Compare commits

...

20 Commits

Author SHA1 Message Date
liyang
4b580f4037 feat: release binary to aws s3 (#1881) 2023-07-04 22:33:35 +08:00
Weny Xu
ee16262b45 feat: add create table procedure (#1845)
* feat: add create table procedure

* feat: change table_info type from vec u8 to RawTableInfo

* feat: return create table status

* fix: fix uncaught error

* refactor: use a notifier to respond to callers

* chore: apply suggestions from CR

* chore: apply suggestions from CR

* chore: add comment

* chore: apply suggestions from CR

* refacotr: make CreateMetadata step after DatanodeCreateTable step
2023-07-04 22:24:43 +08:00
Yingwen
f37b394f1a fix: check table existence in create table procedure (#1880)
* fix: check table existence in table procedures

* fix: use correct error variant

* chore: address view comments

* chore: address comments

* test: change error code
2023-07-04 22:01:27 +08:00
Eugene Tolbakov
ccee60f37d feat(http_body_limit): add initial support for DefaultBodyLimit (#1860)
* feat(http_body_limit): add initial support for DefaultBodyLimit

* fix: address CR suggestions

* fix: adjust the const for default http body limit

* fix: adjust the toml_str for the test

* fix: address CR suggestions

* fix: body_limit units in example config toml files

* fix: address clippy suggestions
2023-07-04 20:56:56 +08:00
Ruihang Xia
bee8323bae chore: bump sqlness to 0.5.0 (#1877)
Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2023-07-04 19:49:12 +08:00
Weny Xu
000df8cf1e feat: add ddl client (#1856)
* feat: add ddl client

* chore: apply suggestions from CR

* chore: apply suggestions from CR
2023-07-04 19:32:02 +08:00
Yingwen
884731a2c8 chore: initialize mito2 crate (#1875) 2023-07-04 17:55:00 +08:00
shuiyisong
2922c25a16 chore: stop caching None in CachedMetaKvBackend (#1871)
* chore: dont cache none

* fix: test case

* chore: add comment

* chore: minor rewrite
2023-07-04 17:17:48 +08:00
Lei, HUANG
4dec06ec86 chore: bump version 0.3.2 (#1876)
bump version 0.3.2
2023-07-04 17:04:27 +08:00
Lei, HUANG
3b6f70cde3 feat: initial twcs impl (#1851)
* feat: initial twcs impl

* chore: rename SimplePicker to LeveledPicker

* rename some structs

* Remove Compaction strategy

* make compaction picker a trait object

* make compaction picker configurable for every region

* chore: add some test for ttl

* add some tests

* fix: some style issues in cr

* feat: enable twcs when creating tables

* feat: allow config time window when creating tables

* fix: some cr comments
2023-07-04 16:42:27 +08:00
Yingwen
b8e92292d2 feat: Implement a new scan mode using a chain reader (#1857)
* feat: add log

* feat: print more info

* feat: use chain reader

* fix: panic on getting first range

* fix: prev not updated

* fix: reverse readers and iter backward

* chore: don't print windows in log

* feat: consider memtable range

Also fix the issue that using incorrect comparision method to sort time
ranges.

* fix: merge memtable window with sst's

* feat: add use_chain_reader option

* feat: skip empty memtables

* chore: change log level

* fix: memtable range not ordered

* style: fix clippy

* chore: address review comments

* chore: print region id in log
2023-07-04 16:01:34 +08:00
Ruihang Xia
746fe8b4fe fix: use mark-deletion for system catalog (#1874)
* fix: use mark-deletion for system catalog

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix the default value

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* clean tables

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2023-07-04 16:00:39 +08:00
JeremyHi
20f2fc4a2a feat: add leader kv store cache for metadata (#1853)
* feat: add leader kv store cache for metadata

* refactor: create cache internal

* fix: race condition

* fix: race condition on read
2023-07-04 15:49:42 +08:00
Yingwen
2ef84f64f1 feat(servers): enlarge default body limit to 64M (#1873) 2023-07-04 07:13:14 +00:00
fys
451cc02d8d chore: add feature for metrics-process, default enable (#1870)
chore: add feature for metrics process, default enable
2023-07-04 13:28:33 +08:00
Lei, HUANG
b466ef6cb6 fix: libz dependency (#1867) 2023-07-03 10:08:53 +00:00
LFC
5b42e15105 refactor: add TableInfoKey and TableRegionKey (#1865)
* refactor: add TableInfoKey and TableRegionKey

* refactor: move KvBackend to common-meta

* fix: resolve PR comments
2023-07-03 18:01:20 +08:00
shuiyisong
e1bb7acfe5 fix: return err msg if use wrong database in MySQL (#1866) 2023-07-03 17:31:09 +08:00
Lei, HUANG
2c0c4672b4 feat: support building binary for centos7 (#1863)
feat:support building binary for centos7
2023-07-03 14:13:55 +08:00
Cao Zhengjia
e54415e723 feat: Make heartbeat intervals configurable in Frontend and Datanode (#1864)
* update frontend options and config

* fix format
2023-07-03 12:08:47 +08:00
128 changed files with 4800 additions and 906 deletions

View File

@@ -127,6 +127,21 @@ jobs:
name: ${{ matrix.file }}.sha256sum
path: target/${{ matrix.arch }}/${{ env.CARGO_PROFILE }}/${{ matrix.file }}.sha256sum
- name: Configure tag
shell: bash
if: github.event_name == 'push'
run: |
VERSION=${{ github.ref_name }}
echo "TAG=${VERSION:1}" >> $GITHUB_ENV
- name: Upload to S3
run: |
aws s3 sync target/${{ matrix.arch }}/${{ env.CARGO_PROFILE }} s3://${{ secrets.GREPTIMEDB_RELEASE_BUCKET_NAME }}/releases/${TAG}
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_CN_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: ${{ secrets.AWS_CN_REGION }}
build-linux:
name: Build linux binary
strategy:
@@ -288,6 +303,21 @@ jobs:
name: ${{ matrix.file }}.sha256sum
path: target/${{ matrix.arch }}/${{ env.CARGO_PROFILE }}/${{ matrix.file }}.sha256sum
- name: Configure tag
shell: bash
if: github.event_name == 'push'
run: |
VERSION=${{ github.ref_name }}
echo "TAG=${VERSION:1}" >> $GITHUB_ENV
- name: Upload to S3
run: |
aws s3 sync target/${{ matrix.arch }}/${{ env.CARGO_PROFILE }} s3://${{ secrets.GREPTIMEDB_RELEASE_BUCKET_NAME }}/releases/${TAG}
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_CN_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: ${{ secrets.AWS_CN_REGION }}
docker:
name: Build docker image
needs: [build-linux, build-macos]

119
Cargo.lock generated
View File

@@ -199,7 +199,7 @@ checksum = "8f1f8f5a6f3d50d89e3797d7593a50f96bb2aaa20ca0cc7be1fb673232c91d72"
[[package]]
name = "api"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"arrow-flight",
"common-base",
@@ -841,7 +841,7 @@ dependencies = [
[[package]]
name = "benchmarks"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"arrow",
"clap 4.3.2",
@@ -1224,7 +1224,7 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
[[package]]
name = "catalog"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"api",
"arc-swap",
@@ -1509,7 +1509,7 @@ checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
[[package]]
name = "client"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"api",
"arrow-flight",
@@ -1535,7 +1535,7 @@ dependencies = [
"prost",
"rand",
"snafu",
"substrait 0.4.0",
"substrait 0.3.2",
"substrait 0.7.5",
"tokio",
"tokio-stream",
@@ -1572,7 +1572,7 @@ dependencies = [
[[package]]
name = "cmd"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"anymap",
"build-data",
@@ -1602,7 +1602,7 @@ dependencies = [
"servers",
"session",
"snafu",
"substrait 0.4.0",
"substrait 0.3.2",
"temp-env",
"tikv-jemallocator",
"tokio",
@@ -1634,7 +1634,7 @@ checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335"
[[package]]
name = "common-base"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"anymap",
"bitvec",
@@ -1648,7 +1648,7 @@ dependencies = [
[[package]]
name = "common-catalog"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"async-trait",
"chrono",
@@ -1665,7 +1665,7 @@ dependencies = [
[[package]]
name = "common-datasource"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"arrow",
"arrow-schema",
@@ -1691,7 +1691,7 @@ dependencies = [
[[package]]
name = "common-error"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"snafu",
"strum",
@@ -1699,7 +1699,7 @@ dependencies = [
[[package]]
name = "common-function"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"arc-swap",
"chrono-tz 0.6.3",
@@ -1722,7 +1722,7 @@ dependencies = [
[[package]]
name = "common-function-macro"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"arc-swap",
"backtrace",
@@ -1738,7 +1738,7 @@ dependencies = [
[[package]]
name = "common-grpc"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"api",
"arrow-flight",
@@ -1768,7 +1768,7 @@ dependencies = [
[[package]]
name = "common-grpc-expr"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"api",
"async-trait",
@@ -1787,7 +1787,7 @@ dependencies = [
[[package]]
name = "common-mem-prof"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"common-error",
"snafu",
@@ -1800,9 +1800,10 @@ dependencies = [
[[package]]
name = "common-meta"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"api",
"async-stream",
"async-trait",
"chrono",
"common-catalog",
@@ -1811,6 +1812,8 @@ dependencies = [
"common-telemetry",
"common-time",
"datatypes",
"futures",
"prost",
"serde",
"serde_json",
"snafu",
@@ -1821,7 +1824,7 @@ dependencies = [
[[package]]
name = "common-pprof"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"common-error",
"pprof",
@@ -1832,7 +1835,7 @@ dependencies = [
[[package]]
name = "common-procedure"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"async-stream",
"async-trait",
@@ -1854,7 +1857,7 @@ dependencies = [
[[package]]
name = "common-procedure-test"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"async-trait",
"common-procedure",
@@ -1862,7 +1865,7 @@ dependencies = [
[[package]]
name = "common-query"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"api",
"async-trait",
@@ -1882,7 +1885,7 @@ dependencies = [
[[package]]
name = "common-recordbatch"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"common-error",
"datafusion",
@@ -1898,7 +1901,7 @@ dependencies = [
[[package]]
name = "common-runtime"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"async-trait",
"common-error",
@@ -1914,7 +1917,7 @@ dependencies = [
[[package]]
name = "common-telemetry"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"backtrace",
"common-error",
@@ -1939,7 +1942,7 @@ dependencies = [
[[package]]
name = "common-test-util"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"once_cell",
"rand",
@@ -1948,7 +1951,7 @@ dependencies = [
[[package]]
name = "common-time"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"chrono",
"chrono-tz 0.8.2",
@@ -2588,7 +2591,7 @@ dependencies = [
[[package]]
name = "datanode"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"api",
"async-compat",
@@ -2644,7 +2647,7 @@ dependencies = [
"sql",
"storage",
"store-api",
"substrait 0.4.0",
"substrait 0.3.2",
"table",
"table-procedure",
"tokio",
@@ -2658,7 +2661,7 @@ dependencies = [
[[package]]
name = "datatypes"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"arrow",
"arrow-array",
@@ -3099,7 +3102,7 @@ dependencies = [
[[package]]
name = "file-table-engine"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"async-trait",
"common-catalog",
@@ -3208,7 +3211,7 @@ dependencies = [
[[package]]
name = "frontend"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"api",
"async-compat",
@@ -3263,7 +3266,7 @@ dependencies = [
"storage",
"store-api",
"strfmt",
"substrait 0.4.0",
"substrait 0.3.2",
"table",
"tokio",
"toml",
@@ -4106,7 +4109,7 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
[[package]]
name = "greptime-proto"
version = "0.1.0"
source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=7aeaeaba1e0ca6a5c736b6ab2eb63144ae3d284b#7aeaeaba1e0ca6a5c736b6ab2eb63144ae3d284b"
source = "git+https://github.com/WenyXu/greptime-proto.git?rev=1eda4691a5d2c8ffc463d48ca2317905ba7e4b2d#1eda4691a5d2c8ffc463d48ca2317905ba7e4b2d"
dependencies = [
"prost",
"serde",
@@ -4869,7 +4872,7 @@ checksum = "518ef76f2f87365916b142844c16d8fefd85039bc5699050210a7778ee1cd1de"
[[package]]
name = "log-store"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"arc-swap",
"async-stream",
@@ -5131,7 +5134,7 @@ dependencies = [
[[package]]
name = "meta-client"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"api",
"async-trait",
@@ -5159,7 +5162,7 @@ dependencies = [
[[package]]
name = "meta-srv"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"anymap",
"api",
@@ -5167,6 +5170,7 @@ dependencies = [
"async-trait",
"catalog",
"chrono",
"client",
"common-base",
"common-catalog",
"common-error",
@@ -5352,7 +5356,7 @@ dependencies = [
[[package]]
name = "mito"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"anymap",
"arc-swap",
@@ -5387,6 +5391,10 @@ dependencies = [
"tokio",
]
[[package]]
name = "mito2"
version = "0.3.2"
[[package]]
name = "moka"
version = "0.9.7"
@@ -5823,7 +5831,7 @@ dependencies = [
[[package]]
name = "object-store"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"anyhow",
"async-trait",
@@ -6217,7 +6225,7 @@ dependencies = [
[[package]]
name = "partition"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"api",
"async-trait",
@@ -6804,7 +6812,7 @@ dependencies = [
[[package]]
name = "promql"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"async-recursion",
"async-trait",
@@ -7054,7 +7062,7 @@ dependencies = [
[[package]]
name = "query"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"ahash 0.8.3",
"approx_eq",
@@ -7108,7 +7116,7 @@ dependencies = [
"stats-cli",
"store-api",
"streaming-stats",
"substrait 0.4.0",
"substrait 0.3.2",
"table",
"tokio",
"tokio-stream",
@@ -8284,7 +8292,7 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]]
name = "script"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"arrow",
"async-trait",
@@ -8539,7 +8547,7 @@ dependencies = [
[[package]]
name = "servers"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"aide",
"api",
@@ -8627,7 +8635,7 @@ dependencies = [
[[package]]
name = "session"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"arc-swap",
"common-catalog",
@@ -8902,7 +8910,7 @@ dependencies = [
[[package]]
name = "sql"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"api",
"common-base",
@@ -8934,8 +8942,9 @@ dependencies = [
[[package]]
name = "sqlness"
version = "0.4.3"
source = "git+https://github.com/CeresDB/sqlness.git?rev=a4663365795d2067eb53966c383e1bb0c89c7627#a4663365795d2067eb53966c383e1bb0c89c7627"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0860f149718809371602b42573693e1ed2b1d0aed35fe69e04e4e4e9918d81f7"
dependencies = [
"async-trait",
"derive_builder 0.11.2",
@@ -8948,7 +8957,7 @@ dependencies = [
[[package]]
name = "sqlness-runner"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"async-trait",
"client",
@@ -9130,7 +9139,7 @@ dependencies = [
[[package]]
name = "storage"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"arc-swap",
"arrow",
@@ -9183,7 +9192,7 @@ dependencies = [
[[package]]
name = "store-api"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"async-stream",
"async-trait",
@@ -9298,7 +9307,7 @@ dependencies = [
[[package]]
name = "substrait"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"async-recursion",
"async-trait",
@@ -9453,7 +9462,7 @@ dependencies = [
[[package]]
name = "table"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"anymap",
"async-trait",
@@ -9489,7 +9498,7 @@ dependencies = [
[[package]]
name = "table-procedure"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"async-trait",
"catalog",
@@ -9582,7 +9591,7 @@ dependencies = [
[[package]]
name = "tests-integration"
version = "0.4.0"
version = "0.3.2"
dependencies = [
"api",
"async-trait",

View File

@@ -33,6 +33,7 @@ members = [
"src/meta-client",
"src/meta-srv",
"src/mito",
"src/mito2",
"src/object-store",
"src/partition",
"src/promql",
@@ -50,7 +51,7 @@ members = [
]
[workspace.package]
version = "0.4.0"
version = "0.3.2"
edition = "2021"
license = "Apache-2.0"
@@ -72,7 +73,7 @@ datafusion-sql = { git = "https://github.com/waynexia/arrow-datafusion.git", rev
datafusion-substrait = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "63e52dde9e44cac4b1f6c6e6b6bf6368ba3bd323" }
futures = "0.3"
futures-util = "0.3"
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "7aeaeaba1e0ca6a5c736b6ab2eb63144ae3d284b" }
greptime-proto = { git = "https://github.com/WenyXu/greptime-proto.git", rev = "1eda4691a5d2c8ffc463d48ca2317905ba7e4b2d" }
itertools = "0.10"
parquet = "40.0"
paste = "1.0"

View File

@@ -1,7 +1,7 @@
[build]
pre-build = [
"dpkg --add-architecture $CROSS_DEB_ARCH",
"apt update && apt install -y unzip zlib1g-dev:$CROSS_DEB_ARCH",
"apt update && apt install -y unzip zlib1g-dev zlib1g-dev:$CROSS_DEB_ARCH",
"curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.15.8/protoc-3.15.8-linux-x86_64.zip && unzip protoc-3.15.8-linux-x86_64.zip -d /usr/",
"chmod a+x /usr/bin/protoc && chmod -R a+rx /usr/include/google",
]

View File

@@ -10,6 +10,8 @@ rpc_addr = "127.0.0.1:3001"
rpc_hostname = "127.0.0.1"
# The number of gRPC server worker threads, 8 by default.
rpc_runtime_size = 8
# Interval for sending heartbeat messages to the Metasrv in milliseconds, 5000 by default.
heartbeat_interval_millis = 5000
# Metasrv client options.
[meta_client_options]

View File

@@ -1,10 +1,15 @@
# Node running mode, see `standalone.example.toml`.
mode = "distributed"
# Interval for sending heartbeat task to the Metasrv in milliseconds, 5000 by default.
heartbeat_interval_millis = 5000
# Interval for retry sending heartbeat task in milliseconds, 5000 by default.
retry_interval_millis = 5000
# HTTP server options, see `standalone.example.toml`.
[http_options]
addr = "127.0.0.1:4000"
timeout = "30s"
body_limit = "64MB"
# gRPC server options, see `standalone.example.toml`.
[grpc_options]

View File

@@ -9,6 +9,9 @@ enable_memory_catalog = false
addr = "127.0.0.1:4000"
# HTTP request timeout, 30s by default.
timeout = "30s"
# HTTP request body limit, 64Mb by default.
# the following units are supported: B, KB, KiB, MB, MiB, GB, GiB, TB, TiB, PB, PiB
body_limit = "64MB"
# gRPC server options.
[grpc_options]

View File

@@ -0,0 +1,29 @@
FROM centos:7
ENV LANG en_US.utf8
WORKDIR /greptimedb
RUN sed -e 's|^mirrorlist=|#mirrorlist=|g' \
-e 's|^#baseurl=http://mirror.centos.org/centos|baseurl=http://mirrors.tuna.tsinghua.edu.cn/centos|g' \
-i.bak \
/etc/yum.repos.d/CentOS-*.repo
# Install dependencies
RUN RUN ulimit -n 1024000 && yum groupinstall -y 'Development Tools'
RUN yum install -y epel-release \
openssl \
openssl-devel \
centos-release-scl \
rh-python38 \
rh-python38-python-devel
# Install protoc
RUN curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.15.8/protoc-3.15.8-linux-x86_64.zip
RUN unzip protoc-3.15.8-linux-x86_64.zip -d /usr/local/
# Install Rust
SHELL ["/bin/bash", "-c"]
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain none -y
ENV PATH /opt/rh/rh-python38/root/usr/bin:/usr/local/bin:/root/.cargo/bin/:$PATH
CMD ["cargo", "build", "--release"]

View File

@@ -243,6 +243,12 @@ pub enum Error {
#[snafu(display("A generic error has occurred, msg: {}", msg))]
Generic { msg: String, location: Location },
#[snafu(display("Table metadata manager error: {}", source))]
TableMetadataManager {
source: common_meta::error::Error,
location: Location,
},
}
pub type Result<T> = std::result::Result<T, Error>;
@@ -298,6 +304,7 @@ impl ErrorExt for Error {
Error::Unimplemented { .. } | Error::NotSupported { .. } => StatusCode::Unsupported,
Error::QueryAccessDenied { .. } => StatusCode::AccessDenied,
Error::Datafusion { .. } => StatusCode::EngineExecuteQuery,
Error::TableMetadataManager { source, .. } => source.status_code(),
}
}

View File

@@ -243,9 +243,12 @@ impl LocalCatalogManager {
info!("Registered schema: {:?}", s);
}
Entry::Table(t) => {
max_table_id = max_table_id.max(t.table_id);
if t.is_deleted {
continue;
}
self.open_and_register_table(&t).await?;
info!("Registered table: {:?}", t);
max_table_id = max_table_id.max(t.table_id);
}
}
}
@@ -602,6 +605,7 @@ mod tests {
table_name: "T1".to_string(),
table_id: 1,
engine: MITO_ENGINE.to_string(),
is_deleted: false,
}),
Entry::Catalog(CatalogEntry {
catalog_name: "C2".to_string(),
@@ -623,6 +627,7 @@ mod tests {
table_name: "T2".to_string(),
table_id: 2,
engine: MITO_ENGINE.to_string(),
is_deleted: false,
}),
];
let res = LocalCatalogManager::sort_entries(vec);

View File

@@ -12,18 +12,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use std::fmt::Debug;
use std::pin::Pin;
use std::sync::Arc;
pub use client::{CachedMetaKvBackend, MetaKvBackend};
use futures::Stream;
use futures_util::StreamExt;
pub use manager::RemoteCatalogManager;
use crate::error::Error;
mod client;
mod manager;
@@ -31,59 +24,6 @@ mod manager;
pub mod mock;
pub mod region_alive_keeper;
#[derive(Debug, Clone)]
pub struct Kv(pub Vec<u8>, pub Vec<u8>);
pub type ValueIter<'a, E> = Pin<Box<dyn Stream<Item = Result<Kv, E>> + Send + 'a>>;
#[async_trait::async_trait]
pub trait KvBackend: Send + Sync {
fn range<'a, 'b>(&'a self, key: &[u8]) -> ValueIter<'b, Error>
where
'a: 'b;
async fn set(&self, key: &[u8], val: &[u8]) -> Result<(), Error>;
/// Compare and set value of key. `expect` is the expected value, if backend's current value associated
/// with key is the same as `expect`, the value will be updated to `val`.
///
/// - If the compare-and-set operation successfully updated value, this method will return an `Ok(Ok())`
/// - If associated value is not the same as `expect`, no value will be updated and an `Ok(Err(Vec<u8>))`
/// will be returned, the `Err(Vec<u8>)` indicates the current associated value of key.
/// - If any error happens during operation, an `Err(Error)` will be returned.
async fn compare_and_set(
&self,
key: &[u8],
expect: &[u8],
val: &[u8],
) -> Result<Result<(), Option<Vec<u8>>>, Error>;
async fn delete_range(&self, key: &[u8], end: &[u8]) -> Result<(), Error>;
async fn delete(&self, key: &[u8]) -> Result<(), Error> {
self.delete_range(key, &[]).await
}
/// Default get is implemented based on `range` method.
async fn get(&self, key: &[u8]) -> Result<Option<Kv>, Error> {
let mut iter = self.range(key);
while let Some(r) = iter.next().await {
let kv = r?;
if kv.0 == key {
return Ok(Some(kv));
}
}
return Ok(None);
}
/// MoveValue atomically renames the key to the given updated key.
async fn move_value(&self, from_key: &[u8], to_key: &[u8]) -> Result<(), Error>;
fn as_any(&self) -> &dyn Any;
}
pub type KvBackendRef = Arc<dyn KvBackend>;
#[async_trait::async_trait]
pub trait KvCacheInvalidator: Send + Sync {
async fn invalidate_key(&self, key: &[u8]);
@@ -93,14 +33,19 @@ pub type KvCacheInvalidatorRef = Arc<dyn KvCacheInvalidator>;
#[cfg(test)]
mod tests {
use async_stream::stream;
use std::any::Any;
use super::*;
use async_stream::stream;
use common_meta::kv_backend::{Kv, KvBackend, ValueIter};
use crate::error::Error;
struct MockKvBackend {}
#[async_trait::async_trait]
impl KvBackend for MockKvBackend {
type Error = Error;
fn range<'a, 'b>(&'a self, _key: &[u8]) -> ValueIter<'b, Error>
where
'a: 'b,

View File

@@ -18,24 +18,26 @@ use std::sync::Arc;
use std::time::Duration;
use async_stream::stream;
use common_error::prelude::BoxedError;
use common_meta::error::Error::{CacheNotGet, GetKvCache};
use common_meta::error::{CacheNotGetSnafu, Error, MetaSrvSnafu, Result};
use common_meta::kv_backend::{Kv, KvBackend, KvBackendRef, ValueIter};
use common_meta::rpc::store::{
CompareAndPutRequest, DeleteRangeRequest, MoveValueRequest, PutRequest, RangeRequest,
};
use common_telemetry::{info, timer};
use meta_client::client::MetaClient;
use moka::future::{Cache, CacheBuilder};
use snafu::ResultExt;
use snafu::{OptionExt, ResultExt};
use super::KvCacheInvalidator;
use crate::error::{Error, GenericSnafu, MetaSrvSnafu, Result};
use crate::metrics::{METRIC_CATALOG_KV_GET, METRIC_CATALOG_KV_REMOTE_GET};
use crate::remote::{Kv, KvBackend, KvBackendRef, ValueIter};
const CACHE_MAX_CAPACITY: u64 = 10000;
const CACHE_TTL_SECOND: u64 = 10 * 60;
const CACHE_TTI_SECOND: u64 = 5 * 60;
pub type CacheBackendRef = Arc<Cache<Vec<u8>, Option<Kv>>>;
pub type CacheBackendRef = Arc<Cache<Vec<u8>, Kv>>;
pub struct CachedMetaKvBackend {
kv_backend: KvBackendRef,
cache: CacheBackendRef,
@@ -43,6 +45,8 @@ pub struct CachedMetaKvBackend {
#[async_trait::async_trait]
impl KvBackend for CachedMetaKvBackend {
type Error = Error;
fn range<'a, 'b>(&'a self, key: &[u8]) -> ValueIter<'b, Error>
where
'a: 'b,
@@ -55,12 +59,26 @@ impl KvBackend for CachedMetaKvBackend {
let init = async {
let _timer = timer!(METRIC_CATALOG_KV_REMOTE_GET);
self.kv_backend.get(key).await
self.kv_backend.get(key).await.map(|val| {
val.with_context(|| CacheNotGetSnafu {
key: String::from_utf8_lossy(key),
})
})?
};
let schema_provider = self.cache.try_get_with_by_ref(key, init).await;
schema_provider.map_err(|e| GenericSnafu { msg: e.to_string() }.build())
// currently moka doesn't have `optionally_try_get_with_by_ref`
// TODO(fys): change to moka method when available
// https://github.com/moka-rs/moka/issues/254
match self.cache.try_get_with_by_ref(key, init).await {
Ok(val) => Ok(Some(val)),
Err(e) => match e.as_ref() {
CacheNotGet { .. } => Ok(None),
_ => Err(e),
},
}
.map_err(|e| GetKvCache {
err_msg: e.to_string(),
})
}
async fn set(&self, key: &[u8], val: &[u8]) -> Result<()> {
@@ -165,6 +183,8 @@ pub struct MetaKvBackend {
/// comparing to `Accessor`'s list and get method.
#[async_trait::async_trait]
impl KvBackend for MetaKvBackend {
type Error = Error;
fn range<'a, 'b>(&'a self, key: &[u8]) -> ValueIter<'b, Error>
where
'a: 'b,
@@ -175,6 +195,7 @@ impl KvBackend for MetaKvBackend {
.client
.range(RangeRequest::new().with_prefix(key))
.await
.map_err(BoxedError::new)
.context(MetaSrvSnafu)?;
let kvs = resp.take_kvs();
for mut kv in kvs.into_iter() {
@@ -188,6 +209,7 @@ impl KvBackend for MetaKvBackend {
.client
.range(RangeRequest::new().with_key(key))
.await
.map_err(BoxedError::new)
.context(MetaSrvSnafu)?;
Ok(response
.take_kvs()
@@ -199,13 +221,23 @@ impl KvBackend for MetaKvBackend {
let req = PutRequest::new()
.with_key(key.to_vec())
.with_value(val.to_vec());
let _ = self.client.put(req).await.context(MetaSrvSnafu)?;
let _ = self
.client
.put(req)
.await
.map_err(BoxedError::new)
.context(MetaSrvSnafu)?;
Ok(())
}
async fn delete_range(&self, key: &[u8], end: &[u8]) -> Result<()> {
let req = DeleteRangeRequest::new().with_range(key.to_vec(), end.to_vec());
let resp = self.client.delete_range(req).await.context(MetaSrvSnafu)?;
let resp = self
.client
.delete_range(req)
.await
.map_err(BoxedError::new)
.context(MetaSrvSnafu)?;
info!(
"Delete range, key: {}, end: {}, deleted: {}",
String::from_utf8_lossy(key),
@@ -230,6 +262,7 @@ impl KvBackend for MetaKvBackend {
.client
.compare_and_put(request)
.await
.map_err(BoxedError::new)
.context(MetaSrvSnafu)?;
if response.is_success() {
Ok(Ok(()))
@@ -240,7 +273,12 @@ impl KvBackend for MetaKvBackend {
async fn move_value(&self, from_key: &[u8], to_key: &[u8]) -> Result<()> {
let req = MoveValueRequest::new(from_key, to_key);
let _ = self.client.move_value(req).await.context(MetaSrvSnafu)?;
let _ = self
.client
.move_value(req)
.await
.map_err(BoxedError::new)
.context(MetaSrvSnafu)?;
Ok(())
}

View File

@@ -21,6 +21,7 @@ use async_stream::stream;
use async_trait::async_trait;
use common_catalog::consts::{MAX_SYS_TABLE_ID, MITO_ENGINE};
use common_meta::ident::TableIdent;
use common_meta::kv_backend::{Kv, KvBackendRef};
use common_telemetry::{debug, error, info, warn};
use futures::Stream;
use futures_util::{StreamExt, TryStreamExt};
@@ -35,6 +36,7 @@ use tokio::sync::Mutex;
use crate::error::{
CatalogNotFoundSnafu, CreateTableSnafu, InvalidCatalogValueSnafu, OpenTableSnafu,
ParallelOpenTableSnafu, Result, SchemaNotFoundSnafu, TableEngineNotFoundSnafu,
TableMetadataManagerSnafu,
};
use crate::helper::{
build_catalog_prefix, build_schema_prefix, build_table_global_prefix,
@@ -42,7 +44,6 @@ use crate::helper::{
TableGlobalValue, TableRegionalKey, TableRegionalValue, CATALOG_KEY_PREFIX,
};
use crate::remote::region_alive_keeper::RegionAliveKeepers;
use crate::remote::{Kv, KvBackendRef};
use crate::{
handle_system_table_request, CatalogManager, DeregisterTableRequest, RegisterSchemaRequest,
RegisterSystemTableRequest, RegisterTableRequest, RenameTableRequest,
@@ -80,7 +81,7 @@ impl RemoteCatalogManager {
let mut catalogs = self.backend.range(catalog_range_prefix.as_bytes());
Box::pin(stream!({
while let Some(r) = catalogs.next().await {
let Kv(k, _) = r?;
let Kv(k, _) = r.context(TableMetadataManagerSnafu)?;
if !k.starts_with(catalog_range_prefix.as_bytes()) {
debug!("Ignoring non-catalog key: {}", String::from_utf8_lossy(&k));
continue;
@@ -134,7 +135,8 @@ impl RemoteCatalogManager {
.as_bytes()
.context(InvalidCatalogValueSnafu)?,
)
.await?;
.await
.context(TableMetadataManagerSnafu)?;
info!("Created schema '{schema_key}'");
let catalog_key = CatalogKey {
@@ -148,7 +150,8 @@ impl RemoteCatalogManager {
.as_bytes()
.context(InvalidCatalogValueSnafu)?,
)
.await?;
.await
.context(TableMetadataManagerSnafu)?;
info!("Created catalog '{catalog_key}");
Ok(())
}
@@ -316,7 +319,8 @@ impl RemoteCatalogManager {
table_key.as_bytes(),
&table_value.as_bytes().context(InvalidCatalogValueSnafu)?,
)
.await?;
.await
.context(TableMetadataManagerSnafu)?;
debug!(
"Successfully set catalog table entry, key: {}, table value: {:?}",
table_key, table_value
@@ -343,7 +347,8 @@ impl RemoteCatalogManager {
let engine_opt = self
.backend
.get(table_key.as_bytes())
.await?
.await
.context(TableMetadataManagerSnafu)?
.map(|Kv(_, v)| {
let TableRegionalValue {
table_id,
@@ -361,7 +366,10 @@ impl RemoteCatalogManager {
return Ok(None);
};
self.backend.delete(table_key.as_bytes()).await?;
self.backend
.delete(table_key.as_bytes())
.await
.context(TableMetadataManagerSnafu)?;
debug!(
"Successfully deleted catalog table entry, key: {}",
table_key
@@ -428,7 +436,7 @@ async fn iter_remote_schemas<'a>(
Box::pin(stream!({
while let Some(r) = schemas.next().await {
let Kv(k, _) = r?;
let Kv(k, _) = r.context(TableMetadataManagerSnafu)?;
if !k.starts_with(schema_prefix.as_bytes()) {
debug!("Ignoring non-schema key: {}", String::from_utf8_lossy(&k));
continue;
@@ -452,7 +460,7 @@ async fn iter_remote_tables<'a>(
let mut tables = backend.range(table_prefix.as_bytes());
Box::pin(stream!({
while let Some(r) = tables.next().await {
let Kv(k, v) = r?;
let Kv(k, v) = r.context(TableMetadataManagerSnafu)?;
if !k.starts_with(table_prefix.as_bytes()) {
debug!("Ignoring non-table prefix: {}", String::from_utf8_lossy(&k));
continue;
@@ -701,7 +709,8 @@ impl CatalogManager for RemoteCatalogManager {
.as_bytes()
.context(InvalidCatalogValueSnafu)?,
)
.await?;
.await
.context(TableMetadataManagerSnafu)?;
increment_gauge!(crate::metrics::METRIC_CATALOG_MANAGER_SCHEMA_COUNT, 1.0);
Ok(true)
@@ -720,7 +729,7 @@ impl CatalogManager for RemoteCatalogManager {
node_id: self.node_id,
}
.to_string();
let Some(Kv(_, value_bytes)) = self.backend.get(old_table_key.as_bytes()).await? else {
let Some(Kv(_, value_bytes)) = self.backend.get(old_table_key.as_bytes()).await.context(TableMetadataManagerSnafu)? else {
return Ok(false)
};
let new_table_key = TableRegionalKey {
@@ -731,10 +740,12 @@ impl CatalogManager for RemoteCatalogManager {
};
self.backend
.set(new_table_key.to_string().as_bytes(), &value_bytes)
.await?;
.await
.context(TableMetadataManagerSnafu)?;
self.backend
.delete(old_table_key.to_string().as_bytes())
.await?;
.await
.context(TableMetadataManagerSnafu)?;
Ok(true)
}
@@ -756,7 +767,12 @@ impl CatalogManager for RemoteCatalogManager {
let key = self
.build_schema_key(catalog.to_string(), schema.to_string())
.to_string();
Ok(self.backend.get(key.as_bytes()).await?.is_some())
Ok(self
.backend
.get(key.as_bytes())
.await
.context(TableMetadataManagerSnafu)?
.is_some())
}
async fn table(
@@ -778,7 +794,8 @@ impl CatalogManager for RemoteCatalogManager {
let table_opt = self
.backend
.get(key.as_bytes())
.await?
.await
.context(TableMetadataManagerSnafu)?
.map(|Kv(_, v)| {
let TableRegionalValue {
table_id,
@@ -821,7 +838,8 @@ impl CatalogManager for RemoteCatalogManager {
Ok(self
.backend
.get(key.to_string().as_bytes())
.await?
.await
.context(TableMetadataManagerSnafu)?
.is_some())
}
@@ -836,7 +854,12 @@ impl CatalogManager for RemoteCatalogManager {
}
.to_string();
Ok(self.backend.get(key.as_bytes()).await?.is_some())
Ok(self
.backend
.get(key.as_bytes())
.await
.context(TableMetadataManagerSnafu)?
.is_some())
}
async fn catalog_names(&self) -> Result<Vec<String>> {
@@ -905,7 +928,8 @@ impl CatalogManager for RemoteCatalogManager {
.as_bytes()
.context(InvalidCatalogValueSnafu)?,
)
.await?;
.await
.context(TableMetadataManagerSnafu)?;
increment_gauge!(crate::metrics::METRIC_CATALOG_MANAGER_CATALOG_COUNT, 1.0);
Ok(false)
}

View File

@@ -12,20 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use std::collections::btree_map::Entry;
use std::collections::{BTreeMap, HashMap};
use std::fmt::{Display, Formatter};
use std::collections::HashMap;
use std::sync::{Arc, RwLock as StdRwLock};
use async_stream::stream;
use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use common_recordbatch::RecordBatch;
use common_telemetry::logging::info;
use datatypes::data_type::ConcreteDataType;
use datatypes::schema::{ColumnSchema, Schema};
use datatypes::vectors::StringVector;
use serde::Serializer;
use table::engine::{CloseTableResult, EngineContext, TableEngine};
use table::metadata::TableId;
use table::requests::{
@@ -33,135 +26,6 @@ use table::requests::{
};
use table::test_util::MemTable;
use table::TableRef;
use tokio::sync::RwLock;
use crate::error::Error;
use crate::helper::{CatalogKey, CatalogValue, SchemaKey, SchemaValue};
use crate::remote::{Kv, KvBackend, ValueIter};
pub struct MockKvBackend {
map: RwLock<BTreeMap<Vec<u8>, Vec<u8>>>,
}
impl Default for MockKvBackend {
fn default() -> Self {
let catalog_value = CatalogValue {}.as_bytes().unwrap();
let schema_value = SchemaValue {}.as_bytes().unwrap();
let default_catalog_key = CatalogKey {
catalog_name: DEFAULT_CATALOG_NAME.to_string(),
}
.to_string();
let default_schema_key = SchemaKey {
catalog_name: DEFAULT_CATALOG_NAME.to_string(),
schema_name: DEFAULT_SCHEMA_NAME.to_string(),
}
.to_string();
let map = RwLock::new(BTreeMap::from([
// create default catalog and schema
(default_catalog_key.into(), catalog_value),
(default_schema_key.into(), schema_value),
]));
Self { map }
}
}
impl Display for MockKvBackend {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
futures::executor::block_on(async {
let map = self.map.read().await;
for (k, v) in map.iter() {
f.serialize_str(&String::from_utf8_lossy(k))?;
f.serialize_str(" -> ")?;
f.serialize_str(&String::from_utf8_lossy(v))?;
f.serialize_str("\n")?;
}
Ok(())
})
}
}
#[async_trait::async_trait]
impl KvBackend for MockKvBackend {
fn range<'a, 'b>(&'a self, key: &[u8]) -> ValueIter<'b, Error>
where
'a: 'b,
{
let prefix = key.to_vec();
let prefix_string = String::from_utf8_lossy(&prefix).to_string();
Box::pin(stream!({
let maps = self.map.read().await.clone();
for (k, v) in maps.range(prefix.clone()..) {
let key_string = String::from_utf8_lossy(k).to_string();
let matches = key_string.starts_with(&prefix_string);
if matches {
yield Ok(Kv(k.clone(), v.clone()))
} else {
info!("Stream finished");
return;
}
}
}))
}
async fn set(&self, key: &[u8], val: &[u8]) -> Result<(), Error> {
let mut map = self.map.write().await;
let _ = map.insert(key.to_vec(), val.to_vec());
Ok(())
}
async fn compare_and_set(
&self,
key: &[u8],
expect: &[u8],
val: &[u8],
) -> Result<Result<(), Option<Vec<u8>>>, Error> {
let mut map = self.map.write().await;
let existing = map.entry(key.to_vec());
match existing {
Entry::Vacant(e) => {
if expect.is_empty() {
let _ = e.insert(val.to_vec());
Ok(Ok(()))
} else {
Ok(Err(None))
}
}
Entry::Occupied(mut existing) => {
if existing.get() == expect {
let _ = existing.insert(val.to_vec());
Ok(Ok(()))
} else {
Ok(Err(Some(existing.get().clone())))
}
}
}
}
async fn delete_range(&self, key: &[u8], end: &[u8]) -> Result<(), Error> {
let mut map = self.map.write().await;
if end.is_empty() {
let _ = map.remove(key);
} else {
let start = key.to_vec();
let end = end.to_vec();
let range = start..end;
map.retain(|k, _| !range.contains(k));
}
Ok(())
}
async fn move_value(&self, _from_key: &[u8], _to_key: &[u8]) -> Result<(), Error> {
unimplemented!()
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[derive(Default)]
pub struct MockTableEngine {

View File

@@ -203,20 +203,32 @@ pub fn build_table_insert_request(
build_insert_request(
EntryType::Table,
entry_key.as_bytes(),
serde_json::to_string(&TableEntryValue { table_name, engine })
.unwrap()
.as_bytes(),
serde_json::to_string(&TableEntryValue {
table_name,
engine,
is_deleted: false,
})
.unwrap()
.as_bytes(),
)
}
pub(crate) fn build_table_deletion_request(
request: &DeregisterTableRequest,
table_id: TableId,
) -> DeleteRequest {
let table_key = format_table_entry_key(&request.catalog, &request.schema, table_id);
DeleteRequest {
key_column_values: build_primary_key_columns(EntryType::Table, table_key.as_bytes()),
}
) -> InsertRequest {
let entry_key = format_table_entry_key(&request.catalog, &request.schema, table_id);
build_insert_request(
EntryType::Table,
entry_key.as_bytes(),
serde_json::to_string(&TableEntryValue {
table_name: "".to_string(),
engine: "".to_string(),
is_deleted: true,
})
.unwrap()
.as_bytes(),
)
}
fn build_primary_key_columns(entry_type: EntryType, key: &[u8]) -> HashMap<String, VectorRef> {
@@ -335,6 +347,7 @@ pub fn decode_system_catalog(
table_name: table_meta.table_name,
table_id,
engine: table_meta.engine,
is_deleted: table_meta.is_deleted,
}))
}
}
@@ -391,6 +404,7 @@ pub struct TableEntry {
pub table_name: String,
pub table_id: TableId,
pub engine: String,
pub is_deleted: bool,
}
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
@@ -399,12 +413,19 @@ pub struct TableEntryValue {
#[serde(default = "mito_engine")]
pub engine: String,
#[serde(default = "not_deleted")]
pub is_deleted: bool,
}
fn mito_engine() -> String {
MITO_ENGINE.to_string()
}
fn not_deleted() -> bool {
false
}
#[cfg(test)]
mod tests {
use common_recordbatch::RecordBatches;
@@ -563,6 +584,7 @@ mod tests {
table_name: "my_table".to_string(),
table_id: 1,
engine: MITO_ENGINE.to_string(),
is_deleted: false,
});
assert_eq!(entry, expected);
@@ -574,11 +596,11 @@ mod tests {
},
1,
);
let result = catalog_table.delete(table_deletion).await.unwrap();
let result = catalog_table.insert(table_deletion).await.unwrap();
assert_eq!(result, 1);
let records = catalog_table.records().await.unwrap();
let batches = RecordBatches::try_collect(records).await.unwrap().take();
assert_eq!(batches.len(), 0);
assert_eq!(batches.len(), 1);
}
}

View File

@@ -69,7 +69,7 @@ impl SystemCatalog {
) -> CatalogResult<()> {
self.information_schema
.system
.delete(build_table_deletion_request(request, table_id))
.insert(build_table_deletion_request(request, table_id))
.await
.map(|x| {
if x != 1 {

View File

@@ -22,12 +22,14 @@ mod tests {
use std::time::Duration;
use catalog::helper::{CatalogKey, CatalogValue, SchemaKey, SchemaValue};
use catalog::remote::mock::{MockKvBackend, MockTableEngine};
use catalog::remote::mock::MockTableEngine;
use catalog::remote::region_alive_keeper::RegionAliveKeepers;
use catalog::remote::{CachedMetaKvBackend, KvBackend, KvBackendRef, RemoteCatalogManager};
use catalog::remote::{CachedMetaKvBackend, RemoteCatalogManager};
use catalog::{CatalogManager, RegisterSchemaRequest, RegisterTableRequest};
use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, MITO_ENGINE};
use common_meta::ident::TableIdent;
use common_meta::kv_backend::memory::MemoryKvBackend;
use common_meta::kv_backend::KvBackend;
use datatypes::schema::RawSchema;
use futures_util::StreamExt;
use table::engine::manager::{MemoryTableEngineManager, TableEngineManagerRef};
@@ -37,8 +39,6 @@ mod tests {
use tokio::time::Instant;
struct TestingComponents {
#[allow(dead_code)]
kv_backend: KvBackendRef,
catalog_manager: Arc<RemoteCatalogManager>,
table_engine_manager: TableEngineManagerRef,
region_alive_keepers: Arc<RegionAliveKeepers>,
@@ -53,7 +53,7 @@ mod tests {
#[tokio::test]
async fn test_backend() {
common_telemetry::init_default_ut_logging();
let backend = MockKvBackend::default();
let backend = MemoryKvBackend::default();
let default_catalog_key = CatalogKey {
catalog_name: DEFAULT_CATALOG_NAME.to_string(),
@@ -92,8 +92,7 @@ mod tests {
#[tokio::test]
async fn test_cached_backend() {
common_telemetry::init_default_ut_logging();
let backend = CachedMetaKvBackend::wrap(Arc::new(MockKvBackend::default()));
let backend = CachedMetaKvBackend::wrap(Arc::new(MemoryKvBackend::default()));
let default_catalog_key = CatalogKey {
catalog_name: DEFAULT_CATALOG_NAME.to_string(),
@@ -135,9 +134,11 @@ mod tests {
}
async fn prepare_components(node_id: u64) -> TestingComponents {
let cached_backend = Arc::new(CachedMetaKvBackend::wrap(
Arc::new(MockKvBackend::default()),
));
let backend = Arc::new(MemoryKvBackend::default());
backend.set(b"__c-greptime", b"").await.unwrap();
backend.set(b"__s-greptime-public", b"").await.unwrap();
let cached_backend = Arc::new(CachedMetaKvBackend::wrap(backend));
let table_engine = Arc::new(MockTableEngine::default());
let engine_manager = Arc::new(MemoryTableEngineManager::alias(
@@ -156,7 +157,6 @@ mod tests {
catalog_manager.start().await.unwrap();
TestingComponents {
kv_backend: cached_backend,
catalog_manager: Arc::new(catalog_manager),
table_engine_manager: engine_manager,
region_alive_keepers,

View File

@@ -10,7 +10,9 @@ name = "greptime"
path = "src/bin/greptime.rs"
[features]
default = ["metrics-process"]
tokio-console = ["common-telemetry/tokio-console"]
metrics-process = ["servers/metrics-process"]
[dependencies]
anymap = "1.0.0-beta.2"

View File

@@ -236,6 +236,7 @@ mod tests {
use std::io::Write;
use std::time::Duration;
use common_base::readable_size::ReadableSize;
use common_test_util::temp_dir::create_named_temp_file;
use frontend::service_config::GrpcOptions;
use servers::auth::{Identity, Password, UserProviderRef};
@@ -260,6 +261,10 @@ mod tests {
command.load_options(TopLevelOptions::default()).unwrap() else { unreachable!() };
assert_eq!(opts.http_options.as_ref().unwrap().addr, "127.0.0.1:1234");
assert_eq!(
ReadableSize::mb(64),
opts.http_options.as_ref().unwrap().body_limit
);
assert_eq!(opts.mysql_options.as_ref().unwrap().addr, "127.0.0.1:5678");
assert_eq!(
opts.postgres_options.as_ref().unwrap().addr,
@@ -301,6 +306,7 @@ mod tests {
[http_options]
addr = "127.0.0.1:4000"
timeout = "30s"
body_limit = "2GB"
[logging]
level = "debug"
@@ -326,6 +332,11 @@ mod tests {
fe_opts.http_options.as_ref().unwrap().timeout
);
assert_eq!(
ReadableSize::gb(2),
fe_opts.http_options.as_ref().unwrap().body_limit
);
assert_eq!("debug", fe_opts.logging.level.as_ref().unwrap());
assert_eq!("/tmp/greptimedb/test/logs".to_string(), fe_opts.logging.dir);
}

View File

@@ -132,6 +132,7 @@ impl StandaloneOptions {
prom_options: self.prom_options,
meta_client_options: None,
logging: self.logging,
..Default::default()
}
}
@@ -341,6 +342,7 @@ mod tests {
use std::io::Write;
use std::time::Duration;
use common_base::readable_size::ReadableSize;
use common_test_util::temp_dir::create_named_temp_file;
use servers::auth::{Identity, Password, UserProviderRef};
use servers::Mode;
@@ -408,6 +410,7 @@ mod tests {
[http_options]
addr = "127.0.0.1:4000"
timeout = "30s"
body_limit = "128MB"
[logging]
level = "debug"
@@ -433,6 +436,10 @@ mod tests {
Duration::from_secs(30),
fe_opts.http_options.as_ref().unwrap().timeout
);
assert_eq!(
ReadableSize::mb(128),
fe_opts.http_options.as_ref().unwrap().body_limit
);
assert_eq!(
"127.0.0.1:4001".to_string(),
fe_opts.grpc_options.unwrap().addr
@@ -559,6 +566,10 @@ mod tests {
opts.fe_opts.http_options.as_ref().unwrap().addr,
"127.0.0.1:14000"
);
assert_eq!(
ReadableSize::mb(64),
opts.fe_opts.http_options.as_ref().unwrap().body_limit
);
// Should be default value.
assert_eq!(

View File

@@ -6,12 +6,15 @@ license.workspace = true
[dependencies]
api = { path = "../../api" }
async-stream.workspace = true
async-trait.workspace = true
common-catalog = { path = "../catalog" }
common-error = { path = "../error" }
common-runtime = { path = "../runtime" }
common-telemetry = { path = "../telemetry" }
common-time = { path = "../time" }
futures.workspace = true
prost.workspace = true
serde.workspace = true
serde_json.workspace = true
snafu.workspace = true

View File

@@ -55,6 +55,21 @@ pub enum Error {
#[snafu(display("Invalid protobuf message, err: {}", err_msg))]
InvalidProtoMsg { err_msg: String, location: Location },
#[snafu(display("Invalid table metadata, err: {}", err_msg))]
InvalidTableMetadata { err_msg: String, location: Location },
#[snafu(display("Failed to get kv cache, err: {}", err_msg))]
GetKvCache { err_msg: String },
#[snafu(display("Get null from cache, key: {}", key))]
CacheNotGet { key: String, location: Location },
#[snafu(display("Failed to request MetaSrv, source: {}", source))]
MetaSrv {
source: BoxedError,
location: Location,
},
}
pub type Result<T> = std::result::Result<T, Error>;
@@ -65,15 +80,18 @@ impl ErrorExt for Error {
match self {
IllegalServerState { .. } => StatusCode::Internal,
SerdeJson { .. } | RouteInfoCorrupted { .. } | InvalidProtoMsg { .. } => {
StatusCode::Unexpected
}
SerdeJson { .. }
| RouteInfoCorrupted { .. }
| InvalidProtoMsg { .. }
| InvalidTableMetadata { .. } => StatusCode::Unexpected,
SendMessage { .. } => StatusCode::Internal,
SendMessage { .. } | GetKvCache { .. } | CacheNotGet { .. } => StatusCode::Internal,
EncodeJson { .. } | DecodeJson { .. } | PayloadNotExist { .. } => {
StatusCode::Unexpected
}
MetaSrv { source, .. } => source.status_code(),
}
}

View File

@@ -12,16 +12,102 @@
// See the License for the specific language governing permissions and
// limitations under the License.
//! This mod defines all the keys used in the metadata store (Metasrv).
//! Specifically, there are these kinds of keys:
//!
//! 1. Table info key: `__table_info/{table_id}`
//! - The value is a [TableInfoValue] struct; it contains the whole table info (like column
//! schemas).
//! - This key is mainly used in constructing the table in Datanode and Frontend.
//!
//! 2. Table region key: `__table_region/{table_id}`
//! - The value is a [TableRegionValue] struct; it contains the region distribution of the
//! table in the Datanodes.
//!
//! All keys have related managers. The managers take care of the serialization and deserialization
//! of keys and values, and the interaction with the underlying KV store backend.
//!
//! To simplify the managers used in struct fields and function parameters, we define a "unify"
//! table metadata manager: [TableMetadataManager]. It contains all the managers defined above.
//! It's recommended to just use this manager only.
pub mod table_info;
pub mod table_region;
mod table_route;
use std::sync::Arc;
use snafu::ResultExt;
use table_info::{TableInfoManager, TableInfoValue};
use table_region::{TableRegionManager, TableRegionValue};
use crate::error::{InvalidTableMetadataSnafu, Result, SerdeJsonSnafu};
pub use crate::key::table_route::{TableRouteKey, TABLE_ROUTE_PREFIX};
use crate::kv_backend::KvBackendRef;
pub const REMOVED_PREFIX: &str = "__removed";
const TABLE_INFO_KEY_PREFIX: &str = "__table_info";
const TABLE_REGION_KEY_PREFIX: &str = "__table_region";
pub fn to_removed_key(key: &str) -> String {
format!("{REMOVED_PREFIX}-{key}")
}
pub trait TableMetaKey {
fn as_raw_key(&self) -> Vec<u8>;
}
pub type TableMetadataManagerRef = Arc<TableMetadataManager>;
pub struct TableMetadataManager {
table_info_manager: TableInfoManager,
table_region_manager: TableRegionManager,
}
impl TableMetadataManager {
pub fn new(kv_backend: KvBackendRef) -> Self {
TableMetadataManager {
table_info_manager: TableInfoManager::new(kv_backend.clone()),
table_region_manager: TableRegionManager::new(kv_backend),
}
}
pub fn table_info_manager(&self) -> &TableInfoManager {
&self.table_info_manager
}
pub fn table_region_manager(&self) -> &TableRegionManager {
&self.table_region_manager
}
}
macro_rules! impl_table_meta_value {
( $($val_ty: ty), *) => {
$(
impl $val_ty {
pub fn try_from_raw_value(raw_value: Vec<u8>) -> Result<Self> {
let raw_value = String::from_utf8(raw_value).map_err(|e| {
InvalidTableMetadataSnafu { err_msg: e.to_string() }.build()
})?;
serde_json::from_str(&raw_value).context(SerdeJsonSnafu)
}
pub fn try_as_raw_value(&self) -> Result<Vec<u8>> {
serde_json::to_string(self)
.map(|x| x.into_bytes())
.context(SerdeJsonSnafu)
}
}
)*
}
}
impl_table_meta_value! {
TableInfoValue,
TableRegionValue
}
#[cfg(test)]
mod tests {
use crate::key::to_removed_key;

View File

@@ -0,0 +1,230 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use serde::{Deserialize, Serialize};
use table::metadata::{RawTableInfo, TableId};
use super::TABLE_INFO_KEY_PREFIX;
use crate::error::Result;
use crate::key::{to_removed_key, TableMetaKey};
use crate::kv_backend::KvBackendRef;
pub struct TableInfoKey {
table_id: TableId,
}
impl TableInfoKey {
pub fn new(table_id: TableId) -> Self {
Self { table_id }
}
}
impl TableMetaKey for TableInfoKey {
fn as_raw_key(&self) -> Vec<u8> {
format!("{}/{}", TABLE_INFO_KEY_PREFIX, self.table_id).into_bytes()
}
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct TableInfoValue {
pub table_info: RawTableInfo,
version: u64,
}
pub struct TableInfoManager {
kv_backend: KvBackendRef,
}
impl TableInfoManager {
pub fn new(kv_backend: KvBackendRef) -> Self {
Self { kv_backend }
}
pub async fn get(&self, table_id: TableId) -> Result<Option<TableInfoValue>> {
let key = TableInfoKey::new(table_id);
let raw_key = key.as_raw_key();
self.kv_backend
.get(&raw_key)
.await?
.map(|x| TableInfoValue::try_from_raw_value(x.1))
.transpose()
}
pub async fn compare_and_set(
&self,
table_id: TableId,
expect: Option<TableInfoValue>,
table_info: RawTableInfo,
) -> Result<std::result::Result<(), Option<Vec<u8>>>> {
let key = TableInfoKey::new(table_id);
let raw_key = key.as_raw_key();
let (expect, version) = if let Some(x) = expect {
(x.try_as_raw_value()?, x.version + 1)
} else {
(vec![], 0)
};
let value = TableInfoValue {
table_info,
version,
};
let raw_value = value.try_as_raw_value()?;
self.kv_backend
.compare_and_set(&raw_key, &expect, &raw_value)
.await
}
pub async fn remove(&self, table_id: TableId) -> Result<()> {
let key = TableInfoKey::new(table_id);
let removed_key = to_removed_key(&String::from_utf8_lossy(key.as_raw_key().as_slice()));
self.kv_backend
.move_value(&key.as_raw_key(), removed_key.as_bytes())
.await
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use datatypes::prelude::ConcreteDataType;
use datatypes::schema::{ColumnSchema, RawSchema, Schema};
use table::metadata::{RawTableMeta, TableIdent, TableType};
use super::*;
use crate::kv_backend::memory::MemoryKvBackend;
use crate::kv_backend::KvBackend;
#[tokio::test]
async fn test_table_info_manager() {
let backend = Arc::new(MemoryKvBackend::default());
for i in 1..=3 {
let key = TableInfoKey::new(i).as_raw_key();
let val = TableInfoValue {
table_info: new_table_info(i),
version: 1,
}
.try_as_raw_value()
.unwrap();
backend.set(&key, &val).await.unwrap();
}
let manager = TableInfoManager::new(backend.clone());
let val = manager.get(1).await.unwrap().unwrap();
assert_eq!(
val,
TableInfoValue {
table_info: new_table_info(1),
version: 1,
}
);
assert!(manager.get(4).await.unwrap().is_none());
let table_info = new_table_info(4);
let result = manager
.compare_and_set(4, None, table_info.clone())
.await
.unwrap();
assert!(result.is_ok());
// test cas failed, the new table info is not set
let new_table_info = new_table_info(4);
let result = manager
.compare_and_set(4, None, new_table_info.clone())
.await
.unwrap();
let actual = TableInfoValue::try_from_raw_value(result.unwrap_err().unwrap()).unwrap();
assert_eq!(
actual,
TableInfoValue {
table_info: table_info.clone(),
version: 0,
}
);
// test cas success
let result = manager
.compare_and_set(4, Some(actual), new_table_info.clone())
.await
.unwrap();
assert!(result.is_ok());
assert!(manager.remove(4).await.is_ok());
let kv = backend
.get(b"__removed-__table_info/4")
.await
.unwrap()
.unwrap();
assert_eq!(b"__removed-__table_info/4", kv.0.as_slice());
let value = TableInfoValue::try_from_raw_value(kv.1).unwrap();
assert_eq!(value.table_info, new_table_info);
assert_eq!(value.version, 1);
}
#[test]
fn test_key_serde() {
let key = TableInfoKey::new(42);
let raw_key = key.as_raw_key();
assert_eq!(raw_key, b"__table_info/42");
}
#[test]
fn test_value_serde() {
let value = TableInfoValue {
table_info: new_table_info(42),
version: 1,
};
let serialized = value.try_as_raw_value().unwrap();
let deserialized = TableInfoValue::try_from_raw_value(serialized).unwrap();
assert_eq!(value, deserialized);
}
fn new_table_info(table_id: TableId) -> RawTableInfo {
let schema = Schema::new(vec![ColumnSchema::new(
"name",
ConcreteDataType::string_datatype(),
true,
)]);
let meta = RawTableMeta {
schema: RawSchema::from(&schema),
engine: "mito".to_string(),
created_on: chrono::DateTime::default(),
primary_key_indices: vec![0, 1],
next_column_id: 3,
engine_options: Default::default(),
value_indices: vec![2, 3],
options: Default::default(),
region_numbers: vec![1],
};
RawTableInfo {
ident: TableIdent {
table_id,
version: 1,
},
name: "table_1".to_string(),
desc: Some("blah".to_string()),
catalog_name: "catalog_1".to_string(),
schema_name: "schema_1".to_string(),
meta,
table_type: TableType::Base,
}
}
}

View File

@@ -0,0 +1,190 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::BTreeMap;
use serde::{Deserialize, Serialize};
use store_api::storage::RegionNumber;
use table::metadata::TableId;
use super::TABLE_REGION_KEY_PREFIX;
use crate::error::Result;
use crate::key::{to_removed_key, TableMetaKey};
use crate::kv_backend::KvBackendRef;
use crate::DatanodeId;
pub type RegionDistribution = BTreeMap<DatanodeId, Vec<RegionNumber>>;
pub struct TableRegionKey {
table_id: TableId,
}
impl TableRegionKey {
pub fn new(table_id: TableId) -> Self {
Self { table_id }
}
}
impl TableMetaKey for TableRegionKey {
fn as_raw_key(&self) -> Vec<u8> {
format!("{}/{}", TABLE_REGION_KEY_PREFIX, self.table_id).into_bytes()
}
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct TableRegionValue {
pub region_distribution: RegionDistribution,
version: u64,
}
pub struct TableRegionManager {
kv_backend: KvBackendRef,
}
impl TableRegionManager {
pub fn new(kv_backend: KvBackendRef) -> Self {
Self { kv_backend }
}
pub async fn get(&self, table_id: TableId) -> Result<Option<TableRegionValue>> {
let key = TableRegionKey::new(table_id);
let raw_key = key.as_raw_key();
self.kv_backend
.get(&raw_key)
.await?
.map(|x| TableRegionValue::try_from_raw_value(x.1))
.transpose()
}
pub async fn compare_and_set(
&self,
table_id: TableId,
expect: Option<TableRegionValue>,
region_distribution: RegionDistribution,
) -> Result<std::result::Result<(), Option<Vec<u8>>>> {
let key = TableRegionKey::new(table_id);
let raw_key = key.as_raw_key();
let (expect, version) = if let Some(x) = expect {
(x.try_as_raw_value()?, x.version + 1)
} else {
(vec![], 0)
};
let value = TableRegionValue {
region_distribution,
version,
};
let raw_value = value.try_as_raw_value()?;
self.kv_backend
.compare_and_set(&raw_key, &expect, &raw_value)
.await
}
pub async fn remove(&self, table_id: TableId) -> Result<()> {
let key = TableRegionKey::new(table_id);
let remove_key = to_removed_key(&String::from_utf8_lossy(key.as_raw_key().as_slice()));
self.kv_backend
.move_value(&key.as_raw_key(), remove_key.as_bytes())
.await
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use super::*;
use crate::kv_backend::memory::MemoryKvBackend;
use crate::kv_backend::KvBackend;
#[tokio::test]
async fn test_table_region_manager() {
let backend = Arc::new(MemoryKvBackend::default());
let manager = TableRegionManager::new(backend.clone());
let region_distribution =
RegionDistribution::from([(1, vec![1, 2, 3]), (2, vec![4, 5, 6])]);
let result = manager
.compare_and_set(1, None, region_distribution.clone())
.await
.unwrap();
assert!(result.is_ok());
let new_region_distribution =
RegionDistribution::from([(1, vec![4, 5, 6]), (2, vec![1, 2, 3])]);
let curr = manager
.compare_and_set(1, None, new_region_distribution.clone())
.await
.unwrap()
.unwrap_err()
.unwrap();
let curr = TableRegionValue::try_from_raw_value(curr).unwrap();
assert_eq!(
curr,
TableRegionValue {
region_distribution,
version: 0
}
);
assert!(manager
.compare_and_set(1, Some(curr), new_region_distribution.clone())
.await
.unwrap()
.is_ok());
let value = manager.get(1).await.unwrap().unwrap();
assert_eq!(
value,
TableRegionValue {
region_distribution: new_region_distribution.clone(),
version: 1
}
);
assert!(manager.get(2).await.unwrap().is_none());
assert!(manager.remove(1).await.is_ok());
let kv = backend
.get(b"__removed-__table_region/1")
.await
.unwrap()
.unwrap();
assert_eq!(b"__removed-__table_region/1", kv.0.as_slice());
let value = TableRegionValue::try_from_raw_value(kv.1).unwrap();
assert_eq!(value.region_distribution, new_region_distribution);
assert_eq!(value.version, 1);
}
#[test]
fn test_serde() {
let key = TableRegionKey::new(1);
let raw_key = key.as_raw_key();
assert_eq!(raw_key, b"__table_region/1");
let value = TableRegionValue {
region_distribution: RegionDistribution::from([(1, vec![1, 2, 3]), (2, vec![4, 5, 6])]),
version: 0,
};
let literal = br#"{"region_distribution":{"1":[1,2,3],"2":[4,5,6]},"version":0}"#;
assert_eq!(value.try_as_raw_value().unwrap(), literal);
assert_eq!(
TableRegionValue::try_from_raw_value(literal.to_vec()).unwrap(),
value,
);
}
}

View File

@@ -0,0 +1,80 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod memory;
use std::any::Any;
use std::pin::Pin;
use std::sync::Arc;
use async_trait::async_trait;
use common_error::ext::ErrorExt;
use futures::{Stream, StreamExt};
use crate::error::Error;
#[derive(Debug, Clone, PartialEq)]
pub struct Kv(pub Vec<u8>, pub Vec<u8>);
pub type ValueIter<'a, E> = Pin<Box<dyn Stream<Item = Result<Kv, E>> + Send + 'a>>;
pub type KvBackendRef = Arc<dyn KvBackend<Error = Error>>;
#[async_trait]
pub trait KvBackend: Send + Sync {
type Error: ErrorExt;
fn range<'a, 'b>(&'a self, key: &[u8]) -> ValueIter<'b, Self::Error>
where
'a: 'b;
async fn set(&self, key: &[u8], val: &[u8]) -> Result<(), Self::Error>;
/// Compare and set value of key. `expect` is the expected value, if backend's current value associated
/// with key is the same as `expect`, the value will be updated to `val`.
///
/// - If the compare-and-set operation successfully updated value, this method will return an `Ok(Ok())`
/// - If associated value is not the same as `expect`, no value will be updated and an `Ok(Err(Vec<u8>))`
/// will be returned, the `Err(Vec<u8>)` indicates the current associated value of key.
/// - If any error happens during operation, an `Err(Error)` will be returned.
async fn compare_and_set(
&self,
key: &[u8],
expect: &[u8],
val: &[u8],
) -> Result<Result<(), Option<Vec<u8>>>, Self::Error>;
async fn delete_range(&self, key: &[u8], end: &[u8]) -> Result<(), Self::Error>;
async fn delete(&self, key: &[u8]) -> Result<(), Self::Error> {
self.delete_range(key, &[]).await
}
/// Default get is implemented based on `range` method.
async fn get(&self, key: &[u8]) -> Result<Option<Kv>, Self::Error> {
let mut iter = self.range(key);
while let Some(r) = iter.next().await {
let kv = r?;
if kv.0 == key {
return Ok(Some(kv));
}
}
return Ok(None);
}
/// MoveValue atomically renames the key to the given updated key.
async fn move_value(&self, from_key: &[u8], to_key: &[u8]) -> Result<(), Self::Error>;
fn as_any(&self) -> &dyn Any;
}

View File

@@ -0,0 +1,197 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use std::collections::btree_map::Entry;
use std::collections::BTreeMap;
use std::fmt::{Display, Formatter};
use std::sync::RwLock;
use async_stream::stream;
use async_trait::async_trait;
use serde::Serializer;
use crate::error::Error;
use crate::kv_backend::{Kv, KvBackend, ValueIter};
pub struct MemoryKvBackend {
kvs: RwLock<BTreeMap<Vec<u8>, Vec<u8>>>,
}
impl Display for MemoryKvBackend {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
let kvs = self.kvs.read().unwrap();
for (k, v) in kvs.iter() {
f.serialize_str(&String::from_utf8_lossy(k))?;
f.serialize_str(" -> ")?;
f.serialize_str(&String::from_utf8_lossy(v))?;
f.serialize_str("\n")?;
}
Ok(())
}
}
impl Default for MemoryKvBackend {
fn default() -> Self {
Self {
kvs: RwLock::new(BTreeMap::new()),
}
}
}
#[async_trait]
impl KvBackend for MemoryKvBackend {
type Error = Error;
fn range<'a, 'b>(&'a self, prefix: &[u8]) -> ValueIter<'b, Error>
where
'a: 'b,
{
let kvs = self.kvs.read().unwrap();
let kvs = kvs.clone();
let prefix = prefix.to_vec();
Box::pin(stream!({
for (k, v) in kvs.range(prefix.clone()..) {
if !k.starts_with(&prefix) {
break;
}
yield Ok(Kv(k.clone(), v.clone()));
}
}))
}
async fn set(&self, key: &[u8], val: &[u8]) -> Result<(), Error> {
let mut kvs = self.kvs.write().unwrap();
let _ = kvs.insert(key.to_vec(), val.to_vec());
Ok(())
}
async fn compare_and_set(
&self,
key: &[u8],
expect: &[u8],
val: &[u8],
) -> Result<Result<(), Option<Vec<u8>>>, Error> {
let key = key.to_vec();
let val = val.to_vec();
let mut kvs = self.kvs.write().unwrap();
let existed = kvs.entry(key);
Ok(match existed {
Entry::Vacant(e) => {
if expect.is_empty() {
let _ = e.insert(val);
Ok(())
} else {
Err(None)
}
}
Entry::Occupied(mut existed) => {
if existed.get() == expect {
let _ = existed.insert(val);
Ok(())
} else {
Err(Some(existed.get().clone()))
}
}
})
}
async fn delete_range(&self, key: &[u8], end: &[u8]) -> Result<(), Error> {
let mut kvs = self.kvs.write().unwrap();
if end.is_empty() {
let _ = kvs.remove(key);
} else {
let start = key.to_vec();
let end = end.to_vec();
let range = start..end;
kvs.retain(|k, _| !range.contains(k));
}
Ok(())
}
async fn move_value(&self, from_key: &[u8], to_key: &[u8]) -> Result<(), Error> {
let mut kvs = self.kvs.write().unwrap();
if let Some(v) = kvs.remove(from_key) {
let _ = kvs.insert(to_key.to_vec(), v);
}
Ok(())
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[cfg(test)]
mod tests {
use futures::TryStreamExt;
use super::*;
#[tokio::test]
async fn test_memory_kv_backend() {
let backend = MemoryKvBackend::default();
for i in 1..10 {
let key = format!("key{}", i);
let val = format!("val{}", i);
assert!(backend.set(key.as_bytes(), val.as_bytes()).await.is_ok());
}
let result = backend
.compare_and_set(b"hello", b"what", b"world")
.await
.unwrap();
assert!(result.unwrap_err().is_none());
let result = backend
.compare_and_set(b"hello", b"", b"world")
.await
.unwrap();
assert!(result.is_ok());
let result = backend
.compare_and_set(b"hello", b"world", b"greptime")
.await
.unwrap();
assert!(result.is_ok());
let result = backend
.compare_and_set(b"hello", b"world", b"what")
.await
.unwrap();
assert_eq!(result.unwrap_err().unwrap(), b"greptime");
assert!(backend.delete_range(b"key1", &[]).await.is_ok());
assert!(backend.delete_range(b"key3", b"key9").await.is_ok());
assert!(backend.move_value(b"key9", b"key10").await.is_ok());
assert_eq!(
backend.to_string(),
r#"hello -> greptime
key10 -> val9
key2 -> val2
"#
);
let range = backend.range(b"key").try_collect::<Vec<_>>().await.unwrap();
assert_eq!(range.len(), 2);
assert_eq!(range[0], Kv(b"key10".to_vec(), b"val9".to_vec()));
assert_eq!(range[1], Kv(b"key2".to_vec(), b"val2".to_vec()));
}
}

View File

@@ -17,6 +17,7 @@ pub mod heartbeat;
pub mod ident;
pub mod instruction;
pub mod key;
pub mod kv_backend;
pub mod peer;
pub mod rpc;
pub mod table_name;

View File

@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod ddl;
pub mod lock;
pub mod router;
pub mod store;

View File

@@ -0,0 +1,217 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::result;
use api::v1::meta::submit_ddl_task_request::Task;
use api::v1::meta::{
CreateTableTask as PbCreateTableTask, Partition,
SubmitDdlTaskRequest as PbSubmitDdlTaskRequest,
SubmitDdlTaskResponse as PbSubmitDdlTaskResponse,
};
use api::v1::CreateTableExpr;
use prost::Message;
use serde::{Deserialize, Serialize};
use snafu::{OptionExt, ResultExt};
use table::engine::TableReference;
use table::metadata::{RawTableInfo, TableId};
use crate::error::{self, Result};
use crate::table_name::TableName;
#[derive(Debug)]
pub enum DdlTask {
CreateTable(CreateTableTask),
}
impl DdlTask {
pub fn new_create_table(
expr: CreateTableExpr,
partitions: Vec<Partition>,
table_info: RawTableInfo,
) -> Self {
DdlTask::CreateTable(CreateTableTask::new(expr, partitions, table_info))
}
}
impl TryFrom<Task> for DdlTask {
type Error = error::Error;
fn try_from(task: Task) -> Result<Self> {
match task {
Task::CreateTableTask(create_table) => {
Ok(DdlTask::CreateTable(create_table.try_into()?))
}
}
}
}
pub struct SubmitDdlTaskRequest {
pub task: DdlTask,
}
impl TryFrom<SubmitDdlTaskRequest> for PbSubmitDdlTaskRequest {
type Error = error::Error;
fn try_from(request: SubmitDdlTaskRequest) -> Result<Self> {
let task = match request.task {
DdlTask::CreateTable(task) => Task::CreateTableTask(PbCreateTableTask {
table_info: serde_json::to_vec(&task.table_info).context(error::SerdeJsonSnafu)?,
create_table: Some(task.create_table),
partitions: task.partitions,
}),
};
Ok(Self {
header: None,
task: Some(task),
})
}
}
pub struct SubmitDdlTaskResponse {
pub key: Vec<u8>,
pub table_id: TableId,
}
impl TryFrom<PbSubmitDdlTaskResponse> for SubmitDdlTaskResponse {
type Error = error::Error;
fn try_from(resp: PbSubmitDdlTaskResponse) -> Result<Self> {
let table_id = resp.table_id.context(error::InvalidProtoMsgSnafu {
err_msg: "expected table_id",
})?;
Ok(Self {
key: resp.key,
table_id: table_id.id,
})
}
}
#[derive(Debug, PartialEq)]
pub struct CreateTableTask {
pub create_table: CreateTableExpr,
pub partitions: Vec<Partition>,
pub table_info: RawTableInfo,
}
impl TryFrom<PbCreateTableTask> for CreateTableTask {
type Error = error::Error;
fn try_from(pb: PbCreateTableTask) -> Result<Self> {
let table_info = serde_json::from_slice(&pb.table_info).context(error::SerdeJsonSnafu)?;
Ok(CreateTableTask::new(
pb.create_table.context(error::InvalidProtoMsgSnafu {
err_msg: "expected create table",
})?,
pb.partitions,
table_info,
))
}
}
impl CreateTableTask {
pub fn new(
expr: CreateTableExpr,
partitions: Vec<Partition>,
table_info: RawTableInfo,
) -> CreateTableTask {
CreateTableTask {
create_table: expr,
partitions,
table_info,
}
}
pub fn table_name(&self) -> TableName {
let table = &self.create_table;
TableName {
catalog_name: table.catalog_name.to_string(),
schema_name: table.schema_name.to_string(),
table_name: table.table_name.to_string(),
}
}
pub fn table_ref(&self) -> TableReference {
let table = &self.create_table;
TableReference {
catalog: &table.catalog_name,
schema: &table.schema_name,
table: &table.table_name,
}
}
}
impl Serialize for CreateTableTask {
fn serialize<S>(&self, serializer: S) -> result::Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
let table_info = serde_json::to_vec(&self.table_info)
.map_err(|err| serde::ser::Error::custom(err.to_string()))?;
let pb = PbCreateTableTask {
create_table: Some(self.create_table.clone()),
partitions: self.partitions.clone(),
table_info,
};
let buf = pb.encode_to_vec();
serializer.serialize_bytes(&buf)
}
}
impl<'de> Deserialize<'de> for CreateTableTask {
fn deserialize<D>(deserializer: D) -> result::Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let buf = Vec::<u8>::deserialize(deserializer)?;
let expr: PbCreateTableTask = PbCreateTableTask::decode(&*buf)
.map_err(|err| serde::de::Error::custom(err.to_string()))?;
let expr = CreateTableTask::try_from(expr)
.map_err(|err| serde::de::Error::custom(err.to_string()))?;
Ok(expr)
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use api::v1::CreateTableExpr;
use datatypes::schema::SchemaBuilder;
use table::metadata::RawTableInfo;
use table::test_util::table_info::test_table_info;
use super::CreateTableTask;
#[test]
fn test_basic_ser_de_create_table_task() {
let schema = SchemaBuilder::default().build().unwrap();
let table_info = test_table_info(1025, "foo", "bar", "baz", Arc::new(schema));
let task = CreateTableTask::new(
CreateTableExpr::default(),
Vec::new(),
RawTableInfo::from(table_info),
);
let output = serde_json::to_vec(&task).unwrap();
let de = serde_json::from_slice(&output).unwrap();
assert_eq!(task, de);
}
}

View File

@@ -16,6 +16,7 @@ use std::fmt::{Display, Formatter};
use api::v1::meta::TableName as PbTableName;
use serde::{Deserialize, Serialize};
use table::engine::TableReference;
#[derive(Debug, Clone, Hash, Eq, PartialEq, Deserialize, Serialize)]
pub struct TableName {
@@ -46,6 +47,14 @@ impl TableName {
table_name: table_name.into(),
}
}
pub fn table_ref(&self) -> TableReference<'_> {
TableReference {
catalog: &self.catalog_name,
schema: &self.schema_name,
table: &self.table_name,
}
}
}
impl From<TableName> for PbTableName {

View File

@@ -14,6 +14,7 @@
use std::cmp::Ordering;
use crate::util::div_ceil;
use crate::Timestamp;
/// Unix timestamp in millisecond resolution.
@@ -80,11 +81,17 @@ impl PartialOrd<TimestampMillis> for i64 {
}
pub trait BucketAligned: Sized {
/// Returns the timestamp aligned by `bucket_duration` or `None` if underflow occurred.
/// Aligns the value by `bucket_duration` or `None` if underflow occurred.
///
/// # Panics
/// Panics if `bucket_duration <= 0`.
fn align_by_bucket(self, bucket_duration: i64) -> Option<Self>;
/// Aligns the value by `bucket_duration` to ceil or `None` if overflow occurred.
///
/// # Panics
/// Panics if `bucket_duration <= 0`.
fn align_to_ceil_by_bucket(self, bucket_duration: i64) -> Option<Self>;
}
impl BucketAligned for i64 {
@@ -93,6 +100,11 @@ impl BucketAligned for i64 {
self.checked_div_euclid(bucket_duration)
.and_then(|val| val.checked_mul(bucket_duration))
}
fn align_to_ceil_by_bucket(self, bucket_duration: i64) -> Option<Self> {
assert!(bucket_duration > 0, "{}", bucket_duration);
div_ceil(self, bucket_duration).checked_mul(bucket_duration)
}
}
impl BucketAligned for Timestamp {
@@ -103,6 +115,14 @@ impl BucketAligned for Timestamp {
.align_by_bucket(bucket_duration)
.map(|val| Timestamp::new(val, unit))
}
fn align_to_ceil_by_bucket(self, bucket_duration: i64) -> Option<Self> {
assert!(bucket_duration > 0, "{}", bucket_duration);
let unit = self.unit();
self.value()
.align_to_ceil_by_bucket(bucket_duration)
.map(|val| Timestamp::new(val, unit))
}
}
#[cfg(test)]
@@ -180,4 +200,31 @@ mod tests {
Timestamp::new_millisecond(i64::MIN).align_by_bucket(bucket)
);
}
#[test]
fn test_align_to_ceil() {
assert_eq!(None, i64::MAX.align_to_ceil_by_bucket(10));
assert_eq!(
Some(i64::MAX - (i64::MAX % 10)),
(i64::MAX - (i64::MAX % 10)).align_to_ceil_by_bucket(10)
);
assert_eq!(Some(i64::MAX), i64::MAX.align_to_ceil_by_bucket(1));
assert_eq!(Some(i64::MAX), i64::MAX.align_to_ceil_by_bucket(1));
assert_eq!(Some(i64::MAX), i64::MAX.align_to_ceil_by_bucket(i64::MAX));
assert_eq!(
Some(i64::MIN - (i64::MIN % 10)),
i64::MIN.align_to_ceil_by_bucket(10)
);
assert_eq!(Some(i64::MIN), i64::MIN.align_to_ceil_by_bucket(1));
assert_eq!(Some(3), 1i64.align_to_ceil_by_bucket(3));
assert_eq!(Some(3), 3i64.align_to_ceil_by_bucket(3));
assert_eq!(Some(6), 4i64.align_to_ceil_by_bucket(3));
assert_eq!(Some(0), 0i64.align_to_ceil_by_bucket(3));
assert_eq!(Some(0), (-1i64).align_to_ceil_by_bucket(3));
assert_eq!(Some(0), (-2i64).align_to_ceil_by_bucket(3));
assert_eq!(Some(-3), (-3i64).align_to_ceil_by_bucket(3));
assert_eq!(Some(-3), (-4i64).align_to_ceil_by_bucket(3));
}
}

View File

@@ -44,7 +44,7 @@ use query::query_engine::{QueryEngineFactory, QueryEngineRef};
use servers::Mode;
use session::context::QueryContext;
use snafu::prelude::*;
use storage::compaction::{CompactionHandler, CompactionSchedulerRef, SimplePicker};
use storage::compaction::{CompactionHandler, CompactionSchedulerRef};
use storage::config::EngineConfig as StorageEngineConfig;
use storage::scheduler::{LocalScheduler, SchedulerConfig};
use storage::EngineImpl;
@@ -395,9 +395,8 @@ impl Instance {
}
fn create_compaction_scheduler<S: LogStore>(opts: &DatanodeOptions) -> CompactionSchedulerRef<S> {
let picker = SimplePicker::default();
let config = SchedulerConfig::from(opts);
let handler = CompactionHandler { picker };
let handler = CompactionHandler::default();
let scheduler = LocalScheduler::new(config, handler);
Arc::new(scheduler)
}

View File

@@ -19,14 +19,14 @@ use std::sync::Arc;
use api::v1::CreateTableExpr;
use catalog::error::{
self as catalog_err, InternalSnafu, InvalidCatalogValueSnafu, InvalidSystemTableDefSnafu,
Result as CatalogResult, UnimplementedSnafu,
Result as CatalogResult, TableMetadataManagerSnafu, UnimplementedSnafu,
};
use catalog::helper::{
build_catalog_prefix, build_schema_prefix, build_table_global_prefix, CatalogKey, SchemaKey,
TableGlobalKey, TableGlobalValue,
};
use catalog::information_schema::InformationSchemaProvider;
use catalog::remote::{Kv, KvBackendRef, KvCacheInvalidatorRef};
use catalog::remote::KvCacheInvalidatorRef;
use catalog::{
CatalogManager, DeregisterTableRequest, RegisterSchemaRequest, RegisterSystemTableRequest,
RegisterTableRequest, RenameTableRequest,
@@ -34,6 +34,7 @@ use catalog::{
use client::client_manager::DatanodeClients;
use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, INFORMATION_SCHEMA_NAME};
use common_error::prelude::BoxedError;
use common_meta::kv_backend::{Kv, KvBackendRef};
use common_meta::table_name::TableName;
use common_telemetry::warn;
use futures::StreamExt;
@@ -254,7 +255,7 @@ impl CatalogManager for FrontendCatalogManager {
let mut iter = self.backend.range(key.as_bytes());
let mut res = HashSet::new();
while let Some(r) = iter.next().await {
let Kv(k, _) = r?;
let Kv(k, _) = r.context(TableMetadataManagerSnafu)?;
let catalog_key = String::from_utf8_lossy(&k);
if let Ok(key) = CatalogKey::parse(catalog_key.as_ref()) {
let _ = res.insert(key.catalog_name);
@@ -270,7 +271,7 @@ impl CatalogManager for FrontendCatalogManager {
let mut iter = self.backend.range(key.as_bytes());
let mut res = HashSet::new();
while let Some(r) = iter.next().await {
let Kv(k, _) = r?;
let Kv(k, _) = r.context(TableMetadataManagerSnafu)?;
let key =
SchemaKey::parse(String::from_utf8_lossy(&k)).context(InvalidCatalogValueSnafu)?;
let _ = res.insert(key.schema_name);
@@ -287,7 +288,7 @@ impl CatalogManager for FrontendCatalogManager {
let iter = self.backend.range(key.as_bytes());
let result = iter
.map(|r| {
let Kv(k, _) = r?;
let Kv(k, _) = r.context(TableMetadataManagerSnafu)?;
let key = TableGlobalKey::parse(String::from_utf8_lossy(&k))
.context(InvalidCatalogValueSnafu)?;
Ok(key.table_name)
@@ -304,7 +305,12 @@ impl CatalogManager for FrontendCatalogManager {
}
.to_string();
Ok(self.backend.get(key.as_bytes()).await?.is_some())
Ok(self
.backend
.get(key.as_bytes())
.await
.context(TableMetadataManagerSnafu)?
.is_some())
}
async fn schema_exist(&self, catalog: &str, schema: &str) -> CatalogResult<bool> {
@@ -314,7 +320,12 @@ impl CatalogManager for FrontendCatalogManager {
}
.to_string();
Ok(self.backend().get(schema_key.as_bytes()).await?.is_some())
Ok(self
.backend()
.get(schema_key.as_bytes())
.await
.context(TableMetadataManagerSnafu)?
.is_some())
}
async fn table_exist(&self, catalog: &str, schema: &str, table: &str) -> CatalogResult<bool> {
@@ -326,7 +337,8 @@ impl CatalogManager for FrontendCatalogManager {
Ok(self
.backend()
.get(table_global_key.to_string().as_bytes())
.await?
.await
.context(TableMetadataManagerSnafu)?
.is_some())
}
@@ -362,7 +374,7 @@ impl CatalogManager for FrontendCatalogManager {
schema_name: schema.to_string(),
table_name: table_name.to_string(),
};
let Some(kv) = self.backend().get(table_global_key.to_string().as_bytes()).await? else {
let Some(kv) = self.backend().get(table_global_key.to_string().as_bytes()).await.context(TableMetadataManagerSnafu)? else {
return Ok(None);
};
let v = TableGlobalValue::from_bytes(kv.1).context(InvalidCatalogValueSnafu)?;

View File

@@ -565,6 +565,12 @@ pub enum Error {
value: String,
location: Location,
},
#[snafu(display("Table metadata manager error: {}", source))]
TableMetadataManager {
source: common_meta::error::Error,
location: Location,
},
}
pub type Result<T> = std::result::Result<T, Error>;
@@ -688,6 +694,7 @@ impl ErrorExt for Error {
Error::WriteParquet { source, .. } => source.status_code(),
Error::InvalidCopyParameter { .. } => StatusCode::InvalidArguments,
Error::TableMetadataManager { source, .. } => source.status_code(),
}
}

View File

@@ -27,6 +27,8 @@ use crate::service_config::{
#[serde(default)]
pub struct FrontendOptions {
pub mode: Mode,
pub heartbeat_interval_millis: u64,
pub retry_interval_millis: u64,
pub http_options: Option<HttpOptions>,
pub grpc_options: Option<GrpcOptions>,
pub mysql_options: Option<MysqlOptions>,
@@ -43,6 +45,8 @@ impl Default for FrontendOptions {
fn default() -> Self {
Self {
mode: Mode::Standalone,
heartbeat_interval_millis: 5000,
retry_interval_millis: 5000,
http_options: Some(HttpOptions::default()),
grpc_options: Some(GrpcOptions::default()),
mysql_options: Some(MysqlOptions::default()),

View File

@@ -43,14 +43,14 @@ pub struct HeartbeatTask {
impl HeartbeatTask {
pub fn new(
meta_client: Arc<MetaClient>,
report_interval: u64,
retry_interval: u64,
heartbeat_interval_millis: u64,
retry_interval_millis: u64,
resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
) -> Self {
HeartbeatTask {
meta_client,
report_interval,
retry_interval,
report_interval: heartbeat_interval_millis,
retry_interval: retry_interval_millis,
resp_handler_executor,
}
}
@@ -92,7 +92,7 @@ impl HeartbeatTask {
Err(e) => {
error!(e; "Occur error while reading heartbeat response");
capture_self
.start_with_retry(Duration::from_secs(retry_interval))
.start_with_retry(Duration::from_millis(retry_interval))
.await;
break;
@@ -136,7 +136,7 @@ impl HeartbeatTask {
}
}
_ = &mut sleep => {
sleep.as_mut().reset(Instant::now() + Duration::from_secs(report_interval));
sleep.as_mut().reset(Instant::now() + Duration::from_millis(report_interval));
Some(HeartbeatRequest::default())
}
};

View File

@@ -47,7 +47,6 @@ use datanode::instance::sql::table_idents_to_full_name;
use datanode::instance::InstanceRef as DnInstanceRef;
use datatypes::schema::Schema;
use distributed::DistInstance;
use futures::future;
use meta_client::client::{MetaClient, MetaClientBuilder};
use meta_client::MetaClientOptions;
use partition::manager::PartitionRuleManager;
@@ -136,13 +135,14 @@ impl Instance {
let datanode_clients = Arc::new(DatanodeClients::default());
Self::try_new_distributed_with(meta_client, datanode_clients, plugins).await
Self::try_new_distributed_with(meta_client, datanode_clients, plugins, opts).await
}
pub async fn try_new_distributed_with(
meta_client: Arc<MetaClient>,
datanode_clients: Arc<DatanodeClients>,
plugins: Arc<Plugins>,
opts: &FrontendOptions,
) -> Result<Self> {
let meta_backend = Arc::new(CachedMetaKvBackend::new(meta_client.clone()));
let table_routes = Arc::new(TableRoutes::new(meta_client.clone()));
@@ -195,8 +195,8 @@ impl Instance {
let heartbeat_task = Some(HeartbeatTask::new(
meta_client,
5,
5,
opts.heartbeat_interval_millis,
opts.retry_interval_millis,
Arc::new(handlers_executor),
));
@@ -288,13 +288,10 @@ impl Instance {
requests: InsertRequests,
ctx: QueryContextRef,
) -> Result<Output> {
let _ = future::join_all(
requests
.inserts
.iter()
.map(|x| self.create_or_alter_table_on_demand(ctx.clone(), x)),
)
.await;
for req in requests.inserts.iter() {
self.create_or_alter_table_on_demand(ctx.clone(), req)
.await?;
}
let query = Request::Inserts(requests);
GrpcQueryHandler::do_query(&*self.grpc_query_handler, query, ctx).await

View File

@@ -658,7 +658,7 @@ impl GrpcQueryHandler for DistInstance {
match expr {
DdlExpr::CreateDatabase(expr) => self.handle_create_database(expr, ctx).await,
DdlExpr::CreateTable(mut expr) => {
let _ = self.create_table(&mut expr, None).await;
let _ = self.create_table(&mut expr, None).await?;
Ok(Output::AffectedRows(0))
}
DdlExpr::Alter(expr) => self.handle_alter_table(expr).await,

View File

@@ -179,10 +179,10 @@ mod tests {
use catalog::helper::{
CatalogKey, CatalogValue, SchemaKey, SchemaValue, TableGlobalKey, TableGlobalValue,
};
use catalog::remote::mock::MockKvBackend;
use catalog::remote::{KvBackend, KvBackendRef};
use client::client_manager::DatanodeClients;
use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use common_meta::kv_backend::memory::MemoryKvBackend;
use common_meta::kv_backend::{KvBackend, KvBackendRef};
use datatypes::prelude::{ConcreteDataType, VectorRef};
use datatypes::schema::{ColumnDefaultConstraint, ColumnSchema, Schema};
use datatypes::vectors::Int32Vector;
@@ -193,7 +193,7 @@ mod tests {
use crate::table::test::create_partition_rule_manager;
async fn prepare_mocked_backend() -> KvBackendRef {
let backend = Arc::new(MockKvBackend::default());
let backend = Arc::new(MemoryKvBackend::default());
let default_catalog = CatalogKey {
catalog_name: DEFAULT_CATALOG_NAME.to_string(),

View File

@@ -55,7 +55,9 @@ use table::Table;
use tokio::sync::RwLock;
use crate::catalog::FrontendCatalogManager;
use crate::error::{self, FindDatanodeSnafu, FindTableRouteSnafu, Result};
use crate::error::{
self, FindDatanodeSnafu, FindTableRouteSnafu, Result, TableMetadataManagerSnafu,
};
use crate::instance::distributed::inserter::DistInserter;
use crate::table::delete::to_grpc_delete_request;
use crate::table::scan::{DatanodeInstance, TableScanPlan};
@@ -256,7 +258,7 @@ impl DistTable {
.backend()
.get(key.to_string().as_bytes())
.await
.context(error::CatalogSnafu)?;
.context(TableMetadataManagerSnafu)?;
Ok(if let Some(raw) = raw {
Some(TableGlobalValue::from_bytes(raw.1).context(error::CatalogEntrySerdeSnafu)?)
} else {
@@ -274,7 +276,7 @@ impl DistTable {
.backend()
.set(key.to_string().as_bytes(), &value)
.await
.context(error::CatalogSnafu)
.context(TableMetadataManagerSnafu)
}
async fn delete_table_global_value(&self, key: TableGlobalKey) -> Result<()> {
@@ -282,7 +284,7 @@ impl DistTable {
.backend()
.delete(key.to_string().as_bytes())
.await
.context(error::CatalogSnafu)
.context(TableMetadataManagerSnafu)
}
async fn move_table_route_value(
@@ -313,7 +315,7 @@ impl DistTable {
.backend()
.move_value(old_key.as_bytes(), new_key.as_bytes())
.await
.context(error::CatalogSnafu)?;
.context(TableMetadataManagerSnafu)?;
self.catalog_manager
.partition_manager()

View File

@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
mod ddl;
mod heartbeat;
mod load_balance;
mod lock;
@@ -29,6 +30,7 @@ use common_meta::rpc::store::{
RangeRequest, RangeResponse,
};
use common_telemetry::info;
use ddl::Client as DdlClient;
use heartbeat::Client as HeartbeatClient;
use lock::Client as LockClient;
use router::Client as RouterClient;
@@ -49,6 +51,7 @@ pub struct MetaClientBuilder {
enable_router: bool,
enable_store: bool,
enable_lock: bool,
enable_ddl: bool,
channel_manager: Option<ChannelManager>,
}
@@ -89,6 +92,13 @@ impl MetaClientBuilder {
}
}
pub fn enable_ddl(self) -> Self {
Self {
enable_ddl: true,
..self
}
}
pub fn channel_manager(self, channel_manager: ChannelManager) -> Self {
Self {
channel_manager: Some(channel_manager),
@@ -119,7 +129,10 @@ impl MetaClientBuilder {
client.store = Some(StoreClient::new(self.id, self.role, mgr.clone()));
}
if self.enable_lock {
client.lock = Some(LockClient::new(self.id, self.role, mgr));
client.lock = Some(LockClient::new(self.id, self.role, mgr.clone()));
}
if self.enable_ddl {
client.ddl = Some(DdlClient::new(self.id, self.role, mgr));
}
client
@@ -134,6 +147,7 @@ pub struct MetaClient {
router: Option<RouterClient>,
store: Option<StoreClient>,
lock: Option<LockClient>,
ddl: Option<DdlClient>,
}
impl MetaClient {

View File

@@ -0,0 +1,145 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use api::v1::meta::ddl_task_client::DdlTaskClient;
use api::v1::meta::{ErrorCode, Role, SubmitDdlTaskRequest, SubmitDdlTaskResponse};
use common_grpc::channel_manager::ChannelManager;
use snafu::{ensure, ResultExt};
use tokio::sync::RwLock;
use tonic::transport::Channel;
use crate::client::heartbeat::Inner as HeartbeatInner;
use crate::client::Id;
use crate::error;
use crate::error::Result;
#[derive(Clone, Debug)]
// TODO(weny): removes this in following PRs.
#[allow(unused)]
pub struct Client {
inner: Arc<RwLock<Inner>>,
}
// TODO(weny): removes this in following PRs.
#[allow(dead_code)]
impl Client {
pub fn new(id: Id, role: Role, channel_manager: ChannelManager) -> Self {
let inner = Arc::new(RwLock::new(Inner {
id,
role,
channel_manager: channel_manager.clone(),
heartbeat_inner: HeartbeatInner::new(id, role, channel_manager),
}));
Self { inner }
}
pub async fn start<U, A>(&mut self, urls: A) -> Result<()>
where
U: AsRef<str>,
A: AsRef<[U]>,
{
let mut inner = self.inner.write().await;
inner.start(urls).await
}
pub async fn is_started(&self) -> bool {
let inner = self.inner.read().await;
inner.is_started()
}
pub async fn submit_ddl_task(
&self,
req: SubmitDdlTaskRequest,
) -> Result<SubmitDdlTaskResponse> {
let mut inner = self.inner.write().await;
inner.submit_ddl_task(req).await
}
}
#[derive(Debug)]
// TODO(weny): removes this in following PRs.
#[allow(unused)]
struct Inner {
id: Id,
role: Role,
channel_manager: ChannelManager,
heartbeat_inner: HeartbeatInner,
}
impl Inner {
async fn start<U, A>(&mut self, urls: A) -> Result<()>
where
U: AsRef<str>,
A: AsRef<[U]>,
{
ensure!(
!self.is_started(),
error::IllegalGrpcClientStateSnafu {
err_msg: "Router client already started",
}
);
self.heartbeat_inner.start(urls).await?;
Ok(())
}
fn make_client(&self, addr: impl AsRef<str>) -> Result<DdlTaskClient<Channel>> {
let channel = self
.channel_manager
.get(addr)
.context(error::CreateChannelSnafu)?;
Ok(DdlTaskClient::new(channel))
}
#[inline]
fn is_started(&self) -> bool {
self.heartbeat_inner.is_started()
}
pub async fn submit_ddl_task(
&mut self,
mut req: SubmitDdlTaskRequest,
) -> Result<SubmitDdlTaskResponse> {
req.set_header(self.id, self.role);
loop {
if let Some(leader) = &self.heartbeat_inner.get_leader() {
let mut client = self.make_client(leader)?;
let res = client
.submit_ddl_task(req.clone())
.await
.context(error::TonicStatusSnafu)?;
let res = res.into_inner();
if let Some(header) = res.header.as_ref() {
if let Some(err) = header.error.as_ref() {
if err.code == ErrorCode::NotLeader as i32 {
self.heartbeat_inner.ask_leader().await?;
continue;
}
}
}
return Ok(res);
} else if let Err(err) = self.heartbeat_inner.ask_leader().await {
return Err(err);
}
}
}
}

View File

@@ -133,7 +133,7 @@ impl Client {
}
#[derive(Debug)]
struct Inner {
pub(crate) struct Inner {
id: Id,
role: Role,
channel_manager: ChannelManager,
@@ -142,7 +142,16 @@ struct Inner {
}
impl Inner {
async fn start<U, A>(&mut self, urls: A) -> Result<()>
pub(crate) fn new(id: Id, role: Role, channel_manager: ChannelManager) -> Self {
Self {
id,
role,
channel_manager,
peers: HashSet::new(),
leader: None,
}
}
pub(crate) async fn start<U, A>(&mut self, urls: A) -> Result<()>
where
U: AsRef<str>,
A: AsRef<[U]>,
@@ -163,7 +172,11 @@ impl Inner {
Ok(())
}
async fn ask_leader(&mut self) -> Result<()> {
pub(crate) fn get_leader(&self) -> Option<String> {
self.leader.clone()
}
pub(crate) async fn ask_leader(&mut self) -> Result<()> {
ensure!(
self.is_started(),
error::IllegalGrpcClientStateSnafu {
@@ -242,7 +255,7 @@ impl Inner {
}
#[inline]
fn is_started(&self) -> bool {
pub(crate) fn is_started(&self) -> bool {
!self.peers.is_empty()
}
}

View File

@@ -13,6 +13,7 @@ api = { path = "../api" }
async-stream.workspace = true
async-trait = "0.1"
catalog = { path = "../catalog" }
client = { path = "../client" }
common-base = { path = "../common/base" }
common-catalog = { path = "../common/catalog" }
common-error = { path = "../common/error" }

110
src/meta-srv/src/ddl.rs Normal file
View File

@@ -0,0 +1,110 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use client::client_manager::DatanodeClients;
use common_meta::rpc::ddl::CreateTableTask;
use common_meta::rpc::router::TableRoute;
use common_procedure::{watcher, ProcedureId, ProcedureManagerRef, ProcedureWithId};
use snafu::ResultExt;
use crate::error::{self, Result};
use crate::procedure::create_table::CreateTableProcedure;
use crate::service::store::kv::KvStoreRef;
pub type DdlManagerRef = Arc<DdlManager>;
pub struct DdlManager {
procedure_manager: ProcedureManagerRef,
kv_store: KvStoreRef,
datanode_clients: Arc<DatanodeClients>,
}
// TODO(weny): removes in following PRs.
#[allow(unused)]
#[derive(Clone)]
pub(crate) struct DdlContext {
pub(crate) kv_store: KvStoreRef,
pub(crate) datanode_clients: Arc<DatanodeClients>,
}
impl DdlManager {
pub(crate) fn new(
procedure_manager: ProcedureManagerRef,
kv_store: KvStoreRef,
datanode_clients: Arc<DatanodeClients>,
) -> Self {
Self {
procedure_manager,
kv_store,
datanode_clients,
}
}
pub(crate) fn create_context(&self) -> DdlContext {
DdlContext {
kv_store: self.kv_store.clone(),
datanode_clients: self.datanode_clients.clone(),
}
}
pub(crate) fn try_start(&self) -> Result<()> {
let context = self.create_context();
self.procedure_manager
.register_loader(
CreateTableProcedure::TYPE_NAME,
Box::new(move |json| {
let context = context.clone();
CreateTableProcedure::from_json(json, context).map(|p| Box::new(p) as _)
}),
)
.context(error::RegisterProcedureLoaderSnafu {
type_name: CreateTableProcedure::TYPE_NAME,
})
}
pub async fn submit_create_table_task(
&self,
cluster_id: u64,
create_table_task: CreateTableTask,
table_route: TableRoute,
) -> Result<ProcedureId> {
let context = self.create_context();
let procedure =
CreateTableProcedure::new(cluster_id, create_table_task, table_route, context);
let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));
self.submit_procedure(procedure_with_id).await
}
async fn submit_procedure(&self, procedure_with_id: ProcedureWithId) -> Result<ProcedureId> {
let procedure_id = procedure_with_id.id;
let mut watcher = self
.procedure_manager
.submit(procedure_with_id)
.await
.context(error::SubmitProcedureSnafu)?;
watcher::wait(&mut watcher)
.await
.context(error::WaitProcedureSnafu)?;
Ok(procedure_id)
}
}

View File

@@ -26,7 +26,7 @@ pub const LEASE_SECS: i64 = 5;
pub const KEEP_ALIVE_PERIOD_SECS: u64 = LEASE_SECS as u64 / 2;
pub const ELECTION_KEY: &str = "__meta_srv_election";
#[derive(Clone)]
#[derive(Debug, Clone)]
pub enum LeaderChangeMessage {
Elected(Arc<LeaderKey>),
StepDown(Arc<LeaderKey>),

View File

@@ -13,14 +13,54 @@
// limitations under the License.
use common_error::prelude::*;
use common_meta::peer::Peer;
use snafu::Location;
use tokio::sync::mpsc::error::SendError;
use tokio::sync::oneshot::error::TryRecvError;
use tonic::codegen::http;
use tonic::Code;
#[derive(Debug, Snafu)]
#[snafu(visibility(pub))]
pub enum Error {
#[snafu(display("Failed to execute transaction: {}", msg))]
Txn { location: Location, msg: String },
#[snafu(display(
"Unexpected table_id changed, expected: {}, found: {}",
expected,
found,
))]
TableIdChanged {
location: Location,
expected: u64,
found: u64,
},
#[snafu(display("Failed to receive status, source: {}", source,))]
TryReceiveStatus {
location: Location,
source: TryRecvError,
},
#[snafu(display(
"Failed to request Datanode, expected: {}, but only {} available",
expected,
available
))]
NoEnoughAvailableDatanode {
location: Location,
expected: usize,
available: usize,
},
#[snafu(display("Failed to request Datanode {}, source: {}", peer, source))]
RequestDatanode {
location: Location,
peer: Peer,
source: client::Error,
},
#[snafu(display("Failed to send shutdown signal"))]
SendShutdownSignal { source: SendError<()> },
@@ -274,6 +314,18 @@ pub enum Error {
source: common_procedure::Error,
},
#[snafu(display("Failed to recover procedure, source: {source}"))]
WaitProcedure {
location: Location,
source: common_procedure::Error,
},
#[snafu(display("Failed to submit procedure, source: {source}"))]
SubmitProcedure {
location: Location,
source: common_procedure::Error,
},
#[snafu(display("Schema already exists, name: {schema_name}"))]
SchemaAlreadyExists {
schema_name: String,
@@ -413,7 +465,9 @@ impl ErrorExt for Error {
| Error::MailboxReceiver { .. }
| Error::RetryLater { .. }
| Error::StartGrpc { .. }
| Error::Combine { .. } => StatusCode::Internal,
| Error::Combine { .. }
| Error::NoEnoughAvailableDatanode { .. }
| Error::TryReceiveStatus { .. } => StatusCode::Internal,
Error::EmptyKey { .. }
| Error::MissingRequiredParameter { .. }
| Error::MissingRequestHeader { .. }
@@ -437,10 +491,15 @@ impl ErrorExt for Error {
| Error::InvalidUtf8Value { .. }
| Error::UnexpectedInstructionReply { .. }
| Error::EtcdTxnOpResponse { .. }
| Error::Unexpected { .. } => StatusCode::Unexpected,
| Error::Unexpected { .. }
| Error::Txn { .. }
| Error::TableIdChanged { .. } => StatusCode::Unexpected,
Error::TableNotFound { .. } => StatusCode::TableNotFound,
Error::RequestDatanode { source, .. } => source.status_code(),
Error::InvalidCatalogValue { source, .. } => source.status_code(),
Error::RecoverProcedure { source, .. } => source.status_code(),
Error::RecoverProcedure { source, .. }
| Error::SubmitProcedure { source, .. }
| Error::WaitProcedure { source, .. } => source.status_code(),
Error::ShutdownServer { source, .. } | Error::StartHttp { source, .. } => {
source.status_code()
}

View File

@@ -37,6 +37,7 @@ impl HeartbeatHandler for OnLeaderStartHandler {
if election.in_infancy() {
ctx.is_infancy = true;
ctx.reset_in_memory();
ctx.reset_leader_cached_kv_store();
}
}
Ok(())

View File

@@ -146,6 +146,7 @@ mod tests {
use crate::handler::{HeartbeatMailbox, Pushers};
use crate::keys::StatKey;
use crate::sequence::Sequence;
use crate::service::store::cached_kv::LeaderCachedKvStore;
use crate::service::store::ext::KvStoreExt;
use crate::service::store::memory::MemStore;
@@ -153,6 +154,8 @@ mod tests {
async fn test_handle_datanode_stats() {
let in_memory = Arc::new(MemStore::new());
let kv_store = Arc::new(MemStore::new());
let leader_cached_kv_store =
Arc::new(LeaderCachedKvStore::with_always_leader(kv_store.clone()));
let seq = Sequence::new("test_seq", 0, 10, kv_store.clone());
let mailbox = HeartbeatMailbox::create(Pushers::default(), seq);
let meta_peer_client = MetaPeerClientBuilder::default()
@@ -166,6 +169,7 @@ mod tests {
server_addr: "127.0.0.1:0000".to_string(),
in_memory,
kv_store,
leader_cached_kv_store,
meta_peer_client,
mailbox,
election: None,

View File

@@ -56,12 +56,15 @@ mod tests {
use crate::cluster::MetaPeerClientBuilder;
use crate::handler::{Context, HeartbeatMailbox, Pushers};
use crate::sequence::Sequence;
use crate::service::store::cached_kv::LeaderCachedKvStore;
use crate::service::store::memory::MemStore;
#[tokio::test]
async fn test_handle_heartbeat_resp_header() {
let in_memory = Arc::new(MemStore::new());
let kv_store = Arc::new(MemStore::new());
let leader_cached_kv_store =
Arc::new(LeaderCachedKvStore::with_always_leader(kv_store.clone()));
let seq = Sequence::new("test_seq", 0, 10, kv_store.clone());
let mailbox = HeartbeatMailbox::create(Pushers::default(), seq);
let meta_peer_client = MetaPeerClientBuilder::default()
@@ -75,6 +78,7 @@ mod tests {
server_addr: "127.0.0.1:0000".to_string(),
in_memory,
kv_store,
leader_cached_kv_store,
meta_peer_client,
mailbox,
election: None,

View File

@@ -17,6 +17,7 @@
pub mod bootstrap;
pub mod cluster;
pub mod ddl;
pub mod election;
pub mod error;
mod failure_detector;

View File

@@ -28,6 +28,7 @@ use snafu::ResultExt;
use tokio::sync::broadcast::error::RecvError;
use crate::cluster::MetaPeerClientRef;
use crate::ddl::DdlManagerRef;
use crate::election::{Election, LeaderChangeMessage};
use crate::error::{RecoverProcedureSnafu, Result};
use crate::handler::HeartbeatHandlerGroup;
@@ -75,6 +76,7 @@ pub struct Context {
pub server_addr: String,
pub in_memory: ResettableKvStoreRef,
pub kv_store: KvStoreRef,
pub leader_cached_kv_store: ResettableKvStoreRef,
pub meta_peer_client: MetaPeerClientRef,
pub mailbox: MailboxRef,
pub election: Option<ElectionRef>,
@@ -94,6 +96,10 @@ impl Context {
pub fn reset_in_memory(&self) {
self.in_memory.reset();
}
pub fn reset_leader_cached_kv_store(&self) {
self.leader_cached_kv_store.reset();
}
}
pub struct LeaderValue(pub String);
@@ -120,6 +126,7 @@ pub struct MetaSrv {
// store some data that will not be persisted.
in_memory: ResettableKvStoreRef,
kv_store: KvStoreRef,
leader_cached_kv_store: ResettableKvStoreRef,
table_id_sequence: SequenceRef,
meta_peer_client: MetaPeerClientRef,
selector: SelectorRef,
@@ -129,6 +136,7 @@ pub struct MetaSrv {
procedure_manager: ProcedureManagerRef,
metadata_service: MetadataServiceRef,
mailbox: MailboxRef,
ddl_manager: DdlManagerRef,
}
impl MetaSrv {
@@ -146,20 +154,30 @@ impl MetaSrv {
if let Some(election) = self.election() {
let procedure_manager = self.procedure_manager.clone();
let in_memory = self.in_memory.clone();
let leader_cached_kv_store = self.leader_cached_kv_store.clone();
let mut rx = election.subscribe_leader_change();
let _handle = common_runtime::spawn_bg(async move {
loop {
match rx.recv().await {
Ok(msg) => match msg {
LeaderChangeMessage::Elected(_) => {
if let Err(e) = procedure_manager.recover().await {
error!("Failed to recover procedures, error: {e}");
Ok(msg) => {
in_memory.reset();
leader_cached_kv_store.reset();
info!(
"Leader's cache has bean cleared on leader change: {:?}",
msg
);
match msg {
LeaderChangeMessage::Elected(_) => {
if let Err(e) = procedure_manager.recover().await {
error!("Failed to recover procedures, error: {e}");
}
}
LeaderChangeMessage::StepDown(leader) => {
error!("Leader :{:?} step down", leader);
}
}
LeaderChangeMessage::StepDown(leader) => {
error!("Leader :{:?} step down", leader);
}
},
}
Err(RecvError::Closed) => {
error!("Not expected, is leader election loop still running?");
break;
@@ -219,6 +237,11 @@ impl MetaSrv {
self.kv_store.clone()
}
#[inline]
pub fn leader_cached_kv_store(&self) -> ResettableKvStoreRef {
self.leader_cached_kv_store.clone()
}
#[inline]
pub fn meta_peer_client(&self) -> MetaPeerClientRef {
self.meta_peer_client.clone()
@@ -254,6 +277,11 @@ impl MetaSrv {
self.mailbox.clone()
}
#[inline]
pub fn ddl_manager(&self) -> &DdlManagerRef {
&self.ddl_manager
}
pub fn procedure_manager(&self) -> &ProcedureManagerRef {
&self.procedure_manager
}
@@ -263,6 +291,7 @@ impl MetaSrv {
let server_addr = self.options().server_addr.clone();
let in_memory = self.in_memory();
let kv_store = self.kv_store();
let leader_cached_kv_store = self.leader_cached_kv_store();
let meta_peer_client = self.meta_peer_client();
let mailbox = self.mailbox();
let election = self.election();
@@ -271,6 +300,7 @@ impl MetaSrv {
server_addr,
in_memory,
kv_store,
leader_cached_kv_store,
meta_peer_client,
mailbox,
election,

View File

@@ -15,9 +15,11 @@
use std::sync::atomic::AtomicBool;
use std::sync::Arc;
use client::client_manager::DatanodeClients;
use common_procedure::local::{LocalManager, ManagerConfig};
use crate::cluster::{MetaPeerClientBuilder, MetaPeerClientRef};
use crate::ddl::DdlManager;
use crate::error::Result;
use crate::handler::mailbox_handler::MailboxHandler;
use crate::handler::region_lease_handler::RegionLeaseHandler;
@@ -36,6 +38,7 @@ use crate::procedure::region_failover::RegionFailoverManager;
use crate::procedure::state_store::MetaStateStore;
use crate::selector::lease_based::LeaseBasedSelector;
use crate::sequence::Sequence;
use crate::service::store::cached_kv::{CheckLeader, LeaderCachedKvStore};
use crate::service::store::kv::{KvStoreRef, ResettableKvStoreRef};
use crate::service::store::memory::MemStore;
@@ -131,6 +134,10 @@ impl MetaSrvBuilder {
let kv_store = kv_store.unwrap_or_else(|| Arc::new(MemStore::default()));
let in_memory = in_memory.unwrap_or_else(|| Arc::new(MemStore::default()));
let leader_cached_kv_store = Arc::new(LeaderCachedKvStore::new(
Arc::new(CheckLeaderByElection(election.clone())),
kv_store.clone(),
));
let meta_peer_client = meta_peer_client.unwrap_or_else(|| {
MetaPeerClientBuilder::default()
.election(election.clone())
@@ -146,6 +153,9 @@ impl MetaSrvBuilder {
let mailbox = HeartbeatMailbox::create(pushers.clone(), mailbox_sequence);
let state_store = Arc::new(MetaStateStore::new(kv_store.clone()));
let procedure_manager = Arc::new(LocalManager::new(ManagerConfig::default(), state_store));
let table_id_sequence = Arc::new(Sequence::new(TABLE_ID_SEQ, 1024, 10, kv_store.clone()));
let metadata_service = metadata_service
.unwrap_or_else(|| Arc::new(DefaultMetadataService::new(kv_store.clone())));
let lock = lock.unwrap_or_else(|| Arc::new(MemLock::default()));
let handler_group = match handler_group {
@@ -202,16 +212,21 @@ impl MetaSrvBuilder {
}
};
let table_id_sequence = Arc::new(Sequence::new(TABLE_ID_SEQ, 1024, 10, kv_store.clone()));
// TODO(weny): considers to modify the default config of procedure manager
let ddl_manager = Arc::new(DdlManager::new(
procedure_manager.clone(),
kv_store.clone(),
Arc::new(DatanodeClients::default()),
));
let metadata_service = metadata_service
.unwrap_or_else(|| Arc::new(DefaultMetadataService::new(kv_store.clone())));
let _ = ddl_manager.try_start();
Ok(MetaSrv {
started,
options,
in_memory,
kv_store,
leader_cached_kv_store,
meta_peer_client,
table_id_sequence,
selector,
@@ -221,6 +236,7 @@ impl MetaSrvBuilder {
procedure_manager,
metadata_service,
mailbox,
ddl_manager,
})
}
}
@@ -230,3 +246,13 @@ impl Default for MetaSrvBuilder {
Self::new()
}
}
struct CheckLeaderByElection(Option<ElectionRef>);
impl CheckLeader for CheckLeaderByElection {
fn check(&self) -> bool {
self.0
.as_ref()
.map_or(false, |election| election.is_leader())
}
}

View File

@@ -12,5 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod create_table;
pub mod region_failover;
pub(crate) mod state_store;

View File

@@ -0,0 +1,330 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use api::v1::meta::TableRouteValue;
use async_trait::async_trait;
use catalog::helper::TableGlobalKey;
use client::Database;
use common_error::ext::ErrorExt;
use common_error::status_code::StatusCode;
use common_meta::key::TableRouteKey;
use common_meta::rpc::ddl::CreateTableTask;
use common_meta::rpc::router::TableRoute;
use common_meta::table_name::TableName;
use common_procedure::error::{FromJsonSnafu, Result as ProcedureResult, ToJsonSnafu};
use common_procedure::{
Context as ProcedureContext, Error as ProcedureError, LockKey, Procedure, Status,
};
use futures::future::join_all;
use serde::{Deserialize, Serialize};
use snafu::{ensure, ResultExt};
use table::engine::TableReference;
use crate::ddl::DdlContext;
use crate::error::{self, Result};
use crate::service::router::create_table_global_value;
use crate::service::store::txn::{Compare, CompareOp, Txn, TxnOp};
use crate::table_routes::get_table_global_value;
// TODO(weny): removes in following PRs.
#[allow(unused)]
pub struct CreateTableProcedure {
context: DdlContext,
creator: TableCreator,
}
// TODO(weny): removes in following PRs.
#[allow(dead_code)]
impl CreateTableProcedure {
pub(crate) const TYPE_NAME: &'static str = "metasrv-procedure::CreateTable";
pub(crate) fn new(
cluster_id: u64,
task: CreateTableTask,
table_route: TableRoute,
context: DdlContext,
) -> Self {
Self {
context,
creator: TableCreator::new(cluster_id, task, table_route),
}
}
pub(crate) fn from_json(json: &str, context: DdlContext) -> ProcedureResult<Self> {
let data = serde_json::from_str(json).context(FromJsonSnafu)?;
Ok(CreateTableProcedure {
context,
creator: TableCreator { data },
})
}
fn global_table_key(&self) -> TableGlobalKey {
let table_ref = self.creator.data.table_ref();
TableGlobalKey {
catalog_name: table_ref.catalog.to_string(),
schema_name: table_ref.schema.to_string(),
table_name: table_ref.table.to_string(),
}
}
fn table_name(&self) -> TableName {
self.creator.data.task.table_name()
}
/// Checks whether the table exists.
async fn on_prepare(&mut self) -> Result<Status> {
if (get_table_global_value(&self.context.kv_store, &self.global_table_key()).await?)
.is_some()
{
ensure!(
self.creator.data.task.create_table.create_if_not_exists,
error::TableAlreadyExistsSnafu {
table_name: self.creator.data.table_ref().to_string(),
}
);
return Ok(Status::Done);
}
self.creator.data.state = CreateTableState::DatanodeCreateTable;
Ok(Status::executing(true))
}
/// registers the `TableRouteValue`,`TableGlobalValue`
async fn register_metadata(&self) -> Result<()> {
let table_name = self.table_name();
let table_id = self.creator.data.table_route.table.id;
let table_route_key = TableRouteKey::with_table_name(table_id, &table_name.clone().into())
.key()
.into_bytes();
let table_global_key = TableGlobalKey {
catalog_name: table_name.catalog_name.clone(),
schema_name: table_name.schema_name.clone(),
table_name: table_name.table_name.clone(),
}
.to_string()
.into_bytes();
let (peers, table_route) = self
.creator
.data
.table_route
.clone()
.try_into_raw()
.context(error::ConvertProtoDataSnafu)?;
let table_route_value = TableRouteValue {
peers,
table_route: Some(table_route),
};
let table_global_value = create_table_global_value(
&table_route_value,
self.creator.data.task.table_info.clone(),
)?
.as_bytes()
.context(error::InvalidCatalogValueSnafu)?;
let txn = Txn::new()
.when(vec![
Compare::with_not_exist_value(table_route_key.clone(), CompareOp::Equal),
Compare::with_not_exist_value(table_global_key.clone(), CompareOp::Equal),
])
.and_then(vec![
TxnOp::Put(table_route_key, table_route_value.into()),
TxnOp::Put(table_global_key, table_global_value),
]);
let resp = self.context.kv_store.txn(txn).await?;
ensure!(
resp.succeeded,
error::TxnSnafu {
msg: "table_route_key or table_global_key exists"
}
);
Ok(())
}
async fn on_create_metadata(&mut self) -> Result<Status> {
let kv_store = &self.context.kv_store;
let key = &self.global_table_key();
match get_table_global_value(kv_store, key).await? {
Some(table_global_value) => {
// The metasrv crashed after metadata was created immediately.
// Recovers table_route from kv.
let table_id = table_global_value.table_id() as u64;
let expected = self.creator.data.table_route.table.id;
// If there is something like:
// Create table A, Create table A(from another Fe, Somehow, Failed), Renames table A to B, Create table A(Recovered).
// We must ensure the table_id isn't changed.
ensure!(
table_id == expected,
error::TableIdChangedSnafu {
expected,
found: table_id
}
);
}
None => {
// registers metadata
self.register_metadata().await?;
}
}
Ok(Status::Done)
}
async fn on_datanode_create_table(&mut self) -> Result<Status> {
let table_route = &self.creator.data.table_route;
let table_name = self.table_name();
let clients = self.context.datanode_clients.clone();
let leaders = table_route.find_leaders();
let mut joins = Vec::with_capacity(leaders.len());
for datanode in leaders {
let client = clients.get_client(&datanode).await;
let client = Database::new(&table_name.catalog_name, &table_name.schema_name, client);
let regions = table_route.find_leader_regions(&datanode);
let mut create_expr_for_region = self.creator.data.task.create_table.clone();
create_expr_for_region.region_numbers = regions;
joins.push(common_runtime::spawn_bg(async move {
if let Err(err) = client
.create(create_expr_for_region)
.await
.context(error::RequestDatanodeSnafu { peer: datanode })
{
// TODO(weny): add tests for `TableAlreadyExists`
if err.status_code() != StatusCode::TableAlreadyExists {
return Err(err);
}
}
Ok(())
}));
}
let _ = join_all(joins)
.await
.into_iter()
.map(|result| {
result.map_err(|err| {
error::RetryLaterSnafu {
reason: format!(
"Failed to execute create table on datanode, source: {}",
err
),
}
.build()
})
})
.collect::<Result<Vec<_>>>()?;
self.creator.data.state = CreateTableState::CreateMetadata;
Ok(Status::executing(true))
}
}
#[async_trait]
impl Procedure for CreateTableProcedure {
fn type_name(&self) -> &str {
Self::TYPE_NAME
}
async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult<Status> {
let error_handler = |e| {
if matches!(e, error::Error::RetryLater { .. }) {
ProcedureError::retry_later(e)
} else {
ProcedureError::external(e)
}
};
match self.creator.data.state {
CreateTableState::Prepare => self.on_prepare().await.map_err(error_handler),
CreateTableState::DatanodeCreateTable => {
self.on_datanode_create_table().await.map_err(error_handler)
}
CreateTableState::CreateMetadata => {
self.on_create_metadata().await.map_err(error_handler)
}
}
}
fn dump(&self) -> ProcedureResult<String> {
serde_json::to_string(&self.creator.data).context(ToJsonSnafu)
}
fn lock_key(&self) -> LockKey {
let table_ref = &self.creator.data.table_ref();
let key = common_catalog::format_full_table_name(
table_ref.catalog,
table_ref.schema,
table_ref.table,
);
LockKey::single(key)
}
}
pub struct TableCreator {
data: CreateTableData,
}
impl TableCreator {
pub fn new(cluster_id: u64, task: CreateTableTask, table_route: TableRoute) -> Self {
Self {
data: CreateTableData {
state: CreateTableState::Prepare,
cluster_id,
task,
table_route,
},
}
}
}
#[derive(Debug, Serialize, Deserialize)]
enum CreateTableState {
/// Prepares to create the table
Prepare,
/// Datanode creates the table
DatanodeCreateTable,
/// Creates metadata
CreateMetadata,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct CreateTableData {
state: CreateTableState,
task: CreateTableTask,
table_route: TableRoute,
cluster_id: u64,
}
impl CreateTableData {
fn table_ref(&self) -> TableReference<'_> {
self.task.table_ref()
}
}

View File

@@ -19,6 +19,7 @@ use tonic::{Response, Status};
pub mod admin;
pub mod cluster;
pub mod ddl;
mod heartbeat;
pub mod lock;
pub mod mailbox;

View File

@@ -0,0 +1,187 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use api::v1::meta::{
ddl_task_server, Partition, Region, RegionRoute, SubmitDdlTaskRequest, SubmitDdlTaskResponse,
Table, TableRoute,
};
use api::v1::TableId;
use common_meta::rpc::ddl::{CreateTableTask, DdlTask};
use common_meta::rpc::router;
use common_meta::table_name::TableName;
use common_telemetry::{info, warn};
use snafu::{OptionExt, ResultExt};
use table::metadata::RawTableInfo;
use tonic::{Request, Response};
use super::GrpcResult;
use crate::ddl::DdlManagerRef;
use crate::error::{self, Result};
use crate::metasrv::{MetaSrv, SelectorContext, SelectorRef};
use crate::sequence::SequenceRef;
#[async_trait::async_trait]
impl ddl_task_server::DdlTask for MetaSrv {
async fn submit_ddl_task(
&self,
request: Request<SubmitDdlTaskRequest>,
) -> GrpcResult<SubmitDdlTaskResponse> {
let SubmitDdlTaskRequest { header, task, .. } = request.into_inner();
let header = header.context(error::MissingRequestHeaderSnafu)?;
let task: DdlTask = task
.context(error::MissingRequiredParameterSnafu { param: "task" })?
.try_into()
.context(error::ConvertProtoDataSnafu)?;
let ctx = SelectorContext {
datanode_lease_secs: self.options().datanode_lease_secs,
server_addr: self.options().server_addr.clone(),
kv_store: self.kv_store(),
meta_peer_client: self.meta_peer_client(),
catalog: None,
schema: None,
table: None,
};
let resp = match task {
DdlTask::CreateTable(create_table_task) => {
handle_create_table_task(
header.cluster_id,
create_table_task,
ctx,
self.selector().clone(),
self.table_id_sequence().clone(),
self.ddl_manager().clone(),
)
.await?
}
};
Ok(Response::new(resp))
}
}
async fn handle_create_table_task(
cluster_id: u64,
mut create_table_task: CreateTableTask,
ctx: SelectorContext,
selector: SelectorRef,
table_id_sequence: SequenceRef,
ddl_manager: DdlManagerRef,
) -> Result<SubmitDdlTaskResponse> {
let table_name = create_table_task.table_name();
let ctx = SelectorContext {
datanode_lease_secs: ctx.datanode_lease_secs,
server_addr: ctx.server_addr,
kv_store: ctx.kv_store,
meta_peer_client: ctx.meta_peer_client,
catalog: Some(table_name.catalog_name.clone()),
schema: Some(table_name.schema_name.clone()),
table: Some(table_name.table_name.clone()),
};
let partitions = create_table_task
.partitions
.clone()
.into_iter()
.map(Into::into)
.collect();
let table_route = handle_create_table_route(
cluster_id,
table_name,
partitions,
&mut create_table_task.table_info,
ctx,
selector,
table_id_sequence,
)
.await?;
let table_id = table_route.table.id;
let id = ddl_manager
.submit_create_table_task(cluster_id, create_table_task, table_route)
.await?;
info!("Table: {table_id} created via procedure_id {id:?}");
Ok(SubmitDdlTaskResponse {
key: id.to_string().into(),
table_id: Some(TableId {
id: table_id as u32,
}),
..Default::default()
})
}
/// pre-calculates create table task's metadata.
async fn handle_create_table_route(
cluster_id: u64,
table_name: TableName,
partitions: Vec<Partition>,
table_info: &mut RawTableInfo,
ctx: SelectorContext,
selector: SelectorRef,
table_id_sequence: SequenceRef,
) -> Result<router::TableRoute> {
let mut peers = selector.select(cluster_id, &ctx).await?;
if peers.len() < partitions.len() {
warn!("Create table failed due to no enough available datanodes, table: {table_name:?}, partition number: {}, datanode number: {}", partitions.len(), peers.len());
return error::NoEnoughAvailableDatanodeSnafu {
expected: partitions.len(),
available: peers.len(),
}
.fail();
}
// We don't need to keep all peers, just truncate it to the number of partitions.
// If the peers are not enough, some peers will be used for multiple partitions.
peers.truncate(partitions.len());
let id = table_id_sequence.next().await?;
table_info.ident.table_id = id as u32;
let table = Table {
id,
table_name: Some(table_name.into()),
..Default::default()
};
let region_routes = partitions
.into_iter()
.enumerate()
.map(|(i, partition)| {
let region = Region {
id: i as u64,
partition: Some(partition),
..Default::default()
};
RegionRoute {
region: Some(region),
leader_peer_index: (i % peers.len()) as u64,
follower_peer_indexes: vec![], // follower_peers is not supported at the moment
}
})
.collect::<Vec<_>>();
let table_route = TableRoute {
table: Some(table),
region_routes,
};
router::TableRoute::try_from_raw(&peers, table_route).context(error::TableRouteConversionSnafu)
}

View File

@@ -258,7 +258,7 @@ async fn handle_create(
})
}
fn create_table_global_value(
pub(crate) fn create_table_global_value(
table_route_value: &TableRouteValue,
table_info: RawTableInfo,
) -> Result<TableGlobalValue> {
@@ -349,7 +349,7 @@ async fn handle_delete(req: DeleteRequest, ctx: Context) -> Result<RouteResponse
})
}
fn fill_table_routes(
pub(crate) fn fill_table_routes(
tables: Vec<(TableGlobalValue, TableRouteValue)>,
) -> Result<(Vec<Peer>, Vec<TableRoute>)> {
let mut peer_dict = PeerDict::default();
@@ -407,7 +407,7 @@ async fn fetch_tables(
Ok(tables)
}
fn table_route_key(table_id: u64, t: &TableGlobalKey) -> TableRouteKey<'_> {
pub(crate) fn table_route_key(table_id: u64, t: &TableGlobalKey) -> TableRouteKey<'_> {
TableRouteKey {
table_id,
catalog_name: &t.catalog_name,

View File

@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod cached_kv;
pub mod etcd;
pub(crate) mod etcd_util;
pub mod ext;

View File

@@ -0,0 +1,474 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashSet;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use api::v1::meta::{
BatchDeleteRequest, BatchDeleteResponse, BatchGetRequest, BatchGetResponse, BatchPutRequest,
BatchPutResponse, CompareAndPutRequest, CompareAndPutResponse, DeleteRangeRequest,
DeleteRangeResponse, KeyValue, MoveValueRequest, MoveValueResponse, PutRequest, PutResponse,
RangeRequest, RangeResponse,
};
use crate::error::Result;
use crate::service::store::ext::KvStoreExt;
use crate::service::store::kv::{KvStore, KvStoreRef, ResettableKvStore, ResettableKvStoreRef};
use crate::service::store::memory::MemStore;
use crate::service::store::txn::{Txn, TxnOp, TxnRequest, TxnResponse, TxnService};
pub type CheckLeaderRef = Arc<dyn CheckLeader>;
pub trait CheckLeader: Sync + Send {
fn check(&self) -> bool;
}
struct AlwaysLeader;
impl CheckLeader for AlwaysLeader {
fn check(&self) -> bool {
true
}
}
/// A cache dedicated to a Leader node, in order to cache some metadata.
///
/// To use this cache, the following constraints must be followed:
/// 1. The leader node can create this metadata.
/// 2. The follower node can create this metadata. The leader node can lazily retrieve
/// the corresponding data through the caching loading mechanism.
/// 3. Only the leader node can update this metadata, as the cache cannot detect
/// modifications made to the data on the follower node.
/// 4. Only the leader node can delete this metadata for the same reason mentioned above.
pub struct LeaderCachedKvStore {
check_leader: CheckLeaderRef,
store: KvStoreRef,
cache: ResettableKvStoreRef,
version: AtomicUsize,
}
impl LeaderCachedKvStore {
pub fn new(check_leader: CheckLeaderRef, store: KvStoreRef) -> Self {
Self {
check_leader,
store,
cache: Arc::new(MemStore::new()),
version: AtomicUsize::new(0),
}
}
/// With a leader checker which always returns true when checking,
/// mainly used in test scenarios.
pub fn with_always_leader(store: KvStoreRef) -> Self {
Self::new(Arc::new(AlwaysLeader), store)
}
#[inline]
fn is_leader(&self) -> bool {
self.check_leader.check()
}
#[inline]
async fn invalid_key(&self, key: Vec<u8>) -> Result<()> {
let _ = self.cache.delete(key, false).await?;
Ok(())
}
#[inline]
async fn invalid_keys(&self, keys: Vec<Vec<u8>>) -> Result<()> {
let txn = Txn::new().and_then(keys.into_iter().map(TxnOp::Delete).collect::<Vec<_>>());
let _ = self.cache.txn(txn).await?;
Ok(())
}
#[inline]
fn get_version(&self) -> usize {
self.version.load(Ordering::Relaxed)
}
#[inline]
fn create_new_version(&self) -> usize {
self.version.fetch_add(1, Ordering::Relaxed) + 1
}
#[inline]
fn validate_version(&self, version: usize) -> bool {
version == self.version.load(Ordering::Relaxed)
}
}
#[async_trait::async_trait]
impl KvStore for LeaderCachedKvStore {
async fn range(&self, req: RangeRequest) -> Result<RangeResponse> {
if !self.is_leader() {
return self.store.range(req).await;
}
// We can only cache for exact key queries (i.e. get requests)
// because we cannot confirm if a range response is complete.
if !req.range_end.is_empty() {
return self.store.range(req).await;
}
let res = self.cache.range(req.clone()).await?;
if !res.kvs.is_empty() {
return Ok(res);
}
let ver = self.get_version();
let res = self.store.range(req.clone()).await?;
if !res.kvs.is_empty() {
let KeyValue { key, value } = res.kvs[0].clone();
let put_req = PutRequest {
key: key.clone(),
value,
..Default::default()
};
let _ = self.cache.put(put_req).await?;
if !self.validate_version(ver) {
self.invalid_key(key).await?;
}
}
return Ok(res);
}
async fn put(&self, req: PutRequest) -> Result<PutResponse> {
if !self.is_leader() {
return self.store.put(req).await;
}
let ver = self.create_new_version();
let res = self.store.put(req.clone()).await?;
let _ = self.cache.put(req.clone()).await?;
if !self.validate_version(ver) {
self.invalid_key(req.key).await?;
}
Ok(res)
}
async fn batch_get(&self, req: BatchGetRequest) -> Result<BatchGetResponse> {
if !self.is_leader() {
return self.store.batch_get(req).await;
}
let cached_res = self.cache.batch_get(req.clone()).await?;
// The cache hit all keys
if cached_res.kvs.len() == req.keys.len() {
return Ok(cached_res);
}
let hit_keys = cached_res
.kvs
.iter()
.map(|kv| kv.key.clone())
.collect::<HashSet<_>>();
let missed_keys = req
.keys
.iter()
.filter(|key| !hit_keys.contains(*key))
.cloned()
.collect::<Vec<_>>();
let remote_req = BatchGetRequest {
keys: missed_keys,
..Default::default()
};
let ver = self.get_version();
let remote_res = self.store.batch_get(remote_req).await?;
let put_req = BatchPutRequest {
kvs: remote_res.kvs.clone(),
..Default::default()
};
let _ = self.cache.batch_put(put_req).await?;
if !self.validate_version(ver) {
let keys = remote_res
.kvs
.iter()
.map(|kv| kv.key.clone())
.collect::<Vec<_>>();
self.invalid_keys(keys).await?;
}
let mut merged_res = cached_res;
merged_res.kvs.extend(remote_res.kvs);
Ok(merged_res)
}
async fn batch_put(&self, req: BatchPutRequest) -> Result<BatchPutResponse> {
if !self.is_leader() {
return self.store.batch_put(req).await;
}
let ver = self.create_new_version();
let res = self.store.batch_put(req.clone()).await?;
let _ = self.cache.batch_put(req.clone()).await?;
if !self.validate_version(ver) {
let keys = req.kvs.into_iter().map(|kv| kv.key).collect::<Vec<_>>();
self.invalid_keys(keys).await?;
}
Ok(res)
}
async fn batch_delete(&self, req: BatchDeleteRequest) -> Result<BatchDeleteResponse> {
if !self.is_leader() {
return self.store.batch_delete(req).await;
}
let _ = self.create_new_version();
let res = self.store.batch_delete(req.clone()).await?;
let _ = self.cache.batch_delete(req).await?;
Ok(res)
}
async fn compare_and_put(&self, req: CompareAndPutRequest) -> Result<CompareAndPutResponse> {
if !self.is_leader() {
return self.store.compare_and_put(req).await;
}
let _ = self.create_new_version();
let key = req.key.clone();
let res = self.store.compare_and_put(req).await?;
// Delete key in the cache.
//
// Cache can not deal with the CAS operation, because it does
// not contain full data, so we need to delete the key.
self.invalid_key(key).await?;
Ok(res)
}
async fn delete_range(&self, req: DeleteRangeRequest) -> Result<DeleteRangeResponse> {
if !self.is_leader() {
return self.store.delete_range(req).await;
}
let _ = self.create_new_version();
let res = self.store.delete_range(req.clone()).await?;
let _ = self.cache.delete_range(req).await?;
Ok(res)
}
async fn move_value(&self, req: MoveValueRequest) -> Result<MoveValueResponse> {
if !self.is_leader() {
return self.store.move_value(req).await;
}
let _ = self.create_new_version();
let res = self.store.move_value(req.clone()).await?;
let MoveValueRequest {
from_key, to_key, ..
} = req;
// Delete all keys in the cache.
//
// Cache can not deal with the move operation, because it does
// not contain full data, so we need to delete both keys.
self.invalid_keys(vec![from_key, to_key]).await?;
Ok(res)
}
}
#[async_trait::async_trait]
impl TxnService for LeaderCachedKvStore {
async fn txn(&self, txn: Txn) -> Result<TxnResponse> {
if !self.is_leader() {
return self.store.txn(txn).await;
}
let _ = self.create_new_version();
let res = self.store.txn(txn.clone()).await?;
let TxnRequest {
success, failure, ..
} = txn.into();
let mut all = success;
all.extend(failure);
// Delete all keys in the cache.
//
// Cache can not deal with the txn operation, because it does
// not contain full data, so we need to delete both keys.
let mut keys = Vec::with_capacity(all.len());
for txn_op in all {
match txn_op {
TxnOp::Put(key, _) => {
keys.push(key);
}
TxnOp::Delete(key) => {
keys.push(key);
}
TxnOp::Get(_) => {}
}
}
self.invalid_keys(keys).await?;
Ok(res)
}
}
impl ResettableKvStore for LeaderCachedKvStore {
fn reset(&self) {
self.cache.reset()
}
}
#[cfg(test)]
mod tests {
use api::v1::meta::KeyValue;
use super::*;
use crate::service::store::memory::MemStore;
fn create_leader_cached_kv_store() -> LeaderCachedKvStore {
let store = Arc::new(MemStore::new());
LeaderCachedKvStore::with_always_leader(store)
}
#[tokio::test]
async fn test_get_put_delete() {
let cached_store = create_leader_cached_kv_store();
let inner_store = cached_store.store.clone();
let inner_cache = cached_store.cache.clone();
let key = "test_key".to_owned().into_bytes();
let value = "value".to_owned().into_bytes();
let put_req = PutRequest {
key: key.clone(),
value: value.clone(),
..Default::default()
};
let _ = inner_store.put(put_req).await.unwrap();
let cached_value = inner_cache.get(key.clone()).await.unwrap();
assert!(cached_value.is_none());
let cached_value = cached_store.get(key.clone()).await.unwrap().unwrap();
assert_eq!(cached_value.value, value);
let cached_value = inner_cache.get(key.clone()).await.unwrap().unwrap();
assert_eq!(cached_value.value, value);
let res = cached_store
.delete(key.clone(), true)
.await
.unwrap()
.unwrap();
assert_eq!(res.value, value);
let cached_value = inner_cache.get(key.clone()).await.unwrap();
assert!(cached_value.is_none());
}
#[tokio::test]
async fn test_batch_get_put_delete() {
let cached_store = create_leader_cached_kv_store();
let inner_store = cached_store.store.clone();
let inner_cache = cached_store.cache.clone();
let kvs = (1..3)
.map(|i| {
let key = format!("test_key_{}", i).into_bytes();
let value = format!("value_{}", i).into_bytes();
KeyValue { key, value }
})
.collect::<Vec<_>>();
let batch_put_req = BatchPutRequest {
kvs: kvs.clone(),
..Default::default()
};
let _ = inner_store.batch_put(batch_put_req).await.unwrap();
let keys = (1..5)
.map(|i| format!("test_key_{}", i).into_bytes())
.collect::<Vec<_>>();
let batch_get_req = BatchGetRequest {
keys,
..Default::default()
};
let cached_values = inner_cache.batch_get(batch_get_req.clone()).await.unwrap();
assert!(cached_values.kvs.is_empty());
let cached_values = cached_store.batch_get(batch_get_req.clone()).await.unwrap();
assert_eq!(cached_values.kvs.len(), 2);
let cached_values = inner_cache.batch_get(batch_get_req.clone()).await.unwrap();
assert_eq!(cached_values.kvs.len(), 2);
cached_store.reset();
let cached_values = inner_cache.batch_get(batch_get_req).await.unwrap();
assert!(cached_values.kvs.is_empty());
}
#[tokio::test]
async fn test_txn() {
let cached_store = create_leader_cached_kv_store();
let inner_cache = cached_store.cache.clone();
let kvs = (1..5)
.map(|i| {
let key = format!("test_key_{}", i).into_bytes();
let value = format!("value_{}", i).into_bytes();
KeyValue { key, value }
})
.collect::<Vec<_>>();
let batch_put_req = BatchPutRequest {
kvs: kvs.clone(),
..Default::default()
};
let _ = cached_store.batch_put(batch_put_req).await.unwrap();
let keys = (1..5)
.map(|i| format!("test_key_{}", i).into_bytes())
.collect::<Vec<_>>();
let batch_get_req = BatchGetRequest {
keys,
..Default::default()
};
let cached_values = inner_cache.batch_get(batch_get_req.clone()).await.unwrap();
assert_eq!(cached_values.kvs.len(), 4);
let put_ops = (1..5)
.map(|i| {
let key = format!("test_key_{}", i).into_bytes();
let value = format!("value_{}", i).into_bytes();
TxnOp::Put(key, value)
})
.collect::<Vec<_>>();
let txn = Txn::new().and_then(put_ops);
let _ = cached_store.txn(txn).await.unwrap();
let cached_values = inner_cache.batch_get(batch_get_req).await.unwrap();
assert!(cached_values.kvs.is_empty());
}
}

View File

@@ -33,8 +33,8 @@ use snafu::{ensure, OptionExt, ResultExt};
use storage::manifest::manifest_compress_type;
use store_api::storage::{
CloseOptions, ColumnDescriptorBuilder, ColumnFamilyDescriptor, ColumnFamilyDescriptorBuilder,
ColumnId, EngineContext as StorageEngineContext, OpenOptions, RegionNumber, RowKeyDescriptor,
RowKeyDescriptorBuilder, StorageEngine,
ColumnId, CompactionStrategy, EngineContext as StorageEngineContext, OpenOptions, RegionNumber,
RowKeyDescriptor, RowKeyDescriptorBuilder, StorageEngine,
};
use table::engine::{
region_name, table_dir, CloseTableResult, EngineContext, TableEngine, TableEngineProcedure,
@@ -417,6 +417,7 @@ impl<S: StorageEngine> MitoEngineInner<S> {
.await.map_err(BoxedError::new)
.context(table_error::TableOperationSnafu)? else { return Ok(None) };
let compaction_strategy = CompactionStrategy::from(&table_info.meta.options.extra_options);
let opts = OpenOptions {
parent_dir: table_dir.to_string(),
write_buffer_size: table_info
@@ -425,6 +426,7 @@ impl<S: StorageEngine> MitoEngineInner<S> {
.write_buffer_size
.map(|s| s.0 as usize),
ttl: table_info.meta.options.ttl,
compaction_strategy,
};
debug!(
@@ -501,6 +503,7 @@ impl<S: StorageEngine> MitoEngineInner<S> {
table: name,
};
let compaction_strategy = CompactionStrategy::from(&table_info.meta.options.extra_options);
let opts = OpenOptions {
parent_dir: table_dir.to_string(),
write_buffer_size: table_info
@@ -509,6 +512,7 @@ impl<S: StorageEngine> MitoEngineInner<S> {
.write_buffer_size
.map(|s| s.0 as usize),
ttl: table_info.meta.options.ttl,
compaction_strategy,
};
// TODO(weny): Returns an error earlier if the target region does not exist in the meta.

View File

@@ -24,8 +24,8 @@ use datatypes::schema::{Schema, SchemaRef};
use serde::{Deserialize, Serialize};
use snafu::{ensure, ResultExt};
use store_api::storage::{
ColumnId, CreateOptions, EngineContext, OpenOptions, RegionDescriptorBuilder, RegionNumber,
StorageEngine,
ColumnId, CompactionStrategy, CreateOptions, EngineContext, OpenOptions,
RegionDescriptorBuilder, RegionNumber, StorageEngine,
};
use table::engine::{region_id, table_dir};
use table::metadata::{TableInfoBuilder, TableMetaBuilder, TableType};
@@ -232,15 +232,18 @@ impl<S: StorageEngine> TableCreator<S> {
let table_options = &self.data.request.table_options;
let write_buffer_size = table_options.write_buffer_size.map(|size| size.0 as usize);
let ttl = table_options.ttl;
let compaction_strategy = CompactionStrategy::from(&table_options.extra_options);
let open_opts = OpenOptions {
parent_dir: table_dir.to_string(),
write_buffer_size,
ttl,
compaction_strategy: compaction_strategy.clone(),
};
let create_opts = CreateOptions {
parent_dir: table_dir.to_string(),
write_buffer_size,
ttl,
compaction_strategy,
};
let primary_key_indices = &self.data.request.primary_key_indices;

View File

@@ -183,7 +183,6 @@ impl ErrorExt for Error {
| BuildTableMeta { .. }
| BuildTableInfo { .. }
| BuildRegionDescriptor { .. }
| TableExists { .. }
| ProjectedColumnNotFound { .. }
| InvalidPrimaryKey { .. }
| MissingTimestampIndex { .. }
@@ -191,6 +190,8 @@ impl ErrorExt for Error {
| InvalidRawSchema { .. }
| VersionChanged { .. } => StatusCode::InvalidArguments,
TableExists { .. } => StatusCode::TableAlreadyExists,
ConvertRaw { .. } => StatusCode::Unexpected,
ScanTableManifest { .. } | UpdateTableManifest { .. } => StatusCode::StorageUnavailable,

7
src/mito2/Cargo.toml Normal file
View File

@@ -0,0 +1,7 @@
[package]
name = "mito2"
version.workspace = true
edition.workspace = true
license.workspace = true
[dependencies]

9
src/mito2/README.md Normal file
View File

@@ -0,0 +1,9 @@
# Mito
Mito is GreptimeDB's default region engine.
## About Mito
The Alfa Romeo [MiTo](https://en.wikipedia.org/wiki/Alfa_Romeo_MiTo) is a front-wheel drive, three-door supermini designed by Centro Stile Alfa Romeo.
> "You can't be a true petrolhead until you've owned an Alfa Romeo."
> <div align="right">-- by Jeremy Clarkson</div>

17
src/mito2/src/engine.rs Normal file
View File

@@ -0,0 +1,17 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/// Region engine implementation for timeseries data.
#[derive(Clone)]
pub struct MitoEngine {}

15
src/mito2/src/lib.rs Normal file
View File

@@ -0,0 +1,15 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod engine;

View File

@@ -5,9 +5,9 @@ edition.workspace = true
license.workspace = true
[features]
pprof = ["dep:common-pprof"]
mem-prof = ["dep:common-mem-prof"]
dashboard = []
mem-prof = ["dep:common-mem-prof"]
pprof = ["dep:common-pprof"]
[dependencies]
aide = { version = "0.9", features = ["axum"] }
@@ -48,7 +48,7 @@ influxdb_line_protocol = { git = "https://github.com/evenyag/influxdb_iox", bran
itertools.workspace = true
metrics.workspace = true
# metrics-process 1.0.10 depends on metrics-0.21 but opendal depends on metrics-0.20.1
metrics-process = "<1.0.10"
metrics-process = { version = "<1.0.10", optional = true }
mime_guess = "2.0"
num_cpus = "1.13"
once_cell = "1.16"

View File

@@ -34,11 +34,12 @@ use aide::openapi::{Info, OpenApi, Server as OpenAPIServer};
use async_trait::async_trait;
use axum::body::BoxBody;
use axum::error_handling::HandleErrorLayer;
use axum::extract::MatchedPath;
use axum::extract::{DefaultBodyLimit, MatchedPath};
use axum::http::Request;
use axum::middleware::{self, Next};
use axum::response::{Html, IntoResponse, Json};
use axum::{routing, BoxError, Extension, Router};
use common_base::readable_size::ReadableSize;
use common_error::prelude::ErrorExt;
use common_error::status_code::StatusCode;
use common_query::Output;
@@ -104,6 +105,8 @@ pub(crate) async fn query_context_from_db(
pub const HTTP_API_VERSION: &str = "v1";
pub const HTTP_API_PREFIX: &str = "/v1/";
/// Default http body limit (64M).
const DEFAULT_BODY_LIMIT: ReadableSize = ReadableSize::mb(64);
// TODO(fys): This is a temporary workaround, it will be improved later
pub static PUBLIC_APIS: [&str; 2] = ["/v1/influxdb/ping", "/v1/influxdb/health"];
@@ -133,6 +136,8 @@ pub struct HttpOptions {
#[serde(skip)]
pub disable_dashboard: bool,
pub body_limit: ReadableSize,
}
impl Default for HttpOptions {
@@ -141,6 +146,7 @@ impl Default for HttpOptions {
addr: "127.0.0.1:4000".to_string(),
timeout: Duration::from_secs(30),
disable_dashboard: false,
body_limit: DEFAULT_BODY_LIMIT,
}
}
}
@@ -544,6 +550,13 @@ impl HttpServer {
.layer(HandleErrorLayer::new(handle_error))
.layer(TraceLayer::new_for_http())
.layer(TimeoutLayer::new(self.options.timeout))
.layer(DefaultBodyLimit::max(
self.options
.body_limit
.0
.try_into()
.unwrap_or_else(|_| DEFAULT_BODY_LIMIT.as_bytes() as usize),
))
// custom layer
.layer(AsyncRequireAuthorizationLayer::new(
HttpAuth::<BoxBody>::new(self.user_provider.clone()),

View File

@@ -27,7 +27,7 @@ use serde::{Deserialize, Serialize};
use session::context::UserInfo;
use crate::http::{ApiState, JsonResponse};
use crate::metrics::{JEMALLOC_COLLECTOR, PROCESS_COLLECTOR};
use crate::metrics::JEMALLOC_COLLECTOR;
use crate::metrics_handler::MetricsHandler;
#[derive(Debug, Default, Serialize, Deserialize, JsonSchema)]
@@ -137,7 +137,9 @@ pub async fn metrics(
Query(_params): Query<HashMap<String, String>>,
) -> String {
// Collect process metrics.
PROCESS_COLLECTOR.collect();
#[cfg(feature = "metrics-process")]
crate::metrics::PROCESS_COLLECTOR.collect();
if let Some(c) = JEMALLOC_COLLECTOR.as_ref() {
if let Err(e) = c.update() {
error!(e; "Failed to update jemalloc metrics");

View File

@@ -18,7 +18,6 @@ use std::time::Instant;
use common_telemetry::error;
use hyper::Body;
use metrics::gauge;
use metrics_process::Collector;
use once_cell::sync::Lazy;
use snafu::ResultExt;
use tikv_jemalloc_ctl::stats::{allocated_mib, resident_mib};
@@ -71,8 +70,9 @@ pub(crate) const METRIC_JEMALLOC_RESIDENT: &str = "sys.jemalloc.resident";
pub(crate) const METRIC_JEMALLOC_ALLOCATED: &str = "sys.jemalloc.allocated";
/// Prometheus style process metrics collector.
pub(crate) static PROCESS_COLLECTOR: Lazy<Collector> = Lazy::new(|| {
let collector = Collector::default();
#[cfg(feature = "metrics-process")]
pub(crate) static PROCESS_COLLECTOR: Lazy<metrics_process::Collector> = Lazy::new(|| {
let collector = metrics_process::Collector::default();
// Describe collector.
collector.describe();
collector

View File

@@ -344,10 +344,16 @@ impl<W: AsyncWrite + Send + Sync + Unpin> AsyncMysqlShim<W> for MysqlInstanceShi
async fn on_init<'a>(&'a mut self, database: &'a str, w: InitWriter<'a, W>) -> Result<()> {
let (catalog, schema) = crate::parse_catalog_and_schema_from_client_database_name(database);
ensure!(
self.query_handler.is_valid_schema(catalog, schema).await?,
error::DatabaseNotFoundSnafu { catalog, schema }
);
if !self.query_handler.is_valid_schema(catalog, schema).await? {
return w
.error(
ErrorKind::ER_WRONG_DB_NAME,
format!("Unknown database '{}'", database).as_bytes(),
)
.await
.map_err(|e| e.into());
}
let user_info = &self.session.user_info();

View File

@@ -17,16 +17,17 @@ use std::sync::Arc;
use async_trait::async_trait;
use common_query::logical_plan::Expr;
use common_recordbatch::OrderOption;
use common_telemetry::debug;
use common_telemetry::logging;
use common_time::range::TimestampRange;
use snafu::ResultExt;
use store_api::storage::{Chunk, ChunkReader, SchemaRef, SequenceNumber};
use store_api::storage::{Chunk, ChunkReader, RegionId, SchemaRef, SequenceNumber};
use table::predicate::{Predicate, TimeRangePredicateBuilder};
use crate::error::{self, Error, Result};
use crate::memtable::{IterContext, MemtableRef};
use crate::read::windowed::WindowedReader;
use crate::read::{Batch, BoxedBatchReader, DedupReader, MergeReaderBuilder};
use crate::read::{
Batch, BoxedBatchReader, ChainReader, DedupReader, MergeReaderBuilder, WindowedReader,
};
use crate::schema::{ProjectedSchema, ProjectedSchemaRef, RegionSchemaRef};
use crate::sst::{AccessLayerRef, FileHandle, LevelMetas, ReadOptions};
use crate::window_infer::{PlainWindowInference, WindowInfer};
@@ -90,6 +91,7 @@ impl ChunkReaderImpl {
/// Builder to create a new [ChunkReaderImpl] from scan request.
pub struct ChunkReaderBuilder {
region_id: RegionId,
schema: RegionSchemaRef,
projection: Option<Vec<usize>>,
filters: Vec<Expr>,
@@ -98,11 +100,13 @@ pub struct ChunkReaderBuilder {
memtables: Vec<MemtableRef>,
files_to_read: Vec<FileHandle>,
output_ordering: Option<Vec<OrderOption>>,
use_chain_reader: bool,
}
impl ChunkReaderBuilder {
pub fn new(schema: RegionSchemaRef, sst_layer: AccessLayerRef) -> Self {
pub fn new(region_id: RegionId, schema: RegionSchemaRef, sst_layer: AccessLayerRef) -> Self {
ChunkReaderBuilder {
region_id,
schema,
projection: None,
filters: vec![],
@@ -111,6 +115,7 @@ impl ChunkReaderBuilder {
memtables: Vec::new(),
files_to_read: Vec::new(),
output_ordering: None,
use_chain_reader: false,
}
}
@@ -150,6 +155,15 @@ impl ChunkReaderBuilder {
self
}
/// Partition files and memtables according to their time windows and scan time windows
/// one by one.
///
/// Note that compaction should not enable this.
pub fn use_chain_reader(mut self, use_chain_reader: bool) -> Self {
self.use_chain_reader = use_chain_reader;
self
}
/// Picks all SSTs in all levels
pub fn pick_all_ssts(mut self, ssts: &LevelMetas) -> Result<Self> {
let files = ssts.levels().iter().flat_map(|level| level.files());
@@ -183,7 +197,12 @@ impl ChunkReaderBuilder {
if name != self.schema.timestamp_column_name() {
return None;
}
let memtable_stats = self.memtables.iter().map(|m| m.stats()).collect::<Vec<_>>();
let memtable_stats = self
.memtables
.iter()
.filter(|m| m.num_rows() > 0) // Skip empty memtables.
.map(|m| m.stats())
.collect::<Vec<_>>();
let files = self
.files_to_read
.iter()
@@ -238,15 +257,32 @@ impl ChunkReaderBuilder {
predicate,
time_range: *time_range,
};
let mut num_read_files = 0;
for file in &self.files_to_read {
if !Self::file_in_range(file, time_range) {
debug!("Skip file {:?}, predicate: {:?}", file, time_range);
logging::debug!(
"Skip region {} file {:?}, predicate: {:?}",
self.region_id,
file,
time_range
);
continue;
}
let reader = self.sst_layer.read_sst(file.clone(), &read_opts).await?;
reader_builder = reader_builder.push_batch_reader(reader);
num_read_files += 1;
}
logging::debug!(
"build reader done, region_id: {}, time_range: {:?}, total_files: {}, num_read_files: {}",
self.region_id,
time_range,
self.files_to_read.len(),
num_read_files,
);
let reader = reader_builder.build();
let reader = DedupReader::new(schema.clone(), reader);
Ok(Box::new(reader) as Box<_>)
@@ -266,6 +302,8 @@ impl ChunkReaderBuilder {
output_ordering = Some(ordering.clone());
self.build_windowed(&schema, &time_range_predicate, windows, ordering)
.await?
} else if self.use_chain_reader {
self.build_chained(&schema, &time_range_predicate).await?
} else {
self.build_reader(&schema, &time_range_predicate).await?
};
@@ -273,8 +311,41 @@ impl ChunkReaderBuilder {
Ok(ChunkReaderImpl::new(schema, reader, output_ordering))
}
async fn build_chained(
&self,
schema: &ProjectedSchemaRef,
time_range: &TimestampRange,
) -> Result<BoxedBatchReader> {
let windows = self.infer_window_for_chain_reader(time_range);
logging::debug!(
"Infer window for chain reader, region_id: {}, memtables: {}, files: {}, num_windows: {}",
self.region_id,
self.memtables.len(),
self.files_to_read.len(),
windows.len(),
);
let mut readers = Vec::with_capacity(windows.len());
for window in &windows {
let time_range = time_range.and(window);
let reader = self.build_reader(schema, &time_range).await?;
readers.push(reader);
}
logging::debug!(
"Build chain reader, region_id: {}, time_range: {:?}, num_readers: {}",
self.region_id,
time_range,
readers.len(),
);
let chain_reader = ChainReader::new(schema.clone(), readers);
Ok(Box::new(chain_reader) as Box<_>)
}
/// Build time range predicate from schema and filters.
pub fn build_time_range_predicate(&self) -> TimestampRange {
fn build_time_range_predicate(&self) -> TimestampRange {
let Some(ts_col) = self.schema.user_schema().timestamp_column() else { return TimestampRange::min_to_max() };
let unit = ts_col
.data_type
@@ -294,4 +365,87 @@ impl ChunkReaderBuilder {
let file_ts_range = TimestampRange::new_inclusive(Some(start), Some(end));
file_ts_range.intersects(predicate)
}
/// Returns the time range of memtables to read.
fn compute_memtable_range(&self) -> Option<TimestampRange> {
let (min_timestamp, max_timestamp) = self
.memtables
.iter()
.filter(|m| m.num_rows() > 0) // Skip empty memtables.
.map(|m| {
let stats = m.stats();
(stats.min_timestamp, stats.max_timestamp)
})
.reduce(|acc, e| (acc.0.min(e.0), acc.1.max(e.1)))?;
logging::debug!(
"Compute memtable range, region_id: {}, min: {:?}, max: {:?}",
self.region_id,
min_timestamp,
max_timestamp,
);
Some(TimestampRange::new_inclusive(
Some(min_timestamp),
Some(max_timestamp),
))
}
/// Infer time window for chain reader according to the time range of memtables and files.
fn infer_window_for_chain_reader(&self, time_range: &TimestampRange) -> Vec<TimestampRange> {
let mut memtable_range = self.compute_memtable_range();
// file ranges: (start, end)
let mut file_ranges = Vec::with_capacity(self.files_to_read.len());
for file in &self.files_to_read {
if !Self::file_in_range(file, time_range) || file.time_range().is_none() {
continue;
}
// Safety: we have skip files whose range is `None`.
let range = file.time_range().unwrap();
// Filter by memtable's time range.
if let Some(mem_range) = &mut memtable_range {
let file_range = TimestampRange::new_inclusive(Some(range.0), Some(range.1));
if mem_range.intersects(&file_range) {
// If the range of the SST intersects with the range of the
// memtable, we merge it into the memtable's range.
*mem_range = mem_range.or(&file_range);
continue;
}
}
file_ranges.push((range.0, range.1));
}
if file_ranges.is_empty() {
return memtable_range.map(|range| vec![range]).unwrap_or_default();
}
// Sort by start times.
file_ranges.sort_unstable_by(|left, right| left.0.cmp(&right.0));
// Compute ranges for all SSTs.
let mut time_ranges = Vec::with_capacity(file_ranges.len() + 1);
// Safety: file_ranges is not empty.
let mut prev =
TimestampRange::new_inclusive(Some(file_ranges[0].0), Some(file_ranges[0].1));
for file_range in &file_ranges[1..] {
let current = TimestampRange::new_inclusive(Some(file_range.0), Some(file_range.1));
if prev.intersects(&current) {
prev = prev.or(&current);
} else {
time_ranges.push(prev);
prev = current;
}
}
time_ranges.push(prev);
if let Some(mem_range) = memtable_range {
time_ranges.push(mem_range);
// We have pushed the memtable range, resort the array.
time_ranges.sort_unstable_by(|left, right| left.start().cmp(right.start()));
}
time_ranges
}
}

View File

@@ -15,17 +15,182 @@
pub mod noop;
mod picker;
mod scheduler;
mod strategy;
mod task;
mod twcs;
mod writer;
use std::sync::Arc;
pub use picker::{Picker, PickerContext, SimplePicker};
use common_telemetry::tracing::log::warn;
use common_time::timestamp::TimeUnit;
use common_time::Timestamp;
pub use picker::{LeveledTimeWindowPicker, Picker, PickerContext};
pub use scheduler::{CompactionHandler, CompactionRequestImpl};
use store_api::logstore::LogStore;
use store_api::storage::CompactionStrategy;
pub use task::{CompactionTask, CompactionTaskImpl};
pub use twcs::TwcsPicker;
use crate::scheduler::Scheduler;
use crate::sst::FileHandle;
pub type CompactionPickerRef<S> =
Arc<dyn Picker<Request = CompactionRequestImpl<S>, Task = CompactionTaskImpl<S>> + Send + Sync>;
pub type CompactionSchedulerRef<S> =
Arc<dyn Scheduler<Request = CompactionRequestImpl<S>> + Send + Sync>;
/// Infers the suitable time bucket duration.
/// Now it simply find the max and min timestamp across all SSTs in level and fit the time span
/// into time bucket.
pub(crate) fn infer_time_bucket<'a>(files: impl Iterator<Item = &'a FileHandle>) -> i64 {
let mut max_ts = Timestamp::new(i64::MIN, TimeUnit::Second);
let mut min_ts = Timestamp::new(i64::MAX, TimeUnit::Second);
for f in files {
if let Some((start, end)) = f.time_range() {
min_ts = min_ts.min(*start);
max_ts = max_ts.max(*end);
} else {
// we don't expect an SST file without time range,
// it's either a bug or data corruption.
warn!("Found SST file without time range metadata: {f:?}");
}
}
// safety: Convert whatever timestamp into seconds will not cause overflow.
let min_sec = min_ts.convert_to(TimeUnit::Second).unwrap().value();
let max_sec = max_ts.convert_to(TimeUnit::Second).unwrap().value();
max_sec
.checked_sub(min_sec)
.map(|span| TIME_BUCKETS.fit_time_bucket(span)) // return the max bucket on subtraction overflow.
.unwrap_or_else(|| TIME_BUCKETS.max()) // safety: TIME_BUCKETS cannot be empty.
}
pub(crate) struct TimeBuckets([i64; 7]);
impl TimeBuckets {
/// Fits a given time span into time bucket by find the minimum bucket that can cover the span.
/// Returns the max bucket if no such bucket can be found.
fn fit_time_bucket(&self, span_sec: i64) -> i64 {
assert!(span_sec >= 0);
match self.0.binary_search(&span_sec) {
Ok(idx) => self.0[idx],
Err(idx) => {
if idx < self.0.len() {
self.0[idx]
} else {
self.0.last().copied().unwrap()
}
}
}
}
#[cfg(test)]
fn get(&self, idx: usize) -> i64 {
self.0[idx]
}
fn max(&self) -> i64 {
self.0.last().copied().unwrap()
}
}
/// A set of predefined time buckets.
pub(crate) const TIME_BUCKETS: TimeBuckets = TimeBuckets([
60 * 60, // one hour
2 * 60 * 60, // two hours
12 * 60 * 60, // twelve hours
24 * 60 * 60, // one day
7 * 24 * 60 * 60, // one week
365 * 24 * 60 * 60, // one year
10 * 365 * 24 * 60 * 60, // ten years
]);
pub fn compaction_strategy_to_picker<S: LogStore>(
strategy: &CompactionStrategy,
) -> CompactionPickerRef<S> {
match strategy {
CompactionStrategy::LeveledTimeWindow => {
Arc::new(LeveledTimeWindowPicker::default()) as Arc<_>
}
CompactionStrategy::Twcs(twcs_opts) => Arc::new(TwcsPicker::new(
twcs_opts.max_active_window_files,
twcs_opts.max_inactive_window_files,
twcs_opts.time_window_seconds,
)) as Arc<_>,
}
}
#[cfg(test)]
mod tests {
use common_time::Timestamp;
use super::*;
use crate::file_purger::noop::new_noop_file_purger;
use crate::sst::{FileHandle, FileId, FileMeta, Level};
/// Test util to create file handles.
pub fn new_file_handle(
file_id: FileId,
start_ts_millis: i64,
end_ts_millis: i64,
level: Level,
) -> FileHandle {
let file_purger = new_noop_file_purger();
let layer = Arc::new(crate::test_util::access_layer_util::MockAccessLayer {});
FileHandle::new(
FileMeta {
region_id: 0,
file_id,
time_range: Some((
Timestamp::new_millisecond(start_ts_millis),
Timestamp::new_millisecond(end_ts_millis),
)),
level,
file_size: 0,
},
layer,
file_purger,
)
}
#[test]
fn test_time_bucket() {
assert_eq!(TIME_BUCKETS.get(0), TIME_BUCKETS.fit_time_bucket(1));
assert_eq!(TIME_BUCKETS.get(0), TIME_BUCKETS.fit_time_bucket(60 * 60));
assert_eq!(
TIME_BUCKETS.get(1),
TIME_BUCKETS.fit_time_bucket(60 * 60 + 1)
);
assert_eq!(
TIME_BUCKETS.get(2),
TIME_BUCKETS.fit_time_bucket(TIME_BUCKETS.get(2) - 1)
);
assert_eq!(
TIME_BUCKETS.get(2),
TIME_BUCKETS.fit_time_bucket(TIME_BUCKETS.get(2))
);
assert_eq!(
TIME_BUCKETS.get(3),
TIME_BUCKETS.fit_time_bucket(TIME_BUCKETS.get(3) - 1)
);
assert_eq!(TIME_BUCKETS.get(6), TIME_BUCKETS.fit_time_bucket(i64::MAX));
}
#[test]
fn test_infer_time_buckets() {
assert_eq!(
TIME_BUCKETS.get(0),
infer_time_bucket(
[
new_file_handle(FileId::random(), 0, TIME_BUCKETS.get(0) * 1000 - 1, 0),
new_file_handle(FileId::random(), 1, 10_000, 0)
]
.iter()
)
);
}
}

View File

@@ -12,30 +12,49 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use std::fmt::{Debug, Formatter};
use std::marker::PhantomData;
use std::sync::Arc;
use std::time::Duration;
use common_telemetry::tracing::log::warn;
use common_telemetry::{debug, error, info};
use common_time::timestamp::TimeUnit;
use common_time::timestamp_millis::BucketAligned;
use common_time::Timestamp;
use snafu::ResultExt;
use store_api::logstore::LogStore;
use crate::compaction::infer_time_bucket;
use crate::compaction::scheduler::CompactionRequestImpl;
use crate::compaction::strategy::{SimpleTimeWindowStrategy, StrategyRef};
use crate::compaction::task::{CompactionTask, CompactionTaskImpl};
use crate::error::TtlCalculationSnafu;
use crate::compaction::task::{CompactionOutput, CompactionTask, CompactionTaskImpl};
use crate::error::{Result, TtlCalculationSnafu};
use crate::scheduler::Request;
use crate::sst::{FileHandle, Level};
use crate::version::LevelMetasRef;
use crate::sst::{FileHandle, LevelMeta};
/// Picker picks input SST files and builds the compaction task.
/// Different compaction strategy may implement different pickers.
pub trait Picker: Send + 'static {
pub trait Picker: Debug + Send + 'static {
type Request: Request;
type Task: CompactionTask;
fn pick(&self, req: &Self::Request) -> crate::error::Result<Option<Self::Task>>;
fn pick(&self, req: &Self::Request) -> Result<Option<Self::Task>>;
}
pub(crate) fn get_expired_ssts(
levels: &[LevelMeta],
ttl: Option<Duration>,
now: Timestamp,
) -> Result<Vec<FileHandle>> {
let Some(ttl) = ttl else { return Ok(vec![]); };
let expire_time = now.sub_duration(ttl).context(TtlCalculationSnafu)?;
let expired_ssts = levels
.iter()
.flat_map(|l| l.get_expired_files(&expire_time).into_iter())
.collect();
Ok(expired_ssts)
}
pub struct PickerContext {
@@ -54,56 +73,40 @@ impl PickerContext {
}
}
/// L0 -> L1 compaction based on time windows.
pub struct SimplePicker<S> {
strategy: StrategyRef,
/// `LeveledTimeWindowPicker` only handles level 0 to level 1 compaction in a time-window tiered
/// manner. It picks all SSTs in level 0 and writes rows in these SSTs to a new file partitioned
/// by a inferred time bucket in level 1.
pub struct LeveledTimeWindowPicker<S> {
_phantom_data: PhantomData<S>,
}
impl<S> Default for SimplePicker<S> {
fn default() -> Self {
Self::new(Arc::new(SimpleTimeWindowStrategy {}))
impl<S> Debug for LeveledTimeWindowPicker<S> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "LeveledTimeWindowPicker{{..}}")
}
}
impl<S> SimplePicker<S> {
pub fn new(strategy: StrategyRef) -> Self {
impl<S> Default for LeveledTimeWindowPicker<S> {
fn default() -> Self {
Self::new()
}
}
impl<S> LeveledTimeWindowPicker<S> {
pub fn new() -> Self {
Self {
strategy,
_phantom_data: Default::default(),
}
}
fn get_expired_ssts(
&self,
levels: &LevelMetasRef,
ttl: Option<Duration>,
) -> crate::error::Result<Vec<FileHandle>> {
let Some(ttl) = ttl else { return Ok(vec![]); };
let expire_time = Timestamp::current_millis()
.sub_duration(ttl)
.context(TtlCalculationSnafu)?;
let mut expired_ssts = vec![];
for level in 0..levels.level_num() {
expired_ssts.extend(levels.level(level as Level).get_expired_files(&expire_time));
}
Ok(expired_ssts)
}
}
impl<S: LogStore> Picker for SimplePicker<S> {
impl<S: LogStore> Picker for LeveledTimeWindowPicker<S> {
type Request = CompactionRequestImpl<S>;
type Task = CompactionTaskImpl<S>;
fn pick(
&self,
req: &CompactionRequestImpl<S>,
) -> crate::error::Result<Option<CompactionTaskImpl<S>>> {
fn pick(&self, req: &CompactionRequestImpl<S>) -> Result<Option<CompactionTaskImpl<S>>> {
let levels = &req.levels();
let expired_ssts = self
.get_expired_ssts(levels, req.ttl)
let expired_ssts = get_expired_ssts(levels.levels(), req.ttl, Timestamp::current_millis())
.map_err(|e| {
error!(e;"Failed to get region expired SST files, region: {}, ttl: {:?}", req.region_id, req.ttl);
e
@@ -121,12 +124,16 @@ impl<S: LogStore> Picker for SimplePicker<S> {
let ctx = &PickerContext::with(req.compaction_time_window);
let mut outputs = vec![];
for level_num in 0..levels.level_num() {
let level = levels.level(level_num as u8);
let (compaction_time_window, outputs) = self.strategy.pick(ctx, level);
let compaction_time_window = Self::pick_level(ctx, level, &mut outputs);
if outputs.is_empty() {
debug!("No SST file can be compacted at level {}", level_num);
debug!(
"No SST file can be compacted at level {}, path: {:?}",
level_num, req.sst_layer
);
continue;
}
@@ -151,3 +158,272 @@ impl<S: LogStore> Picker for SimplePicker<S> {
Ok(None)
}
}
impl<S> LeveledTimeWindowPicker<S> {
fn pick_level(
ctx: &PickerContext,
level: &LevelMeta,
results: &mut Vec<CompactionOutput>,
) -> Option<i64> {
// SimpleTimeWindowStrategy only handles level 0 to level 1 compaction.
if level.level() != 0 {
return None;
}
let files = find_compactable_files(level);
debug!("Compactable files found: {:?}", files);
if files.is_empty() {
return None;
}
let time_window = ctx.compaction_time_window().unwrap_or_else(|| {
let inferred = infer_time_bucket(files.iter());
debug!(
"Compaction window is not present, inferring from files: {:?}",
inferred
);
inferred
});
let buckets = calculate_time_buckets(time_window, &files);
debug!("File bucket:{}, file groups: {:?}", time_window, buckets);
results.extend(buckets.into_iter().map(|(bound, files)| CompactionOutput {
output_level: 1,
time_window_bound: bound,
time_window_sec: time_window,
inputs: files,
// strict window is used in simple time window strategy in that rows in one file
// may get compacted to multiple destinations.
strict_window: true,
}));
Some(time_window)
}
}
/// Finds files that can be compacted in given level.
/// Currently they're files that is not currently under compaction.
#[inline]
fn find_compactable_files(level: &LevelMeta) -> Vec<FileHandle> {
level.files().filter(|f| !f.compacting()).cloned().collect()
}
/// Calculates buckets for files. If file does not contain a time range in metadata, it will be
/// assigned to a special bucket `i64::MAX` (normally no timestamp can be aligned to this bucket)
/// so that all files without timestamp can be compacted together.
fn calculate_time_buckets(bucket_sec: i64, files: &[FileHandle]) -> HashMap<i64, Vec<FileHandle>> {
let mut buckets = HashMap::new();
for file in files {
if let Some((start, end)) = file.time_range() {
let bounds = file_time_bucket_span(
start.convert_to(TimeUnit::Second).unwrap().value(),
end.convert_to(TimeUnit::Second).unwrap().value(),
bucket_sec,
);
for bound in bounds {
buckets
.entry(bound)
.or_insert_with(Vec::new)
.push(file.clone());
}
} else {
warn!("Found corrupted SST without timestamp bounds: {:?}", file);
}
}
buckets
}
/// Calculates timestamp span between start and end timestamp.
fn file_time_bucket_span(start_sec: i64, end_sec: i64, bucket_sec: i64) -> Vec<i64> {
assert!(start_sec <= end_sec);
// if timestamp is between `[i64::MIN, i64::MIN.align_by_bucket(bucket)]`, which cannot
// be aligned to a valid i64 bound, simply return `i64::MIN` rather than just underflow.
let mut start_aligned = start_sec.align_by_bucket(bucket_sec).unwrap_or(i64::MIN);
let end_aligned = end_sec.align_by_bucket(bucket_sec).unwrap_or(i64::MIN);
let mut res = Vec::with_capacity(((end_aligned - start_aligned) / bucket_sec + 1) as usize);
while start_aligned < end_aligned {
res.push(start_aligned);
start_aligned += bucket_sec;
}
res.push(end_aligned);
res
}
#[cfg(test)]
mod tests {
use std::collections::{HashMap, HashSet};
use std::sync::Arc;
use super::*;
use crate::compaction::tests::new_file_handle;
use crate::compaction::TIME_BUCKETS;
use crate::file_purger::noop::new_noop_file_purger;
use crate::sst::{FileId, Level, LevelMetas};
#[test]
fn test_time_bucket_span() {
assert_eq!(vec![0], file_time_bucket_span(1, 9, 10));
assert_eq!(vec![0, 10], file_time_bucket_span(1, 10, 10));
assert_eq!(vec![-10], file_time_bucket_span(-10, -1, 10));
assert_eq!(vec![-10, 0], file_time_bucket_span(-10, 0, 10));
}
#[test]
fn test_time_bucket_span_large() {
assert_eq!(
vec![
(i64::MAX - 10).align_by_bucket(10).unwrap(),
i64::MAX.align_by_bucket(10).unwrap(),
],
file_time_bucket_span(i64::MAX - 10, i64::MAX, 10)
);
// magic hmmm?
for bucket in 1..100 {
assert_eq!(
vec![
i64::MIN,
(i64::MIN + bucket).align_by_bucket(bucket).unwrap()
],
file_time_bucket_span(i64::MIN, i64::MIN + bucket, bucket)
);
}
}
fn new_file_handles(input: &[(FileId, i64, i64)]) -> Vec<FileHandle> {
input
.iter()
.map(|(file_id, start, end)| new_file_handle(*file_id, *start, *end, 0))
.collect()
}
fn check_bucket_calculation(
bucket_sec: i64,
files: Vec<FileHandle>,
expected: &[(i64, &[FileId])],
) {
let res = calculate_time_buckets(bucket_sec, &files);
let expected = expected
.iter()
.map(|(bucket, file_ids)| (*bucket, file_ids.iter().copied().collect::<HashSet<_>>()))
.collect::<HashMap<_, _>>();
for (bucket, file_ids) in expected {
let actual = res
.get(&bucket)
.unwrap()
.iter()
.map(|f| f.file_id())
.collect();
assert_eq!(
file_ids, actual,
"bucket: {bucket}, expected: {file_ids:?}, actual: {actual:?}",
);
}
}
#[test]
fn test_calculate_time_buckets() {
let file_id_a = FileId::random();
let file_id_b = FileId::random();
// simple case, files with disjoint
check_bucket_calculation(
10,
new_file_handles(&[(file_id_a, 0, 9000), (file_id_b, 10000, 19000)]),
&[(0, &[file_id_a]), (10, &[file_id_b])],
);
// files across buckets
check_bucket_calculation(
10,
new_file_handles(&[(file_id_a, 0, 10001), (file_id_b, 10000, 19000)]),
&[(0, &[file_id_a]), (10, &[file_id_a, file_id_b])],
);
check_bucket_calculation(
10,
new_file_handles(&[(file_id_a, 0, 10000)]),
&[(0, &[file_id_a]), (10, &[file_id_a])],
);
// file with an large time range
let file_id_array = &[file_id_a];
let expected = (0..(TIME_BUCKETS.get(4) / TIME_BUCKETS.get(0)))
.map(|b| (b * TIME_BUCKETS.get(0), file_id_array as _))
.collect::<Vec<_>>();
check_bucket_calculation(
TIME_BUCKETS.get(0),
new_file_handles(&[(file_id_a, 0, TIME_BUCKETS.get(4) * 1000)]),
&expected,
);
}
struct TtlTester {
files: Vec<(FileId, i64, i64, Level)>,
ttl: Option<Duration>,
expired: Vec<usize>,
now: Timestamp,
}
impl TtlTester {
fn check(&self) {
let expected_expired = self
.expired
.iter()
.map(|idx| self.files[*idx].0)
.collect::<HashSet<_>>();
let file_purger = new_noop_file_purger();
let layer = Arc::new(crate::test_util::access_layer_util::MockAccessLayer {});
let file_handles = self
.files
.iter()
.map(|(file_id, start_ts, end_ts, level)| {
new_file_handle(*file_id, *start_ts, *end_ts, *level).meta()
})
.collect::<Vec<_>>();
let levels = LevelMetas::new(layer, file_purger).merge(
file_handles.into_iter(),
vec![].into_iter(),
None,
);
let expired = get_expired_ssts(levels.levels(), self.ttl, self.now)
.unwrap()
.into_iter()
.map(|f| f.file_id())
.collect::<HashSet<_>>();
assert_eq!(expected_expired, expired);
}
}
#[test]
fn test_find_expired_ssts() {
TtlTester {
files: vec![
(FileId::random(), 8000, 9000, 0),
(FileId::random(), 10000, 11000, 0),
(FileId::random(), 8000, 11000, 1),
(FileId::random(), 2000, 3000, 1),
],
ttl: Some(Duration::from_secs(1)),
expired: vec![3],
now: Timestamp::new_second(10),
}
.check();
TtlTester {
files: vec![
(FileId::random(), 8000, 8999, 0),
(FileId::random(), 10000, 11000, 0),
(FileId::random(), 8000, 11000, 1),
(FileId::random(), 2000, 3000, 1),
],
ttl: Some(Duration::from_secs(1)),
expired: vec![0, 3],
now: Timestamp::new_second(10),
}
.check();
}
}

View File

@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::marker::PhantomData;
use std::sync::Arc;
use std::time::Duration;
@@ -22,8 +23,8 @@ use store_api::storage::RegionId;
use tokio::sync::oneshot::Sender;
use tokio::sync::Notify;
use crate::compaction::picker::Picker;
use crate::compaction::task::CompactionTask;
use crate::compaction::CompactionPickerRef;
use crate::error::Result;
use crate::manifest::region::RegionManifest;
use crate::region::{RegionWriterRef, SharedDataRef};
@@ -63,7 +64,7 @@ pub struct CompactionRequestImpl<S: LogStore> {
pub compaction_time_window: Option<i64>,
/// Compaction result sender.
pub sender: Option<Sender<Result<()>>>,
pub picker: CompactionPickerRef<S>,
pub sst_write_buffer_size: ReadableSize,
}
@@ -79,18 +80,40 @@ impl<S: LogStore> CompactionRequestImpl<S> {
}
}
pub struct CompactionHandler<P> {
pub picker: P,
pub struct CompactionHandler<S: LogStore> {
_phantom_data: PhantomData<S>,
#[cfg(test)]
pub pending_tasks: Arc<tokio::sync::RwLock<Vec<tokio::task::JoinHandle<()>>>>,
}
impl<S: LogStore> Default for CompactionHandler<S> {
fn default() -> Self {
Self {
_phantom_data: Default::default(),
#[cfg(test)]
pending_tasks: Arc::new(Default::default()),
}
}
}
impl<S: LogStore> CompactionHandler<S> {
#[cfg(test)]
pub fn new_with_pending_tasks(
tasks: Arc<tokio::sync::RwLock<Vec<tokio::task::JoinHandle<()>>>>,
) -> Self {
Self {
_phantom_data: Default::default(),
pending_tasks: tasks,
}
}
}
#[async_trait::async_trait]
impl<P> Handler for CompactionHandler<P>
impl<S> Handler for CompactionHandler<S>
where
P: Picker + Send + Sync,
S: LogStore,
{
type Request = P::Request;
type Request = CompactionRequestImpl<S>;
async fn handle_request(
&self,
@@ -99,7 +122,7 @@ where
finish_notifier: Arc<Notify>,
) -> Result<()> {
let region_id = req.key();
let Some(task) = self.picker.pick(&req)? else {
let Some(task) = req.picker.pick(&req)? else {
info!("No file needs compaction in region: {:?}", region_id);
req.complete(Ok(()));
return Ok(());

View File

@@ -1,327 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use std::sync::Arc;
use common_telemetry::{debug, warn};
use common_time::timestamp::TimeUnit;
use common_time::timestamp_millis::BucketAligned;
use common_time::Timestamp;
use crate::compaction::picker::PickerContext;
use crate::compaction::task::CompactionOutput;
use crate::sst::{FileHandle, LevelMeta};
/// Compaction strategy that defines which SSTs need to be compacted at given level.
pub trait Strategy {
fn pick(&self, ctx: &PickerContext, level: &LevelMeta) -> (Option<i64>, Vec<CompactionOutput>);
}
pub type StrategyRef = Arc<dyn Strategy + Send + Sync>;
/// SimpleTimeWindowStrategy only handles level 0 to level 1 compaction in a time-window tiered
/// manner. It picks all SSTs in level 0 and writes rows in these SSTs to a new file partitioned
/// by a inferred time bucket in level 1.
pub struct SimpleTimeWindowStrategy {}
impl Strategy for SimpleTimeWindowStrategy {
fn pick(&self, ctx: &PickerContext, level: &LevelMeta) -> (Option<i64>, Vec<CompactionOutput>) {
// SimpleTimeWindowStrategy only handles level 0 to level 1 compaction.
if level.level() != 0 {
return (None, vec![]);
}
let files = find_compactable_files(level);
debug!("Compactable files found: {:?}", files);
if files.is_empty() {
return (None, vec![]);
}
let time_window = ctx.compaction_time_window().unwrap_or_else(|| {
let inferred = infer_time_bucket(&files);
debug!(
"Compaction window is not present, inferring from files: {:?}",
inferred
);
inferred
});
let buckets = calculate_time_buckets(time_window, &files);
debug!("File bucket:{}, file groups: {:?}", time_window, buckets);
(
Some(time_window),
buckets
.into_iter()
.map(|(bound, files)| CompactionOutput {
output_level: 1,
bucket_bound: bound,
bucket: time_window,
inputs: files,
})
.collect(),
)
}
}
/// Finds files that can be compacted in given level.
/// Currently they're files that is not currently under compaction.
#[inline]
fn find_compactable_files(level: &LevelMeta) -> Vec<FileHandle> {
level.files().filter(|f| !f.compacting()).cloned().collect()
}
/// Calculates buckets for files. If file does not contain a time range in metadata, it will be
/// assigned to a special bucket `i64::MAX` (normally no timestamp can be aligned to this bucket)
/// so that all files without timestamp can be compacted together.
fn calculate_time_buckets(bucket_sec: i64, files: &[FileHandle]) -> HashMap<i64, Vec<FileHandle>> {
let mut buckets = HashMap::new();
for file in files {
if let Some((start, end)) = file.time_range() {
let bounds = file_time_bucket_span(
start.convert_to(TimeUnit::Second).unwrap().value(),
end.convert_to(TimeUnit::Second).unwrap().value(),
bucket_sec,
);
for bound in bounds {
buckets
.entry(bound)
.or_insert_with(Vec::new)
.push(file.clone());
}
} else {
warn!("Found corrupted SST without timestamp bounds: {:?}", file);
}
}
buckets
}
/// Calculates timestamp span between start and end timestamp.
fn file_time_bucket_span(start_sec: i64, end_sec: i64, bucket_sec: i64) -> Vec<i64> {
assert!(start_sec <= end_sec);
// if timestamp is between `[i64::MIN, i64::MIN.align_by_bucket(bucket)]`, which cannot
// be aligned to a valid i64 bound, simply return `i64::MIN` rather than just underflow.
let mut start_aligned = start_sec.align_by_bucket(bucket_sec).unwrap_or(i64::MIN);
let end_aligned = end_sec.align_by_bucket(bucket_sec).unwrap_or(i64::MIN);
let mut res = Vec::with_capacity(((end_aligned - start_aligned) / bucket_sec + 1) as usize);
while start_aligned < end_aligned {
res.push(start_aligned);
start_aligned += bucket_sec;
}
res.push(end_aligned);
res
}
/// Infers the suitable time bucket duration.
/// Now it simply find the max and min timestamp across all SSTs in level and fit the time span
/// into time bucket.
fn infer_time_bucket(files: &[FileHandle]) -> i64 {
let mut max_ts = &Timestamp::new(i64::MIN, TimeUnit::Second);
let mut min_ts = &Timestamp::new(i64::MAX, TimeUnit::Second);
for f in files {
if let Some((start, end)) = f.time_range() {
min_ts = min_ts.min(start);
max_ts = max_ts.max(end);
} else {
// we don't expect an SST file without time range,
// it's either a bug or data corruption.
warn!("Found SST file without time range metadata: {f:?}");
}
}
// safety: Convert whatever timestamp into seconds will not cause overflow.
let min_sec = min_ts.convert_to(TimeUnit::Second).unwrap().value();
let max_sec = max_ts.convert_to(TimeUnit::Second).unwrap().value();
max_sec
.checked_sub(min_sec)
.map(fit_time_bucket) // return the max bucket on subtraction overflow.
.unwrap_or_else(|| *TIME_BUCKETS.last().unwrap()) // safety: TIME_BUCKETS cannot be empty.
}
/// A set of predefined time buckets.
const TIME_BUCKETS: [i64; 7] = [
60 * 60, // one hour
2 * 60 * 60, // two hours
12 * 60 * 60, // twelve hours
24 * 60 * 60, // one day
7 * 24 * 60 * 60, // one week
365 * 24 * 60 * 60, // one year
10 * 365 * 24 * 60 * 60, // ten years
];
/// Fits a given time span into time bucket by find the minimum bucket that can cover the span.
/// Returns the max bucket if no such bucket can be found.
fn fit_time_bucket(span_sec: i64) -> i64 {
assert!(span_sec >= 0);
for b in TIME_BUCKETS {
if b >= span_sec {
return b;
}
}
*TIME_BUCKETS.last().unwrap()
}
#[cfg(test)]
mod tests {
use std::collections::{HashMap, HashSet};
use super::*;
use crate::file_purger::noop::new_noop_file_purger;
use crate::sst::{FileId, FileMeta};
#[test]
fn test_time_bucket_span() {
assert_eq!(vec![0], file_time_bucket_span(1, 9, 10));
assert_eq!(vec![0, 10], file_time_bucket_span(1, 10, 10));
assert_eq!(vec![-10], file_time_bucket_span(-10, -1, 10));
assert_eq!(vec![-10, 0], file_time_bucket_span(-10, 0, 10));
}
#[test]
fn test_time_bucket_span_large() {
assert_eq!(
vec![
(i64::MAX - 10).align_by_bucket(10).unwrap(),
i64::MAX.align_by_bucket(10).unwrap(),
],
file_time_bucket_span(i64::MAX - 10, i64::MAX, 10)
);
// magic hmmm?
for bucket in 1..100 {
assert_eq!(
vec![
i64::MIN,
(i64::MIN + bucket).align_by_bucket(bucket).unwrap()
],
file_time_bucket_span(i64::MIN, i64::MIN + bucket, bucket)
);
}
}
#[test]
fn test_time_bucket() {
assert_eq!(TIME_BUCKETS[0], fit_time_bucket(1));
assert_eq!(TIME_BUCKETS[0], fit_time_bucket(60 * 60));
assert_eq!(TIME_BUCKETS[1], fit_time_bucket(60 * 60 + 1));
assert_eq!(TIME_BUCKETS[2], fit_time_bucket(TIME_BUCKETS[2] - 1));
assert_eq!(TIME_BUCKETS[2], fit_time_bucket(TIME_BUCKETS[2]));
assert_eq!(TIME_BUCKETS[3], fit_time_bucket(TIME_BUCKETS[3] - 1));
assert_eq!(TIME_BUCKETS[6], fit_time_bucket(i64::MAX));
}
#[test]
fn test_infer_time_buckets() {
assert_eq!(
TIME_BUCKETS[0],
infer_time_bucket(&[
new_file_handle(FileId::random(), 0, TIME_BUCKETS[0] * 1000 - 1),
new_file_handle(FileId::random(), 1, 10_000)
])
);
}
fn new_file_handle(file_id: FileId, start_ts_millis: i64, end_ts_millis: i64) -> FileHandle {
let file_purger = new_noop_file_purger();
let layer = Arc::new(crate::test_util::access_layer_util::MockAccessLayer {});
FileHandle::new(
FileMeta {
region_id: 0,
file_id,
time_range: Some((
Timestamp::new_millisecond(start_ts_millis),
Timestamp::new_millisecond(end_ts_millis),
)),
level: 0,
file_size: 0,
},
layer,
file_purger,
)
}
fn new_file_handles(input: &[(FileId, i64, i64)]) -> Vec<FileHandle> {
input
.iter()
.map(|(file_id, start, end)| new_file_handle(*file_id, *start, *end))
.collect()
}
fn check_bucket_calculation(
bucket_sec: i64,
files: Vec<FileHandle>,
expected: &[(i64, &[FileId])],
) {
let res = calculate_time_buckets(bucket_sec, &files);
let expected = expected
.iter()
.map(|(bucket, file_ids)| (*bucket, file_ids.iter().copied().collect::<HashSet<_>>()))
.collect::<HashMap<_, _>>();
for (bucket, file_ids) in expected {
let actual = res
.get(&bucket)
.unwrap()
.iter()
.map(|f| f.file_id())
.collect();
assert_eq!(
file_ids, actual,
"bucket: {bucket}, expected: {file_ids:?}, actual: {actual:?}",
);
}
}
#[test]
fn test_calculate_time_buckets() {
let file_id_a = FileId::random();
let file_id_b = FileId::random();
// simple case, files with disjoint
check_bucket_calculation(
10,
new_file_handles(&[(file_id_a, 0, 9000), (file_id_b, 10000, 19000)]),
&[(0, &[file_id_a]), (10, &[file_id_b])],
);
// files across buckets
check_bucket_calculation(
10,
new_file_handles(&[(file_id_a, 0, 10001), (file_id_b, 10000, 19000)]),
&[(0, &[file_id_a]), (10, &[file_id_a, file_id_b])],
);
check_bucket_calculation(
10,
new_file_handles(&[(file_id_a, 0, 10000)]),
&[(0, &[file_id_a]), (10, &[file_id_a])],
);
// file with an large time range
let file_id_array = &[file_id_a];
let expected = (0..(TIME_BUCKETS[4] / TIME_BUCKETS[0]))
.map(|b| (b * TIME_BUCKETS[0], file_id_array as _))
.collect::<Vec<_>>();
check_bucket_calculation(
TIME_BUCKETS[0],
new_file_handles(&[(file_id_a, 0, TIME_BUCKETS[4] * 1000)]),
&expected,
);
}
}

View File

@@ -169,13 +169,15 @@ impl<S: LogStore> CompactionTask for CompactionTaskImpl<S> {
#[derive(Debug)]
pub struct CompactionOutput {
/// Compaction output file level.
pub(crate) output_level: Level,
/// The left bound of time bucket.
pub(crate) bucket_bound: i64,
/// Bucket duration in seconds.
pub(crate) bucket: i64,
pub output_level: Level,
/// The left bound of time window.
pub time_window_bound: i64,
/// Time window size in seconds.
pub time_window_sec: i64,
/// Compaction input files.
pub(crate) inputs: Vec<FileHandle>,
pub inputs: Vec<FileHandle>,
/// If the compaction output is strictly windowed.
pub strict_window: bool,
}
impl CompactionOutput {
@@ -186,12 +188,21 @@ impl CompactionOutput {
sst_layer: AccessLayerRef,
sst_write_buffer_size: ReadableSize,
) -> Result<Option<FileMeta>> {
let time_range = if self.strict_window {
(
Some(self.time_window_bound),
Some(self.time_window_bound + self.time_window_sec),
)
} else {
(None, None)
};
let reader = build_sst_reader(
region_id,
schema,
sst_layer.clone(),
&self.inputs,
self.bucket_bound,
self.bucket_bound + self.bucket,
time_range,
)
.await?;

View File

@@ -0,0 +1,398 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Time-window compaction strategy
use std::collections::BTreeMap;
use std::fmt::{Debug, Formatter};
use std::marker::PhantomData;
use common_telemetry::tracing::warn;
use common_telemetry::{debug, info};
use common_time::timestamp::TimeUnit;
use common_time::timestamp_millis::BucketAligned;
use common_time::Timestamp;
use store_api::logstore::LogStore;
use crate::compaction::picker::get_expired_ssts;
use crate::compaction::task::CompactionOutput;
use crate::compaction::{infer_time_bucket, CompactionRequestImpl, CompactionTaskImpl, Picker};
use crate::sst::{FileHandle, LevelMeta};
/// `TwcsPicker` picks files of which the max timestamp are in the same time window as compaction
/// candidates.
pub struct TwcsPicker<S> {
max_active_window_files: usize,
max_inactive_window_files: usize,
time_window_seconds: Option<i64>,
_phantom_data: PhantomData<S>,
}
impl<S> Debug for TwcsPicker<S> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.debug_struct("TwcsPicker")
.field("max_active_window_files", &self.max_active_window_files)
.field("max_inactive_window_files", &self.max_inactive_window_files)
.finish()
}
}
impl<S> TwcsPicker<S> {
pub fn new(
max_active_window_files: usize,
max_inactive_window_files: usize,
time_window_seconds: Option<i64>,
) -> Self {
Self {
max_inactive_window_files,
max_active_window_files,
_phantom_data: Default::default(),
time_window_seconds,
}
}
/// Builds compaction output from files.
/// For active writing window, we allow for at most `max_active_window_files` files to alleviate
/// fragmentation. For other windows, we allow at most 1 file at each window.
fn build_output(
&self,
time_windows: &BTreeMap<i64, Vec<FileHandle>>,
active_window: Option<i64>,
window_size: i64,
) -> Vec<CompactionOutput> {
let mut output = vec![];
for (window, files) in time_windows {
if let Some(active_window) = active_window && *window == active_window {
if files.len() > self.max_active_window_files {
output.push(CompactionOutput {
output_level: 1, // we only have two levels and always compact to l1
time_window_bound: *window,
time_window_sec: window_size,
inputs: files.clone(),
// Strict window is not needed since we always compact many files to one
// single file in TWCS.
strict_window: false,
});
} else {
debug!("Active window not present or no enough files in active window {:?}", active_window);
}
} else {
// not active writing window
if files.len() > self.max_inactive_window_files {
output.push(CompactionOutput {
output_level: 1,
time_window_bound: *window,
time_window_sec: window_size,
inputs: files.clone(),
strict_window: false,
});
}
}
}
output
}
}
impl<S: LogStore> Picker for TwcsPicker<S> {
type Request = CompactionRequestImpl<S>;
type Task = CompactionTaskImpl<S>;
fn pick(&self, req: &Self::Request) -> crate::error::Result<Option<Self::Task>> {
let levels = req.levels();
let expired_ssts = get_expired_ssts(levels.levels(), req.ttl, Timestamp::current_millis())?;
if !expired_ssts.is_empty() {
info!(
"Expired SSTs in region {}: {:?}",
req.region_id, expired_ssts
);
// here we mark expired SSTs as compacting to avoid them being picked.
expired_ssts.iter().for_each(|f| f.mark_compacting(true));
}
let time_window_size = req
.compaction_time_window
.or(self.time_window_seconds)
.unwrap_or_else(|| {
let inferred = infer_time_bucket(req.levels().level(0).files());
info!(
"Compaction window for region {} is not present, inferring from files: {:?}",
req.region_id, inferred
);
inferred
});
// Find active window from files in level 0.
let active_window =
find_latest_window_in_seconds(levels.level(0).files(), time_window_size);
let windows = assign_to_windows(
levels.levels().iter().flat_map(LevelMeta::files),
time_window_size,
);
let outputs = self.build_output(&windows, active_window, time_window_size);
let task = CompactionTaskImpl {
schema: req.schema(),
sst_layer: req.sst_layer.clone(),
outputs,
writer: req.writer.clone(),
shared_data: req.shared.clone(),
wal: req.wal.clone(),
manifest: req.manifest.clone(),
expired_ssts,
sst_write_buffer_size: req.sst_write_buffer_size,
compaction_time_window: Some(time_window_size),
};
Ok(Some(task))
}
}
/// Assigns files to windows with predefined window size (in seconds) by their max timestamps.
fn assign_to_windows<'a>(
files: impl Iterator<Item = &'a FileHandle>,
time_window_size: i64,
) -> BTreeMap<i64, Vec<FileHandle>> {
let mut windows: BTreeMap<i64, Vec<FileHandle>> = BTreeMap::new();
// Iterates all files and assign to time windows according to max timestamp
for file in files {
if let Some((_, end)) = file.time_range() {
let time_window = end
.convert_to(TimeUnit::Second)
.unwrap()
.value()
.align_to_ceil_by_bucket(time_window_size)
.unwrap_or(i64::MIN);
windows.entry(time_window).or_default().push(file.clone());
} else {
warn!("Unexpected file w/o timestamp: {:?}", file.file_id());
}
}
windows
}
/// Finds the latest active writing window among all files.
/// Returns `None` when there are no files or all files are corrupted.
fn find_latest_window_in_seconds<'a>(
files: impl Iterator<Item = &'a FileHandle>,
time_window_size: i64,
) -> Option<i64> {
let mut latest_timestamp = None;
for f in files {
if let Some((_, end)) = f.time_range() {
if let Some(latest) = latest_timestamp && end > latest {
latest_timestamp = Some(end);
} else {
latest_timestamp = Some(end);
}
} else {
warn!("Cannot find timestamp range of file: {}", f.file_id());
}
}
latest_timestamp
.and_then(|ts| ts.convert_to_ceil(TimeUnit::Second))
.and_then(|ts| ts.value().align_to_ceil_by_bucket(time_window_size))
}
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use log_store::NoopLogStore;
use super::*;
use crate::compaction::tests::new_file_handle;
use crate::sst::{FileId, Level};
#[test]
fn test_get_latest_window_in_seconds() {
assert_eq!(
Some(1),
find_latest_window_in_seconds([new_file_handle(FileId::random(), 0, 999, 0)].iter(), 1)
);
assert_eq!(
Some(1),
find_latest_window_in_seconds(
[new_file_handle(FileId::random(), 0, 1000, 0)].iter(),
1
)
);
assert_eq!(
Some(-9223372036854000),
find_latest_window_in_seconds(
[new_file_handle(FileId::random(), i64::MIN, i64::MIN + 1, 0)].iter(),
3600,
)
);
assert_eq!(
(i64::MAX / 10000000 + 1) * 10000,
find_latest_window_in_seconds(
[new_file_handle(FileId::random(), i64::MIN, i64::MAX, 0)].iter(),
10000,
)
.unwrap()
);
}
#[test]
fn test_assign_to_windows() {
let windows = assign_to_windows(
[
new_file_handle(FileId::random(), 0, 999, 0),
new_file_handle(FileId::random(), 0, 999, 0),
new_file_handle(FileId::random(), 0, 999, 0),
new_file_handle(FileId::random(), 0, 999, 0),
new_file_handle(FileId::random(), 0, 999, 0),
]
.iter(),
3,
);
assert_eq!(5, windows.get(&0).unwrap().len());
let files = [FileId::random(); 3];
let windows = assign_to_windows(
[
new_file_handle(files[0], -2000, -3, 0),
new_file_handle(files[1], 0, 2999, 0),
new_file_handle(files[2], 50, 10001, 0),
]
.iter(),
3,
);
assert_eq!(files[0], windows.get(&0).unwrap().get(0).unwrap().file_id());
assert_eq!(files[1], windows.get(&3).unwrap().get(0).unwrap().file_id());
assert_eq!(
files[2],
windows.get(&12).unwrap().get(0).unwrap().file_id()
);
}
struct CompactionPickerTestCase {
window_size: i64,
input_files: Vec<FileHandle>,
expected_outputs: Vec<ExpectedOutput>,
}
impl CompactionPickerTestCase {
fn check(&self) {
let windows = assign_to_windows(self.input_files.iter(), self.window_size);
let active_window =
find_latest_window_in_seconds(self.input_files.iter(), self.window_size);
let output = TwcsPicker::<NoopLogStore>::new(4, 1, None).build_output(
&windows,
active_window,
self.window_size,
);
let output = output
.iter()
.map(|o| {
let input_file_ids =
o.inputs.iter().map(|f| f.file_id()).collect::<HashSet<_>>();
(
input_file_ids,
o.output_level,
o.time_window_sec,
o.time_window_bound,
o.strict_window,
)
})
.collect::<Vec<_>>();
let expected = self
.expected_outputs
.iter()
.map(|o| {
let input_file_ids = o
.input_files
.iter()
.map(|idx| self.input_files[*idx].file_id())
.collect::<HashSet<_>>();
(
input_file_ids,
o.output_level,
o.time_window_sec,
o.time_window_bound,
o.strict_window,
)
})
.collect::<Vec<_>>();
assert_eq!(expected, output);
}
}
struct ExpectedOutput {
input_files: Vec<usize>,
output_level: Level,
time_window_sec: i64,
time_window_bound: i64,
strict_window: bool,
}
#[test]
fn test_build_twcs_output() {
let file_ids = (0..4).map(|_| FileId::random()).collect::<Vec<_>>();
CompactionPickerTestCase {
window_size: 3,
input_files: [
new_file_handle(file_ids[0], -2000, -3, 0),
new_file_handle(file_ids[1], -3000, -100, 0),
new_file_handle(file_ids[2], 0, 2999, 0), //active windows
new_file_handle(file_ids[3], 50, 2998, 0), //active windows
]
.to_vec(),
expected_outputs: vec![ExpectedOutput {
input_files: vec![0, 1],
output_level: 1,
time_window_sec: 3,
time_window_bound: 0,
strict_window: false,
}],
}
.check();
let file_ids = (0..6).map(|_| FileId::random()).collect::<Vec<_>>();
CompactionPickerTestCase {
window_size: 3,
input_files: [
new_file_handle(file_ids[0], -2000, -3, 0),
new_file_handle(file_ids[1], -3000, -100, 0),
new_file_handle(file_ids[2], 0, 2999, 0),
new_file_handle(file_ids[3], 50, 2998, 0),
new_file_handle(file_ids[4], 11, 2990, 0),
new_file_handle(file_ids[5], 50, 4998, 0),
]
.to_vec(),
expected_outputs: vec![
ExpectedOutput {
input_files: vec![0, 1],
output_level: 1,
time_window_sec: 3,
time_window_bound: 0,
strict_window: false,
},
ExpectedOutput {
input_files: vec![2, 3, 4],
output_level: 1,
time_window_sec: 3,
time_window_bound: 3,
strict_window: false,
},
],
}
.check();
}
}

View File

@@ -16,6 +16,7 @@ use common_query::logical_plan::{DfExpr, Expr};
use common_time::timestamp::TimeUnit;
use datafusion_expr::Operator;
use datatypes::value::timestamp_to_scalar_value;
use store_api::storage::RegionId;
use crate::chunk::{ChunkReaderBuilder, ChunkReaderImpl};
use crate::error;
@@ -24,11 +25,11 @@ use crate::sst::{AccessLayerRef, FileHandle};
/// Builds an SST reader that only reads rows within given time range.
pub(crate) async fn build_sst_reader(
region_id: RegionId,
schema: RegionSchemaRef,
sst_layer: AccessLayerRef,
files: &[FileHandle],
lower_sec_inclusive: i64,
upper_sec_exclusive: i64,
time_range: (Option<i64>, Option<i64>),
) -> error::Result<ChunkReaderImpl> {
// TODO(hl): Schemas in different SSTs may differ, thus we should infer
// timestamp column name from Parquet metadata.
@@ -38,17 +39,12 @@ pub(crate) async fn build_sst_reader(
let ts_col_unit = ts_col.data_type.as_timestamp().unwrap().unit();
let ts_col_name = ts_col.name.clone();
ChunkReaderBuilder::new(schema, sst_layer)
ChunkReaderBuilder::new(region_id, schema, sst_layer)
.pick_ssts(files)
.filters(
build_time_range_filter(
lower_sec_inclusive,
upper_sec_exclusive,
&ts_col_name,
ts_col_unit,
)
.into_iter()
.collect(),
build_time_range_filter(time_range, &ts_col_name, ts_col_unit)
.into_iter()
.collect(),
)
.build()
.await
@@ -57,21 +53,22 @@ pub(crate) async fn build_sst_reader(
/// Build time range filter expr from lower (inclusive) and upper bound(exclusive).
/// Returns `None` if time range overflows.
fn build_time_range_filter(
low_sec: i64,
high_sec: i64,
time_range: (Option<i64>, Option<i64>),
ts_col_name: &str,
ts_col_unit: TimeUnit,
) -> Option<Expr> {
debug_assert!(low_sec <= high_sec);
let (low_ts_inclusive, high_ts_exclusive) = time_range;
let ts_col = DfExpr::Column(datafusion_common::Column::from_name(ts_col_name));
// Converting seconds to whatever unit won't lose precision.
// Here only handles overflow.
let low_ts = common_time::Timestamp::new_second(low_sec)
.convert_to(ts_col_unit)
let low_ts = low_ts_inclusive
.map(common_time::Timestamp::new_second)
.and_then(|ts| ts.convert_to(ts_col_unit))
.map(|ts| ts.value());
let high_ts = common_time::Timestamp::new_second(high_sec)
.convert_to(ts_col_unit)
let high_ts = high_ts_exclusive
.map(common_time::Timestamp::new_second)
.and_then(|ts| ts.convert_to(ts_col_unit))
.map(|ts| ts.value());
let expr = match (low_ts, high_ts) {
@@ -139,6 +136,8 @@ mod tests {
use crate::sst::{self, FileId, FileMeta, FsAccessLayer, Source, SstInfo, WriteOptions};
use crate::test_util::descriptor_util::RegionDescBuilder;
const REGION_ID: RegionId = 1;
fn schema_for_test() -> RegionSchemaRef {
// Just build a region desc and use its columns metadata.
let desc = RegionDescBuilder::new("test")
@@ -277,7 +276,9 @@ mod tests {
handle
}
// The region id is only used to build the reader, we don't check its content.
async fn check_reads(
region_id: RegionId,
schema: RegionSchemaRef,
sst_layer: AccessLayerRef,
files: &[FileHandle],
@@ -286,11 +287,11 @@ mod tests {
expect: &[i64],
) {
let mut reader = build_sst_reader(
region_id,
schema,
sst_layer,
files,
lower_sec_inclusive,
upper_sec_exclusive,
(Some(lower_sec_inclusive), Some(upper_sec_exclusive)),
)
.await
.unwrap();
@@ -352,6 +353,7 @@ mod tests {
let files = vec![file1, file2];
// read from two sst files with time range filter,
check_reads(
REGION_ID,
schema.clone(),
sst_layer.clone(),
&files,
@@ -361,7 +363,7 @@ mod tests {
)
.await;
check_reads(schema, sst_layer, &files, 1, 2, &[1000]).await;
check_reads(REGION_ID, schema, sst_layer, &files, 1, 2, &[1000]).await;
}
async fn read_file(
@@ -370,9 +372,15 @@ mod tests {
sst_layer: AccessLayerRef,
) -> Vec<i64> {
let mut timestamps = vec![];
let mut reader = build_sst_reader(schema, sst_layer, files, i64::MIN, i64::MAX)
.await
.unwrap();
let mut reader = build_sst_reader(
REGION_ID,
schema,
sst_layer,
files,
(Some(i64::MIN), Some(i64::MAX)),
)
.await
.unwrap();
while let Some(chunk) = reader.next_chunk().await.unwrap() {
let ts = chunk.columns[0]
.as_any()
@@ -434,15 +442,33 @@ mod tests {
let sst_layer = Arc::new(FsAccessLayer::new("./", object_store.clone()));
let input_files = vec![file2, file1];
let reader1 = build_sst_reader(schema.clone(), sst_layer.clone(), &input_files, 0, 3)
.await
.unwrap();
let reader2 = build_sst_reader(schema.clone(), sst_layer.clone(), &input_files, 3, 6)
.await
.unwrap();
let reader3 = build_sst_reader(schema.clone(), sst_layer.clone(), &input_files, 6, 10)
.await
.unwrap();
let reader1 = build_sst_reader(
REGION_ID,
schema.clone(),
sst_layer.clone(),
&input_files,
(Some(0), Some(3)),
)
.await
.unwrap();
let reader2 = build_sst_reader(
REGION_ID,
schema.clone(),
sst_layer.clone(),
&input_files,
(Some(3), Some(6)),
)
.await
.unwrap();
let reader3 = build_sst_reader(
REGION_ID,
schema.clone(),
sst_layer.clone(),
&input_files,
(Some(6), Some(10)),
)
.await
.unwrap();
let opts = WriteOptions {
sst_write_buffer_size: ReadableSize::mb(8),
@@ -525,7 +551,12 @@ mod tests {
#[test]
fn test_build_time_range_filter() {
assert!(build_time_range_filter(i64::MIN, i64::MAX, "ts", TimeUnit::Nanosecond).is_none());
assert!(build_time_range_filter(
(Some(i64::MIN), Some(i64::MAX)),
"ts",
TimeUnit::Nanosecond
)
.is_none());
assert_eq!(
Expr::from(datafusion_expr::binary_expr(
@@ -533,10 +564,10 @@ mod tests {
Operator::Lt,
datafusion_expr::lit(timestamp_to_scalar_value(
TimeUnit::Nanosecond,
Some(TimeUnit::Second.factor() as i64 / TimeUnit::Nanosecond.factor() as i64)
))
Some(TimeUnit::Second.factor() as i64 / TimeUnit::Nanosecond.factor() as i64),
)),
)),
build_time_range_filter(i64::MIN, 1, "ts", TimeUnit::Nanosecond).unwrap()
build_time_range_filter((Some(i64::MIN), Some(1)), "ts", TimeUnit::Nanosecond).unwrap()
);
assert_eq!(
@@ -547,10 +578,10 @@ mod tests {
TimeUnit::Nanosecond,
Some(
2 * TimeUnit::Second.factor() as i64 / TimeUnit::Nanosecond.factor() as i64
)
))
),
)),
)),
build_time_range_filter(2, i64::MAX, "ts", TimeUnit::Nanosecond).unwrap()
build_time_range_filter((Some(2), Some(i64::MAX)), "ts", TimeUnit::Nanosecond).unwrap()
);
}
}

View File

@@ -23,8 +23,8 @@ use snafu::ResultExt;
use store_api::logstore::LogStore;
use store_api::manifest::Manifest;
use store_api::storage::{
CloseContext, CloseOptions, CreateOptions, EngineContext, OpenOptions, Region,
RegionDescriptor, StorageEngine,
CloseContext, CloseOptions, CompactionStrategy, CreateOptions, EngineContext, OpenOptions,
Region, RegionDescriptor, StorageEngine,
};
use crate::compaction::CompactionSchedulerRef;
@@ -395,6 +395,7 @@ impl<S: LogStore> EngineInner<S> {
name,
&self.config,
opts.ttl,
opts.compaction_strategy.clone(),
)
.await?;
@@ -440,6 +441,7 @@ impl<S: LogStore> EngineInner<S> {
&region_name,
&self.config,
opts.ttl,
opts.compaction_strategy.clone(),
)
.await?;
@@ -471,6 +473,7 @@ impl<S: LogStore> EngineInner<S> {
region_name: &str,
config: &EngineConfig,
region_ttl: Option<Duration>,
compaction_strategy: CompactionStrategy,
) -> Result<StoreConfig<S>> {
let parent_dir = util::normalize_dir(parent_dir);
@@ -503,6 +506,7 @@ impl<S: LogStore> EngineInner<S> {
ttl,
write_buffer_size: write_buffer_size
.unwrap_or(self.config.region_write_buffer_size.as_bytes() as usize),
compaction_strategy,
})
}

View File

@@ -25,7 +25,7 @@ use store_api::storage::{RegionId, SequenceNumber};
use tokio::sync::oneshot::{Receiver, Sender};
use tokio::sync::{oneshot, Notify};
use crate::compaction::{CompactionRequestImpl, CompactionSchedulerRef};
use crate::compaction::{CompactionPickerRef, CompactionRequestImpl, CompactionSchedulerRef};
use crate::config::EngineConfig;
use crate::engine::RegionMap;
use crate::error::{
@@ -109,6 +109,7 @@ pub struct FlushRegionRequest<S: LogStore> {
pub ttl: Option<Duration>,
/// Time window for compaction.
pub compaction_time_window: Option<i64>,
pub compaction_picker: CompactionPickerRef<S>,
}
impl<S: LogStore> FlushRegionRequest<S> {
@@ -146,6 +147,7 @@ impl<S: LogStore> From<&FlushRegionRequest<S>> for CompactionRequestImpl<S> {
ttl: req.ttl,
compaction_time_window: req.compaction_time_window,
sender: None,
picker: req.compaction_picker.clone(),
sst_write_buffer_size: req.engine_config.sst_write_buffer_size,
}
}

View File

@@ -75,9 +75,7 @@ pub trait Memtable: Send + Sync + fmt::Debug {
/// Iterates the memtable.
fn iter(&self, ctx: IterContext) -> Result<BoxedBatchIterator>;
/// Returns the estimated bytes allocated by this memtable from heap. Result
/// of this method may be larger than the estimated based on [`num_rows`] because
/// of the implementor's pre-alloc behavior.
/// Returns the number of rows in the memtable.
fn num_rows(&self) -> usize;
/// Returns stats of this memtable.

View File

@@ -14,9 +14,10 @@
//! Common structs and utilities for read.
mod chain;
mod dedup;
mod merge;
pub(crate) mod windowed;
mod windowed;
use std::cmp::Ordering;
@@ -25,11 +26,13 @@ use common_base::BitVec;
use datatypes::data_type::DataType;
use datatypes::prelude::ConcreteDataType;
use datatypes::vectors::{BooleanVector, MutableVector, VectorRef};
pub use dedup::DedupReader;
pub use merge::{MergeReader, MergeReaderBuilder};
use snafu::{ensure, ResultExt};
use crate::error::{self, Result};
pub use crate::read::chain::ChainReader;
pub use crate::read::dedup::DedupReader;
pub use crate::read::merge::{MergeReader, MergeReaderBuilder};
pub use crate::read::windowed::WindowedReader;
/// Storage internal representation of a batch of rows.
// Now the structure of `Batch` is still unstable, all pub fields may be changed.

View File

@@ -0,0 +1,124 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use crate::error::Result;
use crate::read::{Batch, BatchReader};
use crate::schema::ProjectedSchemaRef;
/// A reader that simply chain the outputs of input readers.
pub struct ChainReader<R> {
/// Schema to read
pub schema: ProjectedSchemaRef,
/// Each reader reads a slice of time window
pub readers: Vec<R>,
}
impl<R> ChainReader<R> {
/// Returns a new [ChainReader] with specific input `readers`.
pub fn new(schema: ProjectedSchemaRef, mut readers: Vec<R>) -> Self {
// Reverse readers since we iter them backward.
readers.reverse();
Self { schema, readers }
}
}
#[async_trait::async_trait]
impl<R> BatchReader for ChainReader<R>
where
R: BatchReader,
{
async fn next_batch(&mut self) -> Result<Option<Batch>> {
while let Some(reader) = self.readers.last_mut() {
if let Some(batch) = reader.next_batch().await? {
return Ok(Some(batch));
} else {
// Remove the exhausted reader.
self.readers.pop();
}
}
Ok(None)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::test_util::read_util::{self, Batches, VecBatchReader};
fn build_chain_reader(sources: &[Batches]) -> ChainReader<VecBatchReader> {
let schema = read_util::new_projected_schema();
let readers = sources
.iter()
.map(|source| read_util::build_vec_reader(source))
.collect();
ChainReader::new(schema, readers)
}
async fn check_chain_reader_result(
mut reader: ChainReader<VecBatchReader>,
input: &[Batches<'_>],
) {
let expect: Vec<_> = input
.iter()
.flat_map(|v| v.iter())
.flat_map(|v| v.iter().copied())
.collect();
let result = read_util::collect_kv_batch(&mut reader).await;
assert_eq!(expect, result);
// Call next_batch() again is allowed.
assert!(reader.next_batch().await.unwrap().is_none());
}
#[tokio::test]
async fn test_chain_empty() {
let mut reader = build_chain_reader(&[]);
assert!(reader.next_batch().await.unwrap().is_none());
// Call next_batch() again is allowed.
assert!(reader.next_batch().await.unwrap().is_none());
}
#[tokio::test]
async fn test_chain_one() {
let input: &[Batches] = &[&[
&[(1, Some(1)), (2, Some(2))],
&[(3, Some(3)), (4, Some(4))],
&[(5, Some(5))],
]];
let reader = build_chain_reader(input);
check_chain_reader_result(reader, input).await;
}
#[tokio::test]
async fn test_chain_multi() {
let input: &[Batches] = &[
&[
&[(1, Some(1)), (2, Some(2))],
&[(3, Some(3)), (4, Some(4))],
&[(5, Some(5))],
],
&[&[(6, Some(3)), (7, Some(4)), (8, Some(8))], &[(9, Some(9))]],
&[&[(10, Some(10)), (11, Some(11))], &[(12, Some(12))]],
];
let reader = build_chain_reader(input);
check_chain_reader_result(reader, input).await;
}
}

View File

@@ -608,7 +608,7 @@ mod tests {
use datatypes::vectors::{Int64Vector, TimestampMillisecondVector};
use super::*;
use crate::test_util::read_util;
use crate::test_util::read_util::{self, Batches};
#[tokio::test]
async fn test_merge_reader_empty() {
@@ -653,8 +653,6 @@ mod tests {
assert!(output.contains("pos: 1"));
}
type Batches<'a> = &'a [&'a [(i64, Option<i64>)]];
fn build_merge_reader(sources: &[Batches], num_iter: usize, batch_size: usize) -> MergeReader {
let schema = read_util::new_projected_schema();
let mut builder =

View File

@@ -32,11 +32,13 @@ use store_api::manifest::{
self, Manifest, ManifestLogStorage, ManifestVersion, MetaActionIterator,
};
use store_api::storage::{
AlterRequest, CloseContext, FlushContext, FlushReason, OpenOptions, ReadContext, Region,
RegionId, SequenceNumber, WriteContext, WriteResponse,
AlterRequest, CloseContext, CompactionStrategy, FlushContext, FlushReason, OpenOptions,
ReadContext, Region, RegionId, SequenceNumber, WriteContext, WriteResponse,
};
use crate::compaction::CompactionSchedulerRef;
use crate::compaction::{
compaction_strategy_to_picker, CompactionPickerRef, CompactionSchedulerRef,
};
use crate::config::EngineConfig;
use crate::error::{self, Error, Result};
use crate::file_purger::FilePurgerRef;
@@ -164,6 +166,7 @@ pub struct StoreConfig<S: LogStore> {
pub file_purger: FilePurgerRef,
pub ttl: Option<Duration>,
pub write_buffer_size: usize,
pub compaction_strategy: CompactionStrategy,
}
pub type RecoveredMetadata = (SequenceNumber, (ManifestVersion, RawRegionMetadata));
@@ -252,6 +255,7 @@ impl<S: LogStore> RegionImpl<S> {
flush_strategy: store_config.flush_strategy,
flush_scheduler: store_config.flush_scheduler,
compaction_scheduler: store_config.compaction_scheduler,
compaction_picker: compaction_strategy_to_picker(&store_config.compaction_strategy),
sst_layer: store_config.sst_layer,
manifest: store_config.manifest,
});
@@ -336,6 +340,8 @@ impl<S: LogStore> RegionImpl<S> {
store_config.ttl,
store_config.write_buffer_size,
));
let compaction_picker = compaction_strategy_to_picker(&store_config.compaction_strategy);
let writer_ctx = WriterContext {
shared: &shared,
flush_strategy: &store_config.flush_strategy,
@@ -345,6 +351,7 @@ impl<S: LogStore> RegionImpl<S> {
wal: &wal,
writer: &writer,
manifest: &store_config.manifest,
compaction_picker: compaction_picker.clone(),
};
// Replay all unflushed data.
writer
@@ -364,6 +371,7 @@ impl<S: LogStore> RegionImpl<S> {
flush_strategy: store_config.flush_strategy,
flush_scheduler: store_config.flush_scheduler,
compaction_scheduler: store_config.compaction_scheduler,
compaction_picker,
sst_layer: store_config.sst_layer,
manifest: store_config.manifest,
});
@@ -586,6 +594,7 @@ impl<S: LogStore> RegionImpl<S> {
wal: &inner.wal,
writer: &inner.writer,
manifest: &inner.manifest,
compaction_picker: inner.compaction_picker.clone(),
};
inner.writer.replay(recovered_metadata, writer_ctx).await
@@ -642,6 +651,7 @@ struct RegionInner<S: LogStore> {
flush_strategy: FlushStrategyRef,
flush_scheduler: FlushSchedulerRef<S>,
compaction_scheduler: CompactionSchedulerRef<S>,
compaction_picker: CompactionPickerRef<S>,
sst_layer: AccessLayerRef,
manifest: RegionManifest,
}
@@ -685,6 +695,7 @@ impl<S: LogStore> RegionInner<S> {
wal: &self.wal,
writer: &self.writer,
manifest: &self.manifest,
compaction_picker: self.compaction_picker.clone(),
};
// The writer would also try to compat the schema of write batch if it finds out the
// schema version of request is less than current schema version.
@@ -746,6 +757,7 @@ impl<S: LogStore> RegionInner<S> {
wal: &self.wal,
writer: &self.writer,
manifest: &self.manifest,
compaction_picker: self.compaction_picker.clone(),
};
self.writer.flush(writer_ctx, ctx).await
}
@@ -761,6 +773,7 @@ impl<S: LogStore> RegionInner<S> {
wal: &self.wal,
writer: &self.writer,
manifest: &self.manifest,
compaction_picker: self.compaction_picker.clone(),
};
self.writer.compact(writer_ctx, ctx).await
}

View File

@@ -559,6 +559,7 @@ async fn create_store_config(region_name: &str, root: &str) -> StoreConfig<NoopL
file_purger,
ttl: None,
write_buffer_size: ReadableSize::mb(32).0 as usize,
compaction_strategy: Default::default(),
}
}

View File

@@ -26,7 +26,7 @@ use object_store::ObjectStore;
use store_api::storage::{FlushContext, FlushReason, OpenOptions, Region};
use tokio::sync::{Notify, RwLock};
use crate::compaction::{CompactionHandler, SimplePicker};
use crate::compaction::CompactionHandler;
use crate::config::EngineConfig;
use crate::error::Result;
use crate::file_purger::{FilePurgeHandler, FilePurgeRequest};
@@ -93,13 +93,8 @@ async fn create_region_for_compaction<
store_config.engine_config = Arc::new(engine_config);
store_config.flush_strategy = flush_strategy;
let picker = SimplePicker::default();
let pending_compaction_tasks = Arc::new(RwLock::new(vec![]));
let handler = CompactionHandler {
picker,
#[cfg(test)]
pending_tasks: pending_compaction_tasks.clone(),
};
let handler = CompactionHandler::new_with_pending_tasks(pending_compaction_tasks.clone());
let config = SchedulerConfig::default();
// Overwrite test compaction scheduler and file purger.
store_config.compaction_scheduler = Arc::new(LocalScheduler::new(config, handler));
@@ -262,12 +257,7 @@ impl CompactionTester {
store_config.engine_config = Arc::new(self.engine_config.clone());
store_config.flush_strategy = self.flush_strategy.clone();
let picker = SimplePicker::default();
let handler = CompactionHandler {
picker,
#[cfg(test)]
pending_tasks: Arc::new(Default::default()),
};
let handler = CompactionHandler::new_with_pending_tasks(Arc::new(Default::default()));
let config = SchedulerConfig::default();
// Overwrite test compaction scheduler and file purger.
store_config.compaction_scheduler = Arc::new(LocalScheduler::new(config, handler));

View File

@@ -252,7 +252,7 @@ async fn test_flush_empty() {
}
#[tokio::test]
async fn test_read_after_flush() {
async fn test_read_after_flush_across_window() {
common_telemetry::init_default_ut_logging();
let dir = create_temp_dir("read-flush");
@@ -289,6 +289,44 @@ async fn test_read_after_flush() {
assert_eq!(expect, output);
}
#[tokio::test]
async fn test_read_after_flush_same_window() {
common_telemetry::init_default_ut_logging();
let dir = create_temp_dir("read-flush");
let store_dir = dir.path().to_str().unwrap();
let flush_switch = Arc::new(FlushSwitch::default());
let tester = FlushTester::new(store_dir, flush_switch.clone()).await;
// Put elements so we have content to flush.
tester.put(&[(1000, Some(100))]).await;
tester.put(&[(2000, Some(200))]).await;
// Flush.
tester.flush(None).await;
// Put element again.
tester.put(&[(1003, Some(300))]).await;
let expect = vec![
(1000, Some(100.to_string())),
(1003, Some(300.to_string())),
(2000, Some(200.to_string())),
];
let output = tester.full_scan().await;
assert_eq!(expect, output);
// Reopen
let mut tester = tester;
tester.reopen().await;
// Scan after reopen.
let output = tester.full_scan().await;
assert_eq!(expect, output);
}
#[tokio::test]
async fn test_merge_read_after_flush() {
let dir = create_temp_dir("merge-read-flush");

View File

@@ -27,7 +27,7 @@ use store_api::storage::{
};
use tokio::sync::{oneshot, Mutex};
use crate::compaction::{CompactionRequestImpl, CompactionSchedulerRef};
use crate::compaction::{CompactionPickerRef, CompactionRequestImpl, CompactionSchedulerRef};
use crate::config::EngineConfig;
use crate::error::{self, Result};
use crate::flush::{
@@ -412,6 +412,7 @@ pub struct WriterContext<'a, S: LogStore> {
pub wal: &'a Wal<S>,
pub writer: &'a RegionWriterRef,
pub manifest: &'a RegionManifest,
pub compaction_picker: CompactionPickerRef<S>,
}
impl<'a, S: LogStore> WriterContext<'a, S> {
@@ -779,6 +780,7 @@ impl WriterInner {
engine_config: self.engine_config.clone(),
ttl: self.ttl,
compaction_time_window: current_version.ssts().compaction_time_window(),
compaction_picker: ctx.compaction_picker.clone(),
};
let flush_handle = ctx
@@ -816,6 +818,7 @@ impl WriterInner {
ttl: self.ttl,
compaction_time_window,
sender: None,
picker: writer_ctx.compaction_picker.clone(),
sst_write_buffer_size,
};

View File

@@ -53,15 +53,19 @@ impl Snapshot for SnapshotImpl {
let mutables = memtable_version.mutable_memtable();
let immutables = memtable_version.immutable_memtables();
let mut builder =
ChunkReaderBuilder::new(self.version.schema().clone(), self.sst_layer.clone())
.reserve_num_memtables(memtable_version.num_memtables())
.projection(request.projection)
.filters(request.filters)
.batch_size(ctx.batch_size)
.output_ordering(request.output_ordering)
.visible_sequence(visible_sequence)
.pick_memtables(mutables.clone());
let mut builder = ChunkReaderBuilder::new(
self.version.metadata().id(),
self.version.schema().clone(),
self.sst_layer.clone(),
)
.reserve_num_memtables(memtable_version.num_memtables())
.projection(request.projection)
.filters(request.filters)
.batch_size(ctx.batch_size)
.output_ordering(request.output_ordering)
.visible_sequence(visible_sequence)
.pick_memtables(mutables.clone())
.use_chain_reader(true);
for memtable in immutables {
builder = builder.pick_memtables(memtable.clone());

View File

@@ -125,6 +125,7 @@ pub async fn new_store_config_with_object_store(
file_purger,
ttl: None,
write_buffer_size: DEFAULT_REGION_WRITE_BUFFER_SIZE.as_bytes() as usize,
compaction_strategy: Default::default(),
},
regions,
)

View File

@@ -92,6 +92,8 @@ pub async fn collect_kv_batch(reader: &mut dyn BatchReader) -> Vec<(i64, Option<
result
}
pub type Batches<'a> = &'a [&'a [(i64, Option<i64>)]];
/// A reader for test that pop batch from Vec.
pub struct VecBatchReader {
schema: ProjectedSchemaRef,

View File

@@ -1,3 +1,4 @@
#![feature(let_chains)]
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");

View File

@@ -32,7 +32,10 @@ pub use datatypes::schema::{
pub use self::chunk::{Chunk, ChunkReader};
pub use self::descriptors::*;
pub use self::engine::{CloseOptions, CreateOptions, EngineContext, OpenOptions, StorageEngine};
pub use self::engine::{
CloseOptions, CompactionStrategy, CreateOptions, EngineContext, OpenOptions, StorageEngine,
TwcsOptions,
};
pub use self::metadata::RegionMeta;
pub use self::region::{CloseContext, FlushContext, FlushReason, Region, RegionStat, WriteContext};
pub use self::requests::{

View File

@@ -18,6 +18,7 @@
//! a [`StorageEngine`] instance manages a bunch of storage unit called [`Region`], which holds
//! chunks of rows, support operations like PUT/DELETE/SCAN.
use std::collections::HashMap;
use std::time::Duration;
use async_trait::async_trait;
@@ -26,6 +27,13 @@ use common_error::ext::ErrorExt;
use crate::storage::descriptors::RegionDescriptor;
use crate::storage::region::Region;
const COMPACTION_STRATEGY_KEY: &str = "compaction";
const COMPACTION_STRATEGY_LEVELED_TIME_WINDOW_VALUE: &str = "LTW";
const COMPACTION_STRATEGY_TWCS_VALUE: &str = "TWCS";
const TWCS_MAX_ACTIVE_WINDOW_FILES_KEY: &str = "compaction.twcs.max_active_window_files";
const TWCS_TIME_WINDOW_SECONDS_KEY: &str = "compaction.twcs.time_window_seconds";
const TWCS_MAX_INACTIVE_WINDOW_FILES_KEY: &str = "compaction.twcs.max_inactive_window_files";
/// Storage engine provides primitive operations to store and access data.
#[async_trait]
pub trait StorageEngine: Send + Sync + Clone + 'static {
@@ -92,6 +100,8 @@ pub struct CreateOptions {
pub write_buffer_size: Option<usize>,
/// Region SST files TTL
pub ttl: Option<Duration>,
/// Compaction strategy
pub compaction_strategy: CompactionStrategy,
}
/// Options to open a region.
@@ -103,6 +113,8 @@ pub struct OpenOptions {
pub write_buffer_size: Option<usize>,
/// Region SST files TTL
pub ttl: Option<Duration>,
/// Compaction strategy
pub compaction_strategy: CompactionStrategy,
}
/// Options to close a region.
@@ -111,3 +123,70 @@ pub struct CloseOptions {
/// Flush region
pub flush: bool,
}
/// Options for compactions
#[derive(Debug, Clone, Default)]
pub enum CompactionStrategy {
/// Leveled time window compaction strategy
#[default]
LeveledTimeWindow,
/// TWCS
Twcs(TwcsOptions),
}
/// TWCS compaction options.
#[derive(Debug, Clone)]
pub struct TwcsOptions {
/// Max num of files that can be kept in active writing time window.
pub max_active_window_files: usize,
/// Max num of files that can be kept in inactive time window.
pub max_inactive_window_files: usize,
/// Compaction time window defined when creating tables.
pub time_window_seconds: Option<i64>,
}
impl Default for TwcsOptions {
fn default() -> Self {
Self {
max_active_window_files: 4,
max_inactive_window_files: 1,
time_window_seconds: None,
}
}
}
impl From<&HashMap<String, String>> for CompactionStrategy {
fn from(opts: &HashMap<String, String>) -> Self {
let Some(strategy_name) = opts.get(COMPACTION_STRATEGY_KEY) else { return CompactionStrategy::default() };
if strategy_name.eq_ignore_ascii_case(COMPACTION_STRATEGY_LEVELED_TIME_WINDOW_VALUE) {
CompactionStrategy::LeveledTimeWindow
} else if strategy_name.eq_ignore_ascii_case(COMPACTION_STRATEGY_TWCS_VALUE) {
let mut twcs_opts = TwcsOptions::default();
if let Some(max_active_window_files) = opts
.get(TWCS_MAX_ACTIVE_WINDOW_FILES_KEY)
.and_then(|num| num.parse::<usize>().ok())
{
twcs_opts.max_active_window_files = max_active_window_files;
}
if let Some(max_inactive_window_files) = opts
.get(TWCS_MAX_INACTIVE_WINDOW_FILES_KEY)
.and_then(|num| num.parse::<usize>().ok())
{
twcs_opts.max_inactive_window_files = max_inactive_window_files;
}
if let Some(time_window) = opts
.get(TWCS_TIME_WINDOW_SECONDS_KEY)
.and_then(|num| num.parse::<i64>().ok()) && time_window > 0
{
twcs_opts.time_window_seconds = Some(time_window);
}
CompactionStrategy::Twcs(twcs_opts)
} else {
// unrecognized compaction strategy
CompactionStrategy::default()
}
}
}

View File

@@ -28,7 +28,7 @@ use table::engine::{EngineContext, TableEngineProcedureRef, TableEngineRef, Tabl
use table::requests::{CreateTableRequest, OpenTableRequest};
use crate::error::{
AccessCatalogSnafu, DeserializeProcedureSnafu, SchemaNotFoundSnafu, SerializeProcedureSnafu,
AccessCatalogSnafu, DeserializeProcedureSnafu, SerializeProcedureSnafu, TableExistsSnafu,
};
/// Procedure to create a table.
@@ -132,23 +132,24 @@ impl CreateTableProcedure {
}
async fn on_prepare(&mut self) -> Result<Status> {
if !self
let table_exists = self
.catalog_manager
.schema_exist(
.table_exist(
&self.data.request.catalog_name,
&self.data.request.schema_name,
&self.data.request.table_name,
)
.await
.context(AccessCatalogSnafu)?
{
logging::error!(
"Failed to create table {}, schema not found",
self.data.table_ref(),
);
return SchemaNotFoundSnafu {
name: &self.data.request.schema_name,
}
.fail()?;
.context(AccessCatalogSnafu)?;
if table_exists {
return if self.data.request.create_if_not_exists {
Ok(Status::Done)
} else {
TableExistsSnafu {
name: &self.data.request.table_name,
}
.fail()?
};
}
self.data.state = CreateTableState::EngineCreateTable;
@@ -168,8 +169,9 @@ impl CreateTableProcedure {
// do this check as we might not submitted the subprocedure yet when the manager
// recover this procedure from procedure store.
logging::info!(
"On engine create table {}, subprocedure not found, sub_id: {}",
"On engine create table {}, table_id: {}, subprocedure not found, sub_id: {}",
self.data.request.table_name,
self.data.request.id,
sub_id
);
@@ -195,8 +197,9 @@ impl CreateTableProcedure {
}),
ProcedureState::Done => {
logging::info!(
"On engine create table {}, done, sub_id: {}",
"On engine create table {}, table_id: {}, done, sub_id: {}",
self.data.request.table_name,
self.data.request.id,
sub_id
);
// The sub procedure is done, we can execute next step.

View File

@@ -23,7 +23,7 @@ use common_procedure::{
};
use common_telemetry::logging;
use serde::{Deserialize, Serialize};
use snafu::{OptionExt, ResultExt};
use snafu::{ensure, ResultExt};
use table::engine::{EngineContext, TableEngineProcedureRef, TableReference};
use table::requests::DropTableRequest;
@@ -122,18 +122,21 @@ impl DropTableProcedure {
async fn on_prepare(&mut self) -> Result<Status> {
let request = &self.data.request;
// Ensure the table exists.
let _ = self
let table_exists = self
.catalog_manager
.table(
.table_exist(
&request.catalog_name,
&request.schema_name,
&request.table_name,
)
.await
.context(AccessCatalogSnafu)?
.context(TableNotFoundSnafu {
.context(AccessCatalogSnafu)?;
ensure!(
table_exists,
TableNotFoundSnafu {
name: &request.table_name,
})?;
}
);
self.data.state = DropTableState::RemoveFromCatalog;

Some files were not shown because too many files have changed in this diff Show More