Compare commits

...

32 Commits

Author SHA1 Message Date
Ruihang Xia
2b2fd80bf4 feat: return new added columns in region server's extension response (#3533)
* feat: adapt the new proto response

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* update interfaces

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* write columns to extension

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* use physical column's schema

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* sort logical columns by name

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* format code

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* return physical table's column

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* Update src/common/meta/src/datanode_manager.rs

Co-authored-by: JeremyHi <jiachun_feng@proton.me>

* implement sort column logic

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* proxy create table procedure to create logical table

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add unit test for sort_columns

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* update sqlness cases

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
Co-authored-by: JeremyHi <jiachun_feng@proton.me>
2024-03-23 09:31:16 +00:00
x³u³
24886b9530 test: add a parameter type mismatch test case to sql integration test (#3568) 2024-03-22 17:43:20 +00:00
tison
8345f1753c chore: avoid confusing TryFrom (#3565)
Signed-off-by: tison <wander4096@gmail.com>
2024-03-22 11:16:36 +00:00
tison
3420a010e6 refactor: reduce one clone by carefully pass ready boundary (#3543)
* refactor: reduce one clone by carefully pass ready boundary

Signed-off-by: tison <wander4096@gmail.com>

* defensive handle None

Signed-off-by: tison <wander4096@gmail.com>

* tidy code a bit

Signed-off-by: tison <wander4096@gmail.com>

* except batch exist

Signed-off-by: tison <wander4096@gmail.com>

---------

Signed-off-by: tison <wander4096@gmail.com>
2024-03-22 04:46:17 +00:00
discord9
9f020aa414 fix(flow): Arrange get range with batch unaligned (#3552)
* fix: Arrange get range with batch unaligned

* chore: per review

* refactor: sort at apply_updates
2024-03-22 04:08:37 +00:00
tison
c9ac72e7f8 ci: use a PAT to list all writers (#3559)
Signed-off-by: tison <wander4096@gmail.com>
2024-03-21 20:25:01 -07:00
Lei, HUANG
86fb9d8ac7 refactor: remove redudant PromStoreProtocolHandler::write (#3553)
refactor: remove redudant PromStoreProtocolHandler::write API and rename PromStoreProtocolHandler::write_fast to write
2024-03-22 02:09:00 +00:00
Lei, HUANG
1f0fc40287 fix: performance degradation caused by config change (#3556) 2024-03-21 12:23:52 +00:00
tison
8b7a5aaa4a refactor: handle error for http format (#3548)
* refactor: handle error for http format

Signed-off-by: tison <wander4096@gmail.com>

* finish format handling

Signed-off-by: tison <wander4096@gmail.com>

* simplify auth error

Signed-off-by: tison <wander4096@gmail.com>

* fix

Signed-off-by: tison <wander4096@gmail.com>

* clippy format

Signed-off-by: tison <wander4096@gmail.com>

* no longer set greptime-db-format on influxdb error

Signed-off-by: tison <wander4096@gmail.com>

---------

Signed-off-by: tison <wander4096@gmail.com>
2024-03-21 07:29:11 +00:00
Weny Xu
856a4e1e4f refactor: refactor CacheInvalidator (#3550)
* refactor: refactor InvalidateCache Instruction

* refactor: refactor CacheInvalidator
2024-03-20 10:18:28 +00:00
Yingwen
39b69f1e3b refactor!: Renames the new memtable to PartitionTreeMemtable (#3547)
* refactor: rename mod merge_tree to partition_tree

* refactor: rename merge_tree

* refactor: change merge tree comment

* refactor: rename merge tree struct

* refactor: memtable options
2024-03-20 06:40:41 +00:00
tison
bbcdb28b7c chore: fix comment in fetch-dashboard-assets.sh (#3546) 2024-03-20 06:18:14 +00:00
YCCD
6377982501 feat: Able to pretty print sql query result in http output (#3539)
* feat: Able to pretty print sql query result in http output

* fix: add some tests

* fix: add some space, delete fn into_payload, and impl Display for TableResponse
2024-03-20 03:25:17 +00:00
Lei, HUANG
ddbcff68dd feat: support append-only mode in time-series memtable (#3540)
* feat: support append-only mode in time-series memtable

* fix: rename sort_and_dedup to sort
2024-03-19 20:37:54 +00:00
WU Jingdi
5b315c2d40 feat: support multi params in promql range function macro (#3464)
feat: support multi params in promql range function
2024-03-19 20:36:51 +00:00
Ruihang Xia
9816d2a08b fix: clone data instead of moving it - homemade future is dangerous (#3542)
* fix: clone data instead of moving it - homemade future is dangerous

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add comment

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2024-03-19 13:46:55 +00:00
Ning Sun
a99d6eb3f9 feat: update pgwire to 0.20 for improved performance (#3538) 2024-03-19 10:11:05 +00:00
discord9
2c115bc22a feat(flow): shared in-memory state for dataflow operator (#3508)
* feat: Arrangement shared state

* feat: arrange&tests

* docs: detailed&tests for get

* chore: license

* refactor: opt out ts expr&tests: internal ts

* docs: remove some TODOs

* feat: use smallvec size of 2

* refactor: per review

* chore: per review

* chore: per review

* chore: remove reduant clone

* feat: return max expire time&docs: more explain cur expire config
2024-03-19 10:03:05 +00:00
Yingwen
641592644d feat: support per table memtable options (#3524)
* feat: add memtable builder to region

* refactor: rename memtable_builder in worker to default_memtable_builder

* fix: return error instead of using default compaction options

Support deserializing memtable and compaction options from the option
map

* feat: optional memtable options

* feat: add MemtableBuilderProvider to create builders

* feat: change default memtable and skip deserializing dedup

* chore: update test and comment

* chore: test invalid type

* feat: metric engine use new memtable manually

* feat: expose more memtable configs

* feat: add memtable options to valid option list

* test: add test

* test: sqlness test

* chore: serde workspace

* chore: remove comments
2024-03-19 08:50:10 +00:00
Weny Xu
fa0f3555d4 refactor: introduce the DropTableExecutor (#3534)
* refactor: introduce the DropTableExecutor

* fix: register the dropping regions

* test: add tests for on_prepare

* chroe: add TODO comment
2024-03-19 08:29:12 +00:00
ZonaHe
3cad844acd feat: update dashboard to v0.4.9 (#3531)
Co-authored-by: ZonaHex <ZonaHex@users.noreply.github.com>
2024-03-19 03:18:42 +00:00
JeremyHi
cf25cf984b chore: avoid unnecessary cloning (#3537) 2024-03-18 13:24:13 +00:00
shuiyisong
3acd5bfad0 chore: http header with metrics (#3536)
* chore: bring write cost to output

* chore: add write cost to greptimev1result

* chore: add metrics to influxdb write resp header

* chore: add metrics to prom store

* chore: add metrics to otlp

* chore: add debug log

* fix: prom remote read with output

* fix: prom queries don't output metrics header

* chore: extract header value

* chore: refactor code

* chore: fix cr issue
2024-03-18 11:21:19 +00:00
Weny Xu
343525dab8 refactor: remove removed-prefixed keys (#3535) 2024-03-18 11:07:30 +00:00
tison
0afac58e4d feat(metasrv): implement maintenance (#3527)
* feat(metasrv): implement maintenance

Signed-off-by: tison <wander4096@gmail.com>

* fixup and test

Signed-off-by: tison <wander4096@gmail.com>

* Add coauthors

Co-authored-by: Yingwen <realevenyag@gmail.com>
Co-authored-by: xifyang <595482900@qq.com>

* tidy code

Signed-off-by: tison <wander4096@gmail.com>

* Apply suggestions from code review

Co-authored-by: LFC <990479+MichaelScofield@users.noreply.github.com>

* always read kv_backend maintenance state

Signed-off-by: tison <wander4096@gmail.com>

---------

Signed-off-by: tison <wander4096@gmail.com>
Co-authored-by: Yingwen <realevenyag@gmail.com>
Co-authored-by: xifyang <595482900@qq.com>
Co-authored-by: LFC <990479+MichaelScofield@users.noreply.github.com>
2024-03-18 09:41:14 +00:00
tison
393ea44de0 docs: improve fn comments (#3526)
Signed-off-by: tison <wander4096@gmail.com>
2024-03-18 03:18:01 +00:00
tison
44731fd653 docs: readme style and project status (#3528)
* docs: readme style

Signed-off-by: tison <wander4096@gmail.com>

* more opening

Signed-off-by: tison <wander4096@gmail.com>

* Project Status

Signed-off-by: tison <wander4096@gmail.com>

* tidy

Signed-off-by: tison <wander4096@gmail.com>

---------

Signed-off-by: tison <wander4096@gmail.com>
2024-03-18 03:03:25 +00:00
tison
d36a5a74d3 ci: unassign issues stale 14 days ago (#3529)
This closes https://github.com/GreptimeTeam/greptimedb/issues/3525.
2024-03-18 03:01:10 +00:00
Yingwen
74862f8c3f feat(mito): Checks whether a region should flush periodically (#3459)
* feat: handle flush periodically

* chore: call periodical method in loop

* feat: check periodical tasks on channel timeout

* refactor: use time provider to get time

Mock a time provider to test auto flush

* chore: fix typos

* refactor: rename mock time provider

* style: fix cilppy

* chore: address comment
2024-03-15 06:41:28 +00:00
Weny Xu
a52aedec5b feat: implement the drop database parser (#3521)
* refactor: refactor drop table parser

* feat: implement drop database parser

* fix: canonicalize name of create database

* test: update sqlness result

* Update src/operator/src/statement.rs

Co-authored-by: Ruihang Xia <waynestxia@gmail.com>

---------

Co-authored-by: Ruihang Xia <waynestxia@gmail.com>
2024-03-15 06:15:18 +00:00
tison
b6fac619a6 docs: revise README file (#3522)
* docs: revise README file

Signed-off-by: tison <wander4096@gmail.com>

* build prerequisite

Signed-off-by: tison <wander4096@gmail.com>

---------

Signed-off-by: tison <wander4096@gmail.com>
2024-03-15 04:22:35 +00:00
Weny Xu
a29e7ebb7d feat: acquire all locks in procedure (#3514)
* feat: acquire catalog and schema lock in region failover

* chore: remove unused code

* feat!: acquire catalog and schema lock in region migration

* feat: acquire catalog and schema lock in create table
2024-03-14 11:41:23 +00:00
184 changed files with 4214 additions and 1508 deletions

21
.github/workflows/unassign.yml vendored Normal file
View File

@@ -0,0 +1,21 @@
name: Auto Unassign
on:
schedule:
- cron: '4 2 * * *'
workflow_dispatch:
permissions:
contents: read
issues: write
pull-requests: write
jobs:
auto-unassign:
name: Auto Unassign
runs-on: ubuntu-latest
steps:
- name: Auto Unassign
uses: tisonspieces/auto-unassign@main
with:
token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
repository: ${{ github.repository }}

15
Cargo.lock generated
View File

@@ -863,6 +863,12 @@ version = "0.21.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35636a1494ede3b646cc98f74f8e62c773a38a659ebc777a2cf26b9b74171df9"
[[package]]
name = "base64"
version = "0.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9475866fec1451be56a3c2400fd081ff546538961565ccb5b7142cbd22bc7a51"
[[package]]
name = "base64ct"
version = "1.6.0"
@@ -3482,6 +3488,7 @@ dependencies = [
"serde_json",
"servers",
"session",
"smallvec",
"snafu",
"tokio",
"tonic 0.10.2",
@@ -3863,7 +3870,7 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
[[package]]
name = "greptime-proto"
version = "0.1.0"
source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=96f1f0404f421ee560a4310c73c5071e49168168#96f1f0404f421ee560a4310c73c5071e49168168"
source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=349cb385583697f41010dabeb3c106d58f9599b4#349cb385583697f41010dabeb3c106d58f9599b4"
dependencies = [
"prost 0.12.3",
"serde",
@@ -6696,12 +6703,12 @@ dependencies = [
[[package]]
name = "pgwire"
version = "0.19.1"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17780c93587822c191c3f4d43fa5f6bc6df1e51b9f58a0be0cd1b7fd6e80d9e6"
checksum = "c00492c52bb65e0421211b7f4c5d9de7586e53786a3b244efb00f74851206bf6"
dependencies = [
"async-trait",
"base64 0.21.5",
"base64 0.22.0",
"bytes",
"chrono",
"derive-new 0.6.0",

View File

@@ -103,7 +103,7 @@ etcd-client = "0.12"
fst = "0.4.7"
futures = "0.3"
futures-util = "0.3"
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "96f1f0404f421ee560a4310c73c5071e49168168" }
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "349cb385583697f41010dabeb3c106d58f9599b4" }
humantime-serde = "1.1"
itertools = "0.10"
lazy_static = "1.4"

219
README.md
View File

@@ -6,145 +6,154 @@
</picture>
</p>
<h1 align="center">Cloud-scale, Fast and Efficient Time Series Database</h1>
<div align="center">
<h3 align="center">
The next-generation hybrid time-series/analytics processing database in the cloud
</h3>
<a href="https://greptime.com/product/cloud">GreptimeCloud</a> |
<a href="https://docs.greptime.com/">User guide</a> |
<a href="https://greptimedb.rs/">API Docs</a> |
<a href="https://github.com/GreptimeTeam/greptimedb/issues/3412">Roadmap 2024</a>
</h4>
<p align="center">
<a href="https://codecov.io/gh/GrepTimeTeam/greptimedb"><img src="https://codecov.io/gh/GrepTimeTeam/greptimedb/branch/main/graph/badge.svg?token=FITFDI3J3C"></img></a>
&nbsp;
<a href="https://github.com/GreptimeTeam/greptimedb/actions/workflows/develop.yml"><img src="https://github.com/GreptimeTeam/greptimedb/actions/workflows/develop.yml/badge.svg" alt="CI"></img></a>
&nbsp;
<a href="https://github.com/greptimeTeam/greptimedb/blob/main/LICENSE"><img src="https://img.shields.io/github/license/greptimeTeam/greptimedb"></a>
</p>
<a href="https://github.com/GreptimeTeam/greptimedb/releases/latest">
<img src="https://img.shields.io/github/v/release/GreptimeTeam/greptimedb.svg" alt="Version"/>
</a>
<a href="https://github.com/GreptimeTeam/greptimedb/releases/latest">
<img src="https://img.shields.io/github/release-date/GreptimeTeam/greptimedb.svg" alt="Releases"/>
</a>
<a href="https://hub.docker.com/r/greptime/greptimedb/">
<img src="https://img.shields.io/docker/pulls/greptime/greptimedb.svg" alt="Docker Pulls"/>
</a>
<a href="https://github.com/GreptimeTeam/greptimedb/actions/workflows/develop.yml">
<img src="https://github.com/GreptimeTeam/greptimedb/actions/workflows/develop.yml/badge.svg" alt="GitHub Actions"/>
</a>
<a href="https://codecov.io/gh/GrepTimeTeam/greptimedb">
<img src="https://codecov.io/gh/GrepTimeTeam/greptimedb/branch/main/graph/badge.svg?token=FITFDI3J3C" alt="Codecov"/>
</a>
<a href="https://github.com/greptimeTeam/greptimedb/blob/main/LICENSE">
<img src="https://img.shields.io/github/license/greptimeTeam/greptimedb" alt="License"/>
</a>
<p align="center">
<a href="https://twitter.com/greptime"><img src="https://img.shields.io/badge/twitter-follow_us-1d9bf0.svg"></a>
&nbsp;
<a href="https://www.linkedin.com/company/greptime/"><img src="https://img.shields.io/badge/linkedin-connect_with_us-0a66c2.svg"></a>
&nbsp;
<a href="https://greptime.com/slack"><img src="https://img.shields.io/badge/slack-GreptimeDB-0abd59?logo=slack" alt="slack" /></a>
</p>
<br/>
## What is GreptimeDB
<a href="https://greptime.com/slack">
<img src="https://img.shields.io/badge/slack-GreptimeDB-0abd59?logo=slack&style=for-the-badge" alt="Slack"/>
</a>
<a href="https://twitter.com/greptime">
<img src="https://img.shields.io/badge/twitter-follow_us-1d9bf0.svg?style=for-the-badge" alt="Twitter"/>
</a>
<a href="https://www.linkedin.com/company/greptime/">
<img src="https://img.shields.io/badge/linkedin-connect_with_us-0a66c2.svg?style=for-the-badge" alt="LinkedIn"/>
</a>
</div>
GreptimeDB is an open-source time-series database focusing on efficiency, scalability, and analytical capabilities.
It's designed to work on infrastructure of the cloud era, and users benefit from its elasticity and commodity storage.
## Introduction
Our core developers have been building time-series data platforms for years. Based on their best-practices, GreptimeDB is born to give you:
**GreptimeDB** is an open-source time-series database focusing on efficiency, scalability, and analytical capabilities.
Designed to work on infrastructure of the cloud era, GreptimeDB benefits users with its elasticity and commodity storage, offering a fast and cost-effective **alternative to InfluxDB** and a **long-term storage for Prometheus**.
- Optimized columnar layout for handling time-series data; compacted, compressed, and stored on various storage backends, particularly cloud object storage with 50x cost efficiency.
- Fully open-source distributed cluster architecture that harnesses the power of cloud-native elastic computing resources.
- Seamless scalability from a standalone binary at edge to a robust, highly available distributed cluster in cloud, with a transparent experience for both developers and administrators.
- Native SQL and PromQL for queries, and Python scripting to facilitate complex analytical tasks.
- Flexible indexing capabilities and distributed, parallel-processing query engine, tackling high cardinality issues down.
- Widely adopted database protocols and APIs, including MySQL, PostgreSQL, and Prometheus Remote Storage, etc.
## Why GreptimeDB
## Quick Start
Our core developers have been building time-series data platforms for years. Based on our best-practices, GreptimeDB is born to give you:
### [GreptimePlay](https://greptime.com/playground)
* **Easy horizontal scaling**
Seamless scalability from a standalone binary at edge to a robust, highly available distributed cluster in cloud, with a transparent experience for both developers and administrators.
* **Analyzing time-series data**
Query your time-series data with SQL and PromQL. Use Python scripts to facilitate complex analytical tasks.
* **Cloud-native distributed database**
Fully open-source distributed cluster architecture that harnesses the power of cloud-native elastic computing resources.
* **Performance and Cost-effective**
Flexible indexing capabilities and distributed, parallel-processing query engine, tackling high cardinality issues down. Optimized columnar layout for handling time-series data; compacted, compressed, and stored on various storage backends, particularly cloud object storage with 50x cost efficiency.
* **Compatible with InfluxDB, Prometheus and more protocols**
Widely adopted database protocols and APIs, including MySQL, PostgreSQL, and Prometheus Remote Storage, etc. [Read more](https://docs.greptime.com/user-guide/clients/overview).
## Try GreptimeDB
### 1. [GreptimePlay](https://greptime.com/playground)
Try out the features of GreptimeDB right from your browser.
### Build
### 2. [GreptimeCloud](https://console.greptime.cloud/)
#### Build from Source
Start instantly with a free cluster.
To compile GreptimeDB from source, you'll need:
### 3. Docker Image
- C/C++ Toolchain: provides basic tools for compiling and linking. This is
available as `build-essential` on ubuntu and similar name on other platforms.
- Rust: the easiest way to install Rust is to use
[`rustup`](https://rustup.rs/), which will check our `rust-toolchain` file and
install correct Rust version for you.
- Protobuf: `protoc` is required for compiling `.proto` files. `protobuf` is
available from major package manager on macos and linux distributions. You can
find an installation instructions [here](https://grpc.io/docs/protoc-installation/).
**Note that `protoc` version needs to be >= 3.15** because we have used the `optional`
keyword. You can check it with `protoc --version`.
- python3-dev or python3-devel(Optional feature, only needed if you want to run scripts
in CPython, and also need to enable `pyo3_backend` feature when compiling(by `cargo run -F pyo3_backend` or add `pyo3_backend` to src/script/Cargo.toml 's `features.default` like `default = ["python", "pyo3_backend]`)): this install a Python shared library required for running Python
scripting engine(In CPython Mode). This is available as `python3-dev` on
ubuntu, you can install it with `sudo apt install python3-dev`, or
`python3-devel` on RPM based distributions (e.g. Fedora, Red Hat, SuSE). Mac's
`Python3` package should have this shared library by default. More detail for compiling with PyO3 can be found in [PyO3](https://pyo3.rs/v0.18.1/building_and_distribution#configuring-the-python-version)'s documentation.
To install GreptimeDB locally, the recommended way is via Docker:
#### Build with Docker
A docker image with necessary dependencies is provided:
```
docker build --network host -f docker/Dockerfile -t greptimedb .
```shell
docker pull greptime/greptimedb
```
### Run
Start GreptimeDB from source code, in standalone mode:
Start a GreptimeDB container with:
```shell
docker run --rm --name greptime --net=host greptime/greptimedb standalone start
```
Read more about [Installation](https://docs.greptime.com/getting-started/installation/overview) on docs.
## Getting Started
* [Quickstart](https://docs.greptime.com/getting-started/quick-start/overview)
* [Write Data](https://docs.greptime.com/user-guide/clients/overview)
* [Query Data](https://docs.greptime.com/user-guide/query-data/overview)
* [Operations](https://docs.greptime.com/user-guide/operations/overview)
## Build
Check the prerequisite:
* [Rust toolchain](https://www.rust-lang.org/tools/install) (nightly)
* [Protobuf compiler](https://grpc.io/docs/protoc-installation/) (>= 3.15)
* Python toolchain (optional): Required only if built with PyO3 backend. More detail for compiling with PyO3 can be found in its [documentation](https://pyo3.rs/v0.18.1/building_and_distribution#configuring-the-python-version).
Build GreptimeDB binary:
```shell
make
```
Run a standalone server:
```shell
cargo run -- standalone start
```
Or if you built from docker:
```
docker run -p 4002:4002 -v "$(pwd):/tmp/greptimedb" greptime/greptimedb standalone start
```
Please see the online document site for more installation options and [operations info](https://docs.greptime.com/user-guide/operations/overview).
### Get started
Read the [complete getting started guide](https://docs.greptime.com/getting-started/overview) on our [official document site](https://docs.greptime.com/).
To write and query data, GreptimeDB is compatible with multiple [protocols and clients](https://docs.greptime.com/user-guide/clients/overview).
## Resources
### Installation
- [Pre-built Binaries](https://greptime.com/download):
For Linux and macOS, you can easily download pre-built binaries including official releases and nightly builds that are ready to use.
In most cases, downloading the version without PyO3 is sufficient. However, if you plan to run scripts in CPython (and use Python packages like NumPy and Pandas), you will need to download the version with PyO3 and install a Python with the same version as the Python in the PyO3 version.
We recommend using virtualenv for the installation process to manage multiple Python versions.
- [Docker Images](https://hub.docker.com/r/greptime/greptimedb)(**recommended**): pre-built
Docker images, this is the easiest way to try GreptimeDB. By default it runs CPython script with `pyo3_backend` enabled.
- [`gtctl`](https://github.com/GreptimeTeam/gtctl): the command-line tool for
Kubernetes deployment
### Documentation
- GreptimeDB [User Guide](https://docs.greptime.com/user-guide/concepts/overview)
- GreptimeDB [Developer
Guide](https://docs.greptime.com/developer-guide/overview.html)
- GreptimeDB [internal code document](https://greptimedb.rs)
## Extension
### Dashboard
- [The dashboard UI for GreptimeDB](https://github.com/GreptimeTeam/dashboard)
### SDK
- [GreptimeDB C++ Client](https://github.com/GreptimeTeam/greptimedb-client-cpp)
- [GreptimeDB Erlang Client](https://github.com/GreptimeTeam/greptimedb-client-erl)
- [GreptimeDB Go Ingester](https://github.com/GreptimeTeam/greptimedb-ingester-go)
- [GreptimeDB Java Ingester](https://github.com/GreptimeTeam/greptimedb-ingester-java)
- [GreptimeDB Python Client](https://github.com/GreptimeTeam/greptimedb-client-py) (WIP)
- [GreptimeDB Rust Client](https://github.com/GreptimeTeam/greptimedb-client-rust)
- [GreptimeDB JavaScript Client](https://github.com/GreptimeTeam/greptime-js-sdk)
- [GreptimeDB C++ Ingester](https://github.com/GreptimeTeam/greptimedb-ingester-cpp)
- [GreptimeDB Erlang Ingester](https://github.com/GreptimeTeam/greptimedb-ingester-erl)
- [GreptimeDB Rust Ingester](https://github.com/GreptimeTeam/greptimedb-ingester-rust)
- [GreptimeDB JavaScript Ingester](https://github.com/GreptimeTeam/greptime-ingester-js)
### Grafana Dashboard
Our official Grafana dashboard is available at [grafana](./grafana/README.md) directory.
Our official Grafana dashboard is available at [grafana](grafana/README.md) directory.
## Project Status
This project is in its early stage and under heavy development. We move fast and
break things. Benchmark on development branch may not represent its potential
performance. We release pre-built binaries constantly for functional
evaluation. Do not use it in production at the moment.
For future plans, check out [GreptimeDB roadmap](https://github.com/GreptimeTeam/greptimedb/issues/669).
The current version has not yet reached General Availability version standards.
In line with our Greptime 2024 Roadmap, we plan to achieve a production-level
version with the update to v1.0 in August. [[Join Force]](https://github.com/GreptimeTeam/greptimedb/issues/3412)
## Community
@@ -154,12 +163,12 @@ and what went wrong. If you have any questions or if you would like to get invol
community, please check out:
- GreptimeDB Community on [Slack](https://greptime.com/slack)
- GreptimeDB GitHub [Discussions](https://github.com/GreptimeTeam/greptimedb/discussions)
- Greptime official [Website](https://greptime.com)
- GreptimeDB [GitHub Discussions forum](https://github.com/GreptimeTeam/greptimedb/discussions)
- Greptime official [website](https://greptime.com)
In addition, you may:
- View our official [Blog](https://greptime.com/blogs/index)
- View our official [Blog](https://greptime.com/blogs/)
- Connect us with [Linkedin](https://www.linkedin.com/company/greptime/)
- Follow us on [Twitter](https://twitter.com/greptime)
@@ -170,7 +179,7 @@ open contributions and allowing you to use the software however you want.
## Contributing
Please refer to [contribution guidelines](CONTRIBUTING.md) for more information.
Please refer to [contribution guidelines](CONTRIBUTING.md) and [internal concepts docs](https://docs.greptime.com/contributor-guide/overview.html) for more information.
## Acknowledgement

View File

@@ -140,9 +140,9 @@ intermediate_path = ""
[region_engine.mito.memtable]
# Memtable type.
# - "experimental": experimental memtable
# - "partition_tree": partition tree memtable
# - "time_series": time-series memtable (deprecated)
type = "experimental"
type = "partition_tree"
# The max number of keys in one shard.
index_max_keys_per_shard = 8192
# The max rows of data inside the actively writing buffer in one shard.

View File

@@ -246,9 +246,9 @@ intermediate_path = ""
[region_engine.mito.memtable]
# Memtable type.
# - "experimental": experimental memtable
# - "partition_tree": partition tree memtable
# - "time_series": time-series memtable (deprecated)
type = "experimental"
type = "partition_tree"
# The max number of keys in one shard.
index_max_keys_per_shard = 8192
# The max rows of data inside the actively writing buffer in one shard.

View File

@@ -27,7 +27,7 @@ function retry_fetch() {
echo "Failed to download $url"
echo "You may try to set http_proxy and https_proxy environment variables."
if [[ -z "$GITHUB_PROXY_URL" ]]; then
echo "You may try to set GITHUB_PROXY_URL=http://mirror.ghproxy.com/"
echo "You may try to set GITHUB_PROXY_URL=http://mirror.ghproxy.com/https://github.com/"
fi
exit 1
}
@@ -39,7 +39,7 @@ function retry_fetch() {
retry_fetch "${GITHUB_URL}/GreptimeTeam/dashboard/releases/download/${RELEASE_VERSION}/sha256.txt" sha256.txt
# Download the tar file containing the built dashboard assets.
retry_fetch "${GITHUB_URL}/GreptimeTeam/dashboard/releases/download/$RELEASE_VERSION/build.tar.gz" build.tar.gz
retry_fetch "${GITHUB_URL}/GreptimeTeam/dashboard/releases/download/${RELEASE_VERSION}/build.tar.gz" build.tar.gz
# Verify the checksums match; exit if they don't.
case "$(uname -s)" in

View File

@@ -707,7 +707,6 @@ pub fn pb_values_to_vector_ref(data_type: &ConcreteDataType, values: Values) ->
}
pub fn pb_values_to_values(data_type: &ConcreteDataType, values: Values) -> Vec<Value> {
// TODO(fys): use macros to optimize code
match data_type {
ConcreteDataType::Int64(_) => values
.i64_values

View File

@@ -40,7 +40,7 @@ pub fn user_provider_from_option(opt: &String) -> Result<UserProviderRef> {
match name {
STATIC_USER_PROVIDER => {
let provider =
StaticUserProvider::try_from(content).map(|p| Arc::new(p) as UserProviderRef)?;
StaticUserProvider::new(content).map(|p| Arc::new(p) as UserProviderRef)?;
Ok(provider)
}
_ => InvalidConfigSnafu {

View File

@@ -23,7 +23,7 @@ use secrecy::ExposeSecret;
use snafu::{ensure, OptionExt, ResultExt};
use crate::error::{
Error, IllegalParamSnafu, InvalidConfigSnafu, IoSnafu, Result, UnsupportedPasswordTypeSnafu,
IllegalParamSnafu, InvalidConfigSnafu, IoSnafu, Result, UnsupportedPasswordTypeSnafu,
UserNotFoundSnafu, UserPasswordMismatchSnafu,
};
use crate::user_info::DefaultUserInfo;
@@ -31,10 +31,12 @@ use crate::{auth_mysql, Identity, Password, UserInfoRef, UserProvider};
pub(crate) const STATIC_USER_PROVIDER: &str = "static_user_provider";
impl TryFrom<&str> for StaticUserProvider {
type Error = Error;
pub(crate) struct StaticUserProvider {
users: HashMap<String, Vec<u8>>,
}
fn try_from(value: &str) -> Result<Self> {
impl StaticUserProvider {
pub(crate) fn new(value: &str) -> Result<Self> {
let (mode, content) = value.split_once(':').context(InvalidConfigSnafu {
value: value.to_string(),
msg: "StaticUserProviderOption must be in format `<option>:<value>`",
@@ -83,15 +85,11 @@ impl TryFrom<&str> for StaticUserProvider {
value: mode.to_string(),
msg: "StaticUserProviderOption must be in format `file:<path>` or `cmd:<values>`",
}
.fail(),
.fail(),
};
}
}
pub(crate) struct StaticUserProvider {
users: HashMap<String, Vec<u8>>,
}
#[async_trait]
impl UserProvider for StaticUserProvider {
fn name(&self) -> &str {
@@ -181,7 +179,7 @@ pub mod test {
#[tokio::test]
async fn test_authorize() {
let user_info = DefaultUserInfo::with_name("root");
let provider = StaticUserProvider::try_from("cmd:root=123456,admin=654321").unwrap();
let provider = StaticUserProvider::new("cmd:root=123456,admin=654321").unwrap();
provider
.authorize("catalog", "schema", &user_info)
.await
@@ -190,7 +188,7 @@ pub mod test {
#[tokio::test]
async fn test_inline_provider() {
let provider = StaticUserProvider::try_from("cmd:root=123456,admin=654321").unwrap();
let provider = StaticUserProvider::new("cmd:root=123456,admin=654321").unwrap();
test_authenticate(&provider, "root", "123456").await;
test_authenticate(&provider, "admin", "654321").await;
}
@@ -214,7 +212,7 @@ admin=654321",
}
let param = format!("file:{file_path}");
let provider = StaticUserProvider::try_from(param.as_str()).unwrap();
let provider = StaticUserProvider::new(param.as_str()).unwrap();
test_authenticate(&provider, "root", "123456").await;
test_authenticate(&provider, "admin", "654321").await;
}

View File

@@ -25,13 +25,13 @@ use common_catalog::format_full_table_name;
use common_error::ext::BoxedError;
use common_meta::cache_invalidator::{CacheInvalidator, CacheInvalidatorRef, Context};
use common_meta::error::Result as MetaResult;
use common_meta::instruction::CacheIdent;
use common_meta::key::catalog_name::CatalogNameKey;
use common_meta::key::schema_name::SchemaNameKey;
use common_meta::key::table_info::TableInfoValue;
use common_meta::key::table_name::TableNameKey;
use common_meta::key::{TableMetadataManager, TableMetadataManagerRef};
use common_meta::kv_backend::KvBackendRef;
use common_meta::table_name::TableName;
use futures_util::stream::BoxStream;
use futures_util::{StreamExt, TryStreamExt};
use moka::future::{Cache as AsyncCache, CacheBuilder};
@@ -39,7 +39,6 @@ use moka::sync::Cache;
use partition::manager::{PartitionRuleManager, PartitionRuleManagerRef};
use snafu::prelude::*;
use table::dist_table::DistTable;
use table::metadata::TableId;
use table::table::numbers::{NumbersTable, NUMBERS_TABLE_NAME};
use table::TableRef;
@@ -79,24 +78,18 @@ fn make_table(table_info_value: TableInfoValue) -> CatalogResult<TableRef> {
#[async_trait::async_trait]
impl CacheInvalidator for KvBackendCatalogManager {
async fn invalidate_table_id(&self, ctx: &Context, table_id: TableId) -> MetaResult<()> {
self.cache_invalidator
.invalidate_table_id(ctx, table_id)
.await
}
async fn invalidate_table_name(&self, ctx: &Context, table_name: TableName) -> MetaResult<()> {
let table_cache_key = format_full_table_name(
&table_name.catalog_name,
&table_name.schema_name,
&table_name.table_name,
);
self.cache_invalidator
.invalidate_table_name(ctx, table_name)
.await?;
self.table_cache.invalidate(&table_cache_key).await;
Ok(())
async fn invalidate(&self, ctx: &Context, caches: Vec<CacheIdent>) -> MetaResult<()> {
for cache in &caches {
if let CacheIdent::TableName(table_name) = cache {
let table_cache_key = format_full_table_name(
&table_name.catalog_name,
&table_name.schema_name,
&table_name.table_name,
);
self.table_cache.invalidate(&table_cache_key).await;
}
}
self.cache_invalidator.invalidate(ctx, caches).await
}
}

View File

@@ -14,7 +14,7 @@
use std::sync::Arc;
use api::v1::region::{QueryRequest, RegionRequest, RegionResponse};
use api::v1::region::{QueryRequest, RegionRequest};
use api::v1::ResponseHeader;
use arc_swap::ArcSwapOption;
use arrow_flight::Ticket;
@@ -23,7 +23,7 @@ use async_trait::async_trait;
use common_error::ext::{BoxedError, ErrorExt};
use common_error::status_code::StatusCode;
use common_grpc::flight::{FlightDecoder, FlightMessage};
use common_meta::datanode_manager::{AffectedRows, Datanode};
use common_meta::datanode_manager::{Datanode, HandleResponse};
use common_meta::error::{self as meta_error, Result as MetaResult};
use common_recordbatch::error::ExternalSnafu;
use common_recordbatch::{RecordBatchStreamWrapper, SendableRecordBatchStream};
@@ -46,7 +46,7 @@ pub struct RegionRequester {
#[async_trait]
impl Datanode for RegionRequester {
async fn handle(&self, request: RegionRequest) -> MetaResult<AffectedRows> {
async fn handle(&self, request: RegionRequest) -> MetaResult<HandleResponse> {
self.handle_inner(request).await.map_err(|err| {
if err.should_retry() {
meta_error::Error::RetryLater {
@@ -165,7 +165,7 @@ impl RegionRequester {
Ok(Box::pin(record_batch_stream))
}
async fn handle_inner(&self, request: RegionRequest) -> Result<AffectedRows> {
async fn handle_inner(&self, request: RegionRequest) -> Result<HandleResponse> {
let request_type = request
.body
.as_ref()
@@ -178,10 +178,7 @@ impl RegionRequester {
let mut client = self.client.raw_region_client()?;
let RegionResponse {
header,
affected_rows,
} = client
let response = client
.handle(request)
.await
.map_err(|e| {
@@ -195,19 +192,20 @@ impl RegionRequester {
})?
.into_inner();
check_response_header(header)?;
check_response_header(&response.header)?;
Ok(affected_rows as _)
Ok(HandleResponse::from_region_response(response))
}
pub async fn handle(&self, request: RegionRequest) -> Result<AffectedRows> {
pub async fn handle(&self, request: RegionRequest) -> Result<HandleResponse> {
self.handle_inner(request).await
}
}
pub fn check_response_header(header: Option<ResponseHeader>) -> Result<()> {
pub fn check_response_header(header: &Option<ResponseHeader>) -> Result<()> {
let status = header
.and_then(|header| header.status)
.as_ref()
.and_then(|header| header.status.as_ref())
.context(IllegalDatabaseResponseSnafu {
err_msg: "either response header or status is missing",
})?;
@@ -221,7 +219,7 @@ pub fn check_response_header(header: Option<ResponseHeader>) -> Result<()> {
})?;
ServerSnafu {
code,
msg: status.err_msg,
msg: status.err_msg.clone(),
}
.fail()
}
@@ -236,19 +234,19 @@ mod test {
#[test]
fn test_check_response_header() {
let result = check_response_header(None);
let result = check_response_header(&None);
assert!(matches!(
result.unwrap_err(),
IllegalDatabaseResponse { .. }
));
let result = check_response_header(Some(ResponseHeader { status: None }));
let result = check_response_header(&Some(ResponseHeader { status: None }));
assert!(matches!(
result.unwrap_err(),
IllegalDatabaseResponse { .. }
));
let result = check_response_header(Some(ResponseHeader {
let result = check_response_header(&Some(ResponseHeader {
status: Some(PbStatus {
status_code: StatusCode::Success as u32,
err_msg: String::default(),
@@ -256,7 +254,7 @@ mod test {
}));
assert!(result.is_ok());
let result = check_response_header(Some(ResponseHeader {
let result = check_response_header(&Some(ResponseHeader {
status: Some(PbStatus {
status_code: u32::MAX,
err_msg: String::default(),
@@ -267,7 +265,7 @@ mod test {
IllegalDatabaseResponse { .. }
));
let result = check_response_header(Some(ResponseHeader {
let result = check_response_header(&Some(ResponseHeader {
status: Some(PbStatus {
status_code: StatusCode::Internal as u32,
err_msg: "blabla".to_string(),

View File

@@ -18,6 +18,7 @@ use async_trait::async_trait;
use common_base::AffectedRows;
use common_meta::rpc::procedure::{MigrateRegionRequest, ProcedureStateResponse};
use common_query::error::Result;
use common_query::Output;
use session::context::QueryContextRef;
use store_api::storage::RegionId;
use table::requests::{CompactTableRequest, DeleteRequest, FlushTableRequest, InsertRequest};
@@ -26,7 +27,7 @@ use table::requests::{CompactTableRequest, DeleteRequest, FlushTableRequest, Ins
#[async_trait]
pub trait TableMutationHandler: Send + Sync {
/// Inserts rows into the table.
async fn insert(&self, request: InsertRequest, ctx: QueryContextRef) -> Result<AffectedRows>;
async fn insert(&self, request: InsertRequest, ctx: QueryContextRef) -> Result<Output>;
/// Delete rows from the table.
async fn delete(&self, request: DeleteRequest, ctx: QueryContextRef) -> Result<AffectedRows>;

View File

@@ -35,6 +35,7 @@ impl FunctionState {
use common_base::AffectedRows;
use common_meta::rpc::procedure::{MigrateRegionRequest, ProcedureStateResponse};
use common_query::error::Result;
use common_query::Output;
use session::context::QueryContextRef;
use store_api::storage::RegionId;
use table::requests::{
@@ -70,8 +71,8 @@ impl FunctionState {
&self,
_request: InsertRequest,
_ctx: QueryContextRef,
) -> Result<AffectedRows> {
Ok(ROWS)
) -> Result<Output> {
Ok(Output::new_with_affected_rows(ROWS))
}
async fn delete(

View File

@@ -56,6 +56,18 @@ pub(crate) fn process_range_fn(args: TokenStream, input: TokenStream) -> TokenSt
} = &sig;
let arg_types = ok!(extract_input_types(inputs));
// with format like Float64Array
let array_types = arg_types
.iter()
.map(|ty| {
if let Type::Reference(TypeReference { elem, .. }) = ty {
elem.as_ref().clone()
} else {
ty.clone()
}
})
.collect::<Vec<_>>();
// build the struct and its impl block
// only do this when `display_name` is specified
if let Ok(display_name) = get_ident(&arg_map, "display_name", arg_span) {
@@ -64,6 +76,8 @@ pub(crate) fn process_range_fn(args: TokenStream, input: TokenStream) -> TokenSt
vis,
ok!(get_ident(&arg_map, "name", arg_span)),
display_name,
array_types,
ok!(get_ident(&arg_map, "ret", arg_span)),
);
result.extend(struct_code);
}
@@ -90,6 +104,8 @@ fn build_struct(
vis: Visibility,
name: Ident,
display_name_ident: Ident,
array_types: Vec<Type>,
return_array_type: Ident,
) -> TokenStream {
let display_name = display_name_ident.to_string();
quote! {
@@ -114,18 +130,12 @@ fn build_struct(
}
}
// TODO(ruihang): this should be parameterized
// time index column and value column
fn input_type() -> Vec<DataType> {
vec![
RangeArray::convert_data_type(DataType::Timestamp(TimeUnit::Millisecond, None)),
RangeArray::convert_data_type(DataType::Float64),
]
vec![#( RangeArray::convert_data_type(#array_types::new_null(0).data_type().clone()), )*]
}
// TODO(ruihang): this should be parameterized
fn return_type() -> DataType {
DataType::Float64
#return_array_type::new_null(0).data_type().clone()
}
}
}
@@ -160,6 +170,7 @@ fn build_calc_fn(
.map(|name| Ident::new(&format!("{}_range_array", name), name.span()))
.collect::<Vec<_>>();
let first_range_array_name = range_array_names.first().unwrap().clone();
let first_param_name = param_names.first().unwrap().clone();
quote! {
impl #name {
@@ -168,13 +179,29 @@ fn build_calc_fn(
#( let #range_array_names = RangeArray::try_new(extract_array(&input[#param_numbers])?.to_data().into())?; )*
// TODO(ruihang): add ensure!()
// check arrays len
{
let len_first = #first_range_array_name.len();
#(
if len_first != #range_array_names.len() {
return Err(DataFusionError::Execution(format!("RangeArray have different lengths in PromQL function {}: array1={}, array2={}", #name::name(), len_first, #range_array_names.len())));
}
)*
}
let mut result_array = Vec::new();
for index in 0..#first_range_array_name.len(){
#( let #param_names = #range_array_names.get(index).unwrap().as_any().downcast_ref::<#unref_param_types>().unwrap().clone(); )*
// TODO(ruihang): add ensure!() to check length
// check element len
{
let len_first = #first_param_name.len();
#(
if len_first != #param_names.len() {
return Err(DataFusionError::Execution(format!("RangeArray's element {} have different lengths in PromQL function {}: array1={}, array2={}", index, #name::name(), len_first, #param_names.len())));
}
)*
}
let result = #fn_name(#( &#param_names, )*);
result_array.push(result);

View File

@@ -14,14 +14,12 @@
use std::sync::Arc;
use table::metadata::TableId;
use crate::error::Result;
use crate::instruction::CacheIdent;
use crate::key::table_info::TableInfoKey;
use crate::key::table_name::TableNameKey;
use crate::key::table_route::TableRouteKey;
use crate::key::TableMetaKey;
use crate::table_name::TableName;
/// KvBackend cache invalidator
#[async_trait::async_trait]
@@ -46,10 +44,7 @@ pub struct Context {
#[async_trait::async_trait]
pub trait CacheInvalidator: Send + Sync {
// Invalidates table cache
async fn invalidate_table_id(&self, ctx: &Context, table_id: TableId) -> Result<()>;
async fn invalidate_table_name(&self, ctx: &Context, table_name: TableName) -> Result<()>;
async fn invalidate(&self, ctx: &Context, caches: Vec<CacheIdent>) -> Result<()>;
}
pub type CacheInvalidatorRef = Arc<dyn CacheInvalidator>;
@@ -58,11 +53,7 @@ pub struct DummyCacheInvalidator;
#[async_trait::async_trait]
impl CacheInvalidator for DummyCacheInvalidator {
async fn invalidate_table_id(&self, _ctx: &Context, _table_id: TableId) -> Result<()> {
Ok(())
}
async fn invalidate_table_name(&self, _ctx: &Context, _table_name: TableName) -> Result<()> {
async fn invalidate(&self, _ctx: &Context, _caches: Vec<CacheIdent>) -> Result<()> {
Ok(())
}
}
@@ -72,21 +63,22 @@ impl<T> CacheInvalidator for T
where
T: KvCacheInvalidator,
{
async fn invalidate_table_name(&self, _ctx: &Context, table_name: TableName) -> Result<()> {
let key: TableNameKey = (&table_name).into();
self.invalidate_key(&key.as_raw_key()).await;
Ok(())
}
async fn invalidate_table_id(&self, _ctx: &Context, table_id: TableId) -> Result<()> {
let key = TableInfoKey::new(table_id);
self.invalidate_key(&key.as_raw_key()).await;
let key = &TableRouteKey { table_id };
self.invalidate_key(&key.as_raw_key()).await;
async fn invalidate(&self, _ctx: &Context, caches: Vec<CacheIdent>) -> Result<()> {
for cache in caches {
match cache {
CacheIdent::TableId(table_id) => {
let key = TableInfoKey::new(table_id);
self.invalidate_key(&key.as_raw_key()).await;
let key = &TableRouteKey { table_id };
self.invalidate_key(&key.as_raw_key()).await;
}
CacheIdent::TableName(table_name) => {
let key: TableNameKey = (&table_name).into();
self.invalidate_key(&key.as_raw_key()).await
}
}
}
Ok(())
}
}

View File

@@ -12,9 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use std::sync::Arc;
use api::v1::region::{QueryRequest, RegionRequest};
use api::v1::region::{QueryRequest, RegionRequest, RegionResponse};
pub use common_base::AffectedRows;
use common_recordbatch::SendableRecordBatchStream;
@@ -25,7 +26,7 @@ use crate::peer::Peer;
#[async_trait::async_trait]
pub trait Datanode: Send + Sync {
/// Handles DML, and DDL requests.
async fn handle(&self, request: RegionRequest) -> Result<AffectedRows>;
async fn handle(&self, request: RegionRequest) -> Result<HandleResponse>;
/// Handles query requests
async fn handle_query(&self, request: QueryRequest) -> Result<SendableRecordBatchStream>;
@@ -41,3 +42,27 @@ pub trait DatanodeManager: Send + Sync {
}
pub type DatanodeManagerRef = Arc<dyn DatanodeManager>;
/// This result struct is derived from [RegionResponse]
#[derive(Debug)]
pub struct HandleResponse {
pub affected_rows: AffectedRows,
pub extension: HashMap<String, Vec<u8>>,
}
impl HandleResponse {
pub fn from_region_response(region_response: RegionResponse) -> Self {
Self {
affected_rows: region_response.affected_rows as _,
extension: region_response.extension,
}
}
/// Creates one response without extension
pub fn new(affected_rows: AffectedRows) -> Self {
Self {
affected_rows,
extension: Default::default(),
}
}
}

View File

@@ -43,6 +43,7 @@ use crate::cache_invalidator::Context;
use crate::ddl::utils::add_peer_context_if_needed;
use crate::ddl::DdlContext;
use crate::error::{self, ConvertAlterTableRequestSnafu, Error, InvalidProtoMsgSnafu, Result};
use crate::instruction::CacheIdent;
use crate::key::table_info::TableInfoValue;
use crate::key::table_name::TableNameKey;
use crate::key::DeserializedValueWithBytes;
@@ -333,11 +334,17 @@ impl AlterTableProcedure {
if matches!(alter_kind, Kind::RenameTable { .. }) {
cache_invalidator
.invalidate_table_name(&Context::default(), self.data.table_ref().into())
.invalidate(
&Context::default(),
vec![CacheIdent::TableName(self.data.table_ref().into())],
)
.await?;
} else {
cache_invalidator
.invalidate_table_id(&Context::default(), self.data.table_id())
.invalidate(
&Context::default(),
vec![CacheIdent::TableId(self.data.table_id())],
)
.await?;
};

View File

@@ -36,7 +36,7 @@ use crate::ddl::DdlContext;
use crate::error::{Result, TableAlreadyExistsSnafu};
use crate::key::table_name::TableNameKey;
use crate::key::table_route::TableRouteValue;
use crate::lock_key::{TableLock, TableNameLock};
use crate::lock_key::{CatalogLock, SchemaLock, TableLock, TableNameLock};
use crate::peer::Peer;
use crate::rpc::ddl::CreateTableTask;
use crate::rpc::router::{find_leader_regions, find_leaders, RegionRoute};
@@ -70,6 +70,7 @@ impl CreateLogicalTablesProcedure {
/// - Checks whether physical table exists.
/// - Checks whether logical tables exist.
/// - Allocates the table ids.
/// - Modify tasks to sort logical columns on their names.
///
/// Abort(non-retry):
/// - The physical table does not exist.
@@ -130,7 +131,7 @@ impl CreateLogicalTablesProcedure {
));
}
// Allocates table ids
// Allocates table ids and sort columns on their names.
for (task, table_id) in tasks.iter_mut().zip(already_exists_tables_ids.iter()) {
let table_id = if let Some(table_id) = table_id {
*table_id
@@ -141,6 +142,11 @@ impl CreateLogicalTablesProcedure {
.await?
};
task.set_table_id(table_id);
// sort columns in task
task.sort_columns();
common_telemetry::info!("[DEBUG] sorted task {:?}", task);
}
self.creator
@@ -307,8 +313,15 @@ impl Procedure for CreateLogicalTablesProcedure {
}
fn lock_key(&self) -> LockKey {
let mut lock_key = Vec::with_capacity(1 + self.creator.data.tasks.len());
// CatalogLock, SchemaLock,
// TableLock
// TableNameLock(s)
let mut lock_key = Vec::with_capacity(2 + 1 + self.creator.data.tasks.len());
let table_ref = self.creator.data.tasks[0].table_ref();
lock_key.push(CatalogLock::Read(table_ref.catalog).into());
lock_key.push(SchemaLock::read(table_ref.catalog, table_ref.schema).into());
lock_key.push(TableLock::Write(self.creator.data.physical_table_id()).into());
for task in &self.creator.data.tasks {
lock_key.push(
TableNameLock::new(

View File

@@ -38,7 +38,7 @@ use crate::ddl::{DdlContext, TableMetadata, TableMetadataAllocatorContext};
use crate::error::{self, Result, TableRouteNotFoundSnafu};
use crate::key::table_name::TableNameKey;
use crate::key::table_route::TableRouteValue;
use crate::lock_key::TableNameLock;
use crate::lock_key::{CatalogLock, SchemaLock, TableNameLock};
use crate::region_keeper::OperatingRegionGuard;
use crate::rpc::ddl::CreateTableTask;
use crate::rpc::router::{
@@ -343,11 +343,11 @@ impl Procedure for CreateTableProcedure {
fn lock_key(&self) -> LockKey {
let table_ref = &self.creator.data.table_ref();
LockKey::single(TableNameLock::new(
table_ref.catalog,
table_ref.schema,
table_ref.table,
))
LockKey::new(vec![
CatalogLock::Read(table_ref.catalog).into(),
SchemaLock::read(table_ref.catalog, table_ref.schema).into(),
TableNameLock::new(table_ref.catalog, table_ref.schema, table_ref.table).into(),
])
}
}

View File

@@ -12,42 +12,32 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use api::v1::region::{
region_request, DropRequest as PbDropRegionRequest, RegionRequest, RegionRequestHeader,
};
pub mod executor;
use async_trait::async_trait;
use common_error::ext::ErrorExt;
use common_error::status_code::StatusCode;
use common_procedure::error::{FromJsonSnafu, ToJsonSnafu};
use common_procedure::{
Context as ProcedureContext, LockKey, Procedure, Result as ProcedureResult, Status,
};
use common_telemetry::tracing_context::TracingContext;
use common_telemetry::{debug, info};
use futures::future::join_all;
use common_telemetry::info;
use serde::{Deserialize, Serialize};
use snafu::{ensure, OptionExt, ResultExt};
use store_api::storage::RegionId;
use snafu::{OptionExt, ResultExt};
use strum::AsRefStr;
use table::metadata::{RawTableInfo, TableId};
use table::table_reference::TableReference;
use self::executor::DropTableExecutor;
use super::utils::handle_retry_error;
use crate::cache_invalidator::Context;
use crate::ddl::utils::add_peer_context_if_needed;
use crate::ddl::DdlContext;
use crate::error::{self, Result};
use crate::key::table_info::TableInfoValue;
use crate::key::table_name::TableNameKey;
use crate::key::table_route::TableRouteValue;
use crate::key::DeserializedValueWithBytes;
use crate::lock_key::{CatalogLock, SchemaLock, TableLock};
use crate::metrics;
use crate::region_keeper::OperatingRegionGuard;
use crate::rpc::ddl::DropTableTask;
use crate::rpc::router::{
find_leader_regions, find_leaders, operating_leader_regions, RegionRoute,
};
use crate::rpc::router::{operating_leader_regions, RegionRoute};
pub struct DropTableProcedure {
/// The context of procedure runtime.
@@ -58,7 +48,6 @@ pub struct DropTableProcedure {
pub dropping_regions: Vec<OperatingRegionGuard>,
}
#[allow(dead_code)]
impl DropTableProcedure {
pub const TYPE_NAME: &'static str = "metasrv-procedure::DropTable";
@@ -85,31 +74,10 @@ impl DropTableProcedure {
})
}
async fn on_prepare(&mut self) -> Result<Status> {
let table_ref = &self.data.table_ref();
let exist = self
.context
.table_metadata_manager
.table_name_manager()
.exists(TableNameKey::new(
table_ref.catalog,
table_ref.schema,
table_ref.table,
))
.await?;
if !exist && self.data.task.drop_if_exists {
async fn on_prepare<'a>(&mut self, executor: &DropTableExecutor) -> Result<Status> {
if executor.on_prepare(&self.context).await?.stop() {
return Ok(Status::done());
}
ensure!(
exist,
error::TableNotFoundSnafu {
table_name: table_ref.to_string()
}
);
self.data.state = DropTableState::RemoveMetadata;
Ok(Status::executing(true))
@@ -144,98 +112,38 @@ impl DropTableProcedure {
}
/// Removes the table metadata.
async fn on_remove_metadata(&mut self) -> Result<Status> {
async fn on_remove_metadata(&mut self, executor: &DropTableExecutor) -> Result<Status> {
self.register_dropping_regions()?;
// NOTES: If the meta server is crashed after the `RemoveMetadata`,
// Corresponding regions of this table on the Datanode will be closed automatically.
// Then any future dropping operation will fail.
// TODO(weny): Considers introducing a RegionStatus to indicate the region is dropping.
let table_metadata_manager = &self.context.table_metadata_manager;
let table_info_value = &self.data.table_info_value;
let table_route_value = &self.data.table_route_value;
let table_id = self.data.table_id();
table_metadata_manager
.delete_table_metadata(table_info_value, table_route_value)
executor
.on_remove_metadata(
&self.context,
&self.data.table_info_value,
&self.data.table_route_value,
)
.await?;
info!("Deleted table metadata for table {table_id}");
self.data.state = DropTableState::InvalidateTableCache;
Ok(Status::executing(true))
}
/// Broadcasts invalidate table cache instruction.
async fn on_broadcast(&mut self) -> Result<Status> {
let ctx = Context {
subject: Some("Invalidate table cache by dropping table".to_string()),
};
let cache_invalidator = &self.context.cache_invalidator;
cache_invalidator
.invalidate_table_name(&ctx, self.data.table_ref().into())
.await?;
cache_invalidator
.invalidate_table_id(&ctx, self.data.table_id())
.await?;
async fn on_broadcast(&mut self, executor: &DropTableExecutor) -> Result<Status> {
executor.invalidate_table_cache(&self.context).await?;
self.data.state = DropTableState::DatanodeDropRegions;
Ok(Status::executing(true))
}
pub async fn on_datanode_drop_regions(&self) -> Result<Status> {
let table_id = self.data.table_id();
let region_routes = &self.data.region_routes()?;
let leaders = find_leaders(region_routes);
let mut drop_region_tasks = Vec::with_capacity(leaders.len());
for datanode in leaders {
let requester = self.context.datanode_manager.datanode(&datanode).await;
let regions = find_leader_regions(region_routes, &datanode);
let region_ids = regions
.iter()
.map(|region_number| RegionId::new(table_id, *region_number))
.collect::<Vec<_>>();
for region_id in region_ids {
debug!("Dropping region {region_id} on Datanode {datanode:?}");
let request = RegionRequest {
header: Some(RegionRequestHeader {
tracing_context: TracingContext::from_current_span().to_w3c(),
..Default::default()
}),
body: Some(region_request::Body::Drop(PbDropRegionRequest {
region_id: region_id.as_u64(),
})),
};
let datanode = datanode.clone();
let requester = requester.clone();
drop_region_tasks.push(async move {
if let Err(err) = requester.handle(request).await {
if err.status_code() != StatusCode::RegionNotFound {
return Err(add_peer_context_if_needed(datanode)(err));
}
}
Ok(())
});
}
}
join_all(drop_region_tasks)
.await
.into_iter()
.collect::<Result<Vec<_>>>()?;
pub async fn on_datanode_drop_regions(&self, executor: &DropTableExecutor) -> Result<Status> {
executor
.on_drop_regions(&self.context, &self.data.table_route_value)
.await?;
Ok(Status::done())
}
}
@@ -247,17 +155,21 @@ impl Procedure for DropTableProcedure {
}
async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult<Status> {
let executor = DropTableExecutor::new(
self.data.task.table_name(),
self.data.table_id(),
self.data.task.drop_if_exists,
);
let state = &self.data.state;
let _timer = metrics::METRIC_META_PROCEDURE_DROP_TABLE
.with_label_values(&[state.as_ref()])
.start_timer();
match self.data.state {
DropTableState::Prepare => self.on_prepare().await,
DropTableState::RemoveMetadata => self.on_remove_metadata().await,
DropTableState::InvalidateTableCache => self.on_broadcast().await,
DropTableState::DatanodeDropRegions => self.on_datanode_drop_regions().await,
DropTableState::Prepare => self.on_prepare(&executor).await,
DropTableState::RemoveMetadata => self.on_remove_metadata(&executor).await,
DropTableState::InvalidateTableCache => self.on_broadcast(&executor).await,
DropTableState::DatanodeDropRegions => self.on_datanode_drop_regions(&executor).await,
}
.map_err(handle_retry_error)
}

View File

@@ -0,0 +1,280 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use api::v1::region::{
region_request, DropRequest as PbDropRegionRequest, RegionRequest, RegionRequestHeader,
};
use common_error::ext::ErrorExt;
use common_error::status_code::StatusCode;
use common_telemetry::debug;
use common_telemetry::tracing_context::TracingContext;
use futures::future::join_all;
use snafu::ensure;
use store_api::storage::RegionId;
use table::metadata::TableId;
use crate::cache_invalidator::Context;
use crate::ddl::utils::add_peer_context_if_needed;
use crate::ddl::DdlContext;
use crate::error::{self, Result};
use crate::instruction::CacheIdent;
use crate::key::table_info::TableInfoValue;
use crate::key::table_name::TableNameKey;
use crate::key::table_route::TableRouteValue;
use crate::key::DeserializedValueWithBytes;
use crate::rpc::router::{find_leader_regions, find_leaders};
use crate::table_name::TableName;
/// [Control] indicated to the caller whether to go to the next step.
#[derive(Debug)]
pub enum Control<T> {
Continue(T),
Stop,
}
impl<T> Control<T> {
/// Returns true if it's [Control::Stop].
pub fn stop(&self) -> bool {
matches!(self, Control::Stop)
}
}
impl DropTableExecutor {
/// Returns the [DropTableExecutor].
pub fn new(table: TableName, table_id: TableId, drop_if_exists: bool) -> Self {
Self {
table,
table_id,
drop_if_exists,
}
}
}
/// [DropTableExecutor] performs:
/// - Drops the metadata of the table.
/// - Invalidates the cache on the Frontend nodes.
/// - Drops the regions on the Datanode nodes.
pub struct DropTableExecutor {
table: TableName,
table_id: TableId,
drop_if_exists: bool,
}
impl DropTableExecutor {
/// Checks whether table exists.
/// - Early returns if table not exists and `drop_if_exists` is `true`.
/// - Throws an error if table not exists and `drop_if_exists` is `false`.
pub async fn on_prepare(&self, ctx: &DdlContext) -> Result<Control<()>> {
let table_ref = self.table.table_ref();
let exist = ctx
.table_metadata_manager
.table_name_manager()
.exists(TableNameKey::new(
table_ref.catalog,
table_ref.schema,
table_ref.table,
))
.await?;
if !exist && self.drop_if_exists {
return Ok(Control::Stop);
}
ensure!(
exist,
error::TableNotFoundSnafu {
table_name: table_ref.to_string()
}
);
Ok(Control::Continue(()))
}
/// Removes the table metadata.
pub async fn on_remove_metadata(
&self,
ctx: &DdlContext,
table_info_value: &DeserializedValueWithBytes<TableInfoValue>,
table_route_value: &DeserializedValueWithBytes<TableRouteValue>,
) -> Result<()> {
ctx.table_metadata_manager
.delete_table_metadata(table_info_value, table_route_value)
.await
}
/// Invalidates frontend caches
pub async fn invalidate_table_cache(&self, ctx: &DdlContext) -> Result<()> {
let cache_invalidator = &ctx.cache_invalidator;
let ctx = Context {
subject: Some("Invalidate table cache by dropping table".to_string()),
};
cache_invalidator
.invalidate(
&ctx,
vec![
CacheIdent::TableName(self.table.table_ref().into()),
CacheIdent::TableId(self.table_id),
],
)
.await?;
Ok(())
}
/// Drops region on datanode.
pub async fn on_drop_regions(
&self,
ctx: &DdlContext,
table_route_value: &DeserializedValueWithBytes<TableRouteValue>,
) -> Result<()> {
// The `table_route_value` always be the physical table route.
let region_routes = table_route_value.region_routes()?;
let leaders = find_leaders(region_routes);
let mut drop_region_tasks = Vec::with_capacity(leaders.len());
let table_id = self.table_id;
for datanode in leaders {
let requester = ctx.datanode_manager.datanode(&datanode).await;
let regions = find_leader_regions(region_routes, &datanode);
let region_ids = regions
.iter()
.map(|region_number| RegionId::new(table_id, *region_number))
.collect::<Vec<_>>();
for region_id in region_ids {
debug!("Dropping region {region_id} on Datanode {datanode:?}");
let request = RegionRequest {
header: Some(RegionRequestHeader {
tracing_context: TracingContext::from_current_span().to_w3c(),
..Default::default()
}),
body: Some(region_request::Body::Drop(PbDropRegionRequest {
region_id: region_id.as_u64(),
})),
};
let datanode = datanode.clone();
let requester = requester.clone();
drop_region_tasks.push(async move {
if let Err(err) = requester.handle(request).await {
if err.status_code() != StatusCode::RegionNotFound {
return Err(add_peer_context_if_needed(datanode)(err));
}
}
Ok(())
});
}
}
join_all(drop_region_tasks)
.await
.into_iter()
.collect::<Result<Vec<_>>>()?;
Ok(())
}
}
#[cfg(test)]
mod tests {
use std::assert_matches::assert_matches;
use std::collections::HashMap;
use std::sync::Arc;
use api::v1::{ColumnDataType, SemanticType};
use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use table::metadata::RawTableInfo;
use super::*;
use crate::ddl::test_util::create_table::build_raw_table_info_from_expr;
use crate::ddl::test_util::{TestColumnDefBuilder, TestCreateTableExprBuilder};
use crate::table_name::TableName;
use crate::test_util::{new_ddl_context, MockDatanodeManager};
fn test_create_raw_table_info(name: &str) -> RawTableInfo {
let create_table = TestCreateTableExprBuilder::default()
.column_defs([
TestColumnDefBuilder::default()
.name("ts")
.data_type(ColumnDataType::TimestampMillisecond)
.semantic_type(SemanticType::Timestamp)
.build()
.unwrap()
.into(),
TestColumnDefBuilder::default()
.name("host")
.data_type(ColumnDataType::String)
.semantic_type(SemanticType::Tag)
.build()
.unwrap()
.into(),
TestColumnDefBuilder::default()
.name("cpu")
.data_type(ColumnDataType::Float64)
.semantic_type(SemanticType::Field)
.build()
.unwrap()
.into(),
])
.time_index("ts")
.primary_keys(["host".into()])
.table_name(name)
.build()
.unwrap()
.into();
build_raw_table_info_from_expr(&create_table)
}
#[tokio::test]
async fn test_on_prepare() {
// Drops if exists
let datanode_manager = Arc::new(MockDatanodeManager::new(()));
let ctx = new_ddl_context(datanode_manager);
let executor = DropTableExecutor::new(
TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_table"),
1024,
true,
);
let ctrl = executor.on_prepare(&ctx).await.unwrap();
assert!(ctrl.stop());
// Drops a non-exists table
let executor = DropTableExecutor::new(
TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_table"),
1024,
false,
);
let err = executor.on_prepare(&ctx).await.unwrap_err();
assert_matches!(err, error::Error::TableNotFound { .. });
// Drops a exists table
let executor = DropTableExecutor::new(
TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_table"),
1024,
false,
);
let raw_table_info = test_create_raw_table_info("my_table");
ctx.table_metadata_manager
.create_table_metadata(
raw_table_info,
TableRouteValue::physical(vec![]),
HashMap::new(),
)
.await
.unwrap();
let ctrl = executor.on_prepare(&ctx).await.unwrap();
assert!(!ctrl.stop());
}
}

View File

@@ -28,6 +28,7 @@ use common_telemetry::debug;
use store_api::storage::RegionId;
use table::metadata::RawTableInfo;
use crate::datanode_manager::HandleResponse;
use crate::ddl::create_logical_tables::CreateLogicalTablesProcedure;
use crate::ddl::test_util::create_table::build_raw_table_info_from_expr;
use crate::ddl::test_util::{TestColumnDefBuilder, TestCreateTableExprBuilder};
@@ -36,7 +37,7 @@ use crate::error::{Error, Result};
use crate::key::table_route::TableRouteValue;
use crate::peer::Peer;
use crate::rpc::ddl::CreateTableTask;
use crate::test_util::{new_ddl_context, AffectedRows, MockDatanodeHandler, MockDatanodeManager};
use crate::test_util::{new_ddl_context, MockDatanodeHandler, MockDatanodeManager};
// Note: this code may be duplicated with others.
// However, it's by design, ensures the tests are easy to be modified or added.
@@ -332,9 +333,9 @@ pub struct NaiveDatanodeHandler;
#[async_trait::async_trait]
impl MockDatanodeHandler for NaiveDatanodeHandler {
async fn handle(&self, peer: &Peer, request: RegionRequest) -> Result<AffectedRows> {
async fn handle(&self, peer: &Peer, request: RegionRequest) -> Result<HandleResponse> {
debug!("Returning Ok(0) for request: {request:?}, peer: {peer:?}");
Ok(0)
Ok(HandleResponse::new(0))
}
async fn handle_query(

View File

@@ -26,6 +26,7 @@ use common_procedure_test::MockContextProvider;
use common_recordbatch::SendableRecordBatchStream;
use common_telemetry::debug;
use crate::datanode_manager::HandleResponse;
use crate::ddl::create_table::CreateTableProcedure;
use crate::ddl::test_util::create_table::build_raw_table_info_from_expr;
use crate::ddl::test_util::{TestColumnDefBuilder, TestCreateTableExprBuilder};
@@ -34,11 +35,11 @@ use crate::error::{Error, Result};
use crate::key::table_route::TableRouteValue;
use crate::peer::Peer;
use crate::rpc::ddl::CreateTableTask;
use crate::test_util::{new_ddl_context, AffectedRows, MockDatanodeHandler, MockDatanodeManager};
use crate::test_util::{new_ddl_context, MockDatanodeHandler, MockDatanodeManager};
#[async_trait::async_trait]
impl MockDatanodeHandler for () {
async fn handle(&self, _peer: &Peer, _request: RegionRequest) -> Result<AffectedRows> {
async fn handle(&self, _peer: &Peer, _request: RegionRequest) -> Result<HandleResponse> {
unreachable!()
}
@@ -176,7 +177,7 @@ pub struct RetryErrorDatanodeHandler;
#[async_trait::async_trait]
impl MockDatanodeHandler for RetryErrorDatanodeHandler {
async fn handle(&self, peer: &Peer, request: RegionRequest) -> Result<AffectedRows> {
async fn handle(&self, peer: &Peer, request: RegionRequest) -> Result<HandleResponse> {
debug!("Returning retry later for request: {request:?}, peer: {peer:?}");
Err(Error::RetryLater {
source: BoxedError::new(
@@ -220,7 +221,7 @@ pub struct UnexpectedErrorDatanodeHandler;
#[async_trait::async_trait]
impl MockDatanodeHandler for UnexpectedErrorDatanodeHandler {
async fn handle(&self, peer: &Peer, request: RegionRequest) -> Result<AffectedRows> {
async fn handle(&self, peer: &Peer, request: RegionRequest) -> Result<HandleResponse> {
debug!("Returning mock error for request: {request:?}, peer: {peer:?}");
error::UnexpectedSnafu {
err_msg: "mock error",
@@ -260,9 +261,9 @@ pub struct NaiveDatanodeHandler;
#[async_trait::async_trait]
impl MockDatanodeHandler for NaiveDatanodeHandler {
async fn handle(&self, peer: &Peer, request: RegionRequest) -> Result<AffectedRows> {
async fn handle(&self, peer: &Peer, request: RegionRequest) -> Result<HandleResponse> {
debug!("Returning Ok(0) for request: {request:?}, peer: {peer:?}");
Ok(0)
Ok(HandleResponse::new(0))
}
async fn handle_query(

View File

@@ -124,7 +124,7 @@ impl OpenRegion {
}
/// The instruction of downgrading leader region.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct DowngradeRegion {
/// The [RegionId].
pub region_id: RegionId,
@@ -137,7 +137,7 @@ impl Display for DowngradeRegion {
}
/// Upgrades a follower region to leader region.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct UpgradeRegion {
/// The [RegionId].
pub region_id: RegionId,
@@ -151,7 +151,14 @@ pub struct UpgradeRegion {
pub wait_for_replay_timeout: Option<Duration>,
}
#[derive(Debug, Clone, Serialize, Deserialize, Display)]
#[derive(Debug, Clone, Serialize, Deserialize, Display, PartialEq, Eq)]
/// The identifier of cache.
pub enum CacheIdent {
TableId(TableId),
TableName(TableName),
}
#[derive(Debug, Clone, Serialize, Deserialize, Display, PartialEq)]
pub enum Instruction {
/// Opens a region.
///
@@ -165,10 +172,8 @@ pub enum Instruction {
UpgradeRegion(UpgradeRegion),
/// Downgrades a region.
DowngradeRegion(DowngradeRegion),
/// Invalidates a specified table cache.
InvalidateTableIdCache(TableId),
/// Invalidates a specified table name index cache.
InvalidateTableNameCache(TableName),
/// Invalidates batch cache.
InvalidateCaches(Vec<CacheIdent>),
}
/// The reply of [UpgradeRegion].

View File

@@ -90,13 +90,13 @@ use crate::kv_backend::KvBackendRef;
use crate::rpc::router::{region_distribution, RegionRoute, RegionStatus};
use crate::DatanodeId;
pub const REMOVED_PREFIX: &str = "__removed";
pub const NAME_PATTERN: &str = r"[a-zA-Z_:-][a-zA-Z0-9_:\-\.]*";
pub const MAINTENANCE_KEY: &str = "maintenance";
const DATANODE_TABLE_KEY_PREFIX: &str = "__dn_table";
const TABLE_REGION_KEY_PREFIX: &str = "__table_region";
pub const REMOVED_PREFIX: &str = "__removed";
pub const TABLE_INFO_KEY_PREFIX: &str = "__table_info";
pub const TABLE_NAME_KEY_PREFIX: &str = "__table_name";
pub const CATALOG_NAME_KEY_PREFIX: &str = "__catalog_name";
@@ -140,10 +140,6 @@ lazy_static! {
.unwrap();
}
pub fn to_removed_key(key: &str) -> String {
format!("{REMOVED_PREFIX}-{key}")
}
pub trait TableMetaKey {
fn as_raw_key(&self) -> Vec<u8>;
}
@@ -565,14 +561,10 @@ impl TableMetadataManager {
&table_info.name,
);
let delete_table_name_txn = self
.table_name_manager()
.build_delete_txn(&table_name, table_id)?;
let delete_table_name_txn = self.table_name_manager().build_delete_txn(&table_name)?;
// Deletes table info.
let delete_table_info_txn = self
.table_info_manager()
.build_delete_txn(table_id, table_info_value)?;
let delete_table_info_txn = self.table_info_manager().build_delete_txn(table_id)?;
// Deletes datanode table key value pairs.
let distribution = region_distribution(table_route_value.region_routes()?);
@@ -584,7 +576,7 @@ impl TableMetadataManager {
let delete_table_route_txn = self
.table_route_manager()
.table_route_storage()
.build_delete_txn(table_id, table_route_value)?;
.build_delete_txn(table_id)?;
let txn = Txn::merge_all(vec![
delete_table_name_txn,
@@ -871,7 +863,7 @@ mod tests {
use crate::key::table_info::TableInfoValue;
use crate::key::table_name::TableNameKey;
use crate::key::table_route::TableRouteValue;
use crate::key::{to_removed_key, DeserializedValueWithBytes, TableMetadataManager};
use crate::key::{DeserializedValueWithBytes, TableMetadataManager};
use crate::kv_backend::memory::MemoryKvBackend;
use crate::peer::Peer;
use crate::rpc::router::{region_distribution, Region, RegionRoute, RegionStatus};
@@ -904,13 +896,6 @@ mod tests {
assert_eq!(decoded.bytes, expected);
}
#[test]
fn test_to_removed_key() {
let key = "test_key";
let removed = "__removed-test_key";
assert_eq!(removed, to_removed_key(key));
}
fn new_test_region_route() -> RegionRoute {
new_region_route(1, 2)
}
@@ -1148,24 +1133,20 @@ mod tests {
.unwrap()
.is_empty());
// Checks removed values
let removed_table_info = table_metadata_manager
let table_info = table_metadata_manager
.table_info_manager()
.get_removed(table_id)
.get(table_id)
.await
.unwrap()
.unwrap()
.into_inner();
assert_eq!(removed_table_info.table_info, table_info);
.unwrap();
assert!(table_info.is_none());
let removed_table_route = table_metadata_manager
let table_route = table_metadata_manager
.table_route_manager()
.table_route_storage()
.get_raw_removed(table_id)
.get(table_id)
.await
.unwrap()
.unwrap()
.into_inner();
assert_eq!(removed_table_route.region_routes().unwrap(), region_routes);
.unwrap();
assert!(table_route.is_none());
}
#[tokio::test]

View File

@@ -20,7 +20,7 @@ use table::table_reference::TableReference;
use super::{txn_helper, DeserializedValueWithBytes, TableMetaValue, TABLE_INFO_KEY_PREFIX};
use crate::error::Result;
use crate::key::{to_removed_key, TableMetaKey};
use crate::key::TableMetaKey;
use crate::kv_backend::txn::{Txn, TxnOp, TxnOpResponse};
use crate::kv_backend::KvBackendRef;
use crate::rpc::store::BatchGetRequest;
@@ -157,38 +157,15 @@ impl TableInfoManager {
}
/// Builds a delete table info transaction.
pub(crate) fn build_delete_txn(
&self,
table_id: TableId,
table_info_value: &DeserializedValueWithBytes<TableInfoValue>,
) -> Result<Txn> {
pub(crate) fn build_delete_txn(&self, table_id: TableId) -> Result<Txn> {
let key = TableInfoKey::new(table_id);
let raw_key = key.as_raw_key();
let raw_value = table_info_value.get_raw_bytes();
let removed_key = to_removed_key(&String::from_utf8_lossy(&raw_key));
let txn = Txn::new().and_then(vec![
TxnOp::Delete(raw_key),
TxnOp::Put(removed_key.into_bytes(), raw_value),
]);
let txn = Txn::new().and_then(vec![TxnOp::Delete(raw_key)]);
Ok(txn)
}
#[cfg(test)]
pub async fn get_removed(
&self,
table_id: TableId,
) -> Result<Option<DeserializedValueWithBytes<TableInfoValue>>> {
let key = TableInfoKey::new(table_id).to_string();
let removed_key = to_removed_key(&key).into_bytes();
self.kv_backend
.get(&removed_key)
.await?
.map(|x| DeserializedValueWithBytes::from_inner_slice(&x.value))
.transpose()
}
pub async fn get(
&self,
table_id: TableId,

View File

@@ -22,7 +22,7 @@ use table::metadata::TableId;
use super::{TableMetaValue, TABLE_NAME_KEY_PATTERN, TABLE_NAME_KEY_PREFIX};
use crate::error::{Error, InvalidTableMetadataSnafu, Result};
use crate::key::{to_removed_key, TableMetaKey};
use crate::key::TableMetaKey;
use crate::kv_backend::memory::MemoryKvBackend;
use crate::kv_backend::txn::{Txn, TxnOp};
use crate::kv_backend::KvBackendRef;
@@ -195,20 +195,9 @@ impl TableNameManager {
}
/// Builds a delete table name transaction. It only executes while the primary keys comparing successes.
pub(crate) fn build_delete_txn(
&self,
key: &TableNameKey<'_>,
table_id: TableId,
) -> Result<Txn> {
pub(crate) fn build_delete_txn(&self, key: &TableNameKey<'_>) -> Result<Txn> {
let raw_key = key.as_raw_key();
let value = TableNameValue::new(table_id);
let raw_value = value.try_as_raw_value()?;
let removed_key = to_removed_key(&String::from_utf8_lossy(&raw_key));
let txn = Txn::new().and_then(vec![
TxnOp::Delete(raw_key),
TxnOp::Put(removed_key.into_bytes(), raw_value),
]);
let txn = Txn::new().and_then(vec![TxnOp::Delete(raw_key)]);
Ok(txn)
}

View File

@@ -25,7 +25,7 @@ use crate::error::{
self, MetadataCorruptionSnafu, Result, SerdeJsonSnafu, TableRouteNotFoundSnafu,
UnexpectedLogicalRouteTableSnafu,
};
use crate::key::{to_removed_key, RegionDistribution, TableMetaKey, TABLE_ROUTE_PREFIX};
use crate::key::{RegionDistribution, TableMetaKey, TABLE_ROUTE_PREFIX};
use crate::kv_backend::txn::{Txn, TxnOp, TxnOpResponse};
use crate::kv_backend::KvBackendRef;
use crate::rpc::router::{region_distribution, RegionRoute};
@@ -485,38 +485,15 @@ impl TableRouteStorage {
/// Builds a delete table route transaction,
/// it expected the remote value equals the `table_route_value`.
pub(crate) fn build_delete_txn(
&self,
table_id: TableId,
table_route_value: &DeserializedValueWithBytes<TableRouteValue>,
) -> Result<Txn> {
pub(crate) fn build_delete_txn(&self, table_id: TableId) -> Result<Txn> {
let key = TableRouteKey::new(table_id);
let raw_key = key.as_raw_key();
let raw_value = table_route_value.get_raw_bytes();
let removed_key = to_removed_key(&String::from_utf8_lossy(&raw_key));
let txn = Txn::new().and_then(vec![
TxnOp::Delete(raw_key),
TxnOp::Put(removed_key.into_bytes(), raw_value),
]);
let txn = Txn::new().and_then(vec![TxnOp::Delete(raw_key)]);
Ok(txn)
}
#[cfg(test)]
pub async fn get_raw_removed(
&self,
table_id: TableId,
) -> Result<Option<DeserializedValueWithBytes<TableRouteValue>>> {
let key = TableRouteKey::new(table_id).to_string();
let removed_key = to_removed_key(&key).into_bytes();
self.kv_backend
.get(&removed_key)
.await?
.map(|x| DeserializedValueWithBytes::from_inner_slice(&x.value))
.transpose()
}
/// Returns the [`TableRouteValue`].
pub async fn get(&self, table_id: TableId) -> Result<Option<TableRouteValue>> {
let key = TableRouteKey::new(table_id);

View File

@@ -22,7 +22,7 @@ use api::v1::meta::{
DropTableTask as PbDropTableTask, DropTableTasks as PbDropTableTasks, Partition, ProcedureId,
TruncateTableTask as PbTruncateTableTask,
};
use api::v1::{AlterExpr, CreateTableExpr, DropTableExpr, TruncateTableExpr};
use api::v1::{AlterExpr, CreateTableExpr, DropTableExpr, SemanticType, TruncateTableExpr};
use base64::engine::general_purpose;
use base64::Engine as _;
use prost::Message;
@@ -368,6 +368,44 @@ impl CreateTableTask {
pub fn set_table_id(&mut self, table_id: TableId) {
self.table_info.ident.table_id = table_id;
}
/// Sort the columns in [CreateTableExpr] and [RawTableInfo].
///
/// This function won't do any check or verification. Caller should
/// ensure this task is valid.
pub fn sort_columns(&mut self) {
// sort create table expr
// sort column_defs by name
self.create_table
.column_defs
.sort_unstable_by(|a, b| a.name.cmp(&b.name));
// compute new indices of sorted columns
// this part won't do any check or verification.
let mut primary_key_indices = Vec::with_capacity(self.create_table.primary_keys.len());
let mut value_indices =
Vec::with_capacity(self.create_table.column_defs.len() - primary_key_indices.len() - 1);
let mut timestamp_index = None;
for (index, col) in self.create_table.column_defs.iter().enumerate() {
if self.create_table.primary_keys.contains(&col.name) {
primary_key_indices.push(index);
} else if col.semantic_type == SemanticType::Timestamp as i32 {
timestamp_index = Some(index);
} else {
value_indices.push(index);
}
}
// overwrite table info
self.table_info
.meta
.schema
.column_schemas
.sort_unstable_by(|a, b| a.name.cmp(&b.name));
self.table_info.meta.schema.timestamp_index = timestamp_index;
self.table_info.meta.primary_key_indices = primary_key_indices;
self.table_info.meta.value_indices = value_indices;
}
}
impl Serialize for CreateTableTask {
@@ -555,9 +593,11 @@ impl TryFrom<TruncateTableTask> for PbTruncateTableTask {
mod tests {
use std::sync::Arc;
use api::v1::{AlterExpr, CreateTableExpr};
use datatypes::schema::SchemaBuilder;
use table::metadata::RawTableInfo;
use api::v1::{AlterExpr, ColumnDef, CreateTableExpr, SemanticType};
use datatypes::schema::{ColumnSchema, RawSchema, SchemaBuilder};
use store_api::metric_engine_consts::METRIC_ENGINE_NAME;
use store_api::storage::ConcreteDataType;
use table::metadata::{RawTableInfo, RawTableMeta, TableType};
use table::test_util::table_info::test_table_info;
use super::{AlterTableTask, CreateTableTask};
@@ -589,4 +629,108 @@ mod tests {
let de = serde_json::from_slice(&output).unwrap();
assert_eq!(task, de);
}
#[test]
fn test_sort_columns() {
// construct RawSchema
let raw_schema = RawSchema {
column_schemas: vec![
ColumnSchema::new(
"column3".to_string(),
ConcreteDataType::string_datatype(),
true,
),
ColumnSchema::new(
"column1".to_string(),
ConcreteDataType::timestamp_millisecond_datatype(),
false,
),
ColumnSchema::new(
"column2".to_string(),
ConcreteDataType::float64_datatype(),
true,
),
],
timestamp_index: Some(1),
version: 0,
};
// construct RawTableMeta
let raw_table_meta = RawTableMeta {
schema: raw_schema,
primary_key_indices: vec![0],
value_indices: vec![2],
engine: METRIC_ENGINE_NAME.to_string(),
next_column_id: 0,
region_numbers: vec![0],
options: Default::default(),
created_on: Default::default(),
partition_key_indices: Default::default(),
};
// construct RawTableInfo
let raw_table_info = RawTableInfo {
ident: Default::default(),
meta: raw_table_meta,
name: Default::default(),
desc: Default::default(),
catalog_name: Default::default(),
schema_name: Default::default(),
table_type: TableType::Base,
};
// construct create table expr
let create_table_expr = CreateTableExpr {
column_defs: vec![
ColumnDef {
name: "column3".to_string(),
semantic_type: SemanticType::Tag as i32,
..Default::default()
},
ColumnDef {
name: "column1".to_string(),
semantic_type: SemanticType::Timestamp as i32,
..Default::default()
},
ColumnDef {
name: "column2".to_string(),
semantic_type: SemanticType::Field as i32,
..Default::default()
},
],
primary_keys: vec!["column3".to_string()],
..Default::default()
};
let mut create_table_task =
CreateTableTask::new(create_table_expr, Vec::new(), raw_table_info);
// Call the sort_columns method
create_table_task.sort_columns();
// Assert that the columns are sorted correctly
assert_eq!(
create_table_task.create_table.column_defs[0].name,
"column1".to_string()
);
assert_eq!(
create_table_task.create_table.column_defs[1].name,
"column2".to_string()
);
assert_eq!(
create_table_task.create_table.column_defs[2].name,
"column3".to_string()
);
// Assert that the table_info is updated correctly
assert_eq!(
create_table_task.table_info.meta.schema.timestamp_index,
Some(0)
);
assert_eq!(
create_table_task.table_info.meta.primary_key_indices,
vec![2]
);
assert_eq!(create_table_task.table_info.meta.value_indices, vec![1]);
}
}

View File

@@ -19,7 +19,9 @@ pub use common_base::AffectedRows;
use common_recordbatch::SendableRecordBatchStream;
use crate::cache_invalidator::DummyCacheInvalidator;
use crate::datanode_manager::{Datanode, DatanodeManager, DatanodeManagerRef, DatanodeRef};
use crate::datanode_manager::{
Datanode, DatanodeManager, DatanodeManagerRef, DatanodeRef, HandleResponse,
};
use crate::ddl::table_meta::TableMetadataAllocator;
use crate::ddl::DdlContext;
use crate::error::Result;
@@ -32,7 +34,7 @@ use crate::wal_options_allocator::WalOptionsAllocator;
#[async_trait::async_trait]
pub trait MockDatanodeHandler: Sync + Send + Clone {
async fn handle(&self, peer: &Peer, request: RegionRequest) -> Result<AffectedRows>;
async fn handle(&self, peer: &Peer, request: RegionRequest) -> Result<HandleResponse>;
async fn handle_query(
&self,
@@ -62,7 +64,7 @@ struct MockDatanode<T> {
#[async_trait::async_trait]
impl<T: MockDatanodeHandler> Datanode for MockDatanode<T> {
async fn handle(&self, request: RegionRequest) -> Result<AffectedRows> {
async fn handle(&self, request: RegionRequest) -> Result<HandleResponse> {
self.handler.handle(&self.peer, request).await
}

View File

@@ -16,4 +16,5 @@
pub const GREPTIME_EXEC_PREFIX: &str = "greptime_exec_";
/// Execution cost metrics key
pub const GREPTIME_EXEC_COST: &str = "greptime_exec_cost";
pub const GREPTIME_EXEC_READ_COST: &str = "greptime_exec_read_cost";
pub const GREPTIME_EXEC_WRITE_COST: &str = "greptime_exec_write_cost";

View File

@@ -17,4 +17,4 @@
/// since `plugins` crate is at the top depending on crates like `frontend` and `datanode`
mod consts;
pub use consts::{GREPTIME_EXEC_COST, GREPTIME_EXEC_PREFIX};
pub use consts::{GREPTIME_EXEC_PREFIX, GREPTIME_EXEC_READ_COST, GREPTIME_EXEC_WRITE_COST};

View File

@@ -40,7 +40,7 @@ pub struct Output {
/// Original Output struct
/// carrying result data to response/client/user interface
pub enum OutputData {
AffectedRows(usize),
AffectedRows(OutputRows),
RecordBatches(RecordBatches),
Stream(SendableRecordBatchStream),
}
@@ -50,11 +50,11 @@ pub enum OutputData {
pub struct OutputMeta {
/// May exist for query output. One can retrieve execution metrics from this plan.
pub plan: Option<Arc<dyn PhysicalPlan>>,
pub cost: usize,
pub cost: OutputCost,
}
impl Output {
pub fn new_with_affected_rows(affected_rows: usize) -> Self {
pub fn new_with_affected_rows(affected_rows: OutputRows) -> Self {
Self {
data: OutputData::AffectedRows(affected_rows),
meta: Default::default(),
@@ -78,6 +78,13 @@ impl Output {
pub fn new(data: OutputData, meta: OutputMeta) -> Self {
Self { data, meta }
}
pub fn extract_rows_and_cost(&self) -> (OutputRows, OutputCost) {
match self.data {
OutputData::AffectedRows(rows) => (rows, self.meta.cost),
_ => (0, self.meta.cost),
}
}
}
impl Debug for OutputData {
@@ -133,3 +140,6 @@ impl From<&AddColumnLocation> for Location {
}
}
}
pub type OutputRows = usize;
pub type OutputCost = usize;

View File

@@ -81,9 +81,7 @@ impl RegionHeartbeatResponseHandler {
Instruction::UpgradeRegion(upgrade_region) => Ok(Box::new(move |handler_context| {
handler_context.handle_upgrade_region_instruction(upgrade_region)
})),
Instruction::InvalidateTableIdCache(_) | Instruction::InvalidateTableNameCache(_) => {
InvalidHeartbeatResponseSnafu.fail()
}
Instruction::InvalidateCaches(_) => InvalidHeartbeatResponseSnafu.fail(),
}
}
}

View File

@@ -25,6 +25,7 @@ use async_trait::async_trait;
use bytes::Bytes;
use common_error::ext::BoxedError;
use common_error::status_code::StatusCode;
use common_meta::datanode_manager::HandleResponse;
use common_query::logical_plan::Expr;
use common_query::physical_plan::DfPhysicalPlanAdapter;
use common_query::{DfPhysicalPlan, OutputData};
@@ -128,7 +129,7 @@ impl RegionServer {
&self,
region_id: RegionId,
request: RegionRequest,
) -> Result<AffectedRows> {
) -> Result<HandleResponse> {
self.inner.handle_request(region_id, request).await
}
@@ -267,11 +268,10 @@ impl RegionServerHandler for RegionServer {
results
};
// merge results by simply sum up affected rows.
// only insert/delete will have multiple results.
// merge results by sum up affected rows and merge extensions.
let mut affected_rows = 0;
for result in results {
affected_rows += result;
affected_rows += result.affected_rows;
}
Ok(RegionResponse {
@@ -282,6 +282,7 @@ impl RegionServerHandler for RegionServer {
}),
}),
affected_rows: affected_rows as _,
extension: Default::default(),
})
}
}
@@ -462,7 +463,7 @@ impl RegionServerInner {
&self,
region_id: RegionId,
request: RegionRequest,
) -> Result<AffectedRows> {
) -> Result<HandleResponse> {
let request_type = request.request_type();
let _timer = crate::metrics::HANDLE_REGION_REQUEST_ELAPSED
.with_label_values(&[request_type])
@@ -487,7 +488,7 @@ impl RegionServerInner {
let engine = match self.get_engine(region_id, &region_change)? {
CurrentEngine::Engine(engine) => engine,
CurrentEngine::EarlyReturn(rows) => return Ok(rows),
CurrentEngine::EarlyReturn(rows) => return Ok(HandleResponse::new(rows)),
};
// Sets corresponding region status to registering/deregistering before the operation.
@@ -502,7 +503,10 @@ impl RegionServerInner {
// Sets corresponding region status to ready.
self.set_region_status_ready(region_id, engine, region_change)
.await?;
Ok(result)
Ok(HandleResponse {
affected_rows: result.affected_rows,
extension: result.extension,
})
}
Err(err) => {
// Removes the region status if the operation fails.
@@ -645,6 +649,7 @@ impl RegionServerInner {
.decode(Bytes::from(plan), catalog_list, "", "")
.await
.context(DecodeLogicalPlanSnafu)?;
let result = self
.query_engine
.execute(logical_plan.into(), ctx)
@@ -916,11 +921,11 @@ mod tests {
RegionEngineWithStatus::Registering(engine.clone()),
);
let affected_rows = mock_region_server
let response = mock_region_server
.handle_request(region_id, RegionRequest::Create(create_req))
.await
.unwrap();
assert_eq!(affected_rows, 0);
assert_eq!(response.affected_rows, 0);
let status = mock_region_server
.inner
@@ -931,7 +936,7 @@ mod tests {
assert!(matches!(status, RegionEngineWithStatus::Registering(_)));
let affected_rows = mock_region_server
let response = mock_region_server
.handle_request(
region_id,
RegionRequest::Open(RegionOpenRequest {
@@ -943,7 +948,7 @@ mod tests {
)
.await
.unwrap();
assert_eq!(affected_rows, 0);
assert_eq!(response.affected_rows, 0);
let status = mock_region_server
.inner
@@ -971,11 +976,11 @@ mod tests {
RegionEngineWithStatus::Deregistering(engine.clone()),
);
let affected_rows = mock_region_server
let response = mock_region_server
.handle_request(region_id, RegionRequest::Drop(RegionDropRequest {}))
.await
.unwrap();
assert_eq!(affected_rows, 0);
assert_eq!(response.affected_rows, 0);
let status = mock_region_server
.inner
@@ -990,11 +995,11 @@ mod tests {
RegionEngineWithStatus::Deregistering(engine.clone()),
);
let affected_rows = mock_region_server
let response = mock_region_server
.handle_request(region_id, RegionRequest::Close(RegionCloseRequest {}))
.await
.unwrap();
assert_eq!(affected_rows, 0);
assert_eq!(response.affected_rows, 0);
let status = mock_region_server
.inner

View File

@@ -31,7 +31,7 @@ use query::query_engine::DescribeResult;
use query::{QueryEngine, QueryEngineContext};
use session::context::QueryContextRef;
use store_api::metadata::RegionMetadataRef;
use store_api::region_engine::{RegionEngine, RegionRole, SetReadonlyResponse};
use store_api::region_engine::{RegionEngine, RegionHandleResult, RegionRole, SetReadonlyResponse};
use store_api::region_request::{AffectedRows, RegionRequest};
use store_api::storage::{RegionId, ScanRequest};
use table::TableRef;
@@ -166,16 +166,18 @@ impl RegionEngine for MockRegionEngine {
&self,
region_id: RegionId,
request: RegionRequest,
) -> Result<AffectedRows, BoxedError> {
) -> Result<RegionHandleResult, BoxedError> {
if let Some(delay) = self.handle_request_delay {
tokio::time::sleep(delay).await;
}
if let Some(mock_fn) = &self.handle_request_mock_fn {
return mock_fn(region_id, request).map_err(BoxedError::new);
return mock_fn(region_id, request)
.map_err(BoxedError::new)
.map(RegionHandleResult::new);
};
let _ = self.sender.send((region_id, request)).await;
Ok(0)
Ok(RegionHandleResult::new(0))
}
async fn handle_query(

View File

@@ -143,11 +143,22 @@ impl ColumnSchema {
}
/// Set the nullablity to `true` of the column.
/// Similar to [set_nullable] but take the ownership and return a owned value.
///
/// [set_nullable]: Self::set_nullable
pub fn with_nullable_set(mut self) -> Self {
self.is_nullable = true;
self
}
/// Set the nullability to `true` of the column.
/// Similar to [with_nullable_set] but don't take the ownership
///
/// [with_nullable_set]: Self::with_nullable_set
pub fn set_nullable(&mut self) {
self.is_nullable = true;
}
/// Creates a new [`ColumnSchema`] with given metadata.
pub fn with_metadata(mut self, metadata: Metadata) -> Self {
self.metadata = metadata;

View File

@@ -24,7 +24,7 @@ use common_telemetry::{error, info};
use object_store::ObjectStore;
use snafu::{ensure, OptionExt};
use store_api::metadata::RegionMetadataRef;
use store_api::region_engine::{RegionEngine, RegionRole, SetReadonlyResponse};
use store_api::region_engine::{RegionEngine, RegionHandleResult, RegionRole, SetReadonlyResponse};
use store_api::region_request::{
AffectedRows, RegionCloseRequest, RegionCreateRequest, RegionDropRequest, RegionOpenRequest,
RegionRequest,
@@ -60,7 +60,7 @@ impl RegionEngine for FileRegionEngine {
&self,
region_id: RegionId,
request: RegionRequest,
) -> Result<AffectedRows, BoxedError> {
) -> Result<RegionHandleResult, BoxedError> {
self.inner
.handle_request(region_id, request)
.await
@@ -154,8 +154,8 @@ impl EngineInner {
&self,
region_id: RegionId,
request: RegionRequest,
) -> EngineResult<AffectedRows> {
match request {
) -> EngineResult<RegionHandleResult> {
let result = match request {
RegionRequest::Create(req) => self.handle_create(region_id, req).await,
RegionRequest::Drop(req) => self.handle_drop(region_id, req).await,
RegionRequest::Open(req) => self.handle_open(region_id, req).await,
@@ -164,7 +164,8 @@ impl EngineInner {
operation: request.to_string(),
}
.fail(),
}
};
result.map(RegionHandleResult::new)
}
async fn stop(&self) -> EngineResult<()> {

View File

@@ -25,6 +25,7 @@ num-traits = "0.2"
serde.workspace = true
servers.workspace = true
session.workspace = true
smallvec.workspace = true
snafu.workspace = true
tokio.workspace = true
tonic.workspace = true

View File

@@ -19,3 +19,4 @@ mod adapter;
mod expr;
mod plan;
mod repr;
mod utils;

View File

@@ -31,7 +31,7 @@ pub(crate) use relation::{RelationDesc, RelationType};
use serde::{Deserialize, Serialize};
use snafu::ResultExt;
use crate::expr::error::{CastValueSnafu, EvalError};
use crate::expr::error::{CastValueSnafu, EvalError, InvalidArgumentSnafu};
/// System-wide Record count difference type. Useful for capture data change
///
@@ -39,17 +39,32 @@ use crate::expr::error::{CastValueSnafu, EvalError};
/// and +/-n means insert/remove multiple duplicate records.
pub type Diff = i64;
/// System-wide default timestamp type
/// System-wide default timestamp type, in milliseconds
pub type Timestamp = i64;
/// System-wide default duration type, in milliseconds
pub type Duration = i64;
/// Default type for a repr of changes to a collection.
pub type DiffRow = (Row, Timestamp, Diff);
pub type KeyValDiffRow = ((Row, Row), Timestamp, Diff);
/// Convert a value that is or can be converted to Datetime to internal timestamp
pub fn value_to_internal_ts(value: Value) -> Result<Timestamp, EvalError> {
let is_supported_time_type = |arg: &Value| {
let ty = arg.data_type();
matches!(
ty,
ConcreteDataType::Date(..)
| ConcreteDataType::DateTime(..)
| ConcreteDataType::Timestamp(..)
)
};
match value {
Value::DateTime(ts) => Ok(ts.val()),
arg => {
Value::Int64(ts) => Ok(ts),
arg if is_supported_time_type(&arg) => {
let arg_ty = arg.data_type();
let res = cast(arg, &ConcreteDataType::datetime_datatype()).context({
CastValueSnafu {
@@ -63,6 +78,10 @@ pub fn value_to_internal_ts(value: Value) -> Result<Timestamp, EvalError> {
unreachable!()
}
}
_ => InvalidArgumentSnafu {
reason: format!("Expect a time type or i64, got {:?}", value.data_type()),
}
.fail(),
}
}
@@ -145,24 +164,58 @@ impl From<Row> for ProtoRow {
ProtoRow { values }
}
}
#[cfg(test)]
mod test {
use common_time::{Date, DateTime};
#[test]
fn test_row() {
let row = Row::empty();
let row_1 = Row::new(vec![]);
assert_eq!(row, row_1);
let mut row_2 = Row::new(vec![Value::Int32(1), Value::Int32(2)]);
assert_eq!(row_2.get(0), Some(&Value::Int32(1)));
row_2.clear();
assert_eq!(row_2.get(0), None);
row_2
.packer()
.extend(vec![Value::Int32(1), Value::Int32(2)]);
assert_eq!(row_2.get(0), Some(&Value::Int32(1)));
row_2.extend(vec![Value::Int32(1), Value::Int32(2)]);
assert_eq!(row_2.len(), 4);
let row_3 = Row::pack(row_2.into_iter());
assert_eq!(row_3.len(), 4);
let row_4 = Row::pack(row_3.iter().cloned());
assert_eq!(row_3, row_4);
use super::*;
#[test]
fn test_row() {
let row = Row::empty();
let row_1 = Row::new(vec![]);
assert_eq!(row, row_1);
let mut row_2 = Row::new(vec![Value::Int32(1), Value::Int32(2)]);
assert_eq!(row_2.get(0), Some(&Value::Int32(1)));
row_2.clear();
assert_eq!(row_2.get(0), None);
row_2
.packer()
.extend(vec![Value::Int32(1), Value::Int32(2)]);
assert_eq!(row_2.get(0), Some(&Value::Int32(1)));
row_2.extend(vec![Value::Int32(1), Value::Int32(2)]);
assert_eq!(row_2.len(), 4);
let row_3 = Row::pack(row_2.into_iter());
assert_eq!(row_3.len(), 4);
let row_4 = Row::pack(row_3.iter().cloned());
assert_eq!(row_3, row_4);
}
#[test]
fn test_cast_to_internal_ts() {
{
let a = Value::from(1i32);
let b = Value::from(1i64);
let c = Value::DateTime(DateTime::new(1i64));
let d = Value::from(1.0);
assert!(value_to_internal_ts(a).is_err());
assert_eq!(value_to_internal_ts(b).unwrap(), 1i64);
assert_eq!(value_to_internal_ts(c).unwrap(), 1i64);
assert!(value_to_internal_ts(d).is_err());
}
{
// time related type
let a = Value::Date(Date::new(1));
assert_eq!(value_to_internal_ts(a).unwrap(), 86400 * 1000i64);
let b = Value::Timestamp(common_time::Timestamp::new_second(1));
assert_eq!(value_to_internal_ts(b).unwrap(), 1000i64);
let c = Value::Time(common_time::time::Time::new_second(1));
assert!(matches!(
value_to_internal_ts(c),
Err(EvalError::InvalidArgument { .. })
));
}
}
}

784
src/flow/src/utils.rs Normal file
View File

@@ -0,0 +1,784 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::{BTreeMap, BTreeSet};
use std::ops::Bound;
use std::sync::Arc;
use itertools::Itertools;
use serde::{Deserialize, Serialize};
use smallvec::{smallvec, SmallVec};
use tokio::sync::{Mutex, RwLock};
use crate::expr::error::InternalSnafu;
use crate::expr::{EvalError, ScalarExpr};
use crate::repr::{value_to_internal_ts, Diff, DiffRow, Duration, KeyValDiffRow, Row, Timestamp};
pub type Batch = BTreeMap<Row, SmallVec<[DiffRow; 2]>>;
pub type Spine = BTreeMap<Timestamp, Batch>;
/// Determine when should a key expire according to it's event timestamp in key,
/// if a key is expired, any future updates to it should be ignored
/// Note that key is expired by it's event timestamp(contained in the key), not by the time it's inserted(system timestamp)
#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
pub struct KeyExpiryManager {
/// a map from event timestamp to key, used for expire keys
event_ts_to_key: BTreeMap<Timestamp, BTreeSet<Row>>,
/// duration after which a key is considered expired, and will be removed from state
key_expiration_duration: Option<Duration>,
/// using this to get timestamp from key row
event_timestamp_from_row: Option<ScalarExpr>,
}
impl KeyExpiryManager {
/// extract event timestamp from key row
///
/// if no expire state is set, return None
pub fn extract_event_ts(&self, row: &Row) -> Result<Option<Timestamp>, EvalError> {
let ts = self
.event_timestamp_from_row
.as_ref()
.map(|e| e.eval(&row.inner))
.transpose()?
.map(value_to_internal_ts)
.transpose()?;
Ok(ts)
}
/// return timestamp that should be expired by the time `now` by compute `now - expiration_duration`
pub fn compute_expiration_timestamp(&self, now: Timestamp) -> Option<Timestamp> {
self.key_expiration_duration.map(|d| now - d)
}
/// update the event timestamp to key mapping
///
/// if given key is expired by now(that is lesser than `now - expiry_duration`), return the amount of time it's expired
/// if it's not expired, return None
pub fn update_event_ts(
&mut self,
now: Timestamp,
row: &Row,
) -> Result<Option<Duration>, EvalError> {
let ts = if let Some(event_ts) = self.extract_event_ts(row)? {
let ret = self.compute_expiration_timestamp(now).and_then(|e| {
if e > event_ts {
// return how much time it's expired
Some(e - event_ts)
} else {
None
}
});
if let Some(expire_by) = ret {
return Ok(Some(expire_by));
}
event_ts
} else {
return Ok(None);
};
self.event_ts_to_key
.entry(ts)
.or_default()
.insert(row.clone());
Ok(None)
}
}
/// A shared state of key-value pair for various state
/// in dataflow execution
///
/// i.e: Mfp operator with temporal filter need to store it's future output so that it can add now, and delete later.
/// To get all needed updates in a time span, use [`get_updates_in_range`]
///
/// And reduce operator need full state of it's output, so that it can query(and modify by calling [`apply_updates`])
/// existing state, also need a way to expire keys. To get a key's current value, use [`get`] with time being `now`
/// so it's like:
/// `mfp operator -> arrange(store futures only, no expire) -> reduce operator <-> arrange(full, with key expiring time) -> output`
///
/// Note the two way arrow between reduce operator and arrange, it's because reduce operator need to query existing state
/// and also need to update existing state
#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
pub struct Arrangement {
/// all the updates that pending to be applied
/// arranged in time -> (key -> (new_val, diff))
/// all updates where the update time is greater than the last key but less than or equal to the current key
/// are updates are categorized under current key.
///
/// that is: `last key < update time <= current key`
/// or for time that's before the first key, just being categorized under the first key
/// The first key is always `now` which include consolidated updates from past, representing the current state of arrangement
///
/// Note that for a given time and key, there might be a bunch of updates and they should be applied in order
/// And for consolidated batch(i.e. btach representing now), there should be only one update for each key with `diff==1`
///
/// And since most time a key gots updated by first delete then insert, small vec with size of 2 make sense
/// TODO: batch size balancing?
spine: Spine,
/// if set to false, will not update current value of the arrangement, useful for case like `map -> arrange -> reduce`
full_arrangement: bool,
/// flag to mark that this arrangement haven't been written to, so that it can be cloned and shared
is_written: bool,
/// manage the expire state of the arrangement
expire_state: Option<KeyExpiryManager>,
/// the time that the last compaction happened, also know as current time
last_compaction_time: Option<Timestamp>,
}
impl Arrangement {
pub fn new() -> Self {
Self {
spine: Default::default(),
full_arrangement: false,
is_written: false,
expire_state: None,
last_compaction_time: None,
}
}
/// apply updates into spine, all updates should have timestamps that are larger than spine's first key
///
/// return the maximum expire time(already expire by how much time) of all updates if any keys is already expired
pub fn apply_updates(
&mut self,
now: Timestamp,
updates: Vec<KeyValDiffRow>,
) -> Result<Option<Duration>, EvalError> {
let mut max_late_by: Option<Duration> = None;
if !self.is_written {
self.is_written = true;
}
for ((key, val), ts, diff) in updates {
// keep rows with expired event timestamp from being updated
if let Some(s) = &mut self.expire_state {
if let Some(late_by) = s.update_event_ts(now, &key)? {
max_late_by = Some(max_late_by.map_or(late_by, |v| v.max(late_by)));
continue;
}
}
// the first batch with key that's greater or equal to ts
let batch = if let Some((_, batch)) = self.spine.range_mut(ts..).next() {
batch
} else {
// if no batch with `batch key >= ts`, then create a new batch with key being `ts`
self.spine.entry(ts).or_default()
};
{
let key_updates = batch.entry(key).or_insert(smallvec![]);
key_updates.push((val, ts, diff));
// a stable sort make updates sort in order of insertion
// without changing the order of updates within same tick
key_updates.sort_by_key(|r| r.1);
}
}
Ok(max_late_by)
}
/// find out the time of next update in the future
/// that is the next update with `timestamp > now`
pub fn get_next_update_time(&self, now: &Timestamp) -> Option<Timestamp> {
// iter over batches that only have updates of `timestamp>now` and find the first non empty batch, then get the minimum timestamp in that batch
let next_batches = self.spine.range((Bound::Excluded(now), Bound::Unbounded));
for (_ts, batch) in next_batches {
let min_ts = batch
.iter()
.flat_map(|(_k, v)| v.iter().map(|(_, ts, _)| *ts))
.min();
if let Some(min_ts) = min_ts {
return Some(min_ts);
} else {
continue;
}
}
// all batches are empty, return now
None
}
/// get the last compaction time
pub fn get_compaction(&self) -> Option<Timestamp> {
self.last_compaction_time
}
/// split spine off at `now`, and return the spine that's before `now`(including `now`)
fn split_lte(&mut self, now: &Timestamp) -> Spine {
let mut before = self.spine.split_off(&(now + 1));
std::mem::swap(&mut before, &mut self.spine);
// if before's last key == now, then all the keys we needed are found
if before
.last_key_value()
.map(|(k, _v)| *k == *now)
.unwrap_or(false)
{
return before;
}
// also need to move all keys from the first batch in spine with timestamp<=now to before
// we know that all remaining keys to be split off are last key < key <= now, we will make them into a new batch
if let Some(mut first_batch) = self.spine.first_entry() {
let mut new_batch: Batch = Default::default();
// remove all keys with val of empty vec
first_batch.get_mut().retain(|key, updates| {
// remove keys <= now from updates
updates.retain(|(val, ts, diff)| {
if *ts <= *now {
new_batch.entry(key.clone()).or_insert(smallvec![]).push((
val.clone(),
*ts,
*diff,
));
}
*ts > *now
});
!updates.is_empty()
});
before.entry(*now).or_default().extend(new_batch);
}
before
}
/// advance time to `now` and consolidate all older(`now` included) updates to the first key
///
/// return the maximum expire time(already expire by how much time) of all updates if any keys is already expired
pub fn set_compaction(&mut self, now: Timestamp) -> Result<Option<Duration>, EvalError> {
let mut max_late_by: Option<Duration> = None;
let should_compact = self.split_lte(&now);
self.last_compaction_time = Some(now);
// if a full arrangement is not needed, we can just discard everything before and including now
if !self.full_arrangement {
return Ok(None);
}
// else we update them into current key value pairs
let mut compacted_batch: BTreeMap<Row, SmallVec<[DiffRow; 2]>> = Default::default();
for (_, batch) in should_compact {
for (key, updates) in batch {
if let Some(s) = &mut self.expire_state {
if let Some(late_by) = s.update_event_ts(now, &key)? {
max_late_by = Some(max_late_by.map_or(late_by, |v| v.max(late_by)));
continue;
}
}
// if diff cancel out each other, then remove the key
let mut old_row: Option<DiffRow> =
compacted_batch.get(&key).and_then(|v| v.first()).cloned();
for new_row in updates {
old_row = compact_diff_row(old_row, &new_row);
}
if let Some(compacted_update) = old_row {
compacted_batch.insert(key, smallvec![compacted_update]);
} else {
compacted_batch.remove(&key);
}
}
}
// insert the compacted batch into spine with key being `now`
self.spine.insert(now, compacted_batch);
Ok(max_late_by)
}
/// get the updates of the arrangement from the given range of time
pub fn get_updates_in_range<R: std::ops::RangeBounds<Timestamp> + Clone>(
&self,
range: R,
) -> Vec<KeyValDiffRow> {
let mut result = vec![];
// three part:
// 1.the starting batch with first key >= range.start, which may contain updates that not in range
// 2. the batches with key in range
// 3. the last batch with first key > range.end, which may contain updates that are in range
let mut is_first = true;
for (_ts, batch) in self.spine.range(range.clone()) {
if is_first {
for (key, updates) in batch {
let iter = updates
.iter()
.filter(|(_val, ts, _diff)| range.contains(ts))
.map(|(val, ts, diff)| ((key.clone(), val.clone()), *ts, *diff));
result.extend(iter);
}
is_first = false;
} else {
for (key, updates) in batch.clone() {
result.extend(
updates
.iter()
.map(|(val, ts, diff)| ((key.clone(), val.clone()), *ts, *diff)),
);
}
}
}
// deal with boundary include start and end
// and for the next batch with upper_bound >= range.end
// we need to search for updates within range
let neg_bound = match range.end_bound() {
Bound::Included(b) => {
// if boundary is aligned, the last batch in range actually cover the full range
// then there will be no further keys we need in the next batch
if self.spine.contains_key(b) {
return result;
}
Bound::Excluded(*b)
}
Bound::Excluded(b) => Bound::Included(*b),
Bound::Unbounded => return result,
};
let search_range = (neg_bound, Bound::Unbounded);
if let Some(last_batch) = self.spine.range(search_range).next() {
for (key, updates) in last_batch.1 {
let iter = updates
.iter()
.filter(|(_val, ts, _diff)| range.contains(ts))
.map(|(val, ts, diff)| ((key.clone(), val.clone()), *ts, *diff));
result.extend(iter);
}
};
result
}
/// expire keys in now that are older than expire_time, intended for reducing memory usage and limit late data arrive
pub fn trunc_expired(&mut self, now: Timestamp) {
if let Some(s) = &mut self.expire_state {
let expire_time = if let Some(t) = s.compute_expiration_timestamp(now) {
t
} else {
// never expire
return;
};
// find all keys smaller than or equal expire_time and silently remove them
let mut after = s.event_ts_to_key.split_off(&(expire_time + 1));
std::mem::swap(&mut s.event_ts_to_key, &mut after);
let before = after;
for key in before.into_iter().flat_map(|i| i.1.into_iter()) {
for (_ts, batch) in self.spine.iter_mut() {
batch.remove(&key);
}
}
}
}
/// get current state of things
/// useful for query existing keys(i.e. reduce and join operator need to query existing state)
pub fn get(&self, now: Timestamp, key: &Row) -> Option<(Row, Timestamp, Diff)> {
if self.full_arrangement
&& self
.spine
.first_key_value()
.map(|(ts, _)| *ts >= now)
.unwrap_or(false)
{
self.spine
.first_key_value()
.and_then(|(_ts, batch)| batch.get(key).and_then(|v| v.first()).cloned())
} else {
// check keys <= now to know current value
let mut final_val = None;
let with_extra_batch = {
let unaligned = self.spine.range(..=now);
if unaligned
.clone()
.last()
.map(|(ts, _)| *ts == now)
.unwrap_or(false)
{
// this extra chain is there just to make type the same
unaligned.chain(None)
} else {
// if the last key is not equal to now, then we need to include the next batch
// because we know last batch key < now < next batch key
// therefore next batch may contain updates that we want
unaligned.chain(
self.spine
.range((Bound::Excluded(now), Bound::Unbounded))
.next(),
)
}
};
for (ts, batch) in with_extra_batch {
if let Some(new_rows) = batch.get(key).map(|v| v.iter()) {
if *ts <= now {
for new_row in new_rows {
final_val = compact_diff_row(final_val, new_row);
}
} else {
for new_row in new_rows.filter(|new_row| new_row.1 <= now) {
final_val = compact_diff_row(final_val, new_row);
}
}
}
}
final_val
}
}
}
fn compact_diff_row(old_row: Option<DiffRow>, new_row: &DiffRow) -> Option<DiffRow> {
let (val, ts, diff) = new_row;
match (old_row, diff) {
(Some((row, _old_ts, old_diff)), diff) if row == *val && old_diff + diff == 0 => {
// the key is deleted now
None
}
(Some((row, _old_ts, old_diff)), diff) if row == *val && old_diff + diff != 0 => {
Some((row, *ts, old_diff + *diff))
}
// if old val not equal new val, simple consider it as being overwritten, for each key can only have one value
// so it make sense to just replace the old value with new value
_ => Some((val.clone(), *ts, *diff)),
}
}
/// A handler to the inner Arrangement, can be cloned and shared, useful for query it's inner state
#[derive(Debug)]
pub struct ArrangeHandler {
inner: Arc<RwLock<Arrangement>>,
}
impl ArrangeHandler {
pub fn from(arr: Arrangement) -> Self {
Self {
inner: Arc::new(RwLock::new(arr)),
}
}
pub fn write(&self) -> tokio::sync::RwLockWriteGuard<'_, Arrangement> {
self.inner.blocking_write()
}
pub fn read(&self) -> tokio::sync::RwLockReadGuard<'_, Arrangement> {
self.inner.blocking_read()
}
/// clone the handler, but only keep the future updates
pub fn clone_future_only(&self) -> Option<Self> {
if self.read().is_written {
return None;
}
Some(Self {
inner: self.inner.clone(),
})
}
/// clone the handler, but keep all updates
/// prevent illegal clone after the arrange have been written,
/// because that will cause loss of data before clone
pub fn clone_full_arrange(&self) -> Option<Self> {
if self.read().is_written {
return None;
}
let mut arr = self.write();
arr.full_arrangement = true;
drop(arr);
Some(Self {
inner: self.inner.clone(),
})
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_future_get() {
// test if apply only future updates, whether get(future_time) can operate correctly
let arr = Arrangement::new();
let arr = ArrangeHandler::from(arr);
{
let mut arr = arr.write();
let key = Row::new(vec![1.into()]);
let updates: Vec<KeyValDiffRow> = vec![
((key.clone(), Row::new(vec![2.into()])), 1, 1),
((key.clone(), Row::new(vec![3.into()])), 2, 1),
((key.clone(), Row::new(vec![4.into()])), 3, 1),
];
// all updates above are future updates
arr.apply_updates(0, updates).unwrap();
assert_eq!(arr.get(1, &key), Some((Row::new(vec![2.into()]), 1, 1)));
assert_eq!(arr.get(2, &key), Some((Row::new(vec![3.into()]), 2, 1)));
assert_eq!(arr.get(3, &key), Some((Row::new(vec![4.into()]), 3, 1)));
}
}
#[test]
fn only_save_future_updates() {
// mfp operator's temporal filter need to record future updates so that it can delete on time
// i.e. insert a record now, delete this record 5 minutes later
// they will only need to keep future updates(if downstream don't need full arrangement that is)
let arr = Arrangement::new();
let arr = ArrangeHandler::from(arr);
let arr1 = arr.clone_full_arrange();
assert!(arr1.is_some());
let arr2 = arr.clone_future_only();
assert!(arr2.is_some());
{
let mut arr = arr.write();
let updates: Vec<KeyValDiffRow> = vec![
((Row::new(vec![1.into()]), Row::new(vec![2.into()])), 1, 1),
((Row::new(vec![2.into()]), Row::new(vec![3.into()])), 2, 1),
((Row::new(vec![3.into()]), Row::new(vec![4.into()])), 3, 1),
];
// all updates above are future updates
arr.apply_updates(0, updates).unwrap();
assert_eq!(
arr.get_updates_in_range(1..=1),
vec![((Row::new(vec![1.into()]), Row::new(vec![2.into()])), 1, 1)]
);
assert_eq!(arr.spine.len(), 3);
arr.set_compaction(1).unwrap();
assert_eq!(arr.spine.len(), 3);
}
let arr2 = arr.clone_full_arrange();
assert!(arr2.is_none());
{
let mut arr = arr.write();
assert_eq!(arr.spine.len(), 3);
arr.set_compaction(2).unwrap();
assert_eq!(arr.spine.len(), 2);
}
}
#[test]
fn test_reduce_expire_keys() {
let mut arr = Arrangement::new();
let expire_state = KeyExpiryManager {
event_ts_to_key: Default::default(),
key_expiration_duration: Some(10),
event_timestamp_from_row: Some(ScalarExpr::Column(0)),
};
let expire_state = Some(expire_state);
arr.expire_state = expire_state;
arr.full_arrangement = true;
let arr = ArrangeHandler::from(arr);
let now = 0;
let key = Row::new(vec![1i64.into()]);
let updates: Vec<KeyValDiffRow> = vec![
(
(Row::new(vec![1i64.into()]), Row::new(vec![2.into()])),
1,
1,
),
(
(Row::new(vec![2i64.into()]), Row::new(vec![3.into()])),
2,
1,
),
(
(Row::new(vec![3i64.into()]), Row::new(vec![4.into()])),
3,
1,
),
];
{
let mut arr = arr.write();
arr.apply_updates(now, updates.clone()).unwrap();
// repeat the same updates means having multiple updates for the same key
arr.apply_updates(now, updates).unwrap();
assert_eq!(
arr.get_updates_in_range(1..=1),
vec![
((key.clone(), Row::new(vec![2.into()])), 1, 1),
((key.clone(), Row::new(vec![2.into()])), 1, 1)
]
);
assert_eq!(arr.spine.len(), 3);
arr.set_compaction(1).unwrap();
assert_eq!(arr.spine.len(), 3);
}
{
let mut arr = arr.write();
assert_eq!(arr.spine.len(), 3);
assert_eq!(arr.get(10, &key), Some((Row::new(vec![2.into()]), 1, 2)));
arr.trunc_expired(10);
assert_eq!(arr.spine.len(), 3);
arr.trunc_expired(11);
assert_eq!(arr.get(11, &key), None);
assert_eq!(arr.spine.len(), 3);
assert_eq!(arr.expire_state.as_ref().unwrap().event_ts_to_key.len(), 2);
arr.trunc_expired(12);
assert_eq!(arr.spine.len(), 3);
assert_eq!(arr.expire_state.as_ref().unwrap().event_ts_to_key.len(), 1);
}
}
#[test]
fn test_apply_expired_keys() {
// apply updates with a expired key
let mut arr = Arrangement::new();
let expire_state = KeyExpiryManager {
event_ts_to_key: Default::default(),
key_expiration_duration: Some(10),
event_timestamp_from_row: Some(ScalarExpr::Column(0)),
};
let expire_state = Some(expire_state);
arr.expire_state = expire_state;
let arr = ArrangeHandler::from(arr);
let updates: Vec<KeyValDiffRow> = vec![
(
(Row::new(vec![1i64.into()]), Row::new(vec![2.into()])),
1,
1,
),
(
(Row::new(vec![2i64.into()]), Row::new(vec![3.into()])),
2,
1,
),
(
(Row::new(vec![3i64.into()]), Row::new(vec![4.into()])),
3,
1,
),
(
(Row::new(vec![3i64.into()]), Row::new(vec![4.into()])),
3,
1,
),
(
(Row::new(vec![1i64.into()]), Row::new(vec![42.into()])),
10,
1,
),
];
{
let mut arr = arr.write();
arr.apply_updates(11, updates).unwrap();
assert_eq!(
arr.get(11, &Row::new(vec![1i64.into()])),
Some((Row::new(vec![42.into()]), 10, 1))
);
arr.trunc_expired(12);
assert_eq!(arr.get(12, &Row::new(vec![1i64.into()])), None);
}
}
/// test if split_lte get ranges that are not aligned with batch boundaries
/// this split_lte can correctly retrieve all updates in the range, including updates that are in the batches
/// near the boundary of input range
#[test]
fn test_split_off() {
let mut arr = Arrangement::new();
// manually create batch ..=1 and 2..=3
arr.spine.insert(1, Default::default());
arr.spine.insert(3, Default::default());
arr.apply_updates(
2,
vec![((Row::new(vec![1.into()]), Row::new(vec![2.into()])), 2, 1)],
)
.unwrap();
// updates falls into the range of 2..=3
let mut arr1 = arr.clone();
{
assert_eq!(arr.get_next_update_time(&1), Some(2));
// split expect to take batch ..=1 and create a new batch 2..=2(which contain update)
let split = &arr.split_lte(&2);
assert_eq!(split.len(), 2);
assert_eq!(split[&2].len(), 1);
let _ = &arr.split_lte(&3);
assert_eq!(arr.get_next_update_time(&1), None);
}
{
// take all updates with timestamp <=1, will get no updates
let split = &arr1.split_lte(&1);
assert_eq!(split.len(), 1);
}
}
/// test if get ranges is not aligned with boundary of batch,
/// whether can get correct result
#[test]
fn test_get_by_range() {
let mut arr = Arrangement::new();
// will form {2: [2, 1], 4: [4,3], 6: [6,5]} three batch
// TODO(discord9): manually set batch
let updates: Vec<KeyValDiffRow> = vec![
((Row::new(vec![1i64.into()]), Row::empty()), 2, 1),
((Row::new(vec![1i64.into()]), Row::empty()), 1, 1),
((Row::new(vec![2i64.into()]), Row::empty()), 4, 1),
((Row::new(vec![3i64.into()]), Row::empty()), 3, 1),
((Row::new(vec![3i64.into()]), Row::empty()), 6, 1),
((Row::new(vec![1i64.into()]), Row::empty()), 5, 1),
];
arr.apply_updates(0, updates).unwrap();
assert_eq!(
arr.get_updates_in_range(2..=5),
vec![
((Row::new(vec![1i64.into()]), Row::empty()), 2, 1),
((Row::new(vec![2i64.into()]), Row::empty()), 4, 1),
((Row::new(vec![3i64.into()]), Row::empty()), 3, 1),
((Row::new(vec![1i64.into()]), Row::empty()), 5, 1),
]
);
}
/// test if get with range unaligned with batch boundary
/// can get correct result
#[test]
fn test_get_unaligned() {
let mut arr = Arrangement::new();
// will form {2: [2, 1], 4: [4,3], 6: [6,5]} three batch
// TODO(discord9): manually set batch
let key = Row::new(vec![1i64.into()]);
let updates: Vec<KeyValDiffRow> = vec![
((key.clone(), Row::new(vec![1i64.into()])), 2, 1),
((key.clone(), Row::new(vec![2i64.into()])), 1, 1),
((key.clone(), Row::new(vec![3i64.into()])), 4, 1),
((key.clone(), Row::new(vec![4i64.into()])), 3, 1),
((key.clone(), Row::new(vec![5i64.into()])), 6, 1),
((key.clone(), Row::new(vec![6i64.into()])), 5, 1),
];
arr.apply_updates(0, updates).unwrap();
// aligned with batch boundary
assert_eq!(arr.get(2, &key), Some((Row::new(vec![1i64.into()]), 2, 1)));
// unaligned with batch boundary
assert_eq!(arr.get(3, &key), Some((Row::new(vec![4i64.into()]), 3, 1)));
}
/// test if out of order updates can be sorted correctly
#[test]
fn test_out_of_order_apply_updates() {
let mut arr = Arrangement::new();
let key = Row::new(vec![1i64.into()]);
let updates: Vec<KeyValDiffRow> = vec![
((key.clone(), Row::new(vec![5i64.into()])), 6, 1),
((key.clone(), Row::new(vec![2i64.into()])), 2, -1),
((key.clone(), Row::new(vec![1i64.into()])), 2, 1),
((key.clone(), Row::new(vec![2i64.into()])), 1, 1),
((key.clone(), Row::new(vec![3i64.into()])), 4, 1),
((key.clone(), Row::new(vec![4i64.into()])), 3, 1),
((key.clone(), Row::new(vec![6i64.into()])), 5, 1),
];
arr.apply_updates(0, updates.clone()).unwrap();
let sorted = updates.iter().sorted_by_key(|r| r.1).cloned().collect_vec();
assert_eq!(arr.get_updates_in_range(1..7), sorted);
}
}

View File

@@ -20,7 +20,6 @@ use common_meta::heartbeat::handler::{
};
use common_meta::instruction::{Instruction, InstructionReply, SimpleReply};
use common_telemetry::error;
use futures::future::Either;
#[derive(Clone)]
pub struct InvalidateTableCacheHandler {
@@ -32,8 +31,7 @@ impl HeartbeatResponseHandler for InvalidateTableCacheHandler {
fn is_acceptable(&self, ctx: &HeartbeatResponseHandlerContext) -> bool {
matches!(
ctx.incoming_message.as_ref(),
Some((_, Instruction::InvalidateTableIdCache { .. }))
| Some((_, Instruction::InvalidateTableNameCache { .. }))
Some((_, Instruction::InvalidateCaches(_)))
)
}
@@ -42,22 +40,11 @@ impl HeartbeatResponseHandler for InvalidateTableCacheHandler {
let cache_invalidator = self.cache_invalidator.clone();
let (meta, invalidator) = match ctx.incoming_message.take() {
Some((meta, Instruction::InvalidateTableIdCache(table_id))) => (
meta,
Either::Left(async move {
cache_invalidator
.invalidate_table_id(&Context::default(), table_id)
.await
}),
),
Some((meta, Instruction::InvalidateTableNameCache(table_name))) => (
meta,
Either::Right(async move {
cache_invalidator
.invalidate_table_name(&Context::default(), table_name)
.await
}),
),
Some((meta, Instruction::InvalidateCaches(caches))) => (meta, async move {
cache_invalidator
.invalidate(&Context::default(), caches)
.await
}),
_ => unreachable!("InvalidateTableCacheHandler: should be guarded by 'is_acceptable'"),
};

View File

@@ -22,7 +22,7 @@ use common_meta::heartbeat::handler::{
HandlerGroupExecutor, HeartbeatResponseHandlerContext, HeartbeatResponseHandlerExecutor,
};
use common_meta::heartbeat::mailbox::{HeartbeatMailbox, MessageMeta};
use common_meta::instruction::{Instruction, InstructionReply, SimpleReply};
use common_meta::instruction::{CacheIdent, Instruction, InstructionReply, SimpleReply};
use common_meta::key::table_info::TableInfoKey;
use common_meta::key::TableMetaKey;
use partition::manager::TableRouteCacheInvalidator;
@@ -74,7 +74,7 @@ async fn test_invalidate_table_cache_handler() {
handle_instruction(
executor.clone(),
mailbox.clone(),
Instruction::InvalidateTableIdCache(table_id),
Instruction::InvalidateCaches(vec![CacheIdent::TableId(table_id)]),
)
.await;
@@ -90,7 +90,12 @@ async fn test_invalidate_table_cache_handler() {
.contains_key(&table_info_key.as_raw_key()));
// removes a invalid key
handle_instruction(executor, mailbox, Instruction::InvalidateTableIdCache(0)).await;
handle_instruction(
executor,
mailbox,
Instruction::InvalidateCaches(vec![CacheIdent::TableId(0)]),
)
.await;
let (_, reply) = rx.recv().await.unwrap();
assert_matches!(

View File

@@ -473,7 +473,8 @@ pub fn check_permission(
// These are executed by query engine, and will be checked there.
Statement::Query(_) | Statement::Explain(_) | Statement::Tql(_) | Statement::Delete(_) => {}
// database ops won't be checked
Statement::CreateDatabase(_) | Statement::ShowDatabases(_) => {}
Statement::CreateDatabase(_) | Statement::ShowDatabases(_) | Statement::DropDatabase(_) => {
}
// show create table and alter are not supported yet
Statement::ShowCreateTable(_) | Statement::CreateExternalTable(_) | Statement::Alter(_) => {
}

View File

@@ -14,6 +14,7 @@
use async_trait::async_trait;
use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
use client::Output;
use common_error::ext::BoxedError;
use servers::error::{AuthSnafu, Error};
use servers::influxdb::InfluxdbRequest;
@@ -30,7 +31,7 @@ impl InfluxdbLineProtocolHandler for Instance {
&self,
request: InfluxdbRequest,
ctx: QueryContextRef,
) -> servers::error::Result<()> {
) -> servers::error::Result<Output> {
self.plugins
.get::<PermissionCheckerRef>()
.as_ref()
@@ -41,11 +42,9 @@ impl InfluxdbLineProtocolHandler for Instance {
interceptor_ref.pre_execute(&request.lines, ctx.clone())?;
let requests = request.try_into()?;
let _ = self
.handle_row_inserts(requests, ctx)
self.handle_row_inserts(requests, ctx)
.await
.map_err(BoxedError::new)
.context(servers::error::ExecuteGrpcQuerySnafu)?;
Ok(())
.context(servers::error::ExecuteGrpcQuerySnafu)
}
}

View File

@@ -14,14 +14,11 @@
use async_trait::async_trait;
use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
use client::Output;
use common_error::ext::BoxedError;
use common_telemetry::tracing;
use opentelemetry_proto::tonic::collector::metrics::v1::{
ExportMetricsServiceRequest, ExportMetricsServiceResponse,
};
use opentelemetry_proto::tonic::collector::trace::v1::{
ExportTraceServiceRequest, ExportTraceServiceResponse,
};
use opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequest;
use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest;
use servers::error::{self, AuthSnafu, Result as ServerResult};
use servers::interceptor::{OpenTelemetryProtocolInterceptor, OpenTelemetryProtocolInterceptorRef};
use servers::otlp;
@@ -40,7 +37,7 @@ impl OpenTelemetryProtocolHandler for Instance {
&self,
request: ExportMetricsServiceRequest,
ctx: QueryContextRef,
) -> ServerResult<ExportMetricsServiceResponse> {
) -> ServerResult<Output> {
self.plugins
.get::<PermissionCheckerRef>()
.as_ref()
@@ -53,19 +50,12 @@ impl OpenTelemetryProtocolHandler for Instance {
interceptor_ref.pre_execute(ctx.clone())?;
let (requests, rows) = otlp::metrics::to_grpc_insert_requests(request)?;
let _ = self
.handle_row_inserts(requests, ctx)
.await
.map_err(BoxedError::new)
.context(error::ExecuteGrpcQuerySnafu)?;
OTLP_METRICS_ROWS.inc_by(rows as u64);
let resp = ExportMetricsServiceResponse {
// TODO(sunng87): add support for partial_success in future patch
partial_success: None,
};
Ok(resp)
self.handle_row_inserts(requests, ctx)
.await
.map_err(BoxedError::new)
.context(error::ExecuteGrpcQuerySnafu)
}
#[tracing::instrument(skip_all)]
@@ -73,7 +63,7 @@ impl OpenTelemetryProtocolHandler for Instance {
&self,
request: ExportTraceServiceRequest,
ctx: QueryContextRef,
) -> ServerResult<ExportTraceServiceResponse> {
) -> ServerResult<Output> {
self.plugins
.get::<PermissionCheckerRef>()
.as_ref()
@@ -95,18 +85,11 @@ impl OpenTelemetryProtocolHandler for Instance {
let (requests, rows) = otlp::trace::to_grpc_insert_requests(table_name, spans)?;
let _ = self
.handle_row_inserts(requests, ctx)
.await
.map_err(BoxedError::new)
.context(error::ExecuteGrpcQuerySnafu)?;
OTLP_TRACES_ROWS.inc_by(rows as u64);
let resp = ExportTraceServiceResponse {
// TODO(fys): add support for partial_success in future patch
partial_success: None,
};
Ok(resp)
self.handle_row_inserts(requests, ctx)
.await
.map_err(BoxedError::new)
.context(error::ExecuteGrpcQuerySnafu)
}
}

View File

@@ -12,10 +12,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use std::sync::Arc;
use api::prom_store::remote::read_request::ResponseType;
use api::prom_store::remote::{Query, QueryResult, ReadRequest, ReadResponse, WriteRequest};
use api::prom_store::remote::{Query, QueryResult, ReadRequest, ReadResponse};
use api::v1::RowInsertRequests;
use async_trait::async_trait;
use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
@@ -30,6 +31,7 @@ use operator::insert::InserterRef;
use operator::statement::StatementExecutor;
use prost::Message;
use servers::error::{self, AuthSnafu, Result as ServerResult};
use servers::http::header::{collect_plan_metrics, CONTENT_ENCODING_SNAPPY, CONTENT_TYPE_PROTOBUF};
use servers::http::prom_store::PHYSICAL_TABLE_PARAM;
use servers::interceptor::{PromStoreProtocolInterceptor, PromStoreProtocolInterceptorRef};
use servers::prom_store::{self, Metrics};
@@ -44,7 +46,6 @@ use crate::error::{
TableNotFoundSnafu,
};
use crate::instance::Instance;
use crate::metrics::PROM_STORE_REMOTE_WRITE_SAMPLES;
const SAMPLES_RESPONSE_TYPE: i32 = ResponseType::Samples as i32;
@@ -161,74 +162,34 @@ impl Instance {
#[async_trait]
impl PromStoreProtocolHandler for Instance {
async fn write(
&self,
request: WriteRequest,
ctx: QueryContextRef,
with_metric_engine: bool,
) -> ServerResult<()> {
self.plugins
.get::<PermissionCheckerRef>()
.as_ref()
.check_permission(ctx.current_user(), PermissionReq::PromStoreWrite)
.context(AuthSnafu)?;
let interceptor_ref = self
.plugins
.get::<PromStoreProtocolInterceptorRef<servers::error::Error>>();
interceptor_ref.pre_write(&request, ctx.clone())?;
let (requests, samples) = prom_store::to_grpc_row_insert_requests(&request)?;
if with_metric_engine {
let physical_table = ctx
.extension(PHYSICAL_TABLE_PARAM)
.unwrap_or(GREPTIME_PHYSICAL_TABLE)
.to_string();
let _ = self
.handle_metric_row_inserts(requests, ctx.clone(), physical_table.to_string())
.await
.map_err(BoxedError::new)
.context(error::ExecuteGrpcQuerySnafu)?;
} else {
let _ = self
.handle_row_inserts(requests, ctx.clone())
.await
.map_err(BoxedError::new)
.context(error::ExecuteGrpcQuerySnafu)?;
}
PROM_STORE_REMOTE_WRITE_SAMPLES.inc_by(samples as u64);
Ok(())
}
async fn write_fast(
&self,
request: RowInsertRequests,
ctx: QueryContextRef,
with_metric_engine: bool,
) -> ServerResult<()> {
) -> ServerResult<Output> {
self.plugins
.get::<PermissionCheckerRef>()
.as_ref()
.check_permission(ctx.current_user(), PermissionReq::PromStoreWrite)
.context(AuthSnafu)?;
if with_metric_engine {
let output = if with_metric_engine {
let physical_table = ctx
.extension(PHYSICAL_TABLE_PARAM)
.unwrap_or(GREPTIME_PHYSICAL_TABLE)
.to_string();
let _ = self
.handle_metric_row_inserts(request, ctx.clone(), physical_table.to_string())
self.handle_metric_row_inserts(request, ctx.clone(), physical_table.to_string())
.await
.map_err(BoxedError::new)
.context(error::ExecuteGrpcQuerySnafu)?;
.context(error::ExecuteGrpcQuerySnafu)?
} else {
let _ = self
.handle_row_inserts(request, ctx.clone())
self.handle_row_inserts(request, ctx.clone())
.await
.map_err(BoxedError::new)
.context(error::ExecuteGrpcQuerySnafu)?;
}
Ok(())
.context(error::ExecuteGrpcQuerySnafu)?
};
Ok(output)
}
async fn read(
@@ -254,18 +215,29 @@ impl PromStoreProtocolHandler for Instance {
match response_type {
ResponseType::Samples => {
let mut query_results = Vec::with_capacity(results.len());
let mut map = HashMap::new();
for (table_name, output) in results {
let plan = output.meta.plan.clone();
query_results.push(to_query_result(&table_name, output).await?);
if let Some(ref plan) = plan {
collect_plan_metrics(plan.clone(), &mut [&mut map]);
}
}
let response = ReadResponse {
results: query_results,
};
let resp_metrics = map
.into_iter()
.map(|(k, v)| (k, v.into()))
.collect::<HashMap<_, _>>();
// TODO(dennis): may consume too much memory, adds flow control
Ok(PromStoreResponse {
content_type: "application/x-protobuf".to_string(),
content_encoding: "snappy".to_string(),
content_type: CONTENT_TYPE_PROTOBUF.clone(),
content_encoding: CONTENT_ENCODING_SNAPPY.clone(),
resp_metrics,
body: prom_store::snappy_compress(&response.encode_to_vec())?,
})
}
@@ -306,31 +278,20 @@ impl ExportMetricHandler {
impl PromStoreProtocolHandler for ExportMetricHandler {
async fn write(
&self,
request: WriteRequest,
request: RowInsertRequests,
ctx: QueryContextRef,
_: bool,
) -> ServerResult<()> {
let (requests, _) = prom_store::to_grpc_row_insert_requests(&request)?;
) -> ServerResult<Output> {
self.inserter
.handle_metric_row_inserts(
requests,
request,
ctx,
&self.statement_executor,
GREPTIME_PHYSICAL_TABLE.to_string(),
)
.await
.map_err(BoxedError::new)
.context(error::ExecuteGrpcQuerySnafu)?;
Ok(())
}
async fn write_fast(
&self,
_request: RowInsertRequests,
_ctx: QueryContextRef,
_with_metric_engine: bool,
) -> ServerResult<()> {
unimplemented!()
.context(error::ExecuteGrpcQuerySnafu)
}
async fn read(

View File

@@ -18,7 +18,7 @@ use api::v1::region::{QueryRequest, RegionRequest, RegionResponse};
use async_trait::async_trait;
use client::region::check_response_header;
use common_error::ext::BoxedError;
use common_meta::datanode_manager::{AffectedRows, Datanode, DatanodeManager, DatanodeRef};
use common_meta::datanode_manager::{Datanode, DatanodeManager, DatanodeRef, HandleResponse};
use common_meta::error::{self as meta_error, Result as MetaResult};
use common_meta::peer::Peer;
use common_recordbatch::SendableRecordBatchStream;
@@ -63,7 +63,7 @@ impl RegionInvoker {
#[async_trait]
impl Datanode for RegionInvoker {
async fn handle(&self, request: RegionRequest) -> MetaResult<AffectedRows> {
async fn handle(&self, request: RegionRequest) -> MetaResult<HandleResponse> {
let span = request
.header
.as_ref()
@@ -76,10 +76,10 @@ impl Datanode for RegionInvoker {
.await
.map_err(BoxedError::new)
.context(meta_error::ExternalSnafu)?;
check_response_header(response.header)
check_response_header(&response.header)
.map_err(BoxedError::new)
.context(meta_error::ExternalSnafu)?;
Ok(response.affected_rows as _)
Ok(HandleResponse::from_region_response(response))
}
async fn handle_query(&self, request: QueryRequest) -> MetaResult<SendableRecordBatchStream> {

View File

@@ -41,13 +41,6 @@ lazy_static! {
.with_label_values(&["insert"]);
pub static ref EXECUTE_SCRIPT_ELAPSED: Histogram = HANDLE_SCRIPT_ELAPSED
.with_label_values(&["execute"]);
/// The samples count of Prometheus remote write.
pub static ref PROM_STORE_REMOTE_WRITE_SAMPLES: IntCounter = register_int_counter!(
"greptime_frontend_prometheus_remote_write_samples",
"frontend prometheus remote write samples"
)
.unwrap();
pub static ref OTLP_METRICS_ROWS: IntCounter = register_int_counter!(
"greptime_frontend_otlp_metrics_rows",
"frontend otlp metrics rows"

View File

@@ -17,10 +17,8 @@ use async_trait::async_trait;
use common_error::ext::BoxedError;
use common_meta::cache_invalidator::{CacheInvalidator, Context};
use common_meta::error::{self as meta_error, Result as MetaResult};
use common_meta::instruction::Instruction;
use common_meta::table_name::TableName;
use common_meta::instruction::{CacheIdent, Instruction};
use snafu::ResultExt;
use table::metadata::TableId;
use crate::metasrv::MetasrvInfo;
use crate::service::mailbox::{BroadcastChannel, MailboxRef};
@@ -65,13 +63,8 @@ impl MetasrvCacheInvalidator {
#[async_trait]
impl CacheInvalidator for MetasrvCacheInvalidator {
async fn invalidate_table_id(&self, ctx: &Context, table_id: TableId) -> MetaResult<()> {
let instruction = Instruction::InvalidateTableIdCache(table_id);
self.broadcast(ctx, instruction).await
}
async fn invalidate_table_name(&self, ctx: &Context, table_name: TableName) -> MetaResult<()> {
let instruction = Instruction::InvalidateTableNameCache(table_name);
async fn invalidate(&self, ctx: &Context, caches: Vec<CacheIdent>) -> MetaResult<()> {
let instruction = Instruction::InvalidateCaches(caches);
self.broadcast(ctx, instruction).await
}
}

View File

@@ -301,6 +301,14 @@ pub enum Error {
location: Location,
},
#[snafu(display("Failed to parse bool: {}", err_msg))]
ParseBool {
err_msg: String,
#[snafu(source)]
error: std::str::ParseBoolError,
location: Location,
},
#[snafu(display("Invalid arguments: {}", err_msg))]
InvalidArguments { err_msg: String, location: Location },
@@ -709,6 +717,7 @@ impl ErrorExt for Error {
| Error::InvalidStatKey { .. }
| Error::InvalidInactiveRegionKey { .. }
| Error::ParseNum { .. }
| Error::ParseBool { .. }
| Error::ParseAddr { .. }
| Error::ParseDuration { .. }
| Error::UnsupportedSelectorType { .. }

View File

@@ -107,6 +107,9 @@ impl HeartbeatHandler for RegionFailureHandler {
#[cfg(test)]
mod tests {
use std::assert_matches::assert_matches;
use common_meta::key::MAINTENANCE_KEY;
use store_api::region_engine::RegionRole;
use store_api::storage::RegionId;
@@ -163,4 +166,37 @@ mod tests {
let dump = handler.failure_detect_runner.dump().await;
assert_eq!(dump.iter().collect::<Vec<_>>().len(), 0);
}
#[tokio::test(flavor = "multi_thread")]
async fn test_maintenance_mode() {
let region_failover_manager = create_region_failover_manager();
let kv_backend = region_failover_manager.create_context().kv_backend.clone();
let _handler = RegionFailureHandler::try_new(
None,
region_failover_manager.clone(),
PhiAccrualFailureDetectorOptions::default(),
)
.await
.unwrap();
let kv_req = common_meta::rpc::store::PutRequest {
key: Vec::from(MAINTENANCE_KEY),
value: vec![],
prev_kv: false,
};
let _ = kv_backend.put(kv_req.clone()).await.unwrap();
assert_matches!(
region_failover_manager.is_maintenance_mode().await,
Ok(true)
);
let _ = kv_backend
.delete(MAINTENANCE_KEY.as_bytes(), false)
.await
.unwrap();
assert_matches!(
region_failover_manager.is_maintenance_mode().await,
Ok(false)
);
}
}

View File

@@ -140,40 +140,59 @@ impl FailureDetectRunner {
let election = self.election.clone();
let region_failover_manager = self.region_failover_manager.clone();
let runner_handle = common_runtime::spawn_bg(async move {
async fn maybe_region_failover(
failure_detectors: &Arc<FailureDetectorContainer>,
region_failover_manager: &Arc<RegionFailoverManager>,
) {
match region_failover_manager.is_maintenance_mode().await {
Ok(false) => {}
Ok(true) => {
info!("Maintenance mode is enabled, skip failover");
return;
}
Err(err) => {
error!(err; "Failed to check maintenance mode");
return;
}
}
let failed_regions = failure_detectors
.iter()
.filter_map(|e| {
// Intentionally not place `current_time_millis()` out of the iteration.
// The failure detection determination should be happened "just in time",
// i.e., failed or not has to be compared with the most recent "now".
// Besides, it might reduce the false positive of failure detection,
// because during the iteration, heartbeats are coming in as usual,
// and the `phi`s are still updating.
if !e.failure_detector().is_available(current_time_millis()) {
Some(e.region_ident().clone())
} else {
None
}
})
.collect::<Vec<RegionIdent>>();
for r in failed_regions {
if let Err(e) = region_failover_manager.do_region_failover(&r).await {
error!(e; "Failed to do region failover for {r}");
} else {
// Now that we know the region is starting to do failover, remove it
// from the failure detectors, avoiding the failover procedure to be
// triggered again.
// If the region is back alive (the failover procedure runs successfully),
// it will be added back to the failure detectors again.
failure_detectors.remove(&r);
}
}
}
loop {
let start = Instant::now();
let is_leader = election.as_ref().map(|x| x.is_leader()).unwrap_or(true);
if is_leader {
let failed_regions = failure_detectors
.iter()
.filter_map(|e| {
// Intentionally not place `current_time_millis()` out of the iteration.
// The failure detection determination should be happened "just in time",
// i.e., failed or not has to be compared with the most recent "now".
// Besides, it might reduce the false positive of failure detection,
// because during the iteration, heartbeats are coming in as usual,
// and the `phi`s are still updating.
if !e.failure_detector().is_available(current_time_millis()) {
Some(e.region_ident().clone())
} else {
None
}
})
.collect::<Vec<RegionIdent>>();
for r in failed_regions {
if let Err(e) = region_failover_manager.do_region_failover(&r).await {
error!(e; "Failed to do region failover for {r}");
} else {
// Now that we know the region is starting to do failover, remove it
// from the failure detectors, avoiding the failover procedure to be
// triggered again.
// If the region is back alive (the failover procedure runs successfully),
// it will be added back to the failure detectors again.
failure_detectors.remove(&r);
}
}
maybe_region_failover(&failure_detectors, &region_failover_manager).await;
}
let elapsed = Instant::now().duration_since(start);

View File

@@ -43,7 +43,7 @@ use tokio::sync::broadcast::error::RecvError;
use crate::cluster::MetaPeerClientRef;
use crate::election::{Election, LeaderChangeMessage};
use crate::error::{
self, InitMetadataSnafu, Result, StartProcedureManagerSnafu, StartTelemetryTaskSnafu,
InitMetadataSnafu, KvBackendSnafu, Result, StartProcedureManagerSnafu, StartTelemetryTaskSnafu,
StopProcedureManagerSnafu,
};
use crate::failure_detector::PhiAccrualFailureDetectorOptions;
@@ -357,7 +357,7 @@ impl MetaSrv {
self.leader_cached_kv_backend
.load()
.await
.context(error::KvBackendSnafu)?;
.context(KvBackendSnafu)?;
self.procedure_manager
.start()
.await

View File

@@ -260,6 +260,7 @@ impl MetaSrvBuilder {
let region_failover_manager = Arc::new(RegionFailoverManager::new(
distributed_time_constants::REGION_LEASE_SECS,
in_memory.clone(),
kv_backend.clone(),
mailbox.clone(),
procedure_manager.clone(),
(selector.clone(), selector_ctx.clone()),

View File

@@ -26,9 +26,10 @@ use std::time::Duration;
use async_trait::async_trait;
use common_meta::key::datanode_table::DatanodeTableKey;
use common_meta::key::TableMetadataManagerRef;
use common_meta::kv_backend::ResettableKvBackendRef;
use common_meta::lock_key::{RegionLock, TableLock};
use common_meta::key::{TableMetadataManagerRef, MAINTENANCE_KEY};
use common_meta::kv_backend::{KvBackendRef, ResettableKvBackendRef};
use common_meta::lock_key::{CatalogLock, RegionLock, SchemaLock, TableLock};
use common_meta::table_name::TableName;
use common_meta::{ClusterId, RegionIdent};
use common_procedure::error::{
Error as ProcedureError, FromJsonSnafu, Result as ProcedureResult, ToJsonSnafu,
@@ -44,7 +45,9 @@ use snafu::ResultExt;
use store_api::storage::{RegionId, RegionNumber};
use table::metadata::TableId;
use crate::error::{RegisterProcedureLoaderSnafu, Result, TableMetadataManagerSnafu};
use crate::error::{
self, KvBackendSnafu, RegisterProcedureLoaderSnafu, Result, TableMetadataManagerSnafu,
};
use crate::lock::DistLockRef;
use crate::metasrv::{SelectorContext, SelectorRef};
use crate::service::mailbox::MailboxRef;
@@ -72,6 +75,7 @@ impl From<RegionIdent> for RegionFailoverKey {
pub(crate) struct RegionFailoverManager {
region_lease_secs: u64,
in_memory: ResettableKvBackendRef,
kv_backend: KvBackendRef,
mailbox: MailboxRef,
procedure_manager: ProcedureManagerRef,
selector: SelectorRef,
@@ -93,9 +97,11 @@ impl Drop for FailoverProcedureGuard {
}
impl RegionFailoverManager {
#[allow(clippy::too_many_arguments)]
pub(crate) fn new(
region_lease_secs: u64,
in_memory: ResettableKvBackendRef,
kv_backend: KvBackendRef,
mailbox: MailboxRef,
procedure_manager: ProcedureManagerRef,
(selector, selector_ctx): (SelectorRef, SelectorContext),
@@ -105,6 +111,7 @@ impl RegionFailoverManager {
Self {
region_lease_secs,
in_memory,
kv_backend,
mailbox,
procedure_manager,
selector,
@@ -119,6 +126,7 @@ impl RegionFailoverManager {
RegionFailoverContext {
region_lease_secs: self.region_lease_secs,
in_memory: self.in_memory.clone(),
kv_backend: self.kv_backend.clone(),
mailbox: self.mailbox.clone(),
selector: self.selector.clone(),
selector_ctx: self.selector_ctx.clone(),
@@ -158,13 +166,27 @@ impl RegionFailoverManager {
}
}
pub(crate) async fn is_maintenance_mode(&self) -> Result<bool> {
self.kv_backend
.exists(MAINTENANCE_KEY.as_bytes())
.await
.context(KvBackendSnafu)
}
pub(crate) async fn do_region_failover(&self, failed_region: &RegionIdent) -> Result<()> {
let Some(guard) = self.insert_running_procedures(failed_region) else {
warn!("Region failover procedure for region {failed_region} is already running!");
return Ok(());
};
if !self.table_exists(failed_region).await? {
let table_info = self
.table_metadata_manager
.table_info_manager()
.get(failed_region.table_id)
.await
.context(error::TableMetadataManagerSnafu)?;
if table_info.is_none() {
// The table could be dropped before the failure detector knows it. Then the region
// failover is not needed.
// Or the table could be renamed. But we will have a new region ident to detect failure.
@@ -178,7 +200,15 @@ impl RegionFailoverManager {
}
let context = self.create_context();
let procedure = RegionFailoverProcedure::new(failed_region.clone(), context);
// Safety: Check before.
let table_info = table_info.unwrap();
let TableName {
catalog_name,
schema_name,
..
} = table_info.table_name();
let procedure =
RegionFailoverProcedure::new(catalog_name, schema_name, failed_region.clone(), context);
let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));
let procedure_id = procedure_with_id.id;
info!("Starting region failover procedure {procedure_id} for region {failed_region:?}");
@@ -206,16 +236,6 @@ impl RegionFailoverManager {
Ok(())
}
async fn table_exists(&self, failed_region: &RegionIdent) -> Result<bool> {
Ok(self
.table_metadata_manager
.table_route_manager()
.get_region_distribution(failed_region.table_id)
.await
.context(TableMetadataManagerSnafu)?
.is_some())
}
async fn failed_region_exists(&self, failed_region: &RegionIdent) -> Result<bool> {
let table_id = failed_region.table_id;
let datanode_id = failed_region.datanode_id;
@@ -238,10 +258,17 @@ impl RegionFailoverManager {
}
}
#[derive(Serialize, Deserialize, Debug)]
struct LockMeta {
catalog: String,
schema: String,
}
/// A "Node" in the state machine of region failover procedure.
/// Contains the current state and the data.
#[derive(Serialize, Deserialize, Debug)]
struct Node {
lock_meta: LockMeta,
failed_region: RegionIdent,
state: Box<dyn State>,
}
@@ -251,6 +278,7 @@ struct Node {
pub struct RegionFailoverContext {
pub region_lease_secs: u64,
pub in_memory: ResettableKvBackendRef,
pub kv_backend: KvBackendRef,
pub mailbox: MailboxRef,
pub selector: SelectorRef,
pub selector_ctx: SelectorContext,
@@ -330,9 +358,15 @@ pub struct RegionFailoverProcedure {
impl RegionFailoverProcedure {
const TYPE_NAME: &'static str = "metasrv-procedure::RegionFailover";
pub fn new(failed_region: RegionIdent, context: RegionFailoverContext) -> Self {
pub fn new(
catalog: String,
schema: String,
failed_region: RegionIdent,
context: RegionFailoverContext,
) -> Self {
let state = RegionFailoverStart::new();
let node = Node {
lock_meta: LockMeta { catalog, schema },
failed_region,
state: Box::new(state),
};
@@ -372,8 +406,9 @@ impl Procedure for RegionFailoverProcedure {
fn lock_key(&self) -> LockKey {
let region_ident = &self.node.failed_region;
// TODO(weny): acquires the catalog, schema read locks.
let lock_key = vec![
CatalogLock::Read(&self.node.lock_meta.catalog).into(),
SchemaLock::read(&self.node.lock_meta.catalog, &self.node.lock_meta.catalog).into(),
TableLock::Read(region_ident.table_id).into(),
RegionLock::Write(RegionId::new(
region_ident.table_id,
@@ -549,6 +584,7 @@ mod tests {
context: RegionFailoverContext {
region_lease_secs: 10,
in_memory,
kv_backend,
mailbox,
selector,
selector_ctx,
@@ -568,6 +604,8 @@ mod tests {
let failed_region = env.failed_region(1).await;
let mut procedure = Box::new(RegionFailoverProcedure::new(
"greptime".into(),
"public".into(),
failed_region.clone(),
env.context.clone(),
)) as BoxedProcedure;
@@ -671,7 +709,7 @@ mod tests {
assert_eq!(
procedure.dump().unwrap(),
r#"{"failed_region":{"cluster_id":0,"datanode_id":1,"table_id":1,"region_number":1,"engine":"mito2"},"state":{"region_failover_state":"RegionFailoverEnd"}}"#
r#"{"lock_meta":{"catalog":"greptime","schema":"public"},"failed_region":{"cluster_id":0,"datanode_id":1,"table_id":1,"region_number":1,"engine":"mito2"},"state":{"region_failover_state":"RegionFailoverEnd"}}"#
);
// Verifies that the failed region (region 1) is moved from failed datanode (datanode 1) to the candidate datanode.
@@ -700,6 +738,10 @@ mod tests {
let state = RegionFailoverStart::new();
let node = Node {
lock_meta: LockMeta {
catalog: "greptime".into(),
schema: "public".into(),
},
failed_region,
state: Box::new(state),
};
@@ -711,12 +753,12 @@ mod tests {
let s = procedure.dump().unwrap();
assert_eq!(
s,
r#"{"failed_region":{"cluster_id":0,"datanode_id":1,"table_id":1,"region_number":1,"engine":"mito2"},"state":{"region_failover_state":"RegionFailoverStart","failover_candidate":null}}"#
r#"{"lock_meta":{"catalog":"greptime","schema":"public"},"failed_region":{"cluster_id":0,"datanode_id":1,"table_id":1,"region_number":1,"engine":"mito2"},"state":{"region_failover_state":"RegionFailoverStart","failover_candidate":null}}"#,
);
let n: Node = serde_json::from_str(&s).unwrap();
assert_eq!(
format!("{n:?}"),
r#"Node { failed_region: RegionIdent { cluster_id: 0, datanode_id: 1, table_id: 1, region_number: 1, engine: "mito2" }, state: RegionFailoverStart { failover_candidate: None } }"#
r#"Node { lock_meta: LockMeta { catalog: "greptime", schema: "public" }, failed_region: RegionIdent { cluster_id: 0, datanode_id: 1, table_id: 1, region_number: 1, engine: "mito2" }, state: RegionFailoverStart { failover_candidate: None } }"#,
);
}
@@ -765,6 +807,10 @@ mod tests {
let state = RegionFailoverStart::new();
let node = Node {
lock_meta: LockMeta {
catalog: "greptime".into(),
schema: "public".into(),
},
failed_region,
state: Box::new(state),
};

View File

@@ -14,7 +14,7 @@
use api::v1::meta::MailboxMessage;
use async_trait::async_trait;
use common_meta::instruction::Instruction;
use common_meta::instruction::{CacheIdent, Instruction};
use common_meta::RegionIdent;
use common_telemetry::info;
use serde::{Deserialize, Serialize};
@@ -35,7 +35,7 @@ impl InvalidateCache {
ctx: &RegionFailoverContext,
table_id: TableId,
) -> Result<()> {
let instruction = Instruction::InvalidateTableIdCache(table_id);
let instruction = Instruction::InvalidateCaches(vec![CacheIdent::TableId(table_id)]);
let msg = &MailboxMessage::json_message(
"Invalidate Table Cache",
@@ -133,7 +133,10 @@ mod tests {
assert_eq!(
received.payload,
Some(Payload::Json(
serde_json::to_string(&Instruction::InvalidateTableIdCache(table_id)).unwrap(),
serde_json::to_string(&Instruction::InvalidateCaches(vec![
CacheIdent::TableId(table_id)
]))
.unwrap(),
))
);
}

View File

@@ -13,8 +13,6 @@
// limitations under the License.
pub(crate) mod downgrade_leader_region;
// TODO(weny): remove it.
#[allow(dead_code)]
pub(crate) mod manager;
pub(crate) mod migration_abort;
pub(crate) mod migration_end;
@@ -31,12 +29,12 @@ use std::time::Duration;
use api::v1::meta::MailboxMessage;
use common_error::ext::BoxedError;
use common_meta::instruction::Instruction;
use common_meta::instruction::{CacheIdent, Instruction};
use common_meta::key::datanode_table::{DatanodeTableKey, DatanodeTableValue};
use common_meta::key::table_info::TableInfoValue;
use common_meta::key::table_route::TableRouteValue;
use common_meta::key::{DeserializedValueWithBytes, TableMetadataManagerRef};
use common_meta::lock_key::{RegionLock, TableLock};
use common_meta::lock_key::{CatalogLock, RegionLock, SchemaLock, TableLock};
use common_meta::peer::Peer;
use common_meta::region_keeper::{MemoryRegionKeeperRef, OperatingRegionGuard};
use common_meta::ClusterId;
@@ -61,6 +59,10 @@ use crate::service::mailbox::{BroadcastChannel, MailboxRef};
/// **Notes: Stores with too large data in the context might incur replication overhead.**
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct PersistentContext {
/// The table catalog.
catalog: String,
/// The table schema.
schema: String,
/// The Id of the cluster.
cluster_id: ClusterId,
/// The [Peer] of migration source.
@@ -81,8 +83,9 @@ fn default_replay_timeout() -> Duration {
impl PersistentContext {
pub fn lock_key(&self) -> Vec<StringKey> {
let region_id = self.region_id;
// TODO(weny): acquires the catalog, schema read locks.
let lock_key = vec![
CatalogLock::Read(&self.catalog).into(),
SchemaLock::read(&self.catalog, &self.schema).into(),
TableLock::Read(region_id.table_id()).into(),
RegionLock::Write(region_id).into(),
];
@@ -185,8 +188,6 @@ impl ContextFactory for DefaultContextFactory {
}
}
// TODO(weny): remove it.
#[allow(dead_code)]
/// The context of procedure execution.
pub struct Context {
persistent_ctx: PersistentContext,
@@ -320,7 +321,7 @@ impl Context {
/// Broadcasts the invalidate table cache message.
pub async fn invalidate_table_cache(&self) -> Result<()> {
let table_id = self.region_id().table_id();
let instruction = Instruction::InvalidateTableIdCache(table_id);
let instruction = Instruction::InvalidateCaches(vec![CacheIdent::TableId(table_id)]);
let msg = &MailboxMessage::json_message(
"Invalidate Table Cache",
@@ -368,7 +369,6 @@ pub struct RegionMigrationProcedure {
context: Context,
}
// TODO(weny): remove it.
#[allow(dead_code)]
impl RegionMigrationProcedure {
const TYPE_NAME: &'static str = "metasrv-procedure::RegionMigration";
@@ -487,8 +487,7 @@ mod tests {
let procedure = RegionMigrationProcedure::new(persistent_context, context);
let serialized = procedure.dump().unwrap();
let expected = r#"{"persistent_ctx":{"cluster_id":0,"from_peer":{"id":1,"addr":""},"to_peer":{"id":2,"addr":""},"region_id":4398046511105,"replay_timeout":"1s"},"state":{"region_migration_state":"RegionMigrationStart"}}"#;
let expected = r#"{"persistent_ctx":{"catalog":"greptime","schema":"public","cluster_id":0,"from_peer":{"id":1,"addr":""},"to_peer":{"id":2,"addr":""},"region_id":4398046511105,"replay_timeout":"1s"},"state":{"region_migration_state":"RegionMigrationStart"}}"#;
assert_eq!(expected, serialized);
}
@@ -496,7 +495,7 @@ mod tests {
fn test_backward_compatibility() {
let persistent_ctx = test_util::new_persistent_context(1, 2, RegionId::new(1024, 1));
// NOTES: Changes it will break backward compatibility.
let serialized = r#"{"cluster_id":0,"from_peer":{"id":1,"addr":""},"to_peer":{"id":2,"addr":""},"region_id":4398046511105}"#;
let serialized = r#"{"catalog":"greptime","schema":"public","cluster_id":0,"from_peer":{"id":1,"addr":""},"to_peer":{"id":2,"addr":""},"region_id":4398046511105}"#;
let deserialized: PersistentContext = serde_json::from_str(serialized).unwrap();
assert_eq!(persistent_ctx, deserialized);
@@ -583,7 +582,10 @@ mod tests {
let msg = resp.mailbox_message.unwrap();
let instruction = HeartbeatMailbox::json_instruction(&msg).unwrap();
assert_matches!(instruction, Instruction::InvalidateTableIdCache(1024));
assert_eq!(
instruction,
Instruction::InvalidateCaches(vec![CacheIdent::TableId(1024)])
);
}
fn procedure_flow_steps(from_peer_id: u64, to_peer_id: u64) -> Vec<Step> {

View File

@@ -226,6 +226,8 @@ mod tests {
fn new_persistent_context() -> PersistentContext {
PersistentContext {
catalog: "greptime".into(),
schema: "public".into(),
from_peer: Peer::empty(1),
to_peer: Peer::empty(2),
region_id: RegionId::new(1024, 1),

View File

@@ -18,9 +18,11 @@ use std::fmt::Display;
use std::sync::{Arc, RwLock};
use std::time::Duration;
use common_meta::key::table_info::TableInfoValue;
use common_meta::key::table_route::TableRouteValue;
use common_meta::peer::Peer;
use common_meta::rpc::router::RegionRoute;
use common_meta::table_name::TableName;
use common_meta::ClusterId;
use common_procedure::{watcher, ProcedureId, ProcedureManagerRef, ProcedureWithId};
use common_telemetry::{error, info};
@@ -93,26 +95,6 @@ impl Display for RegionMigrationProcedureTask {
}
}
impl From<RegionMigrationProcedureTask> for PersistentContext {
fn from(
RegionMigrationProcedureTask {
cluster_id,
region_id,
from_peer,
to_peer,
replay_timeout,
}: RegionMigrationProcedureTask,
) -> Self {
PersistentContext {
cluster_id,
from_peer,
to_peer,
region_id,
replay_timeout,
}
}
}
impl RegionMigrationManager {
/// Returns new [RegionMigrationManager]
pub(crate) fn new(
@@ -188,6 +170,22 @@ impl RegionMigrationManager {
Ok(table_route)
}
async fn retrieve_table_info(&self, region_id: RegionId) -> Result<TableInfoValue> {
let table_route = self
.context_factory
.table_metadata_manager
.table_info_manager()
.get(region_id.table_id())
.await
.context(error::TableMetadataManagerSnafu)?
.context(error::TableInfoNotFoundSnafu {
table_id: region_id.table_id(),
})?
.into_inner();
Ok(table_route)
}
/// Verifies the type of region migration table route.
fn verify_table_route(
&self,
@@ -279,8 +277,31 @@ impl RegionMigrationManager {
self.verify_region_leader_peer(&region_route, &task)?;
let procedure =
RegionMigrationProcedure::new(task.clone().into(), self.context_factory.clone());
let table_info = self.retrieve_table_info(region_id).await?;
let TableName {
catalog_name,
schema_name,
..
} = table_info.table_name();
let RegionMigrationProcedureTask {
cluster_id,
region_id,
from_peer,
to_peer,
replay_timeout,
} = task.clone();
let procedure = RegionMigrationProcedure::new(
PersistentContext {
catalog: catalog_name,
schema: schema_name,
cluster_id,
region_id,
from_peer,
to_peer,
replay_timeout,
},
self.context_factory.clone(),
);
let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));
let procedure_id = procedure_with_id.id;
info!("Starting region migration procedure {procedure_id} for {task}");

View File

@@ -278,6 +278,8 @@ pub fn send_mock_reply(
/// Generates a [PersistentContext].
pub fn new_persistent_context(from: u64, to: u64, region_id: RegionId) -> PersistentContext {
PersistentContext {
catalog: "greptime".into(),
schema: "public".into(),
from_peer: Peer::empty(from),
to_peer: Peer::empty(to),
region_id,
@@ -297,16 +299,6 @@ pub(crate) struct ProcedureMigrationTestSuite {
pub(crate) type BeforeTest =
Arc<dyn Fn(&mut ProcedureMigrationTestSuite) -> BoxFuture<'_, ()> + Send + Sync>;
/// Custom assertion.
pub(crate) type CustomAssertion = Arc<
dyn Fn(
&mut ProcedureMigrationTestSuite,
Result<(Box<dyn State>, Status)>,
) -> BoxFuture<'_, Result<()>>
+ Send
+ Sync,
>;
/// State assertion function.
pub(crate) type StateAssertion = Arc<dyn Fn(&dyn State) + Send + Sync>;
@@ -316,14 +308,11 @@ pub(crate) type StatusAssertion = Arc<dyn Fn(Status) + Send + Sync>;
/// Error assertion function.
pub(crate) type ErrorAssertion = Arc<dyn Fn(Error) + Send + Sync>;
// TODO(weny): Remove it.
#[allow(dead_code)]
/// The type of assertion.
#[derive(Clone)]
pub(crate) enum Assertion {
Simple(StateAssertion, StatusAssertion),
Error(ErrorAssertion),
Custom(CustomAssertion),
}
impl Assertion {
@@ -384,9 +373,6 @@ impl ProcedureMigrationTestSuite {
let error = result.unwrap_err();
error_assert(error);
}
Assertion::Custom(assert_fn) => {
assert_fn(self, result).await?;
}
}
Ok(())

View File

@@ -232,6 +232,8 @@ mod tests {
fn new_persistent_context() -> PersistentContext {
PersistentContext {
catalog: "greptime".into(),
schema: "public".into(),
from_peer: Peer::empty(1),
to_peer: Peer::empty(2),
region_id: RegionId::new(1024, 1),

View File

@@ -30,6 +30,7 @@ use common_meta::datanode_manager::DatanodeManagerRef;
use common_meta::ddl::alter_table::AlterTableProcedure;
use common_meta::ddl::create_logical_tables::{CreateLogicalTablesProcedure, CreateTablesState};
use common_meta::ddl::create_table::*;
use common_meta::ddl::drop_table::executor::DropTableExecutor;
use common_meta::ddl::drop_table::DropTableProcedure;
use common_meta::ddl::test_util::create_table::build_raw_table_info_from_expr;
use common_meta::ddl::test_util::{TestColumnDefBuilder, TestCreateTableExprBuilder};
@@ -38,6 +39,7 @@ use common_meta::key::table_route::TableRouteValue;
use common_meta::key::DeserializedValueWithBytes;
use common_meta::rpc::ddl::{AlterTableTask, CreateTableTask, DropTableTask};
use common_meta::rpc::router::{find_leaders, RegionRoute};
use common_meta::table_name::TableName;
use common_procedure::Status;
use store_api::storage::RegionId;
@@ -322,7 +324,11 @@ async fn test_on_datanode_drop_regions() {
table_id: 42,
drop_if_exists: false,
};
let drop_table_executor = DropTableExecutor::new(
TableName::new("my_catalog", "my_schema", "my_table"),
42,
false,
);
let (region_server, mut rx) = EchoRegionServer::new();
let region_routes = test_data::new_region_routes();
let datanode_manager = new_datanode_manager(&region_server, &region_routes).await;
@@ -357,7 +363,10 @@ async fn test_on_datanode_drop_regions() {
}
});
let status = procedure.on_datanode_drop_regions().await.unwrap();
let status = procedure
.on_datanode_drop_regions(&drop_table_executor)
.await
.unwrap();
assert!(status.is_done());
handle.await.unwrap();

View File

@@ -93,6 +93,7 @@ pub mod mock {
}),
}),
affected_rows: 0,
extension: Default::default(),
})
}
}

View File

@@ -15,10 +15,9 @@
mod health;
mod heartbeat;
mod leader;
mod maintenance;
mod meta;
// TODO(weny): removes it.
mod node_lease;
#[allow(dead_code)]
mod region_migration;
mod route;
mod util;
@@ -99,6 +98,13 @@ pub fn make_admin_service(meta_srv: MetaSrv) -> Admin {
};
let router = router.route("/region-migration", handler);
let handler = maintenance::MaintenanceHandler {
kv_backend: meta_srv.kv_backend().clone(),
};
let router = router
.route("/maintenance", handler.clone())
.route("/maintenance/set", handler);
let router = Router::nest("/admin", router);
Admin::new(router)

View File

@@ -0,0 +1,103 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use common_meta::key::MAINTENANCE_KEY;
use common_meta::kv_backend::KvBackendRef;
use common_meta::rpc::store::PutRequest;
use snafu::{OptionExt, ResultExt};
use tonic::codegen::http;
use tonic::codegen::http::Response;
use crate::error::{
InvalidHttpBodySnafu, KvBackendSnafu, MissingRequiredParameterSnafu, ParseBoolSnafu,
};
use crate::service::admin::HttpHandler;
#[derive(Clone)]
pub struct MaintenanceHandler {
pub kv_backend: KvBackendRef,
}
impl MaintenanceHandler {
async fn get_maintenance(&self) -> crate::Result<Response<String>> {
let enabled = self
.kv_backend
.exists(MAINTENANCE_KEY.as_bytes())
.await
.context(KvBackendSnafu)?;
let response = if enabled {
"Maintenance mode is enabled"
} else {
"Maintenance mode is disabled"
};
http::Response::builder()
.status(http::StatusCode::OK)
.body(response.into())
.context(InvalidHttpBodySnafu)
}
async fn set_maintenance(
&self,
params: &HashMap<String, String>,
) -> crate::Result<Response<String>> {
let enable = params
.get("enable")
.map(|v| v.parse::<bool>())
.context(MissingRequiredParameterSnafu { param: "enable" })?
.context(ParseBoolSnafu {
err_msg: "'enable' must be 'true' or 'false'",
})?;
let response = if enable {
let req = PutRequest {
key: Vec::from(MAINTENANCE_KEY),
value: vec![],
prev_kv: false,
};
self.kv_backend
.put(req.clone())
.await
.context(KvBackendSnafu)?;
"Maintenance mode enabled"
} else {
self.kv_backend
.delete(MAINTENANCE_KEY.as_bytes(), false)
.await
.context(KvBackendSnafu)?;
"Maintenance mode disabled"
};
http::Response::builder()
.status(http::StatusCode::OK)
.body(response.into())
.context(InvalidHttpBodySnafu)
}
}
#[async_trait::async_trait]
impl HttpHandler for MaintenanceHandler {
async fn handle(
&self,
path: &str,
params: &HashMap<String, String>,
) -> crate::Result<Response<String>> {
if path.ends_with("/set") {
self.set_maintenance(params).await
} else {
self.get_maintenance().await
}
}
}

View File

@@ -86,6 +86,7 @@ pub(crate) fn create_region_failover_manager() -> Arc<RegionFailoverManager> {
Arc::new(RegionFailoverManager::new(
10,
in_memory,
kv_backend.clone(),
mailbox,
procedure_manager,
(selector, selector_ctx),

View File

@@ -58,18 +58,19 @@ impl DataRegion {
/// Invoker don't need to set up or verify the column id. This method will adjust
/// it using underlying schema.
///
/// This method will also set the nullable marker to true.
/// This method will also set the nullable marker to true. All of those change are applies
/// to `columns` in-place.
pub async fn add_columns(
&self,
region_id: RegionId,
columns: Vec<ColumnMetadata>,
columns: &mut [ColumnMetadata],
) -> Result<()> {
let region_id = utils::to_data_region_id(region_id);
let mut retries = 0;
// submit alter request
while retries < MAX_RETRIES {
let request = self.assemble_alter_request(region_id, &columns).await?;
let request = self.assemble_alter_request(region_id, columns).await?;
let _timer = MITO_DDL_DURATION.start_timer();
@@ -90,10 +91,12 @@ impl DataRegion {
Ok(())
}
/// Generate warpped [RegionAlterRequest] with given [ColumnMetadata].
/// This method will modify `columns` in-place.
async fn assemble_alter_request(
&self,
region_id: RegionId,
columns: &[ColumnMetadata],
columns: &mut [ColumnMetadata],
) -> Result<RegionRequest> {
// retrieve underlying version
let region_metadata = self
@@ -118,15 +121,14 @@ impl DataRegion {
.unwrap_or(0);
// overwrite semantic type
let columns = columns
.iter()
let new_columns = columns
.iter_mut()
.enumerate()
.map(|(delta, c)| {
let mut c = c.clone();
if c.semantic_type == SemanticType::Tag {
if !c.column_schema.data_type.is_string() {
return ColumnTypeMismatchSnafu {
column_type: c.column_schema.data_type,
column_type: c.column_schema.data_type.clone(),
}
.fail();
}
@@ -138,11 +140,10 @@ impl DataRegion {
};
c.column_id = new_column_id_start + delta as u32;
c.column_schema = c.column_schema.with_nullable_set();
c.column_schema.set_nullable();
Ok(AddColumn {
column_metadata: c,
column_metadata: c.clone(),
location: None,
})
})
@@ -151,7 +152,9 @@ impl DataRegion {
// assemble alter request
let alter_request = RegionRequest::Alter(RegionAlterRequest {
schema_version: version,
kind: AlterKind::AddColumns { columns },
kind: AlterKind::AddColumns {
columns: new_columns,
},
});
Ok(alter_request)
@@ -167,6 +170,7 @@ impl DataRegion {
.handle_request(region_id, RegionRequest::Put(request))
.await
.context(MitoWriteOperationSnafu)
.map(|result| result.affected_rows)
}
pub async fn physical_columns(
@@ -205,7 +209,7 @@ mod test {
// TestEnv will create a logical region which changes the version to 1.
assert_eq!(current_version, 1);
let new_columns = vec![
let mut new_columns = vec![
ColumnMetadata {
column_id: 0,
semantic_type: SemanticType::Tag,
@@ -226,7 +230,7 @@ mod test {
},
];
env.data_region()
.add_columns(env.default_physical_region_id(), new_columns)
.add_columns(env.default_physical_region_id(), &mut new_columns)
.await
.unwrap();
@@ -258,14 +262,14 @@ mod test {
let env = TestEnv::new().await;
env.init_metric_region().await;
let new_columns = vec![ColumnMetadata {
let mut new_columns = vec![ColumnMetadata {
column_id: 0,
semantic_type: SemanticType::Tag,
column_schema: ColumnSchema::new("tag2", ConcreteDataType::int64_datatype(), false),
}];
let result = env
.data_region()
.add_columns(env.default_physical_region_id(), new_columns)
.add_columns(env.default_physical_region_id(), &mut new_columns)
.await;
assert!(result.is_err());
}

View File

@@ -24,6 +24,7 @@ mod region_metadata;
mod state;
use std::any::Any;
use std::collections::HashMap;
use std::sync::{Arc, RwLock};
use async_trait::async_trait;
@@ -33,13 +34,13 @@ use common_recordbatch::SendableRecordBatchStream;
use mito2::engine::MitoEngine;
use store_api::metadata::RegionMetadataRef;
use store_api::metric_engine_consts::METRIC_ENGINE_NAME;
use store_api::region_engine::{RegionEngine, RegionRole, SetReadonlyResponse};
use store_api::region_request::{AffectedRows, RegionRequest};
use store_api::region_engine::{RegionEngine, RegionHandleResult, RegionRole, SetReadonlyResponse};
use store_api::region_request::RegionRequest;
use store_api::storage::{RegionId, ScanRequest};
use self::state::MetricEngineState;
use crate::data_region::DataRegion;
use crate::error::Result;
use crate::error::{Result, UnsupportedRegionRequestSnafu};
use crate::metadata_region::MetadataRegion;
use crate::utils;
@@ -121,23 +122,39 @@ impl RegionEngine for MetricEngine {
&self,
region_id: RegionId,
request: RegionRequest,
) -> Result<AffectedRows, BoxedError> {
) -> Result<RegionHandleResult, BoxedError> {
let mut extension_return_value = HashMap::new();
let result = match request {
RegionRequest::Put(put) => self.inner.put_region(region_id, put).await,
RegionRequest::Delete(_) => todo!(),
RegionRequest::Create(create) => self.inner.create_region(region_id, create).await,
RegionRequest::Create(create) => {
self.inner
.create_region(region_id, create, &mut extension_return_value)
.await
}
RegionRequest::Drop(drop) => self.inner.drop_region(region_id, drop).await,
RegionRequest::Open(open) => self.inner.open_region(region_id, open).await,
RegionRequest::Close(close) => self.inner.close_region(region_id, close).await,
RegionRequest::Alter(alter) => self.inner.alter_region(region_id, alter).await,
RegionRequest::Flush(_) => todo!(),
RegionRequest::Compact(_) => todo!(),
RegionRequest::Truncate(_) => todo!(),
RegionRequest::Alter(alter) => {
self.inner
.alter_region(region_id, alter, &mut extension_return_value)
.await
}
RegionRequest::Delete(_)
| RegionRequest::Flush(_)
| RegionRequest::Compact(_)
| RegionRequest::Truncate(_) => UnsupportedRegionRequestSnafu { request }.fail(),
// It always Ok(0), all data is the latest.
RegionRequest::Catchup(_) => Ok(0),
};
result.map_err(BoxedError::new)
// TODO: pass extension
result
.map_err(BoxedError::new)
.map(|rows| RegionHandleResult {
affected_rows: rows,
extension: extension_return_value,
})
}
/// Handles substrait query and return a stream of record batches

View File

@@ -12,13 +12,19 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use common_telemetry::{error, info};
use snafu::OptionExt;
use snafu::{OptionExt, ResultExt};
use store_api::metadata::ColumnMetadata;
use store_api::metric_engine_consts::ALTER_PHYSICAL_EXTENSION_KEY;
use store_api::region_request::{AffectedRows, AlterKind, RegionAlterRequest};
use store_api::storage::RegionId;
use crate::engine::MetricEngineInner;
use crate::error::{ForbiddenPhysicalAlterSnafu, LogicalRegionNotFoundSnafu, Result};
use crate::error::{
ForbiddenPhysicalAlterSnafu, LogicalRegionNotFoundSnafu, Result, SerializeColumnMetadataSnafu,
};
use crate::metrics::FORBIDDEN_OPERATION_COUNT;
use crate::utils::{to_data_region_id, to_metadata_region_id};
@@ -28,23 +34,39 @@ impl MetricEngineInner {
&self,
region_id: RegionId,
request: RegionAlterRequest,
extension_return_value: &mut HashMap<String, Vec<u8>>,
) -> Result<AffectedRows> {
let is_altering_physical_region = self.is_physical_region(region_id);
let result = if is_altering_physical_region {
self.alter_physical_region(region_id, request).await
} else {
self.alter_logical_region(region_id, request).await
let physical_region_id = self.alter_logical_region(region_id, request).await?;
// Add physical table's column to extension map.
// It's ok to overwrite existing key, as the latter come schema is more up-to-date
let physical_columns = self
.data_region
.physical_columns(physical_region_id)
.await?;
extension_return_value.insert(
ALTER_PHYSICAL_EXTENSION_KEY.to_string(),
ColumnMetadata::encode_list(&physical_columns)
.context(SerializeColumnMetadataSnafu)?,
);
Ok(())
};
result.map(|_| 0)
}
/// Return the physical region id behind this logical region
async fn alter_logical_region(
&self,
region_id: RegionId,
request: RegionAlterRequest,
) -> Result<()> {
) -> Result<RegionId> {
let physical_region_id = {
let state = &self.state.read().unwrap();
state.get_physical_region_id(region_id).with_context(|| {
@@ -55,7 +77,7 @@ impl MetricEngineInner {
// only handle adding column
let AlterKind::AddColumns { columns } = request.kind else {
return Ok(());
return Ok(physical_region_id);
};
let metadata_region_id = to_metadata_region_id(physical_region_id);
@@ -92,7 +114,7 @@ impl MetricEngineInner {
.await?;
}
Ok(())
Ok(physical_region_id)
}
async fn alter_physical_region(

View File

@@ -15,6 +15,7 @@
use std::collections::{HashMap, HashSet};
use api::v1::SemanticType;
use common_error::ext::BoxedError;
use common_telemetry::info;
use common_time::Timestamp;
use datatypes::data_type::ConcreteDataType;
@@ -25,22 +26,26 @@ use object_store::util::join_dir;
use snafu::{ensure, OptionExt, ResultExt};
use store_api::metadata::ColumnMetadata;
use store_api::metric_engine_consts::{
DATA_REGION_SUBDIR, DATA_SCHEMA_TABLE_ID_COLUMN_NAME, DATA_SCHEMA_TSID_COLUMN_NAME,
LOGICAL_TABLE_METADATA_KEY, METADATA_REGION_SUBDIR, METADATA_SCHEMA_KEY_COLUMN_INDEX,
METADATA_SCHEMA_KEY_COLUMN_NAME, METADATA_SCHEMA_TIMESTAMP_COLUMN_INDEX,
METADATA_SCHEMA_TIMESTAMP_COLUMN_NAME, METADATA_SCHEMA_VALUE_COLUMN_INDEX,
METADATA_SCHEMA_VALUE_COLUMN_NAME, PHYSICAL_TABLE_METADATA_KEY,
ALTER_PHYSICAL_EXTENSION_KEY, DATA_REGION_SUBDIR, DATA_SCHEMA_TABLE_ID_COLUMN_NAME,
DATA_SCHEMA_TSID_COLUMN_NAME, LOGICAL_TABLE_METADATA_KEY, METADATA_REGION_SUBDIR,
METADATA_SCHEMA_KEY_COLUMN_INDEX, METADATA_SCHEMA_KEY_COLUMN_NAME,
METADATA_SCHEMA_TIMESTAMP_COLUMN_INDEX, METADATA_SCHEMA_TIMESTAMP_COLUMN_NAME,
METADATA_SCHEMA_VALUE_COLUMN_INDEX, METADATA_SCHEMA_VALUE_COLUMN_NAME,
PHYSICAL_TABLE_METADATA_KEY,
};
use store_api::region_engine::RegionEngine;
use store_api::region_request::{AffectedRows, RegionCreateRequest, RegionRequest};
use store_api::storage::consts::ReservedColumnId;
use store_api::storage::RegionId;
use crate::engine::options::set_index_options_for_data_region;
use crate::engine::options::{
set_index_options_for_data_region, set_memtable_options_for_data_region,
};
use crate::engine::MetricEngineInner;
use crate::error::{
ConflictRegionOptionSnafu, CreateMitoRegionSnafu, InternalColumnOccupiedSnafu,
MissingRegionOptionSnafu, ParseRegionIdSnafu, PhysicalRegionNotFoundSnafu, Result,
ColumnNotFoundSnafu, ConflictRegionOptionSnafu, CreateMitoRegionSnafu,
InternalColumnOccupiedSnafu, MissingRegionOptionSnafu, MitoReadOperationSnafu,
ParseRegionIdSnafu, PhysicalRegionNotFoundSnafu, Result, SerializeColumnMetadataSnafu,
};
use crate::metrics::{LOGICAL_REGION_COUNT, PHYSICAL_COLUMN_COUNT, PHYSICAL_REGION_COUNT};
use crate::utils::{to_data_region_id, to_metadata_region_id};
@@ -51,13 +56,28 @@ impl MetricEngineInner {
&self,
region_id: RegionId,
request: RegionCreateRequest,
extension_return_value: &mut HashMap<String, Vec<u8>>,
) -> Result<AffectedRows> {
Self::verify_region_create_request(&request)?;
let result = if request.options.contains_key(PHYSICAL_TABLE_METADATA_KEY) {
self.create_physical_region(region_id, request).await
} else if request.options.contains_key(LOGICAL_TABLE_METADATA_KEY) {
self.create_logical_region(region_id, request).await
let physical_region_id = self.create_logical_region(region_id, request).await?;
// Add physical table's column to extension map.
// It's ok to overwrite existing key, as the latter come schema is more up-to-date
let physical_columns = self
.data_region
.physical_columns(physical_region_id)
.await?;
extension_return_value.insert(
ALTER_PHYSICAL_EXTENSION_KEY.to_string(),
ColumnMetadata::encode_list(&physical_columns)
.context(SerializeColumnMetadataSnafu)?,
);
Ok(())
} else {
MissingRegionOptionSnafu {}.fail()
};
@@ -124,11 +144,16 @@ impl MetricEngineInner {
/// This method will alter the data region to add columns if necessary.
///
/// If the logical region to create already exists, this method will do nothing.
///
/// `alter_request` is a hashmap that stores the alter requests that were executed
/// to the physical region.
///
/// Return the physical region id of this logical region
async fn create_logical_region(
&self,
logical_region_id: RegionId,
request: RegionCreateRequest,
) -> Result<()> {
) -> Result<RegionId> {
// transform IDs
let physical_region_id_raw = request
.options
@@ -149,11 +174,12 @@ impl MetricEngineInner {
.await?
{
info!("Create a existing logical region {logical_region_id}. Skipped");
return Ok(());
return Ok(data_region_id);
}
// find new columns to add
let mut new_columns = vec![];
let mut existing_columns = vec![];
{
let state = &self.state.read().unwrap();
let physical_columns =
@@ -166,6 +192,8 @@ impl MetricEngineInner {
for col in &request.column_metadatas {
if !physical_columns.contains(&col.column_schema.name) {
new_columns.push(col.clone());
} else {
existing_columns.push(col.column_schema.name.clone());
}
}
}
@@ -186,9 +214,28 @@ impl MetricEngineInner {
self.metadata_region
.add_logical_region(metadata_region_id, logical_region_id)
.await?;
for col in &request.column_metadatas {
// register existing physical column to this new logical region.
let physical_schema = self
.data_region
.physical_columns(data_region_id)
.await
.map_err(BoxedError::new)
.context(MitoReadOperationSnafu)?;
let physical_schema_map = physical_schema
.into_iter()
.map(|metadata| (metadata.column_schema.name.clone(), metadata))
.collect::<HashMap<_, _>>();
for col in &existing_columns {
let column_metadata = physical_schema_map
.get(col)
.with_context(|| ColumnNotFoundSnafu {
name: col,
region_id: physical_region_id,
})?
.clone();
self.metadata_region
.add_column(metadata_region_id, logical_region_id, col)
.add_column(metadata_region_id, logical_region_id, &column_metadata)
.await?;
}
@@ -201,19 +248,21 @@ impl MetricEngineInner {
info!("Created new logical region {logical_region_id} on physical region {data_region_id}");
LOGICAL_REGION_COUNT.inc();
Ok(())
Ok(data_region_id)
}
/// Execute corresponding alter requests to mito region. New added columns' [ColumnMetadata] will be
/// cloned into `added_columns`.
pub(crate) async fn add_columns_to_physical_data_region(
&self,
data_region_id: RegionId,
metadata_region_id: RegionId,
logical_region_id: RegionId,
new_columns: Vec<ColumnMetadata>,
mut new_columns: Vec<ColumnMetadata>,
) -> Result<()> {
// alter data region
self.data_region
.add_columns(data_region_id, new_columns.clone())
.add_columns(data_region_id, &mut new_columns)
.await?;
// register columns to metadata region
@@ -360,13 +409,13 @@ impl MetricEngineInner {
// concat region dir
data_region_request.region_dir = join_dir(&request.region_dir, DATA_REGION_SUBDIR);
// convert semantic type
// change nullability for tag columns
data_region_request
.column_metadatas
.iter_mut()
.for_each(|metadata| {
if metadata.semantic_type == SemanticType::Tag {
metadata.semantic_type = SemanticType::Field;
metadata.column_schema.set_nullable();
}
});
@@ -380,6 +429,9 @@ impl MetricEngineInner {
// set index options
set_index_options_for_data_region(&mut data_region_request.options);
// Set memtable options.
set_memtable_options_for_data_region(&mut data_region_request.options);
data_region_request
}

View File

@@ -42,3 +42,8 @@ pub fn set_index_options_for_data_region(options: &mut HashMap<String, String>)
SEG_ROW_COUNT_FOR_DATA_REGION.to_string(),
);
}
/// Set memtable options for the data region.
pub fn set_memtable_options_for_data_region(options: &mut HashMap<String, String>) {
options.insert("memtable.type".to_string(), "partition_tree".to_string());
}

View File

@@ -215,12 +215,12 @@ mod tests {
// write data
let logical_region_id = env.default_logical_region_id();
let count = env
let result = env
.metric()
.handle_request(logical_region_id, request)
.await
.unwrap();
assert_eq!(count, 5);
assert_eq!(result.affected_rows, 5);
// read data from physical region
let physical_region_id = env.default_physical_region_id();
@@ -287,11 +287,11 @@ mod tests {
});
// write data
let count = engine
let result = engine
.handle_request(logical_region_id, request)
.await
.unwrap();
assert_eq!(100, count);
assert_eq!(100, result.affected_rows);
}
#[tokio::test]

View File

@@ -143,6 +143,7 @@ impl MetricEngineInner {
self.default_projection(physical_region_id, logical_region_id)
.await?
};
request.projection = Some(physical_projection);
// add table filter
@@ -186,6 +187,7 @@ impl MetricEngineInner {
.get_metadata(data_region_id)
.await
.context(MitoReadOperationSnafu)?;
for name in projected_logical_names {
// Safety: logical columns is a strict subset of physical columns
physical_projection.push(physical_metadata.column_index_by_name(&name).unwrap());
@@ -301,7 +303,7 @@ mod test {
.await
.unwrap();
assert_eq!(scan_req.projection.unwrap(), vec![0, 1, 4, 8, 9, 10, 11]);
assert_eq!(scan_req.projection.unwrap(), vec![11, 10, 9, 8, 0, 1, 4]);
assert_eq!(scan_req.filters.len(), 1);
assert_eq!(
scan_req.filters[0],
@@ -318,6 +320,6 @@ mod test {
.transform_request(physical_region_id, logical_region_id, scan_req)
.await
.unwrap();
assert_eq!(scan_req.projection.unwrap(), vec![0, 1, 4, 8, 9, 10, 11]);
assert_eq!(scan_req.projection.unwrap(), vec![11, 10, 9, 8, 0, 1, 4]);
}
}

View File

@@ -39,7 +39,8 @@ impl MetricEngineInner {
.collect::<Vec<_>>();
// sort columns on column id to ensure the order
logical_column_metadata.sort_unstable_by_key(|col| col.column_id);
logical_column_metadata
.sort_unstable_by(|c1, c2| c1.column_schema.name.cmp(&c2.column_schema.name));
Ok(logical_column_metadata)
}

View File

@@ -19,6 +19,7 @@ use common_error::status_code::StatusCode;
use common_macro::stack_trace_debug;
use datatypes::prelude::ConcreteDataType;
use snafu::{Location, Snafu};
use store_api::region_request::RegionRequest;
use store_api::storage::RegionId;
#[derive(Snafu)]
@@ -71,6 +72,13 @@ pub enum Error {
location: Location,
},
#[snafu(display("Failed to serialize column metadata"))]
SerializeColumnMetadata {
#[snafu(source)]
error: serde_json::Error,
location: Location,
},
#[snafu(display("Failed to decode base64 column value"))]
DecodeColumnValue {
#[snafu(source)]
@@ -155,6 +163,12 @@ pub enum Error {
region_id: RegionId,
location: Location,
},
#[snafu(display("Unsupported region request: {}", request))]
UnsupportedRegionRequest {
request: RegionRequest,
location: Location,
},
}
pub type Result<T, E = Error> = std::result::Result<T, E>;
@@ -170,11 +184,14 @@ impl ErrorExt for Error {
| ColumnTypeMismatch { .. }
| PhysicalRegionBusy { .. } => StatusCode::InvalidArguments,
ForbiddenPhysicalAlter { .. } => StatusCode::Unsupported,
ForbiddenPhysicalAlter { .. } | UnsupportedRegionRequest { .. } => {
StatusCode::Unsupported
}
MissingInternalColumn { .. }
| DeserializeSemanticType { .. }
| DeserializeColumnMetadata { .. }
| SerializeColumnMetadata { .. }
| DecodeColumnValue { .. }
| ParseRegionId { .. }
| InvalidMetadata { .. } => StatusCode::Unexpected,

View File

@@ -167,7 +167,7 @@ impl MetadataRegion {
// TODO(ruihang): avoid using `get_all`
/// Get all the columns of a given logical region.
/// Return a list of (column_name, semantic_type).
/// Return a list of (column_name, column_metadata).
pub async fn logical_columns(
&self,
physical_region_id: RegionId,

View File

@@ -56,8 +56,9 @@ pin-project.workspace = true
prometheus.workspace = true
prost.workspace = true
puffin.workspace = true
rand.workspace = true
regex = "1.5"
serde = { version = "1.0", features = ["derive"] }
serde.workspace = true
serde_json.workspace = true
serde_with.workspace = true
smallvec.workspace = true
@@ -75,7 +76,6 @@ common-procedure-test.workspace = true
common-test-util.workspace = true
criterion = "0.4"
log-store.workspace = true
rand.workspace = true
toml.workspace = true
[[bench]]

View File

@@ -21,7 +21,7 @@ use datafusion_common::Column;
use datafusion_expr::{lit, Expr};
use datatypes::data_type::ConcreteDataType;
use datatypes::schema::ColumnSchema;
use mito2::memtable::merge_tree::{MergeTreeConfig, MergeTreeMemtable};
use mito2::memtable::partition_tree::{PartitionTreeConfig, PartitionTreeMemtable};
use mito2::memtable::time_series::TimeSeriesMemtable;
use mito2::memtable::{KeyValues, Memtable};
use mito2::test_util::memtable_util::{self, region_metadata_to_row_schema};
@@ -41,9 +41,9 @@ fn write_rows(c: &mut Criterion) {
// Note that this test only generate one time series.
let mut group = c.benchmark_group("write");
group.bench_function("merge_tree", |b| {
group.bench_function("partition_tree", |b| {
let memtable =
MergeTreeMemtable::new(1, metadata.clone(), None, &MergeTreeConfig::default());
PartitionTreeMemtable::new(1, metadata.clone(), None, &PartitionTreeConfig::default());
let kvs =
memtable_util::build_key_values(&metadata, "hello".to_string(), 42, &timestamps, 1);
b.iter(|| {
@@ -51,7 +51,7 @@ fn write_rows(c: &mut Criterion) {
});
});
group.bench_function("time_series", |b| {
let memtable = TimeSeriesMemtable::new(metadata.clone(), 1, None);
let memtable = TimeSeriesMemtable::new(metadata.clone(), 1, None, true);
let kvs =
memtable_util::build_key_values(&metadata, "hello".to_string(), 42, &timestamps, 1);
b.iter(|| {
@@ -63,14 +63,14 @@ fn write_rows(c: &mut Criterion) {
/// Scans all rows.
fn full_scan(c: &mut Criterion) {
let metadata = Arc::new(cpu_metadata());
let config = MergeTreeConfig::default();
let config = PartitionTreeConfig::default();
let start_sec = 1710043200;
let generator = CpuDataGenerator::new(metadata.clone(), 4000, start_sec, start_sec + 3600 * 2);
let mut group = c.benchmark_group("full_scan");
group.sample_size(10);
group.bench_function("merge_tree", |b| {
let memtable = MergeTreeMemtable::new(1, metadata.clone(), None, &config);
group.bench_function("partition_tree", |b| {
let memtable = PartitionTreeMemtable::new(1, metadata.clone(), None, &config);
for kvs in generator.iter() {
memtable.write(&kvs).unwrap();
}
@@ -83,7 +83,7 @@ fn full_scan(c: &mut Criterion) {
});
});
group.bench_function("time_series", |b| {
let memtable = TimeSeriesMemtable::new(metadata.clone(), 1, None);
let memtable = TimeSeriesMemtable::new(metadata.clone(), 1, None, true);
for kvs in generator.iter() {
memtable.write(&kvs).unwrap();
}
@@ -100,14 +100,14 @@ fn full_scan(c: &mut Criterion) {
/// Filters 1 host.
fn filter_1_host(c: &mut Criterion) {
let metadata = Arc::new(cpu_metadata());
let config = MergeTreeConfig::default();
let config = PartitionTreeConfig::default();
let start_sec = 1710043200;
let generator = CpuDataGenerator::new(metadata.clone(), 4000, start_sec, start_sec + 3600 * 2);
let mut group = c.benchmark_group("filter_1_host");
group.sample_size(10);
group.bench_function("merge_tree", |b| {
let memtable = MergeTreeMemtable::new(1, metadata.clone(), None, &config);
group.bench_function("partition_tree", |b| {
let memtable = PartitionTreeMemtable::new(1, metadata.clone(), None, &config);
for kvs in generator.iter() {
memtable.write(&kvs).unwrap();
}
@@ -121,7 +121,7 @@ fn filter_1_host(c: &mut Criterion) {
});
});
group.bench_function("time_series", |b| {
let memtable = TimeSeriesMemtable::new(metadata.clone(), 1, None);
let memtable = TimeSeriesMemtable::new(metadata.clone(), 1, None, true);
for kvs in generator.iter() {
memtable.write(&kvs).unwrap();
}

View File

@@ -328,14 +328,14 @@ mod tests {
fn test_deserialize_config() {
let s = r#"
[memtable]
type = "experimental"
type = "partition_tree"
index_max_keys_per_shard = 8192
data_freeze_threshold = 1024
dedup = true
fork_dictionary_bytes = "512MiB"
"#;
let config: MitoConfig = toml::from_str(s).unwrap();
let MemtableConfig::Experimental(config) = &config.memtable else {
let MemtableConfig::PartitionTree(config) = &config.memtable else {
unreachable!()
};
assert_eq!(1024, config.data_freeze_threshold);

View File

@@ -57,7 +57,7 @@ use object_store::manager::ObjectStoreManagerRef;
use snafu::{ensure, OptionExt, ResultExt};
use store_api::logstore::LogStore;
use store_api::metadata::RegionMetadataRef;
use store_api::region_engine::{RegionEngine, RegionRole, SetReadonlyResponse};
use store_api::region_engine::{RegionEngine, RegionHandleResult, RegionRole, SetReadonlyResponse};
use store_api::region_request::{AffectedRows, RegionRequest};
use store_api::storage::{RegionId, ScanRequest};
use tokio::sync::oneshot;
@@ -290,10 +290,11 @@ impl RegionEngine for MitoEngine {
&self,
region_id: RegionId,
request: RegionRequest,
) -> Result<AffectedRows, BoxedError> {
) -> Result<RegionHandleResult, BoxedError> {
self.inner
.handle_request(region_id, request)
.await
.map(RegionHandleResult::new)
.map_err(BoxedError::new)
}
@@ -373,6 +374,7 @@ impl MitoEngine {
object_store_manager: ObjectStoreManagerRef,
write_buffer_manager: Option<crate::flush::WriteBufferManagerRef>,
listener: Option<crate::engine::listener::EventListenerRef>,
time_provider: crate::time_provider::TimeProviderRef,
) -> Result<MitoEngine> {
config.sanitize(data_home)?;
@@ -385,6 +387,7 @@ impl MitoEngine {
object_store_manager,
write_buffer_manager,
listener,
time_provider,
)
.await?,
config,

View File

@@ -111,7 +111,7 @@ async fn test_region_replay() {
let engine = env.reopen_engine(engine, MitoConfig::default()).await;
let rows = engine
let result = engine
.handle_request(
region_id,
RegionRequest::Open(RegionOpenRequest {
@@ -123,7 +123,7 @@ async fn test_region_replay() {
)
.await
.unwrap();
assert_eq!(0, rows);
assert_eq!(0, result.affected_rows);
let request = ScanRequest::default();
let stream = engine.handle_query(region_id, request).await.unwrap();

View File

@@ -42,7 +42,7 @@ async fn put_and_flush(
};
put_rows(engine, region_id, rows).await;
let rows = engine
let result = engine
.handle_request(
region_id,
RegionRequest::Flush(RegionFlushRequest {
@@ -51,7 +51,7 @@ async fn put_and_flush(
)
.await
.unwrap();
assert_eq!(0, rows);
assert_eq!(0, result.affected_rows);
}
async fn delete_and_flush(
@@ -66,16 +66,16 @@ async fn delete_and_flush(
rows: build_rows_for_key("a", rows.start, rows.end, 0),
};
let rows_affected = engine
let result = engine
.handle_request(
region_id,
RegionRequest::Delete(RegionDeleteRequest { rows }),
)
.await
.unwrap();
assert_eq!(row_cnt, rows_affected);
assert_eq!(row_cnt, result.affected_rows);
let rows = engine
let result = engine
.handle_request(
region_id,
RegionRequest::Flush(RegionFlushRequest {
@@ -84,7 +84,7 @@ async fn delete_and_flush(
)
.await
.unwrap();
assert_eq!(0, rows);
assert_eq!(0, result.affected_rows);
}
async fn collect_stream_ts(stream: SendableRecordBatchStream) -> Vec<i64> {
@@ -127,11 +127,11 @@ async fn test_compaction_region() {
delete_and_flush(&engine, region_id, &column_schemas, 15..30).await;
put_and_flush(&engine, region_id, &column_schemas, 15..25).await;
let output = engine
let result = engine
.handle_request(region_id, RegionRequest::Compact(RegionCompactRequest {}))
.await
.unwrap();
assert_eq!(output, 0);
assert_eq!(result.affected_rows, 0);
let scanner = engine.scanner(region_id, ScanRequest::default()).unwrap();
assert_eq!(

View File

@@ -14,12 +14,15 @@
use std::time::Duration;
use api::v1::Rows;
use common_recordbatch::RecordBatches;
use store_api::region_engine::RegionEngine;
use store_api::region_request::{RegionCloseRequest, RegionRequest};
use store_api::storage::RegionId;
use store_api::storage::{RegionId, ScanRequest};
use crate::config::MitoConfig;
use crate::test_util::{CreateRequestBuilder, TestEnv};
use crate::region::options::MemtableOptions;
use crate::test_util::{build_rows, put_rows, rows_schema, CreateRequestBuilder, TestEnv};
#[tokio::test]
async fn test_engine_create_new_region() {
@@ -198,3 +201,45 @@ async fn test_engine_create_with_custom_store() {
.await
.unwrap());
}
#[tokio::test]
async fn test_engine_create_with_memtable_opts() {
let mut env = TestEnv::new();
let engine = env.create_engine(MitoConfig::default()).await;
let region_id = RegionId::new(1, 1);
let request = CreateRequestBuilder::new()
.insert_option("memtable.type", "partition_tree")
.insert_option("memtable.partition_tree.index_max_keys_per_shard", "2")
.build();
let column_schemas = rows_schema(&request);
engine
.handle_request(region_id, RegionRequest::Create(request))
.await
.unwrap();
let region = engine.get_region(region_id).unwrap();
let Some(MemtableOptions::PartitionTree(memtable_opts)) = &region.version().options.memtable
else {
unreachable!();
};
assert_eq!(2, memtable_opts.index_max_keys_per_shard);
let rows = Rows {
schema: column_schemas,
rows: build_rows(0, 3),
};
put_rows(&engine, region_id, rows).await;
let request = ScanRequest::default();
let stream = engine.handle_query(region_id, request).await.unwrap();
let batches = RecordBatches::try_collect(stream).await.unwrap();
let expected = "\
+-------+---------+---------------------+
| tag_0 | field_0 | ts |
+-------+---------+---------------------+
| 0 | 0.0 | 1970-01-01T00:00:00 |
| 1 | 1.0 | 1970-01-01T00:00:01 |
| 2 | 2.0 | 1970-01-01T00:00:02 |
+-------+---------+---------------------+";
assert_eq!(expected, batches.pretty_print().unwrap());
}

View File

@@ -14,10 +14,13 @@
//! Flush tests for mito engine.
use std::sync::atomic::{AtomicI64, Ordering};
use std::sync::Arc;
use std::time::Duration;
use api::v1::Rows;
use common_recordbatch::RecordBatches;
use common_time::util::current_time_millis;
use store_api::region_engine::RegionEngine;
use store_api::region_request::RegionRequest;
use store_api::storage::{RegionId, ScanRequest};
@@ -28,6 +31,8 @@ use crate::test_util::{
build_rows, build_rows_for_key, flush_region, put_rows, reopen_region, rows_schema,
CreateRequestBuilder, MockWriteBufferManager, TestEnv,
};
use crate::time_provider::TimeProvider;
use crate::worker::MAX_INITIAL_CHECK_DELAY_SECS;
#[tokio::test]
async fn test_manual_flush() {
@@ -272,3 +277,101 @@ async fn test_flush_reopen_region() {
assert_eq!(2, version_data.last_entry_id);
assert_eq!(5, version_data.committed_sequence);
}
#[derive(Debug)]
struct MockTimeProvider {
now: AtomicI64,
elapsed: AtomicI64,
}
impl TimeProvider for MockTimeProvider {
fn current_time_millis(&self) -> i64 {
self.now.load(Ordering::Relaxed)
}
fn elapsed_since(&self, _current_millis: i64) -> i64 {
self.elapsed.load(Ordering::Relaxed)
}
fn wait_duration(&self, _duration: Duration) -> Duration {
Duration::from_millis(20)
}
}
impl MockTimeProvider {
fn new(now: i64) -> Self {
Self {
now: AtomicI64::new(now),
elapsed: AtomicI64::new(0),
}
}
fn set_now(&self, now: i64) {
self.now.store(now, Ordering::Relaxed);
}
fn set_elapsed(&self, elapsed: i64) {
self.elapsed.store(elapsed, Ordering::Relaxed);
}
}
#[tokio::test]
async fn test_auto_flush_engine() {
let mut env = TestEnv::new();
let write_buffer_manager = Arc::new(MockWriteBufferManager::default());
let listener = Arc::new(FlushListener::default());
let now = current_time_millis();
let time_provider = Arc::new(MockTimeProvider::new(now));
let engine = env
.create_engine_with_time(
MitoConfig {
auto_flush_interval: Duration::from_secs(60 * 5),
..Default::default()
},
Some(write_buffer_manager.clone()),
Some(listener.clone()),
time_provider.clone(),
)
.await;
let region_id = RegionId::new(1, 1);
let request = CreateRequestBuilder::new().build();
let column_schemas = rows_schema(&request);
engine
.handle_request(region_id, RegionRequest::Create(request))
.await
.unwrap();
// Prepares rows for flush.
let rows = Rows {
schema: column_schemas.clone(),
rows: build_rows_for_key("a", 0, 2, 0),
};
put_rows(&engine, region_id, rows).await;
// Sets current time to now + auto_flush_interval * 2.
time_provider.set_now(now + (60 * 5 * 2) * 1000);
// Sets elapsed time to MAX_INITIAL_CHECK_DELAY_SECS + 1.
time_provider.set_elapsed((MAX_INITIAL_CHECK_DELAY_SECS as i64 + 1) * 1000);
// Wait until flush is finished.
tokio::time::timeout(Duration::from_secs(3), listener.wait())
.await
.unwrap();
let request = ScanRequest::default();
let scanner = engine.scanner(region_id, request).unwrap();
assert_eq!(0, scanner.num_memtables());
assert_eq!(1, scanner.num_files());
let stream = scanner.scan().await.unwrap();
let batches = RecordBatches::try_collect(stream).await.unwrap();
let expected = "\
+-------+---------+---------------------+
| tag_0 | field_0 | ts |
+-------+---------+---------------------+
| a | 0.0 | 1970-01-01T00:00:00 |
| a | 1.0 | 1970-01-01T00:00:01 |
+-------+---------+---------------------+";
assert_eq!(expected, batches.pretty_print().unwrap());
}

View File

@@ -572,6 +572,9 @@ pub enum Error {
#[snafu(source)]
error: parquet::errors::ParquetError,
},
#[snafu(display("Invalid region options, {}", reason))]
InvalidRegionOptions { reason: String, location: Location },
}
pub type Result<T, E = Error> = std::result::Result<T, E>;
@@ -621,7 +624,8 @@ impl ErrorExt for Error {
| FillDefault { .. }
| ConvertColumnDataType { .. }
| ColumnNotFound { .. }
| InvalidMetadata { .. } => StatusCode::InvalidArguments,
| InvalidMetadata { .. }
| InvalidRegionOptions { .. } => StatusCode::InvalidArguments,
InvalidRegionRequestSchemaVersion { .. } => StatusCode::RequestOutdated,

View File

@@ -176,6 +176,8 @@ pub enum FlushReason {
Manual,
/// Flush to alter table.
Alter,
/// Flush periodically.
Periodically,
}
impl FlushReason {
@@ -432,18 +434,19 @@ impl FlushScheduler {
) -> Result<()> {
debug_assert_eq!(region_id, task.region_id);
FLUSH_REQUESTS_TOTAL
.with_label_values(&[task.reason.as_str()])
.inc();
let version = version_control.current().version;
if version.memtables.mutable.is_empty() && version.memtables.immutables().is_empty() {
if version.memtables.is_empty() {
debug_assert!(!self.region_status.contains_key(&region_id));
// The region has nothing to flush.
task.on_success();
return Ok(());
}
// Don't increase the counter if a region has nothing to flush.
FLUSH_REQUESTS_TOTAL
.with_label_values(&[task.reason.as_str()])
.inc();
// Add this region to status map.
let flush_status = self
.region_status

View File

@@ -40,6 +40,7 @@ pub mod request;
pub mod row_converter;
pub(crate) mod schedule;
pub mod sst;
mod time_provider;
pub mod wal;
mod worker;

View File

@@ -28,12 +28,14 @@ use crate::error::Result;
use crate::flush::WriteBufferManagerRef;
use crate::memtable::key_values::KeyValue;
pub use crate::memtable::key_values::KeyValues;
use crate::memtable::merge_tree::MergeTreeConfig;
use crate::memtable::partition_tree::{PartitionTreeConfig, PartitionTreeMemtableBuilder};
use crate::memtable::time_series::TimeSeriesMemtableBuilder;
use crate::metrics::WRITE_BUFFER_BYTES;
use crate::read::Batch;
use crate::region::options::MemtableOptions;
pub mod key_values;
pub mod merge_tree;
pub mod partition_tree;
pub mod time_partition;
pub mod time_series;
pub(crate) mod version;
@@ -47,13 +49,13 @@ pub type MemtableId = u32;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum MemtableConfig {
Experimental(MergeTreeConfig),
PartitionTree(PartitionTreeConfig),
TimeSeries,
}
impl Default for MemtableConfig {
fn default() -> Self {
Self::Experimental(MergeTreeConfig::default())
Self::TimeSeries
}
}
@@ -206,6 +208,48 @@ impl Drop for AllocTracker {
}
}
/// Provider of memtable builders for regions.
#[derive(Clone)]
pub(crate) struct MemtableBuilderProvider {
write_buffer_manager: Option<WriteBufferManagerRef>,
default_memtable_builder: MemtableBuilderRef,
}
impl MemtableBuilderProvider {
pub(crate) fn new(
write_buffer_manager: Option<WriteBufferManagerRef>,
default_memtable_builder: MemtableBuilderRef,
) -> Self {
Self {
write_buffer_manager,
default_memtable_builder,
}
}
pub(crate) fn builder_for_options(
&self,
options: Option<&MemtableOptions>,
) -> MemtableBuilderRef {
match options {
Some(MemtableOptions::TimeSeries) => Arc::new(TimeSeriesMemtableBuilder::new(
self.write_buffer_manager.clone(),
)),
Some(MemtableOptions::PartitionTree(opts)) => {
Arc::new(PartitionTreeMemtableBuilder::new(
PartitionTreeConfig {
index_max_keys_per_shard: opts.index_max_keys_per_shard,
data_freeze_threshold: opts.data_freeze_threshold,
fork_dictionary_bytes: opts.fork_dictionary_bytes,
..Default::default()
},
self.write_buffer_manager.clone(),
))
}
None => self.default_memtable_builder.clone(),
}
}
}
#[cfg(test)]
mod tests {
use common_base::readable_size::ReadableSize;
@@ -216,20 +260,20 @@ mod tests {
#[test]
fn test_deserialize_memtable_config() {
let s = r#"
type = "experimental"
type = "partition_tree"
index_max_keys_per_shard = 8192
data_freeze_threshold = 1024
dedup = true
fork_dictionary_bytes = "512MiB"
"#;
let config: MemtableConfig = toml::from_str(s).unwrap();
let MemtableConfig::Experimental(merge_tree) = config else {
let MemtableConfig::PartitionTree(memtable_config) = config else {
unreachable!()
};
assert!(merge_tree.dedup);
assert_eq!(8192, merge_tree.index_max_keys_per_shard);
assert_eq!(1024, merge_tree.data_freeze_threshold);
assert_eq!(ReadableSize::mb(512), merge_tree.fork_dictionary_bytes);
assert!(memtable_config.dedup);
assert_eq!(8192, memtable_config.index_max_keys_per_shard);
assert_eq!(1024, memtable_config.data_freeze_threshold);
assert_eq!(ReadableSize::mb(512), memtable_config.fork_dictionary_bytes);
}
#[test]

View File

@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
//! Memtable implementation based on a merge tree.
//! Memtable implementation based on a partition tree.
pub(crate) mod data;
mod dedup;
@@ -37,15 +37,17 @@ use table::predicate::Predicate;
use crate::error::Result;
use crate::flush::WriteBufferManagerRef;
use crate::memtable::key_values::KeyValue;
use crate::memtable::merge_tree::metrics::WriteMetrics;
use crate::memtable::merge_tree::tree::MergeTree;
use crate::memtable::partition_tree::metrics::WriteMetrics;
use crate::memtable::partition_tree::tree::PartitionTree;
use crate::memtable::{
AllocTracker, BoxedBatchIterator, KeyValues, Memtable, MemtableBuilder, MemtableId,
MemtableRef, MemtableStats,
};
/// Use `1/DICTIONARY_SIZE_FACTOR` of OS memory as dictionary size.
const DICTIONARY_SIZE_FACTOR: u64 = 8;
pub(crate) const DICTIONARY_SIZE_FACTOR: u64 = 8;
pub(crate) const DEFAULT_MAX_KEYS_PER_SHARD: usize = 8192;
pub(crate) const DEFAULT_FREEZE_THRESHOLD: usize = 131072;
/// Id of a shard, only unique inside a partition.
type ShardId = u32;
@@ -59,23 +61,30 @@ struct PkId {
pk_index: PkIndex,
}
/// Config for the merge tree memtable.
// TODO(yingwen): `fork_dictionary_bytes` is per region option, if we have multiple partition tree
// memtable then we will use a lot memory. We should find a better way to control the
// dictionary size.
/// Config for the partition tree memtable.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(default)]
pub struct MergeTreeConfig {
pub struct PartitionTreeConfig {
/// Max keys in an index shard.
pub index_max_keys_per_shard: usize,
/// Number of rows to freeze a data part.
pub data_freeze_threshold: usize,
/// Whether to delete duplicates rows.
///
/// Skips deserializing as it should be determined by whether the
/// table is append only.
#[serde(skip_deserializing)]
pub dedup: bool,
/// Total bytes of dictionary to keep in fork.
pub fork_dictionary_bytes: ReadableSize,
}
impl Default for MergeTreeConfig {
impl Default for PartitionTreeConfig {
fn default() -> Self {
let mut fork_dictionary_bytes = ReadableSize::gb(1);
let mut fork_dictionary_bytes = ReadableSize::mb(512);
if let Some(sys_memory) = common_config::utils::get_sys_total_memory() {
let adjust_dictionary_bytes =
std::cmp::min(sys_memory / DICTIONARY_SIZE_FACTOR, fork_dictionary_bytes);
@@ -93,24 +102,24 @@ impl Default for MergeTreeConfig {
}
}
/// Memtable based on a merge tree.
pub struct MergeTreeMemtable {
/// Memtable based on a partition tree.
pub struct PartitionTreeMemtable {
id: MemtableId,
tree: MergeTree,
tree: PartitionTree,
alloc_tracker: AllocTracker,
max_timestamp: AtomicI64,
min_timestamp: AtomicI64,
}
impl fmt::Debug for MergeTreeMemtable {
impl fmt::Debug for PartitionTreeMemtable {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("MergeTreeMemtable")
f.debug_struct("PartitionTreeMemtable")
.field("id", &self.id)
.finish()
}
}
impl Memtable for MergeTreeMemtable {
impl Memtable for PartitionTreeMemtable {
fn id(&self) -> MemtableId {
self.id
}
@@ -188,29 +197,29 @@ impl Memtable for MergeTreeMemtable {
fn fork(&self, id: MemtableId, metadata: &RegionMetadataRef) -> MemtableRef {
let tree = self.tree.fork(metadata.clone());
let memtable = MergeTreeMemtable::with_tree(id, tree);
let memtable = PartitionTreeMemtable::with_tree(id, tree);
Arc::new(memtable)
}
}
impl MergeTreeMemtable {
impl PartitionTreeMemtable {
/// Returns a new memtable.
pub fn new(
id: MemtableId,
metadata: RegionMetadataRef,
write_buffer_manager: Option<WriteBufferManagerRef>,
config: &MergeTreeConfig,
config: &PartitionTreeConfig,
) -> Self {
Self::with_tree(
id,
MergeTree::new(metadata, config, write_buffer_manager.clone()),
PartitionTree::new(metadata, config, write_buffer_manager.clone()),
)
}
/// Creates a mutable memtable from the tree.
///
/// It also adds the bytes used by shared parts (e.g. index) to the memory usage.
fn with_tree(id: MemtableId, tree: MergeTree) -> Self {
fn with_tree(id: MemtableId, tree: PartitionTree) -> Self {
let alloc_tracker = AllocTracker::new(tree.write_buffer_manager());
Self {
@@ -269,17 +278,17 @@ impl MergeTreeMemtable {
}
}
/// Builder to build a [MergeTreeMemtable].
/// Builder to build a [PartitionTreeMemtable].
#[derive(Debug, Default)]
pub struct MergeTreeMemtableBuilder {
config: MergeTreeConfig,
pub struct PartitionTreeMemtableBuilder {
config: PartitionTreeConfig,
write_buffer_manager: Option<WriteBufferManagerRef>,
}
impl MergeTreeMemtableBuilder {
impl PartitionTreeMemtableBuilder {
/// Creates a new builder with specific `write_buffer_manager`.
pub fn new(
config: MergeTreeConfig,
config: PartitionTreeConfig,
write_buffer_manager: Option<WriteBufferManagerRef>,
) -> Self {
Self {
@@ -289,9 +298,9 @@ impl MergeTreeMemtableBuilder {
}
}
impl MemtableBuilder for MergeTreeMemtableBuilder {
impl MemtableBuilder for PartitionTreeMemtableBuilder {
fn build(&self, id: MemtableId, metadata: &RegionMetadataRef) -> MemtableRef {
Arc::new(MergeTreeMemtable::new(
Arc::new(PartitionTreeMemtable::new(
id,
metadata.clone(),
self.write_buffer_manager.clone(),
@@ -326,7 +335,8 @@ mod tests {
let timestamps = (0..100).collect::<Vec<_>>();
let kvs =
memtable_util::build_key_values(&metadata, "hello".to_string(), 42, &timestamps, 1);
let memtable = MergeTreeMemtable::new(1, metadata, None, &MergeTreeConfig::default());
let memtable =
PartitionTreeMemtable::new(1, metadata, None, &PartitionTreeConfig::default());
memtable.write(&kvs).unwrap();
let expected_ts = kvs
@@ -362,7 +372,7 @@ mod tests {
memtable_util::metadata_with_primary_key(vec![], false)
};
let memtable =
MergeTreeMemtable::new(1, metadata.clone(), None, &MergeTreeConfig::default());
PartitionTreeMemtable::new(1, metadata.clone(), None, &PartitionTreeConfig::default());
let kvs = memtable_util::build_key_values(
&metadata,
@@ -421,8 +431,8 @@ mod tests {
memtable_util::metadata_with_primary_key(vec![], false)
};
// Try to build a memtable via the builder.
let memtable =
MergeTreeMemtableBuilder::new(MergeTreeConfig::default(), None).build(1, &metadata);
let memtable = PartitionTreeMemtableBuilder::new(PartitionTreeConfig::default(), None)
.build(1, &metadata);
let expect = (0..100).collect::<Vec<_>>();
let kvs = memtable_util::build_key_values(&metadata, "hello".to_string(), 10, &expect, 1);
@@ -457,11 +467,11 @@ mod tests {
fn write_iter_multi_keys(max_keys: usize, freeze_threshold: usize) {
let metadata = memtable_util::metadata_with_primary_key(vec![1, 0], true);
let memtable = MergeTreeMemtable::new(
let memtable = PartitionTreeMemtable::new(
1,
metadata.clone(),
None,
&MergeTreeConfig {
&PartitionTreeConfig {
index_max_keys_per_shard: max_keys,
data_freeze_threshold: freeze_threshold,
..Default::default()
@@ -506,8 +516,8 @@ mod tests {
fn test_memtable_filter() {
let metadata = memtable_util::metadata_with_primary_key(vec![0, 1], false);
// Try to build a memtable via the builder.
let memtable = MergeTreeMemtableBuilder::new(
MergeTreeConfig {
let memtable = PartitionTreeMemtableBuilder::new(
PartitionTreeConfig {
index_max_keys_per_shard: 40,
..Default::default()
},
@@ -539,4 +549,17 @@ mod tests {
assert_eq!(timestamps, read);
}
}
#[test]
fn test_deserialize_config() {
let config = PartitionTreeConfig {
dedup: false,
..Default::default()
};
// Creates a json with dedup = false.
let json = serde_json::to_string(&config).unwrap();
let config: PartitionTreeConfig = serde_json::from_str(&json).unwrap();
assert!(config.dedup);
assert_eq!(PartitionTreeConfig::default(), config);
}
}

View File

@@ -45,9 +45,11 @@ use store_api::storage::consts::{OP_TYPE_COLUMN_NAME, SEQUENCE_COLUMN_NAME};
use crate::error;
use crate::error::Result;
use crate::memtable::key_values::KeyValue;
use crate::memtable::merge_tree::merger::{DataBatchKey, DataNode, DataSource, Merger};
use crate::memtable::merge_tree::PkIndex;
use crate::metrics::{MERGE_TREE_DATA_BUFFER_FREEZE_STAGE_ELAPSED, MERGE_TREE_READ_STAGE_ELAPSED};
use crate::memtable::partition_tree::merger::{DataBatchKey, DataNode, DataSource, Merger};
use crate::memtable::partition_tree::PkIndex;
use crate::metrics::{
PARTITION_TREE_DATA_BUFFER_FREEZE_STAGE_ELAPSED, PARTITION_TREE_READ_STAGE_ELAPSED,
};
const PK_INDEX_COLUMN_NAME: &str = "__pk_index";
@@ -255,7 +257,7 @@ impl DataBuffer {
/// Builds a lazily initialized data buffer reader from [DataBuffer]
pub fn read(&self) -> Result<DataBufferReaderBuilder> {
let _timer = MERGE_TREE_READ_STAGE_ELAPSED
let _timer = PARTITION_TREE_READ_STAGE_ELAPSED
.with_label_values(&["read_data_buffer"])
.start_timer();
@@ -523,7 +525,7 @@ pub(crate) struct DataBufferReader {
impl Drop for DataBufferReader {
fn drop(&mut self) {
MERGE_TREE_READ_STAGE_ELAPSED
PARTITION_TREE_READ_STAGE_ELAPSED
.with_label_values(&["read_data_buffer"])
.observe(self.elapsed_time.as_secs_f64())
}
@@ -780,7 +782,7 @@ impl<'a> DataPartEncoder<'a> {
let mut bytes = Vec::with_capacity(1024);
let rb = {
let _timer = MERGE_TREE_DATA_BUFFER_FREEZE_STAGE_ELAPSED
let _timer = PARTITION_TREE_DATA_BUFFER_FREEZE_STAGE_ELAPSED
.with_label_values(&["drain_data_buffer_to_batch"])
.start_timer();
drain_data_buffer_to_record_batches(
@@ -793,7 +795,7 @@ impl<'a> DataPartEncoder<'a> {
};
{
let _timer = MERGE_TREE_DATA_BUFFER_FREEZE_STAGE_ELAPSED
let _timer = PARTITION_TREE_DATA_BUFFER_FREEZE_STAGE_ELAPSED
.with_label_values(&["encode"])
.start_timer();
let mut writer =
@@ -837,7 +839,7 @@ pub struct DataPartReader {
impl Drop for DataPartReader {
fn drop(&mut self) {
MERGE_TREE_READ_STAGE_ELAPSED
PARTITION_TREE_READ_STAGE_ELAPSED
.with_label_values(&["read_data_part"])
.observe(self.elapsed.as_secs_f64());
}
@@ -973,7 +975,7 @@ impl DataParts {
/// The returned iterator yields a record batch of one primary key at a time.
/// The order of yielding primary keys is determined by provided weights.
pub fn read(&self) -> Result<DataPartsReaderBuilder> {
let _timer = MERGE_TREE_READ_STAGE_ELAPSED
let _timer = PARTITION_TREE_READ_STAGE_ELAPSED
.with_label_values(&["build_data_parts_reader"])
.start_timer();
@@ -1030,7 +1032,7 @@ pub struct DataPartsReader {
impl Drop for DataPartsReader {
fn drop(&mut self) {
MERGE_TREE_READ_STAGE_ELAPSED
PARTITION_TREE_READ_STAGE_ELAPSED
.with_label_values(&["read_data_parts"])
.observe(self.elapsed.as_secs_f64())
}

View File

@@ -15,9 +15,9 @@
use std::ops::Range;
use crate::error::Result;
use crate::memtable::merge_tree::data::DataBatch;
use crate::memtable::merge_tree::shard::DataBatchSource;
use crate::memtable::merge_tree::PkId;
use crate::memtable::partition_tree::data::DataBatch;
use crate::memtable::partition_tree::shard::DataBatchSource;
use crate::memtable::partition_tree::PkId;
/// A reader that dedup sorted batches from a merger.
pub struct DedupReader<T> {
@@ -112,7 +112,7 @@ mod tests {
use store_api::metadata::RegionMetadataRef;
use super::*;
use crate::memtable::merge_tree::data::{DataBuffer, DataParts, DataPartsReader};
use crate::memtable::partition_tree::data::{DataBuffer, DataParts, DataPartsReader};
use crate::test_util::memtable_util::{
extract_data_batch, metadata_for_test, write_rows_to_buffer,
};

View File

@@ -19,8 +19,8 @@ use std::sync::Arc;
use datatypes::arrow::array::{Array, ArrayBuilder, BinaryArray, BinaryBuilder};
use crate::memtable::merge_tree::metrics::WriteMetrics;
use crate::memtable::merge_tree::PkIndex;
use crate::memtable::partition_tree::metrics::WriteMetrics;
use crate::memtable::partition_tree::PkIndex;
use crate::metrics::MEMTABLE_DICT_BYTES;
/// Maximum keys in a [DictBlock].

View File

@@ -18,8 +18,8 @@ use std::fmt::Debug;
use std::ops::Range;
use crate::error::Result;
use crate::memtable::merge_tree::data::{DataBatch, DataBufferReader, DataPartReader};
use crate::memtable::merge_tree::PkIndex;
use crate::memtable::partition_tree::data::{DataBatch, DataBufferReader, DataPartReader};
use crate::memtable::partition_tree::PkIndex;
/// Nodes of merger's heap.
pub trait Node: Ord {
@@ -297,7 +297,7 @@ mod tests {
use store_api::metadata::RegionMetadataRef;
use super::*;
use crate::memtable::merge_tree::data::{timestamp_array_to_i64_slice, DataBuffer};
use crate::memtable::partition_tree::data::{timestamp_array_to_i64_slice, DataBuffer};
use crate::test_util::memtable_util::{build_key_values_with_ts_seq_values, metadata_for_test};
fn write_rows_to_buffer(

View File

@@ -14,7 +14,7 @@
//! Internal metrics of the memtable.
/// Metrics of writing the merge tree.
/// Metrics of writing the partition tree.
pub struct WriteMetrics {
/// Size allocated by keys.
pub key_bytes: usize,

View File

@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
//! Partition of a merge tree.
//! Partition of a partition tree.
//!
//! We only support partitioning the tree by pre-defined internal columns.
@@ -28,15 +28,15 @@ use store_api::storage::ColumnId;
use crate::error::Result;
use crate::memtable::key_values::KeyValue;
use crate::memtable::merge_tree::data::{DataBatch, DataParts, DATA_INIT_CAP};
use crate::memtable::merge_tree::dedup::DedupReader;
use crate::memtable::merge_tree::metrics::WriteMetrics;
use crate::memtable::merge_tree::shard::{
use crate::memtable::partition_tree::data::{DataBatch, DataParts, DATA_INIT_CAP};
use crate::memtable::partition_tree::dedup::DedupReader;
use crate::memtable::partition_tree::metrics::WriteMetrics;
use crate::memtable::partition_tree::shard::{
BoxedDataBatchSource, Shard, ShardMerger, ShardNode, ShardSource,
};
use crate::memtable::merge_tree::shard_builder::ShardBuilder;
use crate::memtable::merge_tree::{MergeTreeConfig, PkId};
use crate::metrics::MERGE_TREE_READ_STAGE_ELAPSED;
use crate::memtable::partition_tree::shard_builder::ShardBuilder;
use crate::memtable::partition_tree::{PartitionTreeConfig, PkId};
use crate::metrics::PARTITION_TREE_READ_STAGE_ELAPSED;
use crate::read::{Batch, BatchBuilder};
use crate::row_converter::{McmpRowCodec, RowCodec};
@@ -54,7 +54,7 @@ pub type PartitionRef = Arc<Partition>;
impl Partition {
/// Creates a new partition.
pub fn new(metadata: RegionMetadataRef, config: &MergeTreeConfig) -> Self {
pub fn new(metadata: RegionMetadataRef, config: &PartitionTreeConfig) -> Self {
Partition {
inner: RwLock::new(Inner::new(metadata, config)),
dedup: config.dedup,
@@ -193,7 +193,7 @@ impl Partition {
/// Forks the partition.
///
/// Must freeze the partition before fork.
pub fn fork(&self, metadata: &RegionMetadataRef, config: &MergeTreeConfig) -> Partition {
pub fn fork(&self, metadata: &RegionMetadataRef, config: &PartitionTreeConfig) -> Partition {
let (shards, shard_builder) = {
let inner = self.inner.read().unwrap();
debug_assert!(inner.shard_builder.is_empty());
@@ -437,11 +437,11 @@ pub(crate) struct ReadPartitionContext {
impl Drop for ReadPartitionContext {
fn drop(&mut self) {
let partition_read_source = self.metrics.read_source.as_secs_f64();
MERGE_TREE_READ_STAGE_ELAPSED
PARTITION_TREE_READ_STAGE_ELAPSED
.with_label_values(&["partition_read_source"])
.observe(partition_read_source);
let partition_data_batch_to_batch = self.metrics.data_batch_to_batch.as_secs_f64();
MERGE_TREE_READ_STAGE_ELAPSED
PARTITION_TREE_READ_STAGE_ELAPSED
.with_label_values(&["partition_data_batch_to_batch"])
.observe(partition_data_batch_to_batch);
@@ -558,7 +558,7 @@ struct Inner {
}
impl Inner {
fn new(metadata: RegionMetadataRef, config: &MergeTreeConfig) -> Self {
fn new(metadata: RegionMetadataRef, config: &PartitionTreeConfig) -> Self {
let (shards, current_shard_id) = if metadata.primary_key.is_empty() {
let data_parts = DataParts::new(metadata.clone(), DATA_INIT_CAP, config.dedup);
(

View File

@@ -21,15 +21,15 @@ use store_api::metadata::RegionMetadataRef;
use crate::error::Result;
use crate::memtable::key_values::KeyValue;
use crate::memtable::merge_tree::data::{
use crate::memtable::partition_tree::data::{
DataBatch, DataParts, DataPartsReader, DataPartsReaderBuilder, DATA_INIT_CAP,
};
use crate::memtable::merge_tree::dict::KeyDictRef;
use crate::memtable::merge_tree::merger::{Merger, Node};
use crate::memtable::merge_tree::partition::PrimaryKeyFilter;
use crate::memtable::merge_tree::shard_builder::ShardBuilderReader;
use crate::memtable::merge_tree::{PkId, PkIndex, ShardId};
use crate::metrics::MERGE_TREE_READ_STAGE_ELAPSED;
use crate::memtable::partition_tree::dict::KeyDictRef;
use crate::memtable::partition_tree::merger::{Merger, Node};
use crate::memtable::partition_tree::partition::PrimaryKeyFilter;
use crate::memtable::partition_tree::shard_builder::ShardBuilderReader;
use crate::memtable::partition_tree::{PkId, PkIndex, ShardId};
use crate::metrics::PARTITION_TREE_READ_STAGE_ELAPSED;
/// Shard stores data related to the same key dictionary.
pub struct Shard {
@@ -257,7 +257,7 @@ impl ShardReader {
impl Drop for ShardReader {
fn drop(&mut self) {
let shard_prune_pk = self.prune_pk_cost.as_secs_f64();
MERGE_TREE_READ_STAGE_ELAPSED
PARTITION_TREE_READ_STAGE_ELAPSED
.with_label_values(&["shard_prune_pk"])
.observe(shard_prune_pk);
if self.keys_before_pruning > 0 {
@@ -427,10 +427,10 @@ mod tests {
use std::sync::Arc;
use super::*;
use crate::memtable::merge_tree::data::timestamp_array_to_i64_slice;
use crate::memtable::merge_tree::dict::KeyDictBuilder;
use crate::memtable::merge_tree::metrics::WriteMetrics;
use crate::memtable::merge_tree::PkIndex;
use crate::memtable::partition_tree::data::timestamp_array_to_i64_slice;
use crate::memtable::partition_tree::dict::KeyDictBuilder;
use crate::memtable::partition_tree::metrics::WriteMetrics;
use crate::memtable::partition_tree::PkIndex;
use crate::memtable::KeyValues;
use crate::test_util::memtable_util::{
build_key_values_with_ts_seq_values, encode_keys, metadata_for_test,

View File

@@ -22,15 +22,15 @@ use store_api::metadata::RegionMetadataRef;
use crate::error::Result;
use crate::memtable::key_values::KeyValue;
use crate::memtable::merge_tree::data::{
use crate::memtable::partition_tree::data::{
DataBatch, DataBuffer, DataBufferReader, DataBufferReaderBuilder, DataParts, DATA_INIT_CAP,
};
use crate::memtable::merge_tree::dict::{DictBuilderReader, KeyDictBuilder};
use crate::memtable::merge_tree::metrics::WriteMetrics;
use crate::memtable::merge_tree::partition::PrimaryKeyFilter;
use crate::memtable::merge_tree::shard::Shard;
use crate::memtable::merge_tree::{MergeTreeConfig, PkId, PkIndex, ShardId};
use crate::metrics::MERGE_TREE_READ_STAGE_ELAPSED;
use crate::memtable::partition_tree::dict::{DictBuilderReader, KeyDictBuilder};
use crate::memtable::partition_tree::metrics::WriteMetrics;
use crate::memtable::partition_tree::partition::PrimaryKeyFilter;
use crate::memtable::partition_tree::shard::Shard;
use crate::memtable::partition_tree::{PartitionTreeConfig, PkId, PkIndex, ShardId};
use crate::metrics::PARTITION_TREE_READ_STAGE_ELAPSED;
/// Builder to write keys and data to a shard that the key dictionary
/// is still active.
@@ -50,7 +50,7 @@ impl ShardBuilder {
/// Returns a new builder.
pub fn new(
metadata: RegionMetadataRef,
config: &MergeTreeConfig,
config: &PartitionTreeConfig,
shard_id: ShardId,
) -> ShardBuilder {
ShardBuilder {
@@ -150,14 +150,14 @@ impl ShardBuilder {
/// Scans the shard builder.
pub fn read(&self, pk_weights_buffer: &mut Vec<u16>) -> Result<ShardBuilderReaderBuilder> {
let dict_reader = {
let _timer = MERGE_TREE_READ_STAGE_ELAPSED
let _timer = PARTITION_TREE_READ_STAGE_ELAPSED
.with_label_values(&["shard_builder_read_pk"])
.start_timer();
self.dict_builder.read()
};
{
let _timer = MERGE_TREE_READ_STAGE_ELAPSED
let _timer = PARTITION_TREE_READ_STAGE_ELAPSED
.with_label_values(&["sort_pk"])
.start_timer();
dict_reader.pk_weights_to_sort_data(pk_weights_buffer);
@@ -296,7 +296,7 @@ impl ShardBuilderReader {
impl Drop for ShardBuilderReader {
fn drop(&mut self) {
let shard_builder_prune_pk = self.prune_pk_cost.as_secs_f64();
MERGE_TREE_READ_STAGE_ELAPSED
PARTITION_TREE_READ_STAGE_ELAPSED
.with_label_values(&["shard_builder_prune_pk"])
.observe(shard_builder_prune_pk);
if self.keys_before_pruning > 0 {
@@ -315,8 +315,8 @@ impl Drop for ShardBuilderReader {
mod tests {
use super::*;
use crate::memtable::merge_tree::data::timestamp_array_to_i64_slice;
use crate::memtable::merge_tree::metrics::WriteMetrics;
use crate::memtable::partition_tree::data::timestamp_array_to_i64_slice;
use crate::memtable::partition_tree::metrics::WriteMetrics;
use crate::memtable::KeyValues;
use crate::test_util::memtable_util::{
build_key_values_with_ts_seq_values, encode_key_by_kv, metadata_for_test,
@@ -355,7 +355,7 @@ mod tests {
fn test_write_shard_builder() {
let metadata = metadata_for_test();
let input = input_with_key(&metadata);
let config = MergeTreeConfig::default();
let config = PartitionTreeConfig::default();
let mut shard_builder = ShardBuilder::new(metadata.clone(), &config, 1);
let mut metrics = WriteMetrics::default();
assert!(shard_builder
@@ -382,7 +382,7 @@ mod tests {
fn test_write_read_shard_builder() {
let metadata = metadata_for_test();
let input = input_with_key(&metadata);
let config = MergeTreeConfig::default();
let config = PartitionTreeConfig::default();
let mut shard_builder = ShardBuilder::new(metadata.clone(), &config, 1);
let mut metrics = WriteMetrics::default();

Some files were not shown because too many files have changed in this diff Show More