Compare commits

..

54 Commits

Author SHA1 Message Date
Ning Sun
2bbc4bc4bc fix: correct signature of current_schemas function (#7233) 2025-11-17 01:42:09 +00:00
Alan Tang
b1525e566b chore: fix SQLness test for COPY command from CSV file (#7235)
chore: fix SQLness test for COPY command from CSV file

Signed-off-by: StandingMan <jmtangcs@gmail.com>
2025-11-16 07:08:13 +00:00
Yingwen
df954b47d5 fix: clone the page before putting into the index cache (#7229)
* fix: clone the page before putting into the index cache

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: fix warnings

Signed-off-by: evenyag <realevenyag@gmail.com>

---------

Signed-off-by: evenyag <realevenyag@gmail.com>
2025-11-15 17:52:32 +00:00
liyang
acfd674332 ci: update helm-charts and homebrew-greptime pull request reviewer (#7232)
* ci: update helm-charts and homebrew-greptime pull request reviewer

Signed-off-by: liyang <daviderli614@gmail.com>

* add reviewer

Signed-off-by: liyang <daviderli614@gmail.com>

---------

Signed-off-by: liyang <daviderli614@gmail.com>
2025-11-15 17:51:28 +00:00
shuiyisong
e7928aaeee chore: add tls-watch option in cmd (#7226)
* chore: add tls-watch cmd option

* chore: add watch tls option to standalone and fe cmd

* chore: fix clippy

Signed-off-by: shuiyisong <xixing.sys@gmail.com>

* chore: address CR comment

Co-authored-by: Yingwen <realevenyag@gmail.com>

* chore: address CR issue

Signed-off-by: shuiyisong <xixing.sys@gmail.com>

---------

Signed-off-by: shuiyisong <xixing.sys@gmail.com>
Co-authored-by: Yingwen <realevenyag@gmail.com>
2025-11-14 09:58:52 +00:00
Weny Xu
d5f52013ec feat: introduce batch region migration (#7176)
* feat: introduce batch region migration

Signed-off-by: WenyXu <wenymedia@gmail.com>

* fix: try fix unit tests

Signed-off-by: WenyXu <wenymedia@gmail.com>

* fix clippy

Signed-off-by: WenyXu <wenymedia@gmail.com>

* fix: fix get table route

Signed-off-by: WenyXu <wenymedia@gmail.com>

* fix unit tests

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: avoid cloning vec

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: apply suggestions from CR

Signed-off-by: WenyXu <wenymedia@gmail.com>

* fix: fix tests

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: apply suggestions

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: add suggestions

Signed-off-by: WenyXu <wenymedia@gmail.com>

---------

Signed-off-by: WenyXu <wenymedia@gmail.com>
2025-11-14 08:15:18 +00:00
Weny Xu
c1e762960a fix: obtain system time after fetching lease values (#7223)
* fix: acquire system time inside closure

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: apply suggestions

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: apply suggestions

Signed-off-by: WenyXu <wenymedia@gmail.com>

* test: add tests

Signed-off-by: WenyXu <wenymedia@gmail.com>

---------

Signed-off-by: WenyXu <wenymedia@gmail.com>
2025-11-14 06:53:15 +00:00
Yingwen
7cc0439cc9 feat: load latest index file first (#7221)
Signed-off-by: evenyag <realevenyag@gmail.com>
2025-11-13 08:56:44 +00:00
shuiyisong
6eb7efcb76 chore: add debug log on receiving logs (#7211)
* chore: add debug log on receiving logs

Signed-off-by: shuiyisong <xixing.sys@gmail.com>

* chore: add debug log on receiving logs

Signed-off-by: shuiyisong <xixing.sys@gmail.com>

---------

Signed-off-by: shuiyisong <xixing.sys@gmail.com>
2025-11-13 07:15:26 +00:00
dennis zhuang
5d0e94bfa8 docs: update project status and tweak readme (#7216)
* docs: update project status and tweak readme

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

* fix: style

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

* chore: minor change

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

* feat: add grafana datasource plugin project link

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

* feat: adds senarior

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

* chore: apply suggestions

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

---------

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>
2025-11-12 15:06:56 +00:00
shuiyisong
e842d401fb chore: allow unlimited return if timerange is applied (#7222)
Signed-off-by: shuiyisong <xixing.sys@gmail.com>
2025-11-12 10:00:11 +00:00
discord9
8153068b89 chore: bump main branch version to 1.0.0-beta.1 (#7191)
* chore: bump main branch version to 1.0.0-beta.1

Signed-off-by: discord9 <discord9@163.com>

* rename beta.1 to beta1

Signed-off-by: discord9 <discord9@163.com>

* again

Signed-off-by: discord9 <discord9@163.com>

* test: correct redact version

Signed-off-by: discord9 <discord9@163.com>

* chore

Signed-off-by: discord9 <discord9@163.com>

---------

Signed-off-by: discord9 <discord9@163.com>
2025-11-11 14:52:03 +00:00
Yingwen
bb6a3a2ff3 feat: support altering sst format for a table (#7206)
* refactor: remove memtable_builder from MitoRegion

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: add alter format

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: support changing the format and memtable

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: support changing sst format via table options

Signed-off-by: evenyag <realevenyag@gmail.com>

* fix: set scanner and memtable builder with correct format

Signed-off-by: evenyag <realevenyag@gmail.com>

* style: fix clippy

Signed-off-by: evenyag <realevenyag@gmail.com>

* fix: fix incorrect metadata in version after alter

Signed-off-by: evenyag <realevenyag@gmail.com>

* test: add sqlness test

Signed-off-by: evenyag <realevenyag@gmail.com>

* test: replace region_id in sqlness result

Signed-off-by: evenyag <realevenyag@gmail.com>

* fix: create correct memtable when setting sst_format explicitly

Signed-off-by: evenyag <realevenyag@gmail.com>

* test: sqlness alter_format test set sst_format to primary_key

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: remove verbose log

Signed-off-by: evenyag <realevenyag@gmail.com>

---------

Signed-off-by: evenyag <realevenyag@gmail.com>
2025-11-11 13:19:00 +00:00
Weny Xu
49c6812e98 fix: deregister failure detectors on rollback and improve timeout handling (#7212)
Signed-off-by: WenyXu <wenymedia@gmail.com>
2025-11-11 09:44:27 +00:00
Yingwen
24671b60b4 feat: tracks index files in another cache and preloads them (#7181)
* feat: divide parquet and puffin index

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: download index files when we open the region

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: use different label for parquet/puffin

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: control parallelism and cache size by env

Signed-off-by: evenyag <realevenyag@gmail.com>

* fix: change gauge to counter

Signed-off-by: evenyag <realevenyag@gmail.com>

* fix: correct file type labels in file cache

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: move env to config and change cache ratio to percent

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: checks capacity before download and refine metrics

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: change open to return MitoRegionRef

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: extract download to FileCache

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: run load cache task in write cache

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: check region state before downloading files

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: update config docs and test

Signed-off-by: evenyag <realevenyag@gmail.com>

* fix: use file id from index_file_id to compute puffin key

Signed-off-by: evenyag <realevenyag@gmail.com>

* fix: skip loading cache in some states

Signed-off-by: evenyag <realevenyag@gmail.com>

---------

Signed-off-by: evenyag <realevenyag@gmail.com>
2025-11-11 08:37:32 +00:00
jeremyhi
c7fded29ee feat: query mem limiter (#7078)
* feat: query mem limiter

* feat: config docs

* feat: frontend query limit config

* fix: unused imports

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* feat: add metrics for query memory tracker

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* fix: right postion for tracker

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* fix: avoid race condition

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* feat: soft and hard limit

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* feat: docs

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* fix: when soft_limit == 0

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* feat: upgrade limit algorithm

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* fix: remove batch window

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* chore: batch mem size

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* feat: refine limit algorithm

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* fix: get sys mem

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* chore: minor change

* feat: up tracker to the top stream

* feat: estimated_size for batch

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* chore: minor refactor

* feat: scan_memory_limit connect to max_concurrent_queries

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* chore: make callback clearly

* feat: add unlimted enum

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* chore: by review comment

* chore: comment on recursion_limit

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* feat: refactor and put permit into RegionScanExec

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* chore: multiple lazy static blocks

* chore: minor change

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

---------

Signed-off-by: jeremyhi <fengjiachun@gmail.com>
2025-11-11 07:47:55 +00:00
Ruihang Xia
afa8684ebd feat: report scanner metrics (#7200)
* feat: report scanner metrics

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* Update src/mito2/src/read/scan_util.rs

Co-authored-by: Yingwen <realevenyag@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
Co-authored-by: Yingwen <realevenyag@gmail.com>
2025-11-11 07:40:08 +00:00
Weny Xu
47937961f6 feat(metric)!: enable sparse primary key encoding by default (#7195)
* feat(metric): enable sparse primary key encoding by default

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: update config.md

Signed-off-by: WenyXu <wenymedia@gmail.com>

* fix: fix unit tests

Signed-off-by: WenyXu <wenymedia@gmail.com>

* fix: fix unit tests

Signed-off-by: WenyXu <wenymedia@gmail.com>

* fix sqlness

Signed-off-by: WenyXu <wenymedia@gmail.com>

* Update src/mito-codec/src/key_values.rs

Co-authored-by: Yingwen <realevenyag@gmail.com>

* feat: only allow setting primary key encoding for metric engine

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: support deleting rows from logical region instead of physical region

This keeps the behavior the same as put. It's easier to support sparse
encoding for deleting logical regions. Now the metric engine doesn't
support delete rows from physical region directly.

Signed-off-by: evenyag <realevenyag@gmail.com>

* test: update sqlness

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: remove unused error

Signed-off-by: WenyXu <wenymedia@gmail.com>

---------

Signed-off-by: WenyXu <wenymedia@gmail.com>
Signed-off-by: evenyag <realevenyag@gmail.com>
Co-authored-by: Yingwen <realevenyag@gmail.com>
2025-11-11 06:33:51 +00:00
Lei, HUANG
182cce4cc2 fix(mito): allow region edit in writable state (#7201)
* fix/region-expire-state:
 Refactor region state handling in compaction task and manifest updates

 - Introduce a variable to hold the current region state for clarity in compaction task updates.
 - Add an expected_region_state field to RegionEditResult to manage region state expectations during manifest handling.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* fix/region-expire-state:
 Refactor region state handling in compaction task

 - Replace direct assignment of `RegionLeaderState::Writable` with dynamic state retrieval and conditional check for leader state.
 - Modify `RegionEditResult` to include a flag `update_region_state` instead of `expected_region_state` to indicate if the region state should be updated to writable.
 - Adjust handling of `RegionEditResult` in `handle_manifest` to conditionally update region state based on the new flag.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

---------

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>
2025-11-11 06:16:23 +00:00
Weny Xu
ac0e95c193 fix: correct leader state reset and region migration locking consistency (#7199)
* fix(meta): remove table route cache in region migration ctx

Signed-off-by: WenyXu <wenymedia@gmail.com>

* fix: fix unit tests

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: fix clippy

Signed-off-by: WenyXu <wenymedia@gmail.com>

* fix: fix campaign reset not clearing leader state-s

Signed-off-by: WenyXu <wenymedia@gmail.com>

* feat: gracefully handle region lease renewal errors

Signed-off-by: WenyXu <wenymedia@gmail.com>

---------

Signed-off-by: WenyXu <wenymedia@gmail.com>
2025-11-11 01:19:26 +00:00
Lei, HUANG
f567dcef86 feat: allow fuzz input override through env var (#7208)
* feat/allow-fuzz-input-override:
 Add environment override for fuzzing parameters and seed values

 - Implement `get_fuzz_override` function to read override values from environment variables for fuzzing parameters.
 - Allow overriding `SEED`, `ACTIONS`, `ROWS`, `TABLES`, `COLUMNS`, `INSERTS`, and `PARTITIONS` in various fuzzing targets.
 - Introduce new constants `GT_FUZZ_INPUT_MAX_PARTITIONS` and `FUZZ_OVERRIDE_PREFIX`.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* feat/allow-fuzz-input-override: Remove GT_FUZZ_INPUT_MAX_PARTITIONS constant and usage from fuzzing utils and tests

 • Deleted the GT_FUZZ_INPUT_MAX_PARTITIONS constant from fuzzing utility functions.
 • Updated FuzzInput struct in fuzz_migrate_mito_regions.rs to use a hardcoded range instead of get_gt_fuzz_input_max_partitions for determining the number of partitions.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* feat/allow-fuzz-input-override:
 Improve fuzzing documentation with environment variable overrides

 Enhanced the fuzzing instructions in the README to include guidance on how to override fuzz input using environment variables, providing an example for better clarity.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

---------

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>
2025-11-10 14:02:23 +00:00
Ruihang Xia
30192d9802 feat: disable default compression for __op_type column (#7196)
* feat: disable default compression for `__op_type` column

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* update test

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* revert unrelated code

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2025-11-10 07:59:25 +00:00
Ning Sun
62d109c1f4 fix: allow case-insensitive timezone settings (#7207) 2025-11-08 15:56:27 +00:00
Alan Tang
910a383420 feat(expr): support avg functions on vector (#7146)
* feat(expr): support vec_elem_avg function

Signed-off-by: Alan Tang <jmtangcs@gmail.com>

* feat: support vec_avg function

Signed-off-by: Alan Tang <jmtangcs@gmail.com>

* test: add more query test for avg aggregator

Signed-off-by: Alan Tang <jmtangcs@gmail.com>

* fix: fix the merge batch mode

Signed-off-by: Alan Tang <jmtangcs@gmail.com>

* refactor: use sum and count as state for avg function

Signed-off-by: Alan Tang <jmtangcs@gmail.com>

* refactor: refactor merge batch mode for avg function

Signed-off-by: Alan Tang <jmtangcs@gmail.com>

* feat: add additional vector restrictions for validation

Signed-off-by: Alan Tang <jmtangcs@gmail.com>

---------

Signed-off-by: Alan Tang <jmtangcs@gmail.com>
Co-authored-by: Yingwen <realevenyag@gmail.com>
2025-11-07 13:42:14 +00:00
Weny Xu
af6bbacc8c fix: add serde defaults for MetasrvNodeInfo (#7204)
* fix: add serde defaults for `MetasrvNodeInfo`

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: fmt

Signed-off-by: WenyXu <wenymedia@gmail.com>

---------

Signed-off-by: WenyXu <wenymedia@gmail.com>
2025-11-07 09:50:09 +00:00
Yingwen
7616ffcb35 test: only set ttl to forever in fuzz alter test (#7202)
Signed-off-by: evenyag <realevenyag@gmail.com>
2025-11-07 07:32:53 +00:00
shuiyisong
a3dbd029c5 chore: remove ttl option if presents in trace meta table (#7197)
* chore: remove ttl option if presents in trace meta table

Signed-off-by: shuiyisong <xixing.sys@gmail.com>

* chore: update test

Signed-off-by: shuiyisong <xixing.sys@gmail.com>

---------

Signed-off-by: shuiyisong <xixing.sys@gmail.com>
2025-11-06 11:51:45 +00:00
Yingwen
9caeae391e chore: print root cause in opendal logging interceptor (#7183)
* chore: print root cause in opendal

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: extract a function root_source() to get the cause

Signed-off-by: evenyag <realevenyag@gmail.com>

---------

Signed-off-by: evenyag <realevenyag@gmail.com>
2025-11-06 08:48:59 +00:00
fys
35951afff9 chore: remove unnecessary code related to triggers (#7192)
* chore: remove unused triggers memory tables

* fix: cargo clippy

* fix: sqlness
2025-11-06 08:09:14 +00:00
Ruihang Xia
a049b68c26 feat: import backup data from local files (#7180)
* feat: import backup data from local files

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add unit tests

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix clippy

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2025-11-06 07:33:33 +00:00
Lei, HUANG
c2ff563ac6 fix(mito): avoid shortcut in picking multi window files (#7174)
* fix/pick-continue:
 ### Add Tests for TWCS Compaction Logic

 - **`twcs.rs`**:
   - Modified the logic in `TwcsPicker` to handle cases with zero runs by using `continue` instead of `return`.
   - Added two new test cases: `test_build_output_multiple_windows_with_zero_runs` and `test_build_output_single_window_zero_runs` to verify the behavior of the compaction logic when there are zero runs in
 the windows.

 - **`memtable_util.rs`**:
   - Removed unused import `PredicateGroup`.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* fix: clippy

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* fix/pick-continue:
 ### Commit Message

 Enhance Compaction Process with Expired SST Handling and Testing

 - **`compactor.rs`**:
   - Introduced handling for expired SSTs by updating the manifest immediately upon task completion.
   - Added new test cases to verify the handling of expired SSTs and manifest updates.

 - **`task.rs`**:
   - Implemented `remove_expired` function to handle expired SSTs by updating the manifest and notifying the region worker loop.
   - Refactored `handle_compaction` to `handle_expiration_and_compaction` to integrate expired SST removal before merging inputs.
   - Added logging and error handling for expired SST removal process.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* refactor/progressive-compaction:
 **Enhance Compaction Task Error Handling**

 - Updated `task.rs` to conditionally execute the removal of expired SST files only when they exist, improving error handling and performance.
 - Added a check for non-empty `expired_ssts` before initiating the removal process, ensuring unnecessary operations are avoided.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* refactor/progressive-compaction:
 ### Refactor `DefaultCompactor` to Extract `merge_single_output` Method

 - **File**: `src/mito2/src/compaction/compactor.rs`
   - Extracted the logic for merging a single compaction output into SST files into a new method `merge_single_output` within the `DefaultCompactor` struct.
   - Simplified the `merge_ssts` method by utilizing the new `merge_single_output` method, reducing code duplication and improving maintainability.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* refactor/progressive-compaction:
 ### Add Max Background Compaction Tasks Configuration

 - **`compaction.rs`**: Added `max_background_compactions` to the compaction scheduler to limit background tasks.
 - **`compaction/compactor.rs`**: Removed immediate manifest update logic after task completion.
 - **`compaction/picker.rs`**: Introduced `max_background_tasks` parameter in `new_picker` to control task limits.
 - **`compaction/twcs.rs`**: Updated `TwcsPicker` to include `max_background_tasks` and truncate inputs exceeding this limit. Added related test cases to ensure functionality.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* fix/pick-continue:
 ### Improve Error Handling and Task Management in Compaction

 - **`task.rs`**: Enhanced error handling in `remove_expired` function by logging errors without halting the compaction process. Removed the return of `Result` type and added detailed logging for various
 failure scenarios.
 - **`twcs.rs`**: Adjusted task management logic by removing input truncation based on `max_background_tasks` and instead discarding remaining tasks if the output size exceeds the limit. This ensures better
 control over task execution and resource management.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* fix/pick-continue:
 ### Add Unit Tests for Compaction Task and TWCS Picker

 - **`task.rs`**: Added unit tests to verify the behavior of `PickerOutput` with and without expired SSTs.
 - **`twcs.rs`**: Introduced tests for `TwcsPicker` to ensure correct handling of `max_background_tasks` during compaction, including scenarios with and without task truncation.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* fix/pick-continue:
 **Improve Error Handling and Notification in Compaction Task**

 - **File:** `task.rs`
   - Changed log level from `warn` to `error` for manifest update failures to enhance error visibility.
   - Refactored the notification mechanism for expired file removal by using `BackgroundNotify::RegionEdit` with `RegionEditResult` to streamline the process.
   - Simplified error handling by consolidating match cases into a single `if let Err` block for better readability and maintainability.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

---------

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>
2025-11-06 06:27:17 +00:00
Yingwen
82812ff19e test: add a unit test to scan data from memtable in append mode (#7193)
* test: add tests for scanning append mode before flush

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: extract a function maybe_dedup_one

Signed-off-by: evenyag <realevenyag@gmail.com>

* ci: add flat format to docs.yml so we can make it required later

Signed-off-by: evenyag <realevenyag@gmail.com>

---------

Signed-off-by: evenyag <realevenyag@gmail.com>
2025-11-06 06:11:58 +00:00
Ning Sun
4a77167138 chore: update readme (#7187) 2025-11-06 03:21:01 +00:00
Lei, HUANG
934df46f53 fix(mito): append mode in flat format not working (#7186)
* mito2: add unit test for flat single-range append_mode dedup behavior

Verify memtable_flat_sources skips dedup when append_mode is true and
performs dedup otherwise for single-range flat memtables, preventing
regressions in the new append_mode path.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* fix/flat-source-merge:
 ### Improve Column Metadata Extraction Logic

 - **File**: `src/common/meta/src/ddl/utils.rs`
   - Modified the `extract_column_metadatas` function to use `swap_remove` for extracting the first schema and decode column metadata for comparison instead of raw bytes. This ensures that the extension map is considered during
 verification, enhancing the robustness of metadata consistency checks across datanodes.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

---------

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>
2025-11-06 03:19:39 +00:00
Ning Sun
fb92e4d0b2 feat: add greptime's arrow json extension type (#7168)
* feat: add arrow json extension type

* feat: add json structure settings to extension type

* refactor: store json structure settings as extension metadata

* chore: make binary an acceptable type for extension
2025-11-05 18:34:57 +00:00
Yingwen
0939dc1d32 test: run sqlness for flat format (#7178)
* test: support flat format in sqlness

Signed-off-by: evenyag <realevenyag@gmail.com>

* test: replace region stats test result with NUM

Signed-off-by: evenyag <realevenyag@gmail.com>

* ci: add flat format to sqlness ci

Signed-off-by: evenyag <realevenyag@gmail.com>

---------

Signed-off-by: evenyag <realevenyag@gmail.com>
2025-11-05 11:23:12 +00:00
shuiyisong
50c9600ef8 fix: stabilize test results (#7182)
* fix: stablize test results

Signed-off-by: shuiyisong <xixing.sys@gmail.com>

* fix: test

Signed-off-by: shuiyisong <xixing.sys@gmail.com>

---------

Signed-off-by: shuiyisong <xixing.sys@gmail.com>
2025-11-05 09:19:23 +00:00
Lei, HUANG
abcfbd7f41 chore(metrics): add region server requests failures count metrics (#7173)
* chore/add-region-insert-failure-metric: Add metric for failed insert requests to region server in datanode module

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* chore/add-region-insert-failure-metric:
 Add metric for tracking failed region server requests

 - Introduce a new metric `REGION_SERVER_REQUEST_FAILURE_COUNT` to count failed region server requests.
 - Update `REGION_SERVER_INSERT_FAIL_COUNT` metric description for consistency.
 - Implement error handling in `RegionServerHandler` to increment the new failure metric on request errors.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

---------

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>
2025-11-05 07:23:40 +00:00
Ruihang Xia
aac3ede261 feat: allow creating logical tabel with same partition rule with physical table's (#7177)
* feat: allow creating logical tabel with same partition rule with physical table's

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix errors

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2025-11-05 06:37:17 +00:00
Yingwen
3001c2d719 feat: BulkMemtable stores small fragments in another buffer (#7164)
* feat: buffer small parts in bulk memtable

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: use assert_eq instead of assert

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: fix compiler errors

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: collect bulk memtable scan metrics

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: report metrics early

Signed-off-by: evenyag <realevenyag@gmail.com>

---------

Signed-off-by: evenyag <realevenyag@gmail.com>
2025-11-05 06:35:32 +00:00
shuiyisong
6caff50d01 chore: improve search traces and jaeger resp (#7166)
* chore: add jaeger field in trace query

Signed-off-by: shuiyisong <xixing.sys@gmail.com>

* chore: update search v1 with tags

Signed-off-by: shuiyisong <xixing.sys@gmail.com>

* chore: update col matching using col names

Signed-off-by: shuiyisong <xixing.sys@gmail.com>

* chore: minify code with macro

Signed-off-by: shuiyisong <xixing.sys@gmail.com>

* chore: fix test

Signed-off-by: shuiyisong <xixing.sys@gmail.com>

* chore: change macro to inline function

Signed-off-by: shuiyisong <xixing.sys@gmail.com>

* chore: fix filter with tags & add test

Signed-off-by: shuiyisong <xixing.sys@gmail.com>

---------

Signed-off-by: shuiyisong <xixing.sys@gmail.com>
2025-11-04 05:49:08 +00:00
ZonaHe
421f4eec05 feat: update dashboard to v0.11.7 (#7170)
Co-authored-by: sunchanglong <sunchanglong@users.noreply.github.com>
Co-authored-by: Ning Sun <sunng@protonmail.com>
2025-11-04 02:52:26 +00:00
Yingwen
d944e5c6b8 test: add sqlness for delete and filter (#7171)
Signed-off-by: evenyag <realevenyag@gmail.com>
2025-11-04 02:13:47 +00:00
fys
013d61acbb chore(deps): remove sqlx pg feature in greptimedb build (#7172)
* chore(deps): remove sqlx pg feature in greptimedb build

* fix: ci
2025-11-03 18:49:00 +00:00
LFC
b7e834ab92 refactor: convert to influxdb values directly from arrow (#7163)
* refactor: convert to influxdb values directly from arrow

Signed-off-by: luofucong <luofc@foxmail.com>

* resolve PR comments

Signed-off-by: luofucong <luofc@foxmail.com>

* resolve PR comments

Signed-off-by: luofucong <luofc@foxmail.com>

---------

Signed-off-by: luofucong <luofc@foxmail.com>
2025-11-03 07:52:37 +00:00
LFC
5eab9a1be3 feat: json vector builder (#7151)
* resolve PR comments

Signed-off-by: luofucong <luofc@foxmail.com>

Update src/datatypes/src/vectors/json/builder.rs

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

feat: json vector builder

Signed-off-by: luofucong <luofc@foxmail.com>

* resolve PR comments

Signed-off-by: luofucong <luofc@foxmail.com>

---------

Signed-off-by: luofucong <luofc@foxmail.com>
2025-11-03 06:06:54 +00:00
Weny Xu
9de680f456 refactor: add support for batch region upgrade operations part2 (#7160)
* add tests for metric engines

Signed-off-by: WenyXu <wenymedia@gmail.com>

* feat: catchup in background

Signed-off-by: WenyXu <wenymedia@gmail.com>

* refactor: replace sequential catchup with batch processing

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: unit tests

Signed-off-by: WenyXu <wenymedia@gmail.com>

* remove single catchup

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: remove unused error

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: refine catchup tests

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: add unit tests

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: apply suggestions

Signed-off-by: WenyXu <wenymedia@gmail.com>

---------

Signed-off-by: WenyXu <wenymedia@gmail.com>
2025-11-03 06:01:38 +00:00
Ning Sun
5deaaa59ec chore: fix typo (#7169) 2025-11-03 02:22:34 +00:00
dennis zhuang
61724386ef fix: potential failure in tests (#7167)
* fix: potential failure in the test_index_build_type_compact test

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

* fix: relax timestamp checking in test_timestamp_default_now

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

---------

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>
2025-10-31 22:08:59 +00:00
Weny Xu
6960a0183a refactor: add support for batch region upgrade operations part1 (#7155)
* refactor: convert UpgradeRegion instruction to batch operation

Signed-off-by: WenyXu <wenymedia@gmail.com>

* feat: introduce `handle_batch_catchup_requests` fn for mito engine

Signed-off-by: WenyXu <wenymedia@gmail.com>

* test: add tests

Signed-off-by: WenyXu <wenymedia@gmail.com>

* feat: introduce `handle_batch_catchup_requests` fn for metric engine

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: suggestion and add ser/de tests

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: add comments

Signed-off-by: WenyXu <wenymedia@gmail.com>

* fix: fix unit tests

Signed-off-by: WenyXu <wenymedia@gmail.com>

---------

Signed-off-by: WenyXu <wenymedia@gmail.com>
2025-10-31 03:08:38 +00:00
Sicong Hu
30894d7599 feat(mito): Optimize async index building with priority-based batching (#7034)
* feat: add priority-based batching to IndexBuildScheduler

Signed-off-by: SNC123 <sinhco@outlook.com>

* fix: clean old puffin-related cache

Signed-off-by: SNC123 <sinhco@outlook.com>

* test: add test for IndexBuildScheduler

Signed-off-by: SNC123 <sinhco@outlook.com>

* feat: different index file id for read and async write

Signed-off-by: SNC123 <sinhco@outlook.com>

* feat: different index file id for delete

Signed-off-by: SNC123 <sinhco@outlook.com>

* chore: clippy

Signed-off-by: SNC123 <sinhco@outlook.com>

* fix: apply suggestions

Signed-off-by: SNC123 <sinhco@outlook.com>

* fix: apply comments

Signed-off-by: SNC123 <sinhco@outlook.com>

* combine files and index files

Signed-off-by: SNC123 <sinhco@outlook.com>

* feat: add index_file_id into ManifestSstEntry

Signed-off-by: SNC123 <sinhco@outlook.com>

* Update src/mito2/src/gc.rs

Signed-off-by: SNC123 <sinhco@outlook.com>

* resolve conflicts

Signed-off-by: SNC123 <sinhco@outlook.com>

* fix: sqlness

Signed-off-by: SNC123 <sinhco@outlook.com>

* chore: fmt

Signed-off-by: SNC123 <sinhco@outlook.com>

---------

Signed-off-by: SNC123 <sinhco@outlook.com>
2025-10-31 02:13:17 +00:00
Yingwen
acf38a7091 fix: avoid filtering rows with delete op by fields under merge mode (#7154)
* chore: clear allow dead_code for flat format

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: pass exprs to build appliers

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: split field filters and index appliers

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: support skip filtering fields in RowGroupPruningStats

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: add PreFilterMode to config whether to skip filtering fields

Adds the PreFilterMode to the RangeBase and sets it in
ParquetReaderBuilder

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: support skipping fields in prune reader

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: support pre filter mode in bulk memtable

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: pass PreFilterMode to memtable

Signed-off-by: evenyag <realevenyag@gmail.com>

* test: test mito filter delete

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: fix compiler errors

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: remove commented code

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: move predicate and sequence to RangesOptions

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: fmt code

Signed-off-by: evenyag <realevenyag@gmail.com>

* ci: skip cargo gc

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: fix cargo build warning

Signed-off-by: evenyag <realevenyag@gmail.com>

* Revert "ci: skip cargo gc"

This reverts commit 1ec9594a6d.

Signed-off-by: evenyag <realevenyag@gmail.com>

---------

Signed-off-by: evenyag <realevenyag@gmail.com>
2025-10-30 12:14:45 +00:00
LFC
109b70750a refactor: convert to prometheus values directly from arrow (#7153)
* refactor: convert to prometheus values directly from arrow

Signed-off-by: luofucong <luofc@foxmail.com>

* resolve PR comments

Signed-off-by: luofucong <luofc@foxmail.com>

---------

Signed-off-by: luofucong <luofc@foxmail.com>
2025-10-30 10:24:12 +00:00
shuiyisong
ee5b7ff3c8 chore: unify initialization of channel manager (#7159)
* chore: unify initialization of channel manager and extract loading tls

Signed-off-by: shuiyisong <xixing.sys@gmail.com>

* chore: fix cr issue

Signed-off-by: shuiyisong <xixing.sys@gmail.com>

---------

Signed-off-by: shuiyisong <xixing.sys@gmail.com>
2025-10-30 04:26:02 +00:00
281 changed files with 12531 additions and 3638 deletions

View File

@@ -39,8 +39,11 @@ update_helm_charts_version() {
--body "This PR updates the GreptimeDB version." \
--base main \
--head $BRANCH_NAME \
--reviewer zyy17 \
--reviewer daviderli614
--reviewer sunng87 \
--reviewer daviderli614 \
--reviewer killme2008 \
--reviewer evenyag \
--reviewer fengjiachun
}
update_helm_charts_version

View File

@@ -35,8 +35,11 @@ update_homebrew_greptime_version() {
--body "This PR updates the GreptimeDB version." \
--base main \
--head $BRANCH_NAME \
--reviewer zyy17 \
--reviewer daviderli614
--reviewer sunng87 \
--reviewer daviderli614 \
--reviewer killme2008 \
--reviewer evenyag \
--reviewer fengjiachun
}
update_homebrew_greptime_version

View File

@@ -613,6 +613,9 @@ jobs:
- name: "MySQL Kvbackend"
opts: "--setup-mysql"
kafka: false
- name: "Flat format"
opts: "--enable-flat-format"
kafka: false
timeout-minutes: 60
steps:
- uses: actions/checkout@v4
@@ -808,7 +811,7 @@ jobs:
- name: Setup external services
working-directory: tests-integration/fixtures
run: ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait
- name: Run nextest cases
run: cargo llvm-cov nextest --workspace --lcov --output-path lcov.info -F dashboard -F pg_kvbackend -F mysql_kvbackend
env:

View File

@@ -92,5 +92,6 @@ jobs:
mode:
- name: "Basic"
- name: "Remote WAL"
- name: "Flat format"
steps:
- run: 'echo "No action required"'

260
Cargo.lock generated
View File

@@ -212,8 +212,9 @@ checksum = "d301b3b94cb4b2f23d7917810addbbaff90738e0ca2be692bd027e70d7e0330c"
[[package]]
name = "api"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"arrow-schema",
"common-base",
"common-decimal",
"common-error",
@@ -732,7 +733,7 @@ dependencies = [
[[package]]
name = "auth"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"api",
"async-trait",
@@ -1336,13 +1337,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]]
name = "bytes"
version = "1.10.1"
source = "git+https://github.com/discord9/bytes?rev=1572ab22c3cbad0e9b6681d1f68eca4139322a2a#1572ab22c3cbad0e9b6681d1f68eca4139322a2a"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
dependencies = [
"backtrace",
"crossbeam-channel",
"inferno 0.12.2",
"papaya",
"quanta",
"serde",
]
@@ -1386,7 +1383,7 @@ dependencies = [
[[package]]
name = "cache"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"catalog",
"common-error",
@@ -1421,7 +1418,7 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
[[package]]
name = "catalog"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"api",
"arrow",
@@ -1633,6 +1630,7 @@ dependencies = [
"chrono",
"chrono-tz-build",
"phf 0.11.3",
"uncased",
]
[[package]]
@@ -1643,6 +1641,8 @@ checksum = "8f10f8c9340e31fc120ff885fcdb54a0b48e474bbd77cab557f0c30a3e569402"
dependencies = [
"parse-zoneinfo",
"phf_codegen 0.11.3",
"phf_shared 0.11.3",
"uncased",
]
[[package]]
@@ -1763,7 +1763,7 @@ checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675"
[[package]]
name = "cli"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"async-stream",
"async-trait",
@@ -1816,7 +1816,7 @@ dependencies = [
[[package]]
name = "client"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"api",
"arc-swap",
@@ -1848,8 +1848,8 @@ dependencies = [
"serde_json",
"snafu 0.8.6",
"store-api",
"substrait 0.18.0",
"substrait 0.37.3",
"substrait 1.0.0-beta.1",
"tokio",
"tokio-stream",
"tonic 0.13.1",
@@ -1889,7 +1889,7 @@ dependencies = [
[[package]]
name = "cmd"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"async-trait",
"auth",
@@ -2012,7 +2012,7 @@ checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335"
[[package]]
name = "common-base"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"anymap2",
"async-trait",
@@ -2036,14 +2036,14 @@ dependencies = [
[[package]]
name = "common-catalog"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"const_format",
]
[[package]]
name = "common-config"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"common-base",
"common-error",
@@ -2067,7 +2067,7 @@ dependencies = [
[[package]]
name = "common-datasource"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"arrow",
"arrow-schema",
@@ -2102,7 +2102,7 @@ dependencies = [
[[package]]
name = "common-decimal"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"bigdecimal 0.4.8",
"common-error",
@@ -2115,7 +2115,7 @@ dependencies = [
[[package]]
name = "common-error"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"common-macro",
"http 1.3.1",
@@ -2126,7 +2126,7 @@ dependencies = [
[[package]]
name = "common-event-recorder"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"api",
"async-trait",
@@ -2148,7 +2148,7 @@ dependencies = [
[[package]]
name = "common-frontend"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"api",
"async-trait",
@@ -2170,7 +2170,7 @@ dependencies = [
[[package]]
name = "common-function"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"ahash 0.8.12",
"api",
@@ -2229,7 +2229,7 @@ dependencies = [
[[package]]
name = "common-greptimedb-telemetry"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"async-trait",
"common-runtime",
@@ -2246,7 +2246,7 @@ dependencies = [
[[package]]
name = "common-grpc"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"api",
"arrow-flight",
@@ -2279,7 +2279,7 @@ dependencies = [
[[package]]
name = "common-grpc-expr"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"api",
"common-base",
@@ -2299,7 +2299,7 @@ dependencies = [
[[package]]
name = "common-macro"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"greptime-proto",
"once_cell",
@@ -2310,7 +2310,7 @@ dependencies = [
[[package]]
name = "common-mem-prof"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"anyhow",
"common-error",
@@ -2326,7 +2326,7 @@ dependencies = [
[[package]]
name = "common-meta"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"anymap2",
"api",
@@ -2398,7 +2398,7 @@ dependencies = [
[[package]]
name = "common-options"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"common-grpc",
"humantime-serde",
@@ -2407,11 +2407,11 @@ dependencies = [
[[package]]
name = "common-plugins"
version = "0.18.0"
version = "1.0.0-beta.1"
[[package]]
name = "common-pprof"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"common-error",
"common-macro",
@@ -2423,7 +2423,7 @@ dependencies = [
[[package]]
name = "common-procedure"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"api",
"async-stream",
@@ -2452,7 +2452,7 @@ dependencies = [
[[package]]
name = "common-procedure-test"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"async-trait",
"common-procedure",
@@ -2462,7 +2462,7 @@ dependencies = [
[[package]]
name = "common-query"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"api",
"async-trait",
@@ -2488,7 +2488,7 @@ dependencies = [
[[package]]
name = "common-recordbatch"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"arc-swap",
"common-base",
@@ -2512,7 +2512,7 @@ dependencies = [
[[package]]
name = "common-runtime"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"async-trait",
"clap 4.5.40",
@@ -2541,7 +2541,7 @@ dependencies = [
[[package]]
name = "common-session"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"serde",
"strum 0.27.1",
@@ -2549,7 +2549,7 @@ dependencies = [
[[package]]
name = "common-sql"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"common-base",
"common-decimal",
@@ -2567,7 +2567,7 @@ dependencies = [
[[package]]
name = "common-stat"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"common-base",
"common-runtime",
@@ -2582,7 +2582,7 @@ dependencies = [
[[package]]
name = "common-telemetry"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"backtrace",
"common-base",
@@ -2611,7 +2611,7 @@ dependencies = [
[[package]]
name = "common-test-util"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"client",
"common-grpc",
@@ -2624,7 +2624,7 @@ dependencies = [
[[package]]
name = "common-time"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"arrow",
"chrono",
@@ -2642,7 +2642,7 @@ dependencies = [
[[package]]
name = "common-version"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"build-data",
"cargo-manifest",
@@ -2653,7 +2653,7 @@ dependencies = [
[[package]]
name = "common-wal"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"common-base",
"common-error",
@@ -2676,7 +2676,7 @@ dependencies = [
[[package]]
name = "common-workload"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"common-telemetry",
"serde",
@@ -3913,7 +3913,7 @@ dependencies = [
[[package]]
name = "datanode"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"api",
"arrow-flight",
@@ -3977,7 +3977,7 @@ dependencies = [
[[package]]
name = "datatypes"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"arrow",
"arrow-array",
@@ -4649,7 +4649,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
[[package]]
name = "file-engine"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"api",
"async-trait",
@@ -4781,7 +4781,7 @@ checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8"
[[package]]
name = "flow"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"api",
"arrow",
@@ -4850,7 +4850,7 @@ dependencies = [
"sql",
"store-api",
"strum 0.27.1",
"substrait 0.18.0",
"substrait 1.0.0-beta.1",
"table",
"tokio",
"tonic 0.13.1",
@@ -4905,7 +4905,7 @@ checksum = "28dd6caf6059519a65843af8fe2a3ae298b14b80179855aeb4adc2c1934ee619"
[[package]]
name = "frontend"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"api",
"arc-swap",
@@ -6116,7 +6116,7 @@ dependencies = [
[[package]]
name = "index"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"async-trait",
"asynchronous-codec",
@@ -7045,7 +7045,7 @@ checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
[[package]]
name = "log-query"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"chrono",
"common-error",
@@ -7057,7 +7057,7 @@ dependencies = [
[[package]]
name = "log-store"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"async-stream",
"async-trait",
@@ -7364,7 +7364,7 @@ dependencies = [
[[package]]
name = "meta-client"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"api",
"async-trait",
@@ -7392,7 +7392,7 @@ dependencies = [
[[package]]
name = "meta-srv"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"api",
"async-trait",
@@ -7490,7 +7490,7 @@ dependencies = [
[[package]]
name = "metric-engine"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"api",
"aquamarine",
@@ -7508,6 +7508,7 @@ dependencies = [
"common-telemetry",
"common-test-util",
"common-time",
"common-wal",
"datafusion",
"datatypes",
"futures-util",
@@ -7584,7 +7585,7 @@ dependencies = [
[[package]]
name = "mito-codec"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"api",
"bytes",
@@ -7609,7 +7610,7 @@ dependencies = [
[[package]]
name = "mito2"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"api",
"aquamarine",
@@ -8347,7 +8348,7 @@ dependencies = [
[[package]]
name = "object-store"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"anyhow",
"bytes",
@@ -8632,7 +8633,7 @@ dependencies = [
[[package]]
name = "operator"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"ahash 0.8.12",
"api",
@@ -8690,7 +8691,7 @@ dependencies = [
"sql",
"sqlparser",
"store-api",
"substrait 0.18.0",
"substrait 1.0.0-beta.1",
"table",
"tokio",
"tokio-util",
@@ -8866,16 +8867,6 @@ dependencies = [
"unicode-width 0.1.14",
]
[[package]]
name = "papaya"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f92dd0b07c53a0a0c764db2ace8c541dc47320dad97c2200c2a637ab9dd2328f"
dependencies = [
"equivalent",
"seize",
]
[[package]]
name = "parking"
version = "2.2.1"
@@ -8986,7 +8977,7 @@ dependencies = [
[[package]]
name = "partition"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"api",
"async-trait",
@@ -9285,6 +9276,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5"
dependencies = [
"siphasher",
"uncased",
]
[[package]]
@@ -9330,7 +9322,7 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "pipeline"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"ahash 0.8.12",
"api",
@@ -9486,7 +9478,7 @@ dependencies = [
[[package]]
name = "plugins"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"auth",
"clap 4.5.40",
@@ -9786,7 +9778,7 @@ dependencies = [
[[package]]
name = "promql"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"ahash 0.8.12",
"async-trait",
@@ -10069,7 +10061,7 @@ dependencies = [
[[package]]
name = "puffin"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"async-compression 0.4.19",
"async-trait",
@@ -10109,24 +10101,9 @@ dependencies = [
"variadics",
]
[[package]]
name = "quanta"
version = "0.12.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f3ab5a9d756f0d97bdc89019bd2e4ea098cf9cde50ee7564dde6b81ccc8f06c7"
dependencies = [
"crossbeam-utils",
"libc",
"once_cell",
"raw-cpuid",
"wasi 0.11.1+wasi-snapshot-preview1",
"web-sys",
"winapi",
]
[[package]]
name = "query"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"ahash 0.8.12",
"api",
@@ -10150,6 +10127,7 @@ dependencies = [
"common-query",
"common-recordbatch",
"common-runtime",
"common-stat",
"common-telemetry",
"common-time",
"datafusion",
@@ -10192,7 +10170,7 @@ dependencies = [
"sql",
"sqlparser",
"store-api",
"substrait 0.18.0",
"substrait 1.0.0-beta.1",
"table",
"tokio",
"tokio-stream",
@@ -10423,15 +10401,6 @@ dependencies = [
"thiserror 1.0.69",
]
[[package]]
name = "raw-cpuid"
version = "11.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186"
dependencies = [
"bitflags 2.9.1",
]
[[package]]
name = "rawpointer"
version = "0.2.1"
@@ -11372,16 +11341,6 @@ dependencies = [
"libc",
]
[[package]]
name = "seize"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b55fb86dfd3a2f5f76ea78310a88f96c4ea21a3031f8d212443d56123fd0521"
dependencies = [
"libc",
"windows-sys 0.52.0",
]
[[package]]
name = "semver"
version = "1.0.26"
@@ -11547,7 +11506,7 @@ dependencies = [
[[package]]
name = "servers"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"ahash 0.8.12",
"api",
@@ -11673,7 +11632,7 @@ dependencies = [
[[package]]
name = "session"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"ahash 0.8.12",
"api",
@@ -12007,7 +11966,7 @@ dependencies = [
[[package]]
name = "sql"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"api",
"arrow-buffer",
@@ -12067,7 +12026,7 @@ dependencies = [
[[package]]
name = "sqlness-runner"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"async-trait",
"clap 4.5.40",
@@ -12344,7 +12303,7 @@ dependencies = [
[[package]]
name = "standalone"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"async-trait",
"catalog",
@@ -12385,7 +12344,7 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "store-api"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"api",
"aquamarine",
@@ -12550,28 +12509,6 @@ dependencies = [
"winapi",
]
[[package]]
name = "substrait"
version = "0.18.0"
dependencies = [
"async-trait",
"bytes",
"common-error",
"common-function",
"common-macro",
"common-telemetry",
"datafusion",
"datafusion-common",
"datafusion-expr",
"datafusion-substrait",
"datatypes",
"promql",
"prost 0.13.5",
"snafu 0.8.6",
"substrait 0.37.3",
"tokio",
]
[[package]]
name = "substrait"
version = "0.37.3"
@@ -12618,6 +12555,28 @@ dependencies = [
"walkdir",
]
[[package]]
name = "substrait"
version = "1.0.0-beta.1"
dependencies = [
"async-trait",
"bytes",
"common-error",
"common-function",
"common-macro",
"common-telemetry",
"datafusion",
"datafusion-common",
"datafusion-expr",
"datafusion-substrait",
"datatypes",
"promql",
"prost 0.13.5",
"snafu 0.8.6",
"substrait 0.37.3",
"tokio",
]
[[package]]
name = "subtle"
version = "2.6.1"
@@ -12721,7 +12680,7 @@ dependencies = [
[[package]]
name = "table"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"api",
"async-trait",
@@ -12990,7 +12949,7 @@ checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683"
[[package]]
name = "tests-fuzz"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"arbitrary",
"async-trait",
@@ -13034,7 +12993,7 @@ dependencies = [
[[package]]
name = "tests-integration"
version = "0.18.0"
version = "1.0.0-beta.1"
dependencies = [
"api",
"arrow-flight",
@@ -13108,7 +13067,7 @@ dependencies = [
"sqlx",
"standalone",
"store-api",
"substrait 0.18.0",
"substrait 1.0.0-beta.1",
"table",
"tempfile",
"time",
@@ -14018,6 +13977,15 @@ dependencies = [
"serde",
]
[[package]]
name = "uncased"
version = "0.9.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e1b88fcfe09e89d3866a5c11019378088af2d24c3fbd4f0543f96b479ec90697"
dependencies = [
"version_check",
]
[[package]]
name = "unescaper"
version = "0.1.6"

View File

@@ -74,7 +74,7 @@ members = [
resolver = "2"
[workspace.package]
version = "0.18.0"
version = "1.0.0-beta.1"
edition = "2024"
license = "Apache-2.0"
@@ -118,7 +118,7 @@ bitflags = "2.4.1"
bytemuck = "1.12"
bytes = { version = "1.7", features = ["serde"] }
chrono = { version = "0.4", features = ["serde"] }
chrono-tz = "0.10.1"
chrono-tz = { version = "0.10.1", features = ["case-insensitive"] }
clap = { version = "4.4", features = ["derive"] }
config = "0.13.0"
const_format = "0.2"
@@ -219,12 +219,7 @@ similar-asserts = "1.6.0"
smallvec = { version = "1", features = ["serde"] }
snafu = "0.8"
sqlparser = { version = "0.58.0", default-features = false, features = ["std", "visitor", "serde"] }
sqlx = { version = "0.8", features = [
"runtime-tokio-rustls",
"mysql",
"postgres",
"chrono",
] }
sqlx = { version = "0.8", default-features = false, features = ["any", "macros", "json", "runtime-tokio-rustls"] }
strum = { version = "0.27", features = ["derive"] }
sysinfo = "0.33"
tempfile = "3"
@@ -333,7 +328,6 @@ datafusion-datasource = { git = "https://github.com/GreptimeTeam/datafusion.git"
datafusion-sql = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
datafusion-substrait = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "4b519a5caa95472cc3988f5556813a583dd35af1" } # branch = "v0.58.x"
bytes = { git = "https://github.com/discord9/bytes", rev = "1572ab22c3cbad0e9b6681d1f68eca4139322a2a" }
[profile.release]
debug = 1

View File

@@ -12,8 +12,7 @@
<div align="center">
<h3 align="center">
<a href="https://greptime.com/product/cloud">GreptimeCloud</a> |
<a href="https://docs.greptime.com/">User Guide</a> |
<a href="https://docs.greptime.com/user-guide/overview/">User Guide</a> |
<a href="https://greptimedb.rs/">API Docs</a> |
<a href="https://github.com/GreptimeTeam/greptimedb/issues/5446">Roadmap 2025</a>
</h4>
@@ -67,17 +66,24 @@
## Introduction
**GreptimeDB** is an open-source, cloud-native database purpose-built for the unified collection and analysis of observability data (metrics, logs, and traces). Whether youre operating on the edge, in the cloud, or across hybrid environments, GreptimeDB empowers real-time insights at massive scale — all in one system.
**GreptimeDB** is an open-source, cloud-native database that unifies metrics, logs, and traces, enabling real-time observability at any scale — across edge, cloud, and hybrid environments.
## Features
| Feature | Description |
| --------- | ----------- |
| [Unified Observability Data](https://docs.greptime.com/user-guide/concepts/why-greptimedb) | Store metrics, logs, and traces as timestamped, contextual wide events. Query via [SQL](https://docs.greptime.com/user-guide/query-data/sql), [PromQL](https://docs.greptime.com/user-guide/query-data/promql), and [streaming](https://docs.greptime.com/user-guide/flow-computation/overview). |
| [High Performance & Cost Effective](https://docs.greptime.com/user-guide/manage-data/data-index) | Written in Rust, with a distributed query engine, [rich indexing](https://docs.greptime.com/user-guide/manage-data/data-index), and optimized columnar storage, delivering sub-second responses at PB scale. |
| [Cloud-Native Architecture](https://docs.greptime.com/user-guide/concepts/architecture) | Designed for [Kubernetes](https://docs.greptime.com/user-guide/deployments-administration/deploy-on-kubernetes/greptimedb-operator-management), with compute/storage separation, native object storage (AWS S3, Azure Blob, etc.) and seamless cross-cloud access. |
| [Developer-Friendly](https://docs.greptime.com/user-guide/protocols/overview) | Access via SQL/PromQL interfaces, REST API, MySQL/PostgreSQL protocols, and popular ingestion [protocols](https://docs.greptime.com/user-guide/protocols/overview). |
| [Flexible Deployment](https://docs.greptime.com/user-guide/deployments-administration/overview) | Deploy anywhere: edge (including ARM/[Android](https://docs.greptime.com/user-guide/deployments-administration/run-on-android)) or cloud, with unified APIs and efficient data sync. |
| [All-in-One Observability](https://docs.greptime.com/user-guide/concepts/why-greptimedb) | OpenTelemetry-native platform unifying metrics, logs, and traces. Query via [SQL](https://docs.greptime.com/user-guide/query-data/sql), [PromQL](https://docs.greptime.com/user-guide/query-data/promql), and [Flow](https://docs.greptime.com/user-guide/flow-computation/overview). |
| [High Performance](https://docs.greptime.com/user-guide/manage-data/data-index) | Written in Rust with [rich indexing](https://docs.greptime.com/user-guide/manage-data/data-index) (inverted, fulltext, skipping, vector), delivering sub-second responses at PB scale. |
| [Cost Efficiency](https://docs.greptime.com/user-guide/concepts/architecture) | 50x lower operational and storage costs with compute-storage separation and native object storage (S3, Azure Blob, etc.). |
| [Cloud-Native & Scalable](https://docs.greptime.com/user-guide/deployments-administration/deploy-on-kubernetes/greptimedb-operator-management) | Purpose-built for [Kubernetes](https://docs.greptime.com/user-guide/deployments-administration/deploy-on-kubernetes/greptimedb-operator-management) with unlimited cross-cloud scaling, handling hundreds of thousands of concurrent requests. |
| [Developer-Friendly](https://docs.greptime.com/user-guide/protocols/overview) | SQL/PromQL interfaces, built-in web dashboard, REST API, MySQL/PostgreSQL protocol compatibility, and native [OpenTelemetry](https://docs.greptime.com/user-guide/ingest-data/for-observability/opentelemetry/) support. |
| [Flexible Deployment](https://docs.greptime.com/user-guide/deployments-administration/overview) | Deploy anywhere from ARM-based edge devices (including [Android](https://docs.greptime.com/user-guide/deployments-administration/run-on-android)) to cloud, with unified APIs and efficient data sync. |
**Perfect for:**
- Unified observability stack replacing Prometheus + Loki + Tempo
- Large-scale metrics with high cardinality (millions to billions of time series)
- Large-scale observability platform requiring cost efficiency and scalability
- IoT and edge computing with resource and bandwidth constraints
Learn more in [Why GreptimeDB](https://docs.greptime.com/user-guide/concepts/why-greptimedb) and [Observability 2.0 and the Database for It](https://greptime.com/blogs/2025-04-25-greptimedb-observability2-new-database).
@@ -86,10 +92,10 @@ Learn more in [Why GreptimeDB](https://docs.greptime.com/user-guide/concepts/why
| Feature | GreptimeDB | Traditional TSDB | Log Stores |
|----------------------------------|-----------------------|--------------------|-----------------|
| Data Types | Metrics, Logs, Traces | Metrics only | Logs only |
| Query Language | SQL, PromQL, Streaming| Custom/PromQL | Custom/DSL |
| Query Language | SQL, PromQL | Custom/PromQL | Custom/DSL |
| Deployment | Edge + Cloud | Cloud/On-prem | Mostly central |
| Indexing & Performance | PB-Scale, Sub-second | Varies | Varies |
| Integration | REST, SQL, Common protocols | Varies | Varies |
| Integration | REST API, SQL, Common protocols | Varies | Varies |
**Performance:**
* [GreptimeDB tops JSONBench's billion-record cold run test!](https://greptime.com/blogs/2025-03-18-jsonbench-greptimedb-performance)
@@ -99,22 +105,18 @@ Read [more benchmark reports](https://docs.greptime.com/user-guide/concepts/feat
## Architecture
* Read the [architecture](https://docs.greptime.com/contributor-guide/overview/#architecture) document.
* [DeepWiki](https://deepwiki.com/GreptimeTeam/greptimedb/1-overview) provides an in-depth look at GreptimeDB:
GreptimeDB can run in two modes:
* **Standalone Mode** - Single binary for development and small deployments
* **Distributed Mode** - Separate components for production scale:
- Frontend: Query processing and protocol handling
- Datanode: Data storage and retrieval
- Metasrv: Metadata management and coordination
Read the [architecture](https://docs.greptime.com/contributor-guide/overview/#architecture) document. [DeepWiki](https://deepwiki.com/GreptimeTeam/greptimedb/1-overview) provides an in-depth look at GreptimeDB:
<img alt="GreptimeDB System Overview" src="docs/architecture.png">
## Try GreptimeDB
### 1. [Live Demo](https://greptime.com/playground)
Experience GreptimeDB directly in your browser.
### 2. [GreptimeCloud](https://console.greptime.cloud/)
Start instantly with a free cluster.
### 3. Docker (Local Quickstart)
```shell
docker pull greptime/greptimedb
```
@@ -130,7 +132,8 @@ docker run -p 127.0.0.1:4000-4003:4000-4003 \
--postgres-addr 0.0.0.0:4003
```
Dashboard: [http://localhost:4000/dashboard](http://localhost:4000/dashboard)
[Full Install Guide](https://docs.greptime.com/getting-started/installation/overview)
Read more in the [full Install Guide](https://docs.greptime.com/getting-started/installation/overview).
**Troubleshooting:**
* Cannot connect to the database? Ensure that ports `4000`, `4001`, `4002`, and `4003` are not blocked by a firewall or used by other services.
@@ -159,21 +162,26 @@ cargo run -- standalone start
## Tools & Extensions
- **Kubernetes:** [GreptimeDB Operator](https://github.com/GrepTimeTeam/greptimedb-operator)
- **Helm Charts:** [Greptime Helm Charts](https://github.com/GreptimeTeam/helm-charts)
- **Dashboard:** [Web UI](https://github.com/GreptimeTeam/dashboard)
- **SDKs/Ingester:** [Go](https://github.com/GreptimeTeam/greptimedb-ingester-go), [Java](https://github.com/GreptimeTeam/greptimedb-ingester-java), [C++](https://github.com/GreptimeTeam/greptimedb-ingester-cpp), [Erlang](https://github.com/GreptimeTeam/greptimedb-ingester-erl), [Rust](https://github.com/GreptimeTeam/greptimedb-ingester-rust), [JS](https://github.com/GreptimeTeam/greptimedb-ingester-js)
- **Grafana**: [Official Dashboard](https://github.com/GreptimeTeam/greptimedb/blob/main/grafana/README.md)
- **Kubernetes**: [GreptimeDB Operator](https://github.com/GrepTimeTeam/greptimedb-operator)
- **Helm Charts**: [Greptime Helm Charts](https://github.com/GreptimeTeam/helm-charts)
- **Dashboard**: [Web UI](https://github.com/GreptimeTeam/dashboard)
- **gRPC Ingester**: [Go](https://github.com/GreptimeTeam/greptimedb-ingester-go), [Java](https://github.com/GreptimeTeam/greptimedb-ingester-java), [C++](https://github.com/GreptimeTeam/greptimedb-ingester-cpp), [Erlang](https://github.com/GreptimeTeam/greptimedb-ingester-erl), [Rust](https://github.com/GreptimeTeam/greptimedb-ingester-rust)
- **Grafana Data Source**: [GreptimeDB Grafana data source plugin](https://github.com/GreptimeTeam/greptimedb-grafana-datasource)
- **Grafana Dashboard**: [Official Dashboard for monitoring](https://github.com/GreptimeTeam/greptimedb/blob/main/grafana/README.md)
## Project Status
> **Status:** Beta.
> **GA (v1.0):** Targeted for mid 2025.
> **Status:** Beta — marching toward v1.0 GA!
> **GA (v1.0):** January 10, 2026
- Being used in production by early adopters
- Deployed in production by open-source projects and commercial users
- Stable, actively maintained, with regular releases ([version info](https://docs.greptime.com/nightly/reference/about-greptimedb-version))
- Suitable for evaluation and pilot deployments
GreptimeDB v1.0 represents a major milestone toward maturity — marking stable APIs, production readiness, and proven performance.
**Roadmap:** Beta1 (Nov 10) → Beta2 (Nov 24) → RC1 (Dec 8) → GA (Jan 10, 2026), please read [v1.0 highlights and release plan](https://greptime.com/blogs/2025-11-05-greptimedb-v1-highlights) for details.
For production use, we recommend using the latest stable release.
[![Star History Chart](https://api.star-history.com/svg?repos=GreptimeTeam/GreptimeDB&type=Date)](https://www.star-history.com/#GreptimeTeam/GreptimeDB&Date)
@@ -214,5 +222,5 @@ Special thanks to all contributors! See [AUTHORS.md](https://github.com/Greptime
- Uses [Apache Arrow™](https://arrow.apache.org/) (memory model)
- [Apache Parquet™](https://parquet.apache.org/) (file storage)
- [Apache Arrow DataFusion™](https://arrow.apache.org/datafusion/) (query engine)
- [Apache DataFusion™](https://arrow.apache.org/datafusion/) (query engine)
- [Apache OpenDAL™](https://opendal.apache.org/) (data access abstraction)

View File

@@ -16,7 +16,7 @@
| `default_column_prefix` | String | Unset | The default column prefix for auto-created time index and value columns. |
| `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.<br/>By default, it provides services after all regions have been initialized. |
| `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. |
| `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited. |
| `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited.<br/>NOTE: This setting affects scan_memory_limit's privileged tier allocation.<br/>When set, 70% of queries get privileged memory access (full scan_memory_limit).<br/>The remaining 30% get standard tier access (70% of scan_memory_limit). |
| `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. Enabled by default. |
| `max_in_flight_write_bytes` | String | Unset | The maximum in-flight write bytes. |
| `runtime` | -- | -- | The runtime options. |
@@ -104,6 +104,7 @@
| `flow.num_workers` | Integer | `0` | The number of flow worker in flownode.<br/>Not setting(or set to 0) this value will use the number of CPU cores divided by 2. |
| `query` | -- | -- | The query engine options. |
| `query.parallelism` | Integer | `0` | Parallelism of the query engine.<br/>Default to 0, which means the number of CPU cores. |
| `query.memory_pool_size` | String | `50%` | Memory pool size for query execution operators (aggregation, sorting, join).<br/>Supports absolute size (e.g., "2GB", "4GB") or percentage of system memory (e.g., "20%").<br/>Setting it to 0 disables the limit (unbounded, default behavior).<br/>When this limit is reached, queries will fail with ResourceExhausted error.<br/>NOTE: This does NOT limit memory used by table scans. |
| `storage` | -- | -- | The data storage options. |
| `storage.data_home` | String | `./greptimedb_data` | The working home directory. |
| `storage.type` | String | `File` | The storage type used to store the data.<br/>- `File`: the data is stored in the local file system.<br/>- `S3`: the data is stored in the S3 object storage.<br/>- `Gcs`: the data is stored in the Google Cloud Storage.<br/>- `Azblob`: the data is stored in the Azure Blob Storage.<br/>- `Oss`: the data is stored in the Aliyun OSS. |
@@ -151,10 +152,13 @@
| `region_engine.mito.write_cache_path` | String | `""` | File system path for write cache, defaults to `{data_home}`. |
| `region_engine.mito.write_cache_size` | String | `5GiB` | Capacity for write cache. If your disk space is sufficient, it is recommended to set it larger. |
| `region_engine.mito.write_cache_ttl` | String | Unset | TTL for write cache. |
| `region_engine.mito.preload_index_cache` | Bool | `true` | Preload index (puffin) files into cache on region open (default: true).<br/>When enabled, index files are loaded into the write cache during region initialization,<br/>which can improve query performance at the cost of longer startup times. |
| `region_engine.mito.index_cache_percent` | Integer | `20` | Percentage of write cache capacity allocated for index (puffin) files (default: 20).<br/>The remaining capacity is used for data (parquet) files.<br/>Must be between 0 and 100 (exclusive). For example, with a 5GiB write cache and 20% allocation,<br/>1GiB is reserved for index files and 4GiB for data files. |
| `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. |
| `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. |
| `region_engine.mito.max_concurrent_scan_files` | Integer | `384` | Maximum number of SST files to scan concurrently. |
| `region_engine.mito.allow_stale_entries` | Bool | `false` | Whether to allow stale WAL entries read during replay. |
| `region_engine.mito.scan_memory_limit` | String | `50%` | Memory limit for table scans across all queries.<br/>Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").<br/>Setting it to 0 disables the limit.<br/>NOTE: Works with max_concurrent_queries for tiered memory allocation.<br/>- If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access.<br/>- If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access. |
| `region_engine.mito.min_compaction_interval` | String | `0m` | Minimum time interval between two compactions.<br/>To align with the old behavior, the default value is 0 (no restrictions). |
| `region_engine.mito.default_experimental_flat_format` | Bool | `false` | Whether to enable experimental flat format as the default format. |
| `region_engine.mito.index` | -- | -- | The options for index in Mito engine. |
@@ -188,7 +192,7 @@
| `region_engine.mito.memtable.fork_dictionary_bytes` | String | `1GiB` | Max dictionary bytes.<br/>Only available for `partition_tree` memtable. |
| `region_engine.file` | -- | -- | Enable the file engine. |
| `region_engine.metric` | -- | -- | Metric engine options. |
| `region_engine.metric.experimental_sparse_primary_key_encoding` | Bool | `false` | Whether to enable the experimental sparse primary key encoding. |
| `region_engine.metric.sparse_primary_key_encoding` | Bool | `true` | Whether to use sparse primary key encoding. |
| `logging` | -- | -- | The logging options. |
| `logging.dir` | String | `./greptimedb_data/logs` | The directory to store the log files. If set to empty, logs will not be written to files. |
| `logging.level` | String | Unset | The log level. Can be `info`/`debug`/`warn`/`error`. |
@@ -308,6 +312,7 @@
| `query` | -- | -- | The query engine options. |
| `query.parallelism` | Integer | `0` | Parallelism of the query engine.<br/>Default to 0, which means the number of CPU cores. |
| `query.allow_query_fallback` | Bool | `false` | Whether to allow query fallback when push down optimize fails.<br/>Default to false, meaning when push down optimize failed, return error msg |
| `query.memory_pool_size` | String | `50%` | Memory pool size for query execution operators (aggregation, sorting, join).<br/>Supports absolute size (e.g., "4GB", "8GB") or percentage of system memory (e.g., "30%").<br/>Setting it to 0 disables the limit (unbounded, default behavior).<br/>When this limit is reached, queries will fail with ResourceExhausted error.<br/>NOTE: This does NOT limit memory used by table scans (only applies to datanodes). |
| `datanode` | -- | -- | Datanode options. |
| `datanode.client` | -- | -- | Datanode client options. |
| `datanode.client.connect_timeout` | String | `10s` | -- |
@@ -446,7 +451,7 @@
| `require_lease_before_startup` | Bool | `false` | Start services after regions have obtained leases.<br/>It will block the datanode start if it can't receive leases in the heartbeat from metasrv. |
| `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.<br/>By default, it provides services after all regions have been initialized. |
| `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. |
| `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited. |
| `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited.<br/>NOTE: This setting affects scan_memory_limit's privileged tier allocation.<br/>When set, 70% of queries get privileged memory access (full scan_memory_limit).<br/>The remaining 30% get standard tier access (70% of scan_memory_limit). |
| `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. Enabled by default. |
| `http` | -- | -- | The HTTP server options. |
| `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. |
@@ -500,6 +505,7 @@
| `wal.overwrite_entry_start_id` | Bool | `false` | Ignore missing entries during read WAL.<br/>**It's only used when the provider is `kafka`**.<br/><br/>This option ensures that when Kafka messages are deleted, the system<br/>can still successfully replay memtable data without throwing an<br/>out-of-range error.<br/>However, enabling this option might lead to unexpected data loss,<br/>as the system will skip over missing entries instead of treating<br/>them as critical errors. |
| `query` | -- | -- | The query engine options. |
| `query.parallelism` | Integer | `0` | Parallelism of the query engine.<br/>Default to 0, which means the number of CPU cores. |
| `query.memory_pool_size` | String | `50%` | Memory pool size for query execution operators (aggregation, sorting, join).<br/>Supports absolute size (e.g., "2GB", "4GB") or percentage of system memory (e.g., "20%").<br/>Setting it to 0 disables the limit (unbounded, default behavior).<br/>When this limit is reached, queries will fail with ResourceExhausted error.<br/>NOTE: This does NOT limit memory used by table scans. |
| `storage` | -- | -- | The data storage options. |
| `storage.data_home` | String | `./greptimedb_data` | The working home directory. |
| `storage.type` | String | `File` | The storage type used to store the data.<br/>- `File`: the data is stored in the local file system.<br/>- `S3`: the data is stored in the S3 object storage.<br/>- `Gcs`: the data is stored in the Google Cloud Storage.<br/>- `Azblob`: the data is stored in the Azure Blob Storage.<br/>- `Oss`: the data is stored in the Aliyun OSS. |
@@ -549,10 +555,13 @@
| `region_engine.mito.write_cache_path` | String | `""` | File system path for write cache, defaults to `{data_home}`. |
| `region_engine.mito.write_cache_size` | String | `5GiB` | Capacity for write cache. If your disk space is sufficient, it is recommended to set it larger. |
| `region_engine.mito.write_cache_ttl` | String | Unset | TTL for write cache. |
| `region_engine.mito.preload_index_cache` | Bool | `true` | Preload index (puffin) files into cache on region open (default: true).<br/>When enabled, index files are loaded into the write cache during region initialization,<br/>which can improve query performance at the cost of longer startup times. |
| `region_engine.mito.index_cache_percent` | Integer | `20` | Percentage of write cache capacity allocated for index (puffin) files (default: 20).<br/>The remaining capacity is used for data (parquet) files.<br/>Must be between 0 and 100 (exclusive). For example, with a 5GiB write cache and 20% allocation,<br/>1GiB is reserved for index files and 4GiB for data files. |
| `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. |
| `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. |
| `region_engine.mito.max_concurrent_scan_files` | Integer | `384` | Maximum number of SST files to scan concurrently. |
| `region_engine.mito.allow_stale_entries` | Bool | `false` | Whether to allow stale WAL entries read during replay. |
| `region_engine.mito.scan_memory_limit` | String | `50%` | Memory limit for table scans across all queries.<br/>Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").<br/>Setting it to 0 disables the limit.<br/>NOTE: Works with max_concurrent_queries for tiered memory allocation.<br/>- If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access.<br/>- If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access. |
| `region_engine.mito.min_compaction_interval` | String | `0m` | Minimum time interval between two compactions.<br/>To align with the old behavior, the default value is 0 (no restrictions). |
| `region_engine.mito.default_experimental_flat_format` | Bool | `false` | Whether to enable experimental flat format as the default format. |
| `region_engine.mito.index` | -- | -- | The options for index in Mito engine. |
@@ -586,7 +595,7 @@
| `region_engine.mito.memtable.fork_dictionary_bytes` | String | `1GiB` | Max dictionary bytes.<br/>Only available for `partition_tree` memtable. |
| `region_engine.file` | -- | -- | Enable the file engine. |
| `region_engine.metric` | -- | -- | Metric engine options. |
| `region_engine.metric.experimental_sparse_primary_key_encoding` | Bool | `false` | Whether to enable the experimental sparse primary key encoding. |
| `region_engine.metric.sparse_primary_key_encoding` | Bool | `true` | Whether to use sparse primary key encoding. |
| `logging` | -- | -- | The logging options. |
| `logging.dir` | String | `./greptimedb_data/logs` | The directory to store the log files. If set to empty, logs will not be written to files. |
| `logging.level` | String | Unset | The log level. Can be `info`/`debug`/`warn`/`error`. |
@@ -673,5 +682,6 @@
| `tracing.tokio_console_addr` | String | Unset | The tokio console address. |
| `query` | -- | -- | -- |
| `query.parallelism` | Integer | `1` | Parallelism of the query engine for query sent by flownode.<br/>Default to 1, so it won't use too much cpu or memory |
| `query.memory_pool_size` | String | `50%` | Memory pool size for query execution operators (aggregation, sorting, join).<br/>Supports absolute size (e.g., "1GB", "2GB") or percentage of system memory (e.g., "20%").<br/>Setting it to 0 disables the limit (unbounded, default behavior).<br/>When this limit is reached, queries will fail with ResourceExhausted error.<br/>NOTE: This does NOT limit memory used by table scans. |
| `memory` | -- | -- | The memory options. |
| `memory.enable_heap_profiling` | Bool | `true` | Whether to enable heap profiling activation during startup.<br/>When enabled, heap profiling will be activated if the `MALLOC_CONF` environment variable<br/>is set to "prof:true,prof_active:false". The official image adds this env variable.<br/>Default is true. |

View File

@@ -18,6 +18,9 @@ init_regions_in_background = false
init_regions_parallelism = 16
## The maximum current queries allowed to be executed. Zero means unlimited.
## NOTE: This setting affects scan_memory_limit's privileged tier allocation.
## When set, 70% of queries get privileged memory access (full scan_memory_limit).
## The remaining 30% get standard tier access (70% of scan_memory_limit).
max_concurrent_queries = 0
## Enable telemetry to collect anonymous usage data. Enabled by default.
@@ -261,6 +264,13 @@ overwrite_entry_start_id = false
## Default to 0, which means the number of CPU cores.
parallelism = 0
## Memory pool size for query execution operators (aggregation, sorting, join).
## Supports absolute size (e.g., "2GB", "4GB") or percentage of system memory (e.g., "20%").
## Setting it to 0 disables the limit (unbounded, default behavior).
## When this limit is reached, queries will fail with ResourceExhausted error.
## NOTE: This does NOT limit memory used by table scans.
memory_pool_size = "50%"
## The data storage options.
[storage]
## The working home directory.
@@ -489,6 +499,17 @@ write_cache_size = "5GiB"
## @toml2docs:none-default
write_cache_ttl = "8h"
## Preload index (puffin) files into cache on region open (default: true).
## When enabled, index files are loaded into the write cache during region initialization,
## which can improve query performance at the cost of longer startup times.
preload_index_cache = true
## Percentage of write cache capacity allocated for index (puffin) files (default: 20).
## The remaining capacity is used for data (parquet) files.
## Must be between 0 and 100 (exclusive). For example, with a 5GiB write cache and 20% allocation,
## 1GiB is reserved for index files and 4GiB for data files.
index_cache_percent = 20
## Buffer size for SST writing.
sst_write_buffer_size = "8MB"
@@ -501,6 +522,14 @@ max_concurrent_scan_files = 384
## Whether to allow stale WAL entries read during replay.
allow_stale_entries = false
## Memory limit for table scans across all queries.
## Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").
## Setting it to 0 disables the limit.
## NOTE: Works with max_concurrent_queries for tiered memory allocation.
## - If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access.
## - If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access.
scan_memory_limit = "50%"
## Minimum time interval between two compactions.
## To align with the old behavior, the default value is 0 (no restrictions).
min_compaction_interval = "0m"
@@ -640,8 +669,8 @@ fork_dictionary_bytes = "1GiB"
[[region_engine]]
## Metric engine options.
[region_engine.metric]
## Whether to enable the experimental sparse primary key encoding.
experimental_sparse_primary_key_encoding = false
## Whether to use sparse primary key encoding.
sparse_primary_key_encoding = true
## The logging options.
[logging]

View File

@@ -158,6 +158,13 @@ default_ratio = 1.0
## Default to 1, so it won't use too much cpu or memory
parallelism = 1
## Memory pool size for query execution operators (aggregation, sorting, join).
## Supports absolute size (e.g., "1GB", "2GB") or percentage of system memory (e.g., "20%").
## Setting it to 0 disables the limit (unbounded, default behavior).
## When this limit is reached, queries will fail with ResourceExhausted error.
## NOTE: This does NOT limit memory used by table scans.
memory_pool_size = "50%"
## The memory options.
[memory]
## Whether to enable heap profiling activation during startup.

View File

@@ -256,6 +256,13 @@ parallelism = 0
## Default to false, meaning when push down optimize failed, return error msg
allow_query_fallback = false
## Memory pool size for query execution operators (aggregation, sorting, join).
## Supports absolute size (e.g., "4GB", "8GB") or percentage of system memory (e.g., "30%").
## Setting it to 0 disables the limit (unbounded, default behavior).
## When this limit is reached, queries will fail with ResourceExhausted error.
## NOTE: This does NOT limit memory used by table scans (only applies to datanodes).
memory_pool_size = "50%"
## Datanode options.
[datanode]
## Datanode client options.

View File

@@ -14,6 +14,9 @@ init_regions_in_background = false
init_regions_parallelism = 16
## The maximum current queries allowed to be executed. Zero means unlimited.
## NOTE: This setting affects scan_memory_limit's privileged tier allocation.
## When set, 70% of queries get privileged memory access (full scan_memory_limit).
## The remaining 30% get standard tier access (70% of scan_memory_limit).
max_concurrent_queries = 0
## Enable telemetry to collect anonymous usage data. Enabled by default.
@@ -365,6 +368,13 @@ max_running_procedures = 128
## Default to 0, which means the number of CPU cores.
parallelism = 0
## Memory pool size for query execution operators (aggregation, sorting, join).
## Supports absolute size (e.g., "2GB", "4GB") or percentage of system memory (e.g., "20%").
## Setting it to 0 disables the limit (unbounded, default behavior).
## When this limit is reached, queries will fail with ResourceExhausted error.
## NOTE: This does NOT limit memory used by table scans.
memory_pool_size = "50%"
## The data storage options.
[storage]
## The working home directory.
@@ -580,6 +590,17 @@ write_cache_size = "5GiB"
## @toml2docs:none-default
write_cache_ttl = "8h"
## Preload index (puffin) files into cache on region open (default: true).
## When enabled, index files are loaded into the write cache during region initialization,
## which can improve query performance at the cost of longer startup times.
preload_index_cache = true
## Percentage of write cache capacity allocated for index (puffin) files (default: 20).
## The remaining capacity is used for data (parquet) files.
## Must be between 0 and 100 (exclusive). For example, with a 5GiB write cache and 20% allocation,
## 1GiB is reserved for index files and 4GiB for data files.
index_cache_percent = 20
## Buffer size for SST writing.
sst_write_buffer_size = "8MB"
@@ -592,6 +613,14 @@ max_concurrent_scan_files = 384
## Whether to allow stale WAL entries read during replay.
allow_stale_entries = false
## Memory limit for table scans across all queries.
## Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").
## Setting it to 0 disables the limit.
## NOTE: Works with max_concurrent_queries for tiered memory allocation.
## - If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access.
## - If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access.
scan_memory_limit = "50%"
## Minimum time interval between two compactions.
## To align with the old behavior, the default value is 0 (no restrictions).
min_compaction_interval = "0m"
@@ -731,8 +760,8 @@ fork_dictionary_bytes = "1GiB"
[[region_engine]]
## Metric engine options.
[region_engine.metric]
## Whether to enable the experimental sparse primary key encoding.
experimental_sparse_primary_key_encoding = false
## Whether to use sparse primary key encoding.
sparse_primary_key_encoding = true
## The logging options.
[logging]

View File

@@ -92,9 +92,6 @@ curl -X POST localhost:4000/debug/prof/mem > greptime.hprof
curl -X POST "localhost:4000/debug/prof/mem?output=flamegraph" > greptime.svg
# or output pprof format
curl -X POST "localhost:4000/debug/prof/mem?output=proto" > greptime.pprof
curl -X POST "localhost:4000/debug/prof/bytes" > greptime.svg
```
You can periodically dump profiling data and compare them to find the delta memory usage.

View File

@@ -8,6 +8,7 @@ license.workspace = true
workspace = true
[dependencies]
arrow-schema.workspace = true
common-base.workspace = true
common-decimal.workspace = true
common-error.workspace = true

View File

@@ -14,10 +14,11 @@
use std::collections::HashMap;
use arrow_schema::extension::{EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY};
use datatypes::schema::{
COMMENT_KEY, ColumnDefaultConstraint, ColumnSchema, FULLTEXT_KEY, FulltextAnalyzer,
FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, JSON_STRUCTURE_SETTINGS_KEY,
SKIPPING_INDEX_KEY, SkippingIndexOptions, SkippingIndexType,
FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, SKIPPING_INDEX_KEY, SkippingIndexOptions,
SkippingIndexType,
};
use greptime_proto::v1::{
Analyzer, FulltextBackend as PbFulltextBackend, SkippingIndexType as PbSkippingIndexType,
@@ -68,8 +69,14 @@ pub fn try_as_column_schema(column_def: &ColumnDef) -> Result<ColumnSchema> {
if let Some(skipping_index) = options.options.get(SKIPPING_INDEX_GRPC_KEY) {
metadata.insert(SKIPPING_INDEX_KEY.to_string(), skipping_index.to_owned());
}
if let Some(settings) = options.options.get(JSON_STRUCTURE_SETTINGS_KEY) {
metadata.insert(JSON_STRUCTURE_SETTINGS_KEY.to_string(), settings.clone());
if let Some(extension_name) = options.options.get(EXTENSION_TYPE_NAME_KEY) {
metadata.insert(EXTENSION_TYPE_NAME_KEY.to_string(), extension_name.clone());
}
if let Some(extension_metadata) = options.options.get(EXTENSION_TYPE_METADATA_KEY) {
metadata.insert(
EXTENSION_TYPE_METADATA_KEY.to_string(),
extension_metadata.clone(),
);
}
}
@@ -142,10 +149,16 @@ pub fn options_from_column_schema(column_schema: &ColumnSchema) -> Option<Column
.options
.insert(SKIPPING_INDEX_GRPC_KEY.to_string(), skipping_index.clone());
}
if let Some(settings) = column_schema.metadata().get(JSON_STRUCTURE_SETTINGS_KEY) {
if let Some(extension_name) = column_schema.metadata().get(EXTENSION_TYPE_NAME_KEY) {
options
.options
.insert(JSON_STRUCTURE_SETTINGS_KEY.to_string(), settings.clone());
.insert(EXTENSION_TYPE_NAME_KEY.to_string(), extension_name.clone());
}
if let Some(extension_metadata) = column_schema.metadata().get(EXTENSION_TYPE_METADATA_KEY) {
options.options.insert(
EXTENSION_TYPE_METADATA_KEY.to_string(),
extension_metadata.clone(),
);
}
(!options.options.is_empty()).then_some(options)

View File

@@ -97,7 +97,6 @@ lazy_static! {
ROUTINES,
SCHEMA_PRIVILEGES,
TABLE_PRIVILEGES,
TRIGGERS,
GLOBAL_STATUS,
SESSION_STATUS,
PARTITIONS,
@@ -207,7 +206,6 @@ impl SystemSchemaProviderInner for InformationSchemaProvider {
ROUTINES => setup_memory_table!(ROUTINES),
SCHEMA_PRIVILEGES => setup_memory_table!(SCHEMA_PRIVILEGES),
TABLE_PRIVILEGES => setup_memory_table!(TABLE_PRIVILEGES),
TRIGGERS => setup_memory_table!(TRIGGERS),
GLOBAL_STATUS => setup_memory_table!(GLOBAL_STATUS),
SESSION_STATUS => setup_memory_table!(SESSION_STATUS),
KEY_COLUMN_USAGE => Some(Arc::new(InformationSchemaKeyColumnUsage::new(

View File

@@ -15,8 +15,7 @@
use std::sync::Arc;
use common_catalog::consts::{METRIC_ENGINE, MITO_ENGINE};
use datatypes::data_type::ConcreteDataType;
use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
use datatypes::schema::{Schema, SchemaRef};
use datatypes::vectors::{Int64Vector, StringVector, VectorRef};
use crate::system_schema::information_schema::table_names::*;
@@ -366,16 +365,6 @@ pub(super) fn get_schema_columns(table_name: &str) -> (SchemaRef, Vec<VectorRef>
vec![],
),
TRIGGERS => (
vec![
string_column("TRIGGER_NAME"),
ColumnSchema::new("trigger_id", ConcreteDataType::uint64_datatype(), false),
string_column("TRIGGER_DEFINITION"),
ColumnSchema::new("flownode_id", ConcreteDataType::uint64_datatype(), true),
],
vec![],
),
// TODO: Considering store internal metrics in `global_status` and
// `session_status` tables.
GLOBAL_STATUS => (

View File

@@ -23,6 +23,8 @@ use crate::Tool;
use crate::data::export::ExportCommand;
use crate::data::import::ImportCommand;
pub(crate) const COPY_PATH_PLACEHOLDER: &str = "<PATH/TO/FILES>";
/// Command for data operations including exporting data from and importing data into GreptimeDB.
#[derive(Subcommand)]
pub enum DataCommand {

View File

@@ -30,7 +30,7 @@ use snafu::{OptionExt, ResultExt};
use tokio::sync::Semaphore;
use tokio::time::Instant;
use crate::data::default_database;
use crate::data::{COPY_PATH_PLACEHOLDER, default_database};
use crate::database::{DatabaseClient, parse_proxy_opts};
use crate::error::{
EmptyResultSnafu, Error, OpenDalSnafu, OutputDirNotSetSnafu, Result, S3ConfigNotSetSnafu,
@@ -668,10 +668,26 @@ impl Export {
);
// Create copy_from.sql file
let copy_database_from_sql = format!(
r#"COPY DATABASE "{}"."{}" FROM '{}' WITH ({}){};"#,
export_self.catalog, schema, path, with_options_clone, connection_part
);
let copy_database_from_sql = {
let command_without_connection = format!(
r#"COPY DATABASE "{}"."{}" FROM '{}' WITH ({});"#,
export_self.catalog, schema, COPY_PATH_PLACEHOLDER, with_options_clone
);
if connection_part.is_empty() {
command_without_connection
} else {
let command_with_connection = format!(
r#"COPY DATABASE "{}"."{}" FROM '{}' WITH ({}){};"#,
export_self.catalog, schema, path, with_options_clone, connection_part
);
format!(
"-- {}\n{}",
command_with_connection, command_without_connection
)
}
};
let copy_from_path = export_self.get_file_path(&schema, "copy_from.sql");
export_self

View File

@@ -21,13 +21,13 @@ use clap::{Parser, ValueEnum};
use common_catalog::consts::DEFAULT_SCHEMA_NAME;
use common_error::ext::BoxedError;
use common_telemetry::{error, info, warn};
use snafu::{OptionExt, ResultExt};
use snafu::{OptionExt, ResultExt, ensure};
use tokio::sync::Semaphore;
use tokio::time::Instant;
use crate::data::default_database;
use crate::data::{COPY_PATH_PLACEHOLDER, default_database};
use crate::database::{DatabaseClient, parse_proxy_opts};
use crate::error::{Error, FileIoSnafu, Result, SchemaNotFoundSnafu};
use crate::error::{Error, FileIoSnafu, InvalidArgumentsSnafu, Result, SchemaNotFoundSnafu};
use crate::{Tool, database};
#[derive(Debug, Default, Clone, ValueEnum)]
@@ -148,12 +148,15 @@ impl Import {
let _permit = semaphore_moved.acquire().await.unwrap();
let database_input_dir = self.catalog_path().join(&schema);
let sql_file = database_input_dir.join(filename);
let sql = tokio::fs::read_to_string(sql_file)
let mut sql = tokio::fs::read_to_string(sql_file)
.await
.context(FileIoSnafu)?;
if sql.is_empty() {
if sql.trim().is_empty() {
info!("Empty `{filename}` {database_input_dir:?}");
} else {
if filename == "copy_from.sql" {
sql = self.rewrite_copy_database_sql(&schema, &sql)?;
}
let db = exec_db.unwrap_or(&schema);
self.database_client.sql(&sql, db).await?;
info!("Imported `{filename}` for database {schema}");
@@ -226,6 +229,57 @@ impl Import {
}
Ok(db_names)
}
fn rewrite_copy_database_sql(&self, schema: &str, sql: &str) -> Result<String> {
let target_location = self.build_copy_database_location(schema);
let escaped_location = target_location.replace('\'', "''");
let mut first_stmt_checked = false;
for line in sql.lines() {
let trimmed = line.trim_start();
if trimmed.is_empty() || trimmed.starts_with("--") {
continue;
}
ensure!(
trimmed.starts_with("COPY DATABASE"),
InvalidArgumentsSnafu {
msg: "Expected COPY DATABASE statement at start of copy_from.sql"
}
);
first_stmt_checked = true;
break;
}
ensure!(
first_stmt_checked,
InvalidArgumentsSnafu {
msg: "COPY DATABASE statement not found in copy_from.sql"
}
);
ensure!(
sql.contains(COPY_PATH_PLACEHOLDER),
InvalidArgumentsSnafu {
msg: format!(
"Placeholder `{}` not found in COPY DATABASE statement",
COPY_PATH_PLACEHOLDER
)
}
);
Ok(sql.replacen(COPY_PATH_PLACEHOLDER, &escaped_location, 1))
}
fn build_copy_database_location(&self, schema: &str) -> String {
let mut path = self.catalog_path();
path.push(schema);
let mut path_str = path.to_string_lossy().into_owned();
if !path_str.ends_with('/') {
path_str.push('/');
}
path_str
}
}
#[async_trait]
@@ -241,3 +295,52 @@ impl Tool for Import {
}
}
}
#[cfg(test)]
mod tests {
use std::time::Duration;
use super::*;
fn build_import(input_dir: &str) -> Import {
Import {
catalog: "catalog".to_string(),
schema: None,
database_client: DatabaseClient::new(
"127.0.0.1:4000".to_string(),
"catalog".to_string(),
None,
Duration::from_secs(0),
None,
),
input_dir: input_dir.to_string(),
parallelism: 1,
target: ImportTarget::Data,
}
}
#[test]
fn rewrite_copy_database_sql_replaces_placeholder() {
let import = build_import("/tmp/export-path");
let comment = "-- COPY DATABASE \"catalog\".\"schema\" FROM 's3://bucket/demo/' WITH (format = 'parquet') CONNECTION (region = 'us-west-2')";
let sql = format!(
"{comment}\nCOPY DATABASE \"catalog\".\"schema\" FROM '{}' WITH (format = 'parquet');",
COPY_PATH_PLACEHOLDER
);
let rewritten = import.rewrite_copy_database_sql("schema", &sql).unwrap();
let expected_location = import.build_copy_database_location("schema");
let escaped = expected_location.replace('\'', "''");
assert!(rewritten.starts_with(comment));
assert!(rewritten.contains(&format!("FROM '{escaped}'")));
assert!(!rewritten.contains(COPY_PATH_PLACEHOLDER));
}
#[test]
fn rewrite_copy_database_sql_requires_placeholder() {
let import = build_import("/tmp/export-path");
let sql = "COPY DATABASE \"catalog\".\"schema\" FROM '/tmp/export-path/catalog/schema/' WITH (format = 'parquet');";
assert!(import.rewrite_copy_database_sql("schema", sql).is_err());
}
}

View File

@@ -20,7 +20,9 @@ use api::v1::health_check_client::HealthCheckClient;
use api::v1::prometheus_gateway_client::PrometheusGatewayClient;
use api::v1::region::region_client::RegionClient as PbRegionClient;
use arrow_flight::flight_service_client::FlightServiceClient;
use common_grpc::channel_manager::{ChannelConfig, ChannelManager, ClientTlsOption};
use common_grpc::channel_manager::{
ChannelConfig, ChannelManager, ClientTlsOption, load_tls_config,
};
use parking_lot::RwLock;
use snafu::{OptionExt, ResultExt};
use tonic::codec::CompressionEncoding;
@@ -94,8 +96,9 @@ impl Client {
A: AsRef<[U]>,
{
let channel_config = ChannelConfig::default().client_tls_config(client_tls);
let channel_manager = ChannelManager::with_tls_config(channel_config)
let tls_config = load_tls_config(channel_config.client_tls.as_ref())
.context(error::CreateTlsChannelSnafu)?;
let channel_manager = ChannelManager::with_config(channel_config, tls_config);
Ok(Self::with_manager_and_urls(channel_manager, urls))
}

View File

@@ -74,7 +74,7 @@ impl FlownodeManager for NodeClients {
impl NodeClients {
pub fn new(config: ChannelConfig) -> Self {
Self {
channel_manager: ChannelManager::with_config(config),
channel_manager: ChannelManager::with_config(config, None),
clients: CacheBuilder::new(1024)
.time_to_live(Duration::from_secs(30 * 60))
.time_to_idle(Duration::from_secs(5 * 60))

View File

@@ -162,6 +162,7 @@ impl ObjbenchCommand {
file_size,
available_indexes: Default::default(),
index_file_size: 0,
index_file_id: None,
num_rows,
num_row_groups,
sequence: None,

View File

@@ -177,6 +177,8 @@ pub struct StartCommand {
#[clap(long)]
tls_key_path: Option<String>,
#[clap(long)]
tls_watch: bool,
#[clap(long)]
user_provider: Option<String>,
#[clap(long)]
disable_dashboard: Option<bool>,
@@ -230,6 +232,7 @@ impl StartCommand {
self.tls_mode.clone(),
self.tls_cert_path.clone(),
self.tls_key_path.clone(),
self.tls_watch,
);
if let Some(addr) = &self.http_addr {

View File

@@ -228,6 +228,8 @@ pub struct StartCommand {
#[clap(long)]
tls_key_path: Option<String>,
#[clap(long)]
tls_watch: bool,
#[clap(long)]
user_provider: Option<String>,
#[clap(long, default_value = "GREPTIMEDB_STANDALONE")]
pub env_prefix: String,
@@ -277,6 +279,7 @@ impl StartCommand {
self.tls_mode.clone(),
self.tls_cert_path.clone(),
self.tls_key_path.clone(),
self.tls_watch,
);
if let Some(addr) = &self.http_addr {
@@ -769,6 +772,9 @@ mod tests {
fn test_load_log_options_from_cli() {
let cmd = StartCommand {
user_provider: Some("static_user_provider:cmd:test=test".to_string()),
mysql_addr: Some("127.0.0.1:4002".to_string()),
postgres_addr: Some("127.0.0.1:4003".to_string()),
tls_watch: true,
..Default::default()
};
@@ -785,6 +791,8 @@ mod tests {
assert_eq!("./greptimedb_data/test/logs", opts.logging.dir);
assert_eq!("debug", opts.logging.level.unwrap());
assert!(opts.mysql.tls.watch);
assert!(opts.postgres.tls.watch);
}
#[test]

View File

@@ -15,6 +15,7 @@
use std::time::Duration;
use cmd::options::GreptimeOptions;
use common_base::memory_limit::MemoryLimit;
use common_config::{Configurable, DEFAULT_DATA_HOME};
use common_options::datanode::{ClientOptions, DatanodeClientOptions};
use common_telemetry::logging::{DEFAULT_LOGGING_DIR, DEFAULT_OTLP_HTTP_ENDPOINT, LoggingOptions};
@@ -74,14 +75,19 @@ fn test_load_datanode_example_config() {
RegionEngineConfig::Mito(MitoConfig {
auto_flush_interval: Duration::from_secs(3600),
write_cache_ttl: Some(Duration::from_secs(60 * 60 * 8)),
scan_memory_limit: MemoryLimit::Percentage(50),
..Default::default()
}),
RegionEngineConfig::File(FileEngineConfig {}),
RegionEngineConfig::Metric(MetricEngineConfig {
experimental_sparse_primary_key_encoding: false,
sparse_primary_key_encoding: true,
flush_metadata_region_interval: Duration::from_secs(30),
}),
],
query: QueryOptions {
memory_pool_size: MemoryLimit::Percentage(50),
..Default::default()
},
logging: LoggingOptions {
level: Some("info".to_string()),
dir: format!("{}/{}", DEFAULT_DATA_HOME, DEFAULT_LOGGING_DIR),
@@ -155,6 +161,10 @@ fn test_load_frontend_example_config() {
cors_allowed_origins: vec!["https://example.com".to_string()],
..Default::default()
},
query: QueryOptions {
memory_pool_size: MemoryLimit::Percentage(50),
..Default::default()
},
..Default::default()
},
..Default::default()
@@ -242,6 +252,7 @@ fn test_load_flownode_example_config() {
query: QueryOptions {
parallelism: 1,
allow_query_fallback: false,
memory_pool_size: MemoryLimit::Percentage(50),
},
meta_client: Some(MetaClientOptions {
metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
@@ -286,11 +297,12 @@ fn test_load_standalone_example_config() {
RegionEngineConfig::Mito(MitoConfig {
auto_flush_interval: Duration::from_secs(3600),
write_cache_ttl: Some(Duration::from_secs(60 * 60 * 8)),
scan_memory_limit: MemoryLimit::Percentage(50),
..Default::default()
}),
RegionEngineConfig::File(FileEngineConfig {}),
RegionEngineConfig::Metric(MetricEngineConfig {
experimental_sparse_primary_key_encoding: false,
sparse_primary_key_encoding: true,
flush_metadata_region_interval: Duration::from_secs(30),
}),
],
@@ -314,7 +326,10 @@ fn test_load_standalone_example_config() {
cors_allowed_origins: vec!["https://example.com".to_string()],
..Default::default()
},
query: QueryOptions {
memory_pool_size: MemoryLimit::Percentage(50),
..Default::default()
},
..Default::default()
},
..Default::default()

View File

@@ -15,6 +15,7 @@
pub mod bit_vec;
pub mod bytes;
pub mod cancellation;
pub mod memory_limit;
pub mod plugins;
pub mod range_read;
#[allow(clippy::all)]

View File

@@ -0,0 +1,265 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::fmt::{self, Display};
use std::str::FromStr;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use crate::readable_size::ReadableSize;
/// Memory limit configuration that supports both absolute size and percentage.
///
/// Examples:
/// - Absolute size: "2GB", "4GiB", "512MB"
/// - Percentage: "50%", "75%"
/// - Unlimited: "unlimited", "0"
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum MemoryLimit {
/// Absolute memory size.
Size(ReadableSize),
/// Percentage of total system memory (0-100).
Percentage(u8),
/// No memory limit.
#[default]
Unlimited,
}
impl MemoryLimit {
/// Resolve the memory limit to bytes based on total system memory.
/// Returns 0 if the limit is unlimited.
pub fn resolve(&self, total_memory_bytes: u64) -> u64 {
match self {
MemoryLimit::Size(size) => size.as_bytes(),
MemoryLimit::Percentage(pct) => total_memory_bytes * (*pct as u64) / 100,
MemoryLimit::Unlimited => 0,
}
}
/// Returns true if this limit is unlimited.
pub fn is_unlimited(&self) -> bool {
match self {
MemoryLimit::Size(size) => size.as_bytes() == 0,
MemoryLimit::Percentage(pct) => *pct == 0,
MemoryLimit::Unlimited => true,
}
}
}
impl FromStr for MemoryLimit {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let s = s.trim();
if s.eq_ignore_ascii_case("unlimited") {
return Ok(MemoryLimit::Unlimited);
}
if let Some(pct_str) = s.strip_suffix('%') {
let pct = pct_str
.trim()
.parse::<u8>()
.map_err(|e| format!("invalid percentage value '{}': {}", pct_str, e))?;
if pct > 100 {
return Err(format!("percentage must be between 0 and 100, got {}", pct));
}
if pct == 0 {
Ok(MemoryLimit::Unlimited)
} else {
Ok(MemoryLimit::Percentage(pct))
}
} else {
let size = ReadableSize::from_str(s)?;
if size.as_bytes() == 0 {
Ok(MemoryLimit::Unlimited)
} else {
Ok(MemoryLimit::Size(size))
}
}
}
}
impl Display for MemoryLimit {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
MemoryLimit::Size(size) => write!(f, "{}", size),
MemoryLimit::Percentage(pct) => write!(f, "{}%", pct),
MemoryLimit::Unlimited => write!(f, "unlimited"),
}
}
}
impl Serialize for MemoryLimit {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
serializer.serialize_str(&self.to_string())
}
}
impl<'de> Deserialize<'de> for MemoryLimit {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
let s = String::deserialize(deserializer)?;
MemoryLimit::from_str(&s).map_err(serde::de::Error::custom)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_absolute_size() {
assert_eq!(
"2GB".parse::<MemoryLimit>().unwrap(),
MemoryLimit::Size(ReadableSize(2 * 1024 * 1024 * 1024))
);
assert_eq!(
"512MB".parse::<MemoryLimit>().unwrap(),
MemoryLimit::Size(ReadableSize(512 * 1024 * 1024))
);
assert_eq!("0".parse::<MemoryLimit>().unwrap(), MemoryLimit::Unlimited);
}
#[test]
fn test_parse_percentage() {
assert_eq!(
"50%".parse::<MemoryLimit>().unwrap(),
MemoryLimit::Percentage(50)
);
assert_eq!(
"75%".parse::<MemoryLimit>().unwrap(),
MemoryLimit::Percentage(75)
);
assert_eq!("0%".parse::<MemoryLimit>().unwrap(), MemoryLimit::Unlimited);
}
#[test]
fn test_parse_invalid() {
assert!("150%".parse::<MemoryLimit>().is_err());
assert!("-10%".parse::<MemoryLimit>().is_err());
assert!("invalid".parse::<MemoryLimit>().is_err());
}
#[test]
fn test_resolve() {
let total = 8 * 1024 * 1024 * 1024; // 8GB
assert_eq!(
MemoryLimit::Size(ReadableSize(2 * 1024 * 1024 * 1024)).resolve(total),
2 * 1024 * 1024 * 1024
);
assert_eq!(
MemoryLimit::Percentage(50).resolve(total),
4 * 1024 * 1024 * 1024
);
assert_eq!(MemoryLimit::Unlimited.resolve(total), 0);
}
#[test]
fn test_is_unlimited() {
assert!(MemoryLimit::Unlimited.is_unlimited());
assert!(!MemoryLimit::Size(ReadableSize(1024)).is_unlimited());
assert!(!MemoryLimit::Percentage(50).is_unlimited());
assert!(!MemoryLimit::Percentage(1).is_unlimited());
// Defensive: these states shouldn't exist via public API, but check anyway
assert!(MemoryLimit::Size(ReadableSize(0)).is_unlimited());
assert!(MemoryLimit::Percentage(0).is_unlimited());
}
#[test]
fn test_parse_100_percent() {
assert_eq!(
"100%".parse::<MemoryLimit>().unwrap(),
MemoryLimit::Percentage(100)
);
}
#[test]
fn test_display_percentage() {
assert_eq!(MemoryLimit::Percentage(20).to_string(), "20%");
assert_eq!(MemoryLimit::Percentage(50).to_string(), "50%");
assert_eq!(MemoryLimit::Percentage(100).to_string(), "100%");
}
#[test]
fn test_parse_unlimited() {
assert_eq!(
"unlimited".parse::<MemoryLimit>().unwrap(),
MemoryLimit::Unlimited
);
assert_eq!(
"UNLIMITED".parse::<MemoryLimit>().unwrap(),
MemoryLimit::Unlimited
);
assert_eq!(
"Unlimited".parse::<MemoryLimit>().unwrap(),
MemoryLimit::Unlimited
);
}
#[test]
fn test_display_unlimited() {
assert_eq!(MemoryLimit::Unlimited.to_string(), "unlimited");
}
#[test]
fn test_parse_display_roundtrip() {
let cases = vec![
"50%",
"100%",
"1%",
"2GB",
"512MB",
"unlimited",
"UNLIMITED",
"0", // normalized to unlimited
"0%", // normalized to unlimited
];
for input in cases {
let parsed = input.parse::<MemoryLimit>().unwrap();
let displayed = parsed.to_string();
let reparsed = displayed.parse::<MemoryLimit>().unwrap();
assert_eq!(
parsed, reparsed,
"round-trip failed: '{}' -> '{}' -> '{:?}'",
input, displayed, reparsed
);
}
}
#[test]
fn test_zero_normalization() {
// All forms of zero should normalize to Unlimited
assert_eq!("0".parse::<MemoryLimit>().unwrap(), MemoryLimit::Unlimited);
assert_eq!("0%".parse::<MemoryLimit>().unwrap(), MemoryLimit::Unlimited);
assert_eq!("0B".parse::<MemoryLimit>().unwrap(), MemoryLimit::Unlimited);
assert_eq!(
"0KB".parse::<MemoryLimit>().unwrap(),
MemoryLimit::Unlimited
);
// Unlimited always displays as "unlimited"
assert_eq!(MemoryLimit::Unlimited.to_string(), "unlimited");
}
}

View File

@@ -45,3 +45,19 @@ pub fn from_err_code_msg_to_header(code: u32, msg: &str) -> HeaderMap {
header.insert(GREPTIME_DB_HEADER_ERROR_MSG, msg);
header
}
/// Returns the external root cause of the source error (exclude the current error).
pub fn root_source(err: &dyn std::error::Error) -> Option<&dyn std::error::Error> {
// There are some divergence about the behavior of the `sources()` API
// in https://github.com/rust-lang/rust/issues/58520
// So this function iterates the sources manually.
let mut root = err.source();
while let Some(r) = root {
if let Some(s) = r.source() {
root = Some(s);
} else {
break;
}
}
root
}

View File

@@ -104,7 +104,7 @@ impl MetaClientSelector {
let cfg = ChannelConfig::new()
.connect_timeout(Duration::from_secs(30))
.timeout(Duration::from_secs(30));
let channel_manager = ChannelManager::with_config(cfg);
let channel_manager = ChannelManager::with_config(cfg, None);
Self {
meta_client,
channel_manager,

View File

@@ -12,10 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use crate::aggrs::vector::avg::VectorAvg;
use crate::aggrs::vector::product::VectorProduct;
use crate::aggrs::vector::sum::VectorSum;
use crate::function_registry::FunctionRegistry;
mod avg;
mod product;
mod sum;
@@ -25,5 +27,6 @@ impl VectorFunction {
pub fn register(registry: &FunctionRegistry) {
registry.register_aggr(VectorSum::uadf_impl());
registry.register_aggr(VectorProduct::uadf_impl());
registry.register_aggr(VectorAvg::uadf_impl());
}
}

View File

@@ -0,0 +1,270 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::borrow::Cow;
use std::sync::Arc;
use arrow::array::{Array, ArrayRef, AsArray, BinaryArray, LargeStringArray, StringArray};
use arrow::compute::sum;
use arrow::datatypes::UInt64Type;
use arrow_schema::{DataType, Field};
use datafusion_common::{Result, ScalarValue};
use datafusion_expr::{
Accumulator, AggregateUDF, Signature, SimpleAggregateUDF, TypeSignature, Volatility,
};
use datafusion_functions_aggregate_common::accumulator::AccumulatorArgs;
use nalgebra::{Const, DVector, DVectorView, Dyn, OVector};
use crate::scalars::vector::impl_conv::{
binlit_as_veclit, parse_veclit_from_strlit, veclit_to_binlit,
};
/// The accumulator for the `vec_avg` aggregate function.
#[derive(Debug, Default)]
pub struct VectorAvg {
sum: Option<OVector<f32, Dyn>>,
count: u64,
}
impl VectorAvg {
/// Create a new `AggregateUDF` for the `vec_avg` aggregate function.
pub fn uadf_impl() -> AggregateUDF {
let signature = Signature::one_of(
vec![
TypeSignature::Exact(vec![DataType::Utf8]),
TypeSignature::Exact(vec![DataType::LargeUtf8]),
TypeSignature::Exact(vec![DataType::Binary]),
],
Volatility::Immutable,
);
let udaf = SimpleAggregateUDF::new_with_signature(
"vec_avg",
signature,
DataType::Binary,
Arc::new(Self::accumulator),
vec![
Arc::new(Field::new("sum", DataType::Binary, true)),
Arc::new(Field::new("count", DataType::UInt64, true)),
],
);
AggregateUDF::from(udaf)
}
fn accumulator(args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
if args.schema.fields().len() != 1 {
return Err(datafusion_common::DataFusionError::Internal(format!(
"expect creating `VEC_AVG` with only one input field, actual {}",
args.schema.fields().len()
)));
}
let t = args.schema.field(0).data_type();
if !matches!(t, DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary) {
return Err(datafusion_common::DataFusionError::Internal(format!(
"unexpected input datatype {t} when creating `VEC_AVG`"
)));
}
Ok(Box::new(VectorAvg::default()))
}
fn inner(&mut self, len: usize) -> &mut OVector<f32, Dyn> {
self.sum
.get_or_insert_with(|| OVector::zeros_generic(Dyn(len), Const::<1>))
}
fn update(&mut self, values: &[ArrayRef], is_update: bool) -> Result<()> {
if values.is_empty() {
return Ok(());
};
let vectors = match values[0].data_type() {
DataType::Utf8 => {
let arr: &StringArray = values[0].as_string();
arr.iter()
.filter_map(|x| x.map(|s| parse_veclit_from_strlit(s).map_err(Into::into)))
.map(|x| x.map(Cow::Owned))
.collect::<Result<Vec<_>>>()?
}
DataType::LargeUtf8 => {
let arr: &LargeStringArray = values[0].as_string();
arr.iter()
.filter_map(|x| x.map(|s| parse_veclit_from_strlit(s).map_err(Into::into)))
.map(|x: Result<Vec<f32>>| x.map(Cow::Owned))
.collect::<Result<Vec<_>>>()?
}
DataType::Binary => {
let arr: &BinaryArray = values[0].as_binary();
arr.iter()
.filter_map(|x| x.map(|b| binlit_as_veclit(b).map_err(Into::into)))
.collect::<Result<Vec<_>>>()?
}
_ => {
return Err(datafusion_common::DataFusionError::NotImplemented(format!(
"unsupported data type {} for `VEC_AVG`",
values[0].data_type()
)));
}
};
if vectors.is_empty() {
return Ok(());
}
let len = if is_update {
vectors.len() as u64
} else {
sum(values[1].as_primitive::<UInt64Type>()).unwrap_or_default()
};
let dims = vectors[0].len();
let mut sum = DVector::zeros(dims);
for v in vectors {
if v.len() != dims {
return Err(datafusion_common::DataFusionError::Execution(
"vectors length not match: VEC_AVG".to_string(),
));
}
let v_view = DVectorView::from_slice(&v, dims);
sum += &v_view;
}
*self.inner(dims) += sum;
self.count += len;
Ok(())
}
}
impl Accumulator for VectorAvg {
fn state(&mut self) -> Result<Vec<ScalarValue>> {
let vector = match &self.sum {
None => ScalarValue::Binary(None),
Some(sum) => ScalarValue::Binary(Some(veclit_to_binlit(sum.as_slice()))),
};
Ok(vec![vector, ScalarValue::from(self.count)])
}
fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
self.update(values, true)
}
fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
self.update(states, false)
}
fn evaluate(&mut self) -> Result<ScalarValue> {
match &self.sum {
None => Ok(ScalarValue::Binary(None)),
Some(sum) => Ok(ScalarValue::Binary(Some(veclit_to_binlit(
(sum / self.count as f32).as_slice(),
)))),
}
}
fn size(&self) -> usize {
size_of_val(self)
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use arrow::array::StringArray;
use datatypes::scalars::ScalarVector;
use datatypes::vectors::{ConstantVector, StringVector, Vector};
use super::*;
#[test]
fn test_update_batch() {
// test update empty batch, expect not updating anything
let mut vec_avg = VectorAvg::default();
vec_avg.update_batch(&[]).unwrap();
assert!(vec_avg.sum.is_none());
assert_eq!(ScalarValue::Binary(None), vec_avg.evaluate().unwrap());
// test update one not-null value
let mut vec_avg = VectorAvg::default();
let v: Vec<ArrayRef> = vec![Arc::new(StringArray::from(vec![
Some("[1.0,2.0,3.0]".to_string()),
Some("[4.0,5.0,6.0]".to_string()),
]))];
vec_avg.update_batch(&v).unwrap();
assert_eq!(
ScalarValue::Binary(Some(veclit_to_binlit(&[2.5, 3.5, 4.5]))),
vec_avg.evaluate().unwrap()
);
// test update one null value
let mut vec_avg = VectorAvg::default();
let v: Vec<ArrayRef> = vec![Arc::new(StringArray::from(vec![Option::<String>::None]))];
vec_avg.update_batch(&v).unwrap();
assert_eq!(ScalarValue::Binary(None), vec_avg.evaluate().unwrap());
// test update no null-value batch
let mut vec_avg = VectorAvg::default();
let v: Vec<ArrayRef> = vec![Arc::new(StringArray::from(vec![
Some("[1.0,2.0,3.0]".to_string()),
Some("[4.0,5.0,6.0]".to_string()),
Some("[7.0,8.0,9.0]".to_string()),
]))];
vec_avg.update_batch(&v).unwrap();
assert_eq!(
ScalarValue::Binary(Some(veclit_to_binlit(&[4.0, 5.0, 6.0]))),
vec_avg.evaluate().unwrap()
);
// test update null-value batch
let mut vec_avg = VectorAvg::default();
let v: Vec<ArrayRef> = vec![Arc::new(StringArray::from(vec![
Some("[1.0,2.0,3.0]".to_string()),
None,
Some("[7.0,8.0,9.0]".to_string()),
]))];
vec_avg.update_batch(&v).unwrap();
assert_eq!(
ScalarValue::Binary(Some(veclit_to_binlit(&[4.0, 5.0, 6.0]))),
vec_avg.evaluate().unwrap()
);
let mut vec_avg = VectorAvg::default();
let v: Vec<ArrayRef> = vec![Arc::new(StringArray::from(vec![
None,
Some("[4.0,5.0,6.0]".to_string()),
Some("[7.0,8.0,9.0]".to_string()),
]))];
vec_avg.update_batch(&v).unwrap();
assert_eq!(
ScalarValue::Binary(Some(veclit_to_binlit(&[5.5, 6.5, 7.5]))),
vec_avg.evaluate().unwrap()
);
// test update with constant vector
let mut vec_avg = VectorAvg::default();
let v: Vec<ArrayRef> = vec![
Arc::new(ConstantVector::new(
Arc::new(StringVector::from_vec(vec!["[1.0,2.0,3.0]".to_string()])),
4,
))
.to_arrow_array(),
];
vec_avg.update_batch(&v).unwrap();
assert_eq!(
ScalarValue::Binary(Some(veclit_to_binlit(&[1.0, 2.0, 3.0]))),
vec_avg.evaluate().unwrap()
);
}
}

View File

@@ -14,6 +14,7 @@
mod convert;
mod distance;
mod elem_avg;
mod elem_product;
mod elem_sum;
pub mod impl_conv;
@@ -64,6 +65,7 @@ impl VectorFunction {
registry.register_scalar(vector_subvector::VectorSubvectorFunction::default());
registry.register_scalar(elem_sum::ElemSumFunction::default());
registry.register_scalar(elem_product::ElemProductFunction::default());
registry.register_scalar(elem_avg::ElemAvgFunction::default());
}
}

View File

@@ -0,0 +1,128 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::fmt::Display;
use datafusion::arrow::datatypes::DataType;
use datafusion::logical_expr::ColumnarValue;
use datafusion_common::ScalarValue;
use datafusion_expr::type_coercion::aggregates::{BINARYS, STRINGS};
use datafusion_expr::{ScalarFunctionArgs, Signature, TypeSignature, Volatility};
use nalgebra::DVectorView;
use crate::function::Function;
use crate::scalars::vector::{VectorCalculator, impl_conv};
const NAME: &str = "vec_elem_avg";
#[derive(Debug, Clone)]
pub(crate) struct ElemAvgFunction {
signature: Signature,
}
impl Default for ElemAvgFunction {
fn default() -> Self {
Self {
signature: Signature::one_of(
vec![
TypeSignature::Uniform(1, STRINGS.to_vec()),
TypeSignature::Uniform(1, BINARYS.to_vec()),
TypeSignature::Uniform(1, vec![DataType::BinaryView]),
],
Volatility::Immutable,
),
}
}
}
impl Function for ElemAvgFunction {
fn name(&self) -> &str {
NAME
}
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
Ok(DataType::Float32)
}
fn signature(&self) -> &Signature {
&self.signature
}
fn invoke_with_args(
&self,
args: ScalarFunctionArgs,
) -> datafusion_common::Result<ColumnarValue> {
let body = |v0: &ScalarValue| -> datafusion_common::Result<ScalarValue> {
let v0 =
impl_conv::as_veclit(v0)?.map(|v0| DVectorView::from_slice(&v0, v0.len()).mean());
Ok(ScalarValue::Float32(v0))
};
let calculator = VectorCalculator {
name: self.name(),
func: body,
};
calculator.invoke_with_single_argument(args)
}
}
impl Display for ElemAvgFunction {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", NAME.to_ascii_uppercase())
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use arrow::array::StringViewArray;
use arrow_schema::Field;
use datafusion::arrow::array::{Array, AsArray};
use datafusion::arrow::datatypes::Float32Type;
use datafusion_common::config::ConfigOptions;
use super::*;
#[test]
fn test_elem_avg() {
let func = ElemAvgFunction::default();
let input = Arc::new(StringViewArray::from(vec![
Some("[1.0,2.0,3.0]".to_string()),
Some("[4.0,5.0,6.0]".to_string()),
Some("[7.0,8.0,9.0]".to_string()),
None,
]));
let result = func
.invoke_with_args(ScalarFunctionArgs {
args: vec![ColumnarValue::Array(input.clone())],
arg_fields: vec![],
number_rows: input.len(),
return_field: Arc::new(Field::new("x", DataType::Float32, true)),
config_options: Arc::new(ConfigOptions::new()),
})
.and_then(|v| ColumnarValue::values_to_arrays(&[v]))
.map(|mut a| a.remove(0))
.unwrap();
let result = result.as_primitive::<Float32Type>();
assert_eq!(result.len(), 4);
assert_eq!(result.value(0), 2.0);
assert_eq!(result.value(1), 5.0);
assert_eq!(result.value(2), 8.0);
assert!(result.is_null(3));
}
}

View File

@@ -23,9 +23,10 @@ use datafusion::arrow::array::{ArrayRef, StringArray, as_boolean_array};
use datafusion::catalog::TableFunction;
use datafusion::common::ScalarValue;
use datafusion::common::utils::SingleRowListArrayBuilder;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility};
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, TypeSignature, Volatility};
use datafusion_pg_catalog::pg_catalog::{self, PgCatalogStaticTables};
use datatypes::arrow::datatypes::{DataType, Field};
use derive_more::derive::Display;
use version::PGVersionFunction;
use crate::function::{Function, find_function_context};
@@ -38,7 +39,6 @@ const SESSION_USER_FUNCTION_NAME: &str = "session_user";
const CURRENT_DATABASE_FUNCTION_NAME: &str = "current_database";
define_nullary_udf!(CurrentSchemaFunction);
define_nullary_udf!(CurrentSchemasFunction);
define_nullary_udf!(SessionUserFunction);
define_nullary_udf!(CurrentDatabaseFunction);
@@ -118,6 +118,23 @@ impl Function for SessionUserFunction {
}
}
#[derive(Display, Debug)]
#[display("{}", self.name())]
pub(super) struct CurrentSchemasFunction {
signature: Signature,
}
impl CurrentSchemasFunction {
pub fn new() -> Self {
Self {
signature: Signature::new(
TypeSignature::Exact(vec![DataType::Boolean]),
Volatility::Stable,
),
}
}
}
impl Function for CurrentSchemasFunction {
fn name(&self) -> &str {
CURRENT_SCHEMAS_FUNCTION_NAME
@@ -125,9 +142,9 @@ impl Function for CurrentSchemasFunction {
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
Ok(DataType::List(Arc::new(Field::new(
"x",
DataType::Utf8View,
false,
"item",
DataType::Utf8,
true,
))))
}
@@ -168,7 +185,7 @@ impl PGCatalogFunction {
registry.register_scalar(PGVersionFunction::default());
registry.register_scalar(CurrentSchemaFunction::default());
registry.register_scalar(CurrentSchemasFunction::default());
registry.register_scalar(CurrentSchemasFunction::new());
registry.register_scalar(SessionUserFunction::default());
registry.register_scalar(CurrentDatabaseFunction::default());
registry.register(pg_catalog::format_type::create_format_type_udf());

View File

@@ -22,14 +22,14 @@ use dashmap::DashMap;
use dashmap::mapref::entry::Entry;
use lazy_static::lazy_static;
use serde::{Deserialize, Serialize};
use snafu::{OptionExt, ResultExt};
use snafu::ResultExt;
use tokio_util::sync::CancellationToken;
use tonic::transport::{
Certificate, Channel as InnerChannel, ClientTlsConfig, Endpoint, Identity, Uri,
};
use tower::Service;
use crate::error::{CreateChannelSnafu, InvalidConfigFilePathSnafu, InvalidTlsConfigSnafu, Result};
use crate::error::{CreateChannelSnafu, InvalidConfigFilePathSnafu, Result};
const RECYCLE_CHANNEL_INTERVAL_SECS: u64 = 60;
pub const DEFAULT_GRPC_REQUEST_TIMEOUT_SECS: u64 = 10;
@@ -91,57 +91,18 @@ impl ChannelManager {
Default::default()
}
pub fn with_config(config: ChannelConfig) -> Self {
let inner = Inner::with_config(config);
/// unified with config function that support tls config
/// use [`load_tls_config`] to load tls config from file system
pub fn with_config(config: ChannelConfig, tls_config: Option<ClientTlsConfig>) -> Self {
let mut inner = Inner::with_config(config.clone());
if let Some(tls_config) = tls_config {
inner.client_tls_config = Some(tls_config);
}
Self {
inner: Arc::new(inner),
}
}
/// Read tls cert and key files and create a ChannelManager with TLS config.
pub fn with_tls_config(config: ChannelConfig) -> Result<Self> {
let mut inner = Inner::with_config(config.clone());
// setup tls
let path_config = config.client_tls.context(InvalidTlsConfigSnafu {
msg: "no config input",
})?;
if !path_config.enabled {
// if TLS not enabled, just ignore other tls config
// and not set `client_tls_config` hence not use TLS
return Ok(Self {
inner: Arc::new(inner),
});
}
let mut tls_config = ClientTlsConfig::new();
if let Some(server_ca) = path_config.server_ca_cert_path {
let server_root_ca_cert =
std::fs::read_to_string(server_ca).context(InvalidConfigFilePathSnafu)?;
let server_root_ca_cert = Certificate::from_pem(server_root_ca_cert);
tls_config = tls_config.ca_certificate(server_root_ca_cert);
}
if let (Some(client_cert_path), Some(client_key_path)) =
(&path_config.client_cert_path, &path_config.client_key_path)
{
let client_cert =
std::fs::read_to_string(client_cert_path).context(InvalidConfigFilePathSnafu)?;
let client_key =
std::fs::read_to_string(client_key_path).context(InvalidConfigFilePathSnafu)?;
let client_identity = Identity::from_pem(client_cert, client_key);
tls_config = tls_config.identity(client_identity);
}
inner.client_tls_config = Some(tls_config);
Ok(Self {
inner: Arc::new(inner),
})
}
pub fn config(&self) -> &ChannelConfig {
&self.inner.config
}
@@ -287,6 +248,34 @@ impl ChannelManager {
}
}
pub fn load_tls_config(tls_option: Option<&ClientTlsOption>) -> Result<Option<ClientTlsConfig>> {
let path_config = match tls_option {
Some(path_config) if path_config.enabled => path_config,
_ => return Ok(None),
};
let mut tls_config = ClientTlsConfig::new();
if let Some(server_ca) = &path_config.server_ca_cert_path {
let server_root_ca_cert =
std::fs::read_to_string(server_ca).context(InvalidConfigFilePathSnafu)?;
let server_root_ca_cert = Certificate::from_pem(server_root_ca_cert);
tls_config = tls_config.ca_certificate(server_root_ca_cert);
}
if let (Some(client_cert_path), Some(client_key_path)) =
(&path_config.client_cert_path, &path_config.client_key_path)
{
let client_cert =
std::fs::read_to_string(client_cert_path).context(InvalidConfigFilePathSnafu)?;
let client_key =
std::fs::read_to_string(client_key_path).context(InvalidConfigFilePathSnafu)?;
let client_identity = Identity::from_pem(client_cert, client_key);
tls_config = tls_config.identity(client_identity);
}
Ok(Some(tls_config))
}
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct ClientTlsOption {
/// Whether to enable TLS for client.
@@ -659,7 +648,7 @@ mod tests {
.http2_adaptive_window(true)
.tcp_keepalive(Duration::from_secs(2))
.tcp_nodelay(true);
let mgr = ChannelManager::with_config(config);
let mgr = ChannelManager::with_config(config, None);
let res = mgr.build_endpoint("test_addr");

View File

@@ -12,14 +12,17 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use common_grpc::channel_manager::{ChannelConfig, ChannelManager, ClientTlsOption};
use common_grpc::channel_manager::{
ChannelConfig, ChannelManager, ClientTlsOption, load_tls_config,
};
#[tokio::test]
async fn test_mtls_config() {
// test no config
let config = ChannelConfig::new();
let re = ChannelManager::with_tls_config(config);
assert!(re.is_err());
let re = load_tls_config(config.client_tls.as_ref());
assert!(re.is_ok());
assert!(re.unwrap().is_none());
// test wrong file
let config = ChannelConfig::new().client_tls_config(ClientTlsOption {
@@ -29,7 +32,7 @@ async fn test_mtls_config() {
client_key_path: Some("tests/tls/wrong_client.key".to_string()),
});
let re = ChannelManager::with_tls_config(config);
let re = load_tls_config(config.client_tls.as_ref());
assert!(re.is_err());
// test corrupted file content
@@ -40,7 +43,9 @@ async fn test_mtls_config() {
client_key_path: Some("tests/tls/corrupted".to_string()),
});
let re = ChannelManager::with_tls_config(config).unwrap();
let tls_config = load_tls_config(config.client_tls.as_ref()).unwrap();
let re = ChannelManager::with_config(config, tls_config);
let re = re.get("127.0.0.1:0");
assert!(re.is_err());
@@ -52,7 +57,8 @@ async fn test_mtls_config() {
client_key_path: Some("tests/tls/client.key".to_string()),
});
let re = ChannelManager::with_tls_config(config).unwrap();
let tls_config = load_tls_config(config.client_tls.as_ref()).unwrap();
let re = ChannelManager::with_config(config, tls_config);
let re = re.get("127.0.0.1:0");
let _ = re.unwrap();
}

View File

@@ -77,7 +77,10 @@ serde_json.workspace = true
serde_with.workspace = true
session.workspace = true
snafu.workspace = true
sqlx = { workspace = true, optional = true }
sqlx = { workspace = true, features = [
"mysql",
"chrono",
], optional = true }
store-api.workspace = true
strum.workspace = true
table = { workspace = true, features = ["testing"] }

View File

@@ -442,7 +442,7 @@ pub fn extract_column_metadatas(
results: &mut [RegionResponse],
key: &str,
) -> Result<Option<Vec<ColumnMetadata>>> {
let schemas = results
let mut schemas = results
.iter_mut()
.map(|r| r.extensions.remove(key))
.collect::<Vec<_>>();
@@ -454,20 +454,24 @@ pub fn extract_column_metadatas(
// Verify all the physical schemas are the same
// Safety: previous check ensures this vec is not empty
let first = schemas.first().unwrap();
ensure!(
schemas.iter().all(|x| x == first),
MetadataCorruptionSnafu {
err_msg: "The table column metadata schemas from datanodes are not the same."
}
);
let first_column_metadatas = schemas
.swap_remove(0)
.map(|first_bytes| ColumnMetadata::decode_list(&first_bytes).context(DecodeJsonSnafu))
.transpose()?;
if let Some(first) = first {
let column_metadatas = ColumnMetadata::decode_list(first).context(DecodeJsonSnafu)?;
Ok(Some(column_metadatas))
} else {
Ok(None)
for s in schemas {
// check decoded column metadata instead of bytes because it contains extension map.
let column_metadata = s
.map(|bytes| ColumnMetadata::decode_list(&bytes).context(DecodeJsonSnafu))
.transpose()?;
ensure!(
column_metadata == first_column_metadatas,
MetadataCorruptionSnafu {
err_msg: "The table column metadata schemas from datanodes are not the same."
}
);
}
Ok(first_column_metadatas)
}
#[cfg(test)]

View File

@@ -250,7 +250,7 @@ pub struct UpgradeRegion {
/// `None` stands for no wait,
/// it's helpful to verify whether the leader region is ready.
#[serde(with = "humantime_serde")]
pub replay_timeout: Option<Duration>,
pub replay_timeout: Duration,
/// The hint for replaying memtable.
#[serde(default)]
pub location_id: Option<u64>,
@@ -507,13 +507,14 @@ pub enum Instruction {
/// Closes regions.
#[serde(deserialize_with = "single_or_multiple_from", alias = "CloseRegion")]
CloseRegions(Vec<RegionIdent>),
/// Upgrades a region.
UpgradeRegion(UpgradeRegion),
/// Upgrades regions.
#[serde(deserialize_with = "single_or_multiple_from", alias = "UpgradeRegion")]
UpgradeRegions(Vec<UpgradeRegion>),
#[serde(
deserialize_with = "single_or_multiple_from",
alias = "DowngradeRegion"
)]
/// Downgrades a region.
/// Downgrades regions.
DowngradeRegions(Vec<DowngradeRegion>),
/// Invalidates batch cache.
InvalidateCaches(Vec<CacheIdent>),
@@ -559,9 +560,9 @@ impl Instruction {
}
/// Converts the instruction into a [UpgradeRegion].
pub fn into_upgrade_regions(self) -> Option<UpgradeRegion> {
pub fn into_upgrade_regions(self) -> Option<Vec<UpgradeRegion>> {
match self {
Self::UpgradeRegion(upgrade_region) => Some(upgrade_region),
Self::UpgradeRegions(upgrade_region) => Some(upgrade_region),
_ => None,
}
}
@@ -584,6 +585,10 @@ impl Instruction {
/// The reply of [UpgradeRegion].
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
pub struct UpgradeRegionReply {
/// The [RegionId].
/// For compatibility, it is defaulted to [RegionId::new(0, 0)].
#[serde(default)]
pub region_id: RegionId,
/// Returns true if `last_entry_id` has been replayed to the latest.
pub ready: bool,
/// Indicates whether the region exists.
@@ -635,6 +640,39 @@ where
})
}
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
pub struct UpgradeRegionsReply {
pub replies: Vec<UpgradeRegionReply>,
}
impl UpgradeRegionsReply {
pub fn new(replies: Vec<UpgradeRegionReply>) -> Self {
Self { replies }
}
pub fn single(reply: UpgradeRegionReply) -> Self {
Self::new(vec![reply])
}
}
#[derive(Deserialize)]
#[serde(untagged)]
enum UpgradeRegionsCompat {
Single(UpgradeRegionReply),
Multiple(UpgradeRegionsReply),
}
fn upgrade_regions_compat_from<'de, D>(deserializer: D) -> Result<UpgradeRegionsReply, D::Error>
where
D: Deserializer<'de>,
{
let helper = UpgradeRegionsCompat::deserialize(deserializer)?;
Ok(match helper {
UpgradeRegionsCompat::Single(x) => UpgradeRegionsReply::new(vec![x]),
UpgradeRegionsCompat::Multiple(reply) => reply,
})
}
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum InstructionReply {
@@ -642,7 +680,11 @@ pub enum InstructionReply {
OpenRegions(SimpleReply),
#[serde(alias = "close_region")]
CloseRegions(SimpleReply),
UpgradeRegion(UpgradeRegionReply),
#[serde(
deserialize_with = "upgrade_regions_compat_from",
alias = "upgrade_region"
)]
UpgradeRegions(UpgradeRegionsReply),
#[serde(
alias = "downgrade_region",
deserialize_with = "downgrade_regions_compat_from"
@@ -658,9 +700,11 @@ impl Display for InstructionReply {
match self {
Self::OpenRegions(reply) => write!(f, "InstructionReply::OpenRegions({})", reply),
Self::CloseRegions(reply) => write!(f, "InstructionReply::CloseRegions({})", reply),
Self::UpgradeRegion(reply) => write!(f, "InstructionReply::UpgradeRegion({})", reply),
Self::UpgradeRegions(reply) => {
write!(f, "InstructionReply::UpgradeRegions({:?})", reply.replies)
}
Self::DowngradeRegions(reply) => {
write!(f, "InstructionReply::DowngradeRegions({:?})", reply)
write!(f, "InstructionReply::DowngradeRegions({:?})", reply.replies)
}
Self::FlushRegions(reply) => write!(f, "InstructionReply::FlushRegions({})", reply),
Self::GetFileRefs(reply) => write!(f, "InstructionReply::GetFileRefs({})", reply),
@@ -685,9 +729,9 @@ impl InstructionReply {
}
}
pub fn expect_upgrade_region_reply(self) -> UpgradeRegionReply {
pub fn expect_upgrade_regions_reply(self) -> Vec<UpgradeRegionReply> {
match self {
Self::UpgradeRegion(reply) => reply,
Self::UpgradeRegions(reply) => reply.replies,
_ => panic!("Expected UpgradeRegion reply"),
}
}
@@ -749,25 +793,58 @@ mod tests {
serialized
);
let downgrade_region = InstructionReply::DowngradeRegions(DowngradeRegionsReply::single(
DowngradeRegionReply {
let upgrade_region = Instruction::UpgradeRegions(vec![UpgradeRegion {
region_id: RegionId::new(1024, 1),
last_entry_id: None,
metadata_last_entry_id: None,
replay_timeout: Duration::from_millis(1000),
location_id: None,
replay_entry_id: None,
metadata_replay_entry_id: None,
}]);
let serialized = serde_json::to_string(&upgrade_region).unwrap();
assert_eq!(
r#"{"UpgradeRegions":[{"region_id":4398046511105,"last_entry_id":null,"metadata_last_entry_id":null,"replay_timeout":"1s","location_id":null}]}"#,
serialized
);
}
#[test]
fn test_serialize_instruction_reply() {
let downgrade_region_reply = InstructionReply::DowngradeRegions(
DowngradeRegionsReply::single(DowngradeRegionReply {
region_id: RegionId::new(1024, 1),
last_entry_id: None,
metadata_last_entry_id: None,
exists: true,
error: None,
},
));
}),
);
let serialized = serde_json::to_string(&downgrade_region).unwrap();
let serialized = serde_json::to_string(&downgrade_region_reply).unwrap();
assert_eq!(
r#"{"type":"downgrade_regions","replies":[{"region_id":4398046511105,"last_entry_id":null,"metadata_last_entry_id":null,"exists":true,"error":null}]}"#,
serialized
)
);
let upgrade_region_reply =
InstructionReply::UpgradeRegions(UpgradeRegionsReply::single(UpgradeRegionReply {
region_id: RegionId::new(1024, 1),
ready: true,
exists: true,
error: None,
}));
let serialized = serde_json::to_string(&upgrade_region_reply).unwrap();
assert_eq!(
r#"{"type":"upgrade_regions","replies":[{"region_id":4398046511105,"ready":true,"exists":true,"error":null}]}"#,
serialized
);
}
#[test]
fn test_deserialize_instruction() {
// legacy open region instruction
let open_region_instruction = r#"{"OpenRegion":{"region_ident":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"},"region_storage_path":"test/foo","region_options":{},"region_wal_options":{},"skip_wal_replay":false}}"#;
let open_region_instruction: Instruction =
serde_json::from_str(open_region_instruction).unwrap();
@@ -785,6 +862,7 @@ mod tests {
)]);
assert_eq!(open_region_instruction, open_region);
// legacy close region instruction
let close_region_instruction = r#"{"CloseRegion":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"}}"#;
let close_region_instruction: Instruction =
serde_json::from_str(close_region_instruction).unwrap();
@@ -796,6 +874,7 @@ mod tests {
}]);
assert_eq!(close_region_instruction, close_region);
// legacy downgrade region instruction
let downgrade_region_instruction = r#"{"DowngradeRegions":{"region_id":4398046511105,"flush_timeout":{"secs":1,"nanos":0}}}"#;
let downgrade_region_instruction: Instruction =
serde_json::from_str(downgrade_region_instruction).unwrap();
@@ -805,6 +884,25 @@ mod tests {
}]);
assert_eq!(downgrade_region_instruction, downgrade_region);
// legacy upgrade region instruction
let upgrade_region_instruction = r#"{"UpgradeRegion":{"region_id":4398046511105,"last_entry_id":null,"metadata_last_entry_id":null,"replay_timeout":"1s","location_id":null,"replay_entry_id":null,"metadata_replay_entry_id":null}}"#;
let upgrade_region_instruction: Instruction =
serde_json::from_str(upgrade_region_instruction).unwrap();
let upgrade_region = Instruction::UpgradeRegions(vec![UpgradeRegion {
region_id: RegionId::new(1024, 1),
last_entry_id: None,
metadata_last_entry_id: None,
replay_timeout: Duration::from_millis(1000),
location_id: None,
replay_entry_id: None,
metadata_replay_entry_id: None,
}]);
assert_eq!(upgrade_region_instruction, upgrade_region);
}
#[test]
fn test_deserialize_instruction_reply() {
// legacy close region reply
let close_region_instruction_reply =
r#"{"result":true,"error":null,"type":"close_region"}"#;
let close_region_instruction_reply: InstructionReply =
@@ -815,6 +913,7 @@ mod tests {
});
assert_eq!(close_region_instruction_reply, close_region_reply);
// legacy open region reply
let open_region_instruction_reply = r#"{"result":true,"error":null,"type":"open_region"}"#;
let open_region_instruction_reply: InstructionReply =
serde_json::from_str(open_region_instruction_reply).unwrap();
@@ -824,6 +923,7 @@ mod tests {
});
assert_eq!(open_region_instruction_reply, open_region_reply);
// legacy downgrade region reply
let downgrade_region_instruction_reply = r#"{"region_id":4398046511105,"last_entry_id":null,"metadata_last_entry_id":null,"exists":true,"error":null,"type":"downgrade_region"}"#;
let downgrade_region_instruction_reply: InstructionReply =
serde_json::from_str(downgrade_region_instruction_reply).unwrap();
@@ -837,6 +937,19 @@ mod tests {
}),
);
assert_eq!(downgrade_region_instruction_reply, downgrade_region_reply);
// legacy upgrade region reply
let upgrade_region_instruction_reply = r#"{"region_id":4398046511105,"ready":true,"exists":true,"error":null,"type":"upgrade_region"}"#;
let upgrade_region_instruction_reply: InstructionReply =
serde_json::from_str(upgrade_region_instruction_reply).unwrap();
let upgrade_region_reply =
InstructionReply::UpgradeRegions(UpgradeRegionsReply::single(UpgradeRegionReply {
region_id: RegionId::new(1024, 1),
ready: true,
exists: true,
error: None,
}));
assert_eq!(upgrade_region_instruction_reply, upgrade_region_reply);
}
#[derive(Debug, Clone, Serialize, Deserialize)]

View File

@@ -164,6 +164,25 @@ impl DatanodeTableManager {
.transpose()
}
pub async fn batch_get(
&self,
keys: &[DatanodeTableKey],
) -> Result<HashMap<DatanodeTableKey, DatanodeTableValue>> {
let req = BatchGetRequest::default().with_keys(keys.iter().map(|k| k.to_bytes()).collect());
let resp = self.kv_backend.batch_get(req).await?;
let values = resp
.kvs
.into_iter()
.map(|kv| {
Ok((
DatanodeTableKey::from_bytes(&kv.key)?,
DatanodeTableValue::try_from_raw_value(&kv.value)?,
))
})
.collect::<Result<HashMap<_, _>>>()?;
Ok(values)
}
pub fn tables(
&self,
datanode_id: DatanodeId,

View File

@@ -661,13 +661,32 @@ impl TableRouteStorage {
/// Returns batch of [`TableRouteValue`] that respects the order of `table_ids`.
pub async fn batch_get(&self, table_ids: &[TableId]) -> Result<Vec<Option<TableRouteValue>>> {
let mut table_routes = self.batch_get_inner(table_ids).await?;
self.remap_routes_addresses(&mut table_routes).await?;
let raw_table_routes = self.batch_get_inner(table_ids).await?;
Ok(table_routes)
Ok(raw_table_routes
.into_iter()
.map(|v| v.map(|x| x.inner))
.collect())
}
async fn batch_get_inner(&self, table_ids: &[TableId]) -> Result<Vec<Option<TableRouteValue>>> {
/// Returns batch of [`TableRouteValue`] wrapped with [`DeserializedValueWithBytes`].
///
/// The return value is a vector of [`Option<DeserializedValueWithBytes<TableRouteValue>>`].
/// Note: This method remaps the addresses of the table routes, but does not update their raw byte representations.
pub async fn batch_get_with_raw_bytes(
&self,
table_ids: &[TableId],
) -> Result<Vec<Option<DeserializedValueWithBytes<TableRouteValue>>>> {
let mut raw_table_routes = self.batch_get_inner(table_ids).await?;
self.remap_routes_addresses(&mut raw_table_routes).await?;
Ok(raw_table_routes)
}
async fn batch_get_inner(
&self,
table_ids: &[TableId],
) -> Result<Vec<Option<DeserializedValueWithBytes<TableRouteValue>>>> {
let keys = table_ids
.iter()
.map(|id| TableRouteKey::new(*id).to_bytes())
@@ -685,7 +704,7 @@ impl TableRouteStorage {
keys.into_iter()
.map(|key| {
if let Some(value) = kvs.get(&key) {
Ok(Some(TableRouteValue::try_from_raw_value(value)?))
Ok(Some(DeserializedValueWithBytes::from_inner_slice(value)?))
} else {
Ok(None)
}
@@ -695,14 +714,14 @@ impl TableRouteStorage {
async fn remap_routes_addresses(
&self,
table_routes: &mut [Option<TableRouteValue>],
table_routes: &mut [Option<DeserializedValueWithBytes<TableRouteValue>>],
) -> Result<()> {
let keys = table_routes
.iter()
.flat_map(|table_route| {
table_route
.as_ref()
.map(extract_address_keys)
.map(|x| extract_address_keys(&x.inner))
.unwrap_or_default()
})
.collect::<HashSet<_>>()

View File

@@ -33,7 +33,7 @@ use crate::rpc::store::{
// The TopicRegionKey is a key for the topic-region mapping in the kvbackend.
// The layout of the key is `__topic_region/{topic_name}/{region_id}`.
#[derive(Debug, Clone, PartialEq)]
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct TopicRegionKey<'a> {
pub region_id: RegionId,
pub topic: &'a str,

View File

@@ -26,7 +26,6 @@ use datatypes::arrow::datatypes::{
Int32Type, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
TimestampSecondType,
};
use datatypes::schema::SchemaRef;
fn prepare_record_batch(rows: usize) -> RecordBatch {
let schema = Schema::new(vec![
@@ -56,14 +55,6 @@ fn prepare_record_batch(rows: usize) -> RecordBatch {
RecordBatch::try_new(Arc::new(schema), columns).unwrap()
}
fn iter_by_greptimedb_values(schema: SchemaRef, record_batch: RecordBatch) {
let record_batch =
common_recordbatch::RecordBatch::try_from_df_record_batch(schema, record_batch).unwrap();
for row in record_batch.rows() {
black_box(row);
}
}
fn iter_by_loop_rows_and_columns(record_batch: RecordBatch) {
for i in 0..record_batch.num_rows() {
for column in record_batch.columns() {
@@ -125,19 +116,6 @@ pub fn criterion_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("iter_record_batch");
for rows in [1usize, 10, 100, 1_000, 10_000] {
group.bench_with_input(
BenchmarkId::new("by_greptimedb_values", rows),
&rows,
|b, rows| {
let record_batch = prepare_record_batch(*rows);
let schema =
Arc::new(datatypes::schema::Schema::try_from(record_batch.schema()).unwrap());
b.iter(|| {
iter_by_greptimedb_values(schema.clone(), record_batch.clone());
})
},
);
group.bench_with_input(
BenchmarkId::new("by_loop_rows_and_columns", rows),
&rows,

View File

@@ -193,6 +193,13 @@ pub enum Error {
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Exceeded memory limit: {}", msg))]
ExceedMemoryLimit {
msg: String,
#[snafu(implicit)]
location: Location,
},
}
impl ErrorExt for Error {
@@ -229,6 +236,8 @@ impl ErrorExt for Error {
Error::StreamTimeout { .. } => StatusCode::Cancelled,
Error::StreamCancelled { .. } => StatusCode::Cancelled,
Error::ExceedMemoryLimit { .. } => StatusCode::RuntimeResourcesExhausted,
}
}

View File

@@ -21,11 +21,14 @@ pub mod filter;
mod recordbatch;
pub mod util;
use std::fmt;
use std::pin::Pin;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use adapter::RecordBatchMetrics;
use arc_swap::ArcSwapOption;
use common_base::readable_size::ReadableSize;
pub use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream;
use datatypes::arrow::compute::SortOptions;
pub use datatypes::arrow::record_batch::RecordBatch as DfRecordBatch;
@@ -406,6 +409,399 @@ impl<S: Stream<Item = Result<RecordBatch>> + Unpin> Stream for RecordBatchStream
}
}
/// Memory permit for a stream, providing privileged access or rate limiting.
///
/// The permit tracks whether this stream has privileged Top-K status.
/// When dropped, it automatically releases any privileged slot it holds.
pub struct MemoryPermit {
tracker: QueryMemoryTracker,
is_privileged: AtomicBool,
}
impl MemoryPermit {
/// Check if this permit currently has privileged status.
pub fn is_privileged(&self) -> bool {
self.is_privileged.load(Ordering::Acquire)
}
/// Ensure this permit has privileged status by acquiring a slot if available.
/// Returns true if privileged (either already privileged or just acquired privilege).
fn ensure_privileged(&self) -> bool {
if self.is_privileged.load(Ordering::Acquire) {
return true;
}
// Try to claim a privileged slot
self.tracker
.privileged_count
.fetch_update(Ordering::AcqRel, Ordering::Acquire, |count| {
if count < self.tracker.privileged_slots {
Some(count + 1)
} else {
None
}
})
.map(|_| {
self.is_privileged.store(true, Ordering::Release);
true
})
.unwrap_or(false)
}
/// Track additional memory usage with this permit.
/// Returns error if limit is exceeded.
///
/// # Arguments
/// * `additional` - Additional memory size to track in bytes
/// * `stream_tracked` - Total memory already tracked by this stream
///
/// # Behavior
/// - Privileged streams: Can push global memory usage up to full limit
/// - Standard-tier streams: Can push global memory usage up to limit * standard_tier_memory_fraction (default: 0.7)
/// - Standard-tier streams automatically attempt to acquire privilege if slots become available
/// - The configured limit is absolute hard limit - no stream can exceed it
pub fn track(&self, additional: usize, stream_tracked: usize) -> Result<()> {
// Ensure privileged status if possible
let is_privileged = self.ensure_privileged();
self.tracker
.track_internal(additional, is_privileged, stream_tracked)
}
/// Release tracked memory.
///
/// # Arguments
/// * `amount` - Amount of memory to release in bytes
pub fn release(&self, amount: usize) {
self.tracker.release(amount);
}
}
impl Drop for MemoryPermit {
fn drop(&mut self) {
// Release privileged slot if we had one
if self.is_privileged.load(Ordering::Acquire) {
self.tracker
.privileged_count
.fetch_sub(1, Ordering::Release);
}
}
}
/// Memory tracker for RecordBatch streams. Clone to share the same limit across queries.
///
/// Implements a two-tier memory allocation strategy:
/// - **Privileged tier**: First N streams (default: 20) can use up to the full memory limit
/// - **Standard tier**: Remaining streams are restricted to a fraction of the limit (default: 70%)
/// - Privilege is granted on a first-come-first-served basis
/// - The configured limit is an absolute hard cap - no stream can exceed it
#[derive(Clone)]
pub struct QueryMemoryTracker {
current: Arc<AtomicUsize>,
limit: usize,
standard_tier_memory_fraction: f64,
privileged_count: Arc<AtomicUsize>,
privileged_slots: usize,
on_update: Option<Arc<dyn Fn(usize) + Send + Sync>>,
on_reject: Option<Arc<dyn Fn() + Send + Sync>>,
}
impl fmt::Debug for QueryMemoryTracker {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("QueryMemoryTracker")
.field("current", &self.current.load(Ordering::Acquire))
.field("limit", &self.limit)
.field(
"standard_tier_memory_fraction",
&self.standard_tier_memory_fraction,
)
.field(
"privileged_count",
&self.privileged_count.load(Ordering::Acquire),
)
.field("privileged_slots", &self.privileged_slots)
.field("on_update", &self.on_update.is_some())
.field("on_reject", &self.on_reject.is_some())
.finish()
}
}
impl QueryMemoryTracker {
// Default privileged slots when max_concurrent_queries is 0.
const DEFAULT_PRIVILEGED_SLOTS: usize = 20;
// Ratio for privileged tier: 70% queries get privileged access, standard tier uses 70% memory.
const DEFAULT_PRIVILEGED_TIER_RATIO: f64 = 0.7;
/// Create a new memory tracker with the given limit and max_concurrent_queries.
/// Calculates privileged slots as 70% of max_concurrent_queries (or 20 if max_concurrent_queries is 0).
///
/// # Arguments
/// * `limit` - Maximum memory usage in bytes (hard limit for all streams). 0 means unlimited.
/// * `max_concurrent_queries` - Maximum number of concurrent queries (0 = unlimited).
pub fn new(limit: usize, max_concurrent_queries: usize) -> Self {
let privileged_slots = Self::calculate_privileged_slots(max_concurrent_queries);
Self::with_privileged_slots(limit, privileged_slots)
}
/// Create a new memory tracker with custom privileged slots limit.
pub fn with_privileged_slots(limit: usize, privileged_slots: usize) -> Self {
Self::with_config(limit, privileged_slots, Self::DEFAULT_PRIVILEGED_TIER_RATIO)
}
/// Create a new memory tracker with full configuration.
///
/// # Arguments
/// * `limit` - Maximum memory usage in bytes (hard limit for all streams). 0 means unlimited.
/// * `privileged_slots` - Maximum number of streams that can get privileged status.
/// * `standard_tier_memory_fraction` - Memory fraction for standard-tier streams (range: [0.0, 1.0]).
///
/// # Panics
/// Panics if `standard_tier_memory_fraction` is not in the range [0.0, 1.0].
pub fn with_config(
limit: usize,
privileged_slots: usize,
standard_tier_memory_fraction: f64,
) -> Self {
assert!(
(0.0..=1.0).contains(&standard_tier_memory_fraction),
"standard_tier_memory_fraction must be in [0.0, 1.0], got {}",
standard_tier_memory_fraction
);
Self {
current: Arc::new(AtomicUsize::new(0)),
limit,
standard_tier_memory_fraction,
privileged_count: Arc::new(AtomicUsize::new(0)),
privileged_slots,
on_update: None,
on_reject: None,
}
}
/// Register a new permit for memory tracking.
/// The first `privileged_slots` permits get privileged status automatically.
/// The returned permit can be shared across multiple streams of the same query.
pub fn register_permit(&self) -> MemoryPermit {
// Try to claim a privileged slot
let is_privileged = self
.privileged_count
.fetch_update(Ordering::AcqRel, Ordering::Acquire, |count| {
if count < self.privileged_slots {
Some(count + 1)
} else {
None
}
})
.is_ok();
MemoryPermit {
tracker: self.clone(),
is_privileged: AtomicBool::new(is_privileged),
}
}
/// Set a callback to be called whenever the usage changes successfully.
/// The callback receives the new total usage in bytes.
///
/// # Note
/// The callback is called after both successful `track()` and `release()` operations.
/// It is called even when `limit == 0` (unlimited mode) to track actual usage.
pub fn with_on_update<F>(mut self, on_update: F) -> Self
where
F: Fn(usize) + Send + Sync + 'static,
{
self.on_update = Some(Arc::new(on_update));
self
}
/// Set a callback to be called when memory allocation is rejected.
///
/// # Note
/// This is only called when `track()` fails due to exceeding the limit.
/// It is never called when `limit == 0` (unlimited mode).
pub fn with_on_reject<F>(mut self, on_reject: F) -> Self
where
F: Fn() + Send + Sync + 'static,
{
self.on_reject = Some(Arc::new(on_reject));
self
}
/// Get the current memory usage in bytes.
pub fn current(&self) -> usize {
self.current.load(Ordering::Acquire)
}
fn calculate_privileged_slots(max_concurrent_queries: usize) -> usize {
if max_concurrent_queries == 0 {
Self::DEFAULT_PRIVILEGED_SLOTS
} else {
((max_concurrent_queries as f64 * Self::DEFAULT_PRIVILEGED_TIER_RATIO) as usize).max(1)
}
}
/// Internal method to track additional memory usage.
///
/// Called by `MemoryPermit::track()`. Use `MemoryPermit::track()` instead of calling this directly.
fn track_internal(
&self,
additional: usize,
is_privileged: bool,
stream_tracked: usize,
) -> Result<()> {
// Calculate effective global limit based on stream privilege
// Privileged streams: can push global usage up to full limit
// Standard-tier streams: can only push global usage up to fraction of limit
let effective_limit = if is_privileged {
self.limit
} else {
(self.limit as f64 * self.standard_tier_memory_fraction) as usize
};
let mut new_total = 0;
let result = self
.current
.fetch_update(Ordering::AcqRel, Ordering::Acquire, |current| {
new_total = current.saturating_add(additional);
if self.limit == 0 {
// Unlimited mode
return Some(new_total);
}
// Check if new global total exceeds effective limit
// The configured limit is absolute hard limit - no stream can exceed it
if new_total <= effective_limit {
Some(new_total)
} else {
None
}
});
match result {
Ok(_) => {
if let Some(callback) = &self.on_update {
callback(new_total);
}
Ok(())
}
Err(current) => {
if let Some(callback) = &self.on_reject {
callback();
}
let msg = format!(
"{} requested, {} used globally ({}%), {} used by this stream (privileged: {}), effective limit: {} ({}%), hard limit: {}",
ReadableSize(additional as u64),
ReadableSize(current as u64),
if self.limit > 0 {
current * 100 / self.limit
} else {
0
},
ReadableSize(stream_tracked as u64),
is_privileged,
ReadableSize(effective_limit as u64),
if self.limit > 0 {
effective_limit * 100 / self.limit
} else {
0
},
ReadableSize(self.limit as u64)
);
error::ExceedMemoryLimitSnafu { msg }.fail()
}
}
}
/// Release tracked memory.
///
/// # Arguments
/// * `amount` - Amount of memory to release in bytes
pub fn release(&self, amount: usize) {
if let Ok(old_value) =
self.current
.fetch_update(Ordering::AcqRel, Ordering::Acquire, |current| {
Some(current.saturating_sub(amount))
})
&& let Some(callback) = &self.on_update
{
callback(old_value.saturating_sub(amount));
}
}
}
/// A wrapper stream that tracks memory usage of RecordBatches.
pub struct MemoryTrackedStream {
inner: SendableRecordBatchStream,
permit: Arc<MemoryPermit>,
// Total tracked size, released when stream drops.
total_tracked: usize,
}
impl MemoryTrackedStream {
pub fn new(inner: SendableRecordBatchStream, permit: Arc<MemoryPermit>) -> Self {
Self {
inner,
permit,
total_tracked: 0,
}
}
}
impl Stream for MemoryTrackedStream {
type Item = Result<RecordBatch>;
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
match Pin::new(&mut self.inner).poll_next(cx) {
Poll::Ready(Some(Ok(batch))) => {
let additional = batch
.columns()
.iter()
.map(|c| c.memory_size())
.sum::<usize>();
if let Err(e) = self.permit.track(additional, self.total_tracked) {
return Poll::Ready(Some(Err(e)));
}
self.total_tracked += additional;
Poll::Ready(Some(Ok(batch)))
}
Poll::Ready(Some(Err(e))) => Poll::Ready(Some(Err(e))),
Poll::Ready(None) => Poll::Ready(None),
Poll::Pending => Poll::Pending,
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.inner.size_hint()
}
}
impl Drop for MemoryTrackedStream {
fn drop(&mut self) {
if self.total_tracked > 0 {
self.permit.release(self.total_tracked);
}
}
}
impl RecordBatchStream for MemoryTrackedStream {
fn schema(&self) -> SchemaRef {
self.inner.schema()
}
fn output_ordering(&self) -> Option<&[OrderOption]> {
self.inner.output_ordering()
}
fn metrics(&self) -> Option<RecordBatchMetrics> {
self.inner.metrics()
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
@@ -496,4 +892,157 @@ mod tests {
assert_eq!(collected[0], batch1);
assert_eq!(collected[1], batch2);
}
#[test]
fn test_query_memory_tracker_basic() {
let tracker = Arc::new(QueryMemoryTracker::new(1000, 0));
// Register first stream - should get privileged status
let permit1 = tracker.register_permit();
assert!(permit1.is_privileged());
// Privileged stream can use up to limit
assert!(permit1.track(500, 0).is_ok());
assert_eq!(tracker.current(), 500);
// Register second stream - also privileged
let permit2 = tracker.register_permit();
assert!(permit2.is_privileged());
// Can add more but cannot exceed hard limit (1000)
assert!(permit2.track(400, 0).is_ok());
assert_eq!(tracker.current(), 900);
permit1.release(500);
permit2.release(400);
assert_eq!(tracker.current(), 0);
}
#[test]
fn test_query_memory_tracker_privileged_limit() {
// Privileged slots = 2 for easy testing
// Limit: 1000, standard-tier fraction: 0.7 (default)
// Privileged can push global to 1000, standard-tier can push global to 700
let tracker = Arc::new(QueryMemoryTracker::with_privileged_slots(1000, 2));
// First 2 streams are privileged
let permit1 = tracker.register_permit();
let permit2 = tracker.register_permit();
assert!(permit1.is_privileged());
assert!(permit2.is_privileged());
// Third stream is standard-tier (not privileged)
let permit3 = tracker.register_permit();
assert!(!permit3.is_privileged());
// Privileged stream uses some memory
assert!(permit1.track(300, 0).is_ok());
assert_eq!(tracker.current(), 300);
// Standard-tier can add up to 400 (total becomes 700, its effective limit)
assert!(permit3.track(400, 0).is_ok());
assert_eq!(tracker.current(), 700);
// Standard-tier stream cannot push global beyond 700
let err = permit3.track(100, 400).unwrap_err();
let err_msg = err.to_string();
assert!(err_msg.contains("400B used by this stream"));
assert!(err_msg.contains("effective limit: 700B (70%)"));
assert!(err_msg.contains("700B used globally (70%)"));
assert_eq!(tracker.current(), 700);
permit1.release(300);
permit3.release(400);
assert_eq!(tracker.current(), 0);
}
#[test]
fn test_query_memory_tracker_promotion() {
// Privileged slots = 1 for easy testing
let tracker = Arc::new(QueryMemoryTracker::with_privileged_slots(1000, 1));
// First stream is privileged
let permit1 = tracker.register_permit();
assert!(permit1.is_privileged());
// Second stream is standard-tier (can only use 500)
let permit2 = tracker.register_permit();
assert!(!permit2.is_privileged());
// Standard-tier can only track 500
assert!(permit2.track(400, 0).is_ok());
assert_eq!(tracker.current(), 400);
// Drop first permit to release privileged slot
drop(permit1);
// Second stream can now be promoted and use more memory
assert!(permit2.track(500, 400).is_ok());
assert!(permit2.is_privileged());
assert_eq!(tracker.current(), 900);
permit2.release(900);
assert_eq!(tracker.current(), 0);
}
#[test]
fn test_query_memory_tracker_privileged_hard_limit() {
// Test that the configured limit is absolute hard limit for all streams
// Privileged: can use full limit (1000)
// Standard-tier: can use 0.7x limit (700 with defaults)
let tracker = Arc::new(QueryMemoryTracker::new(1000, 0));
let permit1 = tracker.register_permit();
assert!(permit1.is_privileged());
// Privileged can use up to full limit (1000)
assert!(permit1.track(900, 0).is_ok());
assert_eq!(tracker.current(), 900);
// Privileged cannot exceed hard limit (1000)
assert!(permit1.track(200, 900).is_err());
assert_eq!(tracker.current(), 900);
// Can add within hard limit
assert!(permit1.track(100, 900).is_ok());
assert_eq!(tracker.current(), 1000);
// Cannot exceed even by 1 byte
assert!(permit1.track(1, 1000).is_err());
assert_eq!(tracker.current(), 1000);
permit1.release(1000);
assert_eq!(tracker.current(), 0);
}
#[test]
fn test_query_memory_tracker_standard_tier_fraction() {
// Test standard-tier streams use fraction of limit
// Limit: 1000, default fraction: 0.7, so standard-tier can use 700
let tracker = Arc::new(QueryMemoryTracker::with_privileged_slots(1000, 1));
let permit1 = tracker.register_permit();
assert!(permit1.is_privileged());
let permit2 = tracker.register_permit();
assert!(!permit2.is_privileged());
// Standard-tier can use up to 700 (1000 * 0.7 default)
assert!(permit2.track(600, 0).is_ok());
assert_eq!(tracker.current(), 600);
// Cannot exceed standard-tier limit (700)
assert!(permit2.track(200, 600).is_err());
assert_eq!(tracker.current(), 600);
// Can add within standard-tier limit
assert!(permit2.track(100, 600).is_ok());
assert_eq!(tracker.current(), 700);
// Cannot exceed standard-tier limit
assert!(permit2.track(1, 700).is_err());
assert_eq!(tracker.current(), 700);
permit2.release(700);
assert_eq!(tracker.current(), 0);
}
}

View File

@@ -23,7 +23,6 @@ use datafusion_common::arrow::datatypes::{DataType as ArrowDataType, SchemaRef a
use datatypes::arrow::array::RecordBatchOptions;
use datatypes::prelude::DataType;
use datatypes::schema::SchemaRef;
use datatypes::value::Value;
use datatypes::vectors::{Helper, VectorRef};
use serde::ser::{Error, SerializeStruct};
use serde::{Serialize, Serializer};
@@ -194,11 +193,6 @@ impl RecordBatch {
self.df_record_batch.num_rows()
}
/// Create an iterator to traverse the data by row
pub fn rows(&self) -> RecordBatchRowIterator<'_> {
RecordBatchRowIterator::new(self)
}
pub fn column_vectors(
&self,
table_name: &str,
@@ -277,44 +271,6 @@ impl Serialize for RecordBatch {
}
}
pub struct RecordBatchRowIterator<'a> {
record_batch: &'a RecordBatch,
rows: usize,
columns: usize,
row_cursor: usize,
}
impl<'a> RecordBatchRowIterator<'a> {
fn new(record_batch: &'a RecordBatch) -> RecordBatchRowIterator<'a> {
RecordBatchRowIterator {
record_batch,
rows: record_batch.df_record_batch.num_rows(),
columns: record_batch.df_record_batch.num_columns(),
row_cursor: 0,
}
}
}
impl Iterator for RecordBatchRowIterator<'_> {
type Item = Vec<Value>;
fn next(&mut self) -> Option<Self::Item> {
if self.row_cursor == self.rows {
None
} else {
let mut row = Vec::with_capacity(self.columns);
for col in 0..self.columns {
let column = self.record_batch.column(col);
row.push(column.get(self.row_cursor));
}
self.row_cursor += 1;
Some(row)
}
}
}
/// merge multiple recordbatch into a single
pub fn merge_record_batches(schema: SchemaRef, batches: &[RecordBatch]) -> Result<RecordBatch> {
let batches_len = batches.len();
@@ -349,7 +305,9 @@ pub fn merge_record_batches(schema: SchemaRef, batches: &[RecordBatch]) -> Resul
mod tests {
use std::sync::Arc;
use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
use datatypes::arrow::array::{AsArray, UInt32Array};
use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema, UInt32Type};
use datatypes::arrow_array::StringArray;
use datatypes::data_type::ConcreteDataType;
use datatypes::schema::{ColumnSchema, Schema};
use datatypes::vectors::{StringVector, UInt32Vector};
@@ -407,64 +365,6 @@ mod tests {
);
}
#[test]
fn test_record_batch_visitor() {
let column_schemas = vec![
ColumnSchema::new("numbers", ConcreteDataType::uint32_datatype(), false),
ColumnSchema::new("strings", ConcreteDataType::string_datatype(), true),
];
let schema = Arc::new(Schema::new(column_schemas));
let columns: Vec<VectorRef> = vec![
Arc::new(UInt32Vector::from_slice(vec![1, 2, 3, 4])),
Arc::new(StringVector::from(vec![
None,
Some("hello"),
Some("greptime"),
None,
])),
];
let recordbatch = RecordBatch::new(schema, columns).unwrap();
let mut record_batch_iter = recordbatch.rows();
assert_eq!(
vec![Value::UInt32(1), Value::Null],
record_batch_iter
.next()
.unwrap()
.into_iter()
.collect::<Vec<Value>>()
);
assert_eq!(
vec![Value::UInt32(2), Value::String("hello".into())],
record_batch_iter
.next()
.unwrap()
.into_iter()
.collect::<Vec<Value>>()
);
assert_eq!(
vec![Value::UInt32(3), Value::String("greptime".into())],
record_batch_iter
.next()
.unwrap()
.into_iter()
.collect::<Vec<Value>>()
);
assert_eq!(
vec![Value::UInt32(4), Value::Null],
record_batch_iter
.next()
.unwrap()
.into_iter()
.collect::<Vec<Value>>()
);
assert!(record_batch_iter.next().is_none());
}
#[test]
fn test_record_batch_slice() {
let column_schemas = vec![
@@ -483,26 +383,16 @@ mod tests {
];
let recordbatch = RecordBatch::new(schema, columns).unwrap();
let recordbatch = recordbatch.slice(1, 2).expect("recordbatch slice");
let mut record_batch_iter = recordbatch.rows();
assert_eq!(
vec![Value::UInt32(2), Value::String("hello".into())],
record_batch_iter
.next()
.unwrap()
.into_iter()
.collect::<Vec<Value>>()
);
assert_eq!(
vec![Value::UInt32(3), Value::String("greptime".into())],
record_batch_iter
.next()
.unwrap()
.into_iter()
.collect::<Vec<Value>>()
);
let expected = &UInt32Array::from_iter_values([2u32, 3]);
let array = recordbatch.column(0).to_arrow_array();
let actual = array.as_primitive::<UInt32Type>();
assert_eq!(expected, actual);
assert!(record_batch_iter.next().is_none());
let expected = &StringArray::from(vec!["hello", "greptime"]);
let array = recordbatch.column(1).to_arrow_array();
let actual = array.as_string::<i32>();
assert_eq!(expected, actual);
assert!(recordbatch.slice(1, 5).is_err());
}

View File

@@ -13,7 +13,6 @@
// limitations under the License.
use std::fmt::Display;
use std::str::FromStr;
use chrono::{FixedOffset, TimeZone};
use chrono_tz::{OffsetComponents, Tz};
@@ -102,7 +101,7 @@ impl Timezone {
.parse::<u32>()
.context(ParseOffsetStrSnafu { raw: tz_string })?;
Self::hours_mins_opt(hrs, mins)
} else if let Ok(tz) = Tz::from_str(tz_string) {
} else if let Ok(tz) = Tz::from_str_insensitive(tz_string) {
Ok(Self::Named(tz))
} else {
ParseTimezoneNameSnafu { raw: tz_string }.fail()
@@ -203,6 +202,10 @@ mod tests {
Timezone::Named(Tz::Asia__Shanghai),
Timezone::from_tz_string("Asia/Shanghai").unwrap()
);
assert_eq!(
Timezone::Named(Tz::Asia__Shanghai),
Timezone::from_tz_string("Asia/ShangHai").unwrap()
);
assert_eq!(
Timezone::Named(Tz::UTC),
Timezone::from_tz_string("UTC").unwrap()

View File

@@ -522,6 +522,7 @@ impl DatanodeBuilder {
file_ref_manager,
partition_expr_fetcher.clone(),
plugins,
opts.max_concurrent_queries,
);
#[cfg(feature = "enterprise")]
@@ -564,6 +565,7 @@ impl DatanodeBuilder {
file_ref_manager,
partition_expr_fetcher,
plugins,
opts.max_concurrent_queries,
);
#[cfg(feature = "enterprise")]
@@ -585,6 +587,7 @@ impl DatanodeBuilder {
file_ref_manager,
partition_expr_fetcher.clone(),
plugins,
opts.max_concurrent_queries,
);
#[cfg(feature = "enterprise")]

View File

@@ -44,7 +44,6 @@ use crate::region_server::RegionServer;
#[derive(Clone)]
pub struct RegionHeartbeatResponseHandler {
region_server: RegionServer,
catchup_tasks: TaskTracker<()>,
downgrade_tasks: TaskTracker<()>,
flush_tasks: TaskTracker<()>,
open_region_parallelism: usize,
@@ -64,7 +63,6 @@ pub trait InstructionHandler: Send + Sync {
#[derive(Clone)]
pub struct HandlerContext {
region_server: RegionServer,
catchup_tasks: TaskTracker<()>,
downgrade_tasks: TaskTracker<()>,
flush_tasks: TaskTracker<()>,
gc_tasks: TaskTracker<GcReport>,
@@ -75,7 +73,6 @@ impl HandlerContext {
pub fn new_for_test(region_server: RegionServer) -> Self {
Self {
region_server,
catchup_tasks: TaskTracker::new(),
downgrade_tasks: TaskTracker::new(),
flush_tasks: TaskTracker::new(),
gc_tasks: TaskTracker::new(),
@@ -88,7 +85,6 @@ impl RegionHeartbeatResponseHandler {
pub fn new(region_server: RegionServer) -> Self {
Self {
region_server,
catchup_tasks: TaskTracker::new(),
downgrade_tasks: TaskTracker::new(),
flush_tasks: TaskTracker::new(),
// Default to half of the number of CPUs.
@@ -114,7 +110,12 @@ impl RegionHeartbeatResponseHandler {
)),
Instruction::FlushRegions(_) => Ok(Box::new(FlushRegionsHandler.into())),
Instruction::DowngradeRegions(_) => Ok(Box::new(DowngradeRegionsHandler.into())),
Instruction::UpgradeRegion(_) => Ok(Box::new(UpgradeRegionsHandler.into())),
Instruction::UpgradeRegions(_) => Ok(Box::new(
UpgradeRegionsHandler {
upgrade_region_parallelism: self.open_region_parallelism,
}
.into(),
)),
Instruction::GetFileRefs(_) => Ok(Box::new(GetFileRefsHandler.into())),
Instruction::GcRegions(_) => Ok(Box::new(GcRegionsHandler.into())),
Instruction::InvalidateCaches(_) => InvalidHeartbeatResponseSnafu.fail(),
@@ -194,7 +195,7 @@ dispatch_instr!(
OpenRegions => OpenRegions,
FlushRegions => FlushRegions,
DowngradeRegions => DowngradeRegions,
UpgradeRegion => UpgradeRegions,
UpgradeRegions => UpgradeRegions,
GetFileRefs => GetFileRefs,
GcRegions => GcRegions,
);
@@ -216,7 +217,6 @@ impl HeartbeatResponseHandler for RegionHeartbeatResponseHandler {
let mailbox = ctx.mailbox.clone();
let region_server = self.region_server.clone();
let catchup_tasks = self.catchup_tasks.clone();
let downgrade_tasks = self.downgrade_tasks.clone();
let flush_tasks = self.flush_tasks.clone();
let gc_tasks = self.gc_tasks.clone();
@@ -226,7 +226,6 @@ impl HeartbeatResponseHandler for RegionHeartbeatResponseHandler {
.handle(
&HandlerContext {
region_server,
catchup_tasks,
downgrade_tasks,
flush_tasks,
gc_tasks,
@@ -334,10 +333,10 @@ mod tests {
);
// Upgrade region
let instruction = Instruction::UpgradeRegion(UpgradeRegion {
let instruction = Instruction::UpgradeRegions(vec![UpgradeRegion {
region_id,
..Default::default()
});
}]);
assert!(
heartbeat_handler.is_acceptable(&heartbeat_env.create_handler_ctx((meta, instruction)))
);

View File

@@ -12,125 +12,209 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use common_meta::instruction::{InstructionReply, UpgradeRegion, UpgradeRegionReply};
use common_telemetry::{info, warn};
use store_api::region_request::{RegionCatchupRequest, RegionRequest, ReplayCheckpoint};
use common_error::ext::{BoxedError, ErrorExt};
use common_error::status_code::StatusCode;
use common_meta::instruction::{
InstructionReply, UpgradeRegion, UpgradeRegionReply, UpgradeRegionsReply,
};
use common_telemetry::{debug, info, warn};
use store_api::region_request::{RegionCatchupRequest, ReplayCheckpoint};
use store_api::storage::RegionId;
use crate::error::Result;
use crate::heartbeat::handler::{HandlerContext, InstructionHandler};
use crate::heartbeat::task_tracker::WaitResult;
#[derive(Debug, Clone, Copy, Default)]
pub struct UpgradeRegionsHandler;
pub struct UpgradeRegionsHandler {
pub upgrade_region_parallelism: usize,
}
#[cfg(test)]
impl UpgradeRegionsHandler {
fn new_test() -> UpgradeRegionsHandler {
UpgradeRegionsHandler {
upgrade_region_parallelism: 8,
}
}
}
impl UpgradeRegionsHandler {
fn convert_responses_to_replies(
responses: Result<Vec<(RegionId, std::result::Result<(), BoxedError>)>>,
catchup_regions: &[RegionId],
) -> Vec<UpgradeRegionReply> {
match responses {
Ok(responses) => responses
.into_iter()
.map(|(region_id, result)| match result {
Ok(()) => UpgradeRegionReply {
region_id,
ready: true,
exists: true,
error: None,
},
Err(err) => {
if err.status_code() == StatusCode::RegionNotFound {
UpgradeRegionReply {
region_id,
ready: false,
exists: false,
error: Some(format!("{err:?}")),
}
} else {
UpgradeRegionReply {
region_id,
ready: false,
exists: true,
error: Some(format!("{err:?}")),
}
}
}
})
.collect::<Vec<_>>(),
Err(err) => catchup_regions
.iter()
.map(|region_id| UpgradeRegionReply {
region_id: *region_id,
ready: false,
exists: true,
error: Some(format!("{err:?}")),
})
.collect::<Vec<_>>(),
}
}
}
impl UpgradeRegionsHandler {
// Handles upgrade regions instruction.
//
// Returns batch of upgrade region replies, the order of the replies is not guaranteed.
async fn handle_upgrade_regions(
&self,
ctx: &HandlerContext,
upgrade_regions: Vec<UpgradeRegion>,
) -> Vec<UpgradeRegionReply> {
let num_upgrade_regions = upgrade_regions.len();
let mut replies = Vec::with_capacity(num_upgrade_regions);
let mut catchup_requests = Vec::with_capacity(num_upgrade_regions);
let mut catchup_regions = Vec::with_capacity(num_upgrade_regions);
let mut timeout = None;
for upgrade_region in upgrade_regions {
let Some(writable) = ctx.region_server.is_region_leader(upgrade_region.region_id)
else {
// Region is not found.
debug!("Region {} is not found", upgrade_region.region_id);
replies.push(UpgradeRegionReply {
region_id: upgrade_region.region_id,
ready: false,
exists: false,
error: None,
});
continue;
};
// Ignores the catchup requests for writable regions.
if writable {
warn!(
"Region {} is writable, ignores the catchup request",
upgrade_region.region_id
);
replies.push(UpgradeRegionReply {
region_id: upgrade_region.region_id,
ready: true,
exists: true,
error: None,
});
} else {
let UpgradeRegion {
last_entry_id,
metadata_last_entry_id,
location_id,
replay_entry_id,
metadata_replay_entry_id,
replay_timeout,
..
} = upgrade_region;
match timeout {
Some(timeout) => {
debug_assert_eq!(timeout, replay_timeout);
}
None => {
// TODO(weny): required the replay_timeout.
timeout = Some(replay_timeout);
}
}
let checkpoint = match (replay_entry_id, metadata_replay_entry_id) {
(Some(entry_id), metadata_entry_id) => Some(ReplayCheckpoint {
entry_id,
metadata_entry_id,
}),
_ => None,
};
catchup_regions.push(upgrade_region.region_id);
catchup_requests.push((
upgrade_region.region_id,
RegionCatchupRequest {
set_writable: true,
entry_id: last_entry_id,
metadata_entry_id: metadata_last_entry_id,
location_id,
checkpoint,
},
));
}
}
let Some(timeout) = timeout else {
// No replay timeout, so we don't need to catchup the regions.
info!("All regions are writable, no need to catchup");
debug_assert_eq!(replies.len(), num_upgrade_regions);
return replies;
};
match tokio::time::timeout(
timeout,
ctx.region_server
.handle_batch_catchup_requests(self.upgrade_region_parallelism, catchup_requests),
)
.await
{
Ok(responses) => {
replies.extend(
Self::convert_responses_to_replies(responses, &catchup_regions).into_iter(),
);
}
Err(_) => {
replies.extend(catchup_regions.iter().map(|region_id| UpgradeRegionReply {
region_id: *region_id,
ready: false,
exists: true,
error: None,
}));
}
}
replies
}
}
#[async_trait::async_trait]
impl InstructionHandler for UpgradeRegionsHandler {
type Instruction = UpgradeRegion;
type Instruction = Vec<UpgradeRegion>;
async fn handle(
&self,
ctx: &HandlerContext,
UpgradeRegion {
region_id,
last_entry_id,
metadata_last_entry_id,
replay_timeout,
location_id,
replay_entry_id,
metadata_replay_entry_id,
}: UpgradeRegion,
upgrade_regions: Self::Instruction,
) -> Option<InstructionReply> {
let Some(writable) = ctx.region_server.is_region_leader(region_id) else {
return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
ready: false,
exists: false,
error: None,
}));
};
let replies = self.handle_upgrade_regions(ctx, upgrade_regions).await;
if writable {
return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
ready: true,
exists: true,
error: None,
}));
}
let region_server_moved = ctx.region_server.clone();
let checkpoint = match (replay_entry_id, metadata_replay_entry_id) {
(Some(entry_id), metadata_entry_id) => Some(ReplayCheckpoint {
entry_id,
metadata_entry_id,
}),
_ => None,
};
// The catchup task is almost zero cost if the inside region is writable.
// Therefore, it always registers a new catchup task.
let register_result = ctx
.catchup_tasks
.try_register(
region_id,
Box::pin(async move {
info!(
"Executing region: {region_id} catchup to: last entry id {last_entry_id:?}"
);
region_server_moved
.handle_request(
region_id,
RegionRequest::Catchup(RegionCatchupRequest {
set_writable: true,
entry_id: last_entry_id,
metadata_entry_id: metadata_last_entry_id,
location_id,
checkpoint,
}),
)
.await?;
Ok(())
}),
)
.await;
if register_result.is_busy() {
warn!("Another catchup task is running for the region: {region_id}");
}
// Returns immediately
let Some(replay_timeout) = replay_timeout else {
return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
ready: false,
exists: true,
error: None,
}));
};
// We don't care that it returns a newly registered or running task.
let mut watcher = register_result.into_watcher();
let result = ctx.catchup_tasks.wait(&mut watcher, replay_timeout).await;
match result {
WaitResult::Timeout => Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
ready: false,
exists: true,
error: None,
})),
WaitResult::Finish(Ok(_)) => {
Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
ready: true,
exists: true,
error: None,
}))
}
WaitResult::Finish(Err(err)) => {
Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
ready: false,
exists: true,
error: Some(format!("{err:?}")),
}))
}
}
Some(InstructionReply::UpgradeRegions(UpgradeRegionsReply::new(
replies,
)))
}
}
@@ -142,7 +226,6 @@ mod tests {
use mito2::engine::MITO_ENGINE_NAME;
use store_api::region_engine::RegionRole;
use store_api::storage::RegionId;
use tokio::time::Instant;
use crate::error;
use crate::heartbeat::handler::upgrade_region::UpgradeRegionsHandler;
@@ -158,21 +241,30 @@ mod tests {
let handler_context = HandlerContext::new_for_test(mock_region_server);
let region_id = RegionId::new(1024, 1);
let waits = vec![None, Some(Duration::from_millis(100u64))];
for replay_timeout in waits {
let reply = UpgradeRegionsHandler
.handle(
&handler_context,
let region_id2 = RegionId::new(1024, 2);
let replay_timeout = Duration::from_millis(100u64);
let reply = UpgradeRegionsHandler::new_test()
.handle(
&handler_context,
vec![
UpgradeRegion {
region_id,
replay_timeout,
..Default::default()
},
)
.await;
UpgradeRegion {
region_id: region_id2,
replay_timeout,
..Default::default()
},
],
)
.await;
let reply = reply.unwrap().expect_upgrade_region_reply();
let replies = &reply.unwrap().expect_upgrade_regions_reply();
assert_eq!(replies[0].region_id, region_id);
assert_eq!(replies[1].region_id, region_id2);
for reply in replies {
assert!(!reply.exists);
assert!(reply.error.is_none());
}
@@ -182,6 +274,7 @@ mod tests {
async fn test_region_writable() {
let mock_region_server = mock_region_server();
let region_id = RegionId::new(1024, 1);
let region_id2 = RegionId::new(1024, 2);
let (mock_engine, _) =
MockRegionEngine::with_custom_apply_fn(MITO_ENGINE_NAME, |region_engine| {
@@ -191,25 +284,32 @@ mod tests {
unreachable!();
}));
});
mock_region_server.register_test_region(region_id, mock_engine);
mock_region_server.register_test_region(region_id, mock_engine.clone());
mock_region_server.register_test_region(region_id2, mock_engine);
let handler_context = HandlerContext::new_for_test(mock_region_server);
let waits = vec![None, Some(Duration::from_millis(100u64))];
for replay_timeout in waits {
let reply = UpgradeRegionsHandler
.handle(
&handler_context,
let replay_timeout = Duration::from_millis(100u64);
let reply = UpgradeRegionsHandler::new_test()
.handle(
&handler_context,
vec![
UpgradeRegion {
region_id,
replay_timeout,
..Default::default()
},
)
.await;
UpgradeRegion {
region_id: region_id2,
replay_timeout,
..Default::default()
},
],
)
.await;
let reply = reply.unwrap().expect_upgrade_region_reply();
let replies = &reply.unwrap().expect_upgrade_regions_reply();
assert_eq!(replies[0].region_id, region_id);
assert_eq!(replies[1].region_id, region_id2);
for reply in replies {
assert!(reply.ready);
assert!(reply.exists);
assert!(reply.error.is_none());
@@ -232,30 +332,27 @@ mod tests {
mock_region_server.register_test_region(region_id, mock_engine);
let handler_context = HandlerContext::new_for_test(mock_region_server);
let replay_timeout = Duration::from_millis(100u64);
let reply = UpgradeRegionsHandler::new_test()
.handle(
&handler_context,
vec![UpgradeRegion {
region_id,
replay_timeout,
..Default::default()
}],
)
.await;
let waits = vec![None, Some(Duration::from_millis(100u64))];
for replay_timeout in waits {
let reply = UpgradeRegionsHandler
.handle(
&handler_context,
UpgradeRegion {
region_id,
replay_timeout,
..Default::default()
},
)
.await;
let reply = reply.unwrap().expect_upgrade_region_reply();
assert!(!reply.ready);
assert!(reply.exists);
assert!(reply.error.is_none());
}
let reply = &reply.unwrap().expect_upgrade_regions_reply()[0];
assert!(!reply.ready);
assert!(reply.exists);
assert!(reply.error.is_none(), "error: {:?}", reply.error);
}
#[tokio::test]
async fn test_region_not_ready_with_retry() {
common_telemetry::init_default_ut_logging();
let mock_region_server = mock_region_server();
let region_id = RegionId::new(1024, 1);
@@ -264,58 +361,48 @@ mod tests {
// Region is not ready.
region_engine.mock_role = Some(Some(RegionRole::Follower));
region_engine.handle_request_mock_fn = Some(Box::new(|_, _| Ok(0)));
// Note: Don't change.
region_engine.handle_request_delay = Some(Duration::from_millis(300));
});
mock_region_server.register_test_region(region_id, mock_engine);
let waits = vec![
Some(Duration::from_millis(100u64)),
Some(Duration::from_millis(100u64)),
];
let waits = vec![Duration::from_millis(100u64), Duration::from_millis(100u64)];
let handler_context = HandlerContext::new_for_test(mock_region_server);
for replay_timeout in waits {
let reply = UpgradeRegionsHandler
let reply = UpgradeRegionsHandler::new_test()
.handle(
&handler_context,
UpgradeRegion {
vec![UpgradeRegion {
region_id,
replay_timeout,
..Default::default()
},
}],
)
.await;
let reply = reply.unwrap().expect_upgrade_region_reply();
let reply = &reply.unwrap().expect_upgrade_regions_reply()[0];
assert!(!reply.ready);
assert!(reply.exists);
assert!(reply.error.is_none());
assert!(reply.error.is_none(), "error: {:?}", reply.error);
}
let timer = Instant::now();
let reply = UpgradeRegionsHandler
let reply = UpgradeRegionsHandler::new_test()
.handle(
&handler_context,
UpgradeRegion {
vec![UpgradeRegion {
region_id,
replay_timeout: Some(Duration::from_millis(500)),
replay_timeout: Duration::from_millis(500),
..Default::default()
},
}],
)
.await;
// Must less than 300 ms.
assert!(timer.elapsed().as_millis() < 300);
let reply = reply.unwrap().expect_upgrade_region_reply();
let reply = &reply.unwrap().expect_upgrade_regions_reply()[0];
assert!(reply.ready);
assert!(reply.exists);
assert!(reply.error.is_none());
assert!(reply.error.is_none(), "error: {:?}", reply.error);
}
#[tokio::test]
async fn test_region_error() {
common_telemetry::init_default_ut_logging();
let mock_region_server = mock_region_server();
let region_id = RegionId::new(1024, 1);
@@ -335,38 +422,37 @@ mod tests {
mock_region_server.register_test_region(region_id, mock_engine);
let handler_context = HandlerContext::new_for_test(mock_region_server);
let reply = UpgradeRegionsHandler
let reply = UpgradeRegionsHandler::new_test()
.handle(
&handler_context,
UpgradeRegion {
vec![UpgradeRegion {
region_id,
..Default::default()
},
}],
)
.await;
// It didn't wait for handle returns; it had no idea about the error.
let reply = reply.unwrap().expect_upgrade_region_reply();
let reply = &reply.unwrap().expect_upgrade_regions_reply()[0];
assert!(!reply.ready);
assert!(reply.exists);
assert!(reply.error.is_none());
let reply = UpgradeRegionsHandler
let reply = UpgradeRegionsHandler::new_test()
.handle(
&handler_context,
UpgradeRegion {
vec![UpgradeRegion {
region_id,
replay_timeout: Some(Duration::from_millis(200)),
replay_timeout: Duration::from_millis(200),
..Default::default()
},
}],
)
.await;
let reply = reply.unwrap().expect_upgrade_region_reply();
let reply = &reply.unwrap().expect_upgrade_regions_reply()[0];
assert!(!reply.ready);
assert!(reply.exists);
assert!(reply.error.is_some());
assert!(reply.error.unwrap().contains("mock_error"));
assert!(reply.error.as_ref().unwrap().contains("mock_error"));
}
}

View File

@@ -75,4 +75,20 @@ lazy_static! {
&[RESULT_TYPE]
)
.unwrap();
/// Total count of failed region server requests.
pub static ref REGION_SERVER_REQUEST_FAILURE_COUNT: IntCounterVec = register_int_counter_vec!(
"greptime_datanode_region_request_fail_count",
"failed region server requests count",
&[REGION_REQUEST_TYPE]
)
.unwrap();
/// Total count of failed insert requests to region server.
pub static ref REGION_SERVER_INSERT_FAIL_COUNT: IntCounterVec = register_int_counter_vec!(
"greptime_datanode_region_failed_insert_count",
"failed region server insert requests count",
&[REGION_REQUEST_TYPE]
)
.unwrap();
}

View File

@@ -66,7 +66,8 @@ use store_api::region_engine::{
SettableRegionRoleState,
};
use store_api::region_request::{
AffectedRows, BatchRegionDdlRequest, RegionCloseRequest, RegionOpenRequest, RegionRequest,
AffectedRows, BatchRegionDdlRequest, RegionCatchupRequest, RegionCloseRequest,
RegionOpenRequest, RegionRequest,
};
use store_api::storage::RegionId;
use tokio::sync::{Semaphore, SemaphorePermit};
@@ -191,6 +192,17 @@ impl RegionServer {
.await
}
#[tracing::instrument(skip_all)]
pub async fn handle_batch_catchup_requests(
&self,
parallelism: usize,
requests: Vec<(RegionId, RegionCatchupRequest)>,
) -> Result<Vec<(RegionId, std::result::Result<(), BoxedError>)>> {
self.inner
.handle_batch_catchup_requests(parallelism, requests)
.await
}
#[tracing::instrument(skip_all, fields(request_type = request.request_type()))]
pub async fn handle_request(
&self,
@@ -399,6 +411,14 @@ impl RegionServer {
#[cfg(test)]
/// Registers a region for test purpose.
pub(crate) fn register_test_region(&self, region_id: RegionId, engine: RegionEngineRef) {
{
let mut engines = self.inner.engines.write().unwrap();
if !engines.contains_key(engine.name()) {
debug!("Registering test engine: {}", engine.name());
engines.insert(engine.name().to_string(), engine.clone());
}
}
self.inner
.region_map
.insert(region_id, RegionEngineWithStatus::Ready(engine));
@@ -580,6 +600,8 @@ impl RegionServer {
#[async_trait]
impl RegionServerHandler for RegionServer {
async fn handle(&self, request: region_request::Body) -> ServerResult<RegionResponseV1> {
let failed_requests_cnt = crate::metrics::REGION_SERVER_REQUEST_FAILURE_COUNT
.with_label_values(&[request.as_ref()]);
let response = match &request {
region_request::Body::Creates(_)
| region_request::Body::Drops(_)
@@ -597,6 +619,9 @@ impl RegionServerHandler for RegionServer {
_ => self.handle_requests_in_serial(request).await,
}
.map_err(BoxedError::new)
.inspect_err(|_| {
failed_requests_cnt.inc();
})
.context(ExecuteGrpcRequestSnafu)?;
Ok(RegionResponseV1 {
@@ -972,6 +997,116 @@ impl RegionServerInner {
.collect::<Vec<_>>())
}
pub async fn handle_batch_catchup_requests_inner(
&self,
engine: RegionEngineRef,
parallelism: usize,
requests: Vec<(RegionId, RegionCatchupRequest)>,
) -> Result<Vec<(RegionId, std::result::Result<(), BoxedError>)>> {
for (region_id, _) in &requests {
self.set_region_status_not_ready(*region_id, &engine, &RegionChange::Catchup);
}
let region_ids = requests
.iter()
.map(|(region_id, _)| *region_id)
.collect::<Vec<_>>();
let mut responses = Vec::with_capacity(requests.len());
match engine
.handle_batch_catchup_requests(parallelism, requests)
.await
{
Ok(results) => {
for (region_id, result) in results {
match result {
Ok(_) => {
if let Err(e) = self
.set_region_status_ready(
region_id,
engine.clone(),
RegionChange::Catchup,
)
.await
{
error!(e; "Failed to set region to ready: {}", region_id);
responses.push((region_id, Err(BoxedError::new(e))));
} else {
responses.push((region_id, Ok(())));
}
}
Err(e) => {
self.unset_region_status(region_id, &engine, RegionChange::Catchup);
error!(e; "Failed to catchup region: {}", region_id);
responses.push((region_id, Err(e)));
}
}
}
}
Err(e) => {
for region_id in region_ids {
self.unset_region_status(region_id, &engine, RegionChange::Catchup);
}
error!(e; "Failed to catchup batch regions");
return error::UnexpectedSnafu {
violated: format!("Failed to catchup batch regions: {:?}", e),
}
.fail();
}
}
Ok(responses)
}
pub async fn handle_batch_catchup_requests(
&self,
parallelism: usize,
requests: Vec<(RegionId, RegionCatchupRequest)>,
) -> Result<Vec<(RegionId, std::result::Result<(), BoxedError>)>> {
let mut engine_grouped_requests: HashMap<String, Vec<_>> = HashMap::new();
let mut responses = Vec::with_capacity(requests.len());
for (region_id, request) in requests {
if let Ok(engine) = self.get_engine(region_id, &RegionChange::Catchup) {
match engine {
CurrentEngine::Engine(engine) => {
engine_grouped_requests
.entry(engine.name().to_string())
.or_default()
.push((region_id, request));
}
CurrentEngine::EarlyReturn(_) => {
return error::UnexpectedSnafu {
violated: format!("Unexpected engine type for region {}", region_id),
}
.fail();
}
}
} else {
responses.push((
region_id,
Err(BoxedError::new(
error::RegionNotFoundSnafu { region_id }.build(),
)),
));
}
}
for (engine, requests) in engine_grouped_requests {
let engine = self
.engines
.read()
.unwrap()
.get(&engine)
.with_context(|| RegionEngineNotFoundSnafu { name: &engine })?
.clone();
responses.extend(
self.handle_batch_catchup_requests_inner(engine, parallelism, requests)
.await?,
);
}
Ok(responses)
}
// Handle requests in batch.
//
// limitation: all create requests must be in the same engine.
@@ -1100,6 +1235,11 @@ impl RegionServerInner {
})
}
Err(err) => {
if matches!(region_change, RegionChange::Ingest) {
crate::metrics::REGION_SERVER_INSERT_FAIL_COUNT
.with_label_values(&[request_type])
.inc();
}
// Removes the region status if the operation fails.
self.unset_region_status(region_id, &engine, region_change);
Err(err)

View File

@@ -277,6 +277,10 @@ impl ConcreteDataType {
matches!(self, ConcreteDataType::Null(NullType))
}
pub(crate) fn is_struct(&self) -> bool {
matches!(self, ConcreteDataType::Struct(_))
}
/// Try to cast the type as a [`ListType`].
pub fn as_list(&self) -> Option<&ListType> {
match self {

View File

@@ -266,6 +266,14 @@ pub enum Error {
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Failed to parse or serialize arrow metadata"))]
ArrowMetadata {
#[snafu(source)]
error: arrow::error::ArrowError,
#[snafu(implicit)]
location: Location,
},
}
impl ErrorExt for Error {
@@ -307,7 +315,8 @@ impl ErrorExt for Error {
| ConvertArrowArrayToScalars { .. }
| ConvertScalarToArrowArray { .. }
| ParseExtendedType { .. }
| InconsistentStructFieldsAndItems { .. } => StatusCode::Internal,
| InconsistentStructFieldsAndItems { .. }
| ArrowMetadata { .. } => StatusCode::Internal,
}
}

View File

@@ -0,0 +1,15 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod json;

View File

@@ -0,0 +1,104 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use arrow_schema::extension::ExtensionType;
use arrow_schema::{ArrowError, DataType};
use serde::{Deserialize, Serialize};
use crate::json::JsonStructureSettings;
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct JsonMetadata {
/// Indicates how to handle JSON is stored in underlying data type
///
/// This field can be `None` for data is converted to complete structured in-memory form.
pub json_structure_settings: Option<JsonStructureSettings>,
}
#[derive(Debug, Clone)]
pub struct JsonExtensionType(Arc<JsonMetadata>);
impl JsonExtensionType {
pub fn new(metadata: Arc<JsonMetadata>) -> Self {
JsonExtensionType(metadata)
}
}
impl ExtensionType for JsonExtensionType {
const NAME: &'static str = "greptime.json";
type Metadata = Arc<JsonMetadata>;
fn metadata(&self) -> &Self::Metadata {
&self.0
}
fn serialize_metadata(&self) -> Option<String> {
serde_json::to_string(self.metadata()).ok()
}
fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata, ArrowError> {
if let Some(metadata) = metadata {
let metadata = serde_json::from_str(metadata).map_err(|e| {
ArrowError::ParseError(format!("Failed to deserialize JSON metadata: {}", e))
})?;
Ok(Arc::new(metadata))
} else {
Ok(Arc::new(JsonMetadata::default()))
}
}
fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> {
match data_type {
// object
DataType::Struct(_)
// array
| DataType::List(_)
| DataType::ListView(_)
| DataType::LargeList(_)
| DataType::LargeListView(_)
// string
| DataType::Utf8
| DataType::Utf8View
| DataType::LargeUtf8
// number
| DataType::Int8
| DataType::Int16
| DataType::Int32
| DataType::Int64
| DataType::UInt8
| DataType::UInt16
| DataType::UInt32
| DataType::UInt64
| DataType::Float32
| DataType::Float64
// boolean
| DataType::Boolean
// null
| DataType::Null
// legacy json type
| DataType::Binary => Ok(()),
dt => Err(ArrowError::SchemaError(format!(
"Unexpected data type {dt}"
))),
}
}
fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result<Self, ArrowError> {
let json = Self(metadata);
json.supports_data_type(data_type)?;
Ok(json)
}
}

View File

@@ -13,11 +13,13 @@
// limitations under the License.
#![feature(assert_matches)]
#![feature(box_patterns)]
pub mod arrow_array;
pub mod data_type;
pub mod duration;
pub mod error;
pub mod extension;
pub mod interval;
pub mod json;
pub mod macros;

View File

@@ -32,9 +32,8 @@ pub use crate::schema::column_schema::{
COLUMN_FULLTEXT_OPT_KEY_FALSE_POSITIVE_RATE, COLUMN_FULLTEXT_OPT_KEY_GRANULARITY,
COLUMN_SKIPPING_INDEX_OPT_KEY_FALSE_POSITIVE_RATE, COLUMN_SKIPPING_INDEX_OPT_KEY_GRANULARITY,
COLUMN_SKIPPING_INDEX_OPT_KEY_TYPE, COMMENT_KEY, ColumnExtType, ColumnSchema, FULLTEXT_KEY,
FulltextAnalyzer, FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY,
JSON_STRUCTURE_SETTINGS_KEY, Metadata, SKIPPING_INDEX_KEY, SkippingIndexOptions,
SkippingIndexType, TIME_INDEX_KEY,
FulltextAnalyzer, FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, Metadata,
SKIPPING_INDEX_KEY, SkippingIndexOptions, SkippingIndexType, TIME_INDEX_KEY,
};
pub use crate::schema::constraint::ColumnDefaultConstraint;
pub use crate::schema::raw::RawSchema;

View File

@@ -17,13 +17,17 @@ use std::fmt;
use std::str::FromStr;
use arrow::datatypes::Field;
use arrow_schema::extension::{
EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY, ExtensionType,
};
use serde::{Deserialize, Serialize};
use snafu::{ResultExt, ensure};
use sqlparser_derive::{Visit, VisitMut};
use crate::data_type::{ConcreteDataType, DataType};
use crate::error::{self, Error, InvalidFulltextOptionSnafu, ParseExtendedTypeSnafu, Result};
use crate::json::JsonStructureSettings;
use crate::error::{
self, ArrowMetadataSnafu, Error, InvalidFulltextOptionSnafu, ParseExtendedTypeSnafu, Result,
};
use crate::schema::TYPE_KEY;
use crate::schema::constraint::ColumnDefaultConstraint;
use crate::value::Value;
@@ -42,7 +46,6 @@ pub const FULLTEXT_KEY: &str = "greptime:fulltext";
pub const INVERTED_INDEX_KEY: &str = "greptime:inverted_index";
/// Key used to store skip options in arrow field's metadata.
pub const SKIPPING_INDEX_KEY: &str = "greptime:skipping_index";
pub const JSON_STRUCTURE_SETTINGS_KEY: &str = "greptime:json:structure_settings";
/// Keys used in fulltext options
pub const COLUMN_FULLTEXT_CHANGE_OPT_KEY_ENABLE: &str = "enable";
@@ -394,18 +397,38 @@ impl ColumnSchema {
Ok(())
}
pub fn json_structure_settings(&self) -> Result<Option<JsonStructureSettings>> {
self.metadata
.get(JSON_STRUCTURE_SETTINGS_KEY)
.map(|json| serde_json::from_str(json).context(error::DeserializeSnafu { json }))
.transpose()
pub fn extension_type<E>(&self) -> Result<Option<E>>
where
E: ExtensionType,
{
let extension_type_name = self.metadata.get(EXTENSION_TYPE_NAME_KEY);
if extension_type_name.map(|s| s.as_str()) == Some(E::NAME) {
let extension_metadata = self.metadata.get(EXTENSION_TYPE_METADATA_KEY);
let extension_metadata =
E::deserialize_metadata(extension_metadata.map(|s| s.as_str()))
.context(ArrowMetadataSnafu)?;
let extension = E::try_new(&self.data_type.as_arrow_type(), extension_metadata)
.context(ArrowMetadataSnafu)?;
Ok(Some(extension))
} else {
Ok(None)
}
}
pub fn with_json_structure_settings(&mut self, settings: &JsonStructureSettings) -> Result<()> {
self.metadata.insert(
JSON_STRUCTURE_SETTINGS_KEY.to_string(),
serde_json::to_string(settings).context(error::SerializeSnafu)?,
);
pub fn with_extension_type<E>(&mut self, extension_type: &E) -> Result<()>
where
E: ExtensionType,
{
self.metadata
.insert(EXTENSION_TYPE_NAME_KEY.to_string(), E::NAME.to_string());
if let Some(extension_metadata) = extension_type.serialize_metadata() {
self.metadata
.insert(EXTENSION_TYPE_METADATA_KEY.to_string(), extension_metadata);
}
Ok(())
}
}

View File

@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::BTreeMap;
use std::collections::{BTreeMap, HashMap};
use std::str::FromStr;
use std::sync::Arc;
@@ -31,9 +31,12 @@ use crate::scalars::ScalarVectorBuilder;
use crate::type_id::LogicalTypeId;
use crate::types::{ListType, StructField, StructType};
use crate::value::Value;
use crate::vectors::json::builder::JsonVectorBuilder;
use crate::vectors::{BinaryVectorBuilder, MutableVector};
pub const JSON_TYPE_NAME: &str = "Json";
const JSON_PLAIN_FIELD_NAME: &str = "__plain__";
const JSON_PLAIN_FIELD_METADATA_KEY: &str = "is_plain_json";
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize, Default)]
pub enum JsonFormat {
@@ -54,28 +57,46 @@ impl JsonType {
Self { format }
}
// TODO(LFC): remove "allow unused"
#[allow(unused)]
pub(crate) fn empty() -> Self {
Self {
format: JsonFormat::Native(Box::new(ConcreteDataType::null_datatype())),
}
}
/// Make json type a struct type, by:
/// - if the json is an object, its entries are mapped to struct fields, obviously;
/// - if not, the json is one of bool, number, string or array, make it a special field called
/// "__plain" in a struct with only that field.
/// [JSON_PLAIN_FIELD_NAME] with metadata [JSON_PLAIN_FIELD_METADATA_KEY] = `"true"` in a
/// struct with only that field.
pub(crate) fn as_struct_type(&self) -> StructType {
match &self.format {
JsonFormat::Jsonb => StructType::default(),
JsonFormat::Native(inner) => match inner.as_ref() {
ConcreteDataType::Struct(t) => t.clone(),
x => StructType::new(Arc::new(vec![StructField::new(
"__plain".to_string(),
x.clone(),
true,
)])),
x => {
let mut field =
StructField::new(JSON_PLAIN_FIELD_NAME.to_string(), x.clone(), true);
field.insert_metadata(JSON_PLAIN_FIELD_METADATA_KEY, true);
StructType::new(Arc::new(vec![field]))
}
},
}
}
// TODO(LFC): remove "allow unused"
#[allow(unused)]
/// Check if this json type is the special "plain" one.
/// See [JsonType::as_struct_type].
pub(crate) fn is_plain_json(&self) -> bool {
let JsonFormat::Native(box ConcreteDataType::Struct(t)) = &self.format else {
return true;
};
let fields = t.fields();
let Some((single, [])) = fields.split_first() else {
return false;
};
single.name() == JSON_PLAIN_FIELD_NAME
&& single.metadata(JSON_PLAIN_FIELD_METADATA_KEY) == Some("true")
}
/// Try to merge this json type with others, error on datatype conflict.
pub(crate) fn merge(&mut self, other: &JsonType) -> Result<()> {
match (&self.format, &other.format) {
@@ -91,6 +112,47 @@ impl JsonType {
.fail(),
}
}
pub(crate) fn is_mergeable(&self, other: &JsonType) -> bool {
match (&self.format, &other.format) {
(JsonFormat::Jsonb, JsonFormat::Jsonb) => true,
(JsonFormat::Native(this), JsonFormat::Native(that)) => {
is_mergeable(this.as_ref(), that.as_ref())
}
_ => false,
}
}
}
fn is_mergeable(this: &ConcreteDataType, that: &ConcreteDataType) -> bool {
fn is_mergeable_struct(this: &StructType, that: &StructType) -> bool {
let this_fields = this.fields();
let this_fields = this_fields
.iter()
.map(|x| (x.name(), x))
.collect::<HashMap<_, _>>();
for that_field in that.fields().iter() {
if let Some(this_field) = this_fields.get(that_field.name())
&& !is_mergeable(this_field.data_type(), that_field.data_type())
{
return false;
}
}
true
}
match (this, that) {
(this, that) if this == that => true,
(ConcreteDataType::List(this), ConcreteDataType::List(that)) => {
is_mergeable(this.item_type(), that.item_type())
}
(ConcreteDataType::Struct(this), ConcreteDataType::Struct(that)) => {
is_mergeable_struct(this, that)
}
(ConcreteDataType::Null(_), _) | (_, ConcreteDataType::Null(_)) => true,
_ => false,
}
}
fn merge(this: &ConcreteDataType, that: &ConcreteDataType) -> Result<ConcreteDataType> {
@@ -166,7 +228,10 @@ impl DataType for JsonType {
}
fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector> {
Box::new(BinaryVectorBuilder::with_capacity(capacity))
match self.format {
JsonFormat::Jsonb => Box::new(BinaryVectorBuilder::with_capacity(capacity)),
JsonFormat::Native(_) => Box::new(JsonVectorBuilder::with_capacity(capacity)),
}
}
fn try_cast(&self, from: Value) -> Option<Value> {
@@ -226,10 +291,12 @@ mod tests {
let result = json_type.merge(other);
match (result, expected) {
(Ok(()), Ok(expected)) => {
assert_eq!(json_type.name(), expected)
assert_eq!(json_type.name(), expected);
assert!(json_type.is_mergeable(other));
}
(Err(err), Err(expected)) => {
assert_eq!(err.to_string(), expected)
assert_eq!(err.to_string(), expected);
assert!(!json_type.is_mergeable(other));
}
_ => unreachable!(),
}

View File

@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::BTreeMap;
use std::sync::Arc;
use arrow::datatypes::{DataType as ArrowDataType, Field};
@@ -46,6 +47,15 @@ impl TryFrom<&Fields> for StructType {
}
}
impl<const N: usize> From<[StructField; N]> for StructType {
fn from(value: [StructField; N]) -> Self {
let value: Box<[StructField]> = Box::new(value);
Self {
fields: Arc::new(value.into_vec()),
}
}
}
impl DataType for StructType {
fn name(&self) -> String {
format!(
@@ -108,6 +118,7 @@ pub struct StructField {
name: String,
data_type: ConcreteDataType,
nullable: bool,
metadata: BTreeMap<String, String>,
}
impl StructField {
@@ -116,6 +127,7 @@ impl StructField {
name,
data_type,
nullable,
metadata: BTreeMap::new(),
}
}
@@ -135,11 +147,25 @@ impl StructField {
self.nullable
}
pub(crate) fn insert_metadata(&mut self, key: impl ToString, value: impl ToString) {
self.metadata.insert(key.to_string(), value.to_string());
}
pub(crate) fn metadata(&self, key: &str) -> Option<&str> {
self.metadata.get(key).map(String::as_str)
}
pub fn to_df_field(&self) -> Field {
let metadata = self
.metadata
.iter()
.map(|(k, v)| (k.clone(), v.clone()))
.collect();
Field::new(
self.name.clone(),
self.data_type.as_arrow_type(),
self.nullable,
)
.with_metadata(metadata)
}
}

View File

@@ -873,6 +873,12 @@ impl From<&[u8]> for Value {
}
}
impl From<()> for Value {
fn from(_: ()) -> Self {
Value::Null
}
}
impl TryFrom<Value> for serde_json::Value {
type Error = serde_json::Error;

View File

@@ -35,6 +35,7 @@ mod duration;
mod eq;
mod helper;
mod interval;
pub(crate) mod json;
mod list;
mod null;
pub(crate) mod operations;

View File

@@ -464,6 +464,14 @@ impl Helper {
}
}
#[cfg(test)]
pub(crate) fn pretty_print(vector: VectorRef) -> String {
let array = vector.to_arrow_array();
arrow::util::pretty::pretty_format_columns(&vector.vector_type_name(), &[array])
.map(|x| x.to_string())
.unwrap_or_else(|e| e.to_string())
}
#[cfg(test)]
mod tests {
use arrow::array::{

View File

@@ -0,0 +1,15 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub(crate) mod builder;

View File

@@ -0,0 +1,485 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use std::collections::HashMap;
use snafu::OptionExt;
use crate::data_type::ConcreteDataType;
use crate::error::{Result, TryFromValueSnafu, UnsupportedOperationSnafu};
use crate::prelude::{ValueRef, Vector, VectorRef};
use crate::types::JsonType;
use crate::value::StructValueRef;
use crate::vectors::{MutableVector, StructVectorBuilder};
struct JsonStructsBuilder {
json_type: JsonType,
inner: StructVectorBuilder,
}
impl JsonStructsBuilder {
fn new(json_type: JsonType, capacity: usize) -> Self {
let struct_type = json_type.as_struct_type();
let inner = StructVectorBuilder::with_type_and_capacity(struct_type, capacity);
Self { json_type, inner }
}
fn len(&self) -> usize {
self.inner.len()
}
fn push(&mut self, value: &ValueRef) -> Result<()> {
if self.json_type.is_plain_json() {
let value = ValueRef::Struct(StructValueRef::RefList {
val: vec![value.clone()],
fields: self.json_type.as_struct_type(),
});
self.inner.try_push_value_ref(&value)
} else {
self.inner.try_push_value_ref(value)
}
}
/// Try to merge (and consume the data of) other json vector builder into this one.
/// Note that the other builder's json type must be able to be merged with this one's
/// (this one's json type has all the fields in other one's, and no datatypes conflict).
/// Normally this is guaranteed, as long as json values are pushed through [JsonVectorBuilder].
fn try_merge(&mut self, other: &mut JsonStructsBuilder) -> Result<()> {
debug_assert!(self.json_type.is_mergeable(&other.json_type));
fn helper(this: &mut StructVectorBuilder, that: &mut StructVectorBuilder) -> Result<()> {
let that_len = that.len();
if let Some(x) = that.mut_null_buffer().finish() {
this.mut_null_buffer().append_buffer(&x)
} else {
this.mut_null_buffer().append_n_non_nulls(that_len);
}
let that_fields = that.struct_type().fields();
let mut that_builders = that_fields
.iter()
.zip(that.mut_value_builders().iter_mut())
.map(|(field, builder)| (field.name(), builder))
.collect::<HashMap<_, _>>();
for (field, this_builder) in this
.struct_type()
.fields()
.iter()
.zip(this.mut_value_builders().iter_mut())
{
if let Some(that_builder) = that_builders.get_mut(field.name()) {
if field.data_type().is_struct() {
let this = this_builder
.as_mut_any()
.downcast_mut::<StructVectorBuilder>()
// Safety: a struct datatype field must be corresponding to a struct vector builder.
.unwrap();
let that = that_builder
.as_mut_any()
.downcast_mut::<StructVectorBuilder>()
// Safety: other builder with same field name must have same datatype,
// ensured because the two json types are mergeable.
.unwrap();
helper(this, that)?;
} else {
let vector = that_builder.to_vector();
this_builder.extend_slice_of(vector.as_ref(), 0, vector.len())?;
}
} else {
this_builder.push_nulls(that_len);
}
}
Ok(())
}
helper(&mut self.inner, &mut other.inner)
}
/// Same as [JsonStructsBuilder::try_merge], but does not consume the other builder's data.
fn try_merge_cloned(&mut self, other: &JsonStructsBuilder) -> Result<()> {
debug_assert!(self.json_type.is_mergeable(&other.json_type));
fn helper(this: &mut StructVectorBuilder, that: &StructVectorBuilder) -> Result<()> {
let that_len = that.len();
if let Some(x) = that.null_buffer().finish_cloned() {
this.mut_null_buffer().append_buffer(&x)
} else {
this.mut_null_buffer().append_n_non_nulls(that_len);
}
let that_fields = that.struct_type().fields();
let that_builders = that_fields
.iter()
.zip(that.value_builders().iter())
.map(|(field, builder)| (field.name(), builder))
.collect::<HashMap<_, _>>();
for (field, this_builder) in this
.struct_type()
.fields()
.iter()
.zip(this.mut_value_builders().iter_mut())
{
if let Some(that_builder) = that_builders.get(field.name()) {
if field.data_type().is_struct() {
let this = this_builder
.as_mut_any()
.downcast_mut::<StructVectorBuilder>()
// Safety: a struct datatype field must be corresponding to a struct vector builder.
.unwrap();
let that = that_builder
.as_any()
.downcast_ref::<StructVectorBuilder>()
// Safety: other builder with same field name must have same datatype,
// ensured because the two json types are mergeable.
.unwrap();
helper(this, that)?;
} else {
let vector = that_builder.to_vector_cloned();
this_builder.extend_slice_of(vector.as_ref(), 0, vector.len())?;
}
} else {
this_builder.push_nulls(that_len);
}
}
Ok(())
}
helper(&mut self.inner, &other.inner)
}
}
/// The vector builder for json type values.
///
/// Json type are dynamic, to some degree (as long as they can be merged into each other). So are
/// json values. Json values are physically stored in struct vectors, which require the types of
/// struct values to be fixed inside a certain struct vector. So to resolve "dynamic" vs "fixed"
/// datatype problem, in this builder, each type of json value gets its own struct vector builder.
/// Once new json type value is pushing into this builder, it creates a new "child" builder for it.
///
/// Given the "mixed" nature of the values stored in this builder, to produce the json vector, a
/// "merge" operation is performed. The "merge" is to iterate over all the "child" builders, and fill
/// nulls for missing json fields. The final vector's json type is fixed to be the "merge" of all
/// pushed json types.
pub(crate) struct JsonVectorBuilder {
merged_type: JsonType,
capacity: usize,
builders: Vec<JsonStructsBuilder>,
}
impl JsonVectorBuilder {
pub(crate) fn with_capacity(capacity: usize) -> Self {
Self {
merged_type: JsonType::empty(),
capacity,
builders: vec![],
}
}
fn try_create_new_builder(&mut self, json_type: &JsonType) -> Result<&mut JsonStructsBuilder> {
self.merged_type.merge(json_type)?;
let builder = JsonStructsBuilder::new(json_type.clone(), self.capacity);
self.builders.push(builder);
let len = self.builders.len();
Ok(&mut self.builders[len - 1])
}
}
impl MutableVector for JsonVectorBuilder {
fn data_type(&self) -> ConcreteDataType {
ConcreteDataType::Json(self.merged_type.clone())
}
fn len(&self) -> usize {
self.builders.iter().map(|x| x.len()).sum()
}
fn as_any(&self) -> &dyn Any {
self
}
fn as_mut_any(&mut self) -> &mut dyn Any {
self
}
fn to_vector(&mut self) -> VectorRef {
// Fast path:
if self.builders.len() == 1 {
return self.builders[0].inner.to_vector();
}
let mut unified_jsons = JsonStructsBuilder::new(self.merged_type.clone(), self.capacity);
for builder in self.builders.iter_mut() {
unified_jsons
.try_merge(builder)
// Safety: the "unified_jsons" has the merged json type from all the builders,
// so it should merge them without errors.
.unwrap_or_else(|e| panic!("failed to merge json builders, error: {e}"));
}
unified_jsons.inner.to_vector()
}
fn to_vector_cloned(&self) -> VectorRef {
// Fast path:
if self.builders.len() == 1 {
return self.builders[0].inner.to_vector_cloned();
}
let mut unified_jsons = JsonStructsBuilder::new(self.merged_type.clone(), self.capacity);
for builder in self.builders.iter() {
unified_jsons
.try_merge_cloned(builder)
// Safety: the "unified_jsons" has the merged json type from all the builders,
// so it should merge them without errors.
.unwrap_or_else(|e| panic!("failed to merge json builders, error: {e}"));
}
unified_jsons.inner.to_vector_cloned()
}
fn try_push_value_ref(&mut self, value: &ValueRef) -> Result<()> {
let data_type = value.data_type();
let json_type = data_type.as_json().with_context(|| TryFromValueSnafu {
reason: format!("expected json value, got {value:?}"),
})?;
let builder = match self.builders.last_mut() {
Some(last) => {
if &last.json_type != json_type {
self.try_create_new_builder(json_type)?
} else {
last
}
}
None => self.try_create_new_builder(json_type)?,
};
let ValueRef::Json(value) = value else {
// Safety: json datatype value must be the value of json.
unreachable!()
};
builder.push(value)
}
fn push_null(&mut self) {
let null_json_value = ValueRef::Json(Box::new(ValueRef::Null));
self.try_push_value_ref(&null_json_value)
// Safety: learning from the method "try_push_value_ref", a null json value should be
// always able to push into any json vectors.
.unwrap_or_else(|e| {
panic!("failed to push null json value: {null_json_value:?}, error: {e}")
});
}
fn extend_slice_of(&mut self, _: &dyn Vector, _: usize, _: usize) -> Result<()> {
UnsupportedOperationSnafu {
op: "extend_slice_of",
vector_type: "JsonVector",
}
.fail()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::data_type::DataType;
use crate::json::JsonStructureSettings;
use crate::vectors::helper::pretty_print;
fn push(json: &str, builder: &mut JsonVectorBuilder, expected: std::result::Result<(), &str>) {
let settings = JsonStructureSettings::Structured(None);
let json: serde_json::Value = serde_json::from_str(json).unwrap();
let value = settings.encode(json).unwrap();
let value = value.as_value_ref();
let result = builder.try_push_value_ref(&value);
match (result, expected) {
(Ok(()), Ok(())) => (),
(Err(e), Err(expected)) => assert_eq!(e.to_string(), expected),
_ => unreachable!(),
}
}
#[test]
fn test_push_plain_jsons() -> Result<()> {
let jsons = vec!["1", "2", r#""s""#, "[true]"];
let results = vec![
Ok(()),
Ok(()),
Err(
"Failed to merge JSON datatype: datatypes have conflict, this: Int64, that: String",
),
Err(
"Failed to merge JSON datatype: datatypes have conflict, this: Int64, that: List<Boolean>",
),
];
let mut builder = JsonVectorBuilder::with_capacity(1);
for (json, result) in jsons.into_iter().zip(results.into_iter()) {
push(json, &mut builder, result);
}
let vector = builder.to_vector();
let expected = r#"
+----------------+
| StructVector |
+----------------+
| {__plain__: 1} |
| {__plain__: 2} |
+----------------+"#;
assert_eq!(pretty_print(vector), expected.trim());
Ok(())
}
#[test]
fn test_push_json_objects() -> Result<()> {
let jsons = vec![
r#"{
"s": "a",
"list": [1, 2, 3]
}"#,
r#"{
"list": [4],
"s": "b"
}"#,
r#"{
"s": "c",
"float": 0.9
}"#,
r#"{
"float": 0.8,
"s": "d"
}"#,
r#"{
"float": 0.7,
"int": -1
}"#,
r#"{
"int": 0,
"float": 0.6
}"#,
r#"{
"int": 1,
"object": {"hello": "world", "timestamp": 1761523200000}
}"#,
r#"{
"object": {"hello": "greptime", "timestamp": 1761523201000},
"int": 2
}"#,
r#"{
"object": {"timestamp": 1761523202000},
"nested": {"a": {"b": {"b": {"a": "abba"}}}}
}"#,
r#"{
"nested": {"a": {"b": {"a": {"b": "abab"}}}},
"object": {"timestamp": 1761523203000}
}"#,
];
let mut builder = JsonVectorBuilder::with_capacity(1);
for json in jsons {
push(json, &mut builder, Ok(()));
}
assert_eq!(builder.len(), 10);
// test children builders:
assert_eq!(builder.builders.len(), 6);
let expect_types = [
r#"Json<Struct<"list": List<Int64>, "s": String>>"#,
r#"Json<Struct<"float": Float64, "s": String>>"#,
r#"Json<Struct<"float": Float64, "int": Int64>>"#,
r#"Json<Struct<"int": Int64, "object": Struct<"hello": String, "timestamp": Int64>>>"#,
r#"Json<Struct<"nested": Struct<"a": Struct<"b": Struct<"b": Struct<"a": String>>>>, "object": Struct<"timestamp": Int64>>>"#,
r#"Json<Struct<"nested": Struct<"a": Struct<"b": Struct<"a": Struct<"b": String>>>>, "object": Struct<"timestamp": Int64>>>"#,
];
let expect_vectors = [
r#"
+-------------------------+
| StructVector |
+-------------------------+
| {list: [1, 2, 3], s: a} |
| {list: [4], s: b} |
+-------------------------+"#,
r#"
+--------------------+
| StructVector |
+--------------------+
| {float: 0.9, s: c} |
| {float: 0.8, s: d} |
+--------------------+"#,
r#"
+-----------------------+
| StructVector |
+-----------------------+
| {float: 0.7, int: -1} |
| {float: 0.6, int: 0} |
+-----------------------+"#,
r#"
+---------------------------------------------------------------+
| StructVector |
+---------------------------------------------------------------+
| {int: 1, object: {hello: world, timestamp: 1761523200000}} |
| {int: 2, object: {hello: greptime, timestamp: 1761523201000}} |
+---------------------------------------------------------------+"#,
r#"
+------------------------------------------------------------------------+
| StructVector |
+------------------------------------------------------------------------+
| {nested: {a: {b: {b: {a: abba}}}}, object: {timestamp: 1761523202000}} |
+------------------------------------------------------------------------+"#,
r#"
+------------------------------------------------------------------------+
| StructVector |
+------------------------------------------------------------------------+
| {nested: {a: {b: {a: {b: abab}}}}, object: {timestamp: 1761523203000}} |
+------------------------------------------------------------------------+"#,
];
for (builder, (expect_type, expect_vector)) in builder
.builders
.iter()
.zip(expect_types.into_iter().zip(expect_vectors.into_iter()))
{
assert_eq!(builder.json_type.name(), expect_type);
let vector = builder.inner.to_vector_cloned();
assert_eq!(pretty_print(vector), expect_vector.trim());
}
// test final merged json type:
let expected = r#"Json<Struct<"float": Float64, "int": Int64, "list": List<Int64>, "nested": Struct<"a": Struct<"b": Struct<"a": Struct<"b": String>, "b": Struct<"a": String>>>>, "object": Struct<"hello": String, "timestamp": Int64>, "s": String>>"#;
assert_eq!(builder.data_type().to_string(), expected);
// test final produced vector:
let expected = r#"
+-------------------------------------------------------------------------------------------------------------------+
| StructVector |
+-------------------------------------------------------------------------------------------------------------------+
| {float: , int: , list: [1, 2, 3], nested: , object: , s: a} |
| {float: , int: , list: [4], nested: , object: , s: b} |
| {float: 0.9, int: , list: , nested: , object: , s: c} |
| {float: 0.8, int: , list: , nested: , object: , s: d} |
| {float: 0.7, int: -1, list: , nested: , object: , s: } |
| {float: 0.6, int: 0, list: , nested: , object: , s: } |
| {float: , int: 1, list: , nested: , object: {hello: world, timestamp: 1761523200000}, s: } |
| {float: , int: 2, list: , nested: , object: {hello: greptime, timestamp: 1761523201000}, s: } |
| {float: , int: , list: , nested: {a: {b: {a: , b: {a: abba}}}}, object: {hello: , timestamp: 1761523202000}, s: } |
| {float: , int: , list: , nested: {a: {b: {a: {b: abab}, b: }}}, object: {hello: , timestamp: 1761523203000}, s: } |
+-------------------------------------------------------------------------------------------------------------------+"#;
let vector = builder.to_vector_cloned();
assert_eq!(pretty_print(vector), expected.trim());
let vector = builder.to_vector();
assert_eq!(pretty_print(vector), expected.trim());
Ok(())
}
}

View File

@@ -323,6 +323,26 @@ impl StructVectorBuilder {
}
self.null_buffer.append_null();
}
pub(crate) fn struct_type(&self) -> &StructType {
&self.fields
}
pub(crate) fn value_builders(&self) -> &[Box<dyn MutableVector>] {
&self.value_builders
}
pub(crate) fn mut_value_builders(&mut self) -> &mut [Box<dyn MutableVector>] {
&mut self.value_builders
}
pub(crate) fn null_buffer(&self) -> &NullBufferBuilder {
&self.null_buffer
}
pub(crate) fn mut_null_buffer(&mut self) -> &mut NullBufferBuilder {
&mut self.null_buffer
}
}
impl MutableVector for StructVectorBuilder {

View File

@@ -21,6 +21,7 @@ use std::sync::Arc;
use std::time::{Duration, Instant, SystemTime};
use api::v1::{RowDeleteRequest, RowDeleteRequests, RowInsertRequest, RowInsertRequests};
use common_base::memory_limit::MemoryLimit;
use common_config::Configurable;
use common_error::ext::BoxedError;
use common_meta::key::TableMetadataManagerRef;
@@ -132,6 +133,7 @@ impl Default for FlownodeOptions {
query: QueryOptions {
parallelism: 1,
allow_query_fallback: false,
memory_pool_size: MemoryLimit::default(),
},
user_provider: None,
memory: MemoryOptions::default(),

View File

@@ -23,7 +23,7 @@ use api::v1::query_request::Query;
use api::v1::{CreateTableExpr, QueryRequest};
use client::{Client, Database};
use common_error::ext::{BoxedError, ErrorExt};
use common_grpc::channel_manager::{ChannelConfig, ChannelManager};
use common_grpc::channel_manager::{ChannelConfig, ChannelManager, load_tls_config};
use common_meta::cluster::{NodeInfo, NodeInfoKey, Role};
use common_meta::peer::Peer;
use common_meta::rpc::store::RangeRequest;
@@ -123,12 +123,10 @@ impl FrontendClient {
let cfg = ChannelConfig::new()
.connect_timeout(batch_opts.grpc_conn_timeout)
.timeout(batch_opts.query_timeout);
if let Some(tls) = &batch_opts.frontend_tls {
let cfg = cfg.client_tls_config(tls.clone());
ChannelManager::with_tls_config(cfg).context(InvalidClientConfigSnafu)?
} else {
ChannelManager::with_config(cfg)
}
let tls_config = load_tls_config(batch_opts.frontend_tls.as_ref())
.context(InvalidClientConfigSnafu)?;
ChannelManager::with_config(cfg, tls_config)
},
auth,
query,

View File

@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use std::collections::{HashMap, HashSet};
use std::sync::Arc;
use async_trait::async_trait;
@@ -28,6 +28,7 @@ use common_function::scalars::udf::create_udf;
use common_query::{Output, OutputData};
use common_recordbatch::adapter::RecordBatchStreamAdapter;
use common_recordbatch::util;
use common_telemetry::warn;
use datafusion::dataframe::DataFrame;
use datafusion::execution::SessionStateBuilder;
use datafusion::execution::context::SessionContext;
@@ -42,8 +43,9 @@ use servers::error::{
};
use servers::http::jaeger::{JAEGER_QUERY_TABLE_NAME_KEY, QueryTraceParams};
use servers::otlp::trace::{
DURATION_NANO_COLUMN, SERVICE_NAME_COLUMN, SPAN_ATTRIBUTES_COLUMN, SPAN_KIND_COLUMN,
SPAN_KIND_PREFIX, SPAN_NAME_COLUMN, TIMESTAMP_COLUMN, TRACE_ID_COLUMN,
DURATION_NANO_COLUMN, KEY_OTEL_STATUS_ERROR_KEY, SERVICE_NAME_COLUMN, SPAN_ATTRIBUTES_COLUMN,
SPAN_KIND_COLUMN, SPAN_KIND_PREFIX, SPAN_NAME_COLUMN, SPAN_STATUS_CODE, SPAN_STATUS_ERROR,
TIMESTAMP_COLUMN, TRACE_ID_COLUMN,
};
use servers::query_handler::JaegerQueryHandler;
use session::context::QueryContextRef;
@@ -126,6 +128,7 @@ impl JaegerQueryHandler for Instance {
trace_id: &str,
start_time: Option<i64>,
end_time: Option<i64>,
limit: Option<usize>,
) -> ServerResult<Output> {
// It's equivalent to the following SQL query:
//
@@ -153,6 +156,13 @@ impl JaegerQueryHandler for Instance {
filters.push(col(TIMESTAMP_COLUMN).lt_eq(lit_timestamp_nano(end_time)));
}
let limit = if start_time.is_some() && end_time.is_some() {
// allow unlimited limit if time range is specified
limit
} else {
limit.or(Some(DEFAULT_LIMIT))
};
Ok(query_trace_table(
ctx,
self.catalog_manager(),
@@ -160,7 +170,7 @@ impl JaegerQueryHandler for Instance {
selects,
filters,
vec![col(TIMESTAMP_COLUMN).sort(false, false)], // Sort by timestamp in descending order.
Some(DEFAULT_LIMIT),
limit,
None,
vec![],
)
@@ -263,7 +273,7 @@ impl JaegerQueryHandler for Instance {
self.query_engine(),
vec![wildcard()],
filters,
vec![],
vec![col(TIMESTAMP_COLUMN).sort(false, false)], // Sort by timestamp in descending order.
None,
None,
vec![],
@@ -322,6 +332,7 @@ async fn query_trace_table(
})?;
let is_data_model_v1 = table
.clone()
.table_info()
.meta
.options
@@ -330,6 +341,14 @@ async fn query_trace_table(
.map(|s| s.as_str())
== Some(TABLE_DATA_MODEL_TRACE_V1);
// collect to set
let col_names = table
.table_info()
.meta
.field_column_names()
.map(|s| format!("\"{}\"", s))
.collect::<HashSet<String>>();
let df_context = create_df_context(query_engine)?;
let dataframe = df_context
@@ -342,7 +361,7 @@ async fn query_trace_table(
let dataframe = filters
.into_iter()
.chain(tags.map_or(Ok(vec![]), |t| {
tags_filters(&dataframe, t, is_data_model_v1)
tags_filters(&dataframe, t, is_data_model_v1, &col_names)
})?)
.try_fold(dataframe, |df, expr| {
df.filter(expr).context(DataFusionSnafu)
@@ -472,23 +491,73 @@ fn json_tag_filters(
Ok(filters)
}
fn flatten_tag_filters(tags: HashMap<String, JsonValue>) -> ServerResult<Vec<Expr>> {
/// Helper function to check if span_key or resource_key exists in col_names and create an expression.
/// If neither exists, logs a warning and returns None.
#[inline]
fn check_col_and_build_expr<F>(
span_key: String,
resource_key: String,
key: &str,
col_names: &HashSet<String>,
expr_builder: F,
) -> Option<Expr>
where
F: FnOnce(String) -> Expr,
{
if col_names.contains(&span_key) {
return Some(expr_builder(span_key));
}
if col_names.contains(&resource_key) {
return Some(expr_builder(resource_key));
}
warn!("tag key {} not found in table columns", key);
None
}
fn flatten_tag_filters(
tags: HashMap<String, JsonValue>,
col_names: &HashSet<String>,
) -> ServerResult<Vec<Expr>> {
let filters = tags
.into_iter()
.filter_map(|(key, value)| {
let key = format!("\"span_attributes.{}\"", key);
if key == KEY_OTEL_STATUS_ERROR_KEY && value == JsonValue::Bool(true) {
return Some(col(SPAN_STATUS_CODE).eq(lit(SPAN_STATUS_ERROR)));
}
// TODO(shuiyisong): add more precise mapping from key to col name
let span_key = format!("\"span_attributes.{}\"", key);
let resource_key = format!("\"resource_attributes.{}\"", key);
match value {
JsonValue::String(value) => Some(col(key).eq(lit(value))),
JsonValue::String(value) => {
check_col_and_build_expr(span_key, resource_key, &key, col_names, |k| {
col(k).eq(lit(value))
})
}
JsonValue::Number(value) => {
if value.is_f64() {
// safe to unwrap as checked previously
Some(col(key).eq(lit(value.as_f64().unwrap())))
let value = value.as_f64().unwrap();
check_col_and_build_expr(span_key, resource_key, &key, col_names, |k| {
col(k).eq(lit(value))
})
} else {
Some(col(key).eq(lit(value.as_i64().unwrap())))
let value = value.as_i64().unwrap();
check_col_and_build_expr(span_key, resource_key, &key, col_names, |k| {
col(k).eq(lit(value))
})
}
}
JsonValue::Bool(value) => Some(col(key).eq(lit(value))),
JsonValue::Null => Some(col(key).is_null()),
JsonValue::Bool(value) => {
check_col_and_build_expr(span_key, resource_key, &key, col_names, |k| {
col(k).eq(lit(value))
})
}
JsonValue::Null => {
check_col_and_build_expr(span_key, resource_key, &key, col_names, |k| {
col(k).is_null()
})
}
// not supported at the moment
JsonValue::Array(_value) => None,
JsonValue::Object(_value) => None,
@@ -502,9 +571,10 @@ fn tags_filters(
dataframe: &DataFrame,
tags: HashMap<String, JsonValue>,
is_data_model_v1: bool,
col_names: &HashSet<String>,
) -> ServerResult<Vec<Expr>> {
if is_data_model_v1 {
flatten_tag_filters(tags)
flatten_tag_filters(tags, col_names)
} else {
json_tag_filters(dataframe, tags)
}

View File

@@ -36,7 +36,7 @@ async fn run() {
.timeout(Duration::from_secs(3))
.connect_timeout(Duration::from_secs(5))
.tcp_nodelay(true);
let channel_manager = ChannelManager::with_config(config);
let channel_manager = ChannelManager::with_config(config, None);
let mut meta_client = MetaClientBuilder::datanode_default_options(id)
.channel_manager(channel_manager)
.build();

View File

@@ -101,7 +101,7 @@ pub async fn create_meta_client(
if let MetaClientType::Frontend = client_type {
let ddl_config = base_config.clone().timeout(meta_client_options.ddl_timeout);
builder = builder.ddl_channel_manager(ChannelManager::with_config(ddl_config));
builder = builder.ddl_channel_manager(ChannelManager::with_config(ddl_config, None));
if let Some(plugins) = plugins {
let region_follower = plugins.get::<RegionFollowerClientRef>();
if let Some(region_follower) = region_follower {
@@ -112,8 +112,8 @@ pub async fn create_meta_client(
}
builder = builder
.channel_manager(ChannelManager::with_config(base_config))
.heartbeat_channel_manager(ChannelManager::with_config(heartbeat_config));
.channel_manager(ChannelManager::with_config(base_config, None))
.heartbeat_channel_manager(ChannelManager::with_config(heartbeat_config, None));
let mut meta_client = builder.build();

View File

@@ -72,7 +72,10 @@ serde.workspace = true
serde_json.workspace = true
servers.workspace = true
snafu.workspace = true
sqlx = { workspace = true, optional = true }
sqlx = { workspace = true, features = [
"mysql",
"chrono",
], optional = true }
store-api.workspace = true
strum.workspace = true
table.workspace = true

View File

@@ -26,6 +26,7 @@ use common_meta::distributed_time_constants::{
use common_meta::error::Result;
use common_meta::peer::{Peer, PeerDiscovery, PeerResolver};
use common_meta::{DatanodeId, FlownodeId};
use common_time::util::DefaultSystemTimer;
use snafu::ResultExt;
use crate::cluster::MetaPeerClient;
@@ -35,6 +36,7 @@ use crate::discovery::lease::{LeaseValueAccessor, LeaseValueType};
impl PeerDiscovery for MetaPeerClient {
async fn active_frontends(&self) -> Result<Vec<Peer>> {
utils::alive_frontends(
&DefaultSystemTimer,
self,
Duration::from_millis(FRONTEND_HEARTBEAT_INTERVAL_MILLIS),
)
@@ -47,20 +49,30 @@ impl PeerDiscovery for MetaPeerClient {
&self,
filter: Option<for<'a> fn(&'a NodeWorkloads) -> bool>,
) -> Result<Vec<Peer>> {
utils::alive_datanodes(self, Duration::from_secs(DATANODE_LEASE_SECS), filter)
.await
.map_err(BoxedError::new)
.context(common_meta::error::ExternalSnafu)
utils::alive_datanodes(
&DefaultSystemTimer,
self,
Duration::from_secs(DATANODE_LEASE_SECS),
filter,
)
.await
.map_err(BoxedError::new)
.context(common_meta::error::ExternalSnafu)
}
async fn active_flownodes(
&self,
filter: Option<for<'a> fn(&'a NodeWorkloads) -> bool>,
) -> Result<Vec<Peer>> {
utils::alive_flownodes(self, Duration::from_secs(FLOWNODE_LEASE_SECS), filter)
.await
.map_err(BoxedError::new)
.context(common_meta::error::ExternalSnafu)
utils::alive_flownodes(
&DefaultSystemTimer,
self,
Duration::from_secs(FLOWNODE_LEASE_SECS),
filter,
)
.await
.map_err(BoxedError::new)
.context(common_meta::error::ExternalSnafu)
}
}

View File

@@ -95,20 +95,22 @@ impl LeaseValueAccessor for MetaPeerClient {
#[cfg(test)]
mod tests {
use std::sync::Arc;
use std::sync::atomic::{AtomicI64, Ordering};
use std::time::Duration;
use api::v1::meta::DatanodeWorkloads;
use api::v1::meta::heartbeat_request::NodeWorkloads;
use api::v1::meta::{DatanodeWorkloads, FlownodeWorkloads};
use common_meta::cluster::{FrontendStatus, NodeInfo, NodeInfoKey, NodeStatus, Role};
use common_meta::distributed_time_constants::FRONTEND_HEARTBEAT_INTERVAL_MILLIS;
use common_meta::kv_backend::ResettableKvBackendRef;
use common_meta::peer::{Peer, PeerDiscovery};
use common_meta::rpc::store::PutRequest;
use common_time::util::current_time_millis;
use common_time::util::{DefaultSystemTimer, SystemTimer, current_time_millis};
use common_workload::DatanodeWorkloadType;
use crate::discovery::utils::{self, accept_ingest_workload};
use crate::key::{DatanodeLeaseKey, LeaseValue};
use crate::key::{DatanodeLeaseKey, FlownodeLeaseKey, LeaseValue};
use crate::test_util::create_meta_peer_client;
async fn put_lease_value(
@@ -126,17 +128,47 @@ mod tests {
.unwrap();
}
async fn put_flownode_lease_value(
kv_backend: &ResettableKvBackendRef,
key: FlownodeLeaseKey,
value: LeaseValue,
) {
kv_backend
.put(PutRequest {
key: key.try_into().unwrap(),
value: value.try_into().unwrap(),
prev_kv: false,
})
.await
.unwrap();
}
struct MockTimer {
current: Arc<AtomicI64>,
}
impl SystemTimer for MockTimer {
fn current_time_millis(&self) -> i64 {
self.current.fetch_add(1, Ordering::Relaxed)
}
fn current_time_rfc3339(&self) -> String {
unimplemented!()
}
}
#[tokio::test]
async fn test_alive_datanodes() {
let client = create_meta_peer_client();
let in_memory = client.memory_backend();
let lease_secs = 10;
let timer = DefaultSystemTimer;
// put a stale lease value for node 1
let key = DatanodeLeaseKey { node_id: 1 };
let value = LeaseValue {
// 20s ago
timestamp_millis: current_time_millis() - lease_secs * 2 * 1000,
timestamp_millis: timer.current_time_millis() - lease_secs * 2 * 1000,
node_addr: "127.0.0.1:20201".to_string(),
workloads: NodeWorkloads::Datanode(DatanodeWorkloads {
types: vec![DatanodeWorkloadType::Hybrid as i32],
@@ -147,7 +179,7 @@ mod tests {
// put a fresh lease value for node 2
let key = DatanodeLeaseKey { node_id: 2 };
let value = LeaseValue {
timestamp_millis: current_time_millis(),
timestamp_millis: timer.current_time_millis(),
node_addr: "127.0.0.1:20202".to_string(),
workloads: NodeWorkloads::Datanode(DatanodeWorkloads {
types: vec![DatanodeWorkloadType::Hybrid as i32],
@@ -155,6 +187,37 @@ mod tests {
};
put_lease_value(&in_memory, key.clone(), value.clone()).await;
let peers = utils::alive_datanodes(
&timer,
client.as_ref(),
Duration::from_secs(lease_secs as u64),
None,
)
.await
.unwrap();
assert_eq!(peers.len(), 1);
assert_eq!(peers, vec![Peer::new(2, "127.0.0.1:20202".to_string())]);
}
#[tokio::test]
async fn test_alive_datanodes_with_timer() {
let client = create_meta_peer_client();
let in_memory = client.memory_backend();
let lease_secs = 10;
let timer = MockTimer {
current: Arc::new(AtomicI64::new(current_time_millis())),
};
let key = DatanodeLeaseKey { node_id: 2 };
let value = LeaseValue {
timestamp_millis: timer.current_time_millis(),
node_addr: "127.0.0.1:20202".to_string(),
workloads: NodeWorkloads::Datanode(DatanodeWorkloads {
types: vec![DatanodeWorkloadType::Hybrid as i32],
}),
};
put_lease_value(&in_memory, key.clone(), value.clone()).await;
let peers = utils::alive_datanodes(
&timer,
client.as_ref(),
Duration::from_secs(lease_secs as u64),
None,
@@ -170,12 +233,13 @@ mod tests {
let client = create_meta_peer_client();
let in_memory = client.memory_backend();
let lease_secs = 10;
let timer = DefaultSystemTimer;
// put a lease value for node 1 without mode info
let key = DatanodeLeaseKey { node_id: 1 };
let value = LeaseValue {
// 20s ago
timestamp_millis: current_time_millis() - 20 * 1000,
timestamp_millis: timer.current_time_millis() - 20 * 1000,
node_addr: "127.0.0.1:20201".to_string(),
workloads: NodeWorkloads::Datanode(DatanodeWorkloads {
types: vec![DatanodeWorkloadType::Hybrid as i32],
@@ -186,7 +250,7 @@ mod tests {
// put a lease value for node 2 with mode info
let key = DatanodeLeaseKey { node_id: 2 };
let value = LeaseValue {
timestamp_millis: current_time_millis(),
timestamp_millis: timer.current_time_millis(),
node_addr: "127.0.0.1:20202".to_string(),
workloads: NodeWorkloads::Datanode(DatanodeWorkloads {
types: vec![DatanodeWorkloadType::Hybrid as i32],
@@ -197,7 +261,7 @@ mod tests {
// put a lease value for node 3 with mode info
let key = DatanodeLeaseKey { node_id: 3 };
let value = LeaseValue {
timestamp_millis: current_time_millis(),
timestamp_millis: timer.current_time_millis(),
node_addr: "127.0.0.1:20203".to_string(),
workloads: NodeWorkloads::Datanode(DatanodeWorkloads {
types: vec![i32::MAX],
@@ -208,7 +272,7 @@ mod tests {
// put a lease value for node 3 with mode info
let key = DatanodeLeaseKey { node_id: 4 };
let value = LeaseValue {
timestamp_millis: current_time_millis(),
timestamp_millis: timer.current_time_millis(),
node_addr: "127.0.0.1:20204".to_string(),
workloads: NodeWorkloads::Datanode(DatanodeWorkloads {
types: vec![i32::MAX],
@@ -217,6 +281,7 @@ mod tests {
put_lease_value(&in_memory, key, value).await;
let peers = utils::alive_datanodes(
&timer,
client.as_ref(),
Duration::from_secs(lease_secs),
Some(accept_ingest_workload),
@@ -227,18 +292,84 @@ mod tests {
assert!(peers.contains(&Peer::new(2, "127.0.0.1:20202".to_string())));
}
#[tokio::test]
async fn test_alive_flownodes() {
let client = create_meta_peer_client();
let in_memory = client.memory_backend();
let lease_secs = 10;
let timer = DefaultSystemTimer;
// put a stale lease value for node 1
let key = FlownodeLeaseKey { node_id: 1 };
let value = LeaseValue {
// 20s ago
timestamp_millis: timer.current_time_millis() - lease_secs * 2 * 1000,
node_addr: "127.0.0.1:20201".to_string(),
workloads: NodeWorkloads::Flownode(FlownodeWorkloads { types: vec![] }),
};
put_flownode_lease_value(&in_memory, key, value).await;
// put a fresh lease value for node 2
let key = FlownodeLeaseKey { node_id: 2 };
let value = LeaseValue {
timestamp_millis: timer.current_time_millis(),
node_addr: "127.0.0.1:20202".to_string(),
workloads: NodeWorkloads::Flownode(FlownodeWorkloads { types: vec![] }),
};
put_flownode_lease_value(&in_memory, key.clone(), value.clone()).await;
let peers = utils::alive_flownodes(
&timer,
client.as_ref(),
Duration::from_secs(lease_secs as u64),
None,
)
.await
.unwrap();
assert_eq!(peers.len(), 1);
assert_eq!(peers, vec![Peer::new(2, "127.0.0.1:20202".to_string())]);
}
#[tokio::test]
async fn test_alive_flownodes_with_timer() {
let client = create_meta_peer_client();
let in_memory = client.memory_backend();
let lease_secs = 10;
let timer = MockTimer {
current: Arc::new(AtomicI64::new(current_time_millis())),
};
let key = FlownodeLeaseKey { node_id: 2 };
let value = LeaseValue {
timestamp_millis: timer.current_time_millis(),
node_addr: "127.0.0.1:20202".to_string(),
workloads: NodeWorkloads::Flownode(FlownodeWorkloads { types: vec![] }),
};
put_flownode_lease_value(&in_memory, key.clone(), value.clone()).await;
let peers = utils::alive_flownodes(
&timer,
client.as_ref(),
Duration::from_secs(lease_secs as u64),
None,
)
.await
.unwrap();
assert_eq!(peers.len(), 1);
assert_eq!(peers, vec![Peer::new(2, "127.0.0.1:20202".to_string())]);
}
#[tokio::test]
async fn test_lookup_frontends() {
let client = create_meta_peer_client();
let in_memory = client.memory_backend();
let lease_secs = 10;
let timer = DefaultSystemTimer;
let active_frontend_node = NodeInfo {
peer: Peer {
id: 0,
addr: "127.0.0.1:20201".to_string(),
},
last_activity_ts: current_time_millis(),
last_activity_ts: timer.current_time_millis(),
status: NodeStatus::Frontend(FrontendStatus {}),
version: "1.0.0".to_string(),
git_commit: "1234567890".to_string(),
@@ -266,7 +397,7 @@ mod tests {
id: 1,
addr: "127.0.0.1:20201".to_string(),
},
last_activity_ts: current_time_millis() - 20 * 1000,
last_activity_ts: timer.current_time_millis() - 20 * 1000,
status: NodeStatus::Frontend(FrontendStatus {}),
version: "1.0.0".to_string(),
git_commit: "1234567890".to_string(),
@@ -287,9 +418,52 @@ mod tests {
.await
.unwrap();
let peers = utils::alive_frontends(client.as_ref(), Duration::from_secs(lease_secs))
let peers =
utils::alive_frontends(&timer, client.as_ref(), Duration::from_secs(lease_secs))
.await
.unwrap();
assert_eq!(peers.len(), 1);
assert_eq!(peers[0].id, 0);
}
#[tokio::test]
async fn test_lookup_frontends_with_timer() {
let client = create_meta_peer_client();
let in_memory = client.memory_backend();
let lease_secs = 10;
let timer = MockTimer {
current: Arc::new(AtomicI64::new(current_time_millis())),
};
let active_frontend_node = NodeInfo {
peer: Peer {
id: 0,
addr: "127.0.0.1:20201".to_string(),
},
last_activity_ts: timer.current_time_millis(),
status: NodeStatus::Frontend(FrontendStatus {}),
version: "1.0.0".to_string(),
git_commit: "1234567890".to_string(),
start_time_ms: current_time_millis() as u64,
total_cpu_millicores: 0,
total_memory_bytes: 0,
cpu_usage_millicores: 0,
memory_usage_bytes: 0,
hostname: "test_hostname".to_string(),
};
let key_prefix = NodeInfoKey::key_prefix_with_role(Role::Frontend);
in_memory
.put(PutRequest {
key: format!("{}{}", key_prefix, "0").into(),
value: active_frontend_node.try_into().unwrap(),
prev_kv: false,
})
.await
.unwrap();
let peers =
utils::alive_frontends(&timer, client.as_ref(), Duration::from_secs(lease_secs))
.await
.unwrap();
assert_eq!(peers.len(), 1);
assert_eq!(peers[0].id, 0);
}

View File

@@ -19,7 +19,7 @@ use common_meta::DatanodeId;
use common_meta::cluster::NodeInfo;
use common_meta::kv_backend::KvBackendRef;
use common_meta::peer::Peer;
use common_time::util::{DefaultSystemTimer, SystemTimer};
use common_time::util::SystemTimer;
use common_workload::DatanodeWorkloadType;
use snafu::ResultExt;
@@ -49,16 +49,9 @@ pub trait LastActiveTs {
/// Builds a filter closure that checks whether a [`LastActiveTs`] item
/// is still within the specified active duration, relative to the
/// current time provided by the given [`SystemTimer`].
///
/// The returned closure uses the timestamp at the time of building,
/// so the "now" reference point is fixed when this function is called.
pub fn build_active_filter<T: LastActiveTs>(
timer: impl SystemTimer,
active_duration: Duration,
) -> impl Fn(&T) -> bool {
let now = timer.current_time_millis();
let active_duration = active_duration.as_millis() as u64;
move |item: &T| {
pub fn build_active_filter<T: LastActiveTs>(active_duration: Duration) -> impl Fn(i64, &T) -> bool {
move |now: i64, item: &T| {
let active_duration = active_duration.as_millis() as u64;
let elapsed = now.saturating_sub(item.last_active_ts()) as u64;
elapsed < active_duration
}
@@ -66,18 +59,19 @@ pub fn build_active_filter<T: LastActiveTs>(
/// Returns the alive datanodes.
pub async fn alive_datanodes(
timer: &impl SystemTimer,
accessor: &impl LeaseValueAccessor,
active_duration: Duration,
condition: Option<fn(&NodeWorkloads) -> bool>,
) -> Result<Vec<Peer>> {
let active_filter = build_active_filter(DefaultSystemTimer, active_duration);
let active_filter = build_active_filter(active_duration);
let condition = condition.unwrap_or(|_| true);
Ok(accessor
.lease_values(LeaseValueType::Datanode)
.await?
let lease_values = accessor.lease_values(LeaseValueType::Datanode).await?;
let now = timer.current_time_millis();
Ok(lease_values
.into_iter()
.filter_map(|(peer_id, lease_value)| {
if active_filter(&lease_value) && condition(&lease_value.workloads) {
if active_filter(now, &lease_value) && condition(&lease_value.workloads) {
Some(Peer::new(peer_id, lease_value.node_addr))
} else {
None
@@ -88,18 +82,19 @@ pub async fn alive_datanodes(
/// Returns the alive flownodes.
pub async fn alive_flownodes(
timer: &impl SystemTimer,
accessor: &impl LeaseValueAccessor,
active_duration: Duration,
condition: Option<fn(&NodeWorkloads) -> bool>,
) -> Result<Vec<Peer>> {
let active_filter = build_active_filter(DefaultSystemTimer, active_duration);
let active_filter = build_active_filter(active_duration);
let condition = condition.unwrap_or(|_| true);
Ok(accessor
.lease_values(LeaseValueType::Flownode)
.await?
let lease_values = accessor.lease_values(LeaseValueType::Flownode).await?;
let now = timer.current_time_millis();
Ok(lease_values
.into_iter()
.filter_map(|(peer_id, lease_value)| {
if active_filter(&lease_value) && condition(&lease_value.workloads) {
if active_filter(now, &lease_value) && condition(&lease_value.workloads) {
Some(Peer::new(peer_id, lease_value.node_addr))
} else {
None
@@ -110,16 +105,17 @@ pub async fn alive_flownodes(
/// Returns the alive frontends.
pub async fn alive_frontends(
timer: &impl SystemTimer,
lister: &impl NodeInfoAccessor,
active_duration: Duration,
) -> Result<Vec<Peer>> {
let active_filter = build_active_filter(DefaultSystemTimer, active_duration);
Ok(lister
.node_infos(NodeInfoType::Frontend)
.await?
let active_filter = build_active_filter(active_duration);
let node_infos = lister.node_infos(NodeInfoType::Frontend).await?;
let now = timer.current_time_millis();
Ok(node_infos
.into_iter()
.filter_map(|(_, node_info)| {
if active_filter(&node_info) {
if active_filter(now, &node_info) {
Some(node_info.peer)
} else {
None
@@ -130,15 +126,18 @@ pub async fn alive_frontends(
/// Returns the alive datanode peer.
pub async fn alive_datanode(
timer: &impl SystemTimer,
lister: &impl LeaseValueAccessor,
peer_id: u64,
active_duration: Duration,
) -> Result<Option<Peer>> {
let active_filter = build_active_filter(DefaultSystemTimer, active_duration);
let v = lister
let active_filter = build_active_filter(active_duration);
let lease_value = lister
.lease_value(LeaseValueType::Datanode, peer_id)
.await?
.filter(|(_, lease)| active_filter(lease))
.await?;
let now = timer.current_time_millis();
let v = lease_value
.filter(|(_, lease)| active_filter(now, lease))
.map(|(peer_id, lease)| Peer::new(peer_id, lease.node_addr));
Ok(v)

View File

@@ -17,7 +17,7 @@ use std::sync::atomic::{AtomicBool, Ordering};
use std::time::Duration;
use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY};
use common_telemetry::{error, warn};
use common_telemetry::{error, info, warn};
use common_time::Timestamp;
use snafu::{OptionExt, ResultExt, ensure};
use sqlx::mysql::{MySqlArguments, MySqlRow};
@@ -645,6 +645,13 @@ impl Election for MySqlElection {
}
async fn reset_campaign(&self) {
info!("Resetting campaign");
if self.is_leader.load(Ordering::Relaxed) {
if let Err(err) = self.step_down_without_lock().await {
error!(err; "Failed to step down without lock");
}
info!("Step down without lock successfully, due to reset campaign");
}
if let Err(err) = self.client.lock().await.reset_client().await {
error!(err; "Failed to reset client");
}

View File

@@ -17,7 +17,7 @@ use std::sync::atomic::{AtomicBool, Ordering};
use std::time::Duration;
use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY};
use common_telemetry::{error, warn};
use common_telemetry::{error, info, warn};
use common_time::Timestamp;
use deadpool_postgres::{Manager, Pool};
use snafu::{OptionExt, ResultExt, ensure};
@@ -477,6 +477,13 @@ impl Election for PgElection {
}
async fn reset_campaign(&self) {
info!("Resetting campaign");
if self.is_leader.load(Ordering::Relaxed) {
if let Err(err) = self.step_down_without_lock().await {
error!(err; "Failed to step down without lock");
}
info!("Step down without lock successfully, due to reset campaign");
}
if let Err(err) = self.pg_client.write().await.reset_client().await {
error!(err; "Failed to reset client");
}
@@ -774,16 +781,12 @@ impl PgElection {
key: key.clone(),
..Default::default()
};
if self
.is_leader
.compare_exchange(true, false, Ordering::AcqRel, Ordering::Acquire)
.is_ok()
&& let Err(e) = self
.leader_watcher
.send(LeaderChangeMessage::StepDown(Arc::new(leader_key)))
{
error!(e; "Failed to send leader change message");
}
send_leader_change_and_set_flags(
&self.is_leader,
&self.leader_infancy,
&self.leader_watcher,
LeaderChangeMessage::StepDown(Arc::new(leader_key)),
);
Ok(())
}

View File

@@ -62,10 +62,12 @@ pub(crate) struct RegionMigrationEvent {
impl RegionMigrationEvent {
pub fn from_persistent_ctx(ctx: &PersistentContext) -> Self {
// FIXME(weny): handle multiple region ids.
let region_id = ctx.region_ids[0];
Self {
region_id: ctx.region_id,
table_id: ctx.region_id.table_id(),
region_number: ctx.region_id.region_number(),
region_id,
table_id: region_id.table_id(),
region_number: region_id.region_number(),
trigger_reason: ctx.trigger_reason,
src_node_id: ctx.from_peer.id,
src_peer_addr: ctx.from_peer.addr.clone(),

View File

@@ -19,6 +19,7 @@ use api::v1::meta::{HeartbeatRequest, RegionLease, Role};
use async_trait::async_trait;
use common_meta::key::TableMetadataManagerRef;
use common_meta::region_keeper::MemoryRegionKeeperRef;
use common_telemetry::error;
use store_api::region_engine::GrantedRegion;
use store_api::storage::RegionId;
@@ -83,36 +84,44 @@ impl HeartbeatHandler for RegionLeaseHandler {
let regions = stat.regions();
let datanode_id = stat.id;
let RenewRegionLeasesResponse {
non_exists,
renewed,
} = self
match self
.region_lease_keeper
.renew_region_leases(datanode_id, &regions)
.await?;
.await
{
Ok(RenewRegionLeasesResponse {
non_exists,
renewed,
}) => {
let renewed = if let Some(renewer) = &self.customized_region_lease_renewer {
renewer
.renew(ctx, renewed)
.into_iter()
.map(|region| region.into())
.collect()
} else {
renewed
.into_iter()
.map(|(region_id, region_lease_info)| {
GrantedRegion::new(region_id, region_lease_info.role).into()
})
.collect::<Vec<_>>()
};
let renewed = if let Some(renewer) = &self.customized_region_lease_renewer {
renewer
.renew(ctx, renewed)
.into_iter()
.map(|region| region.into())
.collect()
} else {
renewed
.into_iter()
.map(|(region_id, region_lease_info)| {
GrantedRegion::new(region_id, region_lease_info.role).into()
})
.collect::<Vec<_>>()
};
acc.region_lease = Some(RegionLease {
regions: renewed,
duration_since_epoch: req.duration_since_epoch,
lease_seconds: self.region_lease_seconds,
closeable_region_ids: non_exists.iter().map(|region| region.as_u64()).collect(),
});
acc.inactive_region_ids = non_exists;
acc.region_lease = Some(RegionLease {
regions: renewed,
duration_since_epoch: req.duration_since_epoch,
lease_seconds: self.region_lease_seconds,
closeable_region_ids: non_exists.iter().map(|region| region.as_u64()).collect(),
});
acc.inactive_region_ids = non_exists;
}
Err(e) => {
error!(e; "Failed to renew region leases for datanode: {datanode_id:?}, regions: {:?}", regions);
// If we throw error here, the datanode will be marked as failure by region failure handler.
// So we only log the error and continue.
}
}
Ok(HandleControl::Continue)
}

View File

@@ -49,6 +49,7 @@ use common_procedure::options::ProcedureConfig;
use common_stat::ResourceStatRef;
use common_telemetry::logging::{LoggingOptions, TracingOptions};
use common_telemetry::{error, info, warn};
use common_time::util::DefaultSystemTimer;
use common_wal::config::MetasrvWalConfig;
use serde::{Deserialize, Serialize};
use servers::export_metrics::ExportMetricsOption;
@@ -375,12 +376,14 @@ pub struct MetasrvNodeInfo {
// The node total cpu millicores
#[serde(default)]
pub total_cpu_millicores: i64,
#[serde(default)]
// The node total memory bytes
#[serde(default)]
pub total_memory_bytes: i64,
/// The node build cpu usage millicores
#[serde(default)]
pub cpu_usage_millicores: i64,
/// The node build memory usage bytes
#[serde(default)]
pub memory_usage_bytes: i64,
// The node hostname
#[serde(default)]
@@ -733,6 +736,7 @@ impl Metasrv {
/// A datanode is considered alive when it's still within the lease period.
pub(crate) async fn lookup_datanode_peer(&self, peer_id: u64) -> Result<Option<Peer>> {
discovery::utils::alive_datanode(
&DefaultSystemTimer,
self.meta_peer_client.as_ref(),
peer_id,
Duration::from_secs(distributed_time_constants::DATANODE_LEASE_SECS),
@@ -858,3 +862,18 @@ impl Metasrv {
}
}
}
#[cfg(test)]
mod tests {
use crate::metasrv::MetasrvNodeInfo;
#[test]
fn test_deserialize_metasrv_node_info() {
let str = r#"{"addr":"127.0.0.1:4002","version":"0.1.0","git_commit":"1234567890","start_time_ms":1715145600}"#;
let node_info: MetasrvNodeInfo = serde_json::from_str(str).unwrap();
assert_eq!(node_info.addr, "127.0.0.1:4002");
assert_eq!(node_info.version, "0.1.0");
assert_eq!(node_info.git_commit, "1234567890");
assert_eq!(node_info.start_time_ms, 1715145600);
}
}

View File

@@ -373,7 +373,8 @@ impl MetasrvBuilder {
runtime_switch_manager.clone(),
meta_peer_client.clone(),
leader_cached_kv_backend.clone(),
);
)
.with_state(state.clone());
Some(RegionFailureHandler::new(
region_supervisor,

View File

@@ -134,7 +134,7 @@ pub async fn mock(
.timeout(Duration::from_secs(10))
.connect_timeout(Duration::from_secs(10))
.tcp_nodelay(true);
let channel_manager = ChannelManager::with_config(config);
let channel_manager = ChannelManager::with_config(config, None);
// Move client to an option so we can _move_ the inner value
// on the first attempt to connect. All other attempts will fail.

View File

@@ -26,6 +26,7 @@ pub(crate) mod update_metadata;
pub(crate) mod upgrade_candidate_region;
use std::any::Any;
use std::collections::{HashMap, HashSet};
use std::fmt::{Debug, Display};
use std::sync::Arc;
use std::time::Duration;
@@ -36,12 +37,11 @@ use common_meta::cache_invalidator::CacheInvalidatorRef;
use common_meta::ddl::RegionFailureDetectorControllerRef;
use common_meta::instruction::CacheIdent;
use common_meta::key::datanode_table::{DatanodeTableKey, DatanodeTableValue};
use common_meta::key::table_info::TableInfoValue;
use common_meta::key::table_route::TableRouteValue;
use common_meta::key::topic_region::{ReplayCheckpoint, TopicRegionKey};
use common_meta::key::{DeserializedValueWithBytes, TableMetadataManagerRef};
use common_meta::kv_backend::{KvBackendRef, ResettableKvBackendRef};
use common_meta::lock_key::{CatalogLock, RegionLock, SchemaLock};
use common_meta::lock_key::{CatalogLock, RegionLock, SchemaLock, TableLock};
use common_meta::peer::Peer;
use common_meta::region_keeper::{MemoryRegionKeeperRef, OperatingRegionGuard};
use common_procedure::error::{
@@ -56,9 +56,9 @@ pub use manager::{
RegionMigrationManagerRef, RegionMigrationProcedureTask, RegionMigrationProcedureTracker,
RegionMigrationTriggerReason,
};
use serde::{Deserialize, Serialize};
use serde::{Deserialize, Deserializer, Serialize};
use snafu::{OptionExt, ResultExt};
use store_api::storage::RegionId;
use store_api::storage::{RegionId, TableId};
use tokio::time::Instant;
use self::migration_start::RegionMigrationStart;
@@ -73,6 +73,25 @@ use crate::service::mailbox::MailboxRef;
/// The default timeout for region migration.
pub const DEFAULT_REGION_MIGRATION_TIMEOUT: Duration = Duration::from_secs(120);
#[derive(Debug, Deserialize)]
#[serde(untagged)]
enum SingleOrMultiple<T> {
Single(T),
Multiple(Vec<T>),
}
fn single_or_multiple_from<'de, D, T>(deserializer: D) -> std::result::Result<Vec<T>, D::Error>
where
D: Deserializer<'de>,
T: Deserialize<'de>,
{
let helper = SingleOrMultiple::<T>::deserialize(deserializer)?;
Ok(match helper {
SingleOrMultiple::Single(x) => vec![x],
SingleOrMultiple::Multiple(xs) => xs,
})
}
/// It's shared in each step and available even after recovering.
///
/// It will only be updated/stored after the Red node has succeeded.
@@ -89,7 +108,8 @@ pub struct PersistentContext {
/// The [Peer] of migration destination.
pub(crate) to_peer: Peer,
/// The [RegionId] of migration region.
pub(crate) region_id: RegionId,
#[serde(deserialize_with = "single_or_multiple_from", alias = "region_id")]
pub(crate) region_ids: Vec<RegionId>,
/// The timeout for downgrading leader region and upgrading candidate region operations.
#[serde(with = "humantime_serde", default = "default_timeout")]
pub(crate) timeout: Duration,
@@ -104,14 +124,42 @@ fn default_timeout() -> Duration {
impl PersistentContext {
pub fn lock_key(&self) -> Vec<StringKey> {
let region_id = self.region_id;
let lock_key = vec![
CatalogLock::Read(&self.catalog).into(),
SchemaLock::read(&self.catalog, &self.schema).into(),
RegionLock::Write(region_id).into(),
];
let mut lock_keys = Vec::with_capacity(self.region_ids.len() + 2);
lock_keys.push(CatalogLock::Read(&self.catalog).into());
lock_keys.push(SchemaLock::read(&self.catalog, &self.schema).into());
// Sort the region ids to ensure the same order of region ids.
let mut region_ids = self.region_ids.clone();
region_ids.sort_unstable();
for region_id in region_ids {
lock_keys.push(RegionLock::Write(region_id).into());
}
lock_keys
}
lock_key
/// Returns the table ids of the regions.
///
/// The return value is a set of table ids.
pub fn region_table_ids(&self) -> Vec<TableId> {
self.region_ids
.iter()
.map(|region_id| region_id.table_id())
.collect::<HashSet<_>>()
.into_iter()
.collect()
}
/// Returns the table regions map.
///
/// The key is the table id, the value is the region ids of the table.
pub fn table_regions(&self) -> HashMap<TableId, Vec<RegionId>> {
let mut table_regions = HashMap::new();
for region_id in &self.region_ids {
table_regions
.entry(region_id.table_id())
.or_insert_with(Vec::new)
.push(*region_id);
}
table_regions
}
}
@@ -227,25 +275,18 @@ pub struct VolatileContext {
/// `opening_region_guard` will be set after the
/// [OpenCandidateRegion](crate::procedure::region_migration::open_candidate_region::OpenCandidateRegion) step.
///
/// `opening_region_guard` should be consumed after
/// `opening_region_guards` should be consumed after
/// the corresponding [RegionRoute](common_meta::rpc::router::RegionRoute) of the opening region
/// was written into [TableRouteValue](common_meta::key::table_route::TableRouteValue).
opening_region_guard: Option<OperatingRegionGuard>,
/// `table_route` is stored via previous steps for future use.
table_route: Option<DeserializedValueWithBytes<TableRouteValue>>,
/// `datanode_table` is stored via previous steps for future use.
from_peer_datanode_table: Option<DatanodeTableValue>,
/// `table_info` is stored via previous steps for future use.
///
/// `table_info` should remain unchanged during the procedure;
/// no other DDL procedure executed concurrently for the current table.
table_info: Option<DeserializedValueWithBytes<TableInfoValue>>,
opening_region_guards: Vec<OperatingRegionGuard>,
/// The deadline of leader region lease.
leader_region_lease_deadline: Option<Instant>,
/// The last_entry_id of leader region.
leader_region_last_entry_id: Option<u64>,
/// The last_entry_id of leader metadata region (Only used for metric engine).
leader_region_metadata_last_entry_id: Option<u64>,
/// The datanode table values.
from_peer_datanode_table_values: Option<HashMap<TableId, DatanodeTableValue>>,
/// The last_entry_ids of leader regions.
leader_region_last_entry_ids: HashMap<RegionId, u64>,
/// The last_entry_ids of leader metadata regions (Only used for metric engine).
leader_region_metadata_last_entry_ids: HashMap<RegionId, u64>,
/// Metrics of region migration.
metrics: Metrics,
}
@@ -264,13 +305,15 @@ impl VolatileContext {
}
/// Sets the `leader_region_last_entry_id`.
pub fn set_last_entry_id(&mut self, last_entry_id: u64) {
self.leader_region_last_entry_id = Some(last_entry_id)
pub fn set_last_entry_id(&mut self, region_id: RegionId, last_entry_id: u64) {
self.leader_region_last_entry_ids
.insert(region_id, last_entry_id);
}
/// Sets the `leader_region_metadata_last_entry_id`.
pub fn set_metadata_last_entry_id(&mut self, last_entry_id: u64) {
self.leader_region_metadata_last_entry_id = Some(last_entry_id);
pub fn set_metadata_last_entry_id(&mut self, region_id: RegionId, last_entry_id: u64) {
self.leader_region_metadata_last_entry_ids
.insert(region_id, last_entry_id);
}
}
@@ -319,7 +362,7 @@ impl DefaultContextFactory {
impl ContextFactory for DefaultContextFactory {
fn new_context(self, persistent_ctx: PersistentContext) -> Context {
Context {
persistent_ctx: Arc::new(persistent_ctx),
persistent_ctx,
volatile_ctx: self.volatile_ctx,
in_memory: self.in_memory_key,
table_metadata_manager: self.table_metadata_manager,
@@ -334,7 +377,7 @@ impl ContextFactory for DefaultContextFactory {
/// The context of procedure execution.
pub struct Context {
persistent_ctx: Arc<PersistentContext>,
persistent_ctx: PersistentContext,
volatile_ctx: VolatileContext,
in_memory: KvBackendRef,
table_metadata_manager: TableMetadataManagerRef,
@@ -393,35 +436,135 @@ impl Context {
&self.server_addr
}
/// Returns the table ids of the regions.
pub fn region_table_ids(&self) -> Vec<TableId> {
self.persistent_ctx
.region_ids
.iter()
.map(|region_id| region_id.table_id())
.collect::<HashSet<_>>()
.into_iter()
.collect()
}
/// Returns the `table_routes` of [VolatileContext] if any.
/// Otherwise, returns the value retrieved from remote.
///
/// Retry:
/// - Failed to retrieve the metadata of table.
pub async fn get_table_route_values(
&self,
) -> Result<HashMap<TableId, DeserializedValueWithBytes<TableRouteValue>>> {
let table_ids = self.persistent_ctx.region_table_ids();
let table_routes = self
.table_metadata_manager
.table_route_manager()
.table_route_storage()
.batch_get_with_raw_bytes(&table_ids)
.await
.context(error::TableMetadataManagerSnafu)
.map_err(BoxedError::new)
.with_context(|_| error::RetryLaterWithSourceSnafu {
reason: format!("Failed to get table routes: {table_ids:?}"),
})?;
let table_routes = table_ids
.into_iter()
.zip(table_routes)
.filter_map(|(table_id, table_route)| {
table_route.map(|table_route| (table_id, table_route))
})
.collect::<HashMap<_, _>>();
Ok(table_routes)
}
/// Returns the `table_route` of [VolatileContext] if any.
/// Otherwise, returns the value retrieved from remote.
///
/// Retry:
/// - Failed to retrieve the metadata of table.
pub async fn get_table_route_value(
&mut self,
) -> Result<&DeserializedValueWithBytes<TableRouteValue>> {
let table_route_value = &mut self.volatile_ctx.table_route;
&self,
table_id: TableId,
) -> Result<DeserializedValueWithBytes<TableRouteValue>> {
let table_route_value = self
.table_metadata_manager
.table_route_manager()
.table_route_storage()
.get_with_raw_bytes(table_id)
.await
.context(error::TableMetadataManagerSnafu)
.map_err(BoxedError::new)
.with_context(|_| error::RetryLaterWithSourceSnafu {
reason: format!("Failed to get table routes: {table_id:}"),
})?
.context(error::TableRouteNotFoundSnafu { table_id })?;
Ok(table_route_value)
}
if table_route_value.is_none() {
let table_id = self.persistent_ctx.region_id.table_id();
let table_route = self
/// Returns the `from_peer_datanode_table_values` of [VolatileContext] if any.
/// Otherwise, returns the value retrieved from remote.
///
/// Retry:
/// - Failed to retrieve the metadata of datanode table.
pub async fn get_from_peer_datanode_table_values(
&mut self,
) -> Result<&HashMap<TableId, DatanodeTableValue>> {
let from_peer_datanode_table_values =
&mut self.volatile_ctx.from_peer_datanode_table_values;
if from_peer_datanode_table_values.is_none() {
let table_ids = self.persistent_ctx.region_table_ids();
let datanode_table_keys = table_ids
.iter()
.map(|table_id| DatanodeTableKey {
datanode_id: self.persistent_ctx.from_peer.id,
table_id: *table_id,
})
.collect::<Vec<_>>();
let datanode_table_values = self
.table_metadata_manager
.table_route_manager()
.table_route_storage()
.get_with_raw_bytes(table_id)
.datanode_table_manager()
.batch_get(&datanode_table_keys)
.await
.context(error::TableMetadataManagerSnafu)
.map_err(BoxedError::new)
.with_context(|_| error::RetryLaterWithSourceSnafu {
reason: format!("Failed to get TableRoute: {table_id}"),
reason: format!("Failed to get DatanodeTable: {table_ids:?}"),
})?
.context(error::TableRouteNotFoundSnafu { table_id })?;
*table_route_value = Some(table_route);
.into_iter()
.map(|(k, v)| (k.table_id, v))
.collect();
*from_peer_datanode_table_values = Some(datanode_table_values);
}
Ok(from_peer_datanode_table_values.as_ref().unwrap())
}
Ok(table_route_value.as_ref().unwrap())
/// Returns the `from_peer_datanode_table_value` of [VolatileContext] if any.
/// Otherwise, returns the value retrieved from remote.
///
/// Retry:
/// - Failed to retrieve the metadata of datanode table.
pub async fn get_from_peer_datanode_table_value(
&self,
table_id: TableId,
) -> Result<DatanodeTableValue> {
let datanode_table_value = self
.table_metadata_manager
.datanode_table_manager()
.get(&DatanodeTableKey {
datanode_id: self.persistent_ctx.from_peer.id,
table_id,
})
.await
.context(error::TableMetadataManagerSnafu)
.map_err(BoxedError::new)
.with_context(|_| error::RetryLaterWithSourceSnafu {
reason: format!("Failed to get DatanodeTable: {table_id}"),
})?
.context(error::DatanodeTableNotFoundSnafu {
table_id,
datanode_id: self.persistent_ctx.from_peer.id,
})?;
Ok(datanode_table_value)
}
/// Notifies the RegionSupervisor to register failure detectors of failed region.
@@ -430,11 +573,18 @@ impl Context {
/// Now, we need to register the failure detector for the failed region again.
pub async fn register_failure_detectors(&self) {
let datanode_id = self.persistent_ctx.from_peer.id;
let region_id = self.persistent_ctx.region_id;
let region_ids = &self.persistent_ctx.region_ids;
let detecting_regions = region_ids
.iter()
.map(|region_id| (datanode_id, *region_id))
.collect::<Vec<_>>();
self.region_failure_detector_controller
.register_failure_detectors(vec![(datanode_id, region_id)])
.register_failure_detectors(detecting_regions)
.await;
info!(
"Registered failure detectors after migration failures for datanode {}, regions {:?}",
datanode_id, region_ids
);
}
/// Notifies the RegionSupervisor to deregister failure detectors.
@@ -443,10 +593,14 @@ impl Context {
/// We need to deregister the failure detectors for the original region if the procedure is finished.
pub async fn deregister_failure_detectors(&self) {
let datanode_id = self.persistent_ctx.from_peer.id;
let region_id = self.persistent_ctx.region_id;
let region_ids = &self.persistent_ctx.region_ids;
let detecting_regions = region_ids
.iter()
.map(|region_id| (datanode_id, *region_id))
.collect::<Vec<_>>();
self.region_failure_detector_controller
.deregister_failure_detectors(vec![(datanode_id, region_id)])
.deregister_failure_detectors(detecting_regions)
.await;
}
@@ -456,118 +610,52 @@ impl Context {
/// so we need to deregister the failure detectors for the candidate region if the procedure is aborted.
pub async fn deregister_failure_detectors_for_candidate_region(&self) {
let to_peer_id = self.persistent_ctx.to_peer.id;
let region_id = self.persistent_ctx.region_id;
let region_ids = &self.persistent_ctx.region_ids;
let detecting_regions = region_ids
.iter()
.map(|region_id| (to_peer_id, *region_id))
.collect::<Vec<_>>();
self.region_failure_detector_controller
.deregister_failure_detectors(vec![(to_peer_id, region_id)])
.deregister_failure_detectors(detecting_regions)
.await;
}
/// Removes the `table_route` of [VolatileContext], returns true if any.
pub fn remove_table_route_value(&mut self) -> bool {
let value = self.volatile_ctx.table_route.take();
value.is_some()
}
/// Returns the `table_info` of [VolatileContext] if any.
/// Otherwise, returns the value retrieved from remote.
///
/// Retry:
/// - Failed to retrieve the metadata of table.
pub async fn get_table_info_value(
&mut self,
) -> Result<&DeserializedValueWithBytes<TableInfoValue>> {
let table_info_value = &mut self.volatile_ctx.table_info;
if table_info_value.is_none() {
let table_id = self.persistent_ctx.region_id.table_id();
let table_info = self
.table_metadata_manager
.table_info_manager()
.get(table_id)
.await
.context(error::TableMetadataManagerSnafu)
.map_err(BoxedError::new)
.with_context(|_| error::RetryLaterWithSourceSnafu {
reason: format!("Failed to get TableInfo: {table_id}"),
})?
.context(error::TableInfoNotFoundSnafu { table_id })?;
*table_info_value = Some(table_info);
}
Ok(table_info_value.as_ref().unwrap())
}
/// Returns the `table_info` of [VolatileContext] if any.
/// Otherwise, returns the value retrieved from remote.
///
/// Retry:
/// - Failed to retrieve the metadata of datanode.
pub async fn get_from_peer_datanode_table_value(&mut self) -> Result<&DatanodeTableValue> {
let datanode_value = &mut self.volatile_ctx.from_peer_datanode_table;
if datanode_value.is_none() {
let table_id = self.persistent_ctx.region_id.table_id();
let datanode_id = self.persistent_ctx.from_peer.id;
let datanode_table = self
.table_metadata_manager
.datanode_table_manager()
.get(&DatanodeTableKey {
datanode_id,
table_id,
})
.await
.context(error::TableMetadataManagerSnafu)
.map_err(BoxedError::new)
.with_context(|_| error::RetryLaterWithSourceSnafu {
reason: format!("Failed to get DatanodeTable: ({datanode_id},{table_id})"),
})?
.context(error::DatanodeTableNotFoundSnafu {
table_id,
datanode_id,
})?;
*datanode_value = Some(datanode_table);
}
Ok(datanode_value.as_ref().unwrap())
}
/// Fetches the replay checkpoint for the given topic.
pub async fn fetch_replay_checkpoint(&self, topic: &str) -> Result<Option<ReplayCheckpoint>> {
let region_id = self.region_id();
let topic_region_key = TopicRegionKey::new(region_id, topic);
let value = self
/// Fetches the replay checkpoints for the given topic region keys.
pub async fn get_replay_checkpoints(
&self,
topic_region_keys: Vec<TopicRegionKey<'_>>,
) -> Result<HashMap<RegionId, ReplayCheckpoint>> {
let topic_region_values = self
.table_metadata_manager
.topic_region_manager()
.get(topic_region_key)
.batch_get(topic_region_keys)
.await
.context(error::TableMetadataManagerSnafu)?;
Ok(value.and_then(|value| value.checkpoint))
}
let replay_checkpoints = topic_region_values
.into_iter()
.flat_map(|(key, value)| value.checkpoint.map(|value| (key, value)))
.collect::<HashMap<_, _>>();
/// Returns the [RegionId].
pub fn region_id(&self) -> RegionId {
self.persistent_ctx.region_id
Ok(replay_checkpoints)
}
/// Broadcasts the invalidate table cache message.
pub async fn invalidate_table_cache(&self) -> Result<()> {
let table_id = self.region_id().table_id();
let table_ids = self.region_table_ids();
let mut cache_idents = Vec::with_capacity(table_ids.len());
for table_id in &table_ids {
cache_idents.push(CacheIdent::TableId(*table_id));
}
// ignore the result
let ctx = common_meta::cache_invalidator::Context::default();
let _ = self
.cache_invalidator
.invalidate(&ctx, &[CacheIdent::TableId(table_id)])
.await;
let _ = self.cache_invalidator.invalidate(&ctx, &cache_idents).await;
Ok(())
}
/// Returns the [PersistentContext] of the procedure.
pub fn persistent_ctx(&self) -> Arc<PersistentContext> {
pub fn persistent_ctx(&self) -> PersistentContext {
self.persistent_ctx.clone()
}
}
@@ -609,7 +697,7 @@ pub struct RegionMigrationData<'a> {
pub(crate) struct RegionMigrationProcedure {
state: Box<dyn State>,
context: Context,
_guard: Option<RegionMigrationProcedureGuard>,
_guards: Vec<RegionMigrationProcedureGuard>,
}
impl RegionMigrationProcedure {
@@ -618,22 +706,22 @@ impl RegionMigrationProcedure {
pub fn new(
persistent_context: PersistentContext,
context_factory: impl ContextFactory,
guard: Option<RegionMigrationProcedureGuard>,
guards: Vec<RegionMigrationProcedureGuard>,
) -> Self {
let state = Box::new(RegionMigrationStart {});
Self::new_inner(state, persistent_context, context_factory, guard)
Self::new_inner(state, persistent_context, context_factory, guards)
}
fn new_inner(
state: Box<dyn State>,
persistent_context: PersistentContext,
context_factory: impl ContextFactory,
guard: Option<RegionMigrationProcedureGuard>,
guards: Vec<RegionMigrationProcedureGuard>,
) -> Self {
Self {
state,
context: context_factory.new_context(persistent_context),
_guard: guard,
_guards: guards,
}
}
@@ -646,47 +734,52 @@ impl RegionMigrationProcedure {
persistent_ctx,
state,
} = serde_json::from_str(json).context(FromJsonSnafu)?;
let guards = persistent_ctx
.region_ids
.iter()
.flat_map(|region_id| {
tracker.insert_running_procedure(&RegionMigrationProcedureTask {
region_id: *region_id,
from_peer: persistent_ctx.from_peer.clone(),
to_peer: persistent_ctx.to_peer.clone(),
timeout: persistent_ctx.timeout,
trigger_reason: persistent_ctx.trigger_reason,
})
})
.collect::<Vec<_>>();
let guard = tracker.insert_running_procedure(&RegionMigrationProcedureTask {
region_id: persistent_ctx.region_id,
from_peer: persistent_ctx.from_peer.clone(),
to_peer: persistent_ctx.to_peer.clone(),
timeout: persistent_ctx.timeout,
trigger_reason: persistent_ctx.trigger_reason,
});
let context = context_factory.new_context(persistent_ctx);
Ok(Self {
state,
context,
_guard: guard,
_guards: guards,
})
}
async fn rollback_inner(&mut self) -> Result<()> {
async fn rollback_inner(&mut self, procedure_ctx: &ProcedureContext) -> Result<()> {
let _timer = METRIC_META_REGION_MIGRATION_EXECUTE
.with_label_values(&["rollback"])
.start_timer();
let table_id = self.context.region_id().table_id();
let region_id = self.context.region_id();
self.context.remove_table_route_value();
let table_metadata_manager = self.context.table_metadata_manager.clone();
let table_route = self.context.get_table_route_value().await?;
// Safety: It must be a physical table route.
let downgraded = table_route
.region_routes()
.unwrap()
.iter()
.filter(|route| route.region.id == region_id)
.any(|route| route.is_leader_downgrading());
if downgraded {
info!("Rollbacking downgraded region leader table route, region: {region_id}");
table_metadata_manager
.update_leader_region_status(table_id, table_route, |route| {
if route.region.id == region_id {
let ctx = &self.context;
let table_regions = ctx.persistent_ctx.table_regions();
for (table_id, regions) in table_regions {
let table_lock = TableLock::Write(table_id).into();
let _guard = procedure_ctx.provider.acquire_lock(&table_lock).await;
let table_route = ctx.get_table_route_value(table_id).await?;
let region_routes = table_route.region_routes().unwrap();
let downgraded = region_routes
.iter()
.filter(|route| regions.contains(&route.region.id))
.any(|route| route.is_leader_downgrading());
if downgraded {
info!(
"Rollbacking downgraded region leader table route, table: {table_id}, regions: {regions:?}"
);
let table_metadata_manager = &ctx.table_metadata_manager;
table_metadata_manager
.update_leader_region_status(table_id, &table_route, |route| {
if regions.contains(&route.region.id) {
Some(None)
} else {
None
@@ -696,10 +789,13 @@ impl RegionMigrationProcedure {
.context(error::TableMetadataManagerSnafu)
.map_err(BoxedError::new)
.with_context(|_| error::RetryLaterWithSourceSnafu {
reason: format!("Failed to update the table route during the rollback downgraded leader region: {region_id}"),
reason: format!("Failed to update the table route during the rollback downgraded leader region: {regions:?}"),
})?;
}
}
self.context
.deregister_failure_detectors_for_candidate_region()
.await;
self.context.register_failure_detectors().await;
Ok(())
@@ -712,8 +808,8 @@ impl Procedure for RegionMigrationProcedure {
Self::TYPE_NAME
}
async fn rollback(&mut self, _ctx: &ProcedureContext) -> ProcedureResult<()> {
self.rollback_inner()
async fn rollback(&mut self, ctx: &ProcedureContext) -> ProcedureResult<()> {
self.rollback_inner(ctx)
.await
.map_err(ProcedureError::external)
}
@@ -742,14 +838,14 @@ impl Procedure for RegionMigrationProcedure {
Err(ProcedureError::retry_later(e))
} else {
// Consumes the opening region guard before deregistering the failure detectors.
self.context.volatile_ctx.opening_region_guard.take();
self.context.volatile_ctx.opening_region_guards.clear();
self.context
.deregister_failure_detectors_for_candidate_region()
.await;
error!(
e;
"Region migration procedure failed, region_id: {}, from_peer: {}, to_peer: {}, {}",
self.context.region_id(),
"Region migration procedure failed, regions: {:?}, from_peer: {}, to_peer: {}, {}",
self.context.persistent_ctx.region_ids,
self.context.persistent_ctx.from_peer,
self.context.persistent_ctx.to_peer,
self.context.volatile_ctx.metrics,
@@ -776,7 +872,7 @@ impl Procedure for RegionMigrationProcedure {
}
fn user_metadata(&self) -> Option<UserMetadata> {
Some(UserMetadata::new(self.context.persistent_ctx()))
Some(UserMetadata::new(Arc::new(self.context.persistent_ctx())))
}
}
@@ -790,7 +886,6 @@ mod tests {
use common_meta::key::test_utils::new_test_table_info;
use common_meta::rpc::router::{Region, RegionRoute};
use super::update_metadata::UpdateMetadata;
use super::*;
use crate::handler::HeartbeatMailbox;
use crate::procedure::region_migration::open_candidate_region::OpenCandidateRegion;
@@ -813,7 +908,7 @@ mod tests {
let env = TestingEnv::new();
let context = env.context_factory();
let procedure = RegionMigrationProcedure::new(persistent_context, context, None);
let procedure = RegionMigrationProcedure::new(persistent_context, context, vec![]);
let key = procedure.lock_key();
let keys = key.keys_to_lock().cloned().collect::<Vec<_>>();
@@ -830,10 +925,10 @@ mod tests {
let env = TestingEnv::new();
let context = env.context_factory();
let procedure = RegionMigrationProcedure::new(persistent_context, context, None);
let procedure = RegionMigrationProcedure::new(persistent_context, context, vec![]);
let serialized = procedure.dump().unwrap();
let expected = r#"{"persistent_ctx":{"catalog":"greptime","schema":"public","from_peer":{"id":1,"addr":""},"to_peer":{"id":2,"addr":""},"region_id":4398046511105,"timeout":"10s","trigger_reason":"Unknown"},"state":{"region_migration_state":"RegionMigrationStart"}}"#;
let expected = r#"{"persistent_ctx":{"catalog":"greptime","schema":"public","from_peer":{"id":1,"addr":""},"to_peer":{"id":2,"addr":""},"region_ids":[4398046511105],"timeout":"10s","trigger_reason":"Unknown"},"state":{"region_migration_state":"RegionMigrationStart"}}"#;
assert_eq!(expected, serialized);
}
@@ -874,7 +969,7 @@ mod tests {
let persistent_context = new_persistent_context();
let context_factory = env.context_factory();
let state = Box::<MockState>::default();
RegionMigrationProcedure::new_inner(state, persistent_context, context_factory, None)
RegionMigrationProcedure::new_inner(state, persistent_context, context_factory, vec![])
}
let ctx = TestingEnv::procedure_context();
@@ -897,7 +992,9 @@ mod tests {
let mut procedure =
RegionMigrationProcedure::from_json(&serialized, context_factory, tracker.clone())
.unwrap();
assert!(tracker.contains(procedure.context.persistent_ctx.region_id));
for region_id in &procedure.context.persistent_ctx.region_ids {
assert!(tracker.contains(*region_id));
}
for _ in 1..3 {
status = Some(procedure.execute(&ctx).await.unwrap());
@@ -937,9 +1034,34 @@ mod tests {
vec![
// MigrationStart
Step::next(
"Should be the update metadata for downgrading",
"Should be the open candidate region",
None,
Assertion::simple(assert_update_metadata_downgrade, assert_need_persist),
Assertion::simple(assert_open_candidate_region, assert_need_persist),
),
// OpenCandidateRegion
Step::next(
"Should be the flush leader region",
Some(mock_datanode_reply(
to_peer_id,
Arc::new(|id| Ok(new_open_region_reply(id, true, None))),
)),
Assertion::simple(assert_flush_leader_region, assert_no_persist),
),
// Flush Leader Region
Step::next(
"Should be the flush leader region",
Some(mock_datanode_reply(
from_peer_id,
Arc::new(move |id| {
Ok(new_flush_region_reply_for_region(
id,
RegionId::new(1024, 1),
true,
None,
))
}),
)),
Assertion::simple(assert_update_metadata_downgrade, assert_no_persist),
),
// UpdateMetadata::Downgrade
Step::next(
@@ -998,7 +1120,7 @@ mod tests {
let to_peer_id = persistent_context.to_peer.id;
let from_peer = persistent_context.from_peer.clone();
let to_peer = persistent_context.to_peer.clone();
let region_id = persistent_context.region_id;
let region_id = persistent_context.region_ids[0];
let table_info = new_test_table_info(1024, vec![1]).into();
let region_routes = vec![RegionRoute {
region: Region::new_test(region_id),
@@ -1025,61 +1147,6 @@ mod tests {
runner.suite.verify_table_metadata().await;
}
#[tokio::test]
async fn test_procedure_flow_idempotent() {
common_telemetry::init_default_ut_logging();
let persistent_context = test_util::new_persistent_context(1, 2, RegionId::new(1024, 1));
let state = Box::new(RegionMigrationStart);
// The table metadata.
let from_peer_id = persistent_context.from_peer.id;
let to_peer_id = persistent_context.to_peer.id;
let from_peer = persistent_context.from_peer.clone();
let to_peer = persistent_context.to_peer.clone();
let region_id = persistent_context.region_id;
let table_info = new_test_table_info(1024, vec![1]).into();
let region_routes = vec![RegionRoute {
region: Region::new_test(region_id),
leader_peer: Some(from_peer),
follower_peers: vec![to_peer],
..Default::default()
}];
let suite = ProcedureMigrationTestSuite::new(persistent_context, state);
suite.init_table_metadata(table_info, region_routes).await;
let steps = procedure_flow_steps(from_peer_id, to_peer_id);
let setup_to_latest_persisted_state = Step::setup(
"Sets state to UpdateMetadata::Downgrade",
merge_before_test_fn(vec![
setup_state(Arc::new(|| Box::new(UpdateMetadata::Downgrade))),
Arc::new(reset_volatile_ctx),
]),
);
let steps = [
steps.clone(),
vec![setup_to_latest_persisted_state.clone()],
steps.clone()[1..].to_vec(),
vec![setup_to_latest_persisted_state],
steps.clone()[1..].to_vec(),
]
.concat();
let timer = Instant::now();
// Run the table tests.
let runner = ProcedureMigrationSuiteRunner::new(suite)
.steps(steps.clone())
.run_once()
.await;
// Ensure it didn't run into the slow path.
assert!(timer.elapsed().as_secs() < REGION_LEASE_SECS / 2);
runner.suite.verify_table_metadata().await;
}
#[tokio::test]
async fn test_procedure_flow_open_candidate_region_retryable_error() {
common_telemetry::init_default_ut_logging();
@@ -1090,7 +1157,7 @@ mod tests {
// The table metadata.
let to_peer_id = persistent_context.to_peer.id;
let from_peer = persistent_context.from_peer.clone();
let region_id = persistent_context.region_id;
let region_id = persistent_context.region_ids[0];
let table_info = new_test_table_info(1024, vec![1]).into();
let region_routes = vec![RegionRoute {
region: Region::new_test(region_id),
@@ -1178,13 +1245,12 @@ mod tests {
let from_peer_id = persistent_context.from_peer.id;
let to_peer_id = persistent_context.to_peer.id;
let from_peer = persistent_context.from_peer.clone();
let to_peer = persistent_context.to_peer.clone();
let region_id = persistent_context.region_id;
let region_id = persistent_context.region_ids[0];
let table_info = new_test_table_info(1024, vec![1]).into();
let region_routes = vec![RegionRoute {
region: Region::new_test(region_id),
leader_peer: Some(from_peer),
follower_peers: vec![to_peer],
follower_peers: vec![],
..Default::default()
}];
@@ -1194,9 +1260,34 @@ mod tests {
let steps = vec![
// MigrationStart
Step::next(
"Should be the update metadata for downgrading",
"Should be the open candidate region",
None,
Assertion::simple(assert_update_metadata_downgrade, assert_need_persist),
Assertion::simple(assert_open_candidate_region, assert_need_persist),
),
// OpenCandidateRegion
Step::next(
"Should be the flush leader region",
Some(mock_datanode_reply(
to_peer_id,
Arc::new(|id| Ok(new_open_region_reply(id, true, None))),
)),
Assertion::simple(assert_flush_leader_region, assert_no_persist),
),
// Flush Leader Region
Step::next(
"Should be the flush leader region",
Some(mock_datanode_reply(
from_peer_id,
Arc::new(move |id| {
Ok(new_flush_region_reply_for_region(
id,
RegionId::new(1024, 1),
true,
None,
))
}),
)),
Assertion::simple(assert_update_metadata_downgrade, assert_no_persist),
),
// UpdateMetadata::Downgrade
Step::next(
@@ -1240,9 +1331,9 @@ mod tests {
];
let setup_to_latest_persisted_state = Step::setup(
"Sets state to UpdateMetadata::Downgrade",
"Sets state to OpenCandidateRegion",
merge_before_test_fn(vec![
setup_state(Arc::new(|| Box::new(UpdateMetadata::Downgrade))),
setup_state(Arc::new(|| Box::new(OpenCandidateRegion))),
Arc::new(reset_volatile_ctx),
]),
);
@@ -1274,7 +1365,7 @@ mod tests {
let to_peer_id = persistent_context.to_peer.id;
let from_peer_id = persistent_context.from_peer.id;
let from_peer = persistent_context.from_peer.clone();
let region_id = persistent_context.region_id;
let region_id = persistent_context.region_ids[0];
let table_info = new_test_table_info(1024, vec![1]).into();
let region_routes = vec![RegionRoute {
region: Region::new_test(region_id),

View File

@@ -19,7 +19,6 @@ use api::v1::meta::MailboxMessage;
use common_meta::RegionIdent;
use common_meta::distributed_time_constants::REGION_LEASE_SECS;
use common_meta::instruction::{Instruction, InstructionReply, SimpleReply};
use common_meta::key::datanode_table::RegionInfo;
use common_procedure::{Context as ProcedureContext, Status};
use common_telemetry::{info, warn};
use serde::{Deserialize, Serialize};
@@ -47,12 +46,12 @@ impl State for CloseDowngradedRegion {
) -> Result<(Box<dyn State>, Status)> {
if let Err(err) = self.close_downgraded_leader_region(ctx).await {
let downgrade_leader_datanode = &ctx.persistent_ctx.from_peer;
let region_id = ctx.region_id();
warn!(err; "Failed to close downgraded leader region: {region_id} on datanode {:?}", downgrade_leader_datanode);
let region_ids = &ctx.persistent_ctx.region_ids;
warn!(err; "Failed to close downgraded leader regions: {region_ids:?} on datanode {:?}", downgrade_leader_datanode);
}
info!(
"Region migration is finished: region_id: {}, from_peer: {}, to_peer: {}, trigger_reason: {}, {}",
ctx.region_id(),
"Region migration is finished: regions: {:?}, from_peer: {}, to_peer: {}, trigger_reason: {}, {}",
ctx.persistent_ctx.region_ids,
ctx.persistent_ctx.from_peer,
ctx.persistent_ctx.to_peer,
ctx.persistent_ctx.trigger_reason,
@@ -74,28 +73,30 @@ impl CloseDowngradedRegion {
async fn build_close_region_instruction(&self, ctx: &mut Context) -> Result<Instruction> {
let pc = &ctx.persistent_ctx;
let downgrade_leader_datanode_id = pc.from_peer.id;
let table_id = pc.region_id.table_id();
let region_number = pc.region_id.region_number();
let datanode_table_value = ctx.get_from_peer_datanode_table_value().await?;
let region_ids = &ctx.persistent_ctx.region_ids;
let mut idents = Vec::with_capacity(region_ids.len());
let RegionInfo { engine, .. } = datanode_table_value.region_info.clone();
for region_id in region_ids {
idents.push(RegionIdent {
datanode_id: downgrade_leader_datanode_id,
table_id: region_id.table_id(),
region_number: region_id.region_number(),
// The `engine` field is not used for closing region.
engine: String::new(),
});
}
Ok(Instruction::CloseRegions(vec![RegionIdent {
datanode_id: downgrade_leader_datanode_id,
table_id,
region_number,
engine,
}]))
Ok(Instruction::CloseRegions(idents))
}
/// Closes the downgraded leader region.
async fn close_downgraded_leader_region(&self, ctx: &mut Context) -> Result<()> {
let close_instruction = self.build_close_region_instruction(ctx).await?;
let region_id = ctx.region_id();
let region_ids = &ctx.persistent_ctx.region_ids;
let pc = &ctx.persistent_ctx;
let downgrade_leader_datanode = &pc.from_peer;
let msg = MailboxMessage::json_message(
&format!("Close downgraded region: {}", region_id),
&format!("Close downgraded regions: {:?}", region_ids),
&format!("Metasrv@{}", ctx.server_addr()),
&format!(
"Datanode-{}@{}",
@@ -118,8 +119,8 @@ impl CloseDowngradedRegion {
Ok(msg) => {
let reply = HeartbeatMailbox::json_reply(&msg)?;
info!(
"Received close downgraded leade region reply: {:?}, region: {}",
reply, region_id
"Received close downgraded leade region reply: {:?}, region: {:?}",
reply, region_ids
);
let InstructionReply::CloseRegions(SimpleReply { result, error }) = reply else {
return error::UnexpectedInstructionReplySnafu {
@@ -134,7 +135,7 @@ impl CloseDowngradedRegion {
} else {
error::UnexpectedSnafu {
violated: format!(
"Failed to close downgraded leader region: {region_id} on datanode {:?}, error: {error:?}",
"Failed to close downgraded leader region: {region_ids:?} on datanode {:?}, error: {error:?}",
downgrade_leader_datanode,
),
}

View File

@@ -22,7 +22,7 @@ use common_meta::instruction::{
DowngradeRegion, DowngradeRegionReply, DowngradeRegionsReply, Instruction, InstructionReply,
};
use common_procedure::{Context as ProcedureContext, Status};
use common_telemetry::{error, info, warn};
use common_telemetry::{debug, error, info, warn};
use common_time::util::current_time_millis;
use serde::{Deserialize, Serialize};
use snafu::{OptionExt, ResultExt};
@@ -70,30 +70,30 @@ impl State for DowngradeLeaderRegion {
Ok(_) => {
// Do nothing
info!(
"Downgraded region leader success, region: {}",
ctx.persistent_ctx.region_id
"Downgraded region leader success, region: {:?}",
ctx.persistent_ctx.region_ids
);
}
Err(error::Error::ExceededDeadline { .. }) => {
info!(
"Downgrade region leader exceeded deadline, region: {}",
ctx.persistent_ctx.region_id
"Downgrade region leader exceeded deadline, region: {:?}",
ctx.persistent_ctx.region_ids
);
// Rollbacks the metadata if procedure is timeout
return Ok((Box::new(UpdateMetadata::Rollback), Status::executing(false)));
}
Err(err) => {
error!(err; "Occurs non-retryable error, region: {}", ctx.persistent_ctx.region_id);
error!(err; "Occurs non-retryable error, region: {:?}", ctx.persistent_ctx.region_ids);
if let Some(deadline) = ctx.volatile_ctx.leader_region_lease_deadline.as_ref() {
info!(
"Running into the downgrade region leader slow path, region: {}, sleep until {:?}",
ctx.persistent_ctx.region_id, deadline
"Running into the downgrade region leader slow path, region: {:?}, sleep until {:?}",
ctx.persistent_ctx.region_ids, deadline
);
tokio::time::sleep_until(*deadline).await;
} else {
warn!(
"Leader region lease deadline is not set, region: {}",
ctx.persistent_ctx.region_id
"Leader region lease deadline is not set, region: {:?}",
ctx.persistent_ctx.region_ids
);
}
}
@@ -118,12 +118,76 @@ impl DowngradeLeaderRegion {
ctx: &Context,
flush_timeout: Duration,
) -> Instruction {
let pc = &ctx.persistent_ctx;
let region_id = pc.region_id;
Instruction::DowngradeRegions(vec![DowngradeRegion {
let region_ids = &ctx.persistent_ctx.region_ids;
let mut downgrade_regions = Vec::with_capacity(region_ids.len());
for region_id in region_ids {
downgrade_regions.push(DowngradeRegion {
region_id: *region_id,
flush_timeout: Some(flush_timeout),
});
}
Instruction::DowngradeRegions(downgrade_regions)
}
fn handle_downgrade_region_reply(
&self,
ctx: &mut Context,
reply: &DowngradeRegionReply,
now: &Instant,
) -> Result<()> {
let leader = &ctx.persistent_ctx.from_peer;
let DowngradeRegionReply {
region_id,
flush_timeout: Some(flush_timeout),
}])
last_entry_id,
metadata_last_entry_id,
exists,
error,
} = reply;
if error.is_some() {
return error::RetryLaterSnafu {
reason: format!(
"Failed to downgrade the region {} on datanode {:?}, error: {:?}, elapsed: {:?}",
region_id, leader, error, now.elapsed()
),
}
.fail();
}
if !exists {
warn!(
"Trying to downgrade the region {} on datanode {:?}, but region doesn't exist!, elapsed: {:?}",
region_id,
leader,
now.elapsed()
);
} else {
info!(
"Region {} leader is downgraded on datanode {:?}, last_entry_id: {:?}, metadata_last_entry_id: {:?}, elapsed: {:?}",
region_id,
leader,
last_entry_id,
metadata_last_entry_id,
now.elapsed()
);
}
if let Some(last_entry_id) = last_entry_id {
debug!(
"set last_entry_id: {:?}, region_id: {:?}",
last_entry_id, region_id
);
ctx.volatile_ctx
.set_last_entry_id(*region_id, *last_entry_id);
}
if let Some(metadata_last_entry_id) = metadata_last_entry_id {
ctx.volatile_ctx
.set_metadata_last_entry_id(*region_id, *metadata_last_entry_id);
}
Ok(())
}
/// Tries to downgrade a leader region.
@@ -140,7 +204,7 @@ impl DowngradeLeaderRegion {
/// - [ExceededDeadline](error::Error::ExceededDeadline)
/// - Invalid JSON.
async fn downgrade_region(&self, ctx: &mut Context) -> Result<()> {
let region_id = ctx.persistent_ctx.region_id;
let region_ids = &ctx.persistent_ctx.region_ids;
let operation_timeout =
ctx.next_operation_timeout()
.context(error::ExceededDeadlineSnafu {
@@ -150,7 +214,7 @@ impl DowngradeLeaderRegion {
let leader = &ctx.persistent_ctx.from_peer;
let msg = MailboxMessage::json_message(
&format!("Downgrade leader region: {}", region_id),
&format!("Downgrade leader regions: {:?}", region_ids),
&format!("Metasrv@{}", ctx.server_addr()),
&format!("Datanode-{}@{}", leader.id, leader.addr),
common_time::util::current_time_millis(),
@@ -168,9 +232,9 @@ impl DowngradeLeaderRegion {
Ok(msg) => {
let reply = HeartbeatMailbox::json_reply(&msg)?;
info!(
"Received downgrade region reply: {:?}, region: {}, elapsed: {:?}",
"Received downgrade region reply: {:?}, region: {:?}, elapsed: {:?}",
reply,
region_id,
region_ids,
now.elapsed()
);
let InstructionReply::DowngradeRegions(DowngradeRegionsReply { replies }) = reply
@@ -182,57 +246,14 @@ impl DowngradeLeaderRegion {
.fail();
};
// TODO(weny): handle multiple replies.
let DowngradeRegionReply {
region_id,
last_entry_id,
metadata_last_entry_id,
exists,
error,
} = &replies[0];
if error.is_some() {
return error::RetryLaterSnafu {
reason: format!(
"Failed to downgrade the region {} on datanode {:?}, error: {:?}, elapsed: {:?}",
region_id, leader, error, now.elapsed()
),
}
.fail();
for reply in replies {
self.handle_downgrade_region_reply(ctx, &reply, &now)?;
}
if !exists {
warn!(
"Trying to downgrade the region {} on datanode {:?}, but region doesn't exist!, elapsed: {:?}",
region_id,
leader,
now.elapsed()
);
} else {
info!(
"Region {} leader is downgraded on datanode {:?}, last_entry_id: {:?}, metadata_last_entry_id: {:?}, elapsed: {:?}",
region_id,
leader,
last_entry_id,
metadata_last_entry_id,
now.elapsed()
);
}
if let Some(last_entry_id) = last_entry_id {
ctx.volatile_ctx.set_last_entry_id(*last_entry_id);
}
if let Some(metadata_last_entry_id) = metadata_last_entry_id {
ctx.volatile_ctx
.set_metadata_last_entry_id(*metadata_last_entry_id);
}
Ok(())
}
Err(error::Error::MailboxTimeout { .. }) => {
let reason = format!(
"Mailbox received timeout for downgrade leader region {region_id} on datanode {:?}, elapsed: {:?}",
"Mailbox received timeout for downgrade leader region {region_ids:?} on datanode {:?}, elapsed: {:?}",
leader,
now.elapsed()
);
@@ -248,7 +269,7 @@ impl DowngradeLeaderRegion {
let last_connection_at = match find_datanode_lease_value(&ctx.in_memory, leader.id).await {
Ok(lease_value) => lease_value.map(|lease_value| lease_value.timestamp_millis),
Err(err) => {
error!(err; "Failed to find datanode lease value for datanode: {}, during region migration, region: {}", leader, ctx.persistent_ctx.region_id);
error!(err; "Failed to find datanode lease value for datanode: {}, during region migration, region: {:?}", leader, ctx.persistent_ctx.region_ids);
return;
}
};
@@ -266,8 +287,8 @@ impl DowngradeLeaderRegion {
if elapsed >= (REGION_LEASE_SECS * 1000) as i64 {
ctx.volatile_ctx.reset_leader_region_lease_deadline();
info!(
"Datanode {}({}) has been disconnected for longer than the region lease period ({:?}), reset leader region lease deadline to None, region: {}",
leader, last_connection_at, region_lease, ctx.persistent_ctx.region_id
"Datanode {}({}) has been disconnected for longer than the region lease period ({:?}), reset leader region lease deadline to None, region: {:?}",
leader, last_connection_at, region_lease, ctx.persistent_ctx.region_ids
);
} else if elapsed > 0 {
// `now - last_connection_at` < REGION_LEASE_SECS * 1000
@@ -277,23 +298,23 @@ impl DowngradeLeaderRegion {
ctx.volatile_ctx
.set_leader_region_lease_deadline(lease_timeout);
info!(
"Datanode {}({}) last connected {:?} ago, updated leader region lease deadline to {:?}, region: {}",
"Datanode {}({}) last connected {:?} ago, updated leader region lease deadline to {:?}, region: {:?}",
leader,
last_connection_at,
elapsed,
ctx.volatile_ctx.leader_region_lease_deadline,
ctx.persistent_ctx.region_id
ctx.persistent_ctx.region_ids
);
} else {
warn!(
"Datanode {} has invalid last connection timestamp: {} (which is after current time: {}), region: {}",
leader, last_connection_at, now, ctx.persistent_ctx.region_id
"Datanode {} has invalid last connection timestamp: {} (which is after current time: {}), region: {:?}",
leader, last_connection_at, now, ctx.persistent_ctx.region_ids
)
}
} else {
warn!(
"Failed to find last connection time for datanode {}, unable to update region lease deadline, region: {}",
leader, ctx.persistent_ctx.region_id
"Failed to find last connection time for datanode {}, unable to update region lease deadline, region: {:?}",
leader, ctx.persistent_ctx.region_ids
)
}
}
@@ -318,19 +339,20 @@ impl DowngradeLeaderRegion {
retry += 1;
// Throws the error immediately if the procedure exceeded the deadline.
if matches!(err, error::Error::ExceededDeadline { .. }) {
error!(err; "Failed to downgrade region leader, region: {}, exceeded deadline", ctx.persistent_ctx.region_id);
error!(err; "Failed to downgrade region leader, regions: {:?}, exceeded deadline", ctx.persistent_ctx.region_ids);
return Err(err);
} else if matches!(err, error::Error::PusherNotFound { .. }) {
// Throws the error immediately if the datanode is unreachable.
error!(err; "Failed to downgrade region leader, region: {}, datanode({}) is unreachable(PusherNotFound)", ctx.persistent_ctx.region_id, ctx.persistent_ctx.from_peer.id);
error!(err; "Failed to downgrade region leader, regions: {:?}, datanode({}) is unreachable(PusherNotFound)", ctx.persistent_ctx.region_ids, ctx.persistent_ctx.from_peer.id);
self.update_leader_region_lease_deadline(ctx).await;
return Err(err);
} else if err.is_retryable() && retry < self.optimistic_retry {
error!(err; "Failed to downgrade region leader, region: {}, retry later", ctx.persistent_ctx.region_id);
error!(err; "Failed to downgrade region leader, regions: {:?}, retry later", ctx.persistent_ctx.region_ids);
sleep(self.retry_initial_interval).await;
} else {
return Err(BoxedError::new(err)).context(error::DowngradeLeaderSnafu {
region_id: ctx.persistent_ctx.region_id,
// TODO(weny): handle multiple regions.
region_id: ctx.persistent_ctx.region_ids[0],
})?;
}
} else {
@@ -372,17 +394,17 @@ mod tests {
schema: "public".into(),
from_peer: Peer::empty(1),
to_peer: Peer::empty(2),
region_id: RegionId::new(1024, 1),
region_ids: vec![RegionId::new(1024, 1)],
timeout: Duration::from_millis(1000),
trigger_reason: RegionMigrationTriggerReason::Manual,
}
}
async fn prepare_table_metadata(ctx: &Context, wal_options: HashMap<u32, String>) {
let table_info =
new_test_table_info(ctx.persistent_ctx.region_id.table_id(), vec![1]).into();
let region_id = ctx.persistent_ctx.region_ids[0];
let table_info = new_test_table_info(region_id.table_id(), vec![1]).into();
let region_routes = vec![RegionRoute {
region: Region::new_test(ctx.persistent_ctx.region_id),
region: Region::new_test(region_id),
leader_peer: Some(ctx.persistent_ctx.from_peer.clone()),
follower_peers: vec![ctx.persistent_ctx.to_peer.clone()],
..Default::default()
@@ -590,7 +612,13 @@ mod tests {
});
state.downgrade_region_with_retry(&mut ctx).await.unwrap();
assert_eq!(ctx.volatile_ctx.leader_region_last_entry_id, Some(1));
assert_eq!(
ctx.volatile_ctx
.leader_region_last_entry_ids
.get(&RegionId::new(0, 0))
.cloned(),
Some(1)
);
assert!(ctx.volatile_ctx.leader_region_lease_deadline.is_none());
}
@@ -636,7 +664,7 @@ mod tests {
.await
.unwrap_err();
assert_matches!(err, error::Error::DowngradeLeader { .. });
assert_eq!(ctx.volatile_ctx.leader_region_last_entry_id, None);
// assert_eq!(ctx.volatile_ctx.leader_region_last_entry_id, None);
// Should remain no change.
assert_eq!(
ctx.volatile_ctx.leader_region_lease_deadline.unwrap(),
@@ -671,7 +699,13 @@ mod tests {
let (next, _) = state.next(&mut ctx, &procedure_ctx).await.unwrap();
let elapsed = timer.elapsed().as_secs();
assert!(elapsed < REGION_LEASE_SECS / 2);
assert_eq!(ctx.volatile_ctx.leader_region_last_entry_id, Some(1));
assert_eq!(
ctx.volatile_ctx
.leader_region_last_entry_ids
.get(&RegionId::new(0, 0))
.cloned(),
Some(1)
);
assert!(ctx.volatile_ctx.leader_region_lease_deadline.is_none());
let _ = next

View File

@@ -15,7 +15,7 @@
use std::any::Any;
use api::v1::meta::MailboxMessage;
use common_meta::instruction::{FlushRegions, Instruction, InstructionReply};
use common_meta::instruction::{FlushErrorStrategy, FlushRegions, Instruction, InstructionReply};
use common_procedure::{Context as ProcedureContext, Status};
use common_telemetry::{info, warn};
use serde::{Deserialize, Serialize};
@@ -64,8 +64,10 @@ impl PreFlushRegion {
/// Builds flush leader region instruction.
fn build_flush_leader_region_instruction(&self, ctx: &Context) -> Instruction {
let pc = &ctx.persistent_ctx;
let region_id = pc.region_id;
Instruction::FlushRegions(FlushRegions::sync_single(region_id))
Instruction::FlushRegions(FlushRegions::sync_batch(
pc.region_ids.clone(),
FlushErrorStrategy::TryAll,
))
}
/// Tries to flush a leader region.
@@ -88,11 +90,11 @@ impl PreFlushRegion {
operation: "Flush leader region",
})?;
let flush_instruction = self.build_flush_leader_region_instruction(ctx);
let region_id = ctx.persistent_ctx.region_id;
let region_ids = &ctx.persistent_ctx.region_ids;
let leader = &ctx.persistent_ctx.from_peer;
let msg = MailboxMessage::json_message(
&format!("Flush leader region: {}", region_id),
&format!("Flush leader region: {:?}", region_ids),
&format!("Metasrv@{}", ctx.server_addr()),
&format!("Datanode-{}@{}", leader.id, leader.addr),
common_time::util::current_time_millis(),
@@ -111,32 +113,42 @@ impl PreFlushRegion {
Ok(msg) => {
let reply = HeartbeatMailbox::json_reply(&msg)?;
info!(
"Received flush leader region reply: {:?}, region: {}, elapsed: {:?}",
"Received flush leader region reply: {:?}, region: {:?}, elapsed: {:?}",
reply,
region_id,
region_ids,
now.elapsed()
);
let reply_result = match reply {
InstructionReply::FlushRegions(flush_reply) => {
if flush_reply.results.len() != 1 {
if flush_reply.results.len() != region_ids.len() {
return error::UnexpectedInstructionReplySnafu {
mailbox_message: msg.to_string(),
reason: "expect single region flush result",
reason: format!(
"expect {} region flush result, but got {}",
region_ids.len(),
flush_reply.results.len()
),
}
.fail();
}
let (reply_region_id, result) = &flush_reply.results[0];
if *reply_region_id != region_id {
return error::UnexpectedInstructionReplySnafu {
mailbox_message: msg.to_string(),
reason: "flush reply region ID mismatch",
}
.fail();
}
match result {
Ok(()) => (true, None),
Err(err) => (false, Some(err.clone())),
match flush_reply.overall_success {
true => (true, None),
false => (
false,
Some(
flush_reply
.results
.iter()
.filter_map(|(region_id, result)| match result {
Ok(_) => None,
Err(e) => Some(format!("{}: {}", region_id, e)),
})
.collect::<Vec<String>>()
.join("; "),
),
),
}
}
_ => {
@@ -149,15 +161,15 @@ impl PreFlushRegion {
};
let (result, error) = reply_result;
if error.is_some() {
if let Some(error) = error {
warn!(
"Failed to flush leader region {} on datanode {:?}, error: {:?}. Skip flush operation.",
region_id, leader, error
"Failed to flush leader regions {:?} on datanode {:?}, error: {}. Skip flush operation.",
region_ids, leader, &error
);
} else if result {
info!(
"The flush leader region {} on datanode {:?} is successful, elapsed: {:?}",
region_id,
"The flush leader regions {:?} on datanode {:?} is successful, elapsed: {:?}",
region_ids,
leader,
now.elapsed()
);
@@ -166,15 +178,15 @@ impl PreFlushRegion {
Ok(())
}
Err(Error::MailboxTimeout { .. }) => error::ExceededDeadlineSnafu {
operation: "Flush leader region",
operation: "Flush leader regions",
}
.fail(),
Err(err) => Err(err),
},
Err(Error::PusherNotFound { .. }) => {
warn!(
"Failed to flush leader region({}), the datanode({}) is unreachable(PusherNotFound). Skip flush operation.",
region_id, leader
"Failed to flush leader regions({:?}), the datanode({}) is unreachable(PusherNotFound). Skip flush operation.",
region_ids, leader
);
Ok(())
}
@@ -268,7 +280,7 @@ mod tests {
// to_peer: 2
let persistent_context = new_persistent_context();
let from_peer_id = persistent_context.from_peer.id;
let region_id = persistent_context.region_id;
let region_id = persistent_context.region_ids[0];
let mut env = TestingEnv::new();
let mut ctx = env.context_factory().new_context(persistent_context);
let mailbox_ctx = env.mailbox_context();
@@ -297,7 +309,7 @@ mod tests {
// to_peer: 2
let persistent_context = new_persistent_context();
let from_peer_id = persistent_context.from_peer.id;
let region_id = persistent_context.region_id;
let region_id = persistent_context.region_ids[0];
let mut env = TestingEnv::new();
let mut ctx = env.context_factory().new_context(persistent_context);
let mailbox_ctx = env.mailbox_context();

View File

@@ -387,14 +387,14 @@ impl RegionMigrationManager {
PersistentContext {
catalog: catalog_name,
schema: schema_name,
region_id,
region_ids: vec![region_id],
from_peer,
to_peer,
timeout,
trigger_reason,
},
self.context_factory.clone(),
Some(guard),
vec![guard],
);
let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));
let procedure_id = procedure_with_id.id;

View File

@@ -44,9 +44,9 @@ impl State for RegionMigrationAbort {
_procedure_ctx: &ProcedureContext,
) -> Result<(Box<dyn State>, Status)> {
warn!(
"Region migration is aborted: {}, region_id: {}, from_peer: {}, to_peer: {}, trigger_reason: {}, {}",
"Region migration is aborted: {}, regions: {:?}, from_peer: {}, to_peer: {}, trigger_reason: {}, {}",
self.reason,
ctx.region_id(),
ctx.persistent_ctx.region_ids,
ctx.persistent_ctx.from_peer,
ctx.persistent_ctx.to_peer,
ctx.persistent_ctx.trigger_reason,

View File

@@ -20,22 +20,18 @@ use common_procedure::{Context as ProcedureContext, Status};
use common_telemetry::info;
use serde::{Deserialize, Serialize};
use snafu::{OptionExt, ResultExt};
use store_api::storage::RegionId;
use crate::error::{self, Result};
use crate::procedure::region_migration::migration_abort::RegionMigrationAbort;
use crate::procedure::region_migration::migration_end::RegionMigrationEnd;
use crate::procedure::region_migration::open_candidate_region::OpenCandidateRegion;
use crate::procedure::region_migration::update_metadata::UpdateMetadata;
use crate::procedure::region_migration::{Context, State};
/// The behaviors:
///
/// If the expected leader region has been opened on `to_peer`, go to the [RegionMigrationEnd] state.
///
/// If the candidate region has been opened on `to_peer`, go to the [UpdateMetadata::Downgrade] state.
///
/// Otherwise go to the [OpenCandidateRegion] state.
/// - If all regions have been migrated, transitions to [RegionMigrationEnd].
/// - If any of the region leaders is not the `from_peer`, transitions to [RegionMigrationAbort].
/// - Otherwise, continues with [OpenCandidateRegion] to initiate the candidate region.
#[derive(Debug, Serialize, Deserialize)]
pub struct RegionMigrationStart;
@@ -44,44 +40,62 @@ pub struct RegionMigrationStart;
impl State for RegionMigrationStart {
/// Yields next [State].
///
/// If the expected leader region has been opened on `to_peer`, go to the [RegionMigrationEnd] state.
/// Determines the next [State] for region migration:
///
/// If the candidate region has been opened on `to_peer`, go to the [UpdateMetadata::Downgrade] state.
///
/// Otherwise go to the [OpenCandidateRegion] state.
/// - If all regions have been migrated, transitions to [RegionMigrationEnd].
/// - If any of the region leaders is not the `from_peer`, transitions to [RegionMigrationAbort].
/// - Otherwise, continues with [OpenCandidateRegion] to initiate the candidate region.
async fn next(
&mut self,
ctx: &mut Context,
_procedure_ctx: &ProcedureContext,
) -> Result<(Box<dyn State>, Status)> {
let region_id = ctx.persistent_ctx.region_id;
let region_route = self.retrieve_region_route(ctx, region_id).await?;
let mut region_routes = self.retrieve_region_routes(ctx).await?;
let to_peer = &ctx.persistent_ctx.to_peer;
let from_peer = &ctx.persistent_ctx.from_peer;
let region_ids = &ctx.persistent_ctx.region_ids;
if self.has_migrated(&region_route, to_peer)? {
self.filter_unmigrated_regions(&mut region_routes, to_peer);
// No region to migrate, skip the migration.
if region_routes.is_empty() {
info!(
"Region has been migrated, region: {:?}, to_peer: {:?}",
region_route.region.id, to_peer
"All regions have been migrated, regions: {:?}, to_peer: {:?}",
region_ids, to_peer
);
Ok((Box::new(RegionMigrationEnd), Status::done()))
} else if self.invalid_leader_peer(&region_route, from_peer)? {
info!(
"Abort region migration, region:{:?}, unexpected leader peer: {:?}, expected: {:?}",
region_route.region.id, region_route.leader_peer, from_peer,
);
Ok((
Box::new(RegionMigrationAbort::new(&format!(
"Invalid region leader peer: {from_peer:?}, expected: {:?}",
region_route.leader_peer.as_ref().unwrap(),
))),
Status::done(),
))
} else if self.check_candidate_region_on_peer(&region_route, to_peer) {
Ok((Box::new(UpdateMetadata::Downgrade), Status::executing(true)))
} else {
Ok((Box::new(OpenCandidateRegion), Status::executing(true)))
return Ok((Box::new(RegionMigrationEnd), Status::done()));
}
// Updates the region ids to the unmigrated regions.
if region_routes.len() != region_ids.len() {
let unmigrated_region_ids = region_routes.iter().map(|route| route.region.id).collect();
info!(
"Some of the regions have been migrated, only migrate the following regions: {:?}, to_peer: {:?}",
unmigrated_region_ids, to_peer
);
ctx.persistent_ctx.region_ids = unmigrated_region_ids;
}
// Checks if any of the region leaders is not the `from_peer`.
for region_route in &region_routes {
if self.invalid_leader_peer(region_route, from_peer)? {
info!(
"Abort region migration, region:{}, unexpected leader peer: {:?}, expected: {:?}",
region_route.region.id, region_route.leader_peer, from_peer,
);
return Ok((
Box::new(RegionMigrationAbort::new(&format!(
"Invalid region leader peer: {:?}, expected: {:?}",
region_route.leader_peer.as_ref().unwrap(),
from_peer,
))),
Status::done(),
));
}
}
// If all checks pass, open the candidate region.
Ok((Box::new(OpenCandidateRegion), Status::executing(true)))
}
fn as_any(&self) -> &dyn Any {
@@ -90,7 +104,7 @@ impl State for RegionMigrationStart {
}
impl RegionMigrationStart {
/// Retrieves region route.
/// Retrieves region routes for multiple regions.
///
/// Abort(non-retry):
/// - TableRoute is not found.
@@ -98,39 +112,32 @@ impl RegionMigrationStart {
///
/// Retry:
/// - Failed to retrieve the metadata of table.
async fn retrieve_region_route(
&self,
ctx: &mut Context,
region_id: RegionId,
) -> Result<RegionRoute> {
let table_id = region_id.table_id();
let table_route = ctx.get_table_route_value().await?;
async fn retrieve_region_routes(&self, ctx: &mut Context) -> Result<Vec<RegionRoute>> {
let region_ids = &ctx.persistent_ctx.region_ids;
let table_route_values = ctx.get_table_route_values().await?;
let mut region_routes = Vec::with_capacity(region_ids.len());
for region_id in region_ids {
let table_id = region_id.table_id();
let region_route = table_route_values
.get(&table_id)
.context(error::TableRouteNotFoundSnafu { table_id })?
.region_routes()
.with_context(|_| error::UnexpectedLogicalRouteTableSnafu {
err_msg: format!("TableRoute({table_id:?}) is a non-physical TableRouteValue."),
})?
.iter()
.find(|route| route.region.id == *region_id)
.cloned()
.with_context(|| error::UnexpectedSnafu {
violated: format!(
"RegionRoute({}) is not found in TableRoute({})",
region_id, table_id
),
})?;
region_routes.push(region_route);
}
let region_route = table_route
.region_routes()
.context(error::UnexpectedLogicalRouteTableSnafu {
err_msg: format!("{self:?} is a non-physical TableRouteValue."),
})?
.iter()
.find(|route| route.region.id == region_id)
.cloned()
.context(error::UnexpectedSnafu {
violated: format!(
"RegionRoute({}) is not found in TableRoute({})",
region_id, table_id
),
})?;
Ok(region_route)
}
/// Checks whether the candidate region on region has been opened.
/// Returns true if it's been opened.
fn check_candidate_region_on_peer(&self, region_route: &RegionRoute, to_peer: &Peer) -> bool {
region_route
.follower_peers
.iter()
.any(|peer| peer.id == to_peer.id)
Ok(region_routes)
}
/// Returns true if the region leader is not the `from_peer`.
@@ -143,7 +150,7 @@ impl RegionMigrationStart {
let is_invalid_leader_peer = region_route
.leader_peer
.as_ref()
.context(error::UnexpectedSnafu {
.with_context(|| error::UnexpectedSnafu {
violated: format!("Leader peer is not found in TableRoute({})", region_id),
})?
.id
@@ -151,6 +158,12 @@ impl RegionMigrationStart {
Ok(is_invalid_leader_peer)
}
/// Filters out regions that unmigrated.
fn filter_unmigrated_regions(&self, region_routes: &mut Vec<RegionRoute>, to_peer: &Peer) {
region_routes
.retain(|region_route| !self.has_migrated(region_route, to_peer).unwrap_or(false));
}
/// Checks whether the region has been migrated.
/// Returns true if it's.
///
@@ -162,7 +175,7 @@ impl RegionMigrationStart {
let region_migrated = region_route
.leader_peer
.as_ref()
.context(error::UnexpectedSnafu {
.with_context(|| error::UnexpectedSnafu {
violated: format!("Leader peer is not found in TableRoute({})", region_id),
})?
.id
@@ -173,6 +186,7 @@ impl RegionMigrationStart {
#[cfg(test)]
mod tests {
use std::assert_matches::assert_matches;
use common_meta::key::test_utils::new_test_table_info;
@@ -183,7 +197,6 @@ mod tests {
use super::*;
use crate::error::Error;
use crate::procedure::region_migration::test_util::{self, TestingEnv, new_procedure_context};
use crate::procedure::region_migration::update_metadata::UpdateMetadata;
use crate::procedure::region_migration::{ContextFactory, PersistentContext};
fn new_persistent_context() -> PersistentContext {
@@ -196,14 +209,8 @@ mod tests {
let env = TestingEnv::new();
let persistent_context = new_persistent_context();
let mut ctx = env.context_factory().new_context(persistent_context);
let err = state
.retrieve_region_route(&mut ctx, RegionId::new(1024, 1))
.await
.unwrap_err();
let err = state.retrieve_region_routes(&mut ctx).await.unwrap_err();
assert_matches!(err, Error::TableRouteNotFound { .. });
assert!(!err.is_retryable());
}
@@ -216,56 +223,20 @@ mod tests {
let env = TestingEnv::new();
let mut ctx = env.context_factory().new_context(persistent_context);
let table_info = new_test_table_info(1024, vec![1]).into();
let table_info = new_test_table_info(1024, vec![3]).into();
let region_route = RegionRoute {
region: Region::new_test(RegionId::new(1024, 1)),
region: Region::new_test(RegionId::new(1024, 3)),
leader_peer: Some(from_peer.clone()),
..Default::default()
};
env.create_physical_table_metadata(table_info, vec![region_route])
.await;
let err = state
.retrieve_region_route(&mut ctx, RegionId::new(1024, 3))
.await
.unwrap_err();
let err = state.retrieve_region_routes(&mut ctx).await.unwrap_err();
assert_matches!(err, Error::Unexpected { .. });
assert!(!err.is_retryable());
}
#[tokio::test]
async fn test_next_update_metadata_downgrade_state() {
let mut state = Box::new(RegionMigrationStart);
// from_peer: 1
// to_peer: 2
let persistent_context = new_persistent_context();
let from_peer_id = persistent_context.from_peer.id;
let to_peer = persistent_context.to_peer.clone();
let region_id = persistent_context.region_id;
let env = TestingEnv::new();
let mut ctx = env.context_factory().new_context(persistent_context);
let table_info = new_test_table_info(1024, vec![1]).into();
let region_routes = vec![RegionRoute {
region: Region::new_test(region_id),
leader_peer: Some(Peer::empty(from_peer_id)),
follower_peers: vec![to_peer],
..Default::default()
}];
env.create_physical_table_metadata(table_info, region_routes)
.await;
let procedure_ctx = new_procedure_context();
let (next, _) = state.next(&mut ctx, &procedure_ctx).await.unwrap();
let update_metadata = next.as_any().downcast_ref::<UpdateMetadata>().unwrap();
assert_matches!(update_metadata, UpdateMetadata::Downgrade);
}
#[tokio::test]
async fn test_next_migration_end_state() {
let mut state = Box::new(RegionMigrationStart);
@@ -274,7 +245,7 @@ mod tests {
let persistent_context = new_persistent_context();
let to_peer = persistent_context.to_peer.clone();
let from_peer = persistent_context.from_peer.clone();
let region_id = persistent_context.region_id;
let region_id = persistent_context.region_ids[0];
let env = TestingEnv::new();
let mut ctx = env.context_factory().new_context(persistent_context);
@@ -302,7 +273,7 @@ mod tests {
// to_peer: 2
let persistent_context = new_persistent_context();
let from_peer_id = persistent_context.from_peer.id;
let region_id = persistent_context.region_id;
let region_id = persistent_context.region_ids[0];
let env = TestingEnv::new();
let mut ctx = env.context_factory().new_context(persistent_context);
@@ -327,12 +298,12 @@ mod tests {
// from_peer: 1
// to_peer: 2
let persistent_context = new_persistent_context();
let region_id = persistent_context.region_id;
let region_id = persistent_context.region_ids[0];
let env = TestingEnv::new();
let mut ctx = env.context_factory().new_context(persistent_context);
let table_info = new_test_table_info(1024, vec![1]).into();
let region_routes = vec![RegionRoute {
let region_routes: Vec<RegionRoute> = vec![RegionRoute {
region: Region::new_test(region_id),
leader_peer: Some(Peer::empty(1024)),
..Default::default()

View File

@@ -13,6 +13,7 @@
// limitations under the License.
use std::any::Any;
use std::ops::Div;
use std::time::Duration;
use api::v1::meta::MailboxMessage;
@@ -65,33 +66,43 @@ impl OpenCandidateRegion {
/// Abort(non-retry):
/// - Datanode Table is not found.
async fn build_open_region_instruction(&self, ctx: &mut Context) -> Result<Instruction> {
let pc = &ctx.persistent_ctx;
let table_id = pc.region_id.table_id();
let region_number = pc.region_id.region_number();
let candidate_id = pc.to_peer.id;
let datanode_table_value = ctx.get_from_peer_datanode_table_value().await?;
let region_ids = ctx.persistent_ctx.region_ids.clone();
let from_peer_id = ctx.persistent_ctx.from_peer.id;
let to_peer_id = ctx.persistent_ctx.to_peer.id;
let datanode_table_values = ctx.get_from_peer_datanode_table_values().await?;
let mut open_regions = Vec::with_capacity(region_ids.len());
let RegionInfo {
region_storage_path,
region_options,
region_wal_options,
engine,
} = datanode_table_value.region_info.clone();
let open_instruction = Instruction::OpenRegions(vec![OpenRegion::new(
RegionIdent {
datanode_id: candidate_id,
table_id,
region_number,
for region_id in region_ids {
let table_id = region_id.table_id();
let region_number = region_id.region_number();
let datanode_table_value = datanode_table_values.get(&table_id).context(
error::DatanodeTableNotFoundSnafu {
table_id,
datanode_id: from_peer_id,
},
)?;
let RegionInfo {
region_storage_path,
region_options,
region_wal_options,
engine,
},
&region_storage_path,
region_options,
region_wal_options,
true,
)]);
} = datanode_table_value.region_info.clone();
Ok(open_instruction)
open_regions.push(OpenRegion::new(
RegionIdent {
datanode_id: to_peer_id,
table_id,
region_number,
engine,
},
&region_storage_path,
region_options,
region_wal_options,
true,
));
}
Ok(Instruction::OpenRegions(open_regions))
}
/// Opens the candidate region.
@@ -111,25 +122,27 @@ impl OpenCandidateRegion {
) -> Result<()> {
let pc = &ctx.persistent_ctx;
let vc = &mut ctx.volatile_ctx;
let region_id = pc.region_id;
let region_ids = &pc.region_ids;
let candidate = &pc.to_peer;
// This method might be invoked multiple times.
// Only registers the guard if `opening_region_guard` is absent.
if vc.opening_region_guard.is_none() {
// Registers the opening region.
let guard = ctx
.opening_region_keeper
.register(candidate.id, region_id)
.context(error::RegionOpeningRaceSnafu {
peer_id: candidate.id,
region_id,
})?;
vc.opening_region_guard = Some(guard);
if vc.opening_region_guards.is_empty() {
for region_id in region_ids {
// Registers the opening region.
let guard = ctx
.opening_region_keeper
.register(candidate.id, *region_id)
.context(error::RegionOpeningRaceSnafu {
peer_id: candidate.id,
region_id: *region_id,
})?;
vc.opening_region_guards.push(guard);
}
}
let msg = MailboxMessage::json_message(
&format!("Open candidate region: {}", region_id),
&format!("Open candidate regions: {:?}", region_ids),
&format!("Metasrv@{}", ctx.server_addr()),
&format!("Datanode-{}@{}", candidate.id, candidate.addr),
common_time::util::current_time_millis(),
@@ -139,20 +152,23 @@ impl OpenCandidateRegion {
input: open_instruction.to_string(),
})?;
let operation_timeout =
ctx.next_operation_timeout()
.context(error::ExceededDeadlineSnafu {
operation: "Open candidate region",
})?;
let operation_timeout = operation_timeout.div(2).max(OPEN_CANDIDATE_REGION_TIMEOUT);
let ch = Channel::Datanode(candidate.id);
let now = Instant::now();
let receiver = ctx
.mailbox
.send(&ch, msg, OPEN_CANDIDATE_REGION_TIMEOUT)
.await?;
let receiver = ctx.mailbox.send(&ch, msg, operation_timeout).await?;
match receiver.await {
Ok(msg) => {
let reply = HeartbeatMailbox::json_reply(&msg)?;
info!(
"Received open region reply: {:?}, region: {}, elapsed: {:?}",
"Received open region reply: {:?}, region: {:?}, elapsed: {:?}",
reply,
region_id,
region_ids,
now.elapsed()
);
let InstructionReply::OpenRegions(SimpleReply { result, error }) = reply else {
@@ -168,7 +184,7 @@ impl OpenCandidateRegion {
} else {
error::RetryLaterSnafu {
reason: format!(
"Region {region_id} is not opened by datanode {:?}, error: {error:?}, elapsed: {:?}",
"Region {region_ids:?} is not opened by datanode {:?}, error: {error:?}, elapsed: {:?}",
candidate,
now.elapsed()
),
@@ -178,7 +194,7 @@ impl OpenCandidateRegion {
}
Err(error::Error::MailboxTimeout { .. }) => {
let reason = format!(
"Mailbox received timeout for open candidate region {region_id} on datanode {:?}, elapsed: {:?}",
"Mailbox received timeout for open candidate region {region_ids:?} on datanode {:?}, elapsed: {:?}",
candidate,
now.elapsed()
);
@@ -251,7 +267,7 @@ mod tests {
// from_peer: 1
// to_peer: 2
let persistent_context = new_persistent_context();
let region_id = persistent_context.region_id;
let region_id = persistent_context.region_ids[0];
let to_peer_id = persistent_context.to_peer.id;
let env = TestingEnv::new();
let mut ctx = env.context_factory().new_context(persistent_context);
@@ -272,7 +288,7 @@ mod tests {
// from_peer: 1
// to_peer: 2
let persistent_context = new_persistent_context();
let region_id = persistent_context.region_id;
let region_id = persistent_context.region_ids[0];
let to_peer_id = persistent_context.to_peer.id;
let env = TestingEnv::new();
@@ -298,7 +314,7 @@ mod tests {
// from_peer: 1
// to_peer: 2
let persistent_context = new_persistent_context();
let region_id = persistent_context.region_id;
let region_id = persistent_context.region_ids[0];
let to_peer_id = persistent_context.to_peer.id;
let mut env = TestingEnv::new();
@@ -331,7 +347,7 @@ mod tests {
// from_peer: 1
// to_peer: 2
let persistent_context = new_persistent_context();
let region_id = persistent_context.region_id;
let region_id = persistent_context.region_ids[0];
let to_peer_id = persistent_context.to_peer.id;
let mut env = TestingEnv::new();
@@ -366,7 +382,7 @@ mod tests {
// from_peer: 1
// to_peer: 2
let persistent_context = new_persistent_context();
let region_id = persistent_context.region_id;
let region_id = persistent_context.region_ids[0];
let to_peer_id = persistent_context.to_peer.id;
let mut env = TestingEnv::new();
@@ -406,14 +422,14 @@ mod tests {
// to_peer: 2
let persistent_context = new_persistent_context();
let from_peer_id = persistent_context.from_peer.id;
let region_id = persistent_context.region_id;
let region_id = persistent_context.region_ids[0];
let to_peer_id = persistent_context.to_peer.id;
let mut env = TestingEnv::new();
// Prepares table
let table_info = new_test_table_info(1024, vec![1]).into();
let region_routes = vec![RegionRoute {
region: Region::new_test(persistent_context.region_id),
region: Region::new_test(region_id),
leader_peer: Some(Peer::empty(from_peer_id)),
..Default::default()
}];
@@ -441,10 +457,7 @@ mod tests {
let procedure_ctx = new_procedure_context();
let (next, _) = state.next(&mut ctx, &procedure_ctx).await.unwrap();
let vc = ctx.volatile_ctx;
assert_eq!(
vc.opening_region_guard.unwrap().info(),
(to_peer_id, region_id)
);
assert_eq!(vc.opening_region_guards[0].info(), (to_peer_id, region_id));
let flush_leader_region = next.as_any().downcast_ref::<PreFlushRegion>().unwrap();
assert_matches!(flush_leader_region, PreFlushRegion);

View File

@@ -190,7 +190,7 @@ pub fn new_persistent_context(from: u64, to: u64, region_id: RegionId) -> Persis
schema: "public".into(),
from_peer: Peer::empty(from),
to_peer: Peer::empty(to),
region_id,
region_ids: vec![region_id],
timeout: Duration::from_secs(10),
trigger_reason: RegionMigrationTriggerReason::default(),
}
@@ -306,37 +306,38 @@ impl ProcedureMigrationTestSuite {
/// Verifies table metadata after region migration.
pub(crate) async fn verify_table_metadata(&self) {
let region_id = self.context.persistent_ctx.region_id;
let table_route = self
.env
.table_metadata_manager
.table_route_manager()
.table_route_storage()
.get(region_id.table_id())
.await
.unwrap()
.unwrap();
let region_routes = table_route.region_routes().unwrap();
for region_id in &self.context.persistent_ctx.region_ids {
let table_route = self
.env
.table_metadata_manager
.table_route_manager()
.table_route_storage()
.get(region_id.table_id())
.await
.unwrap()
.unwrap();
let region_routes = table_route.region_routes().unwrap();
let expected_leader_id = self.context.persistent_ctx.to_peer.id;
let removed_follower_id = self.context.persistent_ctx.from_peer.id;
let expected_leader_id = self.context.persistent_ctx.to_peer.id;
let removed_follower_id = self.context.persistent_ctx.from_peer.id;
let region_route = region_routes
.iter()
.find(|route| route.region.id == region_id)
.unwrap();
assert!(!region_route.is_leader_downgrading());
assert_eq!(
region_route.leader_peer.as_ref().unwrap().id,
expected_leader_id
);
assert!(
!region_route
.follower_peers
let region_route = region_routes
.iter()
.any(|route| route.id == removed_follower_id)
)
.find(|route| route.region.id == *region_id)
.unwrap();
assert!(!region_route.is_leader_downgrading());
assert_eq!(
region_route.leader_peer.as_ref().unwrap().id,
expected_leader_id
);
assert!(
!region_route
.follower_peers
.iter()
.any(|route| route.id == removed_follower_id)
)
}
}
}

View File

@@ -18,7 +18,6 @@ pub(crate) mod upgrade_candidate_region;
use std::any::Any;
use common_meta::lock_key::TableLock;
use common_procedure::{Context as ProcedureContext, Status};
use common_telemetry::warn;
use serde::{Deserialize, Serialize};
@@ -48,12 +47,10 @@ impl State for UpdateMetadata {
ctx: &mut Context,
procedure_ctx: &ProcedureContext,
) -> Result<(Box<dyn State>, Status)> {
let table_id = TableLock::Write(ctx.region_id().table_id()).into();
let _guard = procedure_ctx.provider.acquire_lock(&table_id).await;
match self {
UpdateMetadata::Downgrade => {
self.downgrade_leader_region(ctx).await?;
self.downgrade_leader_region(ctx, &procedure_ctx.provider)
.await?;
Ok((
Box::<DowngradeLeaderRegion>::default(),
@@ -61,7 +58,8 @@ impl State for UpdateMetadata {
))
}
UpdateMetadata::Upgrade => {
self.upgrade_candidate_region(ctx).await?;
self.upgrade_candidate_region(ctx, &procedure_ctx.provider)
.await?;
if let Err(err) = ctx.invalidate_table_cache().await {
warn!(
@@ -71,7 +69,8 @@ impl State for UpdateMetadata {
Ok((Box::new(CloseDowngradedRegion), Status::executing(false)))
}
UpdateMetadata::Rollback => {
self.rollback_downgraded_region(ctx).await?;
self.rollback_downgraded_region(ctx, &procedure_ctx.provider)
.await?;
if let Err(err) = ctx.invalidate_table_cache().await {
warn!(

View File

@@ -13,7 +13,10 @@
// limitations under the License.
use common_error::ext::BoxedError;
use common_meta::lock_key::TableLock;
use common_meta::rpc::router::LeaderState;
use common_procedure::ContextProviderRef;
use common_telemetry::{error, info};
use snafu::ResultExt;
use crate::error::{self, Result};
@@ -37,40 +40,48 @@ impl UpdateMetadata {
/// It will only update **other region** info. Therefore, It's safe to retry after failure.
///
/// - There is no other DDL procedure executed concurrently for the current table.
pub async fn downgrade_leader_region(&self, ctx: &mut Context) -> Result<()> {
pub async fn downgrade_leader_region(
&self,
ctx: &mut Context,
ctx_provider: &ContextProviderRef,
) -> Result<()> {
let table_metadata_manager = ctx.table_metadata_manager.clone();
let from_peer_id = ctx.persistent_ctx.from_peer.id;
let region_id = ctx.region_id();
let table_id = region_id.table_id();
let current_table_route_value = ctx.get_table_route_value().await?;
let table_regions = ctx.persistent_ctx.table_regions();
// TODO(weny): ensures the leader region peer is the `from_peer`.
if let Err(err) = table_metadata_manager
.update_leader_region_status(table_id, current_table_route_value, |route| {
if route.region.id == region_id
&& route
.leader_peer
.as_ref()
.is_some_and(|leader_peer| leader_peer.id == from_peer_id)
{
Some(Some(LeaderState::Downgrading))
} else {
None
}
})
.await
.context(error::TableMetadataManagerSnafu)
{
ctx.remove_table_route_value();
return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu {
reason: format!(
"Failed to update the table route during the downgrading leader region, region_id: {region_id}, from_peer_id: {from_peer_id}"
),
});
for (table_id, regions) in table_regions {
let table_lock = TableLock::Write(table_id).into();
let _guard = ctx_provider.acquire_lock(&table_lock).await;
let current_table_route_value = ctx.get_table_route_value(table_id).await?;
if let Err(err) = table_metadata_manager
.update_leader_region_status(table_id, &current_table_route_value, |route| {
if regions.contains(&route.region.id)
&& route
.leader_peer
.as_ref()
.is_some_and(|leader_peer| leader_peer.id == from_peer_id)
{
Some(Some(LeaderState::Downgrading))
} else {
None
}
})
.await
.context(error::TableMetadataManagerSnafu)
{
error!(err; "Failed to update the table route during the downgrading leader region, regions: {regions:?}, from_peer_id: {from_peer_id}");
return Err(BoxedError::new(err)).with_context(|_| error::RetryLaterWithSourceSnafu {
reason: format!(
"Failed to update the table route during the downgrading leader region, regions: {regions:?}, from_peer_id: {from_peer_id}"
),
});
}
info!(
"Downgrading leader region table route success, table_id: {table_id}, regions: {regions:?}, from_peer_id: {from_peer_id}"
);
}
ctx.remove_table_route_value();
Ok(())
}
}
@@ -78,10 +89,13 @@ impl UpdateMetadata {
#[cfg(test)]
mod tests {
use std::assert_matches::assert_matches;
use std::collections::HashMap;
use std::sync::Arc;
use common_meta::key::test_utils::new_test_table_info;
use common_meta::peer::Peer;
use common_meta::rpc::router::{LeaderState, Region, RegionRoute};
use common_meta::rpc::router::{Region, RegionRoute};
use common_procedure_test::MockContextProvider;
use store_api::storage::RegionId;
use crate::error::Error;
@@ -107,71 +121,18 @@ mod tests {
let env = TestingEnv::new();
let persistent_context = new_persistent_context();
let mut ctx = env.context_factory().new_context(persistent_context);
let provider = Arc::new(MockContextProvider::new(HashMap::new())) as _;
let err = state.downgrade_leader_region(&mut ctx).await.unwrap_err();
let err = state
.downgrade_leader_region(&mut ctx, &provider)
.await
.unwrap_err();
assert_matches!(err, Error::TableRouteNotFound { .. });
assert!(!err.is_retryable());
}
#[tokio::test]
async fn test_failed_to_update_table_route_error() {
let state = UpdateMetadata::Downgrade;
let persistent_context = new_persistent_context();
let from_peer = persistent_context.from_peer.clone();
let env = TestingEnv::new();
let mut ctx = env.context_factory().new_context(persistent_context);
let table_id = ctx.region_id().table_id();
let table_info = new_test_table_info(1024, vec![1, 2]).into();
let region_routes = vec![
RegionRoute {
region: Region::new_test(RegionId::new(1024, 1)),
leader_peer: Some(from_peer.clone()),
..Default::default()
},
RegionRoute {
region: Region::new_test(RegionId::new(1024, 2)),
leader_peer: Some(Peer::empty(4)),
..Default::default()
},
];
env.create_physical_table_metadata(table_info, region_routes)
.await;
let table_metadata_manager = env.table_metadata_manager();
let original_table_route = table_metadata_manager
.table_route_manager()
.table_route_storage()
.get_with_raw_bytes(table_id)
.await
.unwrap()
.unwrap();
// modifies the table route.
table_metadata_manager
.update_leader_region_status(table_id, &original_table_route, |route| {
if route.region.id == RegionId::new(1024, 2) {
Some(Some(LeaderState::Downgrading))
} else {
None
}
})
.await
.unwrap();
// sets the old table route.
ctx.volatile_ctx.table_route = Some(original_table_route);
let err = state.downgrade_leader_region(&mut ctx).await.unwrap_err();
assert!(ctx.volatile_ctx.table_route.is_none());
assert!(err.is_retryable());
assert!(format!("{err:?}").contains("Failed to update the table route"));
}
#[tokio::test]
async fn test_only_downgrade_from_peer() {
let mut state = Box::new(UpdateMetadata::Downgrade);
@@ -179,7 +140,7 @@ mod tests {
let env = TestingEnv::new();
let mut ctx = env.context_factory().new_context(persistent_context);
let table_id = ctx.region_id().table_id();
let table_id = ctx.persistent_ctx.region_ids[0].table_id();
let table_info = new_test_table_info(1024, vec![1, 2]).into();
let region_routes = vec![RegionRoute {
@@ -212,7 +173,6 @@ mod tests {
// It should remain unchanged.
assert_eq!(latest_table_route.version().unwrap(), 0);
assert!(!latest_table_route.region_routes().unwrap()[0].is_leader_downgrading());
assert!(ctx.volatile_ctx.table_route.is_none());
}
#[tokio::test]
@@ -223,7 +183,7 @@ mod tests {
let env = TestingEnv::new();
let mut ctx = env.context_factory().new_context(persistent_context);
let table_id = ctx.region_id().table_id();
let table_id = ctx.persistent_ctx.region_ids[0].table_id();
let table_info = new_test_table_info(1024, vec![1, 2]).into();
let region_routes = vec![RegionRoute {
@@ -254,6 +214,5 @@ mod tests {
.unwrap();
assert!(latest_table_route.region_routes().unwrap()[0].is_leader_downgrading());
assert!(ctx.volatile_ctx.table_route.is_none());
}
}

View File

@@ -13,6 +13,9 @@
// limitations under the License.
use common_error::ext::BoxedError;
use common_meta::lock_key::TableLock;
use common_procedure::ContextProviderRef;
use common_telemetry::{error, info};
use snafu::ResultExt;
use crate::error::{self, Result};
@@ -28,31 +31,40 @@ impl UpdateMetadata {
/// Retry:
/// - Failed to update [TableRouteValue](common_meta::key::table_region::TableRegionValue).
/// - Failed to retrieve the metadata of table.
pub async fn rollback_downgraded_region(&self, ctx: &mut Context) -> Result<()> {
pub async fn rollback_downgraded_region(
&self,
ctx: &mut Context,
ctx_provider: &ContextProviderRef,
) -> Result<()> {
let table_metadata_manager = ctx.table_metadata_manager.clone();
let region_id = ctx.region_id();
let table_id = region_id.table_id();
let current_table_route_value = ctx.get_table_route_value().await?;
let table_regions = ctx.persistent_ctx.table_regions();
if let Err(err) = table_metadata_manager
.update_leader_region_status(table_id, current_table_route_value, |route| {
if route.region.id == region_id {
Some(None)
} else {
None
}
})
.await
.context(error::TableMetadataManagerSnafu)
{
ctx.remove_table_route_value();
return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu {
reason: format!("Failed to update the table route during the rollback downgraded leader region: {region_id}"),
});
for (table_id, regions) in table_regions {
let table_lock = TableLock::Write(table_id).into();
let _guard = ctx_provider.acquire_lock(&table_lock).await;
let current_table_route_value = ctx.get_table_route_value(table_id).await?;
if let Err(err) = table_metadata_manager
.update_leader_region_status(table_id, &current_table_route_value, |route| {
if regions.contains(&route.region.id) {
Some(None)
} else {
None
}
})
.await
.context(error::TableMetadataManagerSnafu)
{
error!(err; "Failed to update the table route during the rollback downgraded leader regions: {regions:?}");
return Err(BoxedError::new(err)).with_context(|_| error::RetryLaterWithSourceSnafu {
reason: format!("Failed to update the table route during the rollback downgraded leader regions: {regions:?}"),
});
}
info!(
"Rolling back downgraded leader region table route success, table_id: {table_id}, regions: {regions:?}"
);
}
ctx.register_failure_detectors().await;
ctx.remove_table_route_value();
Ok(())
}
@@ -61,11 +73,13 @@ impl UpdateMetadata {
#[cfg(test)]
mod tests {
use std::assert_matches::assert_matches;
use std::collections::HashMap;
use std::sync::Arc;
use common_meta::key::test_utils::new_test_table_info;
use common_meta::peer::Peer;
use common_meta::rpc::router::{LeaderState, Region, RegionRoute};
use common_procedure_test::MockContextProvider;
use store_api::storage::RegionId;
use crate::error::Error;
@@ -73,7 +87,6 @@ mod tests {
use crate::procedure::region_migration::test_util::{self, TestingEnv, new_procedure_context};
use crate::procedure::region_migration::update_metadata::UpdateMetadata;
use crate::procedure::region_migration::{ContextFactory, PersistentContext, State};
use crate::region::supervisor::RegionFailureDetectorControl;
fn new_persistent_context() -> PersistentContext {
test_util::new_persistent_context(1, 2, RegionId::new(1024, 1))
@@ -86,108 +99,17 @@ mod tests {
let persistent_context = new_persistent_context();
let mut ctx = env.context_factory().new_context(persistent_context);
let err = state.downgrade_leader_region(&mut ctx).await.unwrap_err();
let provider = Arc::new(MockContextProvider::new(HashMap::new())) as _;
let err = state
.rollback_downgraded_region(&mut ctx, &provider)
.await
.unwrap_err();
assert_matches!(err, Error::TableRouteNotFound { .. });
assert!(!err.is_retryable());
}
#[tokio::test]
async fn test_update_table_route_with_retry() {
let state = UpdateMetadata::Rollback;
let persistent_context = new_persistent_context();
let from_peer = persistent_context.from_peer.clone();
let env = TestingEnv::new();
let mut ctx = env.context_factory().new_context(persistent_context);
let (tx, mut rx) = tokio::sync::mpsc::channel(8);
ctx.region_failure_detector_controller = Arc::new(RegionFailureDetectorControl::new(tx));
let table_id = ctx.region_id().table_id();
let table_info = new_test_table_info(1024, vec![1, 2, 3]).into();
let region_routes = vec![
RegionRoute {
region: Region::new_test(RegionId::new(1024, 1)),
leader_peer: Some(from_peer.clone()),
leader_state: Some(LeaderState::Downgrading),
..Default::default()
},
RegionRoute {
region: Region::new_test(RegionId::new(1024, 2)),
leader_peer: Some(Peer::empty(4)),
leader_state: Some(LeaderState::Downgrading),
..Default::default()
},
RegionRoute {
region: Region::new_test(RegionId::new(1024, 3)),
leader_peer: Some(Peer::empty(5)),
..Default::default()
},
];
let expected_region_routes = {
let mut region_routes = region_routes.clone();
region_routes[0].leader_state = None;
region_routes[1].leader_state = None;
region_routes
};
env.create_physical_table_metadata(table_info, region_routes)
.await;
let table_metadata_manager = env.table_metadata_manager();
let old_table_route = table_metadata_manager
.table_route_manager()
.table_route_storage()
.get_with_raw_bytes(table_id)
.await
.unwrap()
.unwrap();
// modifies the table route.
table_metadata_manager
.update_leader_region_status(table_id, &old_table_route, |route| {
if route.region.id == RegionId::new(1024, 2) {
Some(None)
} else {
None
}
})
.await
.unwrap();
ctx.volatile_ctx.table_route = Some(old_table_route);
let err = state
.rollback_downgraded_region(&mut ctx)
.await
.unwrap_err();
assert!(ctx.volatile_ctx.table_route.is_none());
assert!(err.is_retryable());
assert!(format!("{err:?}").contains("Failed to update the table route"));
assert_eq!(rx.len(), 0);
state.rollback_downgraded_region(&mut ctx).await.unwrap();
let event = rx.try_recv().unwrap();
let detecting_regions = event.into_region_failure_detectors();
assert_eq!(
detecting_regions,
vec![(from_peer.id, ctx.persistent_ctx.region_id)]
);
let table_route = table_metadata_manager
.table_route_manager()
.table_route_storage()
.get(table_id)
.await
.unwrap()
.unwrap();
assert_eq!(
&expected_region_routes,
table_route.region_routes().unwrap()
);
}
#[tokio::test]
async fn test_next_migration_end_state() {
let mut state = Box::new(UpdateMetadata::Rollback);
@@ -196,7 +118,7 @@ mod tests {
let env = TestingEnv::new();
let mut ctx = env.context_factory().new_context(persistent_context);
let table_id = ctx.region_id().table_id();
let table_id = ctx.persistent_ctx.region_ids[0].table_id();
let table_info = new_test_table_info(1024, vec![1, 2, 3]).into();
let region_routes = vec![
@@ -238,8 +160,6 @@ mod tests {
.downcast_ref::<RegionMigrationAbort>()
.unwrap();
assert!(ctx.volatile_ctx.table_route.is_none());
let table_route = table_metadata_manager
.table_route_manager()
.table_route_storage()

View File

@@ -14,9 +14,12 @@
use common_error::ext::BoxedError;
use common_meta::key::datanode_table::RegionInfo;
use common_meta::lock_key::TableLock;
use common_meta::rpc::router::{RegionRoute, region_distribution};
use common_telemetry::{info, warn};
use common_procedure::ContextProviderRef;
use common_telemetry::{error, info, warn};
use snafu::{OptionExt, ResultExt, ensure};
use store_api::storage::RegionId;
use crate::error::{self, Result};
use crate::procedure::region_migration::Context;
@@ -24,104 +27,114 @@ use crate::procedure::region_migration::update_metadata::UpdateMetadata;
impl UpdateMetadata {
/// Returns new [Vec<RegionRoute>].
async fn build_upgrade_candidate_region_metadata(
fn build_upgrade_candidate_region_metadata(
&self,
ctx: &mut Context,
region_ids: &[RegionId],
mut region_routes: Vec<RegionRoute>,
) -> Result<Vec<RegionRoute>> {
let region_id = ctx.region_id();
let table_route_value = ctx.get_table_route_value().await?.clone();
let old_leader_peer = &ctx.persistent_ctx.from_peer;
let new_leader_peer = &ctx.persistent_ctx.to_peer;
for region_id in region_ids {
// Find the RegionRoute for this region_id.
let region_route = region_routes
.iter_mut()
.find(|route| route.region.id == *region_id)
.context(error::RegionRouteNotFoundSnafu {
region_id: *region_id,
})?;
let mut region_routes = table_route_value
.region_routes()
.context(error::UnexpectedLogicalRouteTableSnafu {
err_msg: format!("{self:?} is a non-physical TableRouteValue."),
})?
.clone();
let region_route = region_routes
.iter_mut()
.find(|route| route.region.id == region_id)
.context(error::RegionRouteNotFoundSnafu { region_id })?;
// Remove any "downgraded leader" state.
region_route.set_leader_state(None);
// Removes downgraded status.
region_route.set_leader_state(None);
let candidate = &ctx.persistent_ctx.to_peer;
let expected_old_leader = &ctx.persistent_ctx.from_peer;
// Upgrades candidate to leader.
ensure!(
region_route
.leader_peer
.take_if(|old_leader| old_leader.id == expected_old_leader.id)
.is_some(),
error::UnexpectedSnafu {
violated: format!(
"Unexpected region leader: {:?} during the upgrading candidate metadata, expected: {:?}",
region_route.leader_peer, expected_old_leader
),
}
);
region_route.leader_peer = Some(candidate.clone());
info!(
"Upgrading candidate region to leader region: {:?} for region: {}",
candidate, region_id
);
// Removes the candidate region in followers.
let removed = region_route
.follower_peers
.extract_if(.., |peer| peer.id == candidate.id)
.collect::<Vec<_>>();
if removed.len() > 1 {
warn!(
"Removes duplicated regions: {removed:?} during the upgrading candidate metadata for region: {region_id}"
);
}
Ok(region_routes)
}
/// Returns true if region metadata has been updated.
async fn check_metadata_updated(&self, ctx: &mut Context) -> Result<bool> {
let region_id = ctx.region_id();
let table_route_value = ctx.get_table_route_value().await?.clone();
let region_routes = table_route_value
.region_routes()
.context(error::UnexpectedLogicalRouteTableSnafu {
err_msg: format!("{self:?} is a non-physical TableRouteValue."),
})?
.clone();
let region_route = region_routes
.into_iter()
.find(|route| route.region.id == region_id)
.context(error::RegionRouteNotFoundSnafu { region_id })?;
let leader_peer = region_route
.leader_peer
.as_ref()
.context(error::UnexpectedSnafu {
violated: format!("The leader peer of region {region_id} is not found during the update metadata for upgrading"),
})?;
let candidate_peer_id = ctx.persistent_ctx.to_peer.id;
if leader_peer.id == candidate_peer_id {
// Check old leader matches expectation before upgrading to new leader.
ensure!(
!region_route.is_leader_downgrading(),
region_route
.leader_peer
.take_if(|old_leader| old_leader.id == old_leader_peer.id)
.is_some(),
error::UnexpectedSnafu {
violated: format!(
"Unexpected intermediate state is found during the update metadata for upgrading region {region_id}"
"Unexpected region leader: {:?} during the candidate-to-leader upgrade; expected: {:?}",
region_route.leader_peer, old_leader_peer
),
}
);
Ok(true)
} else {
Ok(false)
// Set new leader.
region_route.leader_peer = Some(new_leader_peer.clone());
// Remove new leader from followers (avoids duplicate leader/follower).
let removed = region_route
.follower_peers
.extract_if(.., |peer| peer.id == new_leader_peer.id)
.collect::<Vec<_>>();
// Warn if more than one follower with the new leader id was present.
if removed.len() > 1 {
warn!(
"Removed duplicate followers: {removed:?} during candidate-to-leader upgrade for region: {region_id}"
);
}
}
info!(
"Building metadata for upgrading candidate region to new leader: {:?} for regions: {:?}",
new_leader_peer, region_ids,
);
Ok(region_routes)
}
/// Checks if metadata has been upgraded for a list of regions by verifying if their
/// leader peers have been switched to a specified peer ID (`to_peer_id`) and that
/// no region is in a leader downgrading state.
///
/// Returns:
/// - `Ok(true)` if all regions' leader is the target peer and no downgrading occurs.
/// - `Ok(false)` if any region's leader is not the target peer.
/// - Error if region route or leader peer cannot be found, or an unexpected state is detected.
fn check_metadata_updated(
&self,
ctx: &mut Context,
region_ids: &[RegionId],
region_routes: &[RegionRoute],
) -> Result<bool> {
// Iterate through each provided region ID
for region_id in region_ids {
// Find the route info for this region
let region_route = region_routes
.iter()
.find(|route| route.region.id == *region_id)
.context(error::RegionRouteNotFoundSnafu {
region_id: *region_id,
})?;
// Get the leader peer for the region, error if not found
let leader_peer = region_route.leader_peer.as_ref().with_context(||error::UnexpectedSnafu {
violated: format!(
"The leader peer of region {region_id} is not found during the metadata upgrade check"
),
})?;
// If the leader is not the expected peer, return false (i.e., not yet upgraded)
if leader_peer.id != ctx.persistent_ctx.to_peer.id {
return Ok(false);
} else {
// If leader matches but region is in leader downgrading state, error (unexpected state)
ensure!(
!region_route.is_leader_downgrading(),
error::UnexpectedSnafu {
violated: format!(
"Unexpected intermediate state is found during the metadata upgrade check for region {region_id}"
),
}
);
}
}
// All regions' leader match expected peer and are not downgrading; considered upgraded
Ok(true)
}
/// Upgrades the candidate region.
@@ -133,57 +146,77 @@ impl UpdateMetadata {
/// Retry:
/// - Failed to update [TableRouteValue](common_meta::key::table_region::TableRegionValue).
/// - Failed to retrieve the metadata of table.
pub async fn upgrade_candidate_region(&self, ctx: &mut Context) -> Result<()> {
let region_id = ctx.region_id();
pub async fn upgrade_candidate_region(
&self,
ctx: &mut Context,
ctx_provider: &ContextProviderRef,
) -> Result<()> {
let table_metadata_manager = ctx.table_metadata_manager.clone();
let table_regions = ctx.persistent_ctx.table_regions();
let from_peer_id = ctx.persistent_ctx.from_peer.id;
let to_peer_id = ctx.persistent_ctx.to_peer.id;
if self.check_metadata_updated(ctx).await? {
return Ok(());
for (table_id, region_ids) in table_regions {
let table_lock = TableLock::Write(table_id).into();
let _guard = ctx_provider.acquire_lock(&table_lock).await;
let table_route_value = ctx.get_table_route_value(table_id).await?;
let region_routes = table_route_value.region_routes().with_context(|_| {
error::UnexpectedLogicalRouteTableSnafu {
err_msg: format!("TableRoute({table_id:?}) is a non-physical TableRouteValue."),
}
})?;
if self.check_metadata_updated(ctx, &region_ids, region_routes)? {
continue;
}
let datanode_table_value = ctx.get_from_peer_datanode_table_value(table_id).await?;
let RegionInfo {
region_storage_path,
region_options,
region_wal_options,
engine,
} = datanode_table_value.region_info.clone();
let new_region_routes = self.build_upgrade_candidate_region_metadata(
ctx,
&region_ids,
region_routes.clone(),
)?;
let region_distribution = region_distribution(region_routes);
info!(
"Trying to update region routes to {:?} for table: {}",
region_distribution, table_id,
);
if let Err(err) = table_metadata_manager
.update_table_route(
table_id,
RegionInfo {
engine: engine.clone(),
region_storage_path: region_storage_path.clone(),
region_options: region_options.clone(),
region_wal_options: region_wal_options.clone(),
},
&table_route_value,
new_region_routes,
&region_options,
&region_wal_options,
)
.await
.context(error::TableMetadataManagerSnafu)
{
error!(err; "Failed to update the table route during the upgrading candidate region: {region_ids:?}, from_peer_id: {from_peer_id}");
return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu {
reason: format!("Failed to update the table route during the upgrading candidate region: {table_id}"),
});
};
info!(
"Upgrading candidate region table route success, table_id: {table_id}, regions: {region_ids:?}, to_peer_id: {to_peer_id}"
);
}
let region_routes = self.build_upgrade_candidate_region_metadata(ctx).await?;
let datanode_table_value = ctx.get_from_peer_datanode_table_value().await?;
let RegionInfo {
region_storage_path,
region_options,
region_wal_options,
engine,
} = datanode_table_value.region_info.clone();
let table_route_value = ctx.get_table_route_value().await?;
let region_distribution = region_distribution(&region_routes);
info!(
"Trying to update region routes to {:?} for table: {}",
region_distribution,
region_id.table_id()
);
if let Err(err) = table_metadata_manager
.update_table_route(
region_id.table_id(),
RegionInfo {
engine: engine.clone(),
region_storage_path: region_storage_path.clone(),
region_options: region_options.clone(),
region_wal_options: region_wal_options.clone(),
},
table_route_value,
region_routes,
&region_options,
&region_wal_options,
)
.await
.context(error::TableMetadataManagerSnafu)
{
ctx.remove_table_route_value();
return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu {
reason: format!("Failed to update the table route during the upgrading candidate region: {region_id}"),
});
};
ctx.remove_table_route_value();
ctx.deregister_failure_detectors().await;
// Consumes the guard.
ctx.volatile_ctx.opening_region_guard.take();
ctx.volatile_ctx.opening_region_guards.clear();
Ok(())
}
@@ -212,16 +245,11 @@ mod tests {
#[tokio::test]
async fn test_table_route_is_not_found_error() {
let state = UpdateMetadata::Upgrade;
let env = TestingEnv::new();
let persistent_context = new_persistent_context();
let mut ctx = env.context_factory().new_context(persistent_context);
let ctx = env.context_factory().new_context(persistent_context);
let err = state
.build_upgrade_candidate_region_metadata(&mut ctx)
.await
.unwrap_err();
let err = ctx.get_table_route_value(1024).await.unwrap_err();
assert_matches!(err, Error::TableRouteNotFound { .. });
assert!(!err.is_retryable());
@@ -240,13 +268,20 @@ mod tests {
leader_peer: Some(Peer::empty(4)),
..Default::default()
}];
env.create_physical_table_metadata(table_info, region_routes)
.await;
let table_route_value = ctx.get_table_route_value(1024).await.unwrap();
let region_routes = table_route_value
.into_inner()
.into_physical_table_route()
.region_routes;
let err = state
.build_upgrade_candidate_region_metadata(&mut ctx)
.await
.build_upgrade_candidate_region_metadata(
&mut ctx,
&[RegionId::new(1024, 1)],
region_routes,
)
.unwrap_err();
assert_matches!(err, Error::RegionRouteNotFound { .. });
@@ -270,9 +305,17 @@ mod tests {
env.create_physical_table_metadata(table_info, region_routes)
.await;
let table_route_value = ctx.get_table_route_value(1024).await.unwrap();
let region_routes = table_route_value
.into_inner()
.into_physical_table_route()
.region_routes;
let err = state
.build_upgrade_candidate_region_metadata(&mut ctx)
.await
.build_upgrade_candidate_region_metadata(
&mut ctx,
&[RegionId::new(1024, 1)],
region_routes,
)
.unwrap_err();
assert_matches!(err, Error::Unexpected { .. });
@@ -299,9 +342,17 @@ mod tests {
env.create_physical_table_metadata(table_info, region_routes)
.await;
let table_route_value = ctx.get_table_route_value(1024).await.unwrap();
let region_routes = table_route_value
.into_inner()
.into_physical_table_route()
.region_routes;
let new_region_routes = state
.build_upgrade_candidate_region_metadata(&mut ctx)
.await
.build_upgrade_candidate_region_metadata(
&mut ctx,
&[RegionId::new(1024, 1)],
region_routes,
)
.unwrap();
assert!(!new_region_routes[0].is_leader_downgrading());
@@ -310,71 +361,6 @@ mod tests {
assert_eq!(new_region_routes[0].leader_peer.as_ref().unwrap().id, 2);
}
#[tokio::test]
async fn test_failed_to_update_table_route_error() {
let state = UpdateMetadata::Upgrade;
let env = TestingEnv::new();
let persistent_context = new_persistent_context();
let mut ctx = env.context_factory().new_context(persistent_context);
let opening_keeper = MemoryRegionKeeper::default();
let table_id = 1024;
let table_info = new_test_table_info(table_id, vec![1]).into();
let region_routes = vec![
RegionRoute {
region: Region::new_test(RegionId::new(table_id, 1)),
leader_peer: Some(Peer::empty(1)),
follower_peers: vec![Peer::empty(5), Peer::empty(3)],
leader_state: Some(LeaderState::Downgrading),
leader_down_since: Some(current_time_millis()),
},
RegionRoute {
region: Region::new_test(RegionId::new(table_id, 2)),
leader_peer: Some(Peer::empty(4)),
leader_state: Some(LeaderState::Downgrading),
..Default::default()
},
];
env.create_physical_table_metadata(table_info, region_routes)
.await;
let table_metadata_manager = env.table_metadata_manager();
let original_table_route = table_metadata_manager
.table_route_manager()
.table_route_storage()
.get_with_raw_bytes(table_id)
.await
.unwrap()
.unwrap();
// modifies the table route.
table_metadata_manager
.update_leader_region_status(table_id, &original_table_route, |route| {
if route.region.id == RegionId::new(1024, 2) {
// Removes the status.
Some(None)
} else {
None
}
})
.await
.unwrap();
// sets the old table route.
ctx.volatile_ctx.table_route = Some(original_table_route);
let guard = opening_keeper
.register(2, RegionId::new(table_id, 1))
.unwrap();
ctx.volatile_ctx.opening_region_guard = Some(guard);
let err = state.upgrade_candidate_region(&mut ctx).await.unwrap_err();
assert!(ctx.volatile_ctx.table_route.is_none());
assert!(ctx.volatile_ctx.opening_region_guard.is_some());
assert!(err.is_retryable());
assert!(format!("{err:?}").contains("Failed to update the table route"));
}
#[tokio::test]
async fn test_check_metadata() {
let state = UpdateMetadata::Upgrade;
@@ -394,8 +380,11 @@ mod tests {
env.create_physical_table_metadata(table_info, region_routes)
.await;
let updated = state.check_metadata_updated(&mut ctx).await.unwrap();
let table_routes = ctx.get_table_route_value(1024).await.unwrap();
let region_routes = table_routes.region_routes().unwrap();
let updated = state
.check_metadata_updated(&mut ctx, &[RegionId::new(1024, 1)], region_routes)
.unwrap();
assert!(!updated);
}
@@ -419,7 +408,11 @@ mod tests {
env.create_physical_table_metadata(table_info, region_routes)
.await;
let updated = state.check_metadata_updated(&mut ctx).await.unwrap();
let table_routes = ctx.get_table_route_value(1024).await.unwrap();
let region_routes = table_routes.region_routes().unwrap();
let updated = state
.check_metadata_updated(&mut ctx, &[RegionId::new(1024, 1)], region_routes)
.unwrap();
assert!(updated);
}
@@ -443,7 +436,11 @@ mod tests {
env.create_physical_table_metadata(table_info, region_routes)
.await;
let err = state.check_metadata_updated(&mut ctx).await.unwrap_err();
let table_routes = ctx.get_table_route_value(1024).await.unwrap();
let region_routes = table_routes.region_routes().unwrap();
let err = state
.check_metadata_updated(&mut ctx, &[RegionId::new(1024, 1)], region_routes)
.unwrap_err();
assert_matches!(err, Error::Unexpected { .. });
assert!(err.to_string().contains("intermediate state"));
}
@@ -468,7 +465,7 @@ mod tests {
let guard = opening_keeper
.register(2, RegionId::new(table_id, 1))
.unwrap();
ctx.volatile_ctx.opening_region_guard = Some(guard);
ctx.volatile_ctx.opening_region_guards.push(guard);
env.create_physical_table_metadata(table_info, region_routes)
.await;
@@ -492,8 +489,7 @@ mod tests {
.unwrap();
let region_routes = table_route.region_routes().unwrap();
assert!(ctx.volatile_ctx.table_route.is_none());
assert!(ctx.volatile_ctx.opening_region_guard.is_none());
assert!(ctx.volatile_ctx.opening_region_guards.is_empty());
assert_eq!(region_routes.len(), 1);
assert!(!region_routes[0].is_leader_downgrading());
assert!(region_routes[0].follower_peers.is_empty());

View File

@@ -13,15 +13,19 @@
// limitations under the License.
use std::any::Any;
use std::collections::HashSet;
use std::time::Duration;
use api::v1::meta::MailboxMessage;
use common_meta::ddl::utils::parse_region_wal_options;
use common_meta::instruction::{Instruction, InstructionReply, UpgradeRegion, UpgradeRegionReply};
use common_meta::instruction::{
Instruction, InstructionReply, UpgradeRegion, UpgradeRegionReply, UpgradeRegionsReply,
};
use common_meta::key::topic_region::TopicRegionKey;
use common_meta::lock_key::RemoteWalLock;
use common_meta::wal_options_allocator::extract_topic_from_wal_options;
use common_procedure::{Context as ProcedureContext, Status};
use common_telemetry::{error, warn};
use common_telemetry::{error, info};
use common_wal::options::WalOptions;
use serde::{Deserialize, Serialize};
use snafu::{OptionExt, ResultExt, ensure};
@@ -64,17 +68,9 @@ impl State for UpgradeCandidateRegion {
) -> Result<(Box<dyn State>, Status)> {
let now = Instant::now();
let region_wal_option = self.get_region_wal_option(ctx).await?;
let region_id = ctx.persistent_ctx.region_id;
if region_wal_option.is_none() {
warn!(
"Region {} wal options not found, during upgrade candidate region",
region_id
);
}
let topics = self.get_kafka_topics(ctx).await?;
if self
.upgrade_region_with_retry(ctx, procedure_ctx, region_wal_option.as_ref())
.upgrade_region_with_retry(ctx, procedure_ctx, topics)
.await
{
ctx.update_upgrade_candidate_region_elapsed(now);
@@ -91,24 +87,32 @@ impl State for UpgradeCandidateRegion {
}
impl UpgradeCandidateRegion {
async fn get_region_wal_option(&self, ctx: &mut Context) -> Result<Option<WalOptions>> {
let region_id = ctx.persistent_ctx.region_id;
match ctx.get_from_peer_datanode_table_value().await {
Ok(datanode_table_value) => {
let region_wal_options =
parse_region_wal_options(&datanode_table_value.region_info.region_wal_options)
.context(error::ParseWalOptionsSnafu)?;
Ok(region_wal_options.get(&region_id.region_number()).cloned())
async fn get_kafka_topics(&self, ctx: &mut Context) -> Result<HashSet<String>> {
let table_regions = ctx.persistent_ctx.table_regions();
let datanode_table_values = ctx.get_from_peer_datanode_table_values().await?;
let mut topics = HashSet::new();
for (table_id, regions) in table_regions {
let Some(datanode_table_value) = datanode_table_values.get(&table_id) else {
continue;
};
let region_wal_options =
parse_region_wal_options(&datanode_table_value.region_info.region_wal_options)
.context(error::ParseWalOptionsSnafu)?;
for region_id in regions {
let Some(WalOptions::Kafka(kafka_wal_options)) =
region_wal_options.get(&region_id.region_number())
else {
continue;
};
if !topics.contains(&kafka_wal_options.topic) {
topics.insert(kafka_wal_options.topic.clone());
}
}
Err(error::Error::DatanodeTableNotFound { datanode_id, .. }) => {
warn!(
"Datanode table not found, during upgrade candidate region, the target region might already been migrated, region_id: {}, datanode_id: {}",
region_id, datanode_id
);
Ok(None)
}
Err(e) => Err(e),
}
Ok(topics)
}
/// Builds upgrade region instruction.
@@ -117,35 +121,105 @@ impl UpgradeCandidateRegion {
ctx: &mut Context,
replay_timeout: Duration,
) -> Result<Instruction> {
let pc = &ctx.persistent_ctx;
let region_id = pc.region_id;
let last_entry_id = ctx.volatile_ctx.leader_region_last_entry_id;
let metadata_last_entry_id = ctx.volatile_ctx.leader_region_metadata_last_entry_id;
// Try our best to retrieve replay checkpoint.
let datanode_table_value = ctx.get_from_peer_datanode_table_value().await.ok();
let checkpoint = if let Some(topic) = datanode_table_value.as_ref().and_then(|v| {
extract_topic_from_wal_options(region_id, &v.region_info.region_wal_options)
}) {
ctx.fetch_replay_checkpoint(&topic).await.ok().flatten()
} else {
None
};
let region_ids = ctx.persistent_ctx.region_ids.clone();
let datanode_table_values = ctx.get_from_peer_datanode_table_values().await?;
let mut region_topic = Vec::with_capacity(region_ids.len());
for region_id in region_ids.iter() {
let table_id = region_id.table_id();
if let Some(datanode_table_value) = datanode_table_values.get(&table_id)
&& let Some(topic) = extract_topic_from_wal_options(
*region_id,
&datanode_table_value.region_info.region_wal_options,
)
{
region_topic.push((*region_id, topic));
}
}
let upgrade_instruction = Instruction::UpgradeRegion(
UpgradeRegion {
let replay_checkpoints = ctx
.get_replay_checkpoints(
region_topic
.iter()
.map(|(region_id, topic)| TopicRegionKey::new(*region_id, topic))
.collect(),
)
.await?;
// Build upgrade regions instruction.
let mut upgrade_regions = Vec::with_capacity(region_ids.len());
for region_id in region_ids {
let last_entry_id = ctx
.volatile_ctx
.leader_region_last_entry_ids
.get(&region_id)
.copied();
let metadata_last_entry_id = ctx
.volatile_ctx
.leader_region_metadata_last_entry_ids
.get(&region_id)
.copied();
let checkpoint = replay_checkpoints.get(&region_id).copied();
upgrade_regions.push(UpgradeRegion {
region_id,
last_entry_id,
metadata_last_entry_id,
replay_timeout: Some(replay_timeout),
replay_timeout,
location_id: Some(ctx.persistent_ctx.from_peer.id),
replay_entry_id: None,
metadata_replay_entry_id: None,
replay_entry_id: checkpoint.map(|c| c.entry_id),
metadata_replay_entry_id: checkpoint.and_then(|c| c.metadata_entry_id),
});
}
Ok(Instruction::UpgradeRegions(upgrade_regions))
}
fn handle_upgrade_region_reply(
&self,
ctx: &mut Context,
UpgradeRegionReply {
region_id,
ready,
exists,
error,
}: &UpgradeRegionReply,
now: &Instant,
) -> Result<()> {
let candidate = &ctx.persistent_ctx.to_peer;
if error.is_some() {
return error::RetryLaterSnafu {
reason: format!(
"Failed to upgrade the region {} on datanode {:?}, error: {:?}, elapsed: {:?}",
region_id,
candidate,
error,
now.elapsed()
),
}
.fail();
}
ensure!(
exists,
error::UnexpectedSnafu {
violated: format!(
"Candidate region {} doesn't exist on datanode {:?}",
region_id, candidate
)
}
.with_replay_entry_id(checkpoint.map(|c| c.entry_id))
.with_metadata_replay_entry_id(checkpoint.and_then(|c| c.metadata_entry_id)),
);
Ok(upgrade_instruction)
if self.require_ready && !ready {
return error::RetryLaterSnafu {
reason: format!(
"Candidate region {} still replaying the wal on datanode {:?}, elapsed: {:?}",
region_id,
candidate,
now.elapsed()
),
}
.fail();
}
Ok(())
}
/// Tries to upgrade a candidate region.
@@ -173,11 +247,11 @@ impl UpgradeCandidateRegion {
.await?;
let pc = &ctx.persistent_ctx;
let region_id = pc.region_id;
let region_ids = &pc.region_ids;
let candidate = &pc.to_peer;
let msg = MailboxMessage::json_message(
&format!("Upgrade candidate region: {}", region_id),
&format!("Upgrade candidate regions: {:?}", region_ids),
&format!("Metasrv@{}", ctx.server_addr()),
&format!("Datanode-{}@{}", candidate.id, candidate.addr),
common_time::util::current_time_millis(),
@@ -190,14 +264,17 @@ impl UpgradeCandidateRegion {
let ch = Channel::Datanode(candidate.id);
let receiver = ctx.mailbox.send(&ch, msg, operation_timeout).await?;
let now = Instant::now();
match receiver.await {
Ok(msg) => {
let reply = HeartbeatMailbox::json_reply(&msg)?;
let InstructionReply::UpgradeRegion(UpgradeRegionReply {
ready,
exists,
error,
}) = reply
info!(
"Received upgrade region reply: {:?}, regions: {:?}, elapsed: {:?}",
reply,
region_ids,
now.elapsed()
);
let InstructionReply::UpgradeRegions(UpgradeRegionsReply { replies }) = reply
else {
return error::UnexpectedInstructionReplySnafu {
mailbox_message: msg.to_string(),
@@ -205,44 +282,16 @@ impl UpgradeCandidateRegion {
}
.fail();
};
// Notes: The order of handling is important.
if error.is_some() {
return error::RetryLaterSnafu {
reason: format!(
"Failed to upgrade the region {} on datanode {:?}, error: {:?}",
region_id, candidate, error
),
}
.fail();
for reply in replies {
self.handle_upgrade_region_reply(ctx, &reply, &now)?;
}
ensure!(
exists,
error::UnexpectedSnafu {
violated: format!(
"Candidate region {} doesn't exist on datanode {:?}",
region_id, candidate
)
}
);
if self.require_ready && !ready {
return error::RetryLaterSnafu {
reason: format!(
"Candidate region {} still replaying the wal on datanode {:?}",
region_id, candidate
),
}
.fail();
}
Ok(())
}
Err(error::Error::MailboxTimeout { .. }) => {
let reason = format!(
"Mailbox received timeout for upgrade candidate region {region_id} on datanode {:?}",
"Mailbox received timeout for upgrade candidate regions {region_ids:?} on datanode {:?}, elapsed: {:?}",
candidate,
now.elapsed()
);
error::RetryLaterSnafu { reason }.fail()
}
@@ -257,26 +306,24 @@ impl UpgradeCandidateRegion {
&self,
ctx: &mut Context,
procedure_ctx: &ProcedureContext,
wal_options: Option<&WalOptions>,
topics: HashSet<String>,
) -> bool {
let mut retry = 0;
let mut upgraded = false;
let mut guards = Vec::with_capacity(topics.len());
loop {
let timer = Instant::now();
// If using Kafka WAL, acquire a read lock on the topic to prevent WAL pruning during the upgrade.
let _guard = if let Some(WalOptions::Kafka(kafka_wal_options)) = wal_options {
Some(
for topic in &topics {
guards.push(
procedure_ctx
.provider
.acquire_lock(
&(RemoteWalLock::Read(kafka_wal_options.topic.clone()).into()),
)
.acquire_lock(&(RemoteWalLock::Read(topic.clone()).into()))
.await,
)
} else {
None
};
);
}
if let Err(err) = self.upgrade_region(ctx).await {
retry += 1;
ctx.update_operations_elapsed(timer);
@@ -327,17 +374,17 @@ mod tests {
schema: "public".into(),
from_peer: Peer::empty(1),
to_peer: Peer::empty(2),
region_id: RegionId::new(1024, 1),
region_ids: vec![RegionId::new(1024, 1)],
timeout: Duration::from_millis(1000),
trigger_reason: RegionMigrationTriggerReason::Manual,
}
}
async fn prepare_table_metadata(ctx: &Context, wal_options: HashMap<u32, String>) {
let table_info =
new_test_table_info(ctx.persistent_ctx.region_id.table_id(), vec![1]).into();
let region_id = ctx.persistent_ctx.region_ids[0];
let table_info = new_test_table_info(region_id.table_id(), vec![1]).into();
let region_routes = vec![RegionRoute {
region: Region::new_test(ctx.persistent_ctx.region_id),
region: Region::new_test(region_id),
leader_peer: Some(ctx.persistent_ctx.from_peer.clone()),
follower_peers: vec![ctx.persistent_ctx.to_peer.clone()],
..Default::default()

View File

@@ -18,7 +18,7 @@ use api::v1::meta::mailbox_message::Payload;
use api::v1::meta::{HeartbeatResponse, MailboxMessage};
use common_meta::instruction::{
DowngradeRegionReply, DowngradeRegionsReply, FlushRegionReply, InstructionReply, SimpleReply,
UpgradeRegionReply,
UpgradeRegionReply, UpgradeRegionsReply,
};
use common_meta::key::TableMetadataManagerRef;
use common_meta::key::table_route::TableRouteValue;
@@ -212,11 +212,14 @@ pub fn new_upgrade_region_reply(
to: "meta".to_string(),
timestamp_millis: current_time_millis(),
payload: Some(Payload::Json(
serde_json::to_string(&InstructionReply::UpgradeRegion(UpgradeRegionReply {
ready,
exists,
error,
}))
serde_json::to_string(&InstructionReply::UpgradeRegions(
UpgradeRegionsReply::single(UpgradeRegionReply {
region_id: RegionId::new(0, 0),
ready,
exists,
error,
}),
))
.unwrap(),
)),
}

View File

@@ -52,6 +52,7 @@ use crate::procedure::region_migration::{
};
use crate::region::failure_detector::RegionFailureDetector;
use crate::selector::SelectorOptions;
use crate::state::StateRef;
/// `DatanodeHeartbeat` represents the heartbeat signal sent from a datanode.
/// It includes identifiers for the cluster and datanode, a list of regions being monitored,
@@ -100,16 +101,6 @@ pub(crate) enum Event {
Dump(tokio::sync::oneshot::Sender<RegionFailureDetector>),
}
#[cfg(test)]
impl Event {
pub(crate) fn into_region_failure_detectors(self) -> Vec<DetectingRegion> {
match self {
Self::RegisterFailureDetectors(detecting_regions) => detecting_regions,
_ => unreachable!(),
}
}
}
impl Debug for Event {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
@@ -139,6 +130,9 @@ pub struct RegionSupervisorTicker {
/// The [`Option`] wrapper allows us to abort the job while dropping the [`RegionSupervisor`].
tick_handle: Mutex<Option<JoinHandle<()>>>,
/// The [`Option`] wrapper allows us to abort the job while dropping the [`RegionSupervisor`].
initialization_handler: Mutex<Option<JoinHandle<()>>>,
/// The interval of tick.
tick_interval: Duration,
@@ -182,6 +176,7 @@ impl RegionSupervisorTicker {
);
Self {
tick_handle: Mutex::new(None),
initialization_handler: Mutex::new(None),
tick_interval,
initialization_delay,
initialization_retry_period,
@@ -202,7 +197,7 @@ impl RegionSupervisorTicker {
self.initialization_retry_period,
);
initialization_interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
common_runtime::spawn_global(async move {
let initialization_handler = common_runtime::spawn_global(async move {
loop {
initialization_interval.tick().await;
let (tx, rx) = oneshot::channel();
@@ -218,6 +213,7 @@ impl RegionSupervisorTicker {
}
}
});
*self.initialization_handler.lock().unwrap() = Some(initialization_handler);
let sender = self.sender.clone();
let ticker_loop = tokio::spawn(async move {
@@ -247,6 +243,11 @@ impl RegionSupervisorTicker {
handle.abort();
info!("The tick loop is stopped.");
}
let initialization_handler = self.initialization_handler.lock().unwrap().take();
if let Some(initialization_handler) = initialization_handler {
initialization_handler.abort();
info!("The initialization loop is stopped.");
}
}
}
@@ -290,6 +291,8 @@ pub struct RegionSupervisor {
peer_resolver: PeerResolverRef,
/// The kv backend.
kv_backend: KvBackendRef,
/// The meta state, used to check if the current metasrv is the leader.
state: Option<StateRef>,
}
/// Controller for managing failure detectors for regions.
@@ -373,12 +376,29 @@ impl RegionSupervisor {
runtime_switch_manager,
peer_resolver,
kv_backend,
state: None,
}
}
/// Sets the meta state.
pub(crate) fn with_state(mut self, state: StateRef) -> Self {
self.state = Some(state);
self
}
/// Runs the main loop.
pub(crate) async fn run(&mut self) {
while let Some(event) = self.receiver.recv().await {
if let Some(state) = self.state.as_ref()
&& !state.read().unwrap().is_leader()
{
warn!(
"The current metasrv is not the leader, ignore {:?} event",
event
);
continue;
}
match event {
Event::InitializeAllRegions(sender) => {
match self.is_maintenance_mode_enabled().await {
@@ -413,7 +433,10 @@ impl RegionSupervisor {
self.deregister_failure_detectors(detecting_regions).await
}
Event::HeartbeatArrived(heartbeat) => self.on_heartbeat_arrived(heartbeat),
Event::Clear => self.clear(),
Event::Clear => {
self.clear();
info!("Region supervisor is initialized.");
}
#[cfg(test)]
Event::Dump(sender) => {
let _ = sender.send(self.failure_detector.dump());
@@ -906,6 +929,7 @@ pub(crate) mod tests {
let (tx, mut rx) = tokio::sync::mpsc::channel(128);
let ticker = RegionSupervisorTicker {
tick_handle: Mutex::new(None),
initialization_handler: Mutex::new(None),
tick_interval: Duration::from_millis(10),
initialization_delay: Duration::from_millis(100),
initialization_retry_period: Duration::from_millis(100),
@@ -932,6 +956,7 @@ pub(crate) mod tests {
let (tx, mut rx) = tokio::sync::mpsc::channel(128);
let ticker = RegionSupervisorTicker {
tick_handle: Mutex::new(None),
initialization_handler: Mutex::new(None),
tick_interval: Duration::from_millis(1000),
initialization_delay: Duration::from_millis(50),
initialization_retry_period: Duration::from_millis(50),

Some files were not shown because too many files have changed in this diff Show More