mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-01-06 13:22:57 +00:00
Compare commits
42 Commits
v0.12.0-ni
...
v0.12.0-ni
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
87c21e2baa | ||
|
|
d072801ad6 | ||
|
|
0607b38a20 | ||
|
|
e0384a7d46 | ||
|
|
d73815ba84 | ||
|
|
c78a492863 | ||
|
|
859717c309 | ||
|
|
52697a9e66 | ||
|
|
f8d26b433e | ||
|
|
1acfb6ed1c | ||
|
|
7eaabb3ca2 | ||
|
|
3a55f5d17c | ||
|
|
8d5d4000e6 | ||
|
|
a598008ec3 | ||
|
|
86bd54194a | ||
|
|
ccd2b06b7a | ||
|
|
0db10a33d0 | ||
|
|
317fe9eaa5 | ||
|
|
a4761d6245 | ||
|
|
758aef39d8 | ||
|
|
4e3dd04f42 | ||
|
|
18b77408ae | ||
|
|
725d5a9e68 | ||
|
|
4f29e50ef3 | ||
|
|
121ec7936f | ||
|
|
0185a65905 | ||
|
|
f0d30a0f26 | ||
|
|
7e61d1ae27 | ||
|
|
e56dd20426 | ||
|
|
b64c075cdb | ||
|
|
57f8afcb70 | ||
|
|
bd37e086c2 | ||
|
|
66f63ae981 | ||
|
|
95b20592ac | ||
|
|
1855dccdf1 | ||
|
|
5efcb41310 | ||
|
|
f5829364a2 | ||
|
|
87bd12d6df | ||
|
|
c370b4b40d | ||
|
|
3f01f67f94 | ||
|
|
6eb746d994 | ||
|
|
03a144fa56 |
@@ -48,8 +48,7 @@ runs:
|
||||
path: /tmp/greptime-*.log
|
||||
retention-days: 3
|
||||
|
||||
- name: Build greptime
|
||||
if: ${{ inputs.dev-mode == 'false' }}
|
||||
- name: Build greptime # Builds standard greptime binary
|
||||
uses: ./.github/actions/build-greptime-binary
|
||||
with:
|
||||
base-image: ubuntu
|
||||
|
||||
3
.github/workflows/dependency-check.yml
vendored
3
.github/workflows/dependency-check.yml
vendored
@@ -1,9 +1,6 @@
|
||||
name: Check Dependencies
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
|
||||
110
.github/workflows/develop.yml
vendored
110
.github/workflows/develop.yml
vendored
@@ -1,4 +1,6 @@
|
||||
on:
|
||||
schedule:
|
||||
- cron: "0 15 * * 1-5"
|
||||
merge_group:
|
||||
pull_request:
|
||||
types: [ opened, synchronize, reopened, ready_for_review ]
|
||||
@@ -43,7 +45,7 @@ jobs:
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ windows-2022, ubuntu-20.04 ]
|
||||
os: [ ubuntu-20.04 ]
|
||||
timeout-minutes: 60
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@@ -51,12 +53,14 @@ jobs:
|
||||
with:
|
||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
- uses: actions-rust-lang/setup-rust-toolchain@v1
|
||||
# - name: Rust Cache
|
||||
# uses: Swatinem/rust-cache@v2
|
||||
# with:
|
||||
# # Shares across multiple jobs
|
||||
# # Shares with `Clippy` job
|
||||
# shared-key: "check-lint"
|
||||
- name: Rust Cache
|
||||
uses: Swatinem/rust-cache@v2
|
||||
with:
|
||||
# Shares across multiple jobs
|
||||
# Shares with `Clippy` job
|
||||
shared-key: "check-lint"
|
||||
cache-all-crates: "true"
|
||||
save-if: ${{ github.ref == 'refs/heads/main' }}
|
||||
- name: Run cargo check
|
||||
run: cargo check --locked --workspace --all-targets
|
||||
|
||||
@@ -67,11 +71,6 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions-rust-lang/setup-rust-toolchain@v1
|
||||
# - name: Rust Cache
|
||||
# uses: Swatinem/rust-cache@v2
|
||||
# with:
|
||||
# # Shares across multiple jobs
|
||||
# shared-key: "check-toml"
|
||||
- name: Install taplo
|
||||
run: cargo +stable install taplo-cli --version ^0.9 --locked --force
|
||||
- name: Run taplo
|
||||
@@ -94,6 +93,8 @@ jobs:
|
||||
with:
|
||||
# Shares across multiple jobs
|
||||
shared-key: "build-binaries"
|
||||
cache-all-crates: "true"
|
||||
save-if: ${{ github.ref == 'refs/heads/main' }}
|
||||
- name: Install cargo-gc-bin
|
||||
shell: bash
|
||||
run: cargo install cargo-gc-bin --force
|
||||
@@ -142,11 +143,6 @@ jobs:
|
||||
with:
|
||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
- uses: actions-rust-lang/setup-rust-toolchain@v1
|
||||
# - name: Rust Cache
|
||||
# uses: Swatinem/rust-cache@v2
|
||||
# with:
|
||||
# # Shares across multiple jobs
|
||||
# shared-key: "fuzz-test-targets"
|
||||
- name: Set Rust Fuzz
|
||||
shell: bash
|
||||
run: |
|
||||
@@ -200,11 +196,6 @@ jobs:
|
||||
with:
|
||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
- uses: actions-rust-lang/setup-rust-toolchain@v1
|
||||
# - name: Rust Cache
|
||||
# uses: Swatinem/rust-cache@v2
|
||||
# with:
|
||||
# # Shares across multiple jobs
|
||||
# shared-key: "fuzz-test-targets"
|
||||
- name: Set Rust Fuzz
|
||||
shell: bash
|
||||
run: |
|
||||
@@ -255,6 +246,8 @@ jobs:
|
||||
with:
|
||||
# Shares across multiple jobs
|
||||
shared-key: "build-greptime-ci"
|
||||
cache-all-crates: "true"
|
||||
save-if: ${{ github.ref == 'refs/heads/main' }}
|
||||
- name: Install cargo-gc-bin
|
||||
shell: bash
|
||||
run: cargo install cargo-gc-bin --force
|
||||
@@ -317,11 +310,6 @@ jobs:
|
||||
with:
|
||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
- uses: actions-rust-lang/setup-rust-toolchain@v1
|
||||
# - name: Rust Cache
|
||||
# uses: Swatinem/rust-cache@v2
|
||||
# with:
|
||||
# # Shares across multiple jobs
|
||||
# shared-key: "fuzz-test-targets"
|
||||
- name: Set Rust Fuzz
|
||||
shell: bash
|
||||
run: |
|
||||
@@ -466,11 +454,6 @@ jobs:
|
||||
with:
|
||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
- uses: actions-rust-lang/setup-rust-toolchain@v1
|
||||
# - name: Rust Cache
|
||||
# uses: Swatinem/rust-cache@v2
|
||||
# with:
|
||||
# # Shares across multiple jobs
|
||||
# shared-key: "fuzz-test-targets"
|
||||
- name: Set Rust Fuzz
|
||||
shell: bash
|
||||
run: |
|
||||
@@ -612,11 +595,6 @@ jobs:
|
||||
- uses: actions-rust-lang/setup-rust-toolchain@v1
|
||||
with:
|
||||
components: rustfmt
|
||||
# - name: Rust Cache
|
||||
# uses: Swatinem/rust-cache@v2
|
||||
# with:
|
||||
# # Shares across multiple jobs
|
||||
# shared-key: "check-rust-fmt"
|
||||
- name: Check format
|
||||
run: make fmt-check
|
||||
|
||||
@@ -638,6 +616,8 @@ jobs:
|
||||
# Shares across multiple jobs
|
||||
# Shares with `Check` job
|
||||
shared-key: "check-lint"
|
||||
cache-all-crates: "true"
|
||||
save-if: ${{ github.ref == 'refs/heads/main' }}
|
||||
- name: Run cargo clippy
|
||||
run: make clippy
|
||||
|
||||
@@ -649,11 +629,58 @@ jobs:
|
||||
- name: Merge Conflict Finder
|
||||
uses: olivernybroe/action-conflict-finder@v4.0
|
||||
|
||||
coverage:
|
||||
if: github.event.pull_request.draft == false
|
||||
test:
|
||||
if: github.event_name != 'merge_group'
|
||||
runs-on: ubuntu-20.04-8-cores
|
||||
timeout-minutes: 60
|
||||
needs: [conflict-check, clippy, fmt]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: arduino/setup-protoc@v3
|
||||
with:
|
||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
- uses: rui314/setup-mold@v1
|
||||
- name: Install toolchain
|
||||
uses: actions-rust-lang/setup-rust-toolchain@v1
|
||||
with:
|
||||
cache: false
|
||||
- name: Rust Cache
|
||||
uses: Swatinem/rust-cache@v2
|
||||
with:
|
||||
# Shares cross multiple jobs
|
||||
shared-key: "coverage-test"
|
||||
cache-all-crates: "true"
|
||||
save-if: ${{ github.ref == 'refs/heads/main' }}
|
||||
- name: Install latest nextest release
|
||||
uses: taiki-e/install-action@nextest
|
||||
- name: Setup external services
|
||||
working-directory: tests-integration/fixtures
|
||||
run: docker compose up -d --wait
|
||||
- name: Run nextest cases
|
||||
run: cargo nextest run --workspace -F dashboard -F pg_kvbackend
|
||||
env:
|
||||
CARGO_BUILD_RUSTFLAGS: "-C link-arg=-fuse-ld=mold"
|
||||
RUST_BACKTRACE: 1
|
||||
CARGO_INCREMENTAL: 0
|
||||
GT_S3_BUCKET: ${{ vars.AWS_CI_TEST_BUCKET }}
|
||||
GT_S3_ACCESS_KEY_ID: ${{ secrets.AWS_CI_TEST_ACCESS_KEY_ID }}
|
||||
GT_S3_ACCESS_KEY: ${{ secrets.AWS_CI_TEST_SECRET_ACCESS_KEY }}
|
||||
GT_S3_REGION: ${{ vars.AWS_CI_TEST_BUCKET_REGION }}
|
||||
GT_MINIO_BUCKET: greptime
|
||||
GT_MINIO_ACCESS_KEY_ID: superpower_ci_user
|
||||
GT_MINIO_ACCESS_KEY: superpower_password
|
||||
GT_MINIO_REGION: us-west-2
|
||||
GT_MINIO_ENDPOINT_URL: http://127.0.0.1:9000
|
||||
GT_ETCD_ENDPOINTS: http://127.0.0.1:2379
|
||||
GT_POSTGRES_ENDPOINTS: postgres://greptimedb:admin@127.0.0.1:5432/postgres
|
||||
GT_KAFKA_ENDPOINTS: 127.0.0.1:9092
|
||||
GT_KAFKA_SASL_ENDPOINTS: 127.0.0.1:9093
|
||||
UNITTEST_LOG_DIR: "__unittest_logs"
|
||||
|
||||
coverage:
|
||||
if: github.event_name == 'merge_group'
|
||||
runs-on: ubuntu-20.04-8-cores
|
||||
timeout-minutes: 60
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: arduino/setup-protoc@v3
|
||||
@@ -671,11 +698,6 @@ jobs:
|
||||
# Shares cross multiple jobs
|
||||
shared-key: "coverage-test"
|
||||
save-if: ${{ github.ref == 'refs/heads/main' }}
|
||||
# Disabled temporarily to see performance
|
||||
# - name: Docker Cache
|
||||
# uses: ScribeMD/docker-cache@0.5.0
|
||||
# with:
|
||||
# key: docker-${{ runner.os }}-coverage
|
||||
- name: Install latest nextest release
|
||||
uses: taiki-e/install-action@nextest
|
||||
- name: Install cargo-llvm-cov
|
||||
|
||||
45
.github/workflows/nightly-ci.yml
vendored
45
.github/workflows/nightly-ci.yml
vendored
@@ -108,51 +108,6 @@ jobs:
|
||||
GT_S3_REGION: ${{ vars.AWS_CI_TEST_BUCKET_REGION }}
|
||||
UNITTEST_LOG_DIR: "__unittest_logs"
|
||||
|
||||
## this is designed for generating cache that usable for pull requests
|
||||
test-on-linux:
|
||||
name: Run tests on Linux
|
||||
if: ${{ github.repository == 'GreptimeTeam/greptimedb' }}
|
||||
runs-on: ubuntu-20.04-8-cores
|
||||
timeout-minutes: 60
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: arduino/setup-protoc@v3
|
||||
with:
|
||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
- uses: rui314/setup-mold@v1
|
||||
- name: Install Rust toolchain
|
||||
uses: actions-rust-lang/setup-rust-toolchain@v1
|
||||
- name: Rust Cache
|
||||
uses: Swatinem/rust-cache@v2
|
||||
with:
|
||||
# Shares cross multiple jobs
|
||||
shared-key: "coverage-test"
|
||||
- name: Install Cargo Nextest
|
||||
uses: taiki-e/install-action@nextest
|
||||
- name: Setup external services
|
||||
working-directory: tests-integration/fixtures
|
||||
run: docker compose up -d --wait
|
||||
- name: Running tests
|
||||
run: cargo nextest run -F dashboard -F pg_kvbackend
|
||||
env:
|
||||
CARGO_BUILD_RUSTFLAGS: "-C link-arg=-fuse-ld=mold"
|
||||
RUST_BACKTRACE: 1
|
||||
CARGO_INCREMENTAL: 0
|
||||
GT_S3_BUCKET: ${{ vars.AWS_CI_TEST_BUCKET }}
|
||||
GT_S3_ACCESS_KEY_ID: ${{ secrets.AWS_CI_TEST_ACCESS_KEY_ID }}
|
||||
GT_S3_ACCESS_KEY: ${{ secrets.AWS_CI_TEST_SECRET_ACCESS_KEY }}
|
||||
GT_S3_REGION: ${{ vars.AWS_CI_TEST_BUCKET_REGION }}
|
||||
GT_MINIO_BUCKET: greptime
|
||||
GT_MINIO_ACCESS_KEY_ID: superpower_ci_user
|
||||
GT_MINIO_ACCESS_KEY: superpower_password
|
||||
GT_MINIO_REGION: us-west-2
|
||||
GT_MINIO_ENDPOINT_URL: http://127.0.0.1:9000
|
||||
GT_ETCD_ENDPOINTS: http://127.0.0.1:2379
|
||||
GT_POSTGRES_ENDPOINTS: postgres://greptimedb:admin@127.0.0.1:5432/postgres
|
||||
GT_KAFKA_ENDPOINTS: 127.0.0.1:9092
|
||||
GT_KAFKA_SASL_ENDPOINTS: 127.0.0.1:9093
|
||||
UNITTEST_LOG_DIR: "__unittest_logs"
|
||||
|
||||
cleanbuild-linux-nix:
|
||||
name: Run clean build on Linux
|
||||
runs-on: ubuntu-latest-8-cores
|
||||
|
||||
16
.github/workflows/release.yml
vendored
16
.github/workflows/release.yml
vendored
@@ -436,6 +436,22 @@ jobs:
|
||||
aws-region: ${{ vars.EC2_RUNNER_REGION }}
|
||||
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
|
||||
|
||||
bump-doc-version:
|
||||
name: Bump doc version
|
||||
if: ${{ github.event_name == 'push' || github.event_name == 'schedule' }}
|
||||
needs: [allocate-runners]
|
||||
runs-on: ubuntu-20.04
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: ./.github/actions/setup-cyborg
|
||||
- name: Bump doc version
|
||||
working-directory: cyborg
|
||||
run: pnpm tsx bin/bump-doc-version.ts
|
||||
env:
|
||||
VERSION: ${{ needs.allocate-runners.outputs.version }}
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
DOCS_REPO_TOKEN: ${{ secrets.DOCS_REPO_TOKEN }}
|
||||
|
||||
notification:
|
||||
if: ${{ github.repository == 'GreptimeTeam/greptimedb' && (github.event_name == 'push' || github.event_name == 'schedule') && always() }}
|
||||
name: Send notification to Greptime team
|
||||
|
||||
852
Cargo.lock
generated
852
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -124,7 +124,7 @@ etcd-client = "0.13"
|
||||
fst = "0.4.7"
|
||||
futures = "0.3"
|
||||
futures-util = "0.3"
|
||||
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "43ddd8dea69f4df0fe2e8b5cdc0044d2cfa35908" }
|
||||
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "ec801a91aa22f9666063d02805f1f60f7c93458a" }
|
||||
hex = "0.4"
|
||||
http = "0.2"
|
||||
humantime = "2.1"
|
||||
|
||||
5
Makefile
5
Makefile
@@ -165,15 +165,14 @@ nextest: ## Install nextest tools.
|
||||
sqlness-test: ## Run sqlness test.
|
||||
cargo sqlness ${SQLNESS_OPTS}
|
||||
|
||||
# Run fuzz test ${FUZZ_TARGET}.
|
||||
RUNS ?= 1
|
||||
FUZZ_TARGET ?= fuzz_alter_table
|
||||
.PHONY: fuzz
|
||||
fuzz:
|
||||
fuzz: ## Run fuzz test ${FUZZ_TARGET}.
|
||||
cargo fuzz run ${FUZZ_TARGET} --fuzz-dir tests-fuzz -D -s none -- -runs=${RUNS}
|
||||
|
||||
.PHONY: fuzz-ls
|
||||
fuzz-ls:
|
||||
fuzz-ls: ## List all fuzz targets.
|
||||
cargo fuzz list --fuzz-dir tests-fuzz
|
||||
|
||||
.PHONY: check
|
||||
|
||||
@@ -91,6 +91,8 @@
|
||||
| `procedure` | -- | -- | Procedure storage options. |
|
||||
| `procedure.max_retry_times` | Integer | `3` | Procedure max retry time. |
|
||||
| `procedure.retry_delay` | String | `500ms` | Initial retry delay of procedures, increases exponentially |
|
||||
| `flow` | -- | -- | flow engine options. |
|
||||
| `flow.num_workers` | Integer | `0` | The number of flow worker in flownode.<br/>Not setting(or set to 0) this value will use the number of CPU cores divided by 2. |
|
||||
| `storage` | -- | -- | The data storage options. |
|
||||
| `storage.data_home` | String | `/tmp/greptimedb/` | The working home directory. |
|
||||
| `storage.type` | String | `File` | The storage type used to store the data.<br/>- `File`: the data is stored in the local file system.<br/>- `S3`: the data is stored in the S3 object storage.<br/>- `Gcs`: the data is stored in the Google Cloud Storage.<br/>- `Azblob`: the data is stored in the Azure Blob Storage.<br/>- `Oss`: the data is stored in the Aliyun OSS. |
|
||||
@@ -143,15 +145,15 @@
|
||||
| `region_engine.mito.index` | -- | -- | The options for index in Mito engine. |
|
||||
| `region_engine.mito.index.aux_path` | String | `""` | Auxiliary directory path for the index in filesystem, used to store intermediate files for<br/>creating the index and staging files for searching the index, defaults to `{data_home}/index_intermediate`.<br/>The default name for this directory is `index_intermediate` for backward compatibility.<br/><br/>This path contains two subdirectories:<br/>- `__intm`: for storing intermediate files used during creating index.<br/>- `staging`: for storing staging files used during searching index. |
|
||||
| `region_engine.mito.index.staging_size` | String | `2GB` | The max capacity of the staging directory. |
|
||||
| `region_engine.mito.index.metadata_cache_size` | String | `64MiB` | Cache size for inverted index metadata. |
|
||||
| `region_engine.mito.index.content_cache_size` | String | `128MiB` | Cache size for inverted index content. |
|
||||
| `region_engine.mito.index.content_cache_page_size` | String | `64KiB` | Page size for inverted index content cache. |
|
||||
| `region_engine.mito.inverted_index` | -- | -- | The options for inverted index in Mito engine. |
|
||||
| `region_engine.mito.inverted_index.create_on_flush` | String | `auto` | Whether to create the index on flush.<br/>- `auto`: automatically (default)<br/>- `disable`: never |
|
||||
| `region_engine.mito.inverted_index.create_on_compaction` | String | `auto` | Whether to create the index on compaction.<br/>- `auto`: automatically (default)<br/>- `disable`: never |
|
||||
| `region_engine.mito.inverted_index.apply_on_query` | String | `auto` | Whether to apply the index on query<br/>- `auto`: automatically (default)<br/>- `disable`: never |
|
||||
| `region_engine.mito.inverted_index.mem_threshold_on_create` | String | `auto` | Memory threshold for performing an external sort during index creation.<br/>- `auto`: automatically determine the threshold based on the system memory size (default)<br/>- `unlimited`: no memory limit<br/>- `[size]` e.g. `64MB`: fixed memory threshold |
|
||||
| `region_engine.mito.inverted_index.intermediate_path` | String | `""` | Deprecated, use `region_engine.mito.index.aux_path` instead. |
|
||||
| `region_engine.mito.inverted_index.metadata_cache_size` | String | `64MiB` | Cache size for inverted index metadata. |
|
||||
| `region_engine.mito.inverted_index.content_cache_size` | String | `128MiB` | Cache size for inverted index content. |
|
||||
| `region_engine.mito.inverted_index.content_cache_page_size` | String | `64KiB` | Page size for inverted index content cache. |
|
||||
| `region_engine.mito.fulltext_index` | -- | -- | The options for full-text index in Mito engine. |
|
||||
| `region_engine.mito.fulltext_index.create_on_flush` | String | `auto` | Whether to create the index on flush.<br/>- `auto`: automatically (default)<br/>- `disable`: never |
|
||||
| `region_engine.mito.fulltext_index.create_on_compaction` | String | `auto` | Whether to create the index on compaction.<br/>- `auto`: automatically (default)<br/>- `disable`: never |
|
||||
@@ -479,15 +481,15 @@
|
||||
| `region_engine.mito.index` | -- | -- | The options for index in Mito engine. |
|
||||
| `region_engine.mito.index.aux_path` | String | `""` | Auxiliary directory path for the index in filesystem, used to store intermediate files for<br/>creating the index and staging files for searching the index, defaults to `{data_home}/index_intermediate`.<br/>The default name for this directory is `index_intermediate` for backward compatibility.<br/><br/>This path contains two subdirectories:<br/>- `__intm`: for storing intermediate files used during creating index.<br/>- `staging`: for storing staging files used during searching index. |
|
||||
| `region_engine.mito.index.staging_size` | String | `2GB` | The max capacity of the staging directory. |
|
||||
| `region_engine.mito.index.metadata_cache_size` | String | `64MiB` | Cache size for inverted index metadata. |
|
||||
| `region_engine.mito.index.content_cache_size` | String | `128MiB` | Cache size for inverted index content. |
|
||||
| `region_engine.mito.index.content_cache_page_size` | String | `64KiB` | Page size for inverted index content cache. |
|
||||
| `region_engine.mito.inverted_index` | -- | -- | The options for inverted index in Mito engine. |
|
||||
| `region_engine.mito.inverted_index.create_on_flush` | String | `auto` | Whether to create the index on flush.<br/>- `auto`: automatically (default)<br/>- `disable`: never |
|
||||
| `region_engine.mito.inverted_index.create_on_compaction` | String | `auto` | Whether to create the index on compaction.<br/>- `auto`: automatically (default)<br/>- `disable`: never |
|
||||
| `region_engine.mito.inverted_index.apply_on_query` | String | `auto` | Whether to apply the index on query<br/>- `auto`: automatically (default)<br/>- `disable`: never |
|
||||
| `region_engine.mito.inverted_index.mem_threshold_on_create` | String | `auto` | Memory threshold for performing an external sort during index creation.<br/>- `auto`: automatically determine the threshold based on the system memory size (default)<br/>- `unlimited`: no memory limit<br/>- `[size]` e.g. `64MB`: fixed memory threshold |
|
||||
| `region_engine.mito.inverted_index.intermediate_path` | String | `""` | Deprecated, use `region_engine.mito.index.aux_path` instead. |
|
||||
| `region_engine.mito.inverted_index.metadata_cache_size` | String | `64MiB` | Cache size for inverted index metadata. |
|
||||
| `region_engine.mito.inverted_index.content_cache_size` | String | `128MiB` | Cache size for inverted index content. |
|
||||
| `region_engine.mito.inverted_index.content_cache_page_size` | String | `64KiB` | Page size for inverted index content cache. |
|
||||
| `region_engine.mito.fulltext_index` | -- | -- | The options for full-text index in Mito engine. |
|
||||
| `region_engine.mito.fulltext_index.create_on_flush` | String | `auto` | Whether to create the index on flush.<br/>- `auto`: automatically (default)<br/>- `disable`: never |
|
||||
| `region_engine.mito.fulltext_index.create_on_compaction` | String | `auto` | Whether to create the index on compaction.<br/>- `auto`: automatically (default)<br/>- `disable`: never |
|
||||
@@ -536,12 +538,18 @@
|
||||
| --- | -----| ------- | ----------- |
|
||||
| `mode` | String | `distributed` | The running mode of the flownode. It can be `standalone` or `distributed`. |
|
||||
| `node_id` | Integer | Unset | The flownode identifier and should be unique in the cluster. |
|
||||
| `flow` | -- | -- | flow engine options. |
|
||||
| `flow.num_workers` | Integer | `0` | The number of flow worker in flownode.<br/>Not setting(or set to 0) this value will use the number of CPU cores divided by 2. |
|
||||
| `grpc` | -- | -- | The gRPC server options. |
|
||||
| `grpc.addr` | String | `127.0.0.1:6800` | The address to bind the gRPC server. |
|
||||
| `grpc.hostname` | String | `127.0.0.1` | The hostname advertised to the metasrv,<br/>and used for connections from outside the host |
|
||||
| `grpc.runtime_size` | Integer | `2` | The number of server worker threads. |
|
||||
| `grpc.max_recv_message_size` | String | `512MB` | The maximum receive message size for gRPC server. |
|
||||
| `grpc.max_send_message_size` | String | `512MB` | The maximum send message size for gRPC server. |
|
||||
| `http` | -- | -- | The HTTP server options. |
|
||||
| `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. |
|
||||
| `http.timeout` | String | `30s` | HTTP request timeout. Set to 0 to disable timeout. |
|
||||
| `http.body_limit` | String | `64MB` | HTTP request body limit.<br/>The following units are supported: `B`, `KB`, `KiB`, `MB`, `MiB`, `GB`, `GiB`, `TB`, `TiB`, `PB`, `PiB`.<br/>Set to 0 to disable limit. |
|
||||
| `meta_client` | -- | -- | The metasrv client options. |
|
||||
| `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. |
|
||||
| `meta_client.timeout` | String | `3s` | Operation timeout. |
|
||||
|
||||
@@ -516,6 +516,15 @@ aux_path = ""
|
||||
## The max capacity of the staging directory.
|
||||
staging_size = "2GB"
|
||||
|
||||
## Cache size for inverted index metadata.
|
||||
metadata_cache_size = "64MiB"
|
||||
|
||||
## Cache size for inverted index content.
|
||||
content_cache_size = "128MiB"
|
||||
|
||||
## Page size for inverted index content cache.
|
||||
content_cache_page_size = "64KiB"
|
||||
|
||||
## The options for inverted index in Mito engine.
|
||||
[region_engine.mito.inverted_index]
|
||||
|
||||
@@ -543,15 +552,6 @@ mem_threshold_on_create = "auto"
|
||||
## Deprecated, use `region_engine.mito.index.aux_path` instead.
|
||||
intermediate_path = ""
|
||||
|
||||
## Cache size for inverted index metadata.
|
||||
metadata_cache_size = "64MiB"
|
||||
|
||||
## Cache size for inverted index content.
|
||||
content_cache_size = "128MiB"
|
||||
|
||||
## Page size for inverted index content cache.
|
||||
content_cache_page_size = "64KiB"
|
||||
|
||||
## The options for full-text index in Mito engine.
|
||||
[region_engine.mito.fulltext_index]
|
||||
|
||||
|
||||
@@ -5,6 +5,12 @@ mode = "distributed"
|
||||
## @toml2docs:none-default
|
||||
node_id = 14
|
||||
|
||||
## flow engine options.
|
||||
[flow]
|
||||
## The number of flow worker in flownode.
|
||||
## Not setting(or set to 0) this value will use the number of CPU cores divided by 2.
|
||||
#+num_workers=0
|
||||
|
||||
## The gRPC server options.
|
||||
[grpc]
|
||||
## The address to bind the gRPC server.
|
||||
@@ -19,6 +25,16 @@ max_recv_message_size = "512MB"
|
||||
## The maximum send message size for gRPC server.
|
||||
max_send_message_size = "512MB"
|
||||
|
||||
## The HTTP server options.
|
||||
[http]
|
||||
## The address to bind the HTTP server.
|
||||
addr = "127.0.0.1:4000"
|
||||
## HTTP request timeout. Set to 0 to disable timeout.
|
||||
timeout = "30s"
|
||||
## HTTP request body limit.
|
||||
## The following units are supported: `B`, `KB`, `KiB`, `MB`, `MiB`, `GB`, `GiB`, `TB`, `TiB`, `PB`, `PiB`.
|
||||
## Set to 0 to disable limit.
|
||||
body_limit = "64MB"
|
||||
|
||||
## The metasrv client options.
|
||||
[meta_client]
|
||||
|
||||
@@ -284,6 +284,12 @@ max_retry_times = 3
|
||||
## Initial retry delay of procedures, increases exponentially
|
||||
retry_delay = "500ms"
|
||||
|
||||
## flow engine options.
|
||||
[flow]
|
||||
## The number of flow worker in flownode.
|
||||
## Not setting(or set to 0) this value will use the number of CPU cores divided by 2.
|
||||
#+num_workers=0
|
||||
|
||||
# Example of using S3 as the storage.
|
||||
# [storage]
|
||||
# type = "S3"
|
||||
@@ -559,6 +565,15 @@ aux_path = ""
|
||||
## The max capacity of the staging directory.
|
||||
staging_size = "2GB"
|
||||
|
||||
## Cache size for inverted index metadata.
|
||||
metadata_cache_size = "64MiB"
|
||||
|
||||
## Cache size for inverted index content.
|
||||
content_cache_size = "128MiB"
|
||||
|
||||
## Page size for inverted index content cache.
|
||||
content_cache_page_size = "64KiB"
|
||||
|
||||
## The options for inverted index in Mito engine.
|
||||
[region_engine.mito.inverted_index]
|
||||
|
||||
@@ -586,15 +601,6 @@ mem_threshold_on_create = "auto"
|
||||
## Deprecated, use `region_engine.mito.index.aux_path` instead.
|
||||
intermediate_path = ""
|
||||
|
||||
## Cache size for inverted index metadata.
|
||||
metadata_cache_size = "64MiB"
|
||||
|
||||
## Cache size for inverted index content.
|
||||
content_cache_size = "128MiB"
|
||||
|
||||
## Page size for inverted index content cache.
|
||||
content_cache_page_size = "64KiB"
|
||||
|
||||
## The options for full-text index in Mito engine.
|
||||
[region_engine.mito.fulltext_index]
|
||||
|
||||
|
||||
75
cyborg/bin/bump-doc-version.ts
Normal file
75
cyborg/bin/bump-doc-version.ts
Normal file
@@ -0,0 +1,75 @@
|
||||
/*
|
||||
* Copyright 2023 Greptime Team
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import * as core from "@actions/core";
|
||||
import {obtainClient} from "@/common";
|
||||
|
||||
async function triggerWorkflow(workflowId: string, version: string) {
|
||||
const docsClient = obtainClient("DOCS_REPO_TOKEN")
|
||||
try {
|
||||
await docsClient.rest.actions.createWorkflowDispatch({
|
||||
owner: "GreptimeTeam",
|
||||
repo: "docs",
|
||||
workflow_id: workflowId,
|
||||
ref: "main",
|
||||
inputs: {
|
||||
version,
|
||||
},
|
||||
});
|
||||
console.log(`Successfully triggered ${workflowId} workflow with version ${version}`);
|
||||
} catch (error) {
|
||||
core.setFailed(`Failed to trigger workflow: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
function determineWorkflow(version: string): [string, string] {
|
||||
// Check if it's a nightly version
|
||||
if (version.includes('nightly')) {
|
||||
return ['bump-nightly-version.yml', version];
|
||||
}
|
||||
|
||||
const parts = version.split('.');
|
||||
|
||||
if (parts.length !== 3) {
|
||||
throw new Error('Invalid version format');
|
||||
}
|
||||
|
||||
// If patch version (last number) is 0, it's a major version
|
||||
// Return only major.minor version
|
||||
if (parts[2] === '0') {
|
||||
return ['bump-version.yml', `${parts[0]}.${parts[1]}`];
|
||||
}
|
||||
|
||||
// Otherwise it's a patch version, use full version
|
||||
return ['bump-patch-version.yml', version];
|
||||
}
|
||||
|
||||
const version = process.env.VERSION;
|
||||
if (!version) {
|
||||
core.setFailed("VERSION environment variable is required");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Remove 'v' prefix if exists
|
||||
const cleanVersion = version.startsWith('v') ? version.slice(1) : version;
|
||||
|
||||
try {
|
||||
const [workflowId, apiVersion] = determineWorkflow(cleanVersion);
|
||||
triggerWorkflow(workflowId, apiVersion);
|
||||
} catch (error) {
|
||||
core.setFailed(`Error processing version: ${error.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
@@ -1,3 +1,3 @@
|
||||
[toolchain]
|
||||
channel = "nightly-2024-10-19"
|
||||
components = ["rust-analyzer"]
|
||||
components = ["rust-analyzer", "llvm-tools"]
|
||||
|
||||
@@ -11,11 +11,13 @@ pkgs.mkShell rec {
|
||||
clang
|
||||
gcc
|
||||
protobuf
|
||||
gnumake
|
||||
mold
|
||||
(fenix.fromToolchainFile {
|
||||
dir = ./.;
|
||||
})
|
||||
cargo-nextest
|
||||
cargo-llvm-cov
|
||||
taplo
|
||||
curl
|
||||
];
|
||||
|
||||
@@ -57,13 +57,13 @@ pub fn try_as_column_schema(column_def: &ColumnDef) -> Result<ColumnSchema> {
|
||||
}
|
||||
if let Some(options) = column_def.options.as_ref() {
|
||||
if let Some(fulltext) = options.options.get(FULLTEXT_GRPC_KEY) {
|
||||
metadata.insert(FULLTEXT_KEY.to_string(), fulltext.clone());
|
||||
metadata.insert(FULLTEXT_KEY.to_string(), fulltext.to_owned());
|
||||
}
|
||||
if let Some(inverted_index) = options.options.get(INVERTED_INDEX_GRPC_KEY) {
|
||||
metadata.insert(INVERTED_INDEX_KEY.to_string(), inverted_index.clone());
|
||||
metadata.insert(INVERTED_INDEX_KEY.to_string(), inverted_index.to_owned());
|
||||
}
|
||||
if let Some(skipping_index) = options.options.get(SKIPPING_INDEX_GRPC_KEY) {
|
||||
metadata.insert(SKIPPING_INDEX_KEY.to_string(), skipping_index.clone());
|
||||
metadata.insert(SKIPPING_INDEX_KEY.to_string(), skipping_index.to_owned());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -82,7 +82,7 @@ pub fn options_from_column_schema(column_schema: &ColumnSchema) -> Option<Column
|
||||
if let Some(fulltext) = column_schema.metadata().get(FULLTEXT_KEY) {
|
||||
options
|
||||
.options
|
||||
.insert(FULLTEXT_GRPC_KEY.to_string(), fulltext.clone());
|
||||
.insert(FULLTEXT_GRPC_KEY.to_string(), fulltext.to_owned());
|
||||
}
|
||||
if let Some(inverted_index) = column_schema.metadata().get(INVERTED_INDEX_KEY) {
|
||||
options
|
||||
@@ -181,14 +181,14 @@ mod tests {
|
||||
let options = options_from_column_schema(&schema);
|
||||
assert!(options.is_none());
|
||||
|
||||
let schema = ColumnSchema::new("test", ConcreteDataType::string_datatype(), true)
|
||||
let mut schema = ColumnSchema::new("test", ConcreteDataType::string_datatype(), true)
|
||||
.with_fulltext_options(FulltextOptions {
|
||||
enable: true,
|
||||
analyzer: FulltextAnalyzer::English,
|
||||
case_sensitive: false,
|
||||
})
|
||||
.unwrap()
|
||||
.set_inverted_index(true);
|
||||
.unwrap();
|
||||
schema.with_inverted_index(true);
|
||||
let options = options_from_column_schema(&schema).unwrap();
|
||||
assert_eq!(
|
||||
options.options.get(FULLTEXT_GRPC_KEY).unwrap(),
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
|
||||
mod pg_catalog_memory_table;
|
||||
mod pg_class;
|
||||
mod pg_database;
|
||||
mod pg_namespace;
|
||||
mod table_names;
|
||||
|
||||
@@ -26,6 +27,7 @@ use lazy_static::lazy_static;
|
||||
use paste::paste;
|
||||
use pg_catalog_memory_table::get_schema_columns;
|
||||
use pg_class::PGClass;
|
||||
use pg_database::PGDatabase;
|
||||
use pg_namespace::PGNamespace;
|
||||
use session::context::{Channel, QueryContext};
|
||||
use table::TableRef;
|
||||
@@ -113,6 +115,10 @@ impl PGCatalogProvider {
|
||||
PG_CLASS.to_string(),
|
||||
self.build_table(PG_CLASS).expect(PG_NAMESPACE),
|
||||
);
|
||||
tables.insert(
|
||||
PG_DATABASE.to_string(),
|
||||
self.build_table(PG_DATABASE).expect(PG_DATABASE),
|
||||
);
|
||||
self.tables = tables;
|
||||
}
|
||||
}
|
||||
@@ -135,6 +141,11 @@ impl SystemSchemaProviderInner for PGCatalogProvider {
|
||||
self.catalog_manager.clone(),
|
||||
self.namespace_oid_map.clone(),
|
||||
))),
|
||||
table_names::PG_DATABASE => Some(Arc::new(PGDatabase::new(
|
||||
self.catalog_name.clone(),
|
||||
self.catalog_manager.clone(),
|
||||
self.namespace_oid_map.clone(),
|
||||
))),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
214
src/catalog/src/system_schema/pg_catalog/pg_database.rs
Normal file
214
src/catalog/src/system_schema/pg_catalog/pg_database.rs
Normal file
@@ -0,0 +1,214 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::{Arc, Weak};
|
||||
|
||||
use arrow_schema::SchemaRef as ArrowSchemaRef;
|
||||
use common_catalog::consts::PG_CATALOG_PG_DATABASE_TABLE_ID;
|
||||
use common_error::ext::BoxedError;
|
||||
use common_recordbatch::adapter::RecordBatchStreamAdapter;
|
||||
use common_recordbatch::{DfSendableRecordBatchStream, RecordBatch};
|
||||
use datafusion::execution::TaskContext;
|
||||
use datafusion::physical_plan::stream::RecordBatchStreamAdapter as DfRecordBatchStreamAdapter;
|
||||
use datafusion::physical_plan::streaming::PartitionStream as DfPartitionStream;
|
||||
use datatypes::scalars::ScalarVectorBuilder;
|
||||
use datatypes::schema::{Schema, SchemaRef};
|
||||
use datatypes::value::Value;
|
||||
use datatypes::vectors::{StringVectorBuilder, UInt32VectorBuilder, VectorRef};
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
use store_api::storage::ScanRequest;
|
||||
|
||||
use super::pg_namespace::oid_map::PGNamespaceOidMapRef;
|
||||
use super::{query_ctx, OID_COLUMN_NAME, PG_DATABASE};
|
||||
use crate::error::{
|
||||
CreateRecordBatchSnafu, InternalSnafu, Result, UpgradeWeakCatalogManagerRefSnafu,
|
||||
};
|
||||
use crate::information_schema::Predicates;
|
||||
use crate::system_schema::utils::tables::{string_column, u32_column};
|
||||
use crate::system_schema::SystemTable;
|
||||
use crate::CatalogManager;
|
||||
|
||||
// === column name ===
|
||||
pub const DATNAME: &str = "datname";
|
||||
|
||||
/// The initial capacity of the vector builders.
|
||||
const INIT_CAPACITY: usize = 42;
|
||||
|
||||
/// The `pg_catalog.database` table implementation.
|
||||
pub(super) struct PGDatabase {
|
||||
schema: SchemaRef,
|
||||
catalog_name: String,
|
||||
catalog_manager: Weak<dyn CatalogManager>,
|
||||
|
||||
// Workaround to convert schema_name to a numeric id
|
||||
namespace_oid_map: PGNamespaceOidMapRef,
|
||||
}
|
||||
|
||||
impl PGDatabase {
|
||||
pub(super) fn new(
|
||||
catalog_name: String,
|
||||
catalog_manager: Weak<dyn CatalogManager>,
|
||||
namespace_oid_map: PGNamespaceOidMapRef,
|
||||
) -> Self {
|
||||
Self {
|
||||
schema: Self::schema(),
|
||||
catalog_name,
|
||||
catalog_manager,
|
||||
namespace_oid_map,
|
||||
}
|
||||
}
|
||||
|
||||
fn schema() -> SchemaRef {
|
||||
Arc::new(Schema::new(vec![
|
||||
u32_column(OID_COLUMN_NAME),
|
||||
string_column(DATNAME),
|
||||
]))
|
||||
}
|
||||
|
||||
fn builder(&self) -> PGCDatabaseBuilder {
|
||||
PGCDatabaseBuilder::new(
|
||||
self.schema.clone(),
|
||||
self.catalog_name.clone(),
|
||||
self.catalog_manager.clone(),
|
||||
self.namespace_oid_map.clone(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl DfPartitionStream for PGDatabase {
|
||||
fn schema(&self) -> &ArrowSchemaRef {
|
||||
self.schema.arrow_schema()
|
||||
}
|
||||
|
||||
fn execute(&self, _: Arc<TaskContext>) -> DfSendableRecordBatchStream {
|
||||
let schema = self.schema.arrow_schema().clone();
|
||||
let mut builder = self.builder();
|
||||
Box::pin(DfRecordBatchStreamAdapter::new(
|
||||
schema,
|
||||
futures::stream::once(async move {
|
||||
builder
|
||||
.make_database(None)
|
||||
.await
|
||||
.map(|x| x.into_df_record_batch())
|
||||
.map_err(Into::into)
|
||||
}),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
impl SystemTable for PGDatabase {
|
||||
fn table_id(&self) -> table::metadata::TableId {
|
||||
PG_CATALOG_PG_DATABASE_TABLE_ID
|
||||
}
|
||||
|
||||
fn table_name(&self) -> &'static str {
|
||||
PG_DATABASE
|
||||
}
|
||||
|
||||
fn schema(&self) -> SchemaRef {
|
||||
self.schema.clone()
|
||||
}
|
||||
|
||||
fn to_stream(
|
||||
&self,
|
||||
request: ScanRequest,
|
||||
) -> Result<common_recordbatch::SendableRecordBatchStream> {
|
||||
let schema = self.schema.arrow_schema().clone();
|
||||
let mut builder = self.builder();
|
||||
let stream = Box::pin(DfRecordBatchStreamAdapter::new(
|
||||
schema,
|
||||
futures::stream::once(async move {
|
||||
builder
|
||||
.make_database(Some(request))
|
||||
.await
|
||||
.map(|x| x.into_df_record_batch())
|
||||
.map_err(Into::into)
|
||||
}),
|
||||
));
|
||||
Ok(Box::pin(
|
||||
RecordBatchStreamAdapter::try_new(stream)
|
||||
.map_err(BoxedError::new)
|
||||
.context(InternalSnafu)?,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds the `pg_catalog.pg_database` table row by row
|
||||
/// `oid` use schema name as a workaround since we don't have numeric schema id.
|
||||
/// `nspname` is the schema name.
|
||||
struct PGCDatabaseBuilder {
|
||||
schema: SchemaRef,
|
||||
catalog_name: String,
|
||||
catalog_manager: Weak<dyn CatalogManager>,
|
||||
namespace_oid_map: PGNamespaceOidMapRef,
|
||||
|
||||
oid: UInt32VectorBuilder,
|
||||
datname: StringVectorBuilder,
|
||||
}
|
||||
|
||||
impl PGCDatabaseBuilder {
|
||||
fn new(
|
||||
schema: SchemaRef,
|
||||
catalog_name: String,
|
||||
catalog_manager: Weak<dyn CatalogManager>,
|
||||
namespace_oid_map: PGNamespaceOidMapRef,
|
||||
) -> Self {
|
||||
Self {
|
||||
schema,
|
||||
catalog_name,
|
||||
catalog_manager,
|
||||
namespace_oid_map,
|
||||
|
||||
oid: UInt32VectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
datname: StringVectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
}
|
||||
}
|
||||
|
||||
async fn make_database(&mut self, request: Option<ScanRequest>) -> Result<RecordBatch> {
|
||||
let catalog_name = self.catalog_name.clone();
|
||||
let catalog_manager = self
|
||||
.catalog_manager
|
||||
.upgrade()
|
||||
.context(UpgradeWeakCatalogManagerRefSnafu)?;
|
||||
let predicates = Predicates::from_scan_request(&request);
|
||||
for schema_name in catalog_manager
|
||||
.schema_names(&catalog_name, query_ctx())
|
||||
.await?
|
||||
{
|
||||
self.add_database(&predicates, &schema_name);
|
||||
}
|
||||
self.finish()
|
||||
}
|
||||
|
||||
fn add_database(&mut self, predicates: &Predicates, schema_name: &str) {
|
||||
let oid = self.namespace_oid_map.get_oid(schema_name);
|
||||
let row: [(&str, &Value); 2] = [
|
||||
(OID_COLUMN_NAME, &Value::from(oid)),
|
||||
(DATNAME, &Value::from(schema_name)),
|
||||
];
|
||||
|
||||
if !predicates.eval(&row) {
|
||||
return;
|
||||
}
|
||||
|
||||
self.oid.push(Some(oid));
|
||||
self.datname.push(Some(schema_name));
|
||||
}
|
||||
|
||||
fn finish(&mut self) -> Result<RecordBatch> {
|
||||
let columns: Vec<VectorRef> =
|
||||
vec![Arc::new(self.oid.finish()), Arc::new(self.datname.finish())];
|
||||
RecordBatch::new(self.schema.clone(), columns).context(CreateRecordBatchSnafu)
|
||||
}
|
||||
}
|
||||
@@ -12,7 +12,11 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
pub const PG_DATABASE: &str = "pg_databases";
|
||||
// https://www.postgresql.org/docs/current/catalog-pg-database.html
|
||||
pub const PG_DATABASE: &str = "pg_database";
|
||||
// https://www.postgresql.org/docs/current/catalog-pg-namespace.html
|
||||
pub const PG_NAMESPACE: &str = "pg_namespace";
|
||||
// https://www.postgresql.org/docs/current/catalog-pg-class.html
|
||||
pub const PG_CLASS: &str = "pg_class";
|
||||
// https://www.postgresql.org/docs/current/catalog-pg-type.html
|
||||
pub const PG_TYPE: &str = "pg_type";
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use cache::{build_fundamental_cache_registry, with_default_composite_cache_registry};
|
||||
use catalog::information_extension::DistributedInformationExtension;
|
||||
@@ -142,6 +143,11 @@ struct StartCommand {
|
||||
/// The prefix of environment variables, default is `GREPTIMEDB_FLOWNODE`;
|
||||
#[clap(long, default_value = "GREPTIMEDB_FLOWNODE")]
|
||||
env_prefix: String,
|
||||
#[clap(long)]
|
||||
http_addr: Option<String>,
|
||||
/// HTTP request timeout in seconds.
|
||||
#[clap(long)]
|
||||
http_timeout: Option<u64>,
|
||||
}
|
||||
|
||||
impl StartCommand {
|
||||
@@ -198,6 +204,14 @@ impl StartCommand {
|
||||
opts.mode = Mode::Distributed;
|
||||
}
|
||||
|
||||
if let Some(http_addr) = &self.http_addr {
|
||||
opts.http.addr.clone_from(http_addr);
|
||||
}
|
||||
|
||||
if let Some(http_timeout) = self.http_timeout {
|
||||
opts.http.timeout = Duration::from_secs(http_timeout);
|
||||
}
|
||||
|
||||
if let (Mode::Distributed, None) = (&opts.mode, &opts.node_id) {
|
||||
return MissingConfigSnafu {
|
||||
msg: "Missing node id option",
|
||||
|
||||
@@ -54,7 +54,7 @@ use datanode::config::{DatanodeOptions, ProcedureConfig, RegionEngineConfig, Sto
|
||||
use datanode::datanode::{Datanode, DatanodeBuilder};
|
||||
use datanode::region_server::RegionServer;
|
||||
use file_engine::config::EngineConfig as FileEngineConfig;
|
||||
use flow::{FlowWorkerManager, FlownodeBuilder, FrontendInvoker};
|
||||
use flow::{FlowConfig, FlowWorkerManager, FlownodeBuilder, FlownodeOptions, FrontendInvoker};
|
||||
use frontend::frontend::FrontendOptions;
|
||||
use frontend::instance::builder::FrontendBuilder;
|
||||
use frontend::instance::{FrontendInstance, Instance as FeInstance, StandaloneDatanodeManager};
|
||||
@@ -145,6 +145,7 @@ pub struct StandaloneOptions {
|
||||
pub storage: StorageConfig,
|
||||
pub metadata_store: KvBackendConfig,
|
||||
pub procedure: ProcedureConfig,
|
||||
pub flow: FlowConfig,
|
||||
pub logging: LoggingOptions,
|
||||
pub user_provider: Option<String>,
|
||||
/// Options for different store engines.
|
||||
@@ -173,6 +174,7 @@ impl Default for StandaloneOptions {
|
||||
storage: StorageConfig::default(),
|
||||
metadata_store: KvBackendConfig::default(),
|
||||
procedure: ProcedureConfig::default(),
|
||||
flow: FlowConfig::default(),
|
||||
logging: LoggingOptions::default(),
|
||||
export_metrics: ExportMetricsOption::default(),
|
||||
user_provider: None,
|
||||
@@ -523,8 +525,12 @@ impl StartCommand {
|
||||
Self::create_table_metadata_manager(kv_backend.clone()).await?;
|
||||
|
||||
let flow_metadata_manager = Arc::new(FlowMetadataManager::new(kv_backend.clone()));
|
||||
let flownode_options = FlownodeOptions {
|
||||
flow: opts.flow.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
let flow_builder = FlownodeBuilder::new(
|
||||
Default::default(),
|
||||
flownode_options,
|
||||
plugins.clone(),
|
||||
table_metadata_manager.clone(),
|
||||
catalog_manager.clone(),
|
||||
|
||||
@@ -109,6 +109,7 @@ pub const INFORMATION_SCHEMA_REGION_STATISTICS_TABLE_ID: u32 = 35;
|
||||
pub const PG_CATALOG_PG_CLASS_TABLE_ID: u32 = 256;
|
||||
pub const PG_CATALOG_PG_TYPE_TABLE_ID: u32 = 257;
|
||||
pub const PG_CATALOG_PG_NAMESPACE_TABLE_ID: u32 = 258;
|
||||
pub const PG_CATALOG_PG_DATABASE_TABLE_ID: u32 = 259;
|
||||
|
||||
// ----- End of pg_catalog tables -----
|
||||
|
||||
|
||||
@@ -73,14 +73,21 @@ pub trait Configurable: Serialize + DeserializeOwned + Default + Sized {
|
||||
layered_config = layered_config.add_source(File::new(config_file, FileFormat::Toml));
|
||||
}
|
||||
|
||||
let opts = layered_config
|
||||
let mut opts: Self = layered_config
|
||||
.build()
|
||||
.and_then(|x| x.try_deserialize())
|
||||
.context(LoadLayeredConfigSnafu)?;
|
||||
|
||||
opts.validate_sanitize()?;
|
||||
|
||||
Ok(opts)
|
||||
}
|
||||
|
||||
/// Validate(and possibly sanitize) the configuration.
|
||||
fn validate_sanitize(&mut self) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// List of toml keys that should be parsed as a list.
|
||||
fn env_list_keys() -> Option<&'static [&'static str]> {
|
||||
None
|
||||
|
||||
@@ -180,7 +180,7 @@ pub enum Error {
|
||||
|
||||
#[snafu(display("Failed to parse format {} with value: {}", key, value))]
|
||||
ParseFormat {
|
||||
key: &'static str,
|
||||
key: String,
|
||||
value: String,
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
|
||||
@@ -25,12 +25,15 @@ use datatypes::schema::{ColumnSchema, FulltextOptions, RawSchema};
|
||||
use snafu::{ensure, OptionExt, ResultExt};
|
||||
use store_api::region_request::{SetRegionOption, UnsetRegionOption};
|
||||
use table::metadata::TableId;
|
||||
use table::requests::{AddColumnRequest, AlterKind, AlterTableRequest, ModifyColumnTypeRequest};
|
||||
use table::requests::{
|
||||
AddColumnRequest, AlterKind, AlterTableRequest, ModifyColumnTypeRequest, SetIndexOptions,
|
||||
UnsetIndexOptions,
|
||||
};
|
||||
|
||||
use crate::error::{
|
||||
InvalidColumnDefSnafu, InvalidSetFulltextOptionRequestSnafu, InvalidSetTableOptionRequestSnafu,
|
||||
InvalidUnsetTableOptionRequestSnafu, MissingFieldSnafu, MissingTimestampColumnSnafu, Result,
|
||||
UnknownLocationTypeSnafu,
|
||||
InvalidUnsetTableOptionRequestSnafu, MissingAlterIndexOptionSnafu, MissingFieldSnafu,
|
||||
MissingTimestampColumnSnafu, Result, UnknownLocationTypeSnafu,
|
||||
};
|
||||
|
||||
const LOCATION_TYPE_FIRST: i32 = LocationType::First as i32;
|
||||
@@ -114,18 +117,43 @@ pub fn alter_expr_to_request(table_id: TableId, expr: AlterTableExpr) -> Result<
|
||||
.context(InvalidUnsetTableOptionRequestSnafu)?,
|
||||
}
|
||||
}
|
||||
Kind::SetColumnFulltext(c) => AlterKind::SetColumnFulltext {
|
||||
column_name: c.column_name,
|
||||
options: FulltextOptions {
|
||||
enable: c.enable,
|
||||
analyzer: as_fulltext_option(
|
||||
Analyzer::try_from(c.analyzer).context(InvalidSetFulltextOptionRequestSnafu)?,
|
||||
),
|
||||
case_sensitive: c.case_sensitive,
|
||||
Kind::SetIndex(o) => match o.options {
|
||||
Some(opt) => match opt {
|
||||
api::v1::set_index::Options::Fulltext(f) => AlterKind::SetIndex {
|
||||
options: SetIndexOptions::Fulltext {
|
||||
column_name: f.column_name.clone(),
|
||||
options: FulltextOptions {
|
||||
enable: f.enable,
|
||||
analyzer: as_fulltext_option(
|
||||
Analyzer::try_from(f.analyzer)
|
||||
.context(InvalidSetFulltextOptionRequestSnafu)?,
|
||||
),
|
||||
case_sensitive: f.case_sensitive,
|
||||
},
|
||||
},
|
||||
},
|
||||
api::v1::set_index::Options::Inverted(i) => AlterKind::SetIndex {
|
||||
options: SetIndexOptions::Inverted {
|
||||
column_name: i.column_name,
|
||||
},
|
||||
},
|
||||
},
|
||||
None => return MissingAlterIndexOptionSnafu.fail(),
|
||||
},
|
||||
Kind::UnsetColumnFulltext(c) => AlterKind::UnsetColumnFulltext {
|
||||
column_name: c.column_name,
|
||||
Kind::UnsetIndex(o) => match o.options {
|
||||
Some(opt) => match opt {
|
||||
api::v1::unset_index::Options::Fulltext(f) => AlterKind::UnsetIndex {
|
||||
options: UnsetIndexOptions::Fulltext {
|
||||
column_name: f.column_name,
|
||||
},
|
||||
},
|
||||
api::v1::unset_index::Options::Inverted(i) => AlterKind::UnsetIndex {
|
||||
options: UnsetIndexOptions::Inverted {
|
||||
column_name: i.column_name,
|
||||
},
|
||||
},
|
||||
},
|
||||
None => return MissingAlterIndexOptionSnafu.fail(),
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
@@ -139,6 +139,12 @@ pub enum Error {
|
||||
#[snafu(source)]
|
||||
error: prost::DecodeError,
|
||||
},
|
||||
|
||||
#[snafu(display("Missing alter index options"))]
|
||||
MissingAlterIndexOption {
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
},
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
@@ -164,7 +170,8 @@ impl ErrorExt for Error {
|
||||
}
|
||||
Error::InvalidSetTableOptionRequest { .. }
|
||||
| Error::InvalidUnsetTableOptionRequest { .. }
|
||||
| Error::InvalidSetFulltextOptionRequest { .. } => StatusCode::InvalidArguments,
|
||||
| Error::InvalidSetFulltextOptionRequest { .. }
|
||||
| Error::MissingAlterIndexOption { .. } => StatusCode::InvalidArguments,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -119,29 +119,30 @@ pub fn build_create_table_expr(
|
||||
}
|
||||
|
||||
let mut column_defs = Vec::with_capacity(column_exprs.len());
|
||||
let mut primary_keys = Vec::default();
|
||||
let mut primary_keys = Vec::with_capacity(column_exprs.len());
|
||||
let mut time_index = None;
|
||||
|
||||
for ColumnExpr {
|
||||
column_name,
|
||||
datatype,
|
||||
semantic_type,
|
||||
datatype_extension,
|
||||
options,
|
||||
} in column_exprs
|
||||
{
|
||||
for expr in column_exprs {
|
||||
let ColumnExpr {
|
||||
column_name,
|
||||
datatype,
|
||||
semantic_type,
|
||||
datatype_extension,
|
||||
options,
|
||||
} = expr;
|
||||
|
||||
let mut is_nullable = true;
|
||||
match semantic_type {
|
||||
v if v == SemanticType::Tag as i32 => primary_keys.push(column_name.to_string()),
|
||||
v if v == SemanticType::Tag as i32 => primary_keys.push(column_name.to_owned()),
|
||||
v if v == SemanticType::Timestamp as i32 => {
|
||||
ensure!(
|
||||
time_index.is_none(),
|
||||
DuplicatedTimestampColumnSnafu {
|
||||
exists: time_index.unwrap(),
|
||||
exists: time_index.as_ref().unwrap(),
|
||||
duplicated: column_name,
|
||||
}
|
||||
);
|
||||
time_index = Some(column_name.to_string());
|
||||
time_index = Some(column_name.to_owned());
|
||||
// Timestamp column must not be null.
|
||||
is_nullable = false;
|
||||
}
|
||||
@@ -158,8 +159,8 @@ pub fn build_create_table_expr(
|
||||
}
|
||||
);
|
||||
|
||||
let column_def = ColumnDef {
|
||||
name: column_name.to_string(),
|
||||
column_defs.push(ColumnDef {
|
||||
name: column_name.to_owned(),
|
||||
data_type: datatype,
|
||||
is_nullable,
|
||||
default_constraint: vec![],
|
||||
@@ -167,15 +168,14 @@ pub fn build_create_table_expr(
|
||||
comment: String::new(),
|
||||
datatype_extension: datatype_extension.clone(),
|
||||
options: options.clone(),
|
||||
};
|
||||
column_defs.push(column_def);
|
||||
});
|
||||
}
|
||||
|
||||
let time_index = time_index.context(MissingTimestampColumnSnafu {
|
||||
msg: format!("table is {}", table_name.table),
|
||||
})?;
|
||||
|
||||
let expr = CreateTableExpr {
|
||||
Ok(CreateTableExpr {
|
||||
catalog_name: table_name.catalog.to_string(),
|
||||
schema_name: table_name.schema.to_string(),
|
||||
table_name: table_name.table.to_string(),
|
||||
@@ -187,9 +187,7 @@ pub fn build_create_table_expr(
|
||||
table_options: Default::default(),
|
||||
table_id: table_id.map(|id| api::v1::TableId { id }),
|
||||
engine: engine.to_string(),
|
||||
};
|
||||
|
||||
Ok(expr)
|
||||
})
|
||||
}
|
||||
|
||||
/// Find columns that are not present in the schema and return them as `AddColumns`
|
||||
|
||||
@@ -133,10 +133,8 @@ fn create_proto_alter_kind(
|
||||
Kind::RenameTable(_) => Ok(None),
|
||||
Kind::SetTableOptions(v) => Ok(Some(alter_request::Kind::SetTableOptions(v.clone()))),
|
||||
Kind::UnsetTableOptions(v) => Ok(Some(alter_request::Kind::UnsetTableOptions(v.clone()))),
|
||||
Kind::SetColumnFulltext(v) => Ok(Some(alter_request::Kind::SetColumnFulltext(v.clone()))),
|
||||
Kind::UnsetColumnFulltext(v) => {
|
||||
Ok(Some(alter_request::Kind::UnsetColumnFulltext(v.clone())))
|
||||
}
|
||||
Kind::SetIndex(v) => Ok(Some(alter_request::Kind::SetIndex(v.clone()))),
|
||||
Kind::UnsetIndex(v) => Ok(Some(alter_request::Kind::UnsetIndex(v.clone()))),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -60,8 +60,8 @@ impl AlterTableProcedure {
|
||||
| AlterKind::ModifyColumnTypes { .. }
|
||||
| AlterKind::SetTableOptions { .. }
|
||||
| AlterKind::UnsetTableOptions { .. }
|
||||
| AlterKind::SetColumnFulltext { .. }
|
||||
| AlterKind::UnsetColumnFulltext { .. } => {}
|
||||
| AlterKind::SetIndex { .. }
|
||||
| AlterKind::UnsetIndex { .. } => {}
|
||||
}
|
||||
|
||||
Ok(new_info)
|
||||
|
||||
@@ -21,7 +21,7 @@ use api::v1::CreateTableExpr;
|
||||
use async_trait::async_trait;
|
||||
use common_procedure::error::{FromJsonSnafu, Result as ProcedureResult, ToJsonSnafu};
|
||||
use common_procedure::{Context as ProcedureContext, LockKey, Procedure, Status};
|
||||
use common_telemetry::warn;
|
||||
use common_telemetry::{debug, warn};
|
||||
use futures_util::future::join_all;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use snafu::{ensure, ResultExt};
|
||||
@@ -143,7 +143,12 @@ impl CreateLogicalTablesProcedure {
|
||||
|
||||
for peer in leaders {
|
||||
let requester = self.context.node_manager.datanode(&peer).await;
|
||||
let request = self.make_request(&peer, region_routes)?;
|
||||
let Some(request) = self.make_request(&peer, region_routes)? else {
|
||||
debug!("no region request to send to datanode {}", peer);
|
||||
// We can skip the rest of the datanodes,
|
||||
// the rest of the datanodes should have the same result.
|
||||
break;
|
||||
};
|
||||
|
||||
create_region_tasks.push(async move {
|
||||
requester
|
||||
|
||||
@@ -25,7 +25,7 @@ impl CreateLogicalTablesProcedure {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn check_tables_already_exist(&mut self) -> Result<()> {
|
||||
pub async fn check_tables_already_exist(&mut self) -> Result<()> {
|
||||
let table_name_keys = self
|
||||
.data
|
||||
.all_create_table_exprs()
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use api::v1::region::{region_request, CreateRequests, RegionRequest, RegionRequestHeader};
|
||||
use common_telemetry::debug;
|
||||
use common_telemetry::tracing_context::TracingContext;
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
@@ -31,11 +32,15 @@ impl CreateLogicalTablesProcedure {
|
||||
&self,
|
||||
peer: &Peer,
|
||||
region_routes: &[RegionRoute],
|
||||
) -> Result<RegionRequest> {
|
||||
) -> Result<Option<RegionRequest>> {
|
||||
let tasks = &self.data.tasks;
|
||||
let table_ids_already_exists = &self.data.table_ids_already_exists;
|
||||
let regions_on_this_peer = find_leader_regions(region_routes, peer);
|
||||
let mut requests = Vec::with_capacity(tasks.len() * regions_on_this_peer.len());
|
||||
for task in tasks {
|
||||
for (task, table_id_already_exists) in tasks.iter().zip(table_ids_already_exists) {
|
||||
if table_id_already_exists.is_some() {
|
||||
continue;
|
||||
}
|
||||
let create_table_expr = &task.create_table;
|
||||
let catalog = &create_table_expr.catalog_name;
|
||||
let schema = &create_table_expr.schema_name;
|
||||
@@ -51,13 +56,18 @@ impl CreateLogicalTablesProcedure {
|
||||
}
|
||||
}
|
||||
|
||||
Ok(RegionRequest {
|
||||
if requests.is_empty() {
|
||||
debug!("no region request to send to datanodes");
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Ok(Some(RegionRequest {
|
||||
header: Some(RegionRequestHeader {
|
||||
tracing_context: TracingContext::from_current_span().to_w3c(),
|
||||
..Default::default()
|
||||
}),
|
||||
body: Some(region_request::Body::Creates(CreateRequests { requests })),
|
||||
})
|
||||
}))
|
||||
}
|
||||
|
||||
fn create_region_request_builder(
|
||||
|
||||
@@ -12,7 +12,7 @@ snafu.workspace = true
|
||||
tokio.workspace = true
|
||||
|
||||
[target.'cfg(unix)'.dependencies]
|
||||
pprof = { version = "0.13", features = [
|
||||
pprof = { version = "0.14", features = [
|
||||
"flamegraph",
|
||||
"prost-codec",
|
||||
"protobuf",
|
||||
|
||||
@@ -189,7 +189,7 @@ impl StateStore for ObjectStateStore {
|
||||
|
||||
async fn batch_delete(&self, keys: &[String]) -> Result<()> {
|
||||
self.store
|
||||
.remove(keys.to_vec())
|
||||
.delete_iter(keys.iter().map(String::as_str))
|
||||
.await
|
||||
.with_context(|_| DeleteStateSnafu {
|
||||
key: format!("{:?}", keys),
|
||||
|
||||
@@ -28,14 +28,13 @@ pub fn build_same_type_ts_filter(
|
||||
ts_schema: &ColumnSchema,
|
||||
time_range: Option<TimestampRange>,
|
||||
) -> Option<Expr> {
|
||||
let ts_type = ts_schema.data_type.clone();
|
||||
let time_range = time_range?;
|
||||
let start = time_range
|
||||
.start()
|
||||
.and_then(|start| ts_type.try_cast(Value::Timestamp(start)));
|
||||
.and_then(|start| ts_schema.data_type.try_cast(Value::Timestamp(start)));
|
||||
let end = time_range
|
||||
.end()
|
||||
.and_then(|end| ts_type.try_cast(Value::Timestamp(end)));
|
||||
.and_then(|end| ts_schema.data_type.try_cast(Value::Timestamp(end)));
|
||||
|
||||
let time_range = match (start, end) {
|
||||
(Some(Value::Timestamp(start)), Some(Value::Timestamp(end))) => {
|
||||
|
||||
@@ -108,11 +108,6 @@ impl Time {
|
||||
self.as_formatted_string("%H:%M:%S%.f%z", None)
|
||||
}
|
||||
|
||||
/// Format Time for system timeszone.
|
||||
pub fn to_system_tz_string(&self) -> String {
|
||||
self.as_formatted_string("%H:%M:%S%.f", None)
|
||||
}
|
||||
|
||||
/// Format Time for given timezone.
|
||||
/// When timezone is None, using system timezone by default.
|
||||
pub fn to_timezone_aware_string(&self, tz: Option<&Timezone>) -> String {
|
||||
|
||||
@@ -123,6 +123,14 @@ impl ColumnSchema {
|
||||
self.default_constraint.as_ref()
|
||||
}
|
||||
|
||||
/// Check if the default constraint is a impure function.
|
||||
pub fn is_default_impure(&self) -> bool {
|
||||
self.default_constraint
|
||||
.as_ref()
|
||||
.map(|c| c.is_function())
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn metadata(&self) -> &Metadata {
|
||||
&self.metadata
|
||||
@@ -150,11 +158,22 @@ impl ColumnSchema {
|
||||
self
|
||||
}
|
||||
|
||||
pub fn set_inverted_index(mut self, value: bool) -> Self {
|
||||
let _ = self
|
||||
.metadata
|
||||
.insert(INVERTED_INDEX_KEY.to_string(), value.to_string());
|
||||
self
|
||||
pub fn with_inverted_index(&mut self, value: bool) {
|
||||
match value {
|
||||
true => {
|
||||
self.metadata
|
||||
.insert(INVERTED_INDEX_KEY.to_string(), value.to_string());
|
||||
}
|
||||
false => {
|
||||
self.metadata.remove(INVERTED_INDEX_KEY);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Put a placeholder to invalidate schemas.all(!has_inverted_index_key).
|
||||
pub fn insert_inverted_index_placeholder(&mut self) {
|
||||
self.metadata
|
||||
.insert(INVERTED_INDEX_KEY.to_string(), "".to_string());
|
||||
}
|
||||
|
||||
pub fn is_inverted_indexed(&self) -> bool {
|
||||
@@ -290,6 +309,15 @@ impl ColumnSchema {
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates an impure default value for this column, only if it have a impure default constraint.
|
||||
/// Otherwise, returns `Ok(None)`.
|
||||
pub fn create_impure_default(&self) -> Result<Option<Value>> {
|
||||
match &self.default_constraint {
|
||||
Some(c) => c.create_impure_default(&self.data_type),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieves the fulltext options for the column.
|
||||
pub fn fulltext_options(&self) -> Result<Option<FulltextOptions>> {
|
||||
match self.metadata.get(FULLTEXT_KEY) {
|
||||
|
||||
@@ -178,12 +178,63 @@ impl ColumnDefaultConstraint {
|
||||
}
|
||||
}
|
||||
|
||||
/// Only create default vector if it's impure, i.e., it's a function.
|
||||
///
|
||||
/// This helps to delay creating constant default values to mito engine while also keeps impure default have consistent values
|
||||
pub fn create_impure_default_vector(
|
||||
&self,
|
||||
data_type: &ConcreteDataType,
|
||||
num_rows: usize,
|
||||
) -> Result<Option<VectorRef>> {
|
||||
assert!(num_rows > 0);
|
||||
|
||||
match self {
|
||||
ColumnDefaultConstraint::Function(expr) => {
|
||||
// Functions should also ensure its return value is not null when
|
||||
// is_nullable is true.
|
||||
match &expr[..] {
|
||||
// TODO(dennis): we only supports current_timestamp right now,
|
||||
// it's better to use a expression framework in future.
|
||||
CURRENT_TIMESTAMP | CURRENT_TIMESTAMP_FN | NOW_FN => {
|
||||
create_current_timestamp_vector(data_type, num_rows).map(Some)
|
||||
}
|
||||
_ => error::UnsupportedDefaultExprSnafu { expr }.fail(),
|
||||
}
|
||||
}
|
||||
ColumnDefaultConstraint::Value(_) => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
/// Only create default value if it's impure, i.e., it's a function.
|
||||
///
|
||||
/// This helps to delay creating constant default values to mito engine while also keeps impure default have consistent values
|
||||
pub fn create_impure_default(&self, data_type: &ConcreteDataType) -> Result<Option<Value>> {
|
||||
match self {
|
||||
ColumnDefaultConstraint::Function(expr) => {
|
||||
// Functions should also ensure its return value is not null when
|
||||
// is_nullable is true.
|
||||
match &expr[..] {
|
||||
CURRENT_TIMESTAMP | CURRENT_TIMESTAMP_FN | NOW_FN => {
|
||||
create_current_timestamp(data_type).map(Some)
|
||||
}
|
||||
_ => error::UnsupportedDefaultExprSnafu { expr }.fail(),
|
||||
}
|
||||
}
|
||||
ColumnDefaultConstraint::Value(_) => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if this constraint might creates NULL.
|
||||
fn maybe_null(&self) -> bool {
|
||||
// Once we support more functions, we may return true if given function
|
||||
// could return null.
|
||||
matches!(self, ColumnDefaultConstraint::Value(Value::Null))
|
||||
}
|
||||
|
||||
/// Returns true if this constraint is a function.
|
||||
pub fn is_function(&self) -> bool {
|
||||
matches!(self, ColumnDefaultConstraint::Function(_))
|
||||
}
|
||||
}
|
||||
|
||||
fn create_current_timestamp(data_type: &ConcreteDataType) -> Result<Value> {
|
||||
|
||||
@@ -32,6 +32,7 @@ common-runtime.workspace = true
|
||||
common-telemetry.workspace = true
|
||||
common-time.workspace = true
|
||||
common-version.workspace = true
|
||||
config.workspace = true
|
||||
datafusion.workspace = true
|
||||
datafusion-common.workspace = true
|
||||
datafusion-expr.workspace = true
|
||||
|
||||
@@ -36,6 +36,7 @@ use query::QueryEngine;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use servers::grpc::GrpcOptions;
|
||||
use servers::heartbeat_options::HeartbeatOptions;
|
||||
use servers::http::HttpOptions;
|
||||
use servers::Mode;
|
||||
use session::context::QueryContext;
|
||||
use snafu::{ensure, OptionExt, ResultExt};
|
||||
@@ -45,9 +46,10 @@ use tokio::sync::broadcast::error::TryRecvError;
|
||||
use tokio::sync::{broadcast, watch, Mutex, RwLock};
|
||||
|
||||
pub(crate) use crate::adapter::node_context::FlownodeContext;
|
||||
use crate::adapter::refill::RefillTask;
|
||||
use crate::adapter::table_source::ManagedTableSource;
|
||||
use crate::adapter::util::relation_desc_to_column_schemas_with_fallback;
|
||||
use crate::adapter::worker::{create_worker, Worker, WorkerHandle};
|
||||
pub(crate) use crate::adapter::worker::{create_worker, Worker, WorkerHandle};
|
||||
use crate::compute::ErrCollector;
|
||||
use crate::df_optimizer::sql_to_flow_plan;
|
||||
use crate::error::{EvalSnafu, ExternalSnafu, InternalSnafu, InvalidQuerySnafu, UnexpectedSnafu};
|
||||
@@ -57,6 +59,7 @@ use crate::repr::{self, DiffRow, RelationDesc, Row, BATCH_SIZE};
|
||||
|
||||
mod flownode_impl;
|
||||
mod parse_expr;
|
||||
pub(crate) mod refill;
|
||||
mod stat;
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
@@ -80,6 +83,21 @@ pub const UPDATE_AT_TS_COL: &str = "update_at";
|
||||
pub type FlowId = u64;
|
||||
pub type TableName = [String; 3];
|
||||
|
||||
/// Flow config that exists both in standalone&distributed mode
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
|
||||
#[serde(default)]
|
||||
pub struct FlowConfig {
|
||||
pub num_workers: usize,
|
||||
}
|
||||
|
||||
impl Default for FlowConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
num_workers: (common_config::utils::get_cpus() / 2).max(1),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Options for flow node
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
#[serde(default)]
|
||||
@@ -87,7 +105,9 @@ pub struct FlownodeOptions {
|
||||
pub mode: Mode,
|
||||
pub cluster_id: Option<u64>,
|
||||
pub node_id: Option<u64>,
|
||||
pub flow: FlowConfig,
|
||||
pub grpc: GrpcOptions,
|
||||
pub http: HttpOptions,
|
||||
pub meta_client: Option<MetaClientOptions>,
|
||||
pub logging: LoggingOptions,
|
||||
pub tracing: TracingOptions,
|
||||
@@ -100,7 +120,9 @@ impl Default for FlownodeOptions {
|
||||
mode: servers::Mode::Standalone,
|
||||
cluster_id: None,
|
||||
node_id: None,
|
||||
flow: FlowConfig::default(),
|
||||
grpc: GrpcOptions::default().with_addr("127.0.0.1:3004"),
|
||||
http: HttpOptions::default(),
|
||||
meta_client: None,
|
||||
logging: LoggingOptions::default(),
|
||||
tracing: TracingOptions::default(),
|
||||
@@ -109,7 +131,14 @@ impl Default for FlownodeOptions {
|
||||
}
|
||||
}
|
||||
|
||||
impl Configurable for FlownodeOptions {}
|
||||
impl Configurable for FlownodeOptions {
|
||||
fn validate_sanitize(&mut self) -> common_config::error::Result<()> {
|
||||
if self.flow.num_workers == 0 {
|
||||
self.flow.num_workers = (common_config::utils::get_cpus() / 2).max(1);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Arc-ed FlowNodeManager, cheaper to clone
|
||||
pub type FlowWorkerManagerRef = Arc<FlowWorkerManager>;
|
||||
@@ -120,7 +149,9 @@ pub type FlowWorkerManagerRef = Arc<FlowWorkerManager>;
|
||||
pub struct FlowWorkerManager {
|
||||
/// The handler to the worker that will run the dataflow
|
||||
/// which is `!Send` so a handle is used
|
||||
pub worker_handles: Vec<Mutex<WorkerHandle>>,
|
||||
pub worker_handles: Vec<WorkerHandle>,
|
||||
/// The selector to select a worker to run the dataflow
|
||||
worker_selector: Mutex<usize>,
|
||||
/// The query engine that will be used to parse the query and convert it to a dataflow plan
|
||||
pub query_engine: Arc<dyn QueryEngine>,
|
||||
/// Getting table name and table schema from table info manager
|
||||
@@ -128,6 +159,8 @@ pub struct FlowWorkerManager {
|
||||
frontend_invoker: RwLock<Option<FrontendInvoker>>,
|
||||
/// contains mapping from table name to global id, and table schema
|
||||
node_context: RwLock<FlownodeContext>,
|
||||
/// Contains all refill tasks
|
||||
refill_tasks: RwLock<BTreeMap<FlowId, RefillTask>>,
|
||||
flow_err_collectors: RwLock<BTreeMap<FlowId, ErrCollector>>,
|
||||
src_send_buf_lens: RwLock<BTreeMap<TableId, watch::Receiver<usize>>>,
|
||||
tick_manager: FlowTickManager,
|
||||
@@ -162,10 +195,12 @@ impl FlowWorkerManager {
|
||||
let worker_handles = Vec::new();
|
||||
FlowWorkerManager {
|
||||
worker_handles,
|
||||
worker_selector: Mutex::new(0),
|
||||
query_engine,
|
||||
table_info_source: srv_map,
|
||||
frontend_invoker: RwLock::new(None),
|
||||
node_context: RwLock::new(node_context),
|
||||
refill_tasks: Default::default(),
|
||||
flow_err_collectors: Default::default(),
|
||||
src_send_buf_lens: Default::default(),
|
||||
tick_manager,
|
||||
@@ -181,20 +216,27 @@ impl FlowWorkerManager {
|
||||
}
|
||||
|
||||
/// Create a flownode manager with one worker
|
||||
pub fn new_with_worker<'s>(
|
||||
pub fn new_with_workers<'s>(
|
||||
node_id: Option<u32>,
|
||||
query_engine: Arc<dyn QueryEngine>,
|
||||
table_meta: TableMetadataManagerRef,
|
||||
) -> (Self, Worker<'s>) {
|
||||
num_workers: usize,
|
||||
) -> (Self, Vec<Worker<'s>>) {
|
||||
let mut zelf = Self::new(node_id, query_engine, table_meta);
|
||||
let (handle, worker) = create_worker();
|
||||
zelf.add_worker_handle(handle);
|
||||
(zelf, worker)
|
||||
|
||||
let workers: Vec<_> = (0..num_workers)
|
||||
.map(|_| {
|
||||
let (handle, worker) = create_worker();
|
||||
zelf.add_worker_handle(handle);
|
||||
worker
|
||||
})
|
||||
.collect();
|
||||
(zelf, workers)
|
||||
}
|
||||
|
||||
/// add a worker handler to manager, meaning this corresponding worker is under it's manage
|
||||
pub fn add_worker_handle(&mut self, handle: WorkerHandle) {
|
||||
self.worker_handles.push(Mutex::new(handle));
|
||||
self.worker_handles.push(handle);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -242,12 +284,29 @@ impl FlowWorkerManager {
|
||||
let (catalog, schema) = (table_name[0].clone(), table_name[1].clone());
|
||||
let ctx = Arc::new(QueryContext::with(&catalog, &schema));
|
||||
|
||||
let (is_ts_placeholder, proto_schema) = self
|
||||
let (is_ts_placeholder, proto_schema) = match self
|
||||
.try_fetch_existing_table(&table_name)
|
||||
.await?
|
||||
.context(UnexpectedSnafu {
|
||||
reason: format!("Table not found: {}", table_name.join(".")),
|
||||
})?;
|
||||
}) {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
if self
|
||||
.table_info_source
|
||||
.get_opt_table_id_from_name(&table_name)
|
||||
.await?
|
||||
.is_none()
|
||||
{
|
||||
// deal with both flow&sink table no longer exists
|
||||
// but some output is still in output buf
|
||||
common_telemetry::warn!(e; "Table `{}` no longer exists, skip writeback", table_name.join("."));
|
||||
continue;
|
||||
} else {
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
let schema_len = proto_schema.len();
|
||||
|
||||
let total_rows = reqs.iter().map(|r| r.len()).sum::<usize>();
|
||||
@@ -535,13 +594,16 @@ impl FlowWorkerManager {
|
||||
pub async fn run(&self, mut shutdown: Option<broadcast::Receiver<()>>) {
|
||||
debug!("Starting to run");
|
||||
let default_interval = Duration::from_secs(1);
|
||||
let mut tick_interval = tokio::time::interval(default_interval);
|
||||
// burst mode, so that if we miss a tick, we will run immediately to fully utilize the cpu
|
||||
tick_interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Burst);
|
||||
let mut avg_spd = 0; // rows/sec
|
||||
let mut since_last_run = tokio::time::Instant::now();
|
||||
let run_per_trace = 10;
|
||||
let mut run_cnt = 0;
|
||||
loop {
|
||||
// TODO(discord9): only run when new inputs arrive or scheduled to
|
||||
let row_cnt = self.run_available(true).await.unwrap_or_else(|err| {
|
||||
let row_cnt = self.run_available(false).await.unwrap_or_else(|err| {
|
||||
common_telemetry::error!(err;"Run available errors");
|
||||
0
|
||||
});
|
||||
@@ -571,9 +633,9 @@ impl FlowWorkerManager {
|
||||
|
||||
// for now we want to batch rows until there is around `BATCH_SIZE` rows in send buf
|
||||
// before trigger a run of flow's worker
|
||||
// (plus one for prevent div by zero)
|
||||
let wait_for = since_last_run.elapsed();
|
||||
|
||||
// last runs insert speed
|
||||
let cur_spd = row_cnt * 1000 / wait_for.as_millis().max(1) as usize;
|
||||
// rapid increase, slow decay
|
||||
avg_spd = if cur_spd > avg_spd {
|
||||
@@ -596,7 +658,10 @@ impl FlowWorkerManager {
|
||||
|
||||
METRIC_FLOW_RUN_INTERVAL_MS.set(new_wait.as_millis() as i64);
|
||||
since_last_run = tokio::time::Instant::now();
|
||||
tokio::time::sleep(new_wait).await;
|
||||
tokio::select! {
|
||||
_ = tick_interval.tick() => (),
|
||||
_ = tokio::time::sleep(new_wait) => ()
|
||||
}
|
||||
}
|
||||
// flow is now shutdown, drop frontend_invoker early so a ref cycle(in standalone mode) can be prevent:
|
||||
// FlowWorkerManager.frontend_invoker -> FrontendInvoker.inserter
|
||||
@@ -607,9 +672,9 @@ impl FlowWorkerManager {
|
||||
/// Run all available subgraph in the flow node
|
||||
/// This will try to run all dataflow in this node
|
||||
///
|
||||
/// set `blocking` to true to wait until lock is acquired
|
||||
/// and false to return immediately if lock is not acquired
|
||||
/// return numbers of rows send to worker
|
||||
/// set `blocking` to true to wait until worker finish running
|
||||
/// false to just trigger run and return immediately
|
||||
/// return numbers of rows send to worker(Inaccuary)
|
||||
/// TODO(discord9): add flag for subgraph that have input since last run
|
||||
pub async fn run_available(&self, blocking: bool) -> Result<usize, Error> {
|
||||
let mut row_cnt = 0;
|
||||
@@ -617,13 +682,7 @@ impl FlowWorkerManager {
|
||||
let now = self.tick_manager.tick();
|
||||
for worker in self.worker_handles.iter() {
|
||||
// TODO(discord9): consider how to handle error in individual worker
|
||||
if blocking {
|
||||
worker.lock().await.run_available(now, blocking).await?;
|
||||
} else if let Ok(worker) = worker.try_lock() {
|
||||
worker.run_available(now, blocking).await?;
|
||||
} else {
|
||||
return Ok(row_cnt);
|
||||
}
|
||||
worker.run_available(now, blocking).await?;
|
||||
}
|
||||
// check row send and rows remain in send buf
|
||||
let flush_res = if blocking {
|
||||
@@ -694,7 +753,6 @@ impl FlowWorkerManager {
|
||||
/// remove a flow by it's id
|
||||
pub async fn remove_flow(&self, flow_id: FlowId) -> Result<(), Error> {
|
||||
for handle in self.worker_handles.iter() {
|
||||
let handle = handle.lock().await;
|
||||
if handle.contains_flow(flow_id).await? {
|
||||
handle.remove_flow(flow_id).await?;
|
||||
break;
|
||||
@@ -830,7 +888,8 @@ impl FlowWorkerManager {
|
||||
.write()
|
||||
.await
|
||||
.insert(flow_id, err_collector.clone());
|
||||
let handle = &self.worker_handles[0].lock().await;
|
||||
// TODO(discord9): load balance?
|
||||
let handle = self.get_worker_handle_for_create_flow().await;
|
||||
let create_request = worker::Request::Create {
|
||||
flow_id,
|
||||
plan: flow_plan,
|
||||
|
||||
@@ -24,6 +24,7 @@ use common_error::ext::BoxedError;
|
||||
use common_meta::error::{ExternalSnafu, Result, UnexpectedSnafu};
|
||||
use common_meta::node_manager::Flownode;
|
||||
use common_telemetry::{debug, trace};
|
||||
use datatypes::value::Value;
|
||||
use itertools::Itertools;
|
||||
use snafu::{IntoError, OptionExt, ResultExt};
|
||||
use store_api::storage::RegionId;
|
||||
@@ -178,14 +179,32 @@ impl Flownode for FlowWorkerManager {
|
||||
.table_from_id(&table_id)
|
||||
.await
|
||||
.map_err(to_meta_err(snafu::location!()))?;
|
||||
let default_vals = table_schema
|
||||
.default_values
|
||||
.iter()
|
||||
.zip(table_schema.relation_desc.typ().column_types.iter())
|
||||
.map(|(v, ty)| {
|
||||
v.as_ref().and_then(|v| {
|
||||
match v.create_default(ty.scalar_type(), ty.nullable()) {
|
||||
Ok(v) => Some(v),
|
||||
Err(err) => {
|
||||
common_telemetry::error!(err; "Failed to create default value");
|
||||
None
|
||||
}
|
||||
}
|
||||
})
|
||||
})
|
||||
.collect_vec();
|
||||
|
||||
let table_types = table_schema
|
||||
.relation_desc
|
||||
.typ()
|
||||
.column_types
|
||||
.clone()
|
||||
.into_iter()
|
||||
.map(|t| t.scalar_type)
|
||||
.collect_vec();
|
||||
let table_col_names = table_schema.names;
|
||||
let table_col_names = table_schema.relation_desc.names;
|
||||
let table_col_names = table_col_names
|
||||
.iter().enumerate()
|
||||
.map(|(idx,name)| match name {
|
||||
@@ -202,31 +221,35 @@ impl Flownode for FlowWorkerManager {
|
||||
.enumerate()
|
||||
.map(|(i, name)| (&name.column_name, i)),
|
||||
);
|
||||
let fetch_order: Vec<usize> = table_col_names
|
||||
|
||||
let fetch_order: Vec<FetchFromRow> = table_col_names
|
||||
.iter()
|
||||
.map(|col_name| {
|
||||
.zip(default_vals.into_iter())
|
||||
.map(|(col_name, col_default_val)| {
|
||||
name_to_col
|
||||
.get(col_name)
|
||||
.copied()
|
||||
.map(FetchFromRow::Idx)
|
||||
.or_else(|| col_default_val.clone().map(FetchFromRow::Default))
|
||||
.with_context(|| UnexpectedSnafu {
|
||||
err_msg: format!("Column not found: {}", col_name),
|
||||
err_msg: format!(
|
||||
"Column not found: {}, default_value: {:?}",
|
||||
col_name, col_default_val
|
||||
),
|
||||
})
|
||||
})
|
||||
.try_collect()?;
|
||||
if !fetch_order.iter().enumerate().all(|(i, &v)| i == v) {
|
||||
trace!("Reordering columns: {:?}", fetch_order)
|
||||
}
|
||||
|
||||
trace!("Reordering columns: {:?}", fetch_order);
|
||||
(table_types, fetch_order)
|
||||
};
|
||||
|
||||
// TODO(discord9): use column instead of row
|
||||
let rows: Vec<DiffRow> = rows_proto
|
||||
.into_iter()
|
||||
.map(|r| {
|
||||
let r = repr::Row::from(r);
|
||||
let reordered = fetch_order
|
||||
.iter()
|
||||
.map(|&i| r.inner[i].clone())
|
||||
.collect_vec();
|
||||
let reordered = fetch_order.iter().map(|i| i.fetch(&r)).collect_vec();
|
||||
repr::Row::new(reordered)
|
||||
})
|
||||
.map(|r| (r, now, 1))
|
||||
@@ -258,3 +281,20 @@ impl Flownode for FlowWorkerManager {
|
||||
Ok(Default::default())
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple helper enum for fetching value from row with default value
|
||||
#[derive(Debug, Clone)]
|
||||
enum FetchFromRow {
|
||||
Idx(usize),
|
||||
Default(Value),
|
||||
}
|
||||
|
||||
impl FetchFromRow {
|
||||
/// Panic if idx is out of bound
|
||||
fn fetch(&self, row: &repr::Row) -> Value {
|
||||
match self {
|
||||
FetchFromRow::Idx(idx) => row.get(*idx).unwrap().clone(),
|
||||
FetchFromRow::Default(v) => v.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -130,7 +130,16 @@ impl SourceSender {
|
||||
// TODO(discord9): send rows instead so it's just moving a point
|
||||
if let Some(batch) = send_buf.recv().await {
|
||||
let len = batch.row_count();
|
||||
self.send_buf_row_cnt.fetch_sub(len, Ordering::SeqCst);
|
||||
if let Err(prev_row_cnt) =
|
||||
self.send_buf_row_cnt
|
||||
.fetch_update(Ordering::SeqCst, Ordering::SeqCst, |x| x.checked_sub(len))
|
||||
{
|
||||
common_telemetry::error!(
|
||||
"send buf row count underflow, prev = {}, len = {}",
|
||||
prev_row_cnt,
|
||||
len
|
||||
);
|
||||
}
|
||||
row_cnt += len;
|
||||
self.sender
|
||||
.send(batch)
|
||||
@@ -162,18 +171,21 @@ impl SourceSender {
|
||||
batch_datatypes: &[ConcreteDataType],
|
||||
) -> Result<usize, Error> {
|
||||
METRIC_FLOW_INPUT_BUF_SIZE.add(rows.len() as _);
|
||||
// important for backpressure. if send buf is full, block until it's not
|
||||
while self.send_buf_row_cnt.load(Ordering::SeqCst) >= BATCH_SIZE * 4 {
|
||||
tokio::task::yield_now().await;
|
||||
}
|
||||
|
||||
// row count metrics is approx so relaxed order is ok
|
||||
self.send_buf_row_cnt
|
||||
.fetch_add(rows.len(), Ordering::SeqCst);
|
||||
let batch = Batch::try_from_rows_with_types(
|
||||
rows.into_iter().map(|(row, _, _)| row).collect(),
|
||||
batch_datatypes,
|
||||
)
|
||||
.context(EvalSnafu)?;
|
||||
common_telemetry::trace!("Send one batch to worker with {} rows", batch.row_count());
|
||||
|
||||
self.send_buf_row_cnt
|
||||
.fetch_add(batch.row_count(), Ordering::SeqCst);
|
||||
self.send_buf_tx.send(batch).await.map_err(|e| {
|
||||
crate::error::InternalSnafu {
|
||||
reason: format!("Failed to send row, error = {:?}", e),
|
||||
@@ -353,7 +365,7 @@ impl FlownodeContext {
|
||||
name: name.join("."),
|
||||
})?;
|
||||
let schema = self.table_source.table(name).await?;
|
||||
Ok((id, schema))
|
||||
Ok((id, schema.relation_desc))
|
||||
}
|
||||
|
||||
/// Assign a global id to a table, if already assigned, return the existing global id
|
||||
|
||||
433
src/flow/src/adapter/refill.rs
Normal file
433
src/flow/src/adapter/refill.rs
Normal file
@@ -0,0 +1,433 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! This module contains the refill flow task, which is used to refill flow with given table id and a time range.
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
use std::sync::Arc;
|
||||
|
||||
use catalog::CatalogManagerRef;
|
||||
use common_error::ext::BoxedError;
|
||||
use common_meta::key::flow::FlowMetadataManagerRef;
|
||||
use common_recordbatch::{RecordBatch, RecordBatches, SendableRecordBatchStream};
|
||||
use common_runtime::JoinHandle;
|
||||
use common_telemetry::error;
|
||||
use datatypes::value::Value;
|
||||
use futures::StreamExt;
|
||||
use query::parser::QueryLanguageParser;
|
||||
use session::context::QueryContextBuilder;
|
||||
use snafu::{ensure, OptionExt, ResultExt};
|
||||
use table::metadata::TableId;
|
||||
|
||||
use super::{FlowId, FlowWorkerManager};
|
||||
use crate::adapter::table_source::ManagedTableSource;
|
||||
use crate::adapter::FlowWorkerManagerRef;
|
||||
use crate::error::{FlowNotFoundSnafu, JoinTaskSnafu, UnexpectedSnafu};
|
||||
use crate::expr::error::ExternalSnafu;
|
||||
use crate::expr::utils::find_plan_time_window_expr_lower_bound;
|
||||
use crate::repr::RelationDesc;
|
||||
use crate::server::get_all_flow_ids;
|
||||
use crate::{Error, FrontendInvoker};
|
||||
|
||||
impl FlowWorkerManager {
|
||||
/// Create and start refill flow tasks in background
|
||||
pub async fn create_and_start_refill_flow_tasks(
|
||||
self: &FlowWorkerManagerRef,
|
||||
flow_metadata_manager: &FlowMetadataManagerRef,
|
||||
catalog_manager: &CatalogManagerRef,
|
||||
) -> Result<(), Error> {
|
||||
let tasks = self
|
||||
.create_refill_flow_tasks(flow_metadata_manager, catalog_manager)
|
||||
.await?;
|
||||
self.starting_refill_flows(tasks).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create a series of tasks to refill flow
|
||||
pub async fn create_refill_flow_tasks(
|
||||
&self,
|
||||
flow_metadata_manager: &FlowMetadataManagerRef,
|
||||
catalog_manager: &CatalogManagerRef,
|
||||
) -> Result<Vec<RefillTask>, Error> {
|
||||
let nodeid = self.node_id.map(|c| c as u64);
|
||||
|
||||
let flow_ids = get_all_flow_ids(flow_metadata_manager, catalog_manager, nodeid).await?;
|
||||
let mut refill_tasks = Vec::new();
|
||||
'flow_id_loop: for flow_id in flow_ids {
|
||||
let info = flow_metadata_manager
|
||||
.flow_info_manager()
|
||||
.get(flow_id)
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExternalSnafu)?
|
||||
.context(FlowNotFoundSnafu { id: flow_id })?;
|
||||
|
||||
// TODO(discord9): also check flow is already running
|
||||
for src_table in info.source_table_ids() {
|
||||
// check if source table still exists
|
||||
if !self.table_info_source.check_table_exist(src_table).await? {
|
||||
error!(
|
||||
"Source table id = {:?} not found while refill flow_id={}, consider re-create the flow if necessary",
|
||||
src_table, flow_id
|
||||
);
|
||||
continue 'flow_id_loop;
|
||||
}
|
||||
}
|
||||
|
||||
let expire_after = info.expire_after();
|
||||
// TODO(discord9): better way to get last point
|
||||
let now = self.tick_manager.tick();
|
||||
let plan = self
|
||||
.node_context
|
||||
.read()
|
||||
.await
|
||||
.get_flow_plan(&FlowId::from(flow_id))
|
||||
.context(FlowNotFoundSnafu { id: flow_id })?;
|
||||
let time_range = if let Some(expire_after) = expire_after {
|
||||
let low_bound = common_time::Timestamp::new_millisecond(now - expire_after);
|
||||
let real_low_bound = find_plan_time_window_expr_lower_bound(&plan, low_bound)?;
|
||||
real_low_bound.map(|l| (l, common_time::Timestamp::new_millisecond(now)))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
common_telemetry::debug!(
|
||||
"Time range for refill flow_id={} is {:?}",
|
||||
flow_id,
|
||||
time_range
|
||||
);
|
||||
|
||||
for src_table in info.source_table_ids() {
|
||||
let time_index_col = self
|
||||
.table_info_source
|
||||
.get_time_index_column_from_table_id(*src_table)
|
||||
.await?
|
||||
.1;
|
||||
let time_index_name = time_index_col.name;
|
||||
let task = RefillTask::create(
|
||||
flow_id as u64,
|
||||
*src_table,
|
||||
time_range,
|
||||
&time_index_name,
|
||||
&self.table_info_source,
|
||||
)
|
||||
.await?;
|
||||
refill_tasks.push(task);
|
||||
}
|
||||
}
|
||||
Ok(refill_tasks)
|
||||
}
|
||||
|
||||
/// Starting to refill flows, if any error occurs, will rebuild the flow and retry
|
||||
pub(crate) async fn starting_refill_flows(
|
||||
self: &FlowWorkerManagerRef,
|
||||
tasks: Vec<RefillTask>,
|
||||
) -> Result<(), Error> {
|
||||
// TODO(discord9): add a back pressure mechanism
|
||||
let frontend_invoker =
|
||||
self.frontend_invoker
|
||||
.read()
|
||||
.await
|
||||
.clone()
|
||||
.context(UnexpectedSnafu {
|
||||
reason: "frontend invoker is not set",
|
||||
})?;
|
||||
|
||||
for mut task in tasks {
|
||||
task.start_running(self.clone(), &frontend_invoker).await?;
|
||||
// TODO(discord9): save refill tasks to a map and check if it's finished when necessary
|
||||
// i.e. when system table need query it's state
|
||||
self.refill_tasks
|
||||
.write()
|
||||
.await
|
||||
.insert(task.data.flow_id, task);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Task to refill flow with given table id and a time range
|
||||
pub struct RefillTask {
|
||||
data: TaskData,
|
||||
state: TaskState<()>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct TaskData {
|
||||
flow_id: FlowId,
|
||||
table_id: TableId,
|
||||
table_schema: RelationDesc,
|
||||
}
|
||||
|
||||
impl TaskData {
|
||||
/// validate that incoming batch's schema is the same as table schema(by comparing types&names)
|
||||
fn validate_schema(table_schema: &RelationDesc, rb: &RecordBatch) -> Result<(), Error> {
|
||||
let rb_schema = &rb.schema;
|
||||
ensure!(
|
||||
rb_schema.column_schemas().len() == table_schema.len()?,
|
||||
UnexpectedSnafu {
|
||||
reason: format!(
|
||||
"RecordBatch schema length does not match table schema length, {}!={}",
|
||||
rb_schema.column_schemas().len(),
|
||||
table_schema.len()?
|
||||
)
|
||||
}
|
||||
);
|
||||
for (i, rb_col) in rb_schema.column_schemas().iter().enumerate() {
|
||||
let (rb_name, rb_ty) = (rb_col.name.as_str(), &rb_col.data_type);
|
||||
let (table_name, table_ty) = (
|
||||
table_schema.names[i].as_ref(),
|
||||
&table_schema.typ().column_types[i].scalar_type,
|
||||
);
|
||||
ensure!(
|
||||
Some(rb_name) == table_name.map(|c| c.as_str()),
|
||||
UnexpectedSnafu {
|
||||
reason: format!(
|
||||
"Mismatch in column names: expected {:?}, found {}",
|
||||
table_name, rb_name
|
||||
)
|
||||
}
|
||||
);
|
||||
|
||||
ensure!(
|
||||
rb_ty == table_ty,
|
||||
UnexpectedSnafu {
|
||||
reason: format!(
|
||||
"Mismatch in column types for {}: expected {:?}, found {:?}",
|
||||
rb_name, table_ty, rb_ty
|
||||
)
|
||||
}
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Refill task state
|
||||
enum TaskState<T> {
|
||||
/// Task is not started
|
||||
Prepared { sql: String },
|
||||
/// Task is running
|
||||
Running {
|
||||
handle: JoinHandle<Result<T, Error>>,
|
||||
},
|
||||
/// Task is finished
|
||||
Finished { res: Result<T, Error> },
|
||||
}
|
||||
|
||||
impl<T> TaskState<T> {
|
||||
fn new(sql: String) -> Self {
|
||||
Self::Prepared { sql }
|
||||
}
|
||||
}
|
||||
|
||||
mod test_send {
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
use super::*;
|
||||
fn is_send<T: Send + Sync>() {}
|
||||
fn foo() {
|
||||
is_send::<TaskState<()>>();
|
||||
is_send::<RefillTask>();
|
||||
is_send::<BTreeMap<FlowId, RefillTask>>();
|
||||
is_send::<RwLock<BTreeMap<FlowId, RefillTask>>>();
|
||||
}
|
||||
}
|
||||
|
||||
impl TaskState<()> {
|
||||
/// check if task is finished
|
||||
async fn is_finished(&mut self) -> Result<bool, Error> {
|
||||
match self {
|
||||
Self::Finished { .. } => Ok(true),
|
||||
Self::Running { handle } => Ok(if handle.is_finished() {
|
||||
*self = Self::Finished {
|
||||
res: handle.await.context(JoinTaskSnafu)?,
|
||||
};
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}),
|
||||
_ => Ok(false),
|
||||
}
|
||||
}
|
||||
|
||||
fn start_running(
|
||||
&mut self,
|
||||
task_data: &TaskData,
|
||||
manager: FlowWorkerManagerRef,
|
||||
mut output_stream: SendableRecordBatchStream,
|
||||
) -> Result<(), Error> {
|
||||
let data = (*task_data).clone();
|
||||
let handle: JoinHandle<Result<(), Error>> = common_runtime::spawn_global(async move {
|
||||
while let Some(rb) = output_stream.next().await {
|
||||
let rb = match rb {
|
||||
Ok(rb) => rb,
|
||||
Err(err) => Err(BoxedError::new(err)).context(ExternalSnafu)?,
|
||||
};
|
||||
TaskData::validate_schema(&data.table_schema, &rb)?;
|
||||
|
||||
// send rb into flow node
|
||||
manager
|
||||
.node_context
|
||||
.read()
|
||||
.await
|
||||
.send_rb(data.table_id, rb)
|
||||
.await?;
|
||||
}
|
||||
common_telemetry::info!(
|
||||
"Refill successful for source table_id={}, flow_id={}",
|
||||
data.table_id,
|
||||
data.flow_id
|
||||
);
|
||||
Ok(())
|
||||
});
|
||||
*self = Self::Running { handle };
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Query stream of RefillTask, simply wrap RecordBatches and RecordBatchStream and check output is not `AffectedRows`
|
||||
enum QueryStream {
|
||||
Batches { batches: RecordBatches },
|
||||
Stream { stream: SendableRecordBatchStream },
|
||||
}
|
||||
|
||||
impl TryFrom<common_query::Output> for QueryStream {
|
||||
type Error = Error;
|
||||
fn try_from(value: common_query::Output) -> Result<Self, Self::Error> {
|
||||
match value.data {
|
||||
common_query::OutputData::Stream(stream) => Ok(QueryStream::Stream { stream }),
|
||||
common_query::OutputData::RecordBatches(batches) => {
|
||||
Ok(QueryStream::Batches { batches })
|
||||
}
|
||||
_ => UnexpectedSnafu {
|
||||
reason: format!("Unexpected output data type: {:?}", value.data),
|
||||
}
|
||||
.fail(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl QueryStream {
|
||||
fn try_into_stream(self) -> Result<SendableRecordBatchStream, Error> {
|
||||
match self {
|
||||
Self::Batches { batches } => Ok(batches.as_stream()),
|
||||
Self::Stream { stream } => Ok(stream),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl RefillTask {
|
||||
/// Query with "select * from table WHERE time >= range_start and time < range_end"
|
||||
pub async fn create(
|
||||
flow_id: FlowId,
|
||||
table_id: TableId,
|
||||
time_range: Option<(common_time::Timestamp, common_time::Timestamp)>,
|
||||
time_col_name: &str,
|
||||
table_src: &ManagedTableSource,
|
||||
) -> Result<RefillTask, Error> {
|
||||
let (table_name, table_schema) = table_src.get_table_name_schema(&table_id).await?;
|
||||
let all_col_names: BTreeSet<_> = table_schema
|
||||
.relation_desc
|
||||
.iter_names()
|
||||
.flatten()
|
||||
.map(|s| s.as_str())
|
||||
.collect();
|
||||
|
||||
if !all_col_names.contains(time_col_name) {
|
||||
UnexpectedSnafu {
|
||||
reason: format!(
|
||||
"Can't find column {} in table {} while refill flow",
|
||||
time_col_name,
|
||||
table_name.join(".")
|
||||
),
|
||||
}
|
||||
.fail()?;
|
||||
}
|
||||
|
||||
let sql = if let Some(time_range) = time_range {
|
||||
format!(
|
||||
"select * from {0} where {1} >= {2} and {1} < {3}",
|
||||
table_name.join("."),
|
||||
time_col_name,
|
||||
Value::from(time_range.0),
|
||||
Value::from(time_range.1),
|
||||
)
|
||||
} else {
|
||||
format!("select * from {0}", table_name.join("."))
|
||||
};
|
||||
|
||||
Ok(RefillTask {
|
||||
data: TaskData {
|
||||
flow_id,
|
||||
table_id,
|
||||
table_schema: table_schema.relation_desc,
|
||||
},
|
||||
state: TaskState::new(sql),
|
||||
})
|
||||
}
|
||||
|
||||
/// Start running the task in background, non-blocking
|
||||
pub async fn start_running(
|
||||
&mut self,
|
||||
manager: FlowWorkerManagerRef,
|
||||
invoker: &FrontendInvoker,
|
||||
) -> Result<(), Error> {
|
||||
let TaskState::Prepared { sql } = &mut self.state else {
|
||||
UnexpectedSnafu {
|
||||
reason: "task is not prepared",
|
||||
}
|
||||
.fail()?
|
||||
};
|
||||
|
||||
// we don't need information from query context in this query so a default query context is enough
|
||||
let query_ctx = Arc::new(
|
||||
QueryContextBuilder::default()
|
||||
.current_catalog("greptime".to_string())
|
||||
.current_schema("public".to_string())
|
||||
.build(),
|
||||
);
|
||||
|
||||
let stmt_exec = invoker.statement_executor();
|
||||
|
||||
let stmt = QueryLanguageParser::parse_sql(sql, &query_ctx)
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExternalSnafu)?;
|
||||
let plan = stmt_exec
|
||||
.plan(&stmt, query_ctx.clone())
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExternalSnafu)?;
|
||||
|
||||
let output_data = stmt_exec
|
||||
.exec_plan(plan, query_ctx)
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExternalSnafu)?;
|
||||
|
||||
let output_stream = QueryStream::try_from(output_data)?;
|
||||
let output_stream = output_stream.try_into_stream()?;
|
||||
|
||||
self.state
|
||||
.start_running(&self.data, manager, output_stream)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn is_finished(&mut self) -> Result<bool, Error> {
|
||||
self.state.is_finished().await
|
||||
}
|
||||
}
|
||||
@@ -22,7 +22,6 @@ impl FlowWorkerManager {
|
||||
pub async fn gen_state_report(&self) -> FlowStat {
|
||||
let mut full_report = BTreeMap::new();
|
||||
for worker in self.worker_handles.iter() {
|
||||
let worker = worker.lock().await;
|
||||
match worker.get_state_size().await {
|
||||
Ok(state_size) => {
|
||||
full_report.extend(state_size.into_iter().map(|(k, v)| (k as u32, v)))
|
||||
|
||||
@@ -17,6 +17,8 @@
|
||||
use common_error::ext::BoxedError;
|
||||
use common_meta::key::table_info::{TableInfoManager, TableInfoValue};
|
||||
use common_meta::key::table_name::{TableNameKey, TableNameManager};
|
||||
use datatypes::schema::ColumnDefaultConstraint;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
use table::metadata::TableId;
|
||||
|
||||
@@ -27,6 +29,32 @@ use crate::error::{
|
||||
};
|
||||
use crate::repr::RelationDesc;
|
||||
|
||||
/// Table description, include relation desc and default values, which is the minimal information flow needed for table
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct TableDesc {
|
||||
pub relation_desc: RelationDesc,
|
||||
pub default_values: Vec<Option<ColumnDefaultConstraint>>,
|
||||
}
|
||||
|
||||
impl TableDesc {
|
||||
pub fn new(
|
||||
relation_desc: RelationDesc,
|
||||
default_values: Vec<Option<ColumnDefaultConstraint>>,
|
||||
) -> Self {
|
||||
Self {
|
||||
relation_desc,
|
||||
default_values,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_no_default(relation_desc: RelationDesc) -> Self {
|
||||
Self {
|
||||
relation_desc,
|
||||
default_values: vec![],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Table source but for flow, provide table schema by table name/id
|
||||
#[async_trait::async_trait]
|
||||
pub trait FlowTableSource: Send + Sync + std::fmt::Debug {
|
||||
@@ -34,11 +62,11 @@ pub trait FlowTableSource: Send + Sync + std::fmt::Debug {
|
||||
async fn table_id_from_name(&self, name: &TableName) -> Result<TableId, Error>;
|
||||
|
||||
/// Get the table schema by table name
|
||||
async fn table(&self, name: &TableName) -> Result<RelationDesc, Error> {
|
||||
async fn table(&self, name: &TableName) -> Result<TableDesc, Error> {
|
||||
let id = self.table_id_from_name(name).await?;
|
||||
self.table_from_id(&id).await
|
||||
}
|
||||
async fn table_from_id(&self, table_id: &TableId) -> Result<RelationDesc, Error>;
|
||||
async fn table_from_id(&self, table_id: &TableId) -> Result<TableDesc, Error>;
|
||||
}
|
||||
|
||||
/// managed table source information, query from table info manager and table name manager
|
||||
@@ -51,7 +79,7 @@ pub struct ManagedTableSource {
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl FlowTableSource for ManagedTableSource {
|
||||
async fn table_from_id(&self, table_id: &TableId) -> Result<RelationDesc, Error> {
|
||||
async fn table_from_id(&self, table_id: &TableId) -> Result<TableDesc, Error> {
|
||||
let table_info_value = self
|
||||
.get_table_info_value(table_id)
|
||||
.await?
|
||||
@@ -175,7 +203,7 @@ impl ManagedTableSource {
|
||||
pub async fn get_table_name_schema(
|
||||
&self,
|
||||
table_id: &TableId,
|
||||
) -> Result<(TableName, RelationDesc), Error> {
|
||||
) -> Result<(TableName, TableDesc), Error> {
|
||||
let table_info_value = self
|
||||
.get_table_info_value(table_id)
|
||||
.await?
|
||||
@@ -219,7 +247,7 @@ pub(crate) mod test {
|
||||
use crate::repr::{ColumnType, RelationType};
|
||||
|
||||
pub struct FlowDummyTableSource {
|
||||
pub id_names_to_desc: Vec<(TableId, TableName, RelationDesc)>,
|
||||
pub id_names_to_desc: Vec<(TableId, TableName, TableDesc)>,
|
||||
id_to_idx: HashMap<TableId, usize>,
|
||||
name_to_idx: HashMap<TableName, usize>,
|
||||
}
|
||||
@@ -234,8 +262,10 @@ pub(crate) mod test {
|
||||
"public".to_string(),
|
||||
"numbers".to_string(),
|
||||
],
|
||||
RelationType::new(vec![ColumnType::new(CDT::uint32_datatype(), false)])
|
||||
.into_named(vec![Some("number".to_string())]),
|
||||
TableDesc::new_no_default(
|
||||
RelationType::new(vec![ColumnType::new(CDT::uint32_datatype(), false)])
|
||||
.into_named(vec![Some("number".to_string())]),
|
||||
),
|
||||
),
|
||||
(
|
||||
1025,
|
||||
@@ -244,11 +274,13 @@ pub(crate) mod test {
|
||||
"public".to_string(),
|
||||
"numbers_with_ts".to_string(),
|
||||
],
|
||||
RelationType::new(vec![
|
||||
ColumnType::new(CDT::uint32_datatype(), false),
|
||||
ColumnType::new(CDT::timestamp_millisecond_datatype(), false),
|
||||
])
|
||||
.into_named(vec![Some("number".to_string()), Some("ts".to_string())]),
|
||||
TableDesc::new_no_default(
|
||||
RelationType::new(vec![
|
||||
ColumnType::new(CDT::uint32_datatype(), false),
|
||||
ColumnType::new(CDT::timestamp_millisecond_datatype(), false),
|
||||
])
|
||||
.into_named(vec![Some("number".to_string()), Some("ts".to_string())]),
|
||||
),
|
||||
),
|
||||
];
|
||||
let id_to_idx = id_names_to_desc
|
||||
@@ -271,7 +303,7 @@ pub(crate) mod test {
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl FlowTableSource for FlowDummyTableSource {
|
||||
async fn table_from_id(&self, table_id: &TableId) -> Result<RelationDesc, Error> {
|
||||
async fn table_from_id(&self, table_id: &TableId) -> Result<TableDesc, Error> {
|
||||
let idx = self.id_to_idx.get(table_id).context(TableNotFoundSnafu {
|
||||
name: format!("Table id = {:?}, couldn't found table desc", table_id),
|
||||
})?;
|
||||
|
||||
@@ -27,12 +27,28 @@ use session::context::QueryContextBuilder;
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
use table::table_reference::TableReference;
|
||||
|
||||
use crate::adapter::{TableName, AUTO_CREATED_PLACEHOLDER_TS_COL};
|
||||
use crate::adapter::table_source::TableDesc;
|
||||
use crate::adapter::{TableName, WorkerHandle, AUTO_CREATED_PLACEHOLDER_TS_COL};
|
||||
use crate::error::{Error, ExternalSnafu, UnexpectedSnafu};
|
||||
use crate::repr::{ColumnType, RelationDesc, RelationType};
|
||||
use crate::FlowWorkerManager;
|
||||
|
||||
impl FlowWorkerManager {
|
||||
/// Get a worker handle for creating flow, using round robin to select a worker
|
||||
pub(crate) async fn get_worker_handle_for_create_flow(&self) -> &WorkerHandle {
|
||||
let use_idx = {
|
||||
let mut selector = self.worker_selector.lock().await;
|
||||
if *selector >= self.worker_handles.len() {
|
||||
*selector = 0
|
||||
};
|
||||
let use_idx = *selector;
|
||||
*selector += 1;
|
||||
use_idx
|
||||
};
|
||||
// Safety: selector is always in bound
|
||||
&self.worker_handles[use_idx]
|
||||
}
|
||||
|
||||
/// Create table from given schema(will adjust to add auto column if needed), return true if table is created
|
||||
pub(crate) async fn create_table_from_relation(
|
||||
&self,
|
||||
@@ -126,7 +142,7 @@ impl FlowWorkerManager {
|
||||
|
||||
pub fn table_info_value_to_relation_desc(
|
||||
table_info_value: TableInfoValue,
|
||||
) -> Result<RelationDesc, Error> {
|
||||
) -> Result<TableDesc, Error> {
|
||||
let raw_schema = table_info_value.table_info.meta.schema;
|
||||
let (column_types, col_names): (Vec<_>, Vec<_>) = raw_schema
|
||||
.column_schemas
|
||||
@@ -147,8 +163,7 @@ pub fn table_info_value_to_relation_desc(
|
||||
let keys = vec![crate::repr::Key::from(key)];
|
||||
|
||||
let time_index = raw_schema.timestamp_index;
|
||||
|
||||
Ok(RelationDesc {
|
||||
let relation_desc = RelationDesc {
|
||||
typ: RelationType {
|
||||
column_types,
|
||||
keys,
|
||||
@@ -157,7 +172,14 @@ pub fn table_info_value_to_relation_desc(
|
||||
auto_columns: vec![],
|
||||
},
|
||||
names: col_names,
|
||||
})
|
||||
};
|
||||
let default_values = raw_schema
|
||||
.column_schemas
|
||||
.iter()
|
||||
.map(|c| c.default_constraint().cloned())
|
||||
.collect_vec();
|
||||
|
||||
Ok(TableDesc::new(relation_desc, default_values))
|
||||
}
|
||||
|
||||
pub fn from_proto_to_data_type(
|
||||
|
||||
@@ -103,11 +103,6 @@ impl AggregateFunc {
|
||||
self.signature().generic_fn == GenericFn::Min
|
||||
}
|
||||
|
||||
/// if this function is a `sum`
|
||||
pub fn is_sum(&self) -> bool {
|
||||
self.signature().generic_fn == GenericFn::Sum
|
||||
}
|
||||
|
||||
/// Eval value, diff with accumulator
|
||||
///
|
||||
/// Expect self to be accumulable aggregate function, i.e. sum/count
|
||||
|
||||
@@ -41,6 +41,6 @@ mod utils;
|
||||
#[cfg(test)]
|
||||
mod test_utils;
|
||||
|
||||
pub use adapter::{FlowWorkerManager, FlowWorkerManagerRef, FlownodeOptions};
|
||||
pub use adapter::{FlowConfig, FlowWorkerManager, FlowWorkerManagerRef, FlownodeOptions};
|
||||
pub use error::{Error, Result};
|
||||
pub use server::{FlownodeBuilder, FlownodeInstance, FlownodeServer, FrontendInvoker};
|
||||
|
||||
@@ -489,12 +489,6 @@ impl RelationDesc {
|
||||
self
|
||||
}
|
||||
|
||||
/// Drops all existing keys.
|
||||
pub fn without_keys(mut self) -> Self {
|
||||
self.typ.keys.clear();
|
||||
self
|
||||
}
|
||||
|
||||
/// Builds a new relation description with the column names replaced with
|
||||
/// new names.
|
||||
///
|
||||
@@ -550,32 +544,6 @@ impl RelationDesc {
|
||||
pub fn get_name(&self, i: usize) -> &Option<ColumnName> {
|
||||
&self.names[i]
|
||||
}
|
||||
|
||||
/// Mutably gets the name of the `i`th column.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if `i` is not a valid column index.
|
||||
pub fn get_name_mut(&mut self, i: usize) -> &mut Option<ColumnName> {
|
||||
&mut self.names[i]
|
||||
}
|
||||
|
||||
/// Gets the name of the `i`th column if that column name is unambiguous.
|
||||
///
|
||||
/// If at least one other column has the same name as the `i`th column,
|
||||
/// returns `None`. If the `i`th column has no name, returns `None`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if `i` is not a valid column index.
|
||||
pub fn get_unambiguous_name(&self, i: usize) -> Option<&ColumnName> {
|
||||
let name = &self.names[i];
|
||||
if self.iter_names().filter(|n| *n == name).count() == 1 {
|
||||
name.as_ref()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The name of a column in a [`RelationDesc`].
|
||||
|
||||
@@ -39,6 +39,8 @@ use operator::statement::StatementExecutor;
|
||||
use partition::manager::PartitionRuleManager;
|
||||
use query::{QueryEngine, QueryEngineFactory};
|
||||
use servers::error::{AlreadyStartedSnafu, StartGrpcSnafu, TcpBindSnafu, TcpIncomingSnafu};
|
||||
use servers::http::{HttpServer, HttpServerBuilder};
|
||||
use servers::metrics_handler::MetricsHandler;
|
||||
use servers::server::Server;
|
||||
use session::context::{QueryContextBuilder, QueryContextRef};
|
||||
use snafu::{ensure, OptionExt, ResultExt};
|
||||
@@ -48,7 +50,7 @@ use tonic::codec::CompressionEncoding;
|
||||
use tonic::transport::server::TcpIncoming;
|
||||
use tonic::{Request, Response, Status};
|
||||
|
||||
use crate::adapter::{CreateFlowArgs, FlowWorkerManagerRef};
|
||||
use crate::adapter::{create_worker, CreateFlowArgs, FlowWorkerManagerRef};
|
||||
use crate::error::{
|
||||
to_status_with_last_err, CacheRequiredSnafu, CreateFlowSnafu, ExternalSnafu, FlowNotFoundSnafu,
|
||||
ListFlowsSnafu, ParseAddrSnafu, ShutdownServerSnafu, StartServerSnafu, UnexpectedSnafu,
|
||||
@@ -86,6 +88,10 @@ impl flow_server::Flow for FlowService {
|
||||
self.manager
|
||||
.handle(request)
|
||||
.await
|
||||
.map_err(|err| {
|
||||
common_telemetry::error!(err; "Failed to handle flow request");
|
||||
err
|
||||
})
|
||||
.map(Response::new)
|
||||
.map_err(to_status_with_last_err)
|
||||
}
|
||||
@@ -210,6 +216,9 @@ impl servers::server::Server for FlownodeServer {
|
||||
pub struct FlownodeInstance {
|
||||
server: FlownodeServer,
|
||||
addr: SocketAddr,
|
||||
/// only used for health check
|
||||
http_server: HttpServer,
|
||||
http_addr: SocketAddr,
|
||||
heartbeat_task: Option<HeartbeatTask>,
|
||||
}
|
||||
|
||||
@@ -224,6 +233,12 @@ impl FlownodeInstance {
|
||||
.start(self.addr)
|
||||
.await
|
||||
.context(StartServerSnafu)?;
|
||||
|
||||
self.http_server
|
||||
.start(self.http_addr)
|
||||
.await
|
||||
.context(StartServerSnafu)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
pub async fn shutdown(&self) -> Result<(), crate::Error> {
|
||||
@@ -233,6 +248,11 @@ impl FlownodeInstance {
|
||||
task.shutdown();
|
||||
}
|
||||
|
||||
self.http_server
|
||||
.shutdown()
|
||||
.await
|
||||
.context(ShutdownServerSnafu)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -305,12 +325,21 @@ impl FlownodeBuilder {
|
||||
|
||||
let server = FlownodeServer::new(FlowService::new(manager.clone()));
|
||||
|
||||
let http_addr = self.opts.http.addr.parse().context(ParseAddrSnafu {
|
||||
addr: self.opts.http.addr.clone(),
|
||||
})?;
|
||||
let http_server = HttpServerBuilder::new(self.opts.http)
|
||||
.with_metrics_handler(MetricsHandler)
|
||||
.build();
|
||||
|
||||
let heartbeat_task = self.heartbeat_task;
|
||||
|
||||
let addr = self.opts.grpc.addr;
|
||||
let instance = FlownodeInstance {
|
||||
server,
|
||||
addr: addr.parse().context(ParseAddrSnafu { addr })?,
|
||||
http_server,
|
||||
http_addr,
|
||||
heartbeat_task,
|
||||
};
|
||||
Ok(instance)
|
||||
@@ -414,24 +443,30 @@ impl FlownodeBuilder {
|
||||
|
||||
register_function_to_query_engine(&query_engine);
|
||||
|
||||
let (tx, rx) = oneshot::channel();
|
||||
let num_workers = self.opts.flow.num_workers;
|
||||
|
||||
let node_id = self.opts.node_id.map(|id| id as u32);
|
||||
let _handle = std::thread::Builder::new()
|
||||
.name("flow-worker".to_string())
|
||||
.spawn(move || {
|
||||
let (flow_node_manager, mut worker) =
|
||||
FlowWorkerManager::new_with_worker(node_id, query_engine, table_meta);
|
||||
let _ = tx.send(flow_node_manager);
|
||||
info!("Flow Worker started in new thread");
|
||||
worker.run();
|
||||
});
|
||||
let mut man = rx.await.map_err(|_e| {
|
||||
UnexpectedSnafu {
|
||||
reason: "sender is dropped, failed to create flow node manager",
|
||||
}
|
||||
.build()
|
||||
})?;
|
||||
|
||||
let mut man = FlowWorkerManager::new(node_id, query_engine, table_meta);
|
||||
for worker_id in 0..num_workers {
|
||||
let (tx, rx) = oneshot::channel();
|
||||
|
||||
let _handle = std::thread::Builder::new()
|
||||
.name(format!("flow-worker-{}", worker_id))
|
||||
.spawn(move || {
|
||||
let (handle, mut worker) = create_worker();
|
||||
let _ = tx.send(handle);
|
||||
info!("Flow Worker started in new thread");
|
||||
worker.run();
|
||||
});
|
||||
let worker_handle = rx.await.map_err(|e| {
|
||||
UnexpectedSnafu {
|
||||
reason: format!("Failed to receive worker handle: {}", e),
|
||||
}
|
||||
.build()
|
||||
})?;
|
||||
man.add_worker_handle(worker_handle);
|
||||
}
|
||||
if let Some(handler) = self.state_report_handler.take() {
|
||||
man = man.with_state_report_handler(handler).await;
|
||||
}
|
||||
|
||||
@@ -21,6 +21,7 @@ fastbloom = "0.8"
|
||||
fst.workspace = true
|
||||
futures.workspace = true
|
||||
greptime-proto.workspace = true
|
||||
itertools.workspace = true
|
||||
mockall.workspace = true
|
||||
pin-project.workspace = true
|
||||
prost.workspace = true
|
||||
|
||||
@@ -12,8 +12,6 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
pub mod applier;
|
||||
pub mod creator;
|
||||
pub mod error;
|
||||
@@ -24,35 +22,3 @@ pub type BytesRef<'a> = &'a [u8];
|
||||
|
||||
/// The seed used for the Bloom filter.
|
||||
pub const SEED: u128 = 42;
|
||||
|
||||
/// The Meta information of the bloom filter stored in the file.
|
||||
#[derive(Debug, Default, Serialize, Deserialize, Clone)]
|
||||
pub struct BloomFilterMeta {
|
||||
/// The number of rows per segment.
|
||||
pub rows_per_segment: usize,
|
||||
|
||||
/// The number of segments.
|
||||
pub seg_count: usize,
|
||||
|
||||
/// The number of total rows.
|
||||
pub row_count: usize,
|
||||
|
||||
/// The size of the bloom filter excluding the meta information.
|
||||
pub bloom_filter_segments_size: usize,
|
||||
|
||||
/// Offset and size of bloom filters in the file.
|
||||
pub bloom_filter_segments: Vec<BloomFilterSegmentLocation>,
|
||||
}
|
||||
|
||||
/// The location of the bloom filter segment in the file.
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, Copy, Hash, PartialEq, Eq)]
|
||||
pub struct BloomFilterSegmentLocation {
|
||||
/// The offset of the bloom filter segment in the file.
|
||||
pub offset: u64,
|
||||
|
||||
/// The size of the bloom filter segment in the file.
|
||||
pub size: u64,
|
||||
|
||||
/// The number of elements in the bloom filter segment.
|
||||
pub elem_count: usize,
|
||||
}
|
||||
|
||||
@@ -15,9 +15,12 @@
|
||||
use std::collections::HashSet;
|
||||
use std::ops::Range;
|
||||
|
||||
use greptime_proto::v1::index::BloomFilterMeta;
|
||||
use itertools::Itertools;
|
||||
|
||||
use crate::bloom_filter::error::Result;
|
||||
use crate::bloom_filter::reader::BloomFilterReader;
|
||||
use crate::bloom_filter::{BloomFilterMeta, Bytes};
|
||||
use crate::bloom_filter::Bytes;
|
||||
|
||||
pub struct BloomFilterApplier {
|
||||
reader: Box<dyn BloomFilterReader + Send>,
|
||||
@@ -37,27 +40,42 @@ impl BloomFilterApplier {
|
||||
probes: &HashSet<Bytes>,
|
||||
search_range: Range<usize>,
|
||||
) -> Result<Vec<Range<usize>>> {
|
||||
let rows_per_segment = self.meta.rows_per_segment;
|
||||
let rows_per_segment = self.meta.rows_per_segment as usize;
|
||||
let start_seg = search_range.start / rows_per_segment;
|
||||
let end_seg = search_range.end.div_ceil(rows_per_segment);
|
||||
|
||||
let locs = &self.meta.bloom_filter_segments[start_seg..end_seg];
|
||||
let bfs = self.reader.bloom_filter_vec(locs).await?;
|
||||
let locs = &self.meta.segment_loc_indices[start_seg..end_seg];
|
||||
|
||||
let mut ranges: Vec<Range<usize>> = Vec::with_capacity(end_seg - start_seg);
|
||||
for (seg_id, bloom) in (start_seg..end_seg).zip(bfs) {
|
||||
let start = seg_id * rows_per_segment;
|
||||
// dedup locs
|
||||
let deduped_locs = locs
|
||||
.iter()
|
||||
.dedup()
|
||||
.map(|i| self.meta.bloom_filter_locs[*i as usize].clone())
|
||||
.collect::<Vec<_>>();
|
||||
let bfs = self.reader.bloom_filter_vec(&deduped_locs).await?;
|
||||
|
||||
let mut ranges: Vec<Range<usize>> = Vec::with_capacity(bfs.len());
|
||||
for ((_, mut group), bloom) in locs
|
||||
.iter()
|
||||
.zip(start_seg..end_seg)
|
||||
.group_by(|(x, _)| **x)
|
||||
.into_iter()
|
||||
.zip(bfs.iter())
|
||||
{
|
||||
let start = group.next().unwrap().1 * rows_per_segment; // SAFETY: group is not empty
|
||||
let end = group.last().map_or(start + rows_per_segment, |(_, end)| {
|
||||
(end + 1) * rows_per_segment
|
||||
});
|
||||
let actual_start = start.max(search_range.start);
|
||||
let actual_end = end.min(search_range.end);
|
||||
for probe in probes {
|
||||
if bloom.contains(probe) {
|
||||
let end = (start + rows_per_segment).min(search_range.end);
|
||||
let start = start.max(search_range.start);
|
||||
|
||||
match ranges.last_mut() {
|
||||
Some(last) if last.end == start => {
|
||||
last.end = end;
|
||||
Some(last) if last.end == actual_start => {
|
||||
last.end = actual_end;
|
||||
}
|
||||
_ => {
|
||||
ranges.push(start..end);
|
||||
ranges.push(actual_start..actual_end);
|
||||
}
|
||||
}
|
||||
break;
|
||||
@@ -93,46 +111,73 @@ mod tests {
|
||||
);
|
||||
|
||||
let rows = vec![
|
||||
// seg 0
|
||||
vec![b"row00".to_vec(), b"seg00".to_vec(), b"overl".to_vec()],
|
||||
vec![b"row01".to_vec(), b"seg00".to_vec(), b"overl".to_vec()],
|
||||
vec![b"row02".to_vec(), b"seg00".to_vec(), b"overl".to_vec()],
|
||||
vec![b"row03".to_vec(), b"seg00".to_vec(), b"overl".to_vec()],
|
||||
// seg 1
|
||||
vec![b"row04".to_vec(), b"seg01".to_vec(), b"overl".to_vec()],
|
||||
vec![b"row05".to_vec(), b"seg01".to_vec(), b"overl".to_vec()],
|
||||
vec![b"row06".to_vec(), b"seg01".to_vec(), b"overp".to_vec()],
|
||||
vec![b"row07".to_vec(), b"seg01".to_vec(), b"overp".to_vec()],
|
||||
// seg 2
|
||||
vec![b"row08".to_vec(), b"seg02".to_vec(), b"overp".to_vec()],
|
||||
vec![b"row09".to_vec(), b"seg02".to_vec(), b"overp".to_vec()],
|
||||
vec![b"row10".to_vec(), b"seg02".to_vec(), b"overp".to_vec()],
|
||||
vec![b"row11".to_vec(), b"seg02".to_vec(), b"overp".to_vec()],
|
||||
// duplicate rows
|
||||
// seg 3
|
||||
vec![b"dup".to_vec()],
|
||||
vec![b"dup".to_vec()],
|
||||
vec![b"dup".to_vec()],
|
||||
vec![b"dup".to_vec()],
|
||||
// seg 4
|
||||
vec![b"dup".to_vec()],
|
||||
vec![b"dup".to_vec()],
|
||||
vec![b"dup".to_vec()],
|
||||
vec![b"dup".to_vec()],
|
||||
// seg 5
|
||||
vec![b"dup".to_vec()],
|
||||
vec![b"dup".to_vec()],
|
||||
vec![b"dup".to_vec()],
|
||||
vec![b"dup".to_vec()],
|
||||
// seg 6
|
||||
vec![b"dup".to_vec()],
|
||||
vec![b"dup".to_vec()],
|
||||
vec![b"dup".to_vec()],
|
||||
vec![b"dup".to_vec()],
|
||||
];
|
||||
|
||||
let cases = vec![
|
||||
(vec![b"row00".to_vec()], 0..12, vec![0..4]), // search one row in full range
|
||||
(vec![b"row00".to_vec()], 0..28, vec![0..4]), // search one row in full range
|
||||
(vec![b"row05".to_vec()], 4..8, vec![4..8]), // search one row in partial range
|
||||
(vec![b"row03".to_vec()], 4..8, vec![]), // search for a row that doesn't exist in the partial range
|
||||
(
|
||||
vec![b"row01".to_vec(), b"row06".to_vec()],
|
||||
0..12,
|
||||
0..28,
|
||||
vec![0..8],
|
||||
), // search multiple rows in multiple ranges
|
||||
(
|
||||
vec![b"row01".to_vec(), b"row11".to_vec()],
|
||||
0..12,
|
||||
0..28,
|
||||
vec![0..4, 8..12],
|
||||
), // search multiple rows in multiple ranges
|
||||
(vec![b"row99".to_vec()], 0..12, vec![]), // search for a row that doesn't exist in the full range
|
||||
(vec![b"row99".to_vec()], 0..28, vec![]), // search for a row that doesn't exist in the full range
|
||||
(vec![b"row00".to_vec()], 12..12, vec![]), // search in an empty range
|
||||
(
|
||||
vec![b"row04".to_vec(), b"row05".to_vec()],
|
||||
0..12,
|
||||
vec![4..8],
|
||||
), // search multiple rows in same segment
|
||||
(vec![b"seg01".to_vec()], 0..12, vec![4..8]), // search rows in a segment
|
||||
(vec![b"seg01".to_vec()], 6..12, vec![6..8]), // search rows in a segment in partial range
|
||||
(vec![b"overl".to_vec()], 0..12, vec![0..8]), // search rows in multiple segments
|
||||
(vec![b"overl".to_vec()], 2..12, vec![2..8]), // search range starts from the middle of a segment
|
||||
(vec![b"seg01".to_vec()], 0..28, vec![4..8]), // search rows in a segment
|
||||
(vec![b"seg01".to_vec()], 6..28, vec![6..8]), // search rows in a segment in partial range
|
||||
(vec![b"overl".to_vec()], 0..28, vec![0..8]), // search rows in multiple segments
|
||||
(vec![b"overl".to_vec()], 2..28, vec![2..8]), // search range starts from the middle of a segment
|
||||
(vec![b"overp".to_vec()], 0..10, vec![4..10]), // search range ends at the middle of a segment
|
||||
(vec![b"dup".to_vec()], 0..12, vec![]), // search for a duplicate row not in the range
|
||||
(vec![b"dup".to_vec()], 0..16, vec![12..16]), // search for a duplicate row in the range
|
||||
(vec![b"dup".to_vec()], 0..28, vec![12..28]), // search for a duplicate row in the full range
|
||||
];
|
||||
|
||||
for row in rows {
|
||||
|
||||
@@ -21,10 +21,12 @@ use std::sync::Arc;
|
||||
|
||||
use finalize_segment::FinalizedBloomFilterStorage;
|
||||
use futures::{AsyncWrite, AsyncWriteExt, StreamExt};
|
||||
use greptime_proto::v1::index::{BloomFilterLoc, BloomFilterMeta};
|
||||
use prost::Message;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::bloom_filter::error::{IoSnafu, Result, SerdeJsonSnafu};
|
||||
use crate::bloom_filter::{BloomFilterMeta, BloomFilterSegmentLocation, Bytes, SEED};
|
||||
use crate::bloom_filter::error::{IoSnafu, Result};
|
||||
use crate::bloom_filter::{Bytes, SEED};
|
||||
use crate::external_provider::ExternalTempFileProvider;
|
||||
|
||||
/// The false positive rate of the Bloom filter.
|
||||
@@ -170,12 +172,15 @@ impl BloomFilterCreator {
|
||||
}
|
||||
|
||||
let mut meta = BloomFilterMeta {
|
||||
rows_per_segment: self.rows_per_segment,
|
||||
row_count: self.accumulated_row_count,
|
||||
rows_per_segment: self.rows_per_segment as _,
|
||||
row_count: self.accumulated_row_count as _,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let mut segs = self.finalized_bloom_filters.drain().await?;
|
||||
let (indices, mut segs) = self.finalized_bloom_filters.drain().await?;
|
||||
meta.segment_loc_indices = indices.into_iter().map(|i| i as u64).collect();
|
||||
meta.segment_count = meta.segment_loc_indices.len() as _;
|
||||
|
||||
while let Some(segment) = segs.next().await {
|
||||
let segment = segment?;
|
||||
writer
|
||||
@@ -183,17 +188,16 @@ impl BloomFilterCreator {
|
||||
.await
|
||||
.context(IoSnafu)?;
|
||||
|
||||
let size = segment.bloom_filter_bytes.len();
|
||||
meta.bloom_filter_segments.push(BloomFilterSegmentLocation {
|
||||
offset: meta.bloom_filter_segments_size as _,
|
||||
size: size as _,
|
||||
elem_count: segment.element_count,
|
||||
let size = segment.bloom_filter_bytes.len() as u64;
|
||||
meta.bloom_filter_locs.push(BloomFilterLoc {
|
||||
offset: meta.bloom_filter_size as _,
|
||||
size,
|
||||
element_count: segment.element_count as _,
|
||||
});
|
||||
meta.bloom_filter_segments_size += size;
|
||||
meta.seg_count += 1;
|
||||
meta.bloom_filter_size += size;
|
||||
}
|
||||
|
||||
let meta_bytes = serde_json::to_vec(&meta).context(SerdeJsonSnafu)?;
|
||||
let meta_bytes = meta.encode_to_vec();
|
||||
writer.write_all(&meta_bytes).await.context(IoSnafu)?;
|
||||
|
||||
let meta_size = meta_bytes.len() as u32;
|
||||
@@ -287,34 +291,38 @@ mod tests {
|
||||
let meta_size = u32::from_le_bytes((&bytes[meta_size_offset..]).try_into().unwrap());
|
||||
|
||||
let meta_bytes = &bytes[total_size - meta_size as usize - 4..total_size - 4];
|
||||
let meta: BloomFilterMeta = serde_json::from_slice(meta_bytes).unwrap();
|
||||
let meta = BloomFilterMeta::decode(meta_bytes).unwrap();
|
||||
|
||||
assert_eq!(meta.rows_per_segment, 2);
|
||||
assert_eq!(meta.seg_count, 2);
|
||||
assert_eq!(meta.segment_count, 2);
|
||||
assert_eq!(meta.row_count, 3);
|
||||
assert_eq!(
|
||||
meta.bloom_filter_segments_size + meta_bytes.len() + 4,
|
||||
meta.bloom_filter_size as usize + meta_bytes.len() + 4,
|
||||
total_size
|
||||
);
|
||||
|
||||
let mut bfs = Vec::new();
|
||||
for segment in meta.bloom_filter_segments {
|
||||
for segment in meta.bloom_filter_locs {
|
||||
let bloom_filter_bytes =
|
||||
&bytes[segment.offset as usize..(segment.offset + segment.size) as usize];
|
||||
let v = u64_vec_from_bytes(bloom_filter_bytes);
|
||||
let bloom_filter = BloomFilter::from_vec(v)
|
||||
.seed(&SEED)
|
||||
.expected_items(segment.elem_count);
|
||||
.expected_items(segment.element_count as usize);
|
||||
bfs.push(bloom_filter);
|
||||
}
|
||||
|
||||
assert_eq!(bfs.len(), 2);
|
||||
assert!(bfs[0].contains(&b"a"));
|
||||
assert!(bfs[0].contains(&b"b"));
|
||||
assert!(bfs[0].contains(&b"c"));
|
||||
assert!(bfs[0].contains(&b"d"));
|
||||
assert!(bfs[1].contains(&b"e"));
|
||||
assert!(bfs[1].contains(&b"f"));
|
||||
assert_eq!(meta.segment_loc_indices.len(), 2);
|
||||
|
||||
let bf0 = &bfs[meta.segment_loc_indices[0] as usize];
|
||||
assert!(bf0.contains(&b"a"));
|
||||
assert!(bf0.contains(&b"b"));
|
||||
assert!(bf0.contains(&b"c"));
|
||||
assert!(bf0.contains(&b"d"));
|
||||
|
||||
let bf1 = &bfs[meta.segment_loc_indices[1] as usize];
|
||||
assert!(bf1.contains(&b"e"));
|
||||
assert!(bf1.contains(&b"f"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -356,37 +364,43 @@ mod tests {
|
||||
let meta_size = u32::from_le_bytes((&bytes[meta_size_offset..]).try_into().unwrap());
|
||||
|
||||
let meta_bytes = &bytes[total_size - meta_size as usize - 4..total_size - 4];
|
||||
let meta: BloomFilterMeta = serde_json::from_slice(meta_bytes).unwrap();
|
||||
let meta = BloomFilterMeta::decode(meta_bytes).unwrap();
|
||||
|
||||
assert_eq!(meta.rows_per_segment, 2);
|
||||
assert_eq!(meta.seg_count, 10);
|
||||
assert_eq!(meta.segment_count, 10);
|
||||
assert_eq!(meta.row_count, 20);
|
||||
assert_eq!(
|
||||
meta.bloom_filter_segments_size + meta_bytes.len() + 4,
|
||||
meta.bloom_filter_size as usize + meta_bytes.len() + 4,
|
||||
total_size
|
||||
);
|
||||
|
||||
let mut bfs = Vec::new();
|
||||
for segment in meta.bloom_filter_segments {
|
||||
for segment in meta.bloom_filter_locs {
|
||||
let bloom_filter_bytes =
|
||||
&bytes[segment.offset as usize..(segment.offset + segment.size) as usize];
|
||||
let v = u64_vec_from_bytes(bloom_filter_bytes);
|
||||
let bloom_filter = BloomFilter::from_vec(v)
|
||||
.seed(&SEED)
|
||||
.expected_items(segment.elem_count);
|
||||
.expected_items(segment.element_count as _);
|
||||
bfs.push(bloom_filter);
|
||||
}
|
||||
|
||||
assert_eq!(bfs.len(), 10);
|
||||
for bf in bfs.iter().take(3) {
|
||||
// 4 bloom filters to serve 10 segments
|
||||
assert_eq!(bfs.len(), 4);
|
||||
assert_eq!(meta.segment_loc_indices.len(), 10);
|
||||
|
||||
for idx in meta.segment_loc_indices.iter().take(3) {
|
||||
let bf = &bfs[*idx as usize];
|
||||
assert!(bf.contains(&b"a"));
|
||||
assert!(bf.contains(&b"b"));
|
||||
}
|
||||
for bf in bfs.iter().take(5).skip(2) {
|
||||
for idx in meta.segment_loc_indices.iter().take(5).skip(2) {
|
||||
let bf = &bfs[*idx as usize];
|
||||
assert!(bf.contains(&b"c"));
|
||||
assert!(bf.contains(&b"d"));
|
||||
}
|
||||
for bf in bfs.iter().take(10).skip(5) {
|
||||
for idx in meta.segment_loc_indices.iter().take(10).skip(5) {
|
||||
let bf = &bfs[*idx as usize];
|
||||
assert!(bf.contains(&b"e"));
|
||||
assert!(bf.contains(&b"f"));
|
||||
}
|
||||
|
||||
@@ -33,6 +33,9 @@ const MIN_MEMORY_USAGE_THRESHOLD: usize = 1024 * 1024; // 1MB
|
||||
|
||||
/// Storage for finalized Bloom filters.
|
||||
pub struct FinalizedBloomFilterStorage {
|
||||
/// Indices of the segments in the sequence of finalized Bloom filters.
|
||||
segment_indices: Vec<usize>,
|
||||
|
||||
/// Bloom filters that are stored in memory.
|
||||
in_memory: Vec<FinalizedBloomFilterSegment>,
|
||||
|
||||
@@ -54,6 +57,9 @@ pub struct FinalizedBloomFilterStorage {
|
||||
|
||||
/// The threshold of the global memory usage of the creating Bloom filters.
|
||||
global_memory_usage_threshold: Option<usize>,
|
||||
|
||||
/// Records the number of flushed segments.
|
||||
flushed_seg_count: usize,
|
||||
}
|
||||
|
||||
impl FinalizedBloomFilterStorage {
|
||||
@@ -65,6 +71,7 @@ impl FinalizedBloomFilterStorage {
|
||||
) -> Self {
|
||||
let external_prefix = format!("intm-bloom-filters-{}", uuid::Uuid::new_v4());
|
||||
Self {
|
||||
segment_indices: Vec::new(),
|
||||
in_memory: Vec::new(),
|
||||
intermediate_file_id_counter: 0,
|
||||
intermediate_prefix: external_prefix,
|
||||
@@ -72,6 +79,7 @@ impl FinalizedBloomFilterStorage {
|
||||
memory_usage: 0,
|
||||
global_memory_usage,
|
||||
global_memory_usage_threshold,
|
||||
flushed_seg_count: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -97,6 +105,13 @@ impl FinalizedBloomFilterStorage {
|
||||
|
||||
let fbf = FinalizedBloomFilterSegment::from(bf, element_count);
|
||||
|
||||
// Reuse the last segment if it is the same as the current one.
|
||||
if self.in_memory.last() == Some(&fbf) {
|
||||
self.segment_indices
|
||||
.push(self.flushed_seg_count + self.in_memory.len() - 1);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Update memory usage.
|
||||
let memory_diff = fbf.bloom_filter_bytes.len();
|
||||
self.memory_usage += memory_diff;
|
||||
@@ -105,6 +120,8 @@ impl FinalizedBloomFilterStorage {
|
||||
|
||||
// Add the finalized Bloom filter to the in-memory storage.
|
||||
self.in_memory.push(fbf);
|
||||
self.segment_indices
|
||||
.push(self.flushed_seg_count + self.in_memory.len() - 1);
|
||||
|
||||
// Flush to disk if necessary.
|
||||
|
||||
@@ -129,13 +146,19 @@ impl FinalizedBloomFilterStorage {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Drains the storage and returns a stream of finalized Bloom filter segments.
|
||||
/// Drains the storage and returns indieces of the segments and a stream of finalized Bloom filters.
|
||||
pub async fn drain(
|
||||
&mut self,
|
||||
) -> Result<Pin<Box<dyn Stream<Item = Result<FinalizedBloomFilterSegment>> + Send + '_>>> {
|
||||
) -> Result<(
|
||||
Vec<usize>,
|
||||
Pin<Box<dyn Stream<Item = Result<FinalizedBloomFilterSegment>> + Send + '_>>,
|
||||
)> {
|
||||
// FAST PATH: memory only
|
||||
if self.intermediate_file_id_counter == 0 {
|
||||
return Ok(Box::pin(stream::iter(self.in_memory.drain(..).map(Ok))));
|
||||
return Ok((
|
||||
std::mem::take(&mut self.segment_indices),
|
||||
Box::pin(stream::iter(self.in_memory.drain(..).map(Ok))),
|
||||
));
|
||||
}
|
||||
|
||||
// SLOW PATH: memory + disk
|
||||
@@ -151,8 +174,9 @@ impl FinalizedBloomFilterStorage {
|
||||
.map(|(_, reader)| FramedRead::new(reader, IntermediateBloomFilterCodecV1::default()));
|
||||
|
||||
let in_memory_stream = stream::iter(self.in_memory.drain(..)).map(Ok);
|
||||
Ok(Box::pin(
|
||||
stream::iter(streams).flatten().chain(in_memory_stream),
|
||||
Ok((
|
||||
std::mem::take(&mut self.segment_indices),
|
||||
Box::pin(stream::iter(streams).flatten().chain(in_memory_stream)),
|
||||
))
|
||||
}
|
||||
|
||||
@@ -160,6 +184,7 @@ impl FinalizedBloomFilterStorage {
|
||||
async fn flush_in_memory_to_disk(&mut self) -> Result<()> {
|
||||
let file_id = self.intermediate_file_id_counter;
|
||||
self.intermediate_file_id_counter += 1;
|
||||
self.flushed_seg_count += self.in_memory.len();
|
||||
|
||||
let file_id = format!("{:08}", file_id);
|
||||
let mut writer = self
|
||||
@@ -266,21 +291,25 @@ mod tests {
|
||||
|
||||
let elem_count = 2000;
|
||||
let batch = 1000;
|
||||
let dup_batch = 200;
|
||||
|
||||
for i in 0..batch {
|
||||
for i in 0..(batch - dup_batch) {
|
||||
let elems = (elem_count * i..elem_count * (i + 1)).map(|x| x.to_string().into_bytes());
|
||||
storage.add(elems, elem_count).await.unwrap();
|
||||
}
|
||||
for _ in 0..dup_batch {
|
||||
storage.add(Some(vec![]), 1).await.unwrap();
|
||||
}
|
||||
|
||||
// Flush happens.
|
||||
assert!(storage.intermediate_file_id_counter > 0);
|
||||
|
||||
// Drain the storage.
|
||||
let mut stream = storage.drain().await.unwrap();
|
||||
let (indices, mut stream) = storage.drain().await.unwrap();
|
||||
assert_eq!(indices.len(), batch);
|
||||
|
||||
let mut i = 0;
|
||||
while let Some(segment) = stream.next().await {
|
||||
let segment = segment.unwrap();
|
||||
for (i, idx) in indices.iter().enumerate().take(batch - dup_batch) {
|
||||
let segment = stream.next().await.unwrap().unwrap();
|
||||
assert_eq!(segment.element_count, elem_count);
|
||||
|
||||
let v = u64_vec_from_bytes(&segment.bloom_filter_bytes);
|
||||
@@ -292,9 +321,44 @@ mod tests {
|
||||
for elem in (elem_count * i..elem_count * (i + 1)).map(|x| x.to_string().into_bytes()) {
|
||||
assert!(bf.contains(&elem));
|
||||
}
|
||||
i += 1;
|
||||
assert_eq!(indices[i], *idx);
|
||||
}
|
||||
|
||||
assert_eq!(i, batch);
|
||||
// Check the correctness of the duplicated segments.
|
||||
let dup_seg = stream.next().await.unwrap().unwrap();
|
||||
assert_eq!(dup_seg.element_count, 1);
|
||||
assert!(stream.next().await.is_none());
|
||||
assert!(indices[(batch - dup_batch)..batch]
|
||||
.iter()
|
||||
.all(|&x| x == batch - dup_batch));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_finalized_bloom_filter_storage_all_dup() {
|
||||
let mock_provider = MockExternalTempFileProvider::new();
|
||||
let global_memory_usage = Arc::new(AtomicUsize::new(0));
|
||||
let global_memory_usage_threshold = Some(1024 * 1024); // 1MB
|
||||
let provider = Arc::new(mock_provider);
|
||||
let mut storage = FinalizedBloomFilterStorage::new(
|
||||
provider,
|
||||
global_memory_usage.clone(),
|
||||
global_memory_usage_threshold,
|
||||
);
|
||||
|
||||
let batch = 1000;
|
||||
for _ in 0..batch {
|
||||
storage.add(Some(vec![]), 1).await.unwrap();
|
||||
}
|
||||
|
||||
// Drain the storage.
|
||||
let (indices, mut stream) = storage.drain().await.unwrap();
|
||||
|
||||
let bf = stream.next().await.unwrap().unwrap();
|
||||
assert_eq!(bf.element_count, 1);
|
||||
|
||||
assert!(stream.next().await.is_none());
|
||||
|
||||
assert_eq!(indices.len(), batch);
|
||||
assert!(indices.iter().all(|&x| x == 0));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,18 +31,10 @@ pub enum Error {
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to serde json"))]
|
||||
SerdeJson {
|
||||
#[snafu(display("Failed to decode protobuf"))]
|
||||
DecodeProto {
|
||||
#[snafu(source)]
|
||||
error: serde_json::error::Error,
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to deserialize json"))]
|
||||
DeserializeJson {
|
||||
#[snafu(source)]
|
||||
error: serde_json::Error,
|
||||
error: prost::DecodeError,
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
},
|
||||
@@ -90,10 +82,9 @@ impl ErrorExt for Error {
|
||||
|
||||
match self {
|
||||
Io { .. }
|
||||
| SerdeJson { .. }
|
||||
| FileSizeTooSmall { .. }
|
||||
| UnexpectedMetaSize { .. }
|
||||
| DeserializeJson { .. }
|
||||
| DecodeProto { .. }
|
||||
| InvalidIntermediateMagic { .. } => StatusCode::Unexpected,
|
||||
|
||||
Intermediate { source, .. } => source.status_code(),
|
||||
|
||||
@@ -18,12 +18,14 @@ use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
use common_base::range_read::RangeReader;
|
||||
use fastbloom::BloomFilter;
|
||||
use greptime_proto::v1::index::{BloomFilterLoc, BloomFilterMeta};
|
||||
use prost::Message;
|
||||
use snafu::{ensure, ResultExt};
|
||||
|
||||
use crate::bloom_filter::error::{
|
||||
DeserializeJsonSnafu, FileSizeTooSmallSnafu, IoSnafu, Result, UnexpectedMetaSizeSnafu,
|
||||
DecodeProtoSnafu, FileSizeTooSmallSnafu, IoSnafu, Result, UnexpectedMetaSizeSnafu,
|
||||
};
|
||||
use crate::bloom_filter::{BloomFilterMeta, BloomFilterSegmentLocation, SEED};
|
||||
use crate::bloom_filter::SEED;
|
||||
|
||||
/// Minimum size of the bloom filter, which is the size of the length of the bloom filter.
|
||||
const BLOOM_META_LEN_SIZE: u64 = 4;
|
||||
@@ -52,7 +54,7 @@ pub trait BloomFilterReader: Sync {
|
||||
async fn metadata(&self) -> Result<BloomFilterMeta>;
|
||||
|
||||
/// Reads a bloom filter with the given location.
|
||||
async fn bloom_filter(&self, loc: &BloomFilterSegmentLocation) -> Result<BloomFilter> {
|
||||
async fn bloom_filter(&self, loc: &BloomFilterLoc) -> Result<BloomFilter> {
|
||||
let bytes = self.range_read(loc.offset, loc.size as _).await?;
|
||||
let vec = bytes
|
||||
.chunks_exact(std::mem::size_of::<u64>())
|
||||
@@ -60,14 +62,11 @@ pub trait BloomFilterReader: Sync {
|
||||
.collect();
|
||||
let bm = BloomFilter::from_vec(vec)
|
||||
.seed(&SEED)
|
||||
.expected_items(loc.elem_count);
|
||||
.expected_items(loc.element_count as _);
|
||||
Ok(bm)
|
||||
}
|
||||
|
||||
async fn bloom_filter_vec(
|
||||
&self,
|
||||
locs: &[BloomFilterSegmentLocation],
|
||||
) -> Result<Vec<BloomFilter>> {
|
||||
async fn bloom_filter_vec(&self, locs: &[BloomFilterLoc]) -> Result<Vec<BloomFilter>> {
|
||||
let ranges = locs
|
||||
.iter()
|
||||
.map(|l| l.offset..l.offset + l.size)
|
||||
@@ -82,7 +81,7 @@ pub trait BloomFilterReader: Sync {
|
||||
.collect();
|
||||
let bm = BloomFilter::from_vec(vec)
|
||||
.seed(&SEED)
|
||||
.expected_items(loc.elem_count);
|
||||
.expected_items(loc.element_count as _);
|
||||
result.push(bm);
|
||||
}
|
||||
|
||||
@@ -173,11 +172,11 @@ impl<R: RangeReader> BloomFilterMetaReader<R> {
|
||||
.read(metadata_start..self.file_size - BLOOM_META_LEN_SIZE)
|
||||
.await
|
||||
.context(IoSnafu)?;
|
||||
serde_json::from_slice(&meta).context(DeserializeJsonSnafu)
|
||||
BloomFilterMeta::decode(meta).context(DecodeProtoSnafu)
|
||||
} else {
|
||||
let metadata_start = self.file_size - length - BLOOM_META_LEN_SIZE - meta_start;
|
||||
let meta = &suffix[metadata_start as usize..suffix_len - BLOOM_META_LEN_SIZE as usize];
|
||||
serde_json::from_slice(meta).context(DeserializeJsonSnafu)
|
||||
BloomFilterMeta::decode(meta).context(DecodeProtoSnafu)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -257,17 +256,17 @@ mod tests {
|
||||
let meta = reader.metadata().await.unwrap();
|
||||
|
||||
assert_eq!(meta.rows_per_segment, 2);
|
||||
assert_eq!(meta.seg_count, 2);
|
||||
assert_eq!(meta.segment_count, 2);
|
||||
assert_eq!(meta.row_count, 3);
|
||||
assert_eq!(meta.bloom_filter_segments.len(), 2);
|
||||
assert_eq!(meta.bloom_filter_locs.len(), 2);
|
||||
|
||||
assert_eq!(meta.bloom_filter_segments[0].offset, 0);
|
||||
assert_eq!(meta.bloom_filter_segments[0].elem_count, 4);
|
||||
assert_eq!(meta.bloom_filter_locs[0].offset, 0);
|
||||
assert_eq!(meta.bloom_filter_locs[0].element_count, 4);
|
||||
assert_eq!(
|
||||
meta.bloom_filter_segments[1].offset,
|
||||
meta.bloom_filter_segments[0].size
|
||||
meta.bloom_filter_locs[1].offset,
|
||||
meta.bloom_filter_locs[0].size
|
||||
);
|
||||
assert_eq!(meta.bloom_filter_segments[1].elem_count, 2);
|
||||
assert_eq!(meta.bloom_filter_locs[1].element_count, 2);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -278,9 +277,9 @@ mod tests {
|
||||
let reader = BloomFilterReaderImpl::new(bytes);
|
||||
let meta = reader.metadata().await.unwrap();
|
||||
|
||||
assert_eq!(meta.bloom_filter_segments.len(), 2);
|
||||
assert_eq!(meta.bloom_filter_locs.len(), 2);
|
||||
let bf = reader
|
||||
.bloom_filter(&meta.bloom_filter_segments[0])
|
||||
.bloom_filter(&meta.bloom_filter_locs[0])
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(bf.contains(&b"a"));
|
||||
@@ -289,7 +288,7 @@ mod tests {
|
||||
assert!(bf.contains(&b"d"));
|
||||
|
||||
let bf = reader
|
||||
.bloom_filter(&meta.bloom_filter_segments[1])
|
||||
.bloom_filter(&meta.bloom_filter_locs[1])
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(bf.contains(&b"e"));
|
||||
|
||||
@@ -24,16 +24,73 @@ use crate::error::{
|
||||
/// GreptimeDB's log query request.
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct LogQuery {
|
||||
// Global query parameters
|
||||
/// A fully qualified table name to query logs from.
|
||||
pub table: TableName,
|
||||
/// Specifies the time range for the log query. See [`TimeFilter`] for more details.
|
||||
pub time_filter: TimeFilter,
|
||||
/// Columns with filters to query.
|
||||
pub columns: Vec<ColumnFilters>,
|
||||
/// Controls row skipping and fetch count for logs.
|
||||
/// Controls row skipping and fetch on the result set.
|
||||
pub limit: Limit,
|
||||
/// Adjacent lines to return.
|
||||
/// Columns to return in the result set.
|
||||
///
|
||||
/// The columns can be either from the original log or derived from processing exprs.
|
||||
/// Default (empty) means all columns.
|
||||
///
|
||||
/// TODO(ruihang): Do we need negative select?
|
||||
pub columns: Vec<String>,
|
||||
|
||||
// Filters
|
||||
/// Conjunction of filters to apply for the raw logs.
|
||||
///
|
||||
/// Filters here can only refer to the columns from the original log.
|
||||
pub filters: Vec<ColumnFilters>,
|
||||
/// Adjacent lines to return. Applies to all filters above.
|
||||
///
|
||||
/// TODO(ruihang): Do we need per-filter context?
|
||||
pub context: Context,
|
||||
|
||||
// Processors
|
||||
/// Expressions to calculate after filter.
|
||||
pub exprs: Vec<LogExpr>,
|
||||
}
|
||||
|
||||
/// Expression to calculate on log after filtering.
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub enum LogExpr {
|
||||
NamedIdent(String),
|
||||
PositionalIdent(usize),
|
||||
Literal(String),
|
||||
ScalarFunc {
|
||||
name: String,
|
||||
args: Vec<LogExpr>,
|
||||
},
|
||||
AggrFunc {
|
||||
name: String,
|
||||
args: Vec<LogExpr>,
|
||||
/// Optional range function parameter. Stands for the time range for both step and align.
|
||||
range: Option<String>,
|
||||
by: Vec<LogExpr>,
|
||||
},
|
||||
Decompose {
|
||||
expr: Box<LogExpr>,
|
||||
/// JSON, CSV, etc.
|
||||
schema: String,
|
||||
/// Fields with type name to extract from the decomposed value.
|
||||
fields: Vec<(String, String)>,
|
||||
},
|
||||
BinaryOp {
|
||||
left: Box<LogExpr>,
|
||||
op: String,
|
||||
right: Box<LogExpr>,
|
||||
},
|
||||
Alias {
|
||||
expr: Box<LogExpr>,
|
||||
alias: String,
|
||||
},
|
||||
Filter {
|
||||
expr: Box<LogExpr>,
|
||||
filter: ContentFilter,
|
||||
},
|
||||
}
|
||||
|
||||
impl Default for LogQuery {
|
||||
@@ -41,9 +98,11 @@ impl Default for LogQuery {
|
||||
Self {
|
||||
table: TableName::new("", "", ""),
|
||||
time_filter: Default::default(),
|
||||
columns: vec![],
|
||||
filters: vec![],
|
||||
limit: Limit::default(),
|
||||
context: Default::default(),
|
||||
columns: vec![],
|
||||
exprs: vec![],
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -232,6 +291,7 @@ pub struct ColumnFilters {
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub enum ContentFilter {
|
||||
// Search-based filters
|
||||
/// Only match the exact content.
|
||||
///
|
||||
/// For example, if the content is "pale blue dot", the filter "pale" or "pale blue" will match.
|
||||
@@ -246,6 +306,14 @@ pub enum ContentFilter {
|
||||
Contains(String),
|
||||
/// Match the content with a regex pattern. The pattern should be a valid Rust regex.
|
||||
Regex(String),
|
||||
|
||||
// Value-based filters
|
||||
/// Content exists, a.k.a. not null.
|
||||
Exist,
|
||||
Between(String, String),
|
||||
// TODO(ruihang): arithmetic operations
|
||||
|
||||
// Compound filters
|
||||
Compound(Vec<ContentFilter>, BinaryOperator),
|
||||
}
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ use std::time::Duration;
|
||||
|
||||
use async_stream::stream;
|
||||
use common_runtime::{RepeatedTask, TaskFunction};
|
||||
use common_telemetry::{error, info};
|
||||
use common_telemetry::{debug, error, info};
|
||||
use common_wal::config::raft_engine::RaftEngineConfig;
|
||||
use raft_engine::{Config, Engine, LogBatch, MessageExt, ReadableSize, RecoveryMode};
|
||||
use snafu::{ensure, OptionExt, ResultExt};
|
||||
@@ -64,10 +64,15 @@ impl TaskFunction<Error> for PurgeExpiredFilesFunction {
|
||||
Ok(res) => {
|
||||
// TODO(hl): the retval of purge_expired_files indicates the namespaces need to be compact,
|
||||
// which is useful when monitoring regions failed to flush it's memtable to SSTs.
|
||||
info!(
|
||||
let log_string = format!(
|
||||
"Successfully purged logstore files, namespaces need compaction: {:?}",
|
||||
res
|
||||
);
|
||||
if res.is_empty() {
|
||||
debug!(log_string);
|
||||
} else {
|
||||
info!(log_string);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
error!(e; "Failed to purge files in logstore");
|
||||
|
||||
@@ -291,6 +291,7 @@ async fn test_on_datanode_create_logical_regions() {
|
||||
}
|
||||
});
|
||||
|
||||
procedure.check_tables_already_exist().await.unwrap();
|
||||
let status = procedure.on_datanode_create_regions().await.unwrap();
|
||||
assert!(matches!(status, Status::Executing { persist: true }));
|
||||
assert!(matches!(
|
||||
|
||||
@@ -411,6 +411,7 @@ impl MetadataRegion {
|
||||
output_ordering: None,
|
||||
limit: None,
|
||||
series_row_selector: None,
|
||||
sequence: None,
|
||||
};
|
||||
let record_batch_stream = self
|
||||
.mito
|
||||
@@ -469,6 +470,7 @@ impl MetadataRegion {
|
||||
output_ordering: None,
|
||||
limit: None,
|
||||
series_row_selector: None,
|
||||
sequence: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -630,6 +632,7 @@ mod test {
|
||||
output_ordering: None,
|
||||
limit: None,
|
||||
series_row_selector: None,
|
||||
sequence: None,
|
||||
};
|
||||
let actual_scan_request = MetadataRegion::build_read_request(key);
|
||||
assert_eq!(actual_scan_request, expected_scan_request);
|
||||
|
||||
@@ -46,7 +46,8 @@ lazy_static! {
|
||||
"greptime_metric_engine_mito_op_elapsed",
|
||||
"metric engine's mito operation elapsed",
|
||||
&[OPERATION_LABEL],
|
||||
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
|
||||
// 0.01 ~ 10000
|
||||
exponential_buckets(0.01, 10.0, 7).unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
@@ -25,6 +25,7 @@ use mito2::memtable::partition_tree::{PartitionTreeConfig, PartitionTreeMemtable
|
||||
use mito2::memtable::time_series::TimeSeriesMemtable;
|
||||
use mito2::memtable::{KeyValues, Memtable};
|
||||
use mito2::region::options::MergeMode;
|
||||
use mito2::row_converter::DensePrimaryKeyCodec;
|
||||
use mito2::test_util::memtable_util::{self, region_metadata_to_row_schema};
|
||||
use rand::rngs::ThreadRng;
|
||||
use rand::seq::SliceRandom;
|
||||
@@ -43,8 +44,14 @@ fn write_rows(c: &mut Criterion) {
|
||||
// Note that this test only generate one time series.
|
||||
let mut group = c.benchmark_group("write");
|
||||
group.bench_function("partition_tree", |b| {
|
||||
let memtable =
|
||||
PartitionTreeMemtable::new(1, metadata.clone(), None, &PartitionTreeConfig::default());
|
||||
let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
|
||||
let memtable = PartitionTreeMemtable::new(
|
||||
1,
|
||||
codec,
|
||||
metadata.clone(),
|
||||
None,
|
||||
&PartitionTreeConfig::default(),
|
||||
);
|
||||
let kvs =
|
||||
memtable_util::build_key_values(&metadata, "hello".to_string(), 42, ×tamps, 1);
|
||||
b.iter(|| {
|
||||
@@ -71,13 +78,14 @@ fn full_scan(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("full_scan");
|
||||
group.sample_size(10);
|
||||
group.bench_function("partition_tree", |b| {
|
||||
let memtable = PartitionTreeMemtable::new(1, metadata.clone(), None, &config);
|
||||
let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
|
||||
let memtable = PartitionTreeMemtable::new(1, codec, metadata.clone(), None, &config);
|
||||
for kvs in generator.iter() {
|
||||
memtable.write(&kvs).unwrap();
|
||||
}
|
||||
|
||||
b.iter(|| {
|
||||
let iter = memtable.iter(None, None).unwrap();
|
||||
let iter = memtable.iter(None, None, None).unwrap();
|
||||
for batch in iter {
|
||||
let _batch = batch.unwrap();
|
||||
}
|
||||
@@ -90,7 +98,7 @@ fn full_scan(c: &mut Criterion) {
|
||||
}
|
||||
|
||||
b.iter(|| {
|
||||
let iter = memtable.iter(None, None).unwrap();
|
||||
let iter = memtable.iter(None, None, None).unwrap();
|
||||
for batch in iter {
|
||||
let _batch = batch.unwrap();
|
||||
}
|
||||
@@ -108,14 +116,15 @@ fn filter_1_host(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("filter_1_host");
|
||||
group.sample_size(10);
|
||||
group.bench_function("partition_tree", |b| {
|
||||
let memtable = PartitionTreeMemtable::new(1, metadata.clone(), None, &config);
|
||||
let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
|
||||
let memtable = PartitionTreeMemtable::new(1, codec, metadata.clone(), None, &config);
|
||||
for kvs in generator.iter() {
|
||||
memtable.write(&kvs).unwrap();
|
||||
}
|
||||
let predicate = generator.random_host_filter();
|
||||
|
||||
b.iter(|| {
|
||||
let iter = memtable.iter(None, Some(predicate.clone())).unwrap();
|
||||
let iter = memtable.iter(None, Some(predicate.clone()), None).unwrap();
|
||||
for batch in iter {
|
||||
let _batch = batch.unwrap();
|
||||
}
|
||||
@@ -129,7 +138,7 @@ fn filter_1_host(c: &mut Criterion) {
|
||||
let predicate = generator.random_host_filter();
|
||||
|
||||
b.iter(|| {
|
||||
let iter = memtable.iter(None, Some(predicate.clone())).unwrap();
|
||||
let iter = memtable.iter(None, Some(predicate.clone()), None).unwrap();
|
||||
for batch in iter {
|
||||
let _batch = batch.unwrap();
|
||||
}
|
||||
|
||||
@@ -218,9 +218,9 @@ impl CacheStrategy {
|
||||
|
||||
/// Calls [CacheManager::index_cache()].
|
||||
/// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled].
|
||||
pub fn index_cache(&self) -> Option<&InvertedIndexCacheRef> {
|
||||
pub fn inverted_index_cache(&self) -> Option<&InvertedIndexCacheRef> {
|
||||
match self {
|
||||
CacheStrategy::EnableAll(cache_manager) => cache_manager.index_cache(),
|
||||
CacheStrategy::EnableAll(cache_manager) => cache_manager.inverted_index_cache(),
|
||||
CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
|
||||
}
|
||||
}
|
||||
@@ -409,7 +409,7 @@ impl CacheManager {
|
||||
self.write_cache.as_ref()
|
||||
}
|
||||
|
||||
pub(crate) fn index_cache(&self) -> Option<&InvertedIndexCacheRef> {
|
||||
pub(crate) fn inverted_index_cache(&self) -> Option<&InvertedIndexCacheRef> {
|
||||
self.index_cache.as_ref()
|
||||
}
|
||||
|
||||
|
||||
9
src/mito2/src/cache/file_cache.rs
vendored
9
src/mito2/src/cache/file_cache.rs
vendored
@@ -25,7 +25,7 @@ use futures::{FutureExt, TryStreamExt};
|
||||
use moka::future::Cache;
|
||||
use moka::notification::RemovalCause;
|
||||
use object_store::util::join_path;
|
||||
use object_store::{ErrorKind, Metakey, ObjectStore, Reader};
|
||||
use object_store::{ErrorKind, ObjectStore, Reader};
|
||||
use parquet::file::metadata::ParquetMetaData;
|
||||
use snafu::ResultExt;
|
||||
use store_api::storage::RegionId;
|
||||
@@ -195,7 +195,6 @@ impl FileCache {
|
||||
let mut lister = self
|
||||
.local_store
|
||||
.lister_with(FILE_DIR)
|
||||
.metakey(Metakey::ContentLength)
|
||||
.await
|
||||
.context(OpenDalSnafu)?;
|
||||
// Use i64 for total_size to reduce the risk of overflow.
|
||||
@@ -209,6 +208,12 @@ impl FileCache {
|
||||
let Some(key) = parse_index_key(entry.name()) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let meta = self
|
||||
.local_store
|
||||
.stat(entry.path())
|
||||
.await
|
||||
.context(OpenDalSnafu)?;
|
||||
let file_size = meta.content_length() as u32;
|
||||
self.memory_index
|
||||
.insert(key, IndexValue { file_size })
|
||||
|
||||
@@ -15,12 +15,12 @@
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::v1::index::BloomFilterMeta;
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
use futures::future::try_join_all;
|
||||
use index::bloom_filter::error::Result;
|
||||
use index::bloom_filter::reader::BloomFilterReader;
|
||||
use index::bloom_filter::BloomFilterMeta;
|
||||
use store_api::storage::ColumnId;
|
||||
|
||||
use crate::cache::index::{IndexCache, PageKey, INDEX_METADATA_TYPE};
|
||||
|
||||
@@ -40,6 +40,7 @@ use snafu::{OptionExt, ResultExt};
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::{RegionId, TableId};
|
||||
use table::predicate::Predicate;
|
||||
use task::MAX_PARALLEL_COMPACTION;
|
||||
use tokio::sync::mpsc::{self, Sender};
|
||||
|
||||
use crate::access_layer::AccessLayerRef;
|
||||
@@ -85,6 +86,7 @@ pub struct CompactionRequest {
|
||||
pub(crate) manifest_ctx: ManifestContextRef,
|
||||
pub(crate) listener: WorkerListener,
|
||||
pub(crate) schema_metadata_manager: SchemaMetadataManagerRef,
|
||||
pub(crate) max_parallelism: usize,
|
||||
}
|
||||
|
||||
impl CompactionRequest {
|
||||
@@ -145,6 +147,7 @@ impl CompactionScheduler {
|
||||
waiter: OptionOutputTx,
|
||||
manifest_ctx: &ManifestContextRef,
|
||||
schema_metadata_manager: SchemaMetadataManagerRef,
|
||||
max_parallelism: usize,
|
||||
) -> Result<()> {
|
||||
if let Some(status) = self.region_status.get_mut(®ion_id) {
|
||||
// Region is compacting. Add the waiter to pending list.
|
||||
@@ -163,6 +166,7 @@ impl CompactionScheduler {
|
||||
manifest_ctx,
|
||||
self.listener.clone(),
|
||||
schema_metadata_manager,
|
||||
max_parallelism,
|
||||
);
|
||||
self.region_status.insert(region_id, status);
|
||||
let result = self
|
||||
@@ -193,6 +197,7 @@ impl CompactionScheduler {
|
||||
manifest_ctx,
|
||||
self.listener.clone(),
|
||||
schema_metadata_manager,
|
||||
MAX_PARALLEL_COMPACTION,
|
||||
);
|
||||
// Try to schedule next compaction task for this region.
|
||||
if let Err(e) = self
|
||||
@@ -264,6 +269,7 @@ impl CompactionScheduler {
|
||||
manifest_ctx,
|
||||
listener,
|
||||
schema_metadata_manager,
|
||||
max_parallelism,
|
||||
} = request;
|
||||
|
||||
let ttl = find_ttl(
|
||||
@@ -294,6 +300,7 @@ impl CompactionScheduler {
|
||||
manifest_ctx: manifest_ctx.clone(),
|
||||
file_purger: None,
|
||||
ttl: Some(ttl),
|
||||
max_parallelism,
|
||||
};
|
||||
|
||||
let picker_output = {
|
||||
@@ -521,6 +528,7 @@ impl CompactionStatus {
|
||||
manifest_ctx: &ManifestContextRef,
|
||||
listener: WorkerListener,
|
||||
schema_metadata_manager: SchemaMetadataManagerRef,
|
||||
max_parallelism: usize,
|
||||
) -> CompactionRequest {
|
||||
let current_version = CompactionVersion::from(self.version_control.current().version);
|
||||
let start_time = Instant::now();
|
||||
@@ -535,6 +543,7 @@ impl CompactionStatus {
|
||||
manifest_ctx: manifest_ctx.clone(),
|
||||
listener,
|
||||
schema_metadata_manager,
|
||||
max_parallelism,
|
||||
};
|
||||
|
||||
if let Some(pending) = self.pending_compaction.take() {
|
||||
@@ -722,6 +731,7 @@ mod tests {
|
||||
waiter,
|
||||
&manifest_ctx,
|
||||
schema_metadata_manager.clone(),
|
||||
1,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -742,6 +752,7 @@ mod tests {
|
||||
waiter,
|
||||
&manifest_ctx,
|
||||
schema_metadata_manager,
|
||||
1,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -795,6 +806,7 @@ mod tests {
|
||||
OptionOutputTx::none(),
|
||||
&manifest_ctx,
|
||||
schema_metadata_manager.clone(),
|
||||
1,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -825,6 +837,7 @@ mod tests {
|
||||
OptionOutputTx::none(),
|
||||
&manifest_ctx,
|
||||
schema_metadata_manager.clone(),
|
||||
1,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -860,6 +873,7 @@ mod tests {
|
||||
OptionOutputTx::none(),
|
||||
&manifest_ctx,
|
||||
schema_metadata_manager,
|
||||
1,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -91,6 +91,12 @@ pub struct CompactionRegion {
|
||||
pub(crate) current_version: CompactionVersion,
|
||||
pub(crate) file_purger: Option<Arc<LocalFilePurger>>,
|
||||
pub(crate) ttl: Option<TimeToLive>,
|
||||
|
||||
/// Controls the parallelism of this compaction task. Default is 1.
|
||||
///
|
||||
/// The parallel is inside this compaction task, not across different compaction tasks.
|
||||
/// It can be different windows of the same compaction task or something like this.
|
||||
pub max_parallelism: usize,
|
||||
}
|
||||
|
||||
/// OpenCompactionRegionRequest represents the request to open a compaction region.
|
||||
@@ -99,6 +105,7 @@ pub struct OpenCompactionRegionRequest {
|
||||
pub region_id: RegionId,
|
||||
pub region_dir: String,
|
||||
pub region_options: RegionOptions,
|
||||
pub max_parallelism: usize,
|
||||
}
|
||||
|
||||
/// Open a compaction region from a compaction request.
|
||||
@@ -205,6 +212,7 @@ pub async fn open_compaction_region(
|
||||
current_version,
|
||||
file_purger: Some(file_purger),
|
||||
ttl: Some(ttl),
|
||||
max_parallelism: req.max_parallelism,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -266,6 +274,7 @@ impl Compactor for DefaultCompactor {
|
||||
let mut futs = Vec::with_capacity(picker_output.outputs.len());
|
||||
let mut compacted_inputs =
|
||||
Vec::with_capacity(picker_output.outputs.iter().map(|o| o.inputs.len()).sum());
|
||||
let internal_parallelism = compaction_region.max_parallelism.max(1);
|
||||
|
||||
for output in picker_output.outputs.drain(..) {
|
||||
compacted_inputs.extend(output.inputs.iter().map(|f| f.meta_ref().clone()));
|
||||
@@ -358,9 +367,8 @@ impl Compactor for DefaultCompactor {
|
||||
}
|
||||
let mut output_files = Vec::with_capacity(futs.len());
|
||||
while !futs.is_empty() {
|
||||
let mut task_chunk =
|
||||
Vec::with_capacity(crate::compaction::task::MAX_PARALLEL_COMPACTION);
|
||||
for _ in 0..crate::compaction::task::MAX_PARALLEL_COMPACTION {
|
||||
let mut task_chunk = Vec::with_capacity(internal_parallelism);
|
||||
for _ in 0..internal_parallelism {
|
||||
if let Some(task) = futs.pop() {
|
||||
task_chunk.push(common_runtime::spawn_compact(task));
|
||||
}
|
||||
|
||||
@@ -32,7 +32,7 @@ use crate::request::{
|
||||
use crate::worker::WorkerListener;
|
||||
|
||||
/// Maximum number of compaction tasks in parallel.
|
||||
pub const MAX_PARALLEL_COMPACTION: usize = 8;
|
||||
pub const MAX_PARALLEL_COMPACTION: usize = 1;
|
||||
|
||||
pub(crate) struct CompactionTaskImpl {
|
||||
pub compaction_region: CompactionRegion,
|
||||
|
||||
@@ -35,8 +35,6 @@ pub(crate) const DEFAULT_SCAN_CHANNEL_SIZE: usize = 32;
|
||||
const GLOBAL_WRITE_BUFFER_SIZE_FACTOR: u64 = 8;
|
||||
/// Use `1/SST_META_CACHE_SIZE_FACTOR` of OS memory size as SST meta cache size in default mode
|
||||
const SST_META_CACHE_SIZE_FACTOR: u64 = 32;
|
||||
/// Use `1/INDEX_CONTENT_CACHE_SIZE_FACTOR` of OS memory size for inverted index file content cache by default.
|
||||
const INDEX_CONTENT_CACHE_SIZE_FACTOR: u64 = 32;
|
||||
/// Use `1/MEM_CACHE_SIZE_FACTOR` of OS memory size as mem cache size in default mode
|
||||
const MEM_CACHE_SIZE_FACTOR: u64 = 16;
|
||||
/// Use `1/PAGE_CACHE_SIZE_FACTOR` of OS memory size as page cache size in default mode
|
||||
@@ -307,6 +305,10 @@ pub struct IndexConfig {
|
||||
|
||||
/// Cache size for metadata of puffin files. Setting it to 0 to disable the cache.
|
||||
pub metadata_cache_size: ReadableSize,
|
||||
/// Cache size for inverted index content. Setting it to 0 to disable the cache.
|
||||
pub content_cache_size: ReadableSize,
|
||||
/// Page size for inverted index content.
|
||||
pub content_cache_page_size: ReadableSize,
|
||||
}
|
||||
|
||||
impl Default for IndexConfig {
|
||||
@@ -316,6 +318,8 @@ impl Default for IndexConfig {
|
||||
staging_size: ReadableSize::gb(2),
|
||||
write_buffer_size: ReadableSize::mb(8),
|
||||
metadata_cache_size: ReadableSize::mb(64),
|
||||
content_cache_size: ReadableSize::mb(128),
|
||||
content_cache_page_size: ReadableSize::kb(64),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -411,45 +415,19 @@ pub struct InvertedIndexConfig {
|
||||
#[deprecated = "use [IndexConfig::write_buffer_size] instead"]
|
||||
#[serde(skip_serializing)]
|
||||
pub write_buffer_size: ReadableSize,
|
||||
|
||||
/// Cache size for metadata of inverted index. Setting it to 0 to disable the cache.
|
||||
pub metadata_cache_size: ReadableSize,
|
||||
/// Cache size for inverted index content. Setting it to 0 to disable the cache.
|
||||
pub content_cache_size: ReadableSize,
|
||||
/// Page size for inverted index content.
|
||||
pub content_cache_page_size: ReadableSize,
|
||||
}
|
||||
|
||||
impl InvertedIndexConfig {
|
||||
/// Adjusts the cache size of [InvertedIndexConfig] according to system memory size.
|
||||
fn adjust_cache_size(&mut self, sys_memory: ReadableSize) {
|
||||
let content_cache_size = cmp::min(
|
||||
sys_memory / INDEX_CONTENT_CACHE_SIZE_FACTOR,
|
||||
ReadableSize::mb(128),
|
||||
);
|
||||
self.content_cache_size = content_cache_size;
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for InvertedIndexConfig {
|
||||
#[allow(deprecated)]
|
||||
fn default() -> Self {
|
||||
let mut index_config = Self {
|
||||
Self {
|
||||
create_on_flush: Mode::Auto,
|
||||
create_on_compaction: Mode::Auto,
|
||||
apply_on_query: Mode::Auto,
|
||||
mem_threshold_on_create: MemoryThreshold::Auto,
|
||||
write_buffer_size: ReadableSize::mb(8),
|
||||
intermediate_path: String::new(),
|
||||
metadata_cache_size: ReadableSize::mb(64),
|
||||
content_cache_size: ReadableSize::mb(128),
|
||||
content_cache_page_size: ReadableSize::kb(64),
|
||||
};
|
||||
|
||||
if let Some(sys_memory) = common_config::utils::get_sys_total_memory() {
|
||||
index_config.adjust_cache_size(sys_memory);
|
||||
}
|
||||
index_config
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -26,8 +26,8 @@ use datatypes::schema::{ColumnSchema, FulltextAnalyzer, FulltextOptions};
|
||||
use store_api::metadata::ColumnMetadata;
|
||||
use store_api::region_engine::{RegionEngine, RegionRole};
|
||||
use store_api::region_request::{
|
||||
AddColumn, AddColumnLocation, AlterKind, RegionAlterRequest, RegionOpenRequest, RegionRequest,
|
||||
SetRegionOption,
|
||||
AddColumn, AddColumnLocation, AlterKind, ApiSetIndexOptions, RegionAlterRequest,
|
||||
RegionOpenRequest, RegionRequest, SetRegionOption,
|
||||
};
|
||||
use store_api::storage::{RegionId, ScanRequest};
|
||||
|
||||
@@ -69,15 +69,28 @@ fn add_tag1() -> RegionAlterRequest {
|
||||
}
|
||||
}
|
||||
|
||||
fn alter_column_inverted_index() -> RegionAlterRequest {
|
||||
RegionAlterRequest {
|
||||
schema_version: 0,
|
||||
kind: AlterKind::SetIndex {
|
||||
options: ApiSetIndexOptions::Inverted {
|
||||
column_name: "tag_0".to_string(),
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn alter_column_fulltext_options() -> RegionAlterRequest {
|
||||
RegionAlterRequest {
|
||||
schema_version: 0,
|
||||
kind: AlterKind::SetColumnFulltext {
|
||||
column_name: "tag_0".to_string(),
|
||||
options: FulltextOptions {
|
||||
enable: true,
|
||||
analyzer: FulltextAnalyzer::English,
|
||||
case_sensitive: false,
|
||||
kind: AlterKind::SetIndex {
|
||||
options: ApiSetIndexOptions::Fulltext {
|
||||
column_name: "tag_0".to_string(),
|
||||
options: FulltextOptions {
|
||||
enable: true,
|
||||
analyzer: FulltextAnalyzer::English,
|
||||
case_sensitive: false,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
@@ -579,6 +592,116 @@ async fn test_alter_column_fulltext_options() {
|
||||
check_region_version(&engine, region_id, 1, 3, 1, 3);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_alter_column_set_inverted_index() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
|
||||
let mut env = TestEnv::new();
|
||||
let listener = Arc::new(AlterFlushListener::default());
|
||||
let engine = env
|
||||
.create_engine_with(MitoConfig::default(), None, Some(listener.clone()))
|
||||
.await;
|
||||
|
||||
let region_id = RegionId::new(1, 1);
|
||||
let request = CreateRequestBuilder::new().build();
|
||||
|
||||
env.get_schema_metadata_manager()
|
||||
.register_region_table_info(
|
||||
region_id.table_id(),
|
||||
"test_table",
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
let column_schemas = rows_schema(&request);
|
||||
let region_dir = request.region_dir.clone();
|
||||
engine
|
||||
.handle_request(region_id, RegionRequest::Create(request))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let rows = Rows {
|
||||
schema: column_schemas,
|
||||
rows: build_rows(0, 3),
|
||||
};
|
||||
put_rows(&engine, region_id, rows).await;
|
||||
|
||||
// Spawns a task to flush the engine.
|
||||
let engine_cloned = engine.clone();
|
||||
let flush_job = tokio::spawn(async move {
|
||||
flush_region(&engine_cloned, region_id, None).await;
|
||||
});
|
||||
// Waits for flush begin.
|
||||
listener.wait_flush_begin().await;
|
||||
|
||||
// Consumes the notify permit in the listener.
|
||||
listener.wait_request_begin().await;
|
||||
|
||||
// Submits an alter request to the region. The region should add the request
|
||||
// to the pending ddl request list.
|
||||
let request = alter_column_inverted_index();
|
||||
let engine_cloned = engine.clone();
|
||||
let alter_job = tokio::spawn(async move {
|
||||
engine_cloned
|
||||
.handle_request(region_id, RegionRequest::Alter(request))
|
||||
.await
|
||||
.unwrap();
|
||||
});
|
||||
// Waits until the worker handles the alter request.
|
||||
listener.wait_request_begin().await;
|
||||
|
||||
// Spawns two task to flush the engine. The flush scheduler should put them to the
|
||||
// pending task list.
|
||||
let engine_cloned = engine.clone();
|
||||
let pending_flush_job = tokio::spawn(async move {
|
||||
flush_region(&engine_cloned, region_id, None).await;
|
||||
});
|
||||
// Waits until the worker handles the flush request.
|
||||
listener.wait_request_begin().await;
|
||||
|
||||
// Wake up flush.
|
||||
listener.wake_flush();
|
||||
// Wait for the flush job.
|
||||
flush_job.await.unwrap();
|
||||
// Wait for pending flush job.
|
||||
pending_flush_job.await.unwrap();
|
||||
// Wait for the write job.
|
||||
alter_job.await.unwrap();
|
||||
|
||||
let check_inverted_index_set = |engine: &MitoEngine| {
|
||||
assert!(engine
|
||||
.get_region(region_id)
|
||||
.unwrap()
|
||||
.metadata()
|
||||
.column_by_name("tag_0")
|
||||
.unwrap()
|
||||
.column_schema
|
||||
.is_inverted_indexed())
|
||||
};
|
||||
check_inverted_index_set(&engine);
|
||||
check_region_version(&engine, region_id, 1, 3, 1, 3);
|
||||
|
||||
// Reopen region.
|
||||
let engine = env.reopen_engine(engine, MitoConfig::default()).await;
|
||||
engine
|
||||
.handle_request(
|
||||
region_id,
|
||||
RegionRequest::Open(RegionOpenRequest {
|
||||
engine: String::new(),
|
||||
region_dir,
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: false,
|
||||
}),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
check_inverted_index_set(&engine);
|
||||
check_region_version(&engine, region_id, 1, 3, 1, 3);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_alter_region_ttl_options() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
|
||||
@@ -553,7 +553,7 @@ async fn test_region_usage() {
|
||||
// region is empty now, check manifest size
|
||||
let region = engine.get_region(region_id).unwrap();
|
||||
let region_stat = region.region_statistic();
|
||||
assert_eq!(region_stat.manifest_size, 686);
|
||||
assert_eq!(region_stat.manifest_size, 717);
|
||||
|
||||
// put some rows
|
||||
let rows = Rows {
|
||||
|
||||
@@ -12,16 +12,20 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use api::v1::{ColumnSchema, Rows};
|
||||
use common_recordbatch::{RecordBatches, SendableRecordBatchStream};
|
||||
use datatypes::prelude::ScalarVector;
|
||||
use datatypes::vectors::TimestampMillisecondVector;
|
||||
use store_api::region_engine::{RegionEngine, RegionRole};
|
||||
use store_api::region_request::AlterKind::SetRegionOptions;
|
||||
use store_api::region_request::{
|
||||
RegionCompactRequest, RegionDeleteRequest, RegionFlushRequest, RegionRequest,
|
||||
RegionAlterRequest, RegionCompactRequest, RegionDeleteRequest, RegionFlushRequest,
|
||||
RegionOpenRequest, RegionRequest, SetRegionOption,
|
||||
};
|
||||
use store_api::storage::{RegionId, ScanRequest};
|
||||
use tokio::sync::Notify;
|
||||
@@ -466,3 +470,219 @@ async fn test_compaction_update_time_window() {
|
||||
let vec = collect_stream_ts(stream).await;
|
||||
assert_eq!((0..4000).map(|v| v * 1000).collect::<Vec<_>>(), vec);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_change_region_compaction_window() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let mut env = TestEnv::new();
|
||||
let engine = env.create_engine(MitoConfig::default()).await;
|
||||
|
||||
let region_id = RegionId::new(1, 1);
|
||||
|
||||
env.get_schema_metadata_manager()
|
||||
.register_region_table_info(
|
||||
region_id.table_id(),
|
||||
"test_table",
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
let request = CreateRequestBuilder::new()
|
||||
.insert_option("compaction.type", "twcs")
|
||||
.insert_option("compaction.twcs.max_active_window_runs", "1")
|
||||
.insert_option("compaction.twcs.max_active_window_files", "1")
|
||||
.insert_option("compaction.twcs.max_inactive_window_runs", "1")
|
||||
.insert_option("compaction.twcs.max_inactive_window_files", "1")
|
||||
.build();
|
||||
let region_dir = request.region_dir.clone();
|
||||
let column_schemas = request
|
||||
.column_metadatas
|
||||
.iter()
|
||||
.map(column_metadata_to_column_schema)
|
||||
.collect::<Vec<_>>();
|
||||
engine
|
||||
.handle_request(region_id, RegionRequest::Create(request))
|
||||
.await
|
||||
.unwrap();
|
||||
// Flush 2 SSTs for compaction.
|
||||
put_and_flush(&engine, region_id, &column_schemas, 0..1200).await; // window 3600
|
||||
put_and_flush(&engine, region_id, &column_schemas, 1200..2400).await; // window 3600
|
||||
|
||||
engine
|
||||
.handle_request(
|
||||
region_id,
|
||||
RegionRequest::Compact(RegionCompactRequest::default()),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Put window 7200
|
||||
put_and_flush(&engine, region_id, &column_schemas, 4000..5000).await; // window 3600
|
||||
|
||||
// Check compaction window.
|
||||
let region = engine.get_region(region_id).unwrap();
|
||||
{
|
||||
let version = region.version();
|
||||
assert_eq!(
|
||||
Some(Duration::from_secs(3600)),
|
||||
version.compaction_time_window,
|
||||
);
|
||||
assert!(version.options.compaction.time_window().is_none());
|
||||
}
|
||||
|
||||
// Change compaction window.
|
||||
let request = RegionRequest::Alter(RegionAlterRequest {
|
||||
schema_version: region.metadata().schema_version,
|
||||
kind: SetRegionOptions {
|
||||
options: vec![SetRegionOption::Twsc(
|
||||
"compaction.twcs.time_window".to_string(),
|
||||
"2h".to_string(),
|
||||
)],
|
||||
},
|
||||
});
|
||||
engine.handle_request(region_id, request).await.unwrap();
|
||||
|
||||
// Compaction again. It should compacts window 3600 and 7200
|
||||
// into 7200.
|
||||
engine
|
||||
.handle_request(
|
||||
region_id,
|
||||
RegionRequest::Compact(RegionCompactRequest::default()),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
// Check compaction window.
|
||||
{
|
||||
let region = engine.get_region(region_id).unwrap();
|
||||
let version = region.version();
|
||||
assert_eq!(
|
||||
Some(Duration::from_secs(7200)),
|
||||
version.compaction_time_window,
|
||||
);
|
||||
assert_eq!(
|
||||
Some(Duration::from_secs(7200)),
|
||||
version.options.compaction.time_window()
|
||||
);
|
||||
}
|
||||
|
||||
// Reopen region.
|
||||
let engine = env.reopen_engine(engine, MitoConfig::default()).await;
|
||||
engine
|
||||
.handle_request(
|
||||
region_id,
|
||||
RegionRequest::Open(RegionOpenRequest {
|
||||
engine: String::new(),
|
||||
region_dir,
|
||||
options: Default::default(),
|
||||
skip_wal_replay: false,
|
||||
}),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
// Check compaction window.
|
||||
{
|
||||
let region = engine.get_region(region_id).unwrap();
|
||||
let version = region.version();
|
||||
assert_eq!(
|
||||
Some(Duration::from_secs(7200)),
|
||||
version.compaction_time_window,
|
||||
);
|
||||
// We open the region without options, so the time window should be None.
|
||||
assert!(version.options.compaction.time_window().is_none());
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_open_overwrite_compaction_window() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let mut env = TestEnv::new();
|
||||
let engine = env.create_engine(MitoConfig::default()).await;
|
||||
|
||||
let region_id = RegionId::new(1, 1);
|
||||
|
||||
env.get_schema_metadata_manager()
|
||||
.register_region_table_info(
|
||||
region_id.table_id(),
|
||||
"test_table",
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
env.get_kv_backend(),
|
||||
)
|
||||
.await;
|
||||
|
||||
let request = CreateRequestBuilder::new()
|
||||
.insert_option("compaction.type", "twcs")
|
||||
.insert_option("compaction.twcs.max_active_window_runs", "1")
|
||||
.insert_option("compaction.twcs.max_active_window_files", "1")
|
||||
.insert_option("compaction.twcs.max_inactive_window_runs", "1")
|
||||
.insert_option("compaction.twcs.max_inactive_window_files", "1")
|
||||
.build();
|
||||
let region_dir = request.region_dir.clone();
|
||||
let column_schemas = request
|
||||
.column_metadatas
|
||||
.iter()
|
||||
.map(column_metadata_to_column_schema)
|
||||
.collect::<Vec<_>>();
|
||||
engine
|
||||
.handle_request(region_id, RegionRequest::Create(request))
|
||||
.await
|
||||
.unwrap();
|
||||
// Flush 2 SSTs for compaction.
|
||||
put_and_flush(&engine, region_id, &column_schemas, 0..1200).await; // window 3600
|
||||
put_and_flush(&engine, region_id, &column_schemas, 1200..2400).await; // window 3600
|
||||
|
||||
engine
|
||||
.handle_request(
|
||||
region_id,
|
||||
RegionRequest::Compact(RegionCompactRequest::default()),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Check compaction window.
|
||||
{
|
||||
let region = engine.get_region(region_id).unwrap();
|
||||
let version = region.version();
|
||||
assert_eq!(
|
||||
Some(Duration::from_secs(3600)),
|
||||
version.compaction_time_window,
|
||||
);
|
||||
assert!(version.options.compaction.time_window().is_none());
|
||||
}
|
||||
|
||||
// Reopen region.
|
||||
let options = HashMap::from([
|
||||
("compaction.type".to_string(), "twcs".to_string()),
|
||||
("compaction.twcs.time_window".to_string(), "2h".to_string()),
|
||||
]);
|
||||
let engine = env.reopen_engine(engine, MitoConfig::default()).await;
|
||||
engine
|
||||
.handle_request(
|
||||
region_id,
|
||||
RegionRequest::Open(RegionOpenRequest {
|
||||
engine: String::new(),
|
||||
region_dir,
|
||||
options,
|
||||
skip_wal_replay: false,
|
||||
}),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
// Check compaction window.
|
||||
{
|
||||
let region = engine.get_region(region_id).unwrap();
|
||||
let version = region.version();
|
||||
assert_eq!(
|
||||
Some(Duration::from_secs(7200)),
|
||||
version.compaction_time_window,
|
||||
);
|
||||
assert_eq!(
|
||||
Some(Duration::from_secs(7200)),
|
||||
version.options.compaction.time_window()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -464,6 +464,7 @@ async fn test_open_compaction_region() {
|
||||
region_id,
|
||||
region_dir: region_dir.clone(),
|
||||
region_options: RegionOptions::default(),
|
||||
max_parallelism: 1,
|
||||
};
|
||||
|
||||
let compaction_region = open_compaction_region(
|
||||
|
||||
@@ -79,6 +79,7 @@ async fn test_scan_projection() {
|
||||
output_ordering: None,
|
||||
limit: None,
|
||||
series_row_selector: None,
|
||||
sequence: None,
|
||||
};
|
||||
let stream = engine.scan_to_stream(region_id, request).await.unwrap();
|
||||
let batches = RecordBatches::try_collect(stream).await.unwrap();
|
||||
|
||||
@@ -925,6 +925,20 @@ pub enum Error {
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display(
|
||||
"Unexpected impure default value with region_id: {}, column: {}, default_value: {}",
|
||||
region_id,
|
||||
column,
|
||||
default_value
|
||||
))]
|
||||
UnexpectedImpureDefault {
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
region_id: RegionId,
|
||||
column: String,
|
||||
default_value: String,
|
||||
},
|
||||
}
|
||||
|
||||
pub type Result<T, E = Error> = std::result::Result<T, E>;
|
||||
@@ -964,7 +978,8 @@ impl ErrorExt for Error {
|
||||
| InvalidParquet { .. }
|
||||
| OperateAbortedIndex { .. }
|
||||
| UnexpectedReplay { .. }
|
||||
| IndexEncodeNull { .. } => StatusCode::Unexpected,
|
||||
| IndexEncodeNull { .. }
|
||||
| UnexpectedImpureDefault { .. } => StatusCode::Unexpected,
|
||||
RegionNotFound { .. } => StatusCode::RegionNotFound,
|
||||
ObjectStoreNotFound { .. }
|
||||
| InvalidScanIndex { .. }
|
||||
|
||||
@@ -348,7 +348,7 @@ impl RegionFlushTask {
|
||||
|
||||
let max_sequence = mem.stats().max_sequence();
|
||||
let file_id = FileId::random();
|
||||
let iter = mem.iter(None, None)?;
|
||||
let iter = mem.iter(None, None, None)?;
|
||||
let source = Source::Iter(iter);
|
||||
|
||||
// Flush to level 0.
|
||||
|
||||
@@ -592,6 +592,6 @@ mod test {
|
||||
|
||||
// get manifest size again
|
||||
let manifest_size = manager.manifest_usage();
|
||||
assert_eq!(manifest_size, 1173);
|
||||
assert_eq!(manifest_size, 1204);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -343,7 +343,7 @@ impl ManifestObjectStore {
|
||||
);
|
||||
|
||||
self.object_store
|
||||
.remove(paths)
|
||||
.delete_iter(paths)
|
||||
.await
|
||||
.context(OpenDalSnafu)?;
|
||||
|
||||
|
||||
@@ -154,7 +154,7 @@ async fn manager_with_checkpoint_distance_1() {
|
||||
.unwrap();
|
||||
let raw_json = std::str::from_utf8(&raw_bytes).unwrap();
|
||||
let expected_json =
|
||||
"{\"size\":848,\"version\":10,\"checksum\":4186457347,\"extend_metadata\":{}}";
|
||||
"{\"size\":879,\"version\":10,\"checksum\":2245967096,\"extend_metadata\":{}}";
|
||||
assert_eq!(expected_json, raw_json);
|
||||
|
||||
// reopen the manager
|
||||
|
||||
@@ -147,6 +147,7 @@ pub trait Memtable: Send + Sync + fmt::Debug {
|
||||
&self,
|
||||
projection: Option<&[ColumnId]>,
|
||||
predicate: Option<Predicate>,
|
||||
sequence: Option<SequenceNumber>,
|
||||
) -> Result<BoxedBatchIterator>;
|
||||
|
||||
/// Returns the ranges in the memtable.
|
||||
@@ -155,6 +156,7 @@ pub trait Memtable: Send + Sync + fmt::Debug {
|
||||
&self,
|
||||
projection: Option<&[ColumnId]>,
|
||||
predicate: Option<Predicate>,
|
||||
sequence: Option<SequenceNumber>,
|
||||
) -> MemtableRanges;
|
||||
|
||||
/// Returns true if the memtable is empty.
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::ColumnId;
|
||||
use store_api::storage::{ColumnId, SequenceNumber};
|
||||
use table::predicate::Predicate;
|
||||
|
||||
use crate::error::Result;
|
||||
@@ -63,6 +63,7 @@ impl Memtable for BulkMemtable {
|
||||
&self,
|
||||
_projection: Option<&[ColumnId]>,
|
||||
_predicate: Option<Predicate>,
|
||||
_sequence: Option<SequenceNumber>,
|
||||
) -> Result<BoxedBatchIterator> {
|
||||
todo!()
|
||||
}
|
||||
@@ -71,6 +72,7 @@ impl Memtable for BulkMemtable {
|
||||
&self,
|
||||
_projection: Option<&[ColumnId]>,
|
||||
_predicate: Option<Predicate>,
|
||||
_sequence: Option<SequenceNumber>,
|
||||
) -> MemtableRanges {
|
||||
todo!()
|
||||
}
|
||||
|
||||
@@ -22,7 +22,7 @@ use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::ColumnId;
|
||||
use table::predicate::Predicate;
|
||||
|
||||
use crate::row_converter::McmpRowCodec;
|
||||
use crate::row_converter::DensePrimaryKeyCodec;
|
||||
use crate::sst::parquet::file_range::RangeBase;
|
||||
use crate::sst::parquet::format::ReadFormat;
|
||||
use crate::sst::parquet::reader::SimpleFilterContext;
|
||||
@@ -41,7 +41,7 @@ impl BulkIterContext {
|
||||
projection: &Option<&[ColumnId]>,
|
||||
predicate: Option<Predicate>,
|
||||
) -> Self {
|
||||
let codec = McmpRowCodec::new_with_primary_keys(®ion_metadata);
|
||||
let codec = DensePrimaryKeyCodec::new(®ion_metadata);
|
||||
|
||||
let simple_filters = predicate
|
||||
.as_ref()
|
||||
|
||||
@@ -39,7 +39,7 @@ use parquet::file::metadata::ParquetMetaData;
|
||||
use parquet::file::properties::WriterProperties;
|
||||
use snafu::ResultExt;
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::ColumnId;
|
||||
use store_api::storage::{ColumnId, SequenceNumber};
|
||||
use table::predicate::Predicate;
|
||||
|
||||
use crate::error;
|
||||
@@ -48,7 +48,7 @@ use crate::memtable::bulk::context::BulkIterContextRef;
|
||||
use crate::memtable::bulk::part_reader::BulkPartIter;
|
||||
use crate::memtable::key_values::KeyValuesRef;
|
||||
use crate::memtable::BoxedBatchIterator;
|
||||
use crate::row_converter::{McmpRowCodec, RowCodec};
|
||||
use crate::row_converter::{DensePrimaryKeyCodec, PrimaryKeyCodec, PrimaryKeyCodecExt};
|
||||
use crate::sst::parquet::format::{PrimaryKeyArray, ReadFormat};
|
||||
use crate::sst::parquet::helper::parse_parquet_metadata;
|
||||
use crate::sst::to_sst_arrow_schema;
|
||||
@@ -68,7 +68,11 @@ impl BulkPart {
|
||||
&self.metadata
|
||||
}
|
||||
|
||||
pub(crate) fn read(&self, context: BulkIterContextRef) -> Result<Option<BoxedBatchIterator>> {
|
||||
pub(crate) fn read(
|
||||
&self,
|
||||
context: BulkIterContextRef,
|
||||
sequence: Option<SequenceNumber>,
|
||||
) -> Result<Option<BoxedBatchIterator>> {
|
||||
// use predicate to find row groups to read.
|
||||
let row_groups_to_read = context.row_groups_to_read(&self.metadata.parquet_metadata);
|
||||
|
||||
@@ -82,6 +86,7 @@ impl BulkPart {
|
||||
row_groups_to_read,
|
||||
self.metadata.parquet_metadata.clone(),
|
||||
self.data.clone(),
|
||||
sequence,
|
||||
)?;
|
||||
Ok(Some(Box::new(iter) as BoxedBatchIterator))
|
||||
}
|
||||
@@ -103,7 +108,7 @@ pub struct BulkPartMeta {
|
||||
|
||||
pub struct BulkPartEncoder {
|
||||
metadata: RegionMetadataRef,
|
||||
pk_encoder: McmpRowCodec,
|
||||
pk_encoder: DensePrimaryKeyCodec,
|
||||
row_group_size: usize,
|
||||
dedup: bool,
|
||||
writer_props: Option<WriterProperties>,
|
||||
@@ -115,7 +120,7 @@ impl BulkPartEncoder {
|
||||
dedup: bool,
|
||||
row_group_size: usize,
|
||||
) -> BulkPartEncoder {
|
||||
let codec = McmpRowCodec::new_with_primary_keys(&metadata);
|
||||
let codec = DensePrimaryKeyCodec::new(&metadata);
|
||||
let writer_props = Some(
|
||||
WriterProperties::builder()
|
||||
.set_write_batch_size(row_group_size)
|
||||
@@ -174,7 +179,7 @@ impl BulkPartEncoder {
|
||||
fn mutations_to_record_batch(
|
||||
mutations: &[Mutation],
|
||||
metadata: &RegionMetadataRef,
|
||||
pk_encoder: &McmpRowCodec,
|
||||
pk_encoder: &DensePrimaryKeyCodec,
|
||||
dedup: bool,
|
||||
) -> Result<Option<(RecordBatch, i64, i64)>> {
|
||||
let total_rows: usize = mutations
|
||||
@@ -538,7 +543,7 @@ mod tests {
|
||||
.map(|r| r.rows.len())
|
||||
.sum();
|
||||
|
||||
let pk_encoder = McmpRowCodec::new_with_primary_keys(&metadata);
|
||||
let pk_encoder = DensePrimaryKeyCodec::new(&metadata);
|
||||
|
||||
let (batch, _, _) = mutations_to_record_batch(&mutations, &metadata, &pk_encoder, dedup)
|
||||
.unwrap()
|
||||
@@ -557,7 +562,7 @@ mod tests {
|
||||
let batch_values = batches
|
||||
.into_iter()
|
||||
.map(|b| {
|
||||
let pk_values = pk_encoder.decode(b.primary_key()).unwrap();
|
||||
let pk_values = pk_encoder.decode_dense(b.primary_key()).unwrap();
|
||||
let timestamps = b
|
||||
.timestamps()
|
||||
.as_any()
|
||||
@@ -786,11 +791,14 @@ mod tests {
|
||||
let projection = &[4u32];
|
||||
|
||||
let mut reader = part
|
||||
.read(Arc::new(BulkIterContext::new(
|
||||
part.metadata.region_metadata.clone(),
|
||||
&Some(projection.as_slice()),
|
||||
.read(
|
||||
Arc::new(BulkIterContext::new(
|
||||
part.metadata.region_metadata.clone(),
|
||||
&Some(projection.as_slice()),
|
||||
None,
|
||||
)),
|
||||
None,
|
||||
)))
|
||||
)
|
||||
.unwrap()
|
||||
.expect("expect at least one row group");
|
||||
|
||||
@@ -837,7 +845,7 @@ mod tests {
|
||||
predicate,
|
||||
));
|
||||
let mut reader = part
|
||||
.read(context)
|
||||
.read(context, None)
|
||||
.unwrap()
|
||||
.expect("expect at least one row group");
|
||||
let mut total_rows_read = 0;
|
||||
@@ -866,7 +874,7 @@ mod tests {
|
||||
datafusion_expr::lit(ScalarValue::TimestampMillisecond(Some(300), None)),
|
||||
)])),
|
||||
));
|
||||
assert!(part.read(context).unwrap().is_none());
|
||||
assert!(part.read(context, None).unwrap().is_none());
|
||||
|
||||
check_prune_row_group(&part, None, 310);
|
||||
|
||||
|
||||
@@ -18,6 +18,7 @@ use std::sync::Arc;
|
||||
use bytes::Bytes;
|
||||
use parquet::arrow::ProjectionMask;
|
||||
use parquet::file::metadata::ParquetMetaData;
|
||||
use store_api::storage::SequenceNumber;
|
||||
|
||||
use crate::error;
|
||||
use crate::memtable::bulk::context::BulkIterContextRef;
|
||||
@@ -31,6 +32,7 @@ pub struct BulkPartIter {
|
||||
row_groups_to_read: VecDeque<usize>,
|
||||
current_reader: Option<PruneReader>,
|
||||
builder: MemtableRowGroupReaderBuilder,
|
||||
sequence: Option<SequenceNumber>,
|
||||
}
|
||||
|
||||
impl BulkPartIter {
|
||||
@@ -40,6 +42,7 @@ impl BulkPartIter {
|
||||
mut row_groups_to_read: VecDeque<usize>,
|
||||
parquet_meta: Arc<ParquetMetaData>,
|
||||
data: Bytes,
|
||||
sequence: Option<SequenceNumber>,
|
||||
) -> error::Result<Self> {
|
||||
let projection_mask = ProjectionMask::roots(
|
||||
parquet_meta.file_metadata().schema_descr(),
|
||||
@@ -62,6 +65,7 @@ impl BulkPartIter {
|
||||
row_groups_to_read,
|
||||
current_reader: init_reader,
|
||||
builder,
|
||||
sequence,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -71,14 +75,16 @@ impl BulkPartIter {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
if let Some(batch) = current.next_batch()? {
|
||||
if let Some(mut batch) = current.next_batch()? {
|
||||
batch.filter_by_sequence(self.sequence)?;
|
||||
return Ok(Some(batch));
|
||||
}
|
||||
|
||||
// Previous row group exhausted, read next row group
|
||||
while let Some(next_row_group) = self.row_groups_to_read.pop_front() {
|
||||
current.reset(self.builder.build_row_group_reader(next_row_group, None)?);
|
||||
if let Some(next_batch) = current.next_batch()? {
|
||||
if let Some(mut next_batch) = current.next_batch()? {
|
||||
next_batch.filter_by_sequence(self.sequence)?;
|
||||
return Ok(Some(next_batch));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,6 +19,9 @@ mod dedup;
|
||||
mod dict;
|
||||
mod merger;
|
||||
mod partition;
|
||||
// TODO(weny): remove this
|
||||
#[allow(unused)]
|
||||
mod primary_key_filter;
|
||||
mod shard;
|
||||
mod shard_builder;
|
||||
mod tree;
|
||||
@@ -28,9 +31,11 @@ use std::sync::atomic::{AtomicI64, AtomicU64, AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_base::readable_size::ReadableSize;
|
||||
pub(crate) use primary_key_filter::DensePrimaryKeyFilter;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use store_api::codec::PrimaryKeyEncoding;
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::ColumnId;
|
||||
use store_api::storage::{ColumnId, SequenceNumber};
|
||||
use table::predicate::Predicate;
|
||||
|
||||
use crate::error::{Result, UnsupportedOperationSnafu};
|
||||
@@ -43,6 +48,7 @@ use crate::memtable::{
|
||||
MemtableId, MemtableRange, MemtableRangeContext, MemtableRanges, MemtableRef, MemtableStats,
|
||||
};
|
||||
use crate::region::options::MergeMode;
|
||||
use crate::row_converter::{DensePrimaryKeyCodec, PrimaryKeyCodec};
|
||||
|
||||
/// Use `1/DICTIONARY_SIZE_FACTOR` of OS memory as dictionary size.
|
||||
pub(crate) const DICTIONARY_SIZE_FACTOR: u64 = 8;
|
||||
@@ -184,20 +190,23 @@ impl Memtable for PartitionTreeMemtable {
|
||||
&self,
|
||||
projection: Option<&[ColumnId]>,
|
||||
predicate: Option<Predicate>,
|
||||
sequence: Option<SequenceNumber>,
|
||||
) -> Result<BoxedBatchIterator> {
|
||||
self.tree.read(projection, predicate)
|
||||
self.tree.read(projection, predicate, sequence)
|
||||
}
|
||||
|
||||
fn ranges(
|
||||
&self,
|
||||
projection: Option<&[ColumnId]>,
|
||||
predicate: Option<Predicate>,
|
||||
sequence: Option<SequenceNumber>,
|
||||
) -> MemtableRanges {
|
||||
let projection = projection.map(|ids| ids.to_vec());
|
||||
let builder = Box::new(PartitionTreeIterBuilder {
|
||||
tree: self.tree.clone(),
|
||||
projection,
|
||||
predicate,
|
||||
sequence,
|
||||
});
|
||||
let context = Arc::new(MemtableRangeContext::new(self.id, builder));
|
||||
|
||||
@@ -263,13 +272,14 @@ impl PartitionTreeMemtable {
|
||||
/// Returns a new memtable.
|
||||
pub fn new(
|
||||
id: MemtableId,
|
||||
row_codec: Arc<dyn PrimaryKeyCodec>,
|
||||
metadata: RegionMetadataRef,
|
||||
write_buffer_manager: Option<WriteBufferManagerRef>,
|
||||
config: &PartitionTreeConfig,
|
||||
) -> Self {
|
||||
Self::with_tree(
|
||||
id,
|
||||
PartitionTree::new(metadata, config, write_buffer_manager.clone()),
|
||||
PartitionTree::new(row_codec, metadata, config, write_buffer_manager.clone()),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -320,12 +330,22 @@ impl PartitionTreeMemtableBuilder {
|
||||
|
||||
impl MemtableBuilder for PartitionTreeMemtableBuilder {
|
||||
fn build(&self, id: MemtableId, metadata: &RegionMetadataRef) -> MemtableRef {
|
||||
Arc::new(PartitionTreeMemtable::new(
|
||||
id,
|
||||
metadata.clone(),
|
||||
self.write_buffer_manager.clone(),
|
||||
&self.config,
|
||||
))
|
||||
match metadata.primary_key_encoding {
|
||||
PrimaryKeyEncoding::Dense => {
|
||||
let codec = Arc::new(DensePrimaryKeyCodec::new(metadata));
|
||||
Arc::new(PartitionTreeMemtable::new(
|
||||
id,
|
||||
codec,
|
||||
metadata.clone(),
|
||||
self.write_buffer_manager.clone(),
|
||||
&self.config,
|
||||
))
|
||||
}
|
||||
PrimaryKeyEncoding::Sparse => {
|
||||
//TODO(weny): Implement sparse primary key encoding.
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -333,12 +353,16 @@ struct PartitionTreeIterBuilder {
|
||||
tree: Arc<PartitionTree>,
|
||||
projection: Option<Vec<ColumnId>>,
|
||||
predicate: Option<Predicate>,
|
||||
sequence: Option<SequenceNumber>,
|
||||
}
|
||||
|
||||
impl IterBuilder for PartitionTreeIterBuilder {
|
||||
fn build(&self) -> Result<BoxedBatchIterator> {
|
||||
self.tree
|
||||
.read(self.projection.as_deref(), self.predicate.clone())
|
||||
self.tree.read(
|
||||
self.projection.as_deref(),
|
||||
self.predicate.clone(),
|
||||
self.sequence,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -358,7 +382,7 @@ mod tests {
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use super::*;
|
||||
use crate::row_converter::{McmpRowCodec, RowCodec, SortField};
|
||||
use crate::row_converter::{DensePrimaryKeyCodec, PrimaryKeyCodecExt};
|
||||
use crate::test_util::memtable_util::{
|
||||
self, collect_iter_timestamps, region_metadata_to_row_schema,
|
||||
};
|
||||
@@ -378,8 +402,14 @@ mod tests {
|
||||
let timestamps = (0..100).collect::<Vec<_>>();
|
||||
let kvs =
|
||||
memtable_util::build_key_values(&metadata, "hello".to_string(), 42, ×tamps, 1);
|
||||
let memtable =
|
||||
PartitionTreeMemtable::new(1, metadata, None, &PartitionTreeConfig::default());
|
||||
let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
|
||||
let memtable = PartitionTreeMemtable::new(
|
||||
1,
|
||||
codec,
|
||||
metadata.clone(),
|
||||
None,
|
||||
&PartitionTreeConfig::default(),
|
||||
);
|
||||
memtable.write(&kvs).unwrap();
|
||||
|
||||
let expected_ts = kvs
|
||||
@@ -387,7 +417,7 @@ mod tests {
|
||||
.map(|kv| kv.timestamp().as_timestamp().unwrap().unwrap().value())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let iter = memtable.iter(None, None).unwrap();
|
||||
let iter = memtable.iter(None, None, None).unwrap();
|
||||
let read = collect_iter_timestamps(iter);
|
||||
assert_eq!(expected_ts, read);
|
||||
|
||||
@@ -414,8 +444,14 @@ mod tests {
|
||||
} else {
|
||||
memtable_util::metadata_with_primary_key(vec![], false)
|
||||
};
|
||||
let memtable =
|
||||
PartitionTreeMemtable::new(1, metadata.clone(), None, &PartitionTreeConfig::default());
|
||||
let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
|
||||
let memtable = PartitionTreeMemtable::new(
|
||||
1,
|
||||
codec,
|
||||
metadata.clone(),
|
||||
None,
|
||||
&PartitionTreeConfig::default(),
|
||||
);
|
||||
|
||||
let kvs = memtable_util::build_key_values(
|
||||
&metadata,
|
||||
@@ -435,11 +471,11 @@ mod tests {
|
||||
);
|
||||
memtable.write(&kvs).unwrap();
|
||||
|
||||
let iter = memtable.iter(None, None).unwrap();
|
||||
let iter = memtable.iter(None, None, None).unwrap();
|
||||
let read = collect_iter_timestamps(iter);
|
||||
assert_eq!(vec![0, 1, 2, 3, 4, 5, 6, 7], read);
|
||||
|
||||
let iter = memtable.iter(None, None).unwrap();
|
||||
let iter = memtable.iter(None, None, None).unwrap();
|
||||
let read = iter
|
||||
.flat_map(|batch| {
|
||||
batch
|
||||
@@ -480,7 +516,7 @@ mod tests {
|
||||
let expect = (0..100).collect::<Vec<_>>();
|
||||
let kvs = memtable_util::build_key_values(&metadata, "hello".to_string(), 10, &expect, 1);
|
||||
memtable.write(&kvs).unwrap();
|
||||
let iter = memtable.iter(Some(&[3]), None).unwrap();
|
||||
let iter = memtable.iter(Some(&[3]), None, None).unwrap();
|
||||
|
||||
let mut v0_all = vec![];
|
||||
for res in iter {
|
||||
@@ -510,8 +546,10 @@ mod tests {
|
||||
|
||||
fn write_iter_multi_keys(max_keys: usize, freeze_threshold: usize) {
|
||||
let metadata = memtable_util::metadata_with_primary_key(vec![1, 0], true);
|
||||
let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
|
||||
let memtable = PartitionTreeMemtable::new(
|
||||
1,
|
||||
codec,
|
||||
metadata.clone(),
|
||||
None,
|
||||
&PartitionTreeConfig {
|
||||
@@ -550,7 +588,7 @@ mod tests {
|
||||
data.sort_unstable();
|
||||
|
||||
let expect = data.into_iter().map(|x| x.2).collect::<Vec<_>>();
|
||||
let iter = memtable.iter(None, None).unwrap();
|
||||
let iter = memtable.iter(None, None, None).unwrap();
|
||||
let read = collect_iter_timestamps(iter);
|
||||
assert_eq!(expect, read);
|
||||
}
|
||||
@@ -586,7 +624,7 @@ mod tests {
|
||||
right: Box::new(Expr::Literal(ScalarValue::UInt32(Some(i)))),
|
||||
});
|
||||
let iter = memtable
|
||||
.iter(None, Some(Predicate::new(vec![expr])))
|
||||
.iter(None, Some(Predicate::new(vec![expr])), None)
|
||||
.unwrap();
|
||||
let read = collect_iter_timestamps(iter);
|
||||
assert_eq!(timestamps, read);
|
||||
@@ -719,12 +757,7 @@ mod tests {
|
||||
)
|
||||
.build(1, &metadata);
|
||||
|
||||
let codec = McmpRowCodec::new(
|
||||
metadata
|
||||
.primary_key_columns()
|
||||
.map(|c| SortField::new(c.column_schema.data_type.clone()))
|
||||
.collect(),
|
||||
);
|
||||
let codec = DensePrimaryKeyCodec::new(&metadata);
|
||||
|
||||
memtable
|
||||
.write(&build_key_values(
|
||||
@@ -758,7 +791,7 @@ mod tests {
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
let mut reader = new_memtable.iter(None, None).unwrap();
|
||||
let mut reader = new_memtable.iter(None, None, None).unwrap();
|
||||
let batch = reader.next().unwrap().unwrap();
|
||||
let pk = codec.decode(batch.primary_key()).unwrap();
|
||||
if let Value::String(s) = &pk[2] {
|
||||
|
||||
@@ -22,6 +22,7 @@ use std::time::{Duration, Instant};
|
||||
|
||||
use api::v1::SemanticType;
|
||||
use common_recordbatch::filter::SimpleFilterEvaluator;
|
||||
use store_api::codec::PrimaryKeyEncoding;
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::metric_engine_consts::DATA_SCHEMA_TABLE_ID_COLUMN_NAME;
|
||||
use store_api::storage::ColumnId;
|
||||
@@ -38,7 +39,7 @@ use crate::memtable::partition_tree::{PartitionTreeConfig, PkId};
|
||||
use crate::memtable::stats::WriteMetrics;
|
||||
use crate::metrics::PARTITION_TREE_READ_STAGE_ELAPSED;
|
||||
use crate::read::{Batch, BatchBuilder};
|
||||
use crate::row_converter::{McmpRowCodec, RowCodec};
|
||||
use crate::row_converter::{PrimaryKeyCodec, PrimaryKeyFilter};
|
||||
|
||||
/// Key of a partition.
|
||||
pub type PartitionKey = u32;
|
||||
@@ -65,7 +66,7 @@ impl Partition {
|
||||
pub fn write_with_key(
|
||||
&self,
|
||||
primary_key: &mut Vec<u8>,
|
||||
row_codec: &McmpRowCodec,
|
||||
row_codec: &dyn PrimaryKeyCodec,
|
||||
key_value: KeyValue,
|
||||
re_encode: bool,
|
||||
metrics: &mut WriteMetrics,
|
||||
@@ -85,17 +86,25 @@ impl Partition {
|
||||
|
||||
// Key does not yet exist in shard or builder, encode and insert the full primary key.
|
||||
if re_encode {
|
||||
// `primary_key` is sparse, re-encode the full primary key.
|
||||
let sparse_key = primary_key.clone();
|
||||
primary_key.clear();
|
||||
row_codec.encode_to_vec(key_value.primary_keys(), primary_key)?;
|
||||
let pk_id = inner.shard_builder.write_with_key(
|
||||
primary_key,
|
||||
Some(&sparse_key),
|
||||
&key_value,
|
||||
metrics,
|
||||
);
|
||||
inner.pk_to_pk_id.insert(sparse_key, pk_id);
|
||||
match row_codec.encoding() {
|
||||
PrimaryKeyEncoding::Dense => {
|
||||
// `primary_key` is sparse, re-encode the full primary key.
|
||||
let sparse_key = primary_key.clone();
|
||||
primary_key.clear();
|
||||
row_codec.encode_key_value(&key_value, primary_key)?;
|
||||
let pk_id = inner.shard_builder.write_with_key(
|
||||
primary_key,
|
||||
Some(&sparse_key),
|
||||
&key_value,
|
||||
metrics,
|
||||
);
|
||||
inner.pk_to_pk_id.insert(sparse_key, pk_id);
|
||||
}
|
||||
PrimaryKeyEncoding::Sparse => {
|
||||
// TODO(weny): support sparse primary key.
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// `primary_key` is already the full primary key.
|
||||
let pk_id = inner
|
||||
@@ -126,18 +135,23 @@ impl Partition {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn build_primary_key_filter(
|
||||
need_prune_key: bool,
|
||||
metadata: &RegionMetadataRef,
|
||||
row_codec: &dyn PrimaryKeyCodec,
|
||||
filters: &Arc<Vec<SimpleFilterEvaluator>>,
|
||||
) -> Option<Box<dyn PrimaryKeyFilter>> {
|
||||
if need_prune_key {
|
||||
let filter = row_codec.primary_key_filter(metadata, filters.clone());
|
||||
Some(filter)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Scans data in the partition.
|
||||
pub fn read(&self, mut context: ReadPartitionContext) -> Result<PartitionReader> {
|
||||
let start = Instant::now();
|
||||
let key_filter = if context.need_prune_key {
|
||||
Some(PrimaryKeyFilter::new(
|
||||
context.metadata.clone(),
|
||||
context.filters.clone(),
|
||||
context.row_codec.clone(),
|
||||
))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let (builder_source, shard_reader_builders) = {
|
||||
let inner = self.inner.read().unwrap();
|
||||
let mut shard_source = Vec::with_capacity(inner.shards.len() + 1);
|
||||
@@ -157,20 +171,33 @@ impl Partition {
|
||||
};
|
||||
|
||||
context.metrics.num_shards += shard_reader_builders.len();
|
||||
|
||||
let mut nodes = shard_reader_builders
|
||||
.into_iter()
|
||||
.map(|builder| {
|
||||
let primary_key_filter = Self::build_primary_key_filter(
|
||||
context.need_prune_key,
|
||||
&context.metadata,
|
||||
context.row_codec.as_ref(),
|
||||
&context.filters,
|
||||
);
|
||||
Ok(ShardNode::new(ShardSource::Shard(
|
||||
builder.build(key_filter.clone())?,
|
||||
builder.build(primary_key_filter)?,
|
||||
)))
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
|
||||
if let Some(builder) = builder_source {
|
||||
context.metrics.num_builder += 1;
|
||||
let primary_key_filter = Self::build_primary_key_filter(
|
||||
context.need_prune_key,
|
||||
&context.metadata,
|
||||
context.row_codec.as_ref(),
|
||||
&context.filters,
|
||||
);
|
||||
// Move the initialization of ShardBuilderReader out of read lock.
|
||||
let shard_builder_reader =
|
||||
builder.build(Some(&context.pk_weights), key_filter.clone())?;
|
||||
builder.build(Some(&context.pk_weights), primary_key_filter)?;
|
||||
nodes.push(ShardNode::new(ShardSource::Builder(shard_builder_reader)));
|
||||
}
|
||||
|
||||
@@ -354,81 +381,10 @@ impl PartitionReader {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct PrimaryKeyFilter {
|
||||
metadata: RegionMetadataRef,
|
||||
filters: Arc<Vec<SimpleFilterEvaluator>>,
|
||||
codec: Arc<McmpRowCodec>,
|
||||
offsets_buf: Vec<usize>,
|
||||
}
|
||||
|
||||
impl PrimaryKeyFilter {
|
||||
pub(crate) fn new(
|
||||
metadata: RegionMetadataRef,
|
||||
filters: Arc<Vec<SimpleFilterEvaluator>>,
|
||||
codec: Arc<McmpRowCodec>,
|
||||
) -> Self {
|
||||
Self {
|
||||
metadata,
|
||||
filters,
|
||||
codec,
|
||||
offsets_buf: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn prune_primary_key(&mut self, pk: &[u8]) -> bool {
|
||||
if self.filters.is_empty() {
|
||||
return true;
|
||||
}
|
||||
|
||||
// no primary key, we simply return true.
|
||||
if self.metadata.primary_key.is_empty() {
|
||||
return true;
|
||||
}
|
||||
|
||||
// evaluate filters against primary key values
|
||||
let mut result = true;
|
||||
self.offsets_buf.clear();
|
||||
for filter in &*self.filters {
|
||||
if Partition::is_partition_column(filter.column_name()) {
|
||||
continue;
|
||||
}
|
||||
let Some(column) = self.metadata.column_by_name(filter.column_name()) else {
|
||||
continue;
|
||||
};
|
||||
// ignore filters that are not referencing primary key columns
|
||||
if column.semantic_type != SemanticType::Tag {
|
||||
continue;
|
||||
}
|
||||
// index of the column in primary keys.
|
||||
// Safety: A tag column is always in primary key.
|
||||
let index = self.metadata.primary_key_index(column.column_id).unwrap();
|
||||
let value = match self.codec.decode_value_at(pk, index, &mut self.offsets_buf) {
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
common_telemetry::error!(e; "Failed to decode primary key");
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
// TODO(yingwen): `evaluate_scalar()` creates temporary arrays to compare scalars. We
|
||||
// can compare the bytes directly without allocation and matching types as we use
|
||||
// comparable encoding.
|
||||
// Safety: arrow schema and datatypes are constructed from the same source.
|
||||
let scalar_value = value
|
||||
.try_to_scalar_value(&column.column_schema.data_type)
|
||||
.unwrap();
|
||||
result &= filter.evaluate_scalar(&scalar_value).unwrap_or(true);
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
/// Structs to reuse across readers to avoid allocating for each reader.
|
||||
pub(crate) struct ReadPartitionContext {
|
||||
metadata: RegionMetadataRef,
|
||||
row_codec: Arc<McmpRowCodec>,
|
||||
row_codec: Arc<dyn PrimaryKeyCodec>,
|
||||
projection: HashSet<ColumnId>,
|
||||
filters: Arc<Vec<SimpleFilterEvaluator>>,
|
||||
/// Buffer to store pk weights.
|
||||
@@ -467,16 +423,16 @@ impl Drop for ReadPartitionContext {
|
||||
impl ReadPartitionContext {
|
||||
pub(crate) fn new(
|
||||
metadata: RegionMetadataRef,
|
||||
row_codec: Arc<McmpRowCodec>,
|
||||
row_codec: Arc<dyn PrimaryKeyCodec>,
|
||||
projection: HashSet<ColumnId>,
|
||||
filters: Vec<SimpleFilterEvaluator>,
|
||||
filters: Arc<Vec<SimpleFilterEvaluator>>,
|
||||
) -> ReadPartitionContext {
|
||||
let need_prune_key = Self::need_prune_key(&metadata, &filters);
|
||||
ReadPartitionContext {
|
||||
metadata,
|
||||
row_codec,
|
||||
projection,
|
||||
filters: Arc::new(filters),
|
||||
filters,
|
||||
pk_weights: Vec::new(),
|
||||
need_prune_key,
|
||||
metrics: Default::default(),
|
||||
|
||||
342
src/mito2/src/memtable/partition_tree/primary_key_filter.rs
Normal file
342
src/mito2/src/memtable/partition_tree/primary_key_filter.rs
Normal file
@@ -0,0 +1,342 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::v1::SemanticType;
|
||||
use common_recordbatch::filter::SimpleFilterEvaluator;
|
||||
use datatypes::value::Value;
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::ColumnId;
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::memtable::partition_tree::partition::Partition;
|
||||
use crate::row_converter::{DensePrimaryKeyCodec, PrimaryKeyFilter, SparsePrimaryKeyCodec};
|
||||
|
||||
#[derive(Clone)]
|
||||
struct PrimaryKeyFilterInner {
|
||||
metadata: RegionMetadataRef,
|
||||
filters: Arc<Vec<SimpleFilterEvaluator>>,
|
||||
}
|
||||
|
||||
impl PrimaryKeyFilterInner {
|
||||
fn evaluate_filters(
|
||||
&self,
|
||||
pk: &[u8],
|
||||
mut decode_fn: impl FnMut(ColumnId, &RegionMetadataRef) -> Result<Value>,
|
||||
) -> bool {
|
||||
if self.filters.is_empty() || self.metadata.primary_key.is_empty() {
|
||||
return true;
|
||||
}
|
||||
|
||||
let mut result = true;
|
||||
for filter in self.filters.iter() {
|
||||
if Partition::is_partition_column(filter.column_name()) {
|
||||
continue;
|
||||
}
|
||||
let Some(column) = self.metadata.column_by_name(filter.column_name()) else {
|
||||
continue;
|
||||
};
|
||||
// ignore filters that are not referencing primary key columns
|
||||
if column.semantic_type != SemanticType::Tag {
|
||||
continue;
|
||||
}
|
||||
|
||||
let value = match decode_fn(column.column_id, &self.metadata) {
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
common_telemetry::error!(e; "Failed to decode primary key");
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
// TODO(yingwen): `evaluate_scalar()` creates temporary arrays to compare scalars. We
|
||||
// can compare the bytes directly without allocation and matching types as we use
|
||||
// comparable encoding.
|
||||
// Safety: arrow schema and datatypes are constructed from the same source.
|
||||
let scalar_value = value
|
||||
.try_to_scalar_value(&column.column_schema.data_type)
|
||||
.unwrap();
|
||||
result &= filter.evaluate_scalar(&scalar_value).unwrap_or(true);
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
/// Dense primary key filter.
|
||||
#[derive(Clone)]
|
||||
pub struct DensePrimaryKeyFilter {
|
||||
inner: PrimaryKeyFilterInner,
|
||||
codec: DensePrimaryKeyCodec,
|
||||
offsets_buf: Vec<usize>,
|
||||
}
|
||||
|
||||
impl DensePrimaryKeyFilter {
|
||||
pub(crate) fn new(
|
||||
metadata: RegionMetadataRef,
|
||||
filters: Arc<Vec<SimpleFilterEvaluator>>,
|
||||
codec: DensePrimaryKeyCodec,
|
||||
) -> Self {
|
||||
Self {
|
||||
inner: PrimaryKeyFilterInner { metadata, filters },
|
||||
codec,
|
||||
offsets_buf: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PrimaryKeyFilter for DensePrimaryKeyFilter {
|
||||
fn matches(&mut self, pk: &[u8]) -> bool {
|
||||
self.offsets_buf.clear();
|
||||
self.inner.evaluate_filters(pk, |column_id, metadata| {
|
||||
// index of tag column in primary key
|
||||
// Safety: A tag column is always in primary key.
|
||||
let index = metadata.primary_key_index(column_id).unwrap();
|
||||
self.codec.decode_value_at(pk, index, &mut self.offsets_buf)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Sparse primary key filter.
|
||||
#[derive(Clone)]
|
||||
pub struct SparsePrimaryKeyFilter {
|
||||
inner: PrimaryKeyFilterInner,
|
||||
codec: SparsePrimaryKeyCodec,
|
||||
offsets_map: HashMap<ColumnId, usize>,
|
||||
}
|
||||
|
||||
impl SparsePrimaryKeyFilter {
|
||||
pub(crate) fn new(
|
||||
metadata: RegionMetadataRef,
|
||||
filters: Arc<Vec<SimpleFilterEvaluator>>,
|
||||
codec: SparsePrimaryKeyCodec,
|
||||
) -> Self {
|
||||
Self {
|
||||
inner: PrimaryKeyFilterInner { metadata, filters },
|
||||
codec,
|
||||
offsets_map: HashMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PrimaryKeyFilter for SparsePrimaryKeyFilter {
|
||||
fn matches(&mut self, pk: &[u8]) -> bool {
|
||||
self.offsets_map.clear();
|
||||
self.inner.evaluate_filters(pk, |column_id, _| {
|
||||
if let Some(offset) = self.codec.has_column(pk, &mut self.offsets_map, column_id) {
|
||||
self.codec.decode_value_at(pk, offset, column_id)
|
||||
} else {
|
||||
Ok(Value::Null)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::v1::SemanticType;
|
||||
use common_time::timestamp::TimeUnit;
|
||||
use common_time::Timestamp;
|
||||
use datafusion::execution::context::ExecutionProps;
|
||||
use datafusion::logical_expr::{col, lit, BinaryExpr};
|
||||
use datafusion::physical_expr::create_physical_expr;
|
||||
use datafusion_common::{Column, DFSchema, ScalarValue};
|
||||
use datafusion_expr::{Expr, Operator};
|
||||
use datatypes::arrow::datatypes::{DataType, Field, Schema};
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use datatypes::schema::ColumnSchema;
|
||||
use datatypes::value::{OrderedFloat, Value, ValueRef};
|
||||
use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder};
|
||||
use store_api::metric_engine_consts::{
|
||||
DATA_SCHEMA_TABLE_ID_COLUMN_NAME, DATA_SCHEMA_TSID_COLUMN_NAME,
|
||||
};
|
||||
use store_api::storage::consts::ReservedColumnId;
|
||||
use store_api::storage::{ColumnId, RegionId};
|
||||
|
||||
use super::*;
|
||||
use crate::row_converter::PrimaryKeyCodecExt;
|
||||
|
||||
fn setup_metadata() -> RegionMetadataRef {
|
||||
let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
|
||||
builder
|
||||
.push_column_metadata(ColumnMetadata {
|
||||
column_schema: ColumnSchema::new("pod", ConcreteDataType::string_datatype(), true),
|
||||
semantic_type: SemanticType::Tag,
|
||||
column_id: 1,
|
||||
})
|
||||
.push_column_metadata(ColumnMetadata {
|
||||
column_schema: ColumnSchema::new(
|
||||
"namespace",
|
||||
ConcreteDataType::string_datatype(),
|
||||
true,
|
||||
),
|
||||
semantic_type: SemanticType::Tag,
|
||||
column_id: 2,
|
||||
})
|
||||
.push_column_metadata(ColumnMetadata {
|
||||
column_schema: ColumnSchema::new(
|
||||
"container",
|
||||
ConcreteDataType::string_datatype(),
|
||||
true,
|
||||
),
|
||||
semantic_type: SemanticType::Tag,
|
||||
column_id: 3,
|
||||
})
|
||||
.push_column_metadata(ColumnMetadata {
|
||||
column_schema: ColumnSchema::new(
|
||||
"greptime_value",
|
||||
ConcreteDataType::float64_datatype(),
|
||||
false,
|
||||
),
|
||||
semantic_type: SemanticType::Field,
|
||||
column_id: 4,
|
||||
})
|
||||
.push_column_metadata(ColumnMetadata {
|
||||
column_schema: ColumnSchema::new(
|
||||
"greptime_timestamp",
|
||||
ConcreteDataType::timestamp_nanosecond_datatype(),
|
||||
false,
|
||||
),
|
||||
semantic_type: SemanticType::Timestamp,
|
||||
column_id: 5,
|
||||
})
|
||||
.primary_key(vec![1, 2, 3]);
|
||||
let metadata = builder.build().unwrap();
|
||||
Arc::new(metadata)
|
||||
}
|
||||
|
||||
fn create_test_row() -> Vec<(ColumnId, ValueRef<'static>)> {
|
||||
vec![
|
||||
(1, ValueRef::String("greptime-frontend-6989d9899-22222")),
|
||||
(2, ValueRef::String("greptime-cluster")),
|
||||
(3, ValueRef::String("greptime-frontend-6989d9899-22222")),
|
||||
]
|
||||
}
|
||||
|
||||
fn create_filter(column_name: &str, value: &str) -> SimpleFilterEvaluator {
|
||||
let expr = Expr::BinaryExpr(BinaryExpr {
|
||||
left: Box::new(Expr::Column(Column {
|
||||
relation: None,
|
||||
name: column_name.to_string(),
|
||||
})),
|
||||
op: Operator::Eq,
|
||||
right: Box::new(Expr::Literal(ScalarValue::Utf8(Some(value.to_string())))),
|
||||
});
|
||||
SimpleFilterEvaluator::try_new(&expr).unwrap()
|
||||
}
|
||||
|
||||
fn encode_sparse_pk(
|
||||
metadata: &RegionMetadataRef,
|
||||
row: Vec<(ColumnId, ValueRef<'static>)>,
|
||||
) -> Vec<u8> {
|
||||
let codec = SparsePrimaryKeyCodec::new(metadata);
|
||||
let mut pk = Vec::new();
|
||||
codec.encode_to_vec(row.into_iter(), &mut pk).unwrap();
|
||||
pk
|
||||
}
|
||||
|
||||
fn encode_dense_pk(
|
||||
metadata: &RegionMetadataRef,
|
||||
row: Vec<(ColumnId, ValueRef<'static>)>,
|
||||
) -> Vec<u8> {
|
||||
let codec = DensePrimaryKeyCodec::new(metadata);
|
||||
let mut pk = Vec::new();
|
||||
codec
|
||||
.encode_to_vec(row.into_iter().map(|(_, v)| v), &mut pk)
|
||||
.unwrap();
|
||||
pk
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sparse_primary_key_filter_matches() {
|
||||
let metadata = setup_metadata();
|
||||
let filters = Arc::new(vec![create_filter(
|
||||
"pod",
|
||||
"greptime-frontend-6989d9899-22222",
|
||||
)]);
|
||||
let pk = encode_sparse_pk(&metadata, create_test_row());
|
||||
let codec = SparsePrimaryKeyCodec::new(&metadata);
|
||||
let mut filter = SparsePrimaryKeyFilter::new(metadata, filters, codec);
|
||||
assert!(filter.matches(&pk));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sparse_primary_key_filter_not_matches() {
|
||||
let metadata = setup_metadata();
|
||||
let filters = Arc::new(vec![create_filter(
|
||||
"pod",
|
||||
"greptime-frontend-6989d9899-22223",
|
||||
)]);
|
||||
let pk = encode_sparse_pk(&metadata, create_test_row());
|
||||
let codec = SparsePrimaryKeyCodec::new(&metadata);
|
||||
let mut filter = SparsePrimaryKeyFilter::new(metadata, filters, codec);
|
||||
assert!(!filter.matches(&pk));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sparse_primary_key_filter_matches_with_null() {
|
||||
let metadata = setup_metadata();
|
||||
let filters = Arc::new(vec![create_filter(
|
||||
"non-exist-label",
|
||||
"greptime-frontend-6989d9899-22222",
|
||||
)]);
|
||||
let pk = encode_sparse_pk(&metadata, create_test_row());
|
||||
let codec = SparsePrimaryKeyCodec::new(&metadata);
|
||||
let mut filter = SparsePrimaryKeyFilter::new(metadata, filters, codec);
|
||||
assert!(filter.matches(&pk));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dense_primary_key_filter_matches() {
|
||||
let metadata = setup_metadata();
|
||||
let filters = Arc::new(vec![create_filter(
|
||||
"pod",
|
||||
"greptime-frontend-6989d9899-22222",
|
||||
)]);
|
||||
let pk = encode_dense_pk(&metadata, create_test_row());
|
||||
let codec = DensePrimaryKeyCodec::new(&metadata);
|
||||
let mut filter = DensePrimaryKeyFilter::new(metadata, filters, codec);
|
||||
assert!(filter.matches(&pk));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dense_primary_key_filter_not_matches() {
|
||||
let metadata = setup_metadata();
|
||||
let filters = Arc::new(vec![create_filter(
|
||||
"pod",
|
||||
"greptime-frontend-6989d9899-22223",
|
||||
)]);
|
||||
let pk = encode_dense_pk(&metadata, create_test_row());
|
||||
let codec = DensePrimaryKeyCodec::new(&metadata);
|
||||
let mut filter = DensePrimaryKeyFilter::new(metadata, filters, codec);
|
||||
assert!(!filter.matches(&pk));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dense_primary_key_filter_matches_with_null() {
|
||||
let metadata = setup_metadata();
|
||||
let filters = Arc::new(vec![create_filter(
|
||||
"non-exist-label",
|
||||
"greptime-frontend-6989d9899-22222",
|
||||
)]);
|
||||
let pk = encode_dense_pk(&metadata, create_test_row());
|
||||
let codec = DensePrimaryKeyCodec::new(&metadata);
|
||||
let mut filter = DensePrimaryKeyFilter::new(metadata, filters, codec);
|
||||
assert!(filter.matches(&pk));
|
||||
}
|
||||
}
|
||||
@@ -26,10 +26,10 @@ use crate::memtable::partition_tree::data::{
|
||||
};
|
||||
use crate::memtable::partition_tree::dict::KeyDictRef;
|
||||
use crate::memtable::partition_tree::merger::{Merger, Node};
|
||||
use crate::memtable::partition_tree::partition::PrimaryKeyFilter;
|
||||
use crate::memtable::partition_tree::shard_builder::ShardBuilderReader;
|
||||
use crate::memtable::partition_tree::{PkId, PkIndex, ShardId};
|
||||
use crate::metrics::PARTITION_TREE_READ_STAGE_ELAPSED;
|
||||
use crate::row_converter::PrimaryKeyFilter;
|
||||
|
||||
/// Shard stores data related to the same key dictionary.
|
||||
pub struct Shard {
|
||||
@@ -146,7 +146,10 @@ pub struct ShardReaderBuilder {
|
||||
}
|
||||
|
||||
impl ShardReaderBuilder {
|
||||
pub(crate) fn build(self, key_filter: Option<PrimaryKeyFilter>) -> Result<ShardReader> {
|
||||
pub(crate) fn build(
|
||||
self,
|
||||
key_filter: Option<Box<dyn PrimaryKeyFilter>>,
|
||||
) -> Result<ShardReader> {
|
||||
let ShardReaderBuilder {
|
||||
shard_id,
|
||||
key_dict,
|
||||
@@ -163,7 +166,7 @@ pub struct ShardReader {
|
||||
shard_id: ShardId,
|
||||
key_dict: Option<KeyDictRef>,
|
||||
parts_reader: DataPartsReader,
|
||||
key_filter: Option<PrimaryKeyFilter>,
|
||||
key_filter: Option<Box<dyn PrimaryKeyFilter>>,
|
||||
last_yield_pk_index: Option<PkIndex>,
|
||||
keys_before_pruning: usize,
|
||||
keys_after_pruning: usize,
|
||||
@@ -176,7 +179,7 @@ impl ShardReader {
|
||||
shard_id: ShardId,
|
||||
key_dict: Option<KeyDictRef>,
|
||||
parts_reader: DataPartsReader,
|
||||
key_filter: Option<PrimaryKeyFilter>,
|
||||
key_filter: Option<Box<dyn PrimaryKeyFilter>>,
|
||||
data_build_cost: Duration,
|
||||
) -> Result<Self> {
|
||||
let has_pk = key_dict.is_some();
|
||||
@@ -240,7 +243,7 @@ impl ShardReader {
|
||||
// Safety: `key_filter` is some so the shard has primary keys.
|
||||
let key = self.key_dict.as_ref().unwrap().key_by_pk_index(pk_index);
|
||||
let now = Instant::now();
|
||||
if key_filter.prune_primary_key(key) {
|
||||
if key_filter.matches(key) {
|
||||
self.prune_pk_cost += now.elapsed();
|
||||
self.last_yield_pk_index = Some(pk_index);
|
||||
self.keys_after_pruning += 1;
|
||||
|
||||
@@ -26,11 +26,11 @@ use crate::memtable::partition_tree::data::{
|
||||
DataBatch, DataBuffer, DataBufferReader, DataBufferReaderBuilder, DataParts, DATA_INIT_CAP,
|
||||
};
|
||||
use crate::memtable::partition_tree::dict::{DictBuilderReader, KeyDictBuilder};
|
||||
use crate::memtable::partition_tree::partition::PrimaryKeyFilter;
|
||||
use crate::memtable::partition_tree::shard::Shard;
|
||||
use crate::memtable::partition_tree::{PartitionTreeConfig, PkId, PkIndex, ShardId};
|
||||
use crate::memtable::stats::WriteMetrics;
|
||||
use crate::metrics::PARTITION_TREE_READ_STAGE_ELAPSED;
|
||||
use crate::row_converter::PrimaryKeyFilter;
|
||||
|
||||
/// Builder to write keys and data to a shard that the key dictionary
|
||||
/// is still active.
|
||||
@@ -189,7 +189,7 @@ impl ShardBuilderReaderBuilder {
|
||||
pub(crate) fn build(
|
||||
self,
|
||||
pk_weights: Option<&[u16]>,
|
||||
key_filter: Option<PrimaryKeyFilter>,
|
||||
key_filter: Option<Box<dyn PrimaryKeyFilter>>,
|
||||
) -> Result<ShardBuilderReader> {
|
||||
let now = Instant::now();
|
||||
let data_reader = self.data_reader.build(pk_weights)?;
|
||||
@@ -208,7 +208,7 @@ pub struct ShardBuilderReader {
|
||||
shard_id: ShardId,
|
||||
dict_reader: DictBuilderReader,
|
||||
data_reader: DataBufferReader,
|
||||
key_filter: Option<PrimaryKeyFilter>,
|
||||
key_filter: Option<Box<dyn PrimaryKeyFilter>>,
|
||||
last_yield_pk_index: Option<PkIndex>,
|
||||
keys_before_pruning: usize,
|
||||
keys_after_pruning: usize,
|
||||
@@ -221,7 +221,7 @@ impl ShardBuilderReader {
|
||||
shard_id: ShardId,
|
||||
dict_reader: DictBuilderReader,
|
||||
data_reader: DataBufferReader,
|
||||
key_filter: Option<PrimaryKeyFilter>,
|
||||
key_filter: Option<Box<dyn PrimaryKeyFilter>>,
|
||||
data_build_cost: Duration,
|
||||
) -> Result<Self> {
|
||||
let mut reader = ShardBuilderReader {
|
||||
@@ -281,7 +281,7 @@ impl ShardBuilderReader {
|
||||
self.keys_before_pruning += 1;
|
||||
let key = self.dict_reader.key_by_pk_index(pk_index);
|
||||
let now = Instant::now();
|
||||
if key_filter.prune_primary_key(key) {
|
||||
if key_filter.matches(key) {
|
||||
self.prune_pk_cost += now.elapsed();
|
||||
self.last_yield_pk_index = Some(pk_index);
|
||||
self.keys_after_pruning += 1;
|
||||
|
||||
@@ -27,7 +27,7 @@ use memcomparable::Serializer;
|
||||
use serde::Serialize;
|
||||
use snafu::{ensure, ResultExt};
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::ColumnId;
|
||||
use store_api::storage::{ColumnId, SequenceNumber};
|
||||
use table::predicate::Predicate;
|
||||
|
||||
use crate::error::{PrimaryKeyLengthMismatchSnafu, Result, SerializeFieldSnafu};
|
||||
@@ -43,7 +43,7 @@ use crate::metrics::{PARTITION_TREE_READ_STAGE_ELAPSED, READ_ROWS_TOTAL, READ_ST
|
||||
use crate::read::dedup::LastNonNullIter;
|
||||
use crate::read::Batch;
|
||||
use crate::region::options::MergeMode;
|
||||
use crate::row_converter::{McmpRowCodec, RowCodec, SortField};
|
||||
use crate::row_converter::{PrimaryKeyCodec, SortField};
|
||||
|
||||
/// The partition tree.
|
||||
pub struct PartitionTree {
|
||||
@@ -52,7 +52,7 @@ pub struct PartitionTree {
|
||||
/// Metadata of the region.
|
||||
pub(crate) metadata: RegionMetadataRef,
|
||||
/// Primary key codec.
|
||||
row_codec: Arc<McmpRowCodec>,
|
||||
row_codec: Arc<dyn PrimaryKeyCodec>,
|
||||
/// Partitions in the tree.
|
||||
partitions: RwLock<BTreeMap<PartitionKey, PartitionRef>>,
|
||||
/// Whether the tree has multiple partitions.
|
||||
@@ -65,16 +65,11 @@ pub struct PartitionTree {
|
||||
impl PartitionTree {
|
||||
/// Creates a new partition tree.
|
||||
pub fn new(
|
||||
row_codec: Arc<dyn PrimaryKeyCodec>,
|
||||
metadata: RegionMetadataRef,
|
||||
config: &PartitionTreeConfig,
|
||||
write_buffer_manager: Option<WriteBufferManagerRef>,
|
||||
) -> PartitionTree {
|
||||
let row_codec = McmpRowCodec::new(
|
||||
metadata
|
||||
.primary_key_columns()
|
||||
.map(|c| SortField::new(c.column_schema.data_type.clone()))
|
||||
.collect(),
|
||||
);
|
||||
) -> Self {
|
||||
let sparse_encoder = SparseEncoder {
|
||||
fields: metadata
|
||||
.primary_key_columns()
|
||||
@@ -93,7 +88,7 @@ impl PartitionTree {
|
||||
PartitionTree {
|
||||
config,
|
||||
metadata,
|
||||
row_codec: Arc::new(row_codec),
|
||||
row_codec,
|
||||
partitions: Default::default(),
|
||||
is_partitioned,
|
||||
write_buffer_manager,
|
||||
@@ -141,7 +136,7 @@ impl PartitionTree {
|
||||
self.sparse_encoder
|
||||
.encode_to_vec(kv.primary_keys(), pk_buffer)?;
|
||||
} else {
|
||||
self.row_codec.encode_to_vec(kv.primary_keys(), pk_buffer)?;
|
||||
self.row_codec.encode_key_value(&kv, pk_buffer)?;
|
||||
}
|
||||
|
||||
// Write rows with
|
||||
@@ -191,7 +186,7 @@ impl PartitionTree {
|
||||
self.sparse_encoder
|
||||
.encode_to_vec(kv.primary_keys(), pk_buffer)?;
|
||||
} else {
|
||||
self.row_codec.encode_to_vec(kv.primary_keys(), pk_buffer)?;
|
||||
self.row_codec.encode_key_value(&kv, pk_buffer)?;
|
||||
}
|
||||
|
||||
// Write rows with
|
||||
@@ -207,6 +202,7 @@ impl PartitionTree {
|
||||
&self,
|
||||
projection: Option<&[ColumnId]>,
|
||||
predicate: Option<Predicate>,
|
||||
sequence: Option<SequenceNumber>,
|
||||
) -> Result<BoxedBatchIterator> {
|
||||
let start = Instant::now();
|
||||
// Creates the projection set.
|
||||
@@ -230,6 +226,7 @@ impl PartitionTree {
|
||||
let partitions = self.prune_partitions(&filters, &mut tree_iter_metric);
|
||||
|
||||
let mut iter = TreeIter {
|
||||
sequence,
|
||||
partitions,
|
||||
current_reader: None,
|
||||
metrics: tree_iter_metric,
|
||||
@@ -238,7 +235,7 @@ impl PartitionTree {
|
||||
self.metadata.clone(),
|
||||
self.row_codec.clone(),
|
||||
projection,
|
||||
filters,
|
||||
Arc::new(filters),
|
||||
);
|
||||
iter.fetch_next_partition(context)?;
|
||||
|
||||
@@ -278,7 +275,12 @@ impl PartitionTree {
|
||||
|| self.metadata.column_metadatas != metadata.column_metadatas
|
||||
{
|
||||
// The schema has changed, we can't reuse the tree.
|
||||
return PartitionTree::new(metadata, &self.config, self.write_buffer_manager.clone());
|
||||
return PartitionTree::new(
|
||||
self.row_codec.clone(),
|
||||
metadata,
|
||||
&self.config,
|
||||
self.write_buffer_manager.clone(),
|
||||
);
|
||||
}
|
||||
|
||||
let mut total_shared_size = 0;
|
||||
@@ -353,7 +355,7 @@ impl PartitionTree {
|
||||
|
||||
partition.write_with_key(
|
||||
primary_key,
|
||||
&self.row_codec,
|
||||
self.row_codec.as_ref(),
|
||||
key_value,
|
||||
self.is_partitioned, // If tree is partitioned, re-encode is required to get the full primary key.
|
||||
metrics,
|
||||
@@ -451,6 +453,8 @@ struct TreeIterMetrics {
|
||||
}
|
||||
|
||||
struct TreeIter {
|
||||
/// Optional Sequence number of the current reader which limit results batch to lower than this sequence number.
|
||||
sequence: Option<SequenceNumber>,
|
||||
partitions: VecDeque<PartitionRef>,
|
||||
current_reader: Option<PartitionReader>,
|
||||
metrics: TreeIterMetrics,
|
||||
@@ -519,6 +523,8 @@ impl TreeIter {
|
||||
if part_reader.is_valid() {
|
||||
self.metrics.rows_fetched += batch.num_rows();
|
||||
self.metrics.batches_fetched += 1;
|
||||
let mut batch = batch;
|
||||
batch.filter_by_sequence(self.sequence)?;
|
||||
return Ok(Some(batch));
|
||||
}
|
||||
|
||||
@@ -529,6 +535,8 @@ impl TreeIter {
|
||||
|
||||
self.metrics.rows_fetched += batch.num_rows();
|
||||
self.metrics.batches_fetched += 1;
|
||||
let mut batch = batch;
|
||||
batch.filter_by_sequence(self.sequence)?;
|
||||
Ok(Some(batch))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -482,7 +482,7 @@ mod tests {
|
||||
partitions.list_memtables(&mut memtables);
|
||||
assert_eq!(0, memtables[0].id());
|
||||
|
||||
let iter = memtables[0].iter(None, None).unwrap();
|
||||
let iter = memtables[0].iter(None, None, None).unwrap();
|
||||
let timestamps = collect_iter_timestamps(iter);
|
||||
assert_eq!(&[1000, 3000, 5000, 6000, 7000], ×tamps[..]);
|
||||
}
|
||||
@@ -520,7 +520,7 @@ mod tests {
|
||||
|
||||
let mut memtables = Vec::new();
|
||||
partitions.list_memtables(&mut memtables);
|
||||
let iter = memtables[0].iter(None, None).unwrap();
|
||||
let iter = memtables[0].iter(None, None, None).unwrap();
|
||||
let timestamps = collect_iter_timestamps(iter);
|
||||
assert_eq!(&[0, 2000, 3000, 4000, 5000, 7000], ×tamps[..]);
|
||||
let parts = partitions.list_partitions();
|
||||
@@ -572,7 +572,7 @@ mod tests {
|
||||
let partitions = new_multi_partitions(&metadata);
|
||||
|
||||
let parts = partitions.list_partitions();
|
||||
let iter = parts[0].memtable.iter(None, None).unwrap();
|
||||
let iter = parts[0].memtable.iter(None, None, None).unwrap();
|
||||
let timestamps = collect_iter_timestamps(iter);
|
||||
assert_eq!(0, parts[0].memtable.id());
|
||||
assert_eq!(
|
||||
@@ -584,7 +584,7 @@ mod tests {
|
||||
parts[0].time_range.unwrap().max_timestamp
|
||||
);
|
||||
assert_eq!(&[0, 2000, 3000, 4000], ×tamps[..]);
|
||||
let iter = parts[1].memtable.iter(None, None).unwrap();
|
||||
let iter = parts[1].memtable.iter(None, None, None).unwrap();
|
||||
assert_eq!(1, parts[1].memtable.id());
|
||||
let timestamps = collect_iter_timestamps(iter);
|
||||
assert_eq!(&[5000, 7000], ×tamps[..]);
|
||||
|
||||
@@ -33,7 +33,7 @@ use datatypes::vectors::{
|
||||
};
|
||||
use snafu::{ensure, ResultExt};
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::ColumnId;
|
||||
use store_api::storage::{ColumnId, SequenceNumber};
|
||||
use table::predicate::Predicate;
|
||||
|
||||
use crate::error::{
|
||||
@@ -51,7 +51,7 @@ use crate::metrics::{READ_ROWS_TOTAL, READ_STAGE_ELAPSED};
|
||||
use crate::read::dedup::LastNonNullIter;
|
||||
use crate::read::{Batch, BatchBuilder, BatchColumn};
|
||||
use crate::region::options::MergeMode;
|
||||
use crate::row_converter::{McmpRowCodec, RowCodec, SortField};
|
||||
use crate::row_converter::{DensePrimaryKeyCodec, PrimaryKeyCodec, PrimaryKeyCodecExt};
|
||||
|
||||
/// Initial vector builder capacity.
|
||||
const INITIAL_BUILDER_CAPACITY: usize = 0;
|
||||
@@ -95,7 +95,7 @@ impl MemtableBuilder for TimeSeriesMemtableBuilder {
|
||||
pub struct TimeSeriesMemtable {
|
||||
id: MemtableId,
|
||||
region_metadata: RegionMetadataRef,
|
||||
row_codec: Arc<McmpRowCodec>,
|
||||
row_codec: Arc<DensePrimaryKeyCodec>,
|
||||
series_set: SeriesSet,
|
||||
alloc_tracker: AllocTracker,
|
||||
max_timestamp: AtomicI64,
|
||||
@@ -115,12 +115,7 @@ impl TimeSeriesMemtable {
|
||||
dedup: bool,
|
||||
merge_mode: MergeMode,
|
||||
) -> Self {
|
||||
let row_codec = Arc::new(McmpRowCodec::new(
|
||||
region_metadata
|
||||
.primary_key_columns()
|
||||
.map(|c| SortField::new(c.column_schema.data_type.clone()))
|
||||
.collect(),
|
||||
));
|
||||
let row_codec = Arc::new(DensePrimaryKeyCodec::new(®ion_metadata));
|
||||
let series_set = SeriesSet::new(region_metadata.clone(), row_codec.clone());
|
||||
let dedup = if merge_mode == MergeMode::LastNonNull {
|
||||
false
|
||||
@@ -241,6 +236,7 @@ impl Memtable for TimeSeriesMemtable {
|
||||
&self,
|
||||
projection: Option<&[ColumnId]>,
|
||||
filters: Option<Predicate>,
|
||||
sequence: Option<SequenceNumber>,
|
||||
) -> Result<BoxedBatchIterator> {
|
||||
let projection = if let Some(projection) = projection {
|
||||
projection.iter().copied().collect()
|
||||
@@ -253,7 +249,7 @@ impl Memtable for TimeSeriesMemtable {
|
||||
|
||||
let iter = self
|
||||
.series_set
|
||||
.iter_series(projection, filters, self.dedup)?;
|
||||
.iter_series(projection, filters, self.dedup, sequence)?;
|
||||
|
||||
if self.merge_mode == MergeMode::LastNonNull {
|
||||
let iter = LastNonNullIter::new(iter);
|
||||
@@ -267,6 +263,7 @@ impl Memtable for TimeSeriesMemtable {
|
||||
&self,
|
||||
projection: Option<&[ColumnId]>,
|
||||
predicate: Option<Predicate>,
|
||||
sequence: Option<SequenceNumber>,
|
||||
) -> MemtableRanges {
|
||||
let projection = if let Some(projection) = projection {
|
||||
projection.iter().copied().collect()
|
||||
@@ -282,6 +279,7 @@ impl Memtable for TimeSeriesMemtable {
|
||||
predicate,
|
||||
dedup: self.dedup,
|
||||
merge_mode: self.merge_mode,
|
||||
sequence,
|
||||
});
|
||||
let context = Arc::new(MemtableRangeContext::new(self.id, builder));
|
||||
|
||||
@@ -350,11 +348,11 @@ type SeriesRwLockMap = RwLock<BTreeMap<Vec<u8>, Arc<RwLock<Series>>>>;
|
||||
struct SeriesSet {
|
||||
region_metadata: RegionMetadataRef,
|
||||
series: Arc<SeriesRwLockMap>,
|
||||
codec: Arc<McmpRowCodec>,
|
||||
codec: Arc<DensePrimaryKeyCodec>,
|
||||
}
|
||||
|
||||
impl SeriesSet {
|
||||
fn new(region_metadata: RegionMetadataRef, codec: Arc<McmpRowCodec>) -> Self {
|
||||
fn new(region_metadata: RegionMetadataRef, codec: Arc<DensePrimaryKeyCodec>) -> Self {
|
||||
Self {
|
||||
region_metadata,
|
||||
series: Default::default(),
|
||||
@@ -389,6 +387,7 @@ impl SeriesSet {
|
||||
projection: HashSet<ColumnId>,
|
||||
predicate: Option<Predicate>,
|
||||
dedup: bool,
|
||||
sequence: Option<SequenceNumber>,
|
||||
) -> Result<Iter> {
|
||||
let primary_key_schema = primary_key_schema(&self.region_metadata);
|
||||
let primary_key_datatypes = self
|
||||
@@ -406,6 +405,7 @@ impl SeriesSet {
|
||||
primary_key_datatypes,
|
||||
self.codec.clone(),
|
||||
dedup,
|
||||
sequence,
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -451,8 +451,9 @@ struct Iter {
|
||||
predicate: Vec<SimpleFilterEvaluator>,
|
||||
pk_schema: arrow::datatypes::SchemaRef,
|
||||
pk_datatypes: Vec<ConcreteDataType>,
|
||||
codec: Arc<McmpRowCodec>,
|
||||
codec: Arc<DensePrimaryKeyCodec>,
|
||||
dedup: bool,
|
||||
sequence: Option<SequenceNumber>,
|
||||
metrics: Metrics,
|
||||
}
|
||||
|
||||
@@ -465,8 +466,9 @@ impl Iter {
|
||||
predicate: Option<Predicate>,
|
||||
pk_schema: arrow::datatypes::SchemaRef,
|
||||
pk_datatypes: Vec<ConcreteDataType>,
|
||||
codec: Arc<McmpRowCodec>,
|
||||
codec: Arc<DensePrimaryKeyCodec>,
|
||||
dedup: bool,
|
||||
sequence: Option<SequenceNumber>,
|
||||
) -> Result<Self> {
|
||||
let predicate = predicate
|
||||
.map(|predicate| {
|
||||
@@ -487,6 +489,7 @@ impl Iter {
|
||||
pk_datatypes,
|
||||
codec,
|
||||
dedup,
|
||||
sequence,
|
||||
metrics: Metrics::default(),
|
||||
})
|
||||
}
|
||||
@@ -551,6 +554,12 @@ impl Iterator for Iter {
|
||||
self.metrics.num_batches += 1;
|
||||
self.metrics.num_rows += batch.as_ref().map(|b| b.num_rows()).unwrap_or(0);
|
||||
self.metrics.scan_cost += start.elapsed();
|
||||
|
||||
let mut batch = batch;
|
||||
batch = batch.and_then(|mut batch| {
|
||||
batch.filter_by_sequence(self.sequence)?;
|
||||
Ok(batch)
|
||||
});
|
||||
return Some(batch);
|
||||
}
|
||||
self.metrics.scan_cost += start.elapsed();
|
||||
@@ -560,7 +569,7 @@ impl Iterator for Iter {
|
||||
}
|
||||
|
||||
fn prune_primary_key(
|
||||
codec: &Arc<McmpRowCodec>,
|
||||
codec: &Arc<DensePrimaryKeyCodec>,
|
||||
pk: &[u8],
|
||||
series: &mut Series,
|
||||
datatypes: &[ConcreteDataType],
|
||||
@@ -860,6 +869,7 @@ struct TimeSeriesIterBuilder {
|
||||
projection: HashSet<ColumnId>,
|
||||
predicate: Option<Predicate>,
|
||||
dedup: bool,
|
||||
sequence: Option<SequenceNumber>,
|
||||
merge_mode: MergeMode,
|
||||
}
|
||||
|
||||
@@ -869,6 +879,7 @@ impl IterBuilder for TimeSeriesIterBuilder {
|
||||
self.projection.clone(),
|
||||
self.predicate.clone(),
|
||||
self.dedup,
|
||||
self.sequence,
|
||||
)?;
|
||||
|
||||
if self.merge_mode == MergeMode::LastNonNull {
|
||||
@@ -896,6 +907,7 @@ mod tests {
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use super::*;
|
||||
use crate::row_converter::SortField;
|
||||
|
||||
fn schema_for_test() -> RegionMetadataRef {
|
||||
let mut builder = RegionMetadataBuilder::new(RegionId::new(123, 456));
|
||||
@@ -1160,7 +1172,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_series_set_concurrency() {
|
||||
let schema = schema_for_test();
|
||||
let row_codec = Arc::new(McmpRowCodec::new(
|
||||
let row_codec = Arc::new(DensePrimaryKeyCodec::with_fields(
|
||||
schema
|
||||
.primary_key_columns()
|
||||
.map(|c| SortField::new(c.column_schema.data_type.clone()))
|
||||
@@ -1257,7 +1269,7 @@ mod tests {
|
||||
*expected_ts.entry(ts).or_default() += if dedup { 1 } else { 2 };
|
||||
}
|
||||
|
||||
let iter = memtable.iter(None, None).unwrap();
|
||||
let iter = memtable.iter(None, None, None).unwrap();
|
||||
let mut read = HashMap::new();
|
||||
|
||||
for ts in iter
|
||||
@@ -1297,7 +1309,7 @@ mod tests {
|
||||
let memtable = TimeSeriesMemtable::new(schema, 42, None, true, MergeMode::LastRow);
|
||||
memtable.write(&kvs).unwrap();
|
||||
|
||||
let iter = memtable.iter(Some(&[3]), None).unwrap();
|
||||
let iter = memtable.iter(Some(&[3]), None, None).unwrap();
|
||||
|
||||
let mut v0_all = vec![];
|
||||
|
||||
|
||||
@@ -48,7 +48,8 @@ lazy_static! {
|
||||
"greptime_mito_handle_request_elapsed",
|
||||
"mito handle request elapsed",
|
||||
&[TYPE_LABEL],
|
||||
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 60.0, 300.0]
|
||||
// 0.01 ~ 10000
|
||||
exponential_buckets(0.01, 10.0, 7).unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
@@ -69,7 +70,8 @@ lazy_static! {
|
||||
"greptime_mito_flush_elapsed",
|
||||
"mito flush elapsed",
|
||||
&[TYPE_LABEL],
|
||||
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
|
||||
// 1 ~ 625
|
||||
exponential_buckets(1.0, 5.0, 6).unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
/// Histogram of flushed bytes.
|
||||
@@ -99,7 +101,8 @@ lazy_static! {
|
||||
"greptime_mito_write_stage_elapsed",
|
||||
"mito write stage elapsed",
|
||||
&[STAGE_LABEL],
|
||||
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
|
||||
// 0.01 ~ 1000
|
||||
exponential_buckets(0.01, 10.0, 6).unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
/// Counter of rows to write.
|
||||
@@ -118,12 +121,18 @@ lazy_static! {
|
||||
"greptime_mito_compaction_stage_elapsed",
|
||||
"mito compaction stage elapsed",
|
||||
&[STAGE_LABEL],
|
||||
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
|
||||
// 1 ~ 100000
|
||||
exponential_buckets(1.0, 10.0, 6).unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
/// Timer of whole compaction task.
|
||||
pub static ref COMPACTION_ELAPSED_TOTAL: Histogram =
|
||||
register_histogram!("greptime_mito_compaction_total_elapsed", "mito compaction total elapsed").unwrap();
|
||||
register_histogram!(
|
||||
"greptime_mito_compaction_total_elapsed",
|
||||
"mito compaction total elapsed",
|
||||
// 1 ~ 100000
|
||||
exponential_buckets(1.0, 10.0, 6).unwrap(),
|
||||
).unwrap();
|
||||
/// Counter of all requested compaction task.
|
||||
pub static ref COMPACTION_REQUEST_COUNT: IntCounter =
|
||||
register_int_counter!("greptime_mito_compaction_requests_total", "mito compaction requests total").unwrap();
|
||||
@@ -145,7 +154,8 @@ lazy_static! {
|
||||
"greptime_mito_read_stage_elapsed",
|
||||
"mito read stage elapsed",
|
||||
&[STAGE_LABEL],
|
||||
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
|
||||
// 0.01 ~ 10000
|
||||
exponential_buckets(0.01, 10.0, 7).unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
pub static ref READ_STAGE_FETCH_PAGES: Histogram = READ_STAGE_ELAPSED.with_label_values(&["fetch_pages"]);
|
||||
@@ -222,6 +232,8 @@ lazy_static! {
|
||||
"mito_write_cache_download_elapsed",
|
||||
"mito write cache download elapsed",
|
||||
&[TYPE_LABEL],
|
||||
// 0.1 ~ 10000
|
||||
exponential_buckets(0.1, 10.0, 6).unwrap(),
|
||||
).unwrap();
|
||||
/// Upload bytes counter.
|
||||
pub static ref UPLOAD_BYTES_TOTAL: IntCounter = register_int_counter!(
|
||||
@@ -243,6 +255,8 @@ lazy_static! {
|
||||
"greptime_index_apply_elapsed",
|
||||
"index apply elapsed",
|
||||
&[TYPE_LABEL],
|
||||
// 0.01 ~ 1000
|
||||
exponential_buckets(0.01, 10.0, 6).unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
/// Gauge of index apply memory usage.
|
||||
@@ -256,7 +270,8 @@ lazy_static! {
|
||||
"greptime_index_create_elapsed",
|
||||
"index create elapsed",
|
||||
&[STAGE_LABEL, TYPE_LABEL],
|
||||
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
|
||||
// 0.1 ~ 10000
|
||||
exponential_buckets(0.1, 10.0, 6).unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
/// Counter of rows indexed.
|
||||
@@ -337,7 +352,8 @@ lazy_static! {
|
||||
"greptime_partition_tree_buffer_freeze_stage_elapsed",
|
||||
"mito partition tree data buffer freeze stage elapsed",
|
||||
&[STAGE_LABEL],
|
||||
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0]
|
||||
// 0.01 ~ 1000
|
||||
exponential_buckets(0.01, 10.0, 6).unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
@@ -346,7 +362,8 @@ lazy_static! {
|
||||
"greptime_partition_tree_read_stage_elapsed",
|
||||
"mito partition tree read stage elapsed",
|
||||
&[STAGE_LABEL],
|
||||
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0]
|
||||
// 0.01 ~ 1000
|
||||
exponential_buckets(0.01, 10.0, 6).unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
@@ -359,6 +376,8 @@ lazy_static! {
|
||||
pub static ref MANIFEST_OP_ELAPSED: HistogramVec = register_histogram_vec!(
|
||||
"greptime_manifest_op_elapsed",
|
||||
"mito manifest operation elapsed",
|
||||
&["op"]
|
||||
&["op"],
|
||||
// 0.01 ~ 1000
|
||||
exponential_buckets(0.01, 10.0, 6).unwrap(),
|
||||
).unwrap();
|
||||
}
|
||||
|
||||
@@ -35,7 +35,7 @@ use async_trait::async_trait;
|
||||
use common_time::Timestamp;
|
||||
use datafusion_common::arrow::array::UInt8Array;
|
||||
use datatypes::arrow;
|
||||
use datatypes::arrow::array::{Array, ArrayRef};
|
||||
use datatypes::arrow::array::{Array, ArrayRef, UInt64Array};
|
||||
use datatypes::arrow::compute::SortOptions;
|
||||
use datatypes::arrow::row::{RowConverter, SortField};
|
||||
use datatypes::prelude::{ConcreteDataType, DataType, ScalarVector};
|
||||
@@ -334,6 +334,24 @@ impl Batch {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Filters rows by the given `sequence`. Only preserves rows with sequence less than or equal to `sequence`.
|
||||
pub fn filter_by_sequence(&mut self, sequence: Option<SequenceNumber>) -> Result<()> {
|
||||
let seq = match (sequence, self.last_sequence()) {
|
||||
(None, _) | (_, None) => return Ok(()),
|
||||
(Some(sequence), Some(last_sequence)) if sequence >= last_sequence => return Ok(()),
|
||||
(Some(sequence), Some(_)) => sequence,
|
||||
};
|
||||
|
||||
let seqs = self.sequences.as_arrow();
|
||||
let sequence = UInt64Array::new_scalar(seq);
|
||||
let predicate = datafusion_common::arrow::compute::kernels::cmp::lt_eq(seqs, &sequence)
|
||||
.context(ComputeArrowSnafu)?;
|
||||
let predicate = BooleanVector::from(predicate);
|
||||
self.filter(&predicate)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Sorts rows in the batch. If `dedup` is true, it also removes
|
||||
/// duplicated rows according to primary keys.
|
||||
///
|
||||
@@ -1212,6 +1230,57 @@ mod tests {
|
||||
assert_eq!(expect, batch);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_filter_by_sequence() {
|
||||
// Filters put only.
|
||||
let mut batch = new_batch(
|
||||
&[1, 2, 3, 4],
|
||||
&[11, 12, 13, 14],
|
||||
&[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
|
||||
&[21, 22, 23, 24],
|
||||
);
|
||||
batch.filter_by_sequence(Some(13)).unwrap();
|
||||
let expect = new_batch(
|
||||
&[1, 2, 3],
|
||||
&[11, 12, 13],
|
||||
&[OpType::Put, OpType::Put, OpType::Put],
|
||||
&[21, 22, 23],
|
||||
);
|
||||
assert_eq!(expect, batch);
|
||||
|
||||
// Filters to empty.
|
||||
let mut batch = new_batch(
|
||||
&[1, 2, 3, 4],
|
||||
&[11, 12, 13, 14],
|
||||
&[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
|
||||
&[21, 22, 23, 24],
|
||||
);
|
||||
|
||||
batch.filter_by_sequence(Some(10)).unwrap();
|
||||
assert!(batch.is_empty());
|
||||
|
||||
// None filter.
|
||||
let mut batch = new_batch(
|
||||
&[1, 2, 3, 4],
|
||||
&[11, 12, 13, 14],
|
||||
&[OpType::Put, OpType::Delete, OpType::Put, OpType::Put],
|
||||
&[21, 22, 23, 24],
|
||||
);
|
||||
let expect = batch.clone();
|
||||
batch.filter_by_sequence(None).unwrap();
|
||||
assert_eq!(expect, batch);
|
||||
|
||||
// Filter a empty batch
|
||||
let mut batch = new_batch(&[], &[], &[], &[]);
|
||||
batch.filter_by_sequence(Some(10)).unwrap();
|
||||
assert!(batch.is_empty());
|
||||
|
||||
// Filter a empty batch with None
|
||||
let mut batch = new_batch(&[], &[], &[], &[]);
|
||||
batch.filter_by_sequence(None).unwrap();
|
||||
assert!(batch.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_filter() {
|
||||
// Filters put only.
|
||||
|
||||
@@ -26,7 +26,7 @@ use store_api::storage::ColumnId;
|
||||
use crate::error::{CompatReaderSnafu, CreateDefaultSnafu, Result};
|
||||
use crate::read::projection::ProjectionMapper;
|
||||
use crate::read::{Batch, BatchColumn, BatchReader};
|
||||
use crate::row_converter::{McmpRowCodec, RowCodec, SortField};
|
||||
use crate::row_converter::{DensePrimaryKeyCodec, PrimaryKeyCodec, SortField};
|
||||
|
||||
/// Reader to adapt schema of underlying reader to expected schema.
|
||||
pub struct CompatReader<R> {
|
||||
@@ -127,7 +127,7 @@ pub(crate) fn has_same_columns(left: &RegionMetadata, right: &RegionMetadata) ->
|
||||
#[derive(Debug)]
|
||||
struct CompatPrimaryKey {
|
||||
/// Row converter to append values to primary keys.
|
||||
converter: McmpRowCodec,
|
||||
converter: DensePrimaryKeyCodec,
|
||||
/// Default values to append.
|
||||
values: Vec<Value>,
|
||||
}
|
||||
@@ -138,10 +138,7 @@ impl CompatPrimaryKey {
|
||||
let mut buffer =
|
||||
Vec::with_capacity(batch.primary_key().len() + self.converter.estimated_size());
|
||||
buffer.extend_from_slice(batch.primary_key());
|
||||
self.converter.encode_to_vec(
|
||||
self.values.iter().map(|value| value.as_value_ref()),
|
||||
&mut buffer,
|
||||
)?;
|
||||
self.converter.encode_values(&self.values, &mut buffer)?;
|
||||
|
||||
batch.set_primary_key(buffer);
|
||||
|
||||
@@ -268,7 +265,7 @@ fn may_compat_primary_key(
|
||||
})?;
|
||||
values.push(default_value);
|
||||
}
|
||||
let converter = McmpRowCodec::new(fields);
|
||||
let converter = DensePrimaryKeyCodec::with_fields(fields);
|
||||
|
||||
Ok(Some(CompatPrimaryKey { converter, values }))
|
||||
}
|
||||
@@ -366,6 +363,7 @@ mod tests {
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use super::*;
|
||||
use crate::row_converter::PrimaryKeyCodecExt;
|
||||
use crate::test_util::{check_reader_result, VecBatchReader};
|
||||
|
||||
/// Creates a new [RegionMetadata].
|
||||
@@ -400,7 +398,7 @@ mod tests {
|
||||
let fields = (0..keys.len())
|
||||
.map(|_| SortField::new(ConcreteDataType::string_datatype()))
|
||||
.collect();
|
||||
let converter = McmpRowCodec::new(fields);
|
||||
let converter = DensePrimaryKeyCodec::with_fields(fields);
|
||||
let row = keys.iter().map(|str_opt| match str_opt {
|
||||
Some(v) => ValueRef::String(v),
|
||||
None => ValueRef::Null,
|
||||
|
||||
@@ -33,7 +33,7 @@ use store_api::storage::ColumnId;
|
||||
use crate::cache::CacheStrategy;
|
||||
use crate::error::{InvalidRequestSnafu, Result};
|
||||
use crate::read::Batch;
|
||||
use crate::row_converter::{McmpRowCodec, RowCodec, SortField};
|
||||
use crate::row_converter::{DensePrimaryKeyCodec, PrimaryKeyCodec};
|
||||
|
||||
/// Only cache vector when its length `<=` this value.
|
||||
const MAX_VECTOR_LENGTH_TO_CACHE: usize = 16384;
|
||||
@@ -47,7 +47,7 @@ pub struct ProjectionMapper {
|
||||
/// Output record batch contains tags.
|
||||
has_tags: bool,
|
||||
/// Decoder for primary key.
|
||||
codec: McmpRowCodec,
|
||||
codec: DensePrimaryKeyCodec,
|
||||
/// Schema for converted [RecordBatch].
|
||||
output_schema: SchemaRef,
|
||||
/// Ids of columns to project. It keeps ids in the same order as the `projection`
|
||||
@@ -80,12 +80,7 @@ impl ProjectionMapper {
|
||||
// Safety: idx is valid.
|
||||
column_schemas.push(metadata.schema.column_schemas()[*idx].clone());
|
||||
}
|
||||
let codec = McmpRowCodec::new(
|
||||
metadata
|
||||
.primary_key_columns()
|
||||
.map(|c| SortField::new(c.column_schema.data_type.clone()))
|
||||
.collect(),
|
||||
);
|
||||
let codec = DensePrimaryKeyCodec::new(metadata);
|
||||
// Safety: Columns come from existing schema.
|
||||
let output_schema = Arc::new(Schema::new(column_schemas));
|
||||
// Get fields in each batch.
|
||||
@@ -186,7 +181,7 @@ impl ProjectionMapper {
|
||||
Some(v) => v.to_vec(),
|
||||
None => self
|
||||
.codec
|
||||
.decode(batch.primary_key())
|
||||
.decode_dense(batch.primary_key())
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExternalSnafu)?,
|
||||
}
|
||||
@@ -291,6 +286,7 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::cache::CacheManager;
|
||||
use crate::read::BatchBuilder;
|
||||
use crate::row_converter::{PrimaryKeyCodecExt, SortField};
|
||||
use crate::test_util::meta_util::TestRegionMetadataBuilder;
|
||||
|
||||
fn new_batch(
|
||||
@@ -299,7 +295,7 @@ mod tests {
|
||||
fields: &[(ColumnId, i64)],
|
||||
num_rows: usize,
|
||||
) -> Batch {
|
||||
let converter = McmpRowCodec::new(
|
||||
let converter = DensePrimaryKeyCodec::with_fields(
|
||||
(0..tags.len())
|
||||
.map(|_| SortField::new(ConcreteDataType::int64_datatype()))
|
||||
.collect(),
|
||||
|
||||
@@ -300,6 +300,9 @@ impl ScanRegion {
|
||||
if file_in_range(file, &time_range) {
|
||||
files.push(file.clone());
|
||||
}
|
||||
// There is no need to check and prune for file's sequence here as the sequence number is usually very new,
|
||||
// unless the timing is too good, or the sequence number wouldn't be in file.
|
||||
// and the batch will be filtered out by tree reader anyway.
|
||||
}
|
||||
}
|
||||
|
||||
@@ -347,7 +350,11 @@ impl ScanRegion {
|
||||
let memtables = memtables
|
||||
.into_iter()
|
||||
.map(|mem| {
|
||||
let ranges = mem.ranges(Some(mapper.column_ids()), Some(predicate.clone()));
|
||||
let ranges = mem.ranges(
|
||||
Some(mapper.column_ids()),
|
||||
Some(predicate.clone()),
|
||||
self.request.sequence,
|
||||
);
|
||||
MemRangeBuilder::new(ranges)
|
||||
})
|
||||
.collect();
|
||||
@@ -426,7 +433,7 @@ impl ScanRegion {
|
||||
Some(file_cache)
|
||||
}();
|
||||
|
||||
let index_cache = self.cache_strategy.index_cache().cloned();
|
||||
let inverted_index_cache = self.cache_strategy.inverted_index_cache().cloned();
|
||||
|
||||
let puffin_metadata_cache = self.cache_strategy.puffin_metadata_cache().cloned();
|
||||
|
||||
@@ -445,7 +452,7 @@ impl ScanRegion {
|
||||
self.access_layer.puffin_manager_factory().clone(),
|
||||
)
|
||||
.with_file_cache(file_cache)
|
||||
.with_index_cache(index_cache)
|
||||
.with_inverted_index_cache(inverted_index_cache)
|
||||
.with_puffin_metadata_cache(puffin_metadata_cache)
|
||||
.build(&self.request.filters)
|
||||
.inspect_err(|err| warn!(err; "Failed to build invereted index applier"))
|
||||
@@ -466,7 +473,7 @@ impl ScanRegion {
|
||||
Some(file_cache)
|
||||
}();
|
||||
|
||||
let index_cache = self.cache_strategy.bloom_filter_index_cache().cloned();
|
||||
let bloom_filter_index_cache = self.cache_strategy.bloom_filter_index_cache().cloned();
|
||||
|
||||
let puffin_metadata_cache = self.cache_strategy.puffin_metadata_cache().cloned();
|
||||
|
||||
@@ -477,7 +484,7 @@ impl ScanRegion {
|
||||
self.access_layer.puffin_manager_factory().clone(),
|
||||
)
|
||||
.with_file_cache(file_cache)
|
||||
.with_bloom_filter_index_cache(index_cache)
|
||||
.with_bloom_filter_index_cache(bloom_filter_index_cache)
|
||||
.with_puffin_metadata_cache(puffin_metadata_cache)
|
||||
.build(&self.request.filters)
|
||||
.inspect_err(|err| warn!(err; "Failed to build bloom filter index applier"))
|
||||
|
||||
@@ -82,6 +82,7 @@ impl PartitionMetrics {
|
||||
) -> Self {
|
||||
let partition_str = partition.to_string();
|
||||
let in_progress_scan = IN_PROGRESS_SCAN.with_label_values(&[scanner_type, &partition_str]);
|
||||
in_progress_scan.inc();
|
||||
let inner = PartitionMetricsInner {
|
||||
region_id,
|
||||
partition,
|
||||
|
||||
@@ -24,10 +24,10 @@ use futures::future::BoxFuture;
|
||||
use futures::StreamExt;
|
||||
use object_store::manager::ObjectStoreManagerRef;
|
||||
use object_store::util::{join_dir, normalize_dir};
|
||||
use snafu::{ensure, OptionExt};
|
||||
use snafu::{ensure, OptionExt, ResultExt};
|
||||
use store_api::logstore::provider::Provider;
|
||||
use store_api::logstore::LogStore;
|
||||
use store_api::metadata::{ColumnMetadata, RegionMetadata};
|
||||
use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder};
|
||||
use store_api::region_engine::RegionRole;
|
||||
use store_api::storage::{ColumnId, RegionId};
|
||||
|
||||
@@ -35,7 +35,8 @@ use crate::access_layer::AccessLayer;
|
||||
use crate::cache::CacheManagerRef;
|
||||
use crate::config::MitoConfig;
|
||||
use crate::error::{
|
||||
EmptyRegionDirSnafu, ObjectStoreNotFoundSnafu, RegionCorruptedSnafu, Result, StaleLogEntrySnafu,
|
||||
EmptyRegionDirSnafu, InvalidMetadataSnafu, ObjectStoreNotFoundSnafu, RegionCorruptedSnafu,
|
||||
Result, StaleLogEntrySnafu,
|
||||
};
|
||||
use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions};
|
||||
use crate::manifest::storage::manifest_compress_type;
|
||||
@@ -59,7 +60,7 @@ use crate::wal::{EntryId, Wal};
|
||||
/// Builder to create a new [MitoRegion] or open an existing one.
|
||||
pub(crate) struct RegionOpener {
|
||||
region_id: RegionId,
|
||||
metadata: Option<RegionMetadata>,
|
||||
metadata_builder: Option<RegionMetadataBuilder>,
|
||||
memtable_builder_provider: MemtableBuilderProvider,
|
||||
object_store_manager: ObjectStoreManagerRef,
|
||||
region_dir: String,
|
||||
@@ -90,7 +91,7 @@ impl RegionOpener {
|
||||
) -> RegionOpener {
|
||||
RegionOpener {
|
||||
region_id,
|
||||
metadata: None,
|
||||
metadata_builder: None,
|
||||
memtable_builder_provider,
|
||||
object_store_manager,
|
||||
region_dir: normalize_dir(region_dir),
|
||||
@@ -106,16 +107,27 @@ impl RegionOpener {
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets metadata of the region to create.
|
||||
pub(crate) fn metadata(mut self, metadata: RegionMetadata) -> Self {
|
||||
self.metadata = Some(metadata);
|
||||
/// Sets metadata builder of the region to create.
|
||||
pub(crate) fn metadata_builder(mut self, builder: RegionMetadataBuilder) -> Self {
|
||||
self.metadata_builder = Some(builder);
|
||||
self
|
||||
}
|
||||
|
||||
/// Builds the region metadata.
|
||||
///
|
||||
/// # Panics
|
||||
/// - Panics if `options` is not set.
|
||||
/// - Panics if `metadata_builder` is not set.
|
||||
fn build_metadata(&mut self) -> Result<RegionMetadata> {
|
||||
let options = self.options.as_ref().unwrap();
|
||||
let mut metadata_builder = self.metadata_builder.take().unwrap();
|
||||
metadata_builder.primary_key_encoding(options.primary_key_encoding());
|
||||
metadata_builder.build().context(InvalidMetadataSnafu)
|
||||
}
|
||||
|
||||
/// Parses and sets options for the region.
|
||||
pub(crate) fn parse_options(mut self, options: HashMap<String, String>) -> Result<Self> {
|
||||
self.options = Some(RegionOptions::try_from(&options)?);
|
||||
Ok(self)
|
||||
pub(crate) fn parse_options(self, options: HashMap<String, String>) -> Result<Self> {
|
||||
self.options(RegionOptions::try_from(&options)?)
|
||||
}
|
||||
|
||||
/// If a [WalEntryReader] is set, the [RegionOpener] will use [WalEntryReader] instead of
|
||||
@@ -151,21 +163,21 @@ impl RegionOpener {
|
||||
/// Opens the region if it already exists.
|
||||
///
|
||||
/// # Panics
|
||||
/// - Panics if metadata is not set.
|
||||
/// - Panics if options is not set.
|
||||
/// - Panics if `metadata_builder` is not set.
|
||||
/// - Panics if `options` is not set.
|
||||
pub(crate) async fn create_or_open<S: LogStore>(
|
||||
mut self,
|
||||
config: &MitoConfig,
|
||||
wal: &Wal<S>,
|
||||
) -> Result<MitoRegion> {
|
||||
let region_id = self.region_id;
|
||||
|
||||
let metadata = self.build_metadata()?;
|
||||
// Tries to open the region.
|
||||
match self.maybe_open(config, wal).await {
|
||||
Ok(Some(region)) => {
|
||||
let recovered = region.metadata();
|
||||
// Checks the schema of the region.
|
||||
let expect = self.metadata.as_ref().unwrap();
|
||||
let expect = &metadata;
|
||||
check_recovered_region(
|
||||
&recovered,
|
||||
expect.region_id,
|
||||
@@ -189,13 +201,13 @@ impl RegionOpener {
|
||||
);
|
||||
}
|
||||
}
|
||||
// Safety: must be set before calling this method.
|
||||
let options = self.options.take().unwrap();
|
||||
let object_store = self.object_store(&options.storage)?.clone();
|
||||
let provider = self.provider(&options.wal_options);
|
||||
|
||||
let metadata = Arc::new(metadata);
|
||||
// Create a manifest manager for this region and writes regions to the manifest file.
|
||||
let region_manifest_options = self.manifest_options(config, &options)?;
|
||||
let metadata = Arc::new(self.metadata.unwrap());
|
||||
let manifest_manager = RegionManifestManager::new(
|
||||
metadata.clone(),
|
||||
region_manifest_options,
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user