Compare commits

..

3 Commits

Author SHA1 Message Date
LFC
87d32fc255 fix: show tables too slow cherry pick (#7244)
v0.12.1

fix: `show tables` is too slow under large tables

Signed-off-by: luofucong <luofc@foxmail.com>
2025-11-18 12:29:40 +00:00
LFC
503585ba42 ci: update github runner on old branch (#7249)
* ci: update github runner (ubuntu-20.04 -> ubuntu-22.04)

Signed-off-by: luofucong <luofc@foxmail.com>
2025-11-18 19:52:03 +08:00
Lei, HUANG
ee2a7a5f10 fix: check if memtable is empty by stats (#5989)
fix/checking-memtable-empty-and-stats:
 - **Refactor timestamp updates**: Simplified timestamp range updates in `PartitionTreeMemtable` and `TimeSeriesMemtable` by replacing `update_timestamp_range` with `fetch_max` and `fetch_min` methods for `max_timestamp` and `min_timestamp`.
   - Affected files: `partition_tree.rs`, `time_series.rs`

 - **Remove unused code**: Deleted the `update_timestamp_range` method from `WriteMetrics` and removed unnecessary imports.
   - Affected file: `stats.rs`

 - **Optimize memtable filtering**: Streamlined the check for empty memtables in `ScanRegion` by directly using `time_range`.
   - Affected file: `scan_region.rs`

(cherry picked from commit 1a517ec8ac)
2025-06-04 11:36:42 +08:00
173 changed files with 1238 additions and 6628 deletions

View File

@@ -24,4 +24,9 @@ runs:
--set auth.rbac.token.enabled=false \
--set persistence.size=2Gi \
--create-namespace \
--set global.security.allowInsecureImages=true \
--set image.registry=docker.io \
--set image.repository=greptime/etcd \
--set image.tag=3.6.1-debian-12-r3 \
--version 12.0.8 \
-n ${{ inputs.namespace }}

View File

@@ -51,7 +51,7 @@ runs:
run: |
helm upgrade \
--install my-greptimedb \
--set meta.etcdEndpoints=${{ inputs.etcd-endpoints }} \
--set meta.backendStorage.etcd.endpoints=${{ inputs.etcd-endpoints }} \
--set meta.enableRegionFailover=${{ inputs.enable-region-failover }} \
--set image.registry=${{ inputs.image-registry }} \
--set image.repository=${{ inputs.image-repository }} \

View File

@@ -23,4 +23,8 @@ runs:
--set listeners.controller.protocol=PLAINTEXT \
--set listeners.client.protocol=PLAINTEXT \
--create-namespace \
--set image.registry=docker.io \
--set image.repository=greptime/kafka \
--set image.tag=3.9.0-debian-12-r1 \
--version 31.0.0 \
-n ${{ inputs.namespace }}

34
.github/scripts/pull-test-deps-images.sh vendored Executable file
View File

@@ -0,0 +1,34 @@
#!/bin/bash
# This script is used to pull the test dependency images that are stored in public ECR one by one to avoid rate limiting.
set -e
MAX_RETRIES=3
IMAGES=(
"greptime/zookeeper:3.7"
"greptime/kafka:3.9.0-debian-12-r1"
"greptime/etcd:3.6.1-debian-12-r3"
"greptime/minio:2024"
"greptime/mysql:5.7"
)
for image in "${IMAGES[@]}"; do
for ((attempt=1; attempt<=MAX_RETRIES; attempt++)); do
if docker pull "$image"; then
# Successfully pulled the image.
break
else
# Use some simple exponential backoff to avoid rate limiting.
if [ $attempt -lt $MAX_RETRIES ]; then
sleep_seconds=$((attempt * 5))
echo "Attempt $attempt failed for $image, waiting $sleep_seconds seconds"
sleep $sleep_seconds # 5s, 10s delays
else
echo "Failed to pull $image after $MAX_RETRIES attempts"
exit 1
fi
fi
done
done

View File

@@ -14,7 +14,7 @@ name: Build API docs
jobs:
apidoc:
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
with:

View File

@@ -16,11 +16,11 @@ on:
description: The runner uses to build linux-amd64 artifacts
default: ec2-c6i.4xlarge-amd64
options:
- ubuntu-20.04
- ubuntu-20.04-8-cores
- ubuntu-20.04-16-cores
- ubuntu-20.04-32-cores
- ubuntu-20.04-64-cores
- ubuntu-22.04
- ubuntu-22.04-8-cores
- ubuntu-22.04-16-cores
- ubuntu-22.04-32-cores
- ubuntu-22.04-64-cores
- ec2-c6i.xlarge-amd64 # 4C8G
- ec2-c6i.2xlarge-amd64 # 8C16G
- ec2-c6i.4xlarge-amd64 # 16C32G
@@ -83,7 +83,7 @@ jobs:
allocate-runners:
name: Allocate runners
if: ${{ github.repository == 'GreptimeTeam/greptimedb' }}
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
outputs:
linux-amd64-runner: ${{ steps.start-linux-amd64-runner.outputs.label }}
linux-arm64-runner: ${{ steps.start-linux-arm64-runner.outputs.label }}
@@ -218,7 +218,7 @@ jobs:
build-linux-amd64-artifacts,
build-linux-arm64-artifacts,
]
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
outputs:
build-result: ${{ steps.set-build-result.outputs.build-result }}
steps:
@@ -251,7 +251,7 @@ jobs:
allocate-runners,
release-images-to-dockerhub,
]
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
continue-on-error: true
steps:
- uses: actions/checkout@v4
@@ -283,7 +283,7 @@ jobs:
name: Stop linux-amd64 runner
# Only run this job when the runner is allocated.
if: ${{ always() }}
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
needs: [
allocate-runners,
build-linux-amd64-artifacts,
@@ -309,7 +309,7 @@ jobs:
name: Stop linux-arm64 runner
# Only run this job when the runner is allocated.
if: ${{ always() }}
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
needs: [
allocate-runners,
build-linux-arm64-artifacts,
@@ -337,7 +337,7 @@ jobs:
needs: [
release-images-to-dockerhub
]
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
permissions:
issues: write

View File

@@ -21,14 +21,13 @@ concurrency:
cancel-in-progress: true
jobs:
check-typos-and-docs:
name: Check typos and docs
runs-on: ubuntu-20.04
check-docs:
name: Check docs
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
with:
persist-credentials: false
- uses: crate-ci/typos@master
- name: Check the config docs
run: |
make config-docs && \
@@ -36,7 +35,7 @@ jobs:
|| (echo "'config/config.md' is not up-to-date, please run 'make config-docs'." && exit 1)
license-header-check:
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
name: Check License Header
steps:
- uses: actions/checkout@v4
@@ -49,7 +48,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ ubuntu-20.04 ]
os: [ ubuntu-22.04 ]
timeout-minutes: 60
steps:
- uses: actions/checkout@v4
@@ -72,7 +71,7 @@ jobs:
toml:
name: Toml Check
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
timeout-minutes: 60
steps:
- uses: actions/checkout@v4
@@ -89,7 +88,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ ubuntu-20.04 ]
os: [ ubuntu-22.04 ]
timeout-minutes: 60
steps:
- uses: actions/checkout@v4
@@ -248,7 +247,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ ubuntu-20.04 ]
os: [ ubuntu-22.04 ]
timeout-minutes: 60
steps:
- uses: actions/checkout@v4
@@ -568,7 +567,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ ubuntu-20.04 ]
os: [ ubuntu-22.04 ]
mode:
- name: "Basic"
opts: ""
@@ -587,7 +586,8 @@ jobs:
- if: matrix.mode.kafka
name: Setup kafka server
working-directory: tests-integration/fixtures
run: docker compose up -d --wait kafka
run: ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait kafka
- name: Download pre-built binaries
uses: actions/download-artifact@v4
with:
@@ -607,7 +607,7 @@ jobs:
fmt:
name: Rustfmt
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
timeout-minutes: 60
steps:
- uses: actions/checkout@v4
@@ -624,7 +624,7 @@ jobs:
clippy:
name: Clippy
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
timeout-minutes: 60
steps:
- uses: actions/checkout@v4
@@ -685,7 +685,8 @@ jobs:
uses: taiki-e/install-action@nextest
- name: Setup external services
working-directory: tests-integration/fixtures
run: docker compose up -d --wait
run: ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait
- name: Run nextest cases
run: cargo nextest run --workspace -F dashboard -F pg_kvbackend
env:
@@ -710,7 +711,7 @@ jobs:
coverage:
if: github.event_name == 'merge_group'
runs-on: ubuntu-20.04-8-cores
runs-on: ubuntu-22.04-8-cores
timeout-minutes: 60
steps:
- uses: actions/checkout@v4
@@ -737,7 +738,8 @@ jobs:
uses: taiki-e/install-action@cargo-llvm-cov
- name: Setup external services
working-directory: tests-integration/fixtures
run: docker compose up -d --wait
run: ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait
- name: Run nextest cases
run: cargo llvm-cov nextest --workspace --lcov --output-path lcov.info -F dashboard -F pg_kvbackend
env:
@@ -770,7 +772,7 @@ jobs:
# compat:
# name: Compatibility Test
# needs: build
# runs-on: ubuntu-20.04
# runs-on: ubuntu-22.04
# timeout-minutes: 60
# steps:
# - uses: actions/checkout@v4

View File

@@ -3,13 +3,9 @@ on:
pull_request_target:
types: [opened, edited]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
docbot:
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
permissions:
pull-requests: write
contents: read

View File

@@ -31,7 +31,7 @@ name: CI
jobs:
typos:
name: Spell Check with Typos
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
with:
@@ -39,7 +39,7 @@ jobs:
- uses: crate-ci/typos@master
license-header-check:
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
name: Check License Header
steps:
- uses: actions/checkout@v4
@@ -49,29 +49,29 @@ jobs:
check:
name: Check
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
steps:
- run: 'echo "No action required"'
fmt:
name: Rustfmt
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
steps:
- run: 'echo "No action required"'
clippy:
name: Clippy
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
steps:
- run: 'echo "No action required"'
coverage:
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
steps:
- run: 'echo "No action required"'
test:
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
steps:
- run: 'echo "No action required"'
@@ -80,7 +80,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ ubuntu-20.04 ]
os: [ ubuntu-22.04 ]
mode:
- name: "Basic"
- name: "Remote WAL"

View File

@@ -14,11 +14,11 @@ on:
description: The runner uses to build linux-amd64 artifacts
default: ec2-c6i.4xlarge-amd64
options:
- ubuntu-20.04
- ubuntu-20.04-8-cores
- ubuntu-20.04-16-cores
- ubuntu-20.04-32-cores
- ubuntu-20.04-64-cores
- ubuntu-22.04
- ubuntu-22.04-8-cores
- ubuntu-22.04-16-cores
- ubuntu-22.04-32-cores
- ubuntu-22.04-64-cores
- ec2-c6i.xlarge-amd64 # 4C8G
- ec2-c6i.2xlarge-amd64 # 8C16G
- ec2-c6i.4xlarge-amd64 # 16C32G
@@ -70,7 +70,7 @@ jobs:
allocate-runners:
name: Allocate runners
if: ${{ github.repository == 'GreptimeTeam/greptimedb' }}
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
outputs:
linux-amd64-runner: ${{ steps.start-linux-amd64-runner.outputs.label }}
linux-arm64-runner: ${{ steps.start-linux-arm64-runner.outputs.label }}
@@ -182,7 +182,7 @@ jobs:
build-linux-amd64-artifacts,
build-linux-arm64-artifacts,
]
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
outputs:
nightly-build-result: ${{ steps.set-nightly-build-result.outputs.nightly-build-result }}
steps:
@@ -214,7 +214,7 @@ jobs:
allocate-runners,
release-images-to-dockerhub,
]
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
# When we push to ACR, it's easy to fail due to some unknown network issues.
# However, we don't want to fail the whole workflow because of this.
# The ACR have daily sync with DockerHub, so don't worry about the image not being updated.
@@ -249,7 +249,7 @@ jobs:
name: Stop linux-amd64 runner
# Only run this job when the runner is allocated.
if: ${{ always() }}
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
needs: [
allocate-runners,
build-linux-amd64-artifacts,
@@ -275,7 +275,7 @@ jobs:
name: Stop linux-arm64 runner
# Only run this job when the runner is allocated.
if: ${{ always() }}
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
needs: [
allocate-runners,
build-linux-arm64-artifacts,
@@ -303,7 +303,7 @@ jobs:
needs: [
release-images-to-dockerhub
]
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
permissions:
issues: write
env:

View File

@@ -133,7 +133,7 @@ jobs:
name: Check status
needs: [sqlness-test, sqlness-windows, test-on-windows]
if: ${{ github.repository == 'GreptimeTeam/greptimedb' }}
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
outputs:
check-result: ${{ steps.set-check-result.outputs.check-result }}
steps:
@@ -146,7 +146,7 @@ jobs:
if: ${{ github.repository == 'GreptimeTeam/greptimedb' && always() }} # Not requiring successful dependent jobs, always run.
name: Send notification to Greptime team
needs: [check-status]
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL_DEVELOP_CHANNEL }}
steps:

View File

@@ -29,7 +29,7 @@ jobs:
release-dev-builder-images:
name: Release dev builder images
if: ${{ inputs.release_dev_builder_ubuntu_image || inputs.release_dev_builder_centos_image || inputs.release_dev_builder_android_image }} # Only manually trigger this job.
runs-on: ubuntu-20.04-16-cores
runs-on: ubuntu-22.04-16-cores
outputs:
version: ${{ steps.set-version.outputs.version }}
steps:
@@ -63,7 +63,7 @@ jobs:
release-dev-builder-images-ecr:
name: Release dev builder images to AWS ECR
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
needs: [
release-dev-builder-images
]
@@ -148,7 +148,7 @@ jobs:
release-dev-builder-images-cn: # Note: Be careful issue: https://github.com/containers/skopeo/issues/1874 and we decide to use the latest stable skopeo container.
name: Release dev builder images to CN region
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
needs: [
release-dev-builder-images
]

View File

@@ -18,11 +18,11 @@ on:
description: The runner uses to build linux-amd64 artifacts
default: ec2-c6i.4xlarge-amd64
options:
- ubuntu-20.04
- ubuntu-20.04-8-cores
- ubuntu-20.04-16-cores
- ubuntu-20.04-32-cores
- ubuntu-20.04-64-cores
- ubuntu-22.04
- ubuntu-22.04-8-cores
- ubuntu-22.04-16-cores
- ubuntu-22.04-32-cores
- ubuntu-22.04-64-cores
- ec2-c6i.xlarge-amd64 # 4C8G
- ec2-c6i.2xlarge-amd64 # 8C16G
- ec2-c6i.4xlarge-amd64 # 16C32G
@@ -97,7 +97,7 @@ jobs:
allocate-runners:
name: Allocate runners
if: ${{ github.repository == 'GreptimeTeam/greptimedb' }}
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
outputs:
linux-amd64-runner: ${{ steps.start-linux-amd64-runner.outputs.label }}
linux-arm64-runner: ${{ steps.start-linux-arm64-runner.outputs.label }}
@@ -335,7 +335,7 @@ jobs:
build-windows-artifacts,
release-images-to-dockerhub,
]
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
# When we push to ACR, it's easy to fail due to some unknown network issues.
# However, we don't want to fail the whole workflow because of this.
# The ACR have daily sync with DockerHub, so don't worry about the image not being updated.
@@ -377,7 +377,7 @@ jobs:
build-windows-artifacts,
release-images-to-dockerhub,
]
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
with:
@@ -396,7 +396,7 @@ jobs:
name: Stop linux-amd64 runner
# Only run this job when the runner is allocated.
if: ${{ always() }}
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
needs: [
allocate-runners,
build-linux-amd64-artifacts,
@@ -422,7 +422,7 @@ jobs:
name: Stop linux-arm64 runner
# Only run this job when the runner is allocated.
if: ${{ always() }}
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
needs: [
allocate-runners,
build-linux-arm64-artifacts,
@@ -448,7 +448,7 @@ jobs:
name: Bump doc version
if: ${{ github.event_name == 'push' || github.event_name == 'schedule' }}
needs: [allocate-runners]
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
# Permission reference: https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs
permissions:
issues: write # Allows the action to create issues for cyborg.
@@ -475,7 +475,7 @@ jobs:
build-macos-artifacts,
build-windows-artifacts,
]
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
# Permission reference: https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs
permissions:
issues: write # Allows the action to create issues for cyborg.

View File

@@ -7,13 +7,9 @@ on:
- reopened
- edited
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
check:
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
timeout-minutes: 10
steps:
- uses: actions/checkout@v4

View File

@@ -3,28 +3,30 @@
## Individual Committers (in alphabetical order)
* [CookiePieWw](https://github.com/CookiePieWw)
* [KKould](https://github.com/KKould)
* [NiwakaDev](https://github.com/NiwakaDev)
* [etolbakov](https://github.com/etolbakov)
* [irenjj](https://github.com/irenjj)
* [KKould](https://github.com/KKould)
* [Lanqing Yang](https://github.com/lyang24)
* [NiwakaDev](https://github.com/NiwakaDev)
* [tisonkun](https://github.com/tisonkun)
* [Lanqing Yang](https://github.com/lyang24)
## Team Members (in alphabetical order)
* [Breeze-P](https://github.com/Breeze-P)
* [GrepTime](https://github.com/GrepTime)
* [MichaelScofield](https://github.com/MichaelScofield)
* [Wenjie0329](https://github.com/Wenjie0329)
* [WenyXu](https://github.com/WenyXu)
* [ZonaHex](https://github.com/ZonaHex)
* [apdong2022](https://github.com/apdong2022)
* [beryl678](https://github.com/beryl678)
* [Breeze-P](https://github.com/Breeze-P)
* [daviderli614](https://github.com/daviderli614)
* [discord9](https://github.com/discord9)
* [evenyag](https://github.com/evenyag)
* [fengjiachun](https://github.com/fengjiachun)
* [fengys1996](https://github.com/fengys1996)
* [GrepTime](https://github.com/GrepTime)
* [holalengyu](https://github.com/holalengyu)
* [killme2008](https://github.com/killme2008)
* [MichaelScofield](https://github.com/MichaelScofield)
* [nicecui](https://github.com/nicecui)
* [paomian](https://github.com/paomian)
* [shuiyisong](https://github.com/shuiyisong)
@@ -32,14 +34,11 @@
* [sunng87](https://github.com/sunng87)
* [v0y4g3r](https://github.com/v0y4g3r)
* [waynexia](https://github.com/waynexia)
* [Wenjie0329](https://github.com/Wenjie0329)
* [WenyXu](https://github.com/WenyXu)
* [xtang](https://github.com/xtang)
* [zhaoyingnan01](https://github.com/zhaoyingnan01)
* [zhongzc](https://github.com/zhongzc)
* [ZonaHex](https://github.com/ZonaHex)
* [zyy17](https://github.com/zyy17)
## All Contributors
To see the full list of contributors, please visit our [Contributors page](https://github.com/GreptimeTeam/greptimedb/graphs/contributors)
[![All Contributors](https://contrib.rocks/image?repo=GreptimeTeam/greptimedb)](https://github.com/GreptimeTeam/greptimedb/graphs/contributors)

222
Cargo.lock generated
View File

@@ -185,7 +185,7 @@ checksum = "d301b3b94cb4b2f23d7917810addbbaff90738e0ca2be692bd027e70d7e0330c"
[[package]]
name = "api"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"common-base",
"common-decimal",
@@ -432,7 +432,7 @@ dependencies = [
"arrow-schema",
"chrono",
"half",
"indexmap 2.7.1",
"indexmap 2.6.0",
"lexical-core",
"num",
"serde",
@@ -710,7 +710,7 @@ dependencies = [
[[package]]
name = "auth"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"api",
"async-trait",
@@ -1324,7 +1324,7 @@ dependencies = [
[[package]]
name = "cache"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"catalog",
"common-error",
@@ -1348,7 +1348,7 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
[[package]]
name = "catalog"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"api",
"arrow",
@@ -1475,7 +1475,7 @@ version = "0.13.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6026d8cd82ada8bbcfe337805dd1eb6afdc9e80fa4d57e977b3a36315e0c5525"
dependencies = [
"indexmap 2.7.1",
"indexmap 2.6.0",
"lazy_static",
"num-traits",
"regex",
@@ -1661,7 +1661,7 @@ checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
[[package]]
name = "cli"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"async-trait",
"auth",
@@ -1703,7 +1703,7 @@ dependencies = [
"session",
"snafu 0.8.5",
"store-api",
"substrait 0.12.0",
"substrait 0.12.1",
"table",
"tempfile",
"tokio",
@@ -1712,7 +1712,7 @@ dependencies = [
[[package]]
name = "client"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"api",
"arc-swap",
@@ -1739,7 +1739,7 @@ dependencies = [
"rand",
"serde_json",
"snafu 0.8.5",
"substrait 0.12.0",
"substrait 0.12.1",
"substrait 0.37.3",
"tokio",
"tokio-stream",
@@ -1780,7 +1780,7 @@ dependencies = [
[[package]]
name = "cmd"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"async-trait",
"auth",
@@ -1841,7 +1841,7 @@ dependencies = [
"similar-asserts",
"snafu 0.8.5",
"store-api",
"substrait 0.12.0",
"substrait 0.12.1",
"table",
"temp-env",
"tempfile",
@@ -1887,7 +1887,7 @@ checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335"
[[package]]
name = "common-base"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"anymap2",
"async-trait",
@@ -1909,11 +1909,11 @@ dependencies = [
[[package]]
name = "common-catalog"
version = "0.12.0"
version = "0.12.1"
[[package]]
name = "common-config"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"common-base",
"common-error",
@@ -1938,7 +1938,7 @@ dependencies = [
[[package]]
name = "common-datasource"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"arrow",
"arrow-schema",
@@ -1974,7 +1974,7 @@ dependencies = [
[[package]]
name = "common-decimal"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"bigdecimal 0.4.5",
"common-error",
@@ -1987,7 +1987,7 @@ dependencies = [
[[package]]
name = "common-error"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"http 1.1.0",
"snafu 0.8.5",
@@ -1997,7 +1997,7 @@ dependencies = [
[[package]]
name = "common-frontend"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"async-trait",
"common-error",
@@ -2007,14 +2007,12 @@ dependencies = [
[[package]]
name = "common-function"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"ahash 0.8.11",
"api",
"approx 0.5.1",
"arc-swap",
"async-trait",
"bincode",
"common-base",
"common-catalog",
"common-error",
@@ -2032,7 +2030,6 @@ dependencies = [
"geo-types",
"geohash",
"h3o",
"hyperloglogplus",
"jsonb",
"nalgebra 0.33.2",
"num",
@@ -2049,13 +2046,12 @@ dependencies = [
"store-api",
"table",
"tokio",
"uddsketch",
"wkt",
]
[[package]]
name = "common-greptimedb-telemetry"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"async-trait",
"common-runtime",
@@ -2072,7 +2068,7 @@ dependencies = [
[[package]]
name = "common-grpc"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"api",
"arrow-flight",
@@ -2100,7 +2096,7 @@ dependencies = [
[[package]]
name = "common-grpc-expr"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"api",
"common-base",
@@ -2119,7 +2115,7 @@ dependencies = [
[[package]]
name = "common-macro"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"arc-swap",
"common-query",
@@ -2133,7 +2129,7 @@ dependencies = [
[[package]]
name = "common-mem-prof"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"common-error",
"common-macro",
@@ -2146,7 +2142,7 @@ dependencies = [
[[package]]
name = "common-meta"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"anymap2",
"api",
@@ -2206,7 +2202,7 @@ dependencies = [
[[package]]
name = "common-options"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"common-grpc",
"humantime-serde",
@@ -2215,11 +2211,11 @@ dependencies = [
[[package]]
name = "common-plugins"
version = "0.12.0"
version = "0.12.1"
[[package]]
name = "common-pprof"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"common-error",
"common-macro",
@@ -2231,7 +2227,7 @@ dependencies = [
[[package]]
name = "common-procedure"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"async-stream",
"async-trait",
@@ -2258,7 +2254,7 @@ dependencies = [
[[package]]
name = "common-procedure-test"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"async-trait",
"common-procedure",
@@ -2266,7 +2262,7 @@ dependencies = [
[[package]]
name = "common-query"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"api",
"async-trait",
@@ -2292,7 +2288,7 @@ dependencies = [
[[package]]
name = "common-recordbatch"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"arc-swap",
"common-error",
@@ -2311,7 +2307,7 @@ dependencies = [
[[package]]
name = "common-runtime"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"async-trait",
"clap 4.5.19",
@@ -2341,7 +2337,7 @@ dependencies = [
[[package]]
name = "common-telemetry"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"atty",
"backtrace",
@@ -2369,7 +2365,7 @@ dependencies = [
[[package]]
name = "common-test-util"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"client",
"common-query",
@@ -2381,7 +2377,7 @@ dependencies = [
[[package]]
name = "common-time"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"arrow",
"chrono",
@@ -2399,7 +2395,7 @@ dependencies = [
[[package]]
name = "common-version"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"build-data",
"const_format",
@@ -2409,7 +2405,7 @@ dependencies = [
[[package]]
name = "common-wal"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"common-base",
"common-error",
@@ -2976,7 +2972,7 @@ dependencies = [
"chrono",
"half",
"hashbrown 0.14.5",
"indexmap 2.7.1",
"indexmap 2.6.0",
"libc",
"object_store",
"parquet",
@@ -3036,7 +3032,7 @@ dependencies = [
"datafusion-functions-aggregate-common",
"datafusion-functions-window-common",
"datafusion-physical-expr-common",
"indexmap 2.7.1",
"indexmap 2.6.0",
"paste",
"recursive",
"serde_json",
@@ -3158,7 +3154,7 @@ dependencies = [
"datafusion-physical-expr-common",
"datafusion-physical-plan",
"half",
"indexmap 2.7.1",
"indexmap 2.6.0",
"log",
"parking_lot 0.12.3",
"paste",
@@ -3209,7 +3205,7 @@ dependencies = [
"datafusion-common",
"datafusion-expr",
"datafusion-physical-expr",
"indexmap 2.7.1",
"indexmap 2.6.0",
"itertools 0.13.0",
"log",
"recursive",
@@ -3234,7 +3230,7 @@ dependencies = [
"datafusion-physical-expr-common",
"half",
"hashbrown 0.14.5",
"indexmap 2.7.1",
"indexmap 2.6.0",
"itertools 0.13.0",
"log",
"paste",
@@ -3293,7 +3289,7 @@ dependencies = [
"futures",
"half",
"hashbrown 0.14.5",
"indexmap 2.7.1",
"indexmap 2.6.0",
"itertools 0.13.0",
"log",
"once_cell",
@@ -3313,7 +3309,7 @@ dependencies = [
"arrow-schema",
"datafusion-common",
"datafusion-expr",
"indexmap 2.7.1",
"indexmap 2.6.0",
"log",
"recursive",
"regex",
@@ -3340,7 +3336,7 @@ dependencies = [
[[package]]
name = "datanode"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"api",
"arrow-flight",
@@ -3392,7 +3388,7 @@ dependencies = [
"session",
"snafu 0.8.5",
"store-api",
"substrait 0.12.0",
"substrait 0.12.1",
"table",
"tokio",
"toml 0.8.19",
@@ -3401,7 +3397,7 @@ dependencies = [
[[package]]
name = "datatypes"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"arrow",
"arrow-array",
@@ -4045,7 +4041,7 @@ dependencies = [
[[package]]
name = "file-engine"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"api",
"async-trait",
@@ -4155,7 +4151,7 @@ checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8"
[[package]]
name = "flow"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"api",
"arrow",
@@ -4165,7 +4161,6 @@ dependencies = [
"bytes",
"cache",
"catalog",
"chrono",
"client",
"common-base",
"common-catalog",
@@ -4217,7 +4212,7 @@ dependencies = [
"snafu 0.8.5",
"store-api",
"strum 0.25.0",
"substrait 0.12.0",
"substrait 0.12.1",
"table",
"tokio",
"tonic 0.12.3",
@@ -4272,7 +4267,7 @@ checksum = "6c2141d6d6c8512188a7891b4b01590a45f6dac67afb4f255c4124dbb86d4eaa"
[[package]]
name = "frontend"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"api",
"arc-swap",
@@ -4700,7 +4695,7 @@ dependencies = [
[[package]]
name = "greptime-proto"
version = "0.1.0"
source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=072ce580502e015df1a6b03a185b60309a7c2a7a#072ce580502e015df1a6b03a185b60309a7c2a7a"
source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=a25adc8a01340231121646d8f0a29d0e92f45461#a25adc8a01340231121646d8f0a29d0e92f45461"
dependencies = [
"prost 0.13.3",
"serde",
@@ -4723,7 +4718,7 @@ dependencies = [
"futures-sink",
"futures-util",
"http 0.2.12",
"indexmap 2.7.1",
"indexmap 2.6.0",
"slab",
"tokio",
"tokio-util",
@@ -4742,7 +4737,7 @@ dependencies = [
"futures-core",
"futures-sink",
"http 1.1.0",
"indexmap 2.7.1",
"indexmap 2.6.0",
"slab",
"tokio",
"tokio-util",
@@ -5292,15 +5287,6 @@ dependencies = [
"tracing",
]
[[package]]
name = "hyperloglogplus"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "621debdf94dcac33e50475fdd76d34d5ea9c0362a834b9db08c3024696c1fbe3"
dependencies = [
"serde",
]
[[package]]
name = "i_float"
version = "1.3.1"
@@ -5540,7 +5526,7 @@ dependencies = [
[[package]]
name = "index"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"async-trait",
"asynchronous-codec",
@@ -5589,9 +5575,9 @@ dependencies = [
[[package]]
name = "indexmap"
version = "2.7.1"
version = "2.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652"
checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da"
dependencies = [
"equivalent",
"hashbrown 0.15.2",
@@ -5605,7 +5591,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88"
dependencies = [
"ahash 0.8.11",
"indexmap 2.7.1",
"indexmap 2.6.0",
"is-terminal",
"itoa",
"log",
@@ -5952,7 +5938,7 @@ version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ee7893dab2e44ae5f9d0173f26ff4aa327c10b01b06a72b52dd9405b628640d"
dependencies = [
"indexmap 2.7.1",
"indexmap 2.6.0",
]
[[package]]
@@ -6332,7 +6318,7 @@ checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
[[package]]
name = "log-query"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"chrono",
"common-error",
@@ -6344,7 +6330,7 @@ dependencies = [
[[package]]
name = "log-store"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"async-stream",
"async-trait",
@@ -6435,7 +6421,7 @@ dependencies = [
"cactus",
"cfgrammar",
"filetime",
"indexmap 2.7.1",
"indexmap 2.6.0",
"lazy_static",
"lrtable",
"num-traits",
@@ -6637,7 +6623,7 @@ dependencies = [
[[package]]
name = "meta-client"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"api",
"async-trait",
@@ -6664,7 +6650,7 @@ dependencies = [
[[package]]
name = "meta-srv"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"api",
"async-trait",
@@ -6750,7 +6736,7 @@ dependencies = [
[[package]]
name = "metric-engine"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"api",
"aquamarine",
@@ -6848,7 +6834,7 @@ dependencies = [
[[package]]
name = "mito2"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"api",
"aquamarine",
@@ -7545,7 +7531,7 @@ dependencies = [
[[package]]
name = "object-store"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"anyhow",
"bytes",
@@ -7676,7 +7662,7 @@ checksum = "1e32339a5dc40459130b3bd269e9892439f55b33e772d2a9d402a789baaf4e8a"
dependencies = [
"futures-core",
"futures-sink",
"indexmap 2.7.1",
"indexmap 2.6.0",
"js-sys",
"once_cell",
"pin-project-lite",
@@ -7794,7 +7780,7 @@ dependencies = [
[[package]]
name = "operator"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"ahash 0.8.11",
"api",
@@ -7842,7 +7828,7 @@ dependencies = [
"sql",
"sqlparser 0.52.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=71dd86058d2af97b9925093d40c4e03360403170)",
"store-api",
"substrait 0.12.0",
"substrait 0.12.1",
"table",
"tokio",
"tokio-util",
@@ -8079,7 +8065,7 @@ dependencies = [
[[package]]
name = "partition"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"api",
"async-trait",
@@ -8248,7 +8234,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db"
dependencies = [
"fixedbitset",
"indexmap 2.7.1",
"indexmap 2.6.0",
]
[[package]]
@@ -8347,7 +8333,7 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "pipeline"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"ahash 0.8.11",
"api",
@@ -8487,7 +8473,7 @@ dependencies = [
[[package]]
name = "plugins"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"auth",
"clap 4.5.19",
@@ -8749,7 +8735,7 @@ dependencies = [
[[package]]
name = "promql"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"ahash 0.8.11",
"async-trait",
@@ -8994,7 +8980,7 @@ dependencies = [
[[package]]
name = "puffin"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"async-compression 0.4.13",
"async-trait",
@@ -9035,7 +9021,7 @@ dependencies = [
[[package]]
name = "query"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"ahash 0.8.11",
"api",
@@ -9100,7 +9086,7 @@ dependencies = [
"sqlparser 0.52.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=71dd86058d2af97b9925093d40c4e03360403170)",
"statrs",
"store-api",
"substrait 0.12.0",
"substrait 0.12.1",
"table",
"tokio",
"tokio-stream",
@@ -10339,7 +10325,7 @@ version = "1.0.137"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "930cfb6e6abf99298aaad7d29abbef7a9999a9a8806a40088f55f0dcec03146b"
dependencies = [
"indexmap 2.7.1",
"indexmap 2.6.0",
"itoa",
"memchr",
"ryu",
@@ -10410,7 +10396,7 @@ dependencies = [
"chrono",
"hex",
"indexmap 1.9.3",
"indexmap 2.7.1",
"indexmap 2.6.0",
"serde",
"serde_derive",
"serde_json",
@@ -10436,7 +10422,7 @@ version = "0.9.34+deprecated"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47"
dependencies = [
"indexmap 2.7.1",
"indexmap 2.6.0",
"itoa",
"ryu",
"serde",
@@ -10445,7 +10431,7 @@ dependencies = [
[[package]]
name = "servers"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"ahash 0.8.11",
"api",
@@ -10497,7 +10483,6 @@ dependencies = [
"humantime",
"humantime-serde",
"hyper 1.4.1",
"indexmap 2.7.1",
"influxdb_line_protocol",
"itertools 0.10.5",
"json5",
@@ -10562,7 +10547,7 @@ dependencies = [
[[package]]
name = "session"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"api",
"arc-swap",
@@ -10871,7 +10856,7 @@ dependencies = [
[[package]]
name = "sql"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"api",
"chrono",
@@ -10908,12 +10893,12 @@ dependencies = [
[[package]]
name = "sqlness"
version = "0.6.1"
source = "git+https://github.com/CeresDB/sqlness.git?rev=bb91f31ff58993e07ea89845791235138283a24c#bb91f31ff58993e07ea89845791235138283a24c"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "308a7338f2211813d6e9da117e9b9b7aee5d072872d11a934002fd2bd4ab5276"
dependencies = [
"async-trait",
"derive_builder 0.11.2",
"duration-str",
"futures",
"minijinja",
"prettydiff",
"regex",
@@ -10925,7 +10910,7 @@ dependencies = [
[[package]]
name = "sqlness-runner"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"async-trait",
"clap 4.5.19",
@@ -10939,7 +10924,6 @@ dependencies = [
"hex",
"local-ip-address",
"mysql",
"num_cpus",
"reqwest",
"serde",
"serde_json",
@@ -11039,7 +11023,7 @@ dependencies = [
"futures-util",
"hashbrown 0.15.2",
"hashlink",
"indexmap 2.7.1",
"indexmap 2.6.0",
"log",
"memchr",
"once_cell",
@@ -11242,7 +11226,7 @@ dependencies = [
[[package]]
name = "store-api"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"api",
"aquamarine",
@@ -11372,7 +11356,7 @@ dependencies = [
[[package]]
name = "substrait"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"async-trait",
"bytes",
@@ -11553,7 +11537,7 @@ dependencies = [
[[package]]
name = "table"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"api",
"async-trait",
@@ -11804,7 +11788,7 @@ checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76"
[[package]]
name = "tests-fuzz"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"arbitrary",
"async-trait",
@@ -11848,7 +11832,7 @@ dependencies = [
[[package]]
name = "tests-integration"
version = "0.12.0"
version = "0.12.1"
dependencies = [
"api",
"arrow-flight",
@@ -11914,7 +11898,7 @@ dependencies = [
"sql",
"sqlx",
"store-api",
"substrait 0.12.0",
"substrait 0.12.1",
"table",
"tempfile",
"time",
@@ -12335,7 +12319,7 @@ version = "0.19.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b5bb770da30e5cbfde35a2d7b9b8a2c4b8ef89548a7a6aeab5c9a576e3e7421"
dependencies = [
"indexmap 2.7.1",
"indexmap 2.6.0",
"toml_datetime",
"winnow 0.5.40",
]
@@ -12346,7 +12330,7 @@ version = "0.22.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5"
dependencies = [
"indexmap 2.7.1",
"indexmap 2.6.0",
"serde",
"serde_spanned",
"toml_datetime",
@@ -12484,7 +12468,7 @@ dependencies = [
"futures-core",
"futures-util",
"hdrhistogram",
"indexmap 2.7.1",
"indexmap 2.6.0",
"pin-project-lite",
"slab",
"sync_wrapper 1.0.1",
@@ -12972,14 +12956,6 @@ version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971"
[[package]]
name = "uddsketch"
version = "0.1.0"
source = "git+https://github.com/GreptimeTeam/timescaledb-toolkit.git?rev=84828fe8fb494a6a61412a3da96517fc80f7bb20#84828fe8fb494a6a61412a3da96517fc80f7bb20"
dependencies = [
"serde",
]
[[package]]
name = "unescaper"
version = "0.1.5"

View File

@@ -67,7 +67,7 @@ members = [
resolver = "2"
[workspace.package]
version = "0.12.0"
version = "0.12.1"
edition = "2021"
license = "Apache-2.0"
@@ -129,7 +129,7 @@ etcd-client = "0.14"
fst = "0.4.7"
futures = "0.3"
futures-util = "0.3"
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "072ce580502e015df1a6b03a185b60309a7c2a7a" }
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "a25adc8a01340231121646d8f0a29d0e92f45461" }
hex = "0.4"
http = "1"
humantime = "2.1"

View File

@@ -319,7 +319,6 @@
| `selector` | String | `round_robin` | Datanode selector type.<br/>- `round_robin` (default value)<br/>- `lease_based`<br/>- `load_based`<br/>For details, please see "https://docs.greptime.com/developer-guide/metasrv/selector". |
| `use_memory_store` | Bool | `false` | Store data in memory. |
| `enable_region_failover` | Bool | `false` | Whether to enable region failover.<br/>This feature is only available on GreptimeDB running on cluster mode and<br/>- Using Remote WAL<br/>- Using shared storage (e.g., s3). |
| `node_max_idle_time` | String | `24hours` | Max allowed idle time before removing node info from metasrv memory. |
| `enable_telemetry` | Bool | `true` | Whether to enable greptimedb telemetry. Enabled by default. |
| `runtime` | -- | -- | The runtime options. |
| `runtime.global_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. |

View File

@@ -50,9 +50,6 @@ use_memory_store = false
## - Using shared storage (e.g., s3).
enable_region_failover = false
## Max allowed idle time before removing node info from metasrv memory.
node_max_idle_time = "24hours"
## Whether to enable greptimedb telemetry. Enabled by default.
#+ enable_telemetry = true

View File

@@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashSet;
use std::sync::{Arc, Weak};
use arrow_schema::SchemaRef as ArrowSchemaRef;
@@ -244,14 +243,17 @@ impl InformationSchemaTablesBuilder {
// TODO(dennis): `region_stats` API is not stable in distributed cluster because of network issue etc.
// But we don't want the statements such as `show tables` fail,
// so using `unwrap_or_else` here instead of `?` operator.
let region_stats = information_extension
.region_stats()
.await
.map_err(|e| {
error!(e; "Failed to call region_stats");
e
})
.unwrap_or_else(|_| vec![]);
let region_stats = {
let mut x = information_extension
.region_stats()
.await
.unwrap_or_else(|e| {
error!(e; "Failed to find region stats in information_schema, fallback to all empty");
vec![]
});
x.sort_unstable_by_key(|x| x.id);
x
};
for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
let mut stream = catalog_manager.tables(&catalog_name, &schema_name, None);
@@ -262,16 +264,16 @@ impl InformationSchemaTablesBuilder {
// TODO(dennis): make it working for metric engine
let table_region_stats =
if table_info.meta.engine == MITO_ENGINE || table_info.is_physical_table() {
let region_ids = table_info
table_info
.meta
.region_numbers
.iter()
.map(|n| RegionId::new(table_info.ident.table_id, *n))
.collect::<HashSet<_>>();
region_stats
.iter()
.filter(|stat| region_ids.contains(&stat.id))
.flat_map(|region_id| {
region_stats
.binary_search_by_key(&region_id, |x| x.id)
.map(|i| &region_stats[i])
})
.collect::<Vec<_>>()
} else {
vec![]

View File

@@ -16,6 +16,7 @@
mod client;
pub mod client_manager;
#[cfg(feature = "testing")]
mod database;
pub mod error;
pub mod flow;
@@ -33,6 +34,7 @@ pub use common_recordbatch::{RecordBatches, SendableRecordBatchStream};
use snafu::OptionExt;
pub use self::client::Client;
#[cfg(feature = "testing")]
pub use self::database::Database;
pub use self::error::{Error, Result};
use crate::error::{IllegalDatabaseResponseSnafu, ServerSnafu};

View File

@@ -32,7 +32,7 @@ use common_meta::key::TableMetadataManager;
use common_telemetry::info;
use common_telemetry::logging::TracingOptions;
use common_version::{short_version, version};
use flow::{FlownodeBuilder, FlownodeInstance, FrontendClient, FrontendInvoker};
use flow::{FlownodeBuilder, FlownodeInstance, FrontendInvoker};
use meta_client::{MetaClientOptions, MetaClientType};
use servers::Mode;
use snafu::{OptionExt, ResultExt};
@@ -317,8 +317,6 @@ impl StartCommand {
Arc::new(executor),
);
let frontend_client = FrontendClient::from_meta_client(meta_client.clone());
let flow_metadata_manager = Arc::new(FlowMetadataManager::new(cached_meta_backend.clone()));
let flownode_builder = FlownodeBuilder::new(
opts,
@@ -326,7 +324,6 @@ impl StartCommand {
table_metadata_manager,
catalog_manager.clone(),
flow_metadata_manager,
Arc::new(frontend_client),
)
.with_heartbeat_task(heartbeat_task);

View File

@@ -54,10 +54,7 @@ use datanode::config::{DatanodeOptions, ProcedureConfig, RegionEngineConfig, Sto
use datanode::datanode::{Datanode, DatanodeBuilder};
use datanode::region_server::RegionServer;
use file_engine::config::EngineConfig as FileEngineConfig;
use flow::{
FlowConfig, FlowWorkerManager, FlownodeBuilder, FlownodeOptions, FrontendClient,
FrontendInvoker,
};
use flow::{FlowConfig, FlowWorkerManager, FlownodeBuilder, FlownodeOptions, FrontendInvoker};
use frontend::frontend::FrontendOptions;
use frontend::instance::builder::FrontendBuilder;
use frontend::instance::{FrontendInstance, Instance as FeInstance, StandaloneDatanodeManager};
@@ -536,16 +533,12 @@ impl StartCommand {
flow: opts.flow.clone(),
..Default::default()
};
let fe_server_addr = fe_opts.grpc.bind_addr.clone();
let frontend_client = FrontendClient::from_static_grpc_addr(fe_server_addr);
let flow_builder = FlownodeBuilder::new(
flownode_options,
plugins.clone(),
table_metadata_manager.clone(),
catalog_manager.clone(),
flow_metadata_manager.clone(),
Arc::new(frontend_client),
);
let flownode = Arc::new(
flow_builder

View File

@@ -12,11 +12,9 @@ default = ["geo"]
geo = ["geohash", "h3o", "s2", "wkt", "geo-types", "dep:geo"]
[dependencies]
ahash = "0.8"
api.workspace = true
arc-swap = "1.0"
async-trait.workspace = true
bincode = "1.3"
common-base.workspace = true
common-catalog.workspace = true
common-error.workspace = true
@@ -34,7 +32,6 @@ geo = { version = "0.29", optional = true }
geo-types = { version = "0.7", optional = true }
geohash = { version = "0.13", optional = true }
h3o = { version = "0.6", optional = true }
hyperloglogplus = "0.4"
jsonb.workspace = true
nalgebra.workspace = true
num = "0.4"
@@ -50,7 +47,6 @@ sql.workspace = true
statrs = "0.16"
store-api.workspace = true
table.workspace = true
uddsketch = { git = "https://github.com/GreptimeTeam/timescaledb-toolkit.git", rev = "84828fe8fb494a6a61412a3da96517fc80f7bb20" }
wkt = { version = "0.11", optional = true }
[dev-dependencies]

View File

@@ -1,20 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
mod hll;
mod uddsketch_state;
pub(crate) use hll::HllStateType;
pub use hll::{HllState, HLL_MERGE_NAME, HLL_NAME};
pub use uddsketch_state::{UddSketchState, UDDSKETCH_STATE_NAME};

View File

@@ -1,319 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use common_query::prelude::*;
use common_telemetry::trace;
use datafusion::arrow::array::ArrayRef;
use datafusion::common::cast::{as_binary_array, as_string_array};
use datafusion::common::not_impl_err;
use datafusion::error::{DataFusionError, Result as DfResult};
use datafusion::logical_expr::function::AccumulatorArgs;
use datafusion::logical_expr::{Accumulator as DfAccumulator, AggregateUDF};
use datafusion::prelude::create_udaf;
use datatypes::arrow::datatypes::DataType;
use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
use crate::utils::FixedRandomState;
pub const HLL_NAME: &str = "hll";
pub const HLL_MERGE_NAME: &str = "hll_merge";
const DEFAULT_PRECISION: u8 = 14;
pub(crate) type HllStateType = HyperLogLogPlus<String, FixedRandomState>;
pub struct HllState {
hll: HllStateType,
}
impl std::fmt::Debug for HllState {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "HllState<Opaque>")
}
}
impl Default for HllState {
fn default() -> Self {
Self::new()
}
}
impl HllState {
pub fn new() -> Self {
Self {
// Safety: the DEFAULT_PRECISION is fixed and valid
hll: HllStateType::new(DEFAULT_PRECISION, FixedRandomState::new()).unwrap(),
}
}
/// Create a UDF for the `hll` function.
///
/// `hll` accepts a string column and aggregates the
/// values into a HyperLogLog state.
pub fn state_udf_impl() -> AggregateUDF {
create_udaf(
HLL_NAME,
vec![DataType::Utf8],
Arc::new(DataType::Binary),
Volatility::Immutable,
Arc::new(Self::create_accumulator),
Arc::new(vec![DataType::Binary]),
)
}
/// Create a UDF for the `hll_merge` function.
///
/// `hll_merge` accepts a binary column of states generated by `hll`
/// and merges them into a single state.
pub fn merge_udf_impl() -> AggregateUDF {
create_udaf(
HLL_MERGE_NAME,
vec![DataType::Binary],
Arc::new(DataType::Binary),
Volatility::Immutable,
Arc::new(Self::create_merge_accumulator),
Arc::new(vec![DataType::Binary]),
)
}
fn update(&mut self, value: &str) {
self.hll.insert(value);
}
fn merge(&mut self, raw: &[u8]) {
if let Ok(serialized) = bincode::deserialize::<HllStateType>(raw) {
if let Ok(()) = self.hll.merge(&serialized) {
return;
}
}
trace!("Warning: Failed to merge HyperLogLog from {:?}", raw);
}
fn create_accumulator(acc_args: AccumulatorArgs) -> DfResult<Box<dyn DfAccumulator>> {
let data_type = acc_args.exprs[0].data_type(acc_args.schema)?;
match data_type {
DataType::Utf8 => Ok(Box::new(HllState::new())),
other => not_impl_err!("{HLL_NAME} does not support data type: {other}"),
}
}
fn create_merge_accumulator(acc_args: AccumulatorArgs) -> DfResult<Box<dyn DfAccumulator>> {
let data_type = acc_args.exprs[0].data_type(acc_args.schema)?;
match data_type {
DataType::Binary => Ok(Box::new(HllState::new())),
other => not_impl_err!("{HLL_MERGE_NAME} does not support data type: {other}"),
}
}
}
impl DfAccumulator for HllState {
fn update_batch(&mut self, values: &[ArrayRef]) -> DfResult<()> {
let array = &values[0];
match array.data_type() {
DataType::Utf8 => {
let string_array = as_string_array(array)?;
for value in string_array.iter().flatten() {
self.update(value);
}
}
DataType::Binary => {
let binary_array = as_binary_array(array)?;
for v in binary_array.iter().flatten() {
self.merge(v);
}
}
_ => {
return not_impl_err!(
"HLL functions do not support data type: {}",
array.data_type()
)
}
}
Ok(())
}
fn evaluate(&mut self) -> DfResult<ScalarValue> {
Ok(ScalarValue::Binary(Some(
bincode::serialize(&self.hll).map_err(|e| {
DataFusionError::Internal(format!("Failed to serialize HyperLogLog: {}", e))
})?,
)))
}
fn size(&self) -> usize {
std::mem::size_of_val(&self.hll)
}
fn state(&mut self) -> DfResult<Vec<ScalarValue>> {
Ok(vec![ScalarValue::Binary(Some(
bincode::serialize(&self.hll).map_err(|e| {
DataFusionError::Internal(format!("Failed to serialize HyperLogLog: {}", e))
})?,
))])
}
fn merge_batch(&mut self, states: &[ArrayRef]) -> DfResult<()> {
let array = &states[0];
let binary_array = as_binary_array(array)?;
for v in binary_array.iter().flatten() {
self.merge(v);
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use datafusion::arrow::array::{BinaryArray, StringArray};
use super::*;
#[test]
fn test_hll_basic() {
let mut state = HllState::new();
state.update("1");
state.update("2");
state.update("3");
let result = state.evaluate().unwrap();
if let ScalarValue::Binary(Some(bytes)) = result {
let mut hll: HllStateType = bincode::deserialize(&bytes).unwrap();
assert_eq!(hll.count().trunc() as u32, 3);
} else {
panic!("Expected binary scalar value");
}
}
#[test]
fn test_hll_roundtrip() {
let mut state = HllState::new();
state.update("1");
state.update("2");
// Serialize
let serialized = state.evaluate().unwrap();
// Create new state and merge the serialized data
let mut new_state = HllState::new();
if let ScalarValue::Binary(Some(bytes)) = &serialized {
new_state.merge(bytes);
// Verify the merged state matches original
let result = new_state.evaluate().unwrap();
if let ScalarValue::Binary(Some(new_bytes)) = result {
let mut original: HllStateType = bincode::deserialize(bytes).unwrap();
let mut merged: HllStateType = bincode::deserialize(&new_bytes).unwrap();
assert_eq!(original.count(), merged.count());
} else {
panic!("Expected binary scalar value");
}
} else {
panic!("Expected binary scalar value");
}
}
#[test]
fn test_hll_batch_update() {
let mut state = HllState::new();
// Test string values
let str_values = vec!["a", "b", "c", "d", "e", "f", "g", "h", "i"];
let str_array = Arc::new(StringArray::from(str_values)) as ArrayRef;
state.update_batch(&[str_array]).unwrap();
let result = state.evaluate().unwrap();
if let ScalarValue::Binary(Some(bytes)) = result {
let mut hll: HllStateType = bincode::deserialize(&bytes).unwrap();
assert_eq!(hll.count().trunc() as u32, 9);
} else {
panic!("Expected binary scalar value");
}
}
#[test]
fn test_hll_merge_batch() {
let mut state1 = HllState::new();
state1.update("1");
let state1_binary = state1.evaluate().unwrap();
let mut state2 = HllState::new();
state2.update("2");
let state2_binary = state2.evaluate().unwrap();
let mut merged_state = HllState::new();
if let (ScalarValue::Binary(Some(bytes1)), ScalarValue::Binary(Some(bytes2))) =
(&state1_binary, &state2_binary)
{
let binary_array = Arc::new(BinaryArray::from(vec![
bytes1.as_slice(),
bytes2.as_slice(),
])) as ArrayRef;
merged_state.merge_batch(&[binary_array]).unwrap();
let result = merged_state.evaluate().unwrap();
if let ScalarValue::Binary(Some(bytes)) = result {
let mut hll: HllStateType = bincode::deserialize(&bytes).unwrap();
assert_eq!(hll.count().trunc() as u32, 2);
} else {
panic!("Expected binary scalar value");
}
} else {
panic!("Expected binary scalar values");
}
}
#[test]
fn test_hll_merge_function() {
// Create two HLL states with different values
let mut state1 = HllState::new();
state1.update("1");
state1.update("2");
let state1_binary = state1.evaluate().unwrap();
let mut state2 = HllState::new();
state2.update("2");
state2.update("3");
let state2_binary = state2.evaluate().unwrap();
// Create a merge state and merge both states
let mut merge_state = HllState::new();
if let (ScalarValue::Binary(Some(bytes1)), ScalarValue::Binary(Some(bytes2))) =
(&state1_binary, &state2_binary)
{
let binary_array = Arc::new(BinaryArray::from(vec![
bytes1.as_slice(),
bytes2.as_slice(),
])) as ArrayRef;
merge_state.update_batch(&[binary_array]).unwrap();
let result = merge_state.evaluate().unwrap();
if let ScalarValue::Binary(Some(bytes)) = result {
let mut hll: HllStateType = bincode::deserialize(&bytes).unwrap();
// Should have 3 unique values: "1", "2", "3"
assert_eq!(hll.count().trunc() as u32, 3);
} else {
panic!("Expected binary scalar value");
}
} else {
panic!("Expected binary scalar values");
}
}
}

View File

@@ -1,307 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use common_query::prelude::*;
use common_telemetry::trace;
use datafusion::common::cast::{as_binary_array, as_primitive_array};
use datafusion::common::not_impl_err;
use datafusion::error::{DataFusionError, Result as DfResult};
use datafusion::logical_expr::function::AccumulatorArgs;
use datafusion::logical_expr::{Accumulator as DfAccumulator, AggregateUDF};
use datafusion::physical_plan::expressions::Literal;
use datafusion::prelude::create_udaf;
use datatypes::arrow::array::ArrayRef;
use datatypes::arrow::datatypes::{DataType, Float64Type};
use uddsketch::{SketchHashKey, UDDSketch};
pub const UDDSKETCH_STATE_NAME: &str = "uddsketch_state";
#[derive(Debug)]
pub struct UddSketchState {
uddsketch: UDDSketch,
}
impl UddSketchState {
pub fn new(bucket_size: u64, error_rate: f64) -> Self {
Self {
uddsketch: UDDSketch::new(bucket_size, error_rate),
}
}
pub fn udf_impl() -> AggregateUDF {
create_udaf(
UDDSKETCH_STATE_NAME,
vec![DataType::Int64, DataType::Float64, DataType::Float64],
Arc::new(DataType::Binary),
Volatility::Immutable,
Arc::new(|args| {
let (bucket_size, error_rate) = downcast_accumulator_args(args)?;
Ok(Box::new(UddSketchState::new(bucket_size, error_rate)))
}),
Arc::new(vec![DataType::Binary]),
)
}
fn update(&mut self, value: f64) {
self.uddsketch.add_value(value);
}
fn merge(&mut self, raw: &[u8]) {
if let Ok(uddsketch) = bincode::deserialize::<UDDSketch>(raw) {
if uddsketch.count() != 0 {
self.uddsketch.merge_sketch(&uddsketch);
}
} else {
trace!("Warning: Failed to deserialize UDDSketch from {:?}", raw);
}
}
}
fn downcast_accumulator_args(args: AccumulatorArgs) -> DfResult<(u64, f64)> {
let bucket_size = match args.exprs[0]
.as_any()
.downcast_ref::<Literal>()
.map(|lit| lit.value())
{
Some(ScalarValue::Int64(Some(value))) => *value as u64,
_ => {
return not_impl_err!(
"{} not supported for bucket size: {}",
UDDSKETCH_STATE_NAME,
&args.exprs[0]
)
}
};
let error_rate = match args.exprs[1]
.as_any()
.downcast_ref::<Literal>()
.map(|lit| lit.value())
{
Some(ScalarValue::Float64(Some(value))) => *value,
_ => {
return not_impl_err!(
"{} not supported for error rate: {}",
UDDSKETCH_STATE_NAME,
&args.exprs[1]
)
}
};
Ok((bucket_size, error_rate))
}
impl DfAccumulator for UddSketchState {
fn update_batch(&mut self, values: &[ArrayRef]) -> DfResult<()> {
let array = &values[2]; // the third column is data value
let f64_array = as_primitive_array::<Float64Type>(array)?;
for v in f64_array.iter().flatten() {
self.update(v);
}
Ok(())
}
fn evaluate(&mut self) -> DfResult<ScalarValue> {
Ok(ScalarValue::Binary(Some(
bincode::serialize(&self.uddsketch).map_err(|e| {
DataFusionError::Internal(format!("Failed to serialize UDDSketch: {}", e))
})?,
)))
}
fn size(&self) -> usize {
// Base size of UDDSketch struct fields
let mut total_size = std::mem::size_of::<f64>() * 3 + // alpha, gamma, values_sum
std::mem::size_of::<u32>() + // compactions
std::mem::size_of::<u64>() * 2; // max_buckets, num_values
// Size of buckets (SketchHashMap)
// Each bucket entry contains:
// - SketchHashKey (enum with i64/Zero/Invalid variants)
// - SketchHashEntry (count: u64, next: SketchHashKey)
let bucket_entry_size = std::mem::size_of::<SketchHashKey>() + // key
std::mem::size_of::<u64>() + // count
std::mem::size_of::<SketchHashKey>(); // next
total_size += self.uddsketch.current_buckets_count() * bucket_entry_size;
total_size
}
fn state(&mut self) -> DfResult<Vec<ScalarValue>> {
Ok(vec![ScalarValue::Binary(Some(
bincode::serialize(&self.uddsketch).map_err(|e| {
DataFusionError::Internal(format!("Failed to serialize UDDSketch: {}", e))
})?,
))])
}
fn merge_batch(&mut self, states: &[ArrayRef]) -> DfResult<()> {
let array = &states[0];
let binary_array = as_binary_array(array)?;
for v in binary_array.iter().flatten() {
self.merge(v);
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use datafusion::arrow::array::{BinaryArray, Float64Array};
use super::*;
#[test]
fn test_uddsketch_state_basic() {
let mut state = UddSketchState::new(10, 0.01);
state.update(1.0);
state.update(2.0);
state.update(3.0);
let result = state.evaluate().unwrap();
if let ScalarValue::Binary(Some(bytes)) = result {
let deserialized: UDDSketch = bincode::deserialize(&bytes).unwrap();
assert_eq!(deserialized.count(), 3);
} else {
panic!("Expected binary scalar value");
}
}
#[test]
fn test_uddsketch_state_roundtrip() {
let mut state = UddSketchState::new(10, 0.01);
state.update(1.0);
state.update(2.0);
// Serialize
let serialized = state.evaluate().unwrap();
// Create new state and merge the serialized data
let mut new_state = UddSketchState::new(10, 0.01);
if let ScalarValue::Binary(Some(bytes)) = &serialized {
new_state.merge(bytes);
// Verify the merged state matches original by comparing deserialized values
let original_sketch: UDDSketch = bincode::deserialize(bytes).unwrap();
let new_result = new_state.evaluate().unwrap();
if let ScalarValue::Binary(Some(new_bytes)) = new_result {
let new_sketch: UDDSketch = bincode::deserialize(&new_bytes).unwrap();
assert_eq!(original_sketch.count(), new_sketch.count());
assert_eq!(original_sketch.sum(), new_sketch.sum());
assert_eq!(original_sketch.mean(), new_sketch.mean());
assert_eq!(original_sketch.max_error(), new_sketch.max_error());
// Compare a few quantiles to ensure statistical equivalence
for q in [0.1, 0.5, 0.9].iter() {
assert!(
(original_sketch.estimate_quantile(*q) - new_sketch.estimate_quantile(*q))
.abs()
< 1e-10,
"Quantile {} mismatch: original={}, new={}",
q,
original_sketch.estimate_quantile(*q),
new_sketch.estimate_quantile(*q)
);
}
} else {
panic!("Expected binary scalar value");
}
} else {
panic!("Expected binary scalar value");
}
}
#[test]
fn test_uddsketch_state_batch_update() {
let mut state = UddSketchState::new(10, 0.01);
let values = vec![1.0f64, 2.0, 3.0];
let array = Arc::new(Float64Array::from(values)) as ArrayRef;
state
.update_batch(&[array.clone(), array.clone(), array])
.unwrap();
let result = state.evaluate().unwrap();
if let ScalarValue::Binary(Some(bytes)) = result {
let deserialized: UDDSketch = bincode::deserialize(&bytes).unwrap();
assert_eq!(deserialized.count(), 3);
} else {
panic!("Expected binary scalar value");
}
}
#[test]
fn test_uddsketch_state_merge_batch() {
let mut state1 = UddSketchState::new(10, 0.01);
state1.update(1.0);
let state1_binary = state1.evaluate().unwrap();
let mut state2 = UddSketchState::new(10, 0.01);
state2.update(2.0);
let state2_binary = state2.evaluate().unwrap();
let mut merged_state = UddSketchState::new(10, 0.01);
if let (ScalarValue::Binary(Some(bytes1)), ScalarValue::Binary(Some(bytes2))) =
(&state1_binary, &state2_binary)
{
let binary_array = Arc::new(BinaryArray::from(vec![
bytes1.as_slice(),
bytes2.as_slice(),
])) as ArrayRef;
merged_state.merge_batch(&[binary_array]).unwrap();
let result = merged_state.evaluate().unwrap();
if let ScalarValue::Binary(Some(bytes)) = result {
let deserialized: UDDSketch = bincode::deserialize(&bytes).unwrap();
assert_eq!(deserialized.count(), 2);
} else {
panic!("Expected binary scalar value");
}
} else {
panic!("Expected binary scalar values");
}
}
#[test]
fn test_uddsketch_state_size() {
let mut state = UddSketchState::new(10, 0.01);
let initial_size = state.size();
// Add some values to create buckets
state.update(1.0);
state.update(2.0);
state.update(3.0);
let size_with_values = state.size();
assert!(
size_with_values > initial_size,
"Size should increase after adding values: initial={}, with_values={}",
initial_size,
size_with_values
);
// Verify size increases with more buckets
state.update(10.0); // This should create a new bucket
assert!(
state.size() > size_with_values,
"Size should increase after adding new bucket: prev={}, new={}",
size_with_values,
state.size()
);
}
}

View File

@@ -22,12 +22,10 @@ use crate::function::{AsyncFunctionRef, FunctionRef};
use crate::scalars::aggregate::{AggregateFunctionMetaRef, AggregateFunctions};
use crate::scalars::date::DateFunction;
use crate::scalars::expression::ExpressionFunction;
use crate::scalars::hll_count::HllCalcFunction;
use crate::scalars::json::JsonFunction;
use crate::scalars::matches::MatchesFunction;
use crate::scalars::math::MathFunction;
use crate::scalars::timestamp::TimestampFunction;
use crate::scalars::uddsketch_calc::UddSketchCalcFunction;
use crate::scalars::vector::VectorFunction;
use crate::system::SystemFunction;
use crate::table::TableFunction;
@@ -107,8 +105,6 @@ pub static FUNCTION_REGISTRY: Lazy<Arc<FunctionRegistry>> = Lazy::new(|| {
TimestampFunction::register(&function_registry);
DateFunction::register(&function_registry);
ExpressionFunction::register(&function_registry);
UddSketchCalcFunction::register(&function_registry);
HllCalcFunction::register(&function_registry);
// Aggregate functions
AggregateFunctions::register(&function_registry);

View File

@@ -21,7 +21,6 @@ pub mod scalars;
mod system;
mod table;
pub mod aggr;
pub mod function;
pub mod function_registry;
pub mod handlers;

View File

@@ -22,9 +22,7 @@ pub mod matches;
pub mod math;
pub mod vector;
pub(crate) mod hll_count;
#[cfg(test)]
pub(crate) mod test;
pub(crate) mod timestamp;
pub(crate) mod uddsketch_calc;
pub mod udf;

View File

@@ -1,175 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Implementation of the scalar function `hll_count`.
use std::fmt;
use std::fmt::Display;
use std::sync::Arc;
use common_query::error::{DowncastVectorSnafu, InvalidFuncArgsSnafu, Result};
use common_query::prelude::{Signature, Volatility};
use datatypes::data_type::ConcreteDataType;
use datatypes::prelude::Vector;
use datatypes::scalars::{ScalarVector, ScalarVectorBuilder};
use datatypes::vectors::{BinaryVector, MutableVector, UInt64VectorBuilder, VectorRef};
use hyperloglogplus::HyperLogLog;
use snafu::OptionExt;
use crate::aggr::HllStateType;
use crate::function::{Function, FunctionContext};
use crate::function_registry::FunctionRegistry;
const NAME: &str = "hll_count";
/// HllCalcFunction implements the scalar function `hll_count`.
///
/// It accepts one argument:
/// 1. The serialized HyperLogLogPlus state, as produced by the aggregator (binary).
///
/// For each row, it deserializes the sketch and returns the estimated cardinality.
#[derive(Debug, Default)]
pub struct HllCalcFunction;
impl HllCalcFunction {
pub fn register(registry: &FunctionRegistry) {
registry.register(Arc::new(HllCalcFunction));
}
}
impl Display for HllCalcFunction {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", NAME.to_ascii_uppercase())
}
}
impl Function for HllCalcFunction {
fn name(&self) -> &str {
NAME
}
fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
Ok(ConcreteDataType::uint64_datatype())
}
fn signature(&self) -> Signature {
// Only argument: HyperLogLogPlus state (binary)
Signature::exact(
vec![ConcreteDataType::binary_datatype()],
Volatility::Immutable,
)
}
fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result<VectorRef> {
if columns.len() != 1 {
return InvalidFuncArgsSnafu {
err_msg: format!("hll_count expects 1 argument, got {}", columns.len()),
}
.fail();
}
let hll_vec = columns[0]
.as_any()
.downcast_ref::<BinaryVector>()
.with_context(|| DowncastVectorSnafu {
err_msg: format!("expect BinaryVector, got {}", columns[0].vector_type_name()),
})?;
let len = hll_vec.len();
let mut builder = UInt64VectorBuilder::with_capacity(len);
for i in 0..len {
let hll_opt = hll_vec.get_data(i);
if hll_opt.is_none() {
builder.push_null();
continue;
}
let hll_bytes = hll_opt.unwrap();
// Deserialize the HyperLogLogPlus from its bincode representation
let mut hll: HllStateType = match bincode::deserialize(hll_bytes) {
Ok(h) => h,
Err(e) => {
common_telemetry::trace!("Failed to deserialize HyperLogLogPlus: {}", e);
builder.push_null();
continue;
}
};
builder.push(Some(hll.count().round() as u64));
}
Ok(builder.to_vector())
}
}
#[cfg(test)]
mod tests {
use datatypes::vectors::BinaryVector;
use super::*;
use crate::utils::FixedRandomState;
#[test]
fn test_hll_count_function() {
let function = HllCalcFunction;
assert_eq!("hll_count", function.name());
assert_eq!(
ConcreteDataType::uint64_datatype(),
function
.return_type(&[ConcreteDataType::uint64_datatype()])
.unwrap()
);
// Create a test HLL
let mut hll = HllStateType::new(14, FixedRandomState::new()).unwrap();
for i in 1..=10 {
hll.insert(&i.to_string());
}
let serialized_bytes = bincode::serialize(&hll).unwrap();
let args: Vec<VectorRef> = vec![Arc::new(BinaryVector::from(vec![Some(serialized_bytes)]))];
let result = function.eval(FunctionContext::default(), &args).unwrap();
assert_eq!(result.len(), 1);
// Test cardinality estimate
if let datatypes::value::Value::UInt64(v) = result.get(0) {
assert_eq!(v, 10);
} else {
panic!("Expected uint64 value");
}
}
#[test]
fn test_hll_count_function_errors() {
let function = HllCalcFunction;
// Test with invalid number of arguments
let args: Vec<VectorRef> = vec![];
let result = function.eval(FunctionContext::default(), &args);
assert!(result.is_err());
assert!(result
.unwrap_err()
.to_string()
.contains("hll_count expects 1 argument"));
// Test with invalid binary data
let args: Vec<VectorRef> = vec![Arc::new(BinaryVector::from(vec![Some(vec![1, 2, 3])]))]; // Invalid binary data
let result = function.eval(FunctionContext::default(), &args).unwrap();
assert_eq!(result.len(), 1);
assert!(matches!(result.get(0), datatypes::value::Value::Null));
}
}

View File

@@ -1,211 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Implementation of the scalar function `uddsketch_calc`.
use std::fmt;
use std::fmt::Display;
use std::sync::Arc;
use common_query::error::{DowncastVectorSnafu, InvalidFuncArgsSnafu, Result};
use common_query::prelude::{Signature, Volatility};
use datatypes::data_type::ConcreteDataType;
use datatypes::prelude::Vector;
use datatypes::scalars::{ScalarVector, ScalarVectorBuilder};
use datatypes::vectors::{BinaryVector, Float64VectorBuilder, MutableVector, VectorRef};
use snafu::OptionExt;
use uddsketch::UDDSketch;
use crate::function::{Function, FunctionContext};
use crate::function_registry::FunctionRegistry;
const NAME: &str = "uddsketch_calc";
/// UddSketchCalcFunction implements the scalar function `uddsketch_calc`.
///
/// It accepts two arguments:
/// 1. A percentile (as f64) for which to compute the estimated quantile (e.g. 0.95 for p95).
/// 2. The serialized UDDSketch state, as produced by the aggregator (binary).
///
/// For each row, it deserializes the sketch and returns the computed quantile value.
#[derive(Debug, Default)]
pub struct UddSketchCalcFunction;
impl UddSketchCalcFunction {
pub fn register(registry: &FunctionRegistry) {
registry.register(Arc::new(UddSketchCalcFunction));
}
}
impl Display for UddSketchCalcFunction {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", NAME.to_ascii_uppercase())
}
}
impl Function for UddSketchCalcFunction {
fn name(&self) -> &str {
NAME
}
fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
Ok(ConcreteDataType::float64_datatype())
}
fn signature(&self) -> Signature {
// First argument: percentile (float64)
// Second argument: UDDSketch state (binary)
Signature::exact(
vec![
ConcreteDataType::float64_datatype(),
ConcreteDataType::binary_datatype(),
],
Volatility::Immutable,
)
}
fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result<VectorRef> {
if columns.len() != 2 {
return InvalidFuncArgsSnafu {
err_msg: format!("uddsketch_calc expects 2 arguments, got {}", columns.len()),
}
.fail();
}
let perc_vec = &columns[0];
let sketch_vec = columns[1]
.as_any()
.downcast_ref::<BinaryVector>()
.with_context(|| DowncastVectorSnafu {
err_msg: format!("expect BinaryVector, got {}", columns[1].vector_type_name()),
})?;
let len = sketch_vec.len();
let mut builder = Float64VectorBuilder::with_capacity(len);
for i in 0..len {
let perc_opt = perc_vec.get(i).as_f64_lossy();
let sketch_opt = sketch_vec.get_data(i);
if sketch_opt.is_none() || perc_opt.is_none() {
builder.push_null();
continue;
}
let sketch_bytes = sketch_opt.unwrap();
let perc = perc_opt.unwrap();
// Deserialize the UDDSketch from its bincode representation
let sketch: UDDSketch = match bincode::deserialize(sketch_bytes) {
Ok(s) => s,
Err(e) => {
common_telemetry::trace!("Failed to deserialize UDDSketch: {}", e);
builder.push_null();
continue;
}
};
// Compute the estimated quantile from the sketch
let result = sketch.estimate_quantile(perc);
builder.push(Some(result));
}
Ok(builder.to_vector())
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use datatypes::vectors::{BinaryVector, Float64Vector};
use super::*;
#[test]
fn test_uddsketch_calc_function() {
let function = UddSketchCalcFunction;
assert_eq!("uddsketch_calc", function.name());
assert_eq!(
ConcreteDataType::float64_datatype(),
function
.return_type(&[ConcreteDataType::float64_datatype()])
.unwrap()
);
// Create a test sketch
let mut sketch = UDDSketch::new(128, 0.01);
sketch.add_value(10.0);
sketch.add_value(20.0);
sketch.add_value(30.0);
sketch.add_value(40.0);
sketch.add_value(50.0);
sketch.add_value(60.0);
sketch.add_value(70.0);
sketch.add_value(80.0);
sketch.add_value(90.0);
sketch.add_value(100.0);
// Get expected values directly from the sketch
let expected_p50 = sketch.estimate_quantile(0.5);
let expected_p90 = sketch.estimate_quantile(0.9);
let expected_p95 = sketch.estimate_quantile(0.95);
let serialized = bincode::serialize(&sketch).unwrap();
let percentiles = vec![0.5, 0.9, 0.95];
let args: Vec<VectorRef> = vec![
Arc::new(Float64Vector::from_vec(percentiles.clone())),
Arc::new(BinaryVector::from(vec![Some(serialized.clone()); 3])),
];
let result = function.eval(FunctionContext::default(), &args).unwrap();
assert_eq!(result.len(), 3);
// Test median (p50)
assert!(
matches!(result.get(0), datatypes::value::Value::Float64(v) if (v - expected_p50).abs() < 1e-10)
);
// Test p90
assert!(
matches!(result.get(1), datatypes::value::Value::Float64(v) if (v - expected_p90).abs() < 1e-10)
);
// Test p95
assert!(
matches!(result.get(2), datatypes::value::Value::Float64(v) if (v - expected_p95).abs() < 1e-10)
);
}
#[test]
fn test_uddsketch_calc_function_errors() {
let function = UddSketchCalcFunction;
// Test with invalid number of arguments
let args: Vec<VectorRef> = vec![Arc::new(Float64Vector::from_vec(vec![0.95]))];
let result = function.eval(FunctionContext::default(), &args);
assert!(result.is_err());
assert!(result
.unwrap_err()
.to_string()
.contains("uddsketch_calc expects 2 arguments"));
// Test with invalid binary data
let args: Vec<VectorRef> = vec![
Arc::new(Float64Vector::from_vec(vec![0.95])),
Arc::new(BinaryVector::from(vec![Some(vec![1, 2, 3])])), // Invalid binary data
];
let result = function.eval(FunctionContext::default(), &args).unwrap();
assert_eq!(result.len(), 1);
assert!(matches!(result.get(0), datatypes::value::Value::Null));
}
}

View File

@@ -12,11 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::hash::BuildHasher;
use ahash::RandomState;
use serde::{Deserialize, Serialize};
/// Escapes special characters in the provided pattern string for `LIKE`.
///
/// Specifically, it prefixes the backslash (`\`), percent (`%`), and underscore (`_`)
@@ -37,71 +32,6 @@ pub fn escape_like_pattern(pattern: &str) -> String {
})
.collect::<String>()
}
/// A random state with fixed seeds.
///
/// This is used to ensure that the hash values are consistent across
/// different processes, and easy to serialize and deserialize.
#[derive(Debug)]
pub struct FixedRandomState {
state: RandomState,
}
impl FixedRandomState {
// some random seeds
const RANDOM_SEED_0: u64 = 0x517cc1b727220a95;
const RANDOM_SEED_1: u64 = 0x428a2f98d728ae22;
const RANDOM_SEED_2: u64 = 0x7137449123ef65cd;
const RANDOM_SEED_3: u64 = 0xb5c0fbcfec4d3b2f;
pub fn new() -> Self {
Self {
state: ahash::RandomState::with_seeds(
Self::RANDOM_SEED_0,
Self::RANDOM_SEED_1,
Self::RANDOM_SEED_2,
Self::RANDOM_SEED_3,
),
}
}
}
impl Default for FixedRandomState {
fn default() -> Self {
Self::new()
}
}
impl BuildHasher for FixedRandomState {
type Hasher = ahash::AHasher;
fn build_hasher(&self) -> Self::Hasher {
self.state.build_hasher()
}
fn hash_one<T: std::hash::Hash>(&self, x: T) -> u64 {
self.state.hash_one(x)
}
}
impl Serialize for FixedRandomState {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
serializer.serialize_unit()
}
}
impl<'de> Deserialize<'de> for FixedRandomState {
fn deserialize<D>(_deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
Ok(Self::new())
}
}
#[cfg(test)]
mod tests {
use super::*;

View File

@@ -16,6 +16,7 @@ use std::collections::HashMap;
use std::sync::Arc;
use futures::future::BoxFuture;
use futures::TryStreamExt;
use moka::future::Cache;
use moka::ops::compute::Op;
use table::metadata::TableId;
@@ -53,13 +54,9 @@ fn init_factory(table_flow_manager: TableFlowManagerRef) -> Initializer<TableId,
Box::pin(async move {
table_flow_manager
.flows(table_id)
.map_ok(|(key, value)| (key.flownode_id(), value.peer))
.try_collect::<HashMap<_, _>>()
.await
.map(|flows| {
flows
.into_iter()
.map(|(key, value)| (key.flownode_id(), value.peer))
.collect::<HashMap<_, _>>()
})
// We must cache the `HashSet` even if it's empty,
// to avoid future requests to the remote storage next time;
// If the value is added to the remote storage,

View File

@@ -57,10 +57,12 @@ pub trait ClusterInfo {
}
/// The key of [NodeInfo] in the storage. The format is `__meta_cluster_node_info-{cluster_id}-{role}-{node_id}`.
///
/// This key cannot be used to describe the `Metasrv` because the `Metasrv` does not have
/// a `cluster_id`, it serves multiple clusters.
#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize)]
pub struct NodeInfoKey {
/// The cluster id.
// todo(hl): remove cluster_id as it is not assigned anywhere.
pub cluster_id: ClusterId,
/// The role of the node. It can be `[Role::Datanode]` or `[Role::Frontend]`.
pub role: Role,
@@ -230,8 +232,8 @@ impl TryFrom<Vec<u8>> for NodeInfoKey {
}
}
impl From<&NodeInfoKey> for Vec<u8> {
fn from(key: &NodeInfoKey) -> Self {
impl From<NodeInfoKey> for Vec<u8> {
fn from(key: NodeInfoKey) -> Self {
format!(
"{}-{}-{}-{}",
CLUSTER_NODE_INFO_PREFIX,
@@ -313,7 +315,7 @@ mod tests {
node_id: 2,
};
let key_bytes: Vec<u8> = (&key).into();
let key_bytes: Vec<u8> = key.into();
let new_key: NodeInfoKey = key_bytes.try_into().unwrap();
assert_eq!(1, new_key.cluster_id);

View File

@@ -15,7 +15,6 @@
mod metadata;
use std::collections::BTreeMap;
use std::fmt;
use api::v1::flow::flow_request::Body as PbFlowRequest;
use api::v1::flow::{CreateRequest, FlowRequest, FlowRequestHeader};
@@ -29,6 +28,7 @@ use common_procedure::{
use common_telemetry::info;
use common_telemetry::tracing_context::TracingContext;
use futures::future::join_all;
use futures::TryStreamExt;
use itertools::Itertools;
use serde::{Deserialize, Serialize};
use snafu::{ensure, ResultExt};
@@ -77,7 +77,6 @@ impl CreateFlowProcedure {
query_context,
state: CreateFlowState::Prepare,
prev_flow_info_value: None,
flow_type: None,
},
}
}
@@ -105,7 +104,7 @@ impl CreateFlowProcedure {
if create_if_not_exists && or_replace {
// this is forbidden because not clear what does that mean exactly
return error::UnsupportedSnafu {
operation: "Create flow with both `IF NOT EXISTS` and `OR REPLACE`",
operation: "Create flow with both `IF NOT EXISTS` and `OR REPLACE`".to_string(),
}
.fail();
}
@@ -130,10 +129,9 @@ impl CreateFlowProcedure {
.flow_metadata_manager
.flow_route_manager()
.routes(flow_id)
.await?
.into_iter()
.map(|(_, value)| value.peer)
.collect::<Vec<_>>();
.map_ok(|(_, value)| value.peer)
.try_collect::<Vec<_>>()
.await?;
self.data.flow_id = Some(flow_id);
self.data.peers = peers;
info!("Replacing flow, flow_id: {}", flow_id);
@@ -177,8 +175,6 @@ impl CreateFlowProcedure {
self.allocate_flow_id().await?;
}
self.data.state = CreateFlowState::CreateFlows;
// determine flow type
self.data.flow_type = Some(determine_flow_type(&self.data.task));
Ok(Status::executing(true))
}
@@ -313,11 +309,6 @@ impl Procedure for CreateFlowProcedure {
}
}
pub fn determine_flow_type(_flow_task: &CreateFlowTask) -> FlowType {
// TODO(discord9): determine flow type
FlowType::RecordingRule
}
/// The state of [CreateFlowProcedure].
#[derive(Debug, Clone, Serialize, Deserialize, AsRefStr, PartialEq)]
pub enum CreateFlowState {
@@ -331,36 +322,6 @@ pub enum CreateFlowState {
CreateMetadata,
}
/// The type of flow.
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub enum FlowType {
/// The flow is a recording rule task.
RecordingRule,
/// The flow is a streaming task.
Streaming,
}
impl FlowType {
pub const RECORDING_RULE: &str = "recording_rule";
pub const STREAMING: &str = "streaming";
pub const FLOW_TYPE_KEY: &str = "flow_type";
}
impl Default for FlowType {
fn default() -> Self {
Self::RecordingRule
}
}
impl fmt::Display for FlowType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
FlowType::RecordingRule => write!(f, "{}", FlowType::RECORDING_RULE),
FlowType::Streaming => write!(f, "{}", FlowType::STREAMING),
}
}
}
/// The serializable data.
#[derive(Debug, Serialize, Deserialize)]
pub struct CreateFlowData {
@@ -374,7 +335,6 @@ pub struct CreateFlowData {
/// For verify if prev value is consistent when need to update flow metadata.
/// only set when `or_replace` is true.
pub(crate) prev_flow_info_value: Option<DeserializedValueWithBytes<FlowInfoValue>>,
pub(crate) flow_type: Option<FlowType>,
}
impl From<&CreateFlowData> for CreateRequest {
@@ -382,7 +342,7 @@ impl From<&CreateFlowData> for CreateRequest {
let flow_id = value.flow_id.unwrap();
let source_table_ids = &value.source_table_ids;
let mut req = CreateRequest {
CreateRequest {
flow_id: Some(api::v1::FlowId { id: flow_id }),
source_table_ids: source_table_ids
.iter()
@@ -396,12 +356,7 @@ impl From<&CreateFlowData> for CreateRequest {
comment: value.task.comment.clone(),
sql: value.task.sql.clone(),
flow_options: value.task.flow_options.clone(),
};
let flow_type = value.flow_type.unwrap_or_default().to_string();
req.flow_options
.insert(FlowType::FLOW_TYPE_KEY.to_string(), flow_type);
req
}
}
}
@@ -414,7 +369,7 @@ impl From<&CreateFlowData> for (FlowInfoValue, Vec<(FlowPartitionId, FlowRouteVa
expire_after,
comment,
sql,
flow_options: mut options,
flow_options: options,
..
} = value.task.clone();
@@ -431,21 +386,19 @@ impl From<&CreateFlowData> for (FlowInfoValue, Vec<(FlowPartitionId, FlowRouteVa
.map(|(idx, peer)| (idx as u32, FlowRouteValue { peer: peer.clone() }))
.collect::<Vec<_>>();
let flow_type = value.flow_type.unwrap_or_default().to_string();
options.insert(FlowType::FLOW_TYPE_KEY.to_string(), flow_type);
let flow_info = FlowInfoValue {
source_table_ids: value.source_table_ids.clone(),
sink_table_name,
flownode_ids,
catalog_name,
flow_name,
raw_sql: sql,
expire_after,
comment,
options,
};
(flow_info, flow_routes)
(
FlowInfoValue {
source_table_ids: value.source_table_ids.clone(),
sink_table_name,
flownode_ids,
catalog_name,
flow_name,
raw_sql: sql,
expire_after,
comment,
options,
},
flow_routes,
)
}
}

View File

@@ -128,7 +128,7 @@ impl State for DropDatabaseExecutor {
.await?;
executor.invalidate_table_cache(ddl_ctx).await?;
executor
.on_drop_regions(ddl_ctx, &self.physical_region_routes, true)
.on_drop_regions(ddl_ctx, &self.physical_region_routes)
.await?;
info!("Table: {}({}) is dropped", self.table_name, self.table_id);

View File

@@ -13,6 +13,7 @@
// limitations under the License.
use common_catalog::format_full_flow_name;
use futures::TryStreamExt;
use snafu::{ensure, OptionExt};
use crate::ddl::drop_flow::DropFlowProcedure;
@@ -38,10 +39,9 @@ impl DropFlowProcedure {
.flow_metadata_manager
.flow_route_manager()
.routes(self.data.task.flow_id)
.await?
.into_iter()
.map(|(_, value)| value)
.collect::<Vec<_>>();
.map_ok(|(_, value)| value)
.try_collect::<Vec<_>>()
.await?;
ensure!(
!flow_route_values.is_empty(),
error::FlowRouteNotFoundSnafu {

View File

@@ -156,7 +156,7 @@ impl DropTableProcedure {
pub async fn on_datanode_drop_regions(&mut self) -> Result<Status> {
self.executor
.on_drop_regions(&self.context, &self.data.physical_region_routes, false)
.on_drop_regions(&self.context, &self.data.physical_region_routes)
.await?;
self.data.state = DropTableState::DeleteTombstone;
Ok(Status::executing(true))

View File

@@ -214,7 +214,6 @@ impl DropTableExecutor {
&self,
ctx: &DdlContext,
region_routes: &[RegionRoute],
fast_path: bool,
) -> Result<()> {
let leaders = find_leaders(region_routes);
let mut drop_region_tasks = Vec::with_capacity(leaders.len());
@@ -237,7 +236,6 @@ impl DropTableExecutor {
}),
body: Some(region_request::Body::Drop(PbDropRegionRequest {
region_id: region_id.as_u64(),
fast_path,
})),
};
let datanode = datanode.clone();

View File

@@ -16,9 +16,9 @@ pub mod flow_info;
pub(crate) mod flow_name;
pub(crate) mod flow_route;
pub mod flow_state;
mod flownode_addr_helper;
pub(crate) mod flownode_flow;
pub(crate) mod table_flow;
use std::ops::Deref;
use std::sync::Arc;
@@ -506,6 +506,7 @@ mod tests {
let routes = flow_metadata_manager
.flow_route_manager()
.routes(flow_id)
.try_collect::<Vec<_>>()
.await
.unwrap();
assert_eq!(
@@ -537,6 +538,7 @@ mod tests {
let nodes = flow_metadata_manager
.table_flow_manager()
.flows(table_id)
.try_collect::<Vec<_>>()
.await
.unwrap();
assert_eq!(
@@ -725,6 +727,7 @@ mod tests {
let routes = flow_metadata_manager
.flow_route_manager()
.routes(flow_id)
.try_collect::<Vec<_>>()
.await
.unwrap();
assert_eq!(
@@ -756,6 +759,7 @@ mod tests {
let nodes = flow_metadata_manager
.table_flow_manager()
.flows(table_id)
.try_collect::<Vec<_>>()
.await
.unwrap();
assert_eq!(

View File

@@ -12,15 +12,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use futures::TryStreamExt;
use futures::stream::BoxStream;
use lazy_static::lazy_static;
use regex::Regex;
use serde::{Deserialize, Serialize};
use snafu::OptionExt;
use crate::error::{self, Result};
use crate::key::flow::{flownode_addr_helper, FlowScoped};
use crate::key::node_address::NodeAddressKey;
use crate::key::flow::FlowScoped;
use crate::key::{BytesAdapter, FlowId, FlowPartitionId, MetadataKey, MetadataValue};
use crate::kv_backend::txn::{Txn, TxnOp};
use crate::kv_backend::KvBackendRef;
@@ -168,7 +167,10 @@ impl FlowRouteManager {
}
/// Retrieves all [FlowRouteValue]s of the specified `flow_id`.
pub async fn routes(&self, flow_id: FlowId) -> Result<Vec<(FlowRouteKey, FlowRouteValue)>> {
pub fn routes(
&self,
flow_id: FlowId,
) -> BoxStream<'static, Result<(FlowRouteKey, FlowRouteValue)>> {
let start_key = FlowRouteKey::range_start_key(flow_id);
let req = RangeRequest::new().with_prefix(start_key);
let stream = PaginationStream::new(
@@ -179,9 +181,7 @@ impl FlowRouteManager {
)
.into_stream();
let mut res = stream.try_collect::<Vec<_>>().await?;
self.remap_flow_route_addresses(&mut res).await?;
Ok(res)
Box::pin(stream)
}
/// Builds a create flow routes transaction.
@@ -203,28 +203,6 @@ impl FlowRouteManager {
Ok(Txn::new().and_then(txns))
}
async fn remap_flow_route_addresses(
&self,
flow_routes: &mut [(FlowRouteKey, FlowRouteValue)],
) -> Result<()> {
let keys = flow_routes
.iter()
.map(|(_, value)| NodeAddressKey::with_flownode(value.peer.id))
.collect();
let flow_node_addrs =
flownode_addr_helper::get_flownode_addresses(&self.kv_backend, keys).await?;
for (_, flow_route_value) in flow_routes.iter_mut() {
let flownode_id = flow_route_value.peer.id;
// If an id lacks a corresponding address in the `flow_node_addrs`,
// it means the old address in `table_flow_value` is still valid,
// which is expected.
if let Some(node_addr) = flow_node_addrs.get(&flownode_id) {
flow_route_value.peer.addr = node_addr.peer.addr.clone();
}
}
Ok(())
}
}
#[cfg(test)]

View File

@@ -1,47 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use crate::error::Result;
use crate::key::node_address::{NodeAddressKey, NodeAddressValue};
use crate::key::{MetadataKey, MetadataValue};
use crate::kv_backend::KvBackendRef;
use crate::rpc::store::BatchGetRequest;
/// Get the addresses of the flownodes.
/// The result is a map: node_id -> NodeAddressValue
pub(crate) async fn get_flownode_addresses(
kv_backend: &KvBackendRef,
keys: Vec<NodeAddressKey>,
) -> Result<HashMap<u64, NodeAddressValue>> {
if keys.is_empty() {
return Ok(HashMap::default());
}
let req = BatchGetRequest {
keys: keys.into_iter().map(|k| k.to_bytes()).collect(),
};
kv_backend
.batch_get(req)
.await?
.kvs
.into_iter()
.map(|kv| {
let key = NodeAddressKey::from_bytes(&kv.key)?;
let value = NodeAddressValue::try_from_raw_value(&kv.value)?;
Ok((key.node_id, value))
})
.collect()
}

View File

@@ -14,7 +14,7 @@
use std::sync::Arc;
use futures::TryStreamExt;
use futures::stream::BoxStream;
use lazy_static::lazy_static;
use regex::Regex;
use serde::{Deserialize, Serialize};
@@ -22,8 +22,7 @@ use snafu::OptionExt;
use table::metadata::TableId;
use crate::error::{self, Result};
use crate::key::flow::{flownode_addr_helper, FlowScoped};
use crate::key::node_address::NodeAddressKey;
use crate::key::flow::FlowScoped;
use crate::key::{BytesAdapter, FlowId, FlowPartitionId, MetadataKey, MetadataValue};
use crate::kv_backend::txn::{Txn, TxnOp};
use crate::kv_backend::KvBackendRef;
@@ -197,7 +196,10 @@ impl TableFlowManager {
/// Retrieves all [TableFlowKey]s of the specified `table_id`.
///
/// TODO(discord9): add cache for it since range request does not support cache.
pub async fn flows(&self, table_id: TableId) -> Result<Vec<(TableFlowKey, TableFlowValue)>> {
pub fn flows(
&self,
table_id: TableId,
) -> BoxStream<'static, Result<(TableFlowKey, TableFlowValue)>> {
let start_key = TableFlowKey::range_start_key(table_id);
let req = RangeRequest::new().with_prefix(start_key);
let stream = PaginationStream::new(
@@ -208,9 +210,7 @@ impl TableFlowManager {
)
.into_stream();
let mut res = stream.try_collect::<Vec<_>>().await?;
self.remap_table_flow_addresses(&mut res).await?;
Ok(res)
Box::pin(stream)
}
/// Builds a create table flow transaction.
@@ -238,28 +238,6 @@ impl TableFlowManager {
Ok(Txn::new().and_then(txns))
}
async fn remap_table_flow_addresses(
&self,
table_flows: &mut [(TableFlowKey, TableFlowValue)],
) -> Result<()> {
let keys = table_flows
.iter()
.map(|(_, value)| NodeAddressKey::with_flownode(value.peer.id))
.collect::<Vec<_>>();
let flownode_addrs =
flownode_addr_helper::get_flownode_addresses(&self.kv_backend, keys).await?;
for (_, table_flow_value) in table_flows.iter_mut() {
let flownode_id = table_flow_value.peer.id;
// If an id lacks a corresponding address in the `flow_node_addrs`,
// it means the old address in `table_flow_value` is still valid,
// which is expected.
if let Some(flownode_addr) = flownode_addrs.get(&flownode_id) {
table_flow_value.peer.addr = flownode_addr.peer.addr.clone();
}
}
Ok(())
}
}
#[cfg(test)]

View File

@@ -39,10 +39,6 @@ impl NodeAddressKey {
pub fn with_datanode(node_id: u64) -> Self {
Self::new(Role::Datanode, node_id)
}
pub fn with_flownode(node_id: u64) -> Self {
Self::new(Role::Flownode, node_id)
}
}
#[derive(Debug, PartialEq, Serialize, Deserialize, Clone)]

View File

@@ -34,7 +34,6 @@ pub mod kv_backend;
pub mod leadership_notifier;
pub mod lock_key;
pub mod metrics;
pub mod node_expiry_listener;
pub mod node_manager;
pub mod peer;
pub mod range_stream;

View File

@@ -1,152 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Mutex;
use std::time::Duration;
use common_telemetry::{debug, error, info, warn};
use tokio::task::JoinHandle;
use tokio::time::{interval, MissedTickBehavior};
use crate::cluster::{NodeInfo, NodeInfoKey};
use crate::error;
use crate::kv_backend::ResettableKvBackendRef;
use crate::leadership_notifier::LeadershipChangeListener;
use crate::rpc::store::RangeRequest;
use crate::rpc::KeyValue;
/// [NodeExpiryListener] periodically checks all node info in memory and removes
/// expired node info to prevent memory leak.
pub struct NodeExpiryListener {
handle: Mutex<Option<JoinHandle<()>>>,
max_idle_time: Duration,
in_memory: ResettableKvBackendRef,
}
impl Drop for NodeExpiryListener {
fn drop(&mut self) {
self.stop();
}
}
impl NodeExpiryListener {
pub fn new(max_idle_time: Duration, in_memory: ResettableKvBackendRef) -> Self {
Self {
handle: Mutex::new(None),
max_idle_time,
in_memory,
}
}
async fn start(&self) {
let mut handle = self.handle.lock().unwrap();
if handle.is_none() {
let in_memory = self.in_memory.clone();
let max_idle_time = self.max_idle_time;
let ticker_loop = tokio::spawn(async move {
// Run clean task every minute.
let mut interval = interval(Duration::from_secs(60));
interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
loop {
interval.tick().await;
if let Err(e) = Self::clean_expired_nodes(&in_memory, max_idle_time).await {
error!(e; "Failed to clean expired node");
}
}
});
*handle = Some(ticker_loop);
}
}
fn stop(&self) {
if let Some(handle) = self.handle.lock().unwrap().take() {
handle.abort();
info!("Node expiry listener stopped")
}
}
/// Cleans expired nodes from memory.
async fn clean_expired_nodes(
in_memory: &ResettableKvBackendRef,
max_idle_time: Duration,
) -> error::Result<()> {
let node_keys = Self::list_expired_nodes(in_memory, max_idle_time).await?;
for key in node_keys {
let key_bytes: Vec<u8> = (&key).into();
if let Err(e) = in_memory.delete(&key_bytes, false).await {
warn!(e; "Failed to delete expired node: {:?}", key_bytes);
} else {
debug!("Deleted expired node key: {:?}", key);
}
}
Ok(())
}
/// Lists expired nodes that have been inactive more than `max_idle_time`.
async fn list_expired_nodes(
in_memory: &ResettableKvBackendRef,
max_idle_time: Duration,
) -> error::Result<impl Iterator<Item = NodeInfoKey>> {
let prefix = NodeInfoKey::key_prefix_with_cluster_id(0);
let req = RangeRequest::new().with_prefix(prefix);
let current_time_millis = common_time::util::current_time_millis();
let resp = in_memory.range(req).await?;
Ok(resp
.kvs
.into_iter()
.filter_map(move |KeyValue { key, value }| {
let Ok(info) = NodeInfo::try_from(value).inspect_err(|e| {
warn!(e; "Unrecognized node info value");
}) else {
return None;
};
if (current_time_millis - info.last_activity_ts) > max_idle_time.as_millis() as i64
{
NodeInfoKey::try_from(key)
.inspect_err(|e| {
warn!(e; "Unrecognized node info key: {:?}", info.peer);
})
.ok()
.inspect(|node_key| {
debug!("Found expired node: {:?}", node_key);
})
} else {
None
}
}))
}
}
#[async_trait::async_trait]
impl LeadershipChangeListener for NodeExpiryListener {
fn name(&self) -> &str {
"NodeExpiryListener"
}
async fn on_leader_start(&self) -> error::Result<()> {
self.start().await;
info!(
"On leader start, node expiry listener started with max idle time: {:?}",
self.max_idle_time
);
Ok(())
}
async fn on_leader_stop(&self) -> error::Result<()> {
self.stop();
info!("On leader stop, node expiry listener stopped");
Ok(())
}
}

View File

@@ -1218,10 +1218,7 @@ mod tests {
);
let response = mock_region_server
.handle_request(
region_id,
RegionRequest::Drop(RegionDropRequest { fast_path: false }),
)
.handle_request(region_id, RegionRequest::Drop(RegionDropRequest {}))
.await
.unwrap();
assert_eq!(response.affected_rows, 0);
@@ -1313,10 +1310,7 @@ mod tests {
.insert(region_id, RegionEngineWithStatus::Ready(engine.clone()));
mock_region_server
.handle_request(
region_id,
RegionRequest::Drop(RegionDropRequest { fast_path: false }),
)
.handle_request(region_id, RegionRequest::Drop(RegionDropRequest {}))
.await
.unwrap_err();

View File

@@ -32,5 +32,5 @@ pub mod types;
pub mod value;
pub mod vectors;
pub use arrow::{self, compute};
pub use arrow;
pub use error::{Error, Result};

View File

@@ -16,7 +16,6 @@ async-trait.workspace = true
bytes.workspace = true
cache.workspace = true
catalog.workspace = true
chrono.workspace = true
client.workspace = true
common-base.workspace = true
common-config.workspace = true

View File

@@ -49,13 +49,12 @@ pub(crate) use crate::adapter::node_context::FlownodeContext;
use crate::adapter::refill::RefillTask;
use crate::adapter::table_source::ManagedTableSource;
use crate::adapter::util::relation_desc_to_column_schemas_with_fallback;
pub(crate) use crate::adapter::worker::{create_worker, WorkerHandle};
pub(crate) use crate::adapter::worker::{create_worker, Worker, WorkerHandle};
use crate::compute::ErrCollector;
use crate::df_optimizer::sql_to_flow_plan;
use crate::error::{EvalSnafu, ExternalSnafu, InternalSnafu, InvalidQuerySnafu, UnexpectedSnafu};
use crate::expr::Batch;
use crate::metrics::{METRIC_FLOW_INSERT_ELAPSED, METRIC_FLOW_ROWS, METRIC_FLOW_RUN_INTERVAL_MS};
use crate::recording_rules::RecordingRuleEngine;
use crate::repr::{self, DiffRow, RelationDesc, Row, BATCH_SIZE};
mod flownode_impl;
@@ -64,7 +63,7 @@ pub(crate) mod refill;
mod stat;
#[cfg(test)]
mod tests;
pub(crate) mod util;
mod util;
mod worker;
pub(crate) mod node_context;
@@ -172,8 +171,6 @@ pub struct FlowWorkerManager {
flush_lock: RwLock<()>,
/// receive a oneshot sender to send state size report
state_report_handler: RwLock<Option<StateReportHandler>>,
/// engine for recording rule
rule_engine: RecordingRuleEngine,
}
/// Building FlownodeManager
@@ -188,7 +185,6 @@ impl FlowWorkerManager {
node_id: Option<u32>,
query_engine: Arc<dyn QueryEngine>,
table_meta: TableMetadataManagerRef,
rule_engine: RecordingRuleEngine,
) -> Self {
let srv_map = ManagedTableSource::new(
table_meta.table_info_manager().clone(),
@@ -211,7 +207,6 @@ impl FlowWorkerManager {
node_id,
flush_lock: RwLock::new(()),
state_report_handler: RwLock::new(None),
rule_engine,
}
}
@@ -220,6 +215,25 @@ impl FlowWorkerManager {
self
}
/// Create a flownode manager with one worker
pub fn new_with_workers<'s>(
node_id: Option<u32>,
query_engine: Arc<dyn QueryEngine>,
table_meta: TableMetadataManagerRef,
num_workers: usize,
) -> (Self, Vec<Worker<'s>>) {
let mut zelf = Self::new(node_id, query_engine, table_meta);
let workers: Vec<_> = (0..num_workers)
.map(|_| {
let (handle, worker) = create_worker();
zelf.add_worker_handle(handle);
worker
})
.collect();
(zelf, workers)
}
/// add a worker handler to manager, meaning this corresponding worker is under it's manage
pub fn add_worker_handle(&mut self, handle: WorkerHandle) {
self.worker_handles.push(handle);
@@ -737,11 +751,7 @@ pub struct CreateFlowArgs {
/// Create&Remove flow
impl FlowWorkerManager {
/// remove a flow by it's id
#[allow(unreachable_code)]
pub async fn remove_flow(&self, flow_id: FlowId) -> Result<(), Error> {
// TODO(discord9): reroute some back to streaming engine later
return self.rule_engine.remove_flow(flow_id).await;
for handle in self.worker_handles.iter() {
if handle.contains_flow(flow_id).await? {
handle.remove_flow(flow_id).await?;
@@ -757,10 +767,8 @@ impl FlowWorkerManager {
/// steps to create task:
/// 1. parse query into typed plan(and optional parse expire_after expr)
/// 2. render source/sink with output table id and used input table id
#[allow(clippy::too_many_arguments, unreachable_code)]
#[allow(clippy::too_many_arguments)]
pub async fn create_flow(&self, args: CreateFlowArgs) -> Result<Option<FlowId>, Error> {
// TODO(discord9): reroute some back to streaming engine later
return self.rule_engine.create_flow(args).await;
let CreateFlowArgs {
flow_id,
sink_table_name,

View File

@@ -153,13 +153,7 @@ impl Flownode for FlowWorkerManager {
}
}
#[allow(unreachable_code, unused)]
async fn handle_inserts(&self, request: InsertRequests) -> Result<FlowResponse> {
return self
.rule_engine
.handle_inserts(request)
.await
.map_err(to_meta_err(snafu::location!()));
// using try_read to ensure two things:
// 1. flush wouldn't happen until inserts before it is inserted
// 2. inserts happening concurrently with flush wouldn't be block by flush
@@ -212,15 +206,15 @@ impl Flownode for FlowWorkerManager {
.collect_vec();
let table_col_names = table_schema.relation_desc.names;
let table_col_names = table_col_names
.iter().enumerate()
.map(|(idx,name)| match name {
Some(name) => Ok(name.clone()),
None => InternalSnafu {
reason: format!("Expect column {idx} of table id={table_id} to have name in table schema, found None"),
}
.fail().map_err(BoxedError::new).context(ExternalSnafu),
})
.collect::<Result<Vec<_>>>()?;
.iter().enumerate()
.map(|(idx,name)| match name {
Some(name) => Ok(name.clone()),
None => InternalSnafu {
reason: format!("Expect column {idx} of table id={table_id} to have name in table schema, found None"),
}
.fail().map_err(BoxedError::new).context(ExternalSnafu),
})
.collect::<Result<Vec<_>>>()?;
let name_to_col = HashMap::<_, _>::from_iter(
insert_schema
.iter()

View File

@@ -12,8 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
//! Some utility functions
use std::sync::Arc;
use api::helper::ColumnDataTypeWrapper;

View File

@@ -16,7 +16,6 @@
use std::any::Any;
use arrow_schema::ArrowError;
use common_error::ext::BoxedError;
use common_error::{define_into_tonic_status, from_err_code_msg_to_header};
use common_macro::stack_trace_debug;
@@ -54,13 +53,6 @@ pub enum Error {
location: Location,
},
#[snafu(display("Time error"))]
Time {
source: common_time::error::Error,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("External error"))]
External {
source: BoxedError,
@@ -164,15 +156,6 @@ pub enum Error {
location: Location,
},
#[snafu(display("Arrow error: {raw:?} in context: {context}"))]
Arrow {
#[snafu(source)]
raw: ArrowError,
context: String,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Datafusion error: {raw:?} in context: {context}"))]
Datafusion {
#[snafu(source)]
@@ -247,7 +230,6 @@ impl ErrorExt for Error {
match self {
Self::Eval { .. }
| Self::JoinTask { .. }
| Self::Arrow { .. }
| Self::Datafusion { .. }
| Self::InsertIntoFlow { .. } => StatusCode::Internal,
Self::FlowAlreadyExist { .. } => StatusCode::TableAlreadyExists,
@@ -256,9 +238,7 @@ impl ErrorExt for Error {
| Self::FlowNotFound { .. }
| Self::ListFlows { .. } => StatusCode::TableNotFound,
Self::Plan { .. } | Self::Datatypes { .. } => StatusCode::PlanQuery,
Self::InvalidQuery { .. } | Self::CreateFlow { .. } | Self::Time { .. } => {
StatusCode::EngineExecuteQuery
}
Self::InvalidQuery { .. } | Self::CreateFlow { .. } => StatusCode::EngineExecuteQuery,
Self::Unexpected { .. } => StatusCode::Unexpected,
Self::NotImplemented { .. } | Self::UnsupportedTemporalFilter { .. } => {
StatusCode::Unsupported

View File

@@ -238,7 +238,6 @@ mod test {
for (sql, current, expected) in &testcases {
let plan = sql_to_substrait(engine.clone(), sql).await;
let mut ctx = create_test_ctx();
let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan)
.await

View File

@@ -60,12 +60,12 @@ async fn query_flow_state(
#[derive(Clone)]
pub struct HeartbeatTask {
node_id: u64,
node_epoch: u64,
peer_addr: String,
meta_client: Arc<MetaClient>,
report_interval: Duration,
retry_interval: Duration,
resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
start_time_ms: u64,
running: Arc<AtomicBool>,
query_stat_size: Option<SizeReportSender>,
}
@@ -83,12 +83,12 @@ impl HeartbeatTask {
) -> Self {
Self {
node_id: opts.node_id.unwrap_or(0),
node_epoch: common_time::util::current_time_millis() as u64,
peer_addr: addrs::resolve_addr(&opts.grpc.bind_addr, Some(&opts.grpc.server_addr)),
meta_client,
report_interval: heartbeat_opts.interval,
retry_interval: heartbeat_opts.retry_interval,
resp_handler_executor,
start_time_ms: common_time::util::current_time_millis() as u64,
running: Arc::new(AtomicBool::new(false)),
query_stat_size: None,
}
@@ -103,11 +103,6 @@ impl HeartbeatTask {
warn!("Heartbeat task started multiple times");
return Ok(());
}
self.create_streams().await
}
async fn create_streams(&self) -> Result<(), Error> {
info!("Start to establish the heartbeat connection to metasrv.");
let (req_sender, resp_stream) = self
.meta_client
@@ -130,6 +125,13 @@ impl HeartbeatTask {
pub fn shutdown(&self) {
info!("Close heartbeat task for flownode");
if self
.running
.compare_exchange(true, false, Ordering::AcqRel, Ordering::Acquire)
.is_err()
{
warn!("Call close heartbeat task multiple times");
}
}
fn new_heartbeat_request(
@@ -179,7 +181,7 @@ impl HeartbeatTask {
mut outgoing_rx: mpsc::Receiver<OutgoingMessage>,
) {
let report_interval = self.report_interval;
let node_epoch = self.node_epoch;
let start_time_ms = self.start_time_ms;
let self_peer = Some(Peer {
id: self.node_id,
addr: self.peer_addr.clone(),
@@ -196,8 +198,7 @@ impl HeartbeatTask {
let heartbeat_request = HeartbeatRequest {
peer: self_peer,
node_epoch,
info: Self::build_node_info(node_epoch),
info: Self::build_node_info(start_time_ms),
..Default::default()
};
@@ -229,8 +230,6 @@ impl HeartbeatTask {
// set the timeout to half of the report interval so that it wouldn't delay heartbeat if something went horribly wrong
latest_report = query_flow_state(&query_stat_size, report_interval / 2).await;
}
info!("flownode heartbeat task stopped.");
});
}
@@ -274,7 +273,7 @@ impl HeartbeatTask {
info!("Try to re-establish the heartbeat connection to metasrv.");
if self.create_streams().await.is_ok() {
if self.start().await.is_ok() {
break;
}
}

View File

@@ -33,7 +33,6 @@ mod expr;
pub mod heartbeat;
mod metrics;
mod plan;
mod recording_rules;
mod repr;
mod server;
mod transform;
@@ -44,5 +43,4 @@ mod test_utils;
pub use adapter::{FlowConfig, FlowWorkerManager, FlowWorkerManagerRef, FlownodeOptions};
pub use error::{Error, Result};
pub use recording_rules::FrontendClient;
pub use server::{FlownodeBuilder, FlownodeInstance, FlownodeServer, FrontendInvoker};

View File

@@ -28,32 +28,6 @@ lazy_static! {
&["table_id"]
)
.unwrap();
pub static ref METRIC_FLOW_RULE_ENGINE_QUERY_TIME: HistogramVec = register_histogram_vec!(
"greptime_flow_rule_engine_query_time",
"flow rule engine query time",
&["flow_id"],
vec![
0.0,
1.,
3.,
5.,
10.,
20.,
30.,
60.,
2. * 60.,
5. * 60.,
10. * 60.
]
)
.unwrap();
pub static ref METRIC_FLOW_RULE_ENGINE_SLOW_QUERY: HistogramVec = register_histogram_vec!(
"greptime_flow_rule_engine_slow_query",
"flow rule engine slow query",
&["flow_id", "sql", "peer"],
vec![60., 2. * 60., 3. * 60., 5. * 60., 10. * 60.]
)
.unwrap();
pub static ref METRIC_FLOW_RUN_INTERVAL_MS: IntGauge =
register_int_gauge!("greptime_flow_run_interval_ms", "flow run interval in ms").unwrap();
pub static ref METRIC_FLOW_ROWS: IntCounterVec = register_int_counter_vec!(

View File

@@ -1,882 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Run flow as recording rule which is time-window-aware normal query triggered every tick set by user
mod engine;
mod frontend_client;
use std::collections::BTreeSet;
use std::sync::Arc;
use api::helper::pb_value_to_value_ref;
use catalog::CatalogManagerRef;
use common_error::ext::BoxedError;
use common_recordbatch::DfRecordBatch;
use common_telemetry::warn;
use common_time::timestamp::TimeUnit;
use common_time::Timestamp;
use datafusion::error::Result as DfResult;
use datafusion::logical_expr::Expr;
use datafusion::physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner};
use datafusion::prelude::SessionContext;
use datafusion::sql::unparser::Unparser;
use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion, TreeNodeRewriter};
use datafusion_common::{DFSchema, TableReference};
use datafusion_expr::{ColumnarValue, LogicalPlan};
use datafusion_physical_expr::PhysicalExprRef;
use datatypes::prelude::{ConcreteDataType, DataType};
use datatypes::scalars::ScalarVector;
use datatypes::schema::TIME_INDEX_KEY;
use datatypes::value::Value;
use datatypes::vectors::{
TimestampMicrosecondVector, TimestampMillisecondVector, TimestampNanosecondVector,
TimestampSecondVector, Vector,
};
pub use engine::RecordingRuleEngine;
pub use frontend_client::FrontendClient;
use itertools::Itertools;
use query::parser::QueryLanguageParser;
use query::QueryEngineRef;
use session::context::QueryContextRef;
use snafu::{ensure, OptionExt, ResultExt};
use crate::adapter::util::from_proto_to_data_type;
use crate::df_optimizer::apply_df_optimizer;
use crate::error::{ArrowSnafu, DatafusionSnafu, DatatypesSnafu, ExternalSnafu, UnexpectedSnafu};
use crate::expr::error::DataTypeSnafu;
use crate::Error;
#[derive(Debug, Clone)]
pub struct TimeWindowExpr {
expr: PhysicalExprRef,
column_name: String,
}
impl TimeWindowExpr {
pub fn new(expr: PhysicalExprRef, column_name: String) -> Self {
Self { expr, column_name }
}
pub fn from_expr(expr: &Expr, column_name: &str, df_schema: &DFSchema) -> Result<Self, Error> {
let phy_planner = DefaultPhysicalPlanner::default();
let phy_expr: PhysicalExprRef = phy_planner
.create_physical_expr(expr, df_schema, &SessionContext::new().state())
.with_context(|_e| DatafusionSnafu {
context: format!(
"Failed to create physical expression from {expr:?} using {df_schema:?}"
),
})?;
Ok(Self {
expr: phy_expr,
column_name: column_name.to_string(),
})
}
/// Find timestamps from rows using time window expr
pub async fn handle_rows(
&self,
rows_list: Vec<api::v1::Rows>,
) -> Result<BTreeSet<Timestamp>, Error> {
let mut time_windows = BTreeSet::new();
for rows in rows_list {
// pick the time index column and use it to eval on `self.expr`
let ts_col_index = rows
.schema
.iter()
.map(|col| col.column_name.clone())
.position(|name| name == self.column_name);
let Some(ts_col_index) = ts_col_index else {
warn!("can't found time index column in schema: {:?}", rows.schema);
continue;
};
let col_schema = &rows.schema[ts_col_index];
let cdt = from_proto_to_data_type(col_schema)?;
let column_values = rows
.rows
.iter()
.map(|row| &row.values[ts_col_index])
.collect_vec();
let mut vector = cdt.create_mutable_vector(column_values.len());
for value in column_values {
let value = pb_value_to_value_ref(value, &None);
vector.try_push_value_ref(value).context(DataTypeSnafu {
msg: "Failed to convert rows to columns",
})?;
}
let vector = vector.to_vector();
let df_schema = create_df_schema_for_ts_column(&self.column_name, cdt)?;
let rb =
DfRecordBatch::try_new(df_schema.inner().clone(), vec![vector.to_arrow_array()])
.with_context(|_e| ArrowSnafu {
context: format!(
"Failed to create record batch from {df_schema:?} and {vector:?}"
),
})?;
let eval_res = self.expr.evaluate(&rb).with_context(|_| DatafusionSnafu {
context: format!(
"Failed to evaluate physical expression {:?} on {rb:?}",
self.expr
),
})?;
let res = columnar_to_ts_vector(&eval_res)?;
for ts in res.into_iter().flatten() {
time_windows.insert(ts);
}
}
Ok(time_windows)
}
}
fn create_df_schema_for_ts_column(name: &str, cdt: ConcreteDataType) -> Result<DFSchema, Error> {
let arrow_schema = Arc::new(arrow_schema::Schema::new(vec![arrow_schema::Field::new(
name,
cdt.as_arrow_type(),
false,
)]));
let df_schema = DFSchema::from_field_specific_qualified_schema(
vec![Some(TableReference::bare("TimeIndexOnlyTable"))],
&arrow_schema,
)
.with_context(|_e| DatafusionSnafu {
context: format!("Failed to create DFSchema from arrow schema {arrow_schema:?}"),
})?;
Ok(df_schema)
}
/// Convert `ColumnarValue` to `Vec<Option<Timestamp>>`
fn columnar_to_ts_vector(columnar: &ColumnarValue) -> Result<Vec<Option<Timestamp>>, Error> {
let val = match columnar {
datafusion_expr::ColumnarValue::Array(array) => {
let ty = array.data_type();
let ty = ConcreteDataType::from_arrow_type(ty);
let time_unit = if let ConcreteDataType::Timestamp(ty) = ty {
ty.unit()
} else {
return UnexpectedSnafu {
reason: format!("Non-timestamp type: {ty:?}"),
}
.fail();
};
match time_unit {
TimeUnit::Second => TimestampSecondVector::try_from_arrow_array(array.clone())
.with_context(|_| DatatypesSnafu {
extra: format!("Failed to create vector from arrow array {array:?}"),
})?
.iter_data()
.map(|d| d.map(|d| d.0))
.collect_vec(),
TimeUnit::Millisecond => {
TimestampMillisecondVector::try_from_arrow_array(array.clone())
.with_context(|_| DatatypesSnafu {
extra: format!("Failed to create vector from arrow array {array:?}"),
})?
.iter_data()
.map(|d| d.map(|d| d.0))
.collect_vec()
}
TimeUnit::Microsecond => {
TimestampMicrosecondVector::try_from_arrow_array(array.clone())
.with_context(|_| DatatypesSnafu {
extra: format!("Failed to create vector from arrow array {array:?}"),
})?
.iter_data()
.map(|d| d.map(|d| d.0))
.collect_vec()
}
TimeUnit::Nanosecond => {
TimestampNanosecondVector::try_from_arrow_array(array.clone())
.with_context(|_| DatatypesSnafu {
extra: format!("Failed to create vector from arrow array {array:?}"),
})?
.iter_data()
.map(|d| d.map(|d| d.0))
.collect_vec()
}
}
}
datafusion_expr::ColumnarValue::Scalar(scalar) => {
let value = Value::try_from(scalar.clone()).with_context(|_| DatatypesSnafu {
extra: format!("Failed to convert scalar {scalar:?} to value"),
})?;
let ts = value.as_timestamp().context(UnexpectedSnafu {
reason: format!("Expect Timestamp, found {:?}", value),
})?;
vec![Some(ts)]
}
};
Ok(val)
}
/// Convert sql to datafusion logical plan
pub async fn sql_to_df_plan(
query_ctx: QueryContextRef,
engine: QueryEngineRef,
sql: &str,
optimize: bool,
) -> Result<LogicalPlan, Error> {
let stmt = QueryLanguageParser::parse_sql(sql, &query_ctx)
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
let plan = engine
.planner()
.plan(&stmt, query_ctx)
.await
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
let plan = if optimize {
apply_df_optimizer(plan).await?
} else {
plan
};
Ok(plan)
}
/// Return (the column name of time index column, the time window expr, the expected time unit of time index column, the expr's schema for evaluating the time window)
async fn find_time_window_expr(
plan: &LogicalPlan,
catalog_man: CatalogManagerRef,
query_ctx: QueryContextRef,
) -> Result<(String, Option<datafusion_expr::Expr>, TimeUnit, DFSchema), Error> {
// TODO(discord9): find the expr that do time window
let mut table_name = None;
// first find the table source in the logical plan
plan.apply(|plan| {
let LogicalPlan::TableScan(table_scan) = plan else {
return Ok(TreeNodeRecursion::Continue);
};
table_name = Some(table_scan.table_name.clone());
Ok(TreeNodeRecursion::Stop)
})
.with_context(|_| DatafusionSnafu {
context: format!("Can't find table source in plan {plan:?}"),
})?;
let Some(table_name) = table_name else {
UnexpectedSnafu {
reason: format!("Can't find table source in plan {plan:?}"),
}
.fail()?
};
let current_schema = query_ctx.current_schema();
let catalog_name = table_name.catalog().unwrap_or(query_ctx.current_catalog());
let schema_name = table_name.schema().unwrap_or(&current_schema);
let table_name = table_name.table();
let Some(table_ref) = catalog_man
.table(catalog_name, schema_name, table_name, Some(&query_ctx))
.await
.map_err(BoxedError::new)
.context(ExternalSnafu)?
else {
UnexpectedSnafu {
reason: format!(
"Can't find table {table_name:?} in catalog {catalog_name:?}/{schema_name:?}"
),
}
.fail()?
};
let schema = &table_ref.table_info().meta.schema;
let ts_index = schema.timestamp_column().context(UnexpectedSnafu {
reason: format!("Can't find timestamp column in table {table_name:?}"),
})?;
let ts_col_name = ts_index.name.clone();
let expected_time_unit = ts_index.data_type.as_timestamp().with_context(|| UnexpectedSnafu {
reason: format!(
"Expected timestamp column {ts_col_name:?} in table {table_name:?} to be timestamp, but got {ts_index:?}"
),
})?.unit();
let arrow_schema = Arc::new(arrow_schema::Schema::new(vec![arrow_schema::Field::new(
ts_col_name.clone(),
ts_index.data_type.as_arrow_type(),
false,
)]));
let df_schema = DFSchema::from_field_specific_qualified_schema(
vec![Some(TableReference::bare(table_name))],
&arrow_schema,
)
.with_context(|_e| DatafusionSnafu {
context: format!("Failed to create DFSchema from arrow schema {arrow_schema:?}"),
})?;
// find the time window expr which refers to the time index column
let mut aggr_expr = None;
let mut time_window_expr: Option<Expr> = None;
let find_inner_aggr_expr = |plan: &LogicalPlan| {
if let LogicalPlan::Aggregate(aggregate) = plan {
aggr_expr = Some(aggregate.clone());
};
Ok(TreeNodeRecursion::Continue)
};
plan.apply(find_inner_aggr_expr)
.with_context(|_| DatafusionSnafu {
context: format!("Can't find aggr expr in plan {plan:?}"),
})?;
if let Some(aggregate) = aggr_expr {
for group_expr in &aggregate.group_expr {
let refs = group_expr.column_refs();
if refs.len() != 1 {
continue;
}
let ref_col = refs.iter().next().unwrap();
let index = aggregate.input.schema().maybe_index_of_column(ref_col);
let Some(index) = index else {
continue;
};
let field = aggregate.input.schema().field(index);
let is_time_index = field.metadata().get(TIME_INDEX_KEY) == Some(&"true".to_string());
if is_time_index {
let rewrite_column = group_expr.clone();
let rewritten = rewrite_column
.rewrite(&mut RewriteColumn {
table_name: table_name.to_string(),
})
.with_context(|_| DatafusionSnafu {
context: format!("Rewrite expr failed, expr={:?}", group_expr),
})?
.data;
struct RewriteColumn {
table_name: String,
}
impl TreeNodeRewriter for RewriteColumn {
type Node = Expr;
fn f_down(&mut self, node: Self::Node) -> DfResult<Transformed<Self::Node>> {
let Expr::Column(mut column) = node else {
return Ok(Transformed::no(node));
};
column.relation = Some(TableReference::bare(self.table_name.clone()));
Ok(Transformed::yes(Expr::Column(column)))
}
}
time_window_expr = Some(rewritten);
break;
}
}
Ok((ts_col_name, time_window_expr, expected_time_unit, df_schema))
} else {
// can't found time window expr, return None
Ok((ts_col_name, None, expected_time_unit, df_schema))
}
}
/// Find nearest lower bound for time `current` in given `plan` for the time window expr.
/// i.e. for time window expr being `date_bin(INTERVAL '5 minutes', ts) as time_window` and `current="2021-07-01 00:01:01.000"`,
/// return `Some("2021-07-01 00:00:00.000")`
/// if `plan` doesn't contain a `TIME INDEX` column, return `None`
///
/// Time window expr is a expr that:
/// 1. ref only to a time index column
/// 2. is monotonic increasing
/// 3. show up in GROUP BY clause
///
/// note this plan should only contain one TableScan
pub async fn find_plan_time_window_bound(
plan: &LogicalPlan,
current: Timestamp,
query_ctx: QueryContextRef,
engine: QueryEngineRef,
) -> Result<(String, Option<Timestamp>, Option<Timestamp>), Error> {
// TODO(discord9): find the expr that do time window
let catalog_man = engine.engine_state().catalog_manager();
let (ts_col_name, time_window_expr, expected_time_unit, df_schema) =
find_time_window_expr(plan, catalog_man.clone(), query_ctx).await?;
// cast current to ts_index's type
let new_current = current
.convert_to(expected_time_unit)
.with_context(|| UnexpectedSnafu {
reason: format!("Failed to cast current timestamp {current:?} to {expected_time_unit}"),
})?;
// if no time_window_expr is found, return None
if let Some(time_window_expr) = time_window_expr {
let lower_bound =
find_expr_time_window_lower_bound(&time_window_expr, &df_schema, new_current)?;
let upper_bound =
find_expr_time_window_upper_bound(&time_window_expr, &df_schema, new_current)?;
Ok((ts_col_name, lower_bound, upper_bound))
} else {
Ok((ts_col_name, None, None))
}
}
/// Find the lower bound of time window in given `expr` and `current` timestamp.
///
/// i.e. for `current="2021-07-01 00:01:01.000"` and `expr=date_bin(INTERVAL '5 minutes', ts) as time_window` and `ts_col=ts`,
/// return `Some("2021-07-01 00:00:00.000")` since it's the lower bound
/// of current time window given the current timestamp
///
/// if return None, meaning this time window have no lower bound
fn find_expr_time_window_lower_bound(
expr: &Expr,
df_schema: &DFSchema,
current: Timestamp,
) -> Result<Option<Timestamp>, Error> {
let phy_planner = DefaultPhysicalPlanner::default();
let phy_expr: PhysicalExprRef = phy_planner
.create_physical_expr(expr, df_schema, &SessionContext::new().state())
.with_context(|_e| DatafusionSnafu {
context: format!(
"Failed to create physical expression from {expr:?} using {df_schema:?}"
),
})?;
let cur_time_window = eval_ts_to_ts(&phy_expr, df_schema, current)?;
let input_time_unit = cur_time_window.unit();
Ok(cur_time_window.convert_to(input_time_unit))
}
/// Find the upper bound for time window expression
fn find_expr_time_window_upper_bound(
expr: &Expr,
df_schema: &DFSchema,
current: Timestamp,
) -> Result<Option<Timestamp>, Error> {
use std::cmp::Ordering;
let phy_planner = DefaultPhysicalPlanner::default();
let phy_expr: PhysicalExprRef = phy_planner
.create_physical_expr(expr, df_schema, &SessionContext::new().state())
.with_context(|_e| DatafusionSnafu {
context: format!(
"Failed to create physical expression from {expr:?} using {df_schema:?}"
),
})?;
let cur_time_window = eval_ts_to_ts(&phy_expr, df_schema, current)?;
// search to find the lower bound
let mut offset: i64 = 1;
let mut lower_bound = Some(current);
let upper_bound;
// first expontial probe to found a range for binary search
loop {
let Some(next_val) = current.value().checked_add(offset) else {
// no upper bound if overflow
return Ok(None);
};
let next_time_probe = common_time::Timestamp::new(next_val, current.unit());
let next_time_window = eval_ts_to_ts(&phy_expr, df_schema, next_time_probe)?;
match next_time_window.cmp(&cur_time_window) {
Ordering::Less => {UnexpectedSnafu {
reason: format!(
"Unsupported time window expression, expect monotonic increasing for time window expression {expr:?}"
),
}
.fail()?
}
Ordering::Equal => {
lower_bound = Some(next_time_probe);
}
Ordering::Greater => {
upper_bound = Some(next_time_probe);
break
}
}
let Some(new_offset) = offset.checked_mul(2) else {
// no upper bound if overflow
return Ok(None);
};
offset = new_offset;
}
// binary search for the exact upper bound
ensure!(lower_bound.map(|v|v.unit())==upper_bound.map(|v|v.unit()), UnexpectedSnafu{
reason: format!(" unit mismatch for time window expression {expr:?}, found {lower_bound:?} and {upper_bound:?}"),
});
let output_unit = upper_bound
.context(UnexpectedSnafu {
reason: "should have lower bound",
})?
.unit();
let mut low = lower_bound
.context(UnexpectedSnafu {
reason: "should have lower bound",
})?
.value();
let mut high = upper_bound
.context(UnexpectedSnafu {
reason: "should have upper bound",
})?
.value();
while low < high {
let mid = (low + high) / 2;
let mid_probe = common_time::Timestamp::new(mid, output_unit);
let mid_time_window = eval_ts_to_ts(&phy_expr, df_schema, mid_probe)?;
match mid_time_window.cmp(&cur_time_window) {
Ordering::Less => UnexpectedSnafu {
reason: format!("Binary search failed for time window expression {expr:?}"),
}
.fail()?,
Ordering::Equal => low = mid + 1,
Ordering::Greater => high = mid,
}
}
let final_upper_bound_for_time_window = common_time::Timestamp::new(high, output_unit);
Ok(Some(final_upper_bound_for_time_window))
}
fn eval_ts_to_ts(
phy: &PhysicalExprRef,
df_schema: &DFSchema,
input_value: Timestamp,
) -> Result<Timestamp, Error> {
let ts_vector = match input_value.unit() {
TimeUnit::Second => {
TimestampSecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
}
TimeUnit::Millisecond => {
TimestampMillisecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
}
TimeUnit::Microsecond => {
TimestampMicrosecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
}
TimeUnit::Nanosecond => {
TimestampNanosecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
}
};
let rb = DfRecordBatch::try_new(df_schema.inner().clone(), vec![ts_vector.clone()])
.with_context(|_| ArrowSnafu {
context: format!("Failed to create record batch from {df_schema:?} and {ts_vector:?}"),
})?;
let eval_res = phy.evaluate(&rb).with_context(|_| DatafusionSnafu {
context: format!("Failed to evaluate physical expression {phy:?} on {rb:?}"),
})?;
if let Some(Some(ts)) = columnar_to_ts_vector(&eval_res)?.first() {
Ok(*ts)
} else {
UnexpectedSnafu {
reason: format!(
"Expected timestamp in expression {phy:?} but got {:?}",
eval_res
),
}
.fail()?
}
}
// TODO(discord9): a method to found out the precise time window
/// Find out the `Filter` Node corresponding to outermost `WHERE` and add a new filter expr to it
#[derive(Debug)]
pub struct AddFilterRewriter {
extra_filter: Expr,
is_rewritten: bool,
}
impl AddFilterRewriter {
fn new(filter: Expr) -> Self {
Self {
extra_filter: filter,
is_rewritten: false,
}
}
}
impl TreeNodeRewriter for AddFilterRewriter {
type Node = LogicalPlan;
fn f_up(&mut self, node: Self::Node) -> DfResult<Transformed<Self::Node>> {
if self.is_rewritten {
return Ok(Transformed::no(node));
}
match node {
LogicalPlan::Filter(mut filter) if !filter.having => {
filter.predicate = filter.predicate.and(self.extra_filter.clone());
self.is_rewritten = true;
Ok(Transformed::yes(LogicalPlan::Filter(filter)))
}
LogicalPlan::TableScan(_) => {
// add a new filter
let filter =
datafusion_expr::Filter::try_new(self.extra_filter.clone(), Arc::new(node))?;
self.is_rewritten = true;
Ok(Transformed::yes(LogicalPlan::Filter(filter)))
}
_ => Ok(Transformed::no(node)),
}
}
}
fn df_plan_to_sql(plan: &LogicalPlan) -> Result<String, Error> {
let unparser = Unparser::default();
let sql = unparser
.plan_to_sql(plan)
.with_context(|_e| DatafusionSnafu {
context: format!("Failed to unparse logical plan {plan:?}"),
})?;
Ok(sql.to_string())
}
#[cfg(test)]
mod test {
use datafusion_common::tree_node::TreeNode;
use pretty_assertions::assert_eq;
use session::context::QueryContext;
use super::{sql_to_df_plan, *};
use crate::recording_rules::{df_plan_to_sql, AddFilterRewriter};
use crate::test_utils::create_test_query_engine;
#[tokio::test]
async fn test_add_filter() {
let testcases = vec![
(
"SELECT number FROM numbers_with_ts GROUP BY number","SELECT numbers_with_ts.number FROM numbers_with_ts WHERE (number > 4) GROUP BY numbers_with_ts.number"
),
(
"SELECT number FROM numbers_with_ts WHERE number < 2 OR number >10",
"SELECT numbers_with_ts.number FROM numbers_with_ts WHERE ((numbers_with_ts.number < 2) OR (numbers_with_ts.number > 10)) AND (number > 4)"
),
(
"SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window",
"SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE (number > 4) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
)
];
use datafusion_expr::{col, lit};
let query_engine = create_test_query_engine();
let ctx = QueryContext::arc();
for (before, after) in testcases {
let sql = before;
let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, false)
.await
.unwrap();
let mut add_filter = AddFilterRewriter::new(col("number").gt(lit(4u32)));
let plan = plan.rewrite(&mut add_filter).unwrap().data;
let new_sql = df_plan_to_sql(&plan).unwrap();
assert_eq!(after, new_sql);
}
}
#[tokio::test]
async fn test_plan_time_window_lower_bound() {
use datafusion_expr::{col, lit};
let query_engine = create_test_query_engine();
let ctx = QueryContext::arc();
let testcases = [
// same alias is not same column
(
"SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS ts FROM numbers_with_ts GROUP BY ts;",
Timestamp::new(1740394109, TimeUnit::Second),
(
"ts".to_string(),
Some(Timestamp::new(1740394109000, TimeUnit::Millisecond)),
Some(Timestamp::new(1740394109001, TimeUnit::Millisecond)),
),
"SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS ts FROM numbers_with_ts WHERE ((ts >= CAST('2025-02-24 10:48:29' AS TIMESTAMP)) AND (ts <= CAST('2025-02-24 10:48:29.001' AS TIMESTAMP))) GROUP BY numbers_with_ts.ts"
),
// complex time window index
(
"SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS time_window FROM numbers_with_ts GROUP BY time_window;",
Timestamp::new(1740394109, TimeUnit::Second),
(
"ts".to_string(),
Some(Timestamp::new(1740394080, TimeUnit::Second)),
Some(Timestamp::new(1740394140, TimeUnit::Second)),
),
"SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('2025-02-24 10:48:00' AS TIMESTAMP)) AND (ts <= CAST('2025-02-24 10:49:00' AS TIMESTAMP))) GROUP BY arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)')"
),
// no time index
(
"SELECT date_bin('5 minutes', ts) FROM numbers_with_ts;",
Timestamp::new(23, TimeUnit::Millisecond),
("ts".to_string(), None, None),
"SELECT date_bin('5 minutes', ts) FROM numbers_with_ts;"
),
// time index
(
"SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
Timestamp::new(23, TimeUnit::Nanosecond),
(
"ts".to_string(),
Some(Timestamp::new(0, TimeUnit::Millisecond)),
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
),
"SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
),
// on spot
(
"SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
Timestamp::new(0, TimeUnit::Nanosecond),
(
"ts".to_string(),
Some(Timestamp::new(0, TimeUnit::Millisecond)),
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
),
"SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
),
// different time unit
(
"SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
Timestamp::new(23_000_000, TimeUnit::Nanosecond),
(
"ts".to_string(),
Some(Timestamp::new(0, TimeUnit::Millisecond)),
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
),
"SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
),
// time index with other fields
(
"SELECT sum(number) as sum_up, date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
Timestamp::new(23, TimeUnit::Millisecond),
(
"ts".to_string(),
Some(Timestamp::new(0, TimeUnit::Millisecond)),
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
),
"SELECT sum(numbers_with_ts.number) AS sum_up, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
),
// time index with other pks
(
"SELECT number, date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window, number;",
Timestamp::new(23, TimeUnit::Millisecond),
(
"ts".to_string(),
Some(Timestamp::new(0, TimeUnit::Millisecond)),
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
),
"SELECT numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts), numbers_with_ts.number"
),
// subquery
(
"SELECT number, time_window FROM (SELECT number, date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window, number);",
Timestamp::new(23, TimeUnit::Millisecond),
(
"ts".to_string(),
Some(Timestamp::new(0, TimeUnit::Millisecond)),
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
),
"SELECT numbers_with_ts.number, time_window FROM (SELECT numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts), numbers_with_ts.number)"
),
// cte
(
"with cte as (select number, date_bin('5 minutes', ts) as time_window from numbers_with_ts GROUP BY time_window, number) select number, time_window from cte;",
Timestamp::new(23, TimeUnit::Millisecond),
(
"ts".to_string(),
Some(Timestamp::new(0, TimeUnit::Millisecond)),
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
),
"SELECT cte.number, cte.time_window FROM (SELECT numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts), numbers_with_ts.number) AS cte"
),
// complex subquery without alias
(
"SELECT sum(number), number, date_bin('5 minutes', ts) as time_window, bucket_name FROM (SELECT number, ts, case when number < 5 THEN 'bucket_0_5' when number >= 5 THEN 'bucket_5_inf' END as bucket_name FROM numbers_with_ts) GROUP BY number, time_window, bucket_name;",
Timestamp::new(23, TimeUnit::Millisecond),
(
"ts".to_string(),
Some(Timestamp::new(0, TimeUnit::Millisecond)),
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
),
"SELECT sum(numbers_with_ts.number), numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window, bucket_name FROM (SELECT numbers_with_ts.number, numbers_with_ts.ts, CASE WHEN (numbers_with_ts.number < 5) THEN 'bucket_0_5' WHEN (numbers_with_ts.number >= 5) THEN 'bucket_5_inf' END AS bucket_name FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP)))) GROUP BY numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts), bucket_name"
),
// complex subquery alias
(
"SELECT sum(number), number, date_bin('5 minutes', ts) as time_window, bucket_name FROM (SELECT number, ts, case when number < 5 THEN 'bucket_0_5' when number >= 5 THEN 'bucket_5_inf' END as bucket_name FROM numbers_with_ts) as cte GROUP BY number, time_window, bucket_name;",
Timestamp::new(23, TimeUnit::Millisecond),
(
"ts".to_string(),
Some(Timestamp::new(0, TimeUnit::Millisecond)),
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
),
"SELECT sum(cte.number), cte.number, date_bin('5 minutes', cte.ts) AS time_window, cte.bucket_name FROM (SELECT numbers_with_ts.number, numbers_with_ts.ts, CASE WHEN (numbers_with_ts.number < 5) THEN 'bucket_0_5' WHEN (numbers_with_ts.number >= 5) THEN 'bucket_5_inf' END AS bucket_name FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP)))) AS cte GROUP BY cte.number, date_bin('5 minutes', cte.ts), cte.bucket_name"
),
];
for (sql, current, expected, expected_unparsed) in testcases {
let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, true)
.await
.unwrap();
let real =
find_plan_time_window_bound(&plan, current, ctx.clone(), query_engine.clone())
.await
.unwrap();
assert_eq!(expected, real);
let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, false)
.await
.unwrap();
let (col_name, lower, upper) = real;
let new_sql = if lower.is_some() {
let to_df_literal = |value| {
let value = Value::from(value);
value.try_to_scalar_value(&value.data_type()).unwrap()
};
let lower = to_df_literal(lower.unwrap());
let upper = to_df_literal(upper.unwrap());
let expr = col(&col_name)
.gt_eq(lit(lower))
.and(col(&col_name).lt_eq(lit(upper)));
let mut add_filter = AddFilterRewriter::new(expr);
let plan = plan.rewrite(&mut add_filter).unwrap().data;
df_plan_to_sql(&plan).unwrap()
} else {
sql.to_string()
};
assert_eq!(expected_unparsed, new_sql);
}
}
}

View File

@@ -1,773 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::{BTreeMap, HashMap, HashSet};
use std::sync::Arc;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use api::v1::flow::FlowResponse;
use common_error::ext::BoxedError;
use common_meta::ddl::create_flow::FlowType;
use common_meta::key::flow::FlowMetadataManagerRef;
use common_meta::key::table_info::TableInfoManager;
use common_meta::key::TableMetadataManagerRef;
use common_telemetry::tracing::warn;
use common_telemetry::{debug, info};
use common_time::Timestamp;
use datafusion::sql::unparser::expr_to_sql;
use datafusion_common::tree_node::TreeNode;
use datatypes::value::Value;
use query::QueryEngineRef;
use session::context::QueryContextRef;
use snafu::{ensure, OptionExt, ResultExt};
use store_api::storage::RegionId;
use table::metadata::TableId;
use tokio::sync::oneshot::error::TryRecvError;
use tokio::sync::{oneshot, RwLock};
use tokio::time::Instant;
use super::frontend_client::FrontendClient;
use super::{df_plan_to_sql, AddFilterRewriter, TimeWindowExpr};
use crate::adapter::{CreateFlowArgs, FlowId, TableName};
use crate::error::{
DatafusionSnafu, DatatypesSnafu, ExternalSnafu, FlowAlreadyExistSnafu, InternalSnafu,
TimeSnafu, UnexpectedSnafu,
};
use crate::metrics::{METRIC_FLOW_RULE_ENGINE_QUERY_TIME, METRIC_FLOW_RULE_ENGINE_SLOW_QUERY};
use crate::recording_rules::{find_plan_time_window_bound, find_time_window_expr, sql_to_df_plan};
use crate::Error;
/// TODO(discord9): make those constants configurable
/// The default rule engine query timeout is 10 minutes
pub const DEFAULT_RULE_ENGINE_QUERY_TIMEOUT: Duration = Duration::from_secs(10 * 60);
/// will output a warn log for any query that runs for more that 1 minutes, and also every 1 minutes when that query is still running
pub const SLOW_QUERY_THRESHOLD: Duration = Duration::from_secs(60);
/// TODO(discord9): determine how to configure refresh rate
pub struct RecordingRuleEngine {
tasks: RwLock<BTreeMap<FlowId, RecordingRuleTask>>,
shutdown_txs: RwLock<BTreeMap<FlowId, oneshot::Sender<()>>>,
frontend_client: Arc<FrontendClient>,
flow_metadata_manager: FlowMetadataManagerRef,
table_meta: TableMetadataManagerRef,
engine: QueryEngineRef,
}
impl RecordingRuleEngine {
pub fn new(
frontend_client: Arc<FrontendClient>,
engine: QueryEngineRef,
flow_metadata_manager: FlowMetadataManagerRef,
table_meta: TableMetadataManagerRef,
) -> Self {
Self {
tasks: Default::default(),
shutdown_txs: Default::default(),
frontend_client,
flow_metadata_manager,
table_meta,
engine,
}
}
pub async fn handle_inserts(
&self,
request: api::v1::region::InsertRequests,
) -> Result<FlowResponse, Error> {
let table_info_mgr = self.table_meta.table_info_manager();
let mut group_by_table_name: HashMap<TableName, Vec<api::v1::Rows>> = HashMap::new();
for r in request.requests {
let tid = RegionId::from(r.region_id).table_id();
let name = get_table_name(table_info_mgr, &tid).await?;
let entry = group_by_table_name.entry(name).or_default();
if let Some(rows) = r.rows {
entry.push(rows);
}
}
for (_flow_id, task) in self.tasks.read().await.iter() {
let src_table_names = &task.source_table_names;
for src_table_name in src_table_names {
if let Some(entry) = group_by_table_name.get(src_table_name) {
let Some(expr) = &task.time_window_expr else {
continue;
};
let involved_time_windows = expr.handle_rows(entry.clone()).await?;
let mut state = task.state.write().await;
state
.dirty_time_windows
.add_lower_bounds(involved_time_windows.into_iter());
}
}
}
Ok(Default::default())
}
}
async fn get_table_name(zelf: &TableInfoManager, table_id: &TableId) -> Result<TableName, Error> {
zelf.get(*table_id)
.await
.map_err(BoxedError::new)
.context(ExternalSnafu)?
.with_context(|| UnexpectedSnafu {
reason: format!("Table id = {:?}, couldn't found table name", table_id),
})
.map(|name| name.table_name())
.map(|name| [name.catalog_name, name.schema_name, name.table_name])
}
const MIN_REFRESH_DURATION: Duration = Duration::new(5, 0);
impl RecordingRuleEngine {
pub async fn create_flow(&self, args: CreateFlowArgs) -> Result<Option<FlowId>, Error> {
let CreateFlowArgs {
flow_id,
sink_table_name,
source_table_ids,
create_if_not_exists,
or_replace,
expire_after,
comment: _,
sql,
flow_options,
query_ctx,
} = args;
// or replace logic
{
let is_exist = self.tasks.read().await.contains_key(&flow_id);
match (create_if_not_exists, or_replace, is_exist) {
// if replace, ignore that old flow exists
(_, true, true) => {
info!("Replacing flow with id={}", flow_id);
}
(false, false, true) => FlowAlreadyExistSnafu { id: flow_id }.fail()?,
// already exists, and not replace, return None
(true, false, true) => {
info!("Flow with id={} already exists, do nothing", flow_id);
return Ok(None);
}
// continue as normal
(_, _, false) => (),
}
}
let flow_type = flow_options.get(FlowType::FLOW_TYPE_KEY);
ensure!(
flow_type == Some(&FlowType::RecordingRule.to_string()) || flow_type.is_none(),
UnexpectedSnafu {
reason: format!("Flow type is not RecordingRule nor None, got {flow_type:?}")
}
);
let Some(query_ctx) = query_ctx else {
UnexpectedSnafu {
reason: "Query context is None".to_string(),
}
.fail()?
};
let query_ctx = Arc::new(query_ctx);
let mut source_table_names = Vec::new();
for src_id in source_table_ids {
let table_name = self
.table_meta
.table_info_manager()
.get(src_id)
.await
.map_err(BoxedError::new)
.context(ExternalSnafu)?
.with_context(|| UnexpectedSnafu {
reason: format!("Table id = {:?}, couldn't found table name", src_id),
})
.map(|name| name.table_name())
.map(|name| [name.catalog_name, name.schema_name, name.table_name])?;
source_table_names.push(table_name);
}
let (tx, rx) = oneshot::channel();
let plan = sql_to_df_plan(query_ctx.clone(), self.engine.clone(), &sql, true).await?;
let (column_name, time_window_expr, _, df_schema) = find_time_window_expr(
&plan,
self.engine.engine_state().catalog_manager().clone(),
query_ctx.clone(),
)
.await?;
let phy_expr = time_window_expr
.map(|expr| TimeWindowExpr::from_expr(&expr, &column_name, &df_schema))
.transpose()?;
let task = RecordingRuleTask::new(
flow_id,
&sql,
phy_expr,
expire_after,
sink_table_name,
source_table_names,
query_ctx,
rx,
);
let task_inner = task.clone();
let engine = self.engine.clone();
let frontend = self.frontend_client.clone();
// TODO(discord9): also save handle & use time wheel or what for better
let _handle = common_runtime::spawn_global(async move {
match task_inner.start_executing(engine, frontend).await {
Ok(()) => info!("Flow {} shutdown", task_inner.flow_id),
Err(err) => common_telemetry::error!(
"Flow {} encounter unrecoverable error: {err:?}",
task_inner.flow_id
),
}
});
// TODO(discord9): deal with replace logic
let replaced_old_task_opt = self.tasks.write().await.insert(flow_id, task);
drop(replaced_old_task_opt);
self.shutdown_txs.write().await.insert(flow_id, tx);
Ok(Some(flow_id))
}
pub async fn remove_flow(&self, flow_id: FlowId) -> Result<(), Error> {
if self.tasks.write().await.remove(&flow_id).is_none() {
warn!("Flow {flow_id} not found in tasks")
}
let Some(tx) = self.shutdown_txs.write().await.remove(&flow_id) else {
UnexpectedSnafu {
reason: format!("Can't found shutdown tx for flow {flow_id}"),
}
.fail()?
};
if tx.send(()).is_err() {
warn!("Fail to shutdown flow {flow_id} due to receiver already dropped, maybe flow {flow_id} is already dropped?")
}
Ok(())
}
}
#[derive(Debug, Clone)]
pub struct RecordingRuleTask {
flow_id: FlowId,
query: String,
time_window_expr: Option<TimeWindowExpr>,
/// in seconds
expire_after: Option<i64>,
sink_table_name: [String; 3],
source_table_names: HashSet<[String; 3]>,
state: Arc<RwLock<RecordingRuleState>>,
}
impl RecordingRuleTask {
#[allow(clippy::too_many_arguments)]
pub fn new(
flow_id: FlowId,
query: &str,
time_window_expr: Option<TimeWindowExpr>,
expire_after: Option<i64>,
sink_table_name: [String; 3],
source_table_names: Vec<[String; 3]>,
query_ctx: QueryContextRef,
shutdown_rx: oneshot::Receiver<()>,
) -> Self {
Self {
flow_id,
query: query.to_string(),
time_window_expr,
expire_after,
sink_table_name,
source_table_names: source_table_names.into_iter().collect(),
state: Arc::new(RwLock::new(RecordingRuleState::new(query_ctx, shutdown_rx))),
}
}
}
impl RecordingRuleTask {
/// This should be called in a new tokio task
pub async fn start_executing(
&self,
engine: QueryEngineRef,
frontend_client: Arc<FrontendClient>,
) -> Result<(), Error> {
// only first query don't need upper bound
let mut is_first = true;
loop {
// FIXME(discord9): test if need upper bound also works
let new_query = self.gen_query_with_time_window(engine.clone()).await?;
let insert_into = if let Some(new_query) = new_query {
format!(
"INSERT INTO {}.{}.{} {}",
self.sink_table_name[0],
self.sink_table_name[1],
self.sink_table_name[2],
new_query
)
} else {
tokio::time::sleep(MIN_REFRESH_DURATION).await;
continue;
};
if is_first {
is_first = false;
}
let instant = Instant::now();
let flow_id = self.flow_id;
let db_client = frontend_client.get_database_client().await?;
let peer_addr = db_client.peer.addr;
debug!(
"Executing flow {flow_id}(expire_after={:?} secs) on {:?} with query {}",
self.expire_after, peer_addr, &insert_into
);
let timer = METRIC_FLOW_RULE_ENGINE_QUERY_TIME
.with_label_values(&[flow_id.to_string().as_str()])
.start_timer();
let res = db_client.database.sql(&insert_into).await;
drop(timer);
let elapsed = instant.elapsed();
if let Ok(res1) = &res {
debug!(
"Flow {flow_id} executed, result: {res1:?}, elapsed: {:?}",
elapsed
);
} else if let Err(res) = &res {
warn!(
"Failed to execute Flow {flow_id} on frontend {}, result: {res:?}, elapsed: {:?} with query: {}",
peer_addr, elapsed, &insert_into
);
}
// record slow query
if elapsed >= SLOW_QUERY_THRESHOLD {
warn!(
"Flow {flow_id} on frontend {} executed for {:?} before complete, query: {}",
peer_addr, elapsed, &insert_into
);
METRIC_FLOW_RULE_ENGINE_SLOW_QUERY
.with_label_values(&[flow_id.to_string().as_str(), &insert_into, &peer_addr])
.observe(elapsed.as_secs_f64());
}
self.state
.write()
.await
.after_query_exec(elapsed, res.is_ok());
let sleep_until = {
let mut state = self.state.write().await;
match state.shutdown_rx.try_recv() {
Ok(()) => break Ok(()),
Err(TryRecvError::Closed) => {
warn!("Unexpected shutdown flow {flow_id}, shutdown anyway");
break Ok(());
}
Err(TryRecvError::Empty) => (),
}
state.get_next_start_query_time(None)
};
tokio::time::sleep_until(sleep_until).await;
}
}
/// will merge and use the first ten time window in query
async fn gen_query_with_time_window(
&self,
engine: QueryEngineRef,
) -> Result<Option<String>, Error> {
let query_ctx = self.state.read().await.query_ctx.clone();
let start = SystemTime::now();
let since_the_epoch = start
.duration_since(UNIX_EPOCH)
.expect("Time went backwards");
let low_bound = self
.expire_after
.map(|e| since_the_epoch.as_secs() - e as u64)
.unwrap_or(u64::MIN);
let low_bound = Timestamp::new_second(low_bound as i64);
let plan = sql_to_df_plan(query_ctx.clone(), engine.clone(), &self.query, true).await?;
// TODO(discord9): find more time window!
let (col_name, lower, upper) =
find_plan_time_window_bound(&plan, low_bound, query_ctx.clone(), engine.clone())
.await?;
let new_sql = {
let expr = {
match (lower, upper) {
(Some(l), Some(u)) => {
let window_size = u.sub(&l).with_context(|| UnexpectedSnafu {
reason: format!("Can't get window size from {u:?} - {l:?}"),
})?;
self.state
.write()
.await
.dirty_time_windows
.gen_filter_exprs(&col_name, lower, window_size)?
}
_ => {
warn!(
"Flow id = {:?}, can't get window size: lower={lower:?}, upper={upper:?}, using the same query", self.flow_id
);
// since no time window lower/upper bound is found, just return the original query
return Ok(Some(self.query.clone()));
}
}
};
debug!(
"Flow id={:?}, Generated filter expr: {:?}",
self.flow_id,
expr.as_ref()
.map(|expr| expr_to_sql(expr).with_context(|_| DatafusionSnafu {
context: format!("Failed to generate filter expr from {expr:?}"),
}))
.transpose()?
.map(|s| s.to_string())
);
let Some(expr) = expr else {
// no new data, hence no need to update
debug!("Flow id={:?}, no new data, not update", self.flow_id);
return Ok(None);
};
let mut add_filter = AddFilterRewriter::new(expr);
// make a not optimized plan for clearer unparse
let plan =
sql_to_df_plan(query_ctx.clone(), engine.clone(), &self.query, false).await?;
let plan = plan
.clone()
.rewrite(&mut add_filter)
.with_context(|_| DatafusionSnafu {
context: format!("Failed to rewrite plan {plan:?}"),
})?
.data;
df_plan_to_sql(&plan)?
};
Ok(Some(new_sql))
}
}
#[derive(Debug)]
pub struct RecordingRuleState {
query_ctx: QueryContextRef,
/// last query complete time
last_update_time: Instant,
/// last time query duration
last_query_duration: Duration,
/// Dirty Time windows need to be updated
/// mapping of `start -> end` and non-overlapping
dirty_time_windows: DirtyTimeWindows,
exec_state: ExecState,
shutdown_rx: oneshot::Receiver<()>,
}
#[derive(Debug, Clone, Default)]
pub struct DirtyTimeWindows {
windows: BTreeMap<Timestamp, Option<Timestamp>>,
}
fn to_df_literal(value: Timestamp) -> Result<datafusion_common::ScalarValue, Error> {
let value = Value::from(value);
let value = value
.try_to_scalar_value(&value.data_type())
.with_context(|_| DatatypesSnafu {
extra: format!("Failed to convert to scalar value: {}", value),
})?;
Ok(value)
}
impl DirtyTimeWindows {
/// Time window merge distance
const MERGE_DIST: i32 = 3;
/// Maximum number of filters allowed in a single query
const MAX_FILTER_NUM: usize = 20;
/// Add lower bounds to the dirty time windows. Upper bounds are ignored.
///
/// # Arguments
///
/// * `lower_bounds` - An iterator of lower bounds to be added.
pub fn add_lower_bounds(&mut self, lower_bounds: impl Iterator<Item = Timestamp>) {
for lower_bound in lower_bounds {
let entry = self.windows.entry(lower_bound);
entry.or_insert(None);
}
}
/// Generate all filter expressions consuming all time windows
pub fn gen_filter_exprs(
&mut self,
col_name: &str,
expire_lower_bound: Option<Timestamp>,
window_size: chrono::Duration,
) -> Result<Option<datafusion_expr::Expr>, Error> {
debug!(
"expire_lower_bound: {:?}, window_size: {:?}",
expire_lower_bound.map(|t| t.to_iso8601_string()),
window_size
);
self.merge_dirty_time_windows(window_size)?;
if self.windows.len() > Self::MAX_FILTER_NUM {
warn!(
"Too many time windows: {}, only the first {} are taken for this query, the group by expression might be wrong",
self.windows.len(),
Self::MAX_FILTER_NUM
);
}
// get the first `MAX_FILTER_NUM` time windows
let nth = self
.windows
.iter()
.nth(Self::MAX_FILTER_NUM)
.map(|(key, _)| *key);
let first_nth = {
if let Some(nth) = nth {
let mut after = self.windows.split_off(&nth);
std::mem::swap(&mut self.windows, &mut after);
after
} else {
std::mem::take(&mut self.windows)
}
};
let mut expr_lst = vec![];
for (start, end) in first_nth.into_iter() {
debug!(
"Time window start: {:?}, end: {:?}",
start.to_iso8601_string(),
end.map(|t| t.to_iso8601_string())
);
let start = if let Some(expire) = expire_lower_bound {
start.max(expire)
} else {
start
};
// filter out expired time window
if let Some(end) = end {
if end <= start {
continue;
}
}
use datafusion_expr::{col, lit};
let lower = to_df_literal(start)?;
let upper = end.map(to_df_literal).transpose()?;
let expr = if let Some(upper) = upper {
col(col_name)
.gt_eq(lit(lower))
.and(col(col_name).lt(lit(upper)))
} else {
col(col_name).gt_eq(lit(lower))
};
expr_lst.push(expr);
}
let expr = expr_lst.into_iter().reduce(|a, b| a.or(b));
Ok(expr)
}
/// Merge time windows that overlaps or get too close
pub fn merge_dirty_time_windows(&mut self, window_size: chrono::Duration) -> Result<(), Error> {
let mut new_windows = BTreeMap::new();
let mut prev_tw = None;
for (lower_bound, upper_bound) in std::mem::take(&mut self.windows) {
let Some(prev_tw) = &mut prev_tw else {
prev_tw = Some((lower_bound, upper_bound));
continue;
};
let std_window_size = window_size.to_std().map_err(|e| {
InternalSnafu {
reason: e.to_string(),
}
.build()
})?;
// if cur.lower - prev.upper <= window_size * 2, merge
let prev_upper = prev_tw
.1
.unwrap_or(prev_tw.0.add_duration(std_window_size).context(TimeSnafu)?);
prev_tw.1 = Some(prev_upper);
let cur_upper = upper_bound.unwrap_or(
lower_bound
.add_duration(std_window_size)
.context(TimeSnafu)?,
);
if lower_bound
.sub(&prev_upper)
.map(|dist| dist <= window_size * Self::MERGE_DIST)
.unwrap_or(false)
{
prev_tw.1 = Some(cur_upper);
} else {
new_windows.insert(prev_tw.0, prev_tw.1);
*prev_tw = (lower_bound, Some(cur_upper));
}
}
if let Some(prev_tw) = prev_tw {
new_windows.insert(prev_tw.0, prev_tw.1);
}
self.windows = new_windows;
Ok(())
}
}
impl RecordingRuleState {
pub fn new(query_ctx: QueryContextRef, shutdown_rx: oneshot::Receiver<()>) -> Self {
Self {
query_ctx,
last_update_time: Instant::now(),
last_query_duration: Duration::from_secs(0),
dirty_time_windows: Default::default(),
exec_state: ExecState::Idle,
shutdown_rx,
}
}
/// called after last query is done
/// `is_succ` indicate whether the last query is successful
pub fn after_query_exec(&mut self, elapsed: Duration, _is_succ: bool) {
self.exec_state = ExecState::Idle;
self.last_query_duration = elapsed;
self.last_update_time = Instant::now();
}
/// wait for at least `last_query_duration`, at most `max_timeout` to start next query
pub fn get_next_start_query_time(&self, max_timeout: Option<Duration>) -> Instant {
let next_duration = max_timeout
.unwrap_or(self.last_query_duration)
.min(self.last_query_duration);
let next_duration = next_duration.max(MIN_REFRESH_DURATION);
self.last_update_time + next_duration
}
}
#[derive(Debug, Clone)]
enum ExecState {
Idle,
Executing,
}
#[cfg(test)]
mod test {
use pretty_assertions::assert_eq;
use super::*;
#[test]
fn test_merge_dirty_time_windows() {
let mut dirty = DirtyTimeWindows::default();
dirty.add_lower_bounds(
vec![
Timestamp::new_second(0),
Timestamp::new_second((1 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
]
.into_iter(),
);
dirty
.merge_dirty_time_windows(chrono::Duration::seconds(5 * 60))
.unwrap();
// just enough to merge
assert_eq!(
dirty.windows,
BTreeMap::from([(
Timestamp::new_second(0),
Some(Timestamp::new_second(
(2 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60
))
)])
);
// separate time window
let mut dirty = DirtyTimeWindows::default();
dirty.add_lower_bounds(
vec![
Timestamp::new_second(0),
Timestamp::new_second((2 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
]
.into_iter(),
);
dirty
.merge_dirty_time_windows(chrono::Duration::seconds(5 * 60))
.unwrap();
// just enough to merge
assert_eq!(
BTreeMap::from([
(
Timestamp::new_second(0),
Some(Timestamp::new_second(5 * 60))
),
(
Timestamp::new_second((2 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
Some(Timestamp::new_second(
(3 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60
))
)
]),
dirty.windows
);
// overlapping
let mut dirty = DirtyTimeWindows::default();
dirty.add_lower_bounds(
vec![
Timestamp::new_second(0),
Timestamp::new_second((DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
]
.into_iter(),
);
dirty
.merge_dirty_time_windows(chrono::Duration::seconds(5 * 60))
.unwrap();
// just enough to merge
assert_eq!(
BTreeMap::from([(
Timestamp::new_second(0),
Some(Timestamp::new_second(
(1 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60
))
),]),
dirty.windows
);
}
}

View File

@@ -1,150 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Frontend client to run flow as recording rule which is time-window-aware normal query triggered every tick set by user
use std::sync::Arc;
use client::{Client, Database, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use common_error::ext::BoxedError;
use common_grpc::channel_manager::{ChannelConfig, ChannelManager};
use common_meta::cluster::{NodeInfo, NodeInfoKey, Role};
use common_meta::peer::Peer;
use common_meta::rpc::store::RangeRequest;
use meta_client::client::MetaClient;
use snafu::ResultExt;
use crate::error::{ExternalSnafu, UnexpectedSnafu};
use crate::recording_rules::engine::DEFAULT_RULE_ENGINE_QUERY_TIMEOUT;
use crate::Error;
fn default_channel_mgr() -> ChannelManager {
let cfg = ChannelConfig::new().timeout(DEFAULT_RULE_ENGINE_QUERY_TIMEOUT);
ChannelManager::with_config(cfg)
}
fn client_from_urls(addrs: Vec<String>) -> Client {
Client::with_manager_and_urls(default_channel_mgr(), addrs)
}
/// A simple frontend client able to execute sql using grpc protocol
#[derive(Debug)]
pub enum FrontendClient {
Distributed {
meta_client: Arc<MetaClient>,
},
Standalone {
/// for the sake of simplicity still use grpc even in standalone mode
/// notice the client here should all be lazy, so that can wait after frontend is booted then make conn
/// TODO(discord9): not use grpc under standalone mode
database_client: DatabaseWithPeer,
},
}
#[derive(Debug, Clone)]
pub struct DatabaseWithPeer {
pub database: Database,
pub peer: Peer,
}
impl DatabaseWithPeer {
fn new(database: Database, peer: Peer) -> Self {
Self { database, peer }
}
}
impl FrontendClient {
pub fn from_meta_client(meta_client: Arc<MetaClient>) -> Self {
Self::Distributed { meta_client }
}
pub fn from_static_grpc_addr(addr: String) -> Self {
let peer = Peer {
id: 0,
addr: addr.clone(),
};
let client = client_from_urls(vec![addr]);
let database = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client);
Self::Standalone {
database_client: DatabaseWithPeer::new(database, peer),
}
}
}
impl FrontendClient {
async fn scan_for_frontend(&self) -> Result<Vec<(NodeInfoKey, NodeInfo)>, Error> {
let Self::Distributed { meta_client, .. } = self else {
return Ok(vec![]);
};
let cluster_client = meta_client
.cluster_client()
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
let cluster_id = meta_client.id().0;
let prefix = NodeInfoKey::key_prefix_with_role(cluster_id, Role::Frontend);
let req = RangeRequest::new().with_prefix(prefix);
let resp = cluster_client
.range(req)
.await
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
let mut res = Vec::with_capacity(resp.kvs.len());
for kv in resp.kvs {
let key = NodeInfoKey::try_from(kv.key)
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
let val = NodeInfo::try_from(kv.value)
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
res.push((key, val));
}
Ok(res)
}
/// Get the database with max `last_activity_ts`
async fn get_last_active_frontend(&self) -> Result<DatabaseWithPeer, Error> {
if let Self::Standalone { database_client } = self {
return Ok(database_client.clone());
}
let frontends = self.scan_for_frontend().await?;
let mut last_activity_ts = i64::MIN;
let mut peer = None;
for (_key, val) in frontends.iter() {
if val.last_activity_ts > last_activity_ts {
last_activity_ts = val.last_activity_ts;
peer = Some(val.peer.clone());
}
}
let Some(peer) = peer else {
UnexpectedSnafu {
reason: format!("No frontend available: {:?}", frontends),
}
.fail()?
};
let client = client_from_urls(vec![peer.addr.clone()]);
let database = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client);
Ok(DatabaseWithPeer::new(database, peer))
}
/// Get a database client, and possibly update it before returning.
pub async fn get_database_client(&self) -> Result<DatabaseWithPeer, Error> {
match self {
Self::Standalone { database_client } => Ok(database_client.clone()),
Self::Distributed { meta_client: _ } => self.get_last_active_frontend().await,
}
}
}

View File

@@ -57,7 +57,6 @@ use crate::error::{
};
use crate::heartbeat::HeartbeatTask;
use crate::metrics::{METRIC_FLOW_PROCESSING_TIME, METRIC_FLOW_ROWS};
use crate::recording_rules::{FrontendClient, RecordingRuleEngine};
use crate::transform::register_function_to_query_engine;
use crate::utils::{SizeReportSender, StateReportHandler};
use crate::{Error, FlowWorkerManager, FlownodeOptions};
@@ -246,7 +245,6 @@ impl FlownodeInstance {
self.server.shutdown().await.context(ShutdownServerSnafu)?;
if let Some(task) = &self.heartbeat_task {
info!("Close heartbeat task for flownode");
task.shutdown();
}
@@ -273,8 +271,6 @@ pub struct FlownodeBuilder {
heartbeat_task: Option<HeartbeatTask>,
/// receive a oneshot sender to send state size report
state_report_handler: Option<StateReportHandler>,
/// Client to send sql to frontend
frontend_client: Arc<FrontendClient>,
}
impl FlownodeBuilder {
@@ -285,7 +281,6 @@ impl FlownodeBuilder {
table_meta: TableMetadataManagerRef,
catalog_manager: CatalogManagerRef,
flow_metadata_manager: FlowMetadataManagerRef,
frontend_client: Arc<FrontendClient>,
) -> Self {
Self {
opts,
@@ -295,7 +290,6 @@ impl FlownodeBuilder {
flow_metadata_manager,
heartbeat_task: None,
state_report_handler: None,
frontend_client,
}
}
@@ -453,14 +447,7 @@ impl FlownodeBuilder {
let node_id = self.opts.node_id.map(|id| id as u32);
let rule_engine = RecordingRuleEngine::new(
self.frontend_client.clone(),
query_engine.clone(),
self.flow_metadata_manager.clone(),
table_meta.clone(),
);
let mut man = FlowWorkerManager::new(node_id, query_engine, table_meta, rule_engine);
let mut man = FlowWorkerManager::new(node_id, query_engine, table_meta);
for worker_id in 0..num_workers {
let (tx, rx) = oneshot::channel();

View File

@@ -86,8 +86,7 @@ pub fn create_test_query_engine() -> Arc<dyn QueryEngine> {
let schema = vec![
datatypes::schema::ColumnSchema::new("number", CDT::uint32_datatype(), false),
datatypes::schema::ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false)
.with_time_index(true),
datatypes::schema::ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false),
];
let mut columns = vec![];
let numbers = (1..=10).collect_vec();

View File

@@ -237,13 +237,6 @@ impl Instance {
let output = match stmt {
Statement::Query(_) | Statement::Explain(_) | Statement::Delete(_) => {
// TODO: remove this when format is supported in datafusion
if let Statement::Explain(explain) = &stmt {
if let Some(format) = explain.format() {
query_ctx.set_explain_format(format.to_string());
}
}
let stmt = QueryStatement::Sql(stmt);
let plan = self
.statement_executor

View File

@@ -25,7 +25,7 @@ use crate::fulltext_index::create::{FulltextIndexCreator, TantivyFulltextIndexCr
use crate::fulltext_index::search::{FulltextIndexSearcher, RowId, TantivyFulltextIndexSearcher};
use crate::fulltext_index::{Analyzer, Config};
async fn new_bounded_stager(prefix: &str) -> (TempDir, Arc<BoundedStager<String>>) {
async fn new_bounded_stager(prefix: &str) -> (TempDir, Arc<BoundedStager>) {
let staging_dir = create_temp_dir(prefix);
let path = staging_dir.path().to_path_buf();
(
@@ -68,13 +68,13 @@ async fn test_search(
let file_accessor = Arc::new(MockFileAccessor::new(prefix));
let puffin_manager = FsPuffinManager::new(stager, file_accessor);
let file_name = "fulltext_index".to_string();
let blob_key = "fulltext_index".to_string();
let mut writer = puffin_manager.writer(&file_name).await.unwrap();
create_index(prefix, &mut writer, &blob_key, texts, config).await;
let file_name = "fulltext_index";
let blob_key = "fulltext_index";
let mut writer = puffin_manager.writer(file_name).await.unwrap();
create_index(prefix, &mut writer, blob_key, texts, config).await;
let reader = puffin_manager.reader(&file_name).await.unwrap();
let index_dir = reader.dir(&blob_key).await.unwrap();
let reader = puffin_manager.reader(file_name).await.unwrap();
let index_dir = reader.dir(blob_key).await.unwrap();
let searcher = TantivyFulltextIndexSearcher::new(index_dir.path()).unwrap();
let results = searcher.search(query).await.unwrap();

View File

@@ -112,7 +112,6 @@ impl MetaClientBuilder {
.enable_store()
.enable_heartbeat()
.enable_procedure()
.enable_access_cluster_info()
}
pub fn enable_heartbeat(self) -> Self {

View File

@@ -198,13 +198,13 @@ impl Inner {
}
);
let leader_addr = self
let leader = self
.ask_leader
.as_ref()
.unwrap()
.get_leader()
.context(error::NoLeaderSnafu)?;
let mut leader = self.make_client(&leader_addr)?;
let mut leader = self.make_client(leader)?;
let (sender, receiver) = mpsc::channel::<HeartbeatRequest>(128);
@@ -236,11 +236,7 @@ impl Inner {
.await
.map_err(error::Error::from)?
.context(error::CreateHeartbeatStreamSnafu)?;
info!(
"Success to create heartbeat stream to server: {}, response: {:#?}",
leader_addr, res
);
info!("Success to create heartbeat stream to server: {:#?}", res);
Ok((
HeartbeatSender::new(self.id, self.role, sender),

View File

@@ -44,7 +44,6 @@ use mailbox_handler::MailboxHandler;
use on_leader_start_handler::OnLeaderStartHandler;
use publish_heartbeat_handler::PublishHeartbeatHandler;
use region_lease_handler::RegionLeaseHandler;
use remap_flow_peer_handler::RemapFlowPeerHandler;
use response_header_handler::ResponseHeaderHandler;
use snafu::{OptionExt, ResultExt};
use store_api::storage::RegionId;
@@ -72,7 +71,6 @@ pub mod mailbox_handler;
pub mod on_leader_start_handler;
pub mod publish_heartbeat_handler;
pub mod region_lease_handler;
pub mod remap_flow_peer_handler;
pub mod response_header_handler;
#[async_trait::async_trait]
@@ -575,7 +573,6 @@ impl HeartbeatHandlerGroupBuilder {
self.add_handler_last(publish_heartbeat_handler);
}
self.add_handler_last(CollectStatsHandler::new(self.flush_stats_factor));
self.add_handler_last(RemapFlowPeerHandler::default());
if let Some(flow_state_handler) = self.flow_state_handler.take() {
self.add_handler_last(flow_state_handler);
@@ -856,7 +853,7 @@ mod tests {
.unwrap();
let handlers = group.handlers;
assert_eq!(13, handlers.len());
assert_eq!(12, handlers.len());
let names = [
"ResponseHeaderHandler",
@@ -871,7 +868,6 @@ mod tests {
"MailboxHandler",
"FilterInactiveRegionStatsHandler",
"CollectStatsHandler",
"RemapFlowPeerHandler",
];
for (handler, name) in handlers.iter().zip(names.into_iter()) {
@@ -892,7 +888,7 @@ mod tests {
let group = builder.build().unwrap();
let handlers = group.handlers;
assert_eq!(14, handlers.len());
assert_eq!(13, handlers.len());
let names = [
"ResponseHeaderHandler",
@@ -908,7 +904,6 @@ mod tests {
"CollectStatsHandler",
"FilterInactiveRegionStatsHandler",
"CollectStatsHandler",
"RemapFlowPeerHandler",
];
for (handler, name) in handlers.iter().zip(names.into_iter()) {
@@ -926,7 +921,7 @@ mod tests {
let group = builder.build().unwrap();
let handlers = group.handlers;
assert_eq!(14, handlers.len());
assert_eq!(13, handlers.len());
let names = [
"CollectStatsHandler",
@@ -942,7 +937,6 @@ mod tests {
"MailboxHandler",
"FilterInactiveRegionStatsHandler",
"CollectStatsHandler",
"RemapFlowPeerHandler",
];
for (handler, name) in handlers.iter().zip(names.into_iter()) {
@@ -960,7 +954,7 @@ mod tests {
let group = builder.build().unwrap();
let handlers = group.handlers;
assert_eq!(14, handlers.len());
assert_eq!(13, handlers.len());
let names = [
"ResponseHeaderHandler",
@@ -976,7 +970,6 @@ mod tests {
"CollectStatsHandler",
"FilterInactiveRegionStatsHandler",
"CollectStatsHandler",
"RemapFlowPeerHandler",
];
for (handler, name) in handlers.iter().zip(names.into_iter()) {
@@ -994,7 +987,7 @@ mod tests {
let group = builder.build().unwrap();
let handlers = group.handlers;
assert_eq!(14, handlers.len());
assert_eq!(13, handlers.len());
let names = [
"ResponseHeaderHandler",
@@ -1010,7 +1003,6 @@ mod tests {
"FilterInactiveRegionStatsHandler",
"CollectStatsHandler",
"ResponseHeaderHandler",
"RemapFlowPeerHandler",
];
for (handler, name) in handlers.iter().zip(names.into_iter()) {
@@ -1028,7 +1020,7 @@ mod tests {
let group = builder.build().unwrap();
let handlers = group.handlers;
assert_eq!(13, handlers.len());
assert_eq!(12, handlers.len());
let names = [
"ResponseHeaderHandler",
@@ -1043,7 +1035,6 @@ mod tests {
"CollectStatsHandler",
"FilterInactiveRegionStatsHandler",
"CollectStatsHandler",
"RemapFlowPeerHandler",
];
for (handler, name) in handlers.iter().zip(names.into_iter()) {
@@ -1061,7 +1052,7 @@ mod tests {
let group = builder.build().unwrap();
let handlers = group.handlers;
assert_eq!(13, handlers.len());
assert_eq!(12, handlers.len());
let names = [
"ResponseHeaderHandler",
@@ -1076,7 +1067,6 @@ mod tests {
"MailboxHandler",
"FilterInactiveRegionStatsHandler",
"ResponseHeaderHandler",
"RemapFlowPeerHandler",
];
for (handler, name) in handlers.iter().zip(names.into_iter()) {
@@ -1094,7 +1084,7 @@ mod tests {
let group = builder.build().unwrap();
let handlers = group.handlers;
assert_eq!(13, handlers.len());
assert_eq!(12, handlers.len());
let names = [
"CollectStatsHandler",
@@ -1109,7 +1099,6 @@ mod tests {
"MailboxHandler",
"FilterInactiveRegionStatsHandler",
"CollectStatsHandler",
"RemapFlowPeerHandler",
];
for (handler, name) in handlers.iter().zip(names.into_iter()) {

View File

@@ -23,8 +23,8 @@ pub struct CheckLeaderHandler;
#[async_trait::async_trait]
impl HeartbeatHandler for CheckLeaderHandler {
fn is_acceptable(&self, _role: Role) -> bool {
true
fn is_acceptable(&self, role: Role) -> bool {
role == Role::Datanode
}
async fn handle(

View File

@@ -157,7 +157,7 @@ fn extract_base_info(request: &HeartbeatRequest) -> Option<(NodeInfoKey, Peer, P
}
async fn put_into_memory_store(ctx: &mut Context, key: NodeInfoKey, value: NodeInfo) -> Result<()> {
let key = (&key).into();
let key = key.into();
let value = value.try_into().context(InvalidClusterInfoFormatSnafu)?;
let put_req = PutRequest {
key,

View File

@@ -21,7 +21,7 @@ use common_meta::key::node_address::{NodeAddressKey, NodeAddressValue};
use common_meta::key::{MetadataKey, MetadataValue};
use common_meta::peer::Peer;
use common_meta::rpc::store::PutRequest;
use common_telemetry::{error, info, warn};
use common_telemetry::{error, warn};
use dashmap::DashMap;
use snafu::ResultExt;
@@ -185,10 +185,6 @@ async fn rewrite_node_address(ctx: &mut Context, stat: &Stat) {
match ctx.leader_cached_kv_backend.put(put).await {
Ok(_) => {
info!(
"Successfully updated datanode `NodeAddressValue`: {:?}",
peer
);
// broadcast invalidating cache
let cache_idents = stat
.table_ids()
@@ -204,14 +200,11 @@ async fn rewrite_node_address(ctx: &mut Context, stat: &Stat) {
}
}
Err(e) => {
error!(e; "Failed to update datanode `NodeAddressValue`: {:?}", peer);
error!(e; "Failed to update NodeAddressValue: {:?}", peer);
}
}
} else {
warn!(
"Failed to serialize datanode `NodeAddressValue`: {:?}",
peer
);
warn!("Failed to serialize NodeAddressValue: {:?}", peer);
}
}

View File

@@ -1,92 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use api::v1::meta::{HeartbeatRequest, Peer, Role};
use common_meta::key::node_address::{NodeAddressKey, NodeAddressValue};
use common_meta::key::{MetadataKey, MetadataValue};
use common_meta::rpc::store::PutRequest;
use common_telemetry::{error, info, warn};
use dashmap::DashMap;
use crate::handler::{HandleControl, HeartbeatAccumulator, HeartbeatHandler};
use crate::metasrv::Context;
use crate::Result;
#[derive(Debug, Default)]
pub struct RemapFlowPeerHandler {
/// flow_node_id -> epoch
epoch_cache: DashMap<u64, u64>,
}
#[async_trait::async_trait]
impl HeartbeatHandler for RemapFlowPeerHandler {
fn is_acceptable(&self, role: Role) -> bool {
role == Role::Flownode
}
async fn handle(
&self,
req: &HeartbeatRequest,
ctx: &mut Context,
_acc: &mut HeartbeatAccumulator,
) -> Result<HandleControl> {
let Some(peer) = req.peer.as_ref() else {
return Ok(HandleControl::Continue);
};
let current_epoch = req.node_epoch;
let flow_node_id = peer.id;
let refresh = if let Some(mut epoch) = self.epoch_cache.get_mut(&flow_node_id) {
if current_epoch > *epoch.value() {
*epoch.value_mut() = current_epoch;
true
} else {
false
}
} else {
self.epoch_cache.insert(flow_node_id, current_epoch);
true
};
if refresh {
rewrite_node_address(ctx, peer).await;
}
Ok(HandleControl::Continue)
}
}
async fn rewrite_node_address(ctx: &mut Context, peer: &Peer) {
let key = NodeAddressKey::with_flownode(peer.id).to_bytes();
if let Ok(value) = NodeAddressValue::new(peer.clone().into()).try_as_raw_value() {
let put = PutRequest {
key,
value,
prev_kv: false,
};
match ctx.leader_cached_kv_backend.put(put).await {
Ok(_) => {
info!("Successfully updated flow `NodeAddressValue`: {:?}", peer);
// TODO(discord): broadcast invalidating cache to all frontends
}
Err(e) => {
error!(e; "Failed to update flow `NodeAddressValue`: {:?}", peer);
}
}
} else {
warn!("Failed to serialize flow `NodeAddressValue`: {:?}", peer);
}
}

View File

@@ -32,7 +32,6 @@ use common_meta::kv_backend::{KvBackendRef, ResettableKvBackend, ResettableKvBac
use common_meta::leadership_notifier::{
LeadershipChangeNotifier, LeadershipChangeNotifierCustomizerRef,
};
use common_meta::node_expiry_listener::NodeExpiryListener;
use common_meta::peer::Peer;
use common_meta::region_keeper::MemoryRegionKeeperRef;
use common_meta::wal_options_allocator::WalOptionsAllocatorRef;
@@ -152,8 +151,6 @@ pub struct MetasrvOptions {
#[cfg(feature = "pg_kvbackend")]
/// Lock id for meta kv election. Only effect when using pg_kvbackend.
pub meta_election_lock_id: u64,
#[serde(with = "humantime_serde")]
pub node_max_idle_time: Duration,
}
const DEFAULT_METASRV_ADDR_PORT: &str = "3002";
@@ -195,7 +192,6 @@ impl Default for MetasrvOptions {
meta_table_name: DEFAULT_META_TABLE_NAME.to_string(),
#[cfg(feature = "pg_kvbackend")]
meta_election_lock_id: DEFAULT_META_ELECTION_LOCK_ID,
node_max_idle_time: Duration::from_secs(24 * 60 * 60),
}
}
}
@@ -446,10 +442,6 @@ impl Metasrv {
leadership_change_notifier.add_listener(self.wal_options_allocator.clone());
leadership_change_notifier
.add_listener(Arc::new(ProcedureManagerListenerAdapter(procedure_manager)));
leadership_change_notifier.add_listener(Arc::new(NodeExpiryListener::new(
self.options.node_max_idle_time,
self.in_memory.clone(),
)));
if let Some(region_supervisor_ticker) = &self.region_supervisor_ticker {
leadership_change_notifier.add_listener(region_supervisor_ticker.clone() as _);
}

View File

@@ -68,15 +68,13 @@ impl heartbeat_server::Heartbeat for Metasrv {
};
if pusher_id.is_none() {
pusher_id =
Some(register_pusher(&handler_group, header, tx.clone()).await);
pusher_id = register_pusher(&handler_group, header, tx.clone()).await;
}
if let Some(k) = &pusher_id {
METRIC_META_HEARTBEAT_RECV.with_label_values(&[&k.to_string()]);
} else {
METRIC_META_HEARTBEAT_RECV.with_label_values(&["none"]);
}
let res = handler_group
.handle(req, ctx.clone())
.await
@@ -175,13 +173,13 @@ async fn register_pusher(
handler_group: &HeartbeatHandlerGroup,
header: &RequestHeader,
sender: Sender<std::result::Result<HeartbeatResponse, tonic::Status>>,
) -> PusherId {
) -> Option<PusherId> {
let role = header.role();
let id = get_node_id(header);
let pusher_id = PusherId::new(role, id);
let pusher = Pusher::new(sender, header);
handler_group.register_pusher(pusher_id, pusher).await;
pusher_id
Some(pusher_id)
}
#[cfg(test)]

View File

@@ -17,15 +17,13 @@ use std::time::Duration;
use api::v1::meta::{
procedure_service_server, DdlTaskRequest as PbDdlTaskRequest,
DdlTaskResponse as PbDdlTaskResponse, Error, MigrateRegionRequest, MigrateRegionResponse,
DdlTaskResponse as PbDdlTaskResponse, MigrateRegionRequest, MigrateRegionResponse,
ProcedureDetailRequest, ProcedureDetailResponse, ProcedureStateResponse, QueryProcedureRequest,
ResponseHeader,
};
use common_meta::ddl::ExecutorContext;
use common_meta::rpc::ddl::{DdlTask, SubmitDdlTaskRequest};
use common_meta::rpc::procedure;
use common_telemetry::warn;
use snafu::{OptionExt, ResultExt};
use snafu::{ensure, OptionExt, ResultExt};
use tonic::{Request, Response};
use super::GrpcResult;
@@ -39,16 +37,6 @@ impl procedure_service_server::ProcedureService for Metasrv {
&self,
request: Request<QueryProcedureRequest>,
) -> GrpcResult<ProcedureStateResponse> {
if !self.is_leader() {
let resp = ProcedureStateResponse {
header: Some(ResponseHeader::failed(0, Error::is_not_leader())),
..Default::default()
};
warn!("The current meta is not leader, but a `query procedure state` request have reached the meta. Detail: {:?}.", request);
return Ok(Response::new(resp));
}
let QueryProcedureRequest { header, pid, .. } = request.into_inner();
let _header = header.context(error::MissingRequestHeaderSnafu)?;
let pid = pid.context(error::MissingRequiredParameterSnafu { param: "pid" })?;
@@ -69,16 +57,6 @@ impl procedure_service_server::ProcedureService for Metasrv {
}
async fn ddl(&self, request: Request<PbDdlTaskRequest>) -> GrpcResult<PbDdlTaskResponse> {
if !self.is_leader() {
let resp = PbDdlTaskResponse {
header: Some(ResponseHeader::failed(0, Error::is_not_leader())),
..Default::default()
};
warn!("The current meta is not leader, but a `ddl` request have reached the meta. Detail: {:?}.", request);
return Ok(Response::new(resp));
}
let PbDdlTaskRequest {
header,
query_context,
@@ -121,15 +99,12 @@ impl procedure_service_server::ProcedureService for Metasrv {
&self,
request: Request<MigrateRegionRequest>,
) -> GrpcResult<MigrateRegionResponse> {
if !self.is_leader() {
let resp = MigrateRegionResponse {
header: Some(ResponseHeader::failed(0, Error::is_not_leader())),
..Default::default()
};
warn!("The current meta is not leader, but a `migrate` request have reached the meta. Detail: {:?}.", request);
return Ok(Response::new(resp));
}
ensure!(
self.meta_peer_client().is_leader(),
error::UnexpectedSnafu {
violated: "Trying to submit a region migration procedure to non-leader meta server"
}
);
let MigrateRegionRequest {
header,
@@ -175,16 +150,6 @@ impl procedure_service_server::ProcedureService for Metasrv {
&self,
request: Request<ProcedureDetailRequest>,
) -> GrpcResult<ProcedureDetailResponse> {
if !self.is_leader() {
let resp = ProcedureDetailResponse {
header: Some(ResponseHeader::failed(0, Error::is_not_leader())),
..Default::default()
};
warn!("The current meta is not leader, but a `procedure details` request have reached the meta. Detail: {:?}.", request);
return Ok(Response::new(resp));
}
let ProcedureDetailRequest { header } = request.into_inner();
let _header = header.context(error::MissingRequestHeaderSnafu)?;
let metas = self

View File

@@ -142,7 +142,6 @@ impl DataRegion {
c.column_id = new_column_id_start + delta as u32;
c.column_schema.set_nullable();
match index_options {
IndexOptions::None => {}
IndexOptions::Inverted => {
c.column_schema.set_inverted_index(true);
}

View File

@@ -21,7 +21,7 @@ use api::v1::SemanticType;
use common_telemetry::info;
use common_time::{Timestamp, FOREVER};
use datatypes::data_type::ConcreteDataType;
use datatypes::schema::{ColumnSchema, SkippingIndexOptions};
use datatypes::schema::ColumnSchema;
use datatypes::value::Value;
use mito2::engine::MITO_ENGINE_NAME;
use object_store::util::join_dir;
@@ -55,8 +55,6 @@ use crate::error::{
use crate::metrics::PHYSICAL_REGION_COUNT;
use crate::utils::{self, to_data_region_id, to_metadata_region_id};
const DEFAULT_TABLE_ID_SKIPPING_INDEX_GRANULARITY: u32 = 1024;
impl MetricEngineInner {
pub async fn create_regions(
&self,
@@ -442,7 +440,6 @@ impl MetricEngineInner {
///
/// Return `[table_id_col, tsid_col]`
fn internal_column_metadata() -> [ColumnMetadata; 2] {
// Safety: BloomFilter is a valid skipping index type
let metric_name_col = ColumnMetadata {
column_id: ReservedColumnId::table_id(),
semantic_type: SemanticType::Tag,
@@ -451,11 +448,7 @@ impl MetricEngineInner {
ConcreteDataType::uint32_datatype(),
false,
)
.with_skipping_options(SkippingIndexOptions {
granularity: DEFAULT_TABLE_ID_SKIPPING_INDEX_GRANULARITY,
index_type: datatypes::schema::SkippingIndexType::BloomFilter,
})
.unwrap(),
.with_inverted_index(true),
};
let tsid_col = ColumnMetadata {
column_id: ReservedColumnId::tsid(),

View File

@@ -30,10 +30,9 @@ impl MetricEngineInner {
pub async fn drop_region(
&self,
region_id: RegionId,
req: RegionDropRequest,
_req: RegionDropRequest,
) -> Result<AffectedRows> {
let data_region_id = utils::to_data_region_id(region_id);
let fast_path = req.fast_path;
// enclose the guard in a block to prevent the guard from polluting the async context
let (is_physical_region, is_physical_region_busy) = {
@@ -53,7 +52,7 @@ impl MetricEngineInner {
if is_physical_region {
// check if there is no logical region relates to this physical region
if is_physical_region_busy && !fast_path {
if is_physical_region_busy {
// reject if there is any present logical region
return Err(PhysicalRegionBusySnafu {
region_id: data_region_id,
@@ -61,21 +60,9 @@ impl MetricEngineInner {
.build());
}
return self.drop_physical_region(data_region_id).await;
}
if fast_path {
// for fast path, we don't delete the metadata in the metadata region.
// it only remove the logical region from the engine state.
//
// The drop database procedure will ensure the metadata region and data region are dropped eventually.
self.state
.write()
.unwrap()
.remove_logical_region(region_id)?;
Ok(0)
self.drop_physical_region(data_region_id).await
} else {
// cannot merge these two `if` otherwise the stupid type checker will complain
let metadata_region_id = self
.state
.read()
@@ -100,16 +87,13 @@ impl MetricEngineInner {
// Since the physical regions are going to be dropped, we don't need to
// update the contents in metadata region.
self.mito
.handle_request(
data_region_id,
RegionRequest::Drop(RegionDropRequest { fast_path: false }),
)
.handle_request(data_region_id, RegionRequest::Drop(RegionDropRequest {}))
.await
.with_context(|_| CloseMitoRegionSnafu { region_id })?;
self.mito
.handle_request(
metadata_region_id,
RegionRequest::Drop(RegionDropRequest { fast_path: false }),
RegionRequest::Drop(RegionDropRequest {}),
)
.await
.with_context(|_| CloseMitoRegionSnafu { region_id })?;

View File

@@ -40,7 +40,6 @@ pub struct PhysicalRegionOptions {
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub enum IndexOptions {
#[default]
None,
Inverted,
Skipping {
granularity: u32,

View File

@@ -146,14 +146,11 @@ impl AccessLayer {
} else {
// Write cache is disabled.
let store = self.object_store.clone();
let path_provider = RegionFilePathFactory::new(self.region_dir.clone());
let indexer_builder = IndexerBuilderImpl {
op_type: request.op_type,
metadata: request.metadata.clone(),
row_group_size: write_opts.row_group_size,
puffin_manager: self
.puffin_manager_factory
.build(store, path_provider.clone()),
puffin_manager: self.puffin_manager_factory.build(store),
intermediate_manager: self.intermediate_manager.clone(),
index_options: request.index_options,
inverted_index_config: request.inverted_index_config,
@@ -164,7 +161,9 @@ impl AccessLayer {
self.object_store.clone(),
request.metadata,
indexer_builder,
path_provider,
RegionFilePathFactory {
region_dir: self.region_dir.clone(),
},
)
.await;
writer
@@ -249,18 +248,8 @@ pub trait FilePathProvider: Send + Sync {
/// Path provider that builds paths in local write cache.
#[derive(Clone)]
pub(crate) struct WriteCachePathProvider {
region_id: RegionId,
file_cache: FileCacheRef,
}
impl WriteCachePathProvider {
/// Creates a new `WriteCachePathProvider` instance.
pub fn new(region_id: RegionId, file_cache: FileCacheRef) -> Self {
Self {
region_id,
file_cache,
}
}
pub(crate) region_id: RegionId,
pub(crate) file_cache: FileCacheRef,
}
impl FilePathProvider for WriteCachePathProvider {
@@ -278,14 +267,7 @@ impl FilePathProvider for WriteCachePathProvider {
/// Path provider that builds paths in region storage path.
#[derive(Clone, Debug)]
pub(crate) struct RegionFilePathFactory {
region_dir: String,
}
impl RegionFilePathFactory {
/// Creates a new `RegionFilePathFactory` instance.
pub fn new(region_dir: String) -> Self {
Self { region_dir }
}
pub(crate) region_dir: String,
}
impl FilePathProvider for RegionFilePathFactory {

View File

@@ -187,12 +187,9 @@ impl FileCache {
}
/// Removes a file from the cache explicitly.
/// It always tries to remove the file from the local store because we may not have the file
/// in the memory index if upload is failed.
pub(crate) async fn remove(&self, key: IndexKey) {
let file_path = self.cache_file_path(key);
self.memory_index.remove(&key).await;
// Always delete the file from the local store.
if let Err(e) = self.local_store.delete(&file_path).await {
warn!(e; "Failed to delete a cached file {}", file_path);
}

View File

@@ -22,7 +22,6 @@ use common_telemetry::{debug, info};
use futures::AsyncWriteExt;
use object_store::ObjectStore;
use snafu::ResultExt;
use store_api::storage::RegionId;
use crate::access_layer::{
new_fs_cache_store, FilePathProvider, RegionFilePathFactory, SstInfoArray, SstWriteRequest,
@@ -115,14 +114,15 @@ impl WriteCache {
let region_id = write_request.metadata.region_id;
let store = self.file_cache.local_store();
let path_provider = WriteCachePathProvider::new(region_id, self.file_cache.clone());
let path_provider = WriteCachePathProvider {
file_cache: self.file_cache.clone(),
region_id,
};
let indexer = IndexerBuilderImpl {
op_type: write_request.op_type,
metadata: write_request.metadata.clone(),
row_group_size: write_opts.row_group_size,
puffin_manager: self
.puffin_manager_factory
.build(store, path_provider.clone()),
puffin_manager: self.puffin_manager_factory.build(store),
intermediate_manager: self.intermediate_manager.clone(),
index_options: write_request.index_options,
inverted_index_config: write_request.inverted_index_config,
@@ -150,41 +150,24 @@ impl WriteCache {
return Ok(sst_info);
}
let mut upload_tracker = UploadTracker::new(region_id);
let mut err = None;
let remote_store = &upload_request.remote_store;
for sst in &sst_info {
let parquet_key = IndexKey::new(region_id, sst.file_id, FileType::Parquet);
let parquet_path = upload_request
.dest_path_provider
.build_sst_file_path(sst.file_id);
if let Err(e) = self.upload(parquet_key, &parquet_path, remote_store).await {
err = Some(e);
break;
}
upload_tracker.push_uploaded_file(parquet_path);
self.upload(parquet_key, &parquet_path, remote_store)
.await?;
if sst.index_metadata.file_size > 0 {
let puffin_key = IndexKey::new(region_id, sst.file_id, FileType::Puffin);
let puffin_path = upload_request
let puffin_path = &upload_request
.dest_path_provider
.build_index_file_path(sst.file_id);
if let Err(e) = self.upload(puffin_key, &puffin_path, remote_store).await {
err = Some(e);
break;
}
upload_tracker.push_uploaded_file(puffin_path);
self.upload(puffin_key, puffin_path, remote_store).await?;
}
}
if let Some(err) = err {
// Cleans files on failure.
upload_tracker
.clean(&sst_info, &self.file_cache, remote_store)
.await;
return Err(err);
}
Ok(sst_info)
}
@@ -350,61 +333,6 @@ pub struct SstUploadRequest {
pub remote_store: ObjectStore,
}
/// A structs to track files to upload and clean them if upload failed.
struct UploadTracker {
/// Id of the region to track.
region_id: RegionId,
/// Paths of files uploaded successfully.
files_uploaded: Vec<String>,
}
impl UploadTracker {
/// Creates a new instance of `UploadTracker` for a given region.
fn new(region_id: RegionId) -> Self {
Self {
region_id,
files_uploaded: Vec::new(),
}
}
/// Add a file path to the list of uploaded files.
fn push_uploaded_file(&mut self, path: String) {
self.files_uploaded.push(path);
}
/// Cleans uploaded files and files in the file cache at best effort.
async fn clean(
&self,
sst_info: &SstInfoArray,
file_cache: &FileCacheRef,
remote_store: &ObjectStore,
) {
common_telemetry::info!(
"Start cleaning files on upload failure, region: {}, num_ssts: {}",
self.region_id,
sst_info.len()
);
// Cleans files in the file cache first.
for sst in sst_info {
let parquet_key = IndexKey::new(self.region_id, sst.file_id, FileType::Parquet);
file_cache.remove(parquet_key).await;
if sst.index_metadata.file_size > 0 {
let puffin_key = IndexKey::new(self.region_id, sst.file_id, FileType::Puffin);
file_cache.remove(puffin_key).await;
}
}
// Cleans uploaded files.
for file_path in &self.files_uploaded {
if let Err(e) = remote_store.delete(file_path).await {
common_telemetry::error!(e; "Failed to delete file {}", file_path);
}
}
}
}
#[cfg(test)]
mod tests {
use common_test_util::temp_dir::create_temp_dir;
@@ -427,7 +355,9 @@ mod tests {
// and now just use local file system to mock.
let mut env = TestEnv::new();
let mock_store = env.init_object_store_manager();
let path_provider = RegionFilePathFactory::new("test".to_string());
let path_provider = RegionFilePathFactory {
region_dir: "test".to_string(),
};
let local_dir = create_temp_dir("");
let local_store = new_fs_store(local_dir.path().to_str().unwrap());
@@ -558,7 +488,9 @@ mod tests {
..Default::default()
};
let upload_request = SstUploadRequest {
dest_path_provider: RegionFilePathFactory::new(data_home.clone()),
dest_path_provider: RegionFilePathFactory {
region_dir: data_home.clone(),
},
remote_store: mock_store.clone(),
};

View File

@@ -56,10 +56,7 @@ async fn test_engine_drop_region() {
// It's okay to drop a region doesn't exist.
engine
.handle_request(
region_id,
RegionRequest::Drop(RegionDropRequest { fast_path: false }),
)
.handle_request(region_id, RegionRequest::Drop(RegionDropRequest {}))
.await
.unwrap_err();
@@ -89,10 +86,7 @@ async fn test_engine_drop_region() {
// drop the created region.
engine
.handle_request(
region_id,
RegionRequest::Drop(RegionDropRequest { fast_path: false }),
)
.handle_request(region_id, RegionRequest::Drop(RegionDropRequest {}))
.await
.unwrap();
assert!(!engine.is_region_exists(region_id));
@@ -198,10 +192,7 @@ async fn test_engine_drop_region_for_custom_store() {
// Drop the custom region.
engine
.handle_request(
custom_region_id,
RegionRequest::Drop(RegionDropRequest { fast_path: false }),
)
.handle_request(custom_region_id, RegionRequest::Drop(RegionDropRequest {}))
.await
.unwrap();
assert!(!engine.is_region_exists(custom_region_id));

View File

@@ -301,7 +301,10 @@ impl PartitionTreeMemtable {
fn update_stats(&self, metrics: &WriteMetrics) {
// Only let the tracker tracks value bytes.
self.alloc_tracker.on_allocation(metrics.value_bytes);
metrics.update_timestamp_range(&self.max_timestamp, &self.min_timestamp);
self.max_timestamp
.fetch_max(metrics.max_ts, Ordering::SeqCst);
self.min_timestamp
.fetch_min(metrics.min_ts, Ordering::SeqCst);
}
}

View File

@@ -14,8 +14,6 @@
//! Internal metrics of the memtable.
use std::sync::atomic::{AtomicI64, Ordering};
/// Metrics of writing memtables.
pub(crate) struct WriteMetrics {
/// Size allocated by keys.
@@ -28,51 +26,6 @@ pub(crate) struct WriteMetrics {
pub(crate) max_ts: i64,
}
impl WriteMetrics {
/// Update the min/max timestamp range according to current write metric.
pub(crate) fn update_timestamp_range(&self, prev_max_ts: &AtomicI64, prev_min_ts: &AtomicI64) {
loop {
let current_min = prev_min_ts.load(Ordering::Relaxed);
if self.min_ts >= current_min {
break;
}
let Err(updated) = prev_min_ts.compare_exchange(
current_min,
self.min_ts,
Ordering::Relaxed,
Ordering::Relaxed,
) else {
break;
};
if updated == self.min_ts {
break;
}
}
loop {
let current_max = prev_max_ts.load(Ordering::Relaxed);
if self.max_ts <= current_max {
break;
}
let Err(updated) = prev_max_ts.compare_exchange(
current_max,
self.max_ts,
Ordering::Relaxed,
Ordering::Relaxed,
) else {
break;
};
if updated == self.max_ts {
break;
}
}
}
}
impl Default for WriteMetrics {
fn default() -> Self {
Self {

View File

@@ -146,7 +146,8 @@ impl TimeSeriesMemtable {
fn update_stats(&self, stats: WriteMetrics) {
self.alloc_tracker
.on_allocation(stats.key_bytes + stats.value_bytes);
stats.update_timestamp_range(&self.max_timestamp, &self.min_timestamp);
self.max_timestamp.fetch_max(stats.max_ts, Ordering::SeqCst);
self.min_timestamp.fetch_min(stats.min_ts, Ordering::SeqCst);
}
fn write_key_value(&self, kv: KeyValue, stats: &mut WriteMetrics) -> Result<()> {

View File

@@ -32,6 +32,7 @@ use tokio::sync::{mpsc, Semaphore};
use tokio_stream::wrappers::ReceiverStream;
use crate::access_layer::AccessLayerRef;
use crate::cache::file_cache::FileCacheRef;
use crate::cache::CacheStrategy;
use crate::config::DEFAULT_SCAN_CHANNEL_SIZE;
use crate::error::Result;
@@ -310,13 +311,10 @@ impl ScanRegion {
let memtables: Vec<_> = memtables
.into_iter()
.filter(|mem| {
if mem.is_empty() {
// check if memtable is empty by reading stats.
let Some((start, end)) = mem.stats().time_range() else {
return false;
}
let stats = mem.stats();
// Safety: the memtable is not empty.
let (start, end) = stats.time_range().unwrap();
};
// The time range of the memtable is inclusive.
let memtable_range = TimestampRange::new_inclusive(Some(start), Some(end));
memtable_range.intersects(&time_range)
@@ -426,7 +424,12 @@ impl ScanRegion {
return None;
}
let file_cache = self.cache_strategy.write_cache().map(|w| w.file_cache());
let file_cache = || -> Option<FileCacheRef> {
let write_cache = self.cache_strategy.write_cache()?;
let file_cache = write_cache.file_cache();
Some(file_cache)
}();
let inverted_index_cache = self.cache_strategy.inverted_index_cache().cloned();
let puffin_metadata_cache = self.cache_strategy.puffin_metadata_cache().cloned();
@@ -461,8 +464,14 @@ impl ScanRegion {
return None;
}
let file_cache = self.cache_strategy.write_cache().map(|w| w.file_cache());
let file_cache = || -> Option<FileCacheRef> {
let write_cache = self.cache_strategy.write_cache()?;
let file_cache = write_cache.file_cache();
Some(file_cache)
}();
let bloom_filter_index_cache = self.cache_strategy.bloom_filter_index_cache().cloned();
let puffin_metadata_cache = self.cache_strategy.puffin_metadata_cache().cloned();
BloomFilterIndexApplierBuilder::new(
@@ -487,18 +496,12 @@ impl ScanRegion {
return None;
}
let file_cache = self.cache_strategy.write_cache().map(|w| w.file_cache());
let puffin_metadata_cache = self.cache_strategy.puffin_metadata_cache().cloned();
FulltextIndexApplierBuilder::new(
self.access_layer.region_dir().to_string(),
self.version.metadata.region_id,
self.access_layer.object_store().clone(),
self.access_layer.puffin_manager_factory().clone(),
self.version.metadata.as_ref(),
)
.with_file_cache(file_cache)
.with_puffin_metadata_cache(puffin_metadata_cache)
.build(&self.request.filters)
.inspect_err(|err| warn!(err; "Failed to build fulltext index applier"))
.ok()

View File

@@ -35,8 +35,8 @@ use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataRef};
use store_api::region_engine::{SetRegionRoleStateResponse, SettableRegionRoleState};
use store_api::region_request::{
AffectedRows, RegionAlterRequest, RegionCatchupRequest, RegionCloseRequest,
RegionCompactRequest, RegionCreateRequest, RegionFlushRequest, RegionOpenRequest,
RegionRequest, RegionTruncateRequest,
RegionCompactRequest, RegionCreateRequest, RegionDropRequest, RegionFlushRequest,
RegionOpenRequest, RegionRequest, RegionTruncateRequest,
};
use store_api::storage::{RegionId, SequenceNumber};
use tokio::sync::oneshot::{self, Receiver, Sender};
@@ -624,10 +624,10 @@ impl WorkerRequest {
sender: sender.into(),
request: DdlRequest::Create(v),
}),
RegionRequest::Drop(_) => WorkerRequest::Ddl(SenderDdlRequest {
RegionRequest::Drop(v) => WorkerRequest::Ddl(SenderDdlRequest {
region_id,
sender: sender.into(),
request: DdlRequest::Drop,
request: DdlRequest::Drop(v),
}),
RegionRequest::Open(v) => WorkerRequest::Ddl(SenderDdlRequest {
region_id,
@@ -690,7 +690,7 @@ impl WorkerRequest {
#[derive(Debug)]
pub(crate) enum DdlRequest {
Create(RegionCreateRequest),
Drop,
Drop(RegionDropRequest),
Open((RegionOpenRequest, Option<WalEntryReceiver>)),
Close(RegionCloseRequest),
Alter(RegionAlterRequest),

View File

@@ -174,8 +174,31 @@ impl FileMeta {
.contains(&IndexType::BloomFilterIndex)
}
pub fn index_file_size(&self) -> u64 {
self.index_file_size
/// Returns the size of the inverted index file
pub fn inverted_index_size(&self) -> Option<u64> {
if self.available_indexes.len() == 1 && self.inverted_index_available() {
Some(self.index_file_size)
} else {
None
}
}
/// Returns the size of the fulltext index file
pub fn fulltext_index_size(&self) -> Option<u64> {
if self.available_indexes.len() == 1 && self.fulltext_index_available() {
Some(self.index_file_size)
} else {
None
}
}
/// Returns the size of the bloom filter index file
pub fn bloom_filter_index_size(&self) -> Option<u64> {
if self.available_indexes.len() == 1 && self.bloom_filter_index_available() {
Some(self.index_file_size)
} else {
None
}
}
}

View File

@@ -113,9 +113,11 @@ impl FilePurger for LocalFilePurger {
}
// Purges index content in the stager.
let puffin_file_name =
crate::sst::location::index_file_path(sst_layer.region_dir(), file_meta.file_id);
if let Err(e) = sst_layer
.puffin_manager_factory()
.purge_stager(file_meta.file_id)
.purge_stager(&puffin_file_name)
.await
{
error!(e; "Failed to purge stager with index file, file_id: {}, region: {}",

View File

@@ -103,6 +103,7 @@ pub type BloomFilterOutput = IndexBaseOutput;
#[derive(Default)]
pub struct Indexer {
file_id: FileId,
file_path: String,
region_id: RegionId,
puffin_manager: Option<SstPuffinManager>,
inverted_indexer: Option<InvertedIndexer>,
@@ -169,7 +170,7 @@ impl Indexer {
#[async_trait::async_trait]
pub trait IndexerBuilder {
/// Builds indexer of given file id to [index_file_path].
async fn build(&self, file_id: FileId) -> Indexer;
async fn build(&self, file_id: FileId, index_file_path: String) -> Indexer;
}
pub(crate) struct IndexerBuilderImpl {
@@ -187,9 +188,10 @@ pub(crate) struct IndexerBuilderImpl {
#[async_trait::async_trait]
impl IndexerBuilder for IndexerBuilderImpl {
/// Sanity check for arguments and create a new [Indexer] if arguments are valid.
async fn build(&self, file_id: FileId) -> Indexer {
async fn build(&self, file_id: FileId, index_file_path: String) -> Indexer {
let mut indexer = Indexer {
file_id,
file_path: index_file_path,
region_id: self.metadata.region_id,
..Default::default()
};
@@ -390,31 +392,30 @@ mod tests {
use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder};
use super::*;
use crate::access_layer::FilePathProvider;
use crate::config::{FulltextIndexConfig, Mode};
struct MetaConfig {
with_inverted: bool,
with_tag: bool,
with_fulltext: bool,
with_skipping_bloom: bool,
}
fn mock_region_metadata(
MetaConfig {
with_inverted,
with_tag,
with_fulltext,
with_skipping_bloom,
}: MetaConfig,
) -> RegionMetadataRef {
let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 2));
let mut column_schema = ColumnSchema::new("a", ConcreteDataType::int64_datatype(), false);
if with_inverted {
column_schema = column_schema.with_inverted_index(true);
}
builder
.push_column_metadata(ColumnMetadata {
column_schema,
semantic_type: SemanticType::Field,
column_schema: ColumnSchema::new("a", ConcreteDataType::int64_datatype(), false),
semantic_type: if with_tag {
SemanticType::Tag
} else {
SemanticType::Field
},
column_id: 1,
})
.push_column_metadata(ColumnMetadata {
@@ -432,6 +433,10 @@ mod tests {
column_id: 3,
});
if with_tag {
builder.primary_key(vec![1]);
}
if with_fulltext {
let column_schema =
ColumnSchema::new("text", ConcreteDataType::string_datatype(), true)
@@ -479,18 +484,6 @@ mod tests {
IntermediateManager::init_fs(path).await.unwrap()
}
struct NoopPathProvider;
impl FilePathProvider for NoopPathProvider {
fn build_index_file_path(&self, _file_id: FileId) -> String {
unreachable!()
}
fn build_sst_file_path(&self, _file_id: FileId) -> String {
unreachable!()
}
}
#[tokio::test]
async fn test_build_indexer_basic() {
let (dir, factory) =
@@ -498,7 +491,7 @@ mod tests {
let intm_manager = mock_intm_mgr(dir.path().to_string_lossy()).await;
let metadata = mock_region_metadata(MetaConfig {
with_inverted: true,
with_tag: true,
with_fulltext: true,
with_skipping_bloom: true,
});
@@ -506,14 +499,14 @@ mod tests {
op_type: OperationType::Flush,
metadata,
row_group_size: 1024,
puffin_manager: factory.build(mock_object_store(), NoopPathProvider),
puffin_manager: factory.build(mock_object_store()),
intermediate_manager: intm_manager,
index_options: IndexOptions::default(),
inverted_index_config: InvertedIndexConfig::default(),
fulltext_index_config: FulltextIndexConfig::default(),
bloom_filter_index_config: BloomFilterConfig::default(),
}
.build(FileId::random())
.build(FileId::random(), "test".to_string())
.await;
assert!(indexer.inverted_indexer.is_some());
@@ -528,7 +521,7 @@ mod tests {
let intm_manager = mock_intm_mgr(dir.path().to_string_lossy()).await;
let metadata = mock_region_metadata(MetaConfig {
with_inverted: true,
with_tag: true,
with_fulltext: true,
with_skipping_bloom: true,
});
@@ -536,7 +529,7 @@ mod tests {
op_type: OperationType::Flush,
metadata: metadata.clone(),
row_group_size: 1024,
puffin_manager: factory.build(mock_object_store(), NoopPathProvider),
puffin_manager: factory.build(mock_object_store()),
intermediate_manager: intm_manager.clone(),
index_options: IndexOptions::default(),
inverted_index_config: InvertedIndexConfig {
@@ -546,7 +539,7 @@ mod tests {
fulltext_index_config: FulltextIndexConfig::default(),
bloom_filter_index_config: BloomFilterConfig::default(),
}
.build(FileId::random())
.build(FileId::random(), "test".to_string())
.await;
assert!(indexer.inverted_indexer.is_none());
@@ -557,7 +550,7 @@ mod tests {
op_type: OperationType::Compact,
metadata: metadata.clone(),
row_group_size: 1024,
puffin_manager: factory.build(mock_object_store(), NoopPathProvider),
puffin_manager: factory.build(mock_object_store()),
intermediate_manager: intm_manager.clone(),
index_options: IndexOptions::default(),
inverted_index_config: InvertedIndexConfig::default(),
@@ -567,7 +560,7 @@ mod tests {
},
bloom_filter_index_config: BloomFilterConfig::default(),
}
.build(FileId::random())
.build(FileId::random(), "test".to_string())
.await;
assert!(indexer.inverted_indexer.is_some());
@@ -578,7 +571,7 @@ mod tests {
op_type: OperationType::Compact,
metadata,
row_group_size: 1024,
puffin_manager: factory.build(mock_object_store(), NoopPathProvider),
puffin_manager: factory.build(mock_object_store()),
intermediate_manager: intm_manager,
index_options: IndexOptions::default(),
inverted_index_config: InvertedIndexConfig::default(),
@@ -588,7 +581,7 @@ mod tests {
..Default::default()
},
}
.build(FileId::random())
.build(FileId::random(), "test".to_string())
.await;
assert!(indexer.inverted_indexer.is_some());
@@ -603,7 +596,7 @@ mod tests {
let intm_manager = mock_intm_mgr(dir.path().to_string_lossy()).await;
let metadata = mock_region_metadata(MetaConfig {
with_inverted: false,
with_tag: false,
with_fulltext: true,
with_skipping_bloom: true,
});
@@ -611,14 +604,14 @@ mod tests {
op_type: OperationType::Flush,
metadata: metadata.clone(),
row_group_size: 1024,
puffin_manager: factory.build(mock_object_store(), NoopPathProvider),
puffin_manager: factory.build(mock_object_store()),
intermediate_manager: intm_manager.clone(),
index_options: IndexOptions::default(),
inverted_index_config: InvertedIndexConfig::default(),
fulltext_index_config: FulltextIndexConfig::default(),
bloom_filter_index_config: BloomFilterConfig::default(),
}
.build(FileId::random())
.build(FileId::random(), "test".to_string())
.await;
assert!(indexer.inverted_indexer.is_none());
@@ -626,7 +619,7 @@ mod tests {
assert!(indexer.bloom_filter_indexer.is_some());
let metadata = mock_region_metadata(MetaConfig {
with_inverted: true,
with_tag: true,
with_fulltext: false,
with_skipping_bloom: true,
});
@@ -634,14 +627,14 @@ mod tests {
op_type: OperationType::Flush,
metadata: metadata.clone(),
row_group_size: 1024,
puffin_manager: factory.build(mock_object_store(), NoopPathProvider),
puffin_manager: factory.build(mock_object_store()),
intermediate_manager: intm_manager.clone(),
index_options: IndexOptions::default(),
inverted_index_config: InvertedIndexConfig::default(),
fulltext_index_config: FulltextIndexConfig::default(),
bloom_filter_index_config: BloomFilterConfig::default(),
}
.build(FileId::random())
.build(FileId::random(), "test".to_string())
.await;
assert!(indexer.inverted_indexer.is_some());
@@ -649,7 +642,7 @@ mod tests {
assert!(indexer.bloom_filter_indexer.is_some());
let metadata = mock_region_metadata(MetaConfig {
with_inverted: true,
with_tag: true,
with_fulltext: true,
with_skipping_bloom: false,
});
@@ -657,14 +650,14 @@ mod tests {
op_type: OperationType::Flush,
metadata: metadata.clone(),
row_group_size: 1024,
puffin_manager: factory.build(mock_object_store(), NoopPathProvider),
puffin_manager: factory.build(mock_object_store()),
intermediate_manager: intm_manager,
index_options: IndexOptions::default(),
inverted_index_config: InvertedIndexConfig::default(),
fulltext_index_config: FulltextIndexConfig::default(),
bloom_filter_index_config: BloomFilterConfig::default(),
}
.build(FileId::random())
.build(FileId::random(), "test".to_string())
.await;
assert!(indexer.inverted_indexer.is_some());
@@ -679,7 +672,7 @@ mod tests {
let intm_manager = mock_intm_mgr(dir.path().to_string_lossy()).await;
let metadata = mock_region_metadata(MetaConfig {
with_inverted: true,
with_tag: true,
with_fulltext: true,
with_skipping_bloom: true,
});
@@ -687,14 +680,14 @@ mod tests {
op_type: OperationType::Flush,
metadata,
row_group_size: 0,
puffin_manager: factory.build(mock_object_store(), NoopPathProvider),
puffin_manager: factory.build(mock_object_store()),
intermediate_manager: intm_manager,
index_options: IndexOptions::default(),
inverted_index_config: InvertedIndexConfig::default(),
fulltext_index_config: FulltextIndexConfig::default(),
bloom_filter_index_config: BloomFilterConfig::default(),
}
.build(FileId::random())
.build(FileId::random(), "test".to_string())
.await;
assert!(indexer.inverted_indexer.is_none());

View File

@@ -28,7 +28,6 @@ use puffin::puffin_manager::{BlobGuard, PuffinManager, PuffinReader};
use snafu::ResultExt;
use store_api::storage::{ColumnId, RegionId};
use crate::access_layer::{RegionFilePathFactory, WriteCachePathProvider};
use crate::cache::file_cache::{FileCacheRef, FileType, IndexKey};
use crate::cache::index::bloom_filter_index::{
BloomFilterIndexCacheRef, CachedBloomFilterIndexBlobReader,
@@ -44,6 +43,7 @@ use crate::sst::index::bloom_filter::applier::builder::Predicate;
use crate::sst::index::bloom_filter::INDEX_BLOB_TYPE;
use crate::sst::index::puffin_manager::{BlobReader, PuffinManagerFactory};
use crate::sst::index::TYPE_BLOOM_FILTER_INDEX;
use crate::sst::location;
pub(crate) type BloomFilterIndexApplierRef = Arc<BloomFilterIndexApplier>;
@@ -247,12 +247,11 @@ impl BloomFilterIndexApplier {
return Ok(None);
};
let puffin_manager = self.puffin_manager_factory.build(
file_cache.local_store(),
WriteCachePathProvider::new(self.region_id, file_cache.clone()),
);
let puffin_manager = self.puffin_manager_factory.build(file_cache.local_store());
let puffin_file_name = file_cache.cache_file_path(index_key);
let reader = puffin_manager
.reader(&file_id)
.reader(&puffin_file_name)
.await
.context(PuffinBuildReaderSnafu)?
.with_file_size_hint(file_size_hint)
@@ -279,14 +278,12 @@ impl BloomFilterIndexApplier {
) -> Result<BlobReader> {
let puffin_manager = self
.puffin_manager_factory
.build(
self.object_store.clone(),
RegionFilePathFactory::new(self.region_dir.clone()),
)
.build(self.object_store.clone())
.with_puffin_metadata_cache(self.puffin_metadata_cache.clone());
let file_path = location::index_file_path(&self.region_dir, file_id);
puffin_manager
.reader(&file_id)
.reader(&file_path)
.await
.context(PuffinBuildReaderSnafu)?
.with_file_size_hint(file_size_hint)
@@ -450,6 +447,7 @@ mod tests {
let memory_usage_threshold = Some(1024);
let file_id = FileId::random();
let region_dir = "region_dir".to_string();
let path = location::index_file_path(&region_dir, file_id);
let mut indexer =
BloomFilterIndexer::new(file_id, &region_metadata, intm_mgr, memory_usage_threshold)
@@ -462,12 +460,9 @@ mod tests {
let mut batch = new_batch("tag2", 10..20);
indexer.update(&mut batch).await.unwrap();
let puffin_manager = factory.build(
object_store.clone(),
RegionFilePathFactory::new(region_dir.clone()),
);
let puffin_manager = factory.build(object_store.clone());
let mut puffin_writer = puffin_manager.writer(&file_id).await.unwrap();
let mut puffin_writer = puffin_manager.writer(&path).await.unwrap();
indexer.finish(&mut puffin_writer).await.unwrap();
puffin_writer.finish().await.unwrap();

View File

@@ -356,7 +356,6 @@ pub(crate) mod tests {
use store_api::storage::RegionId;
use super::*;
use crate::access_layer::FilePathProvider;
use crate::read::BatchColumn;
use crate::row_converter::{DensePrimaryKeyCodec, PrimaryKeyCodecExt};
use crate::sst::index::puffin_manager::PuffinManagerFactory;
@@ -369,18 +368,6 @@ pub(crate) mod tests {
IntermediateManager::init_fs(path).await.unwrap()
}
pub struct TestPathProvider;
impl FilePathProvider for TestPathProvider {
fn build_index_file_path(&self, file_id: FileId) -> String {
file_id.to_string()
}
fn build_sst_file_path(&self, file_id: FileId) -> String {
file_id.to_string()
}
}
/// tag_str:
/// - type: string
/// - index: bloom filter
@@ -496,16 +483,16 @@ pub(crate) mod tests {
indexer.update(&mut batch).await.unwrap();
let (_d, factory) = PuffinManagerFactory::new_for_test_async(prefix).await;
let puffin_manager = factory.build(object_store, TestPathProvider);
let puffin_manager = factory.build(object_store);
let file_id = FileId::random();
let mut puffin_writer = puffin_manager.writer(&file_id).await.unwrap();
let index_file_name = "index_file";
let mut puffin_writer = puffin_manager.writer(index_file_name).await.unwrap();
let (row_count, byte_count) = indexer.finish(&mut puffin_writer).await.unwrap();
assert_eq!(row_count, 20);
assert!(byte_count > 0);
puffin_writer.finish().await.unwrap();
let puffin_reader = puffin_manager.reader(&file_id).await.unwrap();
let puffin_reader = puffin_manager.reader(index_file_name).await.unwrap();
// tag_str
{

View File

@@ -15,22 +15,19 @@
use std::collections::BTreeSet;
use std::sync::Arc;
use common_telemetry::warn;
use index::fulltext_index::search::{FulltextIndexSearcher, RowId, TantivyFulltextIndexSearcher};
use object_store::ObjectStore;
use puffin::puffin_manager::cache::PuffinMetadataCacheRef;
use puffin::puffin_manager::{DirGuard, PuffinManager, PuffinReader};
use snafu::ResultExt;
use store_api::storage::{ColumnId, RegionId};
use store_api::storage::ColumnId;
use crate::access_layer::{RegionFilePathFactory, WriteCachePathProvider};
use crate::cache::file_cache::{FileCacheRef, FileType, IndexKey};
use crate::error::{ApplyFulltextIndexSnafu, PuffinBuildReaderSnafu, PuffinReadBlobSnafu, Result};
use crate::metrics::INDEX_APPLY_ELAPSED;
use crate::sst::file::FileId;
use crate::sst::index::fulltext_index::INDEX_BLOB_TYPE_TANTIVY;
use crate::sst::index::puffin_manager::{PuffinManagerFactory, SstPuffinDir};
use crate::sst::index::TYPE_FULLTEXT_INDEX;
use crate::sst::location;
pub mod builder;
@@ -39,9 +36,6 @@ pub struct FulltextIndexApplier {
/// The root directory of the region.
region_dir: String,
/// The region ID.
region_id: RegionId,
/// Queries to apply to the index.
queries: Vec<(ColumnId, String)>,
@@ -50,12 +44,6 @@ pub struct FulltextIndexApplier {
/// Store responsible for accessing index files.
store: ObjectStore,
/// File cache to be used by the `FulltextIndexApplier`.
file_cache: Option<FileCacheRef>,
/// The puffin metadata cache.
puffin_metadata_cache: Option<PuffinMetadataCacheRef>,
}
pub type FulltextIndexApplierRef = Arc<FulltextIndexApplier>;
@@ -64,43 +52,20 @@ impl FulltextIndexApplier {
/// Creates a new `FulltextIndexApplier`.
pub fn new(
region_dir: String,
region_id: RegionId,
store: ObjectStore,
queries: Vec<(ColumnId, String)>,
puffin_manager_factory: PuffinManagerFactory,
) -> Self {
Self {
region_dir,
region_id,
store,
queries,
puffin_manager_factory,
file_cache: None,
puffin_metadata_cache: None,
}
}
/// Sets the file cache.
pub fn with_file_cache(mut self, file_cache: Option<FileCacheRef>) -> Self {
self.file_cache = file_cache;
self
}
/// Sets the puffin metadata cache.
pub fn with_puffin_metadata_cache(
mut self,
puffin_metadata_cache: Option<PuffinMetadataCacheRef>,
) -> Self {
self.puffin_metadata_cache = puffin_metadata_cache;
self
}
/// Applies the queries to the fulltext index of the specified SST file.
pub async fn apply(
&self,
file_id: FileId,
file_size_hint: Option<u64>,
) -> Result<BTreeSet<RowId>> {
pub async fn apply(&self, file_id: FileId) -> Result<BTreeSet<RowId>> {
let _timer = INDEX_APPLY_ELAPSED
.with_label_values(&[TYPE_FULLTEXT_INDEX])
.start_timer();
@@ -109,9 +74,7 @@ impl FulltextIndexApplier {
let mut row_ids = BTreeSet::new();
for (column_id, query) in &self.queries {
let dir = self
.index_dir_path(file_id, *column_id, file_size_hint)
.await?;
let dir = self.index_dir_path(file_id, *column_id).await?;
let path = match &dir {
Some(dir) => dir.path(),
None => {
@@ -147,74 +110,15 @@ impl FulltextIndexApplier {
&self,
file_id: FileId,
column_id: ColumnId,
file_size_hint: Option<u64>,
) -> Result<Option<SstPuffinDir>> {
let blob_key = format!("{INDEX_BLOB_TYPE_TANTIVY}-{column_id}");
let puffin_manager = self.puffin_manager_factory.build(self.store.clone());
let file_path = location::index_file_path(&self.region_dir, file_id);
// FAST PATH: Try to read the index from the file cache.
if let Some(file_cache) = &self.file_cache {
let index_key = IndexKey::new(self.region_id, file_id, FileType::Puffin);
if file_cache.get(index_key).await.is_some() {
match self
.get_index_from_file_cache(file_cache, file_id, file_size_hint, &blob_key)
.await
{
Ok(dir) => return Ok(dir),
Err(err) => {
warn!(err; "An unexpected error occurred while reading the cached index file. Fallback to remote index file.")
}
}
}
}
// SLOW PATH: Try to read the index from the remote file.
self.get_index_from_remote_file(file_id, file_size_hint, &blob_key)
.await
}
async fn get_index_from_file_cache(
&self,
file_cache: &FileCacheRef,
file_id: FileId,
file_size_hint: Option<u64>,
blob_key: &str,
) -> Result<Option<SstPuffinDir>> {
match self
.puffin_manager_factory
.build(
file_cache.local_store(),
WriteCachePathProvider::new(self.region_id, file_cache.clone()),
)
.reader(&file_id)
match puffin_manager
.reader(&file_path)
.await
.context(PuffinBuildReaderSnafu)?
.with_file_size_hint(file_size_hint)
.dir(blob_key)
.await
{
Ok(dir) => Ok(Some(dir)),
Err(puffin::error::Error::BlobNotFound { .. }) => Ok(None),
Err(err) => Err(err).context(PuffinReadBlobSnafu),
}
}
async fn get_index_from_remote_file(
&self,
file_id: FileId,
file_size_hint: Option<u64>,
blob_key: &str,
) -> Result<Option<SstPuffinDir>> {
match self
.puffin_manager_factory
.build(
self.store.clone(),
RegionFilePathFactory::new(self.region_dir.clone()),
)
.reader(&file_id)
.await
.context(PuffinBuildReaderSnafu)?
.with_file_size_hint(file_size_hint)
.dir(blob_key)
.dir(&format!("{INDEX_BLOB_TYPE_TANTIVY}-{column_id}"))
.await
{
Ok(dir) => Ok(Some(dir)),

View File

@@ -15,11 +15,9 @@
use datafusion_common::ScalarValue;
use datafusion_expr::Expr;
use object_store::ObjectStore;
use puffin::puffin_manager::cache::PuffinMetadataCacheRef;
use store_api::metadata::RegionMetadata;
use store_api::storage::{ColumnId, ConcreteDataType, RegionId};
use store_api::storage::{ColumnId, ConcreteDataType};
use crate::cache::file_cache::FileCacheRef;
use crate::error::Result;
use crate::sst::index::fulltext_index::applier::FulltextIndexApplier;
use crate::sst::index::puffin_manager::PuffinManagerFactory;
@@ -27,49 +25,27 @@ use crate::sst::index::puffin_manager::PuffinManagerFactory;
/// `FulltextIndexApplierBuilder` is a builder for `FulltextIndexApplier`.
pub struct FulltextIndexApplierBuilder<'a> {
region_dir: String,
region_id: RegionId,
store: ObjectStore,
puffin_manager_factory: PuffinManagerFactory,
metadata: &'a RegionMetadata,
file_cache: Option<FileCacheRef>,
puffin_metadata_cache: Option<PuffinMetadataCacheRef>,
}
impl<'a> FulltextIndexApplierBuilder<'a> {
/// Creates a new `FulltextIndexApplierBuilder`.
pub fn new(
region_dir: String,
region_id: RegionId,
store: ObjectStore,
puffin_manager_factory: PuffinManagerFactory,
metadata: &'a RegionMetadata,
) -> Self {
Self {
region_dir,
region_id,
store,
puffin_manager_factory,
metadata,
file_cache: None,
puffin_metadata_cache: None,
}
}
/// Sets the file cache to be used by the `FulltextIndexApplier`.
pub fn with_file_cache(mut self, file_cache: Option<FileCacheRef>) -> Self {
self.file_cache = file_cache;
self
}
/// Sets the puffin metadata cache to be used by the `FulltextIndexApplier`.
pub fn with_puffin_metadata_cache(
mut self,
puffin_metadata_cache: Option<PuffinMetadataCacheRef>,
) -> Self {
self.puffin_metadata_cache = puffin_metadata_cache;
self
}
/// Builds `SstIndexApplier` from the given expressions.
pub fn build(self, exprs: &[Expr]) -> Result<Option<FulltextIndexApplier>> {
let mut queries = Vec::with_capacity(exprs.len());
@@ -82,13 +58,10 @@ impl<'a> FulltextIndexApplierBuilder<'a> {
Ok((!queries.is_empty()).then(|| {
FulltextIndexApplier::new(
self.region_dir,
self.region_id,
self.store,
queries,
self.puffin_manager_factory,
)
.with_file_cache(self.file_cache)
.with_puffin_metadata_cache(self.puffin_metadata_cache)
}))
}

View File

@@ -350,11 +350,11 @@ mod tests {
use store_api::storage::{ConcreteDataType, RegionId};
use super::*;
use crate::access_layer::RegionFilePathFactory;
use crate::read::{Batch, BatchColumn};
use crate::sst::file::FileId;
use crate::sst::index::fulltext_index::applier::FulltextIndexApplier;
use crate::sst::index::puffin_manager::PuffinManagerFactory;
use crate::sst::location;
fn mock_object_store() -> ObjectStore {
ObjectStore::new(Memory::default()).unwrap().finish()
@@ -494,6 +494,7 @@ mod tests {
let (d, factory) = PuffinManagerFactory::new_for_test_async(prefix).await;
let region_dir = "region0".to_string();
let sst_file_id = FileId::random();
let file_path = location::index_file_path(&region_dir, sst_file_id);
let object_store = mock_object_store();
let region_metadata = mock_region_metadata();
let intm_mgr = new_intm_mgr(d.path().to_string_lossy()).await;
@@ -513,11 +514,8 @@ mod tests {
let mut batch = new_batch(rows);
indexer.update(&mut batch).await.unwrap();
let puffin_manager = factory.build(
object_store.clone(),
RegionFilePathFactory::new(region_dir.clone()),
);
let mut writer = puffin_manager.writer(&sst_file_id).await.unwrap();
let puffin_manager = factory.build(object_store.clone());
let mut writer = puffin_manager.writer(&file_path).await.unwrap();
let _ = indexer.finish(&mut writer).await.unwrap();
writer.finish().await.unwrap();
@@ -525,7 +523,6 @@ mod tests {
let _d = &d;
let applier = FulltextIndexApplier::new(
region_dir.clone(),
region_metadata.region_id,
object_store.clone(),
queries
.into_iter()
@@ -534,7 +531,7 @@ mod tests {
factory.clone(),
);
async move { applier.apply(sst_file_id, None).await.unwrap() }.boxed()
async move { applier.apply(sst_file_id).await.unwrap() }.boxed()
}
}

View File

@@ -62,7 +62,7 @@ impl Indexer {
async fn build_puffin_writer(&mut self) -> Option<SstPuffinWriter> {
let puffin_manager = self.puffin_manager.take()?;
let err = match puffin_manager.writer(&self.file_id).await {
let err = match puffin_manager.writer(&self.file_path).await {
Ok(writer) => return Some(writer),
Err(err) => err,
};

View File

@@ -28,7 +28,6 @@ use puffin::puffin_manager::{BlobGuard, PuffinManager, PuffinReader};
use snafu::ResultExt;
use store_api::storage::RegionId;
use crate::access_layer::{RegionFilePathFactory, WriteCachePathProvider};
use crate::cache::file_cache::{FileCacheRef, FileType, IndexKey};
use crate::cache::index::inverted_index::{CachedInvertedIndexBlobReader, InvertedIndexCacheRef};
use crate::error::{
@@ -39,6 +38,7 @@ use crate::sst::file::FileId;
use crate::sst::index::inverted_index::INDEX_BLOB_TYPE;
use crate::sst::index::puffin_manager::{BlobReader, PuffinManagerFactory};
use crate::sst::index::TYPE_INVERTED_INDEX;
use crate::sst::location;
/// `InvertedIndexApplier` is responsible for applying predicates to the provided SST files
/// and returning the relevant row group ids for further scan.
@@ -172,14 +172,12 @@ impl InvertedIndexApplier {
return Ok(None);
};
let puffin_manager = self.puffin_manager_factory.build(
file_cache.local_store(),
WriteCachePathProvider::new(self.region_id, file_cache.clone()),
);
let puffin_manager = self.puffin_manager_factory.build(file_cache.local_store());
let puffin_file_name = file_cache.cache_file_path(index_key);
// Adds file size hint to the puffin reader to avoid extra metadata read.
let reader = puffin_manager
.reader(&file_id)
.reader(&puffin_file_name)
.await
.context(PuffinBuildReaderSnafu)?
.with_file_size_hint(file_size_hint)
@@ -200,14 +198,12 @@ impl InvertedIndexApplier {
) -> Result<BlobReader> {
let puffin_manager = self
.puffin_manager_factory
.build(
self.store.clone(),
RegionFilePathFactory::new(self.region_dir.clone()),
)
.build(self.store.clone())
.with_puffin_metadata_cache(self.puffin_metadata_cache.clone());
let file_path = location::index_file_path(&self.region_dir, file_id);
puffin_manager
.reader(&file_id)
.reader(&file_path)
.await
.context(PuffinBuildReaderSnafu)?
.with_file_size_hint(file_size_hint)
@@ -243,12 +239,10 @@ mod tests {
let object_store = ObjectStore::new(Memory::default()).unwrap().finish();
let file_id = FileId::random();
let region_dir = "region_dir".to_string();
let path = location::index_file_path(&region_dir, file_id);
let puffin_manager = puffin_manager_factory.build(
object_store.clone(),
RegionFilePathFactory::new(region_dir.clone()),
);
let mut writer = puffin_manager.writer(&file_id).await.unwrap();
let puffin_manager = puffin_manager_factory.build(object_store.clone());
let mut writer = puffin_manager.writer(&path).await.unwrap();
writer
.put_blob(INDEX_BLOB_TYPE, Cursor::new(vec![]), Default::default())
.await
@@ -291,12 +285,10 @@ mod tests {
let object_store = ObjectStore::new(Memory::default()).unwrap().finish();
let file_id = FileId::random();
let region_dir = "region_dir".to_string();
let path = location::index_file_path(&region_dir, file_id);
let puffin_manager = puffin_manager_factory.build(
object_store.clone(),
RegionFilePathFactory::new(region_dir.clone()),
);
let mut writer = puffin_manager.writer(&file_id).await.unwrap();
let puffin_manager = puffin_manager_factory.build(object_store.clone());
let mut writer = puffin_manager.writer(&path).await.unwrap();
writer
.put_blob("invalid_blob_type", Cursor::new(vec![]), Default::default())
.await

View File

@@ -336,13 +336,13 @@ mod tests {
use store_api::storage::RegionId;
use super::*;
use crate::access_layer::RegionFilePathFactory;
use crate::cache::index::inverted_index::InvertedIndexCache;
use crate::metrics::CACHE_BYTES;
use crate::read::BatchColumn;
use crate::row_converter::{DensePrimaryKeyCodec, PrimaryKeyCodecExt};
use crate::sst::index::inverted_index::applier::builder::InvertedIndexApplierBuilder;
use crate::sst::index::puffin_manager::PuffinManagerFactory;
use crate::sst::location;
fn mock_object_store() -> ObjectStore {
ObjectStore::new(Memory::default()).unwrap().finish()
@@ -438,6 +438,7 @@ mod tests {
let (d, factory) = PuffinManagerFactory::new_for_test_async(prefix).await;
let region_dir = "region0".to_string();
let sst_file_id = FileId::random();
let file_path = location::index_file_path(&region_dir, sst_file_id);
let object_store = mock_object_store();
let region_metadata = mock_region_metadata();
let intm_mgr = new_intm_mgr(d.path().to_string_lossy()).await;
@@ -459,11 +460,8 @@ mod tests {
creator.update(&mut batch).await.unwrap();
}
let puffin_manager = factory.build(
object_store.clone(),
RegionFilePathFactory::new(region_dir.clone()),
);
let mut writer = puffin_manager.writer(&sst_file_id).await.unwrap();
let puffin_manager = factory.build(object_store.clone());
let mut writer = puffin_manager.writer(&file_path).await.unwrap();
let (row_count, _) = creator.finish(&mut writer).await.unwrap();
assert_eq!(row_count, rows.len() * segment_row_count);
writer.finish().await.unwrap();

View File

@@ -26,20 +26,18 @@ use puffin::puffin_manager::stager::{BoundedStager, Stager};
use puffin::puffin_manager::{BlobGuard, PuffinManager, PuffinReader};
use snafu::ResultExt;
use crate::access_layer::FilePathProvider;
use crate::error::{PuffinInitStagerSnafu, PuffinPurgeStagerSnafu, Result};
use crate::metrics::{
StagerMetrics, INDEX_PUFFIN_FLUSH_OP_TOTAL, INDEX_PUFFIN_READ_BYTES_TOTAL,
INDEX_PUFFIN_READ_OP_TOTAL, INDEX_PUFFIN_WRITE_BYTES_TOTAL, INDEX_PUFFIN_WRITE_OP_TOTAL,
};
use crate::sst::file::FileId;
use crate::sst::index::store::{self, InstrumentedStore};
type InstrumentedRangeReader = store::InstrumentedRangeReader<'static>;
type InstrumentedAsyncWrite = store::InstrumentedAsyncWrite<'static, FuturesAsyncWriter>;
pub(crate) type SstPuffinManager =
FsPuffinManager<Arc<BoundedStager<FileId>>, ObjectStorePuffinFileAccessor>;
FsPuffinManager<Arc<BoundedStager>, ObjectStorePuffinFileAccessor>;
pub(crate) type SstPuffinReader = <SstPuffinManager as PuffinManager>::Reader;
pub(crate) type SstPuffinWriter = <SstPuffinManager as PuffinManager>::Writer;
pub(crate) type SstPuffinBlob = <SstPuffinReader as PuffinReader>::Blob;
@@ -52,7 +50,7 @@ const STAGING_DIR: &str = "staging";
#[derive(Clone)]
pub struct PuffinManagerFactory {
/// The stager used by the puffin manager.
stager: Arc<BoundedStager<FileId>>,
stager: Arc<BoundedStager>,
/// The size of the write buffer used to create object store.
write_buffer_size: Option<usize>,
@@ -81,20 +79,15 @@ impl PuffinManagerFactory {
})
}
pub(crate) fn build(
&self,
store: ObjectStore,
path_provider: impl FilePathProvider + 'static,
) -> SstPuffinManager {
pub(crate) fn build(&self, store: ObjectStore) -> SstPuffinManager {
let store = InstrumentedStore::new(store).with_write_buffer_size(self.write_buffer_size);
let puffin_file_accessor =
ObjectStorePuffinFileAccessor::new(store, Arc::new(path_provider));
let puffin_file_accessor = ObjectStorePuffinFileAccessor::new(store);
SstPuffinManager::new(self.stager.clone(), puffin_file_accessor)
}
pub(crate) async fn purge_stager(&self, file_id: FileId) -> Result<()> {
pub(crate) async fn purge_stager(&self, puffin_file_name: &str) -> Result<()> {
self.stager
.purge(&file_id)
.purge(puffin_file_name)
.await
.context(PuffinPurgeStagerSnafu)
}
@@ -126,15 +119,11 @@ impl PuffinManagerFactory {
#[derive(Clone)]
pub(crate) struct ObjectStorePuffinFileAccessor {
object_store: InstrumentedStore,
path_provider: Arc<dyn FilePathProvider>,
}
impl ObjectStorePuffinFileAccessor {
pub fn new(object_store: InstrumentedStore, path_provider: Arc<dyn FilePathProvider>) -> Self {
Self {
object_store,
path_provider,
}
pub fn new(object_store: InstrumentedStore) -> Self {
Self { object_store }
}
}
@@ -142,13 +131,11 @@ impl ObjectStorePuffinFileAccessor {
impl PuffinFileAccessor for ObjectStorePuffinFileAccessor {
type Reader = InstrumentedRangeReader;
type Writer = InstrumentedAsyncWrite;
type FileHandle = FileId;
async fn reader(&self, handle: &FileId) -> PuffinResult<Self::Reader> {
let file_path = self.path_provider.build_index_file_path(*handle);
async fn reader(&self, puffin_file_name: &str) -> PuffinResult<Self::Reader> {
self.object_store
.range_reader(
&file_path,
puffin_file_name,
&INDEX_PUFFIN_READ_BYTES_TOTAL,
&INDEX_PUFFIN_READ_OP_TOTAL,
)
@@ -157,11 +144,10 @@ impl PuffinFileAccessor for ObjectStorePuffinFileAccessor {
.context(puffin_error::ExternalSnafu)
}
async fn writer(&self, handle: &FileId) -> PuffinResult<Self::Writer> {
let file_path = self.path_provider.build_index_file_path(*handle);
async fn writer(&self, puffin_file_name: &str) -> PuffinResult<Self::Writer> {
self.object_store
.writer(
&file_path,
puffin_file_name,
&INDEX_PUFFIN_WRITE_BYTES_TOTAL,
&INDEX_PUFFIN_WRITE_OP_TOTAL,
&INDEX_PUFFIN_FLUSH_OP_TOTAL,
@@ -183,32 +169,20 @@ mod tests {
use super::*;
struct TestFilePathProvider;
impl FilePathProvider for TestFilePathProvider {
fn build_index_file_path(&self, file_id: FileId) -> String {
file_id.to_string()
}
fn build_sst_file_path(&self, file_id: FileId) -> String {
file_id.to_string()
}
}
#[tokio::test]
async fn test_puffin_manager_factory() {
let (_dir, factory) =
PuffinManagerFactory::new_for_test_async("test_puffin_manager_factory_").await;
let object_store = ObjectStore::new(Memory::default()).unwrap().finish();
let manager = factory.build(object_store, TestFilePathProvider);
let manager = factory.build(object_store);
let file_id = FileId::random();
let file_name = "my-puffin-file";
let blob_key = "blob-key";
let dir_key = "dir-key";
let raw_data = b"hello world!";
let mut writer = manager.writer(&file_id).await.unwrap();
let mut writer = manager.writer(file_name).await.unwrap();
writer
.put_blob(blob_key, Cursor::new(raw_data), PutOptions::default())
.await
@@ -229,7 +203,7 @@ mod tests {
.unwrap();
writer.finish().await.unwrap();
let reader = manager.reader(&file_id).await.unwrap();
let reader = manager.reader(file_name).await.unwrap();
let blob_guard = reader.blob(blob_key).await.unwrap();
let blob_reader = blob_guard.reader().await.unwrap();
let meta = blob_reader.metadata().await.unwrap();

Some files were not shown because too many files have changed in this diff Show More