mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2025-12-24 07:00:00 +00:00
Compare commits
2 Commits
feat/index
...
feature/df
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ef80503454 | ||
|
|
30ca2d7652 |
32
.github/workflows/release.yml
vendored
32
.github/workflows/release.yml
vendored
@@ -49,9 +49,14 @@ on:
|
||||
description: Do not run integration tests during the build
|
||||
type: boolean
|
||||
default: true
|
||||
build_linux_artifacts:
|
||||
build_linux_amd64_artifacts:
|
||||
type: boolean
|
||||
description: Build linux artifacts (both amd64 and arm64)
|
||||
description: Build linux-amd64 artifacts
|
||||
required: false
|
||||
default: false
|
||||
build_linux_arm64_artifacts:
|
||||
type: boolean
|
||||
description: Build linux-arm64 artifacts
|
||||
required: false
|
||||
default: false
|
||||
build_macos_artifacts:
|
||||
@@ -139,7 +144,7 @@ jobs:
|
||||
./.github/scripts/check-version.sh "${{ steps.create-version.outputs.version }}"
|
||||
|
||||
- name: Allocate linux-amd64 runner
|
||||
if: ${{ inputs.build_linux_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
|
||||
if: ${{ inputs.build_linux_amd64_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
|
||||
uses: ./.github/actions/start-runner
|
||||
id: start-linux-amd64-runner
|
||||
with:
|
||||
@@ -153,7 +158,7 @@ jobs:
|
||||
subnet-id: ${{ vars.EC2_RUNNER_SUBNET_ID }}
|
||||
|
||||
- name: Allocate linux-arm64 runner
|
||||
if: ${{ inputs.build_linux_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
|
||||
if: ${{ inputs.build_linux_arm64_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
|
||||
uses: ./.github/actions/start-runner
|
||||
id: start-linux-arm64-runner
|
||||
with:
|
||||
@@ -168,7 +173,7 @@ jobs:
|
||||
|
||||
build-linux-amd64-artifacts:
|
||||
name: Build linux-amd64 artifacts
|
||||
if: ${{ inputs.build_linux_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
|
||||
if: ${{ inputs.build_linux_amd64_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
|
||||
needs: [
|
||||
allocate-runners,
|
||||
]
|
||||
@@ -190,7 +195,7 @@ jobs:
|
||||
|
||||
build-linux-arm64-artifacts:
|
||||
name: Build linux-arm64 artifacts
|
||||
if: ${{ inputs.build_linux_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
|
||||
if: ${{ inputs.build_linux_arm64_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
|
||||
needs: [
|
||||
allocate-runners,
|
||||
]
|
||||
@@ -212,7 +217,7 @@ jobs:
|
||||
|
||||
run-multi-lang-tests:
|
||||
name: Run Multi-language SDK Tests
|
||||
if: ${{ inputs.build_linux_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
|
||||
if: ${{ inputs.build_linux_amd64_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
|
||||
needs: [
|
||||
allocate-runners,
|
||||
build-linux-amd64-artifacts,
|
||||
@@ -381,18 +386,7 @@ jobs:
|
||||
|
||||
publish-github-release:
|
||||
name: Create GitHub release and upload artifacts
|
||||
# Use always() to run even when optional jobs (macos, windows) are skipped.
|
||||
# Then check that required jobs succeeded and optional jobs didn't fail.
|
||||
if: |
|
||||
always() &&
|
||||
(inputs.publish_github_release || github.event_name == 'push' || github.event_name == 'schedule') &&
|
||||
needs.allocate-runners.result == 'success' &&
|
||||
(needs.build-linux-amd64-artifacts.result == 'success' || needs.build-linux-amd64-artifacts.result == 'skipped') &&
|
||||
(needs.build-linux-arm64-artifacts.result == 'success' || needs.build-linux-arm64-artifacts.result == 'skipped') &&
|
||||
(needs.build-macos-artifacts.result == 'success' || needs.build-macos-artifacts.result == 'skipped') &&
|
||||
(needs.build-windows-artifacts.result == 'success' || needs.build-windows-artifacts.result == 'skipped') &&
|
||||
(needs.release-images-to-dockerhub.result == 'success' || needs.release-images-to-dockerhub.result == 'skipped') &&
|
||||
(needs.run-multi-lang-tests.result == 'success' || needs.run-multi-lang-tests.result == 'skipped')
|
||||
if: ${{ inputs.publish_github_release || github.event_name == 'push' || github.event_name == 'schedule' }}
|
||||
needs: [ # The job have to wait for all the artifacts are built.
|
||||
allocate-runners,
|
||||
build-linux-amd64-artifacts,
|
||||
|
||||
64
AUTHOR.md
64
AUTHOR.md
@@ -2,41 +2,41 @@
|
||||
|
||||
## Individual Committers (in alphabetical order)
|
||||
|
||||
- [apdong2022](https://github.com/apdong2022)
|
||||
- [beryl678](https://github.com/beryl678)
|
||||
- [CookiePieWw](https://github.com/CookiePieWw)
|
||||
- [etolbakov](https://github.com/etolbakov)
|
||||
- [irenjj](https://github.com/irenjj)
|
||||
- [KKould](https://github.com/KKould)
|
||||
- [Lanqing Yang](https://github.com/lyang24)
|
||||
- [nicecui](https://github.com/nicecui)
|
||||
- [NiwakaDev](https://github.com/NiwakaDev)
|
||||
- [paomian](https://github.com/paomian)
|
||||
- [tisonkun](https://github.com/tisonkun)
|
||||
- [Wenjie0329](https://github.com/Wenjie0329)
|
||||
- [zhaoyingnan01](https://github.com/zhaoyingnan01)
|
||||
- [zhongzc](https://github.com/zhongzc)
|
||||
- [ZonaHex](https://github.com/ZonaHex)
|
||||
- [zyy17](https://github.com/zyy17)
|
||||
* [CookiePieWw](https://github.com/CookiePieWw)
|
||||
* [etolbakov](https://github.com/etolbakov)
|
||||
* [irenjj](https://github.com/irenjj)
|
||||
* [KKould](https://github.com/KKould)
|
||||
* [Lanqing Yang](https://github.com/lyang24)
|
||||
* [NiwakaDev](https://github.com/NiwakaDev)
|
||||
* [tisonkun](https://github.com/tisonkun)
|
||||
|
||||
## Team Members (in alphabetical order)
|
||||
|
||||
- [daviderli614](https://github.com/daviderli614)
|
||||
- [discord9](https://github.com/discord9)
|
||||
- [evenyag](https://github.com/evenyag)
|
||||
- [fengjiachun](https://github.com/fengjiachun)
|
||||
- [fengys1996](https://github.com/fengys1996)
|
||||
- [GrepTime](https://github.com/GrepTime)
|
||||
- [holalengyu](https://github.com/holalengyu)
|
||||
- [killme2008](https://github.com/killme2008)
|
||||
- [MichaelScofield](https://github.com/MichaelScofield)
|
||||
- [shuiyisong](https://github.com/shuiyisong)
|
||||
- [sunchanglong](https://github.com/sunchanglong)
|
||||
- [sunng87](https://github.com/sunng87)
|
||||
- [v0y4g3r](https://github.com/v0y4g3r)
|
||||
- [waynexia](https://github.com/waynexia)
|
||||
- [WenyXu](https://github.com/WenyXu)
|
||||
- [xtang](https://github.com/xtang)
|
||||
* [apdong2022](https://github.com/apdong2022)
|
||||
* [beryl678](https://github.com/beryl678)
|
||||
* [daviderli614](https://github.com/daviderli614)
|
||||
* [discord9](https://github.com/discord9)
|
||||
* [evenyag](https://github.com/evenyag)
|
||||
* [fengjiachun](https://github.com/fengjiachun)
|
||||
* [fengys1996](https://github.com/fengys1996)
|
||||
* [GrepTime](https://github.com/GrepTime)
|
||||
* [holalengyu](https://github.com/holalengyu)
|
||||
* [killme2008](https://github.com/killme2008)
|
||||
* [MichaelScofield](https://github.com/MichaelScofield)
|
||||
* [nicecui](https://github.com/nicecui)
|
||||
* [paomian](https://github.com/paomian)
|
||||
* [shuiyisong](https://github.com/shuiyisong)
|
||||
* [sunchanglong](https://github.com/sunchanglong)
|
||||
* [sunng87](https://github.com/sunng87)
|
||||
* [v0y4g3r](https://github.com/v0y4g3r)
|
||||
* [waynexia](https://github.com/waynexia)
|
||||
* [Wenjie0329](https://github.com/Wenjie0329)
|
||||
* [WenyXu](https://github.com/WenyXu)
|
||||
* [xtang](https://github.com/xtang)
|
||||
* [zhaoyingnan01](https://github.com/zhaoyingnan01)
|
||||
* [zhongzc](https://github.com/zhongzc)
|
||||
* [ZonaHex](https://github.com/ZonaHex)
|
||||
* [zyy17](https://github.com/zyy17)
|
||||
|
||||
## All Contributors
|
||||
|
||||
|
||||
68
Cargo.lock
generated
68
Cargo.lock
generated
@@ -3274,7 +3274,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-ipc",
|
||||
@@ -3329,7 +3329,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-catalog"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
@@ -3353,7 +3353,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-catalog-listing"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
@@ -3375,7 +3375,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-common"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"ahash 0.8.12",
|
||||
"arrow",
|
||||
@@ -3398,7 +3398,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-common-runtime"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"log",
|
||||
@@ -3408,7 +3408,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-datasource"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-compression 0.4.19",
|
||||
@@ -3442,7 +3442,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-datasource-csv"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
@@ -3464,7 +3464,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-datasource-json"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
@@ -3485,7 +3485,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-datasource-parquet"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
@@ -3514,12 +3514,12 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-doc"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-execution"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
@@ -3538,7 +3538,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-expr"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
@@ -3560,7 +3560,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-expr-common"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"datafusion-common",
|
||||
@@ -3572,7 +3572,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-functions"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-buffer",
|
||||
@@ -3600,7 +3600,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-functions-aggregate"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"ahash 0.8.12",
|
||||
"arrow",
|
||||
@@ -3620,7 +3620,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-functions-aggregate-common"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"ahash 0.8.12",
|
||||
"arrow",
|
||||
@@ -3632,7 +3632,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-functions-nested"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-ord",
|
||||
@@ -3654,7 +3654,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-functions-table"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
@@ -3669,7 +3669,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-functions-window"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"datafusion-common",
|
||||
@@ -3686,7 +3686,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-functions-window-common"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"datafusion-common",
|
||||
"datafusion-physical-expr-common",
|
||||
@@ -3695,7 +3695,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-macros"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"datafusion-doc",
|
||||
"quote",
|
||||
@@ -3705,7 +3705,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-optimizer"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"chrono",
|
||||
@@ -3756,7 +3756,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-physical-expr"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"ahash 0.8.12",
|
||||
"arrow",
|
||||
@@ -3777,7 +3777,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-physical-expr-adapter"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"datafusion-common",
|
||||
@@ -3791,7 +3791,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-physical-expr-common"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"ahash 0.8.12",
|
||||
"arrow",
|
||||
@@ -3804,7 +3804,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-physical-optimizer"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"datafusion-common",
|
||||
@@ -3822,7 +3822,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-physical-plan"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"ahash 0.8.12",
|
||||
"arrow",
|
||||
@@ -3852,7 +3852,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-pruning"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"datafusion-common",
|
||||
@@ -3868,7 +3868,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-session"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"datafusion-common",
|
||||
@@ -3881,7 +3881,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-sql"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"bigdecimal 0.4.8",
|
||||
@@ -3898,7 +3898,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-substrait"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"async-recursion",
|
||||
"async-trait",
|
||||
@@ -7514,11 +7514,9 @@ dependencies = [
|
||||
"common-test-util",
|
||||
"common-time",
|
||||
"common-wal",
|
||||
"criterion 0.4.0",
|
||||
"datafusion",
|
||||
"datatypes",
|
||||
"futures-util",
|
||||
"fxhash",
|
||||
"humantime-serde",
|
||||
"itertools 0.14.0",
|
||||
"lazy_static",
|
||||
@@ -9203,9 +9201,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "pgwire"
|
||||
version = "0.36.3"
|
||||
version = "0.36.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "70a2bcdcc4b20a88e0648778ecf00415bbd5b447742275439c22176835056f99"
|
||||
checksum = "d331bb0eef5bc83a221c0a85b1f205bccf094d4f72a26ae1d68a1b1c535123b7"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"base64 0.22.1",
|
||||
|
||||
24
Cargo.toml
24
Cargo.toml
@@ -316,18 +316,18 @@ git = "https://github.com/GreptimeTeam/greptime-meter.git"
|
||||
rev = "5618e779cf2bb4755b499c630fba4c35e91898cb"
|
||||
|
||||
[patch.crates-io]
|
||||
datafusion = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-functions = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-functions-aggregate-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-optimizer = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-physical-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-physical-expr-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-physical-plan = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-datasource = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-sql = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-substrait = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
|
||||
datafusion-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
|
||||
datafusion-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
|
||||
datafusion-functions = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
|
||||
datafusion-functions-aggregate-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
|
||||
datafusion-optimizer = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
|
||||
datafusion-physical-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
|
||||
datafusion-physical-expr-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
|
||||
datafusion-physical-plan = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
|
||||
datafusion-datasource = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
|
||||
datafusion-sql = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
|
||||
datafusion-substrait = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
|
||||
sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "4b519a5caa95472cc3988f5556813a583dd35af1" } # branch = "v0.58.x"
|
||||
|
||||
[profile.release]
|
||||
|
||||
@@ -294,6 +294,7 @@
|
||||
| `meta_client` | -- | -- | The metasrv client options. |
|
||||
| `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. |
|
||||
| `meta_client.timeout` | String | `3s` | Operation timeout. |
|
||||
| `meta_client.heartbeat_timeout` | String | `500ms` | Heartbeat timeout. |
|
||||
| `meta_client.ddl_timeout` | String | `10s` | DDL timeout. |
|
||||
| `meta_client.connect_timeout` | String | `1s` | Connect server timeout. |
|
||||
| `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. |
|
||||
@@ -456,6 +457,7 @@
|
||||
| `meta_client` | -- | -- | The metasrv client options. |
|
||||
| `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. |
|
||||
| `meta_client.timeout` | String | `3s` | Operation timeout. |
|
||||
| `meta_client.heartbeat_timeout` | String | `500ms` | Heartbeat timeout. |
|
||||
| `meta_client.ddl_timeout` | String | `10s` | DDL timeout. |
|
||||
| `meta_client.connect_timeout` | String | `1s` | Connect server timeout. |
|
||||
| `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. |
|
||||
@@ -627,6 +629,7 @@
|
||||
| `meta_client` | -- | -- | The metasrv client options. |
|
||||
| `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. |
|
||||
| `meta_client.timeout` | String | `3s` | Operation timeout. |
|
||||
| `meta_client.heartbeat_timeout` | String | `500ms` | Heartbeat timeout. |
|
||||
| `meta_client.ddl_timeout` | String | `10s` | DDL timeout. |
|
||||
| `meta_client.connect_timeout` | String | `1s` | Connect server timeout. |
|
||||
| `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. |
|
||||
|
||||
@@ -99,6 +99,9 @@ metasrv_addrs = ["127.0.0.1:3002"]
|
||||
## Operation timeout.
|
||||
timeout = "3s"
|
||||
|
||||
## Heartbeat timeout.
|
||||
heartbeat_timeout = "500ms"
|
||||
|
||||
## DDL timeout.
|
||||
ddl_timeout = "10s"
|
||||
|
||||
|
||||
@@ -78,6 +78,9 @@ metasrv_addrs = ["127.0.0.1:3002"]
|
||||
## Operation timeout.
|
||||
timeout = "3s"
|
||||
|
||||
## Heartbeat timeout.
|
||||
heartbeat_timeout = "500ms"
|
||||
|
||||
## DDL timeout.
|
||||
ddl_timeout = "10s"
|
||||
|
||||
|
||||
@@ -226,6 +226,9 @@ metasrv_addrs = ["127.0.0.1:3002"]
|
||||
## Operation timeout.
|
||||
timeout = "3s"
|
||||
|
||||
## Heartbeat timeout.
|
||||
heartbeat_timeout = "500ms"
|
||||
|
||||
## DDL timeout.
|
||||
ddl_timeout = "10s"
|
||||
|
||||
|
||||
@@ -163,7 +163,7 @@ impl ObjbenchCommand {
|
||||
available_indexes: Default::default(),
|
||||
indexes: Default::default(),
|
||||
index_file_size: 0,
|
||||
index_version: 0,
|
||||
index_file_id: None,
|
||||
num_rows,
|
||||
num_row_groups,
|
||||
sequence: None,
|
||||
@@ -565,7 +565,6 @@ fn new_noop_file_purger() -> FilePurgerRef {
|
||||
struct Noop;
|
||||
impl FilePurger for Noop {
|
||||
fn remove_file(&self, _file_meta: FileMeta, _is_delete: bool) {}
|
||||
fn update_index(&self, _file_meta: FileMeta, _version: store_api::storage::IndexVersion) {}
|
||||
}
|
||||
Arc::new(Noop)
|
||||
}
|
||||
|
||||
@@ -52,6 +52,7 @@ fn test_load_datanode_example_config() {
|
||||
meta_client: Some(MetaClientOptions {
|
||||
metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
|
||||
timeout: Duration::from_secs(3),
|
||||
heartbeat_timeout: Duration::from_millis(500),
|
||||
ddl_timeout: Duration::from_secs(10),
|
||||
connect_timeout: Duration::from_secs(1),
|
||||
tcp_nodelay: true,
|
||||
@@ -117,6 +118,7 @@ fn test_load_frontend_example_config() {
|
||||
meta_client: Some(MetaClientOptions {
|
||||
metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
|
||||
timeout: Duration::from_secs(3),
|
||||
heartbeat_timeout: Duration::from_millis(500),
|
||||
ddl_timeout: Duration::from_secs(10),
|
||||
connect_timeout: Duration::from_secs(1),
|
||||
tcp_nodelay: true,
|
||||
@@ -239,6 +241,7 @@ fn test_load_flownode_example_config() {
|
||||
meta_client: Some(MetaClientOptions {
|
||||
metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
|
||||
timeout: Duration::from_secs(3),
|
||||
heartbeat_timeout: Duration::from_millis(500),
|
||||
ddl_timeout: Duration::from_secs(10),
|
||||
connect_timeout: Duration::from_secs(1),
|
||||
tcp_nodelay: true,
|
||||
|
||||
@@ -46,16 +46,13 @@ pub struct DoPutResponse {
|
||||
request_id: i64,
|
||||
/// The successfully ingested rows number.
|
||||
affected_rows: AffectedRows,
|
||||
/// The elapsed time in seconds for handling the bulk insert.
|
||||
elapsed_secs: f64,
|
||||
}
|
||||
|
||||
impl DoPutResponse {
|
||||
pub fn new(request_id: i64, affected_rows: AffectedRows, elapsed_secs: f64) -> Self {
|
||||
pub fn new(request_id: i64, affected_rows: AffectedRows) -> Self {
|
||||
Self {
|
||||
request_id,
|
||||
affected_rows,
|
||||
elapsed_secs,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -66,10 +63,6 @@ impl DoPutResponse {
|
||||
pub fn affected_rows(&self) -> AffectedRows {
|
||||
self.affected_rows
|
||||
}
|
||||
|
||||
pub fn elapsed_secs(&self) -> f64 {
|
||||
self.elapsed_secs
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<PutResult> for DoPutResponse {
|
||||
@@ -93,11 +86,8 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_serde_do_put_response() {
|
||||
let x = DoPutResponse::new(42, 88, 0.123);
|
||||
let x = DoPutResponse::new(42, 88);
|
||||
let serialized = serde_json::to_string(&x).unwrap();
|
||||
assert_eq!(
|
||||
serialized,
|
||||
r#"{"request_id":42,"affected_rows":88,"elapsed_secs":0.123}"#
|
||||
);
|
||||
assert_eq!(serialized, r#"{"request_id":42,"affected_rows":88}"#);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -41,17 +41,6 @@ pub const POSTGRES_KEEP_ALIVE_SECS: u64 = 30;
|
||||
/// In a lease, there are two opportunities for renewal.
|
||||
pub const META_KEEP_ALIVE_INTERVAL_SECS: u64 = META_LEASE_SECS / 2;
|
||||
|
||||
/// The timeout of the heartbeat request.
|
||||
pub const HEARTBEAT_TIMEOUT: Duration = Duration::from_secs(META_KEEP_ALIVE_INTERVAL_SECS + 1);
|
||||
|
||||
/// The keep-alive interval of the heartbeat channel.
|
||||
pub const HEARTBEAT_CHANNEL_KEEP_ALIVE_INTERVAL_SECS: Duration =
|
||||
Duration::from_secs(META_KEEP_ALIVE_INTERVAL_SECS + 1);
|
||||
|
||||
/// The keep-alive timeout of the heartbeat channel.
|
||||
pub const HEARTBEAT_CHANNEL_KEEP_ALIVE_TIMEOUT_SECS: Duration =
|
||||
Duration::from_secs(META_KEEP_ALIVE_INTERVAL_SECS + 1);
|
||||
|
||||
/// The default mailbox round-trip timeout.
|
||||
pub const MAILBOX_RTT_SECS: u64 = 1;
|
||||
|
||||
|
||||
@@ -1261,6 +1261,7 @@ impl RegionServerInner {
|
||||
.with_context(|_| HandleRegionRequestSnafu { region_id })?
|
||||
.new_opened_logical_region_ids()
|
||||
else {
|
||||
warn!("No new opened logical regions");
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
|
||||
@@ -24,8 +24,8 @@ use common_query::Output;
|
||||
use common_runtime::Runtime;
|
||||
use common_runtime::runtime::{BuilderBuild, RuntimeTrait};
|
||||
use datafusion::catalog::TableFunction;
|
||||
use datafusion::dataframe::DataFrame;
|
||||
use datafusion_expr::{AggregateUDF, LogicalPlan};
|
||||
use query::dataframe::DataFrame;
|
||||
use query::planner::LogicalPlanner;
|
||||
use query::query_engine::{DescribeResult, QueryEngineState};
|
||||
use query::{QueryEngine, QueryEngineContext};
|
||||
|
||||
@@ -12,9 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use api::helper::from_pb_time_ranges;
|
||||
use api::v1::ddl_request::{Expr as DdlExpr, Expr};
|
||||
@@ -24,18 +22,16 @@ use api::v1::{
|
||||
DeleteRequests, DropFlowExpr, InsertIntoPlan, InsertRequests, RowDeleteRequests,
|
||||
RowInsertRequests,
|
||||
};
|
||||
use async_stream::try_stream;
|
||||
use async_trait::async_trait;
|
||||
use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
|
||||
use common_base::AffectedRows;
|
||||
use common_error::ext::BoxedError;
|
||||
use common_grpc::flight::do_put::DoPutResponse;
|
||||
use common_grpc::FlightData;
|
||||
use common_grpc::flight::FlightDecoder;
|
||||
use common_query::Output;
|
||||
use common_query::logical_plan::add_insert_to_logical_plan;
|
||||
use common_telemetry::tracing::{self};
|
||||
use datafusion::datasource::DefaultTableSource;
|
||||
use futures::Stream;
|
||||
use futures::stream::StreamExt;
|
||||
use query::parser::PromQuery;
|
||||
use servers::interceptor::{GrpcQueryInterceptor, GrpcQueryInterceptorRef};
|
||||
use servers::query_handler::grpc::GrpcQueryHandler;
|
||||
@@ -244,8 +240,10 @@ impl GrpcQueryHandler for Instance {
|
||||
|
||||
async fn put_record_batch(
|
||||
&self,
|
||||
request: servers::grpc::flight::PutRecordBatchRequest,
|
||||
table_name: &TableName,
|
||||
table_ref: &mut Option<TableRef>,
|
||||
decoder: &mut FlightDecoder,
|
||||
data: FlightData,
|
||||
ctx: QueryContextRef,
|
||||
) -> Result<AffectedRows> {
|
||||
let table = if let Some(table) = table_ref {
|
||||
@@ -254,15 +252,15 @@ impl GrpcQueryHandler for Instance {
|
||||
let table = self
|
||||
.catalog_manager()
|
||||
.table(
|
||||
&request.table_name.catalog_name,
|
||||
&request.table_name.schema_name,
|
||||
&request.table_name.table_name,
|
||||
&table_name.catalog_name,
|
||||
&table_name.schema_name,
|
||||
&table_name.table_name,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
.with_context(|| TableNotFoundSnafu {
|
||||
table_name: request.table_name.to_string(),
|
||||
table_name: table_name.to_string(),
|
||||
})?;
|
||||
*table_ref = Some(table.clone());
|
||||
table
|
||||
@@ -281,77 +279,10 @@ impl GrpcQueryHandler for Instance {
|
||||
// do we check limit for bulk insert?
|
||||
|
||||
self.inserter
|
||||
.handle_bulk_insert(
|
||||
table,
|
||||
request.flight_data,
|
||||
request.record_batch,
|
||||
request.schema_bytes,
|
||||
)
|
||||
.handle_bulk_insert(table, decoder, data)
|
||||
.await
|
||||
.context(TableOperationSnafu)
|
||||
}
|
||||
|
||||
fn handle_put_record_batch_stream(
|
||||
&self,
|
||||
mut stream: servers::grpc::flight::PutRecordBatchRequestStream,
|
||||
ctx: QueryContextRef,
|
||||
) -> Pin<Box<dyn Stream<Item = Result<DoPutResponse>> + Send>> {
|
||||
// Resolve table once for the stream
|
||||
// Clone all necessary data to make it 'static
|
||||
let catalog_manager = self.catalog_manager().clone();
|
||||
let plugins = self.plugins.clone();
|
||||
let inserter = self.inserter.clone();
|
||||
let table_name = stream.table_name().clone();
|
||||
let ctx = ctx.clone();
|
||||
|
||||
Box::pin(try_stream! {
|
||||
plugins
|
||||
.get::<PermissionCheckerRef>()
|
||||
.as_ref()
|
||||
.check_permission(ctx.current_user(), PermissionReq::BulkInsert)
|
||||
.context(PermissionSnafu)?;
|
||||
// Cache for resolved table reference - resolve once and reuse
|
||||
let table_ref = catalog_manager
|
||||
.table(
|
||||
&table_name.catalog_name,
|
||||
&table_name.schema_name,
|
||||
&table_name.table_name,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
.with_context(|| TableNotFoundSnafu {
|
||||
table_name: table_name.to_string(),
|
||||
})?;
|
||||
|
||||
// Check permissions once for the stream
|
||||
let interceptor_ref = plugins.get::<GrpcQueryInterceptorRef<Error>>();
|
||||
let interceptor = interceptor_ref.as_ref();
|
||||
interceptor.pre_bulk_insert(table_ref.clone(), ctx.clone())?;
|
||||
|
||||
// Process each request in the stream
|
||||
while let Some(request_result) = stream.next().await {
|
||||
let request = request_result.map_err(|e| {
|
||||
let error_msg = format!("Stream error: {:?}", e);
|
||||
IncompleteGrpcRequestSnafu { err_msg: error_msg }.build()
|
||||
})?;
|
||||
|
||||
let request_id = request.request_id;
|
||||
let start = Instant::now();
|
||||
let rows = inserter
|
||||
.handle_bulk_insert(
|
||||
table_ref.clone(),
|
||||
request.flight_data,
|
||||
request.record_batch,
|
||||
request.schema_bytes,
|
||||
)
|
||||
.await
|
||||
.context(TableOperationSnafu)?;
|
||||
let elapsed_secs = start.elapsed().as_secs_f64();
|
||||
yield DoPutResponse::new(request_id, rows, elapsed_secs);
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn fill_catalog_and_schema_from_context(ddl_expr: &mut DdlExpr, ctx: &QueryContextRef) {
|
||||
|
||||
@@ -136,7 +136,7 @@ impl Instance {
|
||||
table_name: format_full_table_name(ctx.current_catalog(), &table_schema, &metric),
|
||||
})?;
|
||||
|
||||
let scan_plan = dataframe.into_unoptimized_plan();
|
||||
let scan_plan = dataframe.into_logical_plan();
|
||||
let filter_conditions =
|
||||
PromPlanner::matchers_to_expr(Matchers::new(matchers), scan_plan.schema())
|
||||
.context(PrometheusLabelValuesQueryPlanSnafu)?;
|
||||
|
||||
@@ -22,7 +22,6 @@ use common_telemetry::info;
|
||||
use meta_client::MetaClientOptions;
|
||||
use servers::error::Error as ServerError;
|
||||
use servers::grpc::builder::GrpcServerBuilder;
|
||||
use servers::grpc::flight::FlightCraftRef;
|
||||
use servers::grpc::frontend_grpc_handler::FrontendGrpcHandler;
|
||||
use servers::grpc::greptime_handler::GreptimeRequestHandler;
|
||||
use servers::grpc::{GrpcOptions, GrpcServer};
|
||||
@@ -53,7 +52,6 @@ where
|
||||
grpc_server_builder: Option<GrpcServerBuilder>,
|
||||
http_server_builder: Option<HttpServerBuilder>,
|
||||
plugins: Plugins,
|
||||
flight_handler: Option<FlightCraftRef>,
|
||||
}
|
||||
|
||||
impl<T> Services<T>
|
||||
@@ -67,7 +65,6 @@ where
|
||||
grpc_server_builder: None,
|
||||
http_server_builder: None,
|
||||
plugins,
|
||||
flight_handler: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -142,13 +139,6 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_flight_handler(self, flight_handler: FlightCraftRef) -> Self {
|
||||
Self {
|
||||
flight_handler: Some(flight_handler),
|
||||
..self
|
||||
}
|
||||
}
|
||||
|
||||
fn build_grpc_server(
|
||||
&mut self,
|
||||
grpc: &GrpcOptions,
|
||||
@@ -183,12 +173,6 @@ where
|
||||
grpc.flight_compression,
|
||||
);
|
||||
|
||||
// Use custom flight handler if provided, otherwise use the default GreptimeRequestHandler
|
||||
let flight_handler = self
|
||||
.flight_handler
|
||||
.clone()
|
||||
.unwrap_or_else(|| Arc::new(greptime_request_handler.clone()) as FlightCraftRef);
|
||||
|
||||
let grpc_server = builder
|
||||
.name(name)
|
||||
.database_handler(greptime_request_handler.clone())
|
||||
@@ -197,7 +181,7 @@ where
|
||||
self.instance.clone(),
|
||||
user_provider.clone(),
|
||||
))
|
||||
.flight_handler(flight_handler);
|
||||
.flight_handler(Arc::new(greptime_request_handler));
|
||||
|
||||
let grpc_server = if !external {
|
||||
let frontend_grpc_handler =
|
||||
|
||||
@@ -21,7 +21,7 @@ use itertools::Itertools;
|
||||
|
||||
use crate::Bytes;
|
||||
use crate::bloom_filter::error::Result;
|
||||
use crate::bloom_filter::reader::{BloomFilterReadMetrics, BloomFilterReader};
|
||||
use crate::bloom_filter::reader::BloomFilterReader;
|
||||
|
||||
/// `InListPredicate` contains a list of acceptable values. A value needs to match at least
|
||||
/// one of the elements (logical OR semantic) for the predicate to be satisfied.
|
||||
@@ -38,7 +38,7 @@ pub struct BloomFilterApplier {
|
||||
|
||||
impl BloomFilterApplier {
|
||||
pub async fn new(reader: Box<dyn BloomFilterReader + Send>) -> Result<Self> {
|
||||
let meta = reader.metadata(None).await?;
|
||||
let meta = reader.metadata().await?;
|
||||
|
||||
Ok(Self { reader, meta })
|
||||
}
|
||||
@@ -50,7 +50,6 @@ impl BloomFilterApplier {
|
||||
&mut self,
|
||||
predicates: &[InListPredicate],
|
||||
search_ranges: &[Range<usize>],
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<Vec<Range<usize>>> {
|
||||
if predicates.is_empty() {
|
||||
// If no predicates, return empty result
|
||||
@@ -58,7 +57,7 @@ impl BloomFilterApplier {
|
||||
}
|
||||
|
||||
let segments = self.row_ranges_to_segments(search_ranges);
|
||||
let (seg_locations, bloom_filters) = self.load_bloom_filters(&segments, metrics).await?;
|
||||
let (seg_locations, bloom_filters) = self.load_bloom_filters(&segments).await?;
|
||||
let matching_row_ranges = self.find_matching_rows(seg_locations, bloom_filters, predicates);
|
||||
Ok(intersect_ranges(search_ranges, &matching_row_ranges))
|
||||
}
|
||||
@@ -96,7 +95,6 @@ impl BloomFilterApplier {
|
||||
async fn load_bloom_filters(
|
||||
&mut self,
|
||||
segments: &[usize],
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<(Vec<(u64, usize)>, Vec<BloomFilter>)> {
|
||||
let segment_locations = segments
|
||||
.iter()
|
||||
@@ -110,10 +108,7 @@ impl BloomFilterApplier {
|
||||
.map(|i| self.meta.bloom_filter_locs[i as usize])
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let bloom_filters = self
|
||||
.reader
|
||||
.bloom_filter_vec(&bloom_filter_locs, metrics)
|
||||
.await?;
|
||||
let bloom_filters = self.reader.bloom_filter_vec(&bloom_filter_locs).await?;
|
||||
|
||||
Ok((segment_locations, bloom_filters))
|
||||
}
|
||||
@@ -427,10 +422,7 @@ mod tests {
|
||||
];
|
||||
|
||||
for (predicates, search_range, expected) in cases {
|
||||
let result = applier
|
||||
.search(&predicates, &[search_range], None)
|
||||
.await
|
||||
.unwrap();
|
||||
let result = applier.search(&predicates, &[search_range]).await.unwrap();
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Expected {:?}, got {:?}",
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
// limitations under the License.
|
||||
|
||||
use std::ops::{Range, Rem};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use bytemuck::try_cast_slice;
|
||||
@@ -35,72 +34,6 @@ const BLOOM_META_LEN_SIZE: u64 = 4;
|
||||
/// Default prefetch size of bloom filter meta.
|
||||
pub const DEFAULT_PREFETCH_SIZE: u64 = 8192; // 8KiB
|
||||
|
||||
/// Metrics for bloom filter read operations.
|
||||
#[derive(Default, Clone)]
|
||||
pub struct BloomFilterReadMetrics {
|
||||
/// Total byte size to read.
|
||||
pub total_bytes: u64,
|
||||
/// Total number of ranges to read.
|
||||
pub total_ranges: usize,
|
||||
/// Elapsed time to fetch data.
|
||||
pub fetch_elapsed: Duration,
|
||||
/// Number of cache hits.
|
||||
pub cache_hit: usize,
|
||||
/// Number of cache misses.
|
||||
pub cache_miss: usize,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for BloomFilterReadMetrics {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let Self {
|
||||
total_bytes,
|
||||
total_ranges,
|
||||
fetch_elapsed,
|
||||
cache_hit,
|
||||
cache_miss,
|
||||
} = self;
|
||||
|
||||
// If both total_bytes and cache_hit are 0, we didn't read anything.
|
||||
if *total_bytes == 0 && *cache_hit == 0 {
|
||||
return write!(f, "{{}}");
|
||||
}
|
||||
write!(f, "{{")?;
|
||||
|
||||
if *total_bytes > 0 {
|
||||
write!(f, "\"total_bytes\":{}", total_bytes)?;
|
||||
}
|
||||
if *cache_hit > 0 {
|
||||
if *total_bytes > 0 {
|
||||
write!(f, ", ")?;
|
||||
}
|
||||
write!(f, "\"cache_hit\":{}", cache_hit)?;
|
||||
}
|
||||
|
||||
if *total_ranges > 0 {
|
||||
write!(f, ", \"total_ranges\":{}", total_ranges)?;
|
||||
}
|
||||
if !fetch_elapsed.is_zero() {
|
||||
write!(f, ", \"fetch_elapsed\":\"{:?}\"", fetch_elapsed)?;
|
||||
}
|
||||
if *cache_miss > 0 {
|
||||
write!(f, ", \"cache_miss\":{}", cache_miss)?;
|
||||
}
|
||||
|
||||
write!(f, "}}")
|
||||
}
|
||||
}
|
||||
|
||||
impl BloomFilterReadMetrics {
|
||||
/// Merges another metrics into this one.
|
||||
pub fn merge_from(&mut self, other: &Self) {
|
||||
self.total_bytes += other.total_bytes;
|
||||
self.total_ranges += other.total_ranges;
|
||||
self.fetch_elapsed += other.fetch_elapsed;
|
||||
self.cache_hit += other.cache_hit;
|
||||
self.cache_miss += other.cache_miss;
|
||||
}
|
||||
}
|
||||
|
||||
/// Safely converts bytes to Vec<u64> using bytemuck for optimal performance.
|
||||
/// Faster than chunking and converting each piece individually.
|
||||
///
|
||||
@@ -146,33 +79,25 @@ pub fn bytes_to_u64_vec(bytes: &Bytes) -> Vec<u64> {
|
||||
#[async_trait]
|
||||
pub trait BloomFilterReader: Sync {
|
||||
/// Reads range of bytes from the file.
|
||||
async fn range_read(
|
||||
&self,
|
||||
offset: u64,
|
||||
size: u32,
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<Bytes>;
|
||||
async fn range_read(&self, offset: u64, size: u32) -> Result<Bytes>;
|
||||
|
||||
/// Reads bunch of ranges from the file.
|
||||
async fn read_vec(
|
||||
&self,
|
||||
ranges: &[Range<u64>],
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<Vec<Bytes>>;
|
||||
async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
|
||||
let mut results = Vec::with_capacity(ranges.len());
|
||||
for range in ranges {
|
||||
let size = (range.end - range.start) as u32;
|
||||
let data = self.range_read(range.start, size).await?;
|
||||
results.push(data);
|
||||
}
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Reads the meta information of the bloom filter.
|
||||
async fn metadata(
|
||||
&self,
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<BloomFilterMeta>;
|
||||
async fn metadata(&self) -> Result<BloomFilterMeta>;
|
||||
|
||||
/// Reads a bloom filter with the given location.
|
||||
async fn bloom_filter(
|
||||
&self,
|
||||
loc: &BloomFilterLoc,
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<BloomFilter> {
|
||||
let bytes = self.range_read(loc.offset, loc.size as _, metrics).await?;
|
||||
async fn bloom_filter(&self, loc: &BloomFilterLoc) -> Result<BloomFilter> {
|
||||
let bytes = self.range_read(loc.offset, loc.size as _).await?;
|
||||
let vec = bytes_to_u64_vec(&bytes);
|
||||
let bm = BloomFilter::from_vec(vec)
|
||||
.seed(&SEED)
|
||||
@@ -180,16 +105,12 @@ pub trait BloomFilterReader: Sync {
|
||||
Ok(bm)
|
||||
}
|
||||
|
||||
async fn bloom_filter_vec(
|
||||
&self,
|
||||
locs: &[BloomFilterLoc],
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<Vec<BloomFilter>> {
|
||||
async fn bloom_filter_vec(&self, locs: &[BloomFilterLoc]) -> Result<Vec<BloomFilter>> {
|
||||
let ranges = locs
|
||||
.iter()
|
||||
.map(|l| l.offset..l.offset + l.size)
|
||||
.collect::<Vec<_>>();
|
||||
let bss = self.read_vec(&ranges, metrics).await?;
|
||||
let bss = self.read_vec(&ranges).await?;
|
||||
|
||||
let mut result = Vec::with_capacity(bss.len());
|
||||
for (bs, loc) in bss.into_iter().zip(locs.iter()) {
|
||||
@@ -219,59 +140,24 @@ impl<R: RangeReader> BloomFilterReaderImpl<R> {
|
||||
|
||||
#[async_trait]
|
||||
impl<R: RangeReader> BloomFilterReader for BloomFilterReaderImpl<R> {
|
||||
async fn range_read(
|
||||
&self,
|
||||
offset: u64,
|
||||
size: u32,
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<Bytes> {
|
||||
let start = metrics.as_ref().map(|_| Instant::now());
|
||||
let result = self
|
||||
.reader
|
||||
async fn range_read(&self, offset: u64, size: u32) -> Result<Bytes> {
|
||||
self.reader
|
||||
.read(offset..offset + size as u64)
|
||||
.await
|
||||
.context(IoSnafu)?;
|
||||
|
||||
if let Some(m) = metrics {
|
||||
m.total_ranges += 1;
|
||||
m.total_bytes += size as u64;
|
||||
if let Some(start) = start {
|
||||
m.fetch_elapsed += start.elapsed();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
.context(IoSnafu)
|
||||
}
|
||||
|
||||
async fn read_vec(
|
||||
&self,
|
||||
ranges: &[Range<u64>],
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<Vec<Bytes>> {
|
||||
let start = metrics.as_ref().map(|_| Instant::now());
|
||||
let result = self.reader.read_vec(ranges).await.context(IoSnafu)?;
|
||||
|
||||
if let Some(m) = metrics {
|
||||
m.total_ranges += ranges.len();
|
||||
m.total_bytes += ranges.iter().map(|r| r.end - r.start).sum::<u64>();
|
||||
if let Some(start) = start {
|
||||
m.fetch_elapsed += start.elapsed();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
|
||||
self.reader.read_vec(ranges).await.context(IoSnafu)
|
||||
}
|
||||
|
||||
async fn metadata(
|
||||
&self,
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<BloomFilterMeta> {
|
||||
async fn metadata(&self) -> Result<BloomFilterMeta> {
|
||||
let metadata = self.reader.metadata().await.context(IoSnafu)?;
|
||||
let file_size = metadata.content_length;
|
||||
|
||||
let mut meta_reader =
|
||||
BloomFilterMetaReader::new(&self.reader, file_size, Some(DEFAULT_PREFETCH_SIZE));
|
||||
meta_reader.metadata(metrics).await
|
||||
meta_reader.metadata().await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -297,10 +183,7 @@ impl<R: RangeReader> BloomFilterMetaReader<R> {
|
||||
///
|
||||
/// It will first prefetch some bytes from the end of the file,
|
||||
/// then parse the metadata from the prefetch bytes.
|
||||
pub async fn metadata(
|
||||
&mut self,
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<BloomFilterMeta> {
|
||||
pub async fn metadata(&mut self) -> Result<BloomFilterMeta> {
|
||||
ensure!(
|
||||
self.file_size >= BLOOM_META_LEN_SIZE,
|
||||
FileSizeTooSmallSnafu {
|
||||
@@ -308,7 +191,6 @@ impl<R: RangeReader> BloomFilterMetaReader<R> {
|
||||
}
|
||||
);
|
||||
|
||||
let start = metrics.as_ref().map(|_| Instant::now());
|
||||
let meta_start = self.file_size.saturating_sub(self.prefetch_size);
|
||||
let suffix = self
|
||||
.reader
|
||||
@@ -326,28 +208,8 @@ impl<R: RangeReader> BloomFilterMetaReader<R> {
|
||||
.read(metadata_start..self.file_size - BLOOM_META_LEN_SIZE)
|
||||
.await
|
||||
.context(IoSnafu)?;
|
||||
|
||||
if let Some(m) = metrics {
|
||||
// suffix read + meta read
|
||||
m.total_ranges += 2;
|
||||
// Ignores the meta length size to simplify the calculation.
|
||||
m.total_bytes += self.file_size.min(self.prefetch_size) + length;
|
||||
if let Some(start) = start {
|
||||
m.fetch_elapsed += start.elapsed();
|
||||
}
|
||||
}
|
||||
|
||||
BloomFilterMeta::decode(meta).context(DecodeProtoSnafu)
|
||||
} else {
|
||||
if let Some(m) = metrics {
|
||||
// suffix read only
|
||||
m.total_ranges += 1;
|
||||
m.total_bytes += self.file_size.min(self.prefetch_size);
|
||||
if let Some(start) = start {
|
||||
m.fetch_elapsed += start.elapsed();
|
||||
}
|
||||
}
|
||||
|
||||
let metadata_start = self.file_size - length - BLOOM_META_LEN_SIZE - meta_start;
|
||||
let meta = &suffix[metadata_start as usize..suffix_len - BLOOM_META_LEN_SIZE as usize];
|
||||
BloomFilterMeta::decode(meta).context(DecodeProtoSnafu)
|
||||
@@ -428,7 +290,7 @@ mod tests {
|
||||
for prefetch in [0u64, file_size / 2, file_size, file_size + 10] {
|
||||
let mut reader =
|
||||
BloomFilterMetaReader::new(bytes.clone(), file_size as _, Some(prefetch));
|
||||
let meta = reader.metadata(None).await.unwrap();
|
||||
let meta = reader.metadata().await.unwrap();
|
||||
|
||||
assert_eq!(meta.rows_per_segment, 2);
|
||||
assert_eq!(meta.segment_count, 2);
|
||||
@@ -450,11 +312,11 @@ mod tests {
|
||||
let bytes = mock_bloom_filter_bytes().await;
|
||||
|
||||
let reader = BloomFilterReaderImpl::new(bytes);
|
||||
let meta = reader.metadata(None).await.unwrap();
|
||||
let meta = reader.metadata().await.unwrap();
|
||||
|
||||
assert_eq!(meta.bloom_filter_locs.len(), 2);
|
||||
let bf = reader
|
||||
.bloom_filter(&meta.bloom_filter_locs[0], None)
|
||||
.bloom_filter(&meta.bloom_filter_locs[0])
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(bf.contains(&b"a"));
|
||||
@@ -463,7 +325,7 @@ mod tests {
|
||||
assert!(bf.contains(&b"d"));
|
||||
|
||||
let bf = reader
|
||||
.bloom_filter(&meta.bloom_filter_locs[1], None)
|
||||
.bloom_filter(&meta.bloom_filter_locs[1])
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(bf.contains(&b"e"));
|
||||
|
||||
@@ -74,7 +74,7 @@ async fn test_search(
|
||||
writer.finish().await.unwrap();
|
||||
|
||||
let reader = puffin_manager.reader(&file_name).await.unwrap();
|
||||
let (index_dir, _metrics) = reader.dir(&blob_key).await.unwrap();
|
||||
let index_dir = reader.dir(&blob_key).await.unwrap();
|
||||
let searcher = TantivyFulltextIndexSearcher::new(index_dir.path(), config).unwrap();
|
||||
for (query, expected) in query_expected {
|
||||
let results = searcher.search(query).await.unwrap();
|
||||
|
||||
@@ -15,7 +15,6 @@
|
||||
use std::collections::VecDeque;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
@@ -30,115 +29,37 @@ pub use crate::inverted_index::format::reader::blob::InvertedIndexBlobReader;
|
||||
mod blob;
|
||||
mod footer;
|
||||
|
||||
/// Metrics for inverted index read operations.
|
||||
#[derive(Default, Clone)]
|
||||
pub struct InvertedIndexReadMetrics {
|
||||
/// Total byte size to read.
|
||||
pub total_bytes: u64,
|
||||
/// Total number of ranges to read.
|
||||
pub total_ranges: usize,
|
||||
/// Elapsed time to fetch data.
|
||||
pub fetch_elapsed: Duration,
|
||||
/// Number of cache hits.
|
||||
pub cache_hit: usize,
|
||||
/// Number of cache misses.
|
||||
pub cache_miss: usize,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for InvertedIndexReadMetrics {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let Self {
|
||||
total_bytes,
|
||||
total_ranges,
|
||||
fetch_elapsed,
|
||||
cache_hit,
|
||||
cache_miss,
|
||||
} = self;
|
||||
|
||||
// If both total_bytes and cache_hit are 0, we didn't read anything.
|
||||
if *total_bytes == 0 && *cache_hit == 0 {
|
||||
return write!(f, "{{}}");
|
||||
}
|
||||
write!(f, "{{")?;
|
||||
|
||||
if *total_bytes > 0 {
|
||||
write!(f, "\"total_bytes\":{}", total_bytes)?;
|
||||
}
|
||||
if *cache_hit > 0 {
|
||||
if *total_bytes > 0 {
|
||||
write!(f, ", ")?;
|
||||
}
|
||||
write!(f, "\"cache_hit\":{}", cache_hit)?;
|
||||
}
|
||||
|
||||
if *total_ranges > 0 {
|
||||
write!(f, ", \"total_ranges\":{}", total_ranges)?;
|
||||
}
|
||||
if !fetch_elapsed.is_zero() {
|
||||
write!(f, ", \"fetch_elapsed\":\"{:?}\"", fetch_elapsed)?;
|
||||
}
|
||||
if *cache_miss > 0 {
|
||||
write!(f, ", \"cache_miss\":{}", cache_miss)?;
|
||||
}
|
||||
|
||||
write!(f, "}}")
|
||||
}
|
||||
}
|
||||
|
||||
impl InvertedIndexReadMetrics {
|
||||
/// Merges another metrics into this one.
|
||||
pub fn merge_from(&mut self, other: &Self) {
|
||||
self.total_bytes += other.total_bytes;
|
||||
self.total_ranges += other.total_ranges;
|
||||
self.fetch_elapsed += other.fetch_elapsed;
|
||||
self.cache_hit += other.cache_hit;
|
||||
self.cache_miss += other.cache_miss;
|
||||
}
|
||||
}
|
||||
|
||||
/// InvertedIndexReader defines an asynchronous reader of inverted index data
|
||||
#[mockall::automock]
|
||||
#[async_trait]
|
||||
pub trait InvertedIndexReader: Send + Sync {
|
||||
/// Seeks to given offset and reads data with exact size as provided.
|
||||
async fn range_read<'a>(
|
||||
&self,
|
||||
offset: u64,
|
||||
size: u32,
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<Vec<u8>>;
|
||||
async fn range_read(&self, offset: u64, size: u32) -> Result<Vec<u8>>;
|
||||
|
||||
/// Reads the bytes in the given ranges.
|
||||
async fn read_vec<'a>(
|
||||
&self,
|
||||
ranges: &[Range<u64>],
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<Vec<Bytes>>;
|
||||
async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
|
||||
let mut result = Vec::with_capacity(ranges.len());
|
||||
for range in ranges {
|
||||
let data = self
|
||||
.range_read(range.start, (range.end - range.start) as u32)
|
||||
.await?;
|
||||
result.push(Bytes::from(data));
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Retrieves metadata of all inverted indices stored within the blob.
|
||||
async fn metadata<'a>(
|
||||
&self,
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<Arc<InvertedIndexMetas>>;
|
||||
async fn metadata(&self) -> Result<Arc<InvertedIndexMetas>>;
|
||||
|
||||
/// Retrieves the finite state transducer (FST) map from the given offset and size.
|
||||
async fn fst<'a>(
|
||||
&self,
|
||||
offset: u64,
|
||||
size: u32,
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<FstMap> {
|
||||
let fst_data = self.range_read(offset, size, metrics).await?;
|
||||
async fn fst(&self, offset: u64, size: u32) -> Result<FstMap> {
|
||||
let fst_data = self.range_read(offset, size).await?;
|
||||
FstMap::new(fst_data).context(DecodeFstSnafu)
|
||||
}
|
||||
|
||||
/// Retrieves the multiple finite state transducer (FST) maps from the given ranges.
|
||||
async fn fst_vec<'a>(
|
||||
&mut self,
|
||||
ranges: &[Range<u64>],
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<Vec<FstMap>> {
|
||||
self.read_vec(ranges, metrics)
|
||||
async fn fst_vec(&mut self, ranges: &[Range<u64>]) -> Result<Vec<FstMap>> {
|
||||
self.read_vec(ranges)
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|bytes| FstMap::new(bytes.to_vec()).context(DecodeFstSnafu))
|
||||
@@ -146,28 +67,19 @@ pub trait InvertedIndexReader: Send + Sync {
|
||||
}
|
||||
|
||||
/// Retrieves the bitmap from the given offset and size.
|
||||
async fn bitmap<'a>(
|
||||
&self,
|
||||
offset: u64,
|
||||
size: u32,
|
||||
bitmap_type: BitmapType,
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<Bitmap> {
|
||||
self.range_read(offset, size, metrics)
|
||||
.await
|
||||
.and_then(|bytes| {
|
||||
Bitmap::deserialize_from(&bytes, bitmap_type).context(DecodeBitmapSnafu)
|
||||
})
|
||||
async fn bitmap(&self, offset: u64, size: u32, bitmap_type: BitmapType) -> Result<Bitmap> {
|
||||
self.range_read(offset, size).await.and_then(|bytes| {
|
||||
Bitmap::deserialize_from(&bytes, bitmap_type).context(DecodeBitmapSnafu)
|
||||
})
|
||||
}
|
||||
|
||||
/// Retrieves the multiple bitmaps from the given ranges.
|
||||
async fn bitmap_deque<'a>(
|
||||
async fn bitmap_deque(
|
||||
&mut self,
|
||||
ranges: &[(Range<u64>, BitmapType)],
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<VecDeque<Bitmap>> {
|
||||
let (ranges, types): (Vec<_>, Vec<_>) = ranges.iter().cloned().unzip();
|
||||
let bytes = self.read_vec(&ranges, metrics).await?;
|
||||
let bytes = self.read_vec(&ranges).await?;
|
||||
bytes
|
||||
.into_iter()
|
||||
.zip(types)
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
@@ -24,10 +23,10 @@ use snafu::{ResultExt, ensure};
|
||||
|
||||
use crate::inverted_index::error::{CommonIoSnafu, Result, UnexpectedBlobSizeSnafu};
|
||||
use crate::inverted_index::format::MIN_BLOB_SIZE;
|
||||
use crate::inverted_index::format::reader::InvertedIndexReader;
|
||||
use crate::inverted_index::format::reader::footer::{
|
||||
DEFAULT_PREFETCH_SIZE, InvertedIndexFooterReader,
|
||||
};
|
||||
use crate::inverted_index::format::reader::{InvertedIndexReadMetrics, InvertedIndexReader};
|
||||
|
||||
/// Inverted index blob reader, implements [`InvertedIndexReader`]
|
||||
pub struct InvertedIndexBlobReader<R> {
|
||||
@@ -54,58 +53,27 @@ impl<R> InvertedIndexBlobReader<R> {
|
||||
|
||||
#[async_trait]
|
||||
impl<R: RangeReader + Sync> InvertedIndexReader for InvertedIndexBlobReader<R> {
|
||||
async fn range_read<'a>(
|
||||
&self,
|
||||
offset: u64,
|
||||
size: u32,
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<Vec<u8>> {
|
||||
let start = metrics.as_ref().map(|_| Instant::now());
|
||||
|
||||
async fn range_read(&self, offset: u64, size: u32) -> Result<Vec<u8>> {
|
||||
let buf = self
|
||||
.source
|
||||
.read(offset..offset + size as u64)
|
||||
.await
|
||||
.context(CommonIoSnafu)?;
|
||||
|
||||
if let Some(m) = metrics {
|
||||
m.total_bytes += size as u64;
|
||||
m.total_ranges += 1;
|
||||
m.fetch_elapsed += start.unwrap().elapsed();
|
||||
}
|
||||
|
||||
Ok(buf.into())
|
||||
}
|
||||
|
||||
async fn read_vec<'a>(
|
||||
&self,
|
||||
ranges: &[Range<u64>],
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<Vec<Bytes>> {
|
||||
let start = metrics.as_ref().map(|_| Instant::now());
|
||||
|
||||
let result = self.source.read_vec(ranges).await.context(CommonIoSnafu)?;
|
||||
|
||||
if let Some(m) = metrics {
|
||||
m.total_bytes += ranges.iter().map(|r| r.end - r.start).sum::<u64>();
|
||||
m.total_ranges += ranges.len();
|
||||
m.fetch_elapsed += start.unwrap().elapsed();
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
|
||||
self.source.read_vec(ranges).await.context(CommonIoSnafu)
|
||||
}
|
||||
|
||||
async fn metadata<'a>(
|
||||
&self,
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<Arc<InvertedIndexMetas>> {
|
||||
async fn metadata(&self) -> Result<Arc<InvertedIndexMetas>> {
|
||||
let metadata = self.source.metadata().await.context(CommonIoSnafu)?;
|
||||
let blob_size = metadata.content_length;
|
||||
Self::validate_blob_size(blob_size)?;
|
||||
|
||||
let mut footer_reader = InvertedIndexFooterReader::new(&self.source, blob_size)
|
||||
.with_prefetch_size(DEFAULT_PREFETCH_SIZE);
|
||||
footer_reader.metadata(metrics).await.map(Arc::new)
|
||||
footer_reader.metadata().await.map(Arc::new)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -205,7 +173,7 @@ mod tests {
|
||||
let blob = create_inverted_index_blob();
|
||||
let blob_reader = InvertedIndexBlobReader::new(blob);
|
||||
|
||||
let metas = blob_reader.metadata(None).await.unwrap();
|
||||
let metas = blob_reader.metadata().await.unwrap();
|
||||
assert_eq!(metas.metas.len(), 2);
|
||||
|
||||
let meta0 = metas.metas.get("tag0").unwrap();
|
||||
@@ -232,14 +200,13 @@ mod tests {
|
||||
let blob = create_inverted_index_blob();
|
||||
let blob_reader = InvertedIndexBlobReader::new(blob);
|
||||
|
||||
let metas = blob_reader.metadata(None).await.unwrap();
|
||||
let metas = blob_reader.metadata().await.unwrap();
|
||||
let meta = metas.metas.get("tag0").unwrap();
|
||||
|
||||
let fst_map = blob_reader
|
||||
.fst(
|
||||
meta.base_offset + meta.relative_fst_offset as u64,
|
||||
meta.fst_size,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -252,7 +219,6 @@ mod tests {
|
||||
.fst(
|
||||
meta.base_offset + meta.relative_fst_offset as u64,
|
||||
meta.fst_size,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -266,30 +232,30 @@ mod tests {
|
||||
let blob = create_inverted_index_blob();
|
||||
let blob_reader = InvertedIndexBlobReader::new(blob);
|
||||
|
||||
let metas = blob_reader.metadata(None).await.unwrap();
|
||||
let metas = blob_reader.metadata().await.unwrap();
|
||||
let meta = metas.metas.get("tag0").unwrap();
|
||||
|
||||
let bitmap = blob_reader
|
||||
.bitmap(meta.base_offset, 26, BitmapType::Roaring, None)
|
||||
.bitmap(meta.base_offset, 26, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(bitmap, mock_bitmap());
|
||||
let bitmap = blob_reader
|
||||
.bitmap(meta.base_offset + 26, 26, BitmapType::Roaring, None)
|
||||
.bitmap(meta.base_offset + 26, 26, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(bitmap, mock_bitmap());
|
||||
|
||||
let metas = blob_reader.metadata(None).await.unwrap();
|
||||
let metas = blob_reader.metadata().await.unwrap();
|
||||
let meta = metas.metas.get("tag1").unwrap();
|
||||
|
||||
let bitmap = blob_reader
|
||||
.bitmap(meta.base_offset, 26, BitmapType::Roaring, None)
|
||||
.bitmap(meta.base_offset, 26, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(bitmap, mock_bitmap());
|
||||
let bitmap = blob_reader
|
||||
.bitmap(meta.base_offset + 26, 26, BitmapType::Roaring, None)
|
||||
.bitmap(meta.base_offset + 26, 26, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(bitmap, mock_bitmap());
|
||||
|
||||
@@ -12,8 +12,6 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::time::Instant;
|
||||
|
||||
use common_base::range_read::RangeReader;
|
||||
use greptime_proto::v1::index::{InvertedIndexMeta, InvertedIndexMetas};
|
||||
use prost::Message;
|
||||
@@ -25,7 +23,6 @@ use crate::inverted_index::error::{
|
||||
UnexpectedZeroSegmentRowCountSnafu,
|
||||
};
|
||||
use crate::inverted_index::format::FOOTER_PAYLOAD_SIZE_SIZE;
|
||||
use crate::inverted_index::format::reader::InvertedIndexReadMetrics;
|
||||
|
||||
pub const DEFAULT_PREFETCH_SIZE: u64 = 8192; // 8KiB
|
||||
|
||||
@@ -57,17 +54,12 @@ impl<R> InvertedIndexFooterReader<R> {
|
||||
}
|
||||
|
||||
impl<R: RangeReader> InvertedIndexFooterReader<R> {
|
||||
pub async fn metadata(
|
||||
&mut self,
|
||||
mut metrics: Option<&mut InvertedIndexReadMetrics>,
|
||||
) -> Result<InvertedIndexMetas> {
|
||||
pub async fn metadata(&mut self) -> Result<InvertedIndexMetas> {
|
||||
ensure!(
|
||||
self.blob_size >= FOOTER_PAYLOAD_SIZE_SIZE,
|
||||
BlobSizeTooSmallSnafu
|
||||
);
|
||||
|
||||
let start = metrics.as_ref().map(|_| Instant::now());
|
||||
|
||||
let footer_start = self.blob_size.saturating_sub(self.prefetch_size());
|
||||
let suffix = self
|
||||
.source
|
||||
@@ -81,36 +73,19 @@ impl<R: RangeReader> InvertedIndexFooterReader<R> {
|
||||
let footer_size = FOOTER_PAYLOAD_SIZE_SIZE;
|
||||
|
||||
// Did not fetch the entire file metadata in the initial read, need to make a second request.
|
||||
let result = if length > suffix_len as u64 - footer_size {
|
||||
if length > suffix_len as u64 - footer_size {
|
||||
let metadata_start = self.blob_size - length - footer_size;
|
||||
let meta = self
|
||||
.source
|
||||
.read(metadata_start..self.blob_size - footer_size)
|
||||
.await
|
||||
.context(CommonIoSnafu)?;
|
||||
|
||||
if let Some(m) = metrics.as_deref_mut() {
|
||||
m.total_bytes += self.blob_size.min(self.prefetch_size()) + length;
|
||||
m.total_ranges += 2;
|
||||
}
|
||||
|
||||
self.parse_payload(&meta, length)
|
||||
} else {
|
||||
if let Some(m) = metrics.as_deref_mut() {
|
||||
m.total_bytes += self.blob_size.min(self.prefetch_size());
|
||||
m.total_ranges += 1;
|
||||
}
|
||||
|
||||
let metadata_start = self.blob_size - length - footer_size - footer_start;
|
||||
let meta = &suffix[metadata_start as usize..suffix_len - footer_size as usize];
|
||||
self.parse_payload(meta, length)
|
||||
};
|
||||
|
||||
if let Some(m) = metrics {
|
||||
m.fetch_elapsed += start.unwrap().elapsed();
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
fn read_tailing_four_bytes(suffix: &[u8]) -> Result<[u8; 4]> {
|
||||
@@ -211,7 +186,7 @@ mod tests {
|
||||
reader = reader.with_prefetch_size(prefetch);
|
||||
}
|
||||
|
||||
let metas = reader.metadata(None).await.unwrap();
|
||||
let metas = reader.metadata().await.unwrap();
|
||||
assert_eq!(metas.metas.len(), 1);
|
||||
let index_meta = &metas.metas.get("test").unwrap();
|
||||
assert_eq!(index_meta.name, "test");
|
||||
@@ -235,7 +210,7 @@ mod tests {
|
||||
reader = reader.with_prefetch_size(prefetch);
|
||||
}
|
||||
|
||||
let result = reader.metadata(None).await;
|
||||
let result = reader.metadata().await;
|
||||
assert_matches!(result, Err(Error::UnexpectedFooterPayloadSize { .. }));
|
||||
}
|
||||
}
|
||||
@@ -258,7 +233,7 @@ mod tests {
|
||||
reader = reader.with_prefetch_size(prefetch);
|
||||
}
|
||||
|
||||
let result = reader.metadata(None).await;
|
||||
let result = reader.metadata().await;
|
||||
assert_matches!(result, Err(Error::UnexpectedOffsetSize { .. }));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -122,7 +122,7 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
let reader = InvertedIndexBlobReader::new(blob);
|
||||
let metadata = reader.metadata(None).await.unwrap();
|
||||
let metadata = reader.metadata().await.unwrap();
|
||||
assert_eq!(metadata.total_row_count, 8);
|
||||
assert_eq!(metadata.segment_row_count, 1);
|
||||
assert_eq!(metadata.metas.len(), 0);
|
||||
@@ -182,7 +182,7 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
let reader = InvertedIndexBlobReader::new(blob);
|
||||
let metadata = reader.metadata(None).await.unwrap();
|
||||
let metadata = reader.metadata().await.unwrap();
|
||||
assert_eq!(metadata.total_row_count, 8);
|
||||
assert_eq!(metadata.segment_row_count, 1);
|
||||
assert_eq!(metadata.metas.len(), 2);
|
||||
@@ -198,19 +198,13 @@ mod tests {
|
||||
.fst(
|
||||
tag0.base_offset + tag0.relative_fst_offset as u64,
|
||||
tag0.fst_size,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(fst0.len(), 3);
|
||||
let [offset, size] = unpack(fst0.get(b"a").unwrap());
|
||||
let bitmap = reader
|
||||
.bitmap(
|
||||
tag0.base_offset + offset as u64,
|
||||
size,
|
||||
BitmapType::Roaring,
|
||||
None,
|
||||
)
|
||||
.bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -219,12 +213,7 @@ mod tests {
|
||||
);
|
||||
let [offset, size] = unpack(fst0.get(b"b").unwrap());
|
||||
let bitmap = reader
|
||||
.bitmap(
|
||||
tag0.base_offset + offset as u64,
|
||||
size,
|
||||
BitmapType::Roaring,
|
||||
None,
|
||||
)
|
||||
.bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -233,12 +222,7 @@ mod tests {
|
||||
);
|
||||
let [offset, size] = unpack(fst0.get(b"c").unwrap());
|
||||
let bitmap = reader
|
||||
.bitmap(
|
||||
tag0.base_offset + offset as u64,
|
||||
size,
|
||||
BitmapType::Roaring,
|
||||
None,
|
||||
)
|
||||
.bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -257,19 +241,13 @@ mod tests {
|
||||
.fst(
|
||||
tag1.base_offset + tag1.relative_fst_offset as u64,
|
||||
tag1.fst_size,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(fst1.len(), 3);
|
||||
let [offset, size] = unpack(fst1.get(b"x").unwrap());
|
||||
let bitmap = reader
|
||||
.bitmap(
|
||||
tag1.base_offset + offset as u64,
|
||||
size,
|
||||
BitmapType::Roaring,
|
||||
None,
|
||||
)
|
||||
.bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -278,12 +256,7 @@ mod tests {
|
||||
);
|
||||
let [offset, size] = unpack(fst1.get(b"y").unwrap());
|
||||
let bitmap = reader
|
||||
.bitmap(
|
||||
tag1.base_offset + offset as u64,
|
||||
size,
|
||||
BitmapType::Roaring,
|
||||
None,
|
||||
)
|
||||
.bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -292,12 +265,7 @@ mod tests {
|
||||
);
|
||||
let [offset, size] = unpack(fst1.get(b"z").unwrap());
|
||||
let bitmap = reader
|
||||
.bitmap(
|
||||
tag1.base_offset + offset as u64,
|
||||
size,
|
||||
BitmapType::Roaring,
|
||||
None,
|
||||
)
|
||||
.bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
|
||||
@@ -16,7 +16,7 @@ use greptime_proto::v1::index::{BitmapType, InvertedIndexMeta};
|
||||
|
||||
use crate::bitmap::Bitmap;
|
||||
use crate::inverted_index::error::Result;
|
||||
use crate::inverted_index::format::reader::{InvertedIndexReadMetrics, InvertedIndexReader};
|
||||
use crate::inverted_index::format::reader::InvertedIndexReader;
|
||||
|
||||
/// `ParallelFstValuesMapper` enables parallel mapping of multiple FST value groups to their
|
||||
/// corresponding bitmaps within an inverted index.
|
||||
@@ -35,8 +35,7 @@ impl<'a> ParallelFstValuesMapper<'a> {
|
||||
|
||||
pub async fn map_values_vec(
|
||||
&mut self,
|
||||
value_and_meta_vec: &[(Vec<u64>, &InvertedIndexMeta)],
|
||||
metrics: Option<&mut InvertedIndexReadMetrics>,
|
||||
value_and_meta_vec: &[(Vec<u64>, &'a InvertedIndexMeta)],
|
||||
) -> Result<Vec<Bitmap>> {
|
||||
let groups = value_and_meta_vec
|
||||
.iter()
|
||||
@@ -65,7 +64,7 @@ impl<'a> ParallelFstValuesMapper<'a> {
|
||||
}
|
||||
|
||||
common_telemetry::debug!("fetch ranges: {:?}", fetch_ranges);
|
||||
let mut bitmaps = self.reader.bitmap_deque(&fetch_ranges, metrics).await?;
|
||||
let mut bitmaps = self.reader.bitmap_deque(&fetch_ranges).await?;
|
||||
let mut output = Vec::with_capacity(groups.len());
|
||||
|
||||
for counter in groups {
|
||||
@@ -96,25 +95,23 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_map_values_vec() {
|
||||
let mut mock_reader = MockInvertedIndexReader::new();
|
||||
mock_reader
|
||||
.expect_bitmap_deque()
|
||||
.returning(|ranges, _metrics| {
|
||||
let mut output = VecDeque::new();
|
||||
for (range, bitmap_type) in ranges {
|
||||
let offset = range.start;
|
||||
let size = range.end - range.start;
|
||||
match (offset, size, bitmap_type) {
|
||||
(1, 1, BitmapType::Roaring) => {
|
||||
output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
|
||||
}
|
||||
(2, 1, BitmapType::Roaring) => {
|
||||
output.push_back(Bitmap::from_lsb0_bytes(&[0b01010101], *bitmap_type))
|
||||
}
|
||||
_ => unreachable!(),
|
||||
mock_reader.expect_bitmap_deque().returning(|ranges| {
|
||||
let mut output = VecDeque::new();
|
||||
for (range, bitmap_type) in ranges {
|
||||
let offset = range.start;
|
||||
let size = range.end - range.start;
|
||||
match (offset, size, bitmap_type) {
|
||||
(1, 1, BitmapType::Roaring) => {
|
||||
output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
|
||||
}
|
||||
(2, 1, BitmapType::Roaring) => {
|
||||
output.push_back(Bitmap::from_lsb0_bytes(&[0b01010101], *bitmap_type))
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
Ok(output)
|
||||
});
|
||||
}
|
||||
Ok(output)
|
||||
});
|
||||
|
||||
let meta = InvertedIndexMeta {
|
||||
bitmap_type: BitmapType::Roaring.into(),
|
||||
@@ -123,13 +120,13 @@ mod tests {
|
||||
let mut values_mapper = ParallelFstValuesMapper::new(&mut mock_reader);
|
||||
|
||||
let result = values_mapper
|
||||
.map_values_vec(&[(vec![], &meta)], None)
|
||||
.map_values_vec(&[(vec![], &meta)])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(result[0].count_ones(), 0);
|
||||
|
||||
let result = values_mapper
|
||||
.map_values_vec(&[(vec![value(1, 1)], &meta)], None)
|
||||
.map_values_vec(&[(vec![value(1, 1)], &meta)])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -138,7 +135,7 @@ mod tests {
|
||||
);
|
||||
|
||||
let result = values_mapper
|
||||
.map_values_vec(&[(vec![value(2, 1)], &meta)], None)
|
||||
.map_values_vec(&[(vec![value(2, 1)], &meta)])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -147,7 +144,7 @@ mod tests {
|
||||
);
|
||||
|
||||
let result = values_mapper
|
||||
.map_values_vec(&[(vec![value(1, 1), value(2, 1)], &meta)], None)
|
||||
.map_values_vec(&[(vec![value(1, 1), value(2, 1)], &meta)])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -156,7 +153,7 @@ mod tests {
|
||||
);
|
||||
|
||||
let result = values_mapper
|
||||
.map_values_vec(&[(vec![value(2, 1), value(1, 1)], &meta)], None)
|
||||
.map_values_vec(&[(vec![value(2, 1), value(1, 1)], &meta)])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -165,10 +162,7 @@ mod tests {
|
||||
);
|
||||
|
||||
let result = values_mapper
|
||||
.map_values_vec(
|
||||
&[(vec![value(2, 1)], &meta), (vec![value(1, 1)], &meta)],
|
||||
None,
|
||||
)
|
||||
.map_values_vec(&[(vec![value(2, 1)], &meta), (vec![value(1, 1)], &meta)])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -180,13 +174,10 @@ mod tests {
|
||||
Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring)
|
||||
);
|
||||
let result = values_mapper
|
||||
.map_values_vec(
|
||||
&[
|
||||
(vec![value(2, 1), value(1, 1)], &meta),
|
||||
(vec![value(1, 1)], &meta),
|
||||
],
|
||||
None,
|
||||
)
|
||||
.map_values_vec(&[
|
||||
(vec![value(2, 1), value(1, 1)], &meta),
|
||||
(vec![value(1, 1)], &meta),
|
||||
])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
|
||||
@@ -19,7 +19,7 @@ pub use predicates_apply::PredicatesIndexApplier;
|
||||
|
||||
use crate::bitmap::Bitmap;
|
||||
use crate::inverted_index::error::Result;
|
||||
use crate::inverted_index::format::reader::{InvertedIndexReadMetrics, InvertedIndexReader};
|
||||
use crate::inverted_index::format::reader::InvertedIndexReader;
|
||||
|
||||
/// The output of an apply operation.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
@@ -44,11 +44,10 @@ pub trait IndexApplier: Send + Sync {
|
||||
/// Applies the predefined predicates to the data read by the given index reader, returning
|
||||
/// a list of relevant indices (e.g., post IDs, group IDs, row IDs).
|
||||
#[allow(unused_parens)]
|
||||
async fn apply<'a, 'b>(
|
||||
async fn apply<'a>(
|
||||
&self,
|
||||
context: SearchContext,
|
||||
reader: &mut (dyn InvertedIndexReader + 'a),
|
||||
metrics: Option<&'b mut InvertedIndexReadMetrics>,
|
||||
) -> Result<ApplyOutput>;
|
||||
|
||||
/// Returns the memory usage of the applier.
|
||||
|
||||
@@ -19,7 +19,7 @@ use greptime_proto::v1::index::InvertedIndexMetas;
|
||||
|
||||
use crate::bitmap::Bitmap;
|
||||
use crate::inverted_index::error::{IndexNotFoundSnafu, Result};
|
||||
use crate::inverted_index::format::reader::{InvertedIndexReadMetrics, InvertedIndexReader};
|
||||
use crate::inverted_index::format::reader::InvertedIndexReader;
|
||||
use crate::inverted_index::search::fst_apply::{
|
||||
FstApplier, IntersectionFstApplier, KeysFstApplier,
|
||||
};
|
||||
@@ -43,14 +43,12 @@ pub struct PredicatesIndexApplier {
|
||||
impl IndexApplier for PredicatesIndexApplier {
|
||||
/// Applies all `FstApplier`s to the data in the inverted index reader, intersecting the individual
|
||||
/// bitmaps obtained for each index to result in a final set of indices.
|
||||
async fn apply<'a, 'b>(
|
||||
async fn apply<'a>(
|
||||
&self,
|
||||
context: SearchContext,
|
||||
reader: &mut (dyn InvertedIndexReader + 'a),
|
||||
metrics: Option<&'b mut InvertedIndexReadMetrics>,
|
||||
) -> Result<ApplyOutput> {
|
||||
let mut metrics = metrics;
|
||||
let metadata = reader.metadata(metrics.as_deref_mut()).await?;
|
||||
let metadata = reader.metadata().await?;
|
||||
let mut output = ApplyOutput {
|
||||
matched_segment_ids: Bitmap::new_bitvec(),
|
||||
total_row_count: metadata.total_row_count as _,
|
||||
@@ -86,7 +84,7 @@ impl IndexApplier for PredicatesIndexApplier {
|
||||
return Ok(output);
|
||||
}
|
||||
|
||||
let fsts = reader.fst_vec(&fst_ranges, metrics.as_deref_mut()).await?;
|
||||
let fsts = reader.fst_vec(&fst_ranges).await?;
|
||||
let value_and_meta_vec = fsts
|
||||
.into_iter()
|
||||
.zip(appliers)
|
||||
@@ -94,7 +92,7 @@ impl IndexApplier for PredicatesIndexApplier {
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut mapper = ParallelFstValuesMapper::new(reader);
|
||||
let mut bm_vec = mapper.map_values_vec(&value_and_meta_vec, metrics).await?;
|
||||
let mut bm_vec = mapper.map_values_vec(&value_and_meta_vec).await?;
|
||||
|
||||
let mut bitmap = bm_vec.pop().unwrap(); // SAFETY: `fst_ranges` is not empty
|
||||
for bm in bm_vec {
|
||||
@@ -223,28 +221,26 @@ mod tests {
|
||||
let mut mock_reader = MockInvertedIndexReader::new();
|
||||
mock_reader
|
||||
.expect_metadata()
|
||||
.returning(|_| Ok(mock_metas([("tag-0", 0)])));
|
||||
mock_reader.expect_fst_vec().returning(|_ranges, _metrics| {
|
||||
.returning(|| Ok(mock_metas([("tag-0", 0)])));
|
||||
mock_reader.expect_fst_vec().returning(|_ranges| {
|
||||
Ok(vec![
|
||||
FstMap::from_iter([(b"tag-0_value-0", fst_value(2, 1))]).unwrap(),
|
||||
])
|
||||
});
|
||||
|
||||
mock_reader
|
||||
.expect_bitmap_deque()
|
||||
.returning(|arg, _metrics| {
|
||||
assert_eq!(arg.len(), 1);
|
||||
let range = &arg[0].0;
|
||||
let bitmap_type = arg[0].1;
|
||||
assert_eq!(*range, 2..3);
|
||||
assert_eq!(bitmap_type, BitmapType::Roaring);
|
||||
Ok(VecDeque::from([Bitmap::from_lsb0_bytes(
|
||||
&[0b10101010],
|
||||
bitmap_type,
|
||||
)]))
|
||||
});
|
||||
mock_reader.expect_bitmap_deque().returning(|arg| {
|
||||
assert_eq!(arg.len(), 1);
|
||||
let range = &arg[0].0;
|
||||
let bitmap_type = arg[0].1;
|
||||
assert_eq!(*range, 2..3);
|
||||
assert_eq!(bitmap_type, BitmapType::Roaring);
|
||||
Ok(VecDeque::from([Bitmap::from_lsb0_bytes(
|
||||
&[0b10101010],
|
||||
bitmap_type,
|
||||
)]))
|
||||
});
|
||||
let output = applier
|
||||
.apply(SearchContext::default(), &mut mock_reader, None)
|
||||
.apply(SearchContext::default(), &mut mock_reader)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -256,14 +252,14 @@ mod tests {
|
||||
let mut mock_reader = MockInvertedIndexReader::new();
|
||||
mock_reader
|
||||
.expect_metadata()
|
||||
.returning(|_| Ok(mock_metas([("tag-0", 0)])));
|
||||
mock_reader.expect_fst_vec().returning(|_range, _metrics| {
|
||||
.returning(|| Ok(mock_metas([("tag-0", 0)])));
|
||||
mock_reader.expect_fst_vec().returning(|_range| {
|
||||
Ok(vec![
|
||||
FstMap::from_iter([(b"tag-0_value-1", fst_value(2, 1))]).unwrap(),
|
||||
])
|
||||
});
|
||||
let output = applier
|
||||
.apply(SearchContext::default(), &mut mock_reader, None)
|
||||
.apply(SearchContext::default(), &mut mock_reader)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(output.matched_segment_ids.count_ones(), 0);
|
||||
@@ -283,8 +279,8 @@ mod tests {
|
||||
let mut mock_reader = MockInvertedIndexReader::new();
|
||||
mock_reader
|
||||
.expect_metadata()
|
||||
.returning(|_| Ok(mock_metas([("tag-0", 0), ("tag-1", 1)])));
|
||||
mock_reader.expect_fst_vec().returning(|ranges, _metrics| {
|
||||
.returning(|| Ok(mock_metas([("tag-0", 0), ("tag-1", 1)])));
|
||||
mock_reader.expect_fst_vec().returning(|ranges| {
|
||||
let mut output = vec![];
|
||||
for range in ranges {
|
||||
match range.start {
|
||||
@@ -297,29 +293,27 @@ mod tests {
|
||||
}
|
||||
Ok(output)
|
||||
});
|
||||
mock_reader
|
||||
.expect_bitmap_deque()
|
||||
.returning(|ranges, _metrics| {
|
||||
let mut output = VecDeque::new();
|
||||
for (range, bitmap_type) in ranges {
|
||||
let offset = range.start;
|
||||
let size = range.end - range.start;
|
||||
match (offset, size, bitmap_type) {
|
||||
(1, 1, BitmapType::Roaring) => {
|
||||
output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
|
||||
}
|
||||
(2, 1, BitmapType::Roaring) => {
|
||||
output.push_back(Bitmap::from_lsb0_bytes(&[0b11011011], *bitmap_type))
|
||||
}
|
||||
_ => unreachable!(),
|
||||
mock_reader.expect_bitmap_deque().returning(|ranges| {
|
||||
let mut output = VecDeque::new();
|
||||
for (range, bitmap_type) in ranges {
|
||||
let offset = range.start;
|
||||
let size = range.end - range.start;
|
||||
match (offset, size, bitmap_type) {
|
||||
(1, 1, BitmapType::Roaring) => {
|
||||
output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
|
||||
}
|
||||
(2, 1, BitmapType::Roaring) => {
|
||||
output.push_back(Bitmap::from_lsb0_bytes(&[0b11011011], *bitmap_type))
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(output)
|
||||
});
|
||||
Ok(output)
|
||||
});
|
||||
|
||||
let output = applier
|
||||
.apply(SearchContext::default(), &mut mock_reader, None)
|
||||
.apply(SearchContext::default(), &mut mock_reader)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -337,10 +331,10 @@ mod tests {
|
||||
let mut mock_reader: MockInvertedIndexReader = MockInvertedIndexReader::new();
|
||||
mock_reader
|
||||
.expect_metadata()
|
||||
.returning(|_| Ok(mock_metas([("tag-0", 0)])));
|
||||
.returning(|| Ok(mock_metas([("tag-0", 0)])));
|
||||
|
||||
let output = applier
|
||||
.apply(SearchContext::default(), &mut mock_reader, None)
|
||||
.apply(SearchContext::default(), &mut mock_reader)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(output.matched_segment_ids, Bitmap::full_bitvec(8)); // full range to scan
|
||||
@@ -349,7 +343,7 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_index_applier_with_empty_index() {
|
||||
let mut mock_reader = MockInvertedIndexReader::new();
|
||||
mock_reader.expect_metadata().returning(move |_| {
|
||||
mock_reader.expect_metadata().returning(move || {
|
||||
Ok(Arc::new(InvertedIndexMetas {
|
||||
total_row_count: 0, // No rows
|
||||
segment_row_count: 1,
|
||||
@@ -365,7 +359,7 @@ mod tests {
|
||||
};
|
||||
|
||||
let output = applier
|
||||
.apply(SearchContext::default(), &mut mock_reader, None)
|
||||
.apply(SearchContext::default(), &mut mock_reader)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(output.matched_segment_ids.is_empty());
|
||||
@@ -376,7 +370,7 @@ mod tests {
|
||||
let mut mock_reader = MockInvertedIndexReader::new();
|
||||
mock_reader
|
||||
.expect_metadata()
|
||||
.returning(|_| Ok(mock_metas(vec![])));
|
||||
.returning(|| Ok(mock_metas(vec![])));
|
||||
|
||||
let mut mock_fst_applier = MockFstApplier::new();
|
||||
mock_fst_applier.expect_apply().never();
|
||||
@@ -391,7 +385,6 @@ mod tests {
|
||||
index_not_found_strategy: IndexNotFoundStrategy::ThrowError,
|
||||
},
|
||||
&mut mock_reader,
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
assert!(matches!(result, Err(Error::IndexNotFound { .. })));
|
||||
@@ -402,7 +395,6 @@ mod tests {
|
||||
index_not_found_strategy: IndexNotFoundStrategy::ReturnEmpty,
|
||||
},
|
||||
&mut mock_reader,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -414,7 +406,6 @@ mod tests {
|
||||
index_not_found_strategy: IndexNotFoundStrategy::Ignore,
|
||||
},
|
||||
&mut mock_reader,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -189,9 +189,6 @@ impl MetaClientBuilder {
|
||||
let mgr = client.channel_manager.clone();
|
||||
|
||||
if self.enable_heartbeat {
|
||||
if self.heartbeat_channel_manager.is_some() {
|
||||
info!("Enable heartbeat channel using the heartbeat channel manager.");
|
||||
}
|
||||
let mgr = self.heartbeat_channel_manager.unwrap_or(mgr.clone());
|
||||
client.heartbeat = Some(HeartbeatClient::new(
|
||||
self.id,
|
||||
|
||||
@@ -24,7 +24,7 @@ use common_meta::distributed_time_constants::META_KEEP_ALIVE_INTERVAL_SECS;
|
||||
use common_telemetry::tracing_context::TracingContext;
|
||||
use common_telemetry::warn;
|
||||
use rand::seq::SliceRandom;
|
||||
use snafu::ResultExt;
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
use tokio::time::timeout;
|
||||
use tonic::transport::Channel;
|
||||
|
||||
@@ -101,14 +101,12 @@ impl AskLeader {
|
||||
};
|
||||
|
||||
let (tx, mut rx) = tokio::sync::mpsc::channel(peers.len());
|
||||
let channel_manager = self.channel_manager.clone();
|
||||
|
||||
for addr in &peers {
|
||||
let mut client = self.create_asker(addr)?;
|
||||
let tx_clone = tx.clone();
|
||||
let req = req.clone();
|
||||
let addr = addr.clone();
|
||||
let channel_manager = channel_manager.clone();
|
||||
tokio::spawn(async move {
|
||||
match client.ask_leader(req).await {
|
||||
Ok(res) => {
|
||||
@@ -119,19 +117,13 @@ impl AskLeader {
|
||||
};
|
||||
}
|
||||
Err(status) => {
|
||||
// Reset cached channel even on generic errors: the VIP may keep us on a dead
|
||||
// backend, so forcing a reconnect gives us a chance to hit a healthy peer.
|
||||
Self::reset_channels_with_manager(
|
||||
&channel_manager,
|
||||
std::slice::from_ref(&addr),
|
||||
);
|
||||
warn!("Failed to ask leader from: {addr}, {status}");
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
let leader = match timeout(
|
||||
let leader = timeout(
|
||||
self.channel_manager
|
||||
.config()
|
||||
.timeout
|
||||
@@ -139,16 +131,8 @@ impl AskLeader {
|
||||
rx.recv(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(Some(leader)) => leader,
|
||||
Ok(None) => return error::NoLeaderSnafu.fail(),
|
||||
Err(e) => {
|
||||
// All peers timed out. Reset channels to force reconnection,
|
||||
// which may help escape dead backends in VIP/LB scenarios.
|
||||
Self::reset_channels_with_manager(&self.channel_manager, &peers);
|
||||
return Err(e).context(error::AskLeaderTimeoutSnafu);
|
||||
}
|
||||
};
|
||||
.context(error::AskLeaderTimeoutSnafu)?
|
||||
.context(error::NoLeaderSnafu)?;
|
||||
|
||||
let mut leadership_group = self.leadership_group.write().unwrap();
|
||||
leadership_group.leader = Some(leader.clone());
|
||||
@@ -185,15 +169,6 @@ impl AskLeader {
|
||||
.context(error::CreateChannelSnafu)?,
|
||||
))
|
||||
}
|
||||
|
||||
/// Drop cached channels for the given peers so a fresh connection is used next time.
|
||||
fn reset_channels_with_manager(channel_manager: &ChannelManager, peers: &[String]) {
|
||||
if peers.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
channel_manager.retain_channel(|addr, _| !peers.iter().any(|peer| peer == addr));
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
|
||||
@@ -18,10 +18,6 @@ use std::time::Duration;
|
||||
use client::RegionFollowerClientRef;
|
||||
use common_base::Plugins;
|
||||
use common_grpc::channel_manager::{ChannelConfig, ChannelManager};
|
||||
use common_meta::distributed_time_constants::{
|
||||
HEARTBEAT_CHANNEL_KEEP_ALIVE_INTERVAL_SECS, HEARTBEAT_CHANNEL_KEEP_ALIVE_TIMEOUT_SECS,
|
||||
HEARTBEAT_TIMEOUT,
|
||||
};
|
||||
use common_telemetry::{debug, info};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@@ -38,6 +34,8 @@ pub struct MetaClientOptions {
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub timeout: Duration,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub heartbeat_timeout: Duration,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub ddl_timeout: Duration,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub connect_timeout: Duration,
|
||||
@@ -54,6 +52,7 @@ impl Default for MetaClientOptions {
|
||||
Self {
|
||||
metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
|
||||
timeout: Duration::from_millis(3_000u64),
|
||||
heartbeat_timeout: Duration::from_millis(500u64),
|
||||
ddl_timeout: Duration::from_millis(10_000u64),
|
||||
connect_timeout: Duration::from_millis(1_000u64),
|
||||
tcp_nodelay: true,
|
||||
@@ -98,11 +97,7 @@ pub async fn create_meta_client(
|
||||
.timeout(meta_client_options.timeout)
|
||||
.connect_timeout(meta_client_options.connect_timeout)
|
||||
.tcp_nodelay(meta_client_options.tcp_nodelay);
|
||||
let heartbeat_config = base_config
|
||||
.clone()
|
||||
.timeout(HEARTBEAT_TIMEOUT)
|
||||
.http2_keep_alive_interval(HEARTBEAT_CHANNEL_KEEP_ALIVE_INTERVAL_SECS)
|
||||
.http2_keep_alive_timeout(HEARTBEAT_CHANNEL_KEEP_ALIVE_TIMEOUT_SECS);
|
||||
let heartbeat_config = base_config.clone();
|
||||
|
||||
if let MetaClientType::Frontend = client_type {
|
||||
let ddl_config = base_config.clone().timeout(meta_client_options.ddl_timeout);
|
||||
|
||||
@@ -23,8 +23,6 @@ use store_api::storage::RegionId;
|
||||
mod candidate;
|
||||
mod ctx;
|
||||
mod handler;
|
||||
#[cfg(test)]
|
||||
mod mock;
|
||||
mod options;
|
||||
mod procedure;
|
||||
mod scheduler;
|
||||
|
||||
@@ -1,458 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
mod basic;
|
||||
mod candidate_select;
|
||||
mod concurrent;
|
||||
mod config;
|
||||
mod err_handle;
|
||||
mod full_list;
|
||||
mod integration;
|
||||
mod misc;
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use common_meta::datanode::{RegionManifestInfo, RegionStat};
|
||||
use common_meta::key::table_route::PhysicalTableRouteValue;
|
||||
use common_meta::peer::Peer;
|
||||
use common_meta::rpc::router::{Region, RegionRoute};
|
||||
use common_telemetry::debug;
|
||||
use ordered_float::OrderedFloat;
|
||||
use store_api::region_engine::RegionRole;
|
||||
use store_api::storage::{FileRefsManifest, GcReport, RegionId};
|
||||
use table::metadata::TableId;
|
||||
use tokio::sync::mpsc::Sender;
|
||||
|
||||
use crate::error::{Result, UnexpectedSnafu};
|
||||
use crate::gc::candidate::GcCandidate;
|
||||
use crate::gc::ctx::SchedulerCtx;
|
||||
use crate::gc::handler::Region2Peers;
|
||||
use crate::gc::options::GcSchedulerOptions;
|
||||
use crate::gc::scheduler::{Event, GcScheduler};
|
||||
|
||||
pub const TEST_REGION_SIZE_200MB: u64 = 200_000_000;
|
||||
|
||||
/// Helper function to create an empty GcReport for the given region IDs
|
||||
pub fn new_empty_report_with(region_ids: impl IntoIterator<Item = RegionId>) -> GcReport {
|
||||
let mut deleted_files = HashMap::new();
|
||||
for region_id in region_ids {
|
||||
deleted_files.insert(region_id, vec![]);
|
||||
}
|
||||
GcReport {
|
||||
deleted_files,
|
||||
need_retry_regions: HashSet::new(),
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::type_complexity)]
|
||||
#[derive(Debug, Default)]
|
||||
pub struct MockSchedulerCtx {
|
||||
pub table_to_region_stats: Arc<Mutex<Option<HashMap<TableId, Vec<RegionStat>>>>>,
|
||||
pub table_routes: Arc<Mutex<HashMap<TableId, (TableId, PhysicalTableRouteValue)>>>,
|
||||
pub file_refs: Arc<Mutex<Option<FileRefsManifest>>>,
|
||||
pub gc_reports: Arc<Mutex<HashMap<RegionId, GcReport>>>,
|
||||
pub candidates: Arc<Mutex<Option<HashMap<TableId, Vec<GcCandidate>>>>>,
|
||||
pub get_table_to_region_stats_calls: Arc<Mutex<usize>>,
|
||||
pub get_file_references_calls: Arc<Mutex<usize>>,
|
||||
pub gc_regions_calls: Arc<Mutex<usize>>,
|
||||
// Error injection fields for testing
|
||||
pub get_table_to_region_stats_error: Arc<Mutex<Option<crate::error::Error>>>,
|
||||
pub get_table_route_error: Arc<Mutex<Option<crate::error::Error>>>,
|
||||
pub get_file_references_error: Arc<Mutex<Option<crate::error::Error>>>,
|
||||
pub gc_regions_error: Arc<Mutex<Option<crate::error::Error>>>,
|
||||
// Retry testing fields
|
||||
pub gc_regions_retry_count: Arc<Mutex<HashMap<RegionId, usize>>>,
|
||||
pub gc_regions_error_sequence: Arc<Mutex<Vec<crate::error::Error>>>,
|
||||
pub gc_regions_success_after_retries: Arc<Mutex<HashMap<RegionId, usize>>>,
|
||||
// Per-region error injection
|
||||
pub gc_regions_per_region_errors: Arc<Mutex<HashMap<RegionId, crate::error::Error>>>,
|
||||
}
|
||||
|
||||
impl MockSchedulerCtx {
|
||||
pub fn with_table_routes(
|
||||
self,
|
||||
table_routes: HashMap<TableId, (TableId, Vec<(RegionId, Peer)>)>,
|
||||
) -> Self {
|
||||
*self.table_routes.lock().unwrap() = table_routes
|
||||
.into_iter()
|
||||
.map(|(k, (phy_id, region2peer))| {
|
||||
let phy = PhysicalTableRouteValue::new(
|
||||
region2peer
|
||||
.into_iter()
|
||||
.map(|(region_id, peer)| RegionRoute {
|
||||
region: Region::new_test(region_id),
|
||||
leader_peer: Some(peer),
|
||||
..Default::default()
|
||||
})
|
||||
.collect(),
|
||||
);
|
||||
|
||||
(k, (phy_id, phy))
|
||||
})
|
||||
.collect();
|
||||
self
|
||||
}
|
||||
|
||||
/// Set an error to be returned by `get_table_to_region_stats`
|
||||
#[allow(dead_code)]
|
||||
pub fn with_get_table_to_region_stats_error(self, error: crate::error::Error) -> Self {
|
||||
*self.get_table_to_region_stats_error.lock().unwrap() = Some(error);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set an error to be returned by `get_table_route`
|
||||
pub fn set_table_route_error(&self, error: crate::error::Error) {
|
||||
*self.get_table_route_error.lock().unwrap() = Some(error);
|
||||
}
|
||||
|
||||
/// Set an error to be returned by `get_file_references`
|
||||
#[allow(dead_code)]
|
||||
pub fn with_get_file_references_error(self, error: crate::error::Error) -> Self {
|
||||
*self.get_file_references_error.lock().unwrap() = Some(error);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set an error to be returned by `gc_regions`
|
||||
pub fn with_gc_regions_error(self, error: crate::error::Error) -> Self {
|
||||
*self.gc_regions_error.lock().unwrap() = Some(error);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set a sequence of errors to be returned by `gc_regions` for retry testing
|
||||
pub fn set_gc_regions_error_sequence(&self, errors: Vec<crate::error::Error>) {
|
||||
*self.gc_regions_error_sequence.lock().unwrap() = errors;
|
||||
}
|
||||
|
||||
/// Set success after a specific number of retries for a region
|
||||
pub fn set_gc_regions_success_after_retries(&self, region_id: RegionId, retries: usize) {
|
||||
self.gc_regions_success_after_retries
|
||||
.lock()
|
||||
.unwrap()
|
||||
.insert(region_id, retries);
|
||||
}
|
||||
|
||||
/// Get the retry count for a specific region
|
||||
pub fn get_retry_count(&self, region_id: RegionId) -> usize {
|
||||
self.gc_regions_retry_count
|
||||
.lock()
|
||||
.unwrap()
|
||||
.get(®ion_id)
|
||||
.copied()
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
/// Reset all retry tracking
|
||||
pub fn reset_retry_tracking(&self) {
|
||||
*self.gc_regions_retry_count.lock().unwrap() = HashMap::new();
|
||||
*self.gc_regions_error_sequence.lock().unwrap() = Vec::new();
|
||||
*self.gc_regions_success_after_retries.lock().unwrap() = HashMap::new();
|
||||
}
|
||||
|
||||
/// Set an error to be returned for a specific region
|
||||
pub fn set_gc_regions_error_for_region(&self, region_id: RegionId, error: crate::error::Error) {
|
||||
self.gc_regions_per_region_errors
|
||||
.lock()
|
||||
.unwrap()
|
||||
.insert(region_id, error);
|
||||
}
|
||||
|
||||
/// Clear per-region errors
|
||||
#[allow(unused)]
|
||||
pub fn clear_gc_regions_per_region_errors(&self) {
|
||||
self.gc_regions_per_region_errors.lock().unwrap().clear();
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl SchedulerCtx for MockSchedulerCtx {
|
||||
async fn get_table_to_region_stats(&self) -> Result<HashMap<TableId, Vec<RegionStat>>> {
|
||||
*self.get_table_to_region_stats_calls.lock().unwrap() += 1;
|
||||
|
||||
// Check if we should return an injected error
|
||||
if let Some(error) = self.get_table_to_region_stats_error.lock().unwrap().take() {
|
||||
return Err(error);
|
||||
}
|
||||
|
||||
Ok(self
|
||||
.table_to_region_stats
|
||||
.lock()
|
||||
.unwrap()
|
||||
.clone()
|
||||
.unwrap_or_default())
|
||||
}
|
||||
|
||||
async fn get_table_route(
|
||||
&self,
|
||||
table_id: TableId,
|
||||
) -> Result<(TableId, PhysicalTableRouteValue)> {
|
||||
// Check if we should return an injected error
|
||||
if let Some(error) = self.get_table_route_error.lock().unwrap().take() {
|
||||
return Err(error);
|
||||
}
|
||||
|
||||
Ok(self
|
||||
.table_routes
|
||||
.lock()
|
||||
.unwrap()
|
||||
.get(&table_id)
|
||||
.cloned()
|
||||
.unwrap_or_else(|| (table_id, PhysicalTableRouteValue::default())))
|
||||
}
|
||||
|
||||
async fn get_file_references(
|
||||
&self,
|
||||
query_regions: &[RegionId],
|
||||
_related_regions: HashMap<RegionId, Vec<RegionId>>,
|
||||
region_to_peer: &Region2Peers,
|
||||
_timeout: Duration,
|
||||
) -> Result<FileRefsManifest> {
|
||||
*self.get_file_references_calls.lock().unwrap() += 1;
|
||||
|
||||
// Check if we should return an injected error
|
||||
if let Some(error) = self.get_file_references_error.lock().unwrap().take() {
|
||||
return Err(error);
|
||||
}
|
||||
if query_regions
|
||||
.iter()
|
||||
.any(|region_id| !region_to_peer.contains_key(region_id))
|
||||
{
|
||||
UnexpectedSnafu {
|
||||
violated: format!(
|
||||
"region_to_peer{region_to_peer:?} does not contain all region_ids requested: {:?}",
|
||||
query_regions
|
||||
),
|
||||
}.fail()?;
|
||||
}
|
||||
|
||||
Ok(self.file_refs.lock().unwrap().clone().unwrap_or_default())
|
||||
}
|
||||
|
||||
async fn gc_regions(
|
||||
&self,
|
||||
_peer: Peer,
|
||||
region_ids: &[RegionId],
|
||||
_file_refs_manifest: &FileRefsManifest,
|
||||
_full_file_listing: bool,
|
||||
_timeout: Duration,
|
||||
) -> Result<GcReport> {
|
||||
*self.gc_regions_calls.lock().unwrap() += 1;
|
||||
|
||||
// Check per-region error injection first (for any region)
|
||||
for ®ion_id in region_ids {
|
||||
if let Some(error) = self
|
||||
.gc_regions_per_region_errors
|
||||
.lock()
|
||||
.unwrap()
|
||||
.remove(®ion_id)
|
||||
{
|
||||
*self
|
||||
.gc_regions_retry_count
|
||||
.lock()
|
||||
.unwrap()
|
||||
.entry(region_id)
|
||||
.or_insert(0) += 1;
|
||||
return Err(error);
|
||||
}
|
||||
}
|
||||
|
||||
// Check if we should return an injected error
|
||||
if let Some(error) = self.gc_regions_error.lock().unwrap().take() {
|
||||
for region_id in region_ids {
|
||||
*self
|
||||
.gc_regions_retry_count
|
||||
.lock()
|
||||
.unwrap()
|
||||
.entry(*region_id)
|
||||
.or_insert(0) += 1;
|
||||
}
|
||||
return Err(error);
|
||||
}
|
||||
|
||||
// Handle error sequence for retry testing
|
||||
{
|
||||
let mut error_sequence = self.gc_regions_error_sequence.lock().unwrap();
|
||||
if !error_sequence.is_empty() {
|
||||
let error = error_sequence.remove(0);
|
||||
for region_id in region_ids {
|
||||
*self
|
||||
.gc_regions_retry_count
|
||||
.lock()
|
||||
.unwrap()
|
||||
.entry(*region_id)
|
||||
.or_insert(0) += 1;
|
||||
}
|
||||
return Err(error);
|
||||
}
|
||||
}
|
||||
|
||||
// Build the final report by processing each region individually
|
||||
let mut final_report = GcReport::default();
|
||||
let gc_reports = self.gc_reports.lock().unwrap();
|
||||
let success_after_retries = self.gc_regions_success_after_retries.lock().unwrap();
|
||||
|
||||
for ®ion_id in region_ids {
|
||||
// Get current retry count for this region
|
||||
let retry_count = self
|
||||
.gc_regions_retry_count
|
||||
.lock()
|
||||
.unwrap()
|
||||
.get(®ion_id)
|
||||
.copied()
|
||||
.unwrap_or(0);
|
||||
|
||||
// Check if this region should succeed or need retry
|
||||
if let Some(&required_retries) = success_after_retries.get(®ion_id) {
|
||||
if retry_count < required_retries {
|
||||
debug!(
|
||||
"Region {} needs retry (attempt {}/{})",
|
||||
region_id,
|
||||
retry_count + 1,
|
||||
required_retries
|
||||
);
|
||||
// This region needs more retries - add to need_retry_regions
|
||||
final_report.need_retry_regions.insert(region_id);
|
||||
// Track the retry attempt
|
||||
let mut retry_count_map = self.gc_regions_retry_count.lock().unwrap();
|
||||
*retry_count_map.entry(region_id).or_insert(0) += 1;
|
||||
} else {
|
||||
debug!(
|
||||
"Region {} has completed retries - succeeding now",
|
||||
region_id
|
||||
);
|
||||
// This region has completed all required retries - succeed
|
||||
if let Some(report) = gc_reports.get(®ion_id) {
|
||||
final_report.merge(report.clone());
|
||||
}
|
||||
// Track the success attempt
|
||||
let mut retry_count_map = self.gc_regions_retry_count.lock().unwrap();
|
||||
*retry_count_map.entry(region_id).or_insert(0) += 1;
|
||||
}
|
||||
} else {
|
||||
// No retry requirement - check if we have a GC report for this region
|
||||
if let Some(report) = gc_reports.get(®ion_id) {
|
||||
// We have a GC report - succeed immediately
|
||||
final_report.merge(report.clone());
|
||||
// Track the success attempt
|
||||
let mut retry_count_map = self.gc_regions_retry_count.lock().unwrap();
|
||||
*retry_count_map.entry(region_id).or_insert(0) += 1;
|
||||
} else {
|
||||
// No GC report available - this region should be marked for retry
|
||||
final_report.need_retry_regions.insert(region_id);
|
||||
// Track the attempt
|
||||
let mut retry_count_map = self.gc_regions_retry_count.lock().unwrap();
|
||||
*retry_count_map.entry(region_id).or_insert(0) += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Return the report with need_retry_regions populated - let the caller handle retry logic
|
||||
Ok(final_report)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TestEnv {
|
||||
pub scheduler: GcScheduler,
|
||||
pub ctx: Arc<MockSchedulerCtx>,
|
||||
#[allow(dead_code)]
|
||||
tx: Sender<Event>,
|
||||
}
|
||||
|
||||
#[allow(unused)]
|
||||
impl TestEnv {
|
||||
pub fn new() -> Self {
|
||||
let ctx = Arc::new(MockSchedulerCtx::default());
|
||||
let (tx, rx) = GcScheduler::channel();
|
||||
let config = GcSchedulerOptions::default();
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: rx,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
Self { scheduler, ctx, tx }
|
||||
}
|
||||
|
||||
pub fn with_candidates(self, candidates: HashMap<TableId, Vec<GcCandidate>>) -> Self {
|
||||
*self.ctx.candidates.lock().unwrap() = Some(candidates);
|
||||
self
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub async fn run_scheduler(mut self) {
|
||||
self.scheduler.run().await;
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub async fn tick(&self) {
|
||||
self.tx.send(Event::Tick).await.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function to create a mock GC candidate that will pass the GC threshold
|
||||
fn new_candidate(region_id: RegionId, score: f64) -> GcCandidate {
|
||||
// will pass threshold for gc
|
||||
let region_stat = mock_region_stat(region_id, RegionRole::Leader, 10_000, 10);
|
||||
|
||||
GcCandidate {
|
||||
region_id,
|
||||
score: OrderedFloat(score),
|
||||
region_stat,
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function to create a mock GC candidate
|
||||
fn mock_candidate(region_id: RegionId) -> GcCandidate {
|
||||
let region_stat = mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10);
|
||||
GcCandidate {
|
||||
region_id,
|
||||
score: ordered_float::OrderedFloat(1.0),
|
||||
region_stat,
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function to create a mock RegionStat
|
||||
fn mock_region_stat(
|
||||
id: RegionId,
|
||||
role: RegionRole,
|
||||
approximate_bytes: u64,
|
||||
sst_num: u64,
|
||||
) -> RegionStat {
|
||||
RegionStat {
|
||||
id,
|
||||
role,
|
||||
approximate_bytes,
|
||||
sst_num,
|
||||
region_manifest: RegionManifestInfo::Mito {
|
||||
manifest_version: 0,
|
||||
flushed_entry_id: 0,
|
||||
file_removed_cnt: 0,
|
||||
},
|
||||
rcus: 0,
|
||||
wcus: 0,
|
||||
engine: "mito".to_string(),
|
||||
num_rows: 0,
|
||||
memtable_size: 0,
|
||||
manifest_size: 0,
|
||||
sst_size: 0,
|
||||
index_size: 0,
|
||||
data_topic_latest_entry_id: 0,
|
||||
metadata_topic_latest_entry_id: 0,
|
||||
written_bytes: 0,
|
||||
}
|
||||
}
|
||||
@@ -1,164 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Instant;
|
||||
|
||||
use common_meta::peer::Peer;
|
||||
use common_telemetry::init_default_ut_logging;
|
||||
use store_api::region_engine::RegionRole;
|
||||
use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId};
|
||||
|
||||
use crate::gc::mock::{
|
||||
MockSchedulerCtx, TEST_REGION_SIZE_200MB, TestEnv, mock_region_stat, new_candidate,
|
||||
};
|
||||
use crate::gc::{GcScheduler, GcSchedulerOptions};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_parallel_process_datanodes_empty() {
|
||||
let env = TestEnv::new();
|
||||
let report = env
|
||||
.scheduler
|
||||
.parallel_process_datanodes(HashMap::new())
|
||||
.await;
|
||||
|
||||
assert_eq!(report.per_datanode_reports.len(), 0);
|
||||
assert_eq!(report.failed_datanodes.len(), 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_parallel_process_datanodes_with_candidates() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
let peer = Peer::new(1, "");
|
||||
let candidates = HashMap::from([(table_id, vec![new_candidate(region_id, 1.0)])]);
|
||||
|
||||
let mut gc_reports = HashMap::new();
|
||||
let deleted_files = vec![FileId::random()];
|
||||
gc_reports.insert(
|
||||
region_id,
|
||||
GcReport {
|
||||
deleted_files: HashMap::from([(region_id, deleted_files.clone())]),
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: HashMap::from([(region_id, 1)]),
|
||||
..Default::default()
|
||||
};
|
||||
let ctx = MockSchedulerCtx {
|
||||
gc_reports: Arc::new(Mutex::new(gc_reports)),
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(table_id, vec![(region_id, peer.clone())]),
|
||||
)]));
|
||||
|
||||
let env = TestEnv::new();
|
||||
// We need to replace the ctx with the one with gc_reports
|
||||
let mut scheduler = env.scheduler;
|
||||
scheduler.ctx = Arc::new(ctx);
|
||||
|
||||
// Convert table-based candidates to datanode-based candidates
|
||||
let datanode_to_candidates = HashMap::from([(
|
||||
peer,
|
||||
candidates
|
||||
.into_iter()
|
||||
.flat_map(|(table_id, candidates)| candidates.into_iter().map(move |c| (table_id, c)))
|
||||
.collect(),
|
||||
)]);
|
||||
|
||||
let report = scheduler
|
||||
.parallel_process_datanodes(datanode_to_candidates)
|
||||
.await;
|
||||
|
||||
assert_eq!(report.per_datanode_reports.len(), 1);
|
||||
assert_eq!(report.failed_datanodes.len(), 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_handle_tick() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
let peer = Peer::new(1, "");
|
||||
let candidates = HashMap::from([(table_id, vec![new_candidate(region_id, 1.0)])]);
|
||||
|
||||
let mut gc_reports = HashMap::new();
|
||||
gc_reports.insert(region_id, GcReport::default());
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: HashMap::from([(region_id, 1)]),
|
||||
..Default::default()
|
||||
};
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(HashMap::from([(
|
||||
table_id,
|
||||
vec![mock_region_stat(
|
||||
region_id,
|
||||
RegionRole::Leader,
|
||||
TEST_REGION_SIZE_200MB,
|
||||
10,
|
||||
)],
|
||||
)])))),
|
||||
gc_reports: Arc::new(Mutex::new(gc_reports)),
|
||||
candidates: Arc::new(Mutex::new(Some(candidates))),
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(table_id, vec![(region_id, peer)]),
|
||||
)])),
|
||||
);
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config: GcSchedulerOptions::default(),
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let report = scheduler.handle_tick().await.unwrap();
|
||||
|
||||
// Validate the returned GcJobReport
|
||||
assert_eq!(
|
||||
report.per_datanode_reports.len(),
|
||||
1,
|
||||
"Should process 1 datanode"
|
||||
);
|
||||
assert_eq!(
|
||||
report.failed_datanodes.len(),
|
||||
0,
|
||||
"Should have 0 failed datanodes"
|
||||
);
|
||||
|
||||
assert_eq!(*ctx.get_table_to_region_stats_calls.lock().unwrap(), 1);
|
||||
assert_eq!(*ctx.get_file_references_calls.lock().unwrap(), 1);
|
||||
assert_eq!(*ctx.gc_regions_calls.lock().unwrap(), 1);
|
||||
|
||||
let tracker = scheduler.region_gc_tracker.lock().await;
|
||||
assert!(
|
||||
tracker.contains_key(®ion_id),
|
||||
"Tracker should have one region: {:?}",
|
||||
tracker
|
||||
);
|
||||
}
|
||||
@@ -1,390 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Instant;
|
||||
|
||||
use common_meta::datanode::RegionManifestInfo;
|
||||
use common_telemetry::init_default_ut_logging;
|
||||
use store_api::region_engine::RegionRole;
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use crate::gc::mock::{MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_region_stat};
|
||||
use crate::gc::{GcScheduler, GcSchedulerOptions};
|
||||
|
||||
/// Candidate Selection Tests
|
||||
#[tokio::test]
|
||||
async fn test_gc_candidate_filtering_by_role() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let leader_region = RegionId::new(table_id, 1);
|
||||
let follower_region = RegionId::new(table_id, 2);
|
||||
|
||||
let mut leader_stat = mock_region_stat(
|
||||
leader_region,
|
||||
RegionRole::Leader,
|
||||
TEST_REGION_SIZE_200MB,
|
||||
10,
|
||||
); // 200MB
|
||||
|
||||
let mut follower_stat = mock_region_stat(
|
||||
follower_region,
|
||||
RegionRole::Follower,
|
||||
TEST_REGION_SIZE_200MB,
|
||||
10,
|
||||
); // 200MB
|
||||
|
||||
// Set up manifest info for scoring
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut leader_stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 5;
|
||||
}
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut follower_stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 5;
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, vec![leader_stat.clone(), follower_stat.clone()])]);
|
||||
|
||||
let ctx = Arc::new(MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
..Default::default()
|
||||
});
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config: GcSchedulerOptions::default(),
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let stats = ctx
|
||||
.table_to_region_stats
|
||||
.lock()
|
||||
.unwrap()
|
||||
.clone()
|
||||
.unwrap_or_default();
|
||||
|
||||
let candidates = scheduler.select_gc_candidates(&stats).await.unwrap();
|
||||
|
||||
// Should only select leader regions
|
||||
assert_eq!(
|
||||
candidates.len(),
|
||||
1,
|
||||
"Expected 1 table with candidates, got {}",
|
||||
candidates.len()
|
||||
);
|
||||
if let Some(table_candidates) = candidates.get(&table_id) {
|
||||
assert_eq!(
|
||||
table_candidates.len(),
|
||||
1,
|
||||
"Expected 1 candidate for table {}, got {}",
|
||||
table_id,
|
||||
table_candidates.len()
|
||||
);
|
||||
assert_eq!(
|
||||
table_candidates[0].region_id, leader_region,
|
||||
"Expected leader region {}, got {}",
|
||||
leader_region, table_candidates[0].region_id
|
||||
);
|
||||
} else {
|
||||
panic!("Expected table {} to have candidates", table_id);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_gc_candidate_size_threshold() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let small_region = RegionId::new(table_id, 1);
|
||||
let large_region = RegionId::new(table_id, 2);
|
||||
|
||||
let mut small_stat = mock_region_stat(small_region, RegionRole::Leader, 50_000_000, 5); // 50MB
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut small_stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 3;
|
||||
}
|
||||
|
||||
let mut large_stat =
|
||||
mock_region_stat(large_region, RegionRole::Leader, TEST_REGION_SIZE_200MB, 20); // 200MB
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut large_stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 5;
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, vec![small_stat, large_stat])]);
|
||||
|
||||
let ctx = Arc::new(MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
..Default::default()
|
||||
});
|
||||
|
||||
let config = GcSchedulerOptions {
|
||||
min_region_size_threshold: 100 * 1024 * 1024, // 100MB (default)
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let stats = ctx
|
||||
.table_to_region_stats
|
||||
.lock()
|
||||
.unwrap()
|
||||
.clone()
|
||||
.unwrap_or_default();
|
||||
|
||||
let candidates = scheduler.select_gc_candidates(&stats).await.unwrap();
|
||||
|
||||
// Should only select large region
|
||||
assert_eq!(
|
||||
candidates.len(),
|
||||
1,
|
||||
"Expected 1 table with candidates, got {}",
|
||||
candidates.len()
|
||||
);
|
||||
if let Some(table_candidates) = candidates.get(&table_id) {
|
||||
assert_eq!(
|
||||
table_candidates.len(),
|
||||
1,
|
||||
"Expected 1 candidate for table {}, got {}",
|
||||
table_id,
|
||||
table_candidates.len()
|
||||
);
|
||||
assert_eq!(
|
||||
table_candidates[0].region_id, large_region,
|
||||
"Expected large region {}, got {}",
|
||||
large_region, table_candidates[0].region_id
|
||||
);
|
||||
} else {
|
||||
panic!("Expected table {} to have candidates", table_id);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_gc_candidate_scoring() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let low_score_region = RegionId::new(table_id, 1);
|
||||
let high_score_region = RegionId::new(table_id, 2);
|
||||
|
||||
let mut low_stat = mock_region_stat(
|
||||
low_score_region,
|
||||
RegionRole::Leader,
|
||||
TEST_REGION_SIZE_200MB,
|
||||
5,
|
||||
); // 200MB
|
||||
// Set low file removal rate for low_score_region
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut low_stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 2;
|
||||
}
|
||||
|
||||
let mut high_stat = mock_region_stat(
|
||||
high_score_region,
|
||||
RegionRole::Leader,
|
||||
TEST_REGION_SIZE_200MB,
|
||||
50,
|
||||
); // 200MB
|
||||
// Set high file removal rate for high_score_region
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut high_stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 20;
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, vec![low_stat, high_stat])]);
|
||||
|
||||
let ctx = Arc::new(MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
..Default::default()
|
||||
});
|
||||
|
||||
let config = GcSchedulerOptions {
|
||||
sst_count_weight: 1.0,
|
||||
file_removed_count_weight: 0.5,
|
||||
min_region_size_threshold: 100 * 1024 * 1024, // 100MB (default)
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let stats = ctx
|
||||
.table_to_region_stats
|
||||
.lock()
|
||||
.unwrap()
|
||||
.clone()
|
||||
.unwrap_or_default();
|
||||
|
||||
let candidates = scheduler.select_gc_candidates(&stats).await.unwrap();
|
||||
|
||||
// Should select both regions but high score region should be first
|
||||
assert_eq!(
|
||||
candidates.len(),
|
||||
1,
|
||||
"Expected 1 table with candidates, got {}",
|
||||
candidates.len()
|
||||
);
|
||||
if let Some(table_candidates) = candidates.get(&table_id) {
|
||||
assert_eq!(
|
||||
table_candidates.len(),
|
||||
2,
|
||||
"Expected 2 candidates for table {}, got {}",
|
||||
table_id,
|
||||
table_candidates.len()
|
||||
);
|
||||
// Higher score region should come first (sorted by score descending)
|
||||
assert_eq!(
|
||||
table_candidates[0].region_id, high_score_region,
|
||||
"High score region should be first"
|
||||
);
|
||||
assert!(
|
||||
table_candidates[0].score > table_candidates[1].score,
|
||||
"High score region should have higher score: {} > {}",
|
||||
table_candidates[0].score,
|
||||
table_candidates[1].score
|
||||
);
|
||||
} else {
|
||||
panic!("Expected table {} to have candidates", table_id);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_gc_candidate_regions_per_table_threshold() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
// Create 10 regions for the same table
|
||||
let mut region_stats = Vec::new();
|
||||
|
||||
for i in 0..10 {
|
||||
let region_id = RegionId::new(table_id, i + 1);
|
||||
let mut stat = mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 20); // 200MB
|
||||
|
||||
// Set different file removal rates to create different scores
|
||||
// Higher region IDs get higher scores (better GC candidates)
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = (i as u64 + 1) * 2; // Region 1: 2, Region 2: 4, ..., Region 10: 20
|
||||
}
|
||||
|
||||
region_stats.push(stat);
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, region_stats)]);
|
||||
|
||||
let ctx = Arc::new(MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
..Default::default()
|
||||
});
|
||||
|
||||
// Set regions_per_table_threshold to 3
|
||||
let config = GcSchedulerOptions {
|
||||
regions_per_table_threshold: 3,
|
||||
min_region_size_threshold: 100 * 1024 * 1024, // 100MB (default)
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let stats = ctx
|
||||
.table_to_region_stats
|
||||
.lock()
|
||||
.unwrap()
|
||||
.clone()
|
||||
.unwrap_or_default();
|
||||
|
||||
let candidates = scheduler.select_gc_candidates(&stats).await.unwrap();
|
||||
|
||||
// Should have 1 table with candidates
|
||||
assert_eq!(
|
||||
candidates.len(),
|
||||
1,
|
||||
"Expected 1 table with candidates, got {}",
|
||||
candidates.len()
|
||||
);
|
||||
|
||||
if let Some(table_candidates) = candidates.get(&table_id) {
|
||||
// Should only have 3 candidates due to regions_per_table_threshold
|
||||
assert_eq!(
|
||||
table_candidates.len(),
|
||||
3,
|
||||
"Expected 3 candidates for table {} due to regions_per_table_threshold, got {}",
|
||||
table_id,
|
||||
table_candidates.len()
|
||||
);
|
||||
|
||||
// Verify that the top 3 scoring regions are selected
|
||||
// Regions 8, 9, 10 should have the highest scores (file_removed_cnt: 16, 18, 20)
|
||||
// They should be returned in descending order by score
|
||||
let expected_regions = vec![10, 9, 8];
|
||||
let actual_regions: Vec<u32> = table_candidates
|
||||
.iter()
|
||||
.map(|c| c.region_id.region_number())
|
||||
.collect();
|
||||
|
||||
assert_eq!(
|
||||
actual_regions, expected_regions,
|
||||
"Expected regions {:?} to be selected, got {:?}",
|
||||
expected_regions, actual_regions
|
||||
);
|
||||
|
||||
// Verify they are sorted by score in descending order
|
||||
for i in 0..table_candidates.len() - 1 {
|
||||
assert!(
|
||||
table_candidates[i].score >= table_candidates[i + 1].score,
|
||||
"Candidates should be sorted by score descending: {} >= {}",
|
||||
table_candidates[i].score,
|
||||
table_candidates[i + 1].score
|
||||
);
|
||||
}
|
||||
} else {
|
||||
panic!("Expected table {} to have candidates", table_id);
|
||||
}
|
||||
}
|
||||
@@ -1,516 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use common_meta::key::table_route::PhysicalTableRouteValue;
|
||||
use common_meta::peer::Peer;
|
||||
use common_meta::rpc::router::{Region, RegionRoute};
|
||||
use common_telemetry::{info, init_default_ut_logging};
|
||||
use store_api::region_engine::RegionRole;
|
||||
use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId};
|
||||
|
||||
use crate::gc::mock::{
|
||||
MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_candidate, mock_region_stat, new_candidate,
|
||||
};
|
||||
use crate::gc::{GcScheduler, GcSchedulerOptions};
|
||||
|
||||
/// Concurrent Processing Tests
|
||||
#[tokio::test]
|
||||
async fn test_concurrent_table_processing_limits() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let mut candidates = HashMap::new();
|
||||
let mut gc_reports = HashMap::new();
|
||||
|
||||
// Create many tables with candidates
|
||||
for table_id in 1..=10 {
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
candidates.insert(table_id, vec![new_candidate(region_id, 1.0)]);
|
||||
gc_reports.insert(
|
||||
region_id,
|
||||
GcReport {
|
||||
deleted_files: HashMap::from([(region_id, vec![FileId::random()])]),
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
let ctx = MockSchedulerCtx {
|
||||
candidates: Arc::new(Mutex::new(Some(candidates))),
|
||||
file_refs: Arc::new(Mutex::new(Some(FileRefsManifest {
|
||||
manifest_version: (1..=10).map(|i| (RegionId::new(i, 1), 1)).collect(),
|
||||
..Default::default()
|
||||
}))),
|
||||
gc_reports: Arc::new(Mutex::new(gc_reports)),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(
|
||||
(1..=10)
|
||||
.map(|table_id| {
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
(table_id, (table_id, vec![(region_id, Peer::new(1, ""))]))
|
||||
})
|
||||
.collect(),
|
||||
);
|
||||
|
||||
let ctx = Arc::new(ctx);
|
||||
|
||||
let config = GcSchedulerOptions {
|
||||
max_concurrent_tables: 3, // Set a low limit
|
||||
retry_backoff_duration: Duration::from_millis(50), // for faster test
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let candidates = ctx.candidates.lock().unwrap().clone().unwrap_or_default();
|
||||
|
||||
// Convert table-based candidates to datanode-based candidates
|
||||
let peer = Peer::new(1, "");
|
||||
let datanode_to_candidates = HashMap::from([(
|
||||
peer,
|
||||
candidates
|
||||
.into_iter()
|
||||
.flat_map(|(table_id, candidates)| candidates.into_iter().map(move |c| (table_id, c)))
|
||||
.collect(),
|
||||
)]);
|
||||
|
||||
let report = scheduler
|
||||
.parallel_process_datanodes(datanode_to_candidates)
|
||||
.await;
|
||||
|
||||
// Should process all datanodes
|
||||
assert_eq!(report.per_datanode_reports.len(), 1);
|
||||
assert_eq!(report.failed_datanodes.len(), 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_datanode_processes_tables_with_partial_gc_failures() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table1 = 1;
|
||||
let region1 = RegionId::new(table1, 1);
|
||||
let table2 = 2;
|
||||
let region2 = RegionId::new(table2, 1);
|
||||
let peer = Peer::new(1, "");
|
||||
|
||||
let mut candidates = HashMap::new();
|
||||
candidates.insert(table1, vec![new_candidate(region1, 1.0)]);
|
||||
candidates.insert(table2, vec![new_candidate(region2, 1.0)]);
|
||||
|
||||
// Set up GC reports for success and failure
|
||||
let mut gc_reports = HashMap::new();
|
||||
gc_reports.insert(
|
||||
region1,
|
||||
GcReport {
|
||||
deleted_files: HashMap::from([(region1, vec![])]),
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
// region2 will have no GC report, simulating failure
|
||||
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: HashMap::from([(region1, 1), (region2, 1)]),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
gc_reports: Arc::new(Mutex::new(gc_reports)),
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
candidates: Arc::new(Mutex::new(Some(candidates))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([
|
||||
(table1, (table1, vec![(region1, peer.clone())])),
|
||||
(table2, (table2, vec![(region2, peer.clone())])),
|
||||
])),
|
||||
);
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config: GcSchedulerOptions::default(),
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let candidates = ctx.candidates.lock().unwrap().clone().unwrap_or_default();
|
||||
|
||||
// Convert table-based candidates to datanode-based candidates
|
||||
|
||||
let datanode_to_candidates = HashMap::from([(
|
||||
peer,
|
||||
candidates
|
||||
.into_iter()
|
||||
.flat_map(|(table_id, candidates)| candidates.into_iter().map(move |c| (table_id, c)))
|
||||
.collect(),
|
||||
)]);
|
||||
|
||||
let report = scheduler
|
||||
.parallel_process_datanodes(datanode_to_candidates)
|
||||
.await;
|
||||
|
||||
// Should have one datanode with mixed results
|
||||
assert_eq!(report.per_datanode_reports.len(), 1);
|
||||
// also check one failed region (region2 has no GC report, so it should be in need_retry_regions)
|
||||
let datanode_report = report.per_datanode_reports.values().next().unwrap();
|
||||
assert_eq!(datanode_report.need_retry_regions.len(), 1);
|
||||
assert_eq!(report.failed_datanodes.len(), 0);
|
||||
}
|
||||
|
||||
// Region Concurrency Tests
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_region_gc_concurrency_limit() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let peer = Peer::new(1, "");
|
||||
|
||||
// Create multiple regions for the same table
|
||||
let mut region_stats = Vec::new();
|
||||
let mut candidates = Vec::new();
|
||||
let mut gc_reports = HashMap::new();
|
||||
|
||||
for i in 1..=10 {
|
||||
let region_id = RegionId::new(table_id, i as u32);
|
||||
let region_stat =
|
||||
mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
|
||||
region_stats.push(region_stat);
|
||||
|
||||
candidates.push(mock_candidate(region_id));
|
||||
|
||||
gc_reports.insert(
|
||||
region_id,
|
||||
GcReport {
|
||||
deleted_files: HashMap::from([(
|
||||
region_id,
|
||||
vec![FileId::random(), FileId::random()],
|
||||
)]),
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, region_stats)]);
|
||||
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: (1..=10)
|
||||
.map(|i| (RegionId::new(table_id, i as u32), 1))
|
||||
.collect(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
gc_reports: Arc::new(Mutex::new(gc_reports)),
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(
|
||||
table_id,
|
||||
(1..=10)
|
||||
.map(|i| (RegionId::new(table_id, i as u32), peer.clone()))
|
||||
.collect(),
|
||||
),
|
||||
)])),
|
||||
);
|
||||
|
||||
// Configure low concurrency limit
|
||||
let config = GcSchedulerOptions {
|
||||
region_gc_concurrency: 3, // Only 3 regions can be processed concurrently
|
||||
retry_backoff_duration: Duration::from_millis(50), // for faster test
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let start_time = Instant::now();
|
||||
let report = scheduler
|
||||
.process_datanode_gc(
|
||||
peer,
|
||||
candidates.into_iter().map(|c| (table_id, c)).collect(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let duration = start_time.elapsed();
|
||||
|
||||
// All regions should be processed successfully
|
||||
// Check that all 10 regions have deleted files
|
||||
assert_eq!(report.deleted_files.len(), 10);
|
||||
for i in 1..=10 {
|
||||
let region_id = RegionId::new(table_id, i as u32);
|
||||
assert!(report.deleted_files.contains_key(®ion_id));
|
||||
assert_eq!(report.deleted_files[®ion_id].len(), 2); // Each region has 2 deleted files
|
||||
}
|
||||
assert!(report.need_retry_regions.is_empty());
|
||||
|
||||
// Verify that concurrency limit was respected (this is hard to test directly,
|
||||
// but we can verify that the processing completed successfully)
|
||||
info!(
|
||||
"Processed 10 regions with concurrency limit 3 in {:?}",
|
||||
duration
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_region_gc_concurrency_with_partial_failures() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let peer = Peer::new(1, "");
|
||||
|
||||
// Create multiple regions with mixed success/failure
|
||||
let mut region_stats = Vec::new();
|
||||
let mut candidates = Vec::new();
|
||||
let mut gc_reports = HashMap::new();
|
||||
|
||||
// Create the context first so we can set errors on it
|
||||
let ctx = Arc::new(MockSchedulerCtx::default());
|
||||
|
||||
for i in 1..=6 {
|
||||
let region_id = RegionId::new(table_id, i as u32);
|
||||
let region_stat =
|
||||
mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
|
||||
region_stats.push(region_stat);
|
||||
|
||||
candidates.push(mock_candidate(region_id));
|
||||
|
||||
if i % 2 == 0 {
|
||||
// Even regions will succeed
|
||||
gc_reports.insert(
|
||||
region_id,
|
||||
GcReport {
|
||||
deleted_files: HashMap::from([(
|
||||
region_id,
|
||||
vec![FileId::random(), FileId::random()],
|
||||
)]),
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
} else {
|
||||
// Odd regions will fail - don't add them to gc_reports
|
||||
// This will cause them to be marked as needing retry
|
||||
}
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, region_stats)]);
|
||||
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: (1..=6)
|
||||
.map(|i| (RegionId::new(table_id, i as u32), 1))
|
||||
.collect(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Update the context with the data
|
||||
*ctx.table_to_region_stats.lock().unwrap() = Some(table_stats);
|
||||
*ctx.gc_reports.lock().unwrap() = gc_reports;
|
||||
*ctx.file_refs.lock().unwrap() = Some(file_refs);
|
||||
let region_routes = (1..=6)
|
||||
.map(|i| RegionRoute {
|
||||
region: Region::new_test(RegionId::new(table_id, i as u32)),
|
||||
leader_peer: Some(peer.clone()),
|
||||
..Default::default()
|
||||
})
|
||||
.collect();
|
||||
|
||||
*ctx.table_routes.lock().unwrap() = HashMap::from([(
|
||||
table_id,
|
||||
(table_id, PhysicalTableRouteValue::new(region_routes)),
|
||||
)]);
|
||||
|
||||
// Configure concurrency limit
|
||||
let config = GcSchedulerOptions {
|
||||
region_gc_concurrency: 2, // Process 2 regions concurrently
|
||||
retry_backoff_duration: Duration::from_millis(50), // for faster test
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let datanode_to_candidates = HashMap::from([(
|
||||
peer.clone(),
|
||||
candidates.into_iter().map(|c| (table_id, c)).collect(),
|
||||
)]);
|
||||
|
||||
let report = scheduler
|
||||
.parallel_process_datanodes(datanode_to_candidates)
|
||||
.await;
|
||||
|
||||
let report = report.per_datanode_reports.get(&peer.id).unwrap();
|
||||
|
||||
// Should have 3 successful and 3 failed regions
|
||||
// Even regions (2, 4, 6) should succeed, odd regions (1, 3, 5) should fail
|
||||
let mut successful_regions = 0;
|
||||
let mut failed_regions = 0;
|
||||
|
||||
for i in 1..=6 {
|
||||
let region_id = RegionId::new(table_id, i as u32);
|
||||
if i % 2 == 0 {
|
||||
// Even regions should succeed
|
||||
if report.deleted_files.contains_key(®ion_id) {
|
||||
successful_regions += 1;
|
||||
}
|
||||
} else {
|
||||
// Odd regions should fail - they should be in need_retry_regions
|
||||
if report.need_retry_regions.contains(®ion_id) {
|
||||
failed_regions += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// In the new implementation, regions that cause gc_regions to return an error
|
||||
// are added to need_retry_regions. Let's check if we have the expected mix.
|
||||
info!(
|
||||
"Successful regions: {}, Failed regions: {}",
|
||||
successful_regions, failed_regions
|
||||
);
|
||||
info!(
|
||||
"Deleted files: {:?}",
|
||||
report.deleted_files.keys().collect::<Vec<_>>()
|
||||
);
|
||||
info!("Need retry regions: {:?}", report.need_retry_regions);
|
||||
|
||||
// The exact count might vary depending on how the mock handles errors,
|
||||
// but we should have some successful and some failed regions
|
||||
assert!(
|
||||
successful_regions > 0,
|
||||
"Should have at least some successful regions"
|
||||
);
|
||||
assert!(
|
||||
failed_regions > 0,
|
||||
"Should have at least some failed regions"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_region_gc_concurrency_with_retryable_errors() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let peer = Peer::new(1, "");
|
||||
|
||||
// Create multiple regions
|
||||
let mut region_stats = Vec::new();
|
||||
let mut candidates = Vec::new();
|
||||
|
||||
for i in 1..=5 {
|
||||
let region_id = RegionId::new(table_id, i as u32);
|
||||
let region_stat =
|
||||
mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
|
||||
region_stats.push(region_stat);
|
||||
candidates.push(mock_candidate(region_id));
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, region_stats)]);
|
||||
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: (1..=5)
|
||||
.map(|i| (RegionId::new(table_id, i as u32), 1))
|
||||
.collect(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let gc_report = (1..=5)
|
||||
.map(|i| {
|
||||
let region_id = RegionId::new(table_id, i as u32);
|
||||
(
|
||||
region_id,
|
||||
// mock the actual gc report with deleted files when succeeded(even no files to delete)
|
||||
GcReport::new(HashMap::from([(region_id, vec![])]), HashSet::new()),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
gc_reports: Arc::new(Mutex::new(gc_report)),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(
|
||||
table_id,
|
||||
(1..=5)
|
||||
.map(|i| (RegionId::new(table_id, i as u32), peer.clone()))
|
||||
.collect(),
|
||||
),
|
||||
)])),
|
||||
);
|
||||
|
||||
// Configure concurrency limit
|
||||
let config = GcSchedulerOptions {
|
||||
region_gc_concurrency: 2, // Process 2 regions concurrently
|
||||
retry_backoff_duration: Duration::from_millis(50),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let datanode_to_candidates = HashMap::from([(
|
||||
peer.clone(),
|
||||
candidates.into_iter().map(|c| (table_id, c)).collect(),
|
||||
)]);
|
||||
let report = scheduler
|
||||
.parallel_process_datanodes(datanode_to_candidates)
|
||||
.await;
|
||||
|
||||
let report = report.per_datanode_reports.get(&peer.id).unwrap();
|
||||
|
||||
// In the new implementation without retry logic, all regions should be processed
|
||||
// The exact behavior depends on how the mock handles the regions
|
||||
info!(
|
||||
"Deleted files: {:?}",
|
||||
report.deleted_files.keys().collect::<Vec<_>>()
|
||||
);
|
||||
info!("Need retry regions: {:?}", report.need_retry_regions);
|
||||
|
||||
// We should have processed all 5 regions in some way
|
||||
let total_processed = report.deleted_files.len() + report.need_retry_regions.len();
|
||||
assert_eq!(total_processed, 5, "Should have processed all 5 regions");
|
||||
}
|
||||
@@ -1,197 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Instant;
|
||||
|
||||
use common_meta::datanode::RegionManifestInfo;
|
||||
use common_telemetry::init_default_ut_logging;
|
||||
use store_api::region_engine::RegionRole;
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use crate::gc::mock::{MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_region_stat};
|
||||
use crate::gc::{GcScheduler, GcSchedulerOptions};
|
||||
|
||||
/// Configuration Tests
|
||||
#[tokio::test]
|
||||
async fn test_different_gc_weights() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
|
||||
let mut region_stat =
|
||||
mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB to pass size threshold
|
||||
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut region_stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 5;
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, vec![region_stat])]);
|
||||
|
||||
let ctx = Arc::new(MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
..Default::default()
|
||||
});
|
||||
|
||||
// Test with different weights
|
||||
let config1 = GcSchedulerOptions {
|
||||
sst_count_weight: 2.0,
|
||||
file_removed_count_weight: 0.5,
|
||||
min_region_size_threshold: 100 * 1024 * 1024, // 100MB (default)
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler1 = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config: config1,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let stats = ctx
|
||||
.table_to_region_stats
|
||||
.lock()
|
||||
.unwrap()
|
||||
.clone()
|
||||
.unwrap_or_default();
|
||||
|
||||
let candidates1 = scheduler1.select_gc_candidates(&stats).await.unwrap();
|
||||
|
||||
let config2 = GcSchedulerOptions {
|
||||
sst_count_weight: 0.5,
|
||||
file_removed_count_weight: 2.0,
|
||||
min_region_size_threshold: 100 * 1024 * 1024, // 100MB (default)
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler2 = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config: config2,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let stats = &ctx
|
||||
.table_to_region_stats
|
||||
.lock()
|
||||
.unwrap()
|
||||
.clone()
|
||||
.unwrap_or_default();
|
||||
let candidates2 = scheduler2.select_gc_candidates(stats).await.unwrap();
|
||||
|
||||
// Both should select the region but with different scores
|
||||
assert_eq!(
|
||||
candidates1.len(),
|
||||
1,
|
||||
"Expected 1 table with candidates for config1, got {}",
|
||||
candidates1.len()
|
||||
);
|
||||
assert_eq!(
|
||||
candidates2.len(),
|
||||
1,
|
||||
"Expected 1 table with candidates for config2, got {}",
|
||||
candidates2.len()
|
||||
);
|
||||
|
||||
// Verify the region is actually selected
|
||||
assert!(
|
||||
candidates1.contains_key(&table_id),
|
||||
"Config1 should contain table_id {}",
|
||||
table_id
|
||||
);
|
||||
assert!(
|
||||
candidates2.contains_key(&table_id),
|
||||
"Config2 should contain table_id {}",
|
||||
table_id
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_regions_per_table_threshold() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let mut region_stats = Vec::new();
|
||||
|
||||
// Create many regions
|
||||
for i in 1..=10 {
|
||||
let region_id = RegionId::new(table_id, i as u32);
|
||||
let mut stat = mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
|
||||
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 5;
|
||||
}
|
||||
|
||||
region_stats.push(stat);
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, region_stats)]);
|
||||
|
||||
let ctx = Arc::new(MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
..Default::default()
|
||||
});
|
||||
|
||||
let config = GcSchedulerOptions {
|
||||
regions_per_table_threshold: 3, // Limit to 3 regions per table
|
||||
min_region_size_threshold: 100 * 1024 * 1024, // 100MB (default)
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let stats = ctx
|
||||
.table_to_region_stats
|
||||
.lock()
|
||||
.unwrap()
|
||||
.clone()
|
||||
.unwrap_or_default();
|
||||
|
||||
let candidates = scheduler.select_gc_candidates(&stats).await.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
candidates.len(),
|
||||
1,
|
||||
"Expected 1 table with candidates, got {}",
|
||||
candidates.len()
|
||||
);
|
||||
if let Some(table_candidates) = candidates.get(&table_id) {
|
||||
// Should be limited to 3 regions
|
||||
assert_eq!(
|
||||
table_candidates.len(),
|
||||
3,
|
||||
"Expected 3 candidates for table {}, got {}",
|
||||
table_id,
|
||||
table_candidates.len()
|
||||
);
|
||||
} else {
|
||||
panic!("Expected table {} to have candidates", table_id);
|
||||
}
|
||||
}
|
||||
@@ -1,293 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use common_meta::datanode::RegionManifestInfo;
|
||||
use common_meta::peer::Peer;
|
||||
use common_telemetry::init_default_ut_logging;
|
||||
use store_api::region_engine::RegionRole;
|
||||
use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId};
|
||||
|
||||
use crate::gc::mock::{
|
||||
MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_region_stat, new_empty_report_with,
|
||||
};
|
||||
use crate::gc::{GcScheduler, GcSchedulerOptions};
|
||||
|
||||
/// Error Handling Tests
|
||||
#[tokio::test]
|
||||
async fn test_gc_regions_failure_handling() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
let peer = Peer::new(1, "");
|
||||
|
||||
// Create region stat with proper size and file_removed_cnt to ensure it gets selected as candidate
|
||||
let mut region_stat =
|
||||
mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut region_stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 5;
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, vec![region_stat])]);
|
||||
|
||||
// Create a context that will return an error for gc_regions
|
||||
let mut gc_reports = HashMap::new();
|
||||
gc_reports.insert(region_id, GcReport::default());
|
||||
|
||||
// Inject an error for gc_regions method
|
||||
let gc_error = crate::error::UnexpectedSnafu {
|
||||
violated: "Simulated GC failure for testing".to_string(),
|
||||
}
|
||||
.build();
|
||||
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: HashMap::from([(region_id, 1)]),
|
||||
file_refs: HashMap::from([(region_id, HashSet::from([FileId::random()]))]),
|
||||
};
|
||||
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
gc_reports: Arc::new(Mutex::new(gc_reports)),
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(table_id, vec![(region_id, peer)]),
|
||||
)]))
|
||||
.with_gc_regions_error(gc_error),
|
||||
);
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config: GcSchedulerOptions::default(),
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
// This should handle the failure gracefully
|
||||
let report = scheduler.handle_tick().await.unwrap();
|
||||
|
||||
// Validate the report shows the failure handling
|
||||
assert_eq!(
|
||||
report.per_datanode_reports.len(),
|
||||
1,
|
||||
"Should process 1 datanode despite failure"
|
||||
);
|
||||
assert_eq!(
|
||||
report.failed_datanodes.len(),
|
||||
0,
|
||||
"Should have 0 failed datanodes (failure handled via need_retry_regions)"
|
||||
);
|
||||
|
||||
// Check that the region is in need_retry_regions due to the failure
|
||||
let datanode_report = report.per_datanode_reports.values().next().unwrap();
|
||||
assert_eq!(
|
||||
datanode_report.need_retry_regions.len(),
|
||||
1,
|
||||
"Should have 1 region in need_retry_regions due to failure"
|
||||
);
|
||||
assert!(
|
||||
datanode_report.need_retry_regions.contains(®ion_id),
|
||||
"Region should be in need_retry_regions"
|
||||
);
|
||||
|
||||
// Verify that calls were made despite potential failures
|
||||
assert_eq!(
|
||||
*ctx.get_table_to_region_stats_calls.lock().unwrap(),
|
||||
1,
|
||||
"Expected 1 call to get_table_to_region_stats"
|
||||
);
|
||||
assert!(
|
||||
*ctx.get_file_references_calls.lock().unwrap() >= 1,
|
||||
"Expected at least 1 call to get_file_references"
|
||||
);
|
||||
assert!(
|
||||
*ctx.gc_regions_calls.lock().unwrap() >= 1,
|
||||
"Expected at least 1 call to gc_regions"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_get_file_references_failure() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
let peer = Peer::new(1, "");
|
||||
|
||||
// Create region stat with proper size and file_removed_cnt to ensure it gets selected as candidate
|
||||
let mut region_stat =
|
||||
mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut region_stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 5;
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, vec![region_stat])]);
|
||||
|
||||
// Create context with empty file refs (simulating failure)
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
file_refs: Arc::new(Mutex::new(Some(FileRefsManifest::default()))),
|
||||
gc_reports: Arc::new(Mutex::new(HashMap::from([(
|
||||
region_id,
|
||||
new_empty_report_with([region_id]),
|
||||
)]))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(table_id, vec![(region_id, peer)]),
|
||||
)])),
|
||||
);
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config: GcSchedulerOptions {
|
||||
retry_backoff_duration: Duration::from_millis(10), // shorten for test
|
||||
..Default::default()
|
||||
},
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let report = scheduler.handle_tick().await.unwrap();
|
||||
|
||||
// Validate the report shows the expected results
|
||||
// In the new implementation, even if get_file_references fails, we still create a datanode report
|
||||
assert_eq!(
|
||||
report.per_datanode_reports.len(),
|
||||
1,
|
||||
"Should process 1 datanode"
|
||||
);
|
||||
assert_eq!(
|
||||
report.failed_datanodes.len(),
|
||||
0,
|
||||
"Should have 0 failed datanodes (failure handled gracefully)"
|
||||
);
|
||||
|
||||
// The region should be processed but may have empty results due to file refs failure
|
||||
let datanode_report = report.per_datanode_reports.values().next().unwrap();
|
||||
// The current implementation still processes the region even with file refs failure
|
||||
// and creates an empty entry in deleted_files
|
||||
assert!(
|
||||
datanode_report.deleted_files.contains_key(®ion_id),
|
||||
"Should have region in deleted_files (even if empty)"
|
||||
);
|
||||
assert!(
|
||||
datanode_report.deleted_files[®ion_id].is_empty(),
|
||||
"Should have empty deleted files due to file refs failure"
|
||||
);
|
||||
|
||||
// Should still attempt to get file references (may be called multiple times due to retry logic)
|
||||
assert!(
|
||||
*ctx.get_file_references_calls.lock().unwrap() >= 1,
|
||||
"Expected at least 1 call to get_file_references, got {}",
|
||||
*ctx.get_file_references_calls.lock().unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_get_table_route_failure() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
|
||||
// Create region stat with proper size and file_removed_cnt to ensure it gets selected as candidate
|
||||
let mut region_stat =
|
||||
mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut region_stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 5;
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, vec![region_stat])]);
|
||||
|
||||
// Inject an error for get_table_route method to simulate failure
|
||||
let route_error = crate::error::UnexpectedSnafu {
|
||||
violated: "Simulated table route failure for testing".to_string(),
|
||||
}
|
||||
.build();
|
||||
|
||||
// Create context with table route error injection
|
||||
let ctx = Arc::new(MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
..Default::default()
|
||||
});
|
||||
ctx.set_table_route_error(route_error);
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config: GcSchedulerOptions::default(),
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
// Get candidates first
|
||||
let stats = &ctx
|
||||
.table_to_region_stats
|
||||
.lock()
|
||||
.unwrap()
|
||||
.clone()
|
||||
.unwrap_or_default();
|
||||
let candidates = scheduler.select_gc_candidates(stats).await.unwrap();
|
||||
|
||||
// Convert table-based candidates to datanode-based candidates
|
||||
let datanode_to_candidates = HashMap::from([(
|
||||
Peer::new(1, ""),
|
||||
candidates
|
||||
.into_iter()
|
||||
.flat_map(|(table_id, candidates)| candidates.into_iter().map(move |c| (table_id, c)))
|
||||
.collect(),
|
||||
)]);
|
||||
|
||||
// This should handle table route failure gracefully
|
||||
let report = scheduler
|
||||
.parallel_process_datanodes(datanode_to_candidates)
|
||||
.await;
|
||||
|
||||
// Should process the datanode but handle route error gracefully
|
||||
assert_eq!(
|
||||
report.per_datanode_reports.len(),
|
||||
0,
|
||||
"Expected 0 datanode report"
|
||||
);
|
||||
assert_eq!(
|
||||
report.failed_datanodes.len(),
|
||||
1,
|
||||
"Expected 1 failed datanodes (route error handled gracefully)"
|
||||
);
|
||||
assert!(
|
||||
report.failed_datanodes.contains_key(&1),
|
||||
"Failed datanodes should contain the datanode with route error"
|
||||
);
|
||||
}
|
||||
@@ -1,272 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use common_meta::peer::Peer;
|
||||
use common_telemetry::init_default_ut_logging;
|
||||
use store_api::region_engine::RegionRole;
|
||||
use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId};
|
||||
|
||||
use crate::gc::mock::{MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_candidate, mock_region_stat};
|
||||
use crate::gc::{GcScheduler, GcSchedulerOptions};
|
||||
|
||||
// Full File Listing Tests
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_full_file_listing_first_time_gc() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
let peer = Peer::new(1, "");
|
||||
|
||||
let region_stat = mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
|
||||
let table_stats = HashMap::from([(table_id, vec![region_stat])]);
|
||||
|
||||
let gc_report = GcReport {
|
||||
deleted_files: HashMap::from([(region_id, vec![FileId::random(), FileId::random()])]),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: HashMap::from([(region_id, 1)]),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
gc_reports: Arc::new(Mutex::new(HashMap::from([(region_id, gc_report)]))),
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(table_id, vec![(region_id, peer.clone())]),
|
||||
)])),
|
||||
);
|
||||
|
||||
// Configure short full file listing interval for testing
|
||||
let config = GcSchedulerOptions {
|
||||
full_file_listing_interval: Duration::from_secs(3600), // 1 hour
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
// First GC - should use full listing since region has never been GC'd
|
||||
let reports = scheduler
|
||||
.process_datanode_gc(peer.clone(), vec![(table_id, mock_candidate(region_id))])
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(reports.deleted_files.len(), 1);
|
||||
|
||||
// Verify that full listing was used by checking the tracker
|
||||
let tracker = scheduler.region_gc_tracker.lock().await;
|
||||
let gc_info = tracker
|
||||
.get(®ion_id)
|
||||
.expect("Region should be in tracker");
|
||||
assert!(
|
||||
gc_info.last_full_listing_time.is_some(),
|
||||
"First GC should use full listing"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_full_file_listing_interval_enforcement() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
let peer = Peer::new(1, "");
|
||||
|
||||
let region_stat = mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
|
||||
let table_stats = HashMap::from([(table_id, vec![region_stat])]);
|
||||
|
||||
let gc_report = GcReport {
|
||||
deleted_files: HashMap::from([(region_id, vec![FileId::random(), FileId::random()])]),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: HashMap::from([(region_id, 1)]),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
gc_reports: Arc::new(Mutex::new(HashMap::from([(region_id, gc_report)]))),
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(table_id, vec![(region_id, peer.clone())]),
|
||||
)])),
|
||||
);
|
||||
|
||||
// Configure very short full file listing interval for testing
|
||||
let config = GcSchedulerOptions {
|
||||
full_file_listing_interval: Duration::from_millis(100), // 100ms
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
// First GC - should use full listing
|
||||
let reports1 = scheduler
|
||||
.process_datanode_gc(peer.clone(), vec![(table_id, mock_candidate(region_id))])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(reports1.deleted_files.len(), 1);
|
||||
|
||||
// Get the first full listing time
|
||||
let first_full_listing_time = {
|
||||
let tracker = scheduler.region_gc_tracker.lock().await;
|
||||
let gc_info = tracker
|
||||
.get(®ion_id)
|
||||
.expect("Region should be in tracker");
|
||||
gc_info
|
||||
.last_full_listing_time
|
||||
.expect("Should have full listing time")
|
||||
};
|
||||
|
||||
// Wait for interval to pass
|
||||
tokio::time::sleep(Duration::from_millis(150)).await;
|
||||
|
||||
// Second GC - should use full listing again since interval has passed
|
||||
let _reports2 = scheduler
|
||||
.process_datanode_gc(peer.clone(), vec![(table_id, mock_candidate(region_id))])
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Verify that full listing was used again
|
||||
let tracker = scheduler.region_gc_tracker.lock().await;
|
||||
let gc_info = tracker
|
||||
.get(®ion_id)
|
||||
.expect("Region should be in tracker");
|
||||
let second_full_listing_time = gc_info
|
||||
.last_full_listing_time
|
||||
.expect("Should have full listing time");
|
||||
|
||||
assert!(
|
||||
second_full_listing_time > first_full_listing_time,
|
||||
"Second GC should update full listing time"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_full_file_listing_no_interval_passed() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
let peer = Peer::new(1, "");
|
||||
|
||||
let region_stat = mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
|
||||
let table_stats = HashMap::from([(table_id, vec![region_stat])]);
|
||||
|
||||
let gc_report = GcReport {
|
||||
deleted_files: HashMap::from([(region_id, vec![FileId::random(), FileId::random()])]),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: HashMap::from([(region_id, 1)]),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
gc_reports: Arc::new(Mutex::new(HashMap::from([(region_id, gc_report)]))),
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(table_id, vec![(region_id, peer.clone())]),
|
||||
)])),
|
||||
);
|
||||
|
||||
// Configure long full file listing interval
|
||||
let config = GcSchedulerOptions {
|
||||
full_file_listing_interval: Duration::from_secs(3600), // 1 hour
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
// First GC - should use full listing
|
||||
let reports1 = scheduler
|
||||
.process_datanode_gc(peer.clone(), vec![(table_id, mock_candidate(region_id))])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(reports1.deleted_files.len(), 1);
|
||||
|
||||
// Get the first full listing time
|
||||
let first_full_listing_time = {
|
||||
let tracker = scheduler.region_gc_tracker.lock().await;
|
||||
let gc_info = tracker
|
||||
.get(®ion_id)
|
||||
.expect("Region should be in tracker");
|
||||
gc_info
|
||||
.last_full_listing_time
|
||||
.expect("Should have full listing time")
|
||||
};
|
||||
|
||||
// Second GC immediately - should NOT use full listing since interval hasn't passed
|
||||
let reports2 = scheduler
|
||||
.process_datanode_gc(peer.clone(), vec![(table_id, mock_candidate(region_id))])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(reports2.deleted_files.len(), 1);
|
||||
|
||||
// Verify that full listing time was NOT updated
|
||||
let tracker = scheduler.region_gc_tracker.lock().await;
|
||||
let gc_info = tracker
|
||||
.get(®ion_id)
|
||||
.expect("Region should be in tracker");
|
||||
let second_full_listing_time = gc_info
|
||||
.last_full_listing_time
|
||||
.expect("Should have full listing time");
|
||||
|
||||
assert_eq!(
|
||||
second_full_listing_time, first_full_listing_time,
|
||||
"Second GC should not update full listing time when interval hasn't passed"
|
||||
);
|
||||
}
|
||||
@@ -1,252 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use common_meta::datanode::RegionManifestInfo;
|
||||
use common_meta::peer::Peer;
|
||||
use common_telemetry::init_default_ut_logging;
|
||||
use store_api::region_engine::RegionRole;
|
||||
use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId};
|
||||
|
||||
use crate::gc::mock::{
|
||||
MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_region_stat, new_empty_report_with,
|
||||
};
|
||||
use crate::gc::{GcScheduler, GcSchedulerOptions};
|
||||
|
||||
// Integration Flow Tests
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_full_gc_workflow() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
let peer = Peer::new(1, "");
|
||||
|
||||
let mut region_stat =
|
||||
mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
|
||||
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut region_stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 5;
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, vec![region_stat])]);
|
||||
|
||||
let mut gc_reports = HashMap::new();
|
||||
gc_reports.insert(
|
||||
region_id,
|
||||
GcReport {
|
||||
deleted_files: HashMap::from([(region_id, vec![FileId::random(), FileId::random()])]),
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: HashMap::from([(region_id, 1)]),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
gc_reports: Arc::new(Mutex::new(gc_reports)),
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(table_id, vec![(region_id, peer)]),
|
||||
)])),
|
||||
);
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config: GcSchedulerOptions::default(),
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
// Run the full workflow
|
||||
let report = scheduler.handle_tick().await.unwrap();
|
||||
|
||||
// Validate the returned GcJobReport - should have 1 datanode report
|
||||
assert_eq!(
|
||||
report.per_datanode_reports.len(),
|
||||
1,
|
||||
"Should process 1 datanode"
|
||||
);
|
||||
assert_eq!(
|
||||
report.failed_datanodes.len(),
|
||||
0,
|
||||
"Should have no failed datanodes"
|
||||
);
|
||||
|
||||
// Get the datanode report
|
||||
let datanode_report = report.per_datanode_reports.values().next().unwrap();
|
||||
|
||||
// Check that the region was processed successfully
|
||||
assert!(
|
||||
datanode_report.deleted_files.contains_key(®ion_id),
|
||||
"Should have deleted files for region"
|
||||
);
|
||||
assert_eq!(
|
||||
datanode_report.deleted_files[®ion_id].len(),
|
||||
2,
|
||||
"Should have 2 deleted files"
|
||||
);
|
||||
assert!(
|
||||
datanode_report.need_retry_regions.is_empty(),
|
||||
"Should have no retry regions"
|
||||
);
|
||||
|
||||
// Verify all steps were executed
|
||||
assert_eq!(
|
||||
*ctx.get_table_to_region_stats_calls.lock().unwrap(),
|
||||
1,
|
||||
"Expected 1 call to get_table_to_region_stats"
|
||||
);
|
||||
assert_eq!(
|
||||
*ctx.get_file_references_calls.lock().unwrap(),
|
||||
1,
|
||||
"Expected 1 call to get_file_references"
|
||||
);
|
||||
assert_eq!(
|
||||
*ctx.gc_regions_calls.lock().unwrap(),
|
||||
1,
|
||||
"Expected 1 call to gc_regions"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_tracker_cleanup() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
let peer = Peer::new(1, "");
|
||||
|
||||
// Create region stat with proper file_removed_cnt to ensure it gets selected as candidate
|
||||
let mut region_stat =
|
||||
mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut region_stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 5;
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, vec![region_stat])]);
|
||||
|
||||
let mut gc_reports = HashMap::new();
|
||||
gc_reports.insert(region_id, new_empty_report_with([region_id]));
|
||||
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: HashMap::from([(region_id, 1)]),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
gc_reports: Arc::new(Mutex::new(gc_reports)),
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(table_id, vec![(region_id, peer)]),
|
||||
)])),
|
||||
);
|
||||
|
||||
let old_region_gc_tracker = {
|
||||
let mut tracker = HashMap::new();
|
||||
tracker.insert(
|
||||
region_id,
|
||||
crate::gc::tracker::RegionGcInfo {
|
||||
last_full_listing_time: Some(Instant::now() - Duration::from_secs(7200)), // 2 hours ago
|
||||
last_gc_time: Instant::now() - Duration::from_secs(7200), // 2 hours ago
|
||||
},
|
||||
);
|
||||
// also insert a different table that should also be cleaned up
|
||||
tracker.insert(
|
||||
RegionId::new(2, 1),
|
||||
crate::gc::tracker::RegionGcInfo {
|
||||
last_full_listing_time: Some(Instant::now() - Duration::from_secs(7200)), // 2 hours ago
|
||||
last_gc_time: Instant::now() - Duration::from_secs(7200), // 2 hours ago
|
||||
},
|
||||
);
|
||||
tracker
|
||||
};
|
||||
|
||||
// Use a custom config with shorter cleanup interval to trigger cleanup
|
||||
let config = GcSchedulerOptions {
|
||||
// 30 minutes
|
||||
tracker_cleanup_interval: Duration::from_secs(1800),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(old_region_gc_tracker)),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(
|
||||
Instant::now() - Duration::from_secs(3600), // Old cleanup time (1 hour ago)
|
||||
)),
|
||||
};
|
||||
|
||||
let report = scheduler.handle_tick().await.unwrap();
|
||||
|
||||
// Validate the returned GcJobReport - should have 1 datanode report
|
||||
assert_eq!(
|
||||
report.per_datanode_reports.len(),
|
||||
1,
|
||||
"Should process 1 datanode"
|
||||
);
|
||||
assert_eq!(
|
||||
report.failed_datanodes.len(),
|
||||
0,
|
||||
"Should have no failed datanodes"
|
||||
);
|
||||
|
||||
// Get the datanode report
|
||||
let datanode_report = report.per_datanode_reports.values().next().unwrap();
|
||||
|
||||
// Check that the region was processed successfully
|
||||
assert!(
|
||||
datanode_report.deleted_files.contains_key(®ion_id),
|
||||
"Should have deleted files for region"
|
||||
);
|
||||
assert!(
|
||||
datanode_report.need_retry_regions.is_empty(),
|
||||
"Should have no retry regions"
|
||||
);
|
||||
|
||||
// Verify tracker was updated
|
||||
let tracker = scheduler.region_gc_tracker.lock().await;
|
||||
assert!(
|
||||
tracker.contains_key(®ion_id),
|
||||
"Tracker should contain region {}",
|
||||
region_id
|
||||
);
|
||||
// only one entry
|
||||
assert_eq!(tracker.len(), 1, "Tracker should only have 1 entry");
|
||||
}
|
||||
@@ -1,155 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Instant;
|
||||
|
||||
use common_meta::peer::Peer;
|
||||
use common_telemetry::init_default_ut_logging;
|
||||
use store_api::storage::{FileRefsManifest, GcReport, RegionId};
|
||||
|
||||
use crate::gc::mock::{MockSchedulerCtx, new_candidate};
|
||||
use crate::gc::{GcScheduler, GcSchedulerOptions};
|
||||
|
||||
/// Edge Case Tests
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_empty_file_refs_manifest() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
let peer = Peer::new(1, "");
|
||||
let candidates = HashMap::from([(table_id, vec![new_candidate(region_id, 1.0)])]);
|
||||
|
||||
// Empty file refs manifest
|
||||
let file_refs = FileRefsManifest::default();
|
||||
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
candidates: Arc::new(Mutex::new(Some(candidates))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(table_id, vec![(region_id, peer)]),
|
||||
)])),
|
||||
);
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config: GcSchedulerOptions::default(),
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let candidates = ctx.candidates.lock().unwrap().clone().unwrap_or_default();
|
||||
|
||||
// Convert table-based candidates to datanode-based candidates
|
||||
let peer = Peer::new(1, "");
|
||||
let datanode_to_candidates = HashMap::from([(
|
||||
peer,
|
||||
candidates
|
||||
.into_iter()
|
||||
.flat_map(|(table_id, candidates)| candidates.into_iter().map(move |c| (table_id, c)))
|
||||
.collect(),
|
||||
)]);
|
||||
|
||||
let report = scheduler
|
||||
.parallel_process_datanodes(datanode_to_candidates)
|
||||
.await;
|
||||
|
||||
assert_eq!(report.per_datanode_reports.len(), 1);
|
||||
assert_eq!(report.failed_datanodes.len(), 0);
|
||||
// Should handle empty file refs gracefully
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_multiple_regions_per_table() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region1 = RegionId::new(table_id, 1);
|
||||
let region2 = RegionId::new(table_id, 2);
|
||||
let region3 = RegionId::new(table_id, 3);
|
||||
let peer = Peer::new(1, "");
|
||||
|
||||
let candidates = HashMap::from([(
|
||||
table_id,
|
||||
vec![
|
||||
new_candidate(region1, 1.0),
|
||||
new_candidate(region2, 2.0),
|
||||
new_candidate(region3, 3.0),
|
||||
],
|
||||
)]);
|
||||
|
||||
let mut gc_reports = HashMap::new();
|
||||
gc_reports.insert(region1, GcReport::default());
|
||||
gc_reports.insert(region2, GcReport::default());
|
||||
gc_reports.insert(region3, GcReport::default());
|
||||
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: HashMap::from([(region1, 1), (region2, 1), (region3, 1)]),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
gc_reports: Arc::new(Mutex::new(gc_reports)),
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
candidates: Arc::new(Mutex::new(Some(candidates))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(
|
||||
table_id,
|
||||
vec![
|
||||
(region1, peer.clone()),
|
||||
(region2, peer.clone()),
|
||||
(region3, peer.clone()),
|
||||
],
|
||||
),
|
||||
)])),
|
||||
);
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config: GcSchedulerOptions::default(),
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let candidates = ctx.candidates.lock().unwrap().clone().unwrap_or_default();
|
||||
|
||||
// Convert table-based candidates to datanode-based candidates
|
||||
let datanode_to_candidates = HashMap::from([(
|
||||
peer.clone(),
|
||||
candidates
|
||||
.into_iter()
|
||||
.flat_map(|(table_id, candidates)| candidates.into_iter().map(move |c| (table_id, c)))
|
||||
.collect(),
|
||||
)]);
|
||||
|
||||
let report = scheduler
|
||||
.parallel_process_datanodes(datanode_to_candidates)
|
||||
.await;
|
||||
|
||||
assert_eq!(report.per_datanode_reports.len(), 1);
|
||||
assert_eq!(report.failed_datanodes.len(), 0);
|
||||
}
|
||||
@@ -50,7 +50,7 @@ impl GcScheduler {
|
||||
let now = Instant::now();
|
||||
|
||||
// Check if enough time has passed since last cleanup
|
||||
if now.saturating_duration_since(last_cleanup) < self.config.tracker_cleanup_interval {
|
||||
if now.duration_since(last_cleanup) < self.config.tracker_cleanup_interval {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
|
||||
@@ -14,7 +14,6 @@ async-stream.workspace = true
|
||||
async-trait.workspace = true
|
||||
base64.workspace = true
|
||||
bytes.workspace = true
|
||||
fxhash = "0.2"
|
||||
common-base.workspace = true
|
||||
common-error.workspace = true
|
||||
common-macro.workspace = true
|
||||
@@ -32,6 +31,7 @@ lazy_static = "1.4"
|
||||
mito-codec.workspace = true
|
||||
mito2.workspace = true
|
||||
moka.workspace = true
|
||||
mur3 = "0.1"
|
||||
object-store.workspace = true
|
||||
prometheus.workspace = true
|
||||
serde.workspace = true
|
||||
@@ -47,12 +47,6 @@ common-meta = { workspace = true, features = ["testing"] }
|
||||
common-test-util.workspace = true
|
||||
mito2 = { workspace = true, features = ["test"] }
|
||||
common-wal = { workspace = true }
|
||||
criterion = { version = "0.4", features = ["async", "async_tokio"] }
|
||||
mur3 = "0.1"
|
||||
|
||||
[[bench]]
|
||||
name = "bench_tsid_generator"
|
||||
harness = false
|
||||
|
||||
[package.metadata.cargo-udeps.ignore]
|
||||
normal = ["aquamarine"]
|
||||
|
||||
@@ -1,273 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::hash::Hasher;
|
||||
|
||||
use criterion::{Criterion, black_box, criterion_group, criterion_main};
|
||||
use fxhash::FxHasher;
|
||||
use mur3::Hasher128;
|
||||
|
||||
// A random number (from original implementation)
|
||||
const TSID_HASH_SEED: u32 = 846793005;
|
||||
|
||||
/// Original TSID generator using mur3::Hasher128
|
||||
/// Hashes both label name and value for each label pair
|
||||
struct OriginalTsidGenerator {
|
||||
hasher: Hasher128,
|
||||
}
|
||||
|
||||
impl OriginalTsidGenerator {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
hasher: Hasher128::with_seed(TSID_HASH_SEED),
|
||||
}
|
||||
}
|
||||
|
||||
/// Writes a label pair (name and value) to the generator.
|
||||
fn write_label(&mut self, name: &str, value: &str) {
|
||||
use std::hash::Hash;
|
||||
name.hash(&mut self.hasher);
|
||||
value.hash(&mut self.hasher);
|
||||
}
|
||||
|
||||
/// Generates a new TSID.
|
||||
fn finish(&mut self) -> u64 {
|
||||
// TSID is 64 bits, simply truncate the 128 bits hash
|
||||
let (hash, _) = self.hasher.finish128();
|
||||
hash
|
||||
}
|
||||
}
|
||||
|
||||
/// Current TSID generator using fxhash::FxHasher
|
||||
/// Fast path: pre-computes label name hash, only hashes values
|
||||
struct CurrentTsidGenerator {
|
||||
hasher: FxHasher,
|
||||
}
|
||||
|
||||
impl CurrentTsidGenerator {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
hasher: FxHasher::default(),
|
||||
}
|
||||
}
|
||||
|
||||
fn new_with_label_name_hash(label_name_hash: u64) -> Self {
|
||||
let mut hasher = FxHasher::default();
|
||||
hasher.write_u64(label_name_hash);
|
||||
Self { hasher }
|
||||
}
|
||||
|
||||
/// Writes a label value to the generator.
|
||||
fn write_str(&mut self, value: &str) {
|
||||
self.hasher.write(value.as_bytes());
|
||||
self.hasher.write_u8(0xff);
|
||||
}
|
||||
|
||||
/// Generates a new TSID.
|
||||
fn finish(&mut self) -> u64 {
|
||||
self.hasher.finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// Pre-computes label name hash (used in fast path)
|
||||
fn compute_label_name_hash(labels: &[(&str, &str)]) -> u64 {
|
||||
let mut hasher = FxHasher::default();
|
||||
for (name, _) in labels {
|
||||
hasher.write(name.as_bytes());
|
||||
hasher.write_u8(0xff);
|
||||
}
|
||||
hasher.finish()
|
||||
}
|
||||
|
||||
fn bench_tsid_generator_small(c: &mut Criterion) {
|
||||
let labels = vec![("namespace", "greptimedb"), ("host", "127.0.0.1")];
|
||||
|
||||
let mut group = c.benchmark_group("tsid_generator_small_2_labels");
|
||||
group.bench_function("original_mur3", |b| {
|
||||
b.iter(|| {
|
||||
let mut tsid_gen = OriginalTsidGenerator::new();
|
||||
for (name, value) in &labels {
|
||||
tsid_gen.write_label(black_box(name), black_box(value));
|
||||
}
|
||||
black_box(tsid_gen.finish())
|
||||
})
|
||||
});
|
||||
|
||||
let label_name_hash = compute_label_name_hash(&labels);
|
||||
group.bench_function("current_fxhash_fast_path", |b| {
|
||||
b.iter(|| {
|
||||
let mut tsid_gen =
|
||||
CurrentTsidGenerator::new_with_label_name_hash(black_box(label_name_hash));
|
||||
for (_, value) in &labels {
|
||||
tsid_gen.write_str(black_box(value));
|
||||
}
|
||||
black_box(tsid_gen.finish())
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_tsid_generator_medium(c: &mut Criterion) {
|
||||
let labels = vec![
|
||||
("namespace", "greptimedb"),
|
||||
("host", "127.0.0.1"),
|
||||
("region", "us-west-2"),
|
||||
("env", "production"),
|
||||
("service", "api"),
|
||||
];
|
||||
|
||||
let mut group = c.benchmark_group("tsid_generator_medium_5_labels");
|
||||
group.bench_function("original_mur3", |b| {
|
||||
b.iter(|| {
|
||||
let mut tsid_gen = OriginalTsidGenerator::new();
|
||||
for (name, value) in &labels {
|
||||
tsid_gen.write_label(black_box(name), black_box(value));
|
||||
}
|
||||
black_box(tsid_gen.finish())
|
||||
})
|
||||
});
|
||||
|
||||
let label_name_hash = compute_label_name_hash(&labels);
|
||||
group.bench_function("current_fxhash_fast_path", |b| {
|
||||
b.iter(|| {
|
||||
let mut tsid_gen =
|
||||
CurrentTsidGenerator::new_with_label_name_hash(black_box(label_name_hash));
|
||||
for (_, value) in &labels {
|
||||
tsid_gen.write_str(black_box(value));
|
||||
}
|
||||
black_box(tsid_gen.finish())
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_tsid_generator_large(c: &mut Criterion) {
|
||||
let labels = vec![
|
||||
("namespace", "greptimedb"),
|
||||
("host", "127.0.0.1"),
|
||||
("region", "us-west-2"),
|
||||
("env", "production"),
|
||||
("service", "api"),
|
||||
("version", "v1.0.0"),
|
||||
("cluster", "cluster-1"),
|
||||
("dc", "dc1"),
|
||||
("rack", "rack-1"),
|
||||
("pod", "pod-123"),
|
||||
];
|
||||
|
||||
let mut group = c.benchmark_group("tsid_generator_large_10_labels");
|
||||
group.bench_function("original_mur3", |b| {
|
||||
b.iter(|| {
|
||||
let mut tsid_gen = OriginalTsidGenerator::new();
|
||||
for (name, value) in &labels {
|
||||
tsid_gen.write_label(black_box(name), black_box(value));
|
||||
}
|
||||
black_box(tsid_gen.finish())
|
||||
})
|
||||
});
|
||||
|
||||
let label_name_hash = compute_label_name_hash(&labels);
|
||||
group.bench_function("current_fxhash_fast_path", |b| {
|
||||
b.iter(|| {
|
||||
let mut tsid_gen =
|
||||
CurrentTsidGenerator::new_with_label_name_hash(black_box(label_name_hash));
|
||||
for (_, value) in &labels {
|
||||
tsid_gen.write_str(black_box(value));
|
||||
}
|
||||
black_box(tsid_gen.finish())
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_tsid_generator_slow_path(c: &mut Criterion) {
|
||||
// Simulate slow path: some labels have null values (empty strings)
|
||||
let labels_with_nulls = vec![
|
||||
("namespace", "greptimedb"),
|
||||
("host", "127.0.0.1"),
|
||||
("region", ""), // null
|
||||
("env", "production"),
|
||||
];
|
||||
|
||||
let labels_all_non_null = vec![
|
||||
("namespace", "greptimedb"),
|
||||
("host", "127.0.0.1"),
|
||||
("env", "production"),
|
||||
];
|
||||
|
||||
let mut group = c.benchmark_group("tsid_generator_slow_path_with_nulls");
|
||||
|
||||
// Original: always hashes name and value
|
||||
group.bench_function("original_mur3_with_nulls", |b| {
|
||||
b.iter(|| {
|
||||
let mut tsid_gen = OriginalTsidGenerator::new();
|
||||
for (name, value) in &labels_with_nulls {
|
||||
if !value.is_empty() {
|
||||
tsid_gen.write_label(black_box(name), black_box(value));
|
||||
}
|
||||
}
|
||||
black_box(tsid_gen.finish())
|
||||
})
|
||||
});
|
||||
|
||||
// Current slow path: recomputes label name hash
|
||||
group.bench_function("current_fxhash_slow_path", |b| {
|
||||
b.iter(|| {
|
||||
// Step 1: Compute label name hash for non-null labels
|
||||
let mut name_hasher = CurrentTsidGenerator::new();
|
||||
for (name, value) in &labels_with_nulls {
|
||||
if !value.is_empty() {
|
||||
name_hasher.write_str(black_box(name));
|
||||
}
|
||||
}
|
||||
let label_name_hash = name_hasher.finish();
|
||||
|
||||
// Step 2: Use label name hash and hash values
|
||||
let mut tsid_gen = CurrentTsidGenerator::new_with_label_name_hash(label_name_hash);
|
||||
for (_, value) in &labels_with_nulls {
|
||||
if !value.is_empty() {
|
||||
tsid_gen.write_str(black_box(value));
|
||||
}
|
||||
}
|
||||
black_box(tsid_gen.finish())
|
||||
})
|
||||
});
|
||||
|
||||
// Current fast path: pre-computed (for comparison)
|
||||
let label_name_hash = compute_label_name_hash(&labels_all_non_null);
|
||||
group.bench_function("current_fxhash_fast_path_no_nulls", |b| {
|
||||
b.iter(|| {
|
||||
let mut tsid_gen =
|
||||
CurrentTsidGenerator::new_with_label_name_hash(black_box(label_name_hash));
|
||||
for (_, value) in &labels_all_non_null {
|
||||
tsid_gen.write_str(black_box(value));
|
||||
}
|
||||
black_box(tsid_gen.finish())
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_tsid_generator_small,
|
||||
bench_tsid_generator_medium,
|
||||
bench_tsid_generator_large,
|
||||
bench_tsid_generator_slow_path
|
||||
);
|
||||
criterion_main!(benches);
|
||||
@@ -119,7 +119,7 @@ mod tests {
|
||||
.index_file_path
|
||||
.map(|path| path.replace(&e.file_id, "<file_id>"));
|
||||
e.file_id = "<file_id>".to_string();
|
||||
e.index_version = 0;
|
||||
e.index_file_id = e.index_file_id.map(|_| "<index_file_id>".to_string());
|
||||
format!("\n{:?}", e)
|
||||
})
|
||||
.sorted()
|
||||
@@ -128,12 +128,12 @@ mod tests {
|
||||
assert_eq!(
|
||||
debug_format,
|
||||
r#"
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000001/data/<file_id>.parquet", file_size: 3217, index_file_path: Some("test_metric_region/11_0000000001/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(20), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000002/data/<file_id>.parquet", file_size: 3217, index_file_path: Some("test_metric_region/11_0000000002/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417473(11, 16777217), table_id: 11, region_number: 16777217, region_group: 1, region_sequence: 1, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000001/metadata/<file_id>.parquet", file_size: 3487, index_file_path: None, index_file_size: None, num_rows: 8, num_row_groups: 1, num_series: Some(8), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(8), origin_region_id: 47261417473(11, 16777217), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417474(11, 16777218), table_id: 11, region_number: 16777218, region_group: 1, region_sequence: 2, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000002/metadata/<file_id>.parquet", file_size: 3471, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 47261417474(11, 16777218), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test_metric_region/22_0000000042/data/<file_id>.parquet", file_size: 3217, index_file_path: Some("test_metric_region/22_0000000042/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94506057770(22, 16777258), table_id: 22, region_number: 16777258, region_group: 1, region_sequence: 42, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test_metric_region/22_0000000042/metadata/<file_id>.parquet", file_size: 3471, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 94506057770(22, 16777258), node_id: None, visible: true }"#,
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test_metric_region/11_0000000001/data/<file_id>.parquet", file_size: 3217, index_file_path: Some("test_metric_region/11_0000000001/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(20), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test_metric_region/11_0000000002/data/<file_id>.parquet", file_size: 3217, index_file_path: Some("test_metric_region/11_0000000002/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417473(11, 16777217), table_id: 11, region_number: 16777217, region_group: 1, region_sequence: 1, file_id: "<file_id>", index_file_id: None, level: 0, file_path: "test_metric_region/11_0000000001/metadata/<file_id>.parquet", file_size: 3487, index_file_path: None, index_file_size: None, num_rows: 8, num_row_groups: 1, num_series: Some(8), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(8), origin_region_id: 47261417473(11, 16777217), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417474(11, 16777218), table_id: 11, region_number: 16777218, region_group: 1, region_sequence: 2, file_id: "<file_id>", index_file_id: None, level: 0, file_path: "test_metric_region/11_0000000002/metadata/<file_id>.parquet", file_size: 3471, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 47261417474(11, 16777218), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test_metric_region/22_0000000042/data/<file_id>.parquet", file_size: 3217, index_file_path: Some("test_metric_region/22_0000000042/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94506057770(22, 16777258), table_id: 22, region_number: 16777258, region_group: 1, region_sequence: 42, file_id: "<file_id>", index_file_id: None, level: 0, file_path: "test_metric_region/22_0000000042/metadata/<file_id>.parquet", file_size: 3471, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 94506057770(22, 16777258), node_id: None, visible: true }"#
|
||||
);
|
||||
// list from storage
|
||||
let storage_entries = mito
|
||||
|
||||
@@ -272,15 +272,15 @@ mod tests {
|
||||
.unwrap();
|
||||
let batches = RecordBatches::try_collect(stream).await.unwrap();
|
||||
let expected = "\
|
||||
+-------------------------+----------------+------------+---------------------+-------+
|
||||
| greptime_timestamp | greptime_value | __table_id | __tsid | job |
|
||||
+-------------------------+----------------+------------+---------------------+-------+
|
||||
| 1970-01-01T00:00:00 | 0.0 | 3 | 2955007454552897459 | tag_0 |
|
||||
| 1970-01-01T00:00:00.001 | 1.0 | 3 | 2955007454552897459 | tag_0 |
|
||||
| 1970-01-01T00:00:00.002 | 2.0 | 3 | 2955007454552897459 | tag_0 |
|
||||
| 1970-01-01T00:00:00.003 | 3.0 | 3 | 2955007454552897459 | tag_0 |
|
||||
| 1970-01-01T00:00:00.004 | 4.0 | 3 | 2955007454552897459 | tag_0 |
|
||||
+-------------------------+----------------+------------+---------------------+-------+";
|
||||
+-------------------------+----------------+------------+----------------------+-------+
|
||||
| greptime_timestamp | greptime_value | __table_id | __tsid | job |
|
||||
+-------------------------+----------------+------------+----------------------+-------+
|
||||
| 1970-01-01T00:00:00 | 0.0 | 3 | 12881218023286672757 | tag_0 |
|
||||
| 1970-01-01T00:00:00.001 | 1.0 | 3 | 12881218023286672757 | tag_0 |
|
||||
| 1970-01-01T00:00:00.002 | 2.0 | 3 | 12881218023286672757 | tag_0 |
|
||||
| 1970-01-01T00:00:00.003 | 3.0 | 3 | 12881218023286672757 | tag_0 |
|
||||
| 1970-01-01T00:00:00.004 | 4.0 | 3 | 12881218023286672757 | tag_0 |
|
||||
+-------------------------+----------------+------------+----------------------+-------+";
|
||||
assert_eq!(expected, batches.pretty_print().unwrap(), "physical region");
|
||||
|
||||
// read data from logical region
|
||||
|
||||
@@ -13,12 +13,11 @@
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::hash::Hasher;
|
||||
use std::hash::Hash;
|
||||
|
||||
use api::v1::value::ValueData;
|
||||
use api::v1::{ColumnDataType, ColumnSchema, Row, Rows, SemanticType, Value};
|
||||
use datatypes::value::ValueRef;
|
||||
use fxhash::FxHasher;
|
||||
use mito_codec::row_converter::SparsePrimaryKeyCodec;
|
||||
use smallvec::SmallVec;
|
||||
use snafu::ResultExt;
|
||||
@@ -31,6 +30,9 @@ use store_api::storage::{ColumnId, TableId};
|
||||
|
||||
use crate::error::{EncodePrimaryKeySnafu, Result};
|
||||
|
||||
// A random number
|
||||
const TSID_HASH_SEED: u32 = 846793005;
|
||||
|
||||
/// A row modifier modifies [`Rows`].
|
||||
///
|
||||
/// - For [`PrimaryKeyEncoding::Sparse`] encoding,
|
||||
@@ -73,7 +75,6 @@ impl RowModifier {
|
||||
let num_output_column = num_column - num_primary_key_column + 1;
|
||||
|
||||
let mut buffer = vec![];
|
||||
|
||||
for mut iter in iter.iter_mut() {
|
||||
let (table_id, tsid) = Self::fill_internal_columns(table_id, &iter);
|
||||
let mut values = Vec::with_capacity(num_output_column);
|
||||
@@ -146,72 +147,47 @@ impl RowModifier {
|
||||
|
||||
/// Fills internal columns of a row with table name and a hash of tag values.
|
||||
pub fn fill_internal_columns(table_id: TableId, iter: &RowIter<'_>) -> (Value, Value) {
|
||||
let ts_id = if !iter.has_null_labels() {
|
||||
// No null labels in row, we can safely reuse the precomputed label name hash.
|
||||
let mut ts_id_gen = TsidGenerator::new(iter.index.label_name_hash);
|
||||
for (_, value) in iter.primary_keys_with_name() {
|
||||
// The type is checked before. So only null is ignored.
|
||||
if let Some(ValueData::StringValue(string)) = &value.value_data {
|
||||
ts_id_gen.write_str(string);
|
||||
} else {
|
||||
unreachable!(
|
||||
"Should not contain null or non-string value: {:?}, table id: {}",
|
||||
value, table_id
|
||||
);
|
||||
}
|
||||
let mut hasher = TsidGenerator::default();
|
||||
for (name, value) in iter.primary_keys_with_name() {
|
||||
// The type is checked before. So only null is ignored.
|
||||
if let Some(ValueData::StringValue(string)) = &value.value_data {
|
||||
hasher.write_label(name, string);
|
||||
}
|
||||
ts_id_gen.finish()
|
||||
} else {
|
||||
// Slow path: row contains null, recompute label hash
|
||||
let mut hasher = TsidGenerator::default();
|
||||
// 1. Find out label names with non-null values and get the hash.
|
||||
for (name, value) in iter.primary_keys_with_name() {
|
||||
// The type is checked before. So only null is ignored.
|
||||
if let Some(ValueData::StringValue(_)) = &value.value_data {
|
||||
hasher.write_str(name);
|
||||
}
|
||||
}
|
||||
let label_name_hash = hasher.finish();
|
||||
|
||||
// 2. Use label name hash as seed and continue with label values.
|
||||
let mut final_hasher = TsidGenerator::new(label_name_hash);
|
||||
for (_, value) in iter.primary_keys_with_name() {
|
||||
if let Some(ValueData::StringValue(value)) = &value.value_data {
|
||||
final_hasher.write_str(value);
|
||||
}
|
||||
}
|
||||
final_hasher.finish()
|
||||
};
|
||||
}
|
||||
let hash = hasher.finish();
|
||||
|
||||
(
|
||||
ValueData::U32Value(table_id).into(),
|
||||
ValueData::U64Value(ts_id).into(),
|
||||
ValueData::U64Value(hash).into(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Tsid generator.
|
||||
#[derive(Default)]
|
||||
pub struct TsidGenerator {
|
||||
hasher: FxHasher,
|
||||
hasher: mur3::Hasher128,
|
||||
}
|
||||
|
||||
impl Default for TsidGenerator {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
hasher: mur3::Hasher128::with_seed(TSID_HASH_SEED),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TsidGenerator {
|
||||
pub fn new(label_name_hash: u64) -> Self {
|
||||
let mut hasher = FxHasher::default();
|
||||
hasher.write_u64(label_name_hash);
|
||||
Self { hasher }
|
||||
}
|
||||
|
||||
/// Writes a label pair to the generator.
|
||||
pub fn write_str(&mut self, value: &str) {
|
||||
self.hasher.write(value.as_bytes());
|
||||
self.hasher.write_u8(0xff);
|
||||
pub fn write_label(&mut self, name: &str, value: &str) {
|
||||
name.hash(&mut self.hasher);
|
||||
value.hash(&mut self.hasher);
|
||||
}
|
||||
|
||||
/// Generates a new TSID.
|
||||
pub fn finish(&mut self) -> u64 {
|
||||
self.hasher.finish()
|
||||
// TSID is 64 bits, simply truncate the 128 bits hash
|
||||
let (hash, _) = self.hasher.finish128();
|
||||
hash
|
||||
}
|
||||
}
|
||||
|
||||
@@ -226,8 +202,6 @@ struct ValueIndex {
|
||||
struct IterIndex {
|
||||
indices: Vec<ValueIndex>,
|
||||
num_primary_key_column: usize,
|
||||
/// Precomputed hash for label names.
|
||||
label_name_hash: u64,
|
||||
}
|
||||
|
||||
impl IterIndex {
|
||||
@@ -278,22 +252,15 @@ impl IterIndex {
|
||||
}
|
||||
}
|
||||
let num_primary_key_column = primary_key_indices.len() + reserved_indices.len();
|
||||
let mut indices = Vec::with_capacity(num_primary_key_column + 2);
|
||||
indices.extend(reserved_indices);
|
||||
let mut label_name_hasher = TsidGenerator::default();
|
||||
for (pk_name, pk_index) in primary_key_indices {
|
||||
// primary_key_indices already sorted.
|
||||
label_name_hasher.write_str(pk_name);
|
||||
indices.push(pk_index);
|
||||
}
|
||||
let label_name_hash = label_name_hasher.finish();
|
||||
|
||||
indices.extend(ts_index);
|
||||
indices.extend(field_indices);
|
||||
let indices = reserved_indices
|
||||
.into_iter()
|
||||
.chain(primary_key_indices.values().cloned())
|
||||
.chain(ts_index)
|
||||
.chain(field_indices)
|
||||
.collect();
|
||||
IterIndex {
|
||||
indices,
|
||||
num_primary_key_column,
|
||||
label_name_hash,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -347,13 +314,6 @@ impl RowIter<'_> {
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns true if any label in current row is null.
|
||||
fn has_null_labels(&self) -> bool {
|
||||
self.index.indices[..self.index.num_primary_key_column]
|
||||
.iter()
|
||||
.any(|idx| self.row.values[idx.index].value_data.is_none())
|
||||
}
|
||||
|
||||
/// Returns the primary keys.
|
||||
pub fn primary_keys(&self) -> impl Iterator<Item = (ColumnId, ValueRef<'_>)> {
|
||||
self.index.indices[..self.index.num_primary_key_column]
|
||||
@@ -439,9 +399,9 @@ mod tests {
|
||||
let result = encoder.modify_rows_sparse(rows_iter, table_id).unwrap();
|
||||
assert_eq!(result.rows[0].values.len(), 1);
|
||||
let encoded_primary_key = vec![
|
||||
128, 0, 0, 4, 1, 0, 0, 4, 1, 128, 0, 0, 3, 1, 37, 196, 242, 181, 117, 224, 7, 137, 0,
|
||||
0, 0, 2, 1, 1, 49, 50, 55, 46, 48, 46, 48, 46, 9, 49, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
|
||||
1, 1, 1, 103, 114, 101, 112, 116, 105, 109, 101, 9, 100, 98, 0, 0, 0, 0, 0, 0, 2,
|
||||
128, 0, 0, 4, 1, 0, 0, 4, 1, 128, 0, 0, 3, 1, 131, 9, 166, 190, 173, 37, 39, 240, 0, 0,
|
||||
0, 2, 1, 1, 49, 50, 55, 46, 48, 46, 48, 46, 9, 49, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
|
||||
1, 1, 103, 114, 101, 112, 116, 105, 109, 101, 9, 100, 98, 0, 0, 0, 0, 0, 0, 2,
|
||||
];
|
||||
assert_eq!(
|
||||
result.rows[0].values[0],
|
||||
@@ -517,7 +477,7 @@ mod tests {
|
||||
assert_eq!(result.rows[0].values[2], ValueData::U32Value(1025).into());
|
||||
assert_eq!(
|
||||
result.rows[0].values[3],
|
||||
ValueData::U64Value(2721566936019240841).into()
|
||||
ValueData::U64Value(9442261431637846000).into()
|
||||
);
|
||||
assert_eq!(result.schema, expected_dense_schema());
|
||||
}
|
||||
@@ -536,7 +496,7 @@ mod tests {
|
||||
let row_iter = rows_iter.iter_mut().next().unwrap();
|
||||
let (encoded_table_id, tsid) = RowModifier::fill_internal_columns(table_id, &row_iter);
|
||||
assert_eq!(encoded_table_id, ValueData::U32Value(1025).into());
|
||||
assert_eq!(tsid, ValueData::U64Value(2721566936019240841).into());
|
||||
assert_eq!(tsid, ValueData::U64Value(9442261431637846000).into());
|
||||
|
||||
// Change the column order
|
||||
let schema = vec![
|
||||
@@ -564,264 +524,6 @@ mod tests {
|
||||
let row_iter = rows_iter.iter_mut().next().unwrap();
|
||||
let (encoded_table_id, tsid) = RowModifier::fill_internal_columns(table_id, &row_iter);
|
||||
assert_eq!(encoded_table_id, ValueData::U32Value(1025).into());
|
||||
assert_eq!(tsid, ValueData::U64Value(2721566936019240841).into());
|
||||
}
|
||||
|
||||
/// Helper function to create a schema with multiple label columns
|
||||
fn create_multi_label_schema(labels: &[&str]) -> Vec<ColumnSchema> {
|
||||
labels
|
||||
.iter()
|
||||
.map(|name| ColumnSchema {
|
||||
column_name: name.to_string(),
|
||||
datatype: ColumnDataType::String as i32,
|
||||
semantic_type: SemanticType::Tag as _,
|
||||
datatype_extension: None,
|
||||
options: None,
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Helper function to create a name_to_column_id map
|
||||
fn create_name_to_column_id(labels: &[&str]) -> HashMap<String, ColumnId> {
|
||||
labels
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(idx, name)| (name.to_string(), idx as ColumnId + 1))
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Helper function to create a row with string values
|
||||
fn create_row_with_values(values: &[&str]) -> Row {
|
||||
Row {
|
||||
values: values
|
||||
.iter()
|
||||
.map(|v| ValueData::StringValue(v.to_string()).into())
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function to create a row with some null values
|
||||
fn create_row_with_nulls(values: &[Option<&str>]) -> Row {
|
||||
Row {
|
||||
values: values
|
||||
.iter()
|
||||
.map(|v| {
|
||||
v.map(|s| ValueData::StringValue(s.to_string()).into())
|
||||
.unwrap_or(Value { value_data: None })
|
||||
})
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function to extract TSID from a row
|
||||
fn extract_tsid(
|
||||
schema: Vec<ColumnSchema>,
|
||||
row: Row,
|
||||
name_to_column_id: &HashMap<String, ColumnId>,
|
||||
table_id: TableId,
|
||||
) -> u64 {
|
||||
let rows = Rows {
|
||||
schema,
|
||||
rows: vec![row],
|
||||
};
|
||||
let mut rows_iter = RowsIter::new(rows, name_to_column_id);
|
||||
let row_iter = rows_iter.iter_mut().next().unwrap();
|
||||
let (_, tsid_value) = RowModifier::fill_internal_columns(table_id, &row_iter);
|
||||
match tsid_value.value_data {
|
||||
Some(ValueData::U64Value(tsid)) => tsid,
|
||||
_ => panic!("Expected U64Value for TSID"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tsid_same_for_different_label_orders() {
|
||||
// Test that rows with the same label name-value pairs but in different orders
|
||||
// produce the same TSID
|
||||
let table_id = 1025;
|
||||
|
||||
// Schema 1: a, b, c
|
||||
let schema1 = create_multi_label_schema(&["a", "b", "c"]);
|
||||
let name_to_column_id1 = create_name_to_column_id(&["a", "b", "c"]);
|
||||
let row1 = create_row_with_values(&["A", "B", "C"]);
|
||||
let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
|
||||
|
||||
// Schema 2: b, a, c (different order)
|
||||
let schema2 = create_multi_label_schema(&["b", "a", "c"]);
|
||||
let name_to_column_id2 = create_name_to_column_id(&["a", "b", "c"]);
|
||||
let row2 = create_row_with_values(&["B", "A", "C"]);
|
||||
let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
|
||||
|
||||
// Schema 3: c, b, a (another different order)
|
||||
let schema3 = create_multi_label_schema(&["c", "b", "a"]);
|
||||
let name_to_column_id3 = create_name_to_column_id(&["a", "b", "c"]);
|
||||
let row3 = create_row_with_values(&["C", "B", "A"]);
|
||||
let tsid3 = extract_tsid(schema3, row3, &name_to_column_id3, table_id);
|
||||
|
||||
// All should have the same TSID since label names are sorted lexicographically
|
||||
// and we're using the same label name-value pairs
|
||||
assert_eq!(
|
||||
tsid1, tsid2,
|
||||
"TSID should be same for different column orders"
|
||||
);
|
||||
assert_eq!(
|
||||
tsid2, tsid3,
|
||||
"TSID should be same for different column orders"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tsid_same_with_null_labels() {
|
||||
// Test that rows that differ only by null label values produce the same TSID
|
||||
let table_id = 1025;
|
||||
|
||||
// Row 1: a=A, b=B (no nulls, fast path)
|
||||
let schema1 = create_multi_label_schema(&["a", "b"]);
|
||||
let name_to_column_id1 = create_name_to_column_id(&["a", "b"]);
|
||||
let row1 = create_row_with_values(&["A", "B"]);
|
||||
let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
|
||||
|
||||
// Row 2: a=A, b=B, c=null (has null, slow path)
|
||||
let schema2 = create_multi_label_schema(&["a", "b", "c"]);
|
||||
let name_to_column_id2 = create_name_to_column_id(&["a", "b", "c"]);
|
||||
let row2 = create_row_with_nulls(&[Some("A"), Some("B"), None]);
|
||||
let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
|
||||
|
||||
// Both should have the same TSID since null labels are ignored
|
||||
assert_eq!(
|
||||
tsid1, tsid2,
|
||||
"TSID should be same when only difference is null label values"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tsid_same_with_multiple_null_labels() {
|
||||
// Test with multiple null labels
|
||||
let table_id = 1025;
|
||||
|
||||
// Row 1: a=A, b=B (no nulls)
|
||||
let schema1 = create_multi_label_schema(&["a", "b"]);
|
||||
let name_to_column_id1 = create_name_to_column_id(&["a", "b"]);
|
||||
let row1 = create_row_with_values(&["A", "B"]);
|
||||
let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
|
||||
|
||||
// Row 2: a=A, b=B, c=null, d=null (multiple nulls)
|
||||
let schema2 = create_multi_label_schema(&["a", "b", "c", "d"]);
|
||||
let name_to_column_id2 = create_name_to_column_id(&["a", "b", "c", "d"]);
|
||||
let row2 = create_row_with_nulls(&[Some("A"), Some("B"), None, None]);
|
||||
let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
|
||||
|
||||
assert_eq!(
|
||||
tsid1, tsid2,
|
||||
"TSID should be same when only difference is multiple null label values"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tsid_different_with_different_non_null_values() {
|
||||
// Test that rows with different non-null values produce different TSIDs
|
||||
let table_id = 1025;
|
||||
|
||||
// Row 1: a=A, b=B
|
||||
let schema1 = create_multi_label_schema(&["a", "b"]);
|
||||
let name_to_column_id1 = create_name_to_column_id(&["a", "b"]);
|
||||
let row1 = create_row_with_values(&["A", "B"]);
|
||||
let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
|
||||
|
||||
// Row 2: a=A, b=C (different value for b)
|
||||
let schema2 = create_multi_label_schema(&["a", "b"]);
|
||||
let name_to_column_id2 = create_name_to_column_id(&["a", "b"]);
|
||||
let row2 = create_row_with_values(&["A", "C"]);
|
||||
let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
|
||||
|
||||
assert_ne!(
|
||||
tsid1, tsid2,
|
||||
"TSID should be different when label values differ"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tsid_fast_path_vs_slow_path_consistency() {
|
||||
// Test that fast path (no nulls) and slow path (with nulls) produce
|
||||
// the same TSID for the same non-null label values
|
||||
let table_id = 1025;
|
||||
|
||||
// Fast path: a=A, b=B (no nulls)
|
||||
let schema_fast = create_multi_label_schema(&["a", "b"]);
|
||||
let name_to_column_id_fast = create_name_to_column_id(&["a", "b"]);
|
||||
let row_fast = create_row_with_values(&["A", "B"]);
|
||||
let tsid_fast = extract_tsid(schema_fast, row_fast, &name_to_column_id_fast, table_id);
|
||||
|
||||
// Slow path: a=A, b=B, c=null (has null, triggers slow path)
|
||||
let schema_slow = create_multi_label_schema(&["a", "b", "c"]);
|
||||
let name_to_column_id_slow = create_name_to_column_id(&["a", "b", "c"]);
|
||||
let row_slow = create_row_with_nulls(&[Some("A"), Some("B"), None]);
|
||||
let tsid_slow = extract_tsid(schema_slow, row_slow, &name_to_column_id_slow, table_id);
|
||||
|
||||
assert_eq!(
|
||||
tsid_fast, tsid_slow,
|
||||
"Fast path and slow path should produce same TSID for same non-null values"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tsid_with_null_in_middle() {
|
||||
// Test with null in the middle of labels
|
||||
let table_id = 1025;
|
||||
|
||||
// Row 1: a=A, b=B, c=C
|
||||
let schema1 = create_multi_label_schema(&["a", "b", "c"]);
|
||||
let name_to_column_id1 = create_name_to_column_id(&["a", "b", "c"]);
|
||||
let row1 = create_row_with_values(&["A", "B", "C"]);
|
||||
let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
|
||||
|
||||
// Row 2: a=A, b=null, c=C (null in middle)
|
||||
let schema2 = create_multi_label_schema(&["a", "b", "c"]);
|
||||
let name_to_column_id2 = create_name_to_column_id(&["a", "b", "c"]);
|
||||
let row2 = create_row_with_nulls(&[Some("A"), None, Some("C")]);
|
||||
let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
|
||||
|
||||
// Should be different because b is null in row2 but B in row1
|
||||
// Actually wait, let me reconsider - if b is null, it should be ignored
|
||||
// So row2 should be equivalent to a=A, c=C
|
||||
// But row1 is a=A, b=B, c=C, so they should be different
|
||||
assert_ne!(
|
||||
tsid1, tsid2,
|
||||
"TSID should be different when a non-null value becomes null"
|
||||
);
|
||||
|
||||
// Row 3: a=A, c=C (no b at all, equivalent to row2)
|
||||
let schema3 = create_multi_label_schema(&["a", "c"]);
|
||||
let name_to_column_id3 = create_name_to_column_id(&["a", "c"]);
|
||||
let row3 = create_row_with_values(&["A", "C"]);
|
||||
let tsid3 = extract_tsid(schema3, row3, &name_to_column_id3, table_id);
|
||||
|
||||
// Row2 (a=A, b=null, c=C) should be same as row3 (a=A, c=C)
|
||||
assert_eq!(
|
||||
tsid2, tsid3,
|
||||
"TSID should be same when null label is ignored"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tsid_all_null_labels() {
|
||||
// Test with all labels being null
|
||||
let table_id = 1025;
|
||||
|
||||
// Row with all nulls
|
||||
let schema = create_multi_label_schema(&["a", "b", "c"]);
|
||||
let name_to_column_id = create_name_to_column_id(&["a", "b", "c"]);
|
||||
let row = create_row_with_nulls(&[None, None, None]);
|
||||
let tsid = extract_tsid(schema.clone(), row, &name_to_column_id, table_id);
|
||||
|
||||
// Should still produce a TSID (based on label names only when all values are null)
|
||||
// This tests that the slow path handles the case where all values are null
|
||||
// The TSID will be based on the label name hash only
|
||||
// Test that it's consistent - same schema with all nulls should produce same TSID
|
||||
let row2 = create_row_with_nulls(&[None, None, None]);
|
||||
let tsid2 = extract_tsid(schema, row2, &name_to_column_id, table_id);
|
||||
assert_eq!(
|
||||
tsid, tsid2,
|
||||
"TSID should be consistent when all label values are null"
|
||||
);
|
||||
assert_eq!(tsid, ValueData::U64Value(9442261431637846000).into());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -37,7 +37,7 @@ use crate::error::{CleanDirSnafu, DeleteIndexSnafu, DeleteSstSnafu, OpenDalSnafu
|
||||
use crate::metrics::{COMPACTION_STAGE_ELAPSED, FLUSH_ELAPSED};
|
||||
use crate::read::{FlatSource, Source};
|
||||
use crate::region::options::IndexOptions;
|
||||
use crate::sst::file::{FileHandle, RegionFileId, RegionIndexId};
|
||||
use crate::sst::file::{FileHandle, RegionFileId};
|
||||
use crate::sst::index::IndexerBuilderImpl;
|
||||
use crate::sst::index::intermediate::IntermediateManager;
|
||||
use crate::sst::index::puffin_manager::{PuffinManagerFactory, SstPuffinManager};
|
||||
@@ -216,7 +216,7 @@ impl AccessLayer {
|
||||
pub(crate) async fn delete_sst(
|
||||
&self,
|
||||
region_file_id: &RegionFileId,
|
||||
index_file_id: &RegionIndexId,
|
||||
index_file_id: &RegionFileId,
|
||||
) -> Result<()> {
|
||||
let path = location::sst_file_path(&self.table_dir, *region_file_id, self.path_type);
|
||||
self.object_store
|
||||
@@ -226,22 +226,12 @@ impl AccessLayer {
|
||||
file_id: region_file_id.file_id(),
|
||||
})?;
|
||||
|
||||
// Delete all versions of the index file.
|
||||
for version in 0..=index_file_id.version {
|
||||
self.delete_index(&RegionIndexId::new(index_file_id.file_id, version))
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn delete_index(&self, region_index_id: &RegionIndexId) -> Result<()> {
|
||||
let path = location::index_file_path(&self.table_dir, *region_index_id, self.path_type);
|
||||
let path = location::index_file_path(&self.table_dir, *index_file_id, self.path_type);
|
||||
self.object_store
|
||||
.delete(&path)
|
||||
.await
|
||||
.context(DeleteIndexSnafu {
|
||||
file_id: region_index_id.file_id(),
|
||||
file_id: region_file_id.file_id(),
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
@@ -301,7 +291,6 @@ impl AccessLayer {
|
||||
puffin_manager: self
|
||||
.puffin_manager_factory
|
||||
.build(store, path_provider.clone()),
|
||||
write_cache_enabled: false,
|
||||
intermediate_manager: self.intermediate_manager.clone(),
|
||||
index_options: request.index_options,
|
||||
inverted_index_config: request.inverted_index_config,
|
||||
@@ -479,10 +468,9 @@ impl TempFileCleaner {
|
||||
}
|
||||
|
||||
/// Removes the SST and index file from the local atomic dir by the file id.
|
||||
/// This only removes the initial index, since the index version is always 0 for a new SST, this method should be safe to pass 0.
|
||||
pub(crate) async fn clean_by_file_id(&self, file_id: FileId) {
|
||||
let sst_key = IndexKey::new(self.region_id, file_id, FileType::Parquet).to_string();
|
||||
let index_key = IndexKey::new(self.region_id, file_id, FileType::Puffin(0)).to_string();
|
||||
let index_key = IndexKey::new(self.region_id, file_id, FileType::Puffin).to_string();
|
||||
|
||||
Self::clean_atomic_dir_files(&self.object_store, &[&sst_key, &index_key]).await;
|
||||
}
|
||||
@@ -565,12 +553,9 @@ async fn clean_dir(dir: &str) -> Result<()> {
|
||||
|
||||
/// Path provider for SST file and index file.
|
||||
pub trait FilePathProvider: Send + Sync {
|
||||
/// Creates index file path of given file id. Version default to 0, and not shown in the path.
|
||||
/// Creates index file path of given file id.
|
||||
fn build_index_file_path(&self, file_id: RegionFileId) -> String;
|
||||
|
||||
/// Creates index file path of given index id (with version support).
|
||||
fn build_index_file_path_with_version(&self, index_id: RegionIndexId) -> String;
|
||||
|
||||
/// Creates SST file path of given file id.
|
||||
fn build_sst_file_path(&self, file_id: RegionFileId) -> String;
|
||||
}
|
||||
@@ -590,16 +575,7 @@ impl WriteCachePathProvider {
|
||||
|
||||
impl FilePathProvider for WriteCachePathProvider {
|
||||
fn build_index_file_path(&self, file_id: RegionFileId) -> String {
|
||||
let puffin_key = IndexKey::new(file_id.region_id(), file_id.file_id(), FileType::Puffin(0));
|
||||
self.file_cache.cache_file_path(puffin_key)
|
||||
}
|
||||
|
||||
fn build_index_file_path_with_version(&self, index_id: RegionIndexId) -> String {
|
||||
let puffin_key = IndexKey::new(
|
||||
index_id.region_id(),
|
||||
index_id.file_id(),
|
||||
FileType::Puffin(index_id.version),
|
||||
);
|
||||
let puffin_key = IndexKey::new(file_id.region_id(), file_id.file_id(), FileType::Puffin);
|
||||
self.file_cache.cache_file_path(puffin_key)
|
||||
}
|
||||
|
||||
@@ -629,11 +605,7 @@ impl RegionFilePathFactory {
|
||||
|
||||
impl FilePathProvider for RegionFilePathFactory {
|
||||
fn build_index_file_path(&self, file_id: RegionFileId) -> String {
|
||||
location::index_file_path_legacy(&self.table_dir, file_id, self.path_type)
|
||||
}
|
||||
|
||||
fn build_index_file_path_with_version(&self, index_id: RegionIndexId) -> String {
|
||||
location::index_file_path(&self.table_dir, index_id, self.path_type)
|
||||
location::index_file_path(&self.table_dir, file_id, self.path_type)
|
||||
}
|
||||
|
||||
fn build_sst_file_path(&self, file_id: RegionFileId) -> String {
|
||||
|
||||
@@ -18,7 +18,6 @@ mod cache_size;
|
||||
|
||||
pub(crate) mod file_cache;
|
||||
pub(crate) mod index;
|
||||
pub(crate) mod manifest_cache;
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test_util;
|
||||
pub(crate) mod write_cache;
|
||||
@@ -44,8 +43,7 @@ use crate::cache::index::inverted_index::{InvertedIndexCache, InvertedIndexCache
|
||||
use crate::cache::write_cache::WriteCacheRef;
|
||||
use crate::metrics::{CACHE_BYTES, CACHE_EVICTION, CACHE_HIT, CACHE_MISS};
|
||||
use crate::read::Batch;
|
||||
use crate::sst::file::{RegionFileId, RegionIndexId};
|
||||
use crate::sst::parquet::reader::MetadataCacheMetrics;
|
||||
use crate::sst::file::RegionFileId;
|
||||
|
||||
/// Metrics type key for sst meta.
|
||||
const SST_META_TYPE: &str = "sst_meta";
|
||||
@@ -76,24 +74,19 @@ pub enum CacheStrategy {
|
||||
}
|
||||
|
||||
impl CacheStrategy {
|
||||
/// Gets parquet metadata with cache metrics tracking.
|
||||
/// Returns the metadata and updates the provided metrics.
|
||||
pub(crate) async fn get_parquet_meta_data(
|
||||
/// Calls [CacheManager::get_parquet_meta_data()].
|
||||
pub async fn get_parquet_meta_data(
|
||||
&self,
|
||||
file_id: RegionFileId,
|
||||
metrics: &mut MetadataCacheMetrics,
|
||||
) -> Option<Arc<ParquetMetaData>> {
|
||||
match self {
|
||||
CacheStrategy::EnableAll(cache_manager) => {
|
||||
cache_manager.get_parquet_meta_data(file_id, metrics).await
|
||||
cache_manager.get_parquet_meta_data(file_id).await
|
||||
}
|
||||
CacheStrategy::Compaction(cache_manager) => {
|
||||
cache_manager.get_parquet_meta_data(file_id, metrics).await
|
||||
}
|
||||
CacheStrategy::Disabled => {
|
||||
metrics.cache_miss += 1;
|
||||
None
|
||||
cache_manager.get_parquet_meta_data(file_id).await
|
||||
}
|
||||
CacheStrategy::Disabled => None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -180,7 +173,7 @@ impl CacheStrategy {
|
||||
}
|
||||
|
||||
/// Calls [CacheManager::evict_puffin_cache()].
|
||||
pub async fn evict_puffin_cache(&self, file_id: RegionIndexId) {
|
||||
pub async fn evict_puffin_cache(&self, file_id: RegionFileId) {
|
||||
match self {
|
||||
CacheStrategy::EnableAll(cache_manager) => {
|
||||
cache_manager.evict_puffin_cache(file_id).await
|
||||
@@ -298,17 +291,16 @@ impl CacheManager {
|
||||
CacheManagerBuilder::default()
|
||||
}
|
||||
|
||||
/// Gets cached [ParquetMetaData] with metrics tracking.
|
||||
/// Tries in-memory cache first, then file cache, updating metrics accordingly.
|
||||
pub(crate) async fn get_parquet_meta_data(
|
||||
/// Gets cached [ParquetMetaData] from in-memory cache first.
|
||||
/// If not found, tries to get it from write cache and fill the in-memory cache.
|
||||
pub async fn get_parquet_meta_data(
|
||||
&self,
|
||||
file_id: RegionFileId,
|
||||
metrics: &mut MetadataCacheMetrics,
|
||||
) -> Option<Arc<ParquetMetaData>> {
|
||||
// Try to get metadata from sst meta cache
|
||||
if let Some(metadata) = self.get_parquet_meta_data_from_mem_cache(file_id) {
|
||||
metrics.mem_cache_hit += 1;
|
||||
return Some(metadata);
|
||||
let metadata = self.get_parquet_meta_data_from_mem_cache(file_id);
|
||||
if metadata.is_some() {
|
||||
return metadata;
|
||||
}
|
||||
|
||||
// Try to get metadata from write cache
|
||||
@@ -316,13 +308,11 @@ impl CacheManager {
|
||||
if let Some(write_cache) = &self.write_cache
|
||||
&& let Some(metadata) = write_cache.file_cache().get_parquet_meta_data(key).await
|
||||
{
|
||||
metrics.file_cache_hit += 1;
|
||||
let metadata = Arc::new(metadata);
|
||||
// Put metadata into sst meta cache
|
||||
self.put_parquet_meta_data(file_id, metadata.clone());
|
||||
return Some(metadata);
|
||||
};
|
||||
metrics.cache_miss += 1;
|
||||
|
||||
None
|
||||
}
|
||||
@@ -400,7 +390,7 @@ impl CacheManager {
|
||||
}
|
||||
|
||||
/// Evicts every puffin-related cache entry for the given file.
|
||||
pub async fn evict_puffin_cache(&self, file_id: RegionIndexId) {
|
||||
pub async fn evict_puffin_cache(&self, file_id: RegionFileId) {
|
||||
if let Some(cache) = &self.bloom_filter_index_cache {
|
||||
cache.invalidate_file(file_id.file_id());
|
||||
}
|
||||
@@ -422,7 +412,7 @@ impl CacheManager {
|
||||
.remove(IndexKey::new(
|
||||
file_id.region_id(),
|
||||
file_id.file_id(),
|
||||
FileType::Puffin(file_id.version),
|
||||
FileType::Puffin,
|
||||
))
|
||||
.await;
|
||||
}
|
||||
@@ -835,14 +825,8 @@ mod tests {
|
||||
let region_id = RegionId::new(1, 1);
|
||||
let file_id = RegionFileId::new(region_id, FileId::random());
|
||||
let metadata = parquet_meta();
|
||||
let mut metrics = MetadataCacheMetrics::default();
|
||||
cache.put_parquet_meta_data(file_id, metadata);
|
||||
assert!(
|
||||
cache
|
||||
.get_parquet_meta_data(file_id, &mut metrics)
|
||||
.await
|
||||
.is_none()
|
||||
);
|
||||
assert!(cache.get_parquet_meta_data(file_id).await.is_none());
|
||||
|
||||
let value = Value::Int64(10);
|
||||
let vector: VectorRef = Arc::new(Int64Vector::from_slice([10, 10, 10, 10]));
|
||||
@@ -864,30 +848,14 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_parquet_meta_cache() {
|
||||
let cache = CacheManager::builder().sst_meta_cache_size(2000).build();
|
||||
let mut metrics = MetadataCacheMetrics::default();
|
||||
let region_id = RegionId::new(1, 1);
|
||||
let file_id = RegionFileId::new(region_id, FileId::random());
|
||||
assert!(
|
||||
cache
|
||||
.get_parquet_meta_data(file_id, &mut metrics)
|
||||
.await
|
||||
.is_none()
|
||||
);
|
||||
assert!(cache.get_parquet_meta_data(file_id).await.is_none());
|
||||
let metadata = parquet_meta();
|
||||
cache.put_parquet_meta_data(file_id, metadata);
|
||||
assert!(
|
||||
cache
|
||||
.get_parquet_meta_data(file_id, &mut metrics)
|
||||
.await
|
||||
.is_some()
|
||||
);
|
||||
assert!(cache.get_parquet_meta_data(file_id).await.is_some());
|
||||
cache.remove_parquet_meta_data(file_id);
|
||||
assert!(
|
||||
cache
|
||||
.get_parquet_meta_data(file_id, &mut metrics)
|
||||
.await
|
||||
.is_none()
|
||||
);
|
||||
assert!(cache.get_parquet_meta_data(file_id).await.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -949,7 +917,7 @@ mod tests {
|
||||
let cache = Arc::new(cache);
|
||||
|
||||
let region_id = RegionId::new(1, 1);
|
||||
let index_id = RegionIndexId::new(RegionFileId::new(region_id, FileId::random()), 0);
|
||||
let region_file_id = RegionFileId::new(region_id, FileId::random());
|
||||
let column_id: ColumnId = 1;
|
||||
|
||||
let bloom_cache = cache.bloom_filter_index_cache().unwrap().clone();
|
||||
@@ -957,21 +925,16 @@ mod tests {
|
||||
let result_cache = cache.index_result_cache().unwrap();
|
||||
let puffin_metadata_cache = cache.puffin_metadata_cache().unwrap().clone();
|
||||
|
||||
let bloom_key = (
|
||||
index_id.file_id(),
|
||||
index_id.version,
|
||||
column_id,
|
||||
Tag::Skipping,
|
||||
);
|
||||
let bloom_key = (region_file_id.file_id(), column_id, Tag::Skipping);
|
||||
bloom_cache.put_metadata(bloom_key, Arc::new(BloomFilterMeta::default()));
|
||||
inverted_cache.put_metadata(
|
||||
(index_id.file_id(), index_id.version),
|
||||
region_file_id.file_id(),
|
||||
Arc::new(InvertedIndexMetas::default()),
|
||||
);
|
||||
let predicate = PredicateKey::new_bloom(Arc::new(BTreeMap::new()));
|
||||
let selection = Arc::new(RowGroupSelection::default());
|
||||
result_cache.put(predicate.clone(), index_id.file_id(), selection);
|
||||
let file_id_str = index_id.to_string();
|
||||
result_cache.put(predicate.clone(), region_file_id.file_id(), selection);
|
||||
let file_id_str = region_file_id.to_string();
|
||||
let metadata = Arc::new(FileMetadata {
|
||||
blobs: Vec::new(),
|
||||
properties: HashMap::new(),
|
||||
@@ -981,32 +944,40 @@ mod tests {
|
||||
assert!(bloom_cache.get_metadata(bloom_key).is_some());
|
||||
assert!(
|
||||
inverted_cache
|
||||
.get_metadata((index_id.file_id(), index_id.version))
|
||||
.get_metadata(region_file_id.file_id())
|
||||
.is_some()
|
||||
);
|
||||
assert!(
|
||||
result_cache
|
||||
.get(&predicate, region_file_id.file_id())
|
||||
.is_some()
|
||||
);
|
||||
assert!(result_cache.get(&predicate, index_id.file_id()).is_some());
|
||||
assert!(puffin_metadata_cache.get_metadata(&file_id_str).is_some());
|
||||
|
||||
cache.evict_puffin_cache(index_id).await;
|
||||
cache.evict_puffin_cache(region_file_id).await;
|
||||
|
||||
assert!(bloom_cache.get_metadata(bloom_key).is_none());
|
||||
assert!(
|
||||
inverted_cache
|
||||
.get_metadata((index_id.file_id(), index_id.version))
|
||||
.get_metadata(region_file_id.file_id())
|
||||
.is_none()
|
||||
);
|
||||
assert!(
|
||||
result_cache
|
||||
.get(&predicate, region_file_id.file_id())
|
||||
.is_none()
|
||||
);
|
||||
assert!(result_cache.get(&predicate, index_id.file_id()).is_none());
|
||||
assert!(puffin_metadata_cache.get_metadata(&file_id_str).is_none());
|
||||
|
||||
// Refill caches and evict via CacheStrategy to ensure delegation works.
|
||||
bloom_cache.put_metadata(bloom_key, Arc::new(BloomFilterMeta::default()));
|
||||
inverted_cache.put_metadata(
|
||||
(index_id.file_id(), index_id.version),
|
||||
region_file_id.file_id(),
|
||||
Arc::new(InvertedIndexMetas::default()),
|
||||
);
|
||||
result_cache.put(
|
||||
predicate.clone(),
|
||||
index_id.file_id(),
|
||||
region_file_id.file_id(),
|
||||
Arc::new(RowGroupSelection::default()),
|
||||
);
|
||||
puffin_metadata_cache.put_metadata(
|
||||
@@ -1018,15 +989,19 @@ mod tests {
|
||||
);
|
||||
|
||||
let strategy = CacheStrategy::EnableAll(cache.clone());
|
||||
strategy.evict_puffin_cache(index_id).await;
|
||||
strategy.evict_puffin_cache(region_file_id).await;
|
||||
|
||||
assert!(bloom_cache.get_metadata(bloom_key).is_none());
|
||||
assert!(
|
||||
inverted_cache
|
||||
.get_metadata((index_id.file_id(), index_id.version))
|
||||
.get_metadata(region_file_id.file_id())
|
||||
.is_none()
|
||||
);
|
||||
assert!(
|
||||
result_cache
|
||||
.get(&predicate, region_file_id.file_id())
|
||||
.is_none()
|
||||
);
|
||||
assert!(result_cache.get(&predicate, index_id.file_id()).is_none());
|
||||
assert!(puffin_metadata_cache.get_metadata(&file_id_str).is_none());
|
||||
}
|
||||
}
|
||||
|
||||
758
src/mito2/src/cache/file_cache.rs
vendored
758
src/mito2/src/cache/file_cache.rs
vendored
@@ -55,35 +55,121 @@ pub(crate) const DEFAULT_INDEX_CACHE_PERCENT: u8 = 20;
|
||||
/// Minimum capacity for each cache (512MB).
|
||||
const MIN_CACHE_CAPACITY: u64 = 512 * 1024 * 1024;
|
||||
|
||||
/// Inner struct for FileCache that can be used in spawned tasks.
|
||||
/// A file cache manages files on local store and evict files based
|
||||
/// on size.
|
||||
#[derive(Debug)]
|
||||
struct FileCacheInner {
|
||||
pub(crate) struct FileCache {
|
||||
/// Local store to cache files.
|
||||
local_store: ObjectStore,
|
||||
/// Index to track cached Parquet files.
|
||||
parquet_index: Cache<IndexKey, IndexValue>,
|
||||
/// Index to track cached Puffin files.
|
||||
puffin_index: Cache<IndexKey, IndexValue>,
|
||||
/// Capacity of the puffin (index) cache in bytes.
|
||||
puffin_capacity: u64,
|
||||
}
|
||||
|
||||
impl FileCacheInner {
|
||||
pub(crate) type FileCacheRef = Arc<FileCache>;
|
||||
|
||||
impl FileCache {
|
||||
/// Creates a new file cache.
|
||||
pub(crate) fn new(
|
||||
local_store: ObjectStore,
|
||||
capacity: ReadableSize,
|
||||
ttl: Option<Duration>,
|
||||
index_cache_percent: Option<u8>,
|
||||
) -> FileCache {
|
||||
// Validate and use the provided percent or default
|
||||
let index_percent = index_cache_percent
|
||||
.filter(|&percent| percent > 0 && percent < 100)
|
||||
.unwrap_or(DEFAULT_INDEX_CACHE_PERCENT);
|
||||
let total_capacity = capacity.as_bytes();
|
||||
|
||||
// Convert percent to ratio and calculate capacity for each cache
|
||||
let index_ratio = index_percent as f64 / 100.0;
|
||||
let puffin_capacity = (total_capacity as f64 * index_ratio) as u64;
|
||||
let parquet_capacity = total_capacity - puffin_capacity;
|
||||
|
||||
// Ensure both capacities are at least 512MB
|
||||
let puffin_capacity = puffin_capacity.max(MIN_CACHE_CAPACITY);
|
||||
let parquet_capacity = parquet_capacity.max(MIN_CACHE_CAPACITY);
|
||||
|
||||
info!(
|
||||
"Initializing file cache with index_percent: {}%, total_capacity: {}, parquet_capacity: {}, puffin_capacity: {}",
|
||||
index_percent,
|
||||
ReadableSize(total_capacity),
|
||||
ReadableSize(parquet_capacity),
|
||||
ReadableSize(puffin_capacity)
|
||||
);
|
||||
|
||||
let parquet_index = Self::build_cache(local_store.clone(), parquet_capacity, ttl, "file");
|
||||
let puffin_index = Self::build_cache(local_store.clone(), puffin_capacity, ttl, "index");
|
||||
|
||||
FileCache {
|
||||
local_store,
|
||||
parquet_index,
|
||||
puffin_index,
|
||||
puffin_capacity,
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds a cache for a specific file type.
|
||||
fn build_cache(
|
||||
local_store: ObjectStore,
|
||||
capacity: u64,
|
||||
ttl: Option<Duration>,
|
||||
label: &'static str,
|
||||
) -> Cache<IndexKey, IndexValue> {
|
||||
let cache_store = local_store;
|
||||
let mut builder = Cache::builder()
|
||||
.eviction_policy(EvictionPolicy::lru())
|
||||
.weigher(|_key, value: &IndexValue| -> u32 {
|
||||
// We only measure space on local store.
|
||||
value.file_size
|
||||
})
|
||||
.max_capacity(capacity)
|
||||
.async_eviction_listener(move |key, value, cause| {
|
||||
let store = cache_store.clone();
|
||||
// Stores files under FILE_DIR.
|
||||
let file_path = cache_file_path(FILE_DIR, *key);
|
||||
async move {
|
||||
if let RemovalCause::Replaced = cause {
|
||||
// The cache is replaced by another file. This is unexpected, we don't remove the same
|
||||
// file but updates the metrics as the file is already replaced by users.
|
||||
CACHE_BYTES.with_label_values(&[label]).sub(value.file_size.into());
|
||||
warn!("Replace existing cache {} for region {} unexpectedly", file_path, key.region_id);
|
||||
return;
|
||||
}
|
||||
|
||||
match store.delete(&file_path).await {
|
||||
Ok(()) => {
|
||||
CACHE_BYTES.with_label_values(&[label]).sub(value.file_size.into());
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(e; "Failed to delete cached file {} for region {}", file_path, key.region_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
.boxed()
|
||||
});
|
||||
if let Some(ttl) = ttl {
|
||||
builder = builder.time_to_idle(ttl);
|
||||
}
|
||||
builder.build()
|
||||
}
|
||||
|
||||
/// Returns the appropriate memory index for the given file type.
|
||||
fn memory_index(&self, file_type: FileType) -> &Cache<IndexKey, IndexValue> {
|
||||
match file_type {
|
||||
FileType::Parquet => &self.parquet_index,
|
||||
FileType::Puffin { .. } => &self.puffin_index,
|
||||
FileType::Puffin => &self.puffin_index,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the cache file path for the key.
|
||||
fn cache_file_path(&self, key: IndexKey) -> String {
|
||||
cache_file_path(FILE_DIR, key)
|
||||
}
|
||||
|
||||
/// Puts a file into the cache index.
|
||||
///
|
||||
/// The `WriteCache` should ensure the file is in the correct path.
|
||||
async fn put(&self, key: IndexKey, value: IndexValue) {
|
||||
pub(crate) async fn put(&self, key: IndexKey, value: IndexValue) {
|
||||
CACHE_BYTES
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.add(value.file_size.into());
|
||||
@@ -94,8 +180,100 @@ impl FileCacheInner {
|
||||
index.run_pending_tasks().await;
|
||||
}
|
||||
|
||||
/// Recovers the index from local store.
|
||||
async fn recover(&self) -> Result<()> {
|
||||
pub(crate) async fn get(&self, key: IndexKey) -> Option<IndexValue> {
|
||||
self.memory_index(key.file_type).get(&key).await
|
||||
}
|
||||
|
||||
/// Reads a file from the cache.
|
||||
#[allow(unused)]
|
||||
pub(crate) async fn reader(&self, key: IndexKey) -> Option<Reader> {
|
||||
// We must use `get()` to update the estimator of the cache.
|
||||
// See https://docs.rs/moka/latest/moka/future/struct.Cache.html#method.contains_key
|
||||
let index = self.memory_index(key.file_type);
|
||||
if index.get(&key).await.is_none() {
|
||||
CACHE_MISS
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
return None;
|
||||
}
|
||||
|
||||
let file_path = self.cache_file_path(key);
|
||||
match self.get_reader(&file_path).await {
|
||||
Ok(Some(reader)) => {
|
||||
CACHE_HIT
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
return Some(reader);
|
||||
}
|
||||
Err(e) => {
|
||||
if e.kind() != ErrorKind::NotFound {
|
||||
warn!(e; "Failed to get file for key {:?}", key);
|
||||
}
|
||||
}
|
||||
Ok(None) => {}
|
||||
}
|
||||
|
||||
// We removes the file from the index.
|
||||
index.remove(&key).await;
|
||||
CACHE_MISS
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
None
|
||||
}
|
||||
|
||||
/// Reads ranges from the cache.
|
||||
pub(crate) async fn read_ranges(
|
||||
&self,
|
||||
key: IndexKey,
|
||||
ranges: &[Range<u64>],
|
||||
) -> Option<Vec<Bytes>> {
|
||||
let index = self.memory_index(key.file_type);
|
||||
if index.get(&key).await.is_none() {
|
||||
CACHE_MISS
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
return None;
|
||||
}
|
||||
|
||||
let file_path = self.cache_file_path(key);
|
||||
// In most cases, it will use blocking read,
|
||||
// because FileCache is normally based on local file system, which supports blocking read.
|
||||
let bytes_result = fetch_byte_ranges(&file_path, self.local_store.clone(), ranges).await;
|
||||
match bytes_result {
|
||||
Ok(bytes) => {
|
||||
CACHE_HIT
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
Some(bytes)
|
||||
}
|
||||
Err(e) => {
|
||||
if e.kind() != ErrorKind::NotFound {
|
||||
warn!(e; "Failed to get file for key {:?}", key);
|
||||
}
|
||||
|
||||
// We removes the file from the index.
|
||||
index.remove(&key).await;
|
||||
CACHE_MISS
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Removes a file from the cache explicitly.
|
||||
/// It always tries to remove the file from the local store because we may not have the file
|
||||
/// in the memory index if upload is failed.
|
||||
pub(crate) async fn remove(&self, key: IndexKey) {
|
||||
let file_path = self.cache_file_path(key);
|
||||
self.memory_index(key.file_type).remove(&key).await;
|
||||
// Always delete the file from the local store.
|
||||
if let Err(e) = self.local_store.delete(&file_path).await {
|
||||
warn!(e; "Failed to delete a cached file {}", file_path);
|
||||
}
|
||||
}
|
||||
|
||||
async fn recover_inner(&self) -> Result<()> {
|
||||
let now = Instant::now();
|
||||
let mut lister = self
|
||||
.local_store
|
||||
@@ -130,7 +308,7 @@ impl FileCacheInner {
|
||||
// Track sizes separately for each file type
|
||||
match key.file_type {
|
||||
FileType::Parquet => parquet_size += size,
|
||||
FileType::Puffin { .. } => puffin_size += size,
|
||||
FileType::Puffin => puffin_size += size,
|
||||
}
|
||||
}
|
||||
// The metrics is a signed int gauge so we can updates it finally.
|
||||
@@ -163,7 +341,136 @@ impl FileCacheInner {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Downloads a file without cleaning up on error.
|
||||
/// Recovers the index from local store.
|
||||
///
|
||||
/// If `task_receiver` is provided, spawns a background task after recovery
|
||||
/// to process `RegionLoadCacheTask` messages for loading files into the cache.
|
||||
pub(crate) async fn recover(
|
||||
self: &Arc<Self>,
|
||||
sync: bool,
|
||||
task_receiver: Option<UnboundedReceiver<RegionLoadCacheTask>>,
|
||||
) {
|
||||
let moved_self = self.clone();
|
||||
let handle = tokio::spawn(async move {
|
||||
if let Err(err) = moved_self.recover_inner().await {
|
||||
error!(err; "Failed to recover file cache.")
|
||||
}
|
||||
|
||||
// Spawns background task to process region load cache tasks after recovery.
|
||||
// So it won't block the recovery when `sync` is true.
|
||||
if let Some(mut receiver) = task_receiver {
|
||||
let cache_ref = moved_self.clone();
|
||||
info!("Spawning background task for processing region load cache tasks");
|
||||
tokio::spawn(async move {
|
||||
while let Some(task) = receiver.recv().await {
|
||||
let file_cache = cache_ref.clone();
|
||||
task.fill_cache(file_cache).await;
|
||||
}
|
||||
info!("Background task for processing region load cache tasks stopped");
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
if sync {
|
||||
let _ = handle.await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the cache file path for the key.
|
||||
pub(crate) fn cache_file_path(&self, key: IndexKey) -> String {
|
||||
cache_file_path(FILE_DIR, key)
|
||||
}
|
||||
|
||||
/// Returns the local store of the file cache.
|
||||
pub(crate) fn local_store(&self) -> ObjectStore {
|
||||
self.local_store.clone()
|
||||
}
|
||||
|
||||
/// Get the parquet metadata in file cache.
|
||||
/// If the file is not in the cache or fail to load metadata, return None.
|
||||
pub(crate) async fn get_parquet_meta_data(&self, key: IndexKey) -> Option<ParquetMetaData> {
|
||||
// Check if file cache contains the key
|
||||
if let Some(index_value) = self.parquet_index.get(&key).await {
|
||||
// Load metadata from file cache
|
||||
let local_store = self.local_store();
|
||||
let file_path = self.cache_file_path(key);
|
||||
let file_size = index_value.file_size as u64;
|
||||
let metadata_loader = MetadataLoader::new(local_store, &file_path, file_size);
|
||||
|
||||
match metadata_loader.load().await {
|
||||
Ok(metadata) => {
|
||||
CACHE_HIT
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
Some(metadata)
|
||||
}
|
||||
Err(e) => {
|
||||
if !e.is_object_not_found() {
|
||||
warn!(
|
||||
e; "Failed to get parquet metadata for key {:?}",
|
||||
key
|
||||
);
|
||||
}
|
||||
// We removes the file from the index.
|
||||
self.parquet_index.remove(&key).await;
|
||||
CACHE_MISS
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
CACHE_MISS
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_reader(&self, file_path: &str) -> object_store::Result<Option<Reader>> {
|
||||
if self.local_store.exists(file_path).await? {
|
||||
Ok(Some(self.local_store.reader(file_path).await?))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks if the key is in the file cache.
|
||||
pub(crate) fn contains_key(&self, key: &IndexKey) -> bool {
|
||||
self.memory_index(key.file_type).contains_key(key)
|
||||
}
|
||||
|
||||
/// Returns the capacity of the puffin (index) cache in bytes.
|
||||
pub(crate) fn puffin_cache_capacity(&self) -> u64 {
|
||||
self.puffin_capacity
|
||||
}
|
||||
|
||||
/// Returns the current weighted size (used bytes) of the puffin (index) cache.
|
||||
pub(crate) fn puffin_cache_size(&self) -> u64 {
|
||||
self.puffin_index.weighted_size()
|
||||
}
|
||||
|
||||
/// Downloads a file in `remote_path` from the remote object store to the local cache
|
||||
/// (specified by `index_key`).
|
||||
pub(crate) async fn download(
|
||||
&self,
|
||||
index_key: IndexKey,
|
||||
remote_path: &str,
|
||||
remote_store: &ObjectStore,
|
||||
file_size: u64,
|
||||
) -> Result<()> {
|
||||
if let Err(e) = self
|
||||
.download_without_cleaning(index_key, remote_path, remote_store, file_size)
|
||||
.await
|
||||
{
|
||||
let filename = index_key.to_string();
|
||||
TempFileCleaner::clean_atomic_dir_files(&self.local_store, &[&filename]).await;
|
||||
|
||||
return Err(e);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn download_without_cleaning(
|
||||
&self,
|
||||
index_key: IndexKey,
|
||||
@@ -178,7 +485,7 @@ impl FileCacheInner {
|
||||
let timer = WRITE_CACHE_DOWNLOAD_ELAPSED
|
||||
.with_label_values(&[match file_type {
|
||||
FileType::Parquet => "download_parquet",
|
||||
FileType::Puffin { .. } => "download_puffin",
|
||||
FileType::Puffin => "download_puffin",
|
||||
}])
|
||||
.start_timer();
|
||||
|
||||
@@ -230,360 +537,11 @@ impl FileCacheInner {
|
||||
self.put(index_key, index_value).await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Downloads a file from remote store to local cache.
|
||||
async fn download(
|
||||
&self,
|
||||
index_key: IndexKey,
|
||||
remote_path: &str,
|
||||
remote_store: &ObjectStore,
|
||||
file_size: u64,
|
||||
) -> Result<()> {
|
||||
if let Err(e) = self
|
||||
.download_without_cleaning(index_key, remote_path, remote_store, file_size)
|
||||
.await
|
||||
{
|
||||
let filename = index_key.to_string();
|
||||
TempFileCleaner::clean_atomic_dir_files(&self.local_store, &[&filename]).await;
|
||||
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// A file cache manages files on local store and evict files based
|
||||
/// on size.
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct FileCache {
|
||||
/// Inner cache state shared with background worker.
|
||||
inner: Arc<FileCacheInner>,
|
||||
/// Capacity of the puffin (index) cache in bytes.
|
||||
puffin_capacity: u64,
|
||||
}
|
||||
|
||||
pub(crate) type FileCacheRef = Arc<FileCache>;
|
||||
|
||||
impl FileCache {
|
||||
/// Creates a new file cache.
|
||||
pub(crate) fn new(
|
||||
local_store: ObjectStore,
|
||||
capacity: ReadableSize,
|
||||
ttl: Option<Duration>,
|
||||
index_cache_percent: Option<u8>,
|
||||
) -> FileCache {
|
||||
// Validate and use the provided percent or default
|
||||
let index_percent = index_cache_percent
|
||||
.filter(|&percent| percent > 0 && percent < 100)
|
||||
.unwrap_or(DEFAULT_INDEX_CACHE_PERCENT);
|
||||
let total_capacity = capacity.as_bytes();
|
||||
|
||||
// Convert percent to ratio and calculate capacity for each cache
|
||||
let index_ratio = index_percent as f64 / 100.0;
|
||||
let puffin_capacity = (total_capacity as f64 * index_ratio) as u64;
|
||||
let parquet_capacity = total_capacity - puffin_capacity;
|
||||
|
||||
// Ensure both capacities are at least 512MB
|
||||
let puffin_capacity = puffin_capacity.max(MIN_CACHE_CAPACITY);
|
||||
let parquet_capacity = parquet_capacity.max(MIN_CACHE_CAPACITY);
|
||||
|
||||
info!(
|
||||
"Initializing file cache with index_percent: {}%, total_capacity: {}, parquet_capacity: {}, puffin_capacity: {}",
|
||||
index_percent,
|
||||
ReadableSize(total_capacity),
|
||||
ReadableSize(parquet_capacity),
|
||||
ReadableSize(puffin_capacity)
|
||||
);
|
||||
|
||||
let parquet_index = Self::build_cache(local_store.clone(), parquet_capacity, ttl, "file");
|
||||
let puffin_index = Self::build_cache(local_store.clone(), puffin_capacity, ttl, "index");
|
||||
|
||||
// Create inner cache shared with background worker
|
||||
let inner = Arc::new(FileCacheInner {
|
||||
local_store,
|
||||
parquet_index,
|
||||
puffin_index,
|
||||
});
|
||||
|
||||
FileCache {
|
||||
inner,
|
||||
puffin_capacity,
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds a cache for a specific file type.
|
||||
fn build_cache(
|
||||
local_store: ObjectStore,
|
||||
capacity: u64,
|
||||
ttl: Option<Duration>,
|
||||
label: &'static str,
|
||||
) -> Cache<IndexKey, IndexValue> {
|
||||
let cache_store = local_store;
|
||||
let mut builder = Cache::builder()
|
||||
.eviction_policy(EvictionPolicy::lru())
|
||||
.weigher(|_key, value: &IndexValue| -> u32 {
|
||||
// We only measure space on local store.
|
||||
value.file_size
|
||||
})
|
||||
.max_capacity(capacity)
|
||||
.async_eviction_listener(move |key, value, cause| {
|
||||
let store = cache_store.clone();
|
||||
// Stores files under FILE_DIR.
|
||||
let file_path = cache_file_path(FILE_DIR, *key);
|
||||
async move {
|
||||
if let RemovalCause::Replaced = cause {
|
||||
// The cache is replaced by another file. This is unexpected, we don't remove the same
|
||||
// file but updates the metrics as the file is already replaced by users.
|
||||
CACHE_BYTES.with_label_values(&[label]).sub(value.file_size.into());
|
||||
// TODO(yingwen): Don't log warn later.
|
||||
warn!("Replace existing cache {} for region {} unexpectedly", file_path, key.region_id);
|
||||
return;
|
||||
}
|
||||
|
||||
match store.delete(&file_path).await {
|
||||
Ok(()) => {
|
||||
CACHE_BYTES.with_label_values(&[label]).sub(value.file_size.into());
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(e; "Failed to delete cached file {} for region {}", file_path, key.region_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
.boxed()
|
||||
});
|
||||
if let Some(ttl) = ttl {
|
||||
builder = builder.time_to_idle(ttl);
|
||||
}
|
||||
builder.build()
|
||||
}
|
||||
|
||||
/// Puts a file into the cache index.
|
||||
///
|
||||
/// The `WriteCache` should ensure the file is in the correct path.
|
||||
pub(crate) async fn put(&self, key: IndexKey, value: IndexValue) {
|
||||
self.inner.put(key, value).await
|
||||
}
|
||||
|
||||
pub(crate) async fn get(&self, key: IndexKey) -> Option<IndexValue> {
|
||||
self.inner.memory_index(key.file_type).get(&key).await
|
||||
}
|
||||
|
||||
/// Reads a file from the cache.
|
||||
#[allow(unused)]
|
||||
pub(crate) async fn reader(&self, key: IndexKey) -> Option<Reader> {
|
||||
// We must use `get()` to update the estimator of the cache.
|
||||
// See https://docs.rs/moka/latest/moka/future/struct.Cache.html#method.contains_key
|
||||
let index = self.inner.memory_index(key.file_type);
|
||||
if index.get(&key).await.is_none() {
|
||||
CACHE_MISS
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
return None;
|
||||
}
|
||||
|
||||
let file_path = self.inner.cache_file_path(key);
|
||||
match self.get_reader(&file_path).await {
|
||||
Ok(Some(reader)) => {
|
||||
CACHE_HIT
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
return Some(reader);
|
||||
}
|
||||
Err(e) => {
|
||||
if e.kind() != ErrorKind::NotFound {
|
||||
warn!(e; "Failed to get file for key {:?}", key);
|
||||
}
|
||||
}
|
||||
Ok(None) => {}
|
||||
}
|
||||
|
||||
// We removes the file from the index.
|
||||
index.remove(&key).await;
|
||||
CACHE_MISS
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
None
|
||||
}
|
||||
|
||||
/// Reads ranges from the cache.
|
||||
pub(crate) async fn read_ranges(
|
||||
&self,
|
||||
key: IndexKey,
|
||||
ranges: &[Range<u64>],
|
||||
) -> Option<Vec<Bytes>> {
|
||||
let index = self.inner.memory_index(key.file_type);
|
||||
if index.get(&key).await.is_none() {
|
||||
CACHE_MISS
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
return None;
|
||||
}
|
||||
|
||||
let file_path = self.inner.cache_file_path(key);
|
||||
// In most cases, it will use blocking read,
|
||||
// because FileCache is normally based on local file system, which supports blocking read.
|
||||
let bytes_result =
|
||||
fetch_byte_ranges(&file_path, self.inner.local_store.clone(), ranges).await;
|
||||
match bytes_result {
|
||||
Ok(bytes) => {
|
||||
CACHE_HIT
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
Some(bytes)
|
||||
}
|
||||
Err(e) => {
|
||||
if e.kind() != ErrorKind::NotFound {
|
||||
warn!(e; "Failed to get file for key {:?}", key);
|
||||
}
|
||||
|
||||
// We removes the file from the index.
|
||||
index.remove(&key).await;
|
||||
CACHE_MISS
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Removes a file from the cache explicitly.
|
||||
/// It always tries to remove the file from the local store because we may not have the file
|
||||
/// in the memory index if upload is failed.
|
||||
pub(crate) async fn remove(&self, key: IndexKey) {
|
||||
let file_path = self.inner.cache_file_path(key);
|
||||
self.inner.memory_index(key.file_type).remove(&key).await;
|
||||
// Always delete the file from the local store.
|
||||
if let Err(e) = self.inner.local_store.delete(&file_path).await {
|
||||
warn!(e; "Failed to delete a cached file {}", file_path);
|
||||
}
|
||||
}
|
||||
|
||||
/// Recovers the index from local store.
|
||||
///
|
||||
/// If `task_receiver` is provided, spawns a background task after recovery
|
||||
/// to process `RegionLoadCacheTask` messages for loading files into the cache.
|
||||
pub(crate) async fn recover(
|
||||
&self,
|
||||
sync: bool,
|
||||
task_receiver: Option<UnboundedReceiver<RegionLoadCacheTask>>,
|
||||
) {
|
||||
let moved_self = self.clone();
|
||||
let handle = tokio::spawn(async move {
|
||||
if let Err(err) = moved_self.inner.recover().await {
|
||||
error!(err; "Failed to recover file cache.")
|
||||
}
|
||||
|
||||
// Spawns background task to process region load cache tasks after recovery.
|
||||
// So it won't block the recovery when `sync` is true.
|
||||
if let Some(mut receiver) = task_receiver {
|
||||
info!("Spawning background task for processing region load cache tasks");
|
||||
tokio::spawn(async move {
|
||||
while let Some(task) = receiver.recv().await {
|
||||
task.fill_cache(&moved_self).await;
|
||||
}
|
||||
info!("Background task for processing region load cache tasks stopped");
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
if sync {
|
||||
let _ = handle.await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the cache file path for the key.
|
||||
pub(crate) fn cache_file_path(&self, key: IndexKey) -> String {
|
||||
self.inner.cache_file_path(key)
|
||||
}
|
||||
|
||||
/// Returns the local store of the file cache.
|
||||
pub(crate) fn local_store(&self) -> ObjectStore {
|
||||
self.inner.local_store.clone()
|
||||
}
|
||||
|
||||
/// Get the parquet metadata in file cache.
|
||||
/// If the file is not in the cache or fail to load metadata, return None.
|
||||
pub(crate) async fn get_parquet_meta_data(&self, key: IndexKey) -> Option<ParquetMetaData> {
|
||||
// Check if file cache contains the key
|
||||
if let Some(index_value) = self.inner.parquet_index.get(&key).await {
|
||||
// Load metadata from file cache
|
||||
let local_store = self.local_store();
|
||||
let file_path = self.inner.cache_file_path(key);
|
||||
let file_size = index_value.file_size as u64;
|
||||
let metadata_loader = MetadataLoader::new(local_store, &file_path, file_size);
|
||||
|
||||
match metadata_loader.load().await {
|
||||
Ok(metadata) => {
|
||||
CACHE_HIT
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
Some(metadata)
|
||||
}
|
||||
Err(e) => {
|
||||
if !e.is_object_not_found() {
|
||||
warn!(
|
||||
e; "Failed to get parquet metadata for key {:?}",
|
||||
key
|
||||
);
|
||||
}
|
||||
// We removes the file from the index.
|
||||
self.inner.parquet_index.remove(&key).await;
|
||||
CACHE_MISS
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
CACHE_MISS
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_reader(&self, file_path: &str) -> object_store::Result<Option<Reader>> {
|
||||
if self.inner.local_store.exists(file_path).await? {
|
||||
Ok(Some(self.inner.local_store.reader(file_path).await?))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks if the key is in the file cache.
|
||||
pub(crate) fn contains_key(&self, key: &IndexKey) -> bool {
|
||||
self.inner.memory_index(key.file_type).contains_key(key)
|
||||
}
|
||||
|
||||
/// Returns the capacity of the puffin (index) cache in bytes.
|
||||
pub(crate) fn puffin_cache_capacity(&self) -> u64 {
|
||||
self.puffin_capacity
|
||||
}
|
||||
|
||||
/// Returns the current weighted size (used bytes) of the puffin (index) cache.
|
||||
pub(crate) fn puffin_cache_size(&self) -> u64 {
|
||||
self.inner.puffin_index.weighted_size()
|
||||
}
|
||||
|
||||
/// Downloads a file in `remote_path` from the remote object store to the local cache
|
||||
/// (specified by `index_key`).
|
||||
pub(crate) async fn download(
|
||||
&self,
|
||||
index_key: IndexKey,
|
||||
remote_path: &str,
|
||||
remote_store: &ObjectStore,
|
||||
file_size: u64,
|
||||
) -> Result<()> {
|
||||
self.inner
|
||||
.download(index_key, remote_path, remote_store, file_size)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
/// Key of file cache index.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub struct IndexKey {
|
||||
pub(crate) struct IndexKey {
|
||||
pub region_id: RegionId,
|
||||
pub file_id: FileId,
|
||||
pub file_type: FileType,
|
||||
@@ -607,7 +565,7 @@ impl fmt::Display for IndexKey {
|
||||
"{}.{}.{}",
|
||||
self.region_id.as_u64(),
|
||||
self.file_id,
|
||||
self.file_type
|
||||
self.file_type.as_str()
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -618,16 +576,7 @@ pub enum FileType {
|
||||
/// Parquet file.
|
||||
Parquet,
|
||||
/// Puffin file.
|
||||
Puffin(u64),
|
||||
}
|
||||
|
||||
impl fmt::Display for FileType {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
FileType::Parquet => write!(f, "parquet"),
|
||||
FileType::Puffin(version) => write!(f, "{}.puffin", version),
|
||||
}
|
||||
}
|
||||
Puffin,
|
||||
}
|
||||
|
||||
impl FileType {
|
||||
@@ -635,16 +584,16 @@ impl FileType {
|
||||
fn parse(s: &str) -> Option<FileType> {
|
||||
match s {
|
||||
"parquet" => Some(FileType::Parquet),
|
||||
"puffin" => Some(FileType::Puffin(0)),
|
||||
_ => {
|
||||
// if post-fix with .puffin, try to parse the version
|
||||
if let Some(version_str) = s.strip_suffix(".puffin") {
|
||||
let version = version_str.parse::<u64>().ok()?;
|
||||
Some(FileType::Puffin(version))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
"puffin" => Some(FileType::Puffin),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts the file type to string.
|
||||
fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
FileType::Parquet => "parquet",
|
||||
FileType::Puffin => "puffin",
|
||||
}
|
||||
}
|
||||
|
||||
@@ -652,7 +601,7 @@ impl FileType {
|
||||
fn metric_label(&self) -> &'static str {
|
||||
match self {
|
||||
FileType::Parquet => FILE_TYPE,
|
||||
FileType::Puffin(_) => INDEX_TYPE,
|
||||
FileType::Puffin => INDEX_TYPE,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -734,7 +683,7 @@ mod tests {
|
||||
let exist = cache.reader(key).await;
|
||||
assert!(exist.is_some());
|
||||
tokio::time::sleep(Duration::from_millis(15)).await;
|
||||
cache.inner.parquet_index.run_pending_tasks().await;
|
||||
cache.parquet_index.run_pending_tasks().await;
|
||||
let non = cache.reader(key).await;
|
||||
assert!(non.is_none());
|
||||
}
|
||||
@@ -772,19 +721,19 @@ mod tests {
|
||||
assert_eq!("hello", String::from_utf8(buf).unwrap());
|
||||
|
||||
// Get weighted size.
|
||||
cache.inner.parquet_index.run_pending_tasks().await;
|
||||
assert_eq!(5, cache.inner.parquet_index.weighted_size());
|
||||
cache.parquet_index.run_pending_tasks().await;
|
||||
assert_eq!(5, cache.parquet_index.weighted_size());
|
||||
|
||||
// Remove the file.
|
||||
cache.remove(key).await;
|
||||
assert!(cache.reader(key).await.is_none());
|
||||
|
||||
// Ensure all pending tasks of the moka cache is done before assertion.
|
||||
cache.inner.parquet_index.run_pending_tasks().await;
|
||||
cache.parquet_index.run_pending_tasks().await;
|
||||
|
||||
// The file also not exists.
|
||||
assert!(!local_store.exists(&file_path).await.unwrap());
|
||||
assert_eq!(0, cache.inner.parquet_index.weighted_size());
|
||||
assert_eq!(0, cache.parquet_index.weighted_size());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -817,7 +766,7 @@ mod tests {
|
||||
// Reader is none.
|
||||
assert!(cache.reader(key).await.is_none());
|
||||
// Key is removed.
|
||||
assert!(!cache.inner.parquet_index.contains_key(&key));
|
||||
assert!(!cache.parquet_index.contains_key(&key));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -850,7 +799,12 @@ mod tests {
|
||||
}
|
||||
|
||||
// Recover the cache.
|
||||
let cache = FileCache::new(local_store.clone(), ReadableSize::mb(10), None, None);
|
||||
let cache = Arc::new(FileCache::new(
|
||||
local_store.clone(),
|
||||
ReadableSize::mb(10),
|
||||
None,
|
||||
None,
|
||||
));
|
||||
// No entry before recovery.
|
||||
assert!(
|
||||
cache
|
||||
@@ -861,11 +815,8 @@ mod tests {
|
||||
cache.recover(true, None).await;
|
||||
|
||||
// Check size.
|
||||
cache.inner.parquet_index.run_pending_tasks().await;
|
||||
assert_eq!(
|
||||
total_size,
|
||||
cache.inner.parquet_index.weighted_size() as usize
|
||||
);
|
||||
cache.parquet_index.run_pending_tasks().await;
|
||||
assert_eq!(total_size, cache.parquet_index.weighted_size() as usize);
|
||||
|
||||
for (i, file_id) in file_ids.iter().enumerate() {
|
||||
let key = IndexKey::new(region_id, *file_id, file_type);
|
||||
@@ -930,15 +881,6 @@ mod tests {
|
||||
IndexKey::new(region_id, file_id, FileType::Parquet),
|
||||
parse_index_key("5299989643269.3368731b-a556-42b8-a5df-9c31ce155095.parquet").unwrap()
|
||||
);
|
||||
assert_eq!(
|
||||
IndexKey::new(region_id, file_id, FileType::Puffin(0)),
|
||||
parse_index_key("5299989643269.3368731b-a556-42b8-a5df-9c31ce155095.puffin").unwrap()
|
||||
);
|
||||
assert_eq!(
|
||||
IndexKey::new(region_id, file_id, FileType::Puffin(42)),
|
||||
parse_index_key("5299989643269.3368731b-a556-42b8-a5df-9c31ce155095.42.puffin")
|
||||
.unwrap()
|
||||
);
|
||||
assert!(parse_index_key("").is_none());
|
||||
assert!(parse_index_key(".").is_none());
|
||||
assert!(parse_index_key("5299989643269").is_none());
|
||||
|
||||
42
src/mito2/src/cache/index.rs
vendored
42
src/mito2/src/cache/index.rs
vendored
@@ -31,29 +31,6 @@ const INDEX_METADATA_TYPE: &str = "index_metadata";
|
||||
/// Metrics for index content.
|
||||
const INDEX_CONTENT_TYPE: &str = "index_content";
|
||||
|
||||
/// Metrics collected from IndexCache operations.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct IndexCacheMetrics {
|
||||
/// Number of cache hits.
|
||||
pub cache_hit: usize,
|
||||
/// Number of cache misses.
|
||||
pub cache_miss: usize,
|
||||
/// Number of pages accessed.
|
||||
pub num_pages: usize,
|
||||
/// Total bytes from pages.
|
||||
pub page_bytes: u64,
|
||||
}
|
||||
|
||||
impl IndexCacheMetrics {
|
||||
/// Merges another set of metrics into this one.
|
||||
pub fn merge(&mut self, other: &Self) {
|
||||
self.cache_hit += other.cache_hit;
|
||||
self.cache_miss += other.cache_miss;
|
||||
self.num_pages += other.num_pages;
|
||||
self.page_bytes += other.page_bytes;
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub struct PageKey {
|
||||
page_id: u64,
|
||||
@@ -183,20 +160,18 @@ where
|
||||
offset: u64,
|
||||
size: u32,
|
||||
load: F,
|
||||
) -> Result<(Vec<u8>, IndexCacheMetrics), E>
|
||||
) -> Result<Vec<u8>, E>
|
||||
where
|
||||
F: Fn(Vec<Range<u64>>) -> Fut,
|
||||
Fut: Future<Output = Result<Vec<Bytes>, E>>,
|
||||
E: std::error::Error,
|
||||
{
|
||||
let mut metrics = IndexCacheMetrics::default();
|
||||
let page_keys =
|
||||
PageKey::generate_page_keys(offset, size, self.page_size).collect::<Vec<_>>();
|
||||
// Size is 0, return empty data.
|
||||
if page_keys.is_empty() {
|
||||
return Ok((Vec::new(), metrics));
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
metrics.num_pages = page_keys.len();
|
||||
let mut data = Vec::with_capacity(page_keys.len());
|
||||
data.resize(page_keys.len(), Bytes::new());
|
||||
let mut cache_miss_range = vec![];
|
||||
@@ -207,13 +182,10 @@ where
|
||||
match self.get_page(key, *page_key) {
|
||||
Some(page) => {
|
||||
CACHE_HIT.with_label_values(&[INDEX_CONTENT_TYPE]).inc();
|
||||
metrics.cache_hit += 1;
|
||||
metrics.page_bytes += page.len() as u64;
|
||||
data[i] = page;
|
||||
}
|
||||
None => {
|
||||
CACHE_MISS.with_label_values(&[INDEX_CONTENT_TYPE]).inc();
|
||||
metrics.cache_miss += 1;
|
||||
let base_offset = page_key.page_id * self.page_size;
|
||||
let pruned_size = if i == last_index {
|
||||
prune_size(page_keys.iter(), file_size, self.page_size)
|
||||
@@ -229,18 +201,14 @@ where
|
||||
let pages = load(cache_miss_range).await?;
|
||||
for (i, page) in cache_miss_idx.into_iter().zip(pages.into_iter()) {
|
||||
let page_key = page_keys[i];
|
||||
metrics.page_bytes += page.len() as u64;
|
||||
data[i] = page.clone();
|
||||
self.put_page(key, page_key, page.clone());
|
||||
}
|
||||
}
|
||||
let buffer = Buffer::from_iter(data.into_iter());
|
||||
Ok((
|
||||
buffer
|
||||
.slice(PageKey::calculate_range(offset, size, self.page_size))
|
||||
.to_vec(),
|
||||
metrics,
|
||||
))
|
||||
Ok(buffer
|
||||
.slice(PageKey::calculate_range(offset, size, self.page_size))
|
||||
.to_vec())
|
||||
}
|
||||
|
||||
fn get_page(&self, key: K, page_key: PageKey) -> Option<Bytes> {
|
||||
|
||||
105
src/mito2/src/cache/index/bloom_filter_index.rs
vendored
105
src/mito2/src/cache/index/bloom_filter_index.rs
vendored
@@ -14,14 +14,13 @@
|
||||
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use api::v1::index::{BloomFilterLoc, BloomFilterMeta};
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
use index::bloom_filter::error::Result;
|
||||
use index::bloom_filter::reader::{BloomFilterReadMetrics, BloomFilterReader};
|
||||
use store_api::storage::{ColumnId, FileId, IndexVersion};
|
||||
use index::bloom_filter::reader::BloomFilterReader;
|
||||
use store_api::storage::{ColumnId, FileId};
|
||||
|
||||
use crate::cache::index::{INDEX_METADATA_TYPE, IndexCache, PageKey};
|
||||
use crate::metrics::{CACHE_HIT, CACHE_MISS};
|
||||
@@ -35,10 +34,8 @@ pub enum Tag {
|
||||
Fulltext,
|
||||
}
|
||||
|
||||
pub type BloomFilterIndexKey = (FileId, IndexVersion, ColumnId, Tag);
|
||||
|
||||
/// Cache for bloom filter index.
|
||||
pub type BloomFilterIndexCache = IndexCache<BloomFilterIndexKey, BloomFilterMeta>;
|
||||
pub type BloomFilterIndexCache = IndexCache<(FileId, ColumnId, Tag), BloomFilterMeta>;
|
||||
pub type BloomFilterIndexCacheRef = Arc<BloomFilterIndexCache>;
|
||||
|
||||
impl BloomFilterIndexCache {
|
||||
@@ -61,9 +58,11 @@ impl BloomFilterIndexCache {
|
||||
}
|
||||
|
||||
/// Calculates weight for bloom filter index metadata.
|
||||
fn bloom_filter_index_metadata_weight(k: &BloomFilterIndexKey, meta: &Arc<BloomFilterMeta>) -> u32 {
|
||||
fn bloom_filter_index_metadata_weight(
|
||||
k: &(FileId, ColumnId, Tag),
|
||||
meta: &Arc<BloomFilterMeta>,
|
||||
) -> u32 {
|
||||
let base = k.0.as_bytes().len()
|
||||
+ std::mem::size_of::<IndexVersion>()
|
||||
+ std::mem::size_of::<ColumnId>()
|
||||
+ std::mem::size_of::<Tag>()
|
||||
+ std::mem::size_of::<BloomFilterMeta>();
|
||||
@@ -75,14 +74,16 @@ fn bloom_filter_index_metadata_weight(k: &BloomFilterIndexKey, meta: &Arc<BloomF
|
||||
}
|
||||
|
||||
/// Calculates weight for bloom filter index content.
|
||||
fn bloom_filter_index_content_weight((k, _): &(BloomFilterIndexKey, PageKey), v: &Bytes) -> u32 {
|
||||
fn bloom_filter_index_content_weight(
|
||||
(k, _): &((FileId, ColumnId, Tag), PageKey),
|
||||
v: &Bytes,
|
||||
) -> u32 {
|
||||
(k.0.as_bytes().len() + std::mem::size_of::<ColumnId>() + v.len()) as u32
|
||||
}
|
||||
|
||||
/// Bloom filter index blob reader with cache.
|
||||
pub struct CachedBloomFilterIndexBlobReader<R> {
|
||||
file_id: FileId,
|
||||
index_version: IndexVersion,
|
||||
column_id: ColumnId,
|
||||
tag: Tag,
|
||||
blob_size: u64,
|
||||
@@ -94,7 +95,6 @@ impl<R> CachedBloomFilterIndexBlobReader<R> {
|
||||
/// Creates a new bloom filter index blob reader with cache.
|
||||
pub fn new(
|
||||
file_id: FileId,
|
||||
index_version: IndexVersion,
|
||||
column_id: ColumnId,
|
||||
tag: Tag,
|
||||
blob_size: u64,
|
||||
@@ -103,7 +103,6 @@ impl<R> CachedBloomFilterIndexBlobReader<R> {
|
||||
) -> Self {
|
||||
Self {
|
||||
file_id,
|
||||
index_version,
|
||||
column_id,
|
||||
tag,
|
||||
blob_size,
|
||||
@@ -115,95 +114,53 @@ impl<R> CachedBloomFilterIndexBlobReader<R> {
|
||||
|
||||
#[async_trait]
|
||||
impl<R: BloomFilterReader + Send> BloomFilterReader for CachedBloomFilterIndexBlobReader<R> {
|
||||
async fn range_read(
|
||||
&self,
|
||||
offset: u64,
|
||||
size: u32,
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<Bytes> {
|
||||
let start = metrics.as_ref().map(|_| Instant::now());
|
||||
async fn range_read(&self, offset: u64, size: u32) -> Result<Bytes> {
|
||||
let inner = &self.inner;
|
||||
let (result, cache_metrics) = self
|
||||
.cache
|
||||
self.cache
|
||||
.get_or_load(
|
||||
(self.file_id, self.index_version, self.column_id, self.tag),
|
||||
(self.file_id, self.column_id, self.tag),
|
||||
self.blob_size,
|
||||
offset,
|
||||
size,
|
||||
move |ranges| async move { inner.read_vec(&ranges, None).await },
|
||||
move |ranges| async move { inner.read_vec(&ranges).await },
|
||||
)
|
||||
.await?;
|
||||
|
||||
if let Some(m) = metrics {
|
||||
m.total_ranges += cache_metrics.num_pages;
|
||||
m.total_bytes += cache_metrics.page_bytes;
|
||||
m.cache_hit += cache_metrics.cache_hit;
|
||||
m.cache_miss += cache_metrics.cache_miss;
|
||||
if let Some(start) = start {
|
||||
m.fetch_elapsed += start.elapsed();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result.into())
|
||||
.await
|
||||
.map(|b| b.into())
|
||||
}
|
||||
|
||||
async fn read_vec(
|
||||
&self,
|
||||
ranges: &[Range<u64>],
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<Vec<Bytes>> {
|
||||
let start = metrics.as_ref().map(|_| Instant::now());
|
||||
|
||||
async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
|
||||
let mut pages = Vec::with_capacity(ranges.len());
|
||||
let mut total_cache_metrics = crate::cache::index::IndexCacheMetrics::default();
|
||||
for range in ranges {
|
||||
let inner = &self.inner;
|
||||
let (page, cache_metrics) = self
|
||||
let page = self
|
||||
.cache
|
||||
.get_or_load(
|
||||
(self.file_id, self.index_version, self.column_id, self.tag),
|
||||
(self.file_id, self.column_id, self.tag),
|
||||
self.blob_size,
|
||||
range.start,
|
||||
(range.end - range.start) as u32,
|
||||
move |ranges| async move { inner.read_vec(&ranges, None).await },
|
||||
move |ranges| async move { inner.read_vec(&ranges).await },
|
||||
)
|
||||
.await?;
|
||||
|
||||
total_cache_metrics.merge(&cache_metrics);
|
||||
pages.push(Bytes::from(page));
|
||||
}
|
||||
|
||||
if let Some(m) = metrics {
|
||||
m.total_ranges += total_cache_metrics.num_pages;
|
||||
m.total_bytes += total_cache_metrics.page_bytes;
|
||||
m.cache_hit += total_cache_metrics.cache_hit;
|
||||
m.cache_miss += total_cache_metrics.cache_miss;
|
||||
if let Some(start) = start {
|
||||
m.fetch_elapsed += start.elapsed();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(pages)
|
||||
}
|
||||
|
||||
/// Reads the meta information of the bloom filter.
|
||||
async fn metadata(
|
||||
&self,
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<BloomFilterMeta> {
|
||||
if let Some(cached) =
|
||||
self.cache
|
||||
.get_metadata((self.file_id, self.index_version, self.column_id, self.tag))
|
||||
async fn metadata(&self) -> Result<BloomFilterMeta> {
|
||||
if let Some(cached) = self
|
||||
.cache
|
||||
.get_metadata((self.file_id, self.column_id, self.tag))
|
||||
{
|
||||
CACHE_HIT.with_label_values(&[INDEX_METADATA_TYPE]).inc();
|
||||
if let Some(m) = metrics {
|
||||
m.cache_hit += 1;
|
||||
}
|
||||
Ok((*cached).clone())
|
||||
} else {
|
||||
let meta = self.inner.metadata(metrics).await?;
|
||||
let meta = self.inner.metadata().await?;
|
||||
self.cache.put_metadata(
|
||||
(self.file_id, self.index_version, self.column_id, self.tag),
|
||||
(self.file_id, self.column_id, self.tag),
|
||||
Arc::new(meta.clone()),
|
||||
);
|
||||
CACHE_MISS.with_label_values(&[INDEX_METADATA_TYPE]).inc();
|
||||
@@ -223,7 +180,6 @@ mod test {
|
||||
#[test]
|
||||
fn bloom_filter_metadata_weight_counts_vec_contents() {
|
||||
let file_id = FileId::parse_str("00000000-0000-0000-0000-000000000001").unwrap();
|
||||
let version = 0;
|
||||
let column_id: ColumnId = 42;
|
||||
let tag = Tag::Skipping;
|
||||
|
||||
@@ -247,13 +203,10 @@ mod test {
|
||||
],
|
||||
};
|
||||
|
||||
let weight = bloom_filter_index_metadata_weight(
|
||||
&(file_id, version, column_id, tag),
|
||||
&Arc::new(meta.clone()),
|
||||
);
|
||||
let weight =
|
||||
bloom_filter_index_metadata_weight(&(file_id, column_id, tag), &Arc::new(meta.clone()));
|
||||
|
||||
let base = file_id.as_bytes().len()
|
||||
+ std::mem::size_of::<IndexVersion>()
|
||||
+ std::mem::size_of::<ColumnId>()
|
||||
+ std::mem::size_of::<Tag>()
|
||||
+ std::mem::size_of::<BloomFilterMeta>();
|
||||
|
||||
150
src/mito2/src/cache/index/inverted_index.rs
vendored
150
src/mito2/src/cache/index/inverted_index.rs
vendored
@@ -14,15 +14,14 @@
|
||||
|
||||
use core::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use api::v1::index::InvertedIndexMetas;
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
use index::inverted_index::error::Result;
|
||||
use index::inverted_index::format::reader::{InvertedIndexReadMetrics, InvertedIndexReader};
|
||||
use index::inverted_index::format::reader::InvertedIndexReader;
|
||||
use prost::Message;
|
||||
use store_api::storage::{FileId, IndexVersion};
|
||||
use store_api::storage::FileId;
|
||||
|
||||
use crate::cache::index::{INDEX_METADATA_TYPE, IndexCache, PageKey};
|
||||
use crate::metrics::{CACHE_HIT, CACHE_MISS};
|
||||
@@ -30,7 +29,7 @@ use crate::metrics::{CACHE_HIT, CACHE_MISS};
|
||||
const INDEX_TYPE_INVERTED_INDEX: &str = "inverted_index";
|
||||
|
||||
/// Cache for inverted index.
|
||||
pub type InvertedIndexCache = IndexCache<(FileId, IndexVersion), InvertedIndexMetas>;
|
||||
pub type InvertedIndexCache = IndexCache<FileId, InvertedIndexMetas>;
|
||||
pub type InvertedIndexCacheRef = Arc<InvertedIndexCache>;
|
||||
|
||||
impl InvertedIndexCache {
|
||||
@@ -48,24 +47,23 @@ impl InvertedIndexCache {
|
||||
|
||||
/// Removes all cached entries for the given `file_id`.
|
||||
pub fn invalidate_file(&self, file_id: FileId) {
|
||||
self.invalidate_if(move |key| key.0 == file_id);
|
||||
self.invalidate_if(move |key| *key == file_id);
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculates weight for inverted index metadata.
|
||||
fn inverted_index_metadata_weight(k: &(FileId, IndexVersion), v: &Arc<InvertedIndexMetas>) -> u32 {
|
||||
(k.0.as_bytes().len() + size_of::<IndexVersion>() + v.encoded_len()) as u32
|
||||
fn inverted_index_metadata_weight(k: &FileId, v: &Arc<InvertedIndexMetas>) -> u32 {
|
||||
(k.as_bytes().len() + v.encoded_len()) as u32
|
||||
}
|
||||
|
||||
/// Calculates weight for inverted index content.
|
||||
fn inverted_index_content_weight((k, _): &((FileId, IndexVersion), PageKey), v: &Bytes) -> u32 {
|
||||
(k.0.as_bytes().len() + size_of::<IndexVersion>() + v.len()) as u32
|
||||
fn inverted_index_content_weight((k, _): &(FileId, PageKey), v: &Bytes) -> u32 {
|
||||
(k.as_bytes().len() + v.len()) as u32
|
||||
}
|
||||
|
||||
/// Inverted index blob reader with cache.
|
||||
pub struct CachedInvertedIndexBlobReader<R> {
|
||||
file_id: FileId,
|
||||
index_version: IndexVersion,
|
||||
blob_size: u64,
|
||||
inner: R,
|
||||
cache: InvertedIndexCacheRef,
|
||||
@@ -73,16 +71,9 @@ pub struct CachedInvertedIndexBlobReader<R> {
|
||||
|
||||
impl<R> CachedInvertedIndexBlobReader<R> {
|
||||
/// Creates a new inverted index blob reader with cache.
|
||||
pub fn new(
|
||||
file_id: FileId,
|
||||
index_version: IndexVersion,
|
||||
blob_size: u64,
|
||||
inner: R,
|
||||
cache: InvertedIndexCacheRef,
|
||||
) -> Self {
|
||||
pub fn new(file_id: FileId, blob_size: u64, inner: R, cache: InvertedIndexCacheRef) -> Self {
|
||||
Self {
|
||||
file_id,
|
||||
index_version,
|
||||
blob_size,
|
||||
inner,
|
||||
cache,
|
||||
@@ -92,88 +83,47 @@ impl<R> CachedInvertedIndexBlobReader<R> {
|
||||
|
||||
#[async_trait]
|
||||
impl<R: InvertedIndexReader> InvertedIndexReader for CachedInvertedIndexBlobReader<R> {
|
||||
async fn range_read<'a>(
|
||||
&self,
|
||||
offset: u64,
|
||||
size: u32,
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<Vec<u8>> {
|
||||
let start = metrics.as_ref().map(|_| Instant::now());
|
||||
|
||||
async fn range_read(&self, offset: u64, size: u32) -> Result<Vec<u8>> {
|
||||
let inner = &self.inner;
|
||||
let (result, cache_metrics) = self
|
||||
.cache
|
||||
self.cache
|
||||
.get_or_load(
|
||||
(self.file_id, self.index_version),
|
||||
self.file_id,
|
||||
self.blob_size,
|
||||
offset,
|
||||
size,
|
||||
move |ranges| async move { inner.read_vec(&ranges, None).await },
|
||||
move |ranges| async move { inner.read_vec(&ranges).await },
|
||||
)
|
||||
.await?;
|
||||
|
||||
if let Some(m) = metrics {
|
||||
m.total_bytes += cache_metrics.page_bytes;
|
||||
m.total_ranges += cache_metrics.num_pages;
|
||||
m.cache_hit += cache_metrics.cache_hit;
|
||||
m.cache_miss += cache_metrics.cache_miss;
|
||||
m.fetch_elapsed += start.unwrap().elapsed();
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn read_vec<'a>(
|
||||
&self,
|
||||
ranges: &[Range<u64>],
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<Vec<Bytes>> {
|
||||
let start = metrics.as_ref().map(|_| Instant::now());
|
||||
|
||||
async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
|
||||
let mut pages = Vec::with_capacity(ranges.len());
|
||||
let mut total_cache_metrics = crate::cache::index::IndexCacheMetrics::default();
|
||||
for range in ranges {
|
||||
let inner = &self.inner;
|
||||
let (page, cache_metrics) = self
|
||||
let page = self
|
||||
.cache
|
||||
.get_or_load(
|
||||
(self.file_id, self.index_version),
|
||||
self.file_id,
|
||||
self.blob_size,
|
||||
range.start,
|
||||
(range.end - range.start) as u32,
|
||||
move |ranges| async move { inner.read_vec(&ranges, None).await },
|
||||
move |ranges| async move { inner.read_vec(&ranges).await },
|
||||
)
|
||||
.await?;
|
||||
|
||||
total_cache_metrics.merge(&cache_metrics);
|
||||
pages.push(Bytes::from(page));
|
||||
}
|
||||
|
||||
if let Some(m) = metrics {
|
||||
m.total_bytes += total_cache_metrics.page_bytes;
|
||||
m.total_ranges += total_cache_metrics.num_pages;
|
||||
m.cache_hit += total_cache_metrics.cache_hit;
|
||||
m.cache_miss += total_cache_metrics.cache_miss;
|
||||
m.fetch_elapsed += start.unwrap().elapsed();
|
||||
}
|
||||
|
||||
Ok(pages)
|
||||
}
|
||||
|
||||
async fn metadata<'a>(
|
||||
&self,
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<Arc<InvertedIndexMetas>> {
|
||||
if let Some(cached) = self.cache.get_metadata((self.file_id, self.index_version)) {
|
||||
async fn metadata(&self) -> Result<Arc<InvertedIndexMetas>> {
|
||||
if let Some(cached) = self.cache.get_metadata(self.file_id) {
|
||||
CACHE_HIT.with_label_values(&[INDEX_METADATA_TYPE]).inc();
|
||||
if let Some(m) = metrics {
|
||||
m.cache_hit += 1;
|
||||
}
|
||||
Ok(cached)
|
||||
} else {
|
||||
let meta = self.inner.metadata(metrics).await?;
|
||||
self.cache
|
||||
.put_metadata((self.file_id, self.index_version), meta.clone());
|
||||
let meta = self.inner.metadata().await?;
|
||||
self.cache.put_metadata(self.file_id, meta.clone());
|
||||
CACHE_MISS.with_label_values(&[INDEX_METADATA_TYPE]).inc();
|
||||
Ok(meta)
|
||||
}
|
||||
@@ -308,7 +258,6 @@ mod test {
|
||||
// Init a test range reader in local fs.
|
||||
let mut env = TestEnv::new().await;
|
||||
let file_size = blob.len() as u64;
|
||||
let index_version = 0;
|
||||
let store = env.init_object_store_manager();
|
||||
let temp_path = "data";
|
||||
store.write(temp_path, blob).await.unwrap();
|
||||
@@ -324,12 +273,11 @@ mod test {
|
||||
let reader = InvertedIndexBlobReader::new(range_reader);
|
||||
let cached_reader = CachedInvertedIndexBlobReader::new(
|
||||
FileId::random(),
|
||||
index_version,
|
||||
file_size,
|
||||
reader,
|
||||
Arc::new(InvertedIndexCache::new(8192, 8192, 50)),
|
||||
);
|
||||
let metadata = cached_reader.metadata(None).await.unwrap();
|
||||
let metadata = cached_reader.metadata().await.unwrap();
|
||||
assert_eq!(metadata.total_row_count, 8);
|
||||
assert_eq!(metadata.segment_row_count, 1);
|
||||
assert_eq!(metadata.metas.len(), 2);
|
||||
@@ -344,19 +292,13 @@ mod test {
|
||||
.fst(
|
||||
tag0.base_offset + tag0.relative_fst_offset as u64,
|
||||
tag0.fst_size,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(fst0.len(), 3);
|
||||
let [offset, size] = unpack(fst0.get(b"a").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(
|
||||
tag0.base_offset + offset as u64,
|
||||
size,
|
||||
BitmapType::Roaring,
|
||||
None,
|
||||
)
|
||||
.bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -365,12 +307,7 @@ mod test {
|
||||
);
|
||||
let [offset, size] = unpack(fst0.get(b"b").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(
|
||||
tag0.base_offset + offset as u64,
|
||||
size,
|
||||
BitmapType::Roaring,
|
||||
None,
|
||||
)
|
||||
.bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -379,12 +316,7 @@ mod test {
|
||||
);
|
||||
let [offset, size] = unpack(fst0.get(b"c").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(
|
||||
tag0.base_offset + offset as u64,
|
||||
size,
|
||||
BitmapType::Roaring,
|
||||
None,
|
||||
)
|
||||
.bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -403,19 +335,13 @@ mod test {
|
||||
.fst(
|
||||
tag1.base_offset + tag1.relative_fst_offset as u64,
|
||||
tag1.fst_size,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(fst1.len(), 3);
|
||||
let [offset, size] = unpack(fst1.get(b"x").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(
|
||||
tag1.base_offset + offset as u64,
|
||||
size,
|
||||
BitmapType::Roaring,
|
||||
None,
|
||||
)
|
||||
.bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -424,12 +350,7 @@ mod test {
|
||||
);
|
||||
let [offset, size] = unpack(fst1.get(b"y").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(
|
||||
tag1.base_offset + offset as u64,
|
||||
size,
|
||||
BitmapType::Roaring,
|
||||
None,
|
||||
)
|
||||
.bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -438,12 +359,7 @@ mod test {
|
||||
);
|
||||
let [offset, size] = unpack(fst1.get(b"z").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(
|
||||
tag1.base_offset + offset as u64,
|
||||
size,
|
||||
BitmapType::Roaring,
|
||||
None,
|
||||
)
|
||||
.bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -456,16 +372,16 @@ mod test {
|
||||
for _ in 0..FUZZ_REPEAT_TIMES {
|
||||
let offset = rng.random_range(0..file_size);
|
||||
let size = rng.random_range(0..file_size as u32 - offset as u32);
|
||||
let expected = cached_reader.range_read(offset, size, None).await.unwrap();
|
||||
let expected = cached_reader.range_read(offset, size).await.unwrap();
|
||||
let inner = &cached_reader.inner;
|
||||
let (read, _cache_metrics) = cached_reader
|
||||
let read = cached_reader
|
||||
.cache
|
||||
.get_or_load(
|
||||
(cached_reader.file_id, cached_reader.index_version),
|
||||
cached_reader.file_id,
|
||||
file_size,
|
||||
offset,
|
||||
size,
|
||||
|ranges| async move { inner.read_vec(&ranges, None).await },
|
||||
|ranges| async move { inner.read_vec(&ranges).await },
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
574
src/mito2/src/cache/manifest_cache.rs
vendored
574
src/mito2/src/cache/manifest_cache.rs
vendored
@@ -1,574 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! A cache for manifest files.
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use common_base::readable_size::ReadableSize;
|
||||
use common_telemetry::{error, info, warn};
|
||||
use futures::{FutureExt, TryStreamExt};
|
||||
use moka::future::Cache;
|
||||
use moka::notification::RemovalCause;
|
||||
use moka::policy::EvictionPolicy;
|
||||
use object_store::ObjectStore;
|
||||
use object_store::util::join_path;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::error::{OpenDalSnafu, Result};
|
||||
use crate::metrics::{CACHE_BYTES, CACHE_HIT, CACHE_MISS};
|
||||
|
||||
/// Subdirectory of cached manifest files.
|
||||
///
|
||||
/// This must contain three layers, corresponding to [`build_prometheus_metrics_layer`](object_store::layers::build_prometheus_metrics_layer).
|
||||
const MANIFEST_DIR: &str = "cache/object/manifest/";
|
||||
|
||||
/// Metric label for manifest files.
|
||||
const MANIFEST_TYPE: &str = "manifest";
|
||||
|
||||
/// A manifest cache manages manifest files on local store and evicts files based
|
||||
/// on size.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ManifestCache {
|
||||
/// Local store to cache files.
|
||||
local_store: ObjectStore,
|
||||
/// Index to track cached manifest files.
|
||||
index: Cache<String, IndexValue>,
|
||||
}
|
||||
|
||||
impl ManifestCache {
|
||||
/// Creates a new manifest cache and recovers the index from local store.
|
||||
pub async fn new(
|
||||
local_store: ObjectStore,
|
||||
capacity: ReadableSize,
|
||||
ttl: Option<Duration>,
|
||||
) -> ManifestCache {
|
||||
let total_capacity = capacity.as_bytes();
|
||||
|
||||
info!(
|
||||
"Initializing manifest cache with capacity: {}",
|
||||
ReadableSize(total_capacity)
|
||||
);
|
||||
|
||||
let index = Self::build_cache(local_store.clone(), total_capacity, ttl);
|
||||
|
||||
let cache = ManifestCache { local_store, index };
|
||||
|
||||
// Recovers the cache index from local store asynchronously
|
||||
cache.recover(false).await;
|
||||
|
||||
cache
|
||||
}
|
||||
|
||||
/// Builds the cache.
|
||||
fn build_cache(
|
||||
local_store: ObjectStore,
|
||||
capacity: u64,
|
||||
ttl: Option<Duration>,
|
||||
) -> Cache<String, IndexValue> {
|
||||
let cache_store = local_store;
|
||||
let mut builder = Cache::builder()
|
||||
.eviction_policy(EvictionPolicy::lru())
|
||||
.weigher(|key: &String, value: &IndexValue| -> u32 {
|
||||
key.len() as u32 + value.file_size
|
||||
})
|
||||
.max_capacity(capacity)
|
||||
.async_eviction_listener(move |key: Arc<String>, value: IndexValue, cause| {
|
||||
let store = cache_store.clone();
|
||||
// Stores files under MANIFEST_DIR.
|
||||
let file_path = join_path(MANIFEST_DIR, &key);
|
||||
async move {
|
||||
if let RemovalCause::Replaced = cause {
|
||||
// The cache is replaced by another file. We don't remove the same
|
||||
// file but updates the metrics as the file is already replaced by users.
|
||||
CACHE_BYTES
|
||||
.with_label_values(&[MANIFEST_TYPE])
|
||||
.sub(value.file_size.into());
|
||||
return;
|
||||
}
|
||||
|
||||
match store.delete(&file_path).await {
|
||||
Ok(()) => {
|
||||
CACHE_BYTES
|
||||
.with_label_values(&[MANIFEST_TYPE])
|
||||
.sub(value.file_size.into());
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(e; "Failed to delete cached manifest file {}", file_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
.boxed()
|
||||
});
|
||||
if let Some(ttl) = ttl {
|
||||
builder = builder.time_to_idle(ttl);
|
||||
}
|
||||
builder.build()
|
||||
}
|
||||
|
||||
/// Puts a file into the cache index.
|
||||
///
|
||||
/// The caller should ensure the file is in the correct path.
|
||||
pub(crate) async fn put(&self, key: String, value: IndexValue) {
|
||||
CACHE_BYTES
|
||||
.with_label_values(&[MANIFEST_TYPE])
|
||||
.add(value.file_size.into());
|
||||
self.index.insert(key, value).await;
|
||||
|
||||
// Since files can be large items, we run the pending tasks immediately.
|
||||
self.index.run_pending_tasks().await;
|
||||
}
|
||||
|
||||
/// Gets the index value for the key.
|
||||
pub(crate) async fn get(&self, key: &str) -> Option<IndexValue> {
|
||||
self.index.get(key).await
|
||||
}
|
||||
|
||||
/// Removes a file from the cache explicitly.
|
||||
pub(crate) async fn remove(&self, key: &str) {
|
||||
let file_path = self.cache_file_path(key);
|
||||
self.index.remove(key).await;
|
||||
// Always deletes the file from the local store.
|
||||
if let Err(e) = self.local_store.delete(&file_path).await {
|
||||
warn!(e; "Failed to delete a cached manifest file {}", file_path);
|
||||
}
|
||||
}
|
||||
|
||||
/// Removes multiple files from the cache in batch.
|
||||
pub(crate) async fn remove_batch(&self, keys: &[String]) {
|
||||
if keys.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
for key in keys {
|
||||
self.index.remove(key).await;
|
||||
}
|
||||
|
||||
let file_paths: Vec<String> = keys.iter().map(|key| self.cache_file_path(key)).collect();
|
||||
|
||||
if let Err(e) = self.local_store.delete_iter(file_paths).await {
|
||||
warn!(e; "Failed to delete cached manifest files in batch");
|
||||
}
|
||||
}
|
||||
|
||||
async fn recover_inner(&self) -> Result<()> {
|
||||
let now = Instant::now();
|
||||
let mut lister = self
|
||||
.local_store
|
||||
.lister_with(MANIFEST_DIR)
|
||||
.recursive(true)
|
||||
.await
|
||||
.context(OpenDalSnafu)?;
|
||||
let (mut total_size, mut total_keys) = (0i64, 0);
|
||||
while let Some(entry) = lister.try_next().await.context(OpenDalSnafu)? {
|
||||
let meta = entry.metadata();
|
||||
if !meta.is_file() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let meta = self
|
||||
.local_store
|
||||
.stat(entry.path())
|
||||
.await
|
||||
.context(OpenDalSnafu)?;
|
||||
let file_size = meta.content_length() as u32;
|
||||
let key = entry.path().trim_start_matches(MANIFEST_DIR).to_string();
|
||||
common_telemetry::info!("Manifest cache recover {}, size: {}", key, file_size);
|
||||
self.index.insert(key, IndexValue { file_size }).await;
|
||||
let size = i64::from(file_size);
|
||||
total_size += size;
|
||||
total_keys += 1;
|
||||
}
|
||||
CACHE_BYTES
|
||||
.with_label_values(&[MANIFEST_TYPE])
|
||||
.add(total_size);
|
||||
|
||||
// Runs all pending tasks of the moka cache so that the cache size is updated
|
||||
// and the eviction policy is applied.
|
||||
self.index.run_pending_tasks().await;
|
||||
|
||||
let weight = self.index.weighted_size();
|
||||
let count = self.index.entry_count();
|
||||
info!(
|
||||
"Recovered manifest cache, num_keys: {}, num_bytes: {}, count: {}, weight: {}, cost: {:?}",
|
||||
total_keys,
|
||||
total_size,
|
||||
count,
|
||||
weight,
|
||||
now.elapsed()
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Recovers the index from local store.
|
||||
pub(crate) async fn recover(&self, sync: bool) {
|
||||
let moved_self = self.clone();
|
||||
let handle = tokio::spawn(async move {
|
||||
if let Err(err) = moved_self.recover_inner().await {
|
||||
error!(err; "Failed to recover manifest cache.")
|
||||
}
|
||||
|
||||
moved_self.clean_empty_dirs(true).await;
|
||||
});
|
||||
|
||||
if sync {
|
||||
let _ = handle.await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the cache file path for the key.
|
||||
pub(crate) fn cache_file_path(&self, key: &str) -> String {
|
||||
join_path(MANIFEST_DIR, key)
|
||||
}
|
||||
|
||||
/// Gets a manifest file from cache.
|
||||
/// Returns the file data if found in cache, None otherwise.
|
||||
pub(crate) async fn get_file(&self, key: &str) -> Option<Vec<u8>> {
|
||||
if self.get(key).await.is_none() {
|
||||
CACHE_MISS.with_label_values(&[MANIFEST_TYPE]).inc();
|
||||
return None;
|
||||
}
|
||||
|
||||
let cache_file_path = self.cache_file_path(key);
|
||||
match self.local_store.read(&cache_file_path).await {
|
||||
Ok(data) => {
|
||||
CACHE_HIT.with_label_values(&[MANIFEST_TYPE]).inc();
|
||||
Some(data.to_vec())
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(e; "Failed to read cached manifest file {}", cache_file_path);
|
||||
CACHE_MISS.with_label_values(&[MANIFEST_TYPE]).inc();
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Puts a manifest file into cache.
|
||||
pub(crate) async fn put_file(&self, key: String, data: Vec<u8>) {
|
||||
let cache_file_path = self.cache_file_path(&key);
|
||||
|
||||
if let Err(e) = self.local_store.write(&cache_file_path, data.clone()).await {
|
||||
warn!(e; "Failed to write manifest to cache {}", cache_file_path);
|
||||
return;
|
||||
}
|
||||
|
||||
let file_size = data.len() as u32;
|
||||
self.put(key, IndexValue { file_size }).await;
|
||||
}
|
||||
|
||||
/// Removes empty directories recursively under the manifest cache directory.
|
||||
///
|
||||
/// If `check_mtime` is true, only removes directories that have not been modified
|
||||
/// for at least 1 hour.
|
||||
pub(crate) async fn clean_empty_dirs(&self, check_mtime: bool) {
|
||||
info!("Clean empty dirs start");
|
||||
|
||||
let root = self.local_store.info().root();
|
||||
let manifest_dir = PathBuf::from(root).join(MANIFEST_DIR);
|
||||
let manifest_dir_clone = manifest_dir.clone();
|
||||
|
||||
let result = tokio::task::spawn_blocking(move || {
|
||||
Self::clean_empty_dirs_sync(&manifest_dir_clone, check_mtime)
|
||||
})
|
||||
.await;
|
||||
|
||||
match result {
|
||||
Ok(Ok(())) => {
|
||||
info!("Clean empty dirs end");
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
warn!(e; "Failed to clean empty directories under {}", manifest_dir.display());
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(e; "Failed to spawn blocking task for cleaning empty directories");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Removes all manifest files under the given directory from cache and cleans up empty directories.
|
||||
pub(crate) async fn clean_manifests(&self, dir: &str) {
|
||||
info!("Clean manifest cache for directory: {}", dir);
|
||||
|
||||
let cache_dir = join_path(MANIFEST_DIR, dir);
|
||||
let mut lister = match self
|
||||
.local_store
|
||||
.lister_with(&cache_dir)
|
||||
.recursive(true)
|
||||
.await
|
||||
{
|
||||
Ok(lister) => lister,
|
||||
Err(e) => {
|
||||
warn!(e; "Failed to list manifest files under {}", cache_dir);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let mut keys_to_remove = Vec::new();
|
||||
loop {
|
||||
match lister.try_next().await {
|
||||
Ok(Some(entry)) => {
|
||||
let meta = entry.metadata();
|
||||
if meta.is_file() {
|
||||
keys_to_remove
|
||||
.push(entry.path().trim_start_matches(MANIFEST_DIR).to_string());
|
||||
}
|
||||
}
|
||||
Ok(None) => break,
|
||||
Err(e) => {
|
||||
warn!(e; "Failed to read entry while listing {}", cache_dir);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
"Going to remove files from manifest cache, files: {:?}",
|
||||
keys_to_remove
|
||||
);
|
||||
|
||||
// Removes all files from cache in batch
|
||||
self.remove_batch(&keys_to_remove).await;
|
||||
|
||||
// Cleans up empty directories under the given dir
|
||||
let root = self.local_store.info().root();
|
||||
let dir_path = PathBuf::from(root).join(&cache_dir);
|
||||
let dir_path_clone = dir_path.clone();
|
||||
|
||||
let result = tokio::task::spawn_blocking(move || {
|
||||
Self::clean_empty_dirs_sync(&dir_path_clone, false)
|
||||
})
|
||||
.await;
|
||||
|
||||
match result {
|
||||
Ok(Ok(())) => {
|
||||
info!("Cleaned manifest cache for directory: {}", dir);
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
warn!(e; "Failed to clean empty directories under {}", dir_path.display());
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(e; "Failed to spawn blocking task for cleaning empty directories");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Synchronously removes empty directories recursively.
|
||||
///
|
||||
/// If `check_mtime` is true, only removes directories that have not been modified
|
||||
/// for at least 1 hour.
|
||||
fn clean_empty_dirs_sync(dir: &PathBuf, check_mtime: bool) -> std::io::Result<()> {
|
||||
Self::remove_empty_dirs_recursive_sync(dir, check_mtime)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn remove_empty_dirs_recursive_sync(dir: &PathBuf, check_mtime: bool) -> std::io::Result<bool> {
|
||||
common_telemetry::debug!(
|
||||
"Maybe remove empty dir: {:?}, check_mtime: {}",
|
||||
dir,
|
||||
check_mtime
|
||||
);
|
||||
let entries = match std::fs::read_dir(dir) {
|
||||
Ok(entries) => entries,
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
||||
// Directory doesn't exist, treat as already removed (empty)
|
||||
return Ok(true);
|
||||
}
|
||||
Err(e) => return Err(e),
|
||||
};
|
||||
|
||||
let mut is_empty = true;
|
||||
// Iterates all entries under the directory.
|
||||
// We have to check all entries to clean up all empty subdirectories.
|
||||
for entry in entries {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
let metadata = std::fs::metadata(&path)?;
|
||||
|
||||
if metadata.is_dir() {
|
||||
// Checks if we should skip this directory based on modification time
|
||||
if check_mtime
|
||||
&& let Ok(modified) = metadata.modified()
|
||||
&& let Ok(elapsed) = modified.elapsed()
|
||||
&& elapsed < Duration::from_secs(3600)
|
||||
{
|
||||
common_telemetry::debug!("Skip directory by mtime, elapsed: {:?}", elapsed);
|
||||
// Only removes if not modified for at least 1 hour.
|
||||
is_empty = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
let subdir_empty = Self::remove_empty_dirs_recursive_sync(&path, check_mtime)?;
|
||||
if subdir_empty {
|
||||
if let Err(e) = std::fs::remove_dir(&path)
|
||||
&& e.kind() != std::io::ErrorKind::NotFound
|
||||
{
|
||||
warn!(e; "Failed to remove empty directory {}", path.display());
|
||||
is_empty = false;
|
||||
} else {
|
||||
info!(
|
||||
"Removed empty directory {} from manifest cache",
|
||||
path.display()
|
||||
);
|
||||
}
|
||||
} else {
|
||||
is_empty = false;
|
||||
}
|
||||
} else {
|
||||
is_empty = false;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(is_empty)
|
||||
}
|
||||
}
|
||||
|
||||
/// An entity that describes the file in the manifest cache.
|
||||
///
|
||||
/// It should only keep minimal information needed by the cache.
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct IndexValue {
|
||||
/// Size of the file in bytes.
|
||||
pub(crate) file_size: u32,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use common_test_util::temp_dir::create_temp_dir;
|
||||
use object_store::services::Fs;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn new_fs_store(path: &str) -> ObjectStore {
|
||||
let builder = Fs::default().root(path);
|
||||
ObjectStore::new(builder).unwrap().finish()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_manifest_cache_basic() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
|
||||
let dir = create_temp_dir("");
|
||||
let local_store = new_fs_store(dir.path().to_str().unwrap());
|
||||
|
||||
let cache = ManifestCache::new(local_store.clone(), ReadableSize::mb(10), None).await;
|
||||
let key = "region_1/manifest/00000000000000000007.json";
|
||||
let file_path = cache.cache_file_path(key);
|
||||
|
||||
// Get an empty file.
|
||||
assert!(cache.get(key).await.is_none());
|
||||
|
||||
// Write a file.
|
||||
local_store
|
||||
.write(&file_path, b"manifest content".as_slice())
|
||||
.await
|
||||
.unwrap();
|
||||
// Add to the cache.
|
||||
cache
|
||||
.put(key.to_string(), IndexValue { file_size: 16 })
|
||||
.await;
|
||||
|
||||
// Get the cached value.
|
||||
let value = cache.get(key).await.unwrap();
|
||||
assert_eq!(16, value.file_size);
|
||||
|
||||
// Get weighted size.
|
||||
cache.index.run_pending_tasks().await;
|
||||
assert_eq!(59, cache.index.weighted_size());
|
||||
|
||||
// Remove the file.
|
||||
cache.remove(key).await;
|
||||
cache.index.run_pending_tasks().await;
|
||||
assert!(cache.get(key).await.is_none());
|
||||
|
||||
// Ensure all pending tasks of the moka cache is done before assertion.
|
||||
cache.index.run_pending_tasks().await;
|
||||
|
||||
// The file also not exists.
|
||||
assert!(!local_store.exists(&file_path).await.unwrap());
|
||||
assert_eq!(0, cache.index.weighted_size());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_manifest_cache_recover() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
|
||||
let dir = create_temp_dir("");
|
||||
let local_store = new_fs_store(dir.path().to_str().unwrap());
|
||||
let cache = ManifestCache::new(local_store.clone(), ReadableSize::mb(10), None).await;
|
||||
|
||||
// Write some manifest files with different paths
|
||||
let keys = [
|
||||
"region_1/manifest/00000000000000000001.json",
|
||||
"region_1/manifest/00000000000000000002.json",
|
||||
"region_1/manifest/00000000000000000001.checkpoint",
|
||||
"region_2/manifest/00000000000000000001.json",
|
||||
];
|
||||
|
||||
let mut total_size = 0;
|
||||
for (i, key) in keys.iter().enumerate() {
|
||||
let file_path = cache.cache_file_path(key);
|
||||
let content = format!("manifest-{}", i).into_bytes();
|
||||
local_store
|
||||
.write(&file_path, content.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Add to the cache.
|
||||
cache
|
||||
.put(
|
||||
key.to_string(),
|
||||
IndexValue {
|
||||
file_size: content.len() as u32,
|
||||
},
|
||||
)
|
||||
.await;
|
||||
total_size += content.len() + key.len();
|
||||
}
|
||||
|
||||
// Create a new cache instance which will automatically recover from local store
|
||||
let cache = ManifestCache::new(local_store.clone(), ReadableSize::mb(10), None).await;
|
||||
|
||||
// Wait for recovery to complete synchronously
|
||||
cache.recover(true).await;
|
||||
|
||||
// Check size.
|
||||
cache.index.run_pending_tasks().await;
|
||||
let total_cached = cache.index.weighted_size() as usize;
|
||||
assert_eq!(total_size, total_cached);
|
||||
|
||||
// Verify all files
|
||||
for (i, key) in keys.iter().enumerate() {
|
||||
let value = cache.get(key).await.unwrap();
|
||||
assert_eq!(format!("manifest-{}", i).len() as u32, value.file_size);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_cache_file_path() {
|
||||
let dir = create_temp_dir("");
|
||||
let local_store = new_fs_store(dir.path().to_str().unwrap());
|
||||
let cache = ManifestCache::new(local_store, ReadableSize::mb(10), None).await;
|
||||
|
||||
assert_eq!(
|
||||
"cache/object/manifest/region_1/manifest/00000000000000000007.json",
|
||||
cache.cache_file_path("region_1/manifest/00000000000000000007.json")
|
||||
);
|
||||
assert_eq!(
|
||||
"cache/object/manifest/region_1/manifest/00000000000000000007.checkpoint",
|
||||
cache.cache_file_path("region_1/manifest/00000000000000000007.checkpoint")
|
||||
);
|
||||
}
|
||||
}
|
||||
31
src/mito2/src/cache/write_cache.rs
vendored
31
src/mito2/src/cache/write_cache.rs
vendored
@@ -30,7 +30,6 @@ use crate::access_layer::{
|
||||
TempFileCleaner, WriteCachePathProvider, WriteType, new_fs_cache_store,
|
||||
};
|
||||
use crate::cache::file_cache::{FileCache, FileCacheRef, FileType, IndexKey, IndexValue};
|
||||
use crate::cache::manifest_cache::ManifestCache;
|
||||
use crate::error::{self, Result};
|
||||
use crate::metrics::UPLOAD_BYTES_TOTAL;
|
||||
use crate::region::opener::RegionLoadCacheTask;
|
||||
@@ -54,8 +53,6 @@ pub struct WriteCache {
|
||||
intermediate_manager: IntermediateManager,
|
||||
/// Sender for region load cache tasks.
|
||||
task_sender: UnboundedSender<RegionLoadCacheTask>,
|
||||
/// Optional cache for manifest files.
|
||||
manifest_cache: Option<ManifestCache>,
|
||||
}
|
||||
|
||||
pub type WriteCacheRef = Arc<WriteCache>;
|
||||
@@ -70,7 +67,6 @@ impl WriteCache {
|
||||
index_cache_percent: Option<u8>,
|
||||
puffin_manager_factory: PuffinManagerFactory,
|
||||
intermediate_manager: IntermediateManager,
|
||||
manifest_cache: Option<ManifestCache>,
|
||||
) -> Result<Self> {
|
||||
let (task_sender, task_receiver) = unbounded_channel();
|
||||
|
||||
@@ -87,7 +83,6 @@ impl WriteCache {
|
||||
puffin_manager_factory,
|
||||
intermediate_manager,
|
||||
task_sender,
|
||||
manifest_cache,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -99,19 +94,10 @@ impl WriteCache {
|
||||
index_cache_percent: Option<u8>,
|
||||
puffin_manager_factory: PuffinManagerFactory,
|
||||
intermediate_manager: IntermediateManager,
|
||||
manifest_cache_capacity: ReadableSize,
|
||||
) -> Result<Self> {
|
||||
info!("Init write cache on {cache_dir}, capacity: {cache_capacity}");
|
||||
|
||||
let local_store = new_fs_cache_store(cache_dir).await?;
|
||||
|
||||
// Create manifest cache if capacity is non-zero
|
||||
let manifest_cache = if manifest_cache_capacity.as_bytes() > 0 {
|
||||
Some(ManifestCache::new(local_store.clone(), manifest_cache_capacity, ttl).await)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Self::new(
|
||||
local_store,
|
||||
cache_capacity,
|
||||
@@ -119,7 +105,6 @@ impl WriteCache {
|
||||
index_cache_percent,
|
||||
puffin_manager_factory,
|
||||
intermediate_manager,
|
||||
manifest_cache,
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -129,11 +114,6 @@ impl WriteCache {
|
||||
self.file_cache.clone()
|
||||
}
|
||||
|
||||
/// Returns the manifest cache if available.
|
||||
pub(crate) fn manifest_cache(&self) -> Option<ManifestCache> {
|
||||
self.manifest_cache.clone()
|
||||
}
|
||||
|
||||
/// Build the puffin manager
|
||||
pub(crate) fn build_puffin_manager(&self) -> SstPuffinManager {
|
||||
let store = self.file_cache.local_store();
|
||||
@@ -215,7 +195,6 @@ impl WriteCache {
|
||||
puffin_manager: self
|
||||
.puffin_manager_factory
|
||||
.build(store.clone(), path_provider.clone()),
|
||||
write_cache_enabled: true,
|
||||
intermediate_manager: self.intermediate_manager.clone(),
|
||||
index_options: write_request.index_options,
|
||||
inverted_index_config: write_request.inverted_index_config,
|
||||
@@ -267,7 +246,7 @@ impl WriteCache {
|
||||
upload_tracker.push_uploaded_file(parquet_path);
|
||||
|
||||
if sst.index_metadata.file_size > 0 {
|
||||
let puffin_key = IndexKey::new(region_id, sst.file_id, FileType::Puffin(0));
|
||||
let puffin_key = IndexKey::new(region_id, sst.file_id, FileType::Puffin);
|
||||
let puffin_path = upload_request
|
||||
.dest_path_provider
|
||||
.build_index_file_path(RegionFileId::new(region_id, sst.file_id));
|
||||
@@ -440,11 +419,7 @@ impl UploadTracker {
|
||||
file_cache.remove(parquet_key).await;
|
||||
|
||||
if sst.index_metadata.file_size > 0 {
|
||||
let puffin_key = IndexKey::new(
|
||||
self.region_id,
|
||||
sst.file_id,
|
||||
FileType::Puffin(sst.index_metadata.version),
|
||||
);
|
||||
let puffin_key = IndexKey::new(self.region_id, sst.file_id, FileType::Puffin);
|
||||
file_cache.remove(puffin_key).await;
|
||||
}
|
||||
}
|
||||
@@ -553,7 +528,7 @@ mod tests {
|
||||
assert_eq!(remote_data.to_vec(), cache_data.to_vec());
|
||||
|
||||
// Check write cache contains the index key
|
||||
let index_key = IndexKey::new(region_id, file_id, FileType::Puffin(0));
|
||||
let index_key = IndexKey::new(region_id, file_id, FileType::Puffin);
|
||||
assert!(write_cache.file_cache.contains_key(&index_key));
|
||||
|
||||
let remote_index_data = mock_store.read(&index_upload_path).await.unwrap();
|
||||
|
||||
@@ -1110,7 +1110,6 @@ mod tests {
|
||||
compress_type: CompressionType::Uncompressed,
|
||||
checkpoint_distance: 10,
|
||||
remove_file_options: Default::default(),
|
||||
manifest_cache: None,
|
||||
},
|
||||
FormatType::PrimaryKey,
|
||||
&Default::default(),
|
||||
|
||||
@@ -399,7 +399,7 @@ impl DefaultCompactor {
|
||||
available_indexes: sst_info.index_metadata.build_available_indexes(),
|
||||
indexes: sst_info.index_metadata.build_indexes(),
|
||||
index_file_size: sst_info.index_metadata.file_size,
|
||||
index_version: 0,
|
||||
index_file_id: None,
|
||||
num_rows: sst_info.num_rows as u64,
|
||||
num_row_groups: sst_info.num_row_groups,
|
||||
sequence: max_sequence,
|
||||
|
||||
@@ -77,7 +77,7 @@ pub fn new_file_handle_with_size_and_sequence(
|
||||
available_indexes: Default::default(),
|
||||
indexes: Default::default(),
|
||||
index_file_size: 0,
|
||||
index_version: 0,
|
||||
index_file_id: None,
|
||||
num_rows: 0,
|
||||
num_row_groups: 0,
|
||||
num_series: 0,
|
||||
|
||||
@@ -135,7 +135,7 @@ use crate::read::stream::ScanBatchStream;
|
||||
use crate::region::MitoRegionRef;
|
||||
use crate::region::opener::PartitionExprFetcherRef;
|
||||
use crate::request::{RegionEditRequest, WorkerRequest};
|
||||
use crate::sst::file::{FileMeta, RegionFileId, RegionIndexId};
|
||||
use crate::sst::file::{FileMeta, RegionFileId};
|
||||
use crate::sst::file_ref::FileReferenceManagerRef;
|
||||
use crate::wal::entry_distributor::{
|
||||
DEFAULT_ENTRY_RECEIVER_BUFFER_SIZE, build_wal_entry_distributor_and_receivers,
|
||||
@@ -541,23 +541,22 @@ impl MitoEngine {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
let index_version = entry.index_version;
|
||||
let file_id = match FileId::parse_str(&entry.file_id) {
|
||||
let Some(index_file_id) = entry.index_file_id.as_ref() else {
|
||||
return Vec::new();
|
||||
};
|
||||
let file_id = match FileId::parse_str(index_file_id) {
|
||||
Ok(file_id) => file_id,
|
||||
Err(err) => {
|
||||
warn!(
|
||||
err;
|
||||
"Failed to parse puffin index file id, table_dir: {}, file_id: {}",
|
||||
entry.table_dir,
|
||||
entry.file_id
|
||||
index_file_id
|
||||
);
|
||||
return Vec::new();
|
||||
}
|
||||
};
|
||||
let region_index_id = RegionIndexId::new(
|
||||
RegionFileId::new(entry.region_id, file_id),
|
||||
index_version,
|
||||
);
|
||||
let region_file_id = RegionFileId::new(entry.region_id, file_id);
|
||||
let context = IndexEntryContext {
|
||||
table_dir: &entry.table_dir,
|
||||
index_file_path: index_file_path.as_str(),
|
||||
@@ -566,7 +565,7 @@ impl MitoEngine {
|
||||
region_number: entry.region_number,
|
||||
region_group: entry.region_group,
|
||||
region_sequence: entry.region_sequence,
|
||||
file_id: &entry.file_id,
|
||||
file_id: index_file_id,
|
||||
index_file_size: entry.index_file_size,
|
||||
node_id,
|
||||
};
|
||||
@@ -577,7 +576,7 @@ impl MitoEngine {
|
||||
|
||||
collect_index_entries_from_puffin(
|
||||
manager,
|
||||
region_index_id,
|
||||
region_file_id,
|
||||
context,
|
||||
bloom_filter_cache,
|
||||
inverted_index_cache,
|
||||
|
||||
@@ -861,10 +861,9 @@ async fn test_cache_null_primary_key_with_format(flat_format: bool) {
|
||||
#[tokio::test]
|
||||
async fn test_list_ssts() {
|
||||
test_list_ssts_with_format(false, r#"
|
||||
ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test/11_0000000001/<file_id>.parquet", file_size: 2513, index_file_path: Some("test/11_0000000001/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test/11_0000000002/<file_id>.parquet", file_size: 2513, index_file_path: Some("test/11_0000000002/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test/22_0000000042/<file_id>.parquet", file_size: 2513, index_file_path: Some("test/22_0000000042/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"# ,
|
||||
r#"
|
||||
ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test/11_0000000001/<file_id>.parquet", file_size: 2513, index_file_path: Some("test/11_0000000001/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test/11_0000000002/<file_id>.parquet", file_size: 2513, index_file_path: Some("test/11_0000000002/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test/22_0000000042/<file_id>.parquet", file_size: 2513, index_file_path: Some("test/22_0000000042/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"# ,r#"
|
||||
StorageSstEntry { file_path: "test/11_0000000001/<file_id>.parquet", file_size: None, last_modified_ms: None, node_id: None }
|
||||
StorageSstEntry { file_path: "test/11_0000000001/index/<file_id>.puffin", file_size: None, last_modified_ms: None, node_id: None }
|
||||
StorageSstEntry { file_path: "test/11_0000000002/<file_id>.parquet", file_size: None, last_modified_ms: None, node_id: None }
|
||||
@@ -872,10 +871,9 @@ StorageSstEntry { file_path: "test/11_0000000002/index/<file_id>.puffin", file_s
|
||||
StorageSstEntry { file_path: "test/22_0000000042/<file_id>.parquet", file_size: None, last_modified_ms: None, node_id: None }
|
||||
StorageSstEntry { file_path: "test/22_0000000042/index/<file_id>.puffin", file_size: None, last_modified_ms: None, node_id: None }"#).await;
|
||||
test_list_ssts_with_format(true, r#"
|
||||
ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test/11_0000000001/<file_id>.parquet", file_size: 2837, index_file_path: Some("test/11_0000000001/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test/11_0000000002/<file_id>.parquet", file_size: 2837, index_file_path: Some("test/11_0000000002/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test/22_0000000042/<file_id>.parquet", file_size: 2837, index_file_path: Some("test/22_0000000042/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"#,
|
||||
r#"
|
||||
ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test/11_0000000001/<file_id>.parquet", file_size: 2837, index_file_path: Some("test/11_0000000001/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test/11_0000000002/<file_id>.parquet", file_size: 2837, index_file_path: Some("test/11_0000000002/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test/22_0000000042/<file_id>.parquet", file_size: 2837, index_file_path: Some("test/22_0000000042/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"#, r#"
|
||||
StorageSstEntry { file_path: "test/11_0000000001/<file_id>.parquet", file_size: None, last_modified_ms: None, node_id: None }
|
||||
StorageSstEntry { file_path: "test/11_0000000001/index/<file_id>.puffin", file_size: None, last_modified_ms: None, node_id: None }
|
||||
StorageSstEntry { file_path: "test/11_0000000002/<file_id>.parquet", file_size: None, last_modified_ms: None, node_id: None }
|
||||
@@ -947,13 +945,13 @@ async fn test_list_ssts_with_format(
|
||||
.index_file_path
|
||||
.map(|p| p.replace(&e.file_id, "<file_id>"));
|
||||
e.file_id = "<file_id>".to_string();
|
||||
e.index_version = 0;
|
||||
e.index_file_id = e.index_file_id.map(|_| "<index_file_id>".to_string());
|
||||
format!("\n{:?}", e)
|
||||
})
|
||||
.sorted()
|
||||
.collect::<Vec<_>>()
|
||||
.join("");
|
||||
assert_eq!(debug_format, expected_manifest_ssts, "{}", debug_format);
|
||||
assert_eq!(debug_format, expected_manifest_ssts,);
|
||||
|
||||
// list from storage
|
||||
let storage_entries = engine
|
||||
@@ -971,7 +969,7 @@ async fn test_list_ssts_with_format(
|
||||
.sorted()
|
||||
.collect::<Vec<_>>()
|
||||
.join("");
|
||||
assert_eq!(debug_format, expected_storage_ssts, "{}", debug_format);
|
||||
assert_eq!(debug_format, expected_storage_ssts,);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -515,7 +515,6 @@ async fn test_flush_workers() {
|
||||
}
|
||||
|
||||
async fn test_flush_workers_with_format(flat_format: bool) {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let mut env = TestEnv::new().await;
|
||||
let write_buffer_manager = Arc::new(MockWriteBufferManager::default());
|
||||
let listener = Arc::new(FlushListener::default());
|
||||
@@ -575,7 +574,7 @@ async fn test_flush_workers_with_format(flat_format: bool) {
|
||||
put_rows(&engine, region_id0, rows).await;
|
||||
|
||||
// Waits until flush is finished.
|
||||
while listener.success_count() < 3 {
|
||||
while listener.success_count() < 2 {
|
||||
listener.wait().await;
|
||||
}
|
||||
|
||||
|
||||
@@ -55,10 +55,10 @@ async fn num_of_index_files(engine: &MitoEngine, scanner: &Scanner, region_id: R
|
||||
return 0;
|
||||
}
|
||||
let mut index_files_count: usize = 0;
|
||||
for region_index_id in scanner.index_ids() {
|
||||
for region_file_id in scanner.file_ids() {
|
||||
let index_path = location::index_file_path(
|
||||
access_layer.table_dir(),
|
||||
region_index_id,
|
||||
region_file_id,
|
||||
access_layer.path_type(),
|
||||
);
|
||||
if access_layer
|
||||
|
||||
@@ -32,7 +32,7 @@ use crate::cache::index::bloom_filter_index::{
|
||||
BloomFilterIndexCacheRef, CachedBloomFilterIndexBlobReader, Tag,
|
||||
};
|
||||
use crate::cache::index::inverted_index::{CachedInvertedIndexBlobReader, InvertedIndexCacheRef};
|
||||
use crate::sst::file::RegionIndexId;
|
||||
use crate::sst::file::RegionFileId;
|
||||
use crate::sst::index::bloom_filter::INDEX_BLOB_TYPE as BLOOM_BLOB_TYPE;
|
||||
use crate::sst::index::fulltext_index::{
|
||||
INDEX_BLOB_TYPE_BLOOM as FULLTEXT_BLOOM_BLOB_TYPE,
|
||||
@@ -66,14 +66,14 @@ pub(crate) struct IndexEntryContext<'a> {
|
||||
/// Collect index metadata entries present in the SST puffin file.
|
||||
pub(crate) async fn collect_index_entries_from_puffin(
|
||||
manager: SstPuffinManager,
|
||||
region_index_id: RegionIndexId,
|
||||
region_file_id: RegionFileId,
|
||||
context: IndexEntryContext<'_>,
|
||||
bloom_filter_cache: Option<BloomFilterIndexCacheRef>,
|
||||
inverted_index_cache: Option<InvertedIndexCacheRef>,
|
||||
) -> Vec<PuffinIndexMetaEntry> {
|
||||
let mut entries = Vec::new();
|
||||
|
||||
let reader = match manager.reader(®ion_index_id).await {
|
||||
let reader = match manager.reader(®ion_file_id).await {
|
||||
Ok(reader) => reader,
|
||||
Err(err) => {
|
||||
warn!(
|
||||
@@ -104,7 +104,7 @@ pub(crate) async fn collect_index_entries_from_puffin(
|
||||
Some(BlobIndexTypeTargetKey::BloomFilter(target_key)) => {
|
||||
let bloom_meta = try_read_bloom_meta(
|
||||
&reader,
|
||||
region_index_id,
|
||||
region_file_id,
|
||||
blob.blob_type.as_str(),
|
||||
target_key,
|
||||
bloom_filter_cache.as_ref(),
|
||||
@@ -130,7 +130,7 @@ pub(crate) async fn collect_index_entries_from_puffin(
|
||||
Some(BlobIndexTypeTargetKey::FulltextBloom(target_key)) => {
|
||||
let bloom_meta = try_read_bloom_meta(
|
||||
&reader,
|
||||
region_index_id,
|
||||
region_file_id,
|
||||
blob.blob_type.as_str(),
|
||||
target_key,
|
||||
bloom_filter_cache.as_ref(),
|
||||
@@ -172,7 +172,7 @@ pub(crate) async fn collect_index_entries_from_puffin(
|
||||
Some(BlobIndexTypeTargetKey::Inverted) => {
|
||||
let mut inverted_entries = collect_inverted_entries(
|
||||
&reader,
|
||||
region_index_id,
|
||||
region_file_id,
|
||||
inverted_index_cache.as_ref(),
|
||||
&context,
|
||||
)
|
||||
@@ -188,12 +188,12 @@ pub(crate) async fn collect_index_entries_from_puffin(
|
||||
|
||||
async fn collect_inverted_entries(
|
||||
reader: &SstPuffinReader,
|
||||
region_index_id: RegionIndexId,
|
||||
region_file_id: RegionFileId,
|
||||
cache: Option<&InvertedIndexCacheRef>,
|
||||
context: &IndexEntryContext<'_>,
|
||||
) -> Vec<PuffinIndexMetaEntry> {
|
||||
// Read the inverted index blob and surface its per-column metadata entries.
|
||||
let file_id = region_index_id.file_id();
|
||||
let file_id = region_file_id.file_id();
|
||||
|
||||
let guard = match reader.blob(INVERTED_BLOB_TYPE).await {
|
||||
Ok(guard) => guard,
|
||||
@@ -229,12 +229,11 @@ async fn collect_inverted_entries(
|
||||
let metas = if let (Some(cache), Some(blob_size)) = (cache, blob_size) {
|
||||
let reader = CachedInvertedIndexBlobReader::new(
|
||||
file_id,
|
||||
region_index_id.version,
|
||||
blob_size,
|
||||
InvertedIndexBlobReader::new(blob_reader),
|
||||
cache.clone(),
|
||||
);
|
||||
match reader.metadata(None).await {
|
||||
match reader.metadata().await {
|
||||
Ok(metas) => metas,
|
||||
Err(err) => {
|
||||
warn!(
|
||||
@@ -248,7 +247,7 @@ async fn collect_inverted_entries(
|
||||
}
|
||||
} else {
|
||||
let reader = InvertedIndexBlobReader::new(blob_reader);
|
||||
match reader.metadata(None).await {
|
||||
match reader.metadata().await {
|
||||
Ok(metas) => metas,
|
||||
Err(err) => {
|
||||
warn!(
|
||||
@@ -290,7 +289,7 @@ fn build_inverted_entries(
|
||||
|
||||
async fn try_read_bloom_meta(
|
||||
reader: &SstPuffinReader,
|
||||
region_index_id: RegionIndexId,
|
||||
region_file_id: RegionFileId,
|
||||
blob_type: &str,
|
||||
target_key: &str,
|
||||
cache: Option<&BloomFilterIndexCacheRef>,
|
||||
@@ -312,18 +311,17 @@ async fn try_read_bloom_meta(
|
||||
let result = match (cache, column_id, blob_size) {
|
||||
(Some(cache), Some(column_id), Some(blob_size)) => {
|
||||
CachedBloomFilterIndexBlobReader::new(
|
||||
region_index_id.file_id(),
|
||||
region_index_id.version,
|
||||
region_file_id.file_id(),
|
||||
column_id,
|
||||
tag,
|
||||
blob_size,
|
||||
bloom_reader,
|
||||
cache.clone(),
|
||||
)
|
||||
.metadata(None)
|
||||
.metadata()
|
||||
.await
|
||||
}
|
||||
_ => bloom_reader.metadata(None).await,
|
||||
_ => bloom_reader.metadata().await,
|
||||
};
|
||||
|
||||
match result {
|
||||
|
||||
@@ -20,7 +20,7 @@ use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::time::Instant;
|
||||
|
||||
use common_telemetry::{debug, error, info};
|
||||
use common_telemetry::{debug, error, info, trace};
|
||||
use datatypes::arrow::datatypes::SchemaRef;
|
||||
use either::Either;
|
||||
use partition::expr::PartitionExpr;
|
||||
@@ -89,12 +89,6 @@ pub trait WriteBufferManager: Send + Sync + std::fmt::Debug {
|
||||
|
||||
/// Returns the total memory used by memtables.
|
||||
fn memory_usage(&self) -> usize;
|
||||
|
||||
/// Returns the mutable memtable memory limit.
|
||||
///
|
||||
/// The write buffer manager should flush memtables when the mutable memory usage
|
||||
/// exceeds this limit.
|
||||
fn flush_limit(&self) -> usize;
|
||||
}
|
||||
|
||||
pub type WriteBufferManagerRef = Arc<dyn WriteBufferManager>;
|
||||
@@ -151,7 +145,7 @@ impl WriteBufferManagerImpl {
|
||||
impl WriteBufferManager for WriteBufferManagerImpl {
|
||||
fn should_flush_engine(&self) -> bool {
|
||||
let mutable_memtable_memory_usage = self.memory_active.load(Ordering::Relaxed);
|
||||
if mutable_memtable_memory_usage >= self.mutable_limit {
|
||||
if mutable_memtable_memory_usage > self.mutable_limit {
|
||||
debug!(
|
||||
"Engine should flush (over mutable limit), mutable_usage: {}, memory_usage: {}, mutable_limit: {}, global_limit: {}",
|
||||
mutable_memtable_memory_usage,
|
||||
@@ -163,8 +157,23 @@ impl WriteBufferManager for WriteBufferManagerImpl {
|
||||
}
|
||||
|
||||
let memory_usage = self.memory_used.load(Ordering::Relaxed);
|
||||
// If the memory exceeds the buffer size, we trigger more aggressive
|
||||
// flush. But if already more than half memory is being flushed,
|
||||
// triggering more flush may not help. We will hold it instead.
|
||||
if memory_usage >= self.global_write_buffer_size {
|
||||
return true;
|
||||
if mutable_memtable_memory_usage >= self.global_write_buffer_size / 2 {
|
||||
debug!(
|
||||
"Engine should flush (over total limit), memory_usage: {}, global_write_buffer_size: {}, \
|
||||
mutable_usage: {}.",
|
||||
memory_usage, self.global_write_buffer_size, mutable_memtable_memory_usage
|
||||
);
|
||||
return true;
|
||||
} else {
|
||||
trace!(
|
||||
"Engine won't flush, memory_usage: {}, global_write_buffer_size: {}, mutable_usage: {}.",
|
||||
memory_usage, self.global_write_buffer_size, mutable_memtable_memory_usage
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
@@ -196,10 +205,6 @@ impl WriteBufferManager for WriteBufferManagerImpl {
|
||||
fn memory_usage(&self) -> usize {
|
||||
self.memory_used.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
fn flush_limit(&self) -> usize {
|
||||
self.mutable_limit
|
||||
}
|
||||
}
|
||||
|
||||
/// Reason of a flush task.
|
||||
@@ -643,7 +648,7 @@ impl RegionFlushTask {
|
||||
available_indexes: sst_info.index_metadata.build_available_indexes(),
|
||||
indexes: sst_info.index_metadata.build_indexes(),
|
||||
index_file_size: sst_info.index_metadata.file_size,
|
||||
index_version: 0,
|
||||
index_file_id: None,
|
||||
num_rows: sst_info.num_rows as u64,
|
||||
num_row_groups: sst_info.num_row_groups,
|
||||
sequence: NonZeroU64::new(max_sequence),
|
||||
@@ -883,31 +888,6 @@ impl FlushScheduler {
|
||||
self.region_status.contains_key(®ion_id)
|
||||
}
|
||||
|
||||
fn schedule_flush_task(
|
||||
&mut self,
|
||||
version_control: &VersionControlRef,
|
||||
task: RegionFlushTask,
|
||||
) -> Result<()> {
|
||||
let region_id = task.region_id;
|
||||
|
||||
// If current region doesn't have flush status, we can flush the region directly.
|
||||
if let Err(e) = version_control.freeze_mutable() {
|
||||
error!(e; "Failed to freeze the mutable memtable for region {}", region_id);
|
||||
|
||||
return Err(e);
|
||||
}
|
||||
// Submit a flush job.
|
||||
let job = task.into_flush_job(version_control);
|
||||
if let Err(e) = self.scheduler.schedule(job) {
|
||||
// If scheduler returns error, senders in the job will be dropped and waiters
|
||||
// can get recv errors.
|
||||
error!(e; "Failed to schedule flush job for region {}", region_id);
|
||||
|
||||
return Err(e);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Schedules a flush `task` for specific `region`.
|
||||
pub(crate) fn schedule_flush(
|
||||
&mut self,
|
||||
@@ -930,21 +910,46 @@ impl FlushScheduler {
|
||||
.with_label_values(&[task.reason.as_str()])
|
||||
.inc();
|
||||
|
||||
// If current region has flush status, merge the task.
|
||||
if let Some(flush_status) = self.region_status.get_mut(®ion_id) {
|
||||
// Checks whether we can flush the region now.
|
||||
debug!("Merging flush task for region {}", region_id);
|
||||
// Add this region to status map.
|
||||
let flush_status = self
|
||||
.region_status
|
||||
.entry(region_id)
|
||||
.or_insert_with(|| FlushStatus::new(region_id, version_control.clone()));
|
||||
// Checks whether we can flush the region now.
|
||||
if flush_status.flushing {
|
||||
// There is already a flush job running.
|
||||
flush_status.merge_task(task);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
self.schedule_flush_task(version_control, task)?;
|
||||
// TODO(yingwen): We can merge with pending and execute directly.
|
||||
// If there are pending tasks, then we should push it to pending list.
|
||||
if flush_status.pending_task.is_some() {
|
||||
flush_status.merge_task(task);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Add this region to status map.
|
||||
let _ = self.region_status.insert(
|
||||
region_id,
|
||||
FlushStatus::new(region_id, version_control.clone()),
|
||||
);
|
||||
// Now we can flush the region directly.
|
||||
if let Err(e) = version_control.freeze_mutable() {
|
||||
error!(e; "Failed to freeze the mutable memtable for region {}", region_id);
|
||||
|
||||
// Remove from region status if we can't freeze the mutable memtable.
|
||||
self.region_status.remove(®ion_id);
|
||||
return Err(e);
|
||||
}
|
||||
// Submit a flush job.
|
||||
let job = task.into_flush_job(version_control);
|
||||
if let Err(e) = self.scheduler.schedule(job) {
|
||||
// If scheduler returns error, senders in the job will be dropped and waiters
|
||||
// can get recv errors.
|
||||
error!(e; "Failed to schedule flush job for region {}", region_id);
|
||||
|
||||
// Remove from region status if we can't submit the task.
|
||||
self.region_status.remove(®ion_id);
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
flush_status.flushing = true;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -961,56 +966,48 @@ impl FlushScheduler {
|
||||
Vec<SenderBulkRequest>,
|
||||
)> {
|
||||
let flush_status = self.region_status.get_mut(®ion_id)?;
|
||||
// If region doesn't have any pending flush task, we need to remove it from the status.
|
||||
if flush_status.pending_task.is_none() {
|
||||
|
||||
// This region doesn't have running flush job.
|
||||
flush_status.flushing = false;
|
||||
|
||||
let pending_requests = if flush_status.pending_task.is_none() {
|
||||
// The region doesn't have any pending flush task.
|
||||
// Safety: The flush status must exist.
|
||||
debug!(
|
||||
"Region {} doesn't have any pending flush task, removing it from the status",
|
||||
region_id
|
||||
);
|
||||
let flush_status = self.region_status.remove(®ion_id).unwrap();
|
||||
return Some((
|
||||
Some((
|
||||
flush_status.pending_ddls,
|
||||
flush_status.pending_writes,
|
||||
flush_status.pending_bulk_writes,
|
||||
));
|
||||
))
|
||||
} else {
|
||||
let version_data = flush_status.version_control.current();
|
||||
if version_data.version.memtables.is_empty() {
|
||||
// The region has nothing to flush, we also need to remove it from the status.
|
||||
// Safety: The pending task is not None.
|
||||
let task = flush_status.pending_task.take().unwrap();
|
||||
// The region has nothing to flush. We can notify pending task.
|
||||
task.on_success();
|
||||
// `schedule_next_flush()` may pick up the same region to flush, so we must remove
|
||||
// it from the status to avoid leaking pending requests.
|
||||
// Safety: The flush status must exist.
|
||||
let flush_status = self.region_status.remove(®ion_id).unwrap();
|
||||
Some((
|
||||
flush_status.pending_ddls,
|
||||
flush_status.pending_writes,
|
||||
flush_status.pending_bulk_writes,
|
||||
))
|
||||
} else {
|
||||
// We can flush the region again, keep it in the region status.
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
// Schedule next flush job.
|
||||
if let Err(e) = self.schedule_next_flush() {
|
||||
error!(e; "Flush of region {} is successful, but failed to schedule next flush", region_id);
|
||||
}
|
||||
|
||||
// If region has pending task, but has nothing to flush, we need to remove it from the status.
|
||||
let version_data = flush_status.version_control.current();
|
||||
if version_data.version.memtables.is_empty() {
|
||||
// The region has nothing to flush, we also need to remove it from the status.
|
||||
// Safety: The pending task is not None.
|
||||
let task = flush_status.pending_task.take().unwrap();
|
||||
// The region has nothing to flush. We can notify pending task.
|
||||
task.on_success();
|
||||
debug!(
|
||||
"Region {} has nothing to flush, removing it from the status",
|
||||
region_id
|
||||
);
|
||||
// Safety: The flush status must exist.
|
||||
let flush_status = self.region_status.remove(®ion_id).unwrap();
|
||||
return Some((
|
||||
flush_status.pending_ddls,
|
||||
flush_status.pending_writes,
|
||||
flush_status.pending_bulk_writes,
|
||||
));
|
||||
}
|
||||
|
||||
// If region has pending task and has something to flush, we need to schedule it.
|
||||
debug!("Scheduling pending flush task for region {}", region_id);
|
||||
// Safety: The flush status must exist.
|
||||
let task = flush_status.pending_task.take().unwrap();
|
||||
let version_control = flush_status.version_control.clone();
|
||||
if let Err(err) = self.schedule_flush_task(&version_control, task) {
|
||||
error!(
|
||||
err;
|
||||
"Flush succeeded for region {region_id}, but failed to schedule next flush for it."
|
||||
);
|
||||
}
|
||||
// We can flush the region again, keep it in the region status.
|
||||
None
|
||||
pending_requests
|
||||
}
|
||||
|
||||
/// Notifies the scheduler that the flush job is failed.
|
||||
@@ -1026,6 +1023,11 @@ impl FlushScheduler {
|
||||
|
||||
// Fast fail: cancels all pending tasks and sends error to their waiters.
|
||||
flush_status.on_failure(err);
|
||||
|
||||
// Still tries to schedule a new flush.
|
||||
if let Err(e) = self.schedule_next_flush() {
|
||||
error!(e; "Failed to schedule next flush after region {} flush is failed", region_id);
|
||||
}
|
||||
}
|
||||
|
||||
/// Notifies the scheduler that the region is dropped.
|
||||
@@ -1096,6 +1098,30 @@ impl FlushScheduler {
|
||||
.map(|status| !status.pending_ddls.is_empty())
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Schedules a new flush task when the scheduler can submit next task.
|
||||
pub(crate) fn schedule_next_flush(&mut self) -> Result<()> {
|
||||
debug_assert!(
|
||||
self.region_status
|
||||
.values()
|
||||
.all(|status| status.flushing || status.pending_task.is_some())
|
||||
);
|
||||
|
||||
// Get the first region from status map.
|
||||
let Some(flush_status) = self
|
||||
.region_status
|
||||
.values_mut()
|
||||
.find(|status| status.pending_task.is_some())
|
||||
else {
|
||||
return Ok(());
|
||||
};
|
||||
debug_assert!(!flush_status.flushing);
|
||||
let task = flush_status.pending_task.take().unwrap();
|
||||
let region_id = flush_status.region_id;
|
||||
let version_control = flush_status.version_control.clone();
|
||||
|
||||
self.schedule_flush(region_id, &version_control, task)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for FlushScheduler {
|
||||
@@ -1115,6 +1141,11 @@ struct FlushStatus {
|
||||
region_id: RegionId,
|
||||
/// Version control of the region.
|
||||
version_control: VersionControlRef,
|
||||
/// There is a flush task running.
|
||||
///
|
||||
/// It is possible that a region is not flushing but has pending task if the scheduler
|
||||
/// doesn't schedules this region.
|
||||
flushing: bool,
|
||||
/// Task waiting for next flush.
|
||||
pending_task: Option<RegionFlushTask>,
|
||||
/// Pending ddl requests.
|
||||
@@ -1130,6 +1161,7 @@ impl FlushStatus {
|
||||
FlushStatus {
|
||||
region_id,
|
||||
version_control,
|
||||
flushing: false,
|
||||
pending_task: None,
|
||||
pending_ddls: Vec::new(),
|
||||
pending_writes: Vec::new(),
|
||||
@@ -1221,12 +1253,10 @@ mod tests {
|
||||
// Global usage is still 1100.
|
||||
manager.schedule_free_mem(200);
|
||||
assert!(manager.should_flush_engine());
|
||||
assert!(manager.should_stall());
|
||||
|
||||
// More than global limit, mutable (1100-200-450=450) is less than mutable limit (< 500).
|
||||
// More than global limit, but mutable (1100-200-450=450) is not enough (< 500).
|
||||
manager.schedule_free_mem(450);
|
||||
assert!(manager.should_flush_engine());
|
||||
assert!(manager.should_stall());
|
||||
assert!(!manager.should_flush_engine());
|
||||
|
||||
// Now mutable is enough.
|
||||
manager.reserve_mem(50);
|
||||
@@ -1473,92 +1503,4 @@ mod tests {
|
||||
assert_eq!(2, total_rows, "append_mode should preserve duplicates");
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_schedule_pending_request_on_flush_success() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let job_scheduler = Arc::new(VecScheduler::default());
|
||||
let env = SchedulerEnv::new().await.scheduler(job_scheduler.clone());
|
||||
let (tx, _rx) = mpsc::channel(4);
|
||||
let mut scheduler = env.mock_flush_scheduler();
|
||||
let mut builder = VersionControlBuilder::new();
|
||||
// Overwrites the empty memtable builder.
|
||||
builder.set_memtable_builder(Arc::new(TimeSeriesMemtableBuilder::default()));
|
||||
let version_control = Arc::new(builder.build());
|
||||
// Writes data to the memtable so it is not empty.
|
||||
let version_data = version_control.current();
|
||||
write_rows_to_version(&version_data.version, "host0", 0, 10);
|
||||
let manifest_ctx = env
|
||||
.mock_manifest_context(version_data.version.metadata.clone())
|
||||
.await;
|
||||
// Creates 2 tasks.
|
||||
let mut tasks: Vec<_> = (0..2)
|
||||
.map(|_| RegionFlushTask {
|
||||
region_id: builder.region_id(),
|
||||
reason: FlushReason::Others,
|
||||
senders: Vec::new(),
|
||||
request_sender: tx.clone(),
|
||||
access_layer: env.access_layer.clone(),
|
||||
listener: WorkerListener::default(),
|
||||
engine_config: Arc::new(MitoConfig::default()),
|
||||
row_group_size: None,
|
||||
cache_manager: Arc::new(CacheManager::default()),
|
||||
manifest_ctx: manifest_ctx.clone(),
|
||||
index_options: IndexOptions::default(),
|
||||
flush_semaphore: Arc::new(Semaphore::new(2)),
|
||||
is_staging: false,
|
||||
})
|
||||
.collect();
|
||||
// Schedule first task.
|
||||
let task = tasks.pop().unwrap();
|
||||
scheduler
|
||||
.schedule_flush(builder.region_id(), &version_control, task)
|
||||
.unwrap();
|
||||
// Should schedule 1 flush.
|
||||
assert_eq!(1, scheduler.region_status.len());
|
||||
assert_eq!(1, job_scheduler.num_jobs());
|
||||
// Schedule second task.
|
||||
let task = tasks.pop().unwrap();
|
||||
scheduler
|
||||
.schedule_flush(builder.region_id(), &version_control, task)
|
||||
.unwrap();
|
||||
assert!(
|
||||
scheduler
|
||||
.region_status
|
||||
.get(&builder.region_id())
|
||||
.unwrap()
|
||||
.pending_task
|
||||
.is_some()
|
||||
);
|
||||
|
||||
// Check the new version.
|
||||
let version_data = version_control.current();
|
||||
assert_eq!(0, version_data.version.memtables.immutables()[0].id());
|
||||
// Assumes the flush job is finished.
|
||||
version_control.apply_edit(
|
||||
Some(RegionEdit {
|
||||
files_to_add: Vec::new(),
|
||||
files_to_remove: Vec::new(),
|
||||
timestamp_ms: None,
|
||||
compaction_time_window: None,
|
||||
flushed_entry_id: None,
|
||||
flushed_sequence: None,
|
||||
committed_sequence: None,
|
||||
}),
|
||||
&[0],
|
||||
builder.file_purger(),
|
||||
);
|
||||
write_rows_to_version(&version_data.version, "host1", 0, 10);
|
||||
scheduler.on_flush_success(builder.region_id());
|
||||
assert_eq!(2, job_scheduler.num_jobs());
|
||||
// The pending task is cleared.
|
||||
assert!(
|
||||
scheduler
|
||||
.region_status
|
||||
.get(&builder.region_id())
|
||||
.unwrap()
|
||||
.pending_task
|
||||
.is_none()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -330,9 +330,10 @@ impl LocalGcWorker {
|
||||
|
||||
// TODO(discord9): for now, ignore async index file as it's design is not stable, need to be improved once
|
||||
// index file design is stable
|
||||
let file_pairs: Vec<(FileId, u64)> =
|
||||
unused_files.iter().map(|file_id| (*file_id, 0)).collect();
|
||||
// TODO(discord9): gc worker need another major refactor to support versioned index files
|
||||
let file_pairs: Vec<(FileId, FileId)> = unused_files
|
||||
.iter()
|
||||
.map(|file_id| (*file_id, *file_id))
|
||||
.collect();
|
||||
|
||||
debug!(
|
||||
"Found {} unused index files to delete for region {}",
|
||||
@@ -353,7 +354,7 @@ impl LocalGcWorker {
|
||||
Ok(unused_files)
|
||||
}
|
||||
|
||||
async fn delete_files(&self, region_id: RegionId, file_ids: &[(FileId, u64)]) -> Result<()> {
|
||||
async fn delete_files(&self, region_id: RegionId, file_ids: &[(FileId, FileId)]) -> Result<()> {
|
||||
delete_files(
|
||||
region_id,
|
||||
file_ids,
|
||||
|
||||
@@ -24,7 +24,6 @@ use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::FileId;
|
||||
use store_api::{MAX_VERSION, MIN_VERSION, ManifestVersion};
|
||||
|
||||
use crate::cache::manifest_cache::ManifestCache;
|
||||
use crate::config::MitoConfig;
|
||||
use crate::error::{
|
||||
self, InstallManifestToSnafu, NoCheckpointSnafu, NoManifestsSnafu, RegionStoppedSnafu, Result,
|
||||
@@ -53,8 +52,6 @@ pub struct RegionManifestOptions {
|
||||
/// Set to 0 to disable checkpoint.
|
||||
pub checkpoint_distance: u64,
|
||||
pub remove_file_options: RemoveFileOptions,
|
||||
/// Optional cache for manifest files.
|
||||
pub manifest_cache: Option<ManifestCache>,
|
||||
}
|
||||
|
||||
impl RegionManifestOptions {
|
||||
@@ -70,7 +67,6 @@ impl RegionManifestOptions {
|
||||
remove_file_options: RemoveFileOptions {
|
||||
enable_gc: config.gc.enable,
|
||||
},
|
||||
manifest_cache: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -178,7 +174,6 @@ impl RegionManifestManager {
|
||||
options.object_store.clone(),
|
||||
options.compress_type,
|
||||
stats.total_manifest_size.clone(),
|
||||
options.manifest_cache.clone(),
|
||||
);
|
||||
let manifest_version = stats.manifest_version.clone();
|
||||
|
||||
@@ -261,7 +256,6 @@ impl RegionManifestManager {
|
||||
options.object_store.clone(),
|
||||
options.compress_type,
|
||||
stats.total_manifest_size.clone(),
|
||||
options.manifest_cache.clone(),
|
||||
);
|
||||
let manifest_version = stats.manifest_version.clone();
|
||||
|
||||
|
||||
@@ -33,7 +33,6 @@ use store_api::ManifestVersion;
|
||||
use store_api::storage::RegionId;
|
||||
use tokio::sync::Semaphore;
|
||||
|
||||
use crate::cache::manifest_cache::ManifestCache;
|
||||
use crate::error::{
|
||||
ChecksumMismatchSnafu, CompressObjectSnafu, DecompressObjectSnafu, InvalidScanIndexSnafu,
|
||||
OpenDalSnafu, Result, SerdeJsonSnafu, Utf8Snafu,
|
||||
@@ -145,8 +144,6 @@ pub struct ManifestObjectStore {
|
||||
/// Stores the size of each manifest file.
|
||||
manifest_size_map: Arc<RwLock<HashMap<FileKey, u64>>>,
|
||||
total_manifest_size: Arc<AtomicU64>,
|
||||
/// Optional manifest cache for local caching.
|
||||
manifest_cache: Option<ManifestCache>,
|
||||
}
|
||||
|
||||
impl ManifestObjectStore {
|
||||
@@ -155,7 +152,6 @@ impl ManifestObjectStore {
|
||||
object_store: ObjectStore,
|
||||
compress_type: CompressionType,
|
||||
total_manifest_size: Arc<AtomicU64>,
|
||||
manifest_cache: Option<ManifestCache>,
|
||||
) -> Self {
|
||||
let path = util::normalize_dir(path);
|
||||
let staging_path = {
|
||||
@@ -170,7 +166,6 @@ impl ManifestObjectStore {
|
||||
staging_path,
|
||||
manifest_size_map: Arc::new(RwLock::new(HashMap::new())),
|
||||
total_manifest_size,
|
||||
manifest_cache,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -296,11 +291,9 @@ impl ManifestObjectStore {
|
||||
}
|
||||
|
||||
/// Common implementation for fetching manifests from entries in parallel.
|
||||
/// If `is_staging` is true, cache is skipped.
|
||||
async fn fetch_manifests_from_entries(
|
||||
&self,
|
||||
entries: Vec<(ManifestVersion, Entry)>,
|
||||
is_staging: bool,
|
||||
) -> Result<Vec<(ManifestVersion, Vec<u8>)>> {
|
||||
if entries.is_empty() {
|
||||
return Ok(vec![]);
|
||||
@@ -313,13 +306,6 @@ impl ManifestObjectStore {
|
||||
// Safety: semaphore must exist.
|
||||
let _permit = semaphore.acquire().await.unwrap();
|
||||
|
||||
let cache_key = entry.path();
|
||||
// Try to get from cache first
|
||||
if let Some(data) = self.get_from_cache(cache_key, is_staging).await {
|
||||
return Ok((*v, data));
|
||||
}
|
||||
|
||||
// Fetch from remote object store
|
||||
let compress_type = file_compress_type(entry.name());
|
||||
let bytes = self
|
||||
.object_store
|
||||
@@ -333,11 +319,6 @@ impl ManifestObjectStore {
|
||||
compress_type,
|
||||
path: entry.path(),
|
||||
})?;
|
||||
|
||||
// Add to cache
|
||||
self.put_to_cache(cache_key.to_string(), &data, is_staging)
|
||||
.await;
|
||||
|
||||
Ok((*v, data))
|
||||
});
|
||||
|
||||
@@ -354,7 +335,7 @@ impl ManifestObjectStore {
|
||||
end_version: ManifestVersion,
|
||||
) -> Result<Vec<(ManifestVersion, Vec<u8>)>> {
|
||||
let manifests = self.scan(start_version, end_version).await?;
|
||||
self.fetch_manifests_from_entries(manifests, false).await
|
||||
self.fetch_manifests_from_entries(manifests).await
|
||||
}
|
||||
|
||||
/// Delete manifest files that version < end.
|
||||
@@ -424,11 +405,6 @@ impl ManifestObjectStore {
|
||||
ret, self.path, end, checkpoint_version, paths,
|
||||
);
|
||||
|
||||
// Remove from cache first
|
||||
for (entry, _, _) in &del_entries {
|
||||
self.remove_from_cache(entry.path()).await;
|
||||
}
|
||||
|
||||
self.object_store
|
||||
.delete_iter(paths)
|
||||
.await
|
||||
@@ -464,10 +440,11 @@ impl ManifestObjectStore {
|
||||
path: &path,
|
||||
})?;
|
||||
let delta_size = data.len();
|
||||
|
||||
self.write_and_put_cache(&path, data, is_staging).await?;
|
||||
self.object_store
|
||||
.write(&path, data)
|
||||
.await
|
||||
.context(OpenDalSnafu)?;
|
||||
self.set_delta_file_size(version, delta_size as u64);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -488,8 +465,10 @@ impl ManifestObjectStore {
|
||||
})?;
|
||||
let checkpoint_size = data.len();
|
||||
let checksum = checkpoint_checksum(bytes);
|
||||
|
||||
self.write_and_put_cache(&path, data, false).await?;
|
||||
self.object_store
|
||||
.write(&path, data)
|
||||
.await
|
||||
.context(OpenDalSnafu)?;
|
||||
self.set_checkpoint_file_size(version, checkpoint_size as u64);
|
||||
|
||||
// Because last checkpoint file only contain size and version, which is tiny, so we don't compress it.
|
||||
@@ -522,80 +501,60 @@ impl ManifestObjectStore {
|
||||
) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
|
||||
let version = metadata.version;
|
||||
let path = self.checkpoint_file_path(version);
|
||||
|
||||
// Try to get from cache first
|
||||
if let Some(data) = self.get_from_cache(&path, false).await {
|
||||
verify_checksum(&data, metadata.checksum)?;
|
||||
return Ok(Some((version, data)));
|
||||
}
|
||||
|
||||
// Due to backward compatibility, it is possible that the user's checkpoint not compressed,
|
||||
// so if we don't find file by compressed type. fall back to checkpoint not compressed find again.
|
||||
let checkpoint_data = match self.object_store.read(&path).await {
|
||||
Ok(checkpoint) => {
|
||||
let checkpoint_size = checkpoint.len();
|
||||
let decompress_data =
|
||||
self.compress_type
|
||||
.decode(checkpoint)
|
||||
.await
|
||||
.with_context(|_| DecompressObjectSnafu {
|
||||
let checkpoint_data =
|
||||
match self.object_store.read(&path).await {
|
||||
Ok(checkpoint) => {
|
||||
let checkpoint_size = checkpoint.len();
|
||||
let decompress_data = self.compress_type.decode(checkpoint).await.context(
|
||||
DecompressObjectSnafu {
|
||||
compress_type: self.compress_type,
|
||||
path: path.clone(),
|
||||
})?;
|
||||
verify_checksum(&decompress_data, metadata.checksum)?;
|
||||
// set the checkpoint size
|
||||
self.set_checkpoint_file_size(version, checkpoint_size as u64);
|
||||
// Add to cache
|
||||
self.put_to_cache(path, &decompress_data, false).await;
|
||||
Ok(Some(decompress_data))
|
||||
}
|
||||
Err(e) => {
|
||||
if e.kind() == ErrorKind::NotFound {
|
||||
if self.compress_type != FALL_BACK_COMPRESS_TYPE {
|
||||
let fall_back_path = gen_path(
|
||||
&self.path,
|
||||
&checkpoint_file(version),
|
||||
FALL_BACK_COMPRESS_TYPE,
|
||||
);
|
||||
debug!(
|
||||
"Failed to load checkpoint from path: {}, fall back to path: {}",
|
||||
path, fall_back_path
|
||||
);
|
||||
|
||||
// Try to get fallback from cache first
|
||||
if let Some(data) = self.get_from_cache(&fall_back_path, false).await {
|
||||
verify_checksum(&data, metadata.checksum)?;
|
||||
return Ok(Some((version, data)));
|
||||
}
|
||||
|
||||
match self.object_store.read(&fall_back_path).await {
|
||||
Ok(checkpoint) => {
|
||||
let checkpoint_size = checkpoint.len();
|
||||
let decompress_data = FALL_BACK_COMPRESS_TYPE
|
||||
.decode(checkpoint)
|
||||
.await
|
||||
.with_context(|_| DecompressObjectSnafu {
|
||||
compress_type: FALL_BACK_COMPRESS_TYPE,
|
||||
path: fall_back_path.clone(),
|
||||
})?;
|
||||
verify_checksum(&decompress_data, metadata.checksum)?;
|
||||
self.set_checkpoint_file_size(version, checkpoint_size as u64);
|
||||
// Add fallback to cache
|
||||
self.put_to_cache(fall_back_path, &decompress_data, false)
|
||||
.await;
|
||||
Ok(Some(decompress_data))
|
||||
path,
|
||||
},
|
||||
)?;
|
||||
verify_checksum(&decompress_data, metadata.checksum)?;
|
||||
// set the checkpoint size
|
||||
self.set_checkpoint_file_size(version, checkpoint_size as u64);
|
||||
Ok(Some(decompress_data))
|
||||
}
|
||||
Err(e) => {
|
||||
if e.kind() == ErrorKind::NotFound {
|
||||
if self.compress_type != FALL_BACK_COMPRESS_TYPE {
|
||||
let fall_back_path = gen_path(
|
||||
&self.path,
|
||||
&checkpoint_file(version),
|
||||
FALL_BACK_COMPRESS_TYPE,
|
||||
);
|
||||
debug!(
|
||||
"Failed to load checkpoint from path: {}, fall back to path: {}",
|
||||
path, fall_back_path
|
||||
);
|
||||
match self.object_store.read(&fall_back_path).await {
|
||||
Ok(checkpoint) => {
|
||||
let checkpoint_size = checkpoint.len();
|
||||
let decompress_data = FALL_BACK_COMPRESS_TYPE
|
||||
.decode(checkpoint)
|
||||
.await
|
||||
.context(DecompressObjectSnafu {
|
||||
compress_type: FALL_BACK_COMPRESS_TYPE,
|
||||
path,
|
||||
})?;
|
||||
verify_checksum(&decompress_data, metadata.checksum)?;
|
||||
self.set_checkpoint_file_size(version, checkpoint_size as u64);
|
||||
Ok(Some(decompress_data))
|
||||
}
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => Ok(None),
|
||||
Err(e) => Err(e).context(OpenDalSnafu),
|
||||
}
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => Ok(None),
|
||||
Err(e) => Err(e).context(OpenDalSnafu),
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
} else {
|
||||
Ok(None)
|
||||
Err(e).context(OpenDalSnafu)
|
||||
}
|
||||
} else {
|
||||
Err(e).context(OpenDalSnafu)
|
||||
}
|
||||
}
|
||||
}?;
|
||||
}?;
|
||||
Ok(checkpoint_data.map(|data| (version, data)))
|
||||
}
|
||||
|
||||
@@ -603,10 +562,8 @@ impl ManifestObjectStore {
|
||||
/// Return manifest version and the raw [RegionCheckpoint](crate::manifest::action::RegionCheckpoint) content if any
|
||||
pub async fn load_last_checkpoint(&mut self) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
|
||||
let last_checkpoint_path = self.last_checkpoint_path();
|
||||
|
||||
// Fetch from remote object store without cache
|
||||
let last_checkpoint_data = match self.object_store.read(&last_checkpoint_path).await {
|
||||
Ok(data) => data.to_vec(),
|
||||
Ok(data) => data,
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => {
|
||||
return Ok(None);
|
||||
}
|
||||
@@ -615,7 +572,7 @@ impl ManifestObjectStore {
|
||||
}
|
||||
};
|
||||
|
||||
let checkpoint_metadata = CheckpointMetadata::decode(&last_checkpoint_data)?;
|
||||
let checkpoint_metadata = CheckpointMetadata::decode(&last_checkpoint_data.to_vec())?;
|
||||
|
||||
debug!(
|
||||
"Load checkpoint in path: {}, metadata: {:?}",
|
||||
@@ -745,8 +702,7 @@ impl ManifestObjectStore {
|
||||
let mut sorted_entries = manifest_entries;
|
||||
Self::sort_manifests(&mut sorted_entries);
|
||||
|
||||
self.fetch_manifests_from_entries(sorted_entries, true)
|
||||
.await
|
||||
self.fetch_manifests_from_entries(sorted_entries).await
|
||||
}
|
||||
|
||||
/// Clear all staging manifest files.
|
||||
@@ -763,63 +719,6 @@ impl ManifestObjectStore {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Gets a manifest file from cache.
|
||||
/// Returns the file data if found in cache, None otherwise.
|
||||
/// If `is_staging` is true, always returns None.
|
||||
async fn get_from_cache(&self, key: &str, is_staging: bool) -> Option<Vec<u8>> {
|
||||
if is_staging {
|
||||
return None;
|
||||
}
|
||||
let cache = self.manifest_cache.as_ref()?;
|
||||
cache.get_file(key).await
|
||||
}
|
||||
|
||||
/// Puts a manifest file into cache.
|
||||
/// If `is_staging` is true, does nothing.
|
||||
async fn put_to_cache(&self, key: String, data: &[u8], is_staging: bool) {
|
||||
if is_staging {
|
||||
return;
|
||||
}
|
||||
let Some(cache) = &self.manifest_cache else {
|
||||
return;
|
||||
};
|
||||
|
||||
cache.put_file(key, data.to_vec()).await;
|
||||
}
|
||||
|
||||
/// Writes data to object store and puts it into cache.
|
||||
/// If `is_staging` is true, cache is skipped.
|
||||
async fn write_and_put_cache(&self, path: &str, data: Vec<u8>, is_staging: bool) -> Result<()> {
|
||||
// Clone data for cache before writing, only if cache is enabled and not staging
|
||||
let cache_data = if !is_staging && self.manifest_cache.is_some() {
|
||||
Some(data.clone())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Write to object store
|
||||
self.object_store
|
||||
.write(path, data)
|
||||
.await
|
||||
.context(OpenDalSnafu)?;
|
||||
|
||||
// Put to cache if we cloned the data
|
||||
if let Some(data) = cache_data {
|
||||
self.put_to_cache(path.to_string(), &data, is_staging).await;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Removes a manifest file from cache.
|
||||
async fn remove_from_cache(&self, key: &str) {
|
||||
let Some(cache) = &self.manifest_cache else {
|
||||
return;
|
||||
};
|
||||
|
||||
cache.remove(key).await;
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
@@ -863,7 +762,6 @@ mod tests {
|
||||
object_store,
|
||||
CompressionType::Uncompressed,
|
||||
Default::default(),
|
||||
None,
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -247,7 +247,7 @@ async fn checkpoint_with_different_compression_types() {
|
||||
available_indexes: Default::default(),
|
||||
indexes: Default::default(),
|
||||
index_file_size: 0,
|
||||
index_version: 0,
|
||||
index_file_id: None,
|
||||
num_rows: 0,
|
||||
num_row_groups: 0,
|
||||
sequence: None,
|
||||
@@ -312,7 +312,7 @@ fn generate_action_lists(num: usize) -> (Vec<FileId>, Vec<RegionMetaActionList>)
|
||||
available_indexes: Default::default(),
|
||||
indexes: Default::default(),
|
||||
index_file_size: 0,
|
||||
index_version: 0,
|
||||
index_file_id: None,
|
||||
num_rows: 0,
|
||||
num_row_groups: 0,
|
||||
sequence: None,
|
||||
|
||||
@@ -84,14 +84,6 @@ impl ProjectionMapper {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if the projection includes any tag columns.
|
||||
pub(crate) fn has_tags(&self) -> bool {
|
||||
match self {
|
||||
ProjectionMapper::PrimaryKey(m) => m.has_tags(),
|
||||
ProjectionMapper::Flat(_) => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns ids of projected columns that we need to read
|
||||
/// from memtables and SSTs.
|
||||
pub(crate) fn column_ids(&self) -> &[ColumnId] {
|
||||
@@ -265,11 +257,6 @@ impl PrimaryKeyProjectionMapper {
|
||||
&self.metadata
|
||||
}
|
||||
|
||||
/// Returns true if the projection includes any tag columns.
|
||||
pub(crate) fn has_tags(&self) -> bool {
|
||||
self.has_tags
|
||||
}
|
||||
|
||||
/// Returns ids of projected columns that we need to read
|
||||
/// from memtables and SSTs.
|
||||
pub(crate) fn column_ids(&self) -> &[ColumnId] {
|
||||
|
||||
@@ -135,14 +135,6 @@ impl Scanner {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn index_ids(&self) -> Vec<crate::sst::file::RegionIndexId> {
|
||||
match self {
|
||||
Scanner::Seq(seq_scan) => seq_scan.input().index_ids(),
|
||||
Scanner::Unordered(unordered_scan) => unordered_scan.input().index_ids(),
|
||||
Scanner::Series(series_scan) => series_scan.input().index_ids(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets the target partitions for the scanner. It can controls the parallelism of the scanner.
|
||||
pub(crate) fn set_target_partitions(&mut self, target_partitions: usize) {
|
||||
use store_api::region_engine::{PrepareRequest, RegionScanner};
|
||||
@@ -966,7 +958,6 @@ impl ScanInput {
|
||||
) -> Result<FileRangeBuilder> {
|
||||
let predicate = self.predicate_for_file(file);
|
||||
let filter_mode = pre_filter_mode(self.append_mode, self.merge_mode);
|
||||
let decode_pk_values = !self.compaction && self.mapper.has_tags();
|
||||
let res = self
|
||||
.access_layer
|
||||
.read_sst(file.clone())
|
||||
@@ -980,7 +971,6 @@ impl ScanInput {
|
||||
.flat_format(self.flat_format)
|
||||
.compaction(self.compaction)
|
||||
.pre_filter_mode(filter_mode)
|
||||
.decode_primary_key_values(decode_pk_values)
|
||||
.build_reader_input(reader_metrics)
|
||||
.await;
|
||||
let (mut file_range_ctx, selection) = match res {
|
||||
@@ -1170,10 +1160,6 @@ impl ScanInput {
|
||||
pub(crate) fn file_ids(&self) -> Vec<crate::sst::file::RegionFileId> {
|
||||
self.files.iter().map(|file| file.file_id()).collect()
|
||||
}
|
||||
|
||||
pub(crate) fn index_ids(&self) -> Vec<crate::sst::file::RegionIndexId> {
|
||||
self.files.iter().map(|file| file.index_id()).collect()
|
||||
}
|
||||
}
|
||||
|
||||
fn pre_filter_mode(append_mode: bool, merge_mode: MergeMode) -> PreFilterMode {
|
||||
|
||||
@@ -41,14 +41,10 @@ use crate::read::range::{RangeBuilderList, RangeMeta, RowGroupIndex};
|
||||
use crate::read::scan_region::StreamContext;
|
||||
use crate::read::{Batch, BoxedBatchStream, BoxedRecordBatchStream, ScannerMetrics, Source};
|
||||
use crate::sst::file::FileTimeRange;
|
||||
use crate::sst::index::bloom_filter::applier::BloomFilterIndexApplyMetrics;
|
||||
use crate::sst::index::fulltext_index::applier::FulltextIndexApplyMetrics;
|
||||
use crate::sst::index::inverted_index::applier::InvertedIndexApplyMetrics;
|
||||
use crate::sst::parquet::DEFAULT_ROW_GROUP_SIZE;
|
||||
use crate::sst::parquet::file_range::FileRange;
|
||||
use crate::sst::parquet::flat_format::time_index_column_index;
|
||||
use crate::sst::parquet::reader::{MetadataCacheMetrics, ReaderFilterMetrics, ReaderMetrics};
|
||||
use crate::sst::parquet::row_group::ParquetFetchMetrics;
|
||||
use crate::sst::parquet::reader::{ReaderFilterMetrics, ReaderMetrics};
|
||||
|
||||
/// Verbose scan metrics for a partition.
|
||||
#[derive(Default)]
|
||||
@@ -85,8 +81,6 @@ pub(crate) struct ScanMetricsSet {
|
||||
// SST related metrics:
|
||||
/// Duration to build file ranges.
|
||||
build_parts_cost: Duration,
|
||||
/// Duration to scan SST files.
|
||||
sst_scan_cost: Duration,
|
||||
/// Number of row groups before filtering.
|
||||
rg_total: usize,
|
||||
/// Number of row groups filtered by fulltext index.
|
||||
@@ -132,18 +126,6 @@ pub(crate) struct ScanMetricsSet {
|
||||
|
||||
/// The stream reached EOF
|
||||
stream_eof: bool,
|
||||
|
||||
// Optional verbose metrics:
|
||||
/// Inverted index apply metrics.
|
||||
inverted_index_apply_metrics: Option<InvertedIndexApplyMetrics>,
|
||||
/// Bloom filter index apply metrics.
|
||||
bloom_filter_apply_metrics: Option<BloomFilterIndexApplyMetrics>,
|
||||
/// Fulltext index apply metrics.
|
||||
fulltext_index_apply_metrics: Option<FulltextIndexApplyMetrics>,
|
||||
/// Parquet fetch metrics.
|
||||
fetch_metrics: Option<ParquetFetchMetrics>,
|
||||
/// Metadata cache metrics.
|
||||
metadata_cache_metrics: Option<MetadataCacheMetrics>,
|
||||
}
|
||||
|
||||
impl fmt::Debug for ScanMetricsSet {
|
||||
@@ -159,7 +141,6 @@ impl fmt::Debug for ScanMetricsSet {
|
||||
num_mem_ranges,
|
||||
num_file_ranges,
|
||||
build_parts_cost,
|
||||
sst_scan_cost,
|
||||
rg_total,
|
||||
rg_fulltext_filtered,
|
||||
rg_inverted_filtered,
|
||||
@@ -185,11 +166,6 @@ impl fmt::Debug for ScanMetricsSet {
|
||||
mem_rows,
|
||||
mem_batches,
|
||||
mem_series,
|
||||
inverted_index_apply_metrics,
|
||||
bloom_filter_apply_metrics,
|
||||
fulltext_index_apply_metrics,
|
||||
fetch_metrics,
|
||||
metadata_cache_metrics,
|
||||
} = self;
|
||||
|
||||
// Write core metrics
|
||||
@@ -205,7 +181,6 @@ impl fmt::Debug for ScanMetricsSet {
|
||||
\"num_mem_ranges\":{num_mem_ranges}, \
|
||||
\"num_file_ranges\":{num_file_ranges}, \
|
||||
\"build_parts_cost\":\"{build_parts_cost:?}\", \
|
||||
\"sst_scan_cost\":\"{sst_scan_cost:?}\", \
|
||||
\"rg_total\":{rg_total}, \
|
||||
\"rows_before_filter\":{rows_before_filter}, \
|
||||
\"num_sst_record_batches\":{num_sst_record_batches}, \
|
||||
@@ -280,33 +255,6 @@ impl fmt::Debug for ScanMetricsSet {
|
||||
write!(f, ", \"mem_scan_cost\":\"{mem_scan_cost:?}\"")?;
|
||||
}
|
||||
|
||||
// Write optional verbose metrics if they are not empty
|
||||
if let Some(metrics) = inverted_index_apply_metrics
|
||||
&& !metrics.is_empty()
|
||||
{
|
||||
write!(f, ", \"inverted_index_apply_metrics\":{:?}", metrics)?;
|
||||
}
|
||||
if let Some(metrics) = bloom_filter_apply_metrics
|
||||
&& !metrics.is_empty()
|
||||
{
|
||||
write!(f, ", \"bloom_filter_apply_metrics\":{:?}", metrics)?;
|
||||
}
|
||||
if let Some(metrics) = fulltext_index_apply_metrics
|
||||
&& !metrics.is_empty()
|
||||
{
|
||||
write!(f, ", \"fulltext_index_apply_metrics\":{:?}", metrics)?;
|
||||
}
|
||||
if let Some(metrics) = fetch_metrics
|
||||
&& !metrics.is_empty()
|
||||
{
|
||||
write!(f, ", \"fetch_metrics\":{:?}", metrics)?;
|
||||
}
|
||||
if let Some(metrics) = metadata_cache_metrics
|
||||
&& !metrics.is_empty()
|
||||
{
|
||||
write!(f, ", \"metadata_cache_metrics\":{:?}", metrics)?;
|
||||
}
|
||||
|
||||
write!(f, ", \"stream_eof\":{stream_eof}}}")
|
||||
}
|
||||
}
|
||||
@@ -356,20 +304,14 @@ impl ScanMetricsSet {
|
||||
rows_inverted_filtered,
|
||||
rows_bloom_filtered,
|
||||
rows_precise_filtered,
|
||||
inverted_index_apply_metrics,
|
||||
bloom_filter_apply_metrics,
|
||||
fulltext_index_apply_metrics,
|
||||
},
|
||||
num_record_batches,
|
||||
num_batches,
|
||||
num_rows,
|
||||
scan_cost,
|
||||
metadata_cache_metrics,
|
||||
fetch_metrics,
|
||||
scan_cost: _,
|
||||
} = other;
|
||||
|
||||
self.build_parts_cost += *build_cost;
|
||||
self.sst_scan_cost += *scan_cost;
|
||||
|
||||
self.rg_total += *rg_total;
|
||||
self.rg_fulltext_filtered += *rg_fulltext_filtered;
|
||||
@@ -386,31 +328,6 @@ impl ScanMetricsSet {
|
||||
self.num_sst_record_batches += *num_record_batches;
|
||||
self.num_sst_batches += *num_batches;
|
||||
self.num_sst_rows += *num_rows;
|
||||
|
||||
// Merge optional verbose metrics
|
||||
if let Some(metrics) = inverted_index_apply_metrics {
|
||||
self.inverted_index_apply_metrics
|
||||
.get_or_insert_with(InvertedIndexApplyMetrics::default)
|
||||
.merge_from(metrics);
|
||||
}
|
||||
if let Some(metrics) = bloom_filter_apply_metrics {
|
||||
self.bloom_filter_apply_metrics
|
||||
.get_or_insert_with(BloomFilterIndexApplyMetrics::default)
|
||||
.merge_from(metrics);
|
||||
}
|
||||
if let Some(metrics) = fulltext_index_apply_metrics {
|
||||
self.fulltext_index_apply_metrics
|
||||
.get_or_insert_with(FulltextIndexApplyMetrics::default)
|
||||
.merge_from(metrics);
|
||||
}
|
||||
if let Some(metrics) = fetch_metrics {
|
||||
self.fetch_metrics
|
||||
.get_or_insert_with(ParquetFetchMetrics::default)
|
||||
.merge_from(metrics);
|
||||
}
|
||||
self.metadata_cache_metrics
|
||||
.get_or_insert_with(MetadataCacheMetrics::default)
|
||||
.merge_from(metadata_cache_metrics);
|
||||
}
|
||||
|
||||
/// Sets distributor metrics.
|
||||
@@ -698,11 +615,6 @@ impl PartitionMetrics {
|
||||
let mut metrics_set = self.0.metrics.lock().unwrap();
|
||||
metrics_set.set_distributor_metrics(metrics);
|
||||
}
|
||||
|
||||
/// Returns whether verbose explain is enabled.
|
||||
pub(crate) fn explain_verbose(&self) -> bool {
|
||||
self.0.explain_verbose
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for PartitionMetrics {
|
||||
@@ -856,21 +768,6 @@ fn can_split_series(num_rows: u64, num_series: u64) -> bool {
|
||||
num_series < NUM_SERIES_THRESHOLD || num_rows / num_series >= BATCH_SIZE_THRESHOLD
|
||||
}
|
||||
|
||||
/// Creates a new [ReaderFilterMetrics] with optional apply metrics initialized
|
||||
/// based on the `explain_verbose` flag.
|
||||
fn new_filter_metrics(explain_verbose: bool) -> ReaderFilterMetrics {
|
||||
if explain_verbose {
|
||||
ReaderFilterMetrics {
|
||||
inverted_index_apply_metrics: Some(InvertedIndexApplyMetrics::default()),
|
||||
bloom_filter_apply_metrics: Some(BloomFilterIndexApplyMetrics::default()),
|
||||
fulltext_index_apply_metrics: Some(FulltextIndexApplyMetrics::default()),
|
||||
..Default::default()
|
||||
}
|
||||
} else {
|
||||
ReaderFilterMetrics::default()
|
||||
}
|
||||
}
|
||||
|
||||
/// Scans file ranges at `index`.
|
||||
pub(crate) async fn scan_file_ranges(
|
||||
stream_ctx: Arc<StreamContext>,
|
||||
@@ -879,10 +776,7 @@ pub(crate) async fn scan_file_ranges(
|
||||
read_type: &'static str,
|
||||
range_builder: Arc<RangeBuilderList>,
|
||||
) -> Result<impl Stream<Item = Result<Batch>>> {
|
||||
let mut reader_metrics = ReaderMetrics {
|
||||
filter_metrics: new_filter_metrics(part_metrics.explain_verbose()),
|
||||
..Default::default()
|
||||
};
|
||||
let mut reader_metrics = ReaderMetrics::default();
|
||||
let ranges = range_builder
|
||||
.build_file_ranges(&stream_ctx.input, index, &mut reader_metrics)
|
||||
.await?;
|
||||
@@ -905,10 +799,7 @@ pub(crate) async fn scan_flat_file_ranges(
|
||||
read_type: &'static str,
|
||||
range_builder: Arc<RangeBuilderList>,
|
||||
) -> Result<impl Stream<Item = Result<RecordBatch>>> {
|
||||
let mut reader_metrics = ReaderMetrics {
|
||||
filter_metrics: new_filter_metrics(part_metrics.explain_verbose()),
|
||||
..Default::default()
|
||||
};
|
||||
let mut reader_metrics = ReaderMetrics::default();
|
||||
let ranges = range_builder
|
||||
.build_file_ranges(&stream_ctx.input, index, &mut reader_metrics)
|
||||
.await?;
|
||||
@@ -931,18 +822,10 @@ pub fn build_file_range_scan_stream(
|
||||
ranges: SmallVec<[FileRange; 2]>,
|
||||
) -> impl Stream<Item = Result<Batch>> {
|
||||
try_stream! {
|
||||
let fetch_metrics = if part_metrics.explain_verbose() {
|
||||
Some(Arc::new(ParquetFetchMetrics::default()))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let reader_metrics = &mut ReaderMetrics {
|
||||
fetch_metrics: fetch_metrics.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
let reader_metrics = &mut ReaderMetrics::default();
|
||||
for range in ranges {
|
||||
let build_reader_start = Instant::now();
|
||||
let reader = range.reader(stream_ctx.input.series_row_selector, fetch_metrics.as_deref()).await?;
|
||||
let reader = range.reader(stream_ctx.input.series_row_selector).await?;
|
||||
let build_cost = build_reader_start.elapsed();
|
||||
part_metrics.inc_build_reader_cost(build_cost);
|
||||
let compat_batch = range.compat_batch();
|
||||
@@ -974,18 +857,10 @@ pub fn build_flat_file_range_scan_stream(
|
||||
ranges: SmallVec<[FileRange; 2]>,
|
||||
) -> impl Stream<Item = Result<RecordBatch>> {
|
||||
try_stream! {
|
||||
let fetch_metrics = if part_metrics.explain_verbose() {
|
||||
Some(Arc::new(ParquetFetchMetrics::default()))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let reader_metrics = &mut ReaderMetrics {
|
||||
fetch_metrics: fetch_metrics.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
let reader_metrics = &mut ReaderMetrics::default();
|
||||
for range in ranges {
|
||||
let build_reader_start = Instant::now();
|
||||
let mut reader = range.flat_reader(fetch_metrics.as_deref()).await?;
|
||||
let mut reader = range.flat_reader().await?;
|
||||
let build_cost = build_reader_start.elapsed();
|
||||
part_metrics.inc_build_reader_cost(build_cost);
|
||||
|
||||
|
||||
@@ -617,16 +617,17 @@ impl MitoRegion {
|
||||
.map(|meta| {
|
||||
let region_id = self.region_id;
|
||||
let origin_region_id = meta.region_id;
|
||||
let (index_version, index_file_path, index_file_size) = if meta.index_file_size > 0
|
||||
let (index_file_id, index_file_path, index_file_size) = if meta.index_file_size > 0
|
||||
{
|
||||
let index_file_path = index_file_path(table_dir, meta.index_id(), path_type);
|
||||
let index_file_path =
|
||||
index_file_path(table_dir, meta.index_file_id(), path_type);
|
||||
(
|
||||
meta.index_version,
|
||||
Some(meta.index_file_id().file_id().to_string()),
|
||||
Some(index_file_path),
|
||||
Some(meta.index_file_size),
|
||||
)
|
||||
} else {
|
||||
(0, None, None)
|
||||
(None, None, None)
|
||||
};
|
||||
let visible = visible_ssts.contains(&meta.file_id);
|
||||
ManifestSstEntry {
|
||||
@@ -637,7 +638,7 @@ impl MitoRegion {
|
||||
region_group: region_id.region_group(),
|
||||
region_sequence: region_id.region_sequence(),
|
||||
file_id: meta.file_id.to_string(),
|
||||
index_version,
|
||||
index_file_id,
|
||||
level: meta.level,
|
||||
file_path: sst_file_path(table_dir, meta.file_id(), path_type),
|
||||
file_size: meta.file_size,
|
||||
@@ -1354,7 +1355,6 @@ mod tests {
|
||||
compress_type: CompressionType::Uncompressed,
|
||||
checkpoint_distance: 10,
|
||||
remove_file_options: Default::default(),
|
||||
manifest_cache: None,
|
||||
},
|
||||
FormatType::PrimaryKey,
|
||||
&Default::default(),
|
||||
@@ -1421,7 +1421,6 @@ mod tests {
|
||||
compress_type: CompressionType::Uncompressed,
|
||||
checkpoint_distance: 10,
|
||||
remove_file_options: Default::default(),
|
||||
manifest_cache: None,
|
||||
},
|
||||
FormatType::PrimaryKey,
|
||||
&Default::default(),
|
||||
|
||||
@@ -41,7 +41,7 @@ use store_api::storage::{ColumnId, RegionId};
|
||||
|
||||
use crate::access_layer::AccessLayer;
|
||||
use crate::cache::CacheManagerRef;
|
||||
use crate::cache::file_cache::{FileCache, FileType, IndexKey};
|
||||
use crate::cache::file_cache::{FileCacheRef, FileType, IndexKey};
|
||||
use crate::config::MitoConfig;
|
||||
use crate::error;
|
||||
use crate::error::{
|
||||
@@ -63,7 +63,7 @@ use crate::region_write_ctx::RegionWriteCtx;
|
||||
use crate::request::OptionOutputTx;
|
||||
use crate::schedule::scheduler::SchedulerRef;
|
||||
use crate::sst::FormatType;
|
||||
use crate::sst::file::{RegionFileId, RegionIndexId};
|
||||
use crate::sst::file::RegionFileId;
|
||||
use crate::sst::file_purger::{FilePurgerRef, create_file_purger};
|
||||
use crate::sst::file_ref::FileReferenceManagerRef;
|
||||
use crate::sst::index::intermediate::IntermediateManager;
|
||||
@@ -270,14 +270,8 @@ impl RegionOpener {
|
||||
FormatType::PrimaryKey
|
||||
};
|
||||
// Create a manifest manager for this region and writes regions to the manifest file.
|
||||
let mut region_manifest_options =
|
||||
let region_manifest_options =
|
||||
RegionManifestOptions::new(config, ®ion_dir, &object_store);
|
||||
// Set manifest cache if available
|
||||
region_manifest_options.manifest_cache = self
|
||||
.cache_manager
|
||||
.as_ref()
|
||||
.and_then(|cm| cm.write_cache())
|
||||
.and_then(|wc| wc.manifest_cache());
|
||||
// For remote WAL, we need to set flushed_entry_id to current topic's latest entry id.
|
||||
let flushed_entry_id = provider.initial_flushed_entry_id::<S>(wal.store());
|
||||
let manifest_manager = RegionManifestManager::new(
|
||||
@@ -413,14 +407,8 @@ impl RegionOpener {
|
||||
let now = Instant::now();
|
||||
let mut region_options = self.options.as_ref().unwrap().clone();
|
||||
let object_storage = get_object_store(®ion_options.storage, &self.object_store_manager)?;
|
||||
let mut region_manifest_options =
|
||||
let region_manifest_options =
|
||||
RegionManifestOptions::new(config, &self.region_dir(), &object_storage);
|
||||
// Set manifest cache if available
|
||||
region_manifest_options.manifest_cache = self
|
||||
.cache_manager
|
||||
.as_ref()
|
||||
.and_then(|cm| cm.write_cache())
|
||||
.and_then(|wc| wc.manifest_cache());
|
||||
let Some(manifest_manager) =
|
||||
RegionManifestManager::open(region_manifest_options, &self.stats).await?
|
||||
else {
|
||||
@@ -848,7 +836,7 @@ impl RegionLoadCacheTask {
|
||||
}
|
||||
|
||||
/// Fills the file cache with index files from the region.
|
||||
pub(crate) async fn fill_cache(&self, file_cache: &FileCache) {
|
||||
pub(crate) async fn fill_cache(&self, file_cache: FileCacheRef) {
|
||||
let region_id = self.region.region_id;
|
||||
let table_dir = self.region.access_layer.table_dir();
|
||||
let path_type = self.region.access_layer.path_type();
|
||||
@@ -867,8 +855,8 @@ impl RegionLoadCacheTask {
|
||||
if file_meta.exists_index() {
|
||||
let puffin_key = IndexKey::new(
|
||||
file_meta.region_id,
|
||||
file_meta.file_id,
|
||||
FileType::Puffin(file_meta.index_version),
|
||||
file_meta.index_file_id().file_id(),
|
||||
FileType::Puffin,
|
||||
);
|
||||
|
||||
if !file_cache.contains_key(&puffin_key) {
|
||||
@@ -925,18 +913,12 @@ impl RegionLoadCacheTask {
|
||||
break;
|
||||
}
|
||||
|
||||
let index_version = if let FileType::Puffin(version) = puffin_key.file_type {
|
||||
version
|
||||
} else {
|
||||
unreachable!("`files_to_download` should only contains Puffin files");
|
||||
};
|
||||
let index_id = RegionIndexId::new(
|
||||
let index_remote_path = location::index_file_path(
|
||||
table_dir,
|
||||
RegionFileId::new(puffin_key.region_id, puffin_key.file_id),
|
||||
index_version,
|
||||
path_type,
|
||||
);
|
||||
|
||||
let index_remote_path = location::index_file_path(table_dir, index_id, path_type);
|
||||
|
||||
match file_cache
|
||||
.download(puffin_key, &index_remote_path, object_store, file_size)
|
||||
.await
|
||||
|
||||
@@ -428,7 +428,7 @@ mod tests {
|
||||
available_indexes: SmallVec::new(),
|
||||
indexes: Default::default(),
|
||||
index_file_size: 0,
|
||||
index_version: 0,
|
||||
index_file_id: None,
|
||||
num_rows: 100,
|
||||
num_row_groups: 1,
|
||||
sequence: NonZeroU64::new(1),
|
||||
|
||||
@@ -28,7 +28,7 @@ use serde::{Deserialize, Serialize};
|
||||
use smallvec::SmallVec;
|
||||
use store_api::metadata::ColumnMetadata;
|
||||
use store_api::region_request::PathType;
|
||||
use store_api::storage::{ColumnId, FileId, IndexVersion, RegionId};
|
||||
use store_api::storage::{ColumnId, FileId, RegionId};
|
||||
|
||||
use crate::access_layer::AccessLayerRef;
|
||||
use crate::cache::CacheManagerRef;
|
||||
@@ -117,41 +117,6 @@ impl fmt::Display for RegionFileId {
|
||||
}
|
||||
}
|
||||
|
||||
/// Unique identifier for an index file, combining the SST file ID and the index version.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub struct RegionIndexId {
|
||||
pub file_id: RegionFileId,
|
||||
pub version: IndexVersion,
|
||||
}
|
||||
|
||||
impl RegionIndexId {
|
||||
pub fn new(file_id: RegionFileId, version: IndexVersion) -> Self {
|
||||
Self { file_id, version }
|
||||
}
|
||||
|
||||
pub fn region_id(&self) -> RegionId {
|
||||
self.file_id.region_id
|
||||
}
|
||||
|
||||
pub fn file_id(&self) -> FileId {
|
||||
self.file_id.file_id
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for RegionIndexId {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
if self.version == 0 {
|
||||
write!(f, "{}/{}", self.file_id.region_id, self.file_id.file_id)
|
||||
} else {
|
||||
write!(
|
||||
f,
|
||||
"{}/{}.{}",
|
||||
self.file_id.region_id, self.file_id.file_id, self.version
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Time range (min and max timestamps) of a SST file.
|
||||
/// Both min and max are inclusive.
|
||||
pub type FileTimeRange = (Timestamp, Timestamp);
|
||||
@@ -194,10 +159,12 @@ pub struct FileMeta {
|
||||
pub indexes: Vec<ColumnIndexMetadata>,
|
||||
/// Size of the index file.
|
||||
pub index_file_size: u64,
|
||||
/// Version of the index file.
|
||||
/// Used to generate the index file name: "{file_id}.{index_version}.puffin".
|
||||
/// Default is 0 (which maps to "{file_id}.puffin" for compatibility).
|
||||
pub index_version: IndexVersion,
|
||||
/// File ID of the index file.
|
||||
///
|
||||
/// When this field is None, it means the index file id is the same as the file id.
|
||||
/// Only meaningful when index_file_size > 0.
|
||||
/// Used for rebuilding index files.
|
||||
pub index_file_id: Option<FileId>,
|
||||
/// Number of rows in the file.
|
||||
///
|
||||
/// For historical reasons, this field might be missing in old files. Thus
|
||||
@@ -365,9 +332,14 @@ impl FileMeta {
|
||||
RegionFileId::new(self.region_id, self.file_id)
|
||||
}
|
||||
|
||||
/// Returns the RegionIndexId for this file.
|
||||
pub fn index_id(&self) -> RegionIndexId {
|
||||
RegionIndexId::new(self.file_id(), self.index_version)
|
||||
/// Returns the cross-region index file id.
|
||||
/// If the index file id is not set, returns the file id.
|
||||
pub fn index_file_id(&self) -> RegionFileId {
|
||||
if let Some(index_file_id) = self.index_file_id {
|
||||
RegionFileId::new(self.region_id, index_file_id)
|
||||
} else {
|
||||
self.file_id()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -404,9 +376,14 @@ impl FileHandle {
|
||||
RegionFileId::new(self.inner.meta.region_id, self.inner.meta.file_id)
|
||||
}
|
||||
|
||||
/// Returns the RegionIndexId for this file.
|
||||
pub fn index_id(&self) -> RegionIndexId {
|
||||
RegionIndexId::new(self.file_id(), self.inner.meta.index_version)
|
||||
/// Returns the cross-region index file id.
|
||||
/// If the index file id is not set, returns the file id.
|
||||
pub fn index_file_id(&self) -> RegionFileId {
|
||||
if let Some(index_file_id) = self.inner.meta.index_file_id {
|
||||
RegionFileId::new(self.inner.meta.region_id, index_file_id)
|
||||
} else {
|
||||
self.file_id()
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the complete file path of the file.
|
||||
@@ -491,15 +468,10 @@ impl FileHandleInner {
|
||||
}
|
||||
}
|
||||
|
||||
/// Delete files for a region.
|
||||
/// - `region_id`: Region id.
|
||||
/// - `file_ids`: List of (file id, index version) tuples to delete.
|
||||
/// - `delete_index`: Whether to delete the index file from the cache.
|
||||
/// - `access_layer`: Access layer to delete files.
|
||||
/// - `cache_manager`: Cache manager to remove files from cache.
|
||||
/// Delete
|
||||
pub async fn delete_files(
|
||||
region_id: RegionId,
|
||||
file_ids: &[(FileId, u64)],
|
||||
file_ids: &[(FileId, FileId)],
|
||||
delete_index: bool,
|
||||
access_layer: &AccessLayerRef,
|
||||
cache_manager: &Option<CacheManagerRef>,
|
||||
@@ -512,12 +484,12 @@ pub async fn delete_files(
|
||||
}
|
||||
let mut deleted_files = Vec::with_capacity(file_ids.len());
|
||||
|
||||
for (file_id, index_version) in file_ids {
|
||||
for (file_id, index_file_id) in file_ids {
|
||||
let region_file_id = RegionFileId::new(region_id, *file_id);
|
||||
match access_layer
|
||||
.delete_sst(
|
||||
®ion_file_id,
|
||||
&RegionIndexId::new(region_file_id, *index_version),
|
||||
&RegionFileId::new(region_id, *file_id),
|
||||
&RegionFileId::new(region_id, *index_file_id),
|
||||
)
|
||||
.await
|
||||
{
|
||||
@@ -537,90 +509,32 @@ pub async fn delete_files(
|
||||
deleted_files
|
||||
);
|
||||
|
||||
for (file_id, index_version) in file_ids {
|
||||
purge_index_write_cache_stager(
|
||||
region_id,
|
||||
delete_index,
|
||||
access_layer,
|
||||
cache_manager,
|
||||
file_id,
|
||||
index_version,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
for (file_id, index_file_id) in file_ids {
|
||||
if let Some(write_cache) = cache_manager.as_ref().and_then(|cache| cache.write_cache()) {
|
||||
// Removes index file from the cache.
|
||||
if delete_index {
|
||||
write_cache
|
||||
.remove(IndexKey::new(region_id, *index_file_id, FileType::Puffin))
|
||||
.await;
|
||||
}
|
||||
|
||||
/// Delete index file for a given SST file&index version.
|
||||
pub async fn delete_index(
|
||||
region_id: RegionId,
|
||||
file_id: FileId,
|
||||
index_version: u64,
|
||||
access_layer: &AccessLayerRef,
|
||||
cache_manager: &Option<CacheManagerRef>,
|
||||
) -> crate::error::Result<()> {
|
||||
if let Err(err) = access_layer
|
||||
.delete_index(&RegionIndexId::new(
|
||||
RegionFileId::new(region_id, file_id),
|
||||
index_version,
|
||||
))
|
||||
.await
|
||||
{
|
||||
error!(err; "Failed to delete index file for {}/{}.{}",
|
||||
region_id, file_id, index_version);
|
||||
}
|
||||
|
||||
purge_index_write_cache_stager(
|
||||
region_id,
|
||||
true,
|
||||
access_layer,
|
||||
cache_manager,
|
||||
&file_id,
|
||||
&index_version,
|
||||
)
|
||||
.await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn purge_index_write_cache_stager(
|
||||
region_id: RegionId,
|
||||
delete_index: bool,
|
||||
access_layer: &Arc<crate::access_layer::AccessLayer>,
|
||||
cache_manager: &Option<Arc<crate::cache::CacheManager>>,
|
||||
file_id: &FileId,
|
||||
index_version: &u64,
|
||||
) {
|
||||
if let Some(write_cache) = cache_manager.as_ref().and_then(|cache| cache.write_cache()) {
|
||||
// Removes index file from the cache.
|
||||
if delete_index {
|
||||
// Remove the SST file from the cache.
|
||||
write_cache
|
||||
.remove(IndexKey::new(
|
||||
region_id,
|
||||
*file_id,
|
||||
FileType::Puffin(*index_version),
|
||||
))
|
||||
.remove(IndexKey::new(region_id, *file_id, FileType::Parquet))
|
||||
.await;
|
||||
}
|
||||
|
||||
// Remove the SST file from the cache.
|
||||
write_cache
|
||||
.remove(IndexKey::new(region_id, *file_id, FileType::Parquet))
|
||||
.await;
|
||||
}
|
||||
|
||||
// Purges index content in the stager.
|
||||
if let Err(e) = access_layer
|
||||
.puffin_manager_factory()
|
||||
.purge_stager(RegionIndexId::new(
|
||||
RegionFileId::new(region_id, *file_id),
|
||||
*index_version,
|
||||
))
|
||||
.await
|
||||
{
|
||||
error!(e; "Failed to purge stager with index file, file_id: {}, index_version: {}, region: {}",
|
||||
file_id, index_version, region_id);
|
||||
// Purges index content in the stager.
|
||||
if let Err(e) = access_layer
|
||||
.puffin_manager_factory()
|
||||
.purge_stager(RegionFileId::new(region_id, *index_file_id))
|
||||
.await
|
||||
{
|
||||
error!(e; "Failed to purge stager with index file, file_id: {}, region: {}",
|
||||
index_file_id, region_id);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -649,7 +563,7 @@ mod tests {
|
||||
created_indexes: SmallVec::from_iter([IndexType::InvertedIndex]),
|
||||
}],
|
||||
index_file_size: 0,
|
||||
index_version: 0,
|
||||
index_file_id: None,
|
||||
num_rows: 0,
|
||||
num_row_groups: 0,
|
||||
sequence: None,
|
||||
@@ -700,7 +614,7 @@ mod tests {
|
||||
created_indexes: SmallVec::from_iter([IndexType::InvertedIndex]),
|
||||
}],
|
||||
index_file_size: 0,
|
||||
index_version: 0,
|
||||
index_file_id: None,
|
||||
num_rows: 0,
|
||||
num_row_groups: 0,
|
||||
sequence: None,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -132,11 +132,7 @@ impl FileReferenceManager {
|
||||
let region_id = file_meta.region_id;
|
||||
let mut is_new = false;
|
||||
{
|
||||
let file_ref = FileRef::new(
|
||||
file_meta.region_id,
|
||||
file_meta.file_id,
|
||||
file_meta.index_version,
|
||||
);
|
||||
let file_ref = FileRef::new(file_meta.region_id, file_meta.file_id);
|
||||
self.files_per_region
|
||||
.entry(region_id)
|
||||
.and_modify(|refs| {
|
||||
@@ -161,7 +157,7 @@ impl FileReferenceManager {
|
||||
/// If the reference count reaches zero, the file reference will be removed from the manager.
|
||||
pub fn remove_file(&self, file_meta: &FileMeta) {
|
||||
let region_id = file_meta.region_id;
|
||||
let file_ref = FileRef::new(region_id, file_meta.file_id, file_meta.index_version);
|
||||
let file_ref = FileRef::new(region_id, file_meta.file_id);
|
||||
|
||||
let mut remove_table_entry = false;
|
||||
let mut remove_file_ref = false;
|
||||
@@ -234,7 +230,7 @@ mod tests {
|
||||
created_indexes: SmallVec::from_iter([IndexType::InvertedIndex]),
|
||||
}],
|
||||
index_file_size: 4096,
|
||||
index_version: 0,
|
||||
index_file_id: None,
|
||||
num_rows: 1024,
|
||||
num_row_groups: 1,
|
||||
sequence: NonZeroU64::new(4096),
|
||||
@@ -250,13 +246,13 @@ mod tests {
|
||||
.get(&file_meta.region_id)
|
||||
.unwrap()
|
||||
.files,
|
||||
HashMap::from_iter([(FileRef::new(file_meta.region_id, file_meta.file_id, 0), 1)])
|
||||
HashMap::from_iter([(FileRef::new(file_meta.region_id, file_meta.file_id), 1)])
|
||||
);
|
||||
|
||||
file_ref_mgr.add_file(&file_meta);
|
||||
|
||||
let expected_region_ref_manifest =
|
||||
HashSet::from_iter([FileRef::new(file_meta.region_id, file_meta.file_id, 0)]);
|
||||
HashSet::from_iter([FileRef::new(file_meta.region_id, file_meta.file_id)]);
|
||||
|
||||
assert_eq!(
|
||||
file_ref_mgr.ref_file_set(file_meta.region_id).unwrap(),
|
||||
@@ -269,7 +265,7 @@ mod tests {
|
||||
.get(&file_meta.region_id)
|
||||
.unwrap()
|
||||
.files,
|
||||
HashMap::from_iter([(FileRef::new(file_meta.region_id, file_meta.file_id, 0), 2)])
|
||||
HashMap::from_iter([(FileRef::new(file_meta.region_id, file_meta.file_id), 2)])
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
@@ -285,7 +281,7 @@ mod tests {
|
||||
.get(&file_meta.region_id)
|
||||
.unwrap()
|
||||
.files,
|
||||
HashMap::from_iter([(FileRef::new(file_meta.region_id, file_meta.file_id, 0), 1)])
|
||||
HashMap::from_iter([(FileRef::new(file_meta.region_id, file_meta.file_id), 1)])
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
|
||||
@@ -61,7 +61,7 @@ use crate::request::{
|
||||
};
|
||||
use crate::schedule::scheduler::{Job, SchedulerRef};
|
||||
use crate::sst::file::{
|
||||
ColumnIndexMetadata, FileHandle, FileMeta, IndexType, IndexTypes, RegionFileId, RegionIndexId,
|
||||
ColumnIndexMetadata, FileHandle, FileMeta, IndexType, IndexTypes, RegionFileId,
|
||||
};
|
||||
use crate::sst::file_purger::FilePurgerRef;
|
||||
use crate::sst::index::fulltext_index::creator::FulltextIndexer;
|
||||
@@ -81,8 +81,6 @@ pub(crate) const TYPE_BLOOM_FILTER_INDEX: &str = "bloom_filter_index";
|
||||
pub struct IndexOutput {
|
||||
/// Size of the file.
|
||||
pub file_size: u64,
|
||||
/// Index version.
|
||||
pub version: u64,
|
||||
/// Inverted index output.
|
||||
pub inverted_index: InvertedIndexOutput,
|
||||
/// Fulltext index output.
|
||||
@@ -165,9 +163,7 @@ pub type BloomFilterOutput = IndexBaseOutput;
|
||||
pub struct Indexer {
|
||||
file_id: FileId,
|
||||
region_id: RegionId,
|
||||
index_version: u64,
|
||||
puffin_manager: Option<SstPuffinManager>,
|
||||
write_cache_enabled: bool,
|
||||
inverted_indexer: Option<InvertedIndexer>,
|
||||
last_mem_inverted_index: usize,
|
||||
fulltext_indexer: Option<FulltextIndexer>,
|
||||
@@ -240,7 +236,7 @@ impl Indexer {
|
||||
#[async_trait::async_trait]
|
||||
pub trait IndexerBuilder {
|
||||
/// Builds indexer of given file id to [index_file_path].
|
||||
async fn build(&self, file_id: FileId, index_version: u64) -> Indexer;
|
||||
async fn build(&self, file_id: FileId) -> Indexer;
|
||||
}
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct IndexerBuilderImpl {
|
||||
@@ -248,7 +244,6 @@ pub(crate) struct IndexerBuilderImpl {
|
||||
pub(crate) metadata: RegionMetadataRef,
|
||||
pub(crate) row_group_size: usize,
|
||||
pub(crate) puffin_manager: SstPuffinManager,
|
||||
pub(crate) write_cache_enabled: bool,
|
||||
pub(crate) intermediate_manager: IntermediateManager,
|
||||
pub(crate) index_options: IndexOptions,
|
||||
pub(crate) inverted_index_config: InvertedIndexConfig,
|
||||
@@ -259,12 +254,10 @@ pub(crate) struct IndexerBuilderImpl {
|
||||
#[async_trait::async_trait]
|
||||
impl IndexerBuilder for IndexerBuilderImpl {
|
||||
/// Sanity check for arguments and create a new [Indexer] if arguments are valid.
|
||||
async fn build(&self, file_id: FileId, index_version: u64) -> Indexer {
|
||||
async fn build(&self, file_id: FileId) -> Indexer {
|
||||
let mut indexer = Indexer {
|
||||
file_id,
|
||||
region_id: self.metadata.region_id,
|
||||
index_version,
|
||||
write_cache_enabled: self.write_cache_enabled,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
@@ -618,20 +611,13 @@ impl IndexBuildTask {
|
||||
&mut self,
|
||||
version_control: VersionControlRef,
|
||||
) -> Result<IndexBuildOutcome> {
|
||||
// Determine the new index version
|
||||
let new_index_version = if self.file_meta.index_file_size > 0 {
|
||||
// Increment version if index file exists to avoid overwrite.
|
||||
self.file_meta.index_version + 1
|
||||
let index_file_id = if self.file_meta.index_file_size > 0 {
|
||||
// Generate new file ID if index file exists to avoid overwrite.
|
||||
FileId::random()
|
||||
} else {
|
||||
0 // Default version for new index files
|
||||
self.file_meta.file_id
|
||||
};
|
||||
|
||||
// Use the same file_id but with new version for index file
|
||||
let index_file_id = self.file_meta.file_id;
|
||||
let mut indexer = self
|
||||
.indexer_builder
|
||||
.build(index_file_id, new_index_version)
|
||||
.await;
|
||||
let mut indexer = self.indexer_builder.build(index_file_id).await;
|
||||
|
||||
// Check SST file existence before building index to avoid failure of parquet reader.
|
||||
if !self.check_sst_file_exists(&version_control).await {
|
||||
@@ -691,10 +677,10 @@ impl IndexBuildTask {
|
||||
}
|
||||
|
||||
// Upload index file if write cache is enabled.
|
||||
self.maybe_upload_index_file(index_output.clone(), index_file_id, new_index_version)
|
||||
self.maybe_upload_index_file(index_output.clone(), index_file_id)
|
||||
.await?;
|
||||
|
||||
let worker_request = match self.update_manifest(index_output, new_index_version).await {
|
||||
let worker_request = match self.update_manifest(index_output, index_file_id).await {
|
||||
Ok(edit) => {
|
||||
let index_build_finished = IndexBuildFinished {
|
||||
region_id: self.file_meta.region_id,
|
||||
@@ -726,7 +712,6 @@ impl IndexBuildTask {
|
||||
&self,
|
||||
output: IndexOutput,
|
||||
index_file_id: FileId,
|
||||
index_version: u64,
|
||||
) -> Result<()> {
|
||||
if let Some(write_cache) = &self.write_cache {
|
||||
let file_id = self.file_meta.file_id;
|
||||
@@ -734,14 +719,12 @@ impl IndexBuildTask {
|
||||
let remote_store = self.access_layer.object_store();
|
||||
let mut upload_tracker = UploadTracker::new(region_id);
|
||||
let mut err = None;
|
||||
let puffin_key =
|
||||
IndexKey::new(region_id, index_file_id, FileType::Puffin(output.version));
|
||||
let index_id = RegionIndexId::new(RegionFileId::new(region_id, file_id), index_version);
|
||||
let puffin_key = IndexKey::new(region_id, index_file_id, FileType::Puffin);
|
||||
let puffin_path = RegionFilePathFactory::new(
|
||||
self.access_layer.table_dir().to_string(),
|
||||
self.access_layer.path_type(),
|
||||
)
|
||||
.build_index_file_path_with_version(index_id);
|
||||
.build_index_file_path(RegionFileId::new(region_id, file_id));
|
||||
if let Err(e) = write_cache
|
||||
.upload(puffin_key, &puffin_path, remote_store)
|
||||
.await
|
||||
@@ -773,13 +756,12 @@ impl IndexBuildTask {
|
||||
async fn update_manifest(
|
||||
&mut self,
|
||||
output: IndexOutput,
|
||||
new_index_version: u64,
|
||||
index_file_id: FileId,
|
||||
) -> Result<RegionEdit> {
|
||||
self.file_meta.available_indexes = output.build_available_indexes();
|
||||
self.file_meta.indexes = output.build_indexes();
|
||||
self.file_meta.index_file_size = output.file_size;
|
||||
let old_index_version = self.file_meta.index_version;
|
||||
self.file_meta.index_version = new_index_version;
|
||||
self.file_meta.index_file_id = Some(index_file_id);
|
||||
let edit = RegionEdit {
|
||||
files_to_add: vec![self.file_meta.clone()],
|
||||
files_to_remove: vec![],
|
||||
@@ -802,11 +784,6 @@ impl IndexBuildTask {
|
||||
self.file_meta.region_id,
|
||||
self.reason.as_str()
|
||||
);
|
||||
// notify the file purger to remove the old index files if any
|
||||
if new_index_version > 0 {
|
||||
self.file_purger
|
||||
.update_index(self.file_meta.clone(), old_index_version);
|
||||
}
|
||||
Ok(edit)
|
||||
}
|
||||
}
|
||||
@@ -1186,10 +1163,6 @@ mod tests {
|
||||
unreachable!()
|
||||
}
|
||||
|
||||
fn build_index_file_path_with_version(&self, _index_id: RegionIndexId) -> String {
|
||||
unreachable!()
|
||||
}
|
||||
|
||||
fn build_sst_file_path(&self, _file_id: RegionFileId) -> String {
|
||||
unreachable!()
|
||||
}
|
||||
@@ -1263,7 +1236,6 @@ mod tests {
|
||||
metadata,
|
||||
row_group_size: 1024,
|
||||
puffin_manager,
|
||||
write_cache_enabled: false,
|
||||
intermediate_manager: intm_manager,
|
||||
index_options: IndexOptions::default(),
|
||||
inverted_index_config: InvertedIndexConfig::default(),
|
||||
@@ -1288,14 +1260,13 @@ mod tests {
|
||||
metadata,
|
||||
row_group_size: 1024,
|
||||
puffin_manager: factory.build(mock_object_store(), NoopPathProvider),
|
||||
write_cache_enabled: false,
|
||||
intermediate_manager: intm_manager,
|
||||
index_options: IndexOptions::default(),
|
||||
inverted_index_config: InvertedIndexConfig::default(),
|
||||
fulltext_index_config: FulltextIndexConfig::default(),
|
||||
bloom_filter_index_config: BloomFilterConfig::default(),
|
||||
}
|
||||
.build(FileId::random(), 0)
|
||||
.build(FileId::random())
|
||||
.await;
|
||||
|
||||
assert!(indexer.inverted_indexer.is_some());
|
||||
@@ -1319,7 +1290,6 @@ mod tests {
|
||||
metadata: metadata.clone(),
|
||||
row_group_size: 1024,
|
||||
puffin_manager: factory.build(mock_object_store(), NoopPathProvider),
|
||||
write_cache_enabled: false,
|
||||
intermediate_manager: intm_manager.clone(),
|
||||
index_options: IndexOptions::default(),
|
||||
inverted_index_config: InvertedIndexConfig {
|
||||
@@ -1329,7 +1299,7 @@ mod tests {
|
||||
fulltext_index_config: FulltextIndexConfig::default(),
|
||||
bloom_filter_index_config: BloomFilterConfig::default(),
|
||||
}
|
||||
.build(FileId::random(), 0)
|
||||
.build(FileId::random())
|
||||
.await;
|
||||
|
||||
assert!(indexer.inverted_indexer.is_none());
|
||||
@@ -1341,7 +1311,6 @@ mod tests {
|
||||
metadata: metadata.clone(),
|
||||
row_group_size: 1024,
|
||||
puffin_manager: factory.build(mock_object_store(), NoopPathProvider),
|
||||
write_cache_enabled: false,
|
||||
intermediate_manager: intm_manager.clone(),
|
||||
index_options: IndexOptions::default(),
|
||||
inverted_index_config: InvertedIndexConfig::default(),
|
||||
@@ -1351,7 +1320,7 @@ mod tests {
|
||||
},
|
||||
bloom_filter_index_config: BloomFilterConfig::default(),
|
||||
}
|
||||
.build(FileId::random(), 0)
|
||||
.build(FileId::random())
|
||||
.await;
|
||||
|
||||
assert!(indexer.inverted_indexer.is_some());
|
||||
@@ -1363,7 +1332,6 @@ mod tests {
|
||||
metadata,
|
||||
row_group_size: 1024,
|
||||
puffin_manager: factory.build(mock_object_store(), NoopPathProvider),
|
||||
write_cache_enabled: false,
|
||||
intermediate_manager: intm_manager,
|
||||
index_options: IndexOptions::default(),
|
||||
inverted_index_config: InvertedIndexConfig::default(),
|
||||
@@ -1373,7 +1341,7 @@ mod tests {
|
||||
..Default::default()
|
||||
},
|
||||
}
|
||||
.build(FileId::random(), 0)
|
||||
.build(FileId::random())
|
||||
.await;
|
||||
|
||||
assert!(indexer.inverted_indexer.is_some());
|
||||
@@ -1397,14 +1365,13 @@ mod tests {
|
||||
metadata: metadata.clone(),
|
||||
row_group_size: 1024,
|
||||
puffin_manager: factory.build(mock_object_store(), NoopPathProvider),
|
||||
write_cache_enabled: false,
|
||||
intermediate_manager: intm_manager.clone(),
|
||||
index_options: IndexOptions::default(),
|
||||
inverted_index_config: InvertedIndexConfig::default(),
|
||||
fulltext_index_config: FulltextIndexConfig::default(),
|
||||
bloom_filter_index_config: BloomFilterConfig::default(),
|
||||
}
|
||||
.build(FileId::random(), 0)
|
||||
.build(FileId::random())
|
||||
.await;
|
||||
|
||||
assert!(indexer.inverted_indexer.is_none());
|
||||
@@ -1421,14 +1388,13 @@ mod tests {
|
||||
metadata: metadata.clone(),
|
||||
row_group_size: 1024,
|
||||
puffin_manager: factory.build(mock_object_store(), NoopPathProvider),
|
||||
write_cache_enabled: false,
|
||||
intermediate_manager: intm_manager.clone(),
|
||||
index_options: IndexOptions::default(),
|
||||
inverted_index_config: InvertedIndexConfig::default(),
|
||||
fulltext_index_config: FulltextIndexConfig::default(),
|
||||
bloom_filter_index_config: BloomFilterConfig::default(),
|
||||
}
|
||||
.build(FileId::random(), 0)
|
||||
.build(FileId::random())
|
||||
.await;
|
||||
|
||||
assert!(indexer.inverted_indexer.is_some());
|
||||
@@ -1445,14 +1411,13 @@ mod tests {
|
||||
metadata: metadata.clone(),
|
||||
row_group_size: 1024,
|
||||
puffin_manager: factory.build(mock_object_store(), NoopPathProvider),
|
||||
write_cache_enabled: false,
|
||||
intermediate_manager: intm_manager,
|
||||
index_options: IndexOptions::default(),
|
||||
inverted_index_config: InvertedIndexConfig::default(),
|
||||
fulltext_index_config: FulltextIndexConfig::default(),
|
||||
bloom_filter_index_config: BloomFilterConfig::default(),
|
||||
}
|
||||
.build(FileId::random(), 0)
|
||||
.build(FileId::random())
|
||||
.await;
|
||||
|
||||
assert!(indexer.inverted_indexer.is_some());
|
||||
@@ -1476,14 +1441,13 @@ mod tests {
|
||||
metadata,
|
||||
row_group_size: 0,
|
||||
puffin_manager: factory.build(mock_object_store(), NoopPathProvider),
|
||||
write_cache_enabled: false,
|
||||
intermediate_manager: intm_manager,
|
||||
index_options: IndexOptions::default(),
|
||||
inverted_index_config: InvertedIndexConfig::default(),
|
||||
fulltext_index_config: FulltextIndexConfig::default(),
|
||||
bloom_filter_index_config: BloomFilterConfig::default(),
|
||||
}
|
||||
.build(FileId::random(), 0)
|
||||
.build(FileId::random())
|
||||
.await;
|
||||
|
||||
assert!(indexer.inverted_indexer.is_none());
|
||||
@@ -1655,7 +1619,7 @@ mod tests {
|
||||
|
||||
let puffin_path = location::index_file_path(
|
||||
env.access_layer.table_dir(),
|
||||
RegionIndexId::new(RegionFileId::new(region_id, file_meta.file_id), 0),
|
||||
RegionFileId::new(region_id, file_meta.file_id),
|
||||
env.access_layer.path_type(),
|
||||
);
|
||||
|
||||
@@ -1786,7 +1750,6 @@ mod tests {
|
||||
None,
|
||||
factory,
|
||||
intm_manager,
|
||||
ReadableSize::mb(10),
|
||||
)
|
||||
.await
|
||||
.unwrap(),
|
||||
@@ -1797,7 +1760,6 @@ mod tests {
|
||||
metadata: metadata.clone(),
|
||||
row_group_size: 1024,
|
||||
puffin_manager: write_cache.build_puffin_manager().clone(),
|
||||
write_cache_enabled: true,
|
||||
intermediate_manager: write_cache.intermediate_manager().clone(),
|
||||
index_options: IndexOptions::default(),
|
||||
inverted_index_config: InvertedIndexConfig::default(),
|
||||
@@ -1849,11 +1811,7 @@ mod tests {
|
||||
}
|
||||
|
||||
// The write cache should contain the uploaded index file.
|
||||
let index_key = IndexKey::new(
|
||||
region_id,
|
||||
file_meta.file_id,
|
||||
FileType::Puffin(sst_info.index_metadata.version),
|
||||
);
|
||||
let index_key = IndexKey::new(region_id, file_meta.file_id, FileType::Puffin);
|
||||
assert!(write_cache.file_cache().contains_key(&index_key));
|
||||
}
|
||||
|
||||
|
||||
@@ -17,14 +17,11 @@ mod builder;
|
||||
use std::collections::BTreeMap;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use common_base::range_read::RangeReader;
|
||||
use common_telemetry::warn;
|
||||
use index::bloom_filter::applier::{BloomFilterApplier, InListPredicate};
|
||||
use index::bloom_filter::reader::{
|
||||
BloomFilterReadMetrics, BloomFilterReader, BloomFilterReaderImpl,
|
||||
};
|
||||
use index::bloom_filter::reader::{BloomFilterReader, BloomFilterReaderImpl};
|
||||
use index::target::IndexTarget;
|
||||
use object_store::ObjectStore;
|
||||
use puffin::puffin_manager::cache::PuffinMetadataCacheRef;
|
||||
@@ -44,68 +41,12 @@ use crate::error::{
|
||||
Result,
|
||||
};
|
||||
use crate::metrics::INDEX_APPLY_ELAPSED;
|
||||
use crate::sst::file::RegionIndexId;
|
||||
use crate::sst::file::RegionFileId;
|
||||
use crate::sst::index::TYPE_BLOOM_FILTER_INDEX;
|
||||
use crate::sst::index::bloom_filter::INDEX_BLOB_TYPE;
|
||||
pub use crate::sst::index::bloom_filter::applier::builder::BloomFilterIndexApplierBuilder;
|
||||
use crate::sst::index::puffin_manager::{BlobReader, PuffinManagerFactory};
|
||||
|
||||
/// Metrics for tracking bloom filter index apply operations.
|
||||
#[derive(Default, Clone)]
|
||||
pub struct BloomFilterIndexApplyMetrics {
|
||||
/// Total time spent applying the index.
|
||||
pub apply_elapsed: std::time::Duration,
|
||||
/// Number of blob cache misses.
|
||||
pub blob_cache_miss: usize,
|
||||
/// Total size of blobs read (in bytes).
|
||||
pub blob_read_bytes: u64,
|
||||
/// Metrics for bloom filter read operations.
|
||||
pub read_metrics: BloomFilterReadMetrics,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for BloomFilterIndexApplyMetrics {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let Self {
|
||||
apply_elapsed,
|
||||
blob_cache_miss,
|
||||
blob_read_bytes,
|
||||
read_metrics,
|
||||
} = self;
|
||||
|
||||
if self.is_empty() {
|
||||
return write!(f, "{{}}");
|
||||
}
|
||||
write!(f, "{{")?;
|
||||
|
||||
write!(f, "\"apply_elapsed\":\"{:?}\"", apply_elapsed)?;
|
||||
|
||||
if *blob_cache_miss > 0 {
|
||||
write!(f, ", \"blob_cache_miss\":{}", blob_cache_miss)?;
|
||||
}
|
||||
if *blob_read_bytes > 0 {
|
||||
write!(f, ", \"blob_read_bytes\":{}", blob_read_bytes)?;
|
||||
}
|
||||
write!(f, ", \"read_metrics\":{:?}", read_metrics)?;
|
||||
|
||||
write!(f, "}}")
|
||||
}
|
||||
}
|
||||
|
||||
impl BloomFilterIndexApplyMetrics {
|
||||
/// Returns true if the metrics are empty (contain no meaningful data).
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.apply_elapsed.is_zero()
|
||||
}
|
||||
|
||||
/// Merges another metrics into this one.
|
||||
pub fn merge_from(&mut self, other: &Self) {
|
||||
self.apply_elapsed += other.apply_elapsed;
|
||||
self.blob_cache_miss += other.blob_cache_miss;
|
||||
self.blob_read_bytes += other.blob_read_bytes;
|
||||
self.read_metrics.merge_from(&other.read_metrics);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) type BloomFilterIndexApplierRef = Arc<BloomFilterIndexApplier>;
|
||||
|
||||
/// `BloomFilterIndexApplier` applies bloom filter predicates to the SST file.
|
||||
@@ -192,20 +133,15 @@ impl BloomFilterIndexApplier {
|
||||
///
|
||||
/// Row group id existing in the returned result means that the row group is searched.
|
||||
/// Empty ranges means that the row group is searched but no rows are found.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `file_id` - The region file ID to apply predicates to
|
||||
/// * `file_size_hint` - Optional hint for file size to avoid extra metadata reads
|
||||
/// * `row_groups` - Iterator of row group lengths and whether to search in the row group
|
||||
/// * `metrics` - Optional mutable reference to collect metrics on demand
|
||||
pub async fn apply(
|
||||
&self,
|
||||
file_id: RegionIndexId,
|
||||
file_id: RegionFileId,
|
||||
file_size_hint: Option<u64>,
|
||||
row_groups: impl Iterator<Item = (usize, bool)>,
|
||||
mut metrics: Option<&mut BloomFilterIndexApplyMetrics>,
|
||||
) -> Result<Vec<(usize, Vec<Range<usize>>)>> {
|
||||
let apply_start = Instant::now();
|
||||
let _timer = INDEX_APPLY_ELAPSED
|
||||
.with_label_values(&[TYPE_BLOOM_FILTER_INDEX])
|
||||
.start_timer();
|
||||
|
||||
// Calculates row groups' ranges based on start of the file.
|
||||
let mut input = Vec::with_capacity(row_groups.size_hint().0);
|
||||
@@ -227,7 +163,7 @@ impl BloomFilterIndexApplier {
|
||||
|
||||
for (column_id, predicates) in self.predicates.iter() {
|
||||
let blob = match self
|
||||
.blob_reader(file_id, *column_id, file_size_hint, metrics.as_deref_mut())
|
||||
.blob_reader(file_id, *column_id, file_size_hint)
|
||||
.await?
|
||||
{
|
||||
Some(blob) => blob,
|
||||
@@ -237,24 +173,20 @@ impl BloomFilterIndexApplier {
|
||||
// Create appropriate reader based on whether we have caching enabled
|
||||
if let Some(bloom_filter_cache) = &self.bloom_filter_index_cache {
|
||||
let blob_size = blob.metadata().await.context(MetadataSnafu)?.content_length;
|
||||
if let Some(m) = &mut metrics {
|
||||
m.blob_read_bytes += blob_size;
|
||||
}
|
||||
let reader = CachedBloomFilterIndexBlobReader::new(
|
||||
file_id.file_id(),
|
||||
file_id.version,
|
||||
*column_id,
|
||||
Tag::Skipping,
|
||||
blob_size,
|
||||
BloomFilterReaderImpl::new(blob),
|
||||
bloom_filter_cache.clone(),
|
||||
);
|
||||
self.apply_predicates(reader, predicates, &mut output, metrics.as_deref_mut())
|
||||
self.apply_predicates(reader, predicates, &mut output)
|
||||
.await
|
||||
.context(ApplyBloomFilterIndexSnafu)?;
|
||||
} else {
|
||||
let reader = BloomFilterReaderImpl::new(blob);
|
||||
self.apply_predicates(reader, predicates, &mut output, metrics.as_deref_mut())
|
||||
self.apply_predicates(reader, predicates, &mut output)
|
||||
.await
|
||||
.context(ApplyBloomFilterIndexSnafu)?;
|
||||
}
|
||||
@@ -269,16 +201,6 @@ impl BloomFilterIndexApplier {
|
||||
}
|
||||
}
|
||||
|
||||
// Record elapsed time to histogram and collect metrics if requested
|
||||
let elapsed = apply_start.elapsed();
|
||||
INDEX_APPLY_ELAPSED
|
||||
.with_label_values(&[TYPE_BLOOM_FILTER_INDEX])
|
||||
.observe(elapsed.as_secs_f64());
|
||||
|
||||
if let Some(m) = metrics {
|
||||
m.apply_elapsed += elapsed;
|
||||
}
|
||||
|
||||
Ok(output)
|
||||
}
|
||||
|
||||
@@ -287,10 +209,9 @@ impl BloomFilterIndexApplier {
|
||||
/// Returus `None` if the column does not have an index.
|
||||
async fn blob_reader(
|
||||
&self,
|
||||
file_id: RegionIndexId,
|
||||
file_id: RegionFileId,
|
||||
column_id: ColumnId,
|
||||
file_size_hint: Option<u64>,
|
||||
metrics: Option<&mut BloomFilterIndexApplyMetrics>,
|
||||
) -> Result<Option<BlobReader>> {
|
||||
let reader = match self
|
||||
.cached_blob_reader(file_id, column_id, file_size_hint)
|
||||
@@ -298,9 +219,6 @@ impl BloomFilterIndexApplier {
|
||||
{
|
||||
Ok(Some(puffin_reader)) => puffin_reader,
|
||||
other => {
|
||||
if let Some(m) = metrics {
|
||||
m.blob_cache_miss += 1;
|
||||
}
|
||||
if let Err(err) = other {
|
||||
// Blob not found means no index for this column
|
||||
if is_blob_not_found(&err) {
|
||||
@@ -329,7 +247,7 @@ impl BloomFilterIndexApplier {
|
||||
/// Creates a blob reader from the cached index file
|
||||
async fn cached_blob_reader(
|
||||
&self,
|
||||
file_id: RegionIndexId,
|
||||
file_id: RegionFileId,
|
||||
column_id: ColumnId,
|
||||
file_size_hint: Option<u64>,
|
||||
) -> Result<Option<BlobReader>> {
|
||||
@@ -337,11 +255,7 @@ impl BloomFilterIndexApplier {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let index_key = IndexKey::new(
|
||||
file_id.region_id(),
|
||||
file_id.file_id(),
|
||||
FileType::Puffin(file_id.version),
|
||||
);
|
||||
let index_key = IndexKey::new(file_id.region_id(), file_id.file_id(), FileType::Puffin);
|
||||
if file_cache.get(index_key).await.is_none() {
|
||||
return Ok(None);
|
||||
};
|
||||
@@ -374,7 +288,7 @@ impl BloomFilterIndexApplier {
|
||||
/// Creates a blob reader from the remote index file
|
||||
async fn remote_blob_reader(
|
||||
&self,
|
||||
file_id: RegionIndexId,
|
||||
file_id: RegionFileId,
|
||||
column_id: ColumnId,
|
||||
file_size_hint: Option<u64>,
|
||||
) -> Result<BlobReader> {
|
||||
@@ -406,7 +320,6 @@ impl BloomFilterIndexApplier {
|
||||
reader: R,
|
||||
predicates: &[InListPredicate],
|
||||
output: &mut [(usize, Vec<Range<usize>>)],
|
||||
mut metrics: Option<&mut BloomFilterIndexApplyMetrics>,
|
||||
) -> std::result::Result<(), index::bloom_filter::error::Error> {
|
||||
let mut applier = BloomFilterApplier::new(Box::new(reader)).await?;
|
||||
|
||||
@@ -416,10 +329,7 @@ impl BloomFilterIndexApplier {
|
||||
continue;
|
||||
}
|
||||
|
||||
let read_metrics = metrics.as_deref_mut().map(|m| &mut m.read_metrics);
|
||||
*row_group_output = applier
|
||||
.search(predicates, row_group_output, read_metrics)
|
||||
.await?;
|
||||
*row_group_output = applier.search(predicates, row_group_output).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -451,7 +361,6 @@ mod tests {
|
||||
use store_api::storage::FileId;
|
||||
|
||||
use super::*;
|
||||
use crate::sst::file::RegionFileId;
|
||||
use crate::sst::index::bloom_filter::creator::BloomFilterIndexer;
|
||||
use crate::sst::index::bloom_filter::creator::tests::{
|
||||
mock_object_store, mock_region_metadata, new_batch, new_intm_mgr,
|
||||
@@ -463,7 +372,7 @@ mod tests {
|
||||
object_store: ObjectStore,
|
||||
metadata: &RegionMetadata,
|
||||
puffin_manager_factory: PuffinManagerFactory,
|
||||
file_id: RegionIndexId,
|
||||
file_id: RegionFileId,
|
||||
) -> impl Fn(&[Expr], Vec<(usize, bool)>) -> BoxFuture<'static, Vec<(usize, Vec<Range<usize>>)>>
|
||||
+ use<'_> {
|
||||
move |exprs, row_groups| {
|
||||
@@ -484,7 +393,7 @@ mod tests {
|
||||
|
||||
let applier = builder.build(&exprs).unwrap().unwrap();
|
||||
applier
|
||||
.apply(file_id, None, row_groups.into_iter(), None)
|
||||
.apply(file_id, None, row_groups.into_iter())
|
||||
.await
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
@@ -520,7 +429,6 @@ mod tests {
|
||||
let intm_mgr = new_intm_mgr(d.path().to_string_lossy()).await;
|
||||
let memory_usage_threshold = Some(1024);
|
||||
let file_id = RegionFileId::new(region_metadata.region_id, FileId::random());
|
||||
let file_id = RegionIndexId::new(file_id, 0);
|
||||
let table_dir = "table_dir".to_string();
|
||||
|
||||
let mut indexer = BloomFilterIndexer::new(
|
||||
|
||||
@@ -481,7 +481,7 @@ pub(crate) mod tests {
|
||||
use super::*;
|
||||
use crate::access_layer::FilePathProvider;
|
||||
use crate::read::BatchColumn;
|
||||
use crate::sst::file::{RegionFileId, RegionIndexId};
|
||||
use crate::sst::file::RegionFileId;
|
||||
use crate::sst::index::puffin_manager::PuffinManagerFactory;
|
||||
|
||||
pub fn mock_object_store() -> ObjectStore {
|
||||
@@ -499,10 +499,6 @@ pub(crate) mod tests {
|
||||
file_id.file_id().to_string()
|
||||
}
|
||||
|
||||
fn build_index_file_path_with_version(&self, index_id: RegionIndexId) -> String {
|
||||
index_id.file_id.file_id().to_string()
|
||||
}
|
||||
|
||||
fn build_sst_file_path(&self, file_id: RegionFileId) -> String {
|
||||
file_id.file_id().to_string()
|
||||
}
|
||||
@@ -625,7 +621,6 @@ pub(crate) mod tests {
|
||||
let puffin_manager = factory.build(object_store, TestPathProvider);
|
||||
|
||||
let file_id = RegionFileId::new(region_metadata.region_id, file_id);
|
||||
let file_id = RegionIndexId::new(file_id, 0);
|
||||
let mut puffin_writer = puffin_manager.writer(&file_id).await.unwrap();
|
||||
let (row_count, byte_count) = indexer.finish(&mut puffin_writer).await.unwrap();
|
||||
assert_eq!(row_count, 20);
|
||||
@@ -642,17 +637,17 @@ pub(crate) mod tests {
|
||||
.unwrap();
|
||||
let reader = blob_guard.reader().await.unwrap();
|
||||
let bloom_filter = BloomFilterReaderImpl::new(reader);
|
||||
let metadata = bloom_filter.metadata(None).await.unwrap();
|
||||
let metadata = bloom_filter.metadata().await.unwrap();
|
||||
|
||||
assert_eq!(metadata.segment_count, 10);
|
||||
for i in 0..5 {
|
||||
let loc = &metadata.bloom_filter_locs[metadata.segment_loc_indices[i] as usize];
|
||||
let bf = bloom_filter.bloom_filter(loc, None).await.unwrap();
|
||||
let bf = bloom_filter.bloom_filter(loc).await.unwrap();
|
||||
assert!(bf.contains(b"tag1"));
|
||||
}
|
||||
for i in 5..10 {
|
||||
let loc = &metadata.bloom_filter_locs[metadata.segment_loc_indices[i] as usize];
|
||||
let bf = bloom_filter.bloom_filter(loc, None).await.unwrap();
|
||||
let bf = bloom_filter.bloom_filter(loc).await.unwrap();
|
||||
assert!(bf.contains(b"tag2"));
|
||||
}
|
||||
}
|
||||
@@ -667,13 +662,13 @@ pub(crate) mod tests {
|
||||
.unwrap();
|
||||
let reader = blob_guard.reader().await.unwrap();
|
||||
let bloom_filter = BloomFilterReaderImpl::new(reader);
|
||||
let metadata = bloom_filter.metadata(None).await.unwrap();
|
||||
let metadata = bloom_filter.metadata().await.unwrap();
|
||||
|
||||
assert_eq!(metadata.segment_count, 5);
|
||||
for i in 0u64..20 {
|
||||
let idx = i as usize / 4;
|
||||
let loc = &metadata.bloom_filter_locs[metadata.segment_loc_indices[idx] as usize];
|
||||
let bf = bloom_filter.bloom_filter(loc, None).await.unwrap();
|
||||
let bf = bloom_filter.bloom_filter(loc).await.unwrap();
|
||||
let mut buf = vec![];
|
||||
IndexValueCodec::encode_nonnull_value(ValueRef::UInt64(i), &sort_field, &mut buf)
|
||||
.unwrap();
|
||||
|
||||
@@ -16,12 +16,11 @@ use std::collections::{BTreeMap, BTreeSet, HashSet};
|
||||
use std::iter;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use common_base::range_read::RangeReader;
|
||||
use common_telemetry::warn;
|
||||
use index::bloom_filter::applier::{BloomFilterApplier, InListPredicate};
|
||||
use index::bloom_filter::reader::{BloomFilterReadMetrics, BloomFilterReaderImpl};
|
||||
use index::bloom_filter::reader::BloomFilterReaderImpl;
|
||||
use index::fulltext_index::search::{FulltextIndexSearcher, RowId, TantivyFulltextIndexSearcher};
|
||||
use index::fulltext_index::tokenizer::{ChineseTokenizer, EnglishTokenizer, Tokenizer};
|
||||
use index::fulltext_index::{Analyzer, Config};
|
||||
@@ -44,7 +43,7 @@ use crate::error::{
|
||||
PuffinReadBlobSnafu, Result,
|
||||
};
|
||||
use crate::metrics::INDEX_APPLY_ELAPSED;
|
||||
use crate::sst::file::RegionIndexId;
|
||||
use crate::sst::file::RegionFileId;
|
||||
use crate::sst::index::TYPE_FULLTEXT_INDEX;
|
||||
use crate::sst::index::fulltext_index::applier::builder::{FulltextRequest, FulltextTerm};
|
||||
use crate::sst::index::fulltext_index::{INDEX_BLOB_TYPE_BLOOM, INDEX_BLOB_TYPE_TANTIVY};
|
||||
@@ -54,95 +53,6 @@ use crate::sst::index::puffin_manager::{
|
||||
|
||||
pub mod builder;
|
||||
|
||||
/// Metrics for tracking fulltext index apply operations.
|
||||
#[derive(Default, Clone)]
|
||||
pub struct FulltextIndexApplyMetrics {
|
||||
/// Total time spent applying the index.
|
||||
pub apply_elapsed: std::time::Duration,
|
||||
/// Number of blob cache misses.
|
||||
pub blob_cache_miss: usize,
|
||||
/// Number of directory cache hits.
|
||||
pub dir_cache_hit: usize,
|
||||
/// Number of directory cache misses.
|
||||
pub dir_cache_miss: usize,
|
||||
/// Elapsed time to initialize directory data.
|
||||
pub dir_init_elapsed: std::time::Duration,
|
||||
/// Metrics for bloom filter reads.
|
||||
pub bloom_filter_read_metrics: BloomFilterReadMetrics,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for FulltextIndexApplyMetrics {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let Self {
|
||||
apply_elapsed,
|
||||
blob_cache_miss,
|
||||
dir_cache_hit,
|
||||
dir_cache_miss,
|
||||
dir_init_elapsed,
|
||||
bloom_filter_read_metrics,
|
||||
} = self;
|
||||
|
||||
if self.is_empty() {
|
||||
return write!(f, "{{}}");
|
||||
}
|
||||
write!(f, "{{")?;
|
||||
|
||||
write!(f, "\"apply_elapsed\":\"{:?}\"", apply_elapsed)?;
|
||||
|
||||
if *blob_cache_miss > 0 {
|
||||
write!(f, ", \"blob_cache_miss\":{}", blob_cache_miss)?;
|
||||
}
|
||||
if *dir_cache_hit > 0 {
|
||||
write!(f, ", \"dir_cache_hit\":{}", dir_cache_hit)?;
|
||||
}
|
||||
if *dir_cache_miss > 0 {
|
||||
write!(f, ", \"dir_cache_miss\":{}", dir_cache_miss)?;
|
||||
}
|
||||
if !dir_init_elapsed.is_zero() {
|
||||
write!(f, ", \"dir_init_elapsed\":\"{:?}\"", dir_init_elapsed)?;
|
||||
}
|
||||
write!(
|
||||
f,
|
||||
", \"bloom_filter_read_metrics\":{:?}",
|
||||
bloom_filter_read_metrics
|
||||
)?;
|
||||
|
||||
write!(f, "}}")
|
||||
}
|
||||
}
|
||||
|
||||
impl FulltextIndexApplyMetrics {
|
||||
/// Returns true if the metrics are empty (contain no meaningful data).
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.apply_elapsed.is_zero()
|
||||
}
|
||||
|
||||
/// Collects metrics from a directory read operation.
|
||||
pub fn collect_dir_metrics(
|
||||
&mut self,
|
||||
elapsed: std::time::Duration,
|
||||
dir_metrics: puffin::puffin_manager::DirMetrics,
|
||||
) {
|
||||
self.dir_init_elapsed += elapsed;
|
||||
if dir_metrics.cache_hit {
|
||||
self.dir_cache_hit += 1;
|
||||
} else {
|
||||
self.dir_cache_miss += 1;
|
||||
}
|
||||
}
|
||||
|
||||
/// Merges another metrics into this one.
|
||||
pub fn merge_from(&mut self, other: &Self) {
|
||||
self.apply_elapsed += other.apply_elapsed;
|
||||
self.blob_cache_miss += other.blob_cache_miss;
|
||||
self.dir_cache_hit += other.dir_cache_hit;
|
||||
self.dir_cache_miss += other.dir_cache_miss;
|
||||
self.dir_init_elapsed += other.dir_init_elapsed;
|
||||
self.bloom_filter_read_metrics
|
||||
.merge_from(&other.bloom_filter_read_metrics);
|
||||
}
|
||||
}
|
||||
|
||||
/// `FulltextIndexApplier` is responsible for applying fulltext index to the provided SST files
|
||||
pub struct FulltextIndexApplier {
|
||||
/// Requests to be applied.
|
||||
@@ -214,18 +124,14 @@ impl FulltextIndexApplier {
|
||||
impl FulltextIndexApplier {
|
||||
/// Applies fine-grained fulltext index to the specified SST file.
|
||||
/// Returns the row ids that match the queries.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `file_id` - The region file ID to apply predicates to
|
||||
/// * `file_size_hint` - Optional hint for file size to avoid extra metadata reads
|
||||
/// * `metrics` - Optional mutable reference to collect metrics on demand
|
||||
pub async fn apply_fine(
|
||||
&self,
|
||||
file_id: RegionIndexId,
|
||||
file_id: RegionFileId,
|
||||
file_size_hint: Option<u64>,
|
||||
mut metrics: Option<&mut FulltextIndexApplyMetrics>,
|
||||
) -> Result<Option<BTreeSet<RowId>>> {
|
||||
let apply_start = Instant::now();
|
||||
let timer = INDEX_APPLY_ELAPSED
|
||||
.with_label_values(&[TYPE_FULLTEXT_INDEX])
|
||||
.start_timer();
|
||||
|
||||
let mut row_ids: Option<BTreeSet<RowId>> = None;
|
||||
for (column_id, request) in self.requests.iter() {
|
||||
@@ -234,13 +140,7 @@ impl FulltextIndexApplier {
|
||||
}
|
||||
|
||||
let Some(result) = self
|
||||
.apply_fine_one_column(
|
||||
file_size_hint,
|
||||
file_id,
|
||||
*column_id,
|
||||
request,
|
||||
metrics.as_deref_mut(),
|
||||
)
|
||||
.apply_fine_one_column(file_size_hint, file_id, *column_id, request)
|
||||
.await?
|
||||
else {
|
||||
continue;
|
||||
@@ -259,26 +159,18 @@ impl FulltextIndexApplier {
|
||||
}
|
||||
}
|
||||
|
||||
// Record elapsed time to histogram and collect metrics if requested
|
||||
let elapsed = apply_start.elapsed();
|
||||
INDEX_APPLY_ELAPSED
|
||||
.with_label_values(&[TYPE_FULLTEXT_INDEX])
|
||||
.observe(elapsed.as_secs_f64());
|
||||
|
||||
if let Some(m) = metrics {
|
||||
m.apply_elapsed += elapsed;
|
||||
if row_ids.is_none() {
|
||||
timer.stop_and_discard();
|
||||
}
|
||||
|
||||
Ok(row_ids)
|
||||
}
|
||||
|
||||
async fn apply_fine_one_column(
|
||||
&self,
|
||||
file_size_hint: Option<u64>,
|
||||
file_id: RegionIndexId,
|
||||
file_id: RegionFileId,
|
||||
column_id: ColumnId,
|
||||
request: &FulltextRequest,
|
||||
metrics: Option<&mut FulltextIndexApplyMetrics>,
|
||||
) -> Result<Option<BTreeSet<RowId>>> {
|
||||
let blob_key = format!(
|
||||
"{INDEX_BLOB_TYPE_TANTIVY}-{}",
|
||||
@@ -286,7 +178,7 @@ impl FulltextIndexApplier {
|
||||
);
|
||||
let dir = self
|
||||
.index_source
|
||||
.dir(file_id, &blob_key, file_size_hint, metrics)
|
||||
.dir(file_id, &blob_key, file_size_hint)
|
||||
.await?;
|
||||
|
||||
let dir = match &dir {
|
||||
@@ -348,20 +240,15 @@ impl FulltextIndexApplier {
|
||||
///
|
||||
/// Row group id existing in the returned result means that the row group is searched.
|
||||
/// Empty ranges means that the row group is searched but no rows are found.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `file_id` - The region file ID to apply predicates to
|
||||
/// * `file_size_hint` - Optional hint for file size to avoid extra metadata reads
|
||||
/// * `row_groups` - Iterator of row group lengths and whether to search in the row group
|
||||
/// * `metrics` - Optional mutable reference to collect metrics on demand
|
||||
pub async fn apply_coarse(
|
||||
&self,
|
||||
file_id: RegionIndexId,
|
||||
file_id: RegionFileId,
|
||||
file_size_hint: Option<u64>,
|
||||
row_groups: impl Iterator<Item = (usize, bool)>,
|
||||
mut metrics: Option<&mut FulltextIndexApplyMetrics>,
|
||||
) -> Result<Option<Vec<(usize, Vec<Range<usize>>)>>> {
|
||||
let apply_start = Instant::now();
|
||||
let timer = INDEX_APPLY_ELAPSED
|
||||
.with_label_values(&[TYPE_FULLTEXT_INDEX])
|
||||
.start_timer();
|
||||
|
||||
let (input, mut output) = Self::init_coarse_output(row_groups);
|
||||
let mut applied = false;
|
||||
@@ -379,38 +266,26 @@ impl FulltextIndexApplier {
|
||||
*column_id,
|
||||
&request.terms,
|
||||
&mut output,
|
||||
metrics.as_deref_mut(),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
if !applied {
|
||||
timer.stop_and_discard();
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Self::adjust_coarse_output(input, &mut output);
|
||||
|
||||
// Record elapsed time to histogram and collect metrics if requested
|
||||
let elapsed = apply_start.elapsed();
|
||||
INDEX_APPLY_ELAPSED
|
||||
.with_label_values(&[TYPE_FULLTEXT_INDEX])
|
||||
.observe(elapsed.as_secs_f64());
|
||||
|
||||
if let Some(m) = metrics {
|
||||
m.apply_elapsed += elapsed;
|
||||
}
|
||||
|
||||
Ok(Some(output))
|
||||
}
|
||||
|
||||
async fn apply_coarse_one_column(
|
||||
&self,
|
||||
file_id: RegionIndexId,
|
||||
file_id: RegionFileId,
|
||||
file_size_hint: Option<u64>,
|
||||
column_id: ColumnId,
|
||||
terms: &[FulltextTerm],
|
||||
output: &mut [(usize, Vec<Range<usize>>)],
|
||||
mut metrics: Option<&mut FulltextIndexApplyMetrics>,
|
||||
) -> Result<bool> {
|
||||
let blob_key = format!(
|
||||
"{INDEX_BLOB_TYPE_BLOOM}-{}",
|
||||
@@ -418,7 +293,7 @@ impl FulltextIndexApplier {
|
||||
);
|
||||
let Some(reader) = self
|
||||
.index_source
|
||||
.blob(file_id, &blob_key, file_size_hint, metrics.as_deref_mut())
|
||||
.blob(file_id, &blob_key, file_size_hint)
|
||||
.await?
|
||||
else {
|
||||
return Ok(false);
|
||||
@@ -440,7 +315,6 @@ impl FulltextIndexApplier {
|
||||
.content_length;
|
||||
let reader = CachedBloomFilterIndexBlobReader::new(
|
||||
file_id.file_id(),
|
||||
file_id.version,
|
||||
column_id,
|
||||
Tag::Fulltext,
|
||||
blob_size,
|
||||
@@ -462,13 +336,7 @@ impl FulltextIndexApplier {
|
||||
}
|
||||
|
||||
*row_group_output = applier
|
||||
.search(
|
||||
&predicates,
|
||||
row_group_output,
|
||||
metrics
|
||||
.as_deref_mut()
|
||||
.map(|m| &mut m.bloom_filter_read_metrics),
|
||||
)
|
||||
.search(&predicates, row_group_output)
|
||||
.await
|
||||
.context(ApplyBloomFilterIndexSnafu)?;
|
||||
}
|
||||
@@ -612,18 +480,11 @@ impl IndexSource {
|
||||
/// Returns `None` if the blob is not found.
|
||||
async fn blob(
|
||||
&self,
|
||||
file_id: RegionIndexId,
|
||||
file_id: RegionFileId,
|
||||
key: &str,
|
||||
file_size_hint: Option<u64>,
|
||||
metrics: Option<&mut FulltextIndexApplyMetrics>,
|
||||
) -> Result<Option<GuardWithMetadata<SstPuffinBlob>>> {
|
||||
let (reader, fallbacked) = self.ensure_reader(file_id, file_size_hint).await?;
|
||||
|
||||
// Track cache miss if fallbacked to remote
|
||||
if fallbacked && let Some(m) = metrics {
|
||||
m.blob_cache_miss += 1;
|
||||
}
|
||||
|
||||
let res = reader.blob(key).await;
|
||||
match res {
|
||||
Ok(blob) => Ok(Some(blob)),
|
||||
@@ -650,28 +511,14 @@ impl IndexSource {
|
||||
/// Returns `None` if the directory is not found.
|
||||
async fn dir(
|
||||
&self,
|
||||
file_id: RegionIndexId,
|
||||
file_id: RegionFileId,
|
||||
key: &str,
|
||||
file_size_hint: Option<u64>,
|
||||
mut metrics: Option<&mut FulltextIndexApplyMetrics>,
|
||||
) -> Result<Option<GuardWithMetadata<SstPuffinDir>>> {
|
||||
let (reader, fallbacked) = self.ensure_reader(file_id, file_size_hint).await?;
|
||||
|
||||
// Track cache miss if fallbacked to remote
|
||||
if fallbacked && let Some(m) = &mut metrics {
|
||||
m.blob_cache_miss += 1;
|
||||
}
|
||||
|
||||
let start = metrics.as_ref().map(|_| Instant::now());
|
||||
let res = reader.dir(key).await;
|
||||
match res {
|
||||
Ok((dir, dir_metrics)) => {
|
||||
if let Some(m) = metrics {
|
||||
// Safety: start is Some when metrics is Some
|
||||
m.collect_dir_metrics(start.unwrap().elapsed(), dir_metrics);
|
||||
}
|
||||
Ok(Some(dir))
|
||||
}
|
||||
Ok(dir) => Ok(Some(dir)),
|
||||
Err(err) if err.is_blob_not_found() => Ok(None),
|
||||
Err(err) => {
|
||||
if fallbacked {
|
||||
@@ -679,16 +526,9 @@ impl IndexSource {
|
||||
} else {
|
||||
warn!(err; "An unexpected error occurred while reading the cached index file. Fallback to remote index file.");
|
||||
let reader = self.build_remote(file_id, file_size_hint).await?;
|
||||
let start = metrics.as_ref().map(|_| Instant::now());
|
||||
let res = reader.dir(key).await;
|
||||
match res {
|
||||
Ok((dir, dir_metrics)) => {
|
||||
if let Some(m) = metrics {
|
||||
// Safety: start is Some when metrics is Some
|
||||
m.collect_dir_metrics(start.unwrap().elapsed(), dir_metrics);
|
||||
}
|
||||
Ok(Some(dir))
|
||||
}
|
||||
Ok(dir) => Ok(Some(dir)),
|
||||
Err(err) if err.is_blob_not_found() => Ok(None),
|
||||
Err(err) => Err(err).context(PuffinReadBlobSnafu),
|
||||
}
|
||||
@@ -700,7 +540,7 @@ impl IndexSource {
|
||||
/// Return reader and whether it is fallbacked to remote store.
|
||||
async fn ensure_reader(
|
||||
&self,
|
||||
file_id: RegionIndexId,
|
||||
file_id: RegionFileId,
|
||||
file_size_hint: Option<u64>,
|
||||
) -> Result<(SstPuffinReader, bool)> {
|
||||
match self.build_local_cache(file_id, file_size_hint).await {
|
||||
@@ -712,18 +552,14 @@ impl IndexSource {
|
||||
|
||||
async fn build_local_cache(
|
||||
&self,
|
||||
file_id: RegionIndexId,
|
||||
file_id: RegionFileId,
|
||||
file_size_hint: Option<u64>,
|
||||
) -> Result<Option<SstPuffinReader>> {
|
||||
let Some(file_cache) = &self.file_cache else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let index_key = IndexKey::new(
|
||||
file_id.region_id(),
|
||||
file_id.file_id(),
|
||||
FileType::Puffin(file_id.version),
|
||||
);
|
||||
let index_key = IndexKey::new(file_id.region_id(), file_id.file_id(), FileType::Puffin);
|
||||
if file_cache.get(index_key).await.is_none() {
|
||||
return Ok(None);
|
||||
};
|
||||
@@ -745,7 +581,7 @@ impl IndexSource {
|
||||
|
||||
async fn build_remote(
|
||||
&self,
|
||||
file_id: RegionIndexId,
|
||||
file_id: RegionFileId,
|
||||
file_size_hint: Option<u64>,
|
||||
) -> Result<SstPuffinReader> {
|
||||
let puffin_manager = self
|
||||
|
||||
@@ -481,7 +481,7 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::access_layer::RegionFilePathFactory;
|
||||
use crate::read::{Batch, BatchColumn};
|
||||
use crate::sst::file::{RegionFileId, RegionIndexId};
|
||||
use crate::sst::file::RegionFileId;
|
||||
use crate::sst::index::fulltext_index::applier::FulltextIndexApplier;
|
||||
use crate::sst::index::fulltext_index::applier::builder::{
|
||||
FulltextQuery, FulltextRequest, FulltextTerm,
|
||||
@@ -672,8 +672,7 @@ mod tests {
|
||||
RegionFilePathFactory::new(table_dir.clone(), PathType::Bare),
|
||||
);
|
||||
let region_file_id = RegionFileId::new(region_metadata.region_id, sst_file_id);
|
||||
let index_id = RegionIndexId::new(region_file_id, 0);
|
||||
let mut writer = puffin_manager.writer(&index_id).await.unwrap();
|
||||
let mut writer = puffin_manager.writer(®ion_file_id).await.unwrap();
|
||||
let _ = indexer.finish(&mut writer).await.unwrap();
|
||||
writer.finish().await.unwrap();
|
||||
|
||||
@@ -725,14 +724,14 @@ mod tests {
|
||||
async move {
|
||||
match backend {
|
||||
FulltextBackend::Tantivy => {
|
||||
applier.apply_fine(index_id, None, None).await.unwrap()
|
||||
applier.apply_fine(region_file_id, None).await.unwrap()
|
||||
}
|
||||
FulltextBackend::Bloom => {
|
||||
let coarse_mask = coarse_mask.unwrap_or_default();
|
||||
let row_groups = (0..coarse_mask.len()).map(|i| (1, coarse_mask[i]));
|
||||
// row group id == row id
|
||||
let resp = applier
|
||||
.apply_coarse(index_id, None, row_groups, None)
|
||||
.apply_coarse(region_file_id, None, row_groups)
|
||||
.await
|
||||
.unwrap();
|
||||
resp.map(|r| {
|
||||
|
||||
@@ -14,8 +14,6 @@
|
||||
|
||||
use common_telemetry::warn;
|
||||
|
||||
use crate::access_layer::TempFileCleaner;
|
||||
use crate::sst::file::{RegionFileId, RegionIndexId};
|
||||
use crate::sst::index::Indexer;
|
||||
|
||||
impl Indexer {
|
||||
@@ -24,9 +22,6 @@ impl Indexer {
|
||||
self.do_abort_fulltext_index().await;
|
||||
self.do_abort_bloom_filter().await;
|
||||
self.do_prune_intm_sst_dir().await;
|
||||
if self.write_cache_enabled {
|
||||
self.do_abort_clean_fs_temp_dir().await;
|
||||
}
|
||||
self.puffin_manager = None;
|
||||
}
|
||||
|
||||
@@ -92,18 +87,4 @@ impl Indexer {
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
async fn do_abort_clean_fs_temp_dir(&mut self) {
|
||||
let Some(puffin_manager) = &self.puffin_manager else {
|
||||
return;
|
||||
};
|
||||
let fs_accessor = puffin_manager.file_accessor();
|
||||
|
||||
let fs_handle = RegionIndexId::new(
|
||||
RegionFileId::new(self.region_id, self.file_id),
|
||||
self.index_version,
|
||||
)
|
||||
.to_string();
|
||||
TempFileCleaner::clean_atomic_dir_files(fs_accessor.store().store(), &[&fs_handle]).await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@ use common_telemetry::{debug, warn};
|
||||
use puffin::puffin_manager::{PuffinManager, PuffinWriter};
|
||||
use store_api::storage::ColumnId;
|
||||
|
||||
use crate::sst::file::{RegionFileId, RegionIndexId};
|
||||
use crate::sst::file::RegionFileId;
|
||||
use crate::sst::index::puffin_manager::SstPuffinWriter;
|
||||
use crate::sst::index::statistics::{ByteCount, RowCount};
|
||||
use crate::sst::index::{
|
||||
@@ -56,18 +56,14 @@ impl Indexer {
|
||||
|
||||
self.do_prune_intm_sst_dir().await;
|
||||
output.file_size = self.do_finish_puffin_writer(writer).await;
|
||||
output.version = self.index_version;
|
||||
output
|
||||
}
|
||||
|
||||
async fn build_puffin_writer(&mut self) -> Option<SstPuffinWriter> {
|
||||
let puffin_manager = self.puffin_manager.clone()?;
|
||||
let puffin_manager = self.puffin_manager.take()?;
|
||||
|
||||
let err = match puffin_manager
|
||||
.writer(&RegionIndexId::new(
|
||||
RegionFileId::new(self.region_id, self.file_id),
|
||||
self.index_version,
|
||||
))
|
||||
.writer(&RegionFileId::new(self.region_id, self.file_id))
|
||||
.await
|
||||
{
|
||||
Ok(writer) => return Some(writer),
|
||||
|
||||
@@ -16,11 +16,10 @@ pub mod builder;
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use common_base::range_read::RangeReader;
|
||||
use common_telemetry::warn;
|
||||
use index::inverted_index::format::reader::{InvertedIndexBlobReader, InvertedIndexReadMetrics};
|
||||
use index::inverted_index::format::reader::InvertedIndexBlobReader;
|
||||
use index::inverted_index::search::index_apply::{
|
||||
ApplyOutput, IndexApplier, IndexNotFoundStrategy, SearchContext,
|
||||
};
|
||||
@@ -40,72 +39,11 @@ use crate::error::{
|
||||
ApplyInvertedIndexSnafu, MetadataSnafu, PuffinBuildReaderSnafu, PuffinReadBlobSnafu, Result,
|
||||
};
|
||||
use crate::metrics::{INDEX_APPLY_ELAPSED, INDEX_APPLY_MEMORY_USAGE};
|
||||
use crate::sst::file::RegionIndexId;
|
||||
use crate::sst::file::RegionFileId;
|
||||
use crate::sst::index::TYPE_INVERTED_INDEX;
|
||||
use crate::sst::index::inverted_index::INDEX_BLOB_TYPE;
|
||||
use crate::sst::index::puffin_manager::{BlobReader, PuffinManagerFactory};
|
||||
|
||||
/// Metrics for tracking inverted index apply operations.
|
||||
#[derive(Default, Clone)]
|
||||
pub struct InvertedIndexApplyMetrics {
|
||||
/// Total time spent applying the index.
|
||||
pub apply_elapsed: std::time::Duration,
|
||||
/// Number of blob cache misses (0 or 1).
|
||||
pub blob_cache_miss: usize,
|
||||
/// Total size of blobs read (in bytes).
|
||||
pub blob_read_bytes: u64,
|
||||
/// Metrics for inverted index reads.
|
||||
pub inverted_index_read_metrics: InvertedIndexReadMetrics,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for InvertedIndexApplyMetrics {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let Self {
|
||||
apply_elapsed,
|
||||
blob_cache_miss,
|
||||
blob_read_bytes,
|
||||
inverted_index_read_metrics,
|
||||
} = self;
|
||||
|
||||
if self.is_empty() {
|
||||
return write!(f, "{{}}");
|
||||
}
|
||||
write!(f, "{{")?;
|
||||
|
||||
write!(f, "\"apply_elapsed\":\"{:?}\"", apply_elapsed)?;
|
||||
|
||||
if *blob_cache_miss > 0 {
|
||||
write!(f, ", \"blob_cache_miss\":{}", blob_cache_miss)?;
|
||||
}
|
||||
if *blob_read_bytes > 0 {
|
||||
write!(f, ", \"blob_read_bytes\":{}", blob_read_bytes)?;
|
||||
}
|
||||
write!(
|
||||
f,
|
||||
", \"inverted_index_read_metrics\":{:?}",
|
||||
inverted_index_read_metrics
|
||||
)?;
|
||||
|
||||
write!(f, "}}")
|
||||
}
|
||||
}
|
||||
|
||||
impl InvertedIndexApplyMetrics {
|
||||
/// Returns true if the metrics are empty (contain no meaningful data).
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.apply_elapsed.is_zero()
|
||||
}
|
||||
|
||||
/// Merges another metrics into this one.
|
||||
pub fn merge_from(&mut self, other: &Self) {
|
||||
self.apply_elapsed += other.apply_elapsed;
|
||||
self.blob_cache_miss += other.blob_cache_miss;
|
||||
self.blob_read_bytes += other.blob_read_bytes;
|
||||
self.inverted_index_read_metrics
|
||||
.merge_from(&other.inverted_index_read_metrics);
|
||||
}
|
||||
}
|
||||
|
||||
/// `InvertedIndexApplier` is responsible for applying predicates to the provided SST files
|
||||
/// and returning the relevant row group ids for further scan.
|
||||
pub(crate) struct InvertedIndexApplier {
|
||||
@@ -186,30 +124,24 @@ impl InvertedIndexApplier {
|
||||
self
|
||||
}
|
||||
|
||||
/// Applies predicates to the provided SST file id and returns the relevant row group ids.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `file_id` - The region file ID to apply predicates to
|
||||
/// * `file_size_hint` - Optional hint for file size to avoid extra metadata reads
|
||||
/// * `metrics` - Optional mutable reference to collect metrics on demand
|
||||
/// Applies predicates to the provided SST file id and returns the relevant row group ids
|
||||
pub async fn apply(
|
||||
&self,
|
||||
file_id: RegionIndexId,
|
||||
file_id: RegionFileId,
|
||||
file_size_hint: Option<u64>,
|
||||
mut metrics: Option<&mut InvertedIndexApplyMetrics>,
|
||||
) -> Result<ApplyOutput> {
|
||||
let start = Instant::now();
|
||||
let _timer = INDEX_APPLY_ELAPSED
|
||||
.with_label_values(&[TYPE_INVERTED_INDEX])
|
||||
.start_timer();
|
||||
|
||||
let context = SearchContext {
|
||||
// Encountering a non-existing column indicates that it doesn't match predicates.
|
||||
index_not_found_strategy: IndexNotFoundStrategy::ReturnEmpty,
|
||||
};
|
||||
|
||||
let mut cache_miss = 0;
|
||||
let blob = match self.cached_blob_reader(file_id, file_size_hint).await {
|
||||
Ok(Some(puffin_reader)) => puffin_reader,
|
||||
other => {
|
||||
cache_miss += 1;
|
||||
if let Err(err) = other {
|
||||
warn!(err; "An unexpected error occurred while reading the cached index file. Fallback to remote index file.")
|
||||
}
|
||||
@@ -217,70 +149,38 @@ impl InvertedIndexApplier {
|
||||
}
|
||||
};
|
||||
|
||||
let blob_size = blob.metadata().await.context(MetadataSnafu)?.content_length;
|
||||
|
||||
let result = if let Some(index_cache) = &self.inverted_index_cache {
|
||||
if let Some(index_cache) = &self.inverted_index_cache {
|
||||
let blob_size = blob.metadata().await.context(MetadataSnafu)?.content_length;
|
||||
let mut index_reader = CachedInvertedIndexBlobReader::new(
|
||||
file_id.file_id(),
|
||||
file_id.version,
|
||||
blob_size,
|
||||
InvertedIndexBlobReader::new(blob),
|
||||
index_cache.clone(),
|
||||
);
|
||||
self.index_applier
|
||||
.apply(
|
||||
context,
|
||||
&mut index_reader,
|
||||
metrics
|
||||
.as_deref_mut()
|
||||
.map(|m| &mut m.inverted_index_read_metrics),
|
||||
)
|
||||
.apply(context, &mut index_reader)
|
||||
.await
|
||||
.context(ApplyInvertedIndexSnafu)
|
||||
} else {
|
||||
let mut index_reader = InvertedIndexBlobReader::new(blob);
|
||||
self.index_applier
|
||||
.apply(
|
||||
context,
|
||||
&mut index_reader,
|
||||
metrics
|
||||
.as_deref_mut()
|
||||
.map(|m| &mut m.inverted_index_read_metrics),
|
||||
)
|
||||
.apply(context, &mut index_reader)
|
||||
.await
|
||||
.context(ApplyInvertedIndexSnafu)
|
||||
};
|
||||
|
||||
// Record elapsed time to histogram and collect metrics if requested
|
||||
let elapsed = start.elapsed();
|
||||
INDEX_APPLY_ELAPSED
|
||||
.with_label_values(&[TYPE_INVERTED_INDEX])
|
||||
.observe(elapsed.as_secs_f64());
|
||||
|
||||
if let Some(metrics) = metrics {
|
||||
metrics.apply_elapsed = elapsed;
|
||||
metrics.blob_cache_miss = cache_miss;
|
||||
metrics.blob_read_bytes = blob_size;
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Creates a blob reader from the cached index file.
|
||||
async fn cached_blob_reader(
|
||||
&self,
|
||||
file_id: RegionIndexId,
|
||||
file_id: RegionFileId,
|
||||
file_size_hint: Option<u64>,
|
||||
) -> Result<Option<BlobReader>> {
|
||||
let Some(file_cache) = &self.file_cache else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let index_key = IndexKey::new(
|
||||
file_id.region_id(),
|
||||
file_id.file_id(),
|
||||
FileType::Puffin(file_id.version),
|
||||
);
|
||||
let index_key = IndexKey::new(file_id.region_id(), file_id.file_id(), FileType::Puffin);
|
||||
if file_cache.get(index_key).await.is_none() {
|
||||
return Ok(None);
|
||||
};
|
||||
@@ -308,7 +208,7 @@ impl InvertedIndexApplier {
|
||||
/// Creates a blob reader from the remote index file.
|
||||
async fn remote_blob_reader(
|
||||
&self,
|
||||
file_id: RegionIndexId,
|
||||
file_id: RegionFileId,
|
||||
file_size_hint: Option<u64>,
|
||||
) -> Result<BlobReader> {
|
||||
let puffin_manager = self
|
||||
@@ -354,7 +254,6 @@ mod tests {
|
||||
use store_api::storage::FileId;
|
||||
|
||||
use super::*;
|
||||
use crate::sst::index::RegionFileId;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_index_applier_apply_basic() {
|
||||
@@ -362,14 +261,13 @@ mod tests {
|
||||
PuffinManagerFactory::new_for_test_async("test_index_applier_apply_basic_").await;
|
||||
let object_store = ObjectStore::new(Memory::default()).unwrap().finish();
|
||||
let file_id = RegionFileId::new(0.into(), FileId::random());
|
||||
let index_id = RegionIndexId::new(file_id, 0);
|
||||
let table_dir = "table_dir".to_string();
|
||||
|
||||
let puffin_manager = puffin_manager_factory.build(
|
||||
object_store.clone(),
|
||||
RegionFilePathFactory::new(table_dir.clone(), PathType::Bare),
|
||||
);
|
||||
let mut writer = puffin_manager.writer(&index_id).await.unwrap();
|
||||
let mut writer = puffin_manager.writer(&file_id).await.unwrap();
|
||||
writer
|
||||
.put_blob(
|
||||
INDEX_BLOB_TYPE,
|
||||
@@ -383,7 +281,7 @@ mod tests {
|
||||
|
||||
let mut mock_index_applier = MockIndexApplier::new();
|
||||
mock_index_applier.expect_memory_usage().returning(|| 100);
|
||||
mock_index_applier.expect_apply().returning(|_, _, _| {
|
||||
mock_index_applier.expect_apply().returning(|_, _| {
|
||||
Ok(ApplyOutput {
|
||||
matched_segment_ids: Bitmap::new_bitvec(),
|
||||
total_row_count: 100,
|
||||
@@ -399,7 +297,7 @@ mod tests {
|
||||
puffin_manager_factory,
|
||||
Default::default(),
|
||||
);
|
||||
let output = sst_index_applier.apply(index_id, None, None).await.unwrap();
|
||||
let output = sst_index_applier.apply(file_id, None).await.unwrap();
|
||||
assert_eq!(
|
||||
output,
|
||||
ApplyOutput {
|
||||
@@ -417,14 +315,13 @@ mod tests {
|
||||
.await;
|
||||
let object_store = ObjectStore::new(Memory::default()).unwrap().finish();
|
||||
let file_id = RegionFileId::new(0.into(), FileId::random());
|
||||
let index_id = RegionIndexId::new(file_id, 0);
|
||||
let table_dir = "table_dir".to_string();
|
||||
|
||||
let puffin_manager = puffin_manager_factory.build(
|
||||
object_store.clone(),
|
||||
RegionFilePathFactory::new(table_dir.clone(), PathType::Bare),
|
||||
);
|
||||
let mut writer = puffin_manager.writer(&index_id).await.unwrap();
|
||||
let mut writer = puffin_manager.writer(&file_id).await.unwrap();
|
||||
writer
|
||||
.put_blob(
|
||||
"invalid_blob_type",
|
||||
@@ -448,7 +345,7 @@ mod tests {
|
||||
puffin_manager_factory,
|
||||
Default::default(),
|
||||
);
|
||||
let res = sst_index_applier.apply(index_id, None, None).await;
|
||||
let res = sst_index_applier.apply(file_id, None).await;
|
||||
assert!(format!("{:?}", res.unwrap_err()).contains("Blob not found"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -466,7 +466,7 @@ mod tests {
|
||||
use crate::cache::index::inverted_index::InvertedIndexCache;
|
||||
use crate::metrics::CACHE_BYTES;
|
||||
use crate::read::BatchColumn;
|
||||
use crate::sst::file::{RegionFileId, RegionIndexId};
|
||||
use crate::sst::file::RegionFileId;
|
||||
use crate::sst::index::inverted_index::applier::builder::InvertedIndexApplierBuilder;
|
||||
use crate::sst::index::puffin_manager::PuffinManagerFactory;
|
||||
|
||||
@@ -591,8 +591,7 @@ mod tests {
|
||||
);
|
||||
|
||||
let sst_file_id = RegionFileId::new(region_metadata.region_id, sst_file_id);
|
||||
let index_id = RegionIndexId::new(sst_file_id, 0);
|
||||
let mut writer = puffin_manager.writer(&index_id).await.unwrap();
|
||||
let mut writer = puffin_manager.writer(&sst_file_id).await.unwrap();
|
||||
let (row_count, _) = creator.finish(&mut writer).await.unwrap();
|
||||
assert_eq!(row_count, rows.len() * segment_row_count);
|
||||
writer.finish().await.unwrap();
|
||||
@@ -616,7 +615,7 @@ mod tests {
|
||||
.unwrap();
|
||||
Box::pin(async move {
|
||||
applier
|
||||
.apply(index_id, None, None)
|
||||
.apply(sst_file_id, None)
|
||||
.await
|
||||
.unwrap()
|
||||
.matched_segment_ids
|
||||
|
||||
@@ -32,14 +32,14 @@ use crate::metrics::{
|
||||
INDEX_PUFFIN_FLUSH_OP_TOTAL, INDEX_PUFFIN_READ_BYTES_TOTAL, INDEX_PUFFIN_READ_OP_TOTAL,
|
||||
INDEX_PUFFIN_WRITE_BYTES_TOTAL, INDEX_PUFFIN_WRITE_OP_TOTAL, StagerMetrics,
|
||||
};
|
||||
use crate::sst::file::RegionIndexId;
|
||||
use crate::sst::file::RegionFileId;
|
||||
use crate::sst::index::store::{self, InstrumentedStore};
|
||||
|
||||
type InstrumentedRangeReader = store::InstrumentedRangeReader<'static>;
|
||||
type InstrumentedAsyncWrite = store::InstrumentedAsyncWrite<'static, FuturesAsyncWriter>;
|
||||
|
||||
pub(crate) type SstPuffinManager =
|
||||
FsPuffinManager<Arc<BoundedStager<RegionIndexId>>, ObjectStorePuffinFileAccessor>;
|
||||
FsPuffinManager<Arc<BoundedStager<RegionFileId>>, ObjectStorePuffinFileAccessor>;
|
||||
pub(crate) type SstPuffinReader = <SstPuffinManager as PuffinManager>::Reader;
|
||||
pub(crate) type SstPuffinWriter = <SstPuffinManager as PuffinManager>::Writer;
|
||||
pub(crate) type SstPuffinBlob = <SstPuffinReader as PuffinReader>::Blob;
|
||||
@@ -52,7 +52,7 @@ const STAGING_DIR: &str = "staging";
|
||||
#[derive(Clone)]
|
||||
pub struct PuffinManagerFactory {
|
||||
/// The stager used by the puffin manager.
|
||||
stager: Arc<BoundedStager<RegionIndexId>>,
|
||||
stager: Arc<BoundedStager<RegionFileId>>,
|
||||
|
||||
/// The size of the write buffer used to create object store.
|
||||
write_buffer_size: Option<usize>,
|
||||
@@ -92,7 +92,7 @@ impl PuffinManagerFactory {
|
||||
SstPuffinManager::new(self.stager.clone(), puffin_file_accessor)
|
||||
}
|
||||
|
||||
pub(crate) async fn purge_stager(&self, file_id: RegionIndexId) -> Result<()> {
|
||||
pub(crate) async fn purge_stager(&self, file_id: RegionFileId) -> Result<()> {
|
||||
self.stager
|
||||
.purge(&file_id)
|
||||
.await
|
||||
@@ -136,22 +136,16 @@ impl ObjectStorePuffinFileAccessor {
|
||||
path_provider,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn store(&self) -> &InstrumentedStore {
|
||||
&self.object_store
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl PuffinFileAccessor for ObjectStorePuffinFileAccessor {
|
||||
type Reader = InstrumentedRangeReader;
|
||||
type Writer = InstrumentedAsyncWrite;
|
||||
type FileHandle = RegionIndexId;
|
||||
type FileHandle = RegionFileId;
|
||||
|
||||
async fn reader(&self, handle: &RegionIndexId) -> PuffinResult<Self::Reader> {
|
||||
let file_path = self
|
||||
.path_provider
|
||||
.build_index_file_path_with_version(*handle);
|
||||
async fn reader(&self, handle: &RegionFileId) -> PuffinResult<Self::Reader> {
|
||||
let file_path = self.path_provider.build_index_file_path(*handle);
|
||||
self.object_store
|
||||
.range_reader(
|
||||
&file_path,
|
||||
@@ -163,10 +157,8 @@ impl PuffinFileAccessor for ObjectStorePuffinFileAccessor {
|
||||
.context(puffin_error::ExternalSnafu)
|
||||
}
|
||||
|
||||
async fn writer(&self, handle: &RegionIndexId) -> PuffinResult<Self::Writer> {
|
||||
let file_path = self
|
||||
.path_provider
|
||||
.build_index_file_path_with_version(*handle);
|
||||
async fn writer(&self, handle: &RegionFileId) -> PuffinResult<Self::Writer> {
|
||||
let file_path = self.path_provider.build_index_file_path(*handle);
|
||||
self.object_store
|
||||
.writer(
|
||||
&file_path,
|
||||
@@ -192,7 +184,7 @@ mod tests {
|
||||
use store_api::storage::FileId;
|
||||
|
||||
use super::*;
|
||||
use crate::sst::file::{RegionFileId, RegionIndexId};
|
||||
use crate::sst::file::RegionFileId;
|
||||
|
||||
struct TestFilePathProvider;
|
||||
|
||||
@@ -201,10 +193,6 @@ mod tests {
|
||||
file_id.file_id().to_string()
|
||||
}
|
||||
|
||||
fn build_index_file_path_with_version(&self, index_id: RegionIndexId) -> String {
|
||||
index_id.file_id.file_id().to_string()
|
||||
}
|
||||
|
||||
fn build_sst_file_path(&self, file_id: RegionFileId) -> String {
|
||||
file_id.file_id().to_string()
|
||||
}
|
||||
@@ -218,7 +206,7 @@ mod tests {
|
||||
let object_store = ObjectStore::new(Memory::default()).unwrap().finish();
|
||||
let manager = factory.build(object_store, TestFilePathProvider);
|
||||
|
||||
let file_id = RegionIndexId::new(RegionFileId::new(0.into(), FileId::random()), 0);
|
||||
let file_id = RegionFileId::new(0.into(), FileId::random());
|
||||
let blob_key = "blob-key";
|
||||
let dir_key = "dir-key";
|
||||
let raw_data = b"hello world!";
|
||||
@@ -257,7 +245,7 @@ mod tests {
|
||||
let bs = blob_reader.read(0..meta.content_length).await.unwrap();
|
||||
assert_eq!(&*bs, raw_data);
|
||||
|
||||
let (dir_guard, _metrics) = reader.dir(dir_key).await.unwrap();
|
||||
let dir_guard = reader.dir(dir_key).await.unwrap();
|
||||
let file = dir_guard.path().join("hello");
|
||||
let data = tokio::fs::read(file).await.unwrap();
|
||||
assert_eq!(data, raw_data);
|
||||
|
||||
@@ -49,10 +49,6 @@ impl InstrumentedStore {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn store(&self) -> &ObjectStore {
|
||||
&self.object_store
|
||||
}
|
||||
|
||||
/// Set the size of the write buffer.
|
||||
pub fn with_write_buffer_size(mut self, write_buffer_size: Option<usize>) -> Self {
|
||||
self.write_buffer_size = write_buffer_size.filter(|&size| size > 0);
|
||||
|
||||
@@ -20,7 +20,7 @@ use store_api::region_request::PathType;
|
||||
use store_api::storage::{FileId, RegionId};
|
||||
|
||||
use crate::error::UnexpectedSnafu;
|
||||
use crate::sst::file::{RegionFileId, RegionIndexId};
|
||||
use crate::sst::file::RegionFileId;
|
||||
|
||||
/// Generate region dir from table_dir, region_id and path_type
|
||||
pub fn region_dir_from_table_dir(
|
||||
@@ -46,68 +46,14 @@ pub fn sst_file_path(table_dir: &str, region_file_id: RegionFileId, path_type: P
|
||||
)
|
||||
}
|
||||
|
||||
pub fn index_file_path(table_dir: &str, index_id: RegionIndexId, path_type: PathType) -> String {
|
||||
let region_dir = region_dir_from_table_dir(table_dir, index_id.file_id.region_id(), path_type);
|
||||
let index_dir = util::join_dir(®ion_dir, "index");
|
||||
|
||||
let filename = if index_id.version == 0 {
|
||||
format!("{}.puffin", index_id.file_id.file_id())
|
||||
} else {
|
||||
format!("{}.{}.puffin", index_id.file_id.file_id(), index_id.version)
|
||||
};
|
||||
|
||||
util::join_path(&index_dir, &filename)
|
||||
}
|
||||
|
||||
/// Legacy function for backward compatibility - creates index file path using RegionFileId with version 0
|
||||
pub fn index_file_path_legacy(
|
||||
pub fn index_file_path(
|
||||
table_dir: &str,
|
||||
region_file_id: RegionFileId,
|
||||
path_type: PathType,
|
||||
) -> String {
|
||||
let index_id = RegionIndexId::new(region_file_id, 0);
|
||||
index_file_path(table_dir, index_id, path_type)
|
||||
}
|
||||
|
||||
/// Parse file ID and version from index filename
|
||||
pub fn parse_index_file_info(filepath: &str) -> crate::error::Result<(FileId, u64)> {
|
||||
let filename = filepath.rsplit('/').next().context(UnexpectedSnafu {
|
||||
reason: format!("invalid file path: {}", filepath),
|
||||
})?;
|
||||
let parts: Vec<&str> = filename.split('.').collect();
|
||||
|
||||
if parts.len() == 2 && parts[1] == "puffin" {
|
||||
// Legacy format: {file_id}.puffin (version 0)
|
||||
let file_id = parts[0];
|
||||
FileId::parse_str(file_id).map(|id| (id, 0)).map_err(|e| {
|
||||
UnexpectedSnafu {
|
||||
reason: format!("invalid file id: {}, err: {}", file_id, e),
|
||||
}
|
||||
.build()
|
||||
})
|
||||
} else if parts.len() == 3 && parts[2] == "puffin" {
|
||||
// New format: {file_id}.{version}.puffin
|
||||
let file_id = parts[0];
|
||||
let version = parts[1].parse::<u64>().map_err(|_| {
|
||||
UnexpectedSnafu {
|
||||
reason: format!("invalid version in file name: {}", filename),
|
||||
}
|
||||
.build()
|
||||
})?;
|
||||
FileId::parse_str(file_id)
|
||||
.map(|id| (id, version))
|
||||
.map_err(|e| {
|
||||
UnexpectedSnafu {
|
||||
reason: format!("invalid file id: {}, err: {}", file_id, e),
|
||||
}
|
||||
.build()
|
||||
})
|
||||
} else {
|
||||
UnexpectedSnafu {
|
||||
reason: format!("invalid index file name: {}", filename),
|
||||
}
|
||||
.fail()
|
||||
}
|
||||
let region_dir = region_dir_from_table_dir(table_dir, region_file_id.region_id(), path_type);
|
||||
let index_dir = util::join_dir(®ion_dir, "index");
|
||||
util::join_path(&index_dir, &format!("{}.puffin", region_file_id.file_id()))
|
||||
}
|
||||
|
||||
/// Get RegionFileId from sst or index filename
|
||||
@@ -165,59 +111,17 @@ mod tests {
|
||||
fn test_index_file_path() {
|
||||
let file_id = FileId::random();
|
||||
let region_file_id = RegionFileId::new(RegionId::new(1, 2), file_id);
|
||||
let index_id = RegionIndexId::new(region_file_id, 0);
|
||||
assert_eq!(
|
||||
index_file_path("table_dir", index_id, PathType::Bare),
|
||||
index_file_path("table_dir", region_file_id, PathType::Bare),
|
||||
format!("table_dir/1_0000000002/index/{}.puffin", file_id)
|
||||
);
|
||||
assert_eq!(
|
||||
index_file_path("table_dir", index_id, PathType::Data),
|
||||
index_file_path("table_dir", region_file_id, PathType::Data),
|
||||
format!("table_dir/1_0000000002/data/index/{}.puffin", file_id)
|
||||
);
|
||||
assert_eq!(
|
||||
index_file_path("table_dir", index_id, PathType::Metadata),
|
||||
index_file_path("table_dir", region_file_id, PathType::Metadata),
|
||||
format!("table_dir/1_0000000002/metadata/index/{}.puffin", file_id)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_file_path_versioned() {
|
||||
let file_id = FileId::random();
|
||||
let region_file_id = RegionFileId::new(RegionId::new(1, 2), file_id);
|
||||
let index_id_v1 = RegionIndexId::new(region_file_id, 1);
|
||||
let index_id_v2 = RegionIndexId::new(region_file_id, 2);
|
||||
|
||||
assert_eq!(
|
||||
index_file_path("table_dir", index_id_v1, PathType::Bare),
|
||||
format!("table_dir/1_0000000002/index/{}.1.puffin", file_id)
|
||||
);
|
||||
assert_eq!(
|
||||
index_file_path("table_dir", index_id_v2, PathType::Bare),
|
||||
format!("table_dir/1_0000000002/index/{}.2.puffin", file_id)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_index_file_info() {
|
||||
// Test legacy format
|
||||
let file_id = FileId::random();
|
||||
let result =
|
||||
parse_index_file_info(&format!("table_dir/1_0000000002/index/{file_id}.puffin"))
|
||||
.unwrap();
|
||||
assert_eq!(result.0.to_string(), file_id.to_string());
|
||||
assert_eq!(result.1, 0);
|
||||
|
||||
// Test versioned format
|
||||
let result =
|
||||
parse_index_file_info(&format!("table_dir/1_0000000002/index/{file_id}.1.puffin"))
|
||||
.unwrap();
|
||||
assert_eq!(result.0.to_string(), file_id.to_string());
|
||||
assert_eq!(result.1, 1);
|
||||
|
||||
let result =
|
||||
parse_index_file_info(&format!("table_dir/1_0000000002/index/{file_id}.42.puffin"))
|
||||
.unwrap();
|
||||
assert_eq!(result.0.to_string(), file_id.to_string());
|
||||
assert_eq!(result.1, 42);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -117,7 +117,7 @@ mod tests {
|
||||
use crate::config::IndexConfig;
|
||||
use crate::read::{BatchBuilder, BatchReader, FlatSource};
|
||||
use crate::region::options::{IndexOptions, InvertedIndexOptions};
|
||||
use crate::sst::file::{FileHandle, FileMeta, RegionFileId, RegionIndexId};
|
||||
use crate::sst::file::{FileHandle, FileMeta, RegionFileId};
|
||||
use crate::sst::file_purger::NoopFilePurger;
|
||||
use crate::sst::index::bloom_filter::applier::BloomFilterIndexApplierBuilder;
|
||||
use crate::sst::index::inverted_index::applier::builder::InvertedIndexApplierBuilder;
|
||||
@@ -144,11 +144,7 @@ mod tests {
|
||||
|
||||
impl FilePathProvider for FixedPathProvider {
|
||||
fn build_index_file_path(&self, _file_id: RegionFileId) -> String {
|
||||
location::index_file_path_legacy(FILE_DIR, self.region_file_id, PathType::Bare)
|
||||
}
|
||||
|
||||
fn build_index_file_path_with_version(&self, index_id: RegionIndexId) -> String {
|
||||
location::index_file_path(FILE_DIR, index_id, PathType::Bare)
|
||||
location::index_file_path(FILE_DIR, self.region_file_id, PathType::Bare)
|
||||
}
|
||||
|
||||
fn build_sst_file_path(&self, _file_id: RegionFileId) -> String {
|
||||
@@ -160,7 +156,7 @@ mod tests {
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl IndexerBuilder for NoopIndexBuilder {
|
||||
async fn build(&self, _file_id: FileId, _index_version: u64) -> Indexer {
|
||||
async fn build(&self, _file_id: FileId) -> Indexer {
|
||||
Indexer::default()
|
||||
}
|
||||
}
|
||||
@@ -715,7 +711,6 @@ mod tests {
|
||||
metadata: metadata.clone(),
|
||||
row_group_size,
|
||||
puffin_manager,
|
||||
write_cache_enabled: false,
|
||||
intermediate_manager,
|
||||
index_options: IndexOptions {
|
||||
inverted_index: InvertedIndexOptions {
|
||||
@@ -774,7 +769,7 @@ mod tests {
|
||||
available_indexes: info.index_metadata.build_available_indexes(),
|
||||
indexes: info.index_metadata.build_indexes(),
|
||||
index_file_size: info.index_metadata.file_size,
|
||||
index_version: 0,
|
||||
index_file_id: None,
|
||||
num_row_groups: info.num_row_groups,
|
||||
num_rows: info.num_rows as u64,
|
||||
sequence: None,
|
||||
@@ -1095,7 +1090,6 @@ mod tests {
|
||||
metadata: metadata.clone(),
|
||||
row_group_size,
|
||||
puffin_manager,
|
||||
write_cache_enabled: false,
|
||||
intermediate_manager,
|
||||
index_options: IndexOptions {
|
||||
inverted_index: InvertedIndexOptions {
|
||||
|
||||
@@ -45,7 +45,6 @@ use crate::sst::parquet::format::ReadFormat;
|
||||
use crate::sst::parquet::reader::{
|
||||
FlatRowGroupReader, MaybeFilter, RowGroupReader, RowGroupReaderBuilder, SimpleFilterContext,
|
||||
};
|
||||
use crate::sst::parquet::row_group::ParquetFetchMetrics;
|
||||
|
||||
/// Checks if a row group contains delete operations by examining the min value of op_type column.
|
||||
///
|
||||
@@ -118,16 +117,11 @@ impl FileRange {
|
||||
pub(crate) async fn reader(
|
||||
&self,
|
||||
selector: Option<TimeSeriesRowSelector>,
|
||||
fetch_metrics: Option<&ParquetFetchMetrics>,
|
||||
) -> Result<PruneReader> {
|
||||
let parquet_reader = self
|
||||
.context
|
||||
.reader_builder
|
||||
.build(
|
||||
self.row_group_idx,
|
||||
self.row_selection.clone(),
|
||||
fetch_metrics,
|
||||
)
|
||||
.build(self.row_group_idx, self.row_selection.clone())
|
||||
.await?;
|
||||
|
||||
let use_last_row_reader = if selector
|
||||
@@ -174,18 +168,11 @@ impl FileRange {
|
||||
}
|
||||
|
||||
/// Creates a flat reader that returns RecordBatch.
|
||||
pub(crate) async fn flat_reader(
|
||||
&self,
|
||||
fetch_metrics: Option<&ParquetFetchMetrics>,
|
||||
) -> Result<FlatPruneReader> {
|
||||
pub(crate) async fn flat_reader(&self) -> Result<FlatPruneReader> {
|
||||
let parquet_reader = self
|
||||
.context
|
||||
.reader_builder
|
||||
.build(
|
||||
self.row_group_idx,
|
||||
self.row_selection.clone(),
|
||||
fetch_metrics,
|
||||
)
|
||||
.build(self.row_group_idx, self.row_selection.clone())
|
||||
.await?;
|
||||
|
||||
// Compute skip_fields once for this row group
|
||||
|
||||
@@ -40,10 +40,7 @@ use datatypes::arrow::datatypes::{SchemaRef, UInt32Type};
|
||||
use datatypes::arrow::record_batch::RecordBatch;
|
||||
use datatypes::prelude::DataType;
|
||||
use datatypes::vectors::{Helper, Vector};
|
||||
use mito_codec::row_converter::{
|
||||
CompositeValues, PrimaryKeyCodec, SortField, build_primary_key_codec,
|
||||
build_primary_key_codec_with_fields,
|
||||
};
|
||||
use mito_codec::row_converter::{SortField, build_primary_key_codec_with_fields};
|
||||
use parquet::file::metadata::{ParquetMetaData, RowGroupMetaData};
|
||||
use parquet::file::statistics::Statistics;
|
||||
use snafu::{OptionExt, ResultExt, ensure};
|
||||
@@ -51,8 +48,7 @@ use store_api::metadata::{ColumnMetadata, RegionMetadataRef};
|
||||
use store_api::storage::{ColumnId, SequenceNumber};
|
||||
|
||||
use crate::error::{
|
||||
ConvertVectorSnafu, DecodeSnafu, InvalidBatchSnafu, InvalidRecordBatchSnafu,
|
||||
NewRecordBatchSnafu, Result,
|
||||
ConvertVectorSnafu, InvalidBatchSnafu, InvalidRecordBatchSnafu, NewRecordBatchSnafu, Result,
|
||||
};
|
||||
use crate::read::{Batch, BatchBuilder, BatchColumn};
|
||||
use crate::sst::file::{FileMeta, FileTimeRange};
|
||||
@@ -390,13 +386,6 @@ impl ReadFormat {
|
||||
}
|
||||
}
|
||||
|
||||
/// Enables or disables eager decoding of primary key values into batches.
|
||||
pub(crate) fn set_decode_primary_key_values(&mut self, decode: bool) {
|
||||
if let ReadFormat::PrimaryKey(format) = self {
|
||||
format.set_decode_primary_key_values(decode);
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a sequence array to override.
|
||||
pub(crate) fn new_override_sequence_array(&self, length: usize) -> Option<ArrayRef> {
|
||||
match self {
|
||||
@@ -422,8 +411,6 @@ pub struct PrimaryKeyReadFormat {
|
||||
field_id_to_projected_index: HashMap<ColumnId, usize>,
|
||||
/// Sequence number to override the sequence read from the SST.
|
||||
override_sequence: Option<SequenceNumber>,
|
||||
/// Codec used to decode primary key values if eager decoding is enabled.
|
||||
primary_key_codec: Option<Arc<dyn PrimaryKeyCodec>>,
|
||||
}
|
||||
|
||||
impl PrimaryKeyReadFormat {
|
||||
@@ -452,7 +439,6 @@ impl PrimaryKeyReadFormat {
|
||||
projection_indices: format_projection.projection_indices,
|
||||
field_id_to_projected_index: format_projection.column_id_to_projected_index,
|
||||
override_sequence: None,
|
||||
primary_key_codec: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -461,15 +447,6 @@ impl PrimaryKeyReadFormat {
|
||||
self.override_sequence = sequence;
|
||||
}
|
||||
|
||||
/// Enables or disables eager decoding of primary key values into batches.
|
||||
pub(crate) fn set_decode_primary_key_values(&mut self, decode: bool) {
|
||||
self.primary_key_codec = if decode {
|
||||
Some(build_primary_key_codec(&self.metadata))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
}
|
||||
|
||||
/// Gets the arrow schema of the SST file.
|
||||
///
|
||||
/// This schema is computed from the region metadata but should be the same
|
||||
@@ -584,12 +561,7 @@ impl PrimaryKeyReadFormat {
|
||||
});
|
||||
}
|
||||
|
||||
let mut batch = builder.build()?;
|
||||
if let Some(codec) = &self.primary_key_codec {
|
||||
let pk_values: CompositeValues =
|
||||
codec.decode(batch.primary_key()).context(DecodeSnafu)?;
|
||||
batch.set_pk_values(pk_values);
|
||||
}
|
||||
let batch = builder.build()?;
|
||||
batches.push_back(batch);
|
||||
}
|
||||
|
||||
|
||||
@@ -52,21 +52,15 @@ use crate::metrics::{
|
||||
use crate::read::prune::{PruneReader, Source};
|
||||
use crate::read::{Batch, BatchReader};
|
||||
use crate::sst::file::FileHandle;
|
||||
use crate::sst::index::bloom_filter::applier::{
|
||||
BloomFilterIndexApplierRef, BloomFilterIndexApplyMetrics,
|
||||
};
|
||||
use crate::sst::index::fulltext_index::applier::{
|
||||
FulltextIndexApplierRef, FulltextIndexApplyMetrics,
|
||||
};
|
||||
use crate::sst::index::inverted_index::applier::{
|
||||
InvertedIndexApplierRef, InvertedIndexApplyMetrics,
|
||||
};
|
||||
use crate::sst::index::bloom_filter::applier::BloomFilterIndexApplierRef;
|
||||
use crate::sst::index::fulltext_index::applier::FulltextIndexApplierRef;
|
||||
use crate::sst::index::inverted_index::applier::InvertedIndexApplierRef;
|
||||
use crate::sst::parquet::file_range::{
|
||||
FileRangeContext, FileRangeContextRef, PreFilterMode, row_group_contains_delete,
|
||||
};
|
||||
use crate::sst::parquet::format::{ReadFormat, need_override_sequence};
|
||||
use crate::sst::parquet::metadata::MetadataLoader;
|
||||
use crate::sst::parquet::row_group::{InMemoryRowGroup, ParquetFetchMetrics};
|
||||
use crate::sst::parquet::row_group::InMemoryRowGroup;
|
||||
use crate::sst::parquet::row_selection::RowGroupSelection;
|
||||
use crate::sst::parquet::stats::RowGroupPruningStats;
|
||||
use crate::sst::parquet::{DEFAULT_READ_BATCH_SIZE, PARQUET_METADATA_KEY};
|
||||
@@ -127,8 +121,6 @@ pub struct ParquetReaderBuilder {
|
||||
compaction: bool,
|
||||
/// Mode to pre-filter columns.
|
||||
pre_filter_mode: PreFilterMode,
|
||||
/// Whether to decode primary key values eagerly when reading primary key format SSTs.
|
||||
decode_primary_key_values: bool,
|
||||
}
|
||||
|
||||
impl ParquetReaderBuilder {
|
||||
@@ -154,7 +146,6 @@ impl ParquetReaderBuilder {
|
||||
flat_format: false,
|
||||
compaction: false,
|
||||
pre_filter_mode: PreFilterMode::All,
|
||||
decode_primary_key_values: false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -239,13 +230,6 @@ impl ParquetReaderBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
/// Decodes primary key values eagerly when reading primary key format SSTs.
|
||||
#[must_use]
|
||||
pub(crate) fn decode_primary_key_values(mut self, decode: bool) -> Self {
|
||||
self.decode_primary_key_values = decode;
|
||||
self
|
||||
}
|
||||
|
||||
/// Builds a [ParquetReader].
|
||||
///
|
||||
/// This needs to perform IO operation.
|
||||
@@ -269,9 +253,7 @@ impl ParquetReaderBuilder {
|
||||
let file_size = self.file_handle.meta_ref().file_size;
|
||||
|
||||
// Loads parquet metadata of the file.
|
||||
let parquet_meta = self
|
||||
.read_parquet_metadata(&file_path, file_size, &mut metrics.metadata_cache_metrics)
|
||||
.await?;
|
||||
let parquet_meta = self.read_parquet_metadata(&file_path, file_size).await?;
|
||||
// Decodes region metadata.
|
||||
let key_value_meta = parquet_meta.file_metadata().key_value_metadata();
|
||||
// Gets the metadata stored in the SST.
|
||||
@@ -302,9 +284,6 @@ impl ParquetReaderBuilder {
|
||||
self.compaction,
|
||||
)?
|
||||
};
|
||||
if self.decode_primary_key_values {
|
||||
read_format.set_decode_primary_key_values(true);
|
||||
}
|
||||
if need_override_sequence(&parquet_meta) {
|
||||
read_format
|
||||
.set_override_sequence(self.file_handle.meta_ref().sequence.map(|x| x.get()));
|
||||
@@ -399,34 +378,25 @@ impl ParquetReaderBuilder {
|
||||
&self,
|
||||
file_path: &str,
|
||||
file_size: u64,
|
||||
cache_metrics: &mut MetadataCacheMetrics,
|
||||
) -> Result<Arc<ParquetMetaData>> {
|
||||
let start = Instant::now();
|
||||
let _t = READ_STAGE_ELAPSED
|
||||
.with_label_values(&["read_parquet_metadata"])
|
||||
.start_timer();
|
||||
|
||||
let file_id = self.file_handle.file_id();
|
||||
// Tries to get from cache with metrics tracking.
|
||||
if let Some(metadata) = self
|
||||
.cache_strategy
|
||||
.get_parquet_meta_data(file_id, cache_metrics)
|
||||
.await
|
||||
{
|
||||
cache_metrics.metadata_load_cost += start.elapsed();
|
||||
// Tries to get from global cache.
|
||||
if let Some(metadata) = self.cache_strategy.get_parquet_meta_data(file_id).await {
|
||||
return Ok(metadata);
|
||||
}
|
||||
|
||||
// Cache miss, load metadata directly.
|
||||
let metadata_loader = MetadataLoader::new(self.object_store.clone(), file_path, file_size);
|
||||
let metadata = metadata_loader.load().await?;
|
||||
|
||||
let metadata = Arc::new(metadata);
|
||||
// Cache the metadata.
|
||||
self.cache_strategy
|
||||
.put_parquet_meta_data(file_id, metadata.clone());
|
||||
|
||||
cache_metrics.metadata_load_cost += start.elapsed();
|
||||
Ok(metadata)
|
||||
}
|
||||
|
||||
@@ -557,11 +527,7 @@ impl ParquetReaderBuilder {
|
||||
// Slow path: apply the index from the file.
|
||||
let file_size_hint = self.file_handle.meta_ref().index_file_size();
|
||||
let apply_res = index_applier
|
||||
.apply_fine(
|
||||
self.file_handle.index_id(),
|
||||
Some(file_size_hint),
|
||||
metrics.fulltext_index_apply_metrics.as_mut(),
|
||||
)
|
||||
.apply_fine(self.file_handle.file_id(), Some(file_size_hint))
|
||||
.await;
|
||||
let selection = match apply_res {
|
||||
Ok(Some(res)) => {
|
||||
@@ -629,17 +595,13 @@ impl ParquetReaderBuilder {
|
||||
// Slow path: apply the index from the file.
|
||||
let file_size_hint = self.file_handle.meta_ref().index_file_size();
|
||||
let apply_res = index_applier
|
||||
.apply(
|
||||
self.file_handle.index_id(),
|
||||
Some(file_size_hint),
|
||||
metrics.inverted_index_apply_metrics.as_mut(),
|
||||
)
|
||||
.apply(self.file_handle.file_id(), Some(file_size_hint))
|
||||
.await;
|
||||
let selection = match apply_res {
|
||||
Ok(apply_output) => RowGroupSelection::from_inverted_index_apply_output(
|
||||
Ok(output) => RowGroupSelection::from_inverted_index_apply_output(
|
||||
row_group_size,
|
||||
num_row_groups,
|
||||
apply_output,
|
||||
output,
|
||||
),
|
||||
Err(err) => {
|
||||
handle_index_error!(err, self.file_handle, INDEX_TYPE_INVERTED);
|
||||
@@ -708,12 +670,7 @@ impl ParquetReaderBuilder {
|
||||
)
|
||||
});
|
||||
let apply_res = index_applier
|
||||
.apply(
|
||||
self.file_handle.index_id(),
|
||||
Some(file_size_hint),
|
||||
rgs,
|
||||
metrics.bloom_filter_apply_metrics.as_mut(),
|
||||
)
|
||||
.apply(self.file_handle.file_id(), Some(file_size_hint), rgs)
|
||||
.await;
|
||||
let mut selection = match apply_res {
|
||||
Ok(apply_output) => {
|
||||
@@ -791,12 +748,7 @@ impl ParquetReaderBuilder {
|
||||
)
|
||||
});
|
||||
let apply_res = index_applier
|
||||
.apply_coarse(
|
||||
self.file_handle.index_id(),
|
||||
Some(file_size_hint),
|
||||
rgs,
|
||||
metrics.fulltext_index_apply_metrics.as_mut(),
|
||||
)
|
||||
.apply_coarse(self.file_handle.file_id(), Some(file_size_hint), rgs)
|
||||
.await;
|
||||
let mut selection = match apply_res {
|
||||
Ok(Some(apply_output)) => {
|
||||
@@ -940,7 +892,7 @@ fn all_required_row_groups_searched(
|
||||
}
|
||||
|
||||
/// Metrics of filtering rows groups and rows.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
#[derive(Debug, Default, Clone, Copy)]
|
||||
pub(crate) struct ReaderFilterMetrics {
|
||||
/// Number of row groups before filtering.
|
||||
pub(crate) rg_total: usize,
|
||||
@@ -963,13 +915,6 @@ pub(crate) struct ReaderFilterMetrics {
|
||||
pub(crate) rows_bloom_filtered: usize,
|
||||
/// Number of rows filtered by precise filter.
|
||||
pub(crate) rows_precise_filtered: usize,
|
||||
|
||||
/// Optional metrics for inverted index applier.
|
||||
pub(crate) inverted_index_apply_metrics: Option<InvertedIndexApplyMetrics>,
|
||||
/// Optional metrics for bloom filter index applier.
|
||||
pub(crate) bloom_filter_apply_metrics: Option<BloomFilterIndexApplyMetrics>,
|
||||
/// Optional metrics for fulltext index applier.
|
||||
pub(crate) fulltext_index_apply_metrics: Option<FulltextIndexApplyMetrics>,
|
||||
}
|
||||
|
||||
impl ReaderFilterMetrics {
|
||||
@@ -986,23 +931,6 @@ impl ReaderFilterMetrics {
|
||||
self.rows_inverted_filtered += other.rows_inverted_filtered;
|
||||
self.rows_bloom_filtered += other.rows_bloom_filtered;
|
||||
self.rows_precise_filtered += other.rows_precise_filtered;
|
||||
|
||||
// Merge optional applier metrics
|
||||
if let Some(other_metrics) = &other.inverted_index_apply_metrics {
|
||||
self.inverted_index_apply_metrics
|
||||
.get_or_insert_with(Default::default)
|
||||
.merge_from(other_metrics);
|
||||
}
|
||||
if let Some(other_metrics) = &other.bloom_filter_apply_metrics {
|
||||
self.bloom_filter_apply_metrics
|
||||
.get_or_insert_with(Default::default)
|
||||
.merge_from(other_metrics);
|
||||
}
|
||||
if let Some(other_metrics) = &other.fulltext_index_apply_metrics {
|
||||
self.fulltext_index_apply_metrics
|
||||
.get_or_insert_with(Default::default)
|
||||
.merge_from(other_metrics);
|
||||
}
|
||||
}
|
||||
|
||||
/// Reports metrics.
|
||||
@@ -1059,64 +987,6 @@ impl ReaderFilterMetrics {
|
||||
}
|
||||
}
|
||||
|
||||
/// Metrics for parquet metadata cache operations.
|
||||
#[derive(Default, Clone, Copy)]
|
||||
pub(crate) struct MetadataCacheMetrics {
|
||||
/// Number of memory cache hits for parquet metadata.
|
||||
pub(crate) mem_cache_hit: usize,
|
||||
/// Number of file cache hits for parquet metadata.
|
||||
pub(crate) file_cache_hit: usize,
|
||||
/// Number of cache misses for parquet metadata.
|
||||
pub(crate) cache_miss: usize,
|
||||
/// Duration to load parquet metadata.
|
||||
pub(crate) metadata_load_cost: Duration,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for MetadataCacheMetrics {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let Self {
|
||||
mem_cache_hit,
|
||||
file_cache_hit,
|
||||
cache_miss,
|
||||
metadata_load_cost,
|
||||
} = self;
|
||||
|
||||
if self.is_empty() {
|
||||
return write!(f, "{{}}");
|
||||
}
|
||||
write!(f, "{{")?;
|
||||
|
||||
write!(f, "\"metadata_load_cost\":\"{:?}\"", metadata_load_cost)?;
|
||||
|
||||
if *mem_cache_hit > 0 {
|
||||
write!(f, ", \"mem_cache_hit\":{}", mem_cache_hit)?;
|
||||
}
|
||||
if *file_cache_hit > 0 {
|
||||
write!(f, ", \"file_cache_hit\":{}", file_cache_hit)?;
|
||||
}
|
||||
if *cache_miss > 0 {
|
||||
write!(f, ", \"cache_miss\":{}", cache_miss)?;
|
||||
}
|
||||
|
||||
write!(f, "}}")
|
||||
}
|
||||
}
|
||||
|
||||
impl MetadataCacheMetrics {
|
||||
/// Returns true if the metrics are empty (contain no meaningful data).
|
||||
pub(crate) fn is_empty(&self) -> bool {
|
||||
self.metadata_load_cost.is_zero()
|
||||
}
|
||||
|
||||
/// Adds `other` metrics to this metrics.
|
||||
pub(crate) fn merge_from(&mut self, other: &MetadataCacheMetrics) {
|
||||
self.mem_cache_hit += other.mem_cache_hit;
|
||||
self.file_cache_hit += other.file_cache_hit;
|
||||
self.cache_miss += other.cache_miss;
|
||||
self.metadata_load_cost += other.metadata_load_cost;
|
||||
}
|
||||
}
|
||||
|
||||
/// Parquet reader metrics.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct ReaderMetrics {
|
||||
@@ -1132,10 +1002,6 @@ pub struct ReaderMetrics {
|
||||
pub(crate) num_batches: usize,
|
||||
/// Number of rows read.
|
||||
pub(crate) num_rows: usize,
|
||||
/// Metrics for parquet metadata cache.
|
||||
pub(crate) metadata_cache_metrics: MetadataCacheMetrics,
|
||||
/// Optional metrics for page/row group fetch operations.
|
||||
pub(crate) fetch_metrics: Option<Arc<ParquetFetchMetrics>>,
|
||||
}
|
||||
|
||||
impl ReaderMetrics {
|
||||
@@ -1147,15 +1013,6 @@ impl ReaderMetrics {
|
||||
self.num_record_batches += other.num_record_batches;
|
||||
self.num_batches += other.num_batches;
|
||||
self.num_rows += other.num_rows;
|
||||
self.metadata_cache_metrics
|
||||
.merge_from(&other.metadata_cache_metrics);
|
||||
if let Some(other_fetch) = &other.fetch_metrics {
|
||||
if let Some(self_fetch) = &self.fetch_metrics {
|
||||
self_fetch.merge_from(other_fetch);
|
||||
} else {
|
||||
self.fetch_metrics = Some(other_fetch.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Reports total rows.
|
||||
@@ -1210,10 +1067,7 @@ impl RowGroupReaderBuilder {
|
||||
&self,
|
||||
row_group_idx: usize,
|
||||
row_selection: Option<RowSelection>,
|
||||
fetch_metrics: Option<&ParquetFetchMetrics>,
|
||||
) -> Result<ParquetRecordBatchReader> {
|
||||
let fetch_start = Instant::now();
|
||||
|
||||
let mut row_group = InMemoryRowGroup::create(
|
||||
self.file_handle.region_id(),
|
||||
self.file_handle.file_id().file_id(),
|
||||
@@ -1225,17 +1079,12 @@ impl RowGroupReaderBuilder {
|
||||
);
|
||||
// Fetches data into memory.
|
||||
row_group
|
||||
.fetch(&self.projection, row_selection.as_ref(), fetch_metrics)
|
||||
.fetch(&self.projection, row_selection.as_ref())
|
||||
.await
|
||||
.context(ReadParquetSnafu {
|
||||
path: &self.file_path,
|
||||
})?;
|
||||
|
||||
// Record total fetch elapsed time.
|
||||
if let Some(metrics) = fetch_metrics {
|
||||
metrics.data.lock().unwrap().total_fetch_elapsed += fetch_start.elapsed();
|
||||
}
|
||||
|
||||
// Builds the parquet reader.
|
||||
// Now the row selection is None.
|
||||
ParquetRecordBatchReader::try_new_with_row_groups(
|
||||
@@ -1379,8 +1228,6 @@ pub struct ParquetReader {
|
||||
selection: RowGroupSelection,
|
||||
/// Reader of current row group.
|
||||
reader_state: ReaderState,
|
||||
/// Metrics for tracking row group fetch operations.
|
||||
fetch_metrics: ParquetFetchMetrics,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
@@ -1400,11 +1247,7 @@ impl BatchReader for ParquetReader {
|
||||
let parquet_reader = self
|
||||
.context
|
||||
.reader_builder()
|
||||
.build(
|
||||
row_group_idx,
|
||||
Some(row_selection),
|
||||
Some(&self.fetch_metrics),
|
||||
)
|
||||
.build(row_group_idx, Some(row_selection))
|
||||
.await?;
|
||||
|
||||
// Resets the parquet reader.
|
||||
@@ -1460,12 +1303,11 @@ impl ParquetReader {
|
||||
context: FileRangeContextRef,
|
||||
mut selection: RowGroupSelection,
|
||||
) -> Result<Self> {
|
||||
let fetch_metrics = ParquetFetchMetrics::default();
|
||||
// No more items in current row group, reads next row group.
|
||||
let reader_state = if let Some((row_group_idx, row_selection)) = selection.pop_first() {
|
||||
let parquet_reader = context
|
||||
.reader_builder()
|
||||
.build(row_group_idx, Some(row_selection), Some(&fetch_metrics))
|
||||
.build(row_group_idx, Some(row_selection))
|
||||
.await?;
|
||||
// Compute skip_fields once for this row group
|
||||
let skip_fields = context.should_skip_fields(row_group_idx);
|
||||
@@ -1482,7 +1324,6 @@ impl ParquetReader {
|
||||
context,
|
||||
selection,
|
||||
reader_state,
|
||||
fetch_metrics,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -35,175 +35,6 @@ use crate::cache::{CacheStrategy, PageKey, PageValue};
|
||||
use crate::metrics::{READ_STAGE_ELAPSED, READ_STAGE_FETCH_PAGES};
|
||||
use crate::sst::parquet::helper::{MERGE_GAP, fetch_byte_ranges};
|
||||
|
||||
/// Inner data for ParquetFetchMetrics.
|
||||
#[derive(Default, Debug, Clone)]
|
||||
pub struct ParquetFetchMetricsData {
|
||||
/// Number of page cache hits.
|
||||
pub page_cache_hit: usize,
|
||||
/// Number of write cache hits.
|
||||
pub write_cache_hit: usize,
|
||||
/// Number of cache misses.
|
||||
pub cache_miss: usize,
|
||||
/// Number of pages to fetch from mem cache.
|
||||
pub pages_to_fetch_mem: usize,
|
||||
/// Total size in bytes of pages to fetch from mem cache.
|
||||
pub page_size_to_fetch_mem: u64,
|
||||
/// Number of pages to fetch from write cache.
|
||||
pub pages_to_fetch_write_cache: usize,
|
||||
/// Total size in bytes of pages to fetch from write cache.
|
||||
pub page_size_to_fetch_write_cache: u64,
|
||||
/// Number of pages to fetch from store.
|
||||
pub pages_to_fetch_store: usize,
|
||||
/// Total size in bytes of pages to fetch from store.
|
||||
pub page_size_to_fetch_store: u64,
|
||||
/// Total size in bytes of pages actually returned.
|
||||
pub page_size_needed: u64,
|
||||
/// Elapsed time fetching from write cache.
|
||||
pub write_cache_fetch_elapsed: std::time::Duration,
|
||||
/// Elapsed time fetching from object store.
|
||||
pub store_fetch_elapsed: std::time::Duration,
|
||||
/// Total elapsed time for fetching row groups.
|
||||
pub total_fetch_elapsed: std::time::Duration,
|
||||
}
|
||||
|
||||
impl ParquetFetchMetricsData {
|
||||
/// Returns true if the metrics are empty (contain no meaningful data).
|
||||
fn is_empty(&self) -> bool {
|
||||
self.total_fetch_elapsed.is_zero()
|
||||
}
|
||||
}
|
||||
|
||||
/// Metrics for tracking page/row group fetch operations.
|
||||
#[derive(Default)]
|
||||
pub struct ParquetFetchMetrics {
|
||||
pub data: std::sync::Mutex<ParquetFetchMetricsData>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for ParquetFetchMetrics {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let data = self.data.lock().unwrap();
|
||||
if data.is_empty() {
|
||||
return write!(f, "{{}}");
|
||||
}
|
||||
|
||||
let ParquetFetchMetricsData {
|
||||
page_cache_hit,
|
||||
write_cache_hit,
|
||||
cache_miss,
|
||||
pages_to_fetch_mem,
|
||||
page_size_to_fetch_mem,
|
||||
pages_to_fetch_write_cache,
|
||||
page_size_to_fetch_write_cache,
|
||||
pages_to_fetch_store,
|
||||
page_size_to_fetch_store,
|
||||
page_size_needed,
|
||||
write_cache_fetch_elapsed,
|
||||
store_fetch_elapsed,
|
||||
total_fetch_elapsed,
|
||||
} = *data;
|
||||
|
||||
write!(f, "{{")?;
|
||||
|
||||
write!(f, "\"total_fetch_elapsed\":\"{:?}\"", total_fetch_elapsed)?;
|
||||
|
||||
if page_cache_hit > 0 {
|
||||
write!(f, ", \"page_cache_hit\":{}", page_cache_hit)?;
|
||||
}
|
||||
if write_cache_hit > 0 {
|
||||
write!(f, ", \"write_cache_hit\":{}", write_cache_hit)?;
|
||||
}
|
||||
if cache_miss > 0 {
|
||||
write!(f, ", \"cache_miss\":{}", cache_miss)?;
|
||||
}
|
||||
if pages_to_fetch_mem > 0 {
|
||||
write!(f, ", \"pages_to_fetch_mem\":{}", pages_to_fetch_mem)?;
|
||||
}
|
||||
if page_size_to_fetch_mem > 0 {
|
||||
write!(f, ", \"page_size_to_fetch_mem\":{}", page_size_to_fetch_mem)?;
|
||||
}
|
||||
if pages_to_fetch_write_cache > 0 {
|
||||
write!(
|
||||
f,
|
||||
", \"pages_to_fetch_write_cache\":{}",
|
||||
pages_to_fetch_write_cache
|
||||
)?;
|
||||
}
|
||||
if page_size_to_fetch_write_cache > 0 {
|
||||
write!(
|
||||
f,
|
||||
", \"page_size_to_fetch_write_cache\":{}",
|
||||
page_size_to_fetch_write_cache
|
||||
)?;
|
||||
}
|
||||
if pages_to_fetch_store > 0 {
|
||||
write!(f, ", \"pages_to_fetch_store\":{}", pages_to_fetch_store)?;
|
||||
}
|
||||
if page_size_to_fetch_store > 0 {
|
||||
write!(
|
||||
f,
|
||||
", \"page_size_to_fetch_store\":{}",
|
||||
page_size_to_fetch_store
|
||||
)?;
|
||||
}
|
||||
if page_size_needed > 0 {
|
||||
write!(f, ", \"page_size_needed\":{}", page_size_needed)?;
|
||||
}
|
||||
if !write_cache_fetch_elapsed.is_zero() {
|
||||
write!(
|
||||
f,
|
||||
", \"write_cache_fetch_elapsed\":\"{:?}\"",
|
||||
write_cache_fetch_elapsed
|
||||
)?;
|
||||
}
|
||||
if !store_fetch_elapsed.is_zero() {
|
||||
write!(f, ", \"store_fetch_elapsed\":\"{:?}\"", store_fetch_elapsed)?;
|
||||
}
|
||||
|
||||
write!(f, "}}")
|
||||
}
|
||||
}
|
||||
|
||||
impl ParquetFetchMetrics {
|
||||
/// Returns true if the metrics are empty (contain no meaningful data).
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.data.lock().unwrap().is_empty()
|
||||
}
|
||||
|
||||
/// Merges metrics from another [ParquetFetchMetrics].
|
||||
pub fn merge_from(&self, other: &ParquetFetchMetrics) {
|
||||
let ParquetFetchMetricsData {
|
||||
page_cache_hit,
|
||||
write_cache_hit,
|
||||
cache_miss,
|
||||
pages_to_fetch_mem,
|
||||
page_size_to_fetch_mem,
|
||||
pages_to_fetch_write_cache,
|
||||
page_size_to_fetch_write_cache,
|
||||
pages_to_fetch_store,
|
||||
page_size_to_fetch_store,
|
||||
page_size_needed,
|
||||
write_cache_fetch_elapsed,
|
||||
store_fetch_elapsed,
|
||||
total_fetch_elapsed,
|
||||
} = *other.data.lock().unwrap();
|
||||
|
||||
let mut data = self.data.lock().unwrap();
|
||||
data.page_cache_hit += page_cache_hit;
|
||||
data.write_cache_hit += write_cache_hit;
|
||||
data.cache_miss += cache_miss;
|
||||
data.pages_to_fetch_mem += pages_to_fetch_mem;
|
||||
data.page_size_to_fetch_mem += page_size_to_fetch_mem;
|
||||
data.pages_to_fetch_write_cache += pages_to_fetch_write_cache;
|
||||
data.page_size_to_fetch_write_cache += page_size_to_fetch_write_cache;
|
||||
data.pages_to_fetch_store += pages_to_fetch_store;
|
||||
data.page_size_to_fetch_store += page_size_to_fetch_store;
|
||||
data.page_size_needed += page_size_needed;
|
||||
data.write_cache_fetch_elapsed += write_cache_fetch_elapsed;
|
||||
data.store_fetch_elapsed += store_fetch_elapsed;
|
||||
data.total_fetch_elapsed += total_fetch_elapsed;
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct RowGroupBase<'a> {
|
||||
metadata: &'a RowGroupMetaData,
|
||||
pub(crate) offset_index: Option<&'a [OffsetIndexMetaData]>,
|
||||
@@ -413,14 +244,13 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
&mut self,
|
||||
projection: &ProjectionMask,
|
||||
selection: Option<&RowSelection>,
|
||||
metrics: Option<&ParquetFetchMetrics>,
|
||||
) -> Result<()> {
|
||||
if let Some((selection, offset_index)) = selection.zip(self.base.offset_index) {
|
||||
let (fetch_ranges, page_start_offsets) =
|
||||
self.base
|
||||
.calc_sparse_read_ranges(projection, offset_index, selection);
|
||||
|
||||
let chunk_data = self.fetch_bytes(&fetch_ranges, metrics).await?;
|
||||
let chunk_data = self.fetch_bytes(&fetch_ranges).await?;
|
||||
// Assign sparse chunk data to base.
|
||||
self.base
|
||||
.assign_sparse_chunk(projection, chunk_data, page_start_offsets);
|
||||
@@ -438,7 +268,7 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
}
|
||||
|
||||
// Fetch data with ranges
|
||||
let chunk_data = self.fetch_bytes(&fetch_ranges, metrics).await?;
|
||||
let chunk_data = self.fetch_bytes(&fetch_ranges).await?;
|
||||
|
||||
// Assigns fetched data to base.
|
||||
self.base.assign_dense_chunk(projection, chunk_data);
|
||||
@@ -449,74 +279,31 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
|
||||
/// Try to fetch data from the memory cache or the WriteCache,
|
||||
/// if not in WriteCache, fetch data from object store directly.
|
||||
async fn fetch_bytes(
|
||||
&self,
|
||||
ranges: &[Range<u64>],
|
||||
metrics: Option<&ParquetFetchMetrics>,
|
||||
) -> Result<Vec<Bytes>> {
|
||||
async fn fetch_bytes(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
|
||||
// Now fetch page timer includes the whole time to read pages.
|
||||
let _timer = READ_STAGE_FETCH_PAGES.start_timer();
|
||||
|
||||
let page_key = PageKey::new(self.file_id, self.row_group_idx, ranges.to_vec());
|
||||
if let Some(pages) = self.cache_strategy.get_pages(&page_key) {
|
||||
if let Some(metrics) = metrics {
|
||||
let total_size: u64 = ranges.iter().map(|r| r.end - r.start).sum();
|
||||
let mut metrics_data = metrics.data.lock().unwrap();
|
||||
metrics_data.page_cache_hit += 1;
|
||||
metrics_data.pages_to_fetch_mem += ranges.len();
|
||||
metrics_data.page_size_to_fetch_mem += total_size;
|
||||
metrics_data.page_size_needed += total_size;
|
||||
}
|
||||
return Ok(pages.compressed.clone());
|
||||
}
|
||||
|
||||
// Calculate total range size for metrics.
|
||||
let (total_range_size, unaligned_size) = compute_total_range_size(ranges);
|
||||
|
||||
let key = IndexKey::new(self.region_id, self.file_id, FileType::Parquet);
|
||||
let fetch_write_cache_start = metrics.map(|_| std::time::Instant::now());
|
||||
let write_cache_result = self.fetch_ranges_from_write_cache(key, ranges).await;
|
||||
let pages = match write_cache_result {
|
||||
Some(data) => {
|
||||
if let Some(metrics) = metrics {
|
||||
let elapsed = fetch_write_cache_start
|
||||
.map(|start| start.elapsed())
|
||||
.unwrap_or_default();
|
||||
let range_size_needed: u64 = ranges.iter().map(|r| r.end - r.start).sum();
|
||||
let mut metrics_data = metrics.data.lock().unwrap();
|
||||
metrics_data.write_cache_fetch_elapsed += elapsed;
|
||||
metrics_data.write_cache_hit += 1;
|
||||
metrics_data.pages_to_fetch_write_cache += ranges.len();
|
||||
metrics_data.page_size_to_fetch_write_cache += unaligned_size;
|
||||
metrics_data.page_size_needed += range_size_needed;
|
||||
}
|
||||
data
|
||||
}
|
||||
let pages = match self.fetch_ranges_from_write_cache(key, ranges).await {
|
||||
Some(data) => data,
|
||||
None => {
|
||||
// Fetch data from object store.
|
||||
let _timer = READ_STAGE_ELAPSED
|
||||
.with_label_values(&["cache_miss_read"])
|
||||
.start_timer();
|
||||
|
||||
let start = metrics.map(|_| std::time::Instant::now());
|
||||
let data = fetch_byte_ranges(self.file_path, self.object_store.clone(), ranges)
|
||||
fetch_byte_ranges(self.file_path, self.object_store.clone(), ranges)
|
||||
.await
|
||||
.map_err(|e| ParquetError::External(Box::new(e)))?;
|
||||
if let Some(metrics) = metrics {
|
||||
let elapsed = start.map(|start| start.elapsed()).unwrap_or_default();
|
||||
let range_size_needed: u64 = ranges.iter().map(|r| r.end - r.start).sum();
|
||||
let mut metrics_data = metrics.data.lock().unwrap();
|
||||
metrics_data.store_fetch_elapsed += elapsed;
|
||||
metrics_data.cache_miss += 1;
|
||||
metrics_data.pages_to_fetch_store += ranges.len();
|
||||
metrics_data.page_size_to_fetch_store += unaligned_size;
|
||||
metrics_data.page_size_needed += range_size_needed;
|
||||
}
|
||||
data
|
||||
.map_err(|e| ParquetError::External(Box::new(e)))?
|
||||
}
|
||||
};
|
||||
|
||||
// Put pages back to the cache.
|
||||
let total_range_size = compute_total_range_size(ranges);
|
||||
let page_value = PageValue::new(pages.clone(), total_range_size);
|
||||
self.cache_strategy
|
||||
.put_pages(page_key, Arc::new(page_value));
|
||||
@@ -539,21 +326,17 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
}
|
||||
|
||||
/// Computes the max possible buffer size to read the given `ranges`.
|
||||
/// Returns (aligned_size, unaligned_size) where:
|
||||
/// - aligned_size: total size aligned to pooled buffer size
|
||||
/// - unaligned_size: actual total size without alignment
|
||||
// See https://github.com/apache/opendal/blob/v0.54.0/core/src/types/read/reader.rs#L166-L192
|
||||
fn compute_total_range_size(ranges: &[Range<u64>]) -> (u64, u64) {
|
||||
fn compute_total_range_size(ranges: &[Range<u64>]) -> u64 {
|
||||
if ranges.is_empty() {
|
||||
return (0, 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
let gap = MERGE_GAP as u64;
|
||||
let mut sorted_ranges = ranges.to_vec();
|
||||
sorted_ranges.sort_unstable_by(|a, b| a.start.cmp(&b.start));
|
||||
|
||||
let mut total_size_aligned = 0;
|
||||
let mut total_size_unaligned = 0;
|
||||
let mut total_size = 0;
|
||||
let mut cur = sorted_ranges[0].clone();
|
||||
|
||||
for range in sorted_ranges.into_iter().skip(1) {
|
||||
@@ -562,19 +345,15 @@ fn compute_total_range_size(ranges: &[Range<u64>]) -> (u64, u64) {
|
||||
cur.end = cur.end.max(range.end);
|
||||
} else {
|
||||
// No overlap and the gap is too large, add current range to total and start a new one
|
||||
let range_size = cur.end - cur.start;
|
||||
total_size_aligned += align_to_pooled_buf_size(range_size);
|
||||
total_size_unaligned += range_size;
|
||||
total_size += align_to_pooled_buf_size(cur.end - cur.start);
|
||||
cur = range;
|
||||
}
|
||||
}
|
||||
|
||||
// Add the last range
|
||||
let range_size = cur.end - cur.start;
|
||||
total_size_aligned += align_to_pooled_buf_size(range_size);
|
||||
total_size_unaligned += range_size;
|
||||
total_size += align_to_pooled_buf_size(cur.end - cur.start);
|
||||
|
||||
(total_size_aligned, total_size_unaligned)
|
||||
total_size
|
||||
}
|
||||
|
||||
/// Aligns the given size to the multiple of the pooled buffer size.
|
||||
|
||||
@@ -153,7 +153,7 @@ where
|
||||
metrics: &'a mut Metrics,
|
||||
) -> ParquetWriter<'a, F, I, P> {
|
||||
let init_file = FileId::random();
|
||||
let indexer = indexer_builder.build(init_file, 0).await;
|
||||
let indexer = indexer_builder.build(init_file).await;
|
||||
|
||||
ParquetWriter {
|
||||
path_provider,
|
||||
@@ -482,7 +482,7 @@ where
|
||||
.context(WriteParquetSnafu)?;
|
||||
self.writer = Some(arrow_writer);
|
||||
|
||||
let indexer = self.indexer_builder.build(self.current_file, 0).await;
|
||||
let indexer = self.indexer_builder.build(self.current_file).await;
|
||||
self.current_indexer = Some(indexer);
|
||||
|
||||
// safety: self.writer is assigned above
|
||||
|
||||
@@ -626,7 +626,6 @@ impl TestEnv {
|
||||
compress_type,
|
||||
checkpoint_distance,
|
||||
remove_file_options: Default::default(),
|
||||
manifest_cache: None,
|
||||
};
|
||||
|
||||
if let Some(metadata) = initial_metadata {
|
||||
@@ -657,7 +656,6 @@ impl TestEnv {
|
||||
None,
|
||||
self.puffin_manager.clone(),
|
||||
self.intermediate_manager.clone(),
|
||||
None, // manifest_cache
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -678,7 +676,6 @@ impl TestEnv {
|
||||
None,
|
||||
self.puffin_manager.clone(),
|
||||
self.intermediate_manager.clone(),
|
||||
ReadableSize::mb(0), // manifest_cache_capacity
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -1023,15 +1020,9 @@ pub struct MockWriteBufferManager {
|
||||
should_stall: AtomicBool,
|
||||
memory_used: AtomicUsize,
|
||||
memory_active: AtomicUsize,
|
||||
flush_limit: usize,
|
||||
}
|
||||
|
||||
impl MockWriteBufferManager {
|
||||
/// Set flush limit.
|
||||
pub fn set_flush_limit(&mut self, flush_limit: usize) {
|
||||
self.flush_limit = flush_limit;
|
||||
}
|
||||
|
||||
/// Set whether to flush the engine.
|
||||
pub fn set_should_flush(&self, value: bool) {
|
||||
self.should_flush.store(value, Ordering::Relaxed);
|
||||
@@ -1073,10 +1064,6 @@ impl WriteBufferManager for MockWriteBufferManager {
|
||||
fn memory_usage(&self) -> usize {
|
||||
self.memory_used.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
fn flush_limit(&self) -> usize {
|
||||
self.flush_limit
|
||||
}
|
||||
}
|
||||
|
||||
pub fn column_metadata_to_column_schema(metadata: &ColumnMetadata) -> api::v1::ColumnSchema {
|
||||
|
||||
@@ -132,7 +132,6 @@ impl SchedulerEnv {
|
||||
compress_type: CompressionType::Uncompressed,
|
||||
checkpoint_distance: 10,
|
||||
remove_file_options: Default::default(),
|
||||
manifest_cache: None,
|
||||
},
|
||||
FormatType::PrimaryKey,
|
||||
&Default::default(),
|
||||
|
||||
@@ -126,7 +126,7 @@ pub fn sst_file_handle_with_file_id(file_id: FileId, start_ms: i64, end_ms: i64)
|
||||
available_indexes: Default::default(),
|
||||
indexes: Default::default(),
|
||||
index_file_size: 0,
|
||||
index_version: 0,
|
||||
index_file_id: None,
|
||||
num_rows: 0,
|
||||
num_row_groups: 0,
|
||||
num_series: 0,
|
||||
|
||||
@@ -104,7 +104,7 @@ impl VersionControlBuilder {
|
||||
available_indexes: Default::default(),
|
||||
indexes: Default::default(),
|
||||
index_file_size: 0,
|
||||
index_version: 0,
|
||||
index_file_id: None,
|
||||
num_rows: 0,
|
||||
num_row_groups: 0,
|
||||
num_series: 0,
|
||||
@@ -195,7 +195,7 @@ pub(crate) fn apply_edit(
|
||||
available_indexes: Default::default(),
|
||||
indexes: Default::default(),
|
||||
index_file_size: 0,
|
||||
index_version: 0,
|
||||
index_file_id: None,
|
||||
num_rows: 0,
|
||||
num_row_groups: 0,
|
||||
num_series: 0,
|
||||
|
||||
@@ -37,7 +37,6 @@ use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::time::Duration;
|
||||
|
||||
use common_base::Plugins;
|
||||
use common_base::readable_size::ReadableSize;
|
||||
use common_error::ext::BoxedError;
|
||||
use common_meta::key::SchemaMetadataManagerRef;
|
||||
use common_runtime::JoinHandle;
|
||||
@@ -456,8 +455,6 @@ pub async fn write_cache_from_config(
|
||||
Some(config.index_cache_percent),
|
||||
puffin_manager_factory,
|
||||
intermediate_manager,
|
||||
// TODO(yingwen): Enable manifest cache after removing read cache.
|
||||
ReadableSize(0),
|
||||
)
|
||||
.await?;
|
||||
Ok(Some(Arc::new(cache)))
|
||||
|
||||
@@ -22,7 +22,6 @@ use common_telemetry::info;
|
||||
use common_telemetry::tracing::warn;
|
||||
use humantime_serde::re::humantime;
|
||||
use snafu::{ResultExt, ensure};
|
||||
use store_api::logstore::LogStore;
|
||||
use store_api::metadata::{
|
||||
InvalidSetRegionOptionRequestSnafu, MetadataError, RegionMetadata, RegionMetadataBuilder,
|
||||
RegionMetadataRef,
|
||||
@@ -42,7 +41,7 @@ use crate::request::{DdlRequest, OptionOutputTx, SenderDdlRequest};
|
||||
use crate::sst::FormatType;
|
||||
use crate::worker::RegionWorkerLoop;
|
||||
|
||||
impl<S: LogStore> RegionWorkerLoop<S> {
|
||||
impl<S> RegionWorkerLoop<S> {
|
||||
pub(crate) async fn handle_alter_request(
|
||||
&mut self,
|
||||
region_id: RegionId,
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user