mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2025-12-23 22:49:58 +00:00
Compare commits
2 Commits
docs/vecto
...
feature/df
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ef80503454 | ||
|
|
30ca2d7652 |
64
AUTHOR.md
64
AUTHOR.md
@@ -2,41 +2,41 @@
|
||||
|
||||
## Individual Committers (in alphabetical order)
|
||||
|
||||
- [apdong2022](https://github.com/apdong2022)
|
||||
- [beryl678](https://github.com/beryl678)
|
||||
- [CookiePieWw](https://github.com/CookiePieWw)
|
||||
- [etolbakov](https://github.com/etolbakov)
|
||||
- [irenjj](https://github.com/irenjj)
|
||||
- [KKould](https://github.com/KKould)
|
||||
- [Lanqing Yang](https://github.com/lyang24)
|
||||
- [nicecui](https://github.com/nicecui)
|
||||
- [NiwakaDev](https://github.com/NiwakaDev)
|
||||
- [paomian](https://github.com/paomian)
|
||||
- [tisonkun](https://github.com/tisonkun)
|
||||
- [Wenjie0329](https://github.com/Wenjie0329)
|
||||
- [zhaoyingnan01](https://github.com/zhaoyingnan01)
|
||||
- [zhongzc](https://github.com/zhongzc)
|
||||
- [ZonaHex](https://github.com/ZonaHex)
|
||||
- [zyy17](https://github.com/zyy17)
|
||||
* [CookiePieWw](https://github.com/CookiePieWw)
|
||||
* [etolbakov](https://github.com/etolbakov)
|
||||
* [irenjj](https://github.com/irenjj)
|
||||
* [KKould](https://github.com/KKould)
|
||||
* [Lanqing Yang](https://github.com/lyang24)
|
||||
* [NiwakaDev](https://github.com/NiwakaDev)
|
||||
* [tisonkun](https://github.com/tisonkun)
|
||||
|
||||
## Team Members (in alphabetical order)
|
||||
|
||||
- [daviderli614](https://github.com/daviderli614)
|
||||
- [discord9](https://github.com/discord9)
|
||||
- [evenyag](https://github.com/evenyag)
|
||||
- [fengjiachun](https://github.com/fengjiachun)
|
||||
- [fengys1996](https://github.com/fengys1996)
|
||||
- [GrepTime](https://github.com/GrepTime)
|
||||
- [holalengyu](https://github.com/holalengyu)
|
||||
- [killme2008](https://github.com/killme2008)
|
||||
- [MichaelScofield](https://github.com/MichaelScofield)
|
||||
- [shuiyisong](https://github.com/shuiyisong)
|
||||
- [sunchanglong](https://github.com/sunchanglong)
|
||||
- [sunng87](https://github.com/sunng87)
|
||||
- [v0y4g3r](https://github.com/v0y4g3r)
|
||||
- [waynexia](https://github.com/waynexia)
|
||||
- [WenyXu](https://github.com/WenyXu)
|
||||
- [xtang](https://github.com/xtang)
|
||||
* [apdong2022](https://github.com/apdong2022)
|
||||
* [beryl678](https://github.com/beryl678)
|
||||
* [daviderli614](https://github.com/daviderli614)
|
||||
* [discord9](https://github.com/discord9)
|
||||
* [evenyag](https://github.com/evenyag)
|
||||
* [fengjiachun](https://github.com/fengjiachun)
|
||||
* [fengys1996](https://github.com/fengys1996)
|
||||
* [GrepTime](https://github.com/GrepTime)
|
||||
* [holalengyu](https://github.com/holalengyu)
|
||||
* [killme2008](https://github.com/killme2008)
|
||||
* [MichaelScofield](https://github.com/MichaelScofield)
|
||||
* [nicecui](https://github.com/nicecui)
|
||||
* [paomian](https://github.com/paomian)
|
||||
* [shuiyisong](https://github.com/shuiyisong)
|
||||
* [sunchanglong](https://github.com/sunchanglong)
|
||||
* [sunng87](https://github.com/sunng87)
|
||||
* [v0y4g3r](https://github.com/v0y4g3r)
|
||||
* [waynexia](https://github.com/waynexia)
|
||||
* [Wenjie0329](https://github.com/Wenjie0329)
|
||||
* [WenyXu](https://github.com/WenyXu)
|
||||
* [xtang](https://github.com/xtang)
|
||||
* [zhaoyingnan01](https://github.com/zhaoyingnan01)
|
||||
* [zhongzc](https://github.com/zhongzc)
|
||||
* [ZonaHex](https://github.com/ZonaHex)
|
||||
* [zyy17](https://github.com/zyy17)
|
||||
|
||||
## All Contributors
|
||||
|
||||
|
||||
68
Cargo.lock
generated
68
Cargo.lock
generated
@@ -3274,7 +3274,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-ipc",
|
||||
@@ -3329,7 +3329,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-catalog"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
@@ -3353,7 +3353,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-catalog-listing"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
@@ -3375,7 +3375,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-common"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"ahash 0.8.12",
|
||||
"arrow",
|
||||
@@ -3398,7 +3398,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-common-runtime"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"log",
|
||||
@@ -3408,7 +3408,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-datasource"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-compression 0.4.19",
|
||||
@@ -3442,7 +3442,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-datasource-csv"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
@@ -3464,7 +3464,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-datasource-json"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
@@ -3485,7 +3485,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-datasource-parquet"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
@@ -3514,12 +3514,12 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-doc"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
|
||||
[[package]]
|
||||
name = "datafusion-execution"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
@@ -3538,7 +3538,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-expr"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
@@ -3560,7 +3560,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-expr-common"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"datafusion-common",
|
||||
@@ -3572,7 +3572,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-functions"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-buffer",
|
||||
@@ -3600,7 +3600,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-functions-aggregate"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"ahash 0.8.12",
|
||||
"arrow",
|
||||
@@ -3620,7 +3620,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-functions-aggregate-common"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"ahash 0.8.12",
|
||||
"arrow",
|
||||
@@ -3632,7 +3632,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-functions-nested"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-ord",
|
||||
@@ -3654,7 +3654,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-functions-table"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
@@ -3669,7 +3669,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-functions-window"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"datafusion-common",
|
||||
@@ -3686,7 +3686,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-functions-window-common"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"datafusion-common",
|
||||
"datafusion-physical-expr-common",
|
||||
@@ -3695,7 +3695,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-macros"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"datafusion-doc",
|
||||
"quote",
|
||||
@@ -3705,7 +3705,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-optimizer"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"chrono",
|
||||
@@ -3756,7 +3756,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-physical-expr"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"ahash 0.8.12",
|
||||
"arrow",
|
||||
@@ -3777,7 +3777,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-physical-expr-adapter"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"datafusion-common",
|
||||
@@ -3791,7 +3791,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-physical-expr-common"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"ahash 0.8.12",
|
||||
"arrow",
|
||||
@@ -3804,7 +3804,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-physical-optimizer"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"datafusion-common",
|
||||
@@ -3822,7 +3822,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-physical-plan"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"ahash 0.8.12",
|
||||
"arrow",
|
||||
@@ -3852,7 +3852,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-pruning"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"datafusion-common",
|
||||
@@ -3868,7 +3868,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-session"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"datafusion-common",
|
||||
@@ -3881,7 +3881,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-sql"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"bigdecimal 0.4.8",
|
||||
@@ -3898,7 +3898,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "datafusion-substrait"
|
||||
version = "50.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=fd4b2abcf3c3e43e94951bda452c9fd35243aab0#fd4b2abcf3c3e43e94951bda452c9fd35243aab0"
|
||||
source = "git+https://github.com/GreptimeTeam/datafusion.git?rev=7f8ea0a45748ed32695757368f847ab9ac7b6c82#7f8ea0a45748ed32695757368f847ab9ac7b6c82"
|
||||
dependencies = [
|
||||
"async-recursion",
|
||||
"async-trait",
|
||||
@@ -7514,11 +7514,9 @@ dependencies = [
|
||||
"common-test-util",
|
||||
"common-time",
|
||||
"common-wal",
|
||||
"criterion 0.4.0",
|
||||
"datafusion",
|
||||
"datatypes",
|
||||
"futures-util",
|
||||
"fxhash",
|
||||
"humantime-serde",
|
||||
"itertools 0.14.0",
|
||||
"lazy_static",
|
||||
@@ -9203,9 +9201,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "pgwire"
|
||||
version = "0.36.3"
|
||||
version = "0.36.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "70a2bcdcc4b20a88e0648778ecf00415bbd5b447742275439c22176835056f99"
|
||||
checksum = "d331bb0eef5bc83a221c0a85b1f205bccf094d4f72a26ae1d68a1b1c535123b7"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"base64 0.22.1",
|
||||
|
||||
24
Cargo.toml
24
Cargo.toml
@@ -316,18 +316,18 @@ git = "https://github.com/GreptimeTeam/greptime-meter.git"
|
||||
rev = "5618e779cf2bb4755b499c630fba4c35e91898cb"
|
||||
|
||||
[patch.crates-io]
|
||||
datafusion = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-functions = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-functions-aggregate-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-optimizer = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-physical-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-physical-expr-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-physical-plan = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-datasource = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-sql = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-substrait = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
|
||||
datafusion-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
|
||||
datafusion-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
|
||||
datafusion-functions = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
|
||||
datafusion-functions-aggregate-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
|
||||
datafusion-optimizer = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
|
||||
datafusion-physical-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
|
||||
datafusion-physical-expr-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
|
||||
datafusion-physical-plan = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
|
||||
datafusion-datasource = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
|
||||
datafusion-sql = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
|
||||
datafusion-substrait = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7f8ea0a45748ed32695757368f847ab9ac7b6c82" }
|
||||
sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "4b519a5caa95472cc3988f5556813a583dd35af1" } # branch = "v0.58.x"
|
||||
|
||||
[profile.release]
|
||||
|
||||
@@ -294,6 +294,7 @@
|
||||
| `meta_client` | -- | -- | The metasrv client options. |
|
||||
| `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. |
|
||||
| `meta_client.timeout` | String | `3s` | Operation timeout. |
|
||||
| `meta_client.heartbeat_timeout` | String | `500ms` | Heartbeat timeout. |
|
||||
| `meta_client.ddl_timeout` | String | `10s` | DDL timeout. |
|
||||
| `meta_client.connect_timeout` | String | `1s` | Connect server timeout. |
|
||||
| `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. |
|
||||
@@ -456,6 +457,7 @@
|
||||
| `meta_client` | -- | -- | The metasrv client options. |
|
||||
| `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. |
|
||||
| `meta_client.timeout` | String | `3s` | Operation timeout. |
|
||||
| `meta_client.heartbeat_timeout` | String | `500ms` | Heartbeat timeout. |
|
||||
| `meta_client.ddl_timeout` | String | `10s` | DDL timeout. |
|
||||
| `meta_client.connect_timeout` | String | `1s` | Connect server timeout. |
|
||||
| `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. |
|
||||
@@ -627,6 +629,7 @@
|
||||
| `meta_client` | -- | -- | The metasrv client options. |
|
||||
| `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. |
|
||||
| `meta_client.timeout` | String | `3s` | Operation timeout. |
|
||||
| `meta_client.heartbeat_timeout` | String | `500ms` | Heartbeat timeout. |
|
||||
| `meta_client.ddl_timeout` | String | `10s` | DDL timeout. |
|
||||
| `meta_client.connect_timeout` | String | `1s` | Connect server timeout. |
|
||||
| `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. |
|
||||
|
||||
@@ -99,6 +99,9 @@ metasrv_addrs = ["127.0.0.1:3002"]
|
||||
## Operation timeout.
|
||||
timeout = "3s"
|
||||
|
||||
## Heartbeat timeout.
|
||||
heartbeat_timeout = "500ms"
|
||||
|
||||
## DDL timeout.
|
||||
ddl_timeout = "10s"
|
||||
|
||||
|
||||
@@ -78,6 +78,9 @@ metasrv_addrs = ["127.0.0.1:3002"]
|
||||
## Operation timeout.
|
||||
timeout = "3s"
|
||||
|
||||
## Heartbeat timeout.
|
||||
heartbeat_timeout = "500ms"
|
||||
|
||||
## DDL timeout.
|
||||
ddl_timeout = "10s"
|
||||
|
||||
|
||||
@@ -226,6 +226,9 @@ metasrv_addrs = ["127.0.0.1:3002"]
|
||||
## Operation timeout.
|
||||
timeout = "3s"
|
||||
|
||||
## Heartbeat timeout.
|
||||
heartbeat_timeout = "500ms"
|
||||
|
||||
## DDL timeout.
|
||||
ddl_timeout = "10s"
|
||||
|
||||
|
||||
@@ -1,94 +0,0 @@
|
||||
---
|
||||
Feature Name: Vector Index
|
||||
Tracking Issue: TBD
|
||||
Date: 2025-12-04
|
||||
Author: "TBD"
|
||||
---
|
||||
|
||||
# Summary
|
||||
Introduce a per-SST approximate nearest neighbor (ANN) index for `VECTOR(dim)` columns with a pluggable engine. USearch HNSW is the initial engine, while the design keeps VSAG (default when linked) and future engines selectable at DDL or alter time and encoded in the index metadata. The index is built alongside SST creation and accelerates `ORDER BY vec_*_distance(column, <literal vector>) LIMIT k` queries, falling back to the existing brute-force path when an index is unavailable or ineligible.
|
||||
|
||||
# Motivation
|
||||
Vector distances are currently computed with nalgebra across all rows (O(N)) before sorting, which does not scale to millions of vectors. An on-disk ANN index with sub-linear search reduces latency and compute cost for common RAG, semantic search, and recommendation workloads without changing SQL.
|
||||
|
||||
# Details
|
||||
|
||||
## Current Behavior
|
||||
`VECTOR(dim)` values are stored as binary blobs. Queries call `vec_cos_distance`/`vec_l2sq_distance`/`vec_dot_product` via nalgebra for every row and then sort; there is no indexing or caching.
|
||||
|
||||
## Index Eligibility and Configuration
|
||||
Only `VECTOR(dim)` columns can be indexed. A column metadata flag follows the existing column-option pattern with an intentionally small surface area:
|
||||
- `engine`: `vsag` (default when the binding is built) or `usearch`. If a configured engine is unavailable at runtime, the builder logs and falls back to `usearch` while leaving the option intact for future rebuilds.
|
||||
- `metric`: `cosine` (default), `l2sq`, or `dot`; mismatches with query functions force brute-force execution.
|
||||
- `m`: HNSW graph connectivity (higher = denser graph, more memory, better recall), default `16`.
|
||||
- `ef_construct`: build-time expansion, default `128`.
|
||||
- `ef_search`: query-time expansion, default `64`; engines may clamp values.
|
||||
|
||||
Option semantics mirror HNSW defaults so both USearch and VSAG can honor them; engine-specific tunables stay in reserved key-value pairs inside the blob header for forward compatibility.
|
||||
|
||||
DDL reuses column extensions similar to inverted/fulltext indexes:
|
||||
|
||||
```sql
|
||||
CREATE TABLE embeddings (
|
||||
ts TIMESTAMP TIME INDEX,
|
||||
id STRING PRIMARY KEY,
|
||||
vec VECTOR(384) VECTOR INDEX WITH (engine = 'vsag', metric = 'cosine', ef_search = 64)
|
||||
);
|
||||
```
|
||||
|
||||
Altering column options toggles the flag, can switch engines (for example `usearch` -> `vsag`), and triggers rebuilds through the existing alter/compaction flow. Engine choice stays in table metadata and each blob header; new SSTs use the configured engine while older SSTs remain readable under their recorded engine until compaction or a manual rebuild rewrites them.
|
||||
|
||||
## Storage and Format
|
||||
- One vector index per indexed column per SST, stored as a Puffin blob with type `greptime-vector-index-v1`.
|
||||
- Each blob records the engine (`usearch`, `vsag`, future values) and engine parameters in the header so readers can select the matching decoder. Mixed-engine SSTs remain readable because the engine id travels with the blob.
|
||||
- USearch uses `f32` vectors and SST row offsets (`u64`) as keys; nulls and `OpType::Delete` rows are skipped. Row ids are the absolute SST ordinal so readers can derive `RowSelection` directly from parquet row group lengths without extra side tables.
|
||||
- Blob layout:
|
||||
- Header: version, column id, dimension, engine id, metric, `m`, `ef_construct`, `ef_search`, and reserved engine-specific key-value pairs.
|
||||
- Counts: total rows written and indexed rows.
|
||||
- Payload: USearch binary produced by `save_to_buffer`.
|
||||
- An empty index (no eligible vectors) results in no available index entry for that column.
|
||||
- `puffin_manager` registers the blob type so caches and readers discover it alongside inverted/fulltext/bloom blobs in the same index file.
|
||||
|
||||
## Row Visibility and Duplicates
|
||||
- The indexer increments `row_offset` for every incoming row (including skipped/null/delete rows) so offsets stay aligned with parquet ordering across row groups.
|
||||
- Only `OpType::Put` rows with the expected dimension are inserted; `OpType::Delete` and malformed rows are skipped but still advance `row_offset`, matching the data plane’s visibility rules.
|
||||
- Multiple versions of the same primary key remain in the graph; the read path intersects search hits with the standard mito2 deduplication/visibility pipeline (sequence-aware dedup, delete filtering, projection) before returning results.
|
||||
- Searches overfetch beyond `k` to compensate for rows discarded by visibility checks and to avoid reissuing index reads.
|
||||
|
||||
## Build Path (mito2 write)
|
||||
Extend `sst::index::Indexer` to optionally create a `VectorIndexer` when region metadata marks a column as vector-indexed, mirroring how inverted/fulltext/bloom filters attach to `IndexerBuilderImpl` in `mito2`.
|
||||
|
||||
The indexer consumes `Batch`/`RecordBatch` data and shares memory tracking and abort semantics with existing indexers:
|
||||
- Maintain a running `row_offset` that follows SST write order and spans row groups so the search result can be turned into `RowSelection`.
|
||||
- For each `OpType::Put`, if the vector is non-null and matches the declared dimension, insert into USearch with `row_offset` as the key; otherwise skip.
|
||||
- Track memory with existing index build metrics; on failure, abort only the vector index while keeping SST writing unaffected.
|
||||
|
||||
Engine selection is table-driven: the builder picks the configured engine (default `vsag`, fallback `usearch` if `vsag` is not compiled in) and dispatches to the matching implementation. Unknown engines skip index build with a warning.
|
||||
|
||||
On `finish`, serialize the engine-tagged index into the Puffin writer and record `IndexType::Vector` metadata for the column. `IndexOutput` and `FileMeta::indexes/available_indexes` gain a vector entry so manifest updates and `RegionVersion` surface per-column presence, following patterns used by inverted/fulltext/bloom indexes. Planner/metadata validation ensures that mismatched dimensions only reduce the indexed-row count and do not break reads.
|
||||
|
||||
## Read Path (mito2 query)
|
||||
A planner rule in `query` identifies eligible plans on mito2 tables: a single `ORDER BY vec_cos_distance|vec_l2sq_distance|vec_dot_product(<vector column>, <literal vector>)` in ascending order plus a `LIMIT`/`TopK`. The rule rejects plans with multiple sort keys, non-literal query vectors, or additional projections that would change the distance expression and falls back to brute-force in those cases.
|
||||
|
||||
For eligible scans, build a `VectorIndexScan` execution node that:
|
||||
- Consults SST metadata for `IndexType::Vector`, loads the index via Puffin using the existing `mito2::cache::index` infrastructure, and dispatches to the engine declared in the blob header (USearch/VSAG/etc.).
|
||||
- Runs the engine’s `search` with an overfetch (for example 2×k) to tolerate rows filtered by deletes, dimension mismatches, or late-stage dedup; keys already match SST row offsets produced by the writer.
|
||||
- Converts hits to `RowSelection` using parquet row group lengths and reuses the parquet reader so visibility, projection, and deduplication logic stay unchanged; distances are recomputed with `vec_*_distance` before the final trim to k to guarantee ordering and to merge distributed partial results deterministically.
|
||||
|
||||
Any unsupported shape, load error, or cache miss falls back to the current brute-force execution path.
|
||||
|
||||
## Lifecycle and Maintenance
|
||||
Lifecycle piggybacks on the existing SST/index flow: rebuilds run where other secondary indexes do, graphs are always rebuilt from source rows (no HNSW merge), and cleanup/versioning/caching reuse the existing Puffin and index cache paths.
|
||||
|
||||
# Implementation Plan
|
||||
1. Add the `usearch` dependency (wrapper module in `index` or `mito2`) and map minimal HNSW options; keep an engine trait that allows plugging VSAG without changing the rest of the pipeline.
|
||||
2. Introduce `IndexType::Vector` and a column metadata key for vector index options (including `engine`); add SQL parser and `SHOW CREATE TABLE` support for `VECTOR INDEX WITH (...)`.
|
||||
3. Implement `vector_index` build/read modules under `mito2` (and `index` if shared), including Puffin serialization that records engine id, blob-type registration with `puffin_manager`, and integration with the `Indexer` builder, `IndexOutput`, manifest updates, and compaction rebuild.
|
||||
4. Extend the query planner/execution to detect eligible plans and drive a `RowSelection`-based ANN scan with a fallback path, dispatching by engine at read time and using existing Puffin and index caches.
|
||||
5. Add unit tests for serialization/search correctness and an end-to-end test covering plan rewrite, cache usage, engine selection, and fallback; add a mixed-engine test to confirm old USearch blobs still serve after a VSAG switch.
|
||||
6. Follow up with an optional VSAG engine binding (feature flag), validate parity with USearch on dense vectors, exercise alternative algorithms (for example PQ), and flip the default `engine` to `vsag` when the binding is present.
|
||||
|
||||
# Alternatives
|
||||
- **VSAG (follow-up engine):** C++ library with HNSW and additional algorithms (for example SINDI for sparse vectors and PQ) targeting in-memory and disk-friendly search. Provides parameter generators and a roadmap for GPU-assisted build and graph compression. Compared to FAISS it is newer with fewer integrations but bundles sparse/dense coverage and out-of-core focus in one engine. Fits the pluggable-engine design and would become the default `engine = 'vsag'` when linked; USearch remains available for lighter dependencies.
|
||||
- **FAISS:** Broad feature set (IVF/IVFPQ/PQ/HNSW, GPU acceleration, scalar filtering, pre/post filters) and battle-tested performance across datasets, but it requires a heavier C++/GPU toolchain, has no official Rust binding, and is less disk-centric than VSAG; integrating it would add more build/distribution burden than USearch/VSAG.
|
||||
- **Do nothing:** Keep brute-force evaluation, which remains O(N) and unacceptable at scale.
|
||||
@@ -52,6 +52,7 @@ fn test_load_datanode_example_config() {
|
||||
meta_client: Some(MetaClientOptions {
|
||||
metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
|
||||
timeout: Duration::from_secs(3),
|
||||
heartbeat_timeout: Duration::from_millis(500),
|
||||
ddl_timeout: Duration::from_secs(10),
|
||||
connect_timeout: Duration::from_secs(1),
|
||||
tcp_nodelay: true,
|
||||
@@ -117,6 +118,7 @@ fn test_load_frontend_example_config() {
|
||||
meta_client: Some(MetaClientOptions {
|
||||
metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
|
||||
timeout: Duration::from_secs(3),
|
||||
heartbeat_timeout: Duration::from_millis(500),
|
||||
ddl_timeout: Duration::from_secs(10),
|
||||
connect_timeout: Duration::from_secs(1),
|
||||
tcp_nodelay: true,
|
||||
@@ -239,6 +241,7 @@ fn test_load_flownode_example_config() {
|
||||
meta_client: Some(MetaClientOptions {
|
||||
metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
|
||||
timeout: Duration::from_secs(3),
|
||||
heartbeat_timeout: Duration::from_millis(500),
|
||||
ddl_timeout: Duration::from_secs(10),
|
||||
connect_timeout: Duration::from_secs(1),
|
||||
tcp_nodelay: true,
|
||||
|
||||
@@ -46,16 +46,13 @@ pub struct DoPutResponse {
|
||||
request_id: i64,
|
||||
/// The successfully ingested rows number.
|
||||
affected_rows: AffectedRows,
|
||||
/// The elapsed time in seconds for handling the bulk insert.
|
||||
elapsed_secs: f64,
|
||||
}
|
||||
|
||||
impl DoPutResponse {
|
||||
pub fn new(request_id: i64, affected_rows: AffectedRows, elapsed_secs: f64) -> Self {
|
||||
pub fn new(request_id: i64, affected_rows: AffectedRows) -> Self {
|
||||
Self {
|
||||
request_id,
|
||||
affected_rows,
|
||||
elapsed_secs,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -66,10 +63,6 @@ impl DoPutResponse {
|
||||
pub fn affected_rows(&self) -> AffectedRows {
|
||||
self.affected_rows
|
||||
}
|
||||
|
||||
pub fn elapsed_secs(&self) -> f64 {
|
||||
self.elapsed_secs
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<PutResult> for DoPutResponse {
|
||||
@@ -93,11 +86,8 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_serde_do_put_response() {
|
||||
let x = DoPutResponse::new(42, 88, 0.123);
|
||||
let x = DoPutResponse::new(42, 88);
|
||||
let serialized = serde_json::to_string(&x).unwrap();
|
||||
assert_eq!(
|
||||
serialized,
|
||||
r#"{"request_id":42,"affected_rows":88,"elapsed_secs":0.123}"#
|
||||
);
|
||||
assert_eq!(serialized, r#"{"request_id":42,"affected_rows":88}"#);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -41,17 +41,6 @@ pub const POSTGRES_KEEP_ALIVE_SECS: u64 = 30;
|
||||
/// In a lease, there are two opportunities for renewal.
|
||||
pub const META_KEEP_ALIVE_INTERVAL_SECS: u64 = META_LEASE_SECS / 2;
|
||||
|
||||
/// The timeout of the heartbeat request.
|
||||
pub const HEARTBEAT_TIMEOUT: Duration = Duration::from_secs(META_KEEP_ALIVE_INTERVAL_SECS + 1);
|
||||
|
||||
/// The keep-alive interval of the heartbeat channel.
|
||||
pub const HEARTBEAT_CHANNEL_KEEP_ALIVE_INTERVAL_SECS: Duration =
|
||||
Duration::from_secs(META_KEEP_ALIVE_INTERVAL_SECS + 1);
|
||||
|
||||
/// The keep-alive timeout of the heartbeat channel.
|
||||
pub const HEARTBEAT_CHANNEL_KEEP_ALIVE_TIMEOUT_SECS: Duration =
|
||||
Duration::from_secs(META_KEEP_ALIVE_INTERVAL_SECS + 1);
|
||||
|
||||
/// The default mailbox round-trip timeout.
|
||||
pub const MAILBOX_RTT_SECS: u64 = 1;
|
||||
|
||||
|
||||
@@ -1261,6 +1261,7 @@ impl RegionServerInner {
|
||||
.with_context(|_| HandleRegionRequestSnafu { region_id })?
|
||||
.new_opened_logical_region_ids()
|
||||
else {
|
||||
warn!("No new opened logical regions");
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
|
||||
@@ -24,8 +24,8 @@ use common_query::Output;
|
||||
use common_runtime::Runtime;
|
||||
use common_runtime::runtime::{BuilderBuild, RuntimeTrait};
|
||||
use datafusion::catalog::TableFunction;
|
||||
use datafusion::dataframe::DataFrame;
|
||||
use datafusion_expr::{AggregateUDF, LogicalPlan};
|
||||
use query::dataframe::DataFrame;
|
||||
use query::planner::LogicalPlanner;
|
||||
use query::query_engine::{DescribeResult, QueryEngineState};
|
||||
use query::{QueryEngine, QueryEngineContext};
|
||||
|
||||
@@ -12,9 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use api::helper::from_pb_time_ranges;
|
||||
use api::v1::ddl_request::{Expr as DdlExpr, Expr};
|
||||
@@ -24,18 +22,16 @@ use api::v1::{
|
||||
DeleteRequests, DropFlowExpr, InsertIntoPlan, InsertRequests, RowDeleteRequests,
|
||||
RowInsertRequests,
|
||||
};
|
||||
use async_stream::try_stream;
|
||||
use async_trait::async_trait;
|
||||
use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
|
||||
use common_base::AffectedRows;
|
||||
use common_error::ext::BoxedError;
|
||||
use common_grpc::flight::do_put::DoPutResponse;
|
||||
use common_grpc::FlightData;
|
||||
use common_grpc::flight::FlightDecoder;
|
||||
use common_query::Output;
|
||||
use common_query::logical_plan::add_insert_to_logical_plan;
|
||||
use common_telemetry::tracing::{self};
|
||||
use datafusion::datasource::DefaultTableSource;
|
||||
use futures::Stream;
|
||||
use futures::stream::StreamExt;
|
||||
use query::parser::PromQuery;
|
||||
use servers::interceptor::{GrpcQueryInterceptor, GrpcQueryInterceptorRef};
|
||||
use servers::query_handler::grpc::GrpcQueryHandler;
|
||||
@@ -244,8 +240,10 @@ impl GrpcQueryHandler for Instance {
|
||||
|
||||
async fn put_record_batch(
|
||||
&self,
|
||||
request: servers::grpc::flight::PutRecordBatchRequest,
|
||||
table_name: &TableName,
|
||||
table_ref: &mut Option<TableRef>,
|
||||
decoder: &mut FlightDecoder,
|
||||
data: FlightData,
|
||||
ctx: QueryContextRef,
|
||||
) -> Result<AffectedRows> {
|
||||
let table = if let Some(table) = table_ref {
|
||||
@@ -254,15 +252,15 @@ impl GrpcQueryHandler for Instance {
|
||||
let table = self
|
||||
.catalog_manager()
|
||||
.table(
|
||||
&request.table_name.catalog_name,
|
||||
&request.table_name.schema_name,
|
||||
&request.table_name.table_name,
|
||||
&table_name.catalog_name,
|
||||
&table_name.schema_name,
|
||||
&table_name.table_name,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
.with_context(|| TableNotFoundSnafu {
|
||||
table_name: request.table_name.to_string(),
|
||||
table_name: table_name.to_string(),
|
||||
})?;
|
||||
*table_ref = Some(table.clone());
|
||||
table
|
||||
@@ -281,77 +279,10 @@ impl GrpcQueryHandler for Instance {
|
||||
// do we check limit for bulk insert?
|
||||
|
||||
self.inserter
|
||||
.handle_bulk_insert(
|
||||
table,
|
||||
request.flight_data,
|
||||
request.record_batch,
|
||||
request.schema_bytes,
|
||||
)
|
||||
.handle_bulk_insert(table, decoder, data)
|
||||
.await
|
||||
.context(TableOperationSnafu)
|
||||
}
|
||||
|
||||
fn handle_put_record_batch_stream(
|
||||
&self,
|
||||
mut stream: servers::grpc::flight::PutRecordBatchRequestStream,
|
||||
ctx: QueryContextRef,
|
||||
) -> Pin<Box<dyn Stream<Item = Result<DoPutResponse>> + Send>> {
|
||||
// Resolve table once for the stream
|
||||
// Clone all necessary data to make it 'static
|
||||
let catalog_manager = self.catalog_manager().clone();
|
||||
let plugins = self.plugins.clone();
|
||||
let inserter = self.inserter.clone();
|
||||
let table_name = stream.table_name().clone();
|
||||
let ctx = ctx.clone();
|
||||
|
||||
Box::pin(try_stream! {
|
||||
plugins
|
||||
.get::<PermissionCheckerRef>()
|
||||
.as_ref()
|
||||
.check_permission(ctx.current_user(), PermissionReq::BulkInsert)
|
||||
.context(PermissionSnafu)?;
|
||||
// Cache for resolved table reference - resolve once and reuse
|
||||
let table_ref = catalog_manager
|
||||
.table(
|
||||
&table_name.catalog_name,
|
||||
&table_name.schema_name,
|
||||
&table_name.table_name,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
.with_context(|| TableNotFoundSnafu {
|
||||
table_name: table_name.to_string(),
|
||||
})?;
|
||||
|
||||
// Check permissions once for the stream
|
||||
let interceptor_ref = plugins.get::<GrpcQueryInterceptorRef<Error>>();
|
||||
let interceptor = interceptor_ref.as_ref();
|
||||
interceptor.pre_bulk_insert(table_ref.clone(), ctx.clone())?;
|
||||
|
||||
// Process each request in the stream
|
||||
while let Some(request_result) = stream.next().await {
|
||||
let request = request_result.map_err(|e| {
|
||||
let error_msg = format!("Stream error: {:?}", e);
|
||||
IncompleteGrpcRequestSnafu { err_msg: error_msg }.build()
|
||||
})?;
|
||||
|
||||
let request_id = request.request_id;
|
||||
let start = Instant::now();
|
||||
let rows = inserter
|
||||
.handle_bulk_insert(
|
||||
table_ref.clone(),
|
||||
request.flight_data,
|
||||
request.record_batch,
|
||||
request.schema_bytes,
|
||||
)
|
||||
.await
|
||||
.context(TableOperationSnafu)?;
|
||||
let elapsed_secs = start.elapsed().as_secs_f64();
|
||||
yield DoPutResponse::new(request_id, rows, elapsed_secs);
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn fill_catalog_and_schema_from_context(ddl_expr: &mut DdlExpr, ctx: &QueryContextRef) {
|
||||
|
||||
@@ -136,7 +136,7 @@ impl Instance {
|
||||
table_name: format_full_table_name(ctx.current_catalog(), &table_schema, &metric),
|
||||
})?;
|
||||
|
||||
let scan_plan = dataframe.into_unoptimized_plan();
|
||||
let scan_plan = dataframe.into_logical_plan();
|
||||
let filter_conditions =
|
||||
PromPlanner::matchers_to_expr(Matchers::new(matchers), scan_plan.schema())
|
||||
.context(PrometheusLabelValuesQueryPlanSnafu)?;
|
||||
|
||||
@@ -22,7 +22,6 @@ use common_telemetry::info;
|
||||
use meta_client::MetaClientOptions;
|
||||
use servers::error::Error as ServerError;
|
||||
use servers::grpc::builder::GrpcServerBuilder;
|
||||
use servers::grpc::flight::FlightCraftRef;
|
||||
use servers::grpc::frontend_grpc_handler::FrontendGrpcHandler;
|
||||
use servers::grpc::greptime_handler::GreptimeRequestHandler;
|
||||
use servers::grpc::{GrpcOptions, GrpcServer};
|
||||
@@ -53,7 +52,6 @@ where
|
||||
grpc_server_builder: Option<GrpcServerBuilder>,
|
||||
http_server_builder: Option<HttpServerBuilder>,
|
||||
plugins: Plugins,
|
||||
flight_handler: Option<FlightCraftRef>,
|
||||
}
|
||||
|
||||
impl<T> Services<T>
|
||||
@@ -67,7 +65,6 @@ where
|
||||
grpc_server_builder: None,
|
||||
http_server_builder: None,
|
||||
plugins,
|
||||
flight_handler: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -142,13 +139,6 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_flight_handler(self, flight_handler: FlightCraftRef) -> Self {
|
||||
Self {
|
||||
flight_handler: Some(flight_handler),
|
||||
..self
|
||||
}
|
||||
}
|
||||
|
||||
fn build_grpc_server(
|
||||
&mut self,
|
||||
grpc: &GrpcOptions,
|
||||
@@ -183,12 +173,6 @@ where
|
||||
grpc.flight_compression,
|
||||
);
|
||||
|
||||
// Use custom flight handler if provided, otherwise use the default GreptimeRequestHandler
|
||||
let flight_handler = self
|
||||
.flight_handler
|
||||
.clone()
|
||||
.unwrap_or_else(|| Arc::new(greptime_request_handler.clone()) as FlightCraftRef);
|
||||
|
||||
let grpc_server = builder
|
||||
.name(name)
|
||||
.database_handler(greptime_request_handler.clone())
|
||||
@@ -197,7 +181,7 @@ where
|
||||
self.instance.clone(),
|
||||
user_provider.clone(),
|
||||
))
|
||||
.flight_handler(flight_handler);
|
||||
.flight_handler(Arc::new(greptime_request_handler));
|
||||
|
||||
let grpc_server = if !external {
|
||||
let frontend_grpc_handler =
|
||||
|
||||
@@ -21,7 +21,7 @@ use itertools::Itertools;
|
||||
|
||||
use crate::Bytes;
|
||||
use crate::bloom_filter::error::Result;
|
||||
use crate::bloom_filter::reader::{BloomFilterReadMetrics, BloomFilterReader};
|
||||
use crate::bloom_filter::reader::BloomFilterReader;
|
||||
|
||||
/// `InListPredicate` contains a list of acceptable values. A value needs to match at least
|
||||
/// one of the elements (logical OR semantic) for the predicate to be satisfied.
|
||||
@@ -38,7 +38,7 @@ pub struct BloomFilterApplier {
|
||||
|
||||
impl BloomFilterApplier {
|
||||
pub async fn new(reader: Box<dyn BloomFilterReader + Send>) -> Result<Self> {
|
||||
let meta = reader.metadata(None).await?;
|
||||
let meta = reader.metadata().await?;
|
||||
|
||||
Ok(Self { reader, meta })
|
||||
}
|
||||
@@ -50,7 +50,6 @@ impl BloomFilterApplier {
|
||||
&mut self,
|
||||
predicates: &[InListPredicate],
|
||||
search_ranges: &[Range<usize>],
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<Vec<Range<usize>>> {
|
||||
if predicates.is_empty() {
|
||||
// If no predicates, return empty result
|
||||
@@ -58,7 +57,7 @@ impl BloomFilterApplier {
|
||||
}
|
||||
|
||||
let segments = self.row_ranges_to_segments(search_ranges);
|
||||
let (seg_locations, bloom_filters) = self.load_bloom_filters(&segments, metrics).await?;
|
||||
let (seg_locations, bloom_filters) = self.load_bloom_filters(&segments).await?;
|
||||
let matching_row_ranges = self.find_matching_rows(seg_locations, bloom_filters, predicates);
|
||||
Ok(intersect_ranges(search_ranges, &matching_row_ranges))
|
||||
}
|
||||
@@ -96,7 +95,6 @@ impl BloomFilterApplier {
|
||||
async fn load_bloom_filters(
|
||||
&mut self,
|
||||
segments: &[usize],
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<(Vec<(u64, usize)>, Vec<BloomFilter>)> {
|
||||
let segment_locations = segments
|
||||
.iter()
|
||||
@@ -110,10 +108,7 @@ impl BloomFilterApplier {
|
||||
.map(|i| self.meta.bloom_filter_locs[i as usize])
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let bloom_filters = self
|
||||
.reader
|
||||
.bloom_filter_vec(&bloom_filter_locs, metrics)
|
||||
.await?;
|
||||
let bloom_filters = self.reader.bloom_filter_vec(&bloom_filter_locs).await?;
|
||||
|
||||
Ok((segment_locations, bloom_filters))
|
||||
}
|
||||
@@ -427,10 +422,7 @@ mod tests {
|
||||
];
|
||||
|
||||
for (predicates, search_range, expected) in cases {
|
||||
let result = applier
|
||||
.search(&predicates, &[search_range], None)
|
||||
.await
|
||||
.unwrap();
|
||||
let result = applier.search(&predicates, &[search_range]).await.unwrap();
|
||||
assert_eq!(
|
||||
result, expected,
|
||||
"Expected {:?}, got {:?}",
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
// limitations under the License.
|
||||
|
||||
use std::ops::{Range, Rem};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use bytemuck::try_cast_slice;
|
||||
@@ -35,72 +34,6 @@ const BLOOM_META_LEN_SIZE: u64 = 4;
|
||||
/// Default prefetch size of bloom filter meta.
|
||||
pub const DEFAULT_PREFETCH_SIZE: u64 = 8192; // 8KiB
|
||||
|
||||
/// Metrics for bloom filter read operations.
|
||||
#[derive(Default, Clone)]
|
||||
pub struct BloomFilterReadMetrics {
|
||||
/// Total byte size to read.
|
||||
pub total_bytes: u64,
|
||||
/// Total number of ranges to read.
|
||||
pub total_ranges: usize,
|
||||
/// Elapsed time to fetch data.
|
||||
pub fetch_elapsed: Duration,
|
||||
/// Number of cache hits.
|
||||
pub cache_hit: usize,
|
||||
/// Number of cache misses.
|
||||
pub cache_miss: usize,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for BloomFilterReadMetrics {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let Self {
|
||||
total_bytes,
|
||||
total_ranges,
|
||||
fetch_elapsed,
|
||||
cache_hit,
|
||||
cache_miss,
|
||||
} = self;
|
||||
|
||||
// If both total_bytes and cache_hit are 0, we didn't read anything.
|
||||
if *total_bytes == 0 && *cache_hit == 0 {
|
||||
return write!(f, "{{}}");
|
||||
}
|
||||
write!(f, "{{")?;
|
||||
|
||||
if *total_bytes > 0 {
|
||||
write!(f, "\"total_bytes\":{}", total_bytes)?;
|
||||
}
|
||||
if *cache_hit > 0 {
|
||||
if *total_bytes > 0 {
|
||||
write!(f, ", ")?;
|
||||
}
|
||||
write!(f, "\"cache_hit\":{}", cache_hit)?;
|
||||
}
|
||||
|
||||
if *total_ranges > 0 {
|
||||
write!(f, ", \"total_ranges\":{}", total_ranges)?;
|
||||
}
|
||||
if !fetch_elapsed.is_zero() {
|
||||
write!(f, ", \"fetch_elapsed\":\"{:?}\"", fetch_elapsed)?;
|
||||
}
|
||||
if *cache_miss > 0 {
|
||||
write!(f, ", \"cache_miss\":{}", cache_miss)?;
|
||||
}
|
||||
|
||||
write!(f, "}}")
|
||||
}
|
||||
}
|
||||
|
||||
impl BloomFilterReadMetrics {
|
||||
/// Merges another metrics into this one.
|
||||
pub fn merge_from(&mut self, other: &Self) {
|
||||
self.total_bytes += other.total_bytes;
|
||||
self.total_ranges += other.total_ranges;
|
||||
self.fetch_elapsed += other.fetch_elapsed;
|
||||
self.cache_hit += other.cache_hit;
|
||||
self.cache_miss += other.cache_miss;
|
||||
}
|
||||
}
|
||||
|
||||
/// Safely converts bytes to Vec<u64> using bytemuck for optimal performance.
|
||||
/// Faster than chunking and converting each piece individually.
|
||||
///
|
||||
@@ -146,33 +79,25 @@ pub fn bytes_to_u64_vec(bytes: &Bytes) -> Vec<u64> {
|
||||
#[async_trait]
|
||||
pub trait BloomFilterReader: Sync {
|
||||
/// Reads range of bytes from the file.
|
||||
async fn range_read(
|
||||
&self,
|
||||
offset: u64,
|
||||
size: u32,
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<Bytes>;
|
||||
async fn range_read(&self, offset: u64, size: u32) -> Result<Bytes>;
|
||||
|
||||
/// Reads bunch of ranges from the file.
|
||||
async fn read_vec(
|
||||
&self,
|
||||
ranges: &[Range<u64>],
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<Vec<Bytes>>;
|
||||
async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
|
||||
let mut results = Vec::with_capacity(ranges.len());
|
||||
for range in ranges {
|
||||
let size = (range.end - range.start) as u32;
|
||||
let data = self.range_read(range.start, size).await?;
|
||||
results.push(data);
|
||||
}
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Reads the meta information of the bloom filter.
|
||||
async fn metadata(
|
||||
&self,
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<BloomFilterMeta>;
|
||||
async fn metadata(&self) -> Result<BloomFilterMeta>;
|
||||
|
||||
/// Reads a bloom filter with the given location.
|
||||
async fn bloom_filter(
|
||||
&self,
|
||||
loc: &BloomFilterLoc,
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<BloomFilter> {
|
||||
let bytes = self.range_read(loc.offset, loc.size as _, metrics).await?;
|
||||
async fn bloom_filter(&self, loc: &BloomFilterLoc) -> Result<BloomFilter> {
|
||||
let bytes = self.range_read(loc.offset, loc.size as _).await?;
|
||||
let vec = bytes_to_u64_vec(&bytes);
|
||||
let bm = BloomFilter::from_vec(vec)
|
||||
.seed(&SEED)
|
||||
@@ -180,16 +105,12 @@ pub trait BloomFilterReader: Sync {
|
||||
Ok(bm)
|
||||
}
|
||||
|
||||
async fn bloom_filter_vec(
|
||||
&self,
|
||||
locs: &[BloomFilterLoc],
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<Vec<BloomFilter>> {
|
||||
async fn bloom_filter_vec(&self, locs: &[BloomFilterLoc]) -> Result<Vec<BloomFilter>> {
|
||||
let ranges = locs
|
||||
.iter()
|
||||
.map(|l| l.offset..l.offset + l.size)
|
||||
.collect::<Vec<_>>();
|
||||
let bss = self.read_vec(&ranges, metrics).await?;
|
||||
let bss = self.read_vec(&ranges).await?;
|
||||
|
||||
let mut result = Vec::with_capacity(bss.len());
|
||||
for (bs, loc) in bss.into_iter().zip(locs.iter()) {
|
||||
@@ -219,59 +140,24 @@ impl<R: RangeReader> BloomFilterReaderImpl<R> {
|
||||
|
||||
#[async_trait]
|
||||
impl<R: RangeReader> BloomFilterReader for BloomFilterReaderImpl<R> {
|
||||
async fn range_read(
|
||||
&self,
|
||||
offset: u64,
|
||||
size: u32,
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<Bytes> {
|
||||
let start = metrics.as_ref().map(|_| Instant::now());
|
||||
let result = self
|
||||
.reader
|
||||
async fn range_read(&self, offset: u64, size: u32) -> Result<Bytes> {
|
||||
self.reader
|
||||
.read(offset..offset + size as u64)
|
||||
.await
|
||||
.context(IoSnafu)?;
|
||||
|
||||
if let Some(m) = metrics {
|
||||
m.total_ranges += 1;
|
||||
m.total_bytes += size as u64;
|
||||
if let Some(start) = start {
|
||||
m.fetch_elapsed += start.elapsed();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
.context(IoSnafu)
|
||||
}
|
||||
|
||||
async fn read_vec(
|
||||
&self,
|
||||
ranges: &[Range<u64>],
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<Vec<Bytes>> {
|
||||
let start = metrics.as_ref().map(|_| Instant::now());
|
||||
let result = self.reader.read_vec(ranges).await.context(IoSnafu)?;
|
||||
|
||||
if let Some(m) = metrics {
|
||||
m.total_ranges += ranges.len();
|
||||
m.total_bytes += ranges.iter().map(|r| r.end - r.start).sum::<u64>();
|
||||
if let Some(start) = start {
|
||||
m.fetch_elapsed += start.elapsed();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
|
||||
self.reader.read_vec(ranges).await.context(IoSnafu)
|
||||
}
|
||||
|
||||
async fn metadata(
|
||||
&self,
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<BloomFilterMeta> {
|
||||
async fn metadata(&self) -> Result<BloomFilterMeta> {
|
||||
let metadata = self.reader.metadata().await.context(IoSnafu)?;
|
||||
let file_size = metadata.content_length;
|
||||
|
||||
let mut meta_reader =
|
||||
BloomFilterMetaReader::new(&self.reader, file_size, Some(DEFAULT_PREFETCH_SIZE));
|
||||
meta_reader.metadata(metrics).await
|
||||
meta_reader.metadata().await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -297,10 +183,7 @@ impl<R: RangeReader> BloomFilterMetaReader<R> {
|
||||
///
|
||||
/// It will first prefetch some bytes from the end of the file,
|
||||
/// then parse the metadata from the prefetch bytes.
|
||||
pub async fn metadata(
|
||||
&mut self,
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<BloomFilterMeta> {
|
||||
pub async fn metadata(&mut self) -> Result<BloomFilterMeta> {
|
||||
ensure!(
|
||||
self.file_size >= BLOOM_META_LEN_SIZE,
|
||||
FileSizeTooSmallSnafu {
|
||||
@@ -308,7 +191,6 @@ impl<R: RangeReader> BloomFilterMetaReader<R> {
|
||||
}
|
||||
);
|
||||
|
||||
let start = metrics.as_ref().map(|_| Instant::now());
|
||||
let meta_start = self.file_size.saturating_sub(self.prefetch_size);
|
||||
let suffix = self
|
||||
.reader
|
||||
@@ -326,28 +208,8 @@ impl<R: RangeReader> BloomFilterMetaReader<R> {
|
||||
.read(metadata_start..self.file_size - BLOOM_META_LEN_SIZE)
|
||||
.await
|
||||
.context(IoSnafu)?;
|
||||
|
||||
if let Some(m) = metrics {
|
||||
// suffix read + meta read
|
||||
m.total_ranges += 2;
|
||||
// Ignores the meta length size to simplify the calculation.
|
||||
m.total_bytes += self.file_size.min(self.prefetch_size) + length;
|
||||
if let Some(start) = start {
|
||||
m.fetch_elapsed += start.elapsed();
|
||||
}
|
||||
}
|
||||
|
||||
BloomFilterMeta::decode(meta).context(DecodeProtoSnafu)
|
||||
} else {
|
||||
if let Some(m) = metrics {
|
||||
// suffix read only
|
||||
m.total_ranges += 1;
|
||||
m.total_bytes += self.file_size.min(self.prefetch_size);
|
||||
if let Some(start) = start {
|
||||
m.fetch_elapsed += start.elapsed();
|
||||
}
|
||||
}
|
||||
|
||||
let metadata_start = self.file_size - length - BLOOM_META_LEN_SIZE - meta_start;
|
||||
let meta = &suffix[metadata_start as usize..suffix_len - BLOOM_META_LEN_SIZE as usize];
|
||||
BloomFilterMeta::decode(meta).context(DecodeProtoSnafu)
|
||||
@@ -428,7 +290,7 @@ mod tests {
|
||||
for prefetch in [0u64, file_size / 2, file_size, file_size + 10] {
|
||||
let mut reader =
|
||||
BloomFilterMetaReader::new(bytes.clone(), file_size as _, Some(prefetch));
|
||||
let meta = reader.metadata(None).await.unwrap();
|
||||
let meta = reader.metadata().await.unwrap();
|
||||
|
||||
assert_eq!(meta.rows_per_segment, 2);
|
||||
assert_eq!(meta.segment_count, 2);
|
||||
@@ -450,11 +312,11 @@ mod tests {
|
||||
let bytes = mock_bloom_filter_bytes().await;
|
||||
|
||||
let reader = BloomFilterReaderImpl::new(bytes);
|
||||
let meta = reader.metadata(None).await.unwrap();
|
||||
let meta = reader.metadata().await.unwrap();
|
||||
|
||||
assert_eq!(meta.bloom_filter_locs.len(), 2);
|
||||
let bf = reader
|
||||
.bloom_filter(&meta.bloom_filter_locs[0], None)
|
||||
.bloom_filter(&meta.bloom_filter_locs[0])
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(bf.contains(&b"a"));
|
||||
@@ -463,7 +325,7 @@ mod tests {
|
||||
assert!(bf.contains(&b"d"));
|
||||
|
||||
let bf = reader
|
||||
.bloom_filter(&meta.bloom_filter_locs[1], None)
|
||||
.bloom_filter(&meta.bloom_filter_locs[1])
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(bf.contains(&b"e"));
|
||||
|
||||
@@ -74,7 +74,7 @@ async fn test_search(
|
||||
writer.finish().await.unwrap();
|
||||
|
||||
let reader = puffin_manager.reader(&file_name).await.unwrap();
|
||||
let (index_dir, _metrics) = reader.dir(&blob_key).await.unwrap();
|
||||
let index_dir = reader.dir(&blob_key).await.unwrap();
|
||||
let searcher = TantivyFulltextIndexSearcher::new(index_dir.path(), config).unwrap();
|
||||
for (query, expected) in query_expected {
|
||||
let results = searcher.search(query).await.unwrap();
|
||||
|
||||
@@ -15,7 +15,6 @@
|
||||
use std::collections::VecDeque;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
@@ -30,115 +29,37 @@ pub use crate::inverted_index::format::reader::blob::InvertedIndexBlobReader;
|
||||
mod blob;
|
||||
mod footer;
|
||||
|
||||
/// Metrics for inverted index read operations.
|
||||
#[derive(Default, Clone)]
|
||||
pub struct InvertedIndexReadMetrics {
|
||||
/// Total byte size to read.
|
||||
pub total_bytes: u64,
|
||||
/// Total number of ranges to read.
|
||||
pub total_ranges: usize,
|
||||
/// Elapsed time to fetch data.
|
||||
pub fetch_elapsed: Duration,
|
||||
/// Number of cache hits.
|
||||
pub cache_hit: usize,
|
||||
/// Number of cache misses.
|
||||
pub cache_miss: usize,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for InvertedIndexReadMetrics {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let Self {
|
||||
total_bytes,
|
||||
total_ranges,
|
||||
fetch_elapsed,
|
||||
cache_hit,
|
||||
cache_miss,
|
||||
} = self;
|
||||
|
||||
// If both total_bytes and cache_hit are 0, we didn't read anything.
|
||||
if *total_bytes == 0 && *cache_hit == 0 {
|
||||
return write!(f, "{{}}");
|
||||
}
|
||||
write!(f, "{{")?;
|
||||
|
||||
if *total_bytes > 0 {
|
||||
write!(f, "\"total_bytes\":{}", total_bytes)?;
|
||||
}
|
||||
if *cache_hit > 0 {
|
||||
if *total_bytes > 0 {
|
||||
write!(f, ", ")?;
|
||||
}
|
||||
write!(f, "\"cache_hit\":{}", cache_hit)?;
|
||||
}
|
||||
|
||||
if *total_ranges > 0 {
|
||||
write!(f, ", \"total_ranges\":{}", total_ranges)?;
|
||||
}
|
||||
if !fetch_elapsed.is_zero() {
|
||||
write!(f, ", \"fetch_elapsed\":\"{:?}\"", fetch_elapsed)?;
|
||||
}
|
||||
if *cache_miss > 0 {
|
||||
write!(f, ", \"cache_miss\":{}", cache_miss)?;
|
||||
}
|
||||
|
||||
write!(f, "}}")
|
||||
}
|
||||
}
|
||||
|
||||
impl InvertedIndexReadMetrics {
|
||||
/// Merges another metrics into this one.
|
||||
pub fn merge_from(&mut self, other: &Self) {
|
||||
self.total_bytes += other.total_bytes;
|
||||
self.total_ranges += other.total_ranges;
|
||||
self.fetch_elapsed += other.fetch_elapsed;
|
||||
self.cache_hit += other.cache_hit;
|
||||
self.cache_miss += other.cache_miss;
|
||||
}
|
||||
}
|
||||
|
||||
/// InvertedIndexReader defines an asynchronous reader of inverted index data
|
||||
#[mockall::automock]
|
||||
#[async_trait]
|
||||
pub trait InvertedIndexReader: Send + Sync {
|
||||
/// Seeks to given offset and reads data with exact size as provided.
|
||||
async fn range_read<'a>(
|
||||
&self,
|
||||
offset: u64,
|
||||
size: u32,
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<Vec<u8>>;
|
||||
async fn range_read(&self, offset: u64, size: u32) -> Result<Vec<u8>>;
|
||||
|
||||
/// Reads the bytes in the given ranges.
|
||||
async fn read_vec<'a>(
|
||||
&self,
|
||||
ranges: &[Range<u64>],
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<Vec<Bytes>>;
|
||||
async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
|
||||
let mut result = Vec::with_capacity(ranges.len());
|
||||
for range in ranges {
|
||||
let data = self
|
||||
.range_read(range.start, (range.end - range.start) as u32)
|
||||
.await?;
|
||||
result.push(Bytes::from(data));
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Retrieves metadata of all inverted indices stored within the blob.
|
||||
async fn metadata<'a>(
|
||||
&self,
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<Arc<InvertedIndexMetas>>;
|
||||
async fn metadata(&self) -> Result<Arc<InvertedIndexMetas>>;
|
||||
|
||||
/// Retrieves the finite state transducer (FST) map from the given offset and size.
|
||||
async fn fst<'a>(
|
||||
&self,
|
||||
offset: u64,
|
||||
size: u32,
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<FstMap> {
|
||||
let fst_data = self.range_read(offset, size, metrics).await?;
|
||||
async fn fst(&self, offset: u64, size: u32) -> Result<FstMap> {
|
||||
let fst_data = self.range_read(offset, size).await?;
|
||||
FstMap::new(fst_data).context(DecodeFstSnafu)
|
||||
}
|
||||
|
||||
/// Retrieves the multiple finite state transducer (FST) maps from the given ranges.
|
||||
async fn fst_vec<'a>(
|
||||
&mut self,
|
||||
ranges: &[Range<u64>],
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<Vec<FstMap>> {
|
||||
self.read_vec(ranges, metrics)
|
||||
async fn fst_vec(&mut self, ranges: &[Range<u64>]) -> Result<Vec<FstMap>> {
|
||||
self.read_vec(ranges)
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|bytes| FstMap::new(bytes.to_vec()).context(DecodeFstSnafu))
|
||||
@@ -146,28 +67,19 @@ pub trait InvertedIndexReader: Send + Sync {
|
||||
}
|
||||
|
||||
/// Retrieves the bitmap from the given offset and size.
|
||||
async fn bitmap<'a>(
|
||||
&self,
|
||||
offset: u64,
|
||||
size: u32,
|
||||
bitmap_type: BitmapType,
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<Bitmap> {
|
||||
self.range_read(offset, size, metrics)
|
||||
.await
|
||||
.and_then(|bytes| {
|
||||
Bitmap::deserialize_from(&bytes, bitmap_type).context(DecodeBitmapSnafu)
|
||||
})
|
||||
async fn bitmap(&self, offset: u64, size: u32, bitmap_type: BitmapType) -> Result<Bitmap> {
|
||||
self.range_read(offset, size).await.and_then(|bytes| {
|
||||
Bitmap::deserialize_from(&bytes, bitmap_type).context(DecodeBitmapSnafu)
|
||||
})
|
||||
}
|
||||
|
||||
/// Retrieves the multiple bitmaps from the given ranges.
|
||||
async fn bitmap_deque<'a>(
|
||||
async fn bitmap_deque(
|
||||
&mut self,
|
||||
ranges: &[(Range<u64>, BitmapType)],
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<VecDeque<Bitmap>> {
|
||||
let (ranges, types): (Vec<_>, Vec<_>) = ranges.iter().cloned().unzip();
|
||||
let bytes = self.read_vec(&ranges, metrics).await?;
|
||||
let bytes = self.read_vec(&ranges).await?;
|
||||
bytes
|
||||
.into_iter()
|
||||
.zip(types)
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
@@ -24,10 +23,10 @@ use snafu::{ResultExt, ensure};
|
||||
|
||||
use crate::inverted_index::error::{CommonIoSnafu, Result, UnexpectedBlobSizeSnafu};
|
||||
use crate::inverted_index::format::MIN_BLOB_SIZE;
|
||||
use crate::inverted_index::format::reader::InvertedIndexReader;
|
||||
use crate::inverted_index::format::reader::footer::{
|
||||
DEFAULT_PREFETCH_SIZE, InvertedIndexFooterReader,
|
||||
};
|
||||
use crate::inverted_index::format::reader::{InvertedIndexReadMetrics, InvertedIndexReader};
|
||||
|
||||
/// Inverted index blob reader, implements [`InvertedIndexReader`]
|
||||
pub struct InvertedIndexBlobReader<R> {
|
||||
@@ -54,58 +53,27 @@ impl<R> InvertedIndexBlobReader<R> {
|
||||
|
||||
#[async_trait]
|
||||
impl<R: RangeReader + Sync> InvertedIndexReader for InvertedIndexBlobReader<R> {
|
||||
async fn range_read<'a>(
|
||||
&self,
|
||||
offset: u64,
|
||||
size: u32,
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<Vec<u8>> {
|
||||
let start = metrics.as_ref().map(|_| Instant::now());
|
||||
|
||||
async fn range_read(&self, offset: u64, size: u32) -> Result<Vec<u8>> {
|
||||
let buf = self
|
||||
.source
|
||||
.read(offset..offset + size as u64)
|
||||
.await
|
||||
.context(CommonIoSnafu)?;
|
||||
|
||||
if let Some(m) = metrics {
|
||||
m.total_bytes += size as u64;
|
||||
m.total_ranges += 1;
|
||||
m.fetch_elapsed += start.unwrap().elapsed();
|
||||
}
|
||||
|
||||
Ok(buf.into())
|
||||
}
|
||||
|
||||
async fn read_vec<'a>(
|
||||
&self,
|
||||
ranges: &[Range<u64>],
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<Vec<Bytes>> {
|
||||
let start = metrics.as_ref().map(|_| Instant::now());
|
||||
|
||||
let result = self.source.read_vec(ranges).await.context(CommonIoSnafu)?;
|
||||
|
||||
if let Some(m) = metrics {
|
||||
m.total_bytes += ranges.iter().map(|r| r.end - r.start).sum::<u64>();
|
||||
m.total_ranges += ranges.len();
|
||||
m.fetch_elapsed += start.unwrap().elapsed();
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
|
||||
self.source.read_vec(ranges).await.context(CommonIoSnafu)
|
||||
}
|
||||
|
||||
async fn metadata<'a>(
|
||||
&self,
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<Arc<InvertedIndexMetas>> {
|
||||
async fn metadata(&self) -> Result<Arc<InvertedIndexMetas>> {
|
||||
let metadata = self.source.metadata().await.context(CommonIoSnafu)?;
|
||||
let blob_size = metadata.content_length;
|
||||
Self::validate_blob_size(blob_size)?;
|
||||
|
||||
let mut footer_reader = InvertedIndexFooterReader::new(&self.source, blob_size)
|
||||
.with_prefetch_size(DEFAULT_PREFETCH_SIZE);
|
||||
footer_reader.metadata(metrics).await.map(Arc::new)
|
||||
footer_reader.metadata().await.map(Arc::new)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -205,7 +173,7 @@ mod tests {
|
||||
let blob = create_inverted_index_blob();
|
||||
let blob_reader = InvertedIndexBlobReader::new(blob);
|
||||
|
||||
let metas = blob_reader.metadata(None).await.unwrap();
|
||||
let metas = blob_reader.metadata().await.unwrap();
|
||||
assert_eq!(metas.metas.len(), 2);
|
||||
|
||||
let meta0 = metas.metas.get("tag0").unwrap();
|
||||
@@ -232,14 +200,13 @@ mod tests {
|
||||
let blob = create_inverted_index_blob();
|
||||
let blob_reader = InvertedIndexBlobReader::new(blob);
|
||||
|
||||
let metas = blob_reader.metadata(None).await.unwrap();
|
||||
let metas = blob_reader.metadata().await.unwrap();
|
||||
let meta = metas.metas.get("tag0").unwrap();
|
||||
|
||||
let fst_map = blob_reader
|
||||
.fst(
|
||||
meta.base_offset + meta.relative_fst_offset as u64,
|
||||
meta.fst_size,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -252,7 +219,6 @@ mod tests {
|
||||
.fst(
|
||||
meta.base_offset + meta.relative_fst_offset as u64,
|
||||
meta.fst_size,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -266,30 +232,30 @@ mod tests {
|
||||
let blob = create_inverted_index_blob();
|
||||
let blob_reader = InvertedIndexBlobReader::new(blob);
|
||||
|
||||
let metas = blob_reader.metadata(None).await.unwrap();
|
||||
let metas = blob_reader.metadata().await.unwrap();
|
||||
let meta = metas.metas.get("tag0").unwrap();
|
||||
|
||||
let bitmap = blob_reader
|
||||
.bitmap(meta.base_offset, 26, BitmapType::Roaring, None)
|
||||
.bitmap(meta.base_offset, 26, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(bitmap, mock_bitmap());
|
||||
let bitmap = blob_reader
|
||||
.bitmap(meta.base_offset + 26, 26, BitmapType::Roaring, None)
|
||||
.bitmap(meta.base_offset + 26, 26, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(bitmap, mock_bitmap());
|
||||
|
||||
let metas = blob_reader.metadata(None).await.unwrap();
|
||||
let metas = blob_reader.metadata().await.unwrap();
|
||||
let meta = metas.metas.get("tag1").unwrap();
|
||||
|
||||
let bitmap = blob_reader
|
||||
.bitmap(meta.base_offset, 26, BitmapType::Roaring, None)
|
||||
.bitmap(meta.base_offset, 26, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(bitmap, mock_bitmap());
|
||||
let bitmap = blob_reader
|
||||
.bitmap(meta.base_offset + 26, 26, BitmapType::Roaring, None)
|
||||
.bitmap(meta.base_offset + 26, 26, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(bitmap, mock_bitmap());
|
||||
|
||||
@@ -12,8 +12,6 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::time::Instant;
|
||||
|
||||
use common_base::range_read::RangeReader;
|
||||
use greptime_proto::v1::index::{InvertedIndexMeta, InvertedIndexMetas};
|
||||
use prost::Message;
|
||||
@@ -25,7 +23,6 @@ use crate::inverted_index::error::{
|
||||
UnexpectedZeroSegmentRowCountSnafu,
|
||||
};
|
||||
use crate::inverted_index::format::FOOTER_PAYLOAD_SIZE_SIZE;
|
||||
use crate::inverted_index::format::reader::InvertedIndexReadMetrics;
|
||||
|
||||
pub const DEFAULT_PREFETCH_SIZE: u64 = 8192; // 8KiB
|
||||
|
||||
@@ -57,17 +54,12 @@ impl<R> InvertedIndexFooterReader<R> {
|
||||
}
|
||||
|
||||
impl<R: RangeReader> InvertedIndexFooterReader<R> {
|
||||
pub async fn metadata(
|
||||
&mut self,
|
||||
mut metrics: Option<&mut InvertedIndexReadMetrics>,
|
||||
) -> Result<InvertedIndexMetas> {
|
||||
pub async fn metadata(&mut self) -> Result<InvertedIndexMetas> {
|
||||
ensure!(
|
||||
self.blob_size >= FOOTER_PAYLOAD_SIZE_SIZE,
|
||||
BlobSizeTooSmallSnafu
|
||||
);
|
||||
|
||||
let start = metrics.as_ref().map(|_| Instant::now());
|
||||
|
||||
let footer_start = self.blob_size.saturating_sub(self.prefetch_size());
|
||||
let suffix = self
|
||||
.source
|
||||
@@ -81,36 +73,19 @@ impl<R: RangeReader> InvertedIndexFooterReader<R> {
|
||||
let footer_size = FOOTER_PAYLOAD_SIZE_SIZE;
|
||||
|
||||
// Did not fetch the entire file metadata in the initial read, need to make a second request.
|
||||
let result = if length > suffix_len as u64 - footer_size {
|
||||
if length > suffix_len as u64 - footer_size {
|
||||
let metadata_start = self.blob_size - length - footer_size;
|
||||
let meta = self
|
||||
.source
|
||||
.read(metadata_start..self.blob_size - footer_size)
|
||||
.await
|
||||
.context(CommonIoSnafu)?;
|
||||
|
||||
if let Some(m) = metrics.as_deref_mut() {
|
||||
m.total_bytes += self.blob_size.min(self.prefetch_size()) + length;
|
||||
m.total_ranges += 2;
|
||||
}
|
||||
|
||||
self.parse_payload(&meta, length)
|
||||
} else {
|
||||
if let Some(m) = metrics.as_deref_mut() {
|
||||
m.total_bytes += self.blob_size.min(self.prefetch_size());
|
||||
m.total_ranges += 1;
|
||||
}
|
||||
|
||||
let metadata_start = self.blob_size - length - footer_size - footer_start;
|
||||
let meta = &suffix[metadata_start as usize..suffix_len - footer_size as usize];
|
||||
self.parse_payload(meta, length)
|
||||
};
|
||||
|
||||
if let Some(m) = metrics {
|
||||
m.fetch_elapsed += start.unwrap().elapsed();
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
fn read_tailing_four_bytes(suffix: &[u8]) -> Result<[u8; 4]> {
|
||||
@@ -211,7 +186,7 @@ mod tests {
|
||||
reader = reader.with_prefetch_size(prefetch);
|
||||
}
|
||||
|
||||
let metas = reader.metadata(None).await.unwrap();
|
||||
let metas = reader.metadata().await.unwrap();
|
||||
assert_eq!(metas.metas.len(), 1);
|
||||
let index_meta = &metas.metas.get("test").unwrap();
|
||||
assert_eq!(index_meta.name, "test");
|
||||
@@ -235,7 +210,7 @@ mod tests {
|
||||
reader = reader.with_prefetch_size(prefetch);
|
||||
}
|
||||
|
||||
let result = reader.metadata(None).await;
|
||||
let result = reader.metadata().await;
|
||||
assert_matches!(result, Err(Error::UnexpectedFooterPayloadSize { .. }));
|
||||
}
|
||||
}
|
||||
@@ -258,7 +233,7 @@ mod tests {
|
||||
reader = reader.with_prefetch_size(prefetch);
|
||||
}
|
||||
|
||||
let result = reader.metadata(None).await;
|
||||
let result = reader.metadata().await;
|
||||
assert_matches!(result, Err(Error::UnexpectedOffsetSize { .. }));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -122,7 +122,7 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
let reader = InvertedIndexBlobReader::new(blob);
|
||||
let metadata = reader.metadata(None).await.unwrap();
|
||||
let metadata = reader.metadata().await.unwrap();
|
||||
assert_eq!(metadata.total_row_count, 8);
|
||||
assert_eq!(metadata.segment_row_count, 1);
|
||||
assert_eq!(metadata.metas.len(), 0);
|
||||
@@ -182,7 +182,7 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
let reader = InvertedIndexBlobReader::new(blob);
|
||||
let metadata = reader.metadata(None).await.unwrap();
|
||||
let metadata = reader.metadata().await.unwrap();
|
||||
assert_eq!(metadata.total_row_count, 8);
|
||||
assert_eq!(metadata.segment_row_count, 1);
|
||||
assert_eq!(metadata.metas.len(), 2);
|
||||
@@ -198,19 +198,13 @@ mod tests {
|
||||
.fst(
|
||||
tag0.base_offset + tag0.relative_fst_offset as u64,
|
||||
tag0.fst_size,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(fst0.len(), 3);
|
||||
let [offset, size] = unpack(fst0.get(b"a").unwrap());
|
||||
let bitmap = reader
|
||||
.bitmap(
|
||||
tag0.base_offset + offset as u64,
|
||||
size,
|
||||
BitmapType::Roaring,
|
||||
None,
|
||||
)
|
||||
.bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -219,12 +213,7 @@ mod tests {
|
||||
);
|
||||
let [offset, size] = unpack(fst0.get(b"b").unwrap());
|
||||
let bitmap = reader
|
||||
.bitmap(
|
||||
tag0.base_offset + offset as u64,
|
||||
size,
|
||||
BitmapType::Roaring,
|
||||
None,
|
||||
)
|
||||
.bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -233,12 +222,7 @@ mod tests {
|
||||
);
|
||||
let [offset, size] = unpack(fst0.get(b"c").unwrap());
|
||||
let bitmap = reader
|
||||
.bitmap(
|
||||
tag0.base_offset + offset as u64,
|
||||
size,
|
||||
BitmapType::Roaring,
|
||||
None,
|
||||
)
|
||||
.bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -257,19 +241,13 @@ mod tests {
|
||||
.fst(
|
||||
tag1.base_offset + tag1.relative_fst_offset as u64,
|
||||
tag1.fst_size,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(fst1.len(), 3);
|
||||
let [offset, size] = unpack(fst1.get(b"x").unwrap());
|
||||
let bitmap = reader
|
||||
.bitmap(
|
||||
tag1.base_offset + offset as u64,
|
||||
size,
|
||||
BitmapType::Roaring,
|
||||
None,
|
||||
)
|
||||
.bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -278,12 +256,7 @@ mod tests {
|
||||
);
|
||||
let [offset, size] = unpack(fst1.get(b"y").unwrap());
|
||||
let bitmap = reader
|
||||
.bitmap(
|
||||
tag1.base_offset + offset as u64,
|
||||
size,
|
||||
BitmapType::Roaring,
|
||||
None,
|
||||
)
|
||||
.bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -292,12 +265,7 @@ mod tests {
|
||||
);
|
||||
let [offset, size] = unpack(fst1.get(b"z").unwrap());
|
||||
let bitmap = reader
|
||||
.bitmap(
|
||||
tag1.base_offset + offset as u64,
|
||||
size,
|
||||
BitmapType::Roaring,
|
||||
None,
|
||||
)
|
||||
.bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
|
||||
@@ -16,7 +16,7 @@ use greptime_proto::v1::index::{BitmapType, InvertedIndexMeta};
|
||||
|
||||
use crate::bitmap::Bitmap;
|
||||
use crate::inverted_index::error::Result;
|
||||
use crate::inverted_index::format::reader::{InvertedIndexReadMetrics, InvertedIndexReader};
|
||||
use crate::inverted_index::format::reader::InvertedIndexReader;
|
||||
|
||||
/// `ParallelFstValuesMapper` enables parallel mapping of multiple FST value groups to their
|
||||
/// corresponding bitmaps within an inverted index.
|
||||
@@ -35,8 +35,7 @@ impl<'a> ParallelFstValuesMapper<'a> {
|
||||
|
||||
pub async fn map_values_vec(
|
||||
&mut self,
|
||||
value_and_meta_vec: &[(Vec<u64>, &InvertedIndexMeta)],
|
||||
metrics: Option<&mut InvertedIndexReadMetrics>,
|
||||
value_and_meta_vec: &[(Vec<u64>, &'a InvertedIndexMeta)],
|
||||
) -> Result<Vec<Bitmap>> {
|
||||
let groups = value_and_meta_vec
|
||||
.iter()
|
||||
@@ -65,7 +64,7 @@ impl<'a> ParallelFstValuesMapper<'a> {
|
||||
}
|
||||
|
||||
common_telemetry::debug!("fetch ranges: {:?}", fetch_ranges);
|
||||
let mut bitmaps = self.reader.bitmap_deque(&fetch_ranges, metrics).await?;
|
||||
let mut bitmaps = self.reader.bitmap_deque(&fetch_ranges).await?;
|
||||
let mut output = Vec::with_capacity(groups.len());
|
||||
|
||||
for counter in groups {
|
||||
@@ -96,25 +95,23 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_map_values_vec() {
|
||||
let mut mock_reader = MockInvertedIndexReader::new();
|
||||
mock_reader
|
||||
.expect_bitmap_deque()
|
||||
.returning(|ranges, _metrics| {
|
||||
let mut output = VecDeque::new();
|
||||
for (range, bitmap_type) in ranges {
|
||||
let offset = range.start;
|
||||
let size = range.end - range.start;
|
||||
match (offset, size, bitmap_type) {
|
||||
(1, 1, BitmapType::Roaring) => {
|
||||
output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
|
||||
}
|
||||
(2, 1, BitmapType::Roaring) => {
|
||||
output.push_back(Bitmap::from_lsb0_bytes(&[0b01010101], *bitmap_type))
|
||||
}
|
||||
_ => unreachable!(),
|
||||
mock_reader.expect_bitmap_deque().returning(|ranges| {
|
||||
let mut output = VecDeque::new();
|
||||
for (range, bitmap_type) in ranges {
|
||||
let offset = range.start;
|
||||
let size = range.end - range.start;
|
||||
match (offset, size, bitmap_type) {
|
||||
(1, 1, BitmapType::Roaring) => {
|
||||
output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
|
||||
}
|
||||
(2, 1, BitmapType::Roaring) => {
|
||||
output.push_back(Bitmap::from_lsb0_bytes(&[0b01010101], *bitmap_type))
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
Ok(output)
|
||||
});
|
||||
}
|
||||
Ok(output)
|
||||
});
|
||||
|
||||
let meta = InvertedIndexMeta {
|
||||
bitmap_type: BitmapType::Roaring.into(),
|
||||
@@ -123,13 +120,13 @@ mod tests {
|
||||
let mut values_mapper = ParallelFstValuesMapper::new(&mut mock_reader);
|
||||
|
||||
let result = values_mapper
|
||||
.map_values_vec(&[(vec![], &meta)], None)
|
||||
.map_values_vec(&[(vec![], &meta)])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(result[0].count_ones(), 0);
|
||||
|
||||
let result = values_mapper
|
||||
.map_values_vec(&[(vec![value(1, 1)], &meta)], None)
|
||||
.map_values_vec(&[(vec![value(1, 1)], &meta)])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -138,7 +135,7 @@ mod tests {
|
||||
);
|
||||
|
||||
let result = values_mapper
|
||||
.map_values_vec(&[(vec![value(2, 1)], &meta)], None)
|
||||
.map_values_vec(&[(vec![value(2, 1)], &meta)])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -147,7 +144,7 @@ mod tests {
|
||||
);
|
||||
|
||||
let result = values_mapper
|
||||
.map_values_vec(&[(vec![value(1, 1), value(2, 1)], &meta)], None)
|
||||
.map_values_vec(&[(vec![value(1, 1), value(2, 1)], &meta)])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -156,7 +153,7 @@ mod tests {
|
||||
);
|
||||
|
||||
let result = values_mapper
|
||||
.map_values_vec(&[(vec![value(2, 1), value(1, 1)], &meta)], None)
|
||||
.map_values_vec(&[(vec![value(2, 1), value(1, 1)], &meta)])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -165,10 +162,7 @@ mod tests {
|
||||
);
|
||||
|
||||
let result = values_mapper
|
||||
.map_values_vec(
|
||||
&[(vec![value(2, 1)], &meta), (vec![value(1, 1)], &meta)],
|
||||
None,
|
||||
)
|
||||
.map_values_vec(&[(vec![value(2, 1)], &meta), (vec![value(1, 1)], &meta)])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -180,13 +174,10 @@ mod tests {
|
||||
Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring)
|
||||
);
|
||||
let result = values_mapper
|
||||
.map_values_vec(
|
||||
&[
|
||||
(vec![value(2, 1), value(1, 1)], &meta),
|
||||
(vec![value(1, 1)], &meta),
|
||||
],
|
||||
None,
|
||||
)
|
||||
.map_values_vec(&[
|
||||
(vec![value(2, 1), value(1, 1)], &meta),
|
||||
(vec![value(1, 1)], &meta),
|
||||
])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
|
||||
@@ -19,7 +19,7 @@ pub use predicates_apply::PredicatesIndexApplier;
|
||||
|
||||
use crate::bitmap::Bitmap;
|
||||
use crate::inverted_index::error::Result;
|
||||
use crate::inverted_index::format::reader::{InvertedIndexReadMetrics, InvertedIndexReader};
|
||||
use crate::inverted_index::format::reader::InvertedIndexReader;
|
||||
|
||||
/// The output of an apply operation.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
@@ -44,11 +44,10 @@ pub trait IndexApplier: Send + Sync {
|
||||
/// Applies the predefined predicates to the data read by the given index reader, returning
|
||||
/// a list of relevant indices (e.g., post IDs, group IDs, row IDs).
|
||||
#[allow(unused_parens)]
|
||||
async fn apply<'a, 'b>(
|
||||
async fn apply<'a>(
|
||||
&self,
|
||||
context: SearchContext,
|
||||
reader: &mut (dyn InvertedIndexReader + 'a),
|
||||
metrics: Option<&'b mut InvertedIndexReadMetrics>,
|
||||
) -> Result<ApplyOutput>;
|
||||
|
||||
/// Returns the memory usage of the applier.
|
||||
|
||||
@@ -19,7 +19,7 @@ use greptime_proto::v1::index::InvertedIndexMetas;
|
||||
|
||||
use crate::bitmap::Bitmap;
|
||||
use crate::inverted_index::error::{IndexNotFoundSnafu, Result};
|
||||
use crate::inverted_index::format::reader::{InvertedIndexReadMetrics, InvertedIndexReader};
|
||||
use crate::inverted_index::format::reader::InvertedIndexReader;
|
||||
use crate::inverted_index::search::fst_apply::{
|
||||
FstApplier, IntersectionFstApplier, KeysFstApplier,
|
||||
};
|
||||
@@ -43,14 +43,12 @@ pub struct PredicatesIndexApplier {
|
||||
impl IndexApplier for PredicatesIndexApplier {
|
||||
/// Applies all `FstApplier`s to the data in the inverted index reader, intersecting the individual
|
||||
/// bitmaps obtained for each index to result in a final set of indices.
|
||||
async fn apply<'a, 'b>(
|
||||
async fn apply<'a>(
|
||||
&self,
|
||||
context: SearchContext,
|
||||
reader: &mut (dyn InvertedIndexReader + 'a),
|
||||
metrics: Option<&'b mut InvertedIndexReadMetrics>,
|
||||
) -> Result<ApplyOutput> {
|
||||
let mut metrics = metrics;
|
||||
let metadata = reader.metadata(metrics.as_deref_mut()).await?;
|
||||
let metadata = reader.metadata().await?;
|
||||
let mut output = ApplyOutput {
|
||||
matched_segment_ids: Bitmap::new_bitvec(),
|
||||
total_row_count: metadata.total_row_count as _,
|
||||
@@ -86,7 +84,7 @@ impl IndexApplier for PredicatesIndexApplier {
|
||||
return Ok(output);
|
||||
}
|
||||
|
||||
let fsts = reader.fst_vec(&fst_ranges, metrics.as_deref_mut()).await?;
|
||||
let fsts = reader.fst_vec(&fst_ranges).await?;
|
||||
let value_and_meta_vec = fsts
|
||||
.into_iter()
|
||||
.zip(appliers)
|
||||
@@ -94,7 +92,7 @@ impl IndexApplier for PredicatesIndexApplier {
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut mapper = ParallelFstValuesMapper::new(reader);
|
||||
let mut bm_vec = mapper.map_values_vec(&value_and_meta_vec, metrics).await?;
|
||||
let mut bm_vec = mapper.map_values_vec(&value_and_meta_vec).await?;
|
||||
|
||||
let mut bitmap = bm_vec.pop().unwrap(); // SAFETY: `fst_ranges` is not empty
|
||||
for bm in bm_vec {
|
||||
@@ -223,28 +221,26 @@ mod tests {
|
||||
let mut mock_reader = MockInvertedIndexReader::new();
|
||||
mock_reader
|
||||
.expect_metadata()
|
||||
.returning(|_| Ok(mock_metas([("tag-0", 0)])));
|
||||
mock_reader.expect_fst_vec().returning(|_ranges, _metrics| {
|
||||
.returning(|| Ok(mock_metas([("tag-0", 0)])));
|
||||
mock_reader.expect_fst_vec().returning(|_ranges| {
|
||||
Ok(vec![
|
||||
FstMap::from_iter([(b"tag-0_value-0", fst_value(2, 1))]).unwrap(),
|
||||
])
|
||||
});
|
||||
|
||||
mock_reader
|
||||
.expect_bitmap_deque()
|
||||
.returning(|arg, _metrics| {
|
||||
assert_eq!(arg.len(), 1);
|
||||
let range = &arg[0].0;
|
||||
let bitmap_type = arg[0].1;
|
||||
assert_eq!(*range, 2..3);
|
||||
assert_eq!(bitmap_type, BitmapType::Roaring);
|
||||
Ok(VecDeque::from([Bitmap::from_lsb0_bytes(
|
||||
&[0b10101010],
|
||||
bitmap_type,
|
||||
)]))
|
||||
});
|
||||
mock_reader.expect_bitmap_deque().returning(|arg| {
|
||||
assert_eq!(arg.len(), 1);
|
||||
let range = &arg[0].0;
|
||||
let bitmap_type = arg[0].1;
|
||||
assert_eq!(*range, 2..3);
|
||||
assert_eq!(bitmap_type, BitmapType::Roaring);
|
||||
Ok(VecDeque::from([Bitmap::from_lsb0_bytes(
|
||||
&[0b10101010],
|
||||
bitmap_type,
|
||||
)]))
|
||||
});
|
||||
let output = applier
|
||||
.apply(SearchContext::default(), &mut mock_reader, None)
|
||||
.apply(SearchContext::default(), &mut mock_reader)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -256,14 +252,14 @@ mod tests {
|
||||
let mut mock_reader = MockInvertedIndexReader::new();
|
||||
mock_reader
|
||||
.expect_metadata()
|
||||
.returning(|_| Ok(mock_metas([("tag-0", 0)])));
|
||||
mock_reader.expect_fst_vec().returning(|_range, _metrics| {
|
||||
.returning(|| Ok(mock_metas([("tag-0", 0)])));
|
||||
mock_reader.expect_fst_vec().returning(|_range| {
|
||||
Ok(vec![
|
||||
FstMap::from_iter([(b"tag-0_value-1", fst_value(2, 1))]).unwrap(),
|
||||
])
|
||||
});
|
||||
let output = applier
|
||||
.apply(SearchContext::default(), &mut mock_reader, None)
|
||||
.apply(SearchContext::default(), &mut mock_reader)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(output.matched_segment_ids.count_ones(), 0);
|
||||
@@ -283,8 +279,8 @@ mod tests {
|
||||
let mut mock_reader = MockInvertedIndexReader::new();
|
||||
mock_reader
|
||||
.expect_metadata()
|
||||
.returning(|_| Ok(mock_metas([("tag-0", 0), ("tag-1", 1)])));
|
||||
mock_reader.expect_fst_vec().returning(|ranges, _metrics| {
|
||||
.returning(|| Ok(mock_metas([("tag-0", 0), ("tag-1", 1)])));
|
||||
mock_reader.expect_fst_vec().returning(|ranges| {
|
||||
let mut output = vec![];
|
||||
for range in ranges {
|
||||
match range.start {
|
||||
@@ -297,29 +293,27 @@ mod tests {
|
||||
}
|
||||
Ok(output)
|
||||
});
|
||||
mock_reader
|
||||
.expect_bitmap_deque()
|
||||
.returning(|ranges, _metrics| {
|
||||
let mut output = VecDeque::new();
|
||||
for (range, bitmap_type) in ranges {
|
||||
let offset = range.start;
|
||||
let size = range.end - range.start;
|
||||
match (offset, size, bitmap_type) {
|
||||
(1, 1, BitmapType::Roaring) => {
|
||||
output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
|
||||
}
|
||||
(2, 1, BitmapType::Roaring) => {
|
||||
output.push_back(Bitmap::from_lsb0_bytes(&[0b11011011], *bitmap_type))
|
||||
}
|
||||
_ => unreachable!(),
|
||||
mock_reader.expect_bitmap_deque().returning(|ranges| {
|
||||
let mut output = VecDeque::new();
|
||||
for (range, bitmap_type) in ranges {
|
||||
let offset = range.start;
|
||||
let size = range.end - range.start;
|
||||
match (offset, size, bitmap_type) {
|
||||
(1, 1, BitmapType::Roaring) => {
|
||||
output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
|
||||
}
|
||||
(2, 1, BitmapType::Roaring) => {
|
||||
output.push_back(Bitmap::from_lsb0_bytes(&[0b11011011], *bitmap_type))
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(output)
|
||||
});
|
||||
Ok(output)
|
||||
});
|
||||
|
||||
let output = applier
|
||||
.apply(SearchContext::default(), &mut mock_reader, None)
|
||||
.apply(SearchContext::default(), &mut mock_reader)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -337,10 +331,10 @@ mod tests {
|
||||
let mut mock_reader: MockInvertedIndexReader = MockInvertedIndexReader::new();
|
||||
mock_reader
|
||||
.expect_metadata()
|
||||
.returning(|_| Ok(mock_metas([("tag-0", 0)])));
|
||||
.returning(|| Ok(mock_metas([("tag-0", 0)])));
|
||||
|
||||
let output = applier
|
||||
.apply(SearchContext::default(), &mut mock_reader, None)
|
||||
.apply(SearchContext::default(), &mut mock_reader)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(output.matched_segment_ids, Bitmap::full_bitvec(8)); // full range to scan
|
||||
@@ -349,7 +343,7 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_index_applier_with_empty_index() {
|
||||
let mut mock_reader = MockInvertedIndexReader::new();
|
||||
mock_reader.expect_metadata().returning(move |_| {
|
||||
mock_reader.expect_metadata().returning(move || {
|
||||
Ok(Arc::new(InvertedIndexMetas {
|
||||
total_row_count: 0, // No rows
|
||||
segment_row_count: 1,
|
||||
@@ -365,7 +359,7 @@ mod tests {
|
||||
};
|
||||
|
||||
let output = applier
|
||||
.apply(SearchContext::default(), &mut mock_reader, None)
|
||||
.apply(SearchContext::default(), &mut mock_reader)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(output.matched_segment_ids.is_empty());
|
||||
@@ -376,7 +370,7 @@ mod tests {
|
||||
let mut mock_reader = MockInvertedIndexReader::new();
|
||||
mock_reader
|
||||
.expect_metadata()
|
||||
.returning(|_| Ok(mock_metas(vec![])));
|
||||
.returning(|| Ok(mock_metas(vec![])));
|
||||
|
||||
let mut mock_fst_applier = MockFstApplier::new();
|
||||
mock_fst_applier.expect_apply().never();
|
||||
@@ -391,7 +385,6 @@ mod tests {
|
||||
index_not_found_strategy: IndexNotFoundStrategy::ThrowError,
|
||||
},
|
||||
&mut mock_reader,
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
assert!(matches!(result, Err(Error::IndexNotFound { .. })));
|
||||
@@ -402,7 +395,6 @@ mod tests {
|
||||
index_not_found_strategy: IndexNotFoundStrategy::ReturnEmpty,
|
||||
},
|
||||
&mut mock_reader,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -414,7 +406,6 @@ mod tests {
|
||||
index_not_found_strategy: IndexNotFoundStrategy::Ignore,
|
||||
},
|
||||
&mut mock_reader,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -189,9 +189,6 @@ impl MetaClientBuilder {
|
||||
let mgr = client.channel_manager.clone();
|
||||
|
||||
if self.enable_heartbeat {
|
||||
if self.heartbeat_channel_manager.is_some() {
|
||||
info!("Enable heartbeat channel using the heartbeat channel manager.");
|
||||
}
|
||||
let mgr = self.heartbeat_channel_manager.unwrap_or(mgr.clone());
|
||||
client.heartbeat = Some(HeartbeatClient::new(
|
||||
self.id,
|
||||
|
||||
@@ -24,7 +24,7 @@ use common_meta::distributed_time_constants::META_KEEP_ALIVE_INTERVAL_SECS;
|
||||
use common_telemetry::tracing_context::TracingContext;
|
||||
use common_telemetry::warn;
|
||||
use rand::seq::SliceRandom;
|
||||
use snafu::ResultExt;
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
use tokio::time::timeout;
|
||||
use tonic::transport::Channel;
|
||||
|
||||
@@ -101,14 +101,12 @@ impl AskLeader {
|
||||
};
|
||||
|
||||
let (tx, mut rx) = tokio::sync::mpsc::channel(peers.len());
|
||||
let channel_manager = self.channel_manager.clone();
|
||||
|
||||
for addr in &peers {
|
||||
let mut client = self.create_asker(addr)?;
|
||||
let tx_clone = tx.clone();
|
||||
let req = req.clone();
|
||||
let addr = addr.clone();
|
||||
let channel_manager = channel_manager.clone();
|
||||
tokio::spawn(async move {
|
||||
match client.ask_leader(req).await {
|
||||
Ok(res) => {
|
||||
@@ -119,19 +117,13 @@ impl AskLeader {
|
||||
};
|
||||
}
|
||||
Err(status) => {
|
||||
// Reset cached channel even on generic errors: the VIP may keep us on a dead
|
||||
// backend, so forcing a reconnect gives us a chance to hit a healthy peer.
|
||||
Self::reset_channels_with_manager(
|
||||
&channel_manager,
|
||||
std::slice::from_ref(&addr),
|
||||
);
|
||||
warn!("Failed to ask leader from: {addr}, {status}");
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
let leader = match timeout(
|
||||
let leader = timeout(
|
||||
self.channel_manager
|
||||
.config()
|
||||
.timeout
|
||||
@@ -139,16 +131,8 @@ impl AskLeader {
|
||||
rx.recv(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(Some(leader)) => leader,
|
||||
Ok(None) => return error::NoLeaderSnafu.fail(),
|
||||
Err(e) => {
|
||||
// All peers timed out. Reset channels to force reconnection,
|
||||
// which may help escape dead backends in VIP/LB scenarios.
|
||||
Self::reset_channels_with_manager(&self.channel_manager, &peers);
|
||||
return Err(e).context(error::AskLeaderTimeoutSnafu);
|
||||
}
|
||||
};
|
||||
.context(error::AskLeaderTimeoutSnafu)?
|
||||
.context(error::NoLeaderSnafu)?;
|
||||
|
||||
let mut leadership_group = self.leadership_group.write().unwrap();
|
||||
leadership_group.leader = Some(leader.clone());
|
||||
@@ -185,15 +169,6 @@ impl AskLeader {
|
||||
.context(error::CreateChannelSnafu)?,
|
||||
))
|
||||
}
|
||||
|
||||
/// Drop cached channels for the given peers so a fresh connection is used next time.
|
||||
fn reset_channels_with_manager(channel_manager: &ChannelManager, peers: &[String]) {
|
||||
if peers.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
channel_manager.retain_channel(|addr, _| !peers.iter().any(|peer| peer == addr));
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
|
||||
@@ -18,10 +18,6 @@ use std::time::Duration;
|
||||
use client::RegionFollowerClientRef;
|
||||
use common_base::Plugins;
|
||||
use common_grpc::channel_manager::{ChannelConfig, ChannelManager};
|
||||
use common_meta::distributed_time_constants::{
|
||||
HEARTBEAT_CHANNEL_KEEP_ALIVE_INTERVAL_SECS, HEARTBEAT_CHANNEL_KEEP_ALIVE_TIMEOUT_SECS,
|
||||
HEARTBEAT_TIMEOUT,
|
||||
};
|
||||
use common_telemetry::{debug, info};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@@ -38,6 +34,8 @@ pub struct MetaClientOptions {
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub timeout: Duration,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub heartbeat_timeout: Duration,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub ddl_timeout: Duration,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub connect_timeout: Duration,
|
||||
@@ -54,6 +52,7 @@ impl Default for MetaClientOptions {
|
||||
Self {
|
||||
metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
|
||||
timeout: Duration::from_millis(3_000u64),
|
||||
heartbeat_timeout: Duration::from_millis(500u64),
|
||||
ddl_timeout: Duration::from_millis(10_000u64),
|
||||
connect_timeout: Duration::from_millis(1_000u64),
|
||||
tcp_nodelay: true,
|
||||
@@ -98,11 +97,7 @@ pub async fn create_meta_client(
|
||||
.timeout(meta_client_options.timeout)
|
||||
.connect_timeout(meta_client_options.connect_timeout)
|
||||
.tcp_nodelay(meta_client_options.tcp_nodelay);
|
||||
let heartbeat_config = base_config
|
||||
.clone()
|
||||
.timeout(HEARTBEAT_TIMEOUT)
|
||||
.http2_keep_alive_interval(HEARTBEAT_CHANNEL_KEEP_ALIVE_INTERVAL_SECS)
|
||||
.http2_keep_alive_timeout(HEARTBEAT_CHANNEL_KEEP_ALIVE_TIMEOUT_SECS);
|
||||
let heartbeat_config = base_config.clone();
|
||||
|
||||
if let MetaClientType::Frontend = client_type {
|
||||
let ddl_config = base_config.clone().timeout(meta_client_options.ddl_timeout);
|
||||
|
||||
@@ -23,8 +23,6 @@ use store_api::storage::RegionId;
|
||||
mod candidate;
|
||||
mod ctx;
|
||||
mod handler;
|
||||
#[cfg(test)]
|
||||
mod mock;
|
||||
mod options;
|
||||
mod procedure;
|
||||
mod scheduler;
|
||||
|
||||
@@ -1,458 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
mod basic;
|
||||
mod candidate_select;
|
||||
mod con;
|
||||
mod config;
|
||||
mod err_handle;
|
||||
mod full_list;
|
||||
mod integration;
|
||||
mod misc;
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use common_meta::datanode::{RegionManifestInfo, RegionStat};
|
||||
use common_meta::key::table_route::PhysicalTableRouteValue;
|
||||
use common_meta::peer::Peer;
|
||||
use common_meta::rpc::router::{Region, RegionRoute};
|
||||
use common_telemetry::debug;
|
||||
use ordered_float::OrderedFloat;
|
||||
use store_api::region_engine::RegionRole;
|
||||
use store_api::storage::{FileRefsManifest, GcReport, RegionId};
|
||||
use table::metadata::TableId;
|
||||
use tokio::sync::mpsc::Sender;
|
||||
|
||||
use crate::error::{Result, UnexpectedSnafu};
|
||||
use crate::gc::candidate::GcCandidate;
|
||||
use crate::gc::ctx::SchedulerCtx;
|
||||
use crate::gc::handler::Region2Peers;
|
||||
use crate::gc::options::GcSchedulerOptions;
|
||||
use crate::gc::scheduler::{Event, GcScheduler};
|
||||
|
||||
pub const TEST_REGION_SIZE_200MB: u64 = 200_000_000;
|
||||
|
||||
/// Helper function to create an empty GcReport for the given region IDs
|
||||
pub fn new_empty_report_with(region_ids: impl IntoIterator<Item = RegionId>) -> GcReport {
|
||||
let mut deleted_files = HashMap::new();
|
||||
for region_id in region_ids {
|
||||
deleted_files.insert(region_id, vec![]);
|
||||
}
|
||||
GcReport {
|
||||
deleted_files,
|
||||
need_retry_regions: HashSet::new(),
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::type_complexity)]
|
||||
#[derive(Debug, Default)]
|
||||
pub struct MockSchedulerCtx {
|
||||
pub table_to_region_stats: Arc<Mutex<Option<HashMap<TableId, Vec<RegionStat>>>>>,
|
||||
pub table_routes: Arc<Mutex<HashMap<TableId, (TableId, PhysicalTableRouteValue)>>>,
|
||||
pub file_refs: Arc<Mutex<Option<FileRefsManifest>>>,
|
||||
pub gc_reports: Arc<Mutex<HashMap<RegionId, GcReport>>>,
|
||||
pub candidates: Arc<Mutex<Option<HashMap<TableId, Vec<GcCandidate>>>>>,
|
||||
pub get_table_to_region_stats_calls: Arc<Mutex<usize>>,
|
||||
pub get_file_references_calls: Arc<Mutex<usize>>,
|
||||
pub gc_regions_calls: Arc<Mutex<usize>>,
|
||||
// Error injection fields for testing
|
||||
pub get_table_to_region_stats_error: Arc<Mutex<Option<crate::error::Error>>>,
|
||||
pub get_table_route_error: Arc<Mutex<Option<crate::error::Error>>>,
|
||||
pub get_file_references_error: Arc<Mutex<Option<crate::error::Error>>>,
|
||||
pub gc_regions_error: Arc<Mutex<Option<crate::error::Error>>>,
|
||||
// Retry testing fields
|
||||
pub gc_regions_retry_count: Arc<Mutex<HashMap<RegionId, usize>>>,
|
||||
pub gc_regions_error_sequence: Arc<Mutex<Vec<crate::error::Error>>>,
|
||||
pub gc_regions_success_after_retries: Arc<Mutex<HashMap<RegionId, usize>>>,
|
||||
// Per-region error injection
|
||||
pub gc_regions_per_region_errors: Arc<Mutex<HashMap<RegionId, crate::error::Error>>>,
|
||||
}
|
||||
|
||||
impl MockSchedulerCtx {
|
||||
pub fn with_table_routes(
|
||||
self,
|
||||
table_routes: HashMap<TableId, (TableId, Vec<(RegionId, Peer)>)>,
|
||||
) -> Self {
|
||||
*self.table_routes.lock().unwrap() = table_routes
|
||||
.into_iter()
|
||||
.map(|(k, (phy_id, region2peer))| {
|
||||
let phy = PhysicalTableRouteValue::new(
|
||||
region2peer
|
||||
.into_iter()
|
||||
.map(|(region_id, peer)| RegionRoute {
|
||||
region: Region::new_test(region_id),
|
||||
leader_peer: Some(peer),
|
||||
..Default::default()
|
||||
})
|
||||
.collect(),
|
||||
);
|
||||
|
||||
(k, (phy_id, phy))
|
||||
})
|
||||
.collect();
|
||||
self
|
||||
}
|
||||
|
||||
/// Set an error to be returned by `get_table_to_region_stats`
|
||||
#[allow(dead_code)]
|
||||
pub fn with_get_table_to_region_stats_error(self, error: crate::error::Error) -> Self {
|
||||
*self.get_table_to_region_stats_error.lock().unwrap() = Some(error);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set an error to be returned by `get_table_route`
|
||||
pub fn set_table_route_error(&self, error: crate::error::Error) {
|
||||
*self.get_table_route_error.lock().unwrap() = Some(error);
|
||||
}
|
||||
|
||||
/// Set an error to be returned by `get_file_references`
|
||||
#[allow(dead_code)]
|
||||
pub fn with_get_file_references_error(self, error: crate::error::Error) -> Self {
|
||||
*self.get_file_references_error.lock().unwrap() = Some(error);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set an error to be returned by `gc_regions`
|
||||
pub fn with_gc_regions_error(self, error: crate::error::Error) -> Self {
|
||||
*self.gc_regions_error.lock().unwrap() = Some(error);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set a sequence of errors to be returned by `gc_regions` for retry testing
|
||||
pub fn set_gc_regions_error_sequence(&self, errors: Vec<crate::error::Error>) {
|
||||
*self.gc_regions_error_sequence.lock().unwrap() = errors;
|
||||
}
|
||||
|
||||
/// Set success after a specific number of retries for a region
|
||||
pub fn set_gc_regions_success_after_retries(&self, region_id: RegionId, retries: usize) {
|
||||
self.gc_regions_success_after_retries
|
||||
.lock()
|
||||
.unwrap()
|
||||
.insert(region_id, retries);
|
||||
}
|
||||
|
||||
/// Get the retry count for a specific region
|
||||
pub fn get_retry_count(&self, region_id: RegionId) -> usize {
|
||||
self.gc_regions_retry_count
|
||||
.lock()
|
||||
.unwrap()
|
||||
.get(®ion_id)
|
||||
.copied()
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
/// Reset all retry tracking
|
||||
pub fn reset_retry_tracking(&self) {
|
||||
*self.gc_regions_retry_count.lock().unwrap() = HashMap::new();
|
||||
*self.gc_regions_error_sequence.lock().unwrap() = Vec::new();
|
||||
*self.gc_regions_success_after_retries.lock().unwrap() = HashMap::new();
|
||||
}
|
||||
|
||||
/// Set an error to be returned for a specific region
|
||||
pub fn set_gc_regions_error_for_region(&self, region_id: RegionId, error: crate::error::Error) {
|
||||
self.gc_regions_per_region_errors
|
||||
.lock()
|
||||
.unwrap()
|
||||
.insert(region_id, error);
|
||||
}
|
||||
|
||||
/// Clear per-region errors
|
||||
#[allow(unused)]
|
||||
pub fn clear_gc_regions_per_region_errors(&self) {
|
||||
self.gc_regions_per_region_errors.lock().unwrap().clear();
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl SchedulerCtx for MockSchedulerCtx {
|
||||
async fn get_table_to_region_stats(&self) -> Result<HashMap<TableId, Vec<RegionStat>>> {
|
||||
*self.get_table_to_region_stats_calls.lock().unwrap() += 1;
|
||||
|
||||
// Check if we should return an injected error
|
||||
if let Some(error) = self.get_table_to_region_stats_error.lock().unwrap().take() {
|
||||
return Err(error);
|
||||
}
|
||||
|
||||
Ok(self
|
||||
.table_to_region_stats
|
||||
.lock()
|
||||
.unwrap()
|
||||
.clone()
|
||||
.unwrap_or_default())
|
||||
}
|
||||
|
||||
async fn get_table_route(
|
||||
&self,
|
||||
table_id: TableId,
|
||||
) -> Result<(TableId, PhysicalTableRouteValue)> {
|
||||
// Check if we should return an injected error
|
||||
if let Some(error) = self.get_table_route_error.lock().unwrap().take() {
|
||||
return Err(error);
|
||||
}
|
||||
|
||||
Ok(self
|
||||
.table_routes
|
||||
.lock()
|
||||
.unwrap()
|
||||
.get(&table_id)
|
||||
.cloned()
|
||||
.unwrap_or_else(|| (table_id, PhysicalTableRouteValue::default())))
|
||||
}
|
||||
|
||||
async fn get_file_references(
|
||||
&self,
|
||||
query_regions: &[RegionId],
|
||||
_related_regions: HashMap<RegionId, Vec<RegionId>>,
|
||||
region_to_peer: &Region2Peers,
|
||||
_timeout: Duration,
|
||||
) -> Result<FileRefsManifest> {
|
||||
*self.get_file_references_calls.lock().unwrap() += 1;
|
||||
|
||||
// Check if we should return an injected error
|
||||
if let Some(error) = self.get_file_references_error.lock().unwrap().take() {
|
||||
return Err(error);
|
||||
}
|
||||
if query_regions
|
||||
.iter()
|
||||
.any(|region_id| !region_to_peer.contains_key(region_id))
|
||||
{
|
||||
UnexpectedSnafu {
|
||||
violated: format!(
|
||||
"region_to_peer{region_to_peer:?} does not contain all region_ids requested: {:?}",
|
||||
query_regions
|
||||
),
|
||||
}.fail()?;
|
||||
}
|
||||
|
||||
Ok(self.file_refs.lock().unwrap().clone().unwrap_or_default())
|
||||
}
|
||||
|
||||
async fn gc_regions(
|
||||
&self,
|
||||
_peer: Peer,
|
||||
region_ids: &[RegionId],
|
||||
_file_refs_manifest: &FileRefsManifest,
|
||||
_full_file_listing: bool,
|
||||
_timeout: Duration,
|
||||
) -> Result<GcReport> {
|
||||
*self.gc_regions_calls.lock().unwrap() += 1;
|
||||
|
||||
// Check per-region error injection first (for any region)
|
||||
for ®ion_id in region_ids {
|
||||
if let Some(error) = self
|
||||
.gc_regions_per_region_errors
|
||||
.lock()
|
||||
.unwrap()
|
||||
.remove(®ion_id)
|
||||
{
|
||||
*self
|
||||
.gc_regions_retry_count
|
||||
.lock()
|
||||
.unwrap()
|
||||
.entry(region_id)
|
||||
.or_insert(0) += 1;
|
||||
return Err(error);
|
||||
}
|
||||
}
|
||||
|
||||
// Check if we should return an injected error
|
||||
if let Some(error) = self.gc_regions_error.lock().unwrap().take() {
|
||||
for region_id in region_ids {
|
||||
*self
|
||||
.gc_regions_retry_count
|
||||
.lock()
|
||||
.unwrap()
|
||||
.entry(*region_id)
|
||||
.or_insert(0) += 1;
|
||||
}
|
||||
return Err(error);
|
||||
}
|
||||
|
||||
// Handle error sequence for retry testing
|
||||
{
|
||||
let mut error_sequence = self.gc_regions_error_sequence.lock().unwrap();
|
||||
if !error_sequence.is_empty() {
|
||||
let error = error_sequence.remove(0);
|
||||
for region_id in region_ids {
|
||||
*self
|
||||
.gc_regions_retry_count
|
||||
.lock()
|
||||
.unwrap()
|
||||
.entry(*region_id)
|
||||
.or_insert(0) += 1;
|
||||
}
|
||||
return Err(error);
|
||||
}
|
||||
}
|
||||
|
||||
// Build the final report by processing each region individually
|
||||
let mut final_report = GcReport::default();
|
||||
let gc_reports = self.gc_reports.lock().unwrap();
|
||||
let success_after_retries = self.gc_regions_success_after_retries.lock().unwrap();
|
||||
|
||||
for ®ion_id in region_ids {
|
||||
// Get current retry count for this region
|
||||
let retry_count = self
|
||||
.gc_regions_retry_count
|
||||
.lock()
|
||||
.unwrap()
|
||||
.get(®ion_id)
|
||||
.copied()
|
||||
.unwrap_or(0);
|
||||
|
||||
// Check if this region should succeed or need retry
|
||||
if let Some(&required_retries) = success_after_retries.get(®ion_id) {
|
||||
if retry_count < required_retries {
|
||||
debug!(
|
||||
"Region {} needs retry (attempt {}/{})",
|
||||
region_id,
|
||||
retry_count + 1,
|
||||
required_retries
|
||||
);
|
||||
// This region needs more retries - add to need_retry_regions
|
||||
final_report.need_retry_regions.insert(region_id);
|
||||
// Track the retry attempt
|
||||
let mut retry_count_map = self.gc_regions_retry_count.lock().unwrap();
|
||||
*retry_count_map.entry(region_id).or_insert(0) += 1;
|
||||
} else {
|
||||
debug!(
|
||||
"Region {} has completed retries - succeeding now",
|
||||
region_id
|
||||
);
|
||||
// This region has completed all required retries - succeed
|
||||
if let Some(report) = gc_reports.get(®ion_id) {
|
||||
final_report.merge(report.clone());
|
||||
}
|
||||
// Track the success attempt
|
||||
let mut retry_count_map = self.gc_regions_retry_count.lock().unwrap();
|
||||
*retry_count_map.entry(region_id).or_insert(0) += 1;
|
||||
}
|
||||
} else {
|
||||
// No retry requirement - check if we have a GC report for this region
|
||||
if let Some(report) = gc_reports.get(®ion_id) {
|
||||
// We have a GC report - succeed immediately
|
||||
final_report.merge(report.clone());
|
||||
// Track the success attempt
|
||||
let mut retry_count_map = self.gc_regions_retry_count.lock().unwrap();
|
||||
*retry_count_map.entry(region_id).or_insert(0) += 1;
|
||||
} else {
|
||||
// No GC report available - this region should be marked for retry
|
||||
final_report.need_retry_regions.insert(region_id);
|
||||
// Track the attempt
|
||||
let mut retry_count_map = self.gc_regions_retry_count.lock().unwrap();
|
||||
*retry_count_map.entry(region_id).or_insert(0) += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Return the report with need_retry_regions populated - let the caller handle retry logic
|
||||
Ok(final_report)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TestEnv {
|
||||
pub scheduler: GcScheduler,
|
||||
pub ctx: Arc<MockSchedulerCtx>,
|
||||
#[allow(dead_code)]
|
||||
tx: Sender<Event>,
|
||||
}
|
||||
|
||||
#[allow(unused)]
|
||||
impl TestEnv {
|
||||
pub fn new() -> Self {
|
||||
let ctx = Arc::new(MockSchedulerCtx::default());
|
||||
let (tx, rx) = GcScheduler::channel();
|
||||
let config = GcSchedulerOptions::default();
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: rx,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
Self { scheduler, ctx, tx }
|
||||
}
|
||||
|
||||
pub fn with_candidates(self, candidates: HashMap<TableId, Vec<GcCandidate>>) -> Self {
|
||||
*self.ctx.candidates.lock().unwrap() = Some(candidates);
|
||||
self
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub async fn run_scheduler(mut self) {
|
||||
self.scheduler.run().await;
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub async fn tick(&self) {
|
||||
self.tx.send(Event::Tick).await.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function to create a mock GC candidate that will pass the GC threshold
|
||||
fn new_candidate(region_id: RegionId, score: f64) -> GcCandidate {
|
||||
// will pass threshold for gc
|
||||
let region_stat = mock_region_stat(region_id, RegionRole::Leader, 10_000, 10);
|
||||
|
||||
GcCandidate {
|
||||
region_id,
|
||||
score: OrderedFloat(score),
|
||||
region_stat,
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function to create a mock GC candidate
|
||||
fn mock_candidate(region_id: RegionId) -> GcCandidate {
|
||||
let region_stat = mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10);
|
||||
GcCandidate {
|
||||
region_id,
|
||||
score: ordered_float::OrderedFloat(1.0),
|
||||
region_stat,
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function to create a mock RegionStat
|
||||
fn mock_region_stat(
|
||||
id: RegionId,
|
||||
role: RegionRole,
|
||||
approximate_bytes: u64,
|
||||
sst_num: u64,
|
||||
) -> RegionStat {
|
||||
RegionStat {
|
||||
id,
|
||||
role,
|
||||
approximate_bytes,
|
||||
sst_num,
|
||||
region_manifest: RegionManifestInfo::Mito {
|
||||
manifest_version: 0,
|
||||
flushed_entry_id: 0,
|
||||
file_removed_cnt: 0,
|
||||
},
|
||||
rcus: 0,
|
||||
wcus: 0,
|
||||
engine: "mito".to_string(),
|
||||
num_rows: 0,
|
||||
memtable_size: 0,
|
||||
manifest_size: 0,
|
||||
sst_size: 0,
|
||||
index_size: 0,
|
||||
data_topic_latest_entry_id: 0,
|
||||
metadata_topic_latest_entry_id: 0,
|
||||
written_bytes: 0,
|
||||
}
|
||||
}
|
||||
@@ -1,164 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Instant;
|
||||
|
||||
use common_meta::peer::Peer;
|
||||
use common_telemetry::init_default_ut_logging;
|
||||
use store_api::region_engine::RegionRole;
|
||||
use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId};
|
||||
|
||||
use crate::gc::mock::{
|
||||
MockSchedulerCtx, TEST_REGION_SIZE_200MB, TestEnv, mock_region_stat, new_candidate,
|
||||
};
|
||||
use crate::gc::{GcScheduler, GcSchedulerOptions};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_parallel_process_datanodes_empty() {
|
||||
let env = TestEnv::new();
|
||||
let report = env
|
||||
.scheduler
|
||||
.parallel_process_datanodes(HashMap::new())
|
||||
.await;
|
||||
|
||||
assert_eq!(report.per_datanode_reports.len(), 0);
|
||||
assert_eq!(report.failed_datanodes.len(), 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_parallel_process_datanodes_with_candidates() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
let peer = Peer::new(1, "");
|
||||
let candidates = HashMap::from([(table_id, vec![new_candidate(region_id, 1.0)])]);
|
||||
|
||||
let mut gc_reports = HashMap::new();
|
||||
let deleted_files = vec![FileId::random()];
|
||||
gc_reports.insert(
|
||||
region_id,
|
||||
GcReport {
|
||||
deleted_files: HashMap::from([(region_id, deleted_files.clone())]),
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: HashMap::from([(region_id, 1)]),
|
||||
..Default::default()
|
||||
};
|
||||
let ctx = MockSchedulerCtx {
|
||||
gc_reports: Arc::new(Mutex::new(gc_reports)),
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(table_id, vec![(region_id, peer.clone())]),
|
||||
)]));
|
||||
|
||||
let env = TestEnv::new();
|
||||
// We need to replace the ctx with the one with gc_reports
|
||||
let mut scheduler = env.scheduler;
|
||||
scheduler.ctx = Arc::new(ctx);
|
||||
|
||||
// Convert table-based candidates to datanode-based candidates
|
||||
let datanode_to_candidates = HashMap::from([(
|
||||
peer,
|
||||
candidates
|
||||
.into_iter()
|
||||
.flat_map(|(table_id, candidates)| candidates.into_iter().map(move |c| (table_id, c)))
|
||||
.collect(),
|
||||
)]);
|
||||
|
||||
let report = scheduler
|
||||
.parallel_process_datanodes(datanode_to_candidates)
|
||||
.await;
|
||||
|
||||
assert_eq!(report.per_datanode_reports.len(), 1);
|
||||
assert_eq!(report.failed_datanodes.len(), 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_handle_tick() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
let peer = Peer::new(1, "");
|
||||
let candidates = HashMap::from([(table_id, vec![new_candidate(region_id, 1.0)])]);
|
||||
|
||||
let mut gc_reports = HashMap::new();
|
||||
gc_reports.insert(region_id, GcReport::default());
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: HashMap::from([(region_id, 1)]),
|
||||
..Default::default()
|
||||
};
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(HashMap::from([(
|
||||
table_id,
|
||||
vec![mock_region_stat(
|
||||
region_id,
|
||||
RegionRole::Leader,
|
||||
TEST_REGION_SIZE_200MB,
|
||||
10,
|
||||
)],
|
||||
)])))),
|
||||
gc_reports: Arc::new(Mutex::new(gc_reports)),
|
||||
candidates: Arc::new(Mutex::new(Some(candidates))),
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(table_id, vec![(region_id, peer)]),
|
||||
)])),
|
||||
);
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config: GcSchedulerOptions::default(),
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let report = scheduler.handle_tick().await.unwrap();
|
||||
|
||||
// Validate the returned GcJobReport
|
||||
assert_eq!(
|
||||
report.per_datanode_reports.len(),
|
||||
1,
|
||||
"Should process 1 datanode"
|
||||
);
|
||||
assert_eq!(
|
||||
report.failed_datanodes.len(),
|
||||
0,
|
||||
"Should have 0 failed datanodes"
|
||||
);
|
||||
|
||||
assert_eq!(*ctx.get_table_to_region_stats_calls.lock().unwrap(), 1);
|
||||
assert_eq!(*ctx.get_file_references_calls.lock().unwrap(), 1);
|
||||
assert_eq!(*ctx.gc_regions_calls.lock().unwrap(), 1);
|
||||
|
||||
let tracker = scheduler.region_gc_tracker.lock().await;
|
||||
assert!(
|
||||
tracker.contains_key(®ion_id),
|
||||
"Tracker should have one region: {:?}",
|
||||
tracker
|
||||
);
|
||||
}
|
||||
@@ -1,390 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Instant;
|
||||
|
||||
use common_meta::datanode::RegionManifestInfo;
|
||||
use common_telemetry::init_default_ut_logging;
|
||||
use store_api::region_engine::RegionRole;
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use crate::gc::mock::{MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_region_stat};
|
||||
use crate::gc::{GcScheduler, GcSchedulerOptions};
|
||||
|
||||
/// Candidate Selection Tests
|
||||
#[tokio::test]
|
||||
async fn test_gc_candidate_filtering_by_role() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let leader_region = RegionId::new(table_id, 1);
|
||||
let follower_region = RegionId::new(table_id, 2);
|
||||
|
||||
let mut leader_stat = mock_region_stat(
|
||||
leader_region,
|
||||
RegionRole::Leader,
|
||||
TEST_REGION_SIZE_200MB,
|
||||
10,
|
||||
); // 200MB
|
||||
|
||||
let mut follower_stat = mock_region_stat(
|
||||
follower_region,
|
||||
RegionRole::Follower,
|
||||
TEST_REGION_SIZE_200MB,
|
||||
10,
|
||||
); // 200MB
|
||||
|
||||
// Set up manifest info for scoring
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut leader_stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 5;
|
||||
}
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut follower_stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 5;
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, vec![leader_stat.clone(), follower_stat.clone()])]);
|
||||
|
||||
let ctx = Arc::new(MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
..Default::default()
|
||||
});
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config: GcSchedulerOptions::default(),
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let stats = ctx
|
||||
.table_to_region_stats
|
||||
.lock()
|
||||
.unwrap()
|
||||
.clone()
|
||||
.unwrap_or_default();
|
||||
|
||||
let candidates = scheduler.select_gc_candidates(&stats).await.unwrap();
|
||||
|
||||
// Should only select leader regions
|
||||
assert_eq!(
|
||||
candidates.len(),
|
||||
1,
|
||||
"Expected 1 table with candidates, got {}",
|
||||
candidates.len()
|
||||
);
|
||||
if let Some(table_candidates) = candidates.get(&table_id) {
|
||||
assert_eq!(
|
||||
table_candidates.len(),
|
||||
1,
|
||||
"Expected 1 candidate for table {}, got {}",
|
||||
table_id,
|
||||
table_candidates.len()
|
||||
);
|
||||
assert_eq!(
|
||||
table_candidates[0].region_id, leader_region,
|
||||
"Expected leader region {}, got {}",
|
||||
leader_region, table_candidates[0].region_id
|
||||
);
|
||||
} else {
|
||||
panic!("Expected table {} to have candidates", table_id);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_gc_candidate_size_threshold() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let small_region = RegionId::new(table_id, 1);
|
||||
let large_region = RegionId::new(table_id, 2);
|
||||
|
||||
let mut small_stat = mock_region_stat(small_region, RegionRole::Leader, 50_000_000, 5); // 50MB
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut small_stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 3;
|
||||
}
|
||||
|
||||
let mut large_stat =
|
||||
mock_region_stat(large_region, RegionRole::Leader, TEST_REGION_SIZE_200MB, 20); // 200MB
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut large_stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 5;
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, vec![small_stat, large_stat])]);
|
||||
|
||||
let ctx = Arc::new(MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
..Default::default()
|
||||
});
|
||||
|
||||
let config = GcSchedulerOptions {
|
||||
min_region_size_threshold: 100 * 1024 * 1024, // 100MB (default)
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let stats = ctx
|
||||
.table_to_region_stats
|
||||
.lock()
|
||||
.unwrap()
|
||||
.clone()
|
||||
.unwrap_or_default();
|
||||
|
||||
let candidates = scheduler.select_gc_candidates(&stats).await.unwrap();
|
||||
|
||||
// Should only select large region
|
||||
assert_eq!(
|
||||
candidates.len(),
|
||||
1,
|
||||
"Expected 1 table with candidates, got {}",
|
||||
candidates.len()
|
||||
);
|
||||
if let Some(table_candidates) = candidates.get(&table_id) {
|
||||
assert_eq!(
|
||||
table_candidates.len(),
|
||||
1,
|
||||
"Expected 1 candidate for table {}, got {}",
|
||||
table_id,
|
||||
table_candidates.len()
|
||||
);
|
||||
assert_eq!(
|
||||
table_candidates[0].region_id, large_region,
|
||||
"Expected large region {}, got {}",
|
||||
large_region, table_candidates[0].region_id
|
||||
);
|
||||
} else {
|
||||
panic!("Expected table {} to have candidates", table_id);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_gc_candidate_scoring() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let low_score_region = RegionId::new(table_id, 1);
|
||||
let high_score_region = RegionId::new(table_id, 2);
|
||||
|
||||
let mut low_stat = mock_region_stat(
|
||||
low_score_region,
|
||||
RegionRole::Leader,
|
||||
TEST_REGION_SIZE_200MB,
|
||||
5,
|
||||
); // 200MB
|
||||
// Set low file removal rate for low_score_region
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut low_stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 2;
|
||||
}
|
||||
|
||||
let mut high_stat = mock_region_stat(
|
||||
high_score_region,
|
||||
RegionRole::Leader,
|
||||
TEST_REGION_SIZE_200MB,
|
||||
50,
|
||||
); // 200MB
|
||||
// Set high file removal rate for high_score_region
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut high_stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 20;
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, vec![low_stat, high_stat])]);
|
||||
|
||||
let ctx = Arc::new(MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
..Default::default()
|
||||
});
|
||||
|
||||
let config = GcSchedulerOptions {
|
||||
sst_count_weight: 1.0,
|
||||
file_removed_count_weight: 0.5,
|
||||
min_region_size_threshold: 100 * 1024 * 1024, // 100MB (default)
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let stats = ctx
|
||||
.table_to_region_stats
|
||||
.lock()
|
||||
.unwrap()
|
||||
.clone()
|
||||
.unwrap_or_default();
|
||||
|
||||
let candidates = scheduler.select_gc_candidates(&stats).await.unwrap();
|
||||
|
||||
// Should select both regions but high score region should be first
|
||||
assert_eq!(
|
||||
candidates.len(),
|
||||
1,
|
||||
"Expected 1 table with candidates, got {}",
|
||||
candidates.len()
|
||||
);
|
||||
if let Some(table_candidates) = candidates.get(&table_id) {
|
||||
assert_eq!(
|
||||
table_candidates.len(),
|
||||
2,
|
||||
"Expected 2 candidates for table {}, got {}",
|
||||
table_id,
|
||||
table_candidates.len()
|
||||
);
|
||||
// Higher score region should come first (sorted by score descending)
|
||||
assert_eq!(
|
||||
table_candidates[0].region_id, high_score_region,
|
||||
"High score region should be first"
|
||||
);
|
||||
assert!(
|
||||
table_candidates[0].score > table_candidates[1].score,
|
||||
"High score region should have higher score: {} > {}",
|
||||
table_candidates[0].score,
|
||||
table_candidates[1].score
|
||||
);
|
||||
} else {
|
||||
panic!("Expected table {} to have candidates", table_id);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_gc_candidate_regions_per_table_threshold() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
// Create 10 regions for the same table
|
||||
let mut region_stats = Vec::new();
|
||||
|
||||
for i in 0..10 {
|
||||
let region_id = RegionId::new(table_id, i + 1);
|
||||
let mut stat = mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 20); // 200MB
|
||||
|
||||
// Set different file removal rates to create different scores
|
||||
// Higher region IDs get higher scores (better GC candidates)
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = (i as u64 + 1) * 2; // Region 1: 2, Region 2: 4, ..., Region 10: 20
|
||||
}
|
||||
|
||||
region_stats.push(stat);
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, region_stats)]);
|
||||
|
||||
let ctx = Arc::new(MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
..Default::default()
|
||||
});
|
||||
|
||||
// Set regions_per_table_threshold to 3
|
||||
let config = GcSchedulerOptions {
|
||||
regions_per_table_threshold: 3,
|
||||
min_region_size_threshold: 100 * 1024 * 1024, // 100MB (default)
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let stats = ctx
|
||||
.table_to_region_stats
|
||||
.lock()
|
||||
.unwrap()
|
||||
.clone()
|
||||
.unwrap_or_default();
|
||||
|
||||
let candidates = scheduler.select_gc_candidates(&stats).await.unwrap();
|
||||
|
||||
// Should have 1 table with candidates
|
||||
assert_eq!(
|
||||
candidates.len(),
|
||||
1,
|
||||
"Expected 1 table with candidates, got {}",
|
||||
candidates.len()
|
||||
);
|
||||
|
||||
if let Some(table_candidates) = candidates.get(&table_id) {
|
||||
// Should only have 3 candidates due to regions_per_table_threshold
|
||||
assert_eq!(
|
||||
table_candidates.len(),
|
||||
3,
|
||||
"Expected 3 candidates for table {} due to regions_per_table_threshold, got {}",
|
||||
table_id,
|
||||
table_candidates.len()
|
||||
);
|
||||
|
||||
// Verify that the top 3 scoring regions are selected
|
||||
// Regions 8, 9, 10 should have the highest scores (file_removed_cnt: 16, 18, 20)
|
||||
// They should be returned in descending order by score
|
||||
let expected_regions = vec![10, 9, 8];
|
||||
let actual_regions: Vec<u32> = table_candidates
|
||||
.iter()
|
||||
.map(|c| c.region_id.region_number())
|
||||
.collect();
|
||||
|
||||
assert_eq!(
|
||||
actual_regions, expected_regions,
|
||||
"Expected regions {:?} to be selected, got {:?}",
|
||||
expected_regions, actual_regions
|
||||
);
|
||||
|
||||
// Verify they are sorted by score in descending order
|
||||
for i in 0..table_candidates.len() - 1 {
|
||||
assert!(
|
||||
table_candidates[i].score >= table_candidates[i + 1].score,
|
||||
"Candidates should be sorted by score descending: {} >= {}",
|
||||
table_candidates[i].score,
|
||||
table_candidates[i + 1].score
|
||||
);
|
||||
}
|
||||
} else {
|
||||
panic!("Expected table {} to have candidates", table_id);
|
||||
}
|
||||
}
|
||||
@@ -1,516 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use common_meta::key::table_route::PhysicalTableRouteValue;
|
||||
use common_meta::peer::Peer;
|
||||
use common_meta::rpc::router::{Region, RegionRoute};
|
||||
use common_telemetry::{info, init_default_ut_logging};
|
||||
use store_api::region_engine::RegionRole;
|
||||
use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId};
|
||||
|
||||
use crate::gc::mock::{
|
||||
MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_candidate, mock_region_stat, new_candidate,
|
||||
};
|
||||
use crate::gc::{GcScheduler, GcSchedulerOptions};
|
||||
|
||||
/// Concurrent Processing Tests
|
||||
#[tokio::test]
|
||||
async fn test_concurrent_table_processing_limits() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let mut candidates = HashMap::new();
|
||||
let mut gc_reports = HashMap::new();
|
||||
|
||||
// Create many tables with candidates
|
||||
for table_id in 1..=10 {
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
candidates.insert(table_id, vec![new_candidate(region_id, 1.0)]);
|
||||
gc_reports.insert(
|
||||
region_id,
|
||||
GcReport {
|
||||
deleted_files: HashMap::from([(region_id, vec![FileId::random()])]),
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
let ctx = MockSchedulerCtx {
|
||||
candidates: Arc::new(Mutex::new(Some(candidates))),
|
||||
file_refs: Arc::new(Mutex::new(Some(FileRefsManifest {
|
||||
manifest_version: (1..=10).map(|i| (RegionId::new(i, 1), 1)).collect(),
|
||||
..Default::default()
|
||||
}))),
|
||||
gc_reports: Arc::new(Mutex::new(gc_reports)),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(
|
||||
(1..=10)
|
||||
.map(|table_id| {
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
(table_id, (table_id, vec![(region_id, Peer::new(1, ""))]))
|
||||
})
|
||||
.collect(),
|
||||
);
|
||||
|
||||
let ctx = Arc::new(ctx);
|
||||
|
||||
let config = GcSchedulerOptions {
|
||||
max_concurrent_tables: 3, // Set a low limit
|
||||
retry_backoff_duration: Duration::from_millis(50), // for faster test
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let candidates = ctx.candidates.lock().unwrap().clone().unwrap_or_default();
|
||||
|
||||
// Convert table-based candidates to datanode-based candidates
|
||||
let peer = Peer::new(1, "");
|
||||
let datanode_to_candidates = HashMap::from([(
|
||||
peer,
|
||||
candidates
|
||||
.into_iter()
|
||||
.flat_map(|(table_id, candidates)| candidates.into_iter().map(move |c| (table_id, c)))
|
||||
.collect(),
|
||||
)]);
|
||||
|
||||
let report = scheduler
|
||||
.parallel_process_datanodes(datanode_to_candidates)
|
||||
.await;
|
||||
|
||||
// Should process all datanodes
|
||||
assert_eq!(report.per_datanode_reports.len(), 1);
|
||||
assert_eq!(report.failed_datanodes.len(), 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_datanode_processes_tables_with_partial_gc_failures() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table1 = 1;
|
||||
let region1 = RegionId::new(table1, 1);
|
||||
let table2 = 2;
|
||||
let region2 = RegionId::new(table2, 1);
|
||||
let peer = Peer::new(1, "");
|
||||
|
||||
let mut candidates = HashMap::new();
|
||||
candidates.insert(table1, vec![new_candidate(region1, 1.0)]);
|
||||
candidates.insert(table2, vec![new_candidate(region2, 1.0)]);
|
||||
|
||||
// Set up GC reports for success and failure
|
||||
let mut gc_reports = HashMap::new();
|
||||
gc_reports.insert(
|
||||
region1,
|
||||
GcReport {
|
||||
deleted_files: HashMap::from([(region1, vec![])]),
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
// region2 will have no GC report, simulating failure
|
||||
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: HashMap::from([(region1, 1), (region2, 1)]),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
gc_reports: Arc::new(Mutex::new(gc_reports)),
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
candidates: Arc::new(Mutex::new(Some(candidates))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([
|
||||
(table1, (table1, vec![(region1, peer.clone())])),
|
||||
(table2, (table2, vec![(region2, peer.clone())])),
|
||||
])),
|
||||
);
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config: GcSchedulerOptions::default(),
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let candidates = ctx.candidates.lock().unwrap().clone().unwrap_or_default();
|
||||
|
||||
// Convert table-based candidates to datanode-based candidates
|
||||
|
||||
let datanode_to_candidates = HashMap::from([(
|
||||
peer,
|
||||
candidates
|
||||
.into_iter()
|
||||
.flat_map(|(table_id, candidates)| candidates.into_iter().map(move |c| (table_id, c)))
|
||||
.collect(),
|
||||
)]);
|
||||
|
||||
let report = scheduler
|
||||
.parallel_process_datanodes(datanode_to_candidates)
|
||||
.await;
|
||||
|
||||
// Should have one datanode with mixed results
|
||||
assert_eq!(report.per_datanode_reports.len(), 1);
|
||||
// also check one failed region (region2 has no GC report, so it should be in need_retry_regions)
|
||||
let datanode_report = report.per_datanode_reports.values().next().unwrap();
|
||||
assert_eq!(datanode_report.need_retry_regions.len(), 1);
|
||||
assert_eq!(report.failed_datanodes.len(), 0);
|
||||
}
|
||||
|
||||
// Region Concurrency Tests
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_region_gc_concurrency_limit() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let peer = Peer::new(1, "");
|
||||
|
||||
// Create multiple regions for the same table
|
||||
let mut region_stats = Vec::new();
|
||||
let mut candidates = Vec::new();
|
||||
let mut gc_reports = HashMap::new();
|
||||
|
||||
for i in 1..=10 {
|
||||
let region_id = RegionId::new(table_id, i as u32);
|
||||
let region_stat =
|
||||
mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
|
||||
region_stats.push(region_stat);
|
||||
|
||||
candidates.push(mock_candidate(region_id));
|
||||
|
||||
gc_reports.insert(
|
||||
region_id,
|
||||
GcReport {
|
||||
deleted_files: HashMap::from([(
|
||||
region_id,
|
||||
vec![FileId::random(), FileId::random()],
|
||||
)]),
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, region_stats)]);
|
||||
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: (1..=10)
|
||||
.map(|i| (RegionId::new(table_id, i as u32), 1))
|
||||
.collect(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
gc_reports: Arc::new(Mutex::new(gc_reports)),
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(
|
||||
table_id,
|
||||
(1..=10)
|
||||
.map(|i| (RegionId::new(table_id, i as u32), peer.clone()))
|
||||
.collect(),
|
||||
),
|
||||
)])),
|
||||
);
|
||||
|
||||
// Configure low concurrency limit
|
||||
let config = GcSchedulerOptions {
|
||||
region_gc_concurrency: 3, // Only 3 regions can be processed concurrently
|
||||
retry_backoff_duration: Duration::from_millis(50), // for faster test
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let start_time = Instant::now();
|
||||
let report = scheduler
|
||||
.process_datanode_gc(
|
||||
peer,
|
||||
candidates.into_iter().map(|c| (table_id, c)).collect(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let duration = start_time.elapsed();
|
||||
|
||||
// All regions should be processed successfully
|
||||
// Check that all 10 regions have deleted files
|
||||
assert_eq!(report.deleted_files.len(), 10);
|
||||
for i in 1..=10 {
|
||||
let region_id = RegionId::new(table_id, i as u32);
|
||||
assert!(report.deleted_files.contains_key(®ion_id));
|
||||
assert_eq!(report.deleted_files[®ion_id].len(), 2); // Each region has 2 deleted files
|
||||
}
|
||||
assert!(report.need_retry_regions.is_empty());
|
||||
|
||||
// Verify that concurrency limit was respected (this is hard to test directly,
|
||||
// but we can verify that the processing completed successfully)
|
||||
info!(
|
||||
"Processed 10 regions with concurrency limit 3 in {:?}",
|
||||
duration
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_region_gc_concurrency_with_partial_failures() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let peer = Peer::new(1, "");
|
||||
|
||||
// Create multiple regions with mixed success/failure
|
||||
let mut region_stats = Vec::new();
|
||||
let mut candidates = Vec::new();
|
||||
let mut gc_reports = HashMap::new();
|
||||
|
||||
// Create the context first so we can set errors on it
|
||||
let ctx = Arc::new(MockSchedulerCtx::default());
|
||||
|
||||
for i in 1..=6 {
|
||||
let region_id = RegionId::new(table_id, i as u32);
|
||||
let region_stat =
|
||||
mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
|
||||
region_stats.push(region_stat);
|
||||
|
||||
candidates.push(mock_candidate(region_id));
|
||||
|
||||
if i % 2 == 0 {
|
||||
// Even regions will succeed
|
||||
gc_reports.insert(
|
||||
region_id,
|
||||
GcReport {
|
||||
deleted_files: HashMap::from([(
|
||||
region_id,
|
||||
vec![FileId::random(), FileId::random()],
|
||||
)]),
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
} else {
|
||||
// Odd regions will fail - don't add them to gc_reports
|
||||
// This will cause them to be marked as needing retry
|
||||
}
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, region_stats)]);
|
||||
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: (1..=6)
|
||||
.map(|i| (RegionId::new(table_id, i as u32), 1))
|
||||
.collect(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Update the context with the data
|
||||
*ctx.table_to_region_stats.lock().unwrap() = Some(table_stats);
|
||||
*ctx.gc_reports.lock().unwrap() = gc_reports;
|
||||
*ctx.file_refs.lock().unwrap() = Some(file_refs);
|
||||
let region_routes = (1..=6)
|
||||
.map(|i| RegionRoute {
|
||||
region: Region::new_test(RegionId::new(table_id, i as u32)),
|
||||
leader_peer: Some(peer.clone()),
|
||||
..Default::default()
|
||||
})
|
||||
.collect();
|
||||
|
||||
*ctx.table_routes.lock().unwrap() = HashMap::from([(
|
||||
table_id,
|
||||
(table_id, PhysicalTableRouteValue::new(region_routes)),
|
||||
)]);
|
||||
|
||||
// Configure concurrency limit
|
||||
let config = GcSchedulerOptions {
|
||||
region_gc_concurrency: 2, // Process 2 regions concurrently
|
||||
retry_backoff_duration: Duration::from_millis(50), // for faster test
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let datanode_to_candidates = HashMap::from([(
|
||||
peer.clone(),
|
||||
candidates.into_iter().map(|c| (table_id, c)).collect(),
|
||||
)]);
|
||||
|
||||
let report = scheduler
|
||||
.parallel_process_datanodes(datanode_to_candidates)
|
||||
.await;
|
||||
|
||||
let report = report.per_datanode_reports.get(&peer.id).unwrap();
|
||||
|
||||
// Should have 3 successful and 3 failed regions
|
||||
// Even regions (2, 4, 6) should succeed, odd regions (1, 3, 5) should fail
|
||||
let mut successful_regions = 0;
|
||||
let mut failed_regions = 0;
|
||||
|
||||
for i in 1..=6 {
|
||||
let region_id = RegionId::new(table_id, i as u32);
|
||||
if i % 2 == 0 {
|
||||
// Even regions should succeed
|
||||
if report.deleted_files.contains_key(®ion_id) {
|
||||
successful_regions += 1;
|
||||
}
|
||||
} else {
|
||||
// Odd regions should fail - they should be in need_retry_regions
|
||||
if report.need_retry_regions.contains(®ion_id) {
|
||||
failed_regions += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// In the new implementation, regions that cause gc_regions to return an error
|
||||
// are added to need_retry_regions. Let's check if we have the expected mix.
|
||||
info!(
|
||||
"Successful regions: {}, Failed regions: {}",
|
||||
successful_regions, failed_regions
|
||||
);
|
||||
info!(
|
||||
"Deleted files: {:?}",
|
||||
report.deleted_files.keys().collect::<Vec<_>>()
|
||||
);
|
||||
info!("Need retry regions: {:?}", report.need_retry_regions);
|
||||
|
||||
// The exact count might vary depending on how the mock handles errors,
|
||||
// but we should have some successful and some failed regions
|
||||
assert!(
|
||||
successful_regions > 0,
|
||||
"Should have at least some successful regions"
|
||||
);
|
||||
assert!(
|
||||
failed_regions > 0,
|
||||
"Should have at least some failed regions"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_region_gc_concurrency_with_retryable_errors() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let peer = Peer::new(1, "");
|
||||
|
||||
// Create multiple regions
|
||||
let mut region_stats = Vec::new();
|
||||
let mut candidates = Vec::new();
|
||||
|
||||
for i in 1..=5 {
|
||||
let region_id = RegionId::new(table_id, i as u32);
|
||||
let region_stat =
|
||||
mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
|
||||
region_stats.push(region_stat);
|
||||
candidates.push(mock_candidate(region_id));
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, region_stats)]);
|
||||
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: (1..=5)
|
||||
.map(|i| (RegionId::new(table_id, i as u32), 1))
|
||||
.collect(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let gc_report = (1..=5)
|
||||
.map(|i| {
|
||||
let region_id = RegionId::new(table_id, i as u32);
|
||||
(
|
||||
region_id,
|
||||
// mock the actual gc report with deleted files when succeeded(even no files to delete)
|
||||
GcReport::new(HashMap::from([(region_id, vec![])]), HashSet::new()),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
gc_reports: Arc::new(Mutex::new(gc_report)),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(
|
||||
table_id,
|
||||
(1..=5)
|
||||
.map(|i| (RegionId::new(table_id, i as u32), peer.clone()))
|
||||
.collect(),
|
||||
),
|
||||
)])),
|
||||
);
|
||||
|
||||
// Configure concurrency limit
|
||||
let config = GcSchedulerOptions {
|
||||
region_gc_concurrency: 2, // Process 2 regions concurrently
|
||||
retry_backoff_duration: Duration::from_millis(50),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let datanode_to_candidates = HashMap::from([(
|
||||
peer.clone(),
|
||||
candidates.into_iter().map(|c| (table_id, c)).collect(),
|
||||
)]);
|
||||
let report = scheduler
|
||||
.parallel_process_datanodes(datanode_to_candidates)
|
||||
.await;
|
||||
|
||||
let report = report.per_datanode_reports.get(&peer.id).unwrap();
|
||||
|
||||
// In the new implementation without retry logic, all regions should be processed
|
||||
// The exact behavior depends on how the mock handles the regions
|
||||
info!(
|
||||
"Deleted files: {:?}",
|
||||
report.deleted_files.keys().collect::<Vec<_>>()
|
||||
);
|
||||
info!("Need retry regions: {:?}", report.need_retry_regions);
|
||||
|
||||
// We should have processed all 5 regions in some way
|
||||
let total_processed = report.deleted_files.len() + report.need_retry_regions.len();
|
||||
assert_eq!(total_processed, 5, "Should have processed all 5 regions");
|
||||
}
|
||||
@@ -1,197 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Instant;
|
||||
|
||||
use common_meta::datanode::RegionManifestInfo;
|
||||
use common_telemetry::init_default_ut_logging;
|
||||
use store_api::region_engine::RegionRole;
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use crate::gc::mock::{MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_region_stat};
|
||||
use crate::gc::{GcScheduler, GcSchedulerOptions};
|
||||
|
||||
/// Configuration Tests
|
||||
#[tokio::test]
|
||||
async fn test_different_gc_weights() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
|
||||
let mut region_stat =
|
||||
mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB to pass size threshold
|
||||
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut region_stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 5;
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, vec![region_stat])]);
|
||||
|
||||
let ctx = Arc::new(MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
..Default::default()
|
||||
});
|
||||
|
||||
// Test with different weights
|
||||
let config1 = GcSchedulerOptions {
|
||||
sst_count_weight: 2.0,
|
||||
file_removed_count_weight: 0.5,
|
||||
min_region_size_threshold: 100 * 1024 * 1024, // 100MB (default)
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler1 = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config: config1,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let stats = ctx
|
||||
.table_to_region_stats
|
||||
.lock()
|
||||
.unwrap()
|
||||
.clone()
|
||||
.unwrap_or_default();
|
||||
|
||||
let candidates1 = scheduler1.select_gc_candidates(&stats).await.unwrap();
|
||||
|
||||
let config2 = GcSchedulerOptions {
|
||||
sst_count_weight: 0.5,
|
||||
file_removed_count_weight: 2.0,
|
||||
min_region_size_threshold: 100 * 1024 * 1024, // 100MB (default)
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler2 = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config: config2,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let stats = &ctx
|
||||
.table_to_region_stats
|
||||
.lock()
|
||||
.unwrap()
|
||||
.clone()
|
||||
.unwrap_or_default();
|
||||
let candidates2 = scheduler2.select_gc_candidates(stats).await.unwrap();
|
||||
|
||||
// Both should select the region but with different scores
|
||||
assert_eq!(
|
||||
candidates1.len(),
|
||||
1,
|
||||
"Expected 1 table with candidates for config1, got {}",
|
||||
candidates1.len()
|
||||
);
|
||||
assert_eq!(
|
||||
candidates2.len(),
|
||||
1,
|
||||
"Expected 1 table with candidates for config2, got {}",
|
||||
candidates2.len()
|
||||
);
|
||||
|
||||
// Verify the region is actually selected
|
||||
assert!(
|
||||
candidates1.contains_key(&table_id),
|
||||
"Config1 should contain table_id {}",
|
||||
table_id
|
||||
);
|
||||
assert!(
|
||||
candidates2.contains_key(&table_id),
|
||||
"Config2 should contain table_id {}",
|
||||
table_id
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_regions_per_table_threshold() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let mut region_stats = Vec::new();
|
||||
|
||||
// Create many regions
|
||||
for i in 1..=10 {
|
||||
let region_id = RegionId::new(table_id, i as u32);
|
||||
let mut stat = mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
|
||||
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 5;
|
||||
}
|
||||
|
||||
region_stats.push(stat);
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, region_stats)]);
|
||||
|
||||
let ctx = Arc::new(MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
..Default::default()
|
||||
});
|
||||
|
||||
let config = GcSchedulerOptions {
|
||||
regions_per_table_threshold: 3, // Limit to 3 regions per table
|
||||
min_region_size_threshold: 100 * 1024 * 1024, // 100MB (default)
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let stats = ctx
|
||||
.table_to_region_stats
|
||||
.lock()
|
||||
.unwrap()
|
||||
.clone()
|
||||
.unwrap_or_default();
|
||||
|
||||
let candidates = scheduler.select_gc_candidates(&stats).await.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
candidates.len(),
|
||||
1,
|
||||
"Expected 1 table with candidates, got {}",
|
||||
candidates.len()
|
||||
);
|
||||
if let Some(table_candidates) = candidates.get(&table_id) {
|
||||
// Should be limited to 3 regions
|
||||
assert_eq!(
|
||||
table_candidates.len(),
|
||||
3,
|
||||
"Expected 3 candidates for table {}, got {}",
|
||||
table_id,
|
||||
table_candidates.len()
|
||||
);
|
||||
} else {
|
||||
panic!("Expected table {} to have candidates", table_id);
|
||||
}
|
||||
}
|
||||
@@ -1,293 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use common_meta::datanode::RegionManifestInfo;
|
||||
use common_meta::peer::Peer;
|
||||
use common_telemetry::init_default_ut_logging;
|
||||
use store_api::region_engine::RegionRole;
|
||||
use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId};
|
||||
|
||||
use crate::gc::mock::{
|
||||
MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_region_stat, new_empty_report_with,
|
||||
};
|
||||
use crate::gc::{GcScheduler, GcSchedulerOptions};
|
||||
|
||||
/// Error Handling Tests
|
||||
#[tokio::test]
|
||||
async fn test_gc_regions_failure_handling() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
let peer = Peer::new(1, "");
|
||||
|
||||
// Create region stat with proper size and file_removed_cnt to ensure it gets selected as candidate
|
||||
let mut region_stat =
|
||||
mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut region_stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 5;
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, vec![region_stat])]);
|
||||
|
||||
// Create a context that will return an error for gc_regions
|
||||
let mut gc_reports = HashMap::new();
|
||||
gc_reports.insert(region_id, GcReport::default());
|
||||
|
||||
// Inject an error for gc_regions method
|
||||
let gc_error = crate::error::UnexpectedSnafu {
|
||||
violated: "Simulated GC failure for testing".to_string(),
|
||||
}
|
||||
.build();
|
||||
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: HashMap::from([(region_id, 1)]),
|
||||
file_refs: HashMap::from([(region_id, HashSet::from([FileId::random()]))]),
|
||||
};
|
||||
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
gc_reports: Arc::new(Mutex::new(gc_reports)),
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(table_id, vec![(region_id, peer)]),
|
||||
)]))
|
||||
.with_gc_regions_error(gc_error),
|
||||
);
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config: GcSchedulerOptions::default(),
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
// This should handle the failure gracefully
|
||||
let report = scheduler.handle_tick().await.unwrap();
|
||||
|
||||
// Validate the report shows the failure handling
|
||||
assert_eq!(
|
||||
report.per_datanode_reports.len(),
|
||||
1,
|
||||
"Should process 1 datanode despite failure"
|
||||
);
|
||||
assert_eq!(
|
||||
report.failed_datanodes.len(),
|
||||
0,
|
||||
"Should have 0 failed datanodes (failure handled via need_retry_regions)"
|
||||
);
|
||||
|
||||
// Check that the region is in need_retry_regions due to the failure
|
||||
let datanode_report = report.per_datanode_reports.values().next().unwrap();
|
||||
assert_eq!(
|
||||
datanode_report.need_retry_regions.len(),
|
||||
1,
|
||||
"Should have 1 region in need_retry_regions due to failure"
|
||||
);
|
||||
assert!(
|
||||
datanode_report.need_retry_regions.contains(®ion_id),
|
||||
"Region should be in need_retry_regions"
|
||||
);
|
||||
|
||||
// Verify that calls were made despite potential failures
|
||||
assert_eq!(
|
||||
*ctx.get_table_to_region_stats_calls.lock().unwrap(),
|
||||
1,
|
||||
"Expected 1 call to get_table_to_region_stats"
|
||||
);
|
||||
assert!(
|
||||
*ctx.get_file_references_calls.lock().unwrap() >= 1,
|
||||
"Expected at least 1 call to get_file_references"
|
||||
);
|
||||
assert!(
|
||||
*ctx.gc_regions_calls.lock().unwrap() >= 1,
|
||||
"Expected at least 1 call to gc_regions"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_get_file_references_failure() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
let peer = Peer::new(1, "");
|
||||
|
||||
// Create region stat with proper size and file_removed_cnt to ensure it gets selected as candidate
|
||||
let mut region_stat =
|
||||
mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut region_stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 5;
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, vec![region_stat])]);
|
||||
|
||||
// Create context with empty file refs (simulating failure)
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
file_refs: Arc::new(Mutex::new(Some(FileRefsManifest::default()))),
|
||||
gc_reports: Arc::new(Mutex::new(HashMap::from([(
|
||||
region_id,
|
||||
new_empty_report_with([region_id]),
|
||||
)]))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(table_id, vec![(region_id, peer)]),
|
||||
)])),
|
||||
);
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config: GcSchedulerOptions {
|
||||
retry_backoff_duration: Duration::from_millis(10), // shorten for test
|
||||
..Default::default()
|
||||
},
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let report = scheduler.handle_tick().await.unwrap();
|
||||
|
||||
// Validate the report shows the expected results
|
||||
// In the new implementation, even if get_file_references fails, we still create a datanode report
|
||||
assert_eq!(
|
||||
report.per_datanode_reports.len(),
|
||||
1,
|
||||
"Should process 1 datanode"
|
||||
);
|
||||
assert_eq!(
|
||||
report.failed_datanodes.len(),
|
||||
0,
|
||||
"Should have 0 failed datanodes (failure handled gracefully)"
|
||||
);
|
||||
|
||||
// The region should be processed but may have empty results due to file refs failure
|
||||
let datanode_report = report.per_datanode_reports.values().next().unwrap();
|
||||
// The current implementation still processes the region even with file refs failure
|
||||
// and creates an empty entry in deleted_files
|
||||
assert!(
|
||||
datanode_report.deleted_files.contains_key(®ion_id),
|
||||
"Should have region in deleted_files (even if empty)"
|
||||
);
|
||||
assert!(
|
||||
datanode_report.deleted_files[®ion_id].is_empty(),
|
||||
"Should have empty deleted files due to file refs failure"
|
||||
);
|
||||
|
||||
// Should still attempt to get file references (may be called multiple times due to retry logic)
|
||||
assert!(
|
||||
*ctx.get_file_references_calls.lock().unwrap() >= 1,
|
||||
"Expected at least 1 call to get_file_references, got {}",
|
||||
*ctx.get_file_references_calls.lock().unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_get_table_route_failure() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
|
||||
// Create region stat with proper size and file_removed_cnt to ensure it gets selected as candidate
|
||||
let mut region_stat =
|
||||
mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut region_stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 5;
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, vec![region_stat])]);
|
||||
|
||||
// Inject an error for get_table_route method to simulate failure
|
||||
let route_error = crate::error::UnexpectedSnafu {
|
||||
violated: "Simulated table route failure for testing".to_string(),
|
||||
}
|
||||
.build();
|
||||
|
||||
// Create context with table route error injection
|
||||
let ctx = Arc::new(MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
..Default::default()
|
||||
});
|
||||
ctx.set_table_route_error(route_error);
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config: GcSchedulerOptions::default(),
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
// Get candidates first
|
||||
let stats = &ctx
|
||||
.table_to_region_stats
|
||||
.lock()
|
||||
.unwrap()
|
||||
.clone()
|
||||
.unwrap_or_default();
|
||||
let candidates = scheduler.select_gc_candidates(stats).await.unwrap();
|
||||
|
||||
// Convert table-based candidates to datanode-based candidates
|
||||
let datanode_to_candidates = HashMap::from([(
|
||||
Peer::new(1, ""),
|
||||
candidates
|
||||
.into_iter()
|
||||
.flat_map(|(table_id, candidates)| candidates.into_iter().map(move |c| (table_id, c)))
|
||||
.collect(),
|
||||
)]);
|
||||
|
||||
// This should handle table route failure gracefully
|
||||
let report = scheduler
|
||||
.parallel_process_datanodes(datanode_to_candidates)
|
||||
.await;
|
||||
|
||||
// Should process the datanode but handle route error gracefully
|
||||
assert_eq!(
|
||||
report.per_datanode_reports.len(),
|
||||
0,
|
||||
"Expected 0 datanode report"
|
||||
);
|
||||
assert_eq!(
|
||||
report.failed_datanodes.len(),
|
||||
1,
|
||||
"Expected 1 failed datanodes (route error handled gracefully)"
|
||||
);
|
||||
assert!(
|
||||
report.failed_datanodes.contains_key(&1),
|
||||
"Failed datanodes should contain the datanode with route error"
|
||||
);
|
||||
}
|
||||
@@ -1,272 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use common_meta::peer::Peer;
|
||||
use common_telemetry::init_default_ut_logging;
|
||||
use store_api::region_engine::RegionRole;
|
||||
use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId};
|
||||
|
||||
use crate::gc::mock::{MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_candidate, mock_region_stat};
|
||||
use crate::gc::{GcScheduler, GcSchedulerOptions};
|
||||
|
||||
// Full File Listing Tests
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_full_file_listing_first_time_gc() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
let peer = Peer::new(1, "");
|
||||
|
||||
let region_stat = mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
|
||||
let table_stats = HashMap::from([(table_id, vec![region_stat])]);
|
||||
|
||||
let gc_report = GcReport {
|
||||
deleted_files: HashMap::from([(region_id, vec![FileId::random(), FileId::random()])]),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: HashMap::from([(region_id, 1)]),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
gc_reports: Arc::new(Mutex::new(HashMap::from([(region_id, gc_report)]))),
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(table_id, vec![(region_id, peer.clone())]),
|
||||
)])),
|
||||
);
|
||||
|
||||
// Configure short full file listing interval for testing
|
||||
let config = GcSchedulerOptions {
|
||||
full_file_listing_interval: Duration::from_secs(3600), // 1 hour
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
// First GC - should use full listing since region has never been GC'd
|
||||
let reports = scheduler
|
||||
.process_datanode_gc(peer.clone(), vec![(table_id, mock_candidate(region_id))])
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(reports.deleted_files.len(), 1);
|
||||
|
||||
// Verify that full listing was used by checking the tracker
|
||||
let tracker = scheduler.region_gc_tracker.lock().await;
|
||||
let gc_info = tracker
|
||||
.get(®ion_id)
|
||||
.expect("Region should be in tracker");
|
||||
assert!(
|
||||
gc_info.last_full_listing_time.is_some(),
|
||||
"First GC should use full listing"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_full_file_listing_interval_enforcement() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
let peer = Peer::new(1, "");
|
||||
|
||||
let region_stat = mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
|
||||
let table_stats = HashMap::from([(table_id, vec![region_stat])]);
|
||||
|
||||
let gc_report = GcReport {
|
||||
deleted_files: HashMap::from([(region_id, vec![FileId::random(), FileId::random()])]),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: HashMap::from([(region_id, 1)]),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
gc_reports: Arc::new(Mutex::new(HashMap::from([(region_id, gc_report)]))),
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(table_id, vec![(region_id, peer.clone())]),
|
||||
)])),
|
||||
);
|
||||
|
||||
// Configure very short full file listing interval for testing
|
||||
let config = GcSchedulerOptions {
|
||||
full_file_listing_interval: Duration::from_millis(100), // 100ms
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
// First GC - should use full listing
|
||||
let reports1 = scheduler
|
||||
.process_datanode_gc(peer.clone(), vec![(table_id, mock_candidate(region_id))])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(reports1.deleted_files.len(), 1);
|
||||
|
||||
// Get the first full listing time
|
||||
let first_full_listing_time = {
|
||||
let tracker = scheduler.region_gc_tracker.lock().await;
|
||||
let gc_info = tracker
|
||||
.get(®ion_id)
|
||||
.expect("Region should be in tracker");
|
||||
gc_info
|
||||
.last_full_listing_time
|
||||
.expect("Should have full listing time")
|
||||
};
|
||||
|
||||
// Wait for interval to pass
|
||||
tokio::time::sleep(Duration::from_millis(150)).await;
|
||||
|
||||
// Second GC - should use full listing again since interval has passed
|
||||
let _reports2 = scheduler
|
||||
.process_datanode_gc(peer.clone(), vec![(table_id, mock_candidate(region_id))])
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Verify that full listing was used again
|
||||
let tracker = scheduler.region_gc_tracker.lock().await;
|
||||
let gc_info = tracker
|
||||
.get(®ion_id)
|
||||
.expect("Region should be in tracker");
|
||||
let second_full_listing_time = gc_info
|
||||
.last_full_listing_time
|
||||
.expect("Should have full listing time");
|
||||
|
||||
assert!(
|
||||
second_full_listing_time > first_full_listing_time,
|
||||
"Second GC should update full listing time"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_full_file_listing_no_interval_passed() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
let peer = Peer::new(1, "");
|
||||
|
||||
let region_stat = mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
|
||||
let table_stats = HashMap::from([(table_id, vec![region_stat])]);
|
||||
|
||||
let gc_report = GcReport {
|
||||
deleted_files: HashMap::from([(region_id, vec![FileId::random(), FileId::random()])]),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: HashMap::from([(region_id, 1)]),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
gc_reports: Arc::new(Mutex::new(HashMap::from([(region_id, gc_report)]))),
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(table_id, vec![(region_id, peer.clone())]),
|
||||
)])),
|
||||
);
|
||||
|
||||
// Configure long full file listing interval
|
||||
let config = GcSchedulerOptions {
|
||||
full_file_listing_interval: Duration::from_secs(3600), // 1 hour
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
// First GC - should use full listing
|
||||
let reports1 = scheduler
|
||||
.process_datanode_gc(peer.clone(), vec![(table_id, mock_candidate(region_id))])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(reports1.deleted_files.len(), 1);
|
||||
|
||||
// Get the first full listing time
|
||||
let first_full_listing_time = {
|
||||
let tracker = scheduler.region_gc_tracker.lock().await;
|
||||
let gc_info = tracker
|
||||
.get(®ion_id)
|
||||
.expect("Region should be in tracker");
|
||||
gc_info
|
||||
.last_full_listing_time
|
||||
.expect("Should have full listing time")
|
||||
};
|
||||
|
||||
// Second GC immediately - should NOT use full listing since interval hasn't passed
|
||||
let reports2 = scheduler
|
||||
.process_datanode_gc(peer.clone(), vec![(table_id, mock_candidate(region_id))])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(reports2.deleted_files.len(), 1);
|
||||
|
||||
// Verify that full listing time was NOT updated
|
||||
let tracker = scheduler.region_gc_tracker.lock().await;
|
||||
let gc_info = tracker
|
||||
.get(®ion_id)
|
||||
.expect("Region should be in tracker");
|
||||
let second_full_listing_time = gc_info
|
||||
.last_full_listing_time
|
||||
.expect("Should have full listing time");
|
||||
|
||||
assert_eq!(
|
||||
second_full_listing_time, first_full_listing_time,
|
||||
"Second GC should not update full listing time when interval hasn't passed"
|
||||
);
|
||||
}
|
||||
@@ -1,252 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use common_meta::datanode::RegionManifestInfo;
|
||||
use common_meta::peer::Peer;
|
||||
use common_telemetry::init_default_ut_logging;
|
||||
use store_api::region_engine::RegionRole;
|
||||
use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId};
|
||||
|
||||
use crate::gc::mock::{
|
||||
MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_region_stat, new_empty_report_with,
|
||||
};
|
||||
use crate::gc::{GcScheduler, GcSchedulerOptions};
|
||||
|
||||
// Integration Flow Tests
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_full_gc_workflow() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
let peer = Peer::new(1, "");
|
||||
|
||||
let mut region_stat =
|
||||
mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
|
||||
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut region_stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 5;
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, vec![region_stat])]);
|
||||
|
||||
let mut gc_reports = HashMap::new();
|
||||
gc_reports.insert(
|
||||
region_id,
|
||||
GcReport {
|
||||
deleted_files: HashMap::from([(region_id, vec![FileId::random(), FileId::random()])]),
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: HashMap::from([(region_id, 1)]),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
gc_reports: Arc::new(Mutex::new(gc_reports)),
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(table_id, vec![(region_id, peer)]),
|
||||
)])),
|
||||
);
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config: GcSchedulerOptions::default(),
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
// Run the full workflow
|
||||
let report = scheduler.handle_tick().await.unwrap();
|
||||
|
||||
// Validate the returned GcJobReport - should have 1 datanode report
|
||||
assert_eq!(
|
||||
report.per_datanode_reports.len(),
|
||||
1,
|
||||
"Should process 1 datanode"
|
||||
);
|
||||
assert_eq!(
|
||||
report.failed_datanodes.len(),
|
||||
0,
|
||||
"Should have no failed datanodes"
|
||||
);
|
||||
|
||||
// Get the datanode report
|
||||
let datanode_report = report.per_datanode_reports.values().next().unwrap();
|
||||
|
||||
// Check that the region was processed successfully
|
||||
assert!(
|
||||
datanode_report.deleted_files.contains_key(®ion_id),
|
||||
"Should have deleted files for region"
|
||||
);
|
||||
assert_eq!(
|
||||
datanode_report.deleted_files[®ion_id].len(),
|
||||
2,
|
||||
"Should have 2 deleted files"
|
||||
);
|
||||
assert!(
|
||||
datanode_report.need_retry_regions.is_empty(),
|
||||
"Should have no retry regions"
|
||||
);
|
||||
|
||||
// Verify all steps were executed
|
||||
assert_eq!(
|
||||
*ctx.get_table_to_region_stats_calls.lock().unwrap(),
|
||||
1,
|
||||
"Expected 1 call to get_table_to_region_stats"
|
||||
);
|
||||
assert_eq!(
|
||||
*ctx.get_file_references_calls.lock().unwrap(),
|
||||
1,
|
||||
"Expected 1 call to get_file_references"
|
||||
);
|
||||
assert_eq!(
|
||||
*ctx.gc_regions_calls.lock().unwrap(),
|
||||
1,
|
||||
"Expected 1 call to gc_regions"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_tracker_cleanup() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
let peer = Peer::new(1, "");
|
||||
|
||||
// Create region stat with proper file_removed_cnt to ensure it gets selected as candidate
|
||||
let mut region_stat =
|
||||
mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
|
||||
if let RegionManifestInfo::Mito {
|
||||
file_removed_cnt, ..
|
||||
} = &mut region_stat.region_manifest
|
||||
{
|
||||
*file_removed_cnt = 5;
|
||||
}
|
||||
|
||||
let table_stats = HashMap::from([(table_id, vec![region_stat])]);
|
||||
|
||||
let mut gc_reports = HashMap::new();
|
||||
gc_reports.insert(region_id, new_empty_report_with([region_id]));
|
||||
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: HashMap::from([(region_id, 1)]),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
|
||||
gc_reports: Arc::new(Mutex::new(gc_reports)),
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(table_id, vec![(region_id, peer)]),
|
||||
)])),
|
||||
);
|
||||
|
||||
let old_region_gc_tracker = {
|
||||
let mut tracker = HashMap::new();
|
||||
tracker.insert(
|
||||
region_id,
|
||||
crate::gc::tracker::RegionGcInfo {
|
||||
last_full_listing_time: Some(Instant::now() - Duration::from_secs(7200)), // 2 hours ago
|
||||
last_gc_time: Instant::now() - Duration::from_secs(7200), // 2 hours ago
|
||||
},
|
||||
);
|
||||
// also insert a different table that should also be cleaned up
|
||||
tracker.insert(
|
||||
RegionId::new(2, 1),
|
||||
crate::gc::tracker::RegionGcInfo {
|
||||
last_full_listing_time: Some(Instant::now() - Duration::from_secs(7200)), // 2 hours ago
|
||||
last_gc_time: Instant::now() - Duration::from_secs(7200), // 2 hours ago
|
||||
},
|
||||
);
|
||||
tracker
|
||||
};
|
||||
|
||||
// Use a custom config with shorter cleanup interval to trigger cleanup
|
||||
let config = GcSchedulerOptions {
|
||||
// 30 minutes
|
||||
tracker_cleanup_interval: Duration::from_secs(1800),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config,
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(old_region_gc_tracker)),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(
|
||||
Instant::now() - Duration::from_secs(3600), // Old cleanup time (1 hour ago)
|
||||
)),
|
||||
};
|
||||
|
||||
let report = scheduler.handle_tick().await.unwrap();
|
||||
|
||||
// Validate the returned GcJobReport - should have 1 datanode report
|
||||
assert_eq!(
|
||||
report.per_datanode_reports.len(),
|
||||
1,
|
||||
"Should process 1 datanode"
|
||||
);
|
||||
assert_eq!(
|
||||
report.failed_datanodes.len(),
|
||||
0,
|
||||
"Should have no failed datanodes"
|
||||
);
|
||||
|
||||
// Get the datanode report
|
||||
let datanode_report = report.per_datanode_reports.values().next().unwrap();
|
||||
|
||||
// Check that the region was processed successfully
|
||||
assert!(
|
||||
datanode_report.deleted_files.contains_key(®ion_id),
|
||||
"Should have deleted files for region"
|
||||
);
|
||||
assert!(
|
||||
datanode_report.need_retry_regions.is_empty(),
|
||||
"Should have no retry regions"
|
||||
);
|
||||
|
||||
// Verify tracker was updated
|
||||
let tracker = scheduler.region_gc_tracker.lock().await;
|
||||
assert!(
|
||||
tracker.contains_key(®ion_id),
|
||||
"Tracker should contain region {}",
|
||||
region_id
|
||||
);
|
||||
// only one entry
|
||||
assert_eq!(tracker.len(), 1, "Tracker should only have 1 entry");
|
||||
}
|
||||
@@ -1,155 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Instant;
|
||||
|
||||
use common_meta::peer::Peer;
|
||||
use common_telemetry::init_default_ut_logging;
|
||||
use store_api::storage::{FileRefsManifest, GcReport, RegionId};
|
||||
|
||||
use crate::gc::mock::{MockSchedulerCtx, new_candidate};
|
||||
use crate::gc::{GcScheduler, GcSchedulerOptions};
|
||||
|
||||
/// Edge Case Tests
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_empty_file_refs_manifest() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region_id = RegionId::new(table_id, 1);
|
||||
let peer = Peer::new(1, "");
|
||||
let candidates = HashMap::from([(table_id, vec![new_candidate(region_id, 1.0)])]);
|
||||
|
||||
// Empty file refs manifest
|
||||
let file_refs = FileRefsManifest::default();
|
||||
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
candidates: Arc::new(Mutex::new(Some(candidates))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(table_id, vec![(region_id, peer)]),
|
||||
)])),
|
||||
);
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config: GcSchedulerOptions::default(),
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let candidates = ctx.candidates.lock().unwrap().clone().unwrap_or_default();
|
||||
|
||||
// Convert table-based candidates to datanode-based candidates
|
||||
let peer = Peer::new(1, "");
|
||||
let datanode_to_candidates = HashMap::from([(
|
||||
peer,
|
||||
candidates
|
||||
.into_iter()
|
||||
.flat_map(|(table_id, candidates)| candidates.into_iter().map(move |c| (table_id, c)))
|
||||
.collect(),
|
||||
)]);
|
||||
|
||||
let report = scheduler
|
||||
.parallel_process_datanodes(datanode_to_candidates)
|
||||
.await;
|
||||
|
||||
assert_eq!(report.per_datanode_reports.len(), 1);
|
||||
assert_eq!(report.failed_datanodes.len(), 0);
|
||||
// Should handle empty file refs gracefully
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_multiple_regions_per_table() {
|
||||
init_default_ut_logging();
|
||||
|
||||
let table_id = 1;
|
||||
let region1 = RegionId::new(table_id, 1);
|
||||
let region2 = RegionId::new(table_id, 2);
|
||||
let region3 = RegionId::new(table_id, 3);
|
||||
let peer = Peer::new(1, "");
|
||||
|
||||
let candidates = HashMap::from([(
|
||||
table_id,
|
||||
vec![
|
||||
new_candidate(region1, 1.0),
|
||||
new_candidate(region2, 2.0),
|
||||
new_candidate(region3, 3.0),
|
||||
],
|
||||
)]);
|
||||
|
||||
let mut gc_reports = HashMap::new();
|
||||
gc_reports.insert(region1, GcReport::default());
|
||||
gc_reports.insert(region2, GcReport::default());
|
||||
gc_reports.insert(region3, GcReport::default());
|
||||
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: HashMap::from([(region1, 1), (region2, 1), (region3, 1)]),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let ctx = Arc::new(
|
||||
MockSchedulerCtx {
|
||||
gc_reports: Arc::new(Mutex::new(gc_reports)),
|
||||
file_refs: Arc::new(Mutex::new(Some(file_refs))),
|
||||
candidates: Arc::new(Mutex::new(Some(candidates))),
|
||||
..Default::default()
|
||||
}
|
||||
.with_table_routes(HashMap::from([(
|
||||
table_id,
|
||||
(
|
||||
table_id,
|
||||
vec![
|
||||
(region1, peer.clone()),
|
||||
(region2, peer.clone()),
|
||||
(region3, peer.clone()),
|
||||
],
|
||||
),
|
||||
)])),
|
||||
);
|
||||
|
||||
let scheduler = GcScheduler {
|
||||
ctx: ctx.clone(),
|
||||
receiver: GcScheduler::channel().1,
|
||||
config: GcSchedulerOptions::default(),
|
||||
region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
|
||||
last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
|
||||
};
|
||||
|
||||
let candidates = ctx.candidates.lock().unwrap().clone().unwrap_or_default();
|
||||
|
||||
// Convert table-based candidates to datanode-based candidates
|
||||
let datanode_to_candidates = HashMap::from([(
|
||||
peer.clone(),
|
||||
candidates
|
||||
.into_iter()
|
||||
.flat_map(|(table_id, candidates)| candidates.into_iter().map(move |c| (table_id, c)))
|
||||
.collect(),
|
||||
)]);
|
||||
|
||||
let report = scheduler
|
||||
.parallel_process_datanodes(datanode_to_candidates)
|
||||
.await;
|
||||
|
||||
assert_eq!(report.per_datanode_reports.len(), 1);
|
||||
assert_eq!(report.failed_datanodes.len(), 0);
|
||||
}
|
||||
@@ -14,7 +14,6 @@ async-stream.workspace = true
|
||||
async-trait.workspace = true
|
||||
base64.workspace = true
|
||||
bytes.workspace = true
|
||||
fxhash = "0.2"
|
||||
common-base.workspace = true
|
||||
common-error.workspace = true
|
||||
common-macro.workspace = true
|
||||
@@ -32,6 +31,7 @@ lazy_static = "1.4"
|
||||
mito-codec.workspace = true
|
||||
mito2.workspace = true
|
||||
moka.workspace = true
|
||||
mur3 = "0.1"
|
||||
object-store.workspace = true
|
||||
prometheus.workspace = true
|
||||
serde.workspace = true
|
||||
@@ -47,12 +47,6 @@ common-meta = { workspace = true, features = ["testing"] }
|
||||
common-test-util.workspace = true
|
||||
mito2 = { workspace = true, features = ["test"] }
|
||||
common-wal = { workspace = true }
|
||||
criterion = { version = "0.4", features = ["async", "async_tokio"] }
|
||||
mur3 = "0.1"
|
||||
|
||||
[[bench]]
|
||||
name = "bench_tsid_generator"
|
||||
harness = false
|
||||
|
||||
[package.metadata.cargo-udeps.ignore]
|
||||
normal = ["aquamarine"]
|
||||
|
||||
@@ -1,273 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::hash::Hasher;
|
||||
|
||||
use criterion::{Criterion, black_box, criterion_group, criterion_main};
|
||||
use fxhash::FxHasher;
|
||||
use mur3::Hasher128;
|
||||
|
||||
// A random number (from original implementation)
|
||||
const TSID_HASH_SEED: u32 = 846793005;
|
||||
|
||||
/// Original TSID generator using mur3::Hasher128
|
||||
/// Hashes both label name and value for each label pair
|
||||
struct OriginalTsidGenerator {
|
||||
hasher: Hasher128,
|
||||
}
|
||||
|
||||
impl OriginalTsidGenerator {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
hasher: Hasher128::with_seed(TSID_HASH_SEED),
|
||||
}
|
||||
}
|
||||
|
||||
/// Writes a label pair (name and value) to the generator.
|
||||
fn write_label(&mut self, name: &str, value: &str) {
|
||||
use std::hash::Hash;
|
||||
name.hash(&mut self.hasher);
|
||||
value.hash(&mut self.hasher);
|
||||
}
|
||||
|
||||
/// Generates a new TSID.
|
||||
fn finish(&mut self) -> u64 {
|
||||
// TSID is 64 bits, simply truncate the 128 bits hash
|
||||
let (hash, _) = self.hasher.finish128();
|
||||
hash
|
||||
}
|
||||
}
|
||||
|
||||
/// Current TSID generator using fxhash::FxHasher
|
||||
/// Fast path: pre-computes label name hash, only hashes values
|
||||
struct CurrentTsidGenerator {
|
||||
hasher: FxHasher,
|
||||
}
|
||||
|
||||
impl CurrentTsidGenerator {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
hasher: FxHasher::default(),
|
||||
}
|
||||
}
|
||||
|
||||
fn new_with_label_name_hash(label_name_hash: u64) -> Self {
|
||||
let mut hasher = FxHasher::default();
|
||||
hasher.write_u64(label_name_hash);
|
||||
Self { hasher }
|
||||
}
|
||||
|
||||
/// Writes a label value to the generator.
|
||||
fn write_str(&mut self, value: &str) {
|
||||
self.hasher.write(value.as_bytes());
|
||||
self.hasher.write_u8(0xff);
|
||||
}
|
||||
|
||||
/// Generates a new TSID.
|
||||
fn finish(&mut self) -> u64 {
|
||||
self.hasher.finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// Pre-computes label name hash (used in fast path)
|
||||
fn compute_label_name_hash(labels: &[(&str, &str)]) -> u64 {
|
||||
let mut hasher = FxHasher::default();
|
||||
for (name, _) in labels {
|
||||
hasher.write(name.as_bytes());
|
||||
hasher.write_u8(0xff);
|
||||
}
|
||||
hasher.finish()
|
||||
}
|
||||
|
||||
fn bench_tsid_generator_small(c: &mut Criterion) {
|
||||
let labels = vec![("namespace", "greptimedb"), ("host", "127.0.0.1")];
|
||||
|
||||
let mut group = c.benchmark_group("tsid_generator_small_2_labels");
|
||||
group.bench_function("original_mur3", |b| {
|
||||
b.iter(|| {
|
||||
let mut tsid_gen = OriginalTsidGenerator::new();
|
||||
for (name, value) in &labels {
|
||||
tsid_gen.write_label(black_box(name), black_box(value));
|
||||
}
|
||||
black_box(tsid_gen.finish())
|
||||
})
|
||||
});
|
||||
|
||||
let label_name_hash = compute_label_name_hash(&labels);
|
||||
group.bench_function("current_fxhash_fast_path", |b| {
|
||||
b.iter(|| {
|
||||
let mut tsid_gen =
|
||||
CurrentTsidGenerator::new_with_label_name_hash(black_box(label_name_hash));
|
||||
for (_, value) in &labels {
|
||||
tsid_gen.write_str(black_box(value));
|
||||
}
|
||||
black_box(tsid_gen.finish())
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_tsid_generator_medium(c: &mut Criterion) {
|
||||
let labels = vec![
|
||||
("namespace", "greptimedb"),
|
||||
("host", "127.0.0.1"),
|
||||
("region", "us-west-2"),
|
||||
("env", "production"),
|
||||
("service", "api"),
|
||||
];
|
||||
|
||||
let mut group = c.benchmark_group("tsid_generator_medium_5_labels");
|
||||
group.bench_function("original_mur3", |b| {
|
||||
b.iter(|| {
|
||||
let mut tsid_gen = OriginalTsidGenerator::new();
|
||||
for (name, value) in &labels {
|
||||
tsid_gen.write_label(black_box(name), black_box(value));
|
||||
}
|
||||
black_box(tsid_gen.finish())
|
||||
})
|
||||
});
|
||||
|
||||
let label_name_hash = compute_label_name_hash(&labels);
|
||||
group.bench_function("current_fxhash_fast_path", |b| {
|
||||
b.iter(|| {
|
||||
let mut tsid_gen =
|
||||
CurrentTsidGenerator::new_with_label_name_hash(black_box(label_name_hash));
|
||||
for (_, value) in &labels {
|
||||
tsid_gen.write_str(black_box(value));
|
||||
}
|
||||
black_box(tsid_gen.finish())
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_tsid_generator_large(c: &mut Criterion) {
|
||||
let labels = vec![
|
||||
("namespace", "greptimedb"),
|
||||
("host", "127.0.0.1"),
|
||||
("region", "us-west-2"),
|
||||
("env", "production"),
|
||||
("service", "api"),
|
||||
("version", "v1.0.0"),
|
||||
("cluster", "cluster-1"),
|
||||
("dc", "dc1"),
|
||||
("rack", "rack-1"),
|
||||
("pod", "pod-123"),
|
||||
];
|
||||
|
||||
let mut group = c.benchmark_group("tsid_generator_large_10_labels");
|
||||
group.bench_function("original_mur3", |b| {
|
||||
b.iter(|| {
|
||||
let mut tsid_gen = OriginalTsidGenerator::new();
|
||||
for (name, value) in &labels {
|
||||
tsid_gen.write_label(black_box(name), black_box(value));
|
||||
}
|
||||
black_box(tsid_gen.finish())
|
||||
})
|
||||
});
|
||||
|
||||
let label_name_hash = compute_label_name_hash(&labels);
|
||||
group.bench_function("current_fxhash_fast_path", |b| {
|
||||
b.iter(|| {
|
||||
let mut tsid_gen =
|
||||
CurrentTsidGenerator::new_with_label_name_hash(black_box(label_name_hash));
|
||||
for (_, value) in &labels {
|
||||
tsid_gen.write_str(black_box(value));
|
||||
}
|
||||
black_box(tsid_gen.finish())
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_tsid_generator_slow_path(c: &mut Criterion) {
|
||||
// Simulate slow path: some labels have null values (empty strings)
|
||||
let labels_with_nulls = vec![
|
||||
("namespace", "greptimedb"),
|
||||
("host", "127.0.0.1"),
|
||||
("region", ""), // null
|
||||
("env", "production"),
|
||||
];
|
||||
|
||||
let labels_all_non_null = vec![
|
||||
("namespace", "greptimedb"),
|
||||
("host", "127.0.0.1"),
|
||||
("env", "production"),
|
||||
];
|
||||
|
||||
let mut group = c.benchmark_group("tsid_generator_slow_path_with_nulls");
|
||||
|
||||
// Original: always hashes name and value
|
||||
group.bench_function("original_mur3_with_nulls", |b| {
|
||||
b.iter(|| {
|
||||
let mut tsid_gen = OriginalTsidGenerator::new();
|
||||
for (name, value) in &labels_with_nulls {
|
||||
if !value.is_empty() {
|
||||
tsid_gen.write_label(black_box(name), black_box(value));
|
||||
}
|
||||
}
|
||||
black_box(tsid_gen.finish())
|
||||
})
|
||||
});
|
||||
|
||||
// Current slow path: recomputes label name hash
|
||||
group.bench_function("current_fxhash_slow_path", |b| {
|
||||
b.iter(|| {
|
||||
// Step 1: Compute label name hash for non-null labels
|
||||
let mut name_hasher = CurrentTsidGenerator::new();
|
||||
for (name, value) in &labels_with_nulls {
|
||||
if !value.is_empty() {
|
||||
name_hasher.write_str(black_box(name));
|
||||
}
|
||||
}
|
||||
let label_name_hash = name_hasher.finish();
|
||||
|
||||
// Step 2: Use label name hash and hash values
|
||||
let mut tsid_gen = CurrentTsidGenerator::new_with_label_name_hash(label_name_hash);
|
||||
for (_, value) in &labels_with_nulls {
|
||||
if !value.is_empty() {
|
||||
tsid_gen.write_str(black_box(value));
|
||||
}
|
||||
}
|
||||
black_box(tsid_gen.finish())
|
||||
})
|
||||
});
|
||||
|
||||
// Current fast path: pre-computed (for comparison)
|
||||
let label_name_hash = compute_label_name_hash(&labels_all_non_null);
|
||||
group.bench_function("current_fxhash_fast_path_no_nulls", |b| {
|
||||
b.iter(|| {
|
||||
let mut tsid_gen =
|
||||
CurrentTsidGenerator::new_with_label_name_hash(black_box(label_name_hash));
|
||||
for (_, value) in &labels_all_non_null {
|
||||
tsid_gen.write_str(black_box(value));
|
||||
}
|
||||
black_box(tsid_gen.finish())
|
||||
})
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_tsid_generator_small,
|
||||
bench_tsid_generator_medium,
|
||||
bench_tsid_generator_large,
|
||||
bench_tsid_generator_slow_path
|
||||
);
|
||||
criterion_main!(benches);
|
||||
@@ -272,15 +272,15 @@ mod tests {
|
||||
.unwrap();
|
||||
let batches = RecordBatches::try_collect(stream).await.unwrap();
|
||||
let expected = "\
|
||||
+-------------------------+----------------+------------+---------------------+-------+
|
||||
| greptime_timestamp | greptime_value | __table_id | __tsid | job |
|
||||
+-------------------------+----------------+------------+---------------------+-------+
|
||||
| 1970-01-01T00:00:00 | 0.0 | 3 | 2955007454552897459 | tag_0 |
|
||||
| 1970-01-01T00:00:00.001 | 1.0 | 3 | 2955007454552897459 | tag_0 |
|
||||
| 1970-01-01T00:00:00.002 | 2.0 | 3 | 2955007454552897459 | tag_0 |
|
||||
| 1970-01-01T00:00:00.003 | 3.0 | 3 | 2955007454552897459 | tag_0 |
|
||||
| 1970-01-01T00:00:00.004 | 4.0 | 3 | 2955007454552897459 | tag_0 |
|
||||
+-------------------------+----------------+------------+---------------------+-------+";
|
||||
+-------------------------+----------------+------------+----------------------+-------+
|
||||
| greptime_timestamp | greptime_value | __table_id | __tsid | job |
|
||||
+-------------------------+----------------+------------+----------------------+-------+
|
||||
| 1970-01-01T00:00:00 | 0.0 | 3 | 12881218023286672757 | tag_0 |
|
||||
| 1970-01-01T00:00:00.001 | 1.0 | 3 | 12881218023286672757 | tag_0 |
|
||||
| 1970-01-01T00:00:00.002 | 2.0 | 3 | 12881218023286672757 | tag_0 |
|
||||
| 1970-01-01T00:00:00.003 | 3.0 | 3 | 12881218023286672757 | tag_0 |
|
||||
| 1970-01-01T00:00:00.004 | 4.0 | 3 | 12881218023286672757 | tag_0 |
|
||||
+-------------------------+----------------+------------+----------------------+-------+";
|
||||
assert_eq!(expected, batches.pretty_print().unwrap(), "physical region");
|
||||
|
||||
// read data from logical region
|
||||
|
||||
@@ -13,12 +13,11 @@
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::hash::Hasher;
|
||||
use std::hash::Hash;
|
||||
|
||||
use api::v1::value::ValueData;
|
||||
use api::v1::{ColumnDataType, ColumnSchema, Row, Rows, SemanticType, Value};
|
||||
use datatypes::value::ValueRef;
|
||||
use fxhash::FxHasher;
|
||||
use mito_codec::row_converter::SparsePrimaryKeyCodec;
|
||||
use smallvec::SmallVec;
|
||||
use snafu::ResultExt;
|
||||
@@ -31,6 +30,9 @@ use store_api::storage::{ColumnId, TableId};
|
||||
|
||||
use crate::error::{EncodePrimaryKeySnafu, Result};
|
||||
|
||||
// A random number
|
||||
const TSID_HASH_SEED: u32 = 846793005;
|
||||
|
||||
/// A row modifier modifies [`Rows`].
|
||||
///
|
||||
/// - For [`PrimaryKeyEncoding::Sparse`] encoding,
|
||||
@@ -73,7 +75,6 @@ impl RowModifier {
|
||||
let num_output_column = num_column - num_primary_key_column + 1;
|
||||
|
||||
let mut buffer = vec![];
|
||||
|
||||
for mut iter in iter.iter_mut() {
|
||||
let (table_id, tsid) = Self::fill_internal_columns(table_id, &iter);
|
||||
let mut values = Vec::with_capacity(num_output_column);
|
||||
@@ -146,72 +147,47 @@ impl RowModifier {
|
||||
|
||||
/// Fills internal columns of a row with table name and a hash of tag values.
|
||||
pub fn fill_internal_columns(table_id: TableId, iter: &RowIter<'_>) -> (Value, Value) {
|
||||
let ts_id = if !iter.has_null_labels() {
|
||||
// No null labels in row, we can safely reuse the precomputed label name hash.
|
||||
let mut ts_id_gen = TsidGenerator::new(iter.index.label_name_hash);
|
||||
for (_, value) in iter.primary_keys_with_name() {
|
||||
// The type is checked before. So only null is ignored.
|
||||
if let Some(ValueData::StringValue(string)) = &value.value_data {
|
||||
ts_id_gen.write_str(string);
|
||||
} else {
|
||||
unreachable!(
|
||||
"Should not contain null or non-string value: {:?}, table id: {}",
|
||||
value, table_id
|
||||
);
|
||||
}
|
||||
let mut hasher = TsidGenerator::default();
|
||||
for (name, value) in iter.primary_keys_with_name() {
|
||||
// The type is checked before. So only null is ignored.
|
||||
if let Some(ValueData::StringValue(string)) = &value.value_data {
|
||||
hasher.write_label(name, string);
|
||||
}
|
||||
ts_id_gen.finish()
|
||||
} else {
|
||||
// Slow path: row contains null, recompute label hash
|
||||
let mut hasher = TsidGenerator::default();
|
||||
// 1. Find out label names with non-null values and get the hash.
|
||||
for (name, value) in iter.primary_keys_with_name() {
|
||||
// The type is checked before. So only null is ignored.
|
||||
if let Some(ValueData::StringValue(_)) = &value.value_data {
|
||||
hasher.write_str(name);
|
||||
}
|
||||
}
|
||||
let label_name_hash = hasher.finish();
|
||||
|
||||
// 2. Use label name hash as seed and continue with label values.
|
||||
let mut final_hasher = TsidGenerator::new(label_name_hash);
|
||||
for (_, value) in iter.primary_keys_with_name() {
|
||||
if let Some(ValueData::StringValue(value)) = &value.value_data {
|
||||
final_hasher.write_str(value);
|
||||
}
|
||||
}
|
||||
final_hasher.finish()
|
||||
};
|
||||
}
|
||||
let hash = hasher.finish();
|
||||
|
||||
(
|
||||
ValueData::U32Value(table_id).into(),
|
||||
ValueData::U64Value(ts_id).into(),
|
||||
ValueData::U64Value(hash).into(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Tsid generator.
|
||||
#[derive(Default)]
|
||||
pub struct TsidGenerator {
|
||||
hasher: FxHasher,
|
||||
hasher: mur3::Hasher128,
|
||||
}
|
||||
|
||||
impl Default for TsidGenerator {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
hasher: mur3::Hasher128::with_seed(TSID_HASH_SEED),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TsidGenerator {
|
||||
pub fn new(label_name_hash: u64) -> Self {
|
||||
let mut hasher = FxHasher::default();
|
||||
hasher.write_u64(label_name_hash);
|
||||
Self { hasher }
|
||||
}
|
||||
|
||||
/// Writes a label pair to the generator.
|
||||
pub fn write_str(&mut self, value: &str) {
|
||||
self.hasher.write(value.as_bytes());
|
||||
self.hasher.write_u8(0xff);
|
||||
pub fn write_label(&mut self, name: &str, value: &str) {
|
||||
name.hash(&mut self.hasher);
|
||||
value.hash(&mut self.hasher);
|
||||
}
|
||||
|
||||
/// Generates a new TSID.
|
||||
pub fn finish(&mut self) -> u64 {
|
||||
self.hasher.finish()
|
||||
// TSID is 64 bits, simply truncate the 128 bits hash
|
||||
let (hash, _) = self.hasher.finish128();
|
||||
hash
|
||||
}
|
||||
}
|
||||
|
||||
@@ -226,8 +202,6 @@ struct ValueIndex {
|
||||
struct IterIndex {
|
||||
indices: Vec<ValueIndex>,
|
||||
num_primary_key_column: usize,
|
||||
/// Precomputed hash for label names.
|
||||
label_name_hash: u64,
|
||||
}
|
||||
|
||||
impl IterIndex {
|
||||
@@ -278,22 +252,15 @@ impl IterIndex {
|
||||
}
|
||||
}
|
||||
let num_primary_key_column = primary_key_indices.len() + reserved_indices.len();
|
||||
let mut indices = Vec::with_capacity(num_primary_key_column + 2);
|
||||
indices.extend(reserved_indices);
|
||||
let mut label_name_hasher = TsidGenerator::default();
|
||||
for (pk_name, pk_index) in primary_key_indices {
|
||||
// primary_key_indices already sorted.
|
||||
label_name_hasher.write_str(pk_name);
|
||||
indices.push(pk_index);
|
||||
}
|
||||
let label_name_hash = label_name_hasher.finish();
|
||||
|
||||
indices.extend(ts_index);
|
||||
indices.extend(field_indices);
|
||||
let indices = reserved_indices
|
||||
.into_iter()
|
||||
.chain(primary_key_indices.values().cloned())
|
||||
.chain(ts_index)
|
||||
.chain(field_indices)
|
||||
.collect();
|
||||
IterIndex {
|
||||
indices,
|
||||
num_primary_key_column,
|
||||
label_name_hash,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -347,13 +314,6 @@ impl RowIter<'_> {
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns true if any label in current row is null.
|
||||
fn has_null_labels(&self) -> bool {
|
||||
self.index.indices[..self.index.num_primary_key_column]
|
||||
.iter()
|
||||
.any(|idx| self.row.values[idx.index].value_data.is_none())
|
||||
}
|
||||
|
||||
/// Returns the primary keys.
|
||||
pub fn primary_keys(&self) -> impl Iterator<Item = (ColumnId, ValueRef<'_>)> {
|
||||
self.index.indices[..self.index.num_primary_key_column]
|
||||
@@ -439,9 +399,9 @@ mod tests {
|
||||
let result = encoder.modify_rows_sparse(rows_iter, table_id).unwrap();
|
||||
assert_eq!(result.rows[0].values.len(), 1);
|
||||
let encoded_primary_key = vec![
|
||||
128, 0, 0, 4, 1, 0, 0, 4, 1, 128, 0, 0, 3, 1, 37, 196, 242, 181, 117, 224, 7, 137, 0,
|
||||
0, 0, 2, 1, 1, 49, 50, 55, 46, 48, 46, 48, 46, 9, 49, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
|
||||
1, 1, 1, 103, 114, 101, 112, 116, 105, 109, 101, 9, 100, 98, 0, 0, 0, 0, 0, 0, 2,
|
||||
128, 0, 0, 4, 1, 0, 0, 4, 1, 128, 0, 0, 3, 1, 131, 9, 166, 190, 173, 37, 39, 240, 0, 0,
|
||||
0, 2, 1, 1, 49, 50, 55, 46, 48, 46, 48, 46, 9, 49, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
|
||||
1, 1, 103, 114, 101, 112, 116, 105, 109, 101, 9, 100, 98, 0, 0, 0, 0, 0, 0, 2,
|
||||
];
|
||||
assert_eq!(
|
||||
result.rows[0].values[0],
|
||||
@@ -517,7 +477,7 @@ mod tests {
|
||||
assert_eq!(result.rows[0].values[2], ValueData::U32Value(1025).into());
|
||||
assert_eq!(
|
||||
result.rows[0].values[3],
|
||||
ValueData::U64Value(2721566936019240841).into()
|
||||
ValueData::U64Value(9442261431637846000).into()
|
||||
);
|
||||
assert_eq!(result.schema, expected_dense_schema());
|
||||
}
|
||||
@@ -536,7 +496,7 @@ mod tests {
|
||||
let row_iter = rows_iter.iter_mut().next().unwrap();
|
||||
let (encoded_table_id, tsid) = RowModifier::fill_internal_columns(table_id, &row_iter);
|
||||
assert_eq!(encoded_table_id, ValueData::U32Value(1025).into());
|
||||
assert_eq!(tsid, ValueData::U64Value(2721566936019240841).into());
|
||||
assert_eq!(tsid, ValueData::U64Value(9442261431637846000).into());
|
||||
|
||||
// Change the column order
|
||||
let schema = vec![
|
||||
@@ -564,264 +524,6 @@ mod tests {
|
||||
let row_iter = rows_iter.iter_mut().next().unwrap();
|
||||
let (encoded_table_id, tsid) = RowModifier::fill_internal_columns(table_id, &row_iter);
|
||||
assert_eq!(encoded_table_id, ValueData::U32Value(1025).into());
|
||||
assert_eq!(tsid, ValueData::U64Value(2721566936019240841).into());
|
||||
}
|
||||
|
||||
/// Helper function to create a schema with multiple label columns
|
||||
fn create_multi_label_schema(labels: &[&str]) -> Vec<ColumnSchema> {
|
||||
labels
|
||||
.iter()
|
||||
.map(|name| ColumnSchema {
|
||||
column_name: name.to_string(),
|
||||
datatype: ColumnDataType::String as i32,
|
||||
semantic_type: SemanticType::Tag as _,
|
||||
datatype_extension: None,
|
||||
options: None,
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Helper function to create a name_to_column_id map
|
||||
fn create_name_to_column_id(labels: &[&str]) -> HashMap<String, ColumnId> {
|
||||
labels
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(idx, name)| (name.to_string(), idx as ColumnId + 1))
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Helper function to create a row with string values
|
||||
fn create_row_with_values(values: &[&str]) -> Row {
|
||||
Row {
|
||||
values: values
|
||||
.iter()
|
||||
.map(|v| ValueData::StringValue(v.to_string()).into())
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function to create a row with some null values
|
||||
fn create_row_with_nulls(values: &[Option<&str>]) -> Row {
|
||||
Row {
|
||||
values: values
|
||||
.iter()
|
||||
.map(|v| {
|
||||
v.map(|s| ValueData::StringValue(s.to_string()).into())
|
||||
.unwrap_or(Value { value_data: None })
|
||||
})
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function to extract TSID from a row
|
||||
fn extract_tsid(
|
||||
schema: Vec<ColumnSchema>,
|
||||
row: Row,
|
||||
name_to_column_id: &HashMap<String, ColumnId>,
|
||||
table_id: TableId,
|
||||
) -> u64 {
|
||||
let rows = Rows {
|
||||
schema,
|
||||
rows: vec![row],
|
||||
};
|
||||
let mut rows_iter = RowsIter::new(rows, name_to_column_id);
|
||||
let row_iter = rows_iter.iter_mut().next().unwrap();
|
||||
let (_, tsid_value) = RowModifier::fill_internal_columns(table_id, &row_iter);
|
||||
match tsid_value.value_data {
|
||||
Some(ValueData::U64Value(tsid)) => tsid,
|
||||
_ => panic!("Expected U64Value for TSID"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tsid_same_for_different_label_orders() {
|
||||
// Test that rows with the same label name-value pairs but in different orders
|
||||
// produce the same TSID
|
||||
let table_id = 1025;
|
||||
|
||||
// Schema 1: a, b, c
|
||||
let schema1 = create_multi_label_schema(&["a", "b", "c"]);
|
||||
let name_to_column_id1 = create_name_to_column_id(&["a", "b", "c"]);
|
||||
let row1 = create_row_with_values(&["A", "B", "C"]);
|
||||
let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
|
||||
|
||||
// Schema 2: b, a, c (different order)
|
||||
let schema2 = create_multi_label_schema(&["b", "a", "c"]);
|
||||
let name_to_column_id2 = create_name_to_column_id(&["a", "b", "c"]);
|
||||
let row2 = create_row_with_values(&["B", "A", "C"]);
|
||||
let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
|
||||
|
||||
// Schema 3: c, b, a (another different order)
|
||||
let schema3 = create_multi_label_schema(&["c", "b", "a"]);
|
||||
let name_to_column_id3 = create_name_to_column_id(&["a", "b", "c"]);
|
||||
let row3 = create_row_with_values(&["C", "B", "A"]);
|
||||
let tsid3 = extract_tsid(schema3, row3, &name_to_column_id3, table_id);
|
||||
|
||||
// All should have the same TSID since label names are sorted lexicographically
|
||||
// and we're using the same label name-value pairs
|
||||
assert_eq!(
|
||||
tsid1, tsid2,
|
||||
"TSID should be same for different column orders"
|
||||
);
|
||||
assert_eq!(
|
||||
tsid2, tsid3,
|
||||
"TSID should be same for different column orders"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tsid_same_with_null_labels() {
|
||||
// Test that rows that differ only by null label values produce the same TSID
|
||||
let table_id = 1025;
|
||||
|
||||
// Row 1: a=A, b=B (no nulls, fast path)
|
||||
let schema1 = create_multi_label_schema(&["a", "b"]);
|
||||
let name_to_column_id1 = create_name_to_column_id(&["a", "b"]);
|
||||
let row1 = create_row_with_values(&["A", "B"]);
|
||||
let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
|
||||
|
||||
// Row 2: a=A, b=B, c=null (has null, slow path)
|
||||
let schema2 = create_multi_label_schema(&["a", "b", "c"]);
|
||||
let name_to_column_id2 = create_name_to_column_id(&["a", "b", "c"]);
|
||||
let row2 = create_row_with_nulls(&[Some("A"), Some("B"), None]);
|
||||
let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
|
||||
|
||||
// Both should have the same TSID since null labels are ignored
|
||||
assert_eq!(
|
||||
tsid1, tsid2,
|
||||
"TSID should be same when only difference is null label values"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tsid_same_with_multiple_null_labels() {
|
||||
// Test with multiple null labels
|
||||
let table_id = 1025;
|
||||
|
||||
// Row 1: a=A, b=B (no nulls)
|
||||
let schema1 = create_multi_label_schema(&["a", "b"]);
|
||||
let name_to_column_id1 = create_name_to_column_id(&["a", "b"]);
|
||||
let row1 = create_row_with_values(&["A", "B"]);
|
||||
let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
|
||||
|
||||
// Row 2: a=A, b=B, c=null, d=null (multiple nulls)
|
||||
let schema2 = create_multi_label_schema(&["a", "b", "c", "d"]);
|
||||
let name_to_column_id2 = create_name_to_column_id(&["a", "b", "c", "d"]);
|
||||
let row2 = create_row_with_nulls(&[Some("A"), Some("B"), None, None]);
|
||||
let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
|
||||
|
||||
assert_eq!(
|
||||
tsid1, tsid2,
|
||||
"TSID should be same when only difference is multiple null label values"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tsid_different_with_different_non_null_values() {
|
||||
// Test that rows with different non-null values produce different TSIDs
|
||||
let table_id = 1025;
|
||||
|
||||
// Row 1: a=A, b=B
|
||||
let schema1 = create_multi_label_schema(&["a", "b"]);
|
||||
let name_to_column_id1 = create_name_to_column_id(&["a", "b"]);
|
||||
let row1 = create_row_with_values(&["A", "B"]);
|
||||
let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
|
||||
|
||||
// Row 2: a=A, b=C (different value for b)
|
||||
let schema2 = create_multi_label_schema(&["a", "b"]);
|
||||
let name_to_column_id2 = create_name_to_column_id(&["a", "b"]);
|
||||
let row2 = create_row_with_values(&["A", "C"]);
|
||||
let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
|
||||
|
||||
assert_ne!(
|
||||
tsid1, tsid2,
|
||||
"TSID should be different when label values differ"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tsid_fast_path_vs_slow_path_consistency() {
|
||||
// Test that fast path (no nulls) and slow path (with nulls) produce
|
||||
// the same TSID for the same non-null label values
|
||||
let table_id = 1025;
|
||||
|
||||
// Fast path: a=A, b=B (no nulls)
|
||||
let schema_fast = create_multi_label_schema(&["a", "b"]);
|
||||
let name_to_column_id_fast = create_name_to_column_id(&["a", "b"]);
|
||||
let row_fast = create_row_with_values(&["A", "B"]);
|
||||
let tsid_fast = extract_tsid(schema_fast, row_fast, &name_to_column_id_fast, table_id);
|
||||
|
||||
// Slow path: a=A, b=B, c=null (has null, triggers slow path)
|
||||
let schema_slow = create_multi_label_schema(&["a", "b", "c"]);
|
||||
let name_to_column_id_slow = create_name_to_column_id(&["a", "b", "c"]);
|
||||
let row_slow = create_row_with_nulls(&[Some("A"), Some("B"), None]);
|
||||
let tsid_slow = extract_tsid(schema_slow, row_slow, &name_to_column_id_slow, table_id);
|
||||
|
||||
assert_eq!(
|
||||
tsid_fast, tsid_slow,
|
||||
"Fast path and slow path should produce same TSID for same non-null values"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tsid_with_null_in_middle() {
|
||||
// Test with null in the middle of labels
|
||||
let table_id = 1025;
|
||||
|
||||
// Row 1: a=A, b=B, c=C
|
||||
let schema1 = create_multi_label_schema(&["a", "b", "c"]);
|
||||
let name_to_column_id1 = create_name_to_column_id(&["a", "b", "c"]);
|
||||
let row1 = create_row_with_values(&["A", "B", "C"]);
|
||||
let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
|
||||
|
||||
// Row 2: a=A, b=null, c=C (null in middle)
|
||||
let schema2 = create_multi_label_schema(&["a", "b", "c"]);
|
||||
let name_to_column_id2 = create_name_to_column_id(&["a", "b", "c"]);
|
||||
let row2 = create_row_with_nulls(&[Some("A"), None, Some("C")]);
|
||||
let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
|
||||
|
||||
// Should be different because b is null in row2 but B in row1
|
||||
// Actually wait, let me reconsider - if b is null, it should be ignored
|
||||
// So row2 should be equivalent to a=A, c=C
|
||||
// But row1 is a=A, b=B, c=C, so they should be different
|
||||
assert_ne!(
|
||||
tsid1, tsid2,
|
||||
"TSID should be different when a non-null value becomes null"
|
||||
);
|
||||
|
||||
// Row 3: a=A, c=C (no b at all, equivalent to row2)
|
||||
let schema3 = create_multi_label_schema(&["a", "c"]);
|
||||
let name_to_column_id3 = create_name_to_column_id(&["a", "c"]);
|
||||
let row3 = create_row_with_values(&["A", "C"]);
|
||||
let tsid3 = extract_tsid(schema3, row3, &name_to_column_id3, table_id);
|
||||
|
||||
// Row2 (a=A, b=null, c=C) should be same as row3 (a=A, c=C)
|
||||
assert_eq!(
|
||||
tsid2, tsid3,
|
||||
"TSID should be same when null label is ignored"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tsid_all_null_labels() {
|
||||
// Test with all labels being null
|
||||
let table_id = 1025;
|
||||
|
||||
// Row with all nulls
|
||||
let schema = create_multi_label_schema(&["a", "b", "c"]);
|
||||
let name_to_column_id = create_name_to_column_id(&["a", "b", "c"]);
|
||||
let row = create_row_with_nulls(&[None, None, None]);
|
||||
let tsid = extract_tsid(schema.clone(), row, &name_to_column_id, table_id);
|
||||
|
||||
// Should still produce a TSID (based on label names only when all values are null)
|
||||
// This tests that the slow path handles the case where all values are null
|
||||
// The TSID will be based on the label name hash only
|
||||
// Test that it's consistent - same schema with all nulls should produce same TSID
|
||||
let row2 = create_row_with_nulls(&[None, None, None]);
|
||||
let tsid2 = extract_tsid(schema, row2, &name_to_column_id, table_id);
|
||||
assert_eq!(
|
||||
tsid, tsid2,
|
||||
"TSID should be consistent when all label values are null"
|
||||
);
|
||||
assert_eq!(tsid, ValueData::U64Value(9442261431637846000).into());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,7 +18,6 @@ mod cache_size;
|
||||
|
||||
pub(crate) mod file_cache;
|
||||
pub(crate) mod index;
|
||||
pub(crate) mod manifest_cache;
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test_util;
|
||||
pub(crate) mod write_cache;
|
||||
@@ -45,7 +44,6 @@ use crate::cache::write_cache::WriteCacheRef;
|
||||
use crate::metrics::{CACHE_BYTES, CACHE_EVICTION, CACHE_HIT, CACHE_MISS};
|
||||
use crate::read::Batch;
|
||||
use crate::sst::file::RegionFileId;
|
||||
use crate::sst::parquet::reader::MetadataCacheMetrics;
|
||||
|
||||
/// Metrics type key for sst meta.
|
||||
const SST_META_TYPE: &str = "sst_meta";
|
||||
@@ -76,24 +74,19 @@ pub enum CacheStrategy {
|
||||
}
|
||||
|
||||
impl CacheStrategy {
|
||||
/// Gets parquet metadata with cache metrics tracking.
|
||||
/// Returns the metadata and updates the provided metrics.
|
||||
pub(crate) async fn get_parquet_meta_data(
|
||||
/// Calls [CacheManager::get_parquet_meta_data()].
|
||||
pub async fn get_parquet_meta_data(
|
||||
&self,
|
||||
file_id: RegionFileId,
|
||||
metrics: &mut MetadataCacheMetrics,
|
||||
) -> Option<Arc<ParquetMetaData>> {
|
||||
match self {
|
||||
CacheStrategy::EnableAll(cache_manager) => {
|
||||
cache_manager.get_parquet_meta_data(file_id, metrics).await
|
||||
cache_manager.get_parquet_meta_data(file_id).await
|
||||
}
|
||||
CacheStrategy::Compaction(cache_manager) => {
|
||||
cache_manager.get_parquet_meta_data(file_id, metrics).await
|
||||
}
|
||||
CacheStrategy::Disabled => {
|
||||
metrics.cache_miss += 1;
|
||||
None
|
||||
cache_manager.get_parquet_meta_data(file_id).await
|
||||
}
|
||||
CacheStrategy::Disabled => None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -298,17 +291,16 @@ impl CacheManager {
|
||||
CacheManagerBuilder::default()
|
||||
}
|
||||
|
||||
/// Gets cached [ParquetMetaData] with metrics tracking.
|
||||
/// Tries in-memory cache first, then file cache, updating metrics accordingly.
|
||||
pub(crate) async fn get_parquet_meta_data(
|
||||
/// Gets cached [ParquetMetaData] from in-memory cache first.
|
||||
/// If not found, tries to get it from write cache and fill the in-memory cache.
|
||||
pub async fn get_parquet_meta_data(
|
||||
&self,
|
||||
file_id: RegionFileId,
|
||||
metrics: &mut MetadataCacheMetrics,
|
||||
) -> Option<Arc<ParquetMetaData>> {
|
||||
// Try to get metadata from sst meta cache
|
||||
if let Some(metadata) = self.get_parquet_meta_data_from_mem_cache(file_id) {
|
||||
metrics.mem_cache_hit += 1;
|
||||
return Some(metadata);
|
||||
let metadata = self.get_parquet_meta_data_from_mem_cache(file_id);
|
||||
if metadata.is_some() {
|
||||
return metadata;
|
||||
}
|
||||
|
||||
// Try to get metadata from write cache
|
||||
@@ -316,13 +308,11 @@ impl CacheManager {
|
||||
if let Some(write_cache) = &self.write_cache
|
||||
&& let Some(metadata) = write_cache.file_cache().get_parquet_meta_data(key).await
|
||||
{
|
||||
metrics.file_cache_hit += 1;
|
||||
let metadata = Arc::new(metadata);
|
||||
// Put metadata into sst meta cache
|
||||
self.put_parquet_meta_data(file_id, metadata.clone());
|
||||
return Some(metadata);
|
||||
};
|
||||
metrics.cache_miss += 1;
|
||||
|
||||
None
|
||||
}
|
||||
@@ -835,14 +825,8 @@ mod tests {
|
||||
let region_id = RegionId::new(1, 1);
|
||||
let file_id = RegionFileId::new(region_id, FileId::random());
|
||||
let metadata = parquet_meta();
|
||||
let mut metrics = MetadataCacheMetrics::default();
|
||||
cache.put_parquet_meta_data(file_id, metadata);
|
||||
assert!(
|
||||
cache
|
||||
.get_parquet_meta_data(file_id, &mut metrics)
|
||||
.await
|
||||
.is_none()
|
||||
);
|
||||
assert!(cache.get_parquet_meta_data(file_id).await.is_none());
|
||||
|
||||
let value = Value::Int64(10);
|
||||
let vector: VectorRef = Arc::new(Int64Vector::from_slice([10, 10, 10, 10]));
|
||||
@@ -864,30 +848,14 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_parquet_meta_cache() {
|
||||
let cache = CacheManager::builder().sst_meta_cache_size(2000).build();
|
||||
let mut metrics = MetadataCacheMetrics::default();
|
||||
let region_id = RegionId::new(1, 1);
|
||||
let file_id = RegionFileId::new(region_id, FileId::random());
|
||||
assert!(
|
||||
cache
|
||||
.get_parquet_meta_data(file_id, &mut metrics)
|
||||
.await
|
||||
.is_none()
|
||||
);
|
||||
assert!(cache.get_parquet_meta_data(file_id).await.is_none());
|
||||
let metadata = parquet_meta();
|
||||
cache.put_parquet_meta_data(file_id, metadata);
|
||||
assert!(
|
||||
cache
|
||||
.get_parquet_meta_data(file_id, &mut metrics)
|
||||
.await
|
||||
.is_some()
|
||||
);
|
||||
assert!(cache.get_parquet_meta_data(file_id).await.is_some());
|
||||
cache.remove_parquet_meta_data(file_id);
|
||||
assert!(
|
||||
cache
|
||||
.get_parquet_meta_data(file_id, &mut metrics)
|
||||
.await
|
||||
.is_none()
|
||||
);
|
||||
assert!(cache.get_parquet_meta_data(file_id).await.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
708
src/mito2/src/cache/file_cache.rs
vendored
708
src/mito2/src/cache/file_cache.rs
vendored
@@ -55,18 +55,109 @@ pub(crate) const DEFAULT_INDEX_CACHE_PERCENT: u8 = 20;
|
||||
/// Minimum capacity for each cache (512MB).
|
||||
const MIN_CACHE_CAPACITY: u64 = 512 * 1024 * 1024;
|
||||
|
||||
/// Inner struct for FileCache that can be used in spawned tasks.
|
||||
/// A file cache manages files on local store and evict files based
|
||||
/// on size.
|
||||
#[derive(Debug)]
|
||||
struct FileCacheInner {
|
||||
pub(crate) struct FileCache {
|
||||
/// Local store to cache files.
|
||||
local_store: ObjectStore,
|
||||
/// Index to track cached Parquet files.
|
||||
parquet_index: Cache<IndexKey, IndexValue>,
|
||||
/// Index to track cached Puffin files.
|
||||
puffin_index: Cache<IndexKey, IndexValue>,
|
||||
/// Capacity of the puffin (index) cache in bytes.
|
||||
puffin_capacity: u64,
|
||||
}
|
||||
|
||||
impl FileCacheInner {
|
||||
pub(crate) type FileCacheRef = Arc<FileCache>;
|
||||
|
||||
impl FileCache {
|
||||
/// Creates a new file cache.
|
||||
pub(crate) fn new(
|
||||
local_store: ObjectStore,
|
||||
capacity: ReadableSize,
|
||||
ttl: Option<Duration>,
|
||||
index_cache_percent: Option<u8>,
|
||||
) -> FileCache {
|
||||
// Validate and use the provided percent or default
|
||||
let index_percent = index_cache_percent
|
||||
.filter(|&percent| percent > 0 && percent < 100)
|
||||
.unwrap_or(DEFAULT_INDEX_CACHE_PERCENT);
|
||||
let total_capacity = capacity.as_bytes();
|
||||
|
||||
// Convert percent to ratio and calculate capacity for each cache
|
||||
let index_ratio = index_percent as f64 / 100.0;
|
||||
let puffin_capacity = (total_capacity as f64 * index_ratio) as u64;
|
||||
let parquet_capacity = total_capacity - puffin_capacity;
|
||||
|
||||
// Ensure both capacities are at least 512MB
|
||||
let puffin_capacity = puffin_capacity.max(MIN_CACHE_CAPACITY);
|
||||
let parquet_capacity = parquet_capacity.max(MIN_CACHE_CAPACITY);
|
||||
|
||||
info!(
|
||||
"Initializing file cache with index_percent: {}%, total_capacity: {}, parquet_capacity: {}, puffin_capacity: {}",
|
||||
index_percent,
|
||||
ReadableSize(total_capacity),
|
||||
ReadableSize(parquet_capacity),
|
||||
ReadableSize(puffin_capacity)
|
||||
);
|
||||
|
||||
let parquet_index = Self::build_cache(local_store.clone(), parquet_capacity, ttl, "file");
|
||||
let puffin_index = Self::build_cache(local_store.clone(), puffin_capacity, ttl, "index");
|
||||
|
||||
FileCache {
|
||||
local_store,
|
||||
parquet_index,
|
||||
puffin_index,
|
||||
puffin_capacity,
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds a cache for a specific file type.
|
||||
fn build_cache(
|
||||
local_store: ObjectStore,
|
||||
capacity: u64,
|
||||
ttl: Option<Duration>,
|
||||
label: &'static str,
|
||||
) -> Cache<IndexKey, IndexValue> {
|
||||
let cache_store = local_store;
|
||||
let mut builder = Cache::builder()
|
||||
.eviction_policy(EvictionPolicy::lru())
|
||||
.weigher(|_key, value: &IndexValue| -> u32 {
|
||||
// We only measure space on local store.
|
||||
value.file_size
|
||||
})
|
||||
.max_capacity(capacity)
|
||||
.async_eviction_listener(move |key, value, cause| {
|
||||
let store = cache_store.clone();
|
||||
// Stores files under FILE_DIR.
|
||||
let file_path = cache_file_path(FILE_DIR, *key);
|
||||
async move {
|
||||
if let RemovalCause::Replaced = cause {
|
||||
// The cache is replaced by another file. This is unexpected, we don't remove the same
|
||||
// file but updates the metrics as the file is already replaced by users.
|
||||
CACHE_BYTES.with_label_values(&[label]).sub(value.file_size.into());
|
||||
warn!("Replace existing cache {} for region {} unexpectedly", file_path, key.region_id);
|
||||
return;
|
||||
}
|
||||
|
||||
match store.delete(&file_path).await {
|
||||
Ok(()) => {
|
||||
CACHE_BYTES.with_label_values(&[label]).sub(value.file_size.into());
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(e; "Failed to delete cached file {} for region {}", file_path, key.region_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
.boxed()
|
||||
});
|
||||
if let Some(ttl) = ttl {
|
||||
builder = builder.time_to_idle(ttl);
|
||||
}
|
||||
builder.build()
|
||||
}
|
||||
|
||||
/// Returns the appropriate memory index for the given file type.
|
||||
fn memory_index(&self, file_type: FileType) -> &Cache<IndexKey, IndexValue> {
|
||||
match file_type {
|
||||
@@ -75,15 +166,10 @@ impl FileCacheInner {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the cache file path for the key.
|
||||
fn cache_file_path(&self, key: IndexKey) -> String {
|
||||
cache_file_path(FILE_DIR, key)
|
||||
}
|
||||
|
||||
/// Puts a file into the cache index.
|
||||
///
|
||||
/// The `WriteCache` should ensure the file is in the correct path.
|
||||
async fn put(&self, key: IndexKey, value: IndexValue) {
|
||||
pub(crate) async fn put(&self, key: IndexKey, value: IndexValue) {
|
||||
CACHE_BYTES
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.add(value.file_size.into());
|
||||
@@ -94,8 +180,100 @@ impl FileCacheInner {
|
||||
index.run_pending_tasks().await;
|
||||
}
|
||||
|
||||
/// Recovers the index from local store.
|
||||
async fn recover(&self) -> Result<()> {
|
||||
pub(crate) async fn get(&self, key: IndexKey) -> Option<IndexValue> {
|
||||
self.memory_index(key.file_type).get(&key).await
|
||||
}
|
||||
|
||||
/// Reads a file from the cache.
|
||||
#[allow(unused)]
|
||||
pub(crate) async fn reader(&self, key: IndexKey) -> Option<Reader> {
|
||||
// We must use `get()` to update the estimator of the cache.
|
||||
// See https://docs.rs/moka/latest/moka/future/struct.Cache.html#method.contains_key
|
||||
let index = self.memory_index(key.file_type);
|
||||
if index.get(&key).await.is_none() {
|
||||
CACHE_MISS
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
return None;
|
||||
}
|
||||
|
||||
let file_path = self.cache_file_path(key);
|
||||
match self.get_reader(&file_path).await {
|
||||
Ok(Some(reader)) => {
|
||||
CACHE_HIT
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
return Some(reader);
|
||||
}
|
||||
Err(e) => {
|
||||
if e.kind() != ErrorKind::NotFound {
|
||||
warn!(e; "Failed to get file for key {:?}", key);
|
||||
}
|
||||
}
|
||||
Ok(None) => {}
|
||||
}
|
||||
|
||||
// We removes the file from the index.
|
||||
index.remove(&key).await;
|
||||
CACHE_MISS
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
None
|
||||
}
|
||||
|
||||
/// Reads ranges from the cache.
|
||||
pub(crate) async fn read_ranges(
|
||||
&self,
|
||||
key: IndexKey,
|
||||
ranges: &[Range<u64>],
|
||||
) -> Option<Vec<Bytes>> {
|
||||
let index = self.memory_index(key.file_type);
|
||||
if index.get(&key).await.is_none() {
|
||||
CACHE_MISS
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
return None;
|
||||
}
|
||||
|
||||
let file_path = self.cache_file_path(key);
|
||||
// In most cases, it will use blocking read,
|
||||
// because FileCache is normally based on local file system, which supports blocking read.
|
||||
let bytes_result = fetch_byte_ranges(&file_path, self.local_store.clone(), ranges).await;
|
||||
match bytes_result {
|
||||
Ok(bytes) => {
|
||||
CACHE_HIT
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
Some(bytes)
|
||||
}
|
||||
Err(e) => {
|
||||
if e.kind() != ErrorKind::NotFound {
|
||||
warn!(e; "Failed to get file for key {:?}", key);
|
||||
}
|
||||
|
||||
// We removes the file from the index.
|
||||
index.remove(&key).await;
|
||||
CACHE_MISS
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Removes a file from the cache explicitly.
|
||||
/// It always tries to remove the file from the local store because we may not have the file
|
||||
/// in the memory index if upload is failed.
|
||||
pub(crate) async fn remove(&self, key: IndexKey) {
|
||||
let file_path = self.cache_file_path(key);
|
||||
self.memory_index(key.file_type).remove(&key).await;
|
||||
// Always delete the file from the local store.
|
||||
if let Err(e) = self.local_store.delete(&file_path).await {
|
||||
warn!(e; "Failed to delete a cached file {}", file_path);
|
||||
}
|
||||
}
|
||||
|
||||
async fn recover_inner(&self) -> Result<()> {
|
||||
let now = Instant::now();
|
||||
let mut lister = self
|
||||
.local_store
|
||||
@@ -163,7 +341,136 @@ impl FileCacheInner {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Downloads a file without cleaning up on error.
|
||||
/// Recovers the index from local store.
|
||||
///
|
||||
/// If `task_receiver` is provided, spawns a background task after recovery
|
||||
/// to process `RegionLoadCacheTask` messages for loading files into the cache.
|
||||
pub(crate) async fn recover(
|
||||
self: &Arc<Self>,
|
||||
sync: bool,
|
||||
task_receiver: Option<UnboundedReceiver<RegionLoadCacheTask>>,
|
||||
) {
|
||||
let moved_self = self.clone();
|
||||
let handle = tokio::spawn(async move {
|
||||
if let Err(err) = moved_self.recover_inner().await {
|
||||
error!(err; "Failed to recover file cache.")
|
||||
}
|
||||
|
||||
// Spawns background task to process region load cache tasks after recovery.
|
||||
// So it won't block the recovery when `sync` is true.
|
||||
if let Some(mut receiver) = task_receiver {
|
||||
let cache_ref = moved_self.clone();
|
||||
info!("Spawning background task for processing region load cache tasks");
|
||||
tokio::spawn(async move {
|
||||
while let Some(task) = receiver.recv().await {
|
||||
let file_cache = cache_ref.clone();
|
||||
task.fill_cache(file_cache).await;
|
||||
}
|
||||
info!("Background task for processing region load cache tasks stopped");
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
if sync {
|
||||
let _ = handle.await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the cache file path for the key.
|
||||
pub(crate) fn cache_file_path(&self, key: IndexKey) -> String {
|
||||
cache_file_path(FILE_DIR, key)
|
||||
}
|
||||
|
||||
/// Returns the local store of the file cache.
|
||||
pub(crate) fn local_store(&self) -> ObjectStore {
|
||||
self.local_store.clone()
|
||||
}
|
||||
|
||||
/// Get the parquet metadata in file cache.
|
||||
/// If the file is not in the cache or fail to load metadata, return None.
|
||||
pub(crate) async fn get_parquet_meta_data(&self, key: IndexKey) -> Option<ParquetMetaData> {
|
||||
// Check if file cache contains the key
|
||||
if let Some(index_value) = self.parquet_index.get(&key).await {
|
||||
// Load metadata from file cache
|
||||
let local_store = self.local_store();
|
||||
let file_path = self.cache_file_path(key);
|
||||
let file_size = index_value.file_size as u64;
|
||||
let metadata_loader = MetadataLoader::new(local_store, &file_path, file_size);
|
||||
|
||||
match metadata_loader.load().await {
|
||||
Ok(metadata) => {
|
||||
CACHE_HIT
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
Some(metadata)
|
||||
}
|
||||
Err(e) => {
|
||||
if !e.is_object_not_found() {
|
||||
warn!(
|
||||
e; "Failed to get parquet metadata for key {:?}",
|
||||
key
|
||||
);
|
||||
}
|
||||
// We removes the file from the index.
|
||||
self.parquet_index.remove(&key).await;
|
||||
CACHE_MISS
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
CACHE_MISS
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_reader(&self, file_path: &str) -> object_store::Result<Option<Reader>> {
|
||||
if self.local_store.exists(file_path).await? {
|
||||
Ok(Some(self.local_store.reader(file_path).await?))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks if the key is in the file cache.
|
||||
pub(crate) fn contains_key(&self, key: &IndexKey) -> bool {
|
||||
self.memory_index(key.file_type).contains_key(key)
|
||||
}
|
||||
|
||||
/// Returns the capacity of the puffin (index) cache in bytes.
|
||||
pub(crate) fn puffin_cache_capacity(&self) -> u64 {
|
||||
self.puffin_capacity
|
||||
}
|
||||
|
||||
/// Returns the current weighted size (used bytes) of the puffin (index) cache.
|
||||
pub(crate) fn puffin_cache_size(&self) -> u64 {
|
||||
self.puffin_index.weighted_size()
|
||||
}
|
||||
|
||||
/// Downloads a file in `remote_path` from the remote object store to the local cache
|
||||
/// (specified by `index_key`).
|
||||
pub(crate) async fn download(
|
||||
&self,
|
||||
index_key: IndexKey,
|
||||
remote_path: &str,
|
||||
remote_store: &ObjectStore,
|
||||
file_size: u64,
|
||||
) -> Result<()> {
|
||||
if let Err(e) = self
|
||||
.download_without_cleaning(index_key, remote_path, remote_store, file_size)
|
||||
.await
|
||||
{
|
||||
let filename = index_key.to_string();
|
||||
TempFileCleaner::clean_atomic_dir_files(&self.local_store, &[&filename]).await;
|
||||
|
||||
return Err(e);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn download_without_cleaning(
|
||||
&self,
|
||||
index_key: IndexKey,
|
||||
@@ -230,360 +537,11 @@ impl FileCacheInner {
|
||||
self.put(index_key, index_value).await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Downloads a file from remote store to local cache.
|
||||
async fn download(
|
||||
&self,
|
||||
index_key: IndexKey,
|
||||
remote_path: &str,
|
||||
remote_store: &ObjectStore,
|
||||
file_size: u64,
|
||||
) -> Result<()> {
|
||||
if let Err(e) = self
|
||||
.download_without_cleaning(index_key, remote_path, remote_store, file_size)
|
||||
.await
|
||||
{
|
||||
let filename = index_key.to_string();
|
||||
TempFileCleaner::clean_atomic_dir_files(&self.local_store, &[&filename]).await;
|
||||
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// A file cache manages files on local store and evict files based
|
||||
/// on size.
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct FileCache {
|
||||
/// Inner cache state shared with background worker.
|
||||
inner: Arc<FileCacheInner>,
|
||||
/// Capacity of the puffin (index) cache in bytes.
|
||||
puffin_capacity: u64,
|
||||
}
|
||||
|
||||
pub(crate) type FileCacheRef = Arc<FileCache>;
|
||||
|
||||
impl FileCache {
|
||||
/// Creates a new file cache.
|
||||
pub(crate) fn new(
|
||||
local_store: ObjectStore,
|
||||
capacity: ReadableSize,
|
||||
ttl: Option<Duration>,
|
||||
index_cache_percent: Option<u8>,
|
||||
) -> FileCache {
|
||||
// Validate and use the provided percent or default
|
||||
let index_percent = index_cache_percent
|
||||
.filter(|&percent| percent > 0 && percent < 100)
|
||||
.unwrap_or(DEFAULT_INDEX_CACHE_PERCENT);
|
||||
let total_capacity = capacity.as_bytes();
|
||||
|
||||
// Convert percent to ratio and calculate capacity for each cache
|
||||
let index_ratio = index_percent as f64 / 100.0;
|
||||
let puffin_capacity = (total_capacity as f64 * index_ratio) as u64;
|
||||
let parquet_capacity = total_capacity - puffin_capacity;
|
||||
|
||||
// Ensure both capacities are at least 512MB
|
||||
let puffin_capacity = puffin_capacity.max(MIN_CACHE_CAPACITY);
|
||||
let parquet_capacity = parquet_capacity.max(MIN_CACHE_CAPACITY);
|
||||
|
||||
info!(
|
||||
"Initializing file cache with index_percent: {}%, total_capacity: {}, parquet_capacity: {}, puffin_capacity: {}",
|
||||
index_percent,
|
||||
ReadableSize(total_capacity),
|
||||
ReadableSize(parquet_capacity),
|
||||
ReadableSize(puffin_capacity)
|
||||
);
|
||||
|
||||
let parquet_index = Self::build_cache(local_store.clone(), parquet_capacity, ttl, "file");
|
||||
let puffin_index = Self::build_cache(local_store.clone(), puffin_capacity, ttl, "index");
|
||||
|
||||
// Create inner cache shared with background worker
|
||||
let inner = Arc::new(FileCacheInner {
|
||||
local_store,
|
||||
parquet_index,
|
||||
puffin_index,
|
||||
});
|
||||
|
||||
FileCache {
|
||||
inner,
|
||||
puffin_capacity,
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds a cache for a specific file type.
|
||||
fn build_cache(
|
||||
local_store: ObjectStore,
|
||||
capacity: u64,
|
||||
ttl: Option<Duration>,
|
||||
label: &'static str,
|
||||
) -> Cache<IndexKey, IndexValue> {
|
||||
let cache_store = local_store;
|
||||
let mut builder = Cache::builder()
|
||||
.eviction_policy(EvictionPolicy::lru())
|
||||
.weigher(|_key, value: &IndexValue| -> u32 {
|
||||
// We only measure space on local store.
|
||||
value.file_size
|
||||
})
|
||||
.max_capacity(capacity)
|
||||
.async_eviction_listener(move |key, value, cause| {
|
||||
let store = cache_store.clone();
|
||||
// Stores files under FILE_DIR.
|
||||
let file_path = cache_file_path(FILE_DIR, *key);
|
||||
async move {
|
||||
if let RemovalCause::Replaced = cause {
|
||||
// The cache is replaced by another file. This is unexpected, we don't remove the same
|
||||
// file but updates the metrics as the file is already replaced by users.
|
||||
CACHE_BYTES.with_label_values(&[label]).sub(value.file_size.into());
|
||||
// TODO(yingwen): Don't log warn later.
|
||||
warn!("Replace existing cache {} for region {} unexpectedly", file_path, key.region_id);
|
||||
return;
|
||||
}
|
||||
|
||||
match store.delete(&file_path).await {
|
||||
Ok(()) => {
|
||||
CACHE_BYTES.with_label_values(&[label]).sub(value.file_size.into());
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(e; "Failed to delete cached file {} for region {}", file_path, key.region_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
.boxed()
|
||||
});
|
||||
if let Some(ttl) = ttl {
|
||||
builder = builder.time_to_idle(ttl);
|
||||
}
|
||||
builder.build()
|
||||
}
|
||||
|
||||
/// Puts a file into the cache index.
|
||||
///
|
||||
/// The `WriteCache` should ensure the file is in the correct path.
|
||||
pub(crate) async fn put(&self, key: IndexKey, value: IndexValue) {
|
||||
self.inner.put(key, value).await
|
||||
}
|
||||
|
||||
pub(crate) async fn get(&self, key: IndexKey) -> Option<IndexValue> {
|
||||
self.inner.memory_index(key.file_type).get(&key).await
|
||||
}
|
||||
|
||||
/// Reads a file from the cache.
|
||||
#[allow(unused)]
|
||||
pub(crate) async fn reader(&self, key: IndexKey) -> Option<Reader> {
|
||||
// We must use `get()` to update the estimator of the cache.
|
||||
// See https://docs.rs/moka/latest/moka/future/struct.Cache.html#method.contains_key
|
||||
let index = self.inner.memory_index(key.file_type);
|
||||
if index.get(&key).await.is_none() {
|
||||
CACHE_MISS
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
return None;
|
||||
}
|
||||
|
||||
let file_path = self.inner.cache_file_path(key);
|
||||
match self.get_reader(&file_path).await {
|
||||
Ok(Some(reader)) => {
|
||||
CACHE_HIT
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
return Some(reader);
|
||||
}
|
||||
Err(e) => {
|
||||
if e.kind() != ErrorKind::NotFound {
|
||||
warn!(e; "Failed to get file for key {:?}", key);
|
||||
}
|
||||
}
|
||||
Ok(None) => {}
|
||||
}
|
||||
|
||||
// We removes the file from the index.
|
||||
index.remove(&key).await;
|
||||
CACHE_MISS
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
None
|
||||
}
|
||||
|
||||
/// Reads ranges from the cache.
|
||||
pub(crate) async fn read_ranges(
|
||||
&self,
|
||||
key: IndexKey,
|
||||
ranges: &[Range<u64>],
|
||||
) -> Option<Vec<Bytes>> {
|
||||
let index = self.inner.memory_index(key.file_type);
|
||||
if index.get(&key).await.is_none() {
|
||||
CACHE_MISS
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
return None;
|
||||
}
|
||||
|
||||
let file_path = self.inner.cache_file_path(key);
|
||||
// In most cases, it will use blocking read,
|
||||
// because FileCache is normally based on local file system, which supports blocking read.
|
||||
let bytes_result =
|
||||
fetch_byte_ranges(&file_path, self.inner.local_store.clone(), ranges).await;
|
||||
match bytes_result {
|
||||
Ok(bytes) => {
|
||||
CACHE_HIT
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
Some(bytes)
|
||||
}
|
||||
Err(e) => {
|
||||
if e.kind() != ErrorKind::NotFound {
|
||||
warn!(e; "Failed to get file for key {:?}", key);
|
||||
}
|
||||
|
||||
// We removes the file from the index.
|
||||
index.remove(&key).await;
|
||||
CACHE_MISS
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Removes a file from the cache explicitly.
|
||||
/// It always tries to remove the file from the local store because we may not have the file
|
||||
/// in the memory index if upload is failed.
|
||||
pub(crate) async fn remove(&self, key: IndexKey) {
|
||||
let file_path = self.inner.cache_file_path(key);
|
||||
self.inner.memory_index(key.file_type).remove(&key).await;
|
||||
// Always delete the file from the local store.
|
||||
if let Err(e) = self.inner.local_store.delete(&file_path).await {
|
||||
warn!(e; "Failed to delete a cached file {}", file_path);
|
||||
}
|
||||
}
|
||||
|
||||
/// Recovers the index from local store.
|
||||
///
|
||||
/// If `task_receiver` is provided, spawns a background task after recovery
|
||||
/// to process `RegionLoadCacheTask` messages for loading files into the cache.
|
||||
pub(crate) async fn recover(
|
||||
&self,
|
||||
sync: bool,
|
||||
task_receiver: Option<UnboundedReceiver<RegionLoadCacheTask>>,
|
||||
) {
|
||||
let moved_self = self.clone();
|
||||
let handle = tokio::spawn(async move {
|
||||
if let Err(err) = moved_self.inner.recover().await {
|
||||
error!(err; "Failed to recover file cache.")
|
||||
}
|
||||
|
||||
// Spawns background task to process region load cache tasks after recovery.
|
||||
// So it won't block the recovery when `sync` is true.
|
||||
if let Some(mut receiver) = task_receiver {
|
||||
info!("Spawning background task for processing region load cache tasks");
|
||||
tokio::spawn(async move {
|
||||
while let Some(task) = receiver.recv().await {
|
||||
task.fill_cache(&moved_self).await;
|
||||
}
|
||||
info!("Background task for processing region load cache tasks stopped");
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
if sync {
|
||||
let _ = handle.await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the cache file path for the key.
|
||||
pub(crate) fn cache_file_path(&self, key: IndexKey) -> String {
|
||||
self.inner.cache_file_path(key)
|
||||
}
|
||||
|
||||
/// Returns the local store of the file cache.
|
||||
pub(crate) fn local_store(&self) -> ObjectStore {
|
||||
self.inner.local_store.clone()
|
||||
}
|
||||
|
||||
/// Get the parquet metadata in file cache.
|
||||
/// If the file is not in the cache or fail to load metadata, return None.
|
||||
pub(crate) async fn get_parquet_meta_data(&self, key: IndexKey) -> Option<ParquetMetaData> {
|
||||
// Check if file cache contains the key
|
||||
if let Some(index_value) = self.inner.parquet_index.get(&key).await {
|
||||
// Load metadata from file cache
|
||||
let local_store = self.local_store();
|
||||
let file_path = self.inner.cache_file_path(key);
|
||||
let file_size = index_value.file_size as u64;
|
||||
let metadata_loader = MetadataLoader::new(local_store, &file_path, file_size);
|
||||
|
||||
match metadata_loader.load().await {
|
||||
Ok(metadata) => {
|
||||
CACHE_HIT
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
Some(metadata)
|
||||
}
|
||||
Err(e) => {
|
||||
if !e.is_object_not_found() {
|
||||
warn!(
|
||||
e; "Failed to get parquet metadata for key {:?}",
|
||||
key
|
||||
);
|
||||
}
|
||||
// We removes the file from the index.
|
||||
self.inner.parquet_index.remove(&key).await;
|
||||
CACHE_MISS
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
CACHE_MISS
|
||||
.with_label_values(&[key.file_type.metric_label()])
|
||||
.inc();
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_reader(&self, file_path: &str) -> object_store::Result<Option<Reader>> {
|
||||
if self.inner.local_store.exists(file_path).await? {
|
||||
Ok(Some(self.inner.local_store.reader(file_path).await?))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks if the key is in the file cache.
|
||||
pub(crate) fn contains_key(&self, key: &IndexKey) -> bool {
|
||||
self.inner.memory_index(key.file_type).contains_key(key)
|
||||
}
|
||||
|
||||
/// Returns the capacity of the puffin (index) cache in bytes.
|
||||
pub(crate) fn puffin_cache_capacity(&self) -> u64 {
|
||||
self.puffin_capacity
|
||||
}
|
||||
|
||||
/// Returns the current weighted size (used bytes) of the puffin (index) cache.
|
||||
pub(crate) fn puffin_cache_size(&self) -> u64 {
|
||||
self.inner.puffin_index.weighted_size()
|
||||
}
|
||||
|
||||
/// Downloads a file in `remote_path` from the remote object store to the local cache
|
||||
/// (specified by `index_key`).
|
||||
pub(crate) async fn download(
|
||||
&self,
|
||||
index_key: IndexKey,
|
||||
remote_path: &str,
|
||||
remote_store: &ObjectStore,
|
||||
file_size: u64,
|
||||
) -> Result<()> {
|
||||
self.inner
|
||||
.download(index_key, remote_path, remote_store, file_size)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
/// Key of file cache index.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub struct IndexKey {
|
||||
pub(crate) struct IndexKey {
|
||||
pub region_id: RegionId,
|
||||
pub file_id: FileId,
|
||||
pub file_type: FileType,
|
||||
@@ -725,7 +683,7 @@ mod tests {
|
||||
let exist = cache.reader(key).await;
|
||||
assert!(exist.is_some());
|
||||
tokio::time::sleep(Duration::from_millis(15)).await;
|
||||
cache.inner.parquet_index.run_pending_tasks().await;
|
||||
cache.parquet_index.run_pending_tasks().await;
|
||||
let non = cache.reader(key).await;
|
||||
assert!(non.is_none());
|
||||
}
|
||||
@@ -763,19 +721,19 @@ mod tests {
|
||||
assert_eq!("hello", String::from_utf8(buf).unwrap());
|
||||
|
||||
// Get weighted size.
|
||||
cache.inner.parquet_index.run_pending_tasks().await;
|
||||
assert_eq!(5, cache.inner.parquet_index.weighted_size());
|
||||
cache.parquet_index.run_pending_tasks().await;
|
||||
assert_eq!(5, cache.parquet_index.weighted_size());
|
||||
|
||||
// Remove the file.
|
||||
cache.remove(key).await;
|
||||
assert!(cache.reader(key).await.is_none());
|
||||
|
||||
// Ensure all pending tasks of the moka cache is done before assertion.
|
||||
cache.inner.parquet_index.run_pending_tasks().await;
|
||||
cache.parquet_index.run_pending_tasks().await;
|
||||
|
||||
// The file also not exists.
|
||||
assert!(!local_store.exists(&file_path).await.unwrap());
|
||||
assert_eq!(0, cache.inner.parquet_index.weighted_size());
|
||||
assert_eq!(0, cache.parquet_index.weighted_size());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -808,7 +766,7 @@ mod tests {
|
||||
// Reader is none.
|
||||
assert!(cache.reader(key).await.is_none());
|
||||
// Key is removed.
|
||||
assert!(!cache.inner.parquet_index.contains_key(&key));
|
||||
assert!(!cache.parquet_index.contains_key(&key));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -841,7 +799,12 @@ mod tests {
|
||||
}
|
||||
|
||||
// Recover the cache.
|
||||
let cache = FileCache::new(local_store.clone(), ReadableSize::mb(10), None, None);
|
||||
let cache = Arc::new(FileCache::new(
|
||||
local_store.clone(),
|
||||
ReadableSize::mb(10),
|
||||
None,
|
||||
None,
|
||||
));
|
||||
// No entry before recovery.
|
||||
assert!(
|
||||
cache
|
||||
@@ -852,11 +815,8 @@ mod tests {
|
||||
cache.recover(true, None).await;
|
||||
|
||||
// Check size.
|
||||
cache.inner.parquet_index.run_pending_tasks().await;
|
||||
assert_eq!(
|
||||
total_size,
|
||||
cache.inner.parquet_index.weighted_size() as usize
|
||||
);
|
||||
cache.parquet_index.run_pending_tasks().await;
|
||||
assert_eq!(total_size, cache.parquet_index.weighted_size() as usize);
|
||||
|
||||
for (i, file_id) in file_ids.iter().enumerate() {
|
||||
let key = IndexKey::new(region_id, *file_id, file_type);
|
||||
|
||||
42
src/mito2/src/cache/index.rs
vendored
42
src/mito2/src/cache/index.rs
vendored
@@ -31,29 +31,6 @@ const INDEX_METADATA_TYPE: &str = "index_metadata";
|
||||
/// Metrics for index content.
|
||||
const INDEX_CONTENT_TYPE: &str = "index_content";
|
||||
|
||||
/// Metrics collected from IndexCache operations.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct IndexCacheMetrics {
|
||||
/// Number of cache hits.
|
||||
pub cache_hit: usize,
|
||||
/// Number of cache misses.
|
||||
pub cache_miss: usize,
|
||||
/// Number of pages accessed.
|
||||
pub num_pages: usize,
|
||||
/// Total bytes from pages.
|
||||
pub page_bytes: u64,
|
||||
}
|
||||
|
||||
impl IndexCacheMetrics {
|
||||
/// Merges another set of metrics into this one.
|
||||
pub fn merge(&mut self, other: &Self) {
|
||||
self.cache_hit += other.cache_hit;
|
||||
self.cache_miss += other.cache_miss;
|
||||
self.num_pages += other.num_pages;
|
||||
self.page_bytes += other.page_bytes;
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub struct PageKey {
|
||||
page_id: u64,
|
||||
@@ -183,20 +160,18 @@ where
|
||||
offset: u64,
|
||||
size: u32,
|
||||
load: F,
|
||||
) -> Result<(Vec<u8>, IndexCacheMetrics), E>
|
||||
) -> Result<Vec<u8>, E>
|
||||
where
|
||||
F: Fn(Vec<Range<u64>>) -> Fut,
|
||||
Fut: Future<Output = Result<Vec<Bytes>, E>>,
|
||||
E: std::error::Error,
|
||||
{
|
||||
let mut metrics = IndexCacheMetrics::default();
|
||||
let page_keys =
|
||||
PageKey::generate_page_keys(offset, size, self.page_size).collect::<Vec<_>>();
|
||||
// Size is 0, return empty data.
|
||||
if page_keys.is_empty() {
|
||||
return Ok((Vec::new(), metrics));
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
metrics.num_pages = page_keys.len();
|
||||
let mut data = Vec::with_capacity(page_keys.len());
|
||||
data.resize(page_keys.len(), Bytes::new());
|
||||
let mut cache_miss_range = vec![];
|
||||
@@ -207,13 +182,10 @@ where
|
||||
match self.get_page(key, *page_key) {
|
||||
Some(page) => {
|
||||
CACHE_HIT.with_label_values(&[INDEX_CONTENT_TYPE]).inc();
|
||||
metrics.cache_hit += 1;
|
||||
metrics.page_bytes += page.len() as u64;
|
||||
data[i] = page;
|
||||
}
|
||||
None => {
|
||||
CACHE_MISS.with_label_values(&[INDEX_CONTENT_TYPE]).inc();
|
||||
metrics.cache_miss += 1;
|
||||
let base_offset = page_key.page_id * self.page_size;
|
||||
let pruned_size = if i == last_index {
|
||||
prune_size(page_keys.iter(), file_size, self.page_size)
|
||||
@@ -229,18 +201,14 @@ where
|
||||
let pages = load(cache_miss_range).await?;
|
||||
for (i, page) in cache_miss_idx.into_iter().zip(pages.into_iter()) {
|
||||
let page_key = page_keys[i];
|
||||
metrics.page_bytes += page.len() as u64;
|
||||
data[i] = page.clone();
|
||||
self.put_page(key, page_key, page.clone());
|
||||
}
|
||||
}
|
||||
let buffer = Buffer::from_iter(data.into_iter());
|
||||
Ok((
|
||||
buffer
|
||||
.slice(PageKey::calculate_range(offset, size, self.page_size))
|
||||
.to_vec(),
|
||||
metrics,
|
||||
))
|
||||
Ok(buffer
|
||||
.slice(PageKey::calculate_range(offset, size, self.page_size))
|
||||
.to_vec())
|
||||
}
|
||||
|
||||
fn get_page(&self, key: K, page_key: PageKey) -> Option<Bytes> {
|
||||
|
||||
65
src/mito2/src/cache/index/bloom_filter_index.rs
vendored
65
src/mito2/src/cache/index/bloom_filter_index.rs
vendored
@@ -14,13 +14,12 @@
|
||||
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use api::v1::index::{BloomFilterLoc, BloomFilterMeta};
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
use index::bloom_filter::error::Result;
|
||||
use index::bloom_filter::reader::{BloomFilterReadMetrics, BloomFilterReader};
|
||||
use index::bloom_filter::reader::BloomFilterReader;
|
||||
use store_api::storage::{ColumnId, FileId};
|
||||
|
||||
use crate::cache::index::{INDEX_METADATA_TYPE, IndexCache, PageKey};
|
||||
@@ -115,93 +114,51 @@ impl<R> CachedBloomFilterIndexBlobReader<R> {
|
||||
|
||||
#[async_trait]
|
||||
impl<R: BloomFilterReader + Send> BloomFilterReader for CachedBloomFilterIndexBlobReader<R> {
|
||||
async fn range_read(
|
||||
&self,
|
||||
offset: u64,
|
||||
size: u32,
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<Bytes> {
|
||||
let start = metrics.as_ref().map(|_| Instant::now());
|
||||
async fn range_read(&self, offset: u64, size: u32) -> Result<Bytes> {
|
||||
let inner = &self.inner;
|
||||
let (result, cache_metrics) = self
|
||||
.cache
|
||||
self.cache
|
||||
.get_or_load(
|
||||
(self.file_id, self.column_id, self.tag),
|
||||
self.blob_size,
|
||||
offset,
|
||||
size,
|
||||
move |ranges| async move { inner.read_vec(&ranges, None).await },
|
||||
move |ranges| async move { inner.read_vec(&ranges).await },
|
||||
)
|
||||
.await?;
|
||||
|
||||
if let Some(m) = metrics {
|
||||
m.total_ranges += cache_metrics.num_pages;
|
||||
m.total_bytes += cache_metrics.page_bytes;
|
||||
m.cache_hit += cache_metrics.cache_hit;
|
||||
m.cache_miss += cache_metrics.cache_miss;
|
||||
if let Some(start) = start {
|
||||
m.fetch_elapsed += start.elapsed();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result.into())
|
||||
.await
|
||||
.map(|b| b.into())
|
||||
}
|
||||
|
||||
async fn read_vec(
|
||||
&self,
|
||||
ranges: &[Range<u64>],
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<Vec<Bytes>> {
|
||||
let start = metrics.as_ref().map(|_| Instant::now());
|
||||
|
||||
async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
|
||||
let mut pages = Vec::with_capacity(ranges.len());
|
||||
let mut total_cache_metrics = crate::cache::index::IndexCacheMetrics::default();
|
||||
for range in ranges {
|
||||
let inner = &self.inner;
|
||||
let (page, cache_metrics) = self
|
||||
let page = self
|
||||
.cache
|
||||
.get_or_load(
|
||||
(self.file_id, self.column_id, self.tag),
|
||||
self.blob_size,
|
||||
range.start,
|
||||
(range.end - range.start) as u32,
|
||||
move |ranges| async move { inner.read_vec(&ranges, None).await },
|
||||
move |ranges| async move { inner.read_vec(&ranges).await },
|
||||
)
|
||||
.await?;
|
||||
|
||||
total_cache_metrics.merge(&cache_metrics);
|
||||
pages.push(Bytes::from(page));
|
||||
}
|
||||
|
||||
if let Some(m) = metrics {
|
||||
m.total_ranges += total_cache_metrics.num_pages;
|
||||
m.total_bytes += total_cache_metrics.page_bytes;
|
||||
m.cache_hit += total_cache_metrics.cache_hit;
|
||||
m.cache_miss += total_cache_metrics.cache_miss;
|
||||
if let Some(start) = start {
|
||||
m.fetch_elapsed += start.elapsed();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(pages)
|
||||
}
|
||||
|
||||
/// Reads the meta information of the bloom filter.
|
||||
async fn metadata(
|
||||
&self,
|
||||
metrics: Option<&mut BloomFilterReadMetrics>,
|
||||
) -> Result<BloomFilterMeta> {
|
||||
async fn metadata(&self) -> Result<BloomFilterMeta> {
|
||||
if let Some(cached) = self
|
||||
.cache
|
||||
.get_metadata((self.file_id, self.column_id, self.tag))
|
||||
{
|
||||
CACHE_HIT.with_label_values(&[INDEX_METADATA_TYPE]).inc();
|
||||
if let Some(m) = metrics {
|
||||
m.cache_hit += 1;
|
||||
}
|
||||
Ok((*cached).clone())
|
||||
} else {
|
||||
let meta = self.inner.metadata(metrics).await?;
|
||||
let meta = self.inner.metadata().await?;
|
||||
self.cache.put_metadata(
|
||||
(self.file_id, self.column_id, self.tag),
|
||||
Arc::new(meta.clone()),
|
||||
|
||||
113
src/mito2/src/cache/index/inverted_index.rs
vendored
113
src/mito2/src/cache/index/inverted_index.rs
vendored
@@ -14,13 +14,12 @@
|
||||
|
||||
use core::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use api::v1::index::InvertedIndexMetas;
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
use index::inverted_index::error::Result;
|
||||
use index::inverted_index::format::reader::{InvertedIndexReadMetrics, InvertedIndexReader};
|
||||
use index::inverted_index::format::reader::InvertedIndexReader;
|
||||
use prost::Message;
|
||||
use store_api::storage::FileId;
|
||||
|
||||
@@ -84,86 +83,46 @@ impl<R> CachedInvertedIndexBlobReader<R> {
|
||||
|
||||
#[async_trait]
|
||||
impl<R: InvertedIndexReader> InvertedIndexReader for CachedInvertedIndexBlobReader<R> {
|
||||
async fn range_read<'a>(
|
||||
&self,
|
||||
offset: u64,
|
||||
size: u32,
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<Vec<u8>> {
|
||||
let start = metrics.as_ref().map(|_| Instant::now());
|
||||
|
||||
async fn range_read(&self, offset: u64, size: u32) -> Result<Vec<u8>> {
|
||||
let inner = &self.inner;
|
||||
let (result, cache_metrics) = self
|
||||
.cache
|
||||
self.cache
|
||||
.get_or_load(
|
||||
self.file_id,
|
||||
self.blob_size,
|
||||
offset,
|
||||
size,
|
||||
move |ranges| async move { inner.read_vec(&ranges, None).await },
|
||||
move |ranges| async move { inner.read_vec(&ranges).await },
|
||||
)
|
||||
.await?;
|
||||
|
||||
if let Some(m) = metrics {
|
||||
m.total_bytes += cache_metrics.page_bytes;
|
||||
m.total_ranges += cache_metrics.num_pages;
|
||||
m.cache_hit += cache_metrics.cache_hit;
|
||||
m.cache_miss += cache_metrics.cache_miss;
|
||||
m.fetch_elapsed += start.unwrap().elapsed();
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn read_vec<'a>(
|
||||
&self,
|
||||
ranges: &[Range<u64>],
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<Vec<Bytes>> {
|
||||
let start = metrics.as_ref().map(|_| Instant::now());
|
||||
|
||||
async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
|
||||
let mut pages = Vec::with_capacity(ranges.len());
|
||||
let mut total_cache_metrics = crate::cache::index::IndexCacheMetrics::default();
|
||||
for range in ranges {
|
||||
let inner = &self.inner;
|
||||
let (page, cache_metrics) = self
|
||||
let page = self
|
||||
.cache
|
||||
.get_or_load(
|
||||
self.file_id,
|
||||
self.blob_size,
|
||||
range.start,
|
||||
(range.end - range.start) as u32,
|
||||
move |ranges| async move { inner.read_vec(&ranges, None).await },
|
||||
move |ranges| async move { inner.read_vec(&ranges).await },
|
||||
)
|
||||
.await?;
|
||||
|
||||
total_cache_metrics.merge(&cache_metrics);
|
||||
pages.push(Bytes::from(page));
|
||||
}
|
||||
|
||||
if let Some(m) = metrics {
|
||||
m.total_bytes += total_cache_metrics.page_bytes;
|
||||
m.total_ranges += total_cache_metrics.num_pages;
|
||||
m.cache_hit += total_cache_metrics.cache_hit;
|
||||
m.cache_miss += total_cache_metrics.cache_miss;
|
||||
m.fetch_elapsed += start.unwrap().elapsed();
|
||||
}
|
||||
|
||||
Ok(pages)
|
||||
}
|
||||
|
||||
async fn metadata<'a>(
|
||||
&self,
|
||||
metrics: Option<&'a mut InvertedIndexReadMetrics>,
|
||||
) -> Result<Arc<InvertedIndexMetas>> {
|
||||
async fn metadata(&self) -> Result<Arc<InvertedIndexMetas>> {
|
||||
if let Some(cached) = self.cache.get_metadata(self.file_id) {
|
||||
CACHE_HIT.with_label_values(&[INDEX_METADATA_TYPE]).inc();
|
||||
if let Some(m) = metrics {
|
||||
m.cache_hit += 1;
|
||||
}
|
||||
Ok(cached)
|
||||
} else {
|
||||
let meta = self.inner.metadata(metrics).await?;
|
||||
let meta = self.inner.metadata().await?;
|
||||
self.cache.put_metadata(self.file_id, meta.clone());
|
||||
CACHE_MISS.with_label_values(&[INDEX_METADATA_TYPE]).inc();
|
||||
Ok(meta)
|
||||
@@ -318,7 +277,7 @@ mod test {
|
||||
reader,
|
||||
Arc::new(InvertedIndexCache::new(8192, 8192, 50)),
|
||||
);
|
||||
let metadata = cached_reader.metadata(None).await.unwrap();
|
||||
let metadata = cached_reader.metadata().await.unwrap();
|
||||
assert_eq!(metadata.total_row_count, 8);
|
||||
assert_eq!(metadata.segment_row_count, 1);
|
||||
assert_eq!(metadata.metas.len(), 2);
|
||||
@@ -333,19 +292,13 @@ mod test {
|
||||
.fst(
|
||||
tag0.base_offset + tag0.relative_fst_offset as u64,
|
||||
tag0.fst_size,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(fst0.len(), 3);
|
||||
let [offset, size] = unpack(fst0.get(b"a").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(
|
||||
tag0.base_offset + offset as u64,
|
||||
size,
|
||||
BitmapType::Roaring,
|
||||
None,
|
||||
)
|
||||
.bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -354,12 +307,7 @@ mod test {
|
||||
);
|
||||
let [offset, size] = unpack(fst0.get(b"b").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(
|
||||
tag0.base_offset + offset as u64,
|
||||
size,
|
||||
BitmapType::Roaring,
|
||||
None,
|
||||
)
|
||||
.bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -368,12 +316,7 @@ mod test {
|
||||
);
|
||||
let [offset, size] = unpack(fst0.get(b"c").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(
|
||||
tag0.base_offset + offset as u64,
|
||||
size,
|
||||
BitmapType::Roaring,
|
||||
None,
|
||||
)
|
||||
.bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -392,19 +335,13 @@ mod test {
|
||||
.fst(
|
||||
tag1.base_offset + tag1.relative_fst_offset as u64,
|
||||
tag1.fst_size,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(fst1.len(), 3);
|
||||
let [offset, size] = unpack(fst1.get(b"x").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(
|
||||
tag1.base_offset + offset as u64,
|
||||
size,
|
||||
BitmapType::Roaring,
|
||||
None,
|
||||
)
|
||||
.bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -413,12 +350,7 @@ mod test {
|
||||
);
|
||||
let [offset, size] = unpack(fst1.get(b"y").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(
|
||||
tag1.base_offset + offset as u64,
|
||||
size,
|
||||
BitmapType::Roaring,
|
||||
None,
|
||||
)
|
||||
.bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -427,12 +359,7 @@ mod test {
|
||||
);
|
||||
let [offset, size] = unpack(fst1.get(b"z").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(
|
||||
tag1.base_offset + offset as u64,
|
||||
size,
|
||||
BitmapType::Roaring,
|
||||
None,
|
||||
)
|
||||
.bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
@@ -445,16 +372,16 @@ mod test {
|
||||
for _ in 0..FUZZ_REPEAT_TIMES {
|
||||
let offset = rng.random_range(0..file_size);
|
||||
let size = rng.random_range(0..file_size as u32 - offset as u32);
|
||||
let expected = cached_reader.range_read(offset, size, None).await.unwrap();
|
||||
let expected = cached_reader.range_read(offset, size).await.unwrap();
|
||||
let inner = &cached_reader.inner;
|
||||
let (read, _cache_metrics) = cached_reader
|
||||
let read = cached_reader
|
||||
.cache
|
||||
.get_or_load(
|
||||
cached_reader.file_id,
|
||||
file_size,
|
||||
offset,
|
||||
size,
|
||||
|ranges| async move { inner.read_vec(&ranges, None).await },
|
||||
|ranges| async move { inner.read_vec(&ranges).await },
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
574
src/mito2/src/cache/manifest_cache.rs
vendored
574
src/mito2/src/cache/manifest_cache.rs
vendored
@@ -1,574 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! A cache for manifest files.
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use common_base::readable_size::ReadableSize;
|
||||
use common_telemetry::{error, info, warn};
|
||||
use futures::{FutureExt, TryStreamExt};
|
||||
use moka::future::Cache;
|
||||
use moka::notification::RemovalCause;
|
||||
use moka::policy::EvictionPolicy;
|
||||
use object_store::ObjectStore;
|
||||
use object_store::util::join_path;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::error::{OpenDalSnafu, Result};
|
||||
use crate::metrics::{CACHE_BYTES, CACHE_HIT, CACHE_MISS};
|
||||
|
||||
/// Subdirectory of cached manifest files.
|
||||
///
|
||||
/// This must contain three layers, corresponding to [`build_prometheus_metrics_layer`](object_store::layers::build_prometheus_metrics_layer).
|
||||
const MANIFEST_DIR: &str = "cache/object/manifest/";
|
||||
|
||||
/// Metric label for manifest files.
|
||||
const MANIFEST_TYPE: &str = "manifest";
|
||||
|
||||
/// A manifest cache manages manifest files on local store and evicts files based
|
||||
/// on size.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ManifestCache {
|
||||
/// Local store to cache files.
|
||||
local_store: ObjectStore,
|
||||
/// Index to track cached manifest files.
|
||||
index: Cache<String, IndexValue>,
|
||||
}
|
||||
|
||||
impl ManifestCache {
|
||||
/// Creates a new manifest cache and recovers the index from local store.
|
||||
pub async fn new(
|
||||
local_store: ObjectStore,
|
||||
capacity: ReadableSize,
|
||||
ttl: Option<Duration>,
|
||||
) -> ManifestCache {
|
||||
let total_capacity = capacity.as_bytes();
|
||||
|
||||
info!(
|
||||
"Initializing manifest cache with capacity: {}",
|
||||
ReadableSize(total_capacity)
|
||||
);
|
||||
|
||||
let index = Self::build_cache(local_store.clone(), total_capacity, ttl);
|
||||
|
||||
let cache = ManifestCache { local_store, index };
|
||||
|
||||
// Recovers the cache index from local store asynchronously
|
||||
cache.recover(false).await;
|
||||
|
||||
cache
|
||||
}
|
||||
|
||||
/// Builds the cache.
|
||||
fn build_cache(
|
||||
local_store: ObjectStore,
|
||||
capacity: u64,
|
||||
ttl: Option<Duration>,
|
||||
) -> Cache<String, IndexValue> {
|
||||
let cache_store = local_store;
|
||||
let mut builder = Cache::builder()
|
||||
.eviction_policy(EvictionPolicy::lru())
|
||||
.weigher(|key: &String, value: &IndexValue| -> u32 {
|
||||
key.len() as u32 + value.file_size
|
||||
})
|
||||
.max_capacity(capacity)
|
||||
.async_eviction_listener(move |key: Arc<String>, value: IndexValue, cause| {
|
||||
let store = cache_store.clone();
|
||||
// Stores files under MANIFEST_DIR.
|
||||
let file_path = join_path(MANIFEST_DIR, &key);
|
||||
async move {
|
||||
if let RemovalCause::Replaced = cause {
|
||||
// The cache is replaced by another file. We don't remove the same
|
||||
// file but updates the metrics as the file is already replaced by users.
|
||||
CACHE_BYTES
|
||||
.with_label_values(&[MANIFEST_TYPE])
|
||||
.sub(value.file_size.into());
|
||||
return;
|
||||
}
|
||||
|
||||
match store.delete(&file_path).await {
|
||||
Ok(()) => {
|
||||
CACHE_BYTES
|
||||
.with_label_values(&[MANIFEST_TYPE])
|
||||
.sub(value.file_size.into());
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(e; "Failed to delete cached manifest file {}", file_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
.boxed()
|
||||
});
|
||||
if let Some(ttl) = ttl {
|
||||
builder = builder.time_to_idle(ttl);
|
||||
}
|
||||
builder.build()
|
||||
}
|
||||
|
||||
/// Puts a file into the cache index.
|
||||
///
|
||||
/// The caller should ensure the file is in the correct path.
|
||||
pub(crate) async fn put(&self, key: String, value: IndexValue) {
|
||||
CACHE_BYTES
|
||||
.with_label_values(&[MANIFEST_TYPE])
|
||||
.add(value.file_size.into());
|
||||
self.index.insert(key, value).await;
|
||||
|
||||
// Since files can be large items, we run the pending tasks immediately.
|
||||
self.index.run_pending_tasks().await;
|
||||
}
|
||||
|
||||
/// Gets the index value for the key.
|
||||
pub(crate) async fn get(&self, key: &str) -> Option<IndexValue> {
|
||||
self.index.get(key).await
|
||||
}
|
||||
|
||||
/// Removes a file from the cache explicitly.
|
||||
pub(crate) async fn remove(&self, key: &str) {
|
||||
let file_path = self.cache_file_path(key);
|
||||
self.index.remove(key).await;
|
||||
// Always deletes the file from the local store.
|
||||
if let Err(e) = self.local_store.delete(&file_path).await {
|
||||
warn!(e; "Failed to delete a cached manifest file {}", file_path);
|
||||
}
|
||||
}
|
||||
|
||||
/// Removes multiple files from the cache in batch.
|
||||
pub(crate) async fn remove_batch(&self, keys: &[String]) {
|
||||
if keys.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
for key in keys {
|
||||
self.index.remove(key).await;
|
||||
}
|
||||
|
||||
let file_paths: Vec<String> = keys.iter().map(|key| self.cache_file_path(key)).collect();
|
||||
|
||||
if let Err(e) = self.local_store.delete_iter(file_paths).await {
|
||||
warn!(e; "Failed to delete cached manifest files in batch");
|
||||
}
|
||||
}
|
||||
|
||||
async fn recover_inner(&self) -> Result<()> {
|
||||
let now = Instant::now();
|
||||
let mut lister = self
|
||||
.local_store
|
||||
.lister_with(MANIFEST_DIR)
|
||||
.recursive(true)
|
||||
.await
|
||||
.context(OpenDalSnafu)?;
|
||||
let (mut total_size, mut total_keys) = (0i64, 0);
|
||||
while let Some(entry) = lister.try_next().await.context(OpenDalSnafu)? {
|
||||
let meta = entry.metadata();
|
||||
if !meta.is_file() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let meta = self
|
||||
.local_store
|
||||
.stat(entry.path())
|
||||
.await
|
||||
.context(OpenDalSnafu)?;
|
||||
let file_size = meta.content_length() as u32;
|
||||
let key = entry.path().trim_start_matches(MANIFEST_DIR).to_string();
|
||||
common_telemetry::info!("Manifest cache recover {}, size: {}", key, file_size);
|
||||
self.index.insert(key, IndexValue { file_size }).await;
|
||||
let size = i64::from(file_size);
|
||||
total_size += size;
|
||||
total_keys += 1;
|
||||
}
|
||||
CACHE_BYTES
|
||||
.with_label_values(&[MANIFEST_TYPE])
|
||||
.add(total_size);
|
||||
|
||||
// Runs all pending tasks of the moka cache so that the cache size is updated
|
||||
// and the eviction policy is applied.
|
||||
self.index.run_pending_tasks().await;
|
||||
|
||||
let weight = self.index.weighted_size();
|
||||
let count = self.index.entry_count();
|
||||
info!(
|
||||
"Recovered manifest cache, num_keys: {}, num_bytes: {}, count: {}, weight: {}, cost: {:?}",
|
||||
total_keys,
|
||||
total_size,
|
||||
count,
|
||||
weight,
|
||||
now.elapsed()
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Recovers the index from local store.
|
||||
pub(crate) async fn recover(&self, sync: bool) {
|
||||
let moved_self = self.clone();
|
||||
let handle = tokio::spawn(async move {
|
||||
if let Err(err) = moved_self.recover_inner().await {
|
||||
error!(err; "Failed to recover manifest cache.")
|
||||
}
|
||||
|
||||
moved_self.clean_empty_dirs(true).await;
|
||||
});
|
||||
|
||||
if sync {
|
||||
let _ = handle.await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the cache file path for the key.
|
||||
pub(crate) fn cache_file_path(&self, key: &str) -> String {
|
||||
join_path(MANIFEST_DIR, key)
|
||||
}
|
||||
|
||||
/// Gets a manifest file from cache.
|
||||
/// Returns the file data if found in cache, None otherwise.
|
||||
pub(crate) async fn get_file(&self, key: &str) -> Option<Vec<u8>> {
|
||||
if self.get(key).await.is_none() {
|
||||
CACHE_MISS.with_label_values(&[MANIFEST_TYPE]).inc();
|
||||
return None;
|
||||
}
|
||||
|
||||
let cache_file_path = self.cache_file_path(key);
|
||||
match self.local_store.read(&cache_file_path).await {
|
||||
Ok(data) => {
|
||||
CACHE_HIT.with_label_values(&[MANIFEST_TYPE]).inc();
|
||||
Some(data.to_vec())
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(e; "Failed to read cached manifest file {}", cache_file_path);
|
||||
CACHE_MISS.with_label_values(&[MANIFEST_TYPE]).inc();
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Puts a manifest file into cache.
|
||||
pub(crate) async fn put_file(&self, key: String, data: Vec<u8>) {
|
||||
let cache_file_path = self.cache_file_path(&key);
|
||||
|
||||
if let Err(e) = self.local_store.write(&cache_file_path, data.clone()).await {
|
||||
warn!(e; "Failed to write manifest to cache {}", cache_file_path);
|
||||
return;
|
||||
}
|
||||
|
||||
let file_size = data.len() as u32;
|
||||
self.put(key, IndexValue { file_size }).await;
|
||||
}
|
||||
|
||||
/// Removes empty directories recursively under the manifest cache directory.
|
||||
///
|
||||
/// If `check_mtime` is true, only removes directories that have not been modified
|
||||
/// for at least 1 hour.
|
||||
pub(crate) async fn clean_empty_dirs(&self, check_mtime: bool) {
|
||||
info!("Clean empty dirs start");
|
||||
|
||||
let root = self.local_store.info().root();
|
||||
let manifest_dir = PathBuf::from(root).join(MANIFEST_DIR);
|
||||
let manifest_dir_clone = manifest_dir.clone();
|
||||
|
||||
let result = tokio::task::spawn_blocking(move || {
|
||||
Self::clean_empty_dirs_sync(&manifest_dir_clone, check_mtime)
|
||||
})
|
||||
.await;
|
||||
|
||||
match result {
|
||||
Ok(Ok(())) => {
|
||||
info!("Clean empty dirs end");
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
warn!(e; "Failed to clean empty directories under {}", manifest_dir.display());
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(e; "Failed to spawn blocking task for cleaning empty directories");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Removes all manifest files under the given directory from cache and cleans up empty directories.
|
||||
pub(crate) async fn clean_manifests(&self, dir: &str) {
|
||||
info!("Clean manifest cache for directory: {}", dir);
|
||||
|
||||
let cache_dir = join_path(MANIFEST_DIR, dir);
|
||||
let mut lister = match self
|
||||
.local_store
|
||||
.lister_with(&cache_dir)
|
||||
.recursive(true)
|
||||
.await
|
||||
{
|
||||
Ok(lister) => lister,
|
||||
Err(e) => {
|
||||
warn!(e; "Failed to list manifest files under {}", cache_dir);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let mut keys_to_remove = Vec::new();
|
||||
loop {
|
||||
match lister.try_next().await {
|
||||
Ok(Some(entry)) => {
|
||||
let meta = entry.metadata();
|
||||
if meta.is_file() {
|
||||
keys_to_remove
|
||||
.push(entry.path().trim_start_matches(MANIFEST_DIR).to_string());
|
||||
}
|
||||
}
|
||||
Ok(None) => break,
|
||||
Err(e) => {
|
||||
warn!(e; "Failed to read entry while listing {}", cache_dir);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
"Going to remove files from manifest cache, files: {:?}",
|
||||
keys_to_remove
|
||||
);
|
||||
|
||||
// Removes all files from cache in batch
|
||||
self.remove_batch(&keys_to_remove).await;
|
||||
|
||||
// Cleans up empty directories under the given dir
|
||||
let root = self.local_store.info().root();
|
||||
let dir_path = PathBuf::from(root).join(&cache_dir);
|
||||
let dir_path_clone = dir_path.clone();
|
||||
|
||||
let result = tokio::task::spawn_blocking(move || {
|
||||
Self::clean_empty_dirs_sync(&dir_path_clone, false)
|
||||
})
|
||||
.await;
|
||||
|
||||
match result {
|
||||
Ok(Ok(())) => {
|
||||
info!("Cleaned manifest cache for directory: {}", dir);
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
warn!(e; "Failed to clean empty directories under {}", dir_path.display());
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(e; "Failed to spawn blocking task for cleaning empty directories");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Synchronously removes empty directories recursively.
|
||||
///
|
||||
/// If `check_mtime` is true, only removes directories that have not been modified
|
||||
/// for at least 1 hour.
|
||||
fn clean_empty_dirs_sync(dir: &PathBuf, check_mtime: bool) -> std::io::Result<()> {
|
||||
Self::remove_empty_dirs_recursive_sync(dir, check_mtime)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn remove_empty_dirs_recursive_sync(dir: &PathBuf, check_mtime: bool) -> std::io::Result<bool> {
|
||||
common_telemetry::debug!(
|
||||
"Maybe remove empty dir: {:?}, check_mtime: {}",
|
||||
dir,
|
||||
check_mtime
|
||||
);
|
||||
let entries = match std::fs::read_dir(dir) {
|
||||
Ok(entries) => entries,
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
||||
// Directory doesn't exist, treat as already removed (empty)
|
||||
return Ok(true);
|
||||
}
|
||||
Err(e) => return Err(e),
|
||||
};
|
||||
|
||||
let mut is_empty = true;
|
||||
// Iterates all entries under the directory.
|
||||
// We have to check all entries to clean up all empty subdirectories.
|
||||
for entry in entries {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
let metadata = std::fs::metadata(&path)?;
|
||||
|
||||
if metadata.is_dir() {
|
||||
// Checks if we should skip this directory based on modification time
|
||||
if check_mtime
|
||||
&& let Ok(modified) = metadata.modified()
|
||||
&& let Ok(elapsed) = modified.elapsed()
|
||||
&& elapsed < Duration::from_secs(3600)
|
||||
{
|
||||
common_telemetry::debug!("Skip directory by mtime, elapsed: {:?}", elapsed);
|
||||
// Only removes if not modified for at least 1 hour.
|
||||
is_empty = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
let subdir_empty = Self::remove_empty_dirs_recursive_sync(&path, check_mtime)?;
|
||||
if subdir_empty {
|
||||
if let Err(e) = std::fs::remove_dir(&path)
|
||||
&& e.kind() != std::io::ErrorKind::NotFound
|
||||
{
|
||||
warn!(e; "Failed to remove empty directory {}", path.display());
|
||||
is_empty = false;
|
||||
} else {
|
||||
info!(
|
||||
"Removed empty directory {} from manifest cache",
|
||||
path.display()
|
||||
);
|
||||
}
|
||||
} else {
|
||||
is_empty = false;
|
||||
}
|
||||
} else {
|
||||
is_empty = false;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(is_empty)
|
||||
}
|
||||
}
|
||||
|
||||
/// An entity that describes the file in the manifest cache.
|
||||
///
|
||||
/// It should only keep minimal information needed by the cache.
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct IndexValue {
|
||||
/// Size of the file in bytes.
|
||||
pub(crate) file_size: u32,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use common_test_util::temp_dir::create_temp_dir;
|
||||
use object_store::services::Fs;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn new_fs_store(path: &str) -> ObjectStore {
|
||||
let builder = Fs::default().root(path);
|
||||
ObjectStore::new(builder).unwrap().finish()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_manifest_cache_basic() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
|
||||
let dir = create_temp_dir("");
|
||||
let local_store = new_fs_store(dir.path().to_str().unwrap());
|
||||
|
||||
let cache = ManifestCache::new(local_store.clone(), ReadableSize::mb(10), None).await;
|
||||
let key = "region_1/manifest/00000000000000000007.json";
|
||||
let file_path = cache.cache_file_path(key);
|
||||
|
||||
// Get an empty file.
|
||||
assert!(cache.get(key).await.is_none());
|
||||
|
||||
// Write a file.
|
||||
local_store
|
||||
.write(&file_path, b"manifest content".as_slice())
|
||||
.await
|
||||
.unwrap();
|
||||
// Add to the cache.
|
||||
cache
|
||||
.put(key.to_string(), IndexValue { file_size: 16 })
|
||||
.await;
|
||||
|
||||
// Get the cached value.
|
||||
let value = cache.get(key).await.unwrap();
|
||||
assert_eq!(16, value.file_size);
|
||||
|
||||
// Get weighted size.
|
||||
cache.index.run_pending_tasks().await;
|
||||
assert_eq!(59, cache.index.weighted_size());
|
||||
|
||||
// Remove the file.
|
||||
cache.remove(key).await;
|
||||
cache.index.run_pending_tasks().await;
|
||||
assert!(cache.get(key).await.is_none());
|
||||
|
||||
// Ensure all pending tasks of the moka cache is done before assertion.
|
||||
cache.index.run_pending_tasks().await;
|
||||
|
||||
// The file also not exists.
|
||||
assert!(!local_store.exists(&file_path).await.unwrap());
|
||||
assert_eq!(0, cache.index.weighted_size());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_manifest_cache_recover() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
|
||||
let dir = create_temp_dir("");
|
||||
let local_store = new_fs_store(dir.path().to_str().unwrap());
|
||||
let cache = ManifestCache::new(local_store.clone(), ReadableSize::mb(10), None).await;
|
||||
|
||||
// Write some manifest files with different paths
|
||||
let keys = [
|
||||
"region_1/manifest/00000000000000000001.json",
|
||||
"region_1/manifest/00000000000000000002.json",
|
||||
"region_1/manifest/00000000000000000001.checkpoint",
|
||||
"region_2/manifest/00000000000000000001.json",
|
||||
];
|
||||
|
||||
let mut total_size = 0;
|
||||
for (i, key) in keys.iter().enumerate() {
|
||||
let file_path = cache.cache_file_path(key);
|
||||
let content = format!("manifest-{}", i).into_bytes();
|
||||
local_store
|
||||
.write(&file_path, content.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Add to the cache.
|
||||
cache
|
||||
.put(
|
||||
key.to_string(),
|
||||
IndexValue {
|
||||
file_size: content.len() as u32,
|
||||
},
|
||||
)
|
||||
.await;
|
||||
total_size += content.len() + key.len();
|
||||
}
|
||||
|
||||
// Create a new cache instance which will automatically recover from local store
|
||||
let cache = ManifestCache::new(local_store.clone(), ReadableSize::mb(10), None).await;
|
||||
|
||||
// Wait for recovery to complete synchronously
|
||||
cache.recover(true).await;
|
||||
|
||||
// Check size.
|
||||
cache.index.run_pending_tasks().await;
|
||||
let total_cached = cache.index.weighted_size() as usize;
|
||||
assert_eq!(total_size, total_cached);
|
||||
|
||||
// Verify all files
|
||||
for (i, key) in keys.iter().enumerate() {
|
||||
let value = cache.get(key).await.unwrap();
|
||||
assert_eq!(format!("manifest-{}", i).len() as u32, value.file_size);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_cache_file_path() {
|
||||
let dir = create_temp_dir("");
|
||||
let local_store = new_fs_store(dir.path().to_str().unwrap());
|
||||
let cache = ManifestCache::new(local_store, ReadableSize::mb(10), None).await;
|
||||
|
||||
assert_eq!(
|
||||
"cache/object/manifest/region_1/manifest/00000000000000000007.json",
|
||||
cache.cache_file_path("region_1/manifest/00000000000000000007.json")
|
||||
);
|
||||
assert_eq!(
|
||||
"cache/object/manifest/region_1/manifest/00000000000000000007.checkpoint",
|
||||
cache.cache_file_path("region_1/manifest/00000000000000000007.checkpoint")
|
||||
);
|
||||
}
|
||||
}
|
||||
20
src/mito2/src/cache/write_cache.rs
vendored
20
src/mito2/src/cache/write_cache.rs
vendored
@@ -30,7 +30,6 @@ use crate::access_layer::{
|
||||
TempFileCleaner, WriteCachePathProvider, WriteType, new_fs_cache_store,
|
||||
};
|
||||
use crate::cache::file_cache::{FileCache, FileCacheRef, FileType, IndexKey, IndexValue};
|
||||
use crate::cache::manifest_cache::ManifestCache;
|
||||
use crate::error::{self, Result};
|
||||
use crate::metrics::UPLOAD_BYTES_TOTAL;
|
||||
use crate::region::opener::RegionLoadCacheTask;
|
||||
@@ -54,8 +53,6 @@ pub struct WriteCache {
|
||||
intermediate_manager: IntermediateManager,
|
||||
/// Sender for region load cache tasks.
|
||||
task_sender: UnboundedSender<RegionLoadCacheTask>,
|
||||
/// Optional cache for manifest files.
|
||||
manifest_cache: Option<ManifestCache>,
|
||||
}
|
||||
|
||||
pub type WriteCacheRef = Arc<WriteCache>;
|
||||
@@ -70,7 +67,6 @@ impl WriteCache {
|
||||
index_cache_percent: Option<u8>,
|
||||
puffin_manager_factory: PuffinManagerFactory,
|
||||
intermediate_manager: IntermediateManager,
|
||||
manifest_cache: Option<ManifestCache>,
|
||||
) -> Result<Self> {
|
||||
let (task_sender, task_receiver) = unbounded_channel();
|
||||
|
||||
@@ -87,7 +83,6 @@ impl WriteCache {
|
||||
puffin_manager_factory,
|
||||
intermediate_manager,
|
||||
task_sender,
|
||||
manifest_cache,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -99,19 +94,10 @@ impl WriteCache {
|
||||
index_cache_percent: Option<u8>,
|
||||
puffin_manager_factory: PuffinManagerFactory,
|
||||
intermediate_manager: IntermediateManager,
|
||||
manifest_cache_capacity: ReadableSize,
|
||||
) -> Result<Self> {
|
||||
info!("Init write cache on {cache_dir}, capacity: {cache_capacity}");
|
||||
|
||||
let local_store = new_fs_cache_store(cache_dir).await?;
|
||||
|
||||
// Create manifest cache if capacity is non-zero
|
||||
let manifest_cache = if manifest_cache_capacity.as_bytes() > 0 {
|
||||
Some(ManifestCache::new(local_store.clone(), manifest_cache_capacity, ttl).await)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Self::new(
|
||||
local_store,
|
||||
cache_capacity,
|
||||
@@ -119,7 +105,6 @@ impl WriteCache {
|
||||
index_cache_percent,
|
||||
puffin_manager_factory,
|
||||
intermediate_manager,
|
||||
manifest_cache,
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -129,11 +114,6 @@ impl WriteCache {
|
||||
self.file_cache.clone()
|
||||
}
|
||||
|
||||
/// Returns the manifest cache if available.
|
||||
pub(crate) fn manifest_cache(&self) -> Option<ManifestCache> {
|
||||
self.manifest_cache.clone()
|
||||
}
|
||||
|
||||
/// Build the puffin manager
|
||||
pub(crate) fn build_puffin_manager(&self) -> SstPuffinManager {
|
||||
let store = self.file_cache.local_store();
|
||||
|
||||
@@ -1110,7 +1110,6 @@ mod tests {
|
||||
compress_type: CompressionType::Uncompressed,
|
||||
checkpoint_distance: 10,
|
||||
remove_file_options: Default::default(),
|
||||
manifest_cache: None,
|
||||
},
|
||||
FormatType::PrimaryKey,
|
||||
&Default::default(),
|
||||
|
||||
@@ -515,7 +515,6 @@ async fn test_flush_workers() {
|
||||
}
|
||||
|
||||
async fn test_flush_workers_with_format(flat_format: bool) {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let mut env = TestEnv::new().await;
|
||||
let write_buffer_manager = Arc::new(MockWriteBufferManager::default());
|
||||
let listener = Arc::new(FlushListener::default());
|
||||
@@ -575,7 +574,7 @@ async fn test_flush_workers_with_format(flat_format: bool) {
|
||||
put_rows(&engine, region_id0, rows).await;
|
||||
|
||||
// Waits until flush is finished.
|
||||
while listener.success_count() < 3 {
|
||||
while listener.success_count() < 2 {
|
||||
listener.wait().await;
|
||||
}
|
||||
|
||||
|
||||
@@ -233,7 +233,7 @@ async fn collect_inverted_entries(
|
||||
InvertedIndexBlobReader::new(blob_reader),
|
||||
cache.clone(),
|
||||
);
|
||||
match reader.metadata(None).await {
|
||||
match reader.metadata().await {
|
||||
Ok(metas) => metas,
|
||||
Err(err) => {
|
||||
warn!(
|
||||
@@ -247,7 +247,7 @@ async fn collect_inverted_entries(
|
||||
}
|
||||
} else {
|
||||
let reader = InvertedIndexBlobReader::new(blob_reader);
|
||||
match reader.metadata(None).await {
|
||||
match reader.metadata().await {
|
||||
Ok(metas) => metas,
|
||||
Err(err) => {
|
||||
warn!(
|
||||
@@ -318,10 +318,10 @@ async fn try_read_bloom_meta(
|
||||
bloom_reader,
|
||||
cache.clone(),
|
||||
)
|
||||
.metadata(None)
|
||||
.metadata()
|
||||
.await
|
||||
}
|
||||
_ => bloom_reader.metadata(None).await,
|
||||
_ => bloom_reader.metadata().await,
|
||||
};
|
||||
|
||||
match result {
|
||||
|
||||
@@ -20,7 +20,7 @@ use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::time::Instant;
|
||||
|
||||
use common_telemetry::{debug, error, info};
|
||||
use common_telemetry::{debug, error, info, trace};
|
||||
use datatypes::arrow::datatypes::SchemaRef;
|
||||
use either::Either;
|
||||
use partition::expr::PartitionExpr;
|
||||
@@ -89,12 +89,6 @@ pub trait WriteBufferManager: Send + Sync + std::fmt::Debug {
|
||||
|
||||
/// Returns the total memory used by memtables.
|
||||
fn memory_usage(&self) -> usize;
|
||||
|
||||
/// Returns the mutable memtable memory limit.
|
||||
///
|
||||
/// The write buffer manager should flush memtables when the mutable memory usage
|
||||
/// exceeds this limit.
|
||||
fn flush_limit(&self) -> usize;
|
||||
}
|
||||
|
||||
pub type WriteBufferManagerRef = Arc<dyn WriteBufferManager>;
|
||||
@@ -151,7 +145,7 @@ impl WriteBufferManagerImpl {
|
||||
impl WriteBufferManager for WriteBufferManagerImpl {
|
||||
fn should_flush_engine(&self) -> bool {
|
||||
let mutable_memtable_memory_usage = self.memory_active.load(Ordering::Relaxed);
|
||||
if mutable_memtable_memory_usage >= self.mutable_limit {
|
||||
if mutable_memtable_memory_usage > self.mutable_limit {
|
||||
debug!(
|
||||
"Engine should flush (over mutable limit), mutable_usage: {}, memory_usage: {}, mutable_limit: {}, global_limit: {}",
|
||||
mutable_memtable_memory_usage,
|
||||
@@ -163,8 +157,23 @@ impl WriteBufferManager for WriteBufferManagerImpl {
|
||||
}
|
||||
|
||||
let memory_usage = self.memory_used.load(Ordering::Relaxed);
|
||||
// If the memory exceeds the buffer size, we trigger more aggressive
|
||||
// flush. But if already more than half memory is being flushed,
|
||||
// triggering more flush may not help. We will hold it instead.
|
||||
if memory_usage >= self.global_write_buffer_size {
|
||||
return true;
|
||||
if mutable_memtable_memory_usage >= self.global_write_buffer_size / 2 {
|
||||
debug!(
|
||||
"Engine should flush (over total limit), memory_usage: {}, global_write_buffer_size: {}, \
|
||||
mutable_usage: {}.",
|
||||
memory_usage, self.global_write_buffer_size, mutable_memtable_memory_usage
|
||||
);
|
||||
return true;
|
||||
} else {
|
||||
trace!(
|
||||
"Engine won't flush, memory_usage: {}, global_write_buffer_size: {}, mutable_usage: {}.",
|
||||
memory_usage, self.global_write_buffer_size, mutable_memtable_memory_usage
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
@@ -196,10 +205,6 @@ impl WriteBufferManager for WriteBufferManagerImpl {
|
||||
fn memory_usage(&self) -> usize {
|
||||
self.memory_used.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
fn flush_limit(&self) -> usize {
|
||||
self.mutable_limit
|
||||
}
|
||||
}
|
||||
|
||||
/// Reason of a flush task.
|
||||
@@ -883,31 +888,6 @@ impl FlushScheduler {
|
||||
self.region_status.contains_key(®ion_id)
|
||||
}
|
||||
|
||||
fn schedule_flush_task(
|
||||
&mut self,
|
||||
version_control: &VersionControlRef,
|
||||
task: RegionFlushTask,
|
||||
) -> Result<()> {
|
||||
let region_id = task.region_id;
|
||||
|
||||
// If current region doesn't have flush status, we can flush the region directly.
|
||||
if let Err(e) = version_control.freeze_mutable() {
|
||||
error!(e; "Failed to freeze the mutable memtable for region {}", region_id);
|
||||
|
||||
return Err(e);
|
||||
}
|
||||
// Submit a flush job.
|
||||
let job = task.into_flush_job(version_control);
|
||||
if let Err(e) = self.scheduler.schedule(job) {
|
||||
// If scheduler returns error, senders in the job will be dropped and waiters
|
||||
// can get recv errors.
|
||||
error!(e; "Failed to schedule flush job for region {}", region_id);
|
||||
|
||||
return Err(e);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Schedules a flush `task` for specific `region`.
|
||||
pub(crate) fn schedule_flush(
|
||||
&mut self,
|
||||
@@ -930,21 +910,46 @@ impl FlushScheduler {
|
||||
.with_label_values(&[task.reason.as_str()])
|
||||
.inc();
|
||||
|
||||
// If current region has flush status, merge the task.
|
||||
if let Some(flush_status) = self.region_status.get_mut(®ion_id) {
|
||||
// Checks whether we can flush the region now.
|
||||
debug!("Merging flush task for region {}", region_id);
|
||||
// Add this region to status map.
|
||||
let flush_status = self
|
||||
.region_status
|
||||
.entry(region_id)
|
||||
.or_insert_with(|| FlushStatus::new(region_id, version_control.clone()));
|
||||
// Checks whether we can flush the region now.
|
||||
if flush_status.flushing {
|
||||
// There is already a flush job running.
|
||||
flush_status.merge_task(task);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
self.schedule_flush_task(version_control, task)?;
|
||||
// TODO(yingwen): We can merge with pending and execute directly.
|
||||
// If there are pending tasks, then we should push it to pending list.
|
||||
if flush_status.pending_task.is_some() {
|
||||
flush_status.merge_task(task);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Add this region to status map.
|
||||
let _ = self.region_status.insert(
|
||||
region_id,
|
||||
FlushStatus::new(region_id, version_control.clone()),
|
||||
);
|
||||
// Now we can flush the region directly.
|
||||
if let Err(e) = version_control.freeze_mutable() {
|
||||
error!(e; "Failed to freeze the mutable memtable for region {}", region_id);
|
||||
|
||||
// Remove from region status if we can't freeze the mutable memtable.
|
||||
self.region_status.remove(®ion_id);
|
||||
return Err(e);
|
||||
}
|
||||
// Submit a flush job.
|
||||
let job = task.into_flush_job(version_control);
|
||||
if let Err(e) = self.scheduler.schedule(job) {
|
||||
// If scheduler returns error, senders in the job will be dropped and waiters
|
||||
// can get recv errors.
|
||||
error!(e; "Failed to schedule flush job for region {}", region_id);
|
||||
|
||||
// Remove from region status if we can't submit the task.
|
||||
self.region_status.remove(®ion_id);
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
flush_status.flushing = true;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -961,56 +966,48 @@ impl FlushScheduler {
|
||||
Vec<SenderBulkRequest>,
|
||||
)> {
|
||||
let flush_status = self.region_status.get_mut(®ion_id)?;
|
||||
// If region doesn't have any pending flush task, we need to remove it from the status.
|
||||
if flush_status.pending_task.is_none() {
|
||||
|
||||
// This region doesn't have running flush job.
|
||||
flush_status.flushing = false;
|
||||
|
||||
let pending_requests = if flush_status.pending_task.is_none() {
|
||||
// The region doesn't have any pending flush task.
|
||||
// Safety: The flush status must exist.
|
||||
debug!(
|
||||
"Region {} doesn't have any pending flush task, removing it from the status",
|
||||
region_id
|
||||
);
|
||||
let flush_status = self.region_status.remove(®ion_id).unwrap();
|
||||
return Some((
|
||||
Some((
|
||||
flush_status.pending_ddls,
|
||||
flush_status.pending_writes,
|
||||
flush_status.pending_bulk_writes,
|
||||
));
|
||||
))
|
||||
} else {
|
||||
let version_data = flush_status.version_control.current();
|
||||
if version_data.version.memtables.is_empty() {
|
||||
// The region has nothing to flush, we also need to remove it from the status.
|
||||
// Safety: The pending task is not None.
|
||||
let task = flush_status.pending_task.take().unwrap();
|
||||
// The region has nothing to flush. We can notify pending task.
|
||||
task.on_success();
|
||||
// `schedule_next_flush()` may pick up the same region to flush, so we must remove
|
||||
// it from the status to avoid leaking pending requests.
|
||||
// Safety: The flush status must exist.
|
||||
let flush_status = self.region_status.remove(®ion_id).unwrap();
|
||||
Some((
|
||||
flush_status.pending_ddls,
|
||||
flush_status.pending_writes,
|
||||
flush_status.pending_bulk_writes,
|
||||
))
|
||||
} else {
|
||||
// We can flush the region again, keep it in the region status.
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
// Schedule next flush job.
|
||||
if let Err(e) = self.schedule_next_flush() {
|
||||
error!(e; "Flush of region {} is successful, but failed to schedule next flush", region_id);
|
||||
}
|
||||
|
||||
// If region has pending task, but has nothing to flush, we need to remove it from the status.
|
||||
let version_data = flush_status.version_control.current();
|
||||
if version_data.version.memtables.is_empty() {
|
||||
// The region has nothing to flush, we also need to remove it from the status.
|
||||
// Safety: The pending task is not None.
|
||||
let task = flush_status.pending_task.take().unwrap();
|
||||
// The region has nothing to flush. We can notify pending task.
|
||||
task.on_success();
|
||||
debug!(
|
||||
"Region {} has nothing to flush, removing it from the status",
|
||||
region_id
|
||||
);
|
||||
// Safety: The flush status must exist.
|
||||
let flush_status = self.region_status.remove(®ion_id).unwrap();
|
||||
return Some((
|
||||
flush_status.pending_ddls,
|
||||
flush_status.pending_writes,
|
||||
flush_status.pending_bulk_writes,
|
||||
));
|
||||
}
|
||||
|
||||
// If region has pending task and has something to flush, we need to schedule it.
|
||||
debug!("Scheduling pending flush task for region {}", region_id);
|
||||
// Safety: The flush status must exist.
|
||||
let task = flush_status.pending_task.take().unwrap();
|
||||
let version_control = flush_status.version_control.clone();
|
||||
if let Err(err) = self.schedule_flush_task(&version_control, task) {
|
||||
error!(
|
||||
err;
|
||||
"Flush succeeded for region {region_id}, but failed to schedule next flush for it."
|
||||
);
|
||||
}
|
||||
// We can flush the region again, keep it in the region status.
|
||||
None
|
||||
pending_requests
|
||||
}
|
||||
|
||||
/// Notifies the scheduler that the flush job is failed.
|
||||
@@ -1026,6 +1023,11 @@ impl FlushScheduler {
|
||||
|
||||
// Fast fail: cancels all pending tasks and sends error to their waiters.
|
||||
flush_status.on_failure(err);
|
||||
|
||||
// Still tries to schedule a new flush.
|
||||
if let Err(e) = self.schedule_next_flush() {
|
||||
error!(e; "Failed to schedule next flush after region {} flush is failed", region_id);
|
||||
}
|
||||
}
|
||||
|
||||
/// Notifies the scheduler that the region is dropped.
|
||||
@@ -1096,6 +1098,30 @@ impl FlushScheduler {
|
||||
.map(|status| !status.pending_ddls.is_empty())
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Schedules a new flush task when the scheduler can submit next task.
|
||||
pub(crate) fn schedule_next_flush(&mut self) -> Result<()> {
|
||||
debug_assert!(
|
||||
self.region_status
|
||||
.values()
|
||||
.all(|status| status.flushing || status.pending_task.is_some())
|
||||
);
|
||||
|
||||
// Get the first region from status map.
|
||||
let Some(flush_status) = self
|
||||
.region_status
|
||||
.values_mut()
|
||||
.find(|status| status.pending_task.is_some())
|
||||
else {
|
||||
return Ok(());
|
||||
};
|
||||
debug_assert!(!flush_status.flushing);
|
||||
let task = flush_status.pending_task.take().unwrap();
|
||||
let region_id = flush_status.region_id;
|
||||
let version_control = flush_status.version_control.clone();
|
||||
|
||||
self.schedule_flush(region_id, &version_control, task)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for FlushScheduler {
|
||||
@@ -1115,6 +1141,11 @@ struct FlushStatus {
|
||||
region_id: RegionId,
|
||||
/// Version control of the region.
|
||||
version_control: VersionControlRef,
|
||||
/// There is a flush task running.
|
||||
///
|
||||
/// It is possible that a region is not flushing but has pending task if the scheduler
|
||||
/// doesn't schedules this region.
|
||||
flushing: bool,
|
||||
/// Task waiting for next flush.
|
||||
pending_task: Option<RegionFlushTask>,
|
||||
/// Pending ddl requests.
|
||||
@@ -1130,6 +1161,7 @@ impl FlushStatus {
|
||||
FlushStatus {
|
||||
region_id,
|
||||
version_control,
|
||||
flushing: false,
|
||||
pending_task: None,
|
||||
pending_ddls: Vec::new(),
|
||||
pending_writes: Vec::new(),
|
||||
@@ -1221,12 +1253,10 @@ mod tests {
|
||||
// Global usage is still 1100.
|
||||
manager.schedule_free_mem(200);
|
||||
assert!(manager.should_flush_engine());
|
||||
assert!(manager.should_stall());
|
||||
|
||||
// More than global limit, mutable (1100-200-450=450) is less than mutable limit (< 500).
|
||||
// More than global limit, but mutable (1100-200-450=450) is not enough (< 500).
|
||||
manager.schedule_free_mem(450);
|
||||
assert!(manager.should_flush_engine());
|
||||
assert!(manager.should_stall());
|
||||
assert!(!manager.should_flush_engine());
|
||||
|
||||
// Now mutable is enough.
|
||||
manager.reserve_mem(50);
|
||||
@@ -1473,92 +1503,4 @@ mod tests {
|
||||
assert_eq!(2, total_rows, "append_mode should preserve duplicates");
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_schedule_pending_request_on_flush_success() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let job_scheduler = Arc::new(VecScheduler::default());
|
||||
let env = SchedulerEnv::new().await.scheduler(job_scheduler.clone());
|
||||
let (tx, _rx) = mpsc::channel(4);
|
||||
let mut scheduler = env.mock_flush_scheduler();
|
||||
let mut builder = VersionControlBuilder::new();
|
||||
// Overwrites the empty memtable builder.
|
||||
builder.set_memtable_builder(Arc::new(TimeSeriesMemtableBuilder::default()));
|
||||
let version_control = Arc::new(builder.build());
|
||||
// Writes data to the memtable so it is not empty.
|
||||
let version_data = version_control.current();
|
||||
write_rows_to_version(&version_data.version, "host0", 0, 10);
|
||||
let manifest_ctx = env
|
||||
.mock_manifest_context(version_data.version.metadata.clone())
|
||||
.await;
|
||||
// Creates 2 tasks.
|
||||
let mut tasks: Vec<_> = (0..2)
|
||||
.map(|_| RegionFlushTask {
|
||||
region_id: builder.region_id(),
|
||||
reason: FlushReason::Others,
|
||||
senders: Vec::new(),
|
||||
request_sender: tx.clone(),
|
||||
access_layer: env.access_layer.clone(),
|
||||
listener: WorkerListener::default(),
|
||||
engine_config: Arc::new(MitoConfig::default()),
|
||||
row_group_size: None,
|
||||
cache_manager: Arc::new(CacheManager::default()),
|
||||
manifest_ctx: manifest_ctx.clone(),
|
||||
index_options: IndexOptions::default(),
|
||||
flush_semaphore: Arc::new(Semaphore::new(2)),
|
||||
is_staging: false,
|
||||
})
|
||||
.collect();
|
||||
// Schedule first task.
|
||||
let task = tasks.pop().unwrap();
|
||||
scheduler
|
||||
.schedule_flush(builder.region_id(), &version_control, task)
|
||||
.unwrap();
|
||||
// Should schedule 1 flush.
|
||||
assert_eq!(1, scheduler.region_status.len());
|
||||
assert_eq!(1, job_scheduler.num_jobs());
|
||||
// Schedule second task.
|
||||
let task = tasks.pop().unwrap();
|
||||
scheduler
|
||||
.schedule_flush(builder.region_id(), &version_control, task)
|
||||
.unwrap();
|
||||
assert!(
|
||||
scheduler
|
||||
.region_status
|
||||
.get(&builder.region_id())
|
||||
.unwrap()
|
||||
.pending_task
|
||||
.is_some()
|
||||
);
|
||||
|
||||
// Check the new version.
|
||||
let version_data = version_control.current();
|
||||
assert_eq!(0, version_data.version.memtables.immutables()[0].id());
|
||||
// Assumes the flush job is finished.
|
||||
version_control.apply_edit(
|
||||
Some(RegionEdit {
|
||||
files_to_add: Vec::new(),
|
||||
files_to_remove: Vec::new(),
|
||||
timestamp_ms: None,
|
||||
compaction_time_window: None,
|
||||
flushed_entry_id: None,
|
||||
flushed_sequence: None,
|
||||
committed_sequence: None,
|
||||
}),
|
||||
&[0],
|
||||
builder.file_purger(),
|
||||
);
|
||||
write_rows_to_version(&version_data.version, "host1", 0, 10);
|
||||
scheduler.on_flush_success(builder.region_id());
|
||||
assert_eq!(2, job_scheduler.num_jobs());
|
||||
// The pending task is cleared.
|
||||
assert!(
|
||||
scheduler
|
||||
.region_status
|
||||
.get(&builder.region_id())
|
||||
.unwrap()
|
||||
.pending_task
|
||||
.is_none()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,7 +24,6 @@ use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::FileId;
|
||||
use store_api::{MAX_VERSION, MIN_VERSION, ManifestVersion};
|
||||
|
||||
use crate::cache::manifest_cache::ManifestCache;
|
||||
use crate::config::MitoConfig;
|
||||
use crate::error::{
|
||||
self, InstallManifestToSnafu, NoCheckpointSnafu, NoManifestsSnafu, RegionStoppedSnafu, Result,
|
||||
@@ -53,8 +52,6 @@ pub struct RegionManifestOptions {
|
||||
/// Set to 0 to disable checkpoint.
|
||||
pub checkpoint_distance: u64,
|
||||
pub remove_file_options: RemoveFileOptions,
|
||||
/// Optional cache for manifest files.
|
||||
pub manifest_cache: Option<ManifestCache>,
|
||||
}
|
||||
|
||||
impl RegionManifestOptions {
|
||||
@@ -70,7 +67,6 @@ impl RegionManifestOptions {
|
||||
remove_file_options: RemoveFileOptions {
|
||||
enable_gc: config.gc.enable,
|
||||
},
|
||||
manifest_cache: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -178,7 +174,6 @@ impl RegionManifestManager {
|
||||
options.object_store.clone(),
|
||||
options.compress_type,
|
||||
stats.total_manifest_size.clone(),
|
||||
options.manifest_cache.clone(),
|
||||
);
|
||||
let manifest_version = stats.manifest_version.clone();
|
||||
|
||||
@@ -261,7 +256,6 @@ impl RegionManifestManager {
|
||||
options.object_store.clone(),
|
||||
options.compress_type,
|
||||
stats.total_manifest_size.clone(),
|
||||
options.manifest_cache.clone(),
|
||||
);
|
||||
let manifest_version = stats.manifest_version.clone();
|
||||
|
||||
|
||||
@@ -33,7 +33,6 @@ use store_api::ManifestVersion;
|
||||
use store_api::storage::RegionId;
|
||||
use tokio::sync::Semaphore;
|
||||
|
||||
use crate::cache::manifest_cache::ManifestCache;
|
||||
use crate::error::{
|
||||
ChecksumMismatchSnafu, CompressObjectSnafu, DecompressObjectSnafu, InvalidScanIndexSnafu,
|
||||
OpenDalSnafu, Result, SerdeJsonSnafu, Utf8Snafu,
|
||||
@@ -145,8 +144,6 @@ pub struct ManifestObjectStore {
|
||||
/// Stores the size of each manifest file.
|
||||
manifest_size_map: Arc<RwLock<HashMap<FileKey, u64>>>,
|
||||
total_manifest_size: Arc<AtomicU64>,
|
||||
/// Optional manifest cache for local caching.
|
||||
manifest_cache: Option<ManifestCache>,
|
||||
}
|
||||
|
||||
impl ManifestObjectStore {
|
||||
@@ -155,7 +152,6 @@ impl ManifestObjectStore {
|
||||
object_store: ObjectStore,
|
||||
compress_type: CompressionType,
|
||||
total_manifest_size: Arc<AtomicU64>,
|
||||
manifest_cache: Option<ManifestCache>,
|
||||
) -> Self {
|
||||
let path = util::normalize_dir(path);
|
||||
let staging_path = {
|
||||
@@ -170,7 +166,6 @@ impl ManifestObjectStore {
|
||||
staging_path,
|
||||
manifest_size_map: Arc::new(RwLock::new(HashMap::new())),
|
||||
total_manifest_size,
|
||||
manifest_cache,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -296,11 +291,9 @@ impl ManifestObjectStore {
|
||||
}
|
||||
|
||||
/// Common implementation for fetching manifests from entries in parallel.
|
||||
/// If `is_staging` is true, cache is skipped.
|
||||
async fn fetch_manifests_from_entries(
|
||||
&self,
|
||||
entries: Vec<(ManifestVersion, Entry)>,
|
||||
is_staging: bool,
|
||||
) -> Result<Vec<(ManifestVersion, Vec<u8>)>> {
|
||||
if entries.is_empty() {
|
||||
return Ok(vec![]);
|
||||
@@ -313,13 +306,6 @@ impl ManifestObjectStore {
|
||||
// Safety: semaphore must exist.
|
||||
let _permit = semaphore.acquire().await.unwrap();
|
||||
|
||||
let cache_key = entry.path();
|
||||
// Try to get from cache first
|
||||
if let Some(data) = self.get_from_cache(cache_key, is_staging).await {
|
||||
return Ok((*v, data));
|
||||
}
|
||||
|
||||
// Fetch from remote object store
|
||||
let compress_type = file_compress_type(entry.name());
|
||||
let bytes = self
|
||||
.object_store
|
||||
@@ -333,11 +319,6 @@ impl ManifestObjectStore {
|
||||
compress_type,
|
||||
path: entry.path(),
|
||||
})?;
|
||||
|
||||
// Add to cache
|
||||
self.put_to_cache(cache_key.to_string(), &data, is_staging)
|
||||
.await;
|
||||
|
||||
Ok((*v, data))
|
||||
});
|
||||
|
||||
@@ -354,7 +335,7 @@ impl ManifestObjectStore {
|
||||
end_version: ManifestVersion,
|
||||
) -> Result<Vec<(ManifestVersion, Vec<u8>)>> {
|
||||
let manifests = self.scan(start_version, end_version).await?;
|
||||
self.fetch_manifests_from_entries(manifests, false).await
|
||||
self.fetch_manifests_from_entries(manifests).await
|
||||
}
|
||||
|
||||
/// Delete manifest files that version < end.
|
||||
@@ -424,11 +405,6 @@ impl ManifestObjectStore {
|
||||
ret, self.path, end, checkpoint_version, paths,
|
||||
);
|
||||
|
||||
// Remove from cache first
|
||||
for (entry, _, _) in &del_entries {
|
||||
self.remove_from_cache(entry.path()).await;
|
||||
}
|
||||
|
||||
self.object_store
|
||||
.delete_iter(paths)
|
||||
.await
|
||||
@@ -464,10 +440,11 @@ impl ManifestObjectStore {
|
||||
path: &path,
|
||||
})?;
|
||||
let delta_size = data.len();
|
||||
|
||||
self.write_and_put_cache(&path, data, is_staging).await?;
|
||||
self.object_store
|
||||
.write(&path, data)
|
||||
.await
|
||||
.context(OpenDalSnafu)?;
|
||||
self.set_delta_file_size(version, delta_size as u64);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -488,8 +465,10 @@ impl ManifestObjectStore {
|
||||
})?;
|
||||
let checkpoint_size = data.len();
|
||||
let checksum = checkpoint_checksum(bytes);
|
||||
|
||||
self.write_and_put_cache(&path, data, false).await?;
|
||||
self.object_store
|
||||
.write(&path, data)
|
||||
.await
|
||||
.context(OpenDalSnafu)?;
|
||||
self.set_checkpoint_file_size(version, checkpoint_size as u64);
|
||||
|
||||
// Because last checkpoint file only contain size and version, which is tiny, so we don't compress it.
|
||||
@@ -522,80 +501,60 @@ impl ManifestObjectStore {
|
||||
) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
|
||||
let version = metadata.version;
|
||||
let path = self.checkpoint_file_path(version);
|
||||
|
||||
// Try to get from cache first
|
||||
if let Some(data) = self.get_from_cache(&path, false).await {
|
||||
verify_checksum(&data, metadata.checksum)?;
|
||||
return Ok(Some((version, data)));
|
||||
}
|
||||
|
||||
// Due to backward compatibility, it is possible that the user's checkpoint not compressed,
|
||||
// so if we don't find file by compressed type. fall back to checkpoint not compressed find again.
|
||||
let checkpoint_data = match self.object_store.read(&path).await {
|
||||
Ok(checkpoint) => {
|
||||
let checkpoint_size = checkpoint.len();
|
||||
let decompress_data =
|
||||
self.compress_type
|
||||
.decode(checkpoint)
|
||||
.await
|
||||
.with_context(|_| DecompressObjectSnafu {
|
||||
let checkpoint_data =
|
||||
match self.object_store.read(&path).await {
|
||||
Ok(checkpoint) => {
|
||||
let checkpoint_size = checkpoint.len();
|
||||
let decompress_data = self.compress_type.decode(checkpoint).await.context(
|
||||
DecompressObjectSnafu {
|
||||
compress_type: self.compress_type,
|
||||
path: path.clone(),
|
||||
})?;
|
||||
verify_checksum(&decompress_data, metadata.checksum)?;
|
||||
// set the checkpoint size
|
||||
self.set_checkpoint_file_size(version, checkpoint_size as u64);
|
||||
// Add to cache
|
||||
self.put_to_cache(path, &decompress_data, false).await;
|
||||
Ok(Some(decompress_data))
|
||||
}
|
||||
Err(e) => {
|
||||
if e.kind() == ErrorKind::NotFound {
|
||||
if self.compress_type != FALL_BACK_COMPRESS_TYPE {
|
||||
let fall_back_path = gen_path(
|
||||
&self.path,
|
||||
&checkpoint_file(version),
|
||||
FALL_BACK_COMPRESS_TYPE,
|
||||
);
|
||||
debug!(
|
||||
"Failed to load checkpoint from path: {}, fall back to path: {}",
|
||||
path, fall_back_path
|
||||
);
|
||||
|
||||
// Try to get fallback from cache first
|
||||
if let Some(data) = self.get_from_cache(&fall_back_path, false).await {
|
||||
verify_checksum(&data, metadata.checksum)?;
|
||||
return Ok(Some((version, data)));
|
||||
}
|
||||
|
||||
match self.object_store.read(&fall_back_path).await {
|
||||
Ok(checkpoint) => {
|
||||
let checkpoint_size = checkpoint.len();
|
||||
let decompress_data = FALL_BACK_COMPRESS_TYPE
|
||||
.decode(checkpoint)
|
||||
.await
|
||||
.with_context(|_| DecompressObjectSnafu {
|
||||
compress_type: FALL_BACK_COMPRESS_TYPE,
|
||||
path: fall_back_path.clone(),
|
||||
})?;
|
||||
verify_checksum(&decompress_data, metadata.checksum)?;
|
||||
self.set_checkpoint_file_size(version, checkpoint_size as u64);
|
||||
// Add fallback to cache
|
||||
self.put_to_cache(fall_back_path, &decompress_data, false)
|
||||
.await;
|
||||
Ok(Some(decompress_data))
|
||||
path,
|
||||
},
|
||||
)?;
|
||||
verify_checksum(&decompress_data, metadata.checksum)?;
|
||||
// set the checkpoint size
|
||||
self.set_checkpoint_file_size(version, checkpoint_size as u64);
|
||||
Ok(Some(decompress_data))
|
||||
}
|
||||
Err(e) => {
|
||||
if e.kind() == ErrorKind::NotFound {
|
||||
if self.compress_type != FALL_BACK_COMPRESS_TYPE {
|
||||
let fall_back_path = gen_path(
|
||||
&self.path,
|
||||
&checkpoint_file(version),
|
||||
FALL_BACK_COMPRESS_TYPE,
|
||||
);
|
||||
debug!(
|
||||
"Failed to load checkpoint from path: {}, fall back to path: {}",
|
||||
path, fall_back_path
|
||||
);
|
||||
match self.object_store.read(&fall_back_path).await {
|
||||
Ok(checkpoint) => {
|
||||
let checkpoint_size = checkpoint.len();
|
||||
let decompress_data = FALL_BACK_COMPRESS_TYPE
|
||||
.decode(checkpoint)
|
||||
.await
|
||||
.context(DecompressObjectSnafu {
|
||||
compress_type: FALL_BACK_COMPRESS_TYPE,
|
||||
path,
|
||||
})?;
|
||||
verify_checksum(&decompress_data, metadata.checksum)?;
|
||||
self.set_checkpoint_file_size(version, checkpoint_size as u64);
|
||||
Ok(Some(decompress_data))
|
||||
}
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => Ok(None),
|
||||
Err(e) => Err(e).context(OpenDalSnafu),
|
||||
}
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => Ok(None),
|
||||
Err(e) => Err(e).context(OpenDalSnafu),
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
} else {
|
||||
Ok(None)
|
||||
Err(e).context(OpenDalSnafu)
|
||||
}
|
||||
} else {
|
||||
Err(e).context(OpenDalSnafu)
|
||||
}
|
||||
}
|
||||
}?;
|
||||
}?;
|
||||
Ok(checkpoint_data.map(|data| (version, data)))
|
||||
}
|
||||
|
||||
@@ -603,10 +562,8 @@ impl ManifestObjectStore {
|
||||
/// Return manifest version and the raw [RegionCheckpoint](crate::manifest::action::RegionCheckpoint) content if any
|
||||
pub async fn load_last_checkpoint(&mut self) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
|
||||
let last_checkpoint_path = self.last_checkpoint_path();
|
||||
|
||||
// Fetch from remote object store without cache
|
||||
let last_checkpoint_data = match self.object_store.read(&last_checkpoint_path).await {
|
||||
Ok(data) => data.to_vec(),
|
||||
Ok(data) => data,
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => {
|
||||
return Ok(None);
|
||||
}
|
||||
@@ -615,7 +572,7 @@ impl ManifestObjectStore {
|
||||
}
|
||||
};
|
||||
|
||||
let checkpoint_metadata = CheckpointMetadata::decode(&last_checkpoint_data)?;
|
||||
let checkpoint_metadata = CheckpointMetadata::decode(&last_checkpoint_data.to_vec())?;
|
||||
|
||||
debug!(
|
||||
"Load checkpoint in path: {}, metadata: {:?}",
|
||||
@@ -745,8 +702,7 @@ impl ManifestObjectStore {
|
||||
let mut sorted_entries = manifest_entries;
|
||||
Self::sort_manifests(&mut sorted_entries);
|
||||
|
||||
self.fetch_manifests_from_entries(sorted_entries, true)
|
||||
.await
|
||||
self.fetch_manifests_from_entries(sorted_entries).await
|
||||
}
|
||||
|
||||
/// Clear all staging manifest files.
|
||||
@@ -763,63 +719,6 @@ impl ManifestObjectStore {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Gets a manifest file from cache.
|
||||
/// Returns the file data if found in cache, None otherwise.
|
||||
/// If `is_staging` is true, always returns None.
|
||||
async fn get_from_cache(&self, key: &str, is_staging: bool) -> Option<Vec<u8>> {
|
||||
if is_staging {
|
||||
return None;
|
||||
}
|
||||
let cache = self.manifest_cache.as_ref()?;
|
||||
cache.get_file(key).await
|
||||
}
|
||||
|
||||
/// Puts a manifest file into cache.
|
||||
/// If `is_staging` is true, does nothing.
|
||||
async fn put_to_cache(&self, key: String, data: &[u8], is_staging: bool) {
|
||||
if is_staging {
|
||||
return;
|
||||
}
|
||||
let Some(cache) = &self.manifest_cache else {
|
||||
return;
|
||||
};
|
||||
|
||||
cache.put_file(key, data.to_vec()).await;
|
||||
}
|
||||
|
||||
/// Writes data to object store and puts it into cache.
|
||||
/// If `is_staging` is true, cache is skipped.
|
||||
async fn write_and_put_cache(&self, path: &str, data: Vec<u8>, is_staging: bool) -> Result<()> {
|
||||
// Clone data for cache before writing, only if cache is enabled and not staging
|
||||
let cache_data = if !is_staging && self.manifest_cache.is_some() {
|
||||
Some(data.clone())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Write to object store
|
||||
self.object_store
|
||||
.write(path, data)
|
||||
.await
|
||||
.context(OpenDalSnafu)?;
|
||||
|
||||
// Put to cache if we cloned the data
|
||||
if let Some(data) = cache_data {
|
||||
self.put_to_cache(path.to_string(), &data, is_staging).await;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Removes a manifest file from cache.
|
||||
async fn remove_from_cache(&self, key: &str) {
|
||||
let Some(cache) = &self.manifest_cache else {
|
||||
return;
|
||||
};
|
||||
|
||||
cache.remove(key).await;
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
@@ -863,7 +762,6 @@ mod tests {
|
||||
object_store,
|
||||
CompressionType::Uncompressed,
|
||||
Default::default(),
|
||||
None,
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -41,14 +41,10 @@ use crate::read::range::{RangeBuilderList, RangeMeta, RowGroupIndex};
|
||||
use crate::read::scan_region::StreamContext;
|
||||
use crate::read::{Batch, BoxedBatchStream, BoxedRecordBatchStream, ScannerMetrics, Source};
|
||||
use crate::sst::file::FileTimeRange;
|
||||
use crate::sst::index::bloom_filter::applier::BloomFilterIndexApplyMetrics;
|
||||
use crate::sst::index::fulltext_index::applier::FulltextIndexApplyMetrics;
|
||||
use crate::sst::index::inverted_index::applier::InvertedIndexApplyMetrics;
|
||||
use crate::sst::parquet::DEFAULT_ROW_GROUP_SIZE;
|
||||
use crate::sst::parquet::file_range::FileRange;
|
||||
use crate::sst::parquet::flat_format::time_index_column_index;
|
||||
use crate::sst::parquet::reader::{MetadataCacheMetrics, ReaderFilterMetrics, ReaderMetrics};
|
||||
use crate::sst::parquet::row_group::ParquetFetchMetrics;
|
||||
use crate::sst::parquet::reader::{ReaderFilterMetrics, ReaderMetrics};
|
||||
|
||||
/// Verbose scan metrics for a partition.
|
||||
#[derive(Default)]
|
||||
@@ -85,8 +81,6 @@ pub(crate) struct ScanMetricsSet {
|
||||
// SST related metrics:
|
||||
/// Duration to build file ranges.
|
||||
build_parts_cost: Duration,
|
||||
/// Duration to scan SST files.
|
||||
sst_scan_cost: Duration,
|
||||
/// Number of row groups before filtering.
|
||||
rg_total: usize,
|
||||
/// Number of row groups filtered by fulltext index.
|
||||
@@ -132,18 +126,6 @@ pub(crate) struct ScanMetricsSet {
|
||||
|
||||
/// The stream reached EOF
|
||||
stream_eof: bool,
|
||||
|
||||
// Optional verbose metrics:
|
||||
/// Inverted index apply metrics.
|
||||
inverted_index_apply_metrics: Option<InvertedIndexApplyMetrics>,
|
||||
/// Bloom filter index apply metrics.
|
||||
bloom_filter_apply_metrics: Option<BloomFilterIndexApplyMetrics>,
|
||||
/// Fulltext index apply metrics.
|
||||
fulltext_index_apply_metrics: Option<FulltextIndexApplyMetrics>,
|
||||
/// Parquet fetch metrics.
|
||||
fetch_metrics: Option<ParquetFetchMetrics>,
|
||||
/// Metadata cache metrics.
|
||||
metadata_cache_metrics: Option<MetadataCacheMetrics>,
|
||||
}
|
||||
|
||||
impl fmt::Debug for ScanMetricsSet {
|
||||
@@ -159,7 +141,6 @@ impl fmt::Debug for ScanMetricsSet {
|
||||
num_mem_ranges,
|
||||
num_file_ranges,
|
||||
build_parts_cost,
|
||||
sst_scan_cost,
|
||||
rg_total,
|
||||
rg_fulltext_filtered,
|
||||
rg_inverted_filtered,
|
||||
@@ -185,11 +166,6 @@ impl fmt::Debug for ScanMetricsSet {
|
||||
mem_rows,
|
||||
mem_batches,
|
||||
mem_series,
|
||||
inverted_index_apply_metrics,
|
||||
bloom_filter_apply_metrics,
|
||||
fulltext_index_apply_metrics,
|
||||
fetch_metrics,
|
||||
metadata_cache_metrics,
|
||||
} = self;
|
||||
|
||||
// Write core metrics
|
||||
@@ -205,7 +181,6 @@ impl fmt::Debug for ScanMetricsSet {
|
||||
\"num_mem_ranges\":{num_mem_ranges}, \
|
||||
\"num_file_ranges\":{num_file_ranges}, \
|
||||
\"build_parts_cost\":\"{build_parts_cost:?}\", \
|
||||
\"sst_scan_cost\":\"{sst_scan_cost:?}\", \
|
||||
\"rg_total\":{rg_total}, \
|
||||
\"rows_before_filter\":{rows_before_filter}, \
|
||||
\"num_sst_record_batches\":{num_sst_record_batches}, \
|
||||
@@ -280,33 +255,6 @@ impl fmt::Debug for ScanMetricsSet {
|
||||
write!(f, ", \"mem_scan_cost\":\"{mem_scan_cost:?}\"")?;
|
||||
}
|
||||
|
||||
// Write optional verbose metrics if they are not empty
|
||||
if let Some(metrics) = inverted_index_apply_metrics
|
||||
&& !metrics.is_empty()
|
||||
{
|
||||
write!(f, ", \"inverted_index_apply_metrics\":{:?}", metrics)?;
|
||||
}
|
||||
if let Some(metrics) = bloom_filter_apply_metrics
|
||||
&& !metrics.is_empty()
|
||||
{
|
||||
write!(f, ", \"bloom_filter_apply_metrics\":{:?}", metrics)?;
|
||||
}
|
||||
if let Some(metrics) = fulltext_index_apply_metrics
|
||||
&& !metrics.is_empty()
|
||||
{
|
||||
write!(f, ", \"fulltext_index_apply_metrics\":{:?}", metrics)?;
|
||||
}
|
||||
if let Some(metrics) = fetch_metrics
|
||||
&& !metrics.is_empty()
|
||||
{
|
||||
write!(f, ", \"fetch_metrics\":{:?}", metrics)?;
|
||||
}
|
||||
if let Some(metrics) = metadata_cache_metrics
|
||||
&& !metrics.is_empty()
|
||||
{
|
||||
write!(f, ", \"metadata_cache_metrics\":{:?}", metrics)?;
|
||||
}
|
||||
|
||||
write!(f, ", \"stream_eof\":{stream_eof}}}")
|
||||
}
|
||||
}
|
||||
@@ -356,20 +304,14 @@ impl ScanMetricsSet {
|
||||
rows_inverted_filtered,
|
||||
rows_bloom_filtered,
|
||||
rows_precise_filtered,
|
||||
inverted_index_apply_metrics,
|
||||
bloom_filter_apply_metrics,
|
||||
fulltext_index_apply_metrics,
|
||||
},
|
||||
num_record_batches,
|
||||
num_batches,
|
||||
num_rows,
|
||||
scan_cost,
|
||||
metadata_cache_metrics,
|
||||
fetch_metrics,
|
||||
scan_cost: _,
|
||||
} = other;
|
||||
|
||||
self.build_parts_cost += *build_cost;
|
||||
self.sst_scan_cost += *scan_cost;
|
||||
|
||||
self.rg_total += *rg_total;
|
||||
self.rg_fulltext_filtered += *rg_fulltext_filtered;
|
||||
@@ -386,31 +328,6 @@ impl ScanMetricsSet {
|
||||
self.num_sst_record_batches += *num_record_batches;
|
||||
self.num_sst_batches += *num_batches;
|
||||
self.num_sst_rows += *num_rows;
|
||||
|
||||
// Merge optional verbose metrics
|
||||
if let Some(metrics) = inverted_index_apply_metrics {
|
||||
self.inverted_index_apply_metrics
|
||||
.get_or_insert_with(InvertedIndexApplyMetrics::default)
|
||||
.merge_from(metrics);
|
||||
}
|
||||
if let Some(metrics) = bloom_filter_apply_metrics {
|
||||
self.bloom_filter_apply_metrics
|
||||
.get_or_insert_with(BloomFilterIndexApplyMetrics::default)
|
||||
.merge_from(metrics);
|
||||
}
|
||||
if let Some(metrics) = fulltext_index_apply_metrics {
|
||||
self.fulltext_index_apply_metrics
|
||||
.get_or_insert_with(FulltextIndexApplyMetrics::default)
|
||||
.merge_from(metrics);
|
||||
}
|
||||
if let Some(metrics) = fetch_metrics {
|
||||
self.fetch_metrics
|
||||
.get_or_insert_with(ParquetFetchMetrics::default)
|
||||
.merge_from(metrics);
|
||||
}
|
||||
self.metadata_cache_metrics
|
||||
.get_or_insert_with(MetadataCacheMetrics::default)
|
||||
.merge_from(metadata_cache_metrics);
|
||||
}
|
||||
|
||||
/// Sets distributor metrics.
|
||||
@@ -698,11 +615,6 @@ impl PartitionMetrics {
|
||||
let mut metrics_set = self.0.metrics.lock().unwrap();
|
||||
metrics_set.set_distributor_metrics(metrics);
|
||||
}
|
||||
|
||||
/// Returns whether verbose explain is enabled.
|
||||
pub(crate) fn explain_verbose(&self) -> bool {
|
||||
self.0.explain_verbose
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for PartitionMetrics {
|
||||
@@ -856,21 +768,6 @@ fn can_split_series(num_rows: u64, num_series: u64) -> bool {
|
||||
num_series < NUM_SERIES_THRESHOLD || num_rows / num_series >= BATCH_SIZE_THRESHOLD
|
||||
}
|
||||
|
||||
/// Creates a new [ReaderFilterMetrics] with optional apply metrics initialized
|
||||
/// based on the `explain_verbose` flag.
|
||||
fn new_filter_metrics(explain_verbose: bool) -> ReaderFilterMetrics {
|
||||
if explain_verbose {
|
||||
ReaderFilterMetrics {
|
||||
inverted_index_apply_metrics: Some(InvertedIndexApplyMetrics::default()),
|
||||
bloom_filter_apply_metrics: Some(BloomFilterIndexApplyMetrics::default()),
|
||||
fulltext_index_apply_metrics: Some(FulltextIndexApplyMetrics::default()),
|
||||
..Default::default()
|
||||
}
|
||||
} else {
|
||||
ReaderFilterMetrics::default()
|
||||
}
|
||||
}
|
||||
|
||||
/// Scans file ranges at `index`.
|
||||
pub(crate) async fn scan_file_ranges(
|
||||
stream_ctx: Arc<StreamContext>,
|
||||
@@ -879,10 +776,7 @@ pub(crate) async fn scan_file_ranges(
|
||||
read_type: &'static str,
|
||||
range_builder: Arc<RangeBuilderList>,
|
||||
) -> Result<impl Stream<Item = Result<Batch>>> {
|
||||
let mut reader_metrics = ReaderMetrics {
|
||||
filter_metrics: new_filter_metrics(part_metrics.explain_verbose()),
|
||||
..Default::default()
|
||||
};
|
||||
let mut reader_metrics = ReaderMetrics::default();
|
||||
let ranges = range_builder
|
||||
.build_file_ranges(&stream_ctx.input, index, &mut reader_metrics)
|
||||
.await?;
|
||||
@@ -905,10 +799,7 @@ pub(crate) async fn scan_flat_file_ranges(
|
||||
read_type: &'static str,
|
||||
range_builder: Arc<RangeBuilderList>,
|
||||
) -> Result<impl Stream<Item = Result<RecordBatch>>> {
|
||||
let mut reader_metrics = ReaderMetrics {
|
||||
filter_metrics: new_filter_metrics(part_metrics.explain_verbose()),
|
||||
..Default::default()
|
||||
};
|
||||
let mut reader_metrics = ReaderMetrics::default();
|
||||
let ranges = range_builder
|
||||
.build_file_ranges(&stream_ctx.input, index, &mut reader_metrics)
|
||||
.await?;
|
||||
@@ -931,18 +822,10 @@ pub fn build_file_range_scan_stream(
|
||||
ranges: SmallVec<[FileRange; 2]>,
|
||||
) -> impl Stream<Item = Result<Batch>> {
|
||||
try_stream! {
|
||||
let fetch_metrics = if part_metrics.explain_verbose() {
|
||||
Some(Arc::new(ParquetFetchMetrics::default()))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let reader_metrics = &mut ReaderMetrics {
|
||||
fetch_metrics: fetch_metrics.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
let reader_metrics = &mut ReaderMetrics::default();
|
||||
for range in ranges {
|
||||
let build_reader_start = Instant::now();
|
||||
let reader = range.reader(stream_ctx.input.series_row_selector, fetch_metrics.as_deref()).await?;
|
||||
let reader = range.reader(stream_ctx.input.series_row_selector).await?;
|
||||
let build_cost = build_reader_start.elapsed();
|
||||
part_metrics.inc_build_reader_cost(build_cost);
|
||||
let compat_batch = range.compat_batch();
|
||||
@@ -974,18 +857,10 @@ pub fn build_flat_file_range_scan_stream(
|
||||
ranges: SmallVec<[FileRange; 2]>,
|
||||
) -> impl Stream<Item = Result<RecordBatch>> {
|
||||
try_stream! {
|
||||
let fetch_metrics = if part_metrics.explain_verbose() {
|
||||
Some(Arc::new(ParquetFetchMetrics::default()))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let reader_metrics = &mut ReaderMetrics {
|
||||
fetch_metrics: fetch_metrics.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
let reader_metrics = &mut ReaderMetrics::default();
|
||||
for range in ranges {
|
||||
let build_reader_start = Instant::now();
|
||||
let mut reader = range.flat_reader(fetch_metrics.as_deref()).await?;
|
||||
let mut reader = range.flat_reader().await?;
|
||||
let build_cost = build_reader_start.elapsed();
|
||||
part_metrics.inc_build_reader_cost(build_cost);
|
||||
|
||||
|
||||
@@ -1355,7 +1355,6 @@ mod tests {
|
||||
compress_type: CompressionType::Uncompressed,
|
||||
checkpoint_distance: 10,
|
||||
remove_file_options: Default::default(),
|
||||
manifest_cache: None,
|
||||
},
|
||||
FormatType::PrimaryKey,
|
||||
&Default::default(),
|
||||
@@ -1422,7 +1421,6 @@ mod tests {
|
||||
compress_type: CompressionType::Uncompressed,
|
||||
checkpoint_distance: 10,
|
||||
remove_file_options: Default::default(),
|
||||
manifest_cache: None,
|
||||
},
|
||||
FormatType::PrimaryKey,
|
||||
&Default::default(),
|
||||
|
||||
@@ -41,7 +41,7 @@ use store_api::storage::{ColumnId, RegionId};
|
||||
|
||||
use crate::access_layer::AccessLayer;
|
||||
use crate::cache::CacheManagerRef;
|
||||
use crate::cache::file_cache::{FileCache, FileType, IndexKey};
|
||||
use crate::cache::file_cache::{FileCacheRef, FileType, IndexKey};
|
||||
use crate::config::MitoConfig;
|
||||
use crate::error;
|
||||
use crate::error::{
|
||||
@@ -270,14 +270,8 @@ impl RegionOpener {
|
||||
FormatType::PrimaryKey
|
||||
};
|
||||
// Create a manifest manager for this region and writes regions to the manifest file.
|
||||
let mut region_manifest_options =
|
||||
let region_manifest_options =
|
||||
RegionManifestOptions::new(config, ®ion_dir, &object_store);
|
||||
// Set manifest cache if available
|
||||
region_manifest_options.manifest_cache = self
|
||||
.cache_manager
|
||||
.as_ref()
|
||||
.and_then(|cm| cm.write_cache())
|
||||
.and_then(|wc| wc.manifest_cache());
|
||||
// For remote WAL, we need to set flushed_entry_id to current topic's latest entry id.
|
||||
let flushed_entry_id = provider.initial_flushed_entry_id::<S>(wal.store());
|
||||
let manifest_manager = RegionManifestManager::new(
|
||||
@@ -413,14 +407,8 @@ impl RegionOpener {
|
||||
let now = Instant::now();
|
||||
let mut region_options = self.options.as_ref().unwrap().clone();
|
||||
let object_storage = get_object_store(®ion_options.storage, &self.object_store_manager)?;
|
||||
let mut region_manifest_options =
|
||||
let region_manifest_options =
|
||||
RegionManifestOptions::new(config, &self.region_dir(), &object_storage);
|
||||
// Set manifest cache if available
|
||||
region_manifest_options.manifest_cache = self
|
||||
.cache_manager
|
||||
.as_ref()
|
||||
.and_then(|cm| cm.write_cache())
|
||||
.and_then(|wc| wc.manifest_cache());
|
||||
let Some(manifest_manager) =
|
||||
RegionManifestManager::open(region_manifest_options, &self.stats).await?
|
||||
else {
|
||||
@@ -848,7 +836,7 @@ impl RegionLoadCacheTask {
|
||||
}
|
||||
|
||||
/// Fills the file cache with index files from the region.
|
||||
pub(crate) async fn fill_cache(&self, file_cache: &FileCache) {
|
||||
pub(crate) async fn fill_cache(&self, file_cache: FileCacheRef) {
|
||||
let region_id = self.region.region_id;
|
||||
let table_dir = self.region.access_layer.table_dir();
|
||||
let path_type = self.region.access_layer.path_type();
|
||||
|
||||
@@ -1750,7 +1750,6 @@ mod tests {
|
||||
None,
|
||||
factory,
|
||||
intm_manager,
|
||||
ReadableSize::mb(10),
|
||||
)
|
||||
.await
|
||||
.unwrap(),
|
||||
|
||||
@@ -17,14 +17,11 @@ mod builder;
|
||||
use std::collections::BTreeMap;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use common_base::range_read::RangeReader;
|
||||
use common_telemetry::warn;
|
||||
use index::bloom_filter::applier::{BloomFilterApplier, InListPredicate};
|
||||
use index::bloom_filter::reader::{
|
||||
BloomFilterReadMetrics, BloomFilterReader, BloomFilterReaderImpl,
|
||||
};
|
||||
use index::bloom_filter::reader::{BloomFilterReader, BloomFilterReaderImpl};
|
||||
use index::target::IndexTarget;
|
||||
use object_store::ObjectStore;
|
||||
use puffin::puffin_manager::cache::PuffinMetadataCacheRef;
|
||||
@@ -50,62 +47,6 @@ use crate::sst::index::bloom_filter::INDEX_BLOB_TYPE;
|
||||
pub use crate::sst::index::bloom_filter::applier::builder::BloomFilterIndexApplierBuilder;
|
||||
use crate::sst::index::puffin_manager::{BlobReader, PuffinManagerFactory};
|
||||
|
||||
/// Metrics for tracking bloom filter index apply operations.
|
||||
#[derive(Default, Clone)]
|
||||
pub struct BloomFilterIndexApplyMetrics {
|
||||
/// Total time spent applying the index.
|
||||
pub apply_elapsed: std::time::Duration,
|
||||
/// Number of blob cache misses.
|
||||
pub blob_cache_miss: usize,
|
||||
/// Total size of blobs read (in bytes).
|
||||
pub blob_read_bytes: u64,
|
||||
/// Metrics for bloom filter read operations.
|
||||
pub read_metrics: BloomFilterReadMetrics,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for BloomFilterIndexApplyMetrics {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let Self {
|
||||
apply_elapsed,
|
||||
blob_cache_miss,
|
||||
blob_read_bytes,
|
||||
read_metrics,
|
||||
} = self;
|
||||
|
||||
if self.is_empty() {
|
||||
return write!(f, "{{}}");
|
||||
}
|
||||
write!(f, "{{")?;
|
||||
|
||||
write!(f, "\"apply_elapsed\":\"{:?}\"", apply_elapsed)?;
|
||||
|
||||
if *blob_cache_miss > 0 {
|
||||
write!(f, ", \"blob_cache_miss\":{}", blob_cache_miss)?;
|
||||
}
|
||||
if *blob_read_bytes > 0 {
|
||||
write!(f, ", \"blob_read_bytes\":{}", blob_read_bytes)?;
|
||||
}
|
||||
write!(f, ", \"read_metrics\":{:?}", read_metrics)?;
|
||||
|
||||
write!(f, "}}")
|
||||
}
|
||||
}
|
||||
|
||||
impl BloomFilterIndexApplyMetrics {
|
||||
/// Returns true if the metrics are empty (contain no meaningful data).
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.apply_elapsed.is_zero()
|
||||
}
|
||||
|
||||
/// Merges another metrics into this one.
|
||||
pub fn merge_from(&mut self, other: &Self) {
|
||||
self.apply_elapsed += other.apply_elapsed;
|
||||
self.blob_cache_miss += other.blob_cache_miss;
|
||||
self.blob_read_bytes += other.blob_read_bytes;
|
||||
self.read_metrics.merge_from(&other.read_metrics);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) type BloomFilterIndexApplierRef = Arc<BloomFilterIndexApplier>;
|
||||
|
||||
/// `BloomFilterIndexApplier` applies bloom filter predicates to the SST file.
|
||||
@@ -192,20 +133,15 @@ impl BloomFilterIndexApplier {
|
||||
///
|
||||
/// Row group id existing in the returned result means that the row group is searched.
|
||||
/// Empty ranges means that the row group is searched but no rows are found.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `file_id` - The region file ID to apply predicates to
|
||||
/// * `file_size_hint` - Optional hint for file size to avoid extra metadata reads
|
||||
/// * `row_groups` - Iterator of row group lengths and whether to search in the row group
|
||||
/// * `metrics` - Optional mutable reference to collect metrics on demand
|
||||
pub async fn apply(
|
||||
&self,
|
||||
file_id: RegionFileId,
|
||||
file_size_hint: Option<u64>,
|
||||
row_groups: impl Iterator<Item = (usize, bool)>,
|
||||
mut metrics: Option<&mut BloomFilterIndexApplyMetrics>,
|
||||
) -> Result<Vec<(usize, Vec<Range<usize>>)>> {
|
||||
let apply_start = Instant::now();
|
||||
let _timer = INDEX_APPLY_ELAPSED
|
||||
.with_label_values(&[TYPE_BLOOM_FILTER_INDEX])
|
||||
.start_timer();
|
||||
|
||||
// Calculates row groups' ranges based on start of the file.
|
||||
let mut input = Vec::with_capacity(row_groups.size_hint().0);
|
||||
@@ -227,7 +163,7 @@ impl BloomFilterIndexApplier {
|
||||
|
||||
for (column_id, predicates) in self.predicates.iter() {
|
||||
let blob = match self
|
||||
.blob_reader(file_id, *column_id, file_size_hint, metrics.as_deref_mut())
|
||||
.blob_reader(file_id, *column_id, file_size_hint)
|
||||
.await?
|
||||
{
|
||||
Some(blob) => blob,
|
||||
@@ -237,9 +173,6 @@ impl BloomFilterIndexApplier {
|
||||
// Create appropriate reader based on whether we have caching enabled
|
||||
if let Some(bloom_filter_cache) = &self.bloom_filter_index_cache {
|
||||
let blob_size = blob.metadata().await.context(MetadataSnafu)?.content_length;
|
||||
if let Some(m) = &mut metrics {
|
||||
m.blob_read_bytes += blob_size;
|
||||
}
|
||||
let reader = CachedBloomFilterIndexBlobReader::new(
|
||||
file_id.file_id(),
|
||||
*column_id,
|
||||
@@ -248,12 +181,12 @@ impl BloomFilterIndexApplier {
|
||||
BloomFilterReaderImpl::new(blob),
|
||||
bloom_filter_cache.clone(),
|
||||
);
|
||||
self.apply_predicates(reader, predicates, &mut output, metrics.as_deref_mut())
|
||||
self.apply_predicates(reader, predicates, &mut output)
|
||||
.await
|
||||
.context(ApplyBloomFilterIndexSnafu)?;
|
||||
} else {
|
||||
let reader = BloomFilterReaderImpl::new(blob);
|
||||
self.apply_predicates(reader, predicates, &mut output, metrics.as_deref_mut())
|
||||
self.apply_predicates(reader, predicates, &mut output)
|
||||
.await
|
||||
.context(ApplyBloomFilterIndexSnafu)?;
|
||||
}
|
||||
@@ -268,16 +201,6 @@ impl BloomFilterIndexApplier {
|
||||
}
|
||||
}
|
||||
|
||||
// Record elapsed time to histogram and collect metrics if requested
|
||||
let elapsed = apply_start.elapsed();
|
||||
INDEX_APPLY_ELAPSED
|
||||
.with_label_values(&[TYPE_BLOOM_FILTER_INDEX])
|
||||
.observe(elapsed.as_secs_f64());
|
||||
|
||||
if let Some(m) = metrics {
|
||||
m.apply_elapsed += elapsed;
|
||||
}
|
||||
|
||||
Ok(output)
|
||||
}
|
||||
|
||||
@@ -289,7 +212,6 @@ impl BloomFilterIndexApplier {
|
||||
file_id: RegionFileId,
|
||||
column_id: ColumnId,
|
||||
file_size_hint: Option<u64>,
|
||||
metrics: Option<&mut BloomFilterIndexApplyMetrics>,
|
||||
) -> Result<Option<BlobReader>> {
|
||||
let reader = match self
|
||||
.cached_blob_reader(file_id, column_id, file_size_hint)
|
||||
@@ -297,9 +219,6 @@ impl BloomFilterIndexApplier {
|
||||
{
|
||||
Ok(Some(puffin_reader)) => puffin_reader,
|
||||
other => {
|
||||
if let Some(m) = metrics {
|
||||
m.blob_cache_miss += 1;
|
||||
}
|
||||
if let Err(err) = other {
|
||||
// Blob not found means no index for this column
|
||||
if is_blob_not_found(&err) {
|
||||
@@ -401,7 +320,6 @@ impl BloomFilterIndexApplier {
|
||||
reader: R,
|
||||
predicates: &[InListPredicate],
|
||||
output: &mut [(usize, Vec<Range<usize>>)],
|
||||
mut metrics: Option<&mut BloomFilterIndexApplyMetrics>,
|
||||
) -> std::result::Result<(), index::bloom_filter::error::Error> {
|
||||
let mut applier = BloomFilterApplier::new(Box::new(reader)).await?;
|
||||
|
||||
@@ -411,10 +329,7 @@ impl BloomFilterIndexApplier {
|
||||
continue;
|
||||
}
|
||||
|
||||
let read_metrics = metrics.as_deref_mut().map(|m| &mut m.read_metrics);
|
||||
*row_group_output = applier
|
||||
.search(predicates, row_group_output, read_metrics)
|
||||
.await?;
|
||||
*row_group_output = applier.search(predicates, row_group_output).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -478,7 +393,7 @@ mod tests {
|
||||
|
||||
let applier = builder.build(&exprs).unwrap().unwrap();
|
||||
applier
|
||||
.apply(file_id, None, row_groups.into_iter(), None)
|
||||
.apply(file_id, None, row_groups.into_iter())
|
||||
.await
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
|
||||
@@ -637,17 +637,17 @@ pub(crate) mod tests {
|
||||
.unwrap();
|
||||
let reader = blob_guard.reader().await.unwrap();
|
||||
let bloom_filter = BloomFilterReaderImpl::new(reader);
|
||||
let metadata = bloom_filter.metadata(None).await.unwrap();
|
||||
let metadata = bloom_filter.metadata().await.unwrap();
|
||||
|
||||
assert_eq!(metadata.segment_count, 10);
|
||||
for i in 0..5 {
|
||||
let loc = &metadata.bloom_filter_locs[metadata.segment_loc_indices[i] as usize];
|
||||
let bf = bloom_filter.bloom_filter(loc, None).await.unwrap();
|
||||
let bf = bloom_filter.bloom_filter(loc).await.unwrap();
|
||||
assert!(bf.contains(b"tag1"));
|
||||
}
|
||||
for i in 5..10 {
|
||||
let loc = &metadata.bloom_filter_locs[metadata.segment_loc_indices[i] as usize];
|
||||
let bf = bloom_filter.bloom_filter(loc, None).await.unwrap();
|
||||
let bf = bloom_filter.bloom_filter(loc).await.unwrap();
|
||||
assert!(bf.contains(b"tag2"));
|
||||
}
|
||||
}
|
||||
@@ -662,13 +662,13 @@ pub(crate) mod tests {
|
||||
.unwrap();
|
||||
let reader = blob_guard.reader().await.unwrap();
|
||||
let bloom_filter = BloomFilterReaderImpl::new(reader);
|
||||
let metadata = bloom_filter.metadata(None).await.unwrap();
|
||||
let metadata = bloom_filter.metadata().await.unwrap();
|
||||
|
||||
assert_eq!(metadata.segment_count, 5);
|
||||
for i in 0u64..20 {
|
||||
let idx = i as usize / 4;
|
||||
let loc = &metadata.bloom_filter_locs[metadata.segment_loc_indices[idx] as usize];
|
||||
let bf = bloom_filter.bloom_filter(loc, None).await.unwrap();
|
||||
let bf = bloom_filter.bloom_filter(loc).await.unwrap();
|
||||
let mut buf = vec![];
|
||||
IndexValueCodec::encode_nonnull_value(ValueRef::UInt64(i), &sort_field, &mut buf)
|
||||
.unwrap();
|
||||
|
||||
@@ -16,12 +16,11 @@ use std::collections::{BTreeMap, BTreeSet, HashSet};
|
||||
use std::iter;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use common_base::range_read::RangeReader;
|
||||
use common_telemetry::warn;
|
||||
use index::bloom_filter::applier::{BloomFilterApplier, InListPredicate};
|
||||
use index::bloom_filter::reader::{BloomFilterReadMetrics, BloomFilterReaderImpl};
|
||||
use index::bloom_filter::reader::BloomFilterReaderImpl;
|
||||
use index::fulltext_index::search::{FulltextIndexSearcher, RowId, TantivyFulltextIndexSearcher};
|
||||
use index::fulltext_index::tokenizer::{ChineseTokenizer, EnglishTokenizer, Tokenizer};
|
||||
use index::fulltext_index::{Analyzer, Config};
|
||||
@@ -54,95 +53,6 @@ use crate::sst::index::puffin_manager::{
|
||||
|
||||
pub mod builder;
|
||||
|
||||
/// Metrics for tracking fulltext index apply operations.
|
||||
#[derive(Default, Clone)]
|
||||
pub struct FulltextIndexApplyMetrics {
|
||||
/// Total time spent applying the index.
|
||||
pub apply_elapsed: std::time::Duration,
|
||||
/// Number of blob cache misses.
|
||||
pub blob_cache_miss: usize,
|
||||
/// Number of directory cache hits.
|
||||
pub dir_cache_hit: usize,
|
||||
/// Number of directory cache misses.
|
||||
pub dir_cache_miss: usize,
|
||||
/// Elapsed time to initialize directory data.
|
||||
pub dir_init_elapsed: std::time::Duration,
|
||||
/// Metrics for bloom filter reads.
|
||||
pub bloom_filter_read_metrics: BloomFilterReadMetrics,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for FulltextIndexApplyMetrics {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let Self {
|
||||
apply_elapsed,
|
||||
blob_cache_miss,
|
||||
dir_cache_hit,
|
||||
dir_cache_miss,
|
||||
dir_init_elapsed,
|
||||
bloom_filter_read_metrics,
|
||||
} = self;
|
||||
|
||||
if self.is_empty() {
|
||||
return write!(f, "{{}}");
|
||||
}
|
||||
write!(f, "{{")?;
|
||||
|
||||
write!(f, "\"apply_elapsed\":\"{:?}\"", apply_elapsed)?;
|
||||
|
||||
if *blob_cache_miss > 0 {
|
||||
write!(f, ", \"blob_cache_miss\":{}", blob_cache_miss)?;
|
||||
}
|
||||
if *dir_cache_hit > 0 {
|
||||
write!(f, ", \"dir_cache_hit\":{}", dir_cache_hit)?;
|
||||
}
|
||||
if *dir_cache_miss > 0 {
|
||||
write!(f, ", \"dir_cache_miss\":{}", dir_cache_miss)?;
|
||||
}
|
||||
if !dir_init_elapsed.is_zero() {
|
||||
write!(f, ", \"dir_init_elapsed\":\"{:?}\"", dir_init_elapsed)?;
|
||||
}
|
||||
write!(
|
||||
f,
|
||||
", \"bloom_filter_read_metrics\":{:?}",
|
||||
bloom_filter_read_metrics
|
||||
)?;
|
||||
|
||||
write!(f, "}}")
|
||||
}
|
||||
}
|
||||
|
||||
impl FulltextIndexApplyMetrics {
|
||||
/// Returns true if the metrics are empty (contain no meaningful data).
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.apply_elapsed.is_zero()
|
||||
}
|
||||
|
||||
/// Collects metrics from a directory read operation.
|
||||
pub fn collect_dir_metrics(
|
||||
&mut self,
|
||||
elapsed: std::time::Duration,
|
||||
dir_metrics: puffin::puffin_manager::DirMetrics,
|
||||
) {
|
||||
self.dir_init_elapsed += elapsed;
|
||||
if dir_metrics.cache_hit {
|
||||
self.dir_cache_hit += 1;
|
||||
} else {
|
||||
self.dir_cache_miss += 1;
|
||||
}
|
||||
}
|
||||
|
||||
/// Merges another metrics into this one.
|
||||
pub fn merge_from(&mut self, other: &Self) {
|
||||
self.apply_elapsed += other.apply_elapsed;
|
||||
self.blob_cache_miss += other.blob_cache_miss;
|
||||
self.dir_cache_hit += other.dir_cache_hit;
|
||||
self.dir_cache_miss += other.dir_cache_miss;
|
||||
self.dir_init_elapsed += other.dir_init_elapsed;
|
||||
self.bloom_filter_read_metrics
|
||||
.merge_from(&other.bloom_filter_read_metrics);
|
||||
}
|
||||
}
|
||||
|
||||
/// `FulltextIndexApplier` is responsible for applying fulltext index to the provided SST files
|
||||
pub struct FulltextIndexApplier {
|
||||
/// Requests to be applied.
|
||||
@@ -214,18 +124,14 @@ impl FulltextIndexApplier {
|
||||
impl FulltextIndexApplier {
|
||||
/// Applies fine-grained fulltext index to the specified SST file.
|
||||
/// Returns the row ids that match the queries.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `file_id` - The region file ID to apply predicates to
|
||||
/// * `file_size_hint` - Optional hint for file size to avoid extra metadata reads
|
||||
/// * `metrics` - Optional mutable reference to collect metrics on demand
|
||||
pub async fn apply_fine(
|
||||
&self,
|
||||
file_id: RegionFileId,
|
||||
file_size_hint: Option<u64>,
|
||||
mut metrics: Option<&mut FulltextIndexApplyMetrics>,
|
||||
) -> Result<Option<BTreeSet<RowId>>> {
|
||||
let apply_start = Instant::now();
|
||||
let timer = INDEX_APPLY_ELAPSED
|
||||
.with_label_values(&[TYPE_FULLTEXT_INDEX])
|
||||
.start_timer();
|
||||
|
||||
let mut row_ids: Option<BTreeSet<RowId>> = None;
|
||||
for (column_id, request) in self.requests.iter() {
|
||||
@@ -234,13 +140,7 @@ impl FulltextIndexApplier {
|
||||
}
|
||||
|
||||
let Some(result) = self
|
||||
.apply_fine_one_column(
|
||||
file_size_hint,
|
||||
file_id,
|
||||
*column_id,
|
||||
request,
|
||||
metrics.as_deref_mut(),
|
||||
)
|
||||
.apply_fine_one_column(file_size_hint, file_id, *column_id, request)
|
||||
.await?
|
||||
else {
|
||||
continue;
|
||||
@@ -259,16 +159,9 @@ impl FulltextIndexApplier {
|
||||
}
|
||||
}
|
||||
|
||||
// Record elapsed time to histogram and collect metrics if requested
|
||||
let elapsed = apply_start.elapsed();
|
||||
INDEX_APPLY_ELAPSED
|
||||
.with_label_values(&[TYPE_FULLTEXT_INDEX])
|
||||
.observe(elapsed.as_secs_f64());
|
||||
|
||||
if let Some(m) = metrics {
|
||||
m.apply_elapsed += elapsed;
|
||||
if row_ids.is_none() {
|
||||
timer.stop_and_discard();
|
||||
}
|
||||
|
||||
Ok(row_ids)
|
||||
}
|
||||
|
||||
@@ -278,7 +171,6 @@ impl FulltextIndexApplier {
|
||||
file_id: RegionFileId,
|
||||
column_id: ColumnId,
|
||||
request: &FulltextRequest,
|
||||
metrics: Option<&mut FulltextIndexApplyMetrics>,
|
||||
) -> Result<Option<BTreeSet<RowId>>> {
|
||||
let blob_key = format!(
|
||||
"{INDEX_BLOB_TYPE_TANTIVY}-{}",
|
||||
@@ -286,7 +178,7 @@ impl FulltextIndexApplier {
|
||||
);
|
||||
let dir = self
|
||||
.index_source
|
||||
.dir(file_id, &blob_key, file_size_hint, metrics)
|
||||
.dir(file_id, &blob_key, file_size_hint)
|
||||
.await?;
|
||||
|
||||
let dir = match &dir {
|
||||
@@ -348,20 +240,15 @@ impl FulltextIndexApplier {
|
||||
///
|
||||
/// Row group id existing in the returned result means that the row group is searched.
|
||||
/// Empty ranges means that the row group is searched but no rows are found.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `file_id` - The region file ID to apply predicates to
|
||||
/// * `file_size_hint` - Optional hint for file size to avoid extra metadata reads
|
||||
/// * `row_groups` - Iterator of row group lengths and whether to search in the row group
|
||||
/// * `metrics` - Optional mutable reference to collect metrics on demand
|
||||
pub async fn apply_coarse(
|
||||
&self,
|
||||
file_id: RegionFileId,
|
||||
file_size_hint: Option<u64>,
|
||||
row_groups: impl Iterator<Item = (usize, bool)>,
|
||||
mut metrics: Option<&mut FulltextIndexApplyMetrics>,
|
||||
) -> Result<Option<Vec<(usize, Vec<Range<usize>>)>>> {
|
||||
let apply_start = Instant::now();
|
||||
let timer = INDEX_APPLY_ELAPSED
|
||||
.with_label_values(&[TYPE_FULLTEXT_INDEX])
|
||||
.start_timer();
|
||||
|
||||
let (input, mut output) = Self::init_coarse_output(row_groups);
|
||||
let mut applied = false;
|
||||
@@ -379,27 +266,16 @@ impl FulltextIndexApplier {
|
||||
*column_id,
|
||||
&request.terms,
|
||||
&mut output,
|
||||
metrics.as_deref_mut(),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
if !applied {
|
||||
timer.stop_and_discard();
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Self::adjust_coarse_output(input, &mut output);
|
||||
|
||||
// Record elapsed time to histogram and collect metrics if requested
|
||||
let elapsed = apply_start.elapsed();
|
||||
INDEX_APPLY_ELAPSED
|
||||
.with_label_values(&[TYPE_FULLTEXT_INDEX])
|
||||
.observe(elapsed.as_secs_f64());
|
||||
|
||||
if let Some(m) = metrics {
|
||||
m.apply_elapsed += elapsed;
|
||||
}
|
||||
|
||||
Ok(Some(output))
|
||||
}
|
||||
|
||||
@@ -410,7 +286,6 @@ impl FulltextIndexApplier {
|
||||
column_id: ColumnId,
|
||||
terms: &[FulltextTerm],
|
||||
output: &mut [(usize, Vec<Range<usize>>)],
|
||||
mut metrics: Option<&mut FulltextIndexApplyMetrics>,
|
||||
) -> Result<bool> {
|
||||
let blob_key = format!(
|
||||
"{INDEX_BLOB_TYPE_BLOOM}-{}",
|
||||
@@ -418,7 +293,7 @@ impl FulltextIndexApplier {
|
||||
);
|
||||
let Some(reader) = self
|
||||
.index_source
|
||||
.blob(file_id, &blob_key, file_size_hint, metrics.as_deref_mut())
|
||||
.blob(file_id, &blob_key, file_size_hint)
|
||||
.await?
|
||||
else {
|
||||
return Ok(false);
|
||||
@@ -461,13 +336,7 @@ impl FulltextIndexApplier {
|
||||
}
|
||||
|
||||
*row_group_output = applier
|
||||
.search(
|
||||
&predicates,
|
||||
row_group_output,
|
||||
metrics
|
||||
.as_deref_mut()
|
||||
.map(|m| &mut m.bloom_filter_read_metrics),
|
||||
)
|
||||
.search(&predicates, row_group_output)
|
||||
.await
|
||||
.context(ApplyBloomFilterIndexSnafu)?;
|
||||
}
|
||||
@@ -614,15 +483,8 @@ impl IndexSource {
|
||||
file_id: RegionFileId,
|
||||
key: &str,
|
||||
file_size_hint: Option<u64>,
|
||||
metrics: Option<&mut FulltextIndexApplyMetrics>,
|
||||
) -> Result<Option<GuardWithMetadata<SstPuffinBlob>>> {
|
||||
let (reader, fallbacked) = self.ensure_reader(file_id, file_size_hint).await?;
|
||||
|
||||
// Track cache miss if fallbacked to remote
|
||||
if fallbacked && let Some(m) = metrics {
|
||||
m.blob_cache_miss += 1;
|
||||
}
|
||||
|
||||
let res = reader.blob(key).await;
|
||||
match res {
|
||||
Ok(blob) => Ok(Some(blob)),
|
||||
@@ -652,25 +514,11 @@ impl IndexSource {
|
||||
file_id: RegionFileId,
|
||||
key: &str,
|
||||
file_size_hint: Option<u64>,
|
||||
mut metrics: Option<&mut FulltextIndexApplyMetrics>,
|
||||
) -> Result<Option<GuardWithMetadata<SstPuffinDir>>> {
|
||||
let (reader, fallbacked) = self.ensure_reader(file_id, file_size_hint).await?;
|
||||
|
||||
// Track cache miss if fallbacked to remote
|
||||
if fallbacked && let Some(m) = &mut metrics {
|
||||
m.blob_cache_miss += 1;
|
||||
}
|
||||
|
||||
let start = metrics.as_ref().map(|_| Instant::now());
|
||||
let res = reader.dir(key).await;
|
||||
match res {
|
||||
Ok((dir, dir_metrics)) => {
|
||||
if let Some(m) = metrics {
|
||||
// Safety: start is Some when metrics is Some
|
||||
m.collect_dir_metrics(start.unwrap().elapsed(), dir_metrics);
|
||||
}
|
||||
Ok(Some(dir))
|
||||
}
|
||||
Ok(dir) => Ok(Some(dir)),
|
||||
Err(err) if err.is_blob_not_found() => Ok(None),
|
||||
Err(err) => {
|
||||
if fallbacked {
|
||||
@@ -678,16 +526,9 @@ impl IndexSource {
|
||||
} else {
|
||||
warn!(err; "An unexpected error occurred while reading the cached index file. Fallback to remote index file.");
|
||||
let reader = self.build_remote(file_id, file_size_hint).await?;
|
||||
let start = metrics.as_ref().map(|_| Instant::now());
|
||||
let res = reader.dir(key).await;
|
||||
match res {
|
||||
Ok((dir, dir_metrics)) => {
|
||||
if let Some(m) = metrics {
|
||||
// Safety: start is Some when metrics is Some
|
||||
m.collect_dir_metrics(start.unwrap().elapsed(), dir_metrics);
|
||||
}
|
||||
Ok(Some(dir))
|
||||
}
|
||||
Ok(dir) => Ok(Some(dir)),
|
||||
Err(err) if err.is_blob_not_found() => Ok(None),
|
||||
Err(err) => Err(err).context(PuffinReadBlobSnafu),
|
||||
}
|
||||
|
||||
@@ -723,16 +723,15 @@ mod tests {
|
||||
let backend = backend.clone();
|
||||
async move {
|
||||
match backend {
|
||||
FulltextBackend::Tantivy => applier
|
||||
.apply_fine(region_file_id, None, None)
|
||||
.await
|
||||
.unwrap(),
|
||||
FulltextBackend::Tantivy => {
|
||||
applier.apply_fine(region_file_id, None).await.unwrap()
|
||||
}
|
||||
FulltextBackend::Bloom => {
|
||||
let coarse_mask = coarse_mask.unwrap_or_default();
|
||||
let row_groups = (0..coarse_mask.len()).map(|i| (1, coarse_mask[i]));
|
||||
// row group id == row id
|
||||
let resp = applier
|
||||
.apply_coarse(region_file_id, None, row_groups, None)
|
||||
.apply_coarse(region_file_id, None, row_groups)
|
||||
.await
|
||||
.unwrap();
|
||||
resp.map(|r| {
|
||||
|
||||
@@ -16,11 +16,10 @@ pub mod builder;
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use common_base::range_read::RangeReader;
|
||||
use common_telemetry::warn;
|
||||
use index::inverted_index::format::reader::{InvertedIndexBlobReader, InvertedIndexReadMetrics};
|
||||
use index::inverted_index::format::reader::InvertedIndexBlobReader;
|
||||
use index::inverted_index::search::index_apply::{
|
||||
ApplyOutput, IndexApplier, IndexNotFoundStrategy, SearchContext,
|
||||
};
|
||||
@@ -45,67 +44,6 @@ use crate::sst::index::TYPE_INVERTED_INDEX;
|
||||
use crate::sst::index::inverted_index::INDEX_BLOB_TYPE;
|
||||
use crate::sst::index::puffin_manager::{BlobReader, PuffinManagerFactory};
|
||||
|
||||
/// Metrics for tracking inverted index apply operations.
|
||||
#[derive(Default, Clone)]
|
||||
pub struct InvertedIndexApplyMetrics {
|
||||
/// Total time spent applying the index.
|
||||
pub apply_elapsed: std::time::Duration,
|
||||
/// Number of blob cache misses (0 or 1).
|
||||
pub blob_cache_miss: usize,
|
||||
/// Total size of blobs read (in bytes).
|
||||
pub blob_read_bytes: u64,
|
||||
/// Metrics for inverted index reads.
|
||||
pub inverted_index_read_metrics: InvertedIndexReadMetrics,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for InvertedIndexApplyMetrics {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let Self {
|
||||
apply_elapsed,
|
||||
blob_cache_miss,
|
||||
blob_read_bytes,
|
||||
inverted_index_read_metrics,
|
||||
} = self;
|
||||
|
||||
if self.is_empty() {
|
||||
return write!(f, "{{}}");
|
||||
}
|
||||
write!(f, "{{")?;
|
||||
|
||||
write!(f, "\"apply_elapsed\":\"{:?}\"", apply_elapsed)?;
|
||||
|
||||
if *blob_cache_miss > 0 {
|
||||
write!(f, ", \"blob_cache_miss\":{}", blob_cache_miss)?;
|
||||
}
|
||||
if *blob_read_bytes > 0 {
|
||||
write!(f, ", \"blob_read_bytes\":{}", blob_read_bytes)?;
|
||||
}
|
||||
write!(
|
||||
f,
|
||||
", \"inverted_index_read_metrics\":{:?}",
|
||||
inverted_index_read_metrics
|
||||
)?;
|
||||
|
||||
write!(f, "}}")
|
||||
}
|
||||
}
|
||||
|
||||
impl InvertedIndexApplyMetrics {
|
||||
/// Returns true if the metrics are empty (contain no meaningful data).
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.apply_elapsed.is_zero()
|
||||
}
|
||||
|
||||
/// Merges another metrics into this one.
|
||||
pub fn merge_from(&mut self, other: &Self) {
|
||||
self.apply_elapsed += other.apply_elapsed;
|
||||
self.blob_cache_miss += other.blob_cache_miss;
|
||||
self.blob_read_bytes += other.blob_read_bytes;
|
||||
self.inverted_index_read_metrics
|
||||
.merge_from(&other.inverted_index_read_metrics);
|
||||
}
|
||||
}
|
||||
|
||||
/// `InvertedIndexApplier` is responsible for applying predicates to the provided SST files
|
||||
/// and returning the relevant row group ids for further scan.
|
||||
pub(crate) struct InvertedIndexApplier {
|
||||
@@ -186,30 +124,24 @@ impl InvertedIndexApplier {
|
||||
self
|
||||
}
|
||||
|
||||
/// Applies predicates to the provided SST file id and returns the relevant row group ids.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `file_id` - The region file ID to apply predicates to
|
||||
/// * `file_size_hint` - Optional hint for file size to avoid extra metadata reads
|
||||
/// * `metrics` - Optional mutable reference to collect metrics on demand
|
||||
/// Applies predicates to the provided SST file id and returns the relevant row group ids
|
||||
pub async fn apply(
|
||||
&self,
|
||||
file_id: RegionFileId,
|
||||
file_size_hint: Option<u64>,
|
||||
mut metrics: Option<&mut InvertedIndexApplyMetrics>,
|
||||
) -> Result<ApplyOutput> {
|
||||
let start = Instant::now();
|
||||
let _timer = INDEX_APPLY_ELAPSED
|
||||
.with_label_values(&[TYPE_INVERTED_INDEX])
|
||||
.start_timer();
|
||||
|
||||
let context = SearchContext {
|
||||
// Encountering a non-existing column indicates that it doesn't match predicates.
|
||||
index_not_found_strategy: IndexNotFoundStrategy::ReturnEmpty,
|
||||
};
|
||||
|
||||
let mut cache_miss = 0;
|
||||
let blob = match self.cached_blob_reader(file_id, file_size_hint).await {
|
||||
Ok(Some(puffin_reader)) => puffin_reader,
|
||||
other => {
|
||||
cache_miss += 1;
|
||||
if let Err(err) = other {
|
||||
warn!(err; "An unexpected error occurred while reading the cached index file. Fallback to remote index file.")
|
||||
}
|
||||
@@ -217,9 +149,8 @@ impl InvertedIndexApplier {
|
||||
}
|
||||
};
|
||||
|
||||
let blob_size = blob.metadata().await.context(MetadataSnafu)?.content_length;
|
||||
|
||||
let result = if let Some(index_cache) = &self.inverted_index_cache {
|
||||
if let Some(index_cache) = &self.inverted_index_cache {
|
||||
let blob_size = blob.metadata().await.context(MetadataSnafu)?.content_length;
|
||||
let mut index_reader = CachedInvertedIndexBlobReader::new(
|
||||
file_id.file_id(),
|
||||
blob_size,
|
||||
@@ -227,42 +158,16 @@ impl InvertedIndexApplier {
|
||||
index_cache.clone(),
|
||||
);
|
||||
self.index_applier
|
||||
.apply(
|
||||
context,
|
||||
&mut index_reader,
|
||||
metrics
|
||||
.as_deref_mut()
|
||||
.map(|m| &mut m.inverted_index_read_metrics),
|
||||
)
|
||||
.apply(context, &mut index_reader)
|
||||
.await
|
||||
.context(ApplyInvertedIndexSnafu)
|
||||
} else {
|
||||
let mut index_reader = InvertedIndexBlobReader::new(blob);
|
||||
self.index_applier
|
||||
.apply(
|
||||
context,
|
||||
&mut index_reader,
|
||||
metrics
|
||||
.as_deref_mut()
|
||||
.map(|m| &mut m.inverted_index_read_metrics),
|
||||
)
|
||||
.apply(context, &mut index_reader)
|
||||
.await
|
||||
.context(ApplyInvertedIndexSnafu)
|
||||
};
|
||||
|
||||
// Record elapsed time to histogram and collect metrics if requested
|
||||
let elapsed = start.elapsed();
|
||||
INDEX_APPLY_ELAPSED
|
||||
.with_label_values(&[TYPE_INVERTED_INDEX])
|
||||
.observe(elapsed.as_secs_f64());
|
||||
|
||||
if let Some(metrics) = metrics {
|
||||
metrics.apply_elapsed = elapsed;
|
||||
metrics.blob_cache_miss = cache_miss;
|
||||
metrics.blob_read_bytes = blob_size;
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Creates a blob reader from the cached index file.
|
||||
@@ -376,7 +281,7 @@ mod tests {
|
||||
|
||||
let mut mock_index_applier = MockIndexApplier::new();
|
||||
mock_index_applier.expect_memory_usage().returning(|| 100);
|
||||
mock_index_applier.expect_apply().returning(|_, _, _| {
|
||||
mock_index_applier.expect_apply().returning(|_, _| {
|
||||
Ok(ApplyOutput {
|
||||
matched_segment_ids: Bitmap::new_bitvec(),
|
||||
total_row_count: 100,
|
||||
@@ -392,7 +297,7 @@ mod tests {
|
||||
puffin_manager_factory,
|
||||
Default::default(),
|
||||
);
|
||||
let output = sst_index_applier.apply(file_id, None, None).await.unwrap();
|
||||
let output = sst_index_applier.apply(file_id, None).await.unwrap();
|
||||
assert_eq!(
|
||||
output,
|
||||
ApplyOutput {
|
||||
@@ -440,7 +345,7 @@ mod tests {
|
||||
puffin_manager_factory,
|
||||
Default::default(),
|
||||
);
|
||||
let res = sst_index_applier.apply(file_id, None, None).await;
|
||||
let res = sst_index_applier.apply(file_id, None).await;
|
||||
assert!(format!("{:?}", res.unwrap_err()).contains("Blob not found"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -615,7 +615,7 @@ mod tests {
|
||||
.unwrap();
|
||||
Box::pin(async move {
|
||||
applier
|
||||
.apply(sst_file_id, None, None)
|
||||
.apply(sst_file_id, None)
|
||||
.await
|
||||
.unwrap()
|
||||
.matched_segment_ids
|
||||
|
||||
@@ -245,7 +245,7 @@ mod tests {
|
||||
let bs = blob_reader.read(0..meta.content_length).await.unwrap();
|
||||
assert_eq!(&*bs, raw_data);
|
||||
|
||||
let (dir_guard, _metrics) = reader.dir(dir_key).await.unwrap();
|
||||
let dir_guard = reader.dir(dir_key).await.unwrap();
|
||||
let file = dir_guard.path().join("hello");
|
||||
let data = tokio::fs::read(file).await.unwrap();
|
||||
assert_eq!(data, raw_data);
|
||||
|
||||
@@ -45,7 +45,6 @@ use crate::sst::parquet::format::ReadFormat;
|
||||
use crate::sst::parquet::reader::{
|
||||
FlatRowGroupReader, MaybeFilter, RowGroupReader, RowGroupReaderBuilder, SimpleFilterContext,
|
||||
};
|
||||
use crate::sst::parquet::row_group::ParquetFetchMetrics;
|
||||
|
||||
/// Checks if a row group contains delete operations by examining the min value of op_type column.
|
||||
///
|
||||
@@ -118,16 +117,11 @@ impl FileRange {
|
||||
pub(crate) async fn reader(
|
||||
&self,
|
||||
selector: Option<TimeSeriesRowSelector>,
|
||||
fetch_metrics: Option<&ParquetFetchMetrics>,
|
||||
) -> Result<PruneReader> {
|
||||
let parquet_reader = self
|
||||
.context
|
||||
.reader_builder
|
||||
.build(
|
||||
self.row_group_idx,
|
||||
self.row_selection.clone(),
|
||||
fetch_metrics,
|
||||
)
|
||||
.build(self.row_group_idx, self.row_selection.clone())
|
||||
.await?;
|
||||
|
||||
let use_last_row_reader = if selector
|
||||
@@ -174,18 +168,11 @@ impl FileRange {
|
||||
}
|
||||
|
||||
/// Creates a flat reader that returns RecordBatch.
|
||||
pub(crate) async fn flat_reader(
|
||||
&self,
|
||||
fetch_metrics: Option<&ParquetFetchMetrics>,
|
||||
) -> Result<FlatPruneReader> {
|
||||
pub(crate) async fn flat_reader(&self) -> Result<FlatPruneReader> {
|
||||
let parquet_reader = self
|
||||
.context
|
||||
.reader_builder
|
||||
.build(
|
||||
self.row_group_idx,
|
||||
self.row_selection.clone(),
|
||||
fetch_metrics,
|
||||
)
|
||||
.build(self.row_group_idx, self.row_selection.clone())
|
||||
.await?;
|
||||
|
||||
// Compute skip_fields once for this row group
|
||||
|
||||
@@ -52,21 +52,15 @@ use crate::metrics::{
|
||||
use crate::read::prune::{PruneReader, Source};
|
||||
use crate::read::{Batch, BatchReader};
|
||||
use crate::sst::file::FileHandle;
|
||||
use crate::sst::index::bloom_filter::applier::{
|
||||
BloomFilterIndexApplierRef, BloomFilterIndexApplyMetrics,
|
||||
};
|
||||
use crate::sst::index::fulltext_index::applier::{
|
||||
FulltextIndexApplierRef, FulltextIndexApplyMetrics,
|
||||
};
|
||||
use crate::sst::index::inverted_index::applier::{
|
||||
InvertedIndexApplierRef, InvertedIndexApplyMetrics,
|
||||
};
|
||||
use crate::sst::index::bloom_filter::applier::BloomFilterIndexApplierRef;
|
||||
use crate::sst::index::fulltext_index::applier::FulltextIndexApplierRef;
|
||||
use crate::sst::index::inverted_index::applier::InvertedIndexApplierRef;
|
||||
use crate::sst::parquet::file_range::{
|
||||
FileRangeContext, FileRangeContextRef, PreFilterMode, row_group_contains_delete,
|
||||
};
|
||||
use crate::sst::parquet::format::{ReadFormat, need_override_sequence};
|
||||
use crate::sst::parquet::metadata::MetadataLoader;
|
||||
use crate::sst::parquet::row_group::{InMemoryRowGroup, ParquetFetchMetrics};
|
||||
use crate::sst::parquet::row_group::InMemoryRowGroup;
|
||||
use crate::sst::parquet::row_selection::RowGroupSelection;
|
||||
use crate::sst::parquet::stats::RowGroupPruningStats;
|
||||
use crate::sst::parquet::{DEFAULT_READ_BATCH_SIZE, PARQUET_METADATA_KEY};
|
||||
@@ -259,9 +253,7 @@ impl ParquetReaderBuilder {
|
||||
let file_size = self.file_handle.meta_ref().file_size;
|
||||
|
||||
// Loads parquet metadata of the file.
|
||||
let parquet_meta = self
|
||||
.read_parquet_metadata(&file_path, file_size, &mut metrics.metadata_cache_metrics)
|
||||
.await?;
|
||||
let parquet_meta = self.read_parquet_metadata(&file_path, file_size).await?;
|
||||
// Decodes region metadata.
|
||||
let key_value_meta = parquet_meta.file_metadata().key_value_metadata();
|
||||
// Gets the metadata stored in the SST.
|
||||
@@ -386,34 +378,25 @@ impl ParquetReaderBuilder {
|
||||
&self,
|
||||
file_path: &str,
|
||||
file_size: u64,
|
||||
cache_metrics: &mut MetadataCacheMetrics,
|
||||
) -> Result<Arc<ParquetMetaData>> {
|
||||
let start = Instant::now();
|
||||
let _t = READ_STAGE_ELAPSED
|
||||
.with_label_values(&["read_parquet_metadata"])
|
||||
.start_timer();
|
||||
|
||||
let file_id = self.file_handle.file_id();
|
||||
// Tries to get from cache with metrics tracking.
|
||||
if let Some(metadata) = self
|
||||
.cache_strategy
|
||||
.get_parquet_meta_data(file_id, cache_metrics)
|
||||
.await
|
||||
{
|
||||
cache_metrics.metadata_load_cost += start.elapsed();
|
||||
// Tries to get from global cache.
|
||||
if let Some(metadata) = self.cache_strategy.get_parquet_meta_data(file_id).await {
|
||||
return Ok(metadata);
|
||||
}
|
||||
|
||||
// Cache miss, load metadata directly.
|
||||
let metadata_loader = MetadataLoader::new(self.object_store.clone(), file_path, file_size);
|
||||
let metadata = metadata_loader.load().await?;
|
||||
|
||||
let metadata = Arc::new(metadata);
|
||||
// Cache the metadata.
|
||||
self.cache_strategy
|
||||
.put_parquet_meta_data(file_id, metadata.clone());
|
||||
|
||||
cache_metrics.metadata_load_cost += start.elapsed();
|
||||
Ok(metadata)
|
||||
}
|
||||
|
||||
@@ -544,11 +527,7 @@ impl ParquetReaderBuilder {
|
||||
// Slow path: apply the index from the file.
|
||||
let file_size_hint = self.file_handle.meta_ref().index_file_size();
|
||||
let apply_res = index_applier
|
||||
.apply_fine(
|
||||
self.file_handle.file_id(),
|
||||
Some(file_size_hint),
|
||||
metrics.fulltext_index_apply_metrics.as_mut(),
|
||||
)
|
||||
.apply_fine(self.file_handle.file_id(), Some(file_size_hint))
|
||||
.await;
|
||||
let selection = match apply_res {
|
||||
Ok(Some(res)) => {
|
||||
@@ -616,17 +595,13 @@ impl ParquetReaderBuilder {
|
||||
// Slow path: apply the index from the file.
|
||||
let file_size_hint = self.file_handle.meta_ref().index_file_size();
|
||||
let apply_res = index_applier
|
||||
.apply(
|
||||
self.file_handle.file_id(),
|
||||
Some(file_size_hint),
|
||||
metrics.inverted_index_apply_metrics.as_mut(),
|
||||
)
|
||||
.apply(self.file_handle.file_id(), Some(file_size_hint))
|
||||
.await;
|
||||
let selection = match apply_res {
|
||||
Ok(apply_output) => RowGroupSelection::from_inverted_index_apply_output(
|
||||
Ok(output) => RowGroupSelection::from_inverted_index_apply_output(
|
||||
row_group_size,
|
||||
num_row_groups,
|
||||
apply_output,
|
||||
output,
|
||||
),
|
||||
Err(err) => {
|
||||
handle_index_error!(err, self.file_handle, INDEX_TYPE_INVERTED);
|
||||
@@ -695,12 +670,7 @@ impl ParquetReaderBuilder {
|
||||
)
|
||||
});
|
||||
let apply_res = index_applier
|
||||
.apply(
|
||||
self.file_handle.file_id(),
|
||||
Some(file_size_hint),
|
||||
rgs,
|
||||
metrics.bloom_filter_apply_metrics.as_mut(),
|
||||
)
|
||||
.apply(self.file_handle.file_id(), Some(file_size_hint), rgs)
|
||||
.await;
|
||||
let mut selection = match apply_res {
|
||||
Ok(apply_output) => {
|
||||
@@ -778,12 +748,7 @@ impl ParquetReaderBuilder {
|
||||
)
|
||||
});
|
||||
let apply_res = index_applier
|
||||
.apply_coarse(
|
||||
self.file_handle.file_id(),
|
||||
Some(file_size_hint),
|
||||
rgs,
|
||||
metrics.fulltext_index_apply_metrics.as_mut(),
|
||||
)
|
||||
.apply_coarse(self.file_handle.file_id(), Some(file_size_hint), rgs)
|
||||
.await;
|
||||
let mut selection = match apply_res {
|
||||
Ok(Some(apply_output)) => {
|
||||
@@ -927,7 +892,7 @@ fn all_required_row_groups_searched(
|
||||
}
|
||||
|
||||
/// Metrics of filtering rows groups and rows.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
#[derive(Debug, Default, Clone, Copy)]
|
||||
pub(crate) struct ReaderFilterMetrics {
|
||||
/// Number of row groups before filtering.
|
||||
pub(crate) rg_total: usize,
|
||||
@@ -950,13 +915,6 @@ pub(crate) struct ReaderFilterMetrics {
|
||||
pub(crate) rows_bloom_filtered: usize,
|
||||
/// Number of rows filtered by precise filter.
|
||||
pub(crate) rows_precise_filtered: usize,
|
||||
|
||||
/// Optional metrics for inverted index applier.
|
||||
pub(crate) inverted_index_apply_metrics: Option<InvertedIndexApplyMetrics>,
|
||||
/// Optional metrics for bloom filter index applier.
|
||||
pub(crate) bloom_filter_apply_metrics: Option<BloomFilterIndexApplyMetrics>,
|
||||
/// Optional metrics for fulltext index applier.
|
||||
pub(crate) fulltext_index_apply_metrics: Option<FulltextIndexApplyMetrics>,
|
||||
}
|
||||
|
||||
impl ReaderFilterMetrics {
|
||||
@@ -973,23 +931,6 @@ impl ReaderFilterMetrics {
|
||||
self.rows_inverted_filtered += other.rows_inverted_filtered;
|
||||
self.rows_bloom_filtered += other.rows_bloom_filtered;
|
||||
self.rows_precise_filtered += other.rows_precise_filtered;
|
||||
|
||||
// Merge optional applier metrics
|
||||
if let Some(other_metrics) = &other.inverted_index_apply_metrics {
|
||||
self.inverted_index_apply_metrics
|
||||
.get_or_insert_with(Default::default)
|
||||
.merge_from(other_metrics);
|
||||
}
|
||||
if let Some(other_metrics) = &other.bloom_filter_apply_metrics {
|
||||
self.bloom_filter_apply_metrics
|
||||
.get_or_insert_with(Default::default)
|
||||
.merge_from(other_metrics);
|
||||
}
|
||||
if let Some(other_metrics) = &other.fulltext_index_apply_metrics {
|
||||
self.fulltext_index_apply_metrics
|
||||
.get_or_insert_with(Default::default)
|
||||
.merge_from(other_metrics);
|
||||
}
|
||||
}
|
||||
|
||||
/// Reports metrics.
|
||||
@@ -1046,64 +987,6 @@ impl ReaderFilterMetrics {
|
||||
}
|
||||
}
|
||||
|
||||
/// Metrics for parquet metadata cache operations.
|
||||
#[derive(Default, Clone, Copy)]
|
||||
pub(crate) struct MetadataCacheMetrics {
|
||||
/// Number of memory cache hits for parquet metadata.
|
||||
pub(crate) mem_cache_hit: usize,
|
||||
/// Number of file cache hits for parquet metadata.
|
||||
pub(crate) file_cache_hit: usize,
|
||||
/// Number of cache misses for parquet metadata.
|
||||
pub(crate) cache_miss: usize,
|
||||
/// Duration to load parquet metadata.
|
||||
pub(crate) metadata_load_cost: Duration,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for MetadataCacheMetrics {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let Self {
|
||||
mem_cache_hit,
|
||||
file_cache_hit,
|
||||
cache_miss,
|
||||
metadata_load_cost,
|
||||
} = self;
|
||||
|
||||
if self.is_empty() {
|
||||
return write!(f, "{{}}");
|
||||
}
|
||||
write!(f, "{{")?;
|
||||
|
||||
write!(f, "\"metadata_load_cost\":\"{:?}\"", metadata_load_cost)?;
|
||||
|
||||
if *mem_cache_hit > 0 {
|
||||
write!(f, ", \"mem_cache_hit\":{}", mem_cache_hit)?;
|
||||
}
|
||||
if *file_cache_hit > 0 {
|
||||
write!(f, ", \"file_cache_hit\":{}", file_cache_hit)?;
|
||||
}
|
||||
if *cache_miss > 0 {
|
||||
write!(f, ", \"cache_miss\":{}", cache_miss)?;
|
||||
}
|
||||
|
||||
write!(f, "}}")
|
||||
}
|
||||
}
|
||||
|
||||
impl MetadataCacheMetrics {
|
||||
/// Returns true if the metrics are empty (contain no meaningful data).
|
||||
pub(crate) fn is_empty(&self) -> bool {
|
||||
self.metadata_load_cost.is_zero()
|
||||
}
|
||||
|
||||
/// Adds `other` metrics to this metrics.
|
||||
pub(crate) fn merge_from(&mut self, other: &MetadataCacheMetrics) {
|
||||
self.mem_cache_hit += other.mem_cache_hit;
|
||||
self.file_cache_hit += other.file_cache_hit;
|
||||
self.cache_miss += other.cache_miss;
|
||||
self.metadata_load_cost += other.metadata_load_cost;
|
||||
}
|
||||
}
|
||||
|
||||
/// Parquet reader metrics.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct ReaderMetrics {
|
||||
@@ -1119,10 +1002,6 @@ pub struct ReaderMetrics {
|
||||
pub(crate) num_batches: usize,
|
||||
/// Number of rows read.
|
||||
pub(crate) num_rows: usize,
|
||||
/// Metrics for parquet metadata cache.
|
||||
pub(crate) metadata_cache_metrics: MetadataCacheMetrics,
|
||||
/// Optional metrics for page/row group fetch operations.
|
||||
pub(crate) fetch_metrics: Option<Arc<ParquetFetchMetrics>>,
|
||||
}
|
||||
|
||||
impl ReaderMetrics {
|
||||
@@ -1134,15 +1013,6 @@ impl ReaderMetrics {
|
||||
self.num_record_batches += other.num_record_batches;
|
||||
self.num_batches += other.num_batches;
|
||||
self.num_rows += other.num_rows;
|
||||
self.metadata_cache_metrics
|
||||
.merge_from(&other.metadata_cache_metrics);
|
||||
if let Some(other_fetch) = &other.fetch_metrics {
|
||||
if let Some(self_fetch) = &self.fetch_metrics {
|
||||
self_fetch.merge_from(other_fetch);
|
||||
} else {
|
||||
self.fetch_metrics = Some(other_fetch.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Reports total rows.
|
||||
@@ -1197,10 +1067,7 @@ impl RowGroupReaderBuilder {
|
||||
&self,
|
||||
row_group_idx: usize,
|
||||
row_selection: Option<RowSelection>,
|
||||
fetch_metrics: Option<&ParquetFetchMetrics>,
|
||||
) -> Result<ParquetRecordBatchReader> {
|
||||
let fetch_start = Instant::now();
|
||||
|
||||
let mut row_group = InMemoryRowGroup::create(
|
||||
self.file_handle.region_id(),
|
||||
self.file_handle.file_id().file_id(),
|
||||
@@ -1212,17 +1079,12 @@ impl RowGroupReaderBuilder {
|
||||
);
|
||||
// Fetches data into memory.
|
||||
row_group
|
||||
.fetch(&self.projection, row_selection.as_ref(), fetch_metrics)
|
||||
.fetch(&self.projection, row_selection.as_ref())
|
||||
.await
|
||||
.context(ReadParquetSnafu {
|
||||
path: &self.file_path,
|
||||
})?;
|
||||
|
||||
// Record total fetch elapsed time.
|
||||
if let Some(metrics) = fetch_metrics {
|
||||
metrics.data.lock().unwrap().total_fetch_elapsed += fetch_start.elapsed();
|
||||
}
|
||||
|
||||
// Builds the parquet reader.
|
||||
// Now the row selection is None.
|
||||
ParquetRecordBatchReader::try_new_with_row_groups(
|
||||
@@ -1366,8 +1228,6 @@ pub struct ParquetReader {
|
||||
selection: RowGroupSelection,
|
||||
/// Reader of current row group.
|
||||
reader_state: ReaderState,
|
||||
/// Metrics for tracking row group fetch operations.
|
||||
fetch_metrics: ParquetFetchMetrics,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
@@ -1387,11 +1247,7 @@ impl BatchReader for ParquetReader {
|
||||
let parquet_reader = self
|
||||
.context
|
||||
.reader_builder()
|
||||
.build(
|
||||
row_group_idx,
|
||||
Some(row_selection),
|
||||
Some(&self.fetch_metrics),
|
||||
)
|
||||
.build(row_group_idx, Some(row_selection))
|
||||
.await?;
|
||||
|
||||
// Resets the parquet reader.
|
||||
@@ -1447,12 +1303,11 @@ impl ParquetReader {
|
||||
context: FileRangeContextRef,
|
||||
mut selection: RowGroupSelection,
|
||||
) -> Result<Self> {
|
||||
let fetch_metrics = ParquetFetchMetrics::default();
|
||||
// No more items in current row group, reads next row group.
|
||||
let reader_state = if let Some((row_group_idx, row_selection)) = selection.pop_first() {
|
||||
let parquet_reader = context
|
||||
.reader_builder()
|
||||
.build(row_group_idx, Some(row_selection), Some(&fetch_metrics))
|
||||
.build(row_group_idx, Some(row_selection))
|
||||
.await?;
|
||||
// Compute skip_fields once for this row group
|
||||
let skip_fields = context.should_skip_fields(row_group_idx);
|
||||
@@ -1469,7 +1324,6 @@ impl ParquetReader {
|
||||
context,
|
||||
selection,
|
||||
reader_state,
|
||||
fetch_metrics,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -35,175 +35,6 @@ use crate::cache::{CacheStrategy, PageKey, PageValue};
|
||||
use crate::metrics::{READ_STAGE_ELAPSED, READ_STAGE_FETCH_PAGES};
|
||||
use crate::sst::parquet::helper::{MERGE_GAP, fetch_byte_ranges};
|
||||
|
||||
/// Inner data for ParquetFetchMetrics.
|
||||
#[derive(Default, Debug, Clone)]
|
||||
pub struct ParquetFetchMetricsData {
|
||||
/// Number of page cache hits.
|
||||
pub page_cache_hit: usize,
|
||||
/// Number of write cache hits.
|
||||
pub write_cache_hit: usize,
|
||||
/// Number of cache misses.
|
||||
pub cache_miss: usize,
|
||||
/// Number of pages to fetch from mem cache.
|
||||
pub pages_to_fetch_mem: usize,
|
||||
/// Total size in bytes of pages to fetch from mem cache.
|
||||
pub page_size_to_fetch_mem: u64,
|
||||
/// Number of pages to fetch from write cache.
|
||||
pub pages_to_fetch_write_cache: usize,
|
||||
/// Total size in bytes of pages to fetch from write cache.
|
||||
pub page_size_to_fetch_write_cache: u64,
|
||||
/// Number of pages to fetch from store.
|
||||
pub pages_to_fetch_store: usize,
|
||||
/// Total size in bytes of pages to fetch from store.
|
||||
pub page_size_to_fetch_store: u64,
|
||||
/// Total size in bytes of pages actually returned.
|
||||
pub page_size_needed: u64,
|
||||
/// Elapsed time fetching from write cache.
|
||||
pub write_cache_fetch_elapsed: std::time::Duration,
|
||||
/// Elapsed time fetching from object store.
|
||||
pub store_fetch_elapsed: std::time::Duration,
|
||||
/// Total elapsed time for fetching row groups.
|
||||
pub total_fetch_elapsed: std::time::Duration,
|
||||
}
|
||||
|
||||
impl ParquetFetchMetricsData {
|
||||
/// Returns true if the metrics are empty (contain no meaningful data).
|
||||
fn is_empty(&self) -> bool {
|
||||
self.total_fetch_elapsed.is_zero()
|
||||
}
|
||||
}
|
||||
|
||||
/// Metrics for tracking page/row group fetch operations.
|
||||
#[derive(Default)]
|
||||
pub struct ParquetFetchMetrics {
|
||||
pub data: std::sync::Mutex<ParquetFetchMetricsData>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for ParquetFetchMetrics {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let data = self.data.lock().unwrap();
|
||||
if data.is_empty() {
|
||||
return write!(f, "{{}}");
|
||||
}
|
||||
|
||||
let ParquetFetchMetricsData {
|
||||
page_cache_hit,
|
||||
write_cache_hit,
|
||||
cache_miss,
|
||||
pages_to_fetch_mem,
|
||||
page_size_to_fetch_mem,
|
||||
pages_to_fetch_write_cache,
|
||||
page_size_to_fetch_write_cache,
|
||||
pages_to_fetch_store,
|
||||
page_size_to_fetch_store,
|
||||
page_size_needed,
|
||||
write_cache_fetch_elapsed,
|
||||
store_fetch_elapsed,
|
||||
total_fetch_elapsed,
|
||||
} = *data;
|
||||
|
||||
write!(f, "{{")?;
|
||||
|
||||
write!(f, "\"total_fetch_elapsed\":\"{:?}\"", total_fetch_elapsed)?;
|
||||
|
||||
if page_cache_hit > 0 {
|
||||
write!(f, ", \"page_cache_hit\":{}", page_cache_hit)?;
|
||||
}
|
||||
if write_cache_hit > 0 {
|
||||
write!(f, ", \"write_cache_hit\":{}", write_cache_hit)?;
|
||||
}
|
||||
if cache_miss > 0 {
|
||||
write!(f, ", \"cache_miss\":{}", cache_miss)?;
|
||||
}
|
||||
if pages_to_fetch_mem > 0 {
|
||||
write!(f, ", \"pages_to_fetch_mem\":{}", pages_to_fetch_mem)?;
|
||||
}
|
||||
if page_size_to_fetch_mem > 0 {
|
||||
write!(f, ", \"page_size_to_fetch_mem\":{}", page_size_to_fetch_mem)?;
|
||||
}
|
||||
if pages_to_fetch_write_cache > 0 {
|
||||
write!(
|
||||
f,
|
||||
", \"pages_to_fetch_write_cache\":{}",
|
||||
pages_to_fetch_write_cache
|
||||
)?;
|
||||
}
|
||||
if page_size_to_fetch_write_cache > 0 {
|
||||
write!(
|
||||
f,
|
||||
", \"page_size_to_fetch_write_cache\":{}",
|
||||
page_size_to_fetch_write_cache
|
||||
)?;
|
||||
}
|
||||
if pages_to_fetch_store > 0 {
|
||||
write!(f, ", \"pages_to_fetch_store\":{}", pages_to_fetch_store)?;
|
||||
}
|
||||
if page_size_to_fetch_store > 0 {
|
||||
write!(
|
||||
f,
|
||||
", \"page_size_to_fetch_store\":{}",
|
||||
page_size_to_fetch_store
|
||||
)?;
|
||||
}
|
||||
if page_size_needed > 0 {
|
||||
write!(f, ", \"page_size_needed\":{}", page_size_needed)?;
|
||||
}
|
||||
if !write_cache_fetch_elapsed.is_zero() {
|
||||
write!(
|
||||
f,
|
||||
", \"write_cache_fetch_elapsed\":\"{:?}\"",
|
||||
write_cache_fetch_elapsed
|
||||
)?;
|
||||
}
|
||||
if !store_fetch_elapsed.is_zero() {
|
||||
write!(f, ", \"store_fetch_elapsed\":\"{:?}\"", store_fetch_elapsed)?;
|
||||
}
|
||||
|
||||
write!(f, "}}")
|
||||
}
|
||||
}
|
||||
|
||||
impl ParquetFetchMetrics {
|
||||
/// Returns true if the metrics are empty (contain no meaningful data).
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.data.lock().unwrap().is_empty()
|
||||
}
|
||||
|
||||
/// Merges metrics from another [ParquetFetchMetrics].
|
||||
pub fn merge_from(&self, other: &ParquetFetchMetrics) {
|
||||
let ParquetFetchMetricsData {
|
||||
page_cache_hit,
|
||||
write_cache_hit,
|
||||
cache_miss,
|
||||
pages_to_fetch_mem,
|
||||
page_size_to_fetch_mem,
|
||||
pages_to_fetch_write_cache,
|
||||
page_size_to_fetch_write_cache,
|
||||
pages_to_fetch_store,
|
||||
page_size_to_fetch_store,
|
||||
page_size_needed,
|
||||
write_cache_fetch_elapsed,
|
||||
store_fetch_elapsed,
|
||||
total_fetch_elapsed,
|
||||
} = *other.data.lock().unwrap();
|
||||
|
||||
let mut data = self.data.lock().unwrap();
|
||||
data.page_cache_hit += page_cache_hit;
|
||||
data.write_cache_hit += write_cache_hit;
|
||||
data.cache_miss += cache_miss;
|
||||
data.pages_to_fetch_mem += pages_to_fetch_mem;
|
||||
data.page_size_to_fetch_mem += page_size_to_fetch_mem;
|
||||
data.pages_to_fetch_write_cache += pages_to_fetch_write_cache;
|
||||
data.page_size_to_fetch_write_cache += page_size_to_fetch_write_cache;
|
||||
data.pages_to_fetch_store += pages_to_fetch_store;
|
||||
data.page_size_to_fetch_store += page_size_to_fetch_store;
|
||||
data.page_size_needed += page_size_needed;
|
||||
data.write_cache_fetch_elapsed += write_cache_fetch_elapsed;
|
||||
data.store_fetch_elapsed += store_fetch_elapsed;
|
||||
data.total_fetch_elapsed += total_fetch_elapsed;
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct RowGroupBase<'a> {
|
||||
metadata: &'a RowGroupMetaData,
|
||||
pub(crate) offset_index: Option<&'a [OffsetIndexMetaData]>,
|
||||
@@ -413,14 +244,13 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
&mut self,
|
||||
projection: &ProjectionMask,
|
||||
selection: Option<&RowSelection>,
|
||||
metrics: Option<&ParquetFetchMetrics>,
|
||||
) -> Result<()> {
|
||||
if let Some((selection, offset_index)) = selection.zip(self.base.offset_index) {
|
||||
let (fetch_ranges, page_start_offsets) =
|
||||
self.base
|
||||
.calc_sparse_read_ranges(projection, offset_index, selection);
|
||||
|
||||
let chunk_data = self.fetch_bytes(&fetch_ranges, metrics).await?;
|
||||
let chunk_data = self.fetch_bytes(&fetch_ranges).await?;
|
||||
// Assign sparse chunk data to base.
|
||||
self.base
|
||||
.assign_sparse_chunk(projection, chunk_data, page_start_offsets);
|
||||
@@ -438,7 +268,7 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
}
|
||||
|
||||
// Fetch data with ranges
|
||||
let chunk_data = self.fetch_bytes(&fetch_ranges, metrics).await?;
|
||||
let chunk_data = self.fetch_bytes(&fetch_ranges).await?;
|
||||
|
||||
// Assigns fetched data to base.
|
||||
self.base.assign_dense_chunk(projection, chunk_data);
|
||||
@@ -449,74 +279,31 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
|
||||
/// Try to fetch data from the memory cache or the WriteCache,
|
||||
/// if not in WriteCache, fetch data from object store directly.
|
||||
async fn fetch_bytes(
|
||||
&self,
|
||||
ranges: &[Range<u64>],
|
||||
metrics: Option<&ParquetFetchMetrics>,
|
||||
) -> Result<Vec<Bytes>> {
|
||||
async fn fetch_bytes(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
|
||||
// Now fetch page timer includes the whole time to read pages.
|
||||
let _timer = READ_STAGE_FETCH_PAGES.start_timer();
|
||||
|
||||
let page_key = PageKey::new(self.file_id, self.row_group_idx, ranges.to_vec());
|
||||
if let Some(pages) = self.cache_strategy.get_pages(&page_key) {
|
||||
if let Some(metrics) = metrics {
|
||||
let total_size: u64 = ranges.iter().map(|r| r.end - r.start).sum();
|
||||
let mut metrics_data = metrics.data.lock().unwrap();
|
||||
metrics_data.page_cache_hit += 1;
|
||||
metrics_data.pages_to_fetch_mem += ranges.len();
|
||||
metrics_data.page_size_to_fetch_mem += total_size;
|
||||
metrics_data.page_size_needed += total_size;
|
||||
}
|
||||
return Ok(pages.compressed.clone());
|
||||
}
|
||||
|
||||
// Calculate total range size for metrics.
|
||||
let (total_range_size, unaligned_size) = compute_total_range_size(ranges);
|
||||
|
||||
let key = IndexKey::new(self.region_id, self.file_id, FileType::Parquet);
|
||||
let fetch_write_cache_start = metrics.map(|_| std::time::Instant::now());
|
||||
let write_cache_result = self.fetch_ranges_from_write_cache(key, ranges).await;
|
||||
let pages = match write_cache_result {
|
||||
Some(data) => {
|
||||
if let Some(metrics) = metrics {
|
||||
let elapsed = fetch_write_cache_start
|
||||
.map(|start| start.elapsed())
|
||||
.unwrap_or_default();
|
||||
let range_size_needed: u64 = ranges.iter().map(|r| r.end - r.start).sum();
|
||||
let mut metrics_data = metrics.data.lock().unwrap();
|
||||
metrics_data.write_cache_fetch_elapsed += elapsed;
|
||||
metrics_data.write_cache_hit += 1;
|
||||
metrics_data.pages_to_fetch_write_cache += ranges.len();
|
||||
metrics_data.page_size_to_fetch_write_cache += unaligned_size;
|
||||
metrics_data.page_size_needed += range_size_needed;
|
||||
}
|
||||
data
|
||||
}
|
||||
let pages = match self.fetch_ranges_from_write_cache(key, ranges).await {
|
||||
Some(data) => data,
|
||||
None => {
|
||||
// Fetch data from object store.
|
||||
let _timer = READ_STAGE_ELAPSED
|
||||
.with_label_values(&["cache_miss_read"])
|
||||
.start_timer();
|
||||
|
||||
let start = metrics.map(|_| std::time::Instant::now());
|
||||
let data = fetch_byte_ranges(self.file_path, self.object_store.clone(), ranges)
|
||||
fetch_byte_ranges(self.file_path, self.object_store.clone(), ranges)
|
||||
.await
|
||||
.map_err(|e| ParquetError::External(Box::new(e)))?;
|
||||
if let Some(metrics) = metrics {
|
||||
let elapsed = start.map(|start| start.elapsed()).unwrap_or_default();
|
||||
let range_size_needed: u64 = ranges.iter().map(|r| r.end - r.start).sum();
|
||||
let mut metrics_data = metrics.data.lock().unwrap();
|
||||
metrics_data.store_fetch_elapsed += elapsed;
|
||||
metrics_data.cache_miss += 1;
|
||||
metrics_data.pages_to_fetch_store += ranges.len();
|
||||
metrics_data.page_size_to_fetch_store += unaligned_size;
|
||||
metrics_data.page_size_needed += range_size_needed;
|
||||
}
|
||||
data
|
||||
.map_err(|e| ParquetError::External(Box::new(e)))?
|
||||
}
|
||||
};
|
||||
|
||||
// Put pages back to the cache.
|
||||
let total_range_size = compute_total_range_size(ranges);
|
||||
let page_value = PageValue::new(pages.clone(), total_range_size);
|
||||
self.cache_strategy
|
||||
.put_pages(page_key, Arc::new(page_value));
|
||||
@@ -539,21 +326,17 @@ impl<'a> InMemoryRowGroup<'a> {
|
||||
}
|
||||
|
||||
/// Computes the max possible buffer size to read the given `ranges`.
|
||||
/// Returns (aligned_size, unaligned_size) where:
|
||||
/// - aligned_size: total size aligned to pooled buffer size
|
||||
/// - unaligned_size: actual total size without alignment
|
||||
// See https://github.com/apache/opendal/blob/v0.54.0/core/src/types/read/reader.rs#L166-L192
|
||||
fn compute_total_range_size(ranges: &[Range<u64>]) -> (u64, u64) {
|
||||
fn compute_total_range_size(ranges: &[Range<u64>]) -> u64 {
|
||||
if ranges.is_empty() {
|
||||
return (0, 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
let gap = MERGE_GAP as u64;
|
||||
let mut sorted_ranges = ranges.to_vec();
|
||||
sorted_ranges.sort_unstable_by(|a, b| a.start.cmp(&b.start));
|
||||
|
||||
let mut total_size_aligned = 0;
|
||||
let mut total_size_unaligned = 0;
|
||||
let mut total_size = 0;
|
||||
let mut cur = sorted_ranges[0].clone();
|
||||
|
||||
for range in sorted_ranges.into_iter().skip(1) {
|
||||
@@ -562,19 +345,15 @@ fn compute_total_range_size(ranges: &[Range<u64>]) -> (u64, u64) {
|
||||
cur.end = cur.end.max(range.end);
|
||||
} else {
|
||||
// No overlap and the gap is too large, add current range to total and start a new one
|
||||
let range_size = cur.end - cur.start;
|
||||
total_size_aligned += align_to_pooled_buf_size(range_size);
|
||||
total_size_unaligned += range_size;
|
||||
total_size += align_to_pooled_buf_size(cur.end - cur.start);
|
||||
cur = range;
|
||||
}
|
||||
}
|
||||
|
||||
// Add the last range
|
||||
let range_size = cur.end - cur.start;
|
||||
total_size_aligned += align_to_pooled_buf_size(range_size);
|
||||
total_size_unaligned += range_size;
|
||||
total_size += align_to_pooled_buf_size(cur.end - cur.start);
|
||||
|
||||
(total_size_aligned, total_size_unaligned)
|
||||
total_size
|
||||
}
|
||||
|
||||
/// Aligns the given size to the multiple of the pooled buffer size.
|
||||
|
||||
@@ -626,7 +626,6 @@ impl TestEnv {
|
||||
compress_type,
|
||||
checkpoint_distance,
|
||||
remove_file_options: Default::default(),
|
||||
manifest_cache: None,
|
||||
};
|
||||
|
||||
if let Some(metadata) = initial_metadata {
|
||||
@@ -657,7 +656,6 @@ impl TestEnv {
|
||||
None,
|
||||
self.puffin_manager.clone(),
|
||||
self.intermediate_manager.clone(),
|
||||
None, // manifest_cache
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -678,7 +676,6 @@ impl TestEnv {
|
||||
None,
|
||||
self.puffin_manager.clone(),
|
||||
self.intermediate_manager.clone(),
|
||||
ReadableSize::mb(0), // manifest_cache_capacity
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -1023,15 +1020,9 @@ pub struct MockWriteBufferManager {
|
||||
should_stall: AtomicBool,
|
||||
memory_used: AtomicUsize,
|
||||
memory_active: AtomicUsize,
|
||||
flush_limit: usize,
|
||||
}
|
||||
|
||||
impl MockWriteBufferManager {
|
||||
/// Set flush limit.
|
||||
pub fn set_flush_limit(&mut self, flush_limit: usize) {
|
||||
self.flush_limit = flush_limit;
|
||||
}
|
||||
|
||||
/// Set whether to flush the engine.
|
||||
pub fn set_should_flush(&self, value: bool) {
|
||||
self.should_flush.store(value, Ordering::Relaxed);
|
||||
@@ -1073,10 +1064,6 @@ impl WriteBufferManager for MockWriteBufferManager {
|
||||
fn memory_usage(&self) -> usize {
|
||||
self.memory_used.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
fn flush_limit(&self) -> usize {
|
||||
self.flush_limit
|
||||
}
|
||||
}
|
||||
|
||||
pub fn column_metadata_to_column_schema(metadata: &ColumnMetadata) -> api::v1::ColumnSchema {
|
||||
|
||||
@@ -132,7 +132,6 @@ impl SchedulerEnv {
|
||||
compress_type: CompressionType::Uncompressed,
|
||||
checkpoint_distance: 10,
|
||||
remove_file_options: Default::default(),
|
||||
manifest_cache: None,
|
||||
},
|
||||
FormatType::PrimaryKey,
|
||||
&Default::default(),
|
||||
|
||||
@@ -37,7 +37,6 @@ use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::time::Duration;
|
||||
|
||||
use common_base::Plugins;
|
||||
use common_base::readable_size::ReadableSize;
|
||||
use common_error::ext::BoxedError;
|
||||
use common_meta::key::SchemaMetadataManagerRef;
|
||||
use common_runtime::JoinHandle;
|
||||
@@ -456,8 +455,6 @@ pub async fn write_cache_from_config(
|
||||
Some(config.index_cache_percent),
|
||||
puffin_manager_factory,
|
||||
intermediate_manager,
|
||||
// TODO(yingwen): Enable manifest cache after removing read cache.
|
||||
ReadableSize(0),
|
||||
)
|
||||
.await?;
|
||||
Ok(Some(Arc::new(cache)))
|
||||
|
||||
@@ -22,7 +22,6 @@ use common_telemetry::info;
|
||||
use common_telemetry::tracing::warn;
|
||||
use humantime_serde::re::humantime;
|
||||
use snafu::{ResultExt, ensure};
|
||||
use store_api::logstore::LogStore;
|
||||
use store_api::metadata::{
|
||||
InvalidSetRegionOptionRequestSnafu, MetadataError, RegionMetadata, RegionMetadataBuilder,
|
||||
RegionMetadataRef,
|
||||
@@ -42,7 +41,7 @@ use crate::request::{DdlRequest, OptionOutputTx, SenderDdlRequest};
|
||||
use crate::sst::FormatType;
|
||||
use crate::worker::RegionWorkerLoop;
|
||||
|
||||
impl<S: LogStore> RegionWorkerLoop<S> {
|
||||
impl<S> RegionWorkerLoop<S> {
|
||||
pub(crate) async fn handle_alter_request(
|
||||
&mut self,
|
||||
region_id: RegionId,
|
||||
|
||||
@@ -51,7 +51,6 @@ where
|
||||
// Writes dropping marker
|
||||
// We rarely drop a region so we still operate in the worker loop.
|
||||
let region_dir = region.access_layer.build_region_dir(region_id);
|
||||
let table_dir = region.access_layer.table_dir().to_string();
|
||||
let marker_path = join_path(®ion_dir, DROPPING_MARKER_FILE);
|
||||
region
|
||||
.access_layer
|
||||
@@ -103,14 +102,13 @@ where
|
||||
let dropping_regions = self.dropping_regions.clone();
|
||||
let listener = self.listener.clone();
|
||||
let intm_manager = self.intermediate_manager.clone();
|
||||
let cache_manager = self.cache_manager.clone();
|
||||
common_runtime::spawn_global(async move {
|
||||
let gc_duration = listener
|
||||
.on_later_drop_begin(region_id)
|
||||
.unwrap_or(Duration::from_secs(GC_TASK_INTERVAL_SEC));
|
||||
let removed = later_drop_task(
|
||||
region_id,
|
||||
region_dir.clone(),
|
||||
region_dir,
|
||||
object_store,
|
||||
dropping_regions,
|
||||
gc_duration,
|
||||
@@ -119,16 +117,6 @@ where
|
||||
if let Err(err) = intm_manager.prune_region_dir(®ion_id).await {
|
||||
warn!(err; "Failed to prune intermediate region directory, region_id: {}", region_id);
|
||||
}
|
||||
|
||||
// Clean manifest cache for the region
|
||||
if let Some(write_cache) = cache_manager.write_cache()
|
||||
&& let Some(manifest_cache) = write_cache.manifest_cache()
|
||||
{
|
||||
// We pass the table dir so we can remove the table dir in manifest cache
|
||||
// when the last region in the same host is dropped.
|
||||
manifest_cache.clean_manifests(&table_dir).await;
|
||||
}
|
||||
|
||||
listener.on_later_drop_end(region_id, removed);
|
||||
});
|
||||
|
||||
|
||||
@@ -30,26 +30,16 @@ use crate::request::{BuildIndexRequest, FlushFailed, FlushFinished, OnFailure, O
|
||||
use crate::sst::index::IndexBuildType;
|
||||
use crate::worker::RegionWorkerLoop;
|
||||
|
||||
impl<S: LogStore> RegionWorkerLoop<S> {
|
||||
impl<S> RegionWorkerLoop<S> {
|
||||
/// On region flush job failed.
|
||||
pub(crate) async fn handle_flush_failed(&mut self, region_id: RegionId, request: FlushFailed) {
|
||||
self.flush_scheduler.on_flush_failed(region_id, request.err);
|
||||
debug!(
|
||||
"Flush failed for region {}, handling stalled requests",
|
||||
region_id
|
||||
);
|
||||
// Maybe flush worker again.
|
||||
self.maybe_flush_worker();
|
||||
|
||||
// Handle stalled requests.
|
||||
self.handle_stalled_requests().await;
|
||||
}
|
||||
|
||||
/// Checks whether the engine reaches flush threshold. If so, finds regions in this
|
||||
/// worker to flush.
|
||||
pub(crate) fn maybe_flush_worker(&mut self) {
|
||||
if !self.write_buffer_manager.should_flush_engine() {
|
||||
debug!("No need to flush worker");
|
||||
// No need to flush worker.
|
||||
return;
|
||||
}
|
||||
@@ -66,7 +56,9 @@ impl<S: LogStore> RegionWorkerLoop<S> {
|
||||
let regions = self.regions.list_regions();
|
||||
let now = self.time_provider.current_time_millis();
|
||||
let min_last_flush_time = now - self.config.auto_flush_interval.as_millis() as i64;
|
||||
let mut pending_regions = vec![];
|
||||
let mut max_mutable_size = 0;
|
||||
// Region with max mutable memtable size.
|
||||
let mut max_mem_region = None;
|
||||
|
||||
for region in ®ions {
|
||||
if self.flush_scheduler.is_flush_requested(region.region_id) || !region.is_writable() {
|
||||
@@ -75,8 +67,12 @@ impl<S: LogStore> RegionWorkerLoop<S> {
|
||||
}
|
||||
|
||||
let version = region.version();
|
||||
let region_memtable_size =
|
||||
version.memtables.mutable_usage() + version.memtables.immutables_usage();
|
||||
let region_mutable_size = version.memtables.mutable_usage();
|
||||
// Tracks region with max mutable memtable size.
|
||||
if region_mutable_size > max_mutable_size {
|
||||
max_mem_region = Some(region);
|
||||
max_mutable_size = region_mutable_size;
|
||||
}
|
||||
|
||||
if region.last_flush_millis() < min_last_flush_time {
|
||||
// If flush time of this region is earlier than `min_last_flush_time`, we can flush this region.
|
||||
@@ -92,38 +88,14 @@ impl<S: LogStore> RegionWorkerLoop<S> {
|
||||
®ion.version_control,
|
||||
task,
|
||||
)?;
|
||||
} else if region_memtable_size > 0 {
|
||||
// We should only consider regions with memtable size > 0 to flush.
|
||||
pending_regions.push((region, region_memtable_size));
|
||||
}
|
||||
}
|
||||
pending_regions.sort_unstable_by_key(|(_, size)| std::cmp::Reverse(*size));
|
||||
// The flush target is the mutable memtable limit (half of the global buffer).
|
||||
// When memory is full, we aggressively flush regions until usage drops below this target,
|
||||
// not just below the full limit.
|
||||
let target_memory_usage = self.write_buffer_manager.flush_limit();
|
||||
let mut memory_usage = self.write_buffer_manager.memory_usage();
|
||||
|
||||
#[cfg(test)]
|
||||
// Flush memtable with max mutable memtable.
|
||||
// TODO(yingwen): Maybe flush more tables to reduce write buffer size.
|
||||
if let Some(region) = max_mem_region
|
||||
&& !self.flush_scheduler.is_flush_requested(region.region_id)
|
||||
{
|
||||
debug!(
|
||||
"Flushing regions on engine full, target memory usage: {}, memory usage: {}, pending regions: {:?}",
|
||||
target_memory_usage,
|
||||
memory_usage,
|
||||
pending_regions
|
||||
.iter()
|
||||
.map(|(region, mem_size)| (region.region_id, mem_size))
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
// Iterate over pending regions in descending order of their memory size and schedule flush tasks
|
||||
// for each region until the overall memory usage drops below the flush limit.
|
||||
for (region, region_mem_size) in pending_regions.into_iter() {
|
||||
// Make sure the first region is always flushed.
|
||||
if memory_usage < target_memory_usage {
|
||||
// Stop flushing regions if memory usage is already below the flush limit
|
||||
break;
|
||||
}
|
||||
let task = self.new_flush_task(
|
||||
region,
|
||||
FlushReason::EngineFull,
|
||||
@@ -131,12 +103,8 @@ impl<S: LogStore> RegionWorkerLoop<S> {
|
||||
self.config.clone(),
|
||||
region.is_staging(),
|
||||
);
|
||||
debug!("Scheduling flush task for region {}", region.region_id);
|
||||
// Schedule a flush task for the current region
|
||||
self.flush_scheduler
|
||||
.schedule_flush(region.region_id, ®ion.version_control, task)?;
|
||||
// Reduce memory usage by the region's size, ensuring it doesn't go negative
|
||||
memory_usage = memory_usage.saturating_sub(region_mem_size);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -323,9 +291,6 @@ impl<S: LogStore> RegionWorkerLoop<S> {
|
||||
.await;
|
||||
}
|
||||
|
||||
// Maybe flush worker again.
|
||||
self.maybe_flush_worker();
|
||||
|
||||
// Handle stalled requests.
|
||||
self.handle_stalled_requests().await;
|
||||
|
||||
|
||||
@@ -22,10 +22,9 @@ use api::v1::region::{
|
||||
};
|
||||
use arrow::array::Array;
|
||||
use arrow::record_batch::RecordBatch;
|
||||
use bytes::Bytes;
|
||||
use common_base::AffectedRows;
|
||||
use common_grpc::FlightData;
|
||||
use common_grpc::flight::{FlightEncoder, FlightMessage};
|
||||
use common_grpc::flight::{FlightDecoder, FlightEncoder, FlightMessage};
|
||||
use common_telemetry::error;
|
||||
use common_telemetry::tracing_context::TracingContext;
|
||||
use snafu::{OptionExt, ResultExt, ensure};
|
||||
@@ -41,20 +40,33 @@ impl Inserter {
|
||||
pub async fn handle_bulk_insert(
|
||||
&self,
|
||||
table: TableRef,
|
||||
raw_flight_data: FlightData,
|
||||
record_batch: RecordBatch,
|
||||
schema_bytes: Bytes,
|
||||
decoder: &mut FlightDecoder,
|
||||
data: FlightData,
|
||||
) -> error::Result<AffectedRows> {
|
||||
let table_info = table.table_info();
|
||||
let table_id = table_info.table_id();
|
||||
let db_name = table_info.get_db_string();
|
||||
let decode_timer = metrics::HANDLE_BULK_INSERT_ELAPSED
|
||||
.with_label_values(&["decode_request"])
|
||||
.start_timer();
|
||||
let body_size = data.data_body.len();
|
||||
// Build region server requests
|
||||
let message = decoder
|
||||
.try_decode(&data)
|
||||
.context(error::DecodeFlightDataSnafu)?
|
||||
.context(error::NotSupportedSnafu {
|
||||
feat: "bulk insert RecordBatch with dictionary arrays",
|
||||
})?;
|
||||
let FlightMessage::RecordBatch(record_batch) = message else {
|
||||
return Ok(0);
|
||||
};
|
||||
decode_timer.observe_duration();
|
||||
|
||||
if record_batch.num_rows() == 0 {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
let body_size = raw_flight_data.data_body.len();
|
||||
// TODO(yingwen): Fill record batch impure default values. Note that we should override `raw_flight_data` if we have to fill defaults.
|
||||
// TODO(yingwen): Fill record batch impure default values.
|
||||
// notify flownode to update dirty timestamps if flow is configured.
|
||||
self.maybe_update_flow_dirty_window(table_info.clone(), record_batch.clone());
|
||||
|
||||
@@ -63,6 +75,8 @@ impl Inserter {
|
||||
.with_label_values(&["raw"])
|
||||
.observe(record_batch.num_rows() as f64);
|
||||
|
||||
// safety: when reach here schema must be present.
|
||||
let schema_bytes = decoder.schema_bytes().unwrap();
|
||||
let partition_timer = metrics::HANDLE_BULK_INSERT_ELAPSED
|
||||
.with_label_values(&["partition"])
|
||||
.start_timer();
|
||||
@@ -92,7 +106,6 @@ impl Inserter {
|
||||
.find_region_leader(region_id)
|
||||
.await
|
||||
.context(error::FindRegionLeaderSnafu)?;
|
||||
|
||||
let request = RegionRequest {
|
||||
header: Some(RegionRequestHeader {
|
||||
tracing_context: TracingContext::from_current_span().to_w3c(),
|
||||
@@ -101,9 +114,9 @@ impl Inserter {
|
||||
body: Some(region_request::Body::BulkInsert(BulkInsertRequest {
|
||||
region_id: region_id.as_u64(),
|
||||
body: Some(bulk_insert_request::Body::ArrowIpc(ArrowIpc {
|
||||
schema: schema_bytes.clone(),
|
||||
data_header: raw_flight_data.data_header,
|
||||
payload: raw_flight_data.data_body,
|
||||
schema: schema_bytes,
|
||||
data_header: data.data_header,
|
||||
payload: data.data_body,
|
||||
})),
|
||||
})),
|
||||
};
|
||||
@@ -145,6 +158,8 @@ impl Inserter {
|
||||
|
||||
let mut handles = Vec::with_capacity(mask_per_datanode.len());
|
||||
|
||||
// raw daya header and payload bytes.
|
||||
let mut raw_data_bytes = None;
|
||||
for (peer, masks) in mask_per_datanode {
|
||||
for (region_id, mask) in masks {
|
||||
if mask.select_none() {
|
||||
@@ -155,10 +170,13 @@ impl Inserter {
|
||||
let node_manager = self.node_manager.clone();
|
||||
let peer = peer.clone();
|
||||
let raw_header_and_data = if mask.select_all() {
|
||||
Some((
|
||||
raw_flight_data.data_header.clone(),
|
||||
raw_flight_data.data_body.clone(),
|
||||
))
|
||||
Some(
|
||||
raw_data_bytes
|
||||
.get_or_insert_with(|| {
|
||||
(data.data_header.clone(), data.data_body.clone())
|
||||
})
|
||||
.clone(),
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
@@ -34,6 +34,7 @@ use itertools::Itertools;
|
||||
use operator::insert::InserterRef;
|
||||
use operator::statement::StatementExecutorRef;
|
||||
use query::QueryEngineRef;
|
||||
use query::dataframe::DataFrame;
|
||||
use session::context::{QueryContextBuilder, QueryContextRef};
|
||||
use snafu::{OptionExt, ResultExt, ensure};
|
||||
use table::TableRef;
|
||||
@@ -412,6 +413,7 @@ impl PipelineTable {
|
||||
.query_engine
|
||||
.read_table(self.table.clone())
|
||||
.context(DataFrameSnafu)?;
|
||||
let DataFrame::DataFusion(dataframe) = dataframe;
|
||||
|
||||
let dataframe = dataframe
|
||||
.filter(prepare_dataframe_conditions(name, version))
|
||||
@@ -472,6 +474,7 @@ impl PipelineTable {
|
||||
.query_engine
|
||||
.read_table(self.table.clone())
|
||||
.context(DataFrameSnafu)?;
|
||||
let DataFrame::DataFusion(dataframe) = dataframe;
|
||||
|
||||
// select all pipelines with name and version
|
||||
let dataframe = dataframe
|
||||
|
||||
@@ -32,15 +32,6 @@ use crate::blob_metadata::{BlobMetadata, CompressionCodec};
|
||||
use crate::error::Result;
|
||||
use crate::file_metadata::FileMetadata;
|
||||
|
||||
/// Metrics returned by `PuffinReader::dir` operations.
|
||||
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
|
||||
pub struct DirMetrics {
|
||||
/// Whether this was a cache hit (true) or cache miss (false).
|
||||
pub cache_hit: bool,
|
||||
/// Size of the directory in bytes.
|
||||
pub dir_size: u64,
|
||||
}
|
||||
|
||||
/// The `PuffinManager` trait provides a unified interface for creating `PuffinReader` and `PuffinWriter`.
|
||||
#[async_trait]
|
||||
pub trait PuffinManager {
|
||||
@@ -115,10 +106,9 @@ pub trait PuffinReader {
|
||||
|
||||
/// Reads a directory from the Puffin file.
|
||||
///
|
||||
/// The returned tuple contains `GuardWithMetadata` and `DirMetrics`.
|
||||
/// The `GuardWithMetadata` is used to access the directory data and its metadata.
|
||||
/// The returned `GuardWithMetadata` is used to access the directory data and its metadata.
|
||||
/// Users should hold the `GuardWithMetadata` until they are done with the directory data.
|
||||
async fn dir(&self, key: &str) -> Result<(GuardWithMetadata<Self::Dir>, DirMetrics)>;
|
||||
async fn dir(&self, key: &str) -> Result<GuardWithMetadata<Self::Dir>>;
|
||||
}
|
||||
|
||||
/// `BlobGuard` is provided by the `PuffinReader` to access the blob data.
|
||||
|
||||
@@ -36,7 +36,7 @@ use crate::puffin_manager::file_accessor::PuffinFileAccessor;
|
||||
use crate::puffin_manager::fs_puffin_manager::PuffinMetadataCacheRef;
|
||||
use crate::puffin_manager::fs_puffin_manager::dir_meta::DirMetadata;
|
||||
use crate::puffin_manager::stager::{BoxWriter, DirWriterProviderRef, Stager};
|
||||
use crate::puffin_manager::{BlobGuard, DirMetrics, GuardWithMetadata, PuffinReader};
|
||||
use crate::puffin_manager::{BlobGuard, GuardWithMetadata, PuffinReader};
|
||||
|
||||
/// `FsPuffinReader` is a `PuffinReader` that provides fs readers for puffin files.
|
||||
pub struct FsPuffinReader<S, F>
|
||||
@@ -130,10 +130,10 @@ where
|
||||
Ok(GuardWithMetadata::new(blob, blob_metadata))
|
||||
}
|
||||
|
||||
async fn dir(&self, key: &str) -> Result<(GuardWithMetadata<Self::Dir>, DirMetrics)> {
|
||||
async fn dir(&self, key: &str) -> Result<GuardWithMetadata<Self::Dir>> {
|
||||
let mut file = self.puffin_reader().await?;
|
||||
let blob_metadata = self.get_blob_metadata(key, &mut file).await?;
|
||||
let (dir, metrics) = self
|
||||
let dir = self
|
||||
.stager
|
||||
.get_dir(
|
||||
&self.handle,
|
||||
@@ -153,7 +153,7 @@ where
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok((GuardWithMetadata::new(dir, blob_metadata), metrics))
|
||||
Ok(GuardWithMetadata::new(dir, blob_metadata))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ use futures::AsyncWrite;
|
||||
use futures::future::BoxFuture;
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::puffin_manager::{BlobGuard, DirGuard, DirMetrics};
|
||||
use crate::puffin_manager::{BlobGuard, DirGuard};
|
||||
|
||||
pub type BoxWriter = Box<dyn AsyncWrite + Unpin + Send>;
|
||||
|
||||
@@ -72,15 +72,14 @@ pub trait Stager: Send + Sync {
|
||||
|
||||
/// Retrieves a directory, initializing it if necessary using the provided `init_fn`.
|
||||
///
|
||||
/// The returned tuple contains the `DirGuard` and `DirMetrics`.
|
||||
/// The `DirGuard` is used to access the directory in the filesystem.
|
||||
/// The returned `DirGuard` is used to access the directory in the filesystem.
|
||||
/// The caller is responsible for holding the `DirGuard` until they are done with the directory.
|
||||
async fn get_dir<'a>(
|
||||
&self,
|
||||
handle: &Self::FileHandle,
|
||||
key: &str,
|
||||
init_fn: Box<dyn InitDirFn + Send + Sync + 'a>,
|
||||
) -> Result<(Self::Dir, DirMetrics)>;
|
||||
) -> Result<Self::Dir>;
|
||||
|
||||
/// Stores a directory in the staging area.
|
||||
async fn put_dir(
|
||||
|
||||
@@ -41,7 +41,7 @@ use crate::error::{
|
||||
use crate::puffin_manager::stager::{
|
||||
BoxWriter, DirWriterProvider, InitBlobFn, InitDirFn, Stager, StagerNotifier,
|
||||
};
|
||||
use crate::puffin_manager::{BlobGuard, DirGuard, DirMetrics};
|
||||
use crate::puffin_manager::{BlobGuard, DirGuard};
|
||||
|
||||
const DELETE_QUEUE_SIZE: usize = 10240;
|
||||
const TMP_EXTENSION: &str = "tmp";
|
||||
@@ -203,7 +203,7 @@ impl<H: ToString + Clone + Send + Sync> Stager for BoundedStager<H> {
|
||||
handle: &Self::FileHandle,
|
||||
key: &str,
|
||||
init_fn: Box<dyn InitDirFn + Send + Sync + 'a>,
|
||||
) -> Result<(Self::Dir, DirMetrics)> {
|
||||
) -> Result<Self::Dir> {
|
||||
let handle_str = handle.to_string();
|
||||
|
||||
let cache_key = Self::encode_cache_key(&handle_str, key);
|
||||
@@ -242,22 +242,15 @@ impl<H: ToString + Clone + Send + Sync> Stager for BoundedStager<H> {
|
||||
.await
|
||||
.context(CacheGetSnafu)?;
|
||||
|
||||
let dir_size = v.size();
|
||||
if let Some(notifier) = self.notifier.as_ref() {
|
||||
if miss {
|
||||
notifier.on_cache_miss(dir_size);
|
||||
notifier.on_cache_miss(v.size());
|
||||
} else {
|
||||
notifier.on_cache_hit(dir_size);
|
||||
notifier.on_cache_hit(v.size());
|
||||
}
|
||||
}
|
||||
|
||||
let metrics = DirMetrics {
|
||||
cache_hit: !miss,
|
||||
dir_size,
|
||||
};
|
||||
|
||||
match v {
|
||||
CacheValue::Dir(guard) => Ok((guard, metrics)),
|
||||
CacheValue::Dir(guard) => Ok(guard),
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
@@ -889,7 +882,7 @@ mod tests {
|
||||
|
||||
let puffin_file_name = "test_get_dir".to_string();
|
||||
let key = "key";
|
||||
let (dir_path, metrics) = stager
|
||||
let dir_path = stager
|
||||
.get_dir(
|
||||
&puffin_file_name,
|
||||
key,
|
||||
@@ -908,9 +901,6 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert!(!metrics.cache_hit);
|
||||
assert!(metrics.dir_size > 0);
|
||||
|
||||
for (rel_path, content) in &files_in_dir {
|
||||
let file_path = dir_path.path().join(rel_path);
|
||||
let mut file = tokio::fs::File::open(&file_path).await.unwrap();
|
||||
@@ -984,7 +974,7 @@ mod tests {
|
||||
];
|
||||
|
||||
let dir_key = "dir_key";
|
||||
let (guard, _metrics) = stager
|
||||
let guard = stager
|
||||
.get_dir(
|
||||
&puffin_file_name,
|
||||
dir_key,
|
||||
@@ -1026,7 +1016,7 @@ mod tests {
|
||||
let buf = reader.read(0..m.content_length).await.unwrap();
|
||||
assert_eq!(&*buf, b"hello world");
|
||||
|
||||
let (dir_path, metrics) = stager
|
||||
let dir_path = stager
|
||||
.get_dir(
|
||||
&puffin_file_name,
|
||||
dir_key,
|
||||
@@ -1034,9 +1024,6 @@ mod tests {
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert!(metrics.cache_hit);
|
||||
assert!(metrics.dir_size > 0);
|
||||
for (rel_path, content) in &files_in_dir {
|
||||
let file_path = dir_path.path().join(rel_path);
|
||||
let mut file = tokio::fs::File::open(&file_path).await.unwrap();
|
||||
@@ -1164,7 +1151,7 @@ mod tests {
|
||||
];
|
||||
|
||||
// First time to get the directory
|
||||
let (guard_0, _metrics) = stager
|
||||
let guard_0 = stager
|
||||
.get_dir(
|
||||
&puffin_file_name,
|
||||
dir_key,
|
||||
@@ -1211,7 +1198,7 @@ mod tests {
|
||||
);
|
||||
|
||||
// Second time to get the directory
|
||||
let (guard_1, _metrics) = stager
|
||||
let guard_1 = stager
|
||||
.get_dir(
|
||||
&puffin_file_name,
|
||||
dir_key,
|
||||
@@ -1250,7 +1237,7 @@ mod tests {
|
||||
// Third time to get the directory and all guards are dropped
|
||||
drop(guard_0);
|
||||
drop(guard_1);
|
||||
let (guard_2, _metrics) = stager
|
||||
let guard_2 = stager
|
||||
.get_dir(
|
||||
&puffin_file_name,
|
||||
dir_key,
|
||||
@@ -1403,7 +1390,7 @@ mod tests {
|
||||
];
|
||||
|
||||
let dir_key = "dir_key";
|
||||
let (guard, _metrics) = stager
|
||||
let guard = stager
|
||||
.get_dir(
|
||||
&puffin_file_name,
|
||||
dir_key,
|
||||
|
||||
@@ -356,7 +356,7 @@ async fn check_dir(
|
||||
stager: &BoundedStager<String>,
|
||||
puffin_reader: &impl PuffinReader,
|
||||
) {
|
||||
let (res_dir, _metrics) = puffin_reader.dir(key).await.unwrap();
|
||||
let res_dir = puffin_reader.dir(key).await.unwrap();
|
||||
let metadata = res_dir.metadata();
|
||||
assert_eq!(
|
||||
metadata.properties,
|
||||
|
||||
31
src/query/src/dataframe.rs
Normal file
31
src/query/src/dataframe.rs
Normal file
@@ -0,0 +1,31 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use datafusion::dataframe::DataFrame as DfDataFrame;
|
||||
use datafusion_expr::LogicalPlan;
|
||||
|
||||
/// DataFrame represents a logical set of rows with the same named columns.
|
||||
/// Similar to a Pandas DataFrame or Spark DataFrame
|
||||
#[derive(Clone)]
|
||||
pub enum DataFrame {
|
||||
DataFusion(DfDataFrame),
|
||||
}
|
||||
|
||||
impl DataFrame {
|
||||
pub fn into_logical_plan(self) -> LogicalPlan {
|
||||
match self {
|
||||
Self::DataFusion(dataframe) => dataframe.into_parts().1,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -32,7 +32,6 @@ use common_recordbatch::adapter::RecordBatchStreamAdapter;
|
||||
use common_recordbatch::{EmptyRecordBatchStream, SendableRecordBatchStream};
|
||||
use common_telemetry::tracing;
|
||||
use datafusion::catalog::TableFunction;
|
||||
use datafusion::dataframe::DataFrame;
|
||||
use datafusion::physical_plan::ExecutionPlan;
|
||||
use datafusion::physical_plan::analyze::AnalyzeExec;
|
||||
use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
|
||||
@@ -50,6 +49,7 @@ use table::TableRef;
|
||||
use table::requests::{DeleteRequest, InsertRequest};
|
||||
|
||||
use crate::analyze::DistAnalyzeExec;
|
||||
use crate::dataframe::DataFrame;
|
||||
pub use crate::datafusion::planner::DfContextProviderAdapter;
|
||||
use crate::dist_plan::{DistPlannerOptions, MergeScanLogicalPlan};
|
||||
use crate::error::{
|
||||
@@ -515,11 +515,13 @@ impl QueryEngine for DatafusionQueryEngine {
|
||||
}
|
||||
|
||||
fn read_table(&self, table: TableRef) -> Result<DataFrame> {
|
||||
self.state
|
||||
.read_table(table)
|
||||
.context(error::DatafusionSnafu)
|
||||
.map_err(BoxedError::new)
|
||||
.context(QueryExecutionSnafu)
|
||||
Ok(DataFrame::DataFusion(
|
||||
self.state
|
||||
.read_table(table)
|
||||
.context(error::DatafusionSnafu)
|
||||
.map_err(BoxedError::new)
|
||||
.context(QueryExecutionSnafu)?,
|
||||
))
|
||||
}
|
||||
|
||||
fn engine_context(&self, query_ctx: QueryContextRef) -> QueryEngineContext {
|
||||
@@ -797,7 +799,7 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let df = engine.read_table(table).unwrap();
|
||||
let DataFrame::DataFusion(df) = engine.read_table(table).unwrap();
|
||||
let df = df
|
||||
.select_columns(&["number"])
|
||||
.unwrap()
|
||||
|
||||
@@ -187,7 +187,7 @@ impl Categorizer {
|
||||
if partition_cols.is_empty() {
|
||||
Commutativity::Commutative
|
||||
} else {
|
||||
Commutativity::PartialCommutative
|
||||
Commutativity::Unimplemented
|
||||
}
|
||||
}
|
||||
LogicalPlan::Unnest(_) => Commutativity::Commutative,
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#![feature(box_patterns)]
|
||||
|
||||
mod analyze;
|
||||
pub mod dataframe;
|
||||
pub mod datafusion;
|
||||
pub mod dist_plan;
|
||||
pub mod dummy_catalog;
|
||||
|
||||
@@ -29,7 +29,6 @@ use common_function::handlers::{
|
||||
};
|
||||
use common_query::Output;
|
||||
use datafusion::catalog::TableFunction;
|
||||
use datafusion::dataframe::DataFrame;
|
||||
use datafusion_expr::{AggregateUDF, LogicalPlan};
|
||||
use datatypes::schema::Schema;
|
||||
pub use default_serializer::{DefaultPlanDecoder, DefaultSerializer};
|
||||
@@ -37,6 +36,7 @@ use partition::manager::PartitionRuleManagerRef;
|
||||
use session::context::QueryContextRef;
|
||||
use table::TableRef;
|
||||
|
||||
use crate::dataframe::DataFrame;
|
||||
use crate::datafusion::DatafusionQueryEngine;
|
||||
use crate::error::Result;
|
||||
use crate::options::QueryOptions;
|
||||
|
||||
@@ -68,6 +68,7 @@ use table::TableRef;
|
||||
use table::requests::{FILE_TABLE_LOCATION_KEY, FILE_TABLE_PATTERN_KEY};
|
||||
|
||||
use crate::QueryEngineRef;
|
||||
use crate::dataframe::DataFrame;
|
||||
use crate::error::{self, Result, UnsupportedVariableSnafu};
|
||||
use crate::planner::DfLogicalPlanner;
|
||||
|
||||
@@ -269,7 +270,7 @@ async fn query_from_information_schema_table(
|
||||
),
|
||||
})?;
|
||||
|
||||
let dataframe = query_engine.read_table(table)?;
|
||||
let DataFrame::DataFusion(dataframe) = query_engine.read_table(table)?;
|
||||
|
||||
// Apply filters
|
||||
let dataframe = filters.into_iter().try_fold(dataframe, |df, expr| {
|
||||
|
||||
@@ -87,7 +87,7 @@ operator.workspace = true
|
||||
otel-arrow-rust.workspace = true
|
||||
parking_lot.workspace = true
|
||||
pg_interval = "0.4"
|
||||
pgwire = { version = "0.36.3", default-features = false, features = [
|
||||
pgwire = { version = "0.36.1", default-features = false, features = [
|
||||
"server-api-ring",
|
||||
"pg-ext-types",
|
||||
] }
|
||||
|
||||
@@ -25,15 +25,12 @@ use arrow_flight::{
|
||||
HandshakeRequest, HandshakeResponse, PollInfo, PutResult, SchemaResult, Ticket,
|
||||
};
|
||||
use async_trait::async_trait;
|
||||
use bytes;
|
||||
use bytes::Bytes;
|
||||
use common_grpc::flight::do_put::{DoPutMetadata, DoPutResponse};
|
||||
use common_grpc::flight::{FlightDecoder, FlightEncoder, FlightMessage};
|
||||
use common_grpc::flight::{FlightEncoder, FlightMessage};
|
||||
use common_query::{Output, OutputData};
|
||||
use common_recordbatch::DfRecordBatch;
|
||||
use common_telemetry::tracing::info_span;
|
||||
use common_telemetry::tracing_context::{FutureExt, TracingContext};
|
||||
use datatypes::arrow::datatypes::SchemaRef;
|
||||
use futures::{Stream, future, ready};
|
||||
use futures_util::{StreamExt, TryStreamExt};
|
||||
use prost::Message;
|
||||
@@ -44,7 +41,7 @@ use tokio::sync::mpsc;
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
use tonic::{Request, Response, Status, Streaming};
|
||||
|
||||
use crate::error::{InvalidParameterSnafu, Result, ToJsonSnafu};
|
||||
use crate::error::{InvalidParameterSnafu, ParseJsonSnafu, Result, ToJsonSnafu};
|
||||
pub use crate::grpc::flight::stream::FlightRecordBatchStream;
|
||||
use crate::grpc::greptime_handler::{GreptimeRequestHandler, get_request_type};
|
||||
use crate::grpc::{FlightCompression, TonicResult, context_auth};
|
||||
@@ -226,15 +223,14 @@ impl FlightCraft for GreptimeRequestHandler {
|
||||
const MAX_PENDING_RESPONSES: usize = 32;
|
||||
let (tx, rx) = mpsc::channel::<TonicResult<DoPutResponse>>(MAX_PENDING_RESPONSES);
|
||||
|
||||
let stream = PutRecordBatchRequestStream::new(
|
||||
stream,
|
||||
query_ctx.current_catalog().to_string(),
|
||||
query_ctx.current_schema(),
|
||||
let stream = PutRecordBatchRequestStream {
|
||||
flight_data_stream: stream,
|
||||
state: PutRecordBatchRequestStreamState::Init(
|
||||
query_ctx.current_catalog().to_string(),
|
||||
query_ctx.current_schema(),
|
||||
),
|
||||
limiter,
|
||||
)
|
||||
.await?;
|
||||
// Ack to the first schema message when we successfully built the stream.
|
||||
let _ = tx.send(Ok(DoPutResponse::new(0, 0, 0.0))).await;
|
||||
};
|
||||
self.put_record_batches(stream, tx, query_ctx).await;
|
||||
|
||||
let response = ReceiverStream::new(rx)
|
||||
@@ -253,33 +249,33 @@ impl FlightCraft for GreptimeRequestHandler {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PutRecordBatchRequest {
|
||||
pub table_name: TableName,
|
||||
pub request_id: i64,
|
||||
pub record_batch: DfRecordBatch,
|
||||
pub schema_bytes: Bytes,
|
||||
pub flight_data: FlightData,
|
||||
pub(crate) struct PutRecordBatchRequest {
|
||||
pub(crate) table_name: TableName,
|
||||
pub(crate) request_id: i64,
|
||||
pub(crate) data: FlightData,
|
||||
pub(crate) _guard: Option<RequestMemoryGuard>,
|
||||
}
|
||||
|
||||
impl PutRecordBatchRequest {
|
||||
fn try_new(
|
||||
table_name: TableName,
|
||||
record_batch: DfRecordBatch,
|
||||
request_id: i64,
|
||||
schema_bytes: Bytes,
|
||||
flight_data: FlightData,
|
||||
limiter: Option<&RequestMemoryLimiter>,
|
||||
) -> Result<Self> {
|
||||
let memory_usage = flight_data.data_body.len()
|
||||
+ flight_data.app_metadata.len()
|
||||
+ flight_data.data_header.len();
|
||||
let request_id = if !flight_data.app_metadata.is_empty() {
|
||||
let metadata: DoPutMetadata =
|
||||
serde_json::from_slice(&flight_data.app_metadata).context(ParseJsonSnafu)?;
|
||||
metadata.request_id()
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
let _guard = limiter
|
||||
.filter(|limiter| limiter.is_enabled())
|
||||
.map(|limiter| {
|
||||
let message_size = flight_data.encoded_len();
|
||||
limiter
|
||||
.try_acquire(memory_usage)
|
||||
.try_acquire(message_size)
|
||||
.map(|guard| {
|
||||
guard.inspect(|g| {
|
||||
METRIC_GRPC_MEMORY_USAGE_BYTES.set(g.current_usage() as i64);
|
||||
@@ -295,32 +291,27 @@ impl PutRecordBatchRequest {
|
||||
Ok(Self {
|
||||
table_name,
|
||||
request_id,
|
||||
record_batch,
|
||||
schema_bytes,
|
||||
flight_data,
|
||||
data: flight_data,
|
||||
_guard,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PutRecordBatchRequestStream {
|
||||
pub(crate) struct PutRecordBatchRequestStream {
|
||||
flight_data_stream: Streaming<FlightData>,
|
||||
table_name: TableName,
|
||||
schema: SchemaRef,
|
||||
schema_bytes: Bytes,
|
||||
decoder: FlightDecoder,
|
||||
state: PutRecordBatchRequestStreamState,
|
||||
limiter: Option<RequestMemoryLimiter>,
|
||||
}
|
||||
|
||||
impl PutRecordBatchRequestStream {
|
||||
/// Creates a new `PutRecordBatchRequestStream` by waiting for the first message,
|
||||
/// extracting the table name from the flight descriptor, and decoding the schema.
|
||||
pub async fn new(
|
||||
mut flight_data_stream: Streaming<FlightData>,
|
||||
catalog: String,
|
||||
schema: String,
|
||||
limiter: Option<RequestMemoryLimiter>,
|
||||
) -> TonicResult<Self> {
|
||||
enum PutRecordBatchRequestStreamState {
|
||||
Init(String, String),
|
||||
Started(TableName),
|
||||
}
|
||||
|
||||
impl Stream for PutRecordBatchRequestStream {
|
||||
type Item = TonicResult<PutRecordBatchRequest>;
|
||||
|
||||
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
fn extract_table_name(mut descriptor: FlightDescriptor) -> Result<String> {
|
||||
ensure!(
|
||||
descriptor.r#type == arrow_flight::flight_descriptor::DescriptorType::Path as i32,
|
||||
@@ -337,131 +328,56 @@ impl PutRecordBatchRequestStream {
|
||||
Ok(descriptor.path.remove(0))
|
||||
}
|
||||
|
||||
// Wait for the first message which must be a Schema message
|
||||
let first_message = flight_data_stream.next().await.ok_or_else(|| {
|
||||
Status::failed_precondition("flight data stream ended unexpectedly")
|
||||
})??;
|
||||
let poll = ready!(self.flight_data_stream.poll_next_unpin(cx));
|
||||
let limiter = self.limiter.clone();
|
||||
|
||||
let flight_descriptor = first_message
|
||||
.flight_descriptor
|
||||
.as_ref()
|
||||
.ok_or_else(|| {
|
||||
Status::failed_precondition("table to put is not found in flight descriptor")
|
||||
})?
|
||||
.clone();
|
||||
let result = match &mut self.state {
|
||||
PutRecordBatchRequestStreamState::Init(catalog, schema) => match poll {
|
||||
Some(Ok(mut flight_data)) => {
|
||||
let flight_descriptor = flight_data.flight_descriptor.take();
|
||||
let result = if let Some(descriptor) = flight_descriptor {
|
||||
let table_name = extract_table_name(descriptor)
|
||||
.map(|x| TableName::new(catalog.clone(), schema.clone(), x));
|
||||
let table_name = match table_name {
|
||||
Ok(table_name) => table_name,
|
||||
Err(e) => return Poll::Ready(Some(Err(e.into()))),
|
||||
};
|
||||
|
||||
let table_name_str = extract_table_name(flight_descriptor)
|
||||
.map_err(|e| Status::invalid_argument(e.to_string()))?;
|
||||
let table_name = TableName::new(catalog, schema, table_name_str);
|
||||
let request = PutRecordBatchRequest::try_new(
|
||||
table_name.clone(),
|
||||
flight_data,
|
||||
limiter.as_ref(),
|
||||
);
|
||||
let request = match request {
|
||||
Ok(request) => request,
|
||||
Err(e) => return Poll::Ready(Some(Err(e.into()))),
|
||||
};
|
||||
|
||||
// Decode the first message as schema
|
||||
let mut decoder = FlightDecoder::default();
|
||||
let schema_message = decoder
|
||||
.try_decode(&first_message)
|
||||
.map_err(|e| Status::invalid_argument(format!("Failed to decode schema: {}", e)))?;
|
||||
self.state = PutRecordBatchRequestStreamState::Started(table_name);
|
||||
|
||||
let (schema, schema_bytes) = match schema_message {
|
||||
Some(FlightMessage::Schema(schema)) => {
|
||||
let schema_bytes = decoder.schema_bytes().ok_or_else(|| {
|
||||
Status::internal("decoder should have schema bytes after decoding schema")
|
||||
})?;
|
||||
(schema, schema_bytes)
|
||||
}
|
||||
_ => {
|
||||
return Err(Status::failed_precondition(
|
||||
"first message must be a Schema message",
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
flight_data_stream,
|
||||
table_name,
|
||||
schema,
|
||||
schema_bytes,
|
||||
decoder,
|
||||
limiter,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns the table name extracted from the flight descriptor.
|
||||
pub fn table_name(&self) -> &TableName {
|
||||
&self.table_name
|
||||
}
|
||||
|
||||
/// Returns the Arrow schema decoded from the first flight message.
|
||||
pub fn schema(&self) -> &SchemaRef {
|
||||
&self.schema
|
||||
}
|
||||
|
||||
/// Returns the raw schema bytes in IPC format.
|
||||
pub fn schema_bytes(&self) -> &Bytes {
|
||||
&self.schema_bytes
|
||||
}
|
||||
}
|
||||
|
||||
impl Stream for PutRecordBatchRequestStream {
|
||||
type Item = TonicResult<PutRecordBatchRequest>;
|
||||
|
||||
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
loop {
|
||||
let poll = ready!(self.flight_data_stream.poll_next_unpin(cx));
|
||||
|
||||
match poll {
|
||||
Some(Ok(flight_data)) => {
|
||||
// Extract request_id and body_size from FlightData before decoding
|
||||
let request_id = if !flight_data.app_metadata.is_empty() {
|
||||
match serde_json::from_slice::<DoPutMetadata>(&flight_data.app_metadata) {
|
||||
Ok(metadata) => metadata.request_id(),
|
||||
Err(_) => 0,
|
||||
}
|
||||
Ok(request)
|
||||
} else {
|
||||
0
|
||||
Err(Status::failed_precondition(
|
||||
"table to put is not found in flight descriptor",
|
||||
))
|
||||
};
|
||||
|
||||
// Decode FlightData to RecordBatch
|
||||
match self.decoder.try_decode(&flight_data) {
|
||||
Ok(Some(FlightMessage::RecordBatch(record_batch))) => {
|
||||
let limiter = self.limiter.clone();
|
||||
let table_name = self.table_name.clone();
|
||||
let schema_bytes = self.schema_bytes.clone();
|
||||
return Poll::Ready(Some(
|
||||
PutRecordBatchRequest::try_new(
|
||||
table_name,
|
||||
record_batch,
|
||||
request_id,
|
||||
schema_bytes,
|
||||
flight_data,
|
||||
limiter.as_ref(),
|
||||
)
|
||||
.map_err(|e| Status::invalid_argument(e.to_string())),
|
||||
));
|
||||
}
|
||||
Ok(Some(_)) => {
|
||||
return Poll::Ready(Some(Err(Status::invalid_argument(
|
||||
"Expected RecordBatch message, got other message type",
|
||||
))));
|
||||
}
|
||||
Ok(None) => {
|
||||
// Dictionary batch - processed internally by decoder, continue polling
|
||||
continue;
|
||||
}
|
||||
Err(e) => {
|
||||
return Poll::Ready(Some(Err(Status::invalid_argument(format!(
|
||||
"Failed to decode RecordBatch: {}",
|
||||
e
|
||||
)))));
|
||||
}
|
||||
}
|
||||
Some(result)
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
return Poll::Ready(Some(Err(e)));
|
||||
}
|
||||
None => {
|
||||
return Poll::Ready(None);
|
||||
}
|
||||
}
|
||||
}
|
||||
Some(Err(e)) => Some(Err(e)),
|
||||
None => None,
|
||||
},
|
||||
PutRecordBatchRequestStreamState::Started(table_name) => poll.map(|x| {
|
||||
x.and_then(|flight_data| {
|
||||
PutRecordBatchRequest::try_new(
|
||||
table_name.clone(),
|
||||
flight_data,
|
||||
limiter.as_ref(),
|
||||
)
|
||||
.map_err(Into::into)
|
||||
})
|
||||
}),
|
||||
};
|
||||
Poll::Ready(result)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@ use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
|
||||
use common_catalog::parse_catalog_and_schema_from_db_string;
|
||||
use common_error::ext::ErrorExt;
|
||||
use common_error::status_code::StatusCode;
|
||||
use common_grpc::flight::FlightDecoder;
|
||||
use common_grpc::flight::do_put::DoPutResponse;
|
||||
use common_query::Output;
|
||||
use common_runtime::Runtime;
|
||||
@@ -36,14 +37,15 @@ use futures_util::StreamExt;
|
||||
use session::context::{Channel, QueryContextBuilder, QueryContextRef};
|
||||
use session::hints::READ_PREFERENCE_HINT;
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
use table::TableRef;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio::sync::mpsc::error::TrySendError;
|
||||
use tonic::Status;
|
||||
|
||||
use crate::error::{InvalidQuerySnafu, JoinTaskSnafu, Result, UnknownHintSnafu};
|
||||
use crate::grpc::flight::PutRecordBatchRequestStream;
|
||||
use crate::grpc::flight::{PutRecordBatchRequest, PutRecordBatchRequestStream};
|
||||
use crate::grpc::{FlightCompression, TonicResult, context_auth};
|
||||
use crate::metrics::{self, METRIC_SERVER_GRPC_DB_REQUEST_TIMER};
|
||||
use crate::metrics;
|
||||
use crate::metrics::METRIC_SERVER_GRPC_DB_REQUEST_TIMER;
|
||||
use crate::query_handler::grpc::ServerGrpcQueryHandlerRef;
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -132,7 +134,7 @@ impl GreptimeRequestHandler {
|
||||
|
||||
pub(crate) async fn put_record_batches(
|
||||
&self,
|
||||
stream: PutRecordBatchRequestStream,
|
||||
mut stream: PutRecordBatchRequestStream,
|
||||
result_sender: mpsc::Sender<TonicResult<DoPutResponse>>,
|
||||
query_ctx: QueryContextRef,
|
||||
) {
|
||||
@@ -142,24 +144,37 @@ impl GreptimeRequestHandler {
|
||||
.clone()
|
||||
.unwrap_or_else(common_runtime::global_runtime);
|
||||
runtime.spawn(async move {
|
||||
let mut result_stream = handler.handle_put_record_batch_stream(stream, query_ctx);
|
||||
// Cached table ref
|
||||
let mut table_ref: Option<TableRef> = None;
|
||||
|
||||
while let Some(result) = result_stream.next().await {
|
||||
match &result {
|
||||
Ok(response) => {
|
||||
// Record the elapsed time metric from the response
|
||||
metrics::GRPC_BULK_INSERT_ELAPSED.observe(response.elapsed_secs());
|
||||
}
|
||||
let mut decoder = FlightDecoder::default();
|
||||
while let Some(request) = stream.next().await {
|
||||
let request = match request {
|
||||
Ok(request) => request,
|
||||
Err(e) => {
|
||||
error!(e; "Failed to handle flight record batches");
|
||||
let _ = result_sender.try_send(Err(e));
|
||||
break;
|
||||
}
|
||||
}
|
||||
};
|
||||
let PutRecordBatchRequest {
|
||||
table_name,
|
||||
request_id,
|
||||
data,
|
||||
_guard,
|
||||
} = request;
|
||||
|
||||
if let Err(e) =
|
||||
result_sender.try_send(result.map_err(|e| Status::from_error(Box::new(e))))
|
||||
&& let TrySendError::Closed(_) = e
|
||||
{
|
||||
warn!(r#""DoPut" client maybe unreachable, abort handling its message"#);
|
||||
let timer = metrics::GRPC_BULK_INSERT_ELAPSED.start_timer();
|
||||
let result = handler
|
||||
.put_record_batch(&table_name, &mut table_ref, &mut decoder, data, query_ctx.clone())
|
||||
.await
|
||||
.inspect_err(|e| error!(e; "Failed to handle flight record batches"));
|
||||
timer.observe_duration();
|
||||
let result = result
|
||||
.map(|x| DoPutResponse::new(request_id, x))
|
||||
.map_err(Into::into);
|
||||
if let Err(e)= result_sender.try_send(result)
|
||||
&& let TrySendError::Closed(_) = e {
|
||||
warn!(r#""DoPut" client with request_id {} maybe unreachable, abort handling its message"#, request_id);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -461,18 +461,8 @@ impl ExtendedQueryHandler for PostgresServerHandlerInner {
|
||||
// we will not support other show statements for extended query protocol at least for now.
|
||||
// because the return columns is not predictable at this stage
|
||||
_ => {
|
||||
// test if query caught by fixture
|
||||
if let Some(mut resp) =
|
||||
fixtures::process(&sql_plan.query, self.session.new_query_context())
|
||||
&& let Response::Query(query_response) = resp.remove(0)
|
||||
{
|
||||
Ok(DescribePortalResponse::new(
|
||||
(*query_response.row_schema()).clone(),
|
||||
))
|
||||
} else {
|
||||
// fallback to NoData
|
||||
Ok(DescribePortalResponse::new(vec![]))
|
||||
}
|
||||
// fallback to NoData
|
||||
Ok(DescribePortalResponse::new(vec![]))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1260,204 +1260,6 @@ pub(super) fn parameters_to_scalar_values(
|
||||
ScalarValue::Null
|
||||
}
|
||||
}
|
||||
&Type::TIMESTAMP_ARRAY => {
|
||||
let data = portal.parameter::<Vec<NaiveDateTime>>(idx, &client_type)?;
|
||||
if let Some(data) = data {
|
||||
if let Some(ConcreteDataType::List(list_type)) = &server_type {
|
||||
match list_type.item_type() {
|
||||
ConcreteDataType::Timestamp(unit) => match *unit {
|
||||
TimestampType::Second(_) => {
|
||||
let values = data
|
||||
.into_iter()
|
||||
.map(|ts| {
|
||||
ScalarValue::TimestampSecond(
|
||||
Some(ts.and_utc().timestamp()),
|
||||
None,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
ScalarValue::List(ScalarValue::new_list(
|
||||
&values,
|
||||
&ArrowDataType::Timestamp(TimeUnit::Second, None),
|
||||
true,
|
||||
))
|
||||
}
|
||||
TimestampType::Millisecond(_) => {
|
||||
let values = data
|
||||
.into_iter()
|
||||
.map(|ts| {
|
||||
ScalarValue::TimestampMillisecond(
|
||||
Some(ts.and_utc().timestamp_millis()),
|
||||
None,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
ScalarValue::List(ScalarValue::new_list(
|
||||
&values,
|
||||
&ArrowDataType::Timestamp(TimeUnit::Millisecond, None),
|
||||
true,
|
||||
))
|
||||
}
|
||||
TimestampType::Microsecond(_) => {
|
||||
let values = data
|
||||
.into_iter()
|
||||
.map(|ts| {
|
||||
ScalarValue::TimestampMicrosecond(
|
||||
Some(ts.and_utc().timestamp_micros()),
|
||||
None,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
ScalarValue::List(ScalarValue::new_list(
|
||||
&values,
|
||||
&ArrowDataType::Timestamp(TimeUnit::Microsecond, None),
|
||||
true,
|
||||
))
|
||||
}
|
||||
TimestampType::Nanosecond(_) => {
|
||||
let values = data
|
||||
.into_iter()
|
||||
.filter_map(|ts| {
|
||||
ts.and_utc().timestamp_nanos_opt().map(|nanos| {
|
||||
ScalarValue::TimestampNanosecond(Some(nanos), None)
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
ScalarValue::List(ScalarValue::new_list(
|
||||
&values,
|
||||
&ArrowDataType::Timestamp(TimeUnit::Nanosecond, None),
|
||||
true,
|
||||
))
|
||||
}
|
||||
},
|
||||
_ => {
|
||||
return Err(invalid_parameter_error(
|
||||
"invalid_parameter_type",
|
||||
Some(format!(
|
||||
"Expected: {}, found: {}",
|
||||
list_type.item_type(),
|
||||
client_type
|
||||
)),
|
||||
));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Default to millisecond when no server type is specified
|
||||
let values = data
|
||||
.into_iter()
|
||||
.map(|ts| {
|
||||
ScalarValue::TimestampMillisecond(
|
||||
Some(ts.and_utc().timestamp_millis()),
|
||||
None,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
ScalarValue::List(ScalarValue::new_list(
|
||||
&values,
|
||||
&ArrowDataType::Timestamp(TimeUnit::Millisecond, None),
|
||||
true,
|
||||
))
|
||||
}
|
||||
} else {
|
||||
ScalarValue::Null
|
||||
}
|
||||
}
|
||||
&Type::TIMESTAMPTZ_ARRAY => {
|
||||
let data = portal.parameter::<Vec<DateTime<FixedOffset>>>(idx, &client_type)?;
|
||||
if let Some(data) = data {
|
||||
if let Some(ConcreteDataType::List(list_type)) = &server_type {
|
||||
match list_type.item_type() {
|
||||
ConcreteDataType::Timestamp(unit) => match *unit {
|
||||
TimestampType::Second(_) => {
|
||||
let values = data
|
||||
.into_iter()
|
||||
.map(|ts| {
|
||||
ScalarValue::TimestampSecond(Some(ts.timestamp()), None)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
ScalarValue::List(ScalarValue::new_list(
|
||||
&values,
|
||||
&ArrowDataType::Timestamp(TimeUnit::Second, None),
|
||||
true,
|
||||
))
|
||||
}
|
||||
TimestampType::Millisecond(_) => {
|
||||
let values = data
|
||||
.into_iter()
|
||||
.map(|ts| {
|
||||
ScalarValue::TimestampMillisecond(
|
||||
Some(ts.timestamp_millis()),
|
||||
None,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
ScalarValue::List(ScalarValue::new_list(
|
||||
&values,
|
||||
&ArrowDataType::Timestamp(TimeUnit::Millisecond, None),
|
||||
true,
|
||||
))
|
||||
}
|
||||
TimestampType::Microsecond(_) => {
|
||||
let values = data
|
||||
.into_iter()
|
||||
.map(|ts| {
|
||||
ScalarValue::TimestampMicrosecond(
|
||||
Some(ts.timestamp_micros()),
|
||||
None,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
ScalarValue::List(ScalarValue::new_list(
|
||||
&values,
|
||||
&ArrowDataType::Timestamp(TimeUnit::Microsecond, None),
|
||||
true,
|
||||
))
|
||||
}
|
||||
TimestampType::Nanosecond(_) => {
|
||||
let values = data
|
||||
.into_iter()
|
||||
.filter_map(|ts| {
|
||||
ts.timestamp_nanos_opt().map(|nanos| {
|
||||
ScalarValue::TimestampNanosecond(Some(nanos), None)
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
ScalarValue::List(ScalarValue::new_list(
|
||||
&values,
|
||||
&ArrowDataType::Timestamp(TimeUnit::Nanosecond, None),
|
||||
true,
|
||||
))
|
||||
}
|
||||
},
|
||||
_ => {
|
||||
return Err(invalid_parameter_error(
|
||||
"invalid_parameter_type",
|
||||
Some(format!(
|
||||
"Expected: {}, found: {}",
|
||||
list_type.item_type(),
|
||||
client_type
|
||||
)),
|
||||
));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Default to millisecond when no server type is specified
|
||||
let values = data
|
||||
.into_iter()
|
||||
.map(|ts| {
|
||||
ScalarValue::TimestampMillisecond(Some(ts.timestamp_millis()), None)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
ScalarValue::List(ScalarValue::new_list(
|
||||
&values,
|
||||
&ArrowDataType::Timestamp(TimeUnit::Millisecond, None),
|
||||
true,
|
||||
))
|
||||
}
|
||||
} else {
|
||||
ScalarValue::Null
|
||||
}
|
||||
}
|
||||
_ => Err(invalid_parameter_error(
|
||||
"unsupported_parameter_value",
|
||||
Some(format!("Found type: {}", client_type)),
|
||||
|
||||
@@ -27,11 +27,11 @@ use common_grpc::precision::Precision;
|
||||
use common_query::prelude::{greptime_timestamp, greptime_value};
|
||||
use common_recordbatch::{RecordBatch, RecordBatches};
|
||||
use common_telemetry::tracing;
|
||||
use datafusion::dataframe::DataFrame;
|
||||
use datafusion::prelude::{Expr, col, lit, regexp_match};
|
||||
use datafusion_common::ScalarValue;
|
||||
use datafusion_expr::LogicalPlan;
|
||||
use openmetrics_parser::{MetricsExposition, PrometheusType, PrometheusValue};
|
||||
use query::dataframe::DataFrame;
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
use snap::raw::{Decoder, Encoder};
|
||||
|
||||
@@ -102,6 +102,8 @@ pub fn extract_schema_from_read_request(request: &ReadRequest) -> Option<String>
|
||||
/// Create a DataFrame from a remote Query
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub fn query_to_plan(dataframe: DataFrame, q: &Query) -> Result<LogicalPlan> {
|
||||
let DataFrame::DataFusion(dataframe) = dataframe;
|
||||
|
||||
let start_timestamp_ms = q.start_timestamp_ms;
|
||||
let end_timestamp_ms = q.end_timestamp_ms;
|
||||
|
||||
@@ -652,7 +654,7 @@ mod tests {
|
||||
let table_provider = Arc::new(DfTableProviderAdapter::new(table));
|
||||
|
||||
let dataframe = ctx.read_table(table_provider.clone()).unwrap();
|
||||
let plan = query_to_plan(dataframe, &q).unwrap();
|
||||
let plan = query_to_plan(DataFrame::DataFusion(dataframe), &q).unwrap();
|
||||
let display_string = format!("{}", plan.display_indent());
|
||||
|
||||
let ts_col = greptime_timestamp();
|
||||
@@ -686,7 +688,7 @@ mod tests {
|
||||
};
|
||||
|
||||
let dataframe = ctx.read_table(table_provider).unwrap();
|
||||
let plan = query_to_plan(dataframe, &q).unwrap();
|
||||
let plan = query_to_plan(DataFrame::DataFusion(dataframe), &q).unwrap();
|
||||
let display_string = format!("{}", plan.display_indent());
|
||||
|
||||
let ts_col = greptime_timestamp();
|
||||
|
||||
@@ -16,10 +16,10 @@ use catalog::system_schema::information_schema::tables::{
|
||||
ENGINE as TABLE_ENGINE, TABLE_CATALOG, TABLE_NAME, TABLE_SCHEMA,
|
||||
};
|
||||
use common_telemetry::tracing;
|
||||
use datafusion::dataframe::DataFrame;
|
||||
use datafusion::prelude::{Expr, col, lit, regexp_match};
|
||||
use datafusion_expr::LogicalPlan;
|
||||
use promql_parser::label::{MatchOp, Matcher};
|
||||
use query::dataframe::DataFrame;
|
||||
use session::context::QueryContextRef;
|
||||
use snafu::ResultExt;
|
||||
|
||||
@@ -71,6 +71,7 @@ pub fn metric_name_matchers_to_plan(
|
||||
// Safety: conditions MUST not be empty, reduce always return Some(expr).
|
||||
let conditions = conditions.into_iter().reduce(Expr::and).unwrap();
|
||||
|
||||
let DataFrame::DataFusion(dataframe) = dataframe;
|
||||
let dataframe = dataframe
|
||||
.filter(conditions)
|
||||
.context(error::DataFrameSnafu)?
|
||||
|
||||
@@ -12,22 +12,21 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::v1::greptime_request::Request;
|
||||
use arrow_flight::FlightData;
|
||||
use async_trait::async_trait;
|
||||
use common_base::AffectedRows;
|
||||
use common_error::ext::{BoxedError, ErrorExt};
|
||||
use common_grpc::flight::do_put::DoPutResponse;
|
||||
use common_grpc::flight::FlightDecoder;
|
||||
use common_query::Output;
|
||||
use futures::Stream;
|
||||
use session::context::QueryContextRef;
|
||||
use snafu::ResultExt;
|
||||
use table::TableRef;
|
||||
use table::table_name::TableName;
|
||||
|
||||
use crate::error::{self, Result};
|
||||
use crate::grpc::flight::{PutRecordBatchRequest, PutRecordBatchRequestStream};
|
||||
|
||||
pub type GrpcQueryHandlerRef<E> = Arc<dyn GrpcQueryHandler<Error = E> + Send + Sync>;
|
||||
pub type ServerGrpcQueryHandlerRef = GrpcQueryHandlerRef<error::Error>;
|
||||
@@ -46,16 +45,12 @@ pub trait GrpcQueryHandler {
|
||||
|
||||
async fn put_record_batch(
|
||||
&self,
|
||||
request: PutRecordBatchRequest,
|
||||
table_name: &TableName,
|
||||
table_ref: &mut Option<TableRef>,
|
||||
decoder: &mut FlightDecoder,
|
||||
flight_data: FlightData,
|
||||
ctx: QueryContextRef,
|
||||
) -> std::result::Result<AffectedRows, Self::Error>;
|
||||
|
||||
fn handle_put_record_batch_stream(
|
||||
&self,
|
||||
stream: PutRecordBatchRequestStream,
|
||||
ctx: QueryContextRef,
|
||||
) -> Pin<Box<dyn Stream<Item = std::result::Result<DoPutResponse, Self::Error>> + Send>>;
|
||||
}
|
||||
|
||||
pub struct ServerGrpcQueryHandlerAdapter<E>(GrpcQueryHandlerRef<E>);
|
||||
@@ -83,31 +78,16 @@ where
|
||||
|
||||
async fn put_record_batch(
|
||||
&self,
|
||||
request: PutRecordBatchRequest,
|
||||
table_name: &TableName,
|
||||
table_ref: &mut Option<TableRef>,
|
||||
decoder: &mut FlightDecoder,
|
||||
data: FlightData,
|
||||
ctx: QueryContextRef,
|
||||
) -> Result<AffectedRows> {
|
||||
self.0
|
||||
.put_record_batch(request, table_ref, ctx)
|
||||
.put_record_batch(table_name, table_ref, decoder, data, ctx)
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(error::ExecuteGrpcRequestSnafu)
|
||||
}
|
||||
|
||||
fn handle_put_record_batch_stream(
|
||||
&self,
|
||||
stream: PutRecordBatchRequestStream,
|
||||
ctx: QueryContextRef,
|
||||
) -> Pin<Box<dyn Stream<Item = Result<DoPutResponse>> + Send>> {
|
||||
use futures_util::StreamExt;
|
||||
Box::pin(
|
||||
self.0
|
||||
.handle_put_record_batch_stream(stream, ctx)
|
||||
.map(|result| {
|
||||
result
|
||||
.map_err(|e| BoxedError::new(e))
|
||||
.context(error::ExecuteGrpcRequestSnafu)
|
||||
}),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,11 +16,12 @@ use std::sync::Arc;
|
||||
|
||||
use api::v1::greptime_request::Request;
|
||||
use api::v1::query_request::Query;
|
||||
use arrow_flight::FlightData;
|
||||
use async_trait::async_trait;
|
||||
use catalog::memory::MemoryCatalogManager;
|
||||
use common_base::AffectedRows;
|
||||
use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
|
||||
use common_grpc::flight::do_put::DoPutResponse;
|
||||
use common_grpc::flight::FlightDecoder;
|
||||
use common_query::Output;
|
||||
use datafusion_expr::LogicalPlan;
|
||||
use query::options::QueryOptions;
|
||||
@@ -34,6 +35,7 @@ use session::context::QueryContextRef;
|
||||
use snafu::ensure;
|
||||
use sql::statements::statement::Statement;
|
||||
use table::TableRef;
|
||||
use table::table_name::TableName;
|
||||
|
||||
mod http;
|
||||
mod interceptor;
|
||||
@@ -163,22 +165,14 @@ impl GrpcQueryHandler for DummyInstance {
|
||||
|
||||
async fn put_record_batch(
|
||||
&self,
|
||||
_request: servers::grpc::flight::PutRecordBatchRequest,
|
||||
_table_name: &TableName,
|
||||
_table_ref: &mut Option<TableRef>,
|
||||
_decoder: &mut FlightDecoder,
|
||||
_data: FlightData,
|
||||
_ctx: QueryContextRef,
|
||||
) -> std::result::Result<AffectedRows, Self::Error> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn handle_put_record_batch_stream(
|
||||
&self,
|
||||
_stream: servers::grpc::flight::PutRecordBatchRequestStream,
|
||||
_ctx: QueryContextRef,
|
||||
) -> std::pin::Pin<
|
||||
Box<dyn futures::Stream<Item = std::result::Result<DoPutResponse, Self::Error>> + Send>,
|
||||
> {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
fn create_testing_instance(table: TableRef) -> DummyInstance {
|
||||
|
||||
@@ -4327,7 +4327,7 @@ pub async fn test_otlp_metrics_new(store_type: StorageType) {
|
||||
.await;
|
||||
|
||||
// select metrics data
|
||||
let expected = "[[1753780559836,2.244618,\"arm64\",\"claude-code\",\"claude-sonnet-4-20250514\",\"25.0.0\",\"com.anthropic.claude_code\",\"\",\"1.0.62\",\"claude-code\",\"1.0.62\",\"736525A3-F5D4-496B-933E-827AF23A5B97\",\"ghostty\",\"6DA02FD9-B5C5-4E61-9355-9FE8EC9A0CF4\"],[1753780559836,0.0052544,\"arm64\",\"claude-code\",\"claude-3-5-haiku-20241022\",\"25.0.0\",\"com.anthropic.claude_code\",\"\",\"1.0.62\",\"claude-code\",\"1.0.62\",\"736525A3-F5D4-496B-933E-827AF23A5B97\",\"ghostty\",\"6DA02FD9-B5C5-4E61-9355-9FE8EC9A0CF4\"]]";
|
||||
let expected = "[[1753780559836,0.0052544,\"arm64\",\"claude-code\",\"claude-3-5-haiku-20241022\",\"25.0.0\",\"com.anthropic.claude_code\",\"\",\"1.0.62\",\"claude-code\",\"1.0.62\",\"736525A3-F5D4-496B-933E-827AF23A5B97\",\"ghostty\",\"6DA02FD9-B5C5-4E61-9355-9FE8EC9A0CF4\"],[1753780559836,2.244618,\"arm64\",\"claude-code\",\"claude-sonnet-4-20250514\",\"25.0.0\",\"com.anthropic.claude_code\",\"\",\"1.0.62\",\"claude-code\",\"1.0.62\",\"736525A3-F5D4-496B-933E-827AF23A5B97\",\"ghostty\",\"6DA02FD9-B5C5-4E61-9355-9FE8EC9A0CF4\"]]";
|
||||
validate_data(
|
||||
"otlp_metrics_all_select",
|
||||
&client,
|
||||
@@ -4399,7 +4399,7 @@ pub async fn test_otlp_metrics_new(store_type: StorageType) {
|
||||
.await;
|
||||
|
||||
// select metrics data
|
||||
let expected = "[[1753780559836,0.0052544,\"claude-code\",\"claude-3-5-haiku-20241022\",\"darwin\",\"25.0.0\",\"claude-code\",\"1.0.62\",\"736525A3-F5D4-496B-933E-827AF23A5B97\",\"ghostty\",\"6DA02FD9-B5C5-4E61-9355-9FE8EC9A0CF4\"],[1753780559836,2.244618,\"claude-code\",\"claude-sonnet-4-20250514\",\"darwin\",\"25.0.0\",\"claude-code\",\"1.0.62\",\"736525A3-F5D4-496B-933E-827AF23A5B97\",\"ghostty\",\"6DA02FD9-B5C5-4E61-9355-9FE8EC9A0CF4\"]]";
|
||||
let expected = "[[1753780559836,2.244618,\"claude-code\",\"claude-sonnet-4-20250514\",\"darwin\",\"25.0.0\",\"claude-code\",\"1.0.62\",\"736525A3-F5D4-496B-933E-827AF23A5B97\",\"ghostty\",\"6DA02FD9-B5C5-4E61-9355-9FE8EC9A0CF4\"],[1753780559836,0.0052544,\"claude-code\",\"claude-3-5-haiku-20241022\",\"darwin\",\"25.0.0\",\"claude-code\",\"1.0.62\",\"736525A3-F5D4-496B-933E-827AF23A5B97\",\"ghostty\",\"6DA02FD9-B5C5-4E61-9355-9FE8EC9A0CF4\"]]";
|
||||
validate_data(
|
||||
"otlp_metrics_select",
|
||||
&client,
|
||||
|
||||
@@ -363,7 +363,7 @@ pub async fn test_metric_table_region_migration_by_sql(
|
||||
let result = cluster
|
||||
.frontend
|
||||
.instance
|
||||
.do_query("select * from t1 order by host desc", query_ctx.clone())
|
||||
.do_query("select * from t1", query_ctx.clone())
|
||||
.await
|
||||
.remove(0);
|
||||
|
||||
@@ -379,7 +379,7 @@ pub async fn test_metric_table_region_migration_by_sql(
|
||||
let result = cluster
|
||||
.frontend
|
||||
.instance
|
||||
.do_query("select * from t2 order by job desc", query_ctx)
|
||||
.do_query("select * from t2", query_ctx)
|
||||
.await
|
||||
.remove(0);
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user