mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-01-10 07:12:54 +00:00
Compare commits
39 Commits
flow_rule_
...
feat/sst-c
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d5760a7348 | ||
|
|
bc9614e22c | ||
|
|
7dd9e98ff6 | ||
|
|
fb6b7f7801 | ||
|
|
87d7c316df | ||
|
|
c80a73bc20 | ||
|
|
dd9d13e7df | ||
|
|
79d249f5fa | ||
|
|
63bc544514 | ||
|
|
30c29539a3 | ||
|
|
359da62d9e | ||
|
|
c9f4b36360 | ||
|
|
85c346b16a | ||
|
|
738c23beb0 | ||
|
|
8aadd1e59a | ||
|
|
cbd58291da | ||
|
|
e522e8959b | ||
|
|
7183a93e5a | ||
|
|
8c538622e2 | ||
|
|
142dacb2c8 | ||
|
|
371afc458f | ||
|
|
0751cd74c0 | ||
|
|
ec34e8739a | ||
|
|
b650743785 | ||
|
|
80a8b2e1bd | ||
|
|
ec8a15cadd | ||
|
|
f929d751a5 | ||
|
|
fad3621a7a | ||
|
|
87723effc7 | ||
|
|
62a333ad09 | ||
|
|
6ad186a13e | ||
|
|
77dee84a75 | ||
|
|
a57e263e5a | ||
|
|
8796ddaf31 | ||
|
|
7fa3fbdfef | ||
|
|
457d2a620c | ||
|
|
9f14edbb28 | ||
|
|
cb3fad0c2d | ||
|
|
2d1e7c2441 |
52
.github/workflows/grafana.yml
vendored
52
.github/workflows/grafana.yml
vendored
@@ -1,52 +0,0 @@
|
||||
name: Check Grafana Panels
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- 'grafana/**' # Trigger only when files under the grafana/ directory change
|
||||
|
||||
jobs:
|
||||
check-panels:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
# Check out the repository
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
# Install jq (required for the script)
|
||||
- name: Install jq
|
||||
run: sudo apt-get install -y jq
|
||||
|
||||
# Make the check.sh script executable
|
||||
- name: Make check.sh executable
|
||||
run: chmod +x grafana/check.sh
|
||||
|
||||
# Run the check.sh script
|
||||
- name: Run check.sh
|
||||
run: ./grafana/check.sh
|
||||
|
||||
# Only run summary.sh for pull_request events (not for merge queues or final pushes)
|
||||
- name: Check if this is a pull request
|
||||
id: check-pr
|
||||
run: |
|
||||
if [[ "${{ github.event_name }}" == "pull_request" ]]; then
|
||||
echo "is_pull_request=true" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "is_pull_request=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
# Make the summary.sh script executable
|
||||
- name: Make summary.sh executable
|
||||
if: steps.check-pr.outputs.is_pull_request == 'true'
|
||||
run: chmod +x grafana/summary.sh
|
||||
|
||||
# Run the summary.sh script and add its output to the GitHub Job Summary
|
||||
- name: Run summary.sh and add to Job Summary
|
||||
if: steps.check-pr.outputs.is_pull_request == 'true'
|
||||
run: |
|
||||
SUMMARY=$(./grafana/summary.sh)
|
||||
echo "### Summary of Grafana Panels" >> $GITHUB_STEP_SUMMARY
|
||||
echo "$SUMMARY" >> $GITHUB_STEP_SUMMARY
|
||||
144
Cargo.lock
generated
144
Cargo.lock
generated
@@ -1594,7 +1594,7 @@ dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
"strsim 0.8.0",
|
||||
"textwrap 0.11.0",
|
||||
"unicode-width",
|
||||
"unicode-width 0.1.14",
|
||||
"vec_map",
|
||||
]
|
||||
|
||||
@@ -1876,7 +1876,7 @@ checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7"
|
||||
dependencies = [
|
||||
"strum 0.26.3",
|
||||
"strum_macros 0.26.4",
|
||||
"unicode-width",
|
||||
"unicode-width 0.1.14",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2469,6 +2469,7 @@ dependencies = [
|
||||
"encode_unicode",
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"unicode-width 0.1.14",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
@@ -4167,7 +4168,6 @@ dependencies = [
|
||||
"bytes",
|
||||
"cache",
|
||||
"catalog",
|
||||
"chrono",
|
||||
"client",
|
||||
"common-base",
|
||||
"common-catalog",
|
||||
@@ -4646,7 +4646,7 @@ version = "0.2.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5"
|
||||
dependencies = [
|
||||
"unicode-width",
|
||||
"unicode-width 0.1.14",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5567,7 +5567,6 @@ dependencies = [
|
||||
"rand",
|
||||
"regex",
|
||||
"regex-automata 0.4.8",
|
||||
"roaring",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"snafu 0.8.5",
|
||||
@@ -5601,6 +5600,19 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "indicatif"
|
||||
version = "0.17.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235"
|
||||
dependencies = [
|
||||
"console",
|
||||
"number_prefix",
|
||||
"portable-atomic",
|
||||
"unicode-width 0.2.0",
|
||||
"web-time 1.1.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "inferno"
|
||||
version = "0.11.21"
|
||||
@@ -5630,6 +5642,25 @@ dependencies = [
|
||||
"snafu 0.7.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ingester"
|
||||
version = "0.13.0"
|
||||
dependencies = [
|
||||
"clap 4.5.19",
|
||||
"common-telemetry",
|
||||
"common-time",
|
||||
"datanode",
|
||||
"meta-client",
|
||||
"mito2",
|
||||
"object-store",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sst-convert",
|
||||
"tokio",
|
||||
"toml 0.8.19",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "inotify"
|
||||
version = "0.9.6"
|
||||
@@ -5899,15 +5930,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "jsonpath-rust"
|
||||
version = "0.7.5"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c00ae348f9f8fd2d09f82a98ca381c60df9e0820d8d79fce43e649b4dc3128b"
|
||||
checksum = "69a61b87f6a55cc6c28fed5739dd36b9642321ce63e4a5e4a4715d69106f4a10"
|
||||
dependencies = [
|
||||
"pest",
|
||||
"pest_derive",
|
||||
"regex",
|
||||
"serde_json",
|
||||
"thiserror 2.0.12",
|
||||
"thiserror 1.0.64",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7519,6 +7550,12 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "number_prefix"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
|
||||
|
||||
[[package]]
|
||||
name = "objc"
|
||||
version = "0.2.7"
|
||||
@@ -7975,7 +8012,7 @@ version = "0.1.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d2ad9b889f1b12e0b9ee24db044b5129150d5eada288edc800f789928dc8c0e3"
|
||||
dependencies = [
|
||||
"unicode-width",
|
||||
"unicode-width 0.1.14",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -8071,6 +8108,19 @@ dependencies = [
|
||||
"zstd-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parquet_opendal"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4140ae96f37c170f8d684a544711fabdac1d94adcbd97e8b033329bd37f40446"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"futures",
|
||||
"opendal",
|
||||
"parquet",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parse-zoneinfo"
|
||||
version = "0.3.1"
|
||||
@@ -8272,7 +8322,7 @@ dependencies = [
|
||||
"rand",
|
||||
"ring",
|
||||
"rust_decimal",
|
||||
"thiserror 2.0.12",
|
||||
"thiserror 2.0.6",
|
||||
"tokio",
|
||||
"tokio-rustls 0.26.0",
|
||||
"tokio-util",
|
||||
@@ -8384,7 +8434,7 @@ dependencies = [
|
||||
"greptime-proto",
|
||||
"itertools 0.10.5",
|
||||
"jsonb",
|
||||
"jsonpath-rust 0.7.5",
|
||||
"jsonpath-rust 0.7.3",
|
||||
"lazy_static",
|
||||
"moka",
|
||||
"once_cell",
|
||||
@@ -8762,7 +8812,6 @@ dependencies = [
|
||||
"common-recordbatch",
|
||||
"common-telemetry",
|
||||
"datafusion",
|
||||
"datafusion-common",
|
||||
"datafusion-expr",
|
||||
"datatypes",
|
||||
"futures",
|
||||
@@ -8776,9 +8825,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "promql-parser"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7c6b1429bdd199d53bd58b745075c1652efedbe2746e5d4f0d56d3184dda48ec"
|
||||
version = "0.4.3"
|
||||
source = "git+https://github.com/GreptimeTeam/promql-parser.git?rev=27abb8e16003a50c720f00d6c85f41f5fa2a2a8e#27abb8e16003a50c720f00d6c85f41f5fa2a2a8e"
|
||||
dependencies = [
|
||||
"cfgrammar",
|
||||
"chrono",
|
||||
@@ -9636,16 +9684,6 @@ dependencies = [
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "roaring"
|
||||
version = "0.10.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "41589aba99537475bf697f2118357cad1c31590c5a1b9f6d9fc4ad6d07503661"
|
||||
dependencies = [
|
||||
"bytemuck",
|
||||
"byteorder",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "robust"
|
||||
version = "1.1.0"
|
||||
@@ -10070,7 +10108,7 @@ dependencies = [
|
||||
"radix_trie",
|
||||
"scopeguard",
|
||||
"unicode-segmentation",
|
||||
"unicode-width",
|
||||
"unicode-width 0.1.14",
|
||||
"utf8parse",
|
||||
"winapi",
|
||||
]
|
||||
@@ -11065,7 +11103,7 @@ dependencies = [
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"smallvec",
|
||||
"thiserror 2.0.12",
|
||||
"thiserror 2.0.6",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tracing",
|
||||
@@ -11150,7 +11188,7 @@ dependencies = [
|
||||
"smallvec",
|
||||
"sqlx-core",
|
||||
"stringprep",
|
||||
"thiserror 2.0.12",
|
||||
"thiserror 2.0.6",
|
||||
"tracing",
|
||||
"whoami",
|
||||
]
|
||||
@@ -11188,7 +11226,7 @@ dependencies = [
|
||||
"smallvec",
|
||||
"sqlx-core",
|
||||
"stringprep",
|
||||
"thiserror 2.0.12",
|
||||
"thiserror 2.0.6",
|
||||
"tracing",
|
||||
"whoami",
|
||||
]
|
||||
@@ -11217,6 +11255,36 @@ dependencies = [
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sst-convert"
|
||||
version = "0.13.0"
|
||||
dependencies = [
|
||||
"api",
|
||||
"arrow-array",
|
||||
"async-trait",
|
||||
"catalog",
|
||||
"common-error",
|
||||
"common-macro",
|
||||
"common-meta",
|
||||
"common-recordbatch",
|
||||
"common-telemetry",
|
||||
"datanode",
|
||||
"datatypes",
|
||||
"futures",
|
||||
"futures-util",
|
||||
"indicatif",
|
||||
"meta-client",
|
||||
"metric-engine",
|
||||
"mito2",
|
||||
"object-store",
|
||||
"parquet",
|
||||
"parquet_opendal",
|
||||
"prost 0.13.3",
|
||||
"snafu 0.8.5",
|
||||
"store-api",
|
||||
"table",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "stable_deref_trait"
|
||||
version = "1.2.0"
|
||||
@@ -11949,7 +12017,7 @@ version = "0.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
|
||||
dependencies = [
|
||||
"unicode-width",
|
||||
"unicode-width 0.1.14",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -11969,11 +12037,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "2.0.12"
|
||||
version = "2.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708"
|
||||
checksum = "8fec2a1820ebd077e2b90c4df007bebf344cd394098a13c563957d0afc83ea47"
|
||||
dependencies = [
|
||||
"thiserror-impl 2.0.12",
|
||||
"thiserror-impl 2.0.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -11989,9 +12057,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "2.0.12"
|
||||
version = "2.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d"
|
||||
checksum = "d65750cab40f4ff1929fb1ba509e9914eb756131cef4210da8d5d700d26f6312"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -13052,6 +13120,12 @@ version = "0.1.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-width"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-xid"
|
||||
version = "0.2.6"
|
||||
|
||||
@@ -41,6 +41,7 @@ members = [
|
||||
"src/flow",
|
||||
"src/frontend",
|
||||
"src/index",
|
||||
"src/ingester",
|
||||
"src/log-query",
|
||||
"src/log-store",
|
||||
"src/meta-client",
|
||||
@@ -58,6 +59,7 @@ members = [
|
||||
"src/servers",
|
||||
"src/session",
|
||||
"src/sql",
|
||||
"src/sst-convert",
|
||||
"src/store-api",
|
||||
"src/table",
|
||||
"tests-fuzz",
|
||||
@@ -160,7 +162,9 @@ parquet = { version = "53.0.0", default-features = false, features = ["arrow", "
|
||||
paste = "1.0"
|
||||
pin-project = "1.0"
|
||||
prometheus = { version = "0.13.3", features = ["process"] }
|
||||
promql-parser = { version = "0.5", features = ["ser"] }
|
||||
promql-parser = { git = "https://github.com/GreptimeTeam/promql-parser.git", features = [
|
||||
"ser",
|
||||
], rev = "27abb8e16003a50c720f00d6c85f41f5fa2a2a8e" }
|
||||
prost = "0.13"
|
||||
raft-engine = { version = "0.4.1", default-features = false }
|
||||
rand = "0.8"
|
||||
@@ -269,6 +273,7 @@ query = { path = "src/query" }
|
||||
servers = { path = "src/servers" }
|
||||
session = { path = "src/session" }
|
||||
sql = { path = "src/sql" }
|
||||
sst-convert = { path = "src/sst-convert" }
|
||||
store-api = { path = "src/store-api" }
|
||||
substrait = { path = "src/common/substrait" }
|
||||
table = { path = "src/table" }
|
||||
|
||||
76
chore.md
Normal file
76
chore.md
Normal file
@@ -0,0 +1,76 @@
|
||||
# log
|
||||
## first create table
|
||||
```bash
|
||||
mysql --host=127.0.0.1 --port=19195 --database=public;
|
||||
```
|
||||
|
||||
```sql
|
||||
CREATE DATABASE IF NOT EXISTS `cluster1`;
|
||||
USE `cluster1`;
|
||||
CREATE TABLE IF NOT EXISTS `app1` (
|
||||
`greptime_timestamp` TimestampNanosecond NOT NULL TIME INDEX,
|
||||
`app` STRING NULL INVERTED INDEX,
|
||||
`cluster` STRING NULL INVERTED INDEX,
|
||||
`message` STRING NULL,
|
||||
`region` STRING NULL,
|
||||
`cloud-provider` STRING NULL,
|
||||
`environment` STRING NULL,
|
||||
`product` STRING NULL,
|
||||
`sub-product` STRING NULL,
|
||||
`service` STRING NULL
|
||||
) WITH (
|
||||
append_mode = 'true',
|
||||
'compaction.type' = 'twcs',
|
||||
'compaction.twcs.max_output_file_size' = '500MB',
|
||||
'compaction.twcs.max_active_window_files' = '16',
|
||||
'compaction.twcs.max_active_window_runs' = '4',
|
||||
'compaction.twcs.max_inactive_window_files' = '4',
|
||||
'compaction.twcs.max_inactive_window_runs' = '2',
|
||||
);
|
||||
|
||||
select count(*) from app1;
|
||||
|
||||
SELECT * FROM app1 ORDER BY greptime_timestamp DESC LIMIT 10\G
|
||||
```
|
||||
|
||||
## then ingest
|
||||
```bash
|
||||
RUST_LOG="debug" cargo run --bin=ingester -- --input-dir="/home/discord9/greptimedb/parquet_store_bk/" --parquet-dir="parquet_store/" --cfg="ingester.toml" --db-http-addr="http://127.0.0.1:4000/v1/sst/ingest_json"
|
||||
```
|
||||
|
||||
# metrics!!!!!!!
|
||||
```bash
|
||||
mysql --host=127.0.0.1 --port=19195 --database=public < public.greptime_physical_table-create-tables.sql
|
||||
```
|
||||
|
||||
## then ingest
|
||||
```bash
|
||||
RUST_LOG="debug"
|
||||
cargo run --bin=ingester -- --input-dir="/home/discord9/greptimedb/parquet_store_bk/" --remote-write-dir="metrics_parquet/" --cfg="ingester.toml" --db-http-addr="http://127.0.0.1:4000/v1/sst/ingest_json"
|
||||
# perf it
|
||||
cargo build --release ---bin=ingester
|
||||
samply record target/release/ingester --input-dir="/home/discord9/greptimedb/parquet_store_bk/" --remote-write-dir="metrics_parquet/" --cfg="ingester.toml" --db-http-addr="http://127.0.0.1:4000/v1/sst/ingest_json"
|
||||
```
|
||||
|
||||
## check data
|
||||
```sql
|
||||
select count(*) from greptime_physical_table;
|
||||
+----------+
|
||||
| count(*) |
|
||||
+----------+
|
||||
| 36200 |
|
||||
+----------+
|
||||
1 row in set (0.06 sec)
|
||||
|
||||
select count(*) from storage_operation_errors_total;
|
||||
+----------+
|
||||
| count(*) |
|
||||
+----------+
|
||||
| 10 |
|
||||
+----------+
|
||||
1 row in set (0.03 sec)
|
||||
```
|
||||
|
||||
|
||||
# with oss
|
||||
the same, only different is change storage config in `ingester.toml`
|
||||
@@ -1,19 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
BASEDIR=$(dirname "$0")
|
||||
|
||||
# Use jq to check for panels with empty or missing descriptions
|
||||
invalid_panels=$(cat $BASEDIR/greptimedb-cluster.json | jq -r '
|
||||
.panels[]
|
||||
| select((.type == "stats" or .type == "timeseries") and (.description == "" or .description == null))
|
||||
')
|
||||
|
||||
# Check if any invalid panels were found
|
||||
if [[ -n "$invalid_panels" ]]; then
|
||||
echo "Error: The following panels have empty or missing descriptions:"
|
||||
echo "$invalid_panels"
|
||||
exit 1
|
||||
else
|
||||
echo "All panels with type 'stats' or 'timeseries' have valid descriptions."
|
||||
exit 0
|
||||
fi
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,11 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
BASEDIR=$(dirname "$0")
|
||||
echo '| Title | Description | Expressions |
|
||||
|---|---|---|'
|
||||
|
||||
cat $BASEDIR/greptimedb-cluster.json | jq -r '
|
||||
.panels |
|
||||
map(select(.type == "stat" or .type == "timeseries")) |
|
||||
.[] | "| \(.title) | \(.description | gsub("\n"; "<br>")) | \(.targets | map(.expr // .rawSql | "`\(.|gsub("\n"; "<br>"))`") | join("<br>")) |"
|
||||
'
|
||||
35
ingester.toml
Normal file
35
ingester.toml
Normal file
@@ -0,0 +1,35 @@
|
||||
## The metasrv client options.
|
||||
[meta_client]
|
||||
## The addresses of the metasrv.
|
||||
metasrv_addrs = ["127.0.0.1:3002", "127.0.0.1:3003"]
|
||||
|
||||
## Operation timeout.
|
||||
timeout = "3s"
|
||||
|
||||
## Heartbeat timeout.
|
||||
heartbeat_timeout = "500ms"
|
||||
|
||||
## DDL timeout.
|
||||
ddl_timeout = "10s"
|
||||
|
||||
## Connect server timeout.
|
||||
connect_timeout = "1s"
|
||||
|
||||
## `TCP_NODELAY` option for accepted connections.
|
||||
tcp_nodelay = true
|
||||
|
||||
## The configuration about the cache of the metadata.
|
||||
metadata_cache_max_capacity = 100000
|
||||
|
||||
## TTL of the metadata cache.
|
||||
metadata_cache_ttl = "10m"
|
||||
|
||||
# TTI of the metadata cache.
|
||||
metadata_cache_tti = "5m"
|
||||
|
||||
## The data storage options.
|
||||
[storage]
|
||||
## The working home directory.
|
||||
data_home = "/tmp/greptimedb-cluster/datanode0"
|
||||
type = "File"
|
||||
[mito]
|
||||
@@ -16,6 +16,7 @@
|
||||
|
||||
mod client;
|
||||
pub mod client_manager;
|
||||
#[cfg(feature = "testing")]
|
||||
mod database;
|
||||
pub mod error;
|
||||
pub mod flow;
|
||||
@@ -33,6 +34,7 @@ pub use common_recordbatch::{RecordBatches, SendableRecordBatchStream};
|
||||
use snafu::OptionExt;
|
||||
|
||||
pub use self::client::Client;
|
||||
#[cfg(feature = "testing")]
|
||||
pub use self::database::Database;
|
||||
pub use self::error::{Error, Result};
|
||||
use crate::error::{IllegalDatabaseResponseSnafu, ServerSnafu};
|
||||
|
||||
@@ -32,7 +32,7 @@ use common_meta::key::TableMetadataManager;
|
||||
use common_telemetry::info;
|
||||
use common_telemetry::logging::TracingOptions;
|
||||
use common_version::{short_version, version};
|
||||
use flow::{FlownodeBuilder, FlownodeInstance, FrontendClient, FrontendInvoker};
|
||||
use flow::{FlownodeBuilder, FlownodeInstance, FrontendInvoker};
|
||||
use meta_client::{MetaClientOptions, MetaClientType};
|
||||
use servers::Mode;
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
@@ -311,8 +311,6 @@ impl StartCommand {
|
||||
Arc::new(executor),
|
||||
);
|
||||
|
||||
let frontend_client = FrontendClient::from_meta_client(meta_client.clone());
|
||||
|
||||
let flow_metadata_manager = Arc::new(FlowMetadataManager::new(cached_meta_backend.clone()));
|
||||
let flownode_builder = FlownodeBuilder::new(
|
||||
opts,
|
||||
@@ -320,7 +318,6 @@ impl StartCommand {
|
||||
table_metadata_manager,
|
||||
catalog_manager.clone(),
|
||||
flow_metadata_manager,
|
||||
Arc::new(frontend_client),
|
||||
)
|
||||
.with_heartbeat_task(heartbeat_task);
|
||||
|
||||
|
||||
@@ -54,10 +54,7 @@ use datanode::config::{DatanodeOptions, ProcedureConfig, RegionEngineConfig, Sto
|
||||
use datanode::datanode::{Datanode, DatanodeBuilder};
|
||||
use datanode::region_server::RegionServer;
|
||||
use file_engine::config::EngineConfig as FileEngineConfig;
|
||||
use flow::{
|
||||
FlowConfig, FlowWorkerManager, FlownodeBuilder, FlownodeOptions, FrontendClient,
|
||||
FrontendInvoker,
|
||||
};
|
||||
use flow::{FlowConfig, FlowWorkerManager, FlownodeBuilder, FlownodeOptions, FrontendInvoker};
|
||||
use frontend::frontend::FrontendOptions;
|
||||
use frontend::instance::builder::FrontendBuilder;
|
||||
use frontend::instance::{FrontendInstance, Instance as FeInstance, StandaloneDatanodeManager};
|
||||
@@ -536,16 +533,12 @@ impl StartCommand {
|
||||
flow: opts.flow.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let fe_server_addr = fe_opts.grpc.bind_addr.clone();
|
||||
let frontend_client = FrontendClient::from_static_grpc_addr(fe_server_addr);
|
||||
let flow_builder = FlownodeBuilder::new(
|
||||
flownode_options,
|
||||
plugins.clone(),
|
||||
table_metadata_manager.clone(),
|
||||
catalog_manager.clone(),
|
||||
flow_metadata_manager.clone(),
|
||||
Arc::new(frontend_client),
|
||||
);
|
||||
let flownode = Arc::new(
|
||||
flow_builder
|
||||
|
||||
@@ -445,20 +445,10 @@ impl Pool {
|
||||
|
||||
async fn recycle_channel_in_loop(pool: Arc<Pool>, interval_secs: u64) {
|
||||
let mut interval = tokio::time::interval(Duration::from_secs(interval_secs));
|
||||
// use weak ref here to prevent pool being leaked
|
||||
let pool_weak = {
|
||||
let weak = Arc::downgrade(&pool);
|
||||
drop(pool);
|
||||
weak
|
||||
};
|
||||
|
||||
loop {
|
||||
let _ = interval.tick().await;
|
||||
if let Some(pool) = pool_weak.upgrade() {
|
||||
pool.retain_channel(|_, c| c.access.swap(0, Ordering::Relaxed) != 0)
|
||||
} else {
|
||||
// no one is using this pool, so we can also let go
|
||||
break;
|
||||
}
|
||||
pool.retain_channel(|_, c| c.access.swap(0, Ordering::Relaxed) != 0)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -337,7 +337,6 @@ pub enum FlowType {
|
||||
impl FlowType {
|
||||
pub const RECORDING_RULE: &str = "recording_rule";
|
||||
pub const STREAMING: &str = "streaming";
|
||||
pub const FLOW_TYPE_KEY: &str = "flow_type";
|
||||
}
|
||||
|
||||
impl Default for FlowType {
|
||||
@@ -392,8 +391,7 @@ impl From<&CreateFlowData> for CreateRequest {
|
||||
};
|
||||
|
||||
let flow_type = value.flow_type.unwrap_or_default().to_string();
|
||||
req.flow_options
|
||||
.insert(FlowType::FLOW_TYPE_KEY.to_string(), flow_type);
|
||||
req.flow_options.insert("flow_type".to_string(), flow_type);
|
||||
req
|
||||
}
|
||||
}
|
||||
@@ -425,7 +423,7 @@ impl From<&CreateFlowData> for (FlowInfoValue, Vec<(FlowPartitionId, FlowRouteVa
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let flow_type = value.flow_type.unwrap_or_default().to_string();
|
||||
options.insert(FlowType::FLOW_TYPE_KEY.to_string(), flow_type);
|
||||
options.insert("flow_type".to_string(), flow_type);
|
||||
|
||||
let flow_info = FlowInfoValue {
|
||||
source_table_ids: value.source_table_ids.clone(),
|
||||
|
||||
@@ -25,6 +25,6 @@ pub mod heartbeat;
|
||||
pub mod metrics;
|
||||
pub mod region_server;
|
||||
pub mod service;
|
||||
mod store;
|
||||
pub mod store;
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
pub mod tests;
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
//! object storage utilities
|
||||
|
||||
mod azblob;
|
||||
mod fs;
|
||||
pub mod fs;
|
||||
mod gcs;
|
||||
mod oss;
|
||||
mod s3;
|
||||
|
||||
@@ -24,7 +24,8 @@ use crate::config::FileConfig;
|
||||
use crate::error::{self, Result};
|
||||
use crate::store;
|
||||
|
||||
pub(crate) async fn new_fs_object_store(
|
||||
/// A helper function to create a file system object store.
|
||||
pub async fn new_fs_object_store(
|
||||
data_home: &str,
|
||||
_file_config: &FileConfig,
|
||||
) -> Result<ObjectStore> {
|
||||
|
||||
@@ -16,7 +16,6 @@ async-trait.workspace = true
|
||||
bytes.workspace = true
|
||||
cache.workspace = true
|
||||
catalog.workspace = true
|
||||
chrono.workspace = true
|
||||
client.workspace = true
|
||||
common-base.workspace = true
|
||||
common-config.workspace = true
|
||||
|
||||
@@ -49,13 +49,12 @@ pub(crate) use crate::adapter::node_context::FlownodeContext;
|
||||
use crate::adapter::refill::RefillTask;
|
||||
use crate::adapter::table_source::ManagedTableSource;
|
||||
use crate::adapter::util::relation_desc_to_column_schemas_with_fallback;
|
||||
pub(crate) use crate::adapter::worker::{create_worker, WorkerHandle};
|
||||
pub(crate) use crate::adapter::worker::{create_worker, Worker, WorkerHandle};
|
||||
use crate::compute::ErrCollector;
|
||||
use crate::df_optimizer::sql_to_flow_plan;
|
||||
use crate::error::{EvalSnafu, ExternalSnafu, InternalSnafu, InvalidQuerySnafu, UnexpectedSnafu};
|
||||
use crate::expr::Batch;
|
||||
use crate::metrics::{METRIC_FLOW_INSERT_ELAPSED, METRIC_FLOW_ROWS, METRIC_FLOW_RUN_INTERVAL_MS};
|
||||
use crate::recording_rules::RecordingRuleEngine;
|
||||
use crate::repr::{self, DiffRow, RelationDesc, Row, BATCH_SIZE};
|
||||
|
||||
mod flownode_impl;
|
||||
@@ -64,7 +63,7 @@ pub(crate) mod refill;
|
||||
mod stat;
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
pub(crate) mod util;
|
||||
mod util;
|
||||
mod worker;
|
||||
|
||||
pub(crate) mod node_context;
|
||||
@@ -170,8 +169,6 @@ pub struct FlowWorkerManager {
|
||||
flush_lock: RwLock<()>,
|
||||
/// receive a oneshot sender to send state size report
|
||||
state_report_handler: RwLock<Option<StateReportHandler>>,
|
||||
/// engine for recording rule
|
||||
rule_engine: RecordingRuleEngine,
|
||||
}
|
||||
|
||||
/// Building FlownodeManager
|
||||
@@ -186,7 +183,6 @@ impl FlowWorkerManager {
|
||||
node_id: Option<u32>,
|
||||
query_engine: Arc<dyn QueryEngine>,
|
||||
table_meta: TableMetadataManagerRef,
|
||||
rule_engine: RecordingRuleEngine,
|
||||
) -> Self {
|
||||
let srv_map = ManagedTableSource::new(
|
||||
table_meta.table_info_manager().clone(),
|
||||
@@ -209,7 +205,6 @@ impl FlowWorkerManager {
|
||||
node_id,
|
||||
flush_lock: RwLock::new(()),
|
||||
state_report_handler: RwLock::new(None),
|
||||
rule_engine,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -218,6 +213,25 @@ impl FlowWorkerManager {
|
||||
self
|
||||
}
|
||||
|
||||
/// Create a flownode manager with one worker
|
||||
pub fn new_with_workers<'s>(
|
||||
node_id: Option<u32>,
|
||||
query_engine: Arc<dyn QueryEngine>,
|
||||
table_meta: TableMetadataManagerRef,
|
||||
num_workers: usize,
|
||||
) -> (Self, Vec<Worker<'s>>) {
|
||||
let mut zelf = Self::new(node_id, query_engine, table_meta);
|
||||
|
||||
let workers: Vec<_> = (0..num_workers)
|
||||
.map(|_| {
|
||||
let (handle, worker) = create_worker();
|
||||
zelf.add_worker_handle(handle);
|
||||
worker
|
||||
})
|
||||
.collect();
|
||||
(zelf, workers)
|
||||
}
|
||||
|
||||
/// add a worker handler to manager, meaning this corresponding worker is under it's manage
|
||||
pub fn add_worker_handle(&mut self, handle: WorkerHandle) {
|
||||
self.worker_handles.push(handle);
|
||||
@@ -735,11 +749,7 @@ pub struct CreateFlowArgs {
|
||||
/// Create&Remove flow
|
||||
impl FlowWorkerManager {
|
||||
/// remove a flow by it's id
|
||||
#[allow(unreachable_code)]
|
||||
pub async fn remove_flow(&self, flow_id: FlowId) -> Result<(), Error> {
|
||||
// TODO(discord9): reroute some back to streaming engine later
|
||||
return self.rule_engine.remove_flow(flow_id).await;
|
||||
|
||||
for handle in self.worker_handles.iter() {
|
||||
if handle.contains_flow(flow_id).await? {
|
||||
handle.remove_flow(flow_id).await?;
|
||||
@@ -755,10 +765,8 @@ impl FlowWorkerManager {
|
||||
/// steps to create task:
|
||||
/// 1. parse query into typed plan(and optional parse expire_after expr)
|
||||
/// 2. render source/sink with output table id and used input table id
|
||||
#[allow(clippy::too_many_arguments, unreachable_code)]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn create_flow(&self, args: CreateFlowArgs) -> Result<Option<FlowId>, Error> {
|
||||
// TODO(discord9): reroute some back to streaming engine later
|
||||
return self.rule_engine.create_flow(args).await;
|
||||
let CreateFlowArgs {
|
||||
flow_id,
|
||||
sink_table_name,
|
||||
|
||||
@@ -153,13 +153,7 @@ impl Flownode for FlowWorkerManager {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(unreachable_code, unused)]
|
||||
async fn handle_inserts(&self, request: InsertRequests) -> Result<FlowResponse> {
|
||||
return self
|
||||
.rule_engine
|
||||
.handle_inserts(request)
|
||||
.await
|
||||
.map_err(to_meta_err(snafu::location!()));
|
||||
// using try_read to ensure two things:
|
||||
// 1. flush wouldn't happen until inserts before it is inserted
|
||||
// 2. inserts happening concurrently with flush wouldn't be block by flush
|
||||
@@ -212,15 +206,15 @@ impl Flownode for FlowWorkerManager {
|
||||
.collect_vec();
|
||||
let table_col_names = table_schema.relation_desc.names;
|
||||
let table_col_names = table_col_names
|
||||
.iter().enumerate()
|
||||
.map(|(idx,name)| match name {
|
||||
Some(name) => Ok(name.clone()),
|
||||
None => InternalSnafu {
|
||||
reason: format!("Expect column {idx} of table id={table_id} to have name in table schema, found None"),
|
||||
}
|
||||
.fail().map_err(BoxedError::new).context(ExternalSnafu),
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
.iter().enumerate()
|
||||
.map(|(idx,name)| match name {
|
||||
Some(name) => Ok(name.clone()),
|
||||
None => InternalSnafu {
|
||||
reason: format!("Expect column {idx} of table id={table_id} to have name in table schema, found None"),
|
||||
}
|
||||
.fail().map_err(BoxedError::new).context(ExternalSnafu),
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
let name_to_col = HashMap::<_, _>::from_iter(
|
||||
insert_schema
|
||||
.iter()
|
||||
|
||||
@@ -12,8 +12,6 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Some utility functions
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::helper::ColumnDataTypeWrapper;
|
||||
|
||||
@@ -16,7 +16,6 @@
|
||||
|
||||
use std::any::Any;
|
||||
|
||||
use arrow_schema::ArrowError;
|
||||
use common_error::ext::BoxedError;
|
||||
use common_error::{define_into_tonic_status, from_err_code_msg_to_header};
|
||||
use common_macro::stack_trace_debug;
|
||||
@@ -54,13 +53,6 @@ pub enum Error {
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Time error"))]
|
||||
Time {
|
||||
source: common_time::error::Error,
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("External error"))]
|
||||
External {
|
||||
source: BoxedError,
|
||||
@@ -164,15 +156,6 @@ pub enum Error {
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Arrow error: {raw:?} in context: {context}"))]
|
||||
Arrow {
|
||||
#[snafu(source)]
|
||||
raw: ArrowError,
|
||||
context: String,
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Datafusion error: {raw:?} in context: {context}"))]
|
||||
Datafusion {
|
||||
#[snafu(source)]
|
||||
@@ -247,7 +230,6 @@ impl ErrorExt for Error {
|
||||
match self {
|
||||
Self::Eval { .. }
|
||||
| Self::JoinTask { .. }
|
||||
| Self::Arrow { .. }
|
||||
| Self::Datafusion { .. }
|
||||
| Self::InsertIntoFlow { .. } => StatusCode::Internal,
|
||||
Self::FlowAlreadyExist { .. } => StatusCode::TableAlreadyExists,
|
||||
@@ -256,9 +238,7 @@ impl ErrorExt for Error {
|
||||
| Self::FlowNotFound { .. }
|
||||
| Self::ListFlows { .. } => StatusCode::TableNotFound,
|
||||
Self::Plan { .. } | Self::Datatypes { .. } => StatusCode::PlanQuery,
|
||||
Self::InvalidQuery { .. } | Self::CreateFlow { .. } | Self::Time { .. } => {
|
||||
StatusCode::EngineExecuteQuery
|
||||
}
|
||||
Self::InvalidQuery { .. } | Self::CreateFlow { .. } => StatusCode::EngineExecuteQuery,
|
||||
Self::Unexpected { .. } => StatusCode::Unexpected,
|
||||
Self::NotImplemented { .. } | Self::UnsupportedTemporalFilter { .. } => {
|
||||
StatusCode::Unsupported
|
||||
|
||||
@@ -238,7 +238,6 @@ mod test {
|
||||
|
||||
for (sql, current, expected) in &testcases {
|
||||
let plan = sql_to_substrait(engine.clone(), sql).await;
|
||||
|
||||
let mut ctx = create_test_ctx();
|
||||
let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan)
|
||||
.await
|
||||
|
||||
@@ -130,6 +130,13 @@ impl HeartbeatTask {
|
||||
|
||||
pub fn shutdown(&self) {
|
||||
info!("Close heartbeat task for flownode");
|
||||
if self
|
||||
.running
|
||||
.compare_exchange(true, false, Ordering::AcqRel, Ordering::Acquire)
|
||||
.is_err()
|
||||
{
|
||||
warn!("Call close heartbeat task multiple times");
|
||||
}
|
||||
}
|
||||
|
||||
fn new_heartbeat_request(
|
||||
|
||||
@@ -33,7 +33,6 @@ mod expr;
|
||||
pub mod heartbeat;
|
||||
mod metrics;
|
||||
mod plan;
|
||||
mod recording_rules;
|
||||
mod repr;
|
||||
mod server;
|
||||
mod transform;
|
||||
@@ -44,5 +43,4 @@ mod test_utils;
|
||||
|
||||
pub use adapter::{FlowConfig, FlowWorkerManager, FlowWorkerManagerRef, FlownodeOptions};
|
||||
pub use error::{Error, Result};
|
||||
pub use recording_rules::FrontendClient;
|
||||
pub use server::{FlownodeBuilder, FlownodeInstance, FlownodeServer, FrontendInvoker};
|
||||
|
||||
@@ -28,32 +28,6 @@ lazy_static! {
|
||||
&["table_id"]
|
||||
)
|
||||
.unwrap();
|
||||
pub static ref METRIC_FLOW_RULE_ENGINE_QUERY_TIME: HistogramVec = register_histogram_vec!(
|
||||
"greptime_flow_rule_engine_query_time",
|
||||
"flow rule engine query time",
|
||||
&["flow_id"],
|
||||
vec![
|
||||
0.0,
|
||||
1.,
|
||||
3.,
|
||||
5.,
|
||||
10.,
|
||||
20.,
|
||||
30.,
|
||||
60.,
|
||||
2. * 60.,
|
||||
5. * 60.,
|
||||
10. * 60.
|
||||
]
|
||||
)
|
||||
.unwrap();
|
||||
pub static ref METRIC_FLOW_RULE_ENGINE_SLOW_QUERY: HistogramVec = register_histogram_vec!(
|
||||
"greptime_flow_rule_engine_slow_query",
|
||||
"flow rule engine slow query",
|
||||
&["flow_id", "sql", "peer"],
|
||||
vec![60., 2. * 60., 3. * 60., 5. * 60., 10. * 60.]
|
||||
)
|
||||
.unwrap();
|
||||
pub static ref METRIC_FLOW_RUN_INTERVAL_MS: IntGauge =
|
||||
register_int_gauge!("greptime_flow_run_interval_ms", "flow run interval in ms").unwrap();
|
||||
pub static ref METRIC_FLOW_ROWS: IntCounterVec = register_int_counter_vec!(
|
||||
|
||||
@@ -1,940 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Run flow as recording rule which is time-window-aware normal query triggered every tick set by user
|
||||
|
||||
mod engine;
|
||||
mod frontend_client;
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::helper::pb_value_to_value_ref;
|
||||
use catalog::CatalogManagerRef;
|
||||
use common_error::ext::BoxedError;
|
||||
use common_recordbatch::DfRecordBatch;
|
||||
use common_telemetry::warn;
|
||||
use common_time::timestamp::TimeUnit;
|
||||
use common_time::Timestamp;
|
||||
use datafusion::error::Result as DfResult;
|
||||
use datafusion::logical_expr::Expr;
|
||||
use datafusion::physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner};
|
||||
use datafusion::prelude::SessionContext;
|
||||
use datafusion::sql::unparser::Unparser;
|
||||
use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion, TreeNodeRewriter};
|
||||
use datafusion_common::{DFSchema, TableReference};
|
||||
use datafusion_expr::{ColumnarValue, LogicalPlan};
|
||||
use datafusion_physical_expr::PhysicalExprRef;
|
||||
use datatypes::prelude::{ConcreteDataType, DataType};
|
||||
use datatypes::scalars::ScalarVector;
|
||||
use datatypes::schema::TIME_INDEX_KEY;
|
||||
use datatypes::value::Value;
|
||||
use datatypes::vectors::{
|
||||
TimestampMicrosecondVector, TimestampMillisecondVector, TimestampNanosecondVector,
|
||||
TimestampSecondVector, Vector,
|
||||
};
|
||||
pub use engine::RecordingRuleEngine;
|
||||
pub use frontend_client::FrontendClient;
|
||||
use itertools::Itertools;
|
||||
use query::parser::QueryLanguageParser;
|
||||
use query::QueryEngineRef;
|
||||
use session::context::QueryContextRef;
|
||||
use snafu::{ensure, OptionExt, ResultExt};
|
||||
|
||||
use crate::adapter::util::from_proto_to_data_type;
|
||||
use crate::df_optimizer::apply_df_optimizer;
|
||||
use crate::error::{ArrowSnafu, DatafusionSnafu, DatatypesSnafu, ExternalSnafu, UnexpectedSnafu};
|
||||
use crate::expr::error::DataTypeSnafu;
|
||||
use crate::Error;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct TimeWindowExpr {
|
||||
phy_expr: PhysicalExprRef,
|
||||
column_name: String,
|
||||
logical_expr: Expr,
|
||||
df_schema: DFSchema,
|
||||
}
|
||||
|
||||
impl TimeWindowExpr {
|
||||
pub fn from_expr(expr: &Expr, column_name: &str, df_schema: &DFSchema) -> Result<Self, Error> {
|
||||
let phy_planner = DefaultPhysicalPlanner::default();
|
||||
|
||||
let phy_expr: PhysicalExprRef = phy_planner
|
||||
.create_physical_expr(expr, df_schema, &SessionContext::new().state())
|
||||
.with_context(|_e| DatafusionSnafu {
|
||||
context: format!(
|
||||
"Failed to create physical expression from {expr:?} using {df_schema:?}"
|
||||
),
|
||||
})?;
|
||||
Ok(Self {
|
||||
phy_expr,
|
||||
column_name: column_name.to_string(),
|
||||
logical_expr: expr.clone(),
|
||||
df_schema: df_schema.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn eval(
|
||||
&self,
|
||||
current: Timestamp,
|
||||
) -> Result<(Option<Timestamp>, Option<Timestamp>), Error> {
|
||||
let lower_bound =
|
||||
find_expr_time_window_lower_bound(&self.logical_expr, &self.df_schema, current)?;
|
||||
let upper_bound =
|
||||
find_expr_time_window_upper_bound(&self.logical_expr, &self.df_schema, current)?;
|
||||
Ok((lower_bound, upper_bound))
|
||||
}
|
||||
|
||||
/// Find timestamps from rows using time window expr
|
||||
pub async fn handle_rows(
|
||||
&self,
|
||||
rows_list: Vec<api::v1::Rows>,
|
||||
) -> Result<BTreeSet<Timestamp>, Error> {
|
||||
let mut time_windows = BTreeSet::new();
|
||||
|
||||
for rows in rows_list {
|
||||
// pick the time index column and use it to eval on `self.expr`
|
||||
let ts_col_index = rows
|
||||
.schema
|
||||
.iter()
|
||||
.map(|col| col.column_name.clone())
|
||||
.position(|name| name == self.column_name);
|
||||
let Some(ts_col_index) = ts_col_index else {
|
||||
warn!("can't found time index column in schema: {:?}", rows.schema);
|
||||
continue;
|
||||
};
|
||||
let col_schema = &rows.schema[ts_col_index];
|
||||
let cdt = from_proto_to_data_type(col_schema)?;
|
||||
|
||||
let column_values = rows
|
||||
.rows
|
||||
.iter()
|
||||
.map(|row| &row.values[ts_col_index])
|
||||
.collect_vec();
|
||||
|
||||
let mut vector = cdt.create_mutable_vector(column_values.len());
|
||||
for value in column_values {
|
||||
let value = pb_value_to_value_ref(value, &None);
|
||||
vector.try_push_value_ref(value).context(DataTypeSnafu {
|
||||
msg: "Failed to convert rows to columns",
|
||||
})?;
|
||||
}
|
||||
let vector = vector.to_vector();
|
||||
|
||||
let df_schema = create_df_schema_for_ts_column(&self.column_name, cdt)?;
|
||||
|
||||
let rb =
|
||||
DfRecordBatch::try_new(df_schema.inner().clone(), vec![vector.to_arrow_array()])
|
||||
.with_context(|_e| ArrowSnafu {
|
||||
context: format!(
|
||||
"Failed to create record batch from {df_schema:?} and {vector:?}"
|
||||
),
|
||||
})?;
|
||||
|
||||
let eval_res = self
|
||||
.phy_expr
|
||||
.evaluate(&rb)
|
||||
.with_context(|_| DatafusionSnafu {
|
||||
context: format!(
|
||||
"Failed to evaluate physical expression {:?} on {rb:?}",
|
||||
self.phy_expr
|
||||
),
|
||||
})?;
|
||||
|
||||
let res = columnar_to_ts_vector(&eval_res)?;
|
||||
|
||||
for ts in res.into_iter().flatten() {
|
||||
time_windows.insert(ts);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(time_windows)
|
||||
}
|
||||
}
|
||||
|
||||
fn create_df_schema_for_ts_column(name: &str, cdt: ConcreteDataType) -> Result<DFSchema, Error> {
|
||||
let arrow_schema = Arc::new(arrow_schema::Schema::new(vec![arrow_schema::Field::new(
|
||||
name,
|
||||
cdt.as_arrow_type(),
|
||||
false,
|
||||
)]));
|
||||
|
||||
let df_schema = DFSchema::from_field_specific_qualified_schema(
|
||||
vec![Some(TableReference::bare("TimeIndexOnlyTable"))],
|
||||
&arrow_schema,
|
||||
)
|
||||
.with_context(|_e| DatafusionSnafu {
|
||||
context: format!("Failed to create DFSchema from arrow schema {arrow_schema:?}"),
|
||||
})?;
|
||||
|
||||
Ok(df_schema)
|
||||
}
|
||||
|
||||
/// Convert `ColumnarValue` to `Vec<Option<Timestamp>>`
|
||||
fn columnar_to_ts_vector(columnar: &ColumnarValue) -> Result<Vec<Option<Timestamp>>, Error> {
|
||||
let val = match columnar {
|
||||
datafusion_expr::ColumnarValue::Array(array) => {
|
||||
let ty = array.data_type();
|
||||
let ty = ConcreteDataType::from_arrow_type(ty);
|
||||
let time_unit = if let ConcreteDataType::Timestamp(ty) = ty {
|
||||
ty.unit()
|
||||
} else {
|
||||
return UnexpectedSnafu {
|
||||
reason: format!("Non-timestamp type: {ty:?}"),
|
||||
}
|
||||
.fail();
|
||||
};
|
||||
|
||||
match time_unit {
|
||||
TimeUnit::Second => TimestampSecondVector::try_from_arrow_array(array.clone())
|
||||
.with_context(|_| DatatypesSnafu {
|
||||
extra: format!("Failed to create vector from arrow array {array:?}"),
|
||||
})?
|
||||
.iter_data()
|
||||
.map(|d| d.map(|d| d.0))
|
||||
.collect_vec(),
|
||||
TimeUnit::Millisecond => {
|
||||
TimestampMillisecondVector::try_from_arrow_array(array.clone())
|
||||
.with_context(|_| DatatypesSnafu {
|
||||
extra: format!("Failed to create vector from arrow array {array:?}"),
|
||||
})?
|
||||
.iter_data()
|
||||
.map(|d| d.map(|d| d.0))
|
||||
.collect_vec()
|
||||
}
|
||||
TimeUnit::Microsecond => {
|
||||
TimestampMicrosecondVector::try_from_arrow_array(array.clone())
|
||||
.with_context(|_| DatatypesSnafu {
|
||||
extra: format!("Failed to create vector from arrow array {array:?}"),
|
||||
})?
|
||||
.iter_data()
|
||||
.map(|d| d.map(|d| d.0))
|
||||
.collect_vec()
|
||||
}
|
||||
TimeUnit::Nanosecond => {
|
||||
TimestampNanosecondVector::try_from_arrow_array(array.clone())
|
||||
.with_context(|_| DatatypesSnafu {
|
||||
extra: format!("Failed to create vector from arrow array {array:?}"),
|
||||
})?
|
||||
.iter_data()
|
||||
.map(|d| d.map(|d| d.0))
|
||||
.collect_vec()
|
||||
}
|
||||
}
|
||||
}
|
||||
datafusion_expr::ColumnarValue::Scalar(scalar) => {
|
||||
let value = Value::try_from(scalar.clone()).with_context(|_| DatatypesSnafu {
|
||||
extra: format!("Failed to convert scalar {scalar:?} to value"),
|
||||
})?;
|
||||
let ts = value.as_timestamp().context(UnexpectedSnafu {
|
||||
reason: format!("Expect Timestamp, found {:?}", value),
|
||||
})?;
|
||||
vec![Some(ts)]
|
||||
}
|
||||
};
|
||||
Ok(val)
|
||||
}
|
||||
|
||||
/// Convert sql to datafusion logical plan
|
||||
pub async fn sql_to_df_plan(
|
||||
query_ctx: QueryContextRef,
|
||||
engine: QueryEngineRef,
|
||||
sql: &str,
|
||||
optimize: bool,
|
||||
) -> Result<LogicalPlan, Error> {
|
||||
let stmt = QueryLanguageParser::parse_sql(sql, &query_ctx)
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExternalSnafu)?;
|
||||
let plan = engine
|
||||
.planner()
|
||||
.plan(&stmt, query_ctx)
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExternalSnafu)?;
|
||||
let plan = if optimize {
|
||||
apply_df_optimizer(plan).await?
|
||||
} else {
|
||||
plan
|
||||
};
|
||||
Ok(plan)
|
||||
}
|
||||
|
||||
/// Return (the column name of time index column, the time window expr, the expected time unit of time index column, the expr's schema for evaluating the time window)
|
||||
async fn find_time_window_expr(
|
||||
plan: &LogicalPlan,
|
||||
catalog_man: CatalogManagerRef,
|
||||
query_ctx: QueryContextRef,
|
||||
) -> Result<(String, Option<datafusion_expr::Expr>, TimeUnit, DFSchema), Error> {
|
||||
// TODO(discord9): find the expr that do time window
|
||||
|
||||
let mut table_name = None;
|
||||
|
||||
// first find the table source in the logical plan
|
||||
plan.apply(|plan| {
|
||||
let LogicalPlan::TableScan(table_scan) = plan else {
|
||||
return Ok(TreeNodeRecursion::Continue);
|
||||
};
|
||||
table_name = Some(table_scan.table_name.clone());
|
||||
Ok(TreeNodeRecursion::Stop)
|
||||
})
|
||||
.with_context(|_| DatafusionSnafu {
|
||||
context: format!("Can't find table source in plan {plan:?}"),
|
||||
})?;
|
||||
let Some(table_name) = table_name else {
|
||||
UnexpectedSnafu {
|
||||
reason: format!("Can't find table source in plan {plan:?}"),
|
||||
}
|
||||
.fail()?
|
||||
};
|
||||
|
||||
let current_schema = query_ctx.current_schema();
|
||||
|
||||
let catalog_name = table_name.catalog().unwrap_or(query_ctx.current_catalog());
|
||||
let schema_name = table_name.schema().unwrap_or(¤t_schema);
|
||||
let table_name = table_name.table();
|
||||
|
||||
let Some(table_ref) = catalog_man
|
||||
.table(catalog_name, schema_name, table_name, Some(&query_ctx))
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExternalSnafu)?
|
||||
else {
|
||||
UnexpectedSnafu {
|
||||
reason: format!(
|
||||
"Can't find table {table_name:?} in catalog {catalog_name:?}/{schema_name:?}"
|
||||
),
|
||||
}
|
||||
.fail()?
|
||||
};
|
||||
|
||||
let schema = &table_ref.table_info().meta.schema;
|
||||
|
||||
let ts_index = schema.timestamp_column().context(UnexpectedSnafu {
|
||||
reason: format!("Can't find timestamp column in table {table_name:?}"),
|
||||
})?;
|
||||
|
||||
let ts_col_name = ts_index.name.clone();
|
||||
|
||||
let expected_time_unit = ts_index.data_type.as_timestamp().with_context(|| UnexpectedSnafu {
|
||||
reason: format!(
|
||||
"Expected timestamp column {ts_col_name:?} in table {table_name:?} to be timestamp, but got {ts_index:?}"
|
||||
),
|
||||
})?.unit();
|
||||
|
||||
let arrow_schema = Arc::new(arrow_schema::Schema::new(vec![arrow_schema::Field::new(
|
||||
ts_col_name.clone(),
|
||||
ts_index.data_type.as_arrow_type(),
|
||||
false,
|
||||
)]));
|
||||
|
||||
let df_schema = DFSchema::from_field_specific_qualified_schema(
|
||||
vec![Some(TableReference::bare(table_name))],
|
||||
&arrow_schema,
|
||||
)
|
||||
.with_context(|_e| DatafusionSnafu {
|
||||
context: format!("Failed to create DFSchema from arrow schema {arrow_schema:?}"),
|
||||
})?;
|
||||
|
||||
// find the time window expr which refers to the time index column
|
||||
let mut aggr_expr = None;
|
||||
let mut time_window_expr: Option<Expr> = None;
|
||||
|
||||
let find_inner_aggr_expr = |plan: &LogicalPlan| {
|
||||
if let LogicalPlan::Aggregate(aggregate) = plan {
|
||||
aggr_expr = Some(aggregate.clone());
|
||||
};
|
||||
|
||||
Ok(TreeNodeRecursion::Continue)
|
||||
};
|
||||
plan.apply(find_inner_aggr_expr)
|
||||
.with_context(|_| DatafusionSnafu {
|
||||
context: format!("Can't find aggr expr in plan {plan:?}"),
|
||||
})?;
|
||||
|
||||
if let Some(aggregate) = aggr_expr {
|
||||
for group_expr in &aggregate.group_expr {
|
||||
let refs = group_expr.column_refs();
|
||||
if refs.len() != 1 {
|
||||
continue;
|
||||
}
|
||||
let ref_col = refs.iter().next().unwrap();
|
||||
|
||||
let index = aggregate.input.schema().maybe_index_of_column(ref_col);
|
||||
let Some(index) = index else {
|
||||
continue;
|
||||
};
|
||||
let field = aggregate.input.schema().field(index);
|
||||
|
||||
let is_time_index = field.metadata().get(TIME_INDEX_KEY) == Some(&"true".to_string());
|
||||
|
||||
if is_time_index {
|
||||
let rewrite_column = group_expr.clone();
|
||||
let rewritten = rewrite_column
|
||||
.rewrite(&mut RewriteColumn {
|
||||
table_name: table_name.to_string(),
|
||||
})
|
||||
.with_context(|_| DatafusionSnafu {
|
||||
context: format!("Rewrite expr failed, expr={:?}", group_expr),
|
||||
})?
|
||||
.data;
|
||||
struct RewriteColumn {
|
||||
table_name: String,
|
||||
}
|
||||
|
||||
impl TreeNodeRewriter for RewriteColumn {
|
||||
type Node = Expr;
|
||||
fn f_down(&mut self, node: Self::Node) -> DfResult<Transformed<Self::Node>> {
|
||||
let Expr::Column(mut column) = node else {
|
||||
return Ok(Transformed::no(node));
|
||||
};
|
||||
|
||||
column.relation = Some(TableReference::bare(self.table_name.clone()));
|
||||
|
||||
Ok(Transformed::yes(Expr::Column(column)))
|
||||
}
|
||||
}
|
||||
|
||||
time_window_expr = Some(rewritten);
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok((ts_col_name, time_window_expr, expected_time_unit, df_schema))
|
||||
} else {
|
||||
// can't found time window expr, return None
|
||||
Ok((ts_col_name, None, expected_time_unit, df_schema))
|
||||
}
|
||||
}
|
||||
|
||||
/// Find nearest lower bound for time `current` in given `plan` for the time window expr.
|
||||
/// i.e. for time window expr being `date_bin(INTERVAL '5 minutes', ts) as time_window` and `current="2021-07-01 00:01:01.000"`,
|
||||
/// return `Some("2021-07-01 00:00:00.000")`
|
||||
/// if `plan` doesn't contain a `TIME INDEX` column, return `None`
|
||||
///
|
||||
/// Time window expr is a expr that:
|
||||
/// 1. ref only to a time index column
|
||||
/// 2. is monotonic increasing
|
||||
/// 3. show up in GROUP BY clause
|
||||
///
|
||||
/// note this plan should only contain one TableScan
|
||||
pub async fn find_plan_time_window_bound(
|
||||
plan: &LogicalPlan,
|
||||
current: Timestamp,
|
||||
query_ctx: QueryContextRef,
|
||||
engine: QueryEngineRef,
|
||||
) -> Result<(String, Option<Timestamp>, Option<Timestamp>), Error> {
|
||||
// TODO(discord9): find the expr that do time window
|
||||
let catalog_man = engine.engine_state().catalog_manager();
|
||||
|
||||
let (ts_col_name, time_window_expr, expected_time_unit, df_schema) =
|
||||
find_time_window_expr(plan, catalog_man.clone(), query_ctx).await?;
|
||||
// cast current to ts_index's type
|
||||
let new_current = current
|
||||
.convert_to(expected_time_unit)
|
||||
.with_context(|| UnexpectedSnafu {
|
||||
reason: format!("Failed to cast current timestamp {current:?} to {expected_time_unit}"),
|
||||
})?;
|
||||
|
||||
// if no time_window_expr is found, return None
|
||||
if let Some(time_window_expr) = time_window_expr {
|
||||
let lower_bound =
|
||||
find_expr_time_window_lower_bound(&time_window_expr, &df_schema, new_current)?;
|
||||
let upper_bound =
|
||||
find_expr_time_window_upper_bound(&time_window_expr, &df_schema, new_current)?;
|
||||
Ok((ts_col_name, lower_bound, upper_bound))
|
||||
} else {
|
||||
Ok((ts_col_name, None, None))
|
||||
}
|
||||
}
|
||||
|
||||
/// Find the lower bound of time window in given `expr` and `current` timestamp.
|
||||
///
|
||||
/// i.e. for `current="2021-07-01 00:01:01.000"` and `expr=date_bin(INTERVAL '5 minutes', ts) as time_window` and `ts_col=ts`,
|
||||
/// return `Some("2021-07-01 00:00:00.000")` since it's the lower bound
|
||||
/// return `Some("2021-07-01 00:00:00.000")` since it's the lower bound
|
||||
/// of current time window given the current timestamp
|
||||
///
|
||||
/// if return None, meaning this time window have no lower bound
|
||||
fn find_expr_time_window_lower_bound(
|
||||
expr: &Expr,
|
||||
df_schema: &DFSchema,
|
||||
current: Timestamp,
|
||||
) -> Result<Option<Timestamp>, Error> {
|
||||
let phy_planner = DefaultPhysicalPlanner::default();
|
||||
|
||||
let phy_expr: PhysicalExprRef = phy_planner
|
||||
.create_physical_expr(expr, df_schema, &SessionContext::new().state())
|
||||
.with_context(|_e| DatafusionSnafu {
|
||||
context: format!(
|
||||
"Failed to create physical expression from {expr:?} using {df_schema:?}"
|
||||
),
|
||||
})?;
|
||||
|
||||
let cur_time_window = eval_ts_to_ts(&phy_expr, df_schema, current)?;
|
||||
let input_time_unit = cur_time_window.unit();
|
||||
Ok(cur_time_window.convert_to(input_time_unit))
|
||||
}
|
||||
|
||||
/// Find the upper bound for time window expression
|
||||
fn find_expr_time_window_upper_bound(
|
||||
expr: &Expr,
|
||||
df_schema: &DFSchema,
|
||||
current: Timestamp,
|
||||
) -> Result<Option<Timestamp>, Error> {
|
||||
use std::cmp::Ordering;
|
||||
|
||||
let phy_planner = DefaultPhysicalPlanner::default();
|
||||
|
||||
let phy_expr: PhysicalExprRef = phy_planner
|
||||
.create_physical_expr(expr, df_schema, &SessionContext::new().state())
|
||||
.with_context(|_e| DatafusionSnafu {
|
||||
context: format!(
|
||||
"Failed to create physical expression from {expr:?} using {df_schema:?}"
|
||||
),
|
||||
})?;
|
||||
|
||||
let cur_time_window = eval_ts_to_ts(&phy_expr, df_schema, current)?;
|
||||
|
||||
// search to find the lower bound
|
||||
let mut offset: i64 = 1;
|
||||
let mut lower_bound = Some(current);
|
||||
let upper_bound;
|
||||
// first expontial probe to found a range for binary search
|
||||
loop {
|
||||
let Some(next_val) = current.value().checked_add(offset) else {
|
||||
// no upper bound if overflow
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let next_time_probe = common_time::Timestamp::new(next_val, current.unit());
|
||||
|
||||
let next_time_window = eval_ts_to_ts(&phy_expr, df_schema, next_time_probe)?;
|
||||
|
||||
match next_time_window.cmp(&cur_time_window) {
|
||||
Ordering::Less => {UnexpectedSnafu {
|
||||
reason: format!(
|
||||
"Unsupported time window expression, expect monotonic increasing for time window expression {expr:?}"
|
||||
),
|
||||
}
|
||||
.fail()?
|
||||
}
|
||||
Ordering::Equal => {
|
||||
lower_bound = Some(next_time_probe);
|
||||
}
|
||||
Ordering::Greater => {
|
||||
upper_bound = Some(next_time_probe);
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
let Some(new_offset) = offset.checked_mul(2) else {
|
||||
// no upper bound if overflow
|
||||
return Ok(None);
|
||||
};
|
||||
offset = new_offset;
|
||||
}
|
||||
|
||||
// binary search for the exact upper bound
|
||||
|
||||
ensure!(lower_bound.map(|v|v.unit())==upper_bound.map(|v|v.unit()), UnexpectedSnafu{
|
||||
reason: format!(" unit mismatch for time window expression {expr:?}, found {lower_bound:?} and {upper_bound:?}"),
|
||||
});
|
||||
|
||||
let output_unit = upper_bound
|
||||
.context(UnexpectedSnafu {
|
||||
reason: "should have lower bound",
|
||||
})?
|
||||
.unit();
|
||||
|
||||
let mut low = lower_bound
|
||||
.context(UnexpectedSnafu {
|
||||
reason: "should have lower bound",
|
||||
})?
|
||||
.value();
|
||||
let mut high = upper_bound
|
||||
.context(UnexpectedSnafu {
|
||||
reason: "should have upper bound",
|
||||
})?
|
||||
.value();
|
||||
while low < high {
|
||||
let mid = (low + high) / 2;
|
||||
let mid_probe = common_time::Timestamp::new(mid, output_unit);
|
||||
let mid_time_window = eval_ts_to_ts(&phy_expr, df_schema, mid_probe)?;
|
||||
|
||||
match mid_time_window.cmp(&cur_time_window) {
|
||||
Ordering::Less => UnexpectedSnafu {
|
||||
reason: format!("Binary search failed for time window expression {expr:?}"),
|
||||
}
|
||||
.fail()?,
|
||||
Ordering::Equal => low = mid + 1,
|
||||
Ordering::Greater => high = mid,
|
||||
}
|
||||
}
|
||||
|
||||
let final_upper_bound_for_time_window = common_time::Timestamp::new(high, output_unit);
|
||||
|
||||
Ok(Some(final_upper_bound_for_time_window))
|
||||
}
|
||||
|
||||
fn eval_ts_to_ts(
|
||||
phy: &PhysicalExprRef,
|
||||
df_schema: &DFSchema,
|
||||
input_value: Timestamp,
|
||||
) -> Result<Timestamp, Error> {
|
||||
let schema_ty = df_schema.field(0).data_type();
|
||||
let schema_cdt = ConcreteDataType::from_arrow_type(schema_ty);
|
||||
let schema_unit = if let ConcreteDataType::Timestamp(ts) = schema_cdt {
|
||||
ts.unit()
|
||||
} else {
|
||||
return UnexpectedSnafu {
|
||||
reason: format!("Expect Timestamp, found {:?}", schema_cdt),
|
||||
}
|
||||
.fail();
|
||||
};
|
||||
let input_value = input_value
|
||||
.convert_to(schema_unit)
|
||||
.with_context(|| UnexpectedSnafu {
|
||||
reason: format!("Failed to convert timestamp {input_value:?} to {schema_unit}"),
|
||||
})?;
|
||||
let ts_vector = match schema_unit {
|
||||
TimeUnit::Second => {
|
||||
TimestampSecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
|
||||
}
|
||||
TimeUnit::Millisecond => {
|
||||
TimestampMillisecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
|
||||
}
|
||||
TimeUnit::Microsecond => {
|
||||
TimestampMicrosecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
|
||||
}
|
||||
TimeUnit::Nanosecond => {
|
||||
TimestampNanosecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
|
||||
}
|
||||
};
|
||||
|
||||
let rb = DfRecordBatch::try_new(df_schema.inner().clone(), vec![ts_vector.clone()])
|
||||
.with_context(|_| ArrowSnafu {
|
||||
context: format!("Failed to create record batch from {df_schema:?} and {ts_vector:?}"),
|
||||
})?;
|
||||
|
||||
let eval_res = phy.evaluate(&rb).with_context(|_| DatafusionSnafu {
|
||||
context: format!("Failed to evaluate physical expression {phy:?} on {rb:?}"),
|
||||
})?;
|
||||
|
||||
if let Some(Some(ts)) = columnar_to_ts_vector(&eval_res)?.first() {
|
||||
Ok(*ts)
|
||||
} else {
|
||||
UnexpectedSnafu {
|
||||
reason: format!(
|
||||
"Expected timestamp in expression {phy:?} but got {:?}",
|
||||
eval_res
|
||||
),
|
||||
}
|
||||
.fail()?
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(discord9): a method to found out the precise time window
|
||||
|
||||
/// Find out the `Filter` Node corresponding to outermost `WHERE` and add a new filter expr to it
|
||||
#[derive(Debug)]
|
||||
pub struct AddFilterRewriter {
|
||||
extra_filter: Expr,
|
||||
is_rewritten: bool,
|
||||
}
|
||||
|
||||
impl AddFilterRewriter {
|
||||
fn new(filter: Expr) -> Self {
|
||||
Self {
|
||||
extra_filter: filter,
|
||||
is_rewritten: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TreeNodeRewriter for AddFilterRewriter {
|
||||
type Node = LogicalPlan;
|
||||
fn f_up(&mut self, node: Self::Node) -> DfResult<Transformed<Self::Node>> {
|
||||
if self.is_rewritten {
|
||||
return Ok(Transformed::no(node));
|
||||
}
|
||||
match node {
|
||||
LogicalPlan::Filter(mut filter) if !filter.having => {
|
||||
filter.predicate = filter.predicate.and(self.extra_filter.clone());
|
||||
self.is_rewritten = true;
|
||||
Ok(Transformed::yes(LogicalPlan::Filter(filter)))
|
||||
}
|
||||
LogicalPlan::TableScan(_) => {
|
||||
// add a new filter
|
||||
let filter =
|
||||
datafusion_expr::Filter::try_new(self.extra_filter.clone(), Arc::new(node))?;
|
||||
self.is_rewritten = true;
|
||||
Ok(Transformed::yes(LogicalPlan::Filter(filter)))
|
||||
}
|
||||
_ => Ok(Transformed::no(node)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn df_plan_to_sql(plan: &LogicalPlan) -> Result<String, Error> {
|
||||
/// A dialect that forces all identifiers to be quoted
|
||||
struct ForceQuoteIdentifiers;
|
||||
impl datafusion::sql::unparser::dialect::Dialect for ForceQuoteIdentifiers {
|
||||
fn identifier_quote_style(&self, identifier: &str) -> Option<char> {
|
||||
if identifier.to_lowercase() != identifier {
|
||||
Some('"')
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
let unparser = Unparser::new(&ForceQuoteIdentifiers);
|
||||
// first make all column qualified
|
||||
let sql = unparser
|
||||
.plan_to_sql(plan)
|
||||
.with_context(|_e| DatafusionSnafu {
|
||||
context: format!("Failed to unparse logical plan {plan:?}"),
|
||||
})?;
|
||||
Ok(sql.to_string())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use datafusion_common::tree_node::TreeNode;
|
||||
use pretty_assertions::assert_eq;
|
||||
use session::context::QueryContext;
|
||||
|
||||
use super::{sql_to_df_plan, *};
|
||||
use crate::recording_rules::{df_plan_to_sql, AddFilterRewriter};
|
||||
use crate::test_utils::create_test_query_engine;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_sql_plan_convert() {
|
||||
let query_engine = create_test_query_engine();
|
||||
let ctx = QueryContext::arc();
|
||||
let old = r#"SELECT "NUMBER" FROM "UPPERCASE_NUMBERS_WITH_TS""#;
|
||||
let new = sql_to_df_plan(ctx.clone(), query_engine.clone(), old, false)
|
||||
.await
|
||||
.unwrap();
|
||||
let new_sql = df_plan_to_sql(&new).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
r#"SELECT "UPPERCASE_NUMBERS_WITH_TS"."NUMBER" FROM "UPPERCASE_NUMBERS_WITH_TS""#,
|
||||
new_sql
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_add_filter() {
|
||||
let testcases = vec![
|
||||
(
|
||||
"SELECT number FROM numbers_with_ts GROUP BY number","SELECT numbers_with_ts.number FROM numbers_with_ts WHERE (number > 4) GROUP BY numbers_with_ts.number"
|
||||
),
|
||||
(
|
||||
"SELECT number FROM numbers_with_ts WHERE number < 2 OR number >10",
|
||||
"SELECT numbers_with_ts.number FROM numbers_with_ts WHERE ((numbers_with_ts.number < 2) OR (numbers_with_ts.number > 10)) AND (number > 4)"
|
||||
),
|
||||
(
|
||||
"SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window",
|
||||
"SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE (number > 4) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
|
||||
)
|
||||
];
|
||||
use datafusion_expr::{col, lit};
|
||||
let query_engine = create_test_query_engine();
|
||||
let ctx = QueryContext::arc();
|
||||
|
||||
for (before, after) in testcases {
|
||||
let sql = before;
|
||||
let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, false)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut add_filter = AddFilterRewriter::new(col("number").gt(lit(4u32)));
|
||||
let plan = plan.rewrite(&mut add_filter).unwrap().data;
|
||||
let new_sql = df_plan_to_sql(&plan).unwrap();
|
||||
assert_eq!(after, new_sql);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_plan_time_window_lower_bound() {
|
||||
use datafusion_expr::{col, lit};
|
||||
let query_engine = create_test_query_engine();
|
||||
let ctx = QueryContext::arc();
|
||||
|
||||
let testcases = [
|
||||
// same alias is not same column
|
||||
(
|
||||
"SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS ts FROM numbers_with_ts GROUP BY ts;",
|
||||
Timestamp::new(1740394109, TimeUnit::Second),
|
||||
(
|
||||
"ts".to_string(),
|
||||
Some(Timestamp::new(1740394109000, TimeUnit::Millisecond)),
|
||||
Some(Timestamp::new(1740394109001, TimeUnit::Millisecond)),
|
||||
),
|
||||
r#"SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS ts FROM numbers_with_ts WHERE ((ts >= CAST('2025-02-24 10:48:29' AS TIMESTAMP)) AND (ts <= CAST('2025-02-24 10:48:29.001' AS TIMESTAMP))) GROUP BY numbers_with_ts.ts"#
|
||||
),
|
||||
// complex time window index
|
||||
(
|
||||
"SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS time_window FROM numbers_with_ts GROUP BY time_window;",
|
||||
Timestamp::new(1740394109, TimeUnit::Second),
|
||||
(
|
||||
"ts".to_string(),
|
||||
Some(Timestamp::new(1740394080, TimeUnit::Second)),
|
||||
Some(Timestamp::new(1740394140, TimeUnit::Second)),
|
||||
),
|
||||
"SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('2025-02-24 10:48:00' AS TIMESTAMP)) AND (ts <= CAST('2025-02-24 10:49:00' AS TIMESTAMP))) GROUP BY arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)')"
|
||||
),
|
||||
// no time index
|
||||
(
|
||||
"SELECT date_bin('5 minutes', ts) FROM numbers_with_ts;",
|
||||
Timestamp::new(23, TimeUnit::Millisecond),
|
||||
("ts".to_string(), None, None),
|
||||
"SELECT date_bin('5 minutes', ts) FROM numbers_with_ts;"
|
||||
),
|
||||
// time index
|
||||
(
|
||||
"SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
|
||||
Timestamp::new(23, TimeUnit::Nanosecond),
|
||||
(
|
||||
"ts".to_string(),
|
||||
Some(Timestamp::new(0, TimeUnit::Millisecond)),
|
||||
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
|
||||
),
|
||||
"SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
|
||||
),
|
||||
// on spot
|
||||
(
|
||||
"SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
|
||||
Timestamp::new(0, TimeUnit::Nanosecond),
|
||||
(
|
||||
"ts".to_string(),
|
||||
Some(Timestamp::new(0, TimeUnit::Millisecond)),
|
||||
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
|
||||
),
|
||||
"SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
|
||||
),
|
||||
// different time unit
|
||||
(
|
||||
"SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
|
||||
Timestamp::new(23_000_000, TimeUnit::Nanosecond),
|
||||
(
|
||||
"ts".to_string(),
|
||||
Some(Timestamp::new(0, TimeUnit::Millisecond)),
|
||||
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
|
||||
),
|
||||
"SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
|
||||
),
|
||||
// time index with other fields
|
||||
(
|
||||
"SELECT sum(number) as sum_up, date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
|
||||
Timestamp::new(23, TimeUnit::Millisecond),
|
||||
(
|
||||
"ts".to_string(),
|
||||
Some(Timestamp::new(0, TimeUnit::Millisecond)),
|
||||
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
|
||||
),
|
||||
"SELECT sum(numbers_with_ts.number) AS sum_up, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
|
||||
),
|
||||
// time index with other pks
|
||||
(
|
||||
"SELECT number, date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window, number;",
|
||||
Timestamp::new(23, TimeUnit::Millisecond),
|
||||
(
|
||||
"ts".to_string(),
|
||||
Some(Timestamp::new(0, TimeUnit::Millisecond)),
|
||||
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
|
||||
),
|
||||
"SELECT numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts), numbers_with_ts.number"
|
||||
),
|
||||
// subquery
|
||||
(
|
||||
"SELECT number, time_window FROM (SELECT number, date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window, number);",
|
||||
Timestamp::new(23, TimeUnit::Millisecond),
|
||||
(
|
||||
"ts".to_string(),
|
||||
Some(Timestamp::new(0, TimeUnit::Millisecond)),
|
||||
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
|
||||
),
|
||||
"SELECT numbers_with_ts.number, time_window FROM (SELECT numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts), numbers_with_ts.number)"
|
||||
),
|
||||
// cte
|
||||
(
|
||||
"with cte as (select number, date_bin('5 minutes', ts) as time_window from numbers_with_ts GROUP BY time_window, number) select number, time_window from cte;",
|
||||
Timestamp::new(23, TimeUnit::Millisecond),
|
||||
(
|
||||
"ts".to_string(),
|
||||
Some(Timestamp::new(0, TimeUnit::Millisecond)),
|
||||
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
|
||||
),
|
||||
"SELECT cte.number, cte.time_window FROM (SELECT numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts), numbers_with_ts.number) AS cte"
|
||||
),
|
||||
// complex subquery without alias
|
||||
(
|
||||
"SELECT sum(number), number, date_bin('5 minutes', ts) as time_window, bucket_name FROM (SELECT number, ts, case when number < 5 THEN 'bucket_0_5' when number >= 5 THEN 'bucket_5_inf' END as bucket_name FROM numbers_with_ts) GROUP BY number, time_window, bucket_name;",
|
||||
Timestamp::new(23, TimeUnit::Millisecond),
|
||||
(
|
||||
"ts".to_string(),
|
||||
Some(Timestamp::new(0, TimeUnit::Millisecond)),
|
||||
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
|
||||
),
|
||||
"SELECT sum(numbers_with_ts.number), numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window, bucket_name FROM (SELECT numbers_with_ts.number, numbers_with_ts.ts, CASE WHEN (numbers_with_ts.number < 5) THEN 'bucket_0_5' WHEN (numbers_with_ts.number >= 5) THEN 'bucket_5_inf' END AS bucket_name FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP)))) GROUP BY numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts), bucket_name"
|
||||
),
|
||||
// complex subquery alias
|
||||
(
|
||||
"SELECT sum(number), number, date_bin('5 minutes', ts) as time_window, bucket_name FROM (SELECT number, ts, case when number < 5 THEN 'bucket_0_5' when number >= 5 THEN 'bucket_5_inf' END as bucket_name FROM numbers_with_ts) as cte GROUP BY number, time_window, bucket_name;",
|
||||
Timestamp::new(23, TimeUnit::Millisecond),
|
||||
(
|
||||
"ts".to_string(),
|
||||
Some(Timestamp::new(0, TimeUnit::Millisecond)),
|
||||
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
|
||||
),
|
||||
"SELECT sum(cte.number), cte.number, date_bin('5 minutes', cte.ts) AS time_window, cte.bucket_name FROM (SELECT numbers_with_ts.number, numbers_with_ts.ts, CASE WHEN (numbers_with_ts.number < 5) THEN 'bucket_0_5' WHEN (numbers_with_ts.number >= 5) THEN 'bucket_5_inf' END AS bucket_name FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP)))) AS cte GROUP BY cte.number, date_bin('5 minutes', cte.ts), cte.bucket_name"
|
||||
),
|
||||
];
|
||||
|
||||
for (sql, current, expected, expected_unparsed) in testcases {
|
||||
let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, true)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let real =
|
||||
find_plan_time_window_bound(&plan, current, ctx.clone(), query_engine.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(expected, real);
|
||||
|
||||
let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, false)
|
||||
.await
|
||||
.unwrap();
|
||||
let (col_name, lower, upper) = real;
|
||||
let new_sql = if lower.is_some() {
|
||||
let to_df_literal = |value| {
|
||||
let value = Value::from(value);
|
||||
|
||||
value.try_to_scalar_value(&value.data_type()).unwrap()
|
||||
};
|
||||
let lower = to_df_literal(lower.unwrap());
|
||||
let upper = to_df_literal(upper.unwrap());
|
||||
let expr = col(&col_name)
|
||||
.gt_eq(lit(lower))
|
||||
.and(col(&col_name).lt_eq(lit(upper)));
|
||||
let mut add_filter = AddFilterRewriter::new(expr);
|
||||
let plan = plan.rewrite(&mut add_filter).unwrap().data;
|
||||
df_plan_to_sql(&plan).unwrap()
|
||||
} else {
|
||||
sql.to_string()
|
||||
};
|
||||
assert_eq!(expected_unparsed, new_sql);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,815 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::{BTreeMap, HashMap, HashSet};
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||
|
||||
use api::v1::flow::FlowResponse;
|
||||
use common_error::ext::BoxedError;
|
||||
use common_meta::ddl::create_flow::FlowType;
|
||||
use common_meta::key::flow::FlowMetadataManagerRef;
|
||||
use common_meta::key::table_info::TableInfoManager;
|
||||
use common_meta::key::TableMetadataManagerRef;
|
||||
use common_telemetry::tracing::warn;
|
||||
use common_telemetry::{debug, info};
|
||||
use common_time::Timestamp;
|
||||
use datafusion::sql::unparser::expr_to_sql;
|
||||
use datafusion_common::tree_node::TreeNode;
|
||||
use datatypes::value::Value;
|
||||
use query::QueryEngineRef;
|
||||
use session::context::QueryContextRef;
|
||||
use snafu::{ensure, OptionExt, ResultExt};
|
||||
use store_api::storage::RegionId;
|
||||
use table::metadata::TableId;
|
||||
use tokio::sync::oneshot::error::TryRecvError;
|
||||
use tokio::sync::{oneshot, RwLock};
|
||||
use tokio::time::Instant;
|
||||
|
||||
use super::frontend_client::FrontendClient;
|
||||
use super::{df_plan_to_sql, AddFilterRewriter, TimeWindowExpr};
|
||||
use crate::adapter::{CreateFlowArgs, FlowId, TableName};
|
||||
use crate::error::{
|
||||
DatafusionSnafu, DatatypesSnafu, ExternalSnafu, FlowAlreadyExistSnafu, InternalSnafu,
|
||||
TimeSnafu, UnexpectedSnafu,
|
||||
};
|
||||
use crate::metrics::{METRIC_FLOW_RULE_ENGINE_QUERY_TIME, METRIC_FLOW_RULE_ENGINE_SLOW_QUERY};
|
||||
use crate::recording_rules::{find_time_window_expr, sql_to_df_plan};
|
||||
use crate::Error;
|
||||
|
||||
/// TODO(discord9): make those constants configurable
|
||||
/// The default rule engine query timeout is 10 minutes
|
||||
pub const DEFAULT_RULE_ENGINE_QUERY_TIMEOUT: Duration = Duration::from_secs(10 * 60);
|
||||
|
||||
/// will output a warn log for any query that runs for more that 1 minutes, and also every 1 minutes when that query is still running
|
||||
pub const SLOW_QUERY_THRESHOLD: Duration = Duration::from_secs(60);
|
||||
|
||||
/// TODO(discord9): determine how to configure refresh rate
|
||||
pub struct RecordingRuleEngine {
|
||||
tasks: RwLock<BTreeMap<FlowId, RecordingRuleTask>>,
|
||||
shutdown_txs: RwLock<BTreeMap<FlowId, oneshot::Sender<()>>>,
|
||||
frontend_client: Arc<FrontendClient>,
|
||||
flow_metadata_manager: FlowMetadataManagerRef,
|
||||
table_meta: TableMetadataManagerRef,
|
||||
engine: QueryEngineRef,
|
||||
}
|
||||
|
||||
impl RecordingRuleEngine {
|
||||
pub fn new(
|
||||
frontend_client: Arc<FrontendClient>,
|
||||
engine: QueryEngineRef,
|
||||
flow_metadata_manager: FlowMetadataManagerRef,
|
||||
table_meta: TableMetadataManagerRef,
|
||||
) -> Self {
|
||||
Self {
|
||||
tasks: Default::default(),
|
||||
shutdown_txs: Default::default(),
|
||||
frontend_client,
|
||||
flow_metadata_manager,
|
||||
table_meta,
|
||||
engine,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn handle_inserts(
|
||||
&self,
|
||||
request: api::v1::region::InsertRequests,
|
||||
) -> Result<FlowResponse, Error> {
|
||||
let table_info_mgr = self.table_meta.table_info_manager();
|
||||
let mut group_by_table_name: HashMap<TableName, Vec<api::v1::Rows>> = HashMap::new();
|
||||
for r in request.requests {
|
||||
let tid = RegionId::from(r.region_id).table_id();
|
||||
let name = get_table_name(table_info_mgr, &tid).await?;
|
||||
let entry = group_by_table_name.entry(name).or_default();
|
||||
if let Some(rows) = r.rows {
|
||||
entry.push(rows);
|
||||
}
|
||||
}
|
||||
|
||||
for (_flow_id, task) in self.tasks.read().await.iter() {
|
||||
let src_table_names = &task.source_table_names;
|
||||
|
||||
for src_table_name in src_table_names {
|
||||
if let Some(entry) = group_by_table_name.get(src_table_name) {
|
||||
let Some(expr) = &task.time_window_expr else {
|
||||
continue;
|
||||
};
|
||||
let involved_time_windows = expr.handle_rows(entry.clone()).await?;
|
||||
let mut state = task.state.write().await;
|
||||
state
|
||||
.dirty_time_windows
|
||||
.add_lower_bounds(involved_time_windows.into_iter());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Default::default())
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_table_name(zelf: &TableInfoManager, table_id: &TableId) -> Result<TableName, Error> {
|
||||
zelf.get(*table_id)
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExternalSnafu)?
|
||||
.with_context(|| UnexpectedSnafu {
|
||||
reason: format!("Table id = {:?}, couldn't found table name", table_id),
|
||||
})
|
||||
.map(|name| name.table_name())
|
||||
.map(|name| [name.catalog_name, name.schema_name, name.table_name])
|
||||
}
|
||||
|
||||
const MIN_REFRESH_DURATION: Duration = Duration::new(5, 0);
|
||||
|
||||
impl RecordingRuleEngine {
|
||||
pub async fn create_flow(&self, args: CreateFlowArgs) -> Result<Option<FlowId>, Error> {
|
||||
let CreateFlowArgs {
|
||||
flow_id,
|
||||
sink_table_name,
|
||||
source_table_ids,
|
||||
create_if_not_exists,
|
||||
or_replace,
|
||||
expire_after,
|
||||
comment: _,
|
||||
sql,
|
||||
flow_options,
|
||||
query_ctx,
|
||||
} = args;
|
||||
|
||||
// or replace logic
|
||||
{
|
||||
let is_exist = self.tasks.read().await.contains_key(&flow_id);
|
||||
match (create_if_not_exists, or_replace, is_exist) {
|
||||
// if replace, ignore that old flow exists
|
||||
(_, true, true) => {
|
||||
info!("Replacing flow with id={}", flow_id);
|
||||
}
|
||||
(false, false, true) => FlowAlreadyExistSnafu { id: flow_id }.fail()?,
|
||||
// already exists, and not replace, return None
|
||||
(true, false, true) => {
|
||||
info!("Flow with id={} already exists, do nothing", flow_id);
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// continue as normal
|
||||
(_, _, false) => (),
|
||||
}
|
||||
}
|
||||
|
||||
let flow_type = flow_options.get(FlowType::FLOW_TYPE_KEY);
|
||||
|
||||
ensure!(
|
||||
flow_type == Some(&FlowType::RecordingRule.to_string()) || flow_type.is_none(),
|
||||
UnexpectedSnafu {
|
||||
reason: format!("Flow type is not RecordingRule nor None, got {flow_type:?}")
|
||||
}
|
||||
);
|
||||
|
||||
let Some(query_ctx) = query_ctx else {
|
||||
UnexpectedSnafu {
|
||||
reason: "Query context is None".to_string(),
|
||||
}
|
||||
.fail()?
|
||||
};
|
||||
let query_ctx = Arc::new(query_ctx);
|
||||
let mut source_table_names = Vec::new();
|
||||
for src_id in source_table_ids {
|
||||
let table_name = self
|
||||
.table_meta
|
||||
.table_info_manager()
|
||||
.get(src_id)
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExternalSnafu)?
|
||||
.with_context(|| UnexpectedSnafu {
|
||||
reason: format!("Table id = {:?}, couldn't found table name", src_id),
|
||||
})
|
||||
.map(|name| name.table_name())
|
||||
.map(|name| [name.catalog_name, name.schema_name, name.table_name])?;
|
||||
source_table_names.push(table_name);
|
||||
}
|
||||
|
||||
let (tx, rx) = oneshot::channel();
|
||||
|
||||
let plan = sql_to_df_plan(query_ctx.clone(), self.engine.clone(), &sql, true).await?;
|
||||
let (column_name, time_window_expr, _, df_schema) = find_time_window_expr(
|
||||
&plan,
|
||||
self.engine.engine_state().catalog_manager().clone(),
|
||||
query_ctx.clone(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let phy_expr = time_window_expr
|
||||
.map(|expr| TimeWindowExpr::from_expr(&expr, &column_name, &df_schema))
|
||||
.transpose()?;
|
||||
|
||||
info!("Flow id={}, found time window expr={:?}", flow_id, phy_expr);
|
||||
|
||||
let task = RecordingRuleTask::new(
|
||||
flow_id,
|
||||
&sql,
|
||||
phy_expr,
|
||||
expire_after,
|
||||
sink_table_name,
|
||||
source_table_names,
|
||||
query_ctx,
|
||||
rx,
|
||||
);
|
||||
|
||||
let task_inner = task.clone();
|
||||
let engine = self.engine.clone();
|
||||
let frontend = self.frontend_client.clone();
|
||||
|
||||
// TODO(discord9): also save handle & use time wheel or what for better
|
||||
let _handle = common_runtime::spawn_global(async move {
|
||||
match task_inner.start_executing(engine, frontend).await {
|
||||
Ok(()) => info!("Flow {} shutdown", task_inner.flow_id),
|
||||
Err(err) => common_telemetry::error!(
|
||||
"Flow {} encounter unrecoverable error: {err:?}",
|
||||
task_inner.flow_id
|
||||
),
|
||||
}
|
||||
});
|
||||
|
||||
// TODO(discord9): deal with replace logic
|
||||
let replaced_old_task_opt = self.tasks.write().await.insert(flow_id, task);
|
||||
drop(replaced_old_task_opt);
|
||||
|
||||
self.shutdown_txs.write().await.insert(flow_id, tx);
|
||||
|
||||
Ok(Some(flow_id))
|
||||
}
|
||||
|
||||
pub async fn remove_flow(&self, flow_id: FlowId) -> Result<(), Error> {
|
||||
if self.tasks.write().await.remove(&flow_id).is_none() {
|
||||
warn!("Flow {flow_id} not found in tasks")
|
||||
}
|
||||
let Some(tx) = self.shutdown_txs.write().await.remove(&flow_id) else {
|
||||
UnexpectedSnafu {
|
||||
reason: format!("Can't found shutdown tx for flow {flow_id}"),
|
||||
}
|
||||
.fail()?
|
||||
};
|
||||
if tx.send(()).is_err() {
|
||||
warn!("Fail to shutdown flow {flow_id} due to receiver already dropped, maybe flow {flow_id} is already dropped?")
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RecordingRuleTask {
|
||||
pub flow_id: FlowId,
|
||||
query: String,
|
||||
pub time_window_expr: Option<TimeWindowExpr>,
|
||||
/// in seconds
|
||||
pub expire_after: Option<i64>,
|
||||
sink_table_name: [String; 3],
|
||||
source_table_names: HashSet<[String; 3]>,
|
||||
state: Arc<RwLock<RecordingRuleState>>,
|
||||
}
|
||||
|
||||
impl RecordingRuleTask {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new(
|
||||
flow_id: FlowId,
|
||||
query: &str,
|
||||
time_window_expr: Option<TimeWindowExpr>,
|
||||
expire_after: Option<i64>,
|
||||
sink_table_name: [String; 3],
|
||||
source_table_names: Vec<[String; 3]>,
|
||||
query_ctx: QueryContextRef,
|
||||
shutdown_rx: oneshot::Receiver<()>,
|
||||
) -> Self {
|
||||
Self {
|
||||
flow_id,
|
||||
query: query.to_string(),
|
||||
time_window_expr,
|
||||
expire_after,
|
||||
sink_table_name,
|
||||
source_table_names: source_table_names.into_iter().collect(),
|
||||
state: Arc::new(RwLock::new(RecordingRuleState::new(query_ctx, shutdown_rx))),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl RecordingRuleTask {
|
||||
/// This should be called in a new tokio task
|
||||
pub async fn start_executing(
|
||||
&self,
|
||||
engine: QueryEngineRef,
|
||||
frontend_client: Arc<FrontendClient>,
|
||||
) -> Result<(), Error> {
|
||||
// only first query don't need upper bound
|
||||
let mut is_first = true;
|
||||
|
||||
loop {
|
||||
// FIXME(discord9): test if need upper bound also works
|
||||
let new_query = self.gen_query_with_time_window(engine.clone()).await?;
|
||||
|
||||
let insert_into = if let Some(new_query) = new_query {
|
||||
format!(
|
||||
"INSERT INTO {}.{}.{} {}",
|
||||
self.sink_table_name[0],
|
||||
self.sink_table_name[1],
|
||||
self.sink_table_name[2],
|
||||
new_query
|
||||
)
|
||||
} else {
|
||||
tokio::time::sleep(MIN_REFRESH_DURATION).await;
|
||||
continue;
|
||||
};
|
||||
|
||||
if is_first {
|
||||
is_first = false;
|
||||
}
|
||||
|
||||
let instant = Instant::now();
|
||||
let flow_id = self.flow_id;
|
||||
let db_client = frontend_client.get_database_client().await?;
|
||||
let peer_addr = db_client.peer.addr;
|
||||
debug!(
|
||||
"Executing flow {flow_id}(expire_after={:?} secs) on {:?} with query {}",
|
||||
self.expire_after, peer_addr, &insert_into
|
||||
);
|
||||
|
||||
let timer = METRIC_FLOW_RULE_ENGINE_QUERY_TIME
|
||||
.with_label_values(&[flow_id.to_string().as_str()])
|
||||
.start_timer();
|
||||
|
||||
let res = db_client.database.sql(&insert_into).await;
|
||||
drop(timer);
|
||||
|
||||
let elapsed = instant.elapsed();
|
||||
if let Ok(res1) = &res {
|
||||
debug!(
|
||||
"Flow {flow_id} executed, result: {res1:?}, elapsed: {:?}",
|
||||
elapsed
|
||||
);
|
||||
} else if let Err(res) = &res {
|
||||
warn!(
|
||||
"Failed to execute Flow {flow_id} on frontend {}, result: {res:?}, elapsed: {:?} with query: {}",
|
||||
peer_addr, elapsed, &insert_into
|
||||
);
|
||||
}
|
||||
|
||||
// record slow query
|
||||
if elapsed >= SLOW_QUERY_THRESHOLD {
|
||||
warn!(
|
||||
"Flow {flow_id} on frontend {} executed for {:?} before complete, query: {}",
|
||||
peer_addr, elapsed, &insert_into
|
||||
);
|
||||
METRIC_FLOW_RULE_ENGINE_SLOW_QUERY
|
||||
.with_label_values(&[flow_id.to_string().as_str(), &insert_into, &peer_addr])
|
||||
.observe(elapsed.as_secs_f64());
|
||||
}
|
||||
|
||||
self.state
|
||||
.write()
|
||||
.await
|
||||
.after_query_exec(elapsed, res.is_ok());
|
||||
// drop the result to free client-related resources
|
||||
drop(res);
|
||||
|
||||
let sleep_until = {
|
||||
let mut state = self.state.write().await;
|
||||
match state.shutdown_rx.try_recv() {
|
||||
Ok(()) => break Ok(()),
|
||||
Err(TryRecvError::Closed) => {
|
||||
warn!("Unexpected shutdown flow {flow_id}, shutdown anyway");
|
||||
break Ok(());
|
||||
}
|
||||
Err(TryRecvError::Empty) => (),
|
||||
}
|
||||
state.get_next_start_query_time(None)
|
||||
};
|
||||
tokio::time::sleep_until(sleep_until).await;
|
||||
}
|
||||
}
|
||||
|
||||
/// will merge and use the first ten time window in query
|
||||
async fn gen_query_with_time_window(
|
||||
&self,
|
||||
engine: QueryEngineRef,
|
||||
) -> Result<Option<String>, Error> {
|
||||
let query_ctx = self.state.read().await.query_ctx.clone();
|
||||
let start = SystemTime::now();
|
||||
let since_the_epoch = start
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.expect("Time went backwards");
|
||||
let low_bound = self
|
||||
.expire_after
|
||||
.map(|e| since_the_epoch.as_secs() - e as u64)
|
||||
.unwrap_or(u64::MIN);
|
||||
|
||||
let low_bound = Timestamp::new_second(low_bound as i64);
|
||||
|
||||
// TODO(discord9): use time window expr to get the precise expire lower bound
|
||||
let expire_time_window_bound = self
|
||||
.time_window_expr
|
||||
.as_ref()
|
||||
.map(|expr| expr.eval(low_bound))
|
||||
.transpose()?;
|
||||
|
||||
let new_sql = {
|
||||
let expr = {
|
||||
match expire_time_window_bound {
|
||||
Some((Some(l), Some(u))) => {
|
||||
let window_size = u.sub(&l).with_context(|| UnexpectedSnafu {
|
||||
reason: format!("Can't get window size from {u:?} - {l:?}"),
|
||||
})?;
|
||||
let col_name = self
|
||||
.time_window_expr
|
||||
.as_ref()
|
||||
.map(|expr| expr.column_name.clone())
|
||||
.with_context(|| UnexpectedSnafu {
|
||||
reason: format!(
|
||||
"Flow id={:?}, Failed to get column name from time window expr",
|
||||
self.flow_id
|
||||
),
|
||||
})?;
|
||||
|
||||
self.state
|
||||
.write()
|
||||
.await
|
||||
.dirty_time_windows
|
||||
.gen_filter_exprs(&col_name, Some(l), window_size, self)?
|
||||
}
|
||||
_ => {
|
||||
debug!(
|
||||
"Flow id = {:?}, can't get window size: precise_lower_bound={expire_time_window_bound:?}, using the same query", self.flow_id
|
||||
);
|
||||
// since no time window lower/upper bound is found, just return the original query
|
||||
return Ok(Some(self.query.clone()));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
debug!(
|
||||
"Flow id={:?}, Generated filter expr: {:?}",
|
||||
self.flow_id,
|
||||
expr.as_ref()
|
||||
.map(|expr| expr_to_sql(expr).with_context(|_| DatafusionSnafu {
|
||||
context: format!("Failed to generate filter expr from {expr:?}"),
|
||||
}))
|
||||
.transpose()?
|
||||
.map(|s| s.to_string())
|
||||
);
|
||||
|
||||
let Some(expr) = expr else {
|
||||
// no new data, hence no need to update
|
||||
debug!("Flow id={:?}, no new data, not update", self.flow_id);
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let mut add_filter = AddFilterRewriter::new(expr);
|
||||
// make a not optimized plan for clearer unparse
|
||||
let plan =
|
||||
sql_to_df_plan(query_ctx.clone(), engine.clone(), &self.query, false).await?;
|
||||
let plan = plan
|
||||
.clone()
|
||||
.rewrite(&mut add_filter)
|
||||
.with_context(|_| DatafusionSnafu {
|
||||
context: format!("Failed to rewrite plan {plan:?}"),
|
||||
})?
|
||||
.data;
|
||||
df_plan_to_sql(&plan)?
|
||||
};
|
||||
|
||||
Ok(Some(new_sql))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct RecordingRuleState {
|
||||
query_ctx: QueryContextRef,
|
||||
/// last query complete time
|
||||
last_update_time: Instant,
|
||||
/// last time query duration
|
||||
last_query_duration: Duration,
|
||||
/// Dirty Time windows need to be updated
|
||||
/// mapping of `start -> end` and non-overlapping
|
||||
dirty_time_windows: DirtyTimeWindows,
|
||||
exec_state: ExecState,
|
||||
shutdown_rx: oneshot::Receiver<()>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct DirtyTimeWindows {
|
||||
windows: BTreeMap<Timestamp, Option<Timestamp>>,
|
||||
}
|
||||
|
||||
fn to_df_literal(value: Timestamp) -> Result<datafusion_common::ScalarValue, Error> {
|
||||
let value = Value::from(value);
|
||||
let value = value
|
||||
.try_to_scalar_value(&value.data_type())
|
||||
.with_context(|_| DatatypesSnafu {
|
||||
extra: format!("Failed to convert to scalar value: {}", value),
|
||||
})?;
|
||||
Ok(value)
|
||||
}
|
||||
|
||||
impl DirtyTimeWindows {
|
||||
/// Time window merge distance
|
||||
const MERGE_DIST: i32 = 3;
|
||||
|
||||
/// Maximum number of filters allowed in a single query
|
||||
const MAX_FILTER_NUM: usize = 20;
|
||||
|
||||
/// Add lower bounds to the dirty time windows. Upper bounds are ignored.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `lower_bounds` - An iterator of lower bounds to be added.
|
||||
pub fn add_lower_bounds(&mut self, lower_bounds: impl Iterator<Item = Timestamp>) {
|
||||
for lower_bound in lower_bounds {
|
||||
let entry = self.windows.entry(lower_bound);
|
||||
entry.or_insert(None);
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate all filter expressions consuming all time windows
|
||||
pub fn gen_filter_exprs(
|
||||
&mut self,
|
||||
col_name: &str,
|
||||
expire_lower_bound: Option<Timestamp>,
|
||||
window_size: chrono::Duration,
|
||||
task_ctx: &RecordingRuleTask,
|
||||
) -> Result<Option<datafusion_expr::Expr>, Error> {
|
||||
debug!(
|
||||
"expire_lower_bound: {:?}, window_size: {:?}",
|
||||
expire_lower_bound.map(|t| t.to_iso8601_string()),
|
||||
window_size
|
||||
);
|
||||
self.merge_dirty_time_windows(window_size, expire_lower_bound)?;
|
||||
|
||||
if self.windows.len() > Self::MAX_FILTER_NUM {
|
||||
let first_time_window = self.windows.first_key_value();
|
||||
let last_time_window = self.windows.last_key_value();
|
||||
warn!(
|
||||
"Flow id = {:?}, too many time windows: {}, only the first {} are taken for this query, the group by expression might be wrong. Time window expr={:?}, expire_after={:?}, first_time_window={:?}, last_time_window={:?}, the original query: {:?}",
|
||||
task_ctx.flow_id,
|
||||
self.windows.len(),
|
||||
Self::MAX_FILTER_NUM,
|
||||
task_ctx.time_window_expr,
|
||||
task_ctx.expire_after,
|
||||
first_time_window,
|
||||
last_time_window,
|
||||
task_ctx.query
|
||||
);
|
||||
}
|
||||
|
||||
// get the first `MAX_FILTER_NUM` time windows
|
||||
let nth = self
|
||||
.windows
|
||||
.iter()
|
||||
.nth(Self::MAX_FILTER_NUM)
|
||||
.map(|(key, _)| *key);
|
||||
let first_nth = {
|
||||
if let Some(nth) = nth {
|
||||
let mut after = self.windows.split_off(&nth);
|
||||
std::mem::swap(&mut self.windows, &mut after);
|
||||
|
||||
after
|
||||
} else {
|
||||
std::mem::take(&mut self.windows)
|
||||
}
|
||||
};
|
||||
|
||||
let mut expr_lst = vec![];
|
||||
for (start, end) in first_nth.into_iter() {
|
||||
debug!(
|
||||
"Time window start: {:?}, end: {:?}",
|
||||
start.to_iso8601_string(),
|
||||
end.map(|t| t.to_iso8601_string())
|
||||
);
|
||||
|
||||
use datafusion_expr::{col, lit};
|
||||
let lower = to_df_literal(start)?;
|
||||
let upper = end.map(to_df_literal).transpose()?;
|
||||
let expr = if let Some(upper) = upper {
|
||||
col(col_name)
|
||||
.gt_eq(lit(lower))
|
||||
.and(col(col_name).lt(lit(upper)))
|
||||
} else {
|
||||
col(col_name).gt_eq(lit(lower))
|
||||
};
|
||||
expr_lst.push(expr);
|
||||
}
|
||||
let expr = expr_lst.into_iter().reduce(|a, b| a.or(b));
|
||||
Ok(expr)
|
||||
}
|
||||
|
||||
/// Merge time windows that overlaps or get too close
|
||||
pub fn merge_dirty_time_windows(
|
||||
&mut self,
|
||||
window_size: chrono::Duration,
|
||||
expire_lower_bound: Option<Timestamp>,
|
||||
) -> Result<(), Error> {
|
||||
let mut new_windows = BTreeMap::new();
|
||||
|
||||
let mut prev_tw = None;
|
||||
for (lower_bound, upper_bound) in std::mem::take(&mut self.windows) {
|
||||
// filter out expired time window
|
||||
if let Some(expire_lower_bound) = expire_lower_bound {
|
||||
if lower_bound <= expire_lower_bound {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
let Some(prev_tw) = &mut prev_tw else {
|
||||
prev_tw = Some((lower_bound, upper_bound));
|
||||
continue;
|
||||
};
|
||||
|
||||
let std_window_size = window_size.to_std().map_err(|e| {
|
||||
InternalSnafu {
|
||||
reason: e.to_string(),
|
||||
}
|
||||
.build()
|
||||
})?;
|
||||
|
||||
// if cur.lower - prev.upper <= window_size * 2, merge
|
||||
let prev_upper = prev_tw
|
||||
.1
|
||||
.unwrap_or(prev_tw.0.add_duration(std_window_size).context(TimeSnafu)?);
|
||||
prev_tw.1 = Some(prev_upper);
|
||||
|
||||
let cur_upper = upper_bound.unwrap_or(
|
||||
lower_bound
|
||||
.add_duration(std_window_size)
|
||||
.context(TimeSnafu)?,
|
||||
);
|
||||
|
||||
if lower_bound
|
||||
.sub(&prev_upper)
|
||||
.map(|dist| dist <= window_size * Self::MERGE_DIST)
|
||||
.unwrap_or(false)
|
||||
{
|
||||
prev_tw.1 = Some(cur_upper);
|
||||
} else {
|
||||
new_windows.insert(prev_tw.0, prev_tw.1);
|
||||
*prev_tw = (lower_bound, Some(cur_upper));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(prev_tw) = prev_tw {
|
||||
new_windows.insert(prev_tw.0, prev_tw.1);
|
||||
}
|
||||
|
||||
self.windows = new_windows;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl RecordingRuleState {
|
||||
pub fn new(query_ctx: QueryContextRef, shutdown_rx: oneshot::Receiver<()>) -> Self {
|
||||
Self {
|
||||
query_ctx,
|
||||
last_update_time: Instant::now(),
|
||||
last_query_duration: Duration::from_secs(0),
|
||||
dirty_time_windows: Default::default(),
|
||||
exec_state: ExecState::Idle,
|
||||
shutdown_rx,
|
||||
}
|
||||
}
|
||||
|
||||
/// called after last query is done
|
||||
/// `is_succ` indicate whether the last query is successful
|
||||
pub fn after_query_exec(&mut self, elapsed: Duration, _is_succ: bool) {
|
||||
self.exec_state = ExecState::Idle;
|
||||
self.last_query_duration = elapsed;
|
||||
self.last_update_time = Instant::now();
|
||||
}
|
||||
|
||||
/// wait for at least `last_query_duration`, at most `max_timeout` to start next query
|
||||
pub fn get_next_start_query_time(&self, max_timeout: Option<Duration>) -> Instant {
|
||||
let next_duration = max_timeout
|
||||
.unwrap_or(self.last_query_duration)
|
||||
.min(self.last_query_duration);
|
||||
let next_duration = next_duration.max(MIN_REFRESH_DURATION);
|
||||
|
||||
self.last_update_time + next_duration
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
enum ExecState {
|
||||
Idle,
|
||||
Executing,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_merge_dirty_time_windows() {
|
||||
let mut dirty = DirtyTimeWindows::default();
|
||||
dirty.add_lower_bounds(
|
||||
vec![
|
||||
Timestamp::new_second(0),
|
||||
Timestamp::new_second((1 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
|
||||
]
|
||||
.into_iter(),
|
||||
);
|
||||
dirty
|
||||
.merge_dirty_time_windows(chrono::Duration::seconds(5 * 60), None)
|
||||
.unwrap();
|
||||
// just enough to merge
|
||||
assert_eq!(
|
||||
dirty.windows,
|
||||
BTreeMap::from([(
|
||||
Timestamp::new_second(0),
|
||||
Some(Timestamp::new_second(
|
||||
(2 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60
|
||||
))
|
||||
)])
|
||||
);
|
||||
|
||||
// separate time window
|
||||
let mut dirty = DirtyTimeWindows::default();
|
||||
dirty.add_lower_bounds(
|
||||
vec![
|
||||
Timestamp::new_second(0),
|
||||
Timestamp::new_second((2 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
|
||||
]
|
||||
.into_iter(),
|
||||
);
|
||||
dirty
|
||||
.merge_dirty_time_windows(chrono::Duration::seconds(5 * 60), None)
|
||||
.unwrap();
|
||||
// just enough to merge
|
||||
assert_eq!(
|
||||
BTreeMap::from([
|
||||
(
|
||||
Timestamp::new_second(0),
|
||||
Some(Timestamp::new_second(5 * 60))
|
||||
),
|
||||
(
|
||||
Timestamp::new_second((2 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
|
||||
Some(Timestamp::new_second(
|
||||
(3 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60
|
||||
))
|
||||
)
|
||||
]),
|
||||
dirty.windows
|
||||
);
|
||||
|
||||
// overlapping
|
||||
let mut dirty = DirtyTimeWindows::default();
|
||||
dirty.add_lower_bounds(
|
||||
vec![
|
||||
Timestamp::new_second(0),
|
||||
Timestamp::new_second((DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
|
||||
]
|
||||
.into_iter(),
|
||||
);
|
||||
dirty
|
||||
.merge_dirty_time_windows(chrono::Duration::seconds(5 * 60), None)
|
||||
.unwrap();
|
||||
// just enough to merge
|
||||
assert_eq!(
|
||||
BTreeMap::from([(
|
||||
Timestamp::new_second(0),
|
||||
Some(Timestamp::new_second(
|
||||
(1 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60
|
||||
))
|
||||
),]),
|
||||
dirty.windows
|
||||
);
|
||||
|
||||
// expired
|
||||
let mut dirty = DirtyTimeWindows::default();
|
||||
dirty.add_lower_bounds(
|
||||
vec![
|
||||
Timestamp::new_second(0),
|
||||
Timestamp::new_second((DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
|
||||
]
|
||||
.into_iter(),
|
||||
);
|
||||
dirty
|
||||
.merge_dirty_time_windows(
|
||||
chrono::Duration::seconds(5 * 60),
|
||||
Some(Timestamp::new_second(
|
||||
(DirtyTimeWindows::MERGE_DIST as i64) * 6 * 60,
|
||||
)),
|
||||
)
|
||||
.unwrap();
|
||||
// just enough to merge
|
||||
assert_eq!(BTreeMap::from([]), dirty.windows);
|
||||
}
|
||||
}
|
||||
@@ -1,163 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Frontend client to run flow as recording rule which is time-window-aware normal query triggered every tick set by user
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use client::{Client, Database, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
|
||||
use common_error::ext::BoxedError;
|
||||
use common_grpc::channel_manager::{ChannelConfig, ChannelManager};
|
||||
use common_meta::cluster::{NodeInfo, NodeInfoKey, Role};
|
||||
use common_meta::peer::Peer;
|
||||
use common_meta::rpc::store::RangeRequest;
|
||||
use meta_client::client::MetaClient;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::error::{ExternalSnafu, UnexpectedSnafu};
|
||||
use crate::recording_rules::engine::DEFAULT_RULE_ENGINE_QUERY_TIMEOUT;
|
||||
use crate::Error;
|
||||
|
||||
fn default_channel_mgr() -> ChannelManager {
|
||||
let cfg = ChannelConfig::new().timeout(DEFAULT_RULE_ENGINE_QUERY_TIMEOUT);
|
||||
ChannelManager::with_config(cfg)
|
||||
}
|
||||
|
||||
fn client_from_urls(addrs: Vec<String>) -> Client {
|
||||
Client::with_manager_and_urls(default_channel_mgr(), addrs)
|
||||
}
|
||||
|
||||
/// A simple frontend client able to execute sql using grpc protocol
|
||||
#[derive(Debug)]
|
||||
pub enum FrontendClient {
|
||||
Distributed {
|
||||
meta_client: Arc<MetaClient>,
|
||||
channel_mgr: ChannelManager,
|
||||
},
|
||||
Standalone {
|
||||
/// for the sake of simplicity still use grpc even in standalone mode
|
||||
/// notice the client here should all be lazy, so that can wait after frontend is booted then make conn
|
||||
/// TODO(discord9): not use grpc under standalone mode
|
||||
database_client: DatabaseWithPeer,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DatabaseWithPeer {
|
||||
pub database: Database,
|
||||
pub peer: Peer,
|
||||
}
|
||||
|
||||
impl DatabaseWithPeer {
|
||||
fn new(database: Database, peer: Peer) -> Self {
|
||||
Self { database, peer }
|
||||
}
|
||||
}
|
||||
|
||||
impl FrontendClient {
|
||||
pub fn from_meta_client(meta_client: Arc<MetaClient>) -> Self {
|
||||
Self::Distributed {
|
||||
meta_client,
|
||||
channel_mgr: default_channel_mgr(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_static_grpc_addr(addr: String) -> Self {
|
||||
let peer = Peer {
|
||||
id: 0,
|
||||
addr: addr.clone(),
|
||||
};
|
||||
|
||||
let mgr = default_channel_mgr();
|
||||
let client = Client::with_manager_and_urls(mgr.clone(), vec![addr]);
|
||||
let database = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client);
|
||||
Self::Standalone {
|
||||
database_client: DatabaseWithPeer::new(database, peer),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FrontendClient {
|
||||
async fn scan_for_frontend(&self) -> Result<Vec<(NodeInfoKey, NodeInfo)>, Error> {
|
||||
let Self::Distributed { meta_client, .. } = self else {
|
||||
return Ok(vec![]);
|
||||
};
|
||||
let cluster_client = meta_client
|
||||
.cluster_client()
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExternalSnafu)?;
|
||||
|
||||
let prefix = NodeInfoKey::key_prefix_with_role(Role::Frontend);
|
||||
let req = RangeRequest::new().with_prefix(prefix);
|
||||
let resp = cluster_client
|
||||
.range(req)
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExternalSnafu)?;
|
||||
let mut res = Vec::with_capacity(resp.kvs.len());
|
||||
for kv in resp.kvs {
|
||||
let key = NodeInfoKey::try_from(kv.key)
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExternalSnafu)?;
|
||||
|
||||
let val = NodeInfo::try_from(kv.value)
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExternalSnafu)?;
|
||||
res.push((key, val));
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
/// Get the database with max `last_activity_ts`
|
||||
async fn get_last_active_frontend(&self) -> Result<DatabaseWithPeer, Error> {
|
||||
if let Self::Standalone { database_client } = self {
|
||||
return Ok(database_client.clone());
|
||||
}
|
||||
match &self {
|
||||
Self::Standalone { database_client } => Ok(database_client.clone()),
|
||||
Self::Distributed {
|
||||
meta_client: _,
|
||||
channel_mgr,
|
||||
} => {
|
||||
let frontends = self.scan_for_frontend().await?;
|
||||
let mut last_activity_ts = i64::MIN;
|
||||
let mut peer = None;
|
||||
for (_key, val) in frontends.iter() {
|
||||
if val.last_activity_ts > last_activity_ts {
|
||||
last_activity_ts = val.last_activity_ts;
|
||||
peer = Some(val.peer.clone());
|
||||
}
|
||||
}
|
||||
let Some(peer) = peer else {
|
||||
UnexpectedSnafu {
|
||||
reason: format!("No frontend available: {:?}", frontends),
|
||||
}
|
||||
.fail()?
|
||||
};
|
||||
let client =
|
||||
Client::with_manager_and_urls(channel_mgr.clone(), vec![peer.addr.clone()]);
|
||||
let database = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client);
|
||||
Ok(DatabaseWithPeer::new(database, peer))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a database client, and possibly update it before returning.
|
||||
pub async fn get_database_client(&self) -> Result<DatabaseWithPeer, Error> {
|
||||
match self {
|
||||
Self::Standalone { database_client } => Ok(database_client.clone()),
|
||||
Self::Distributed { meta_client: _, .. } => self.get_last_active_frontend().await,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -57,7 +57,6 @@ use crate::error::{
|
||||
};
|
||||
use crate::heartbeat::HeartbeatTask;
|
||||
use crate::metrics::{METRIC_FLOW_PROCESSING_TIME, METRIC_FLOW_ROWS};
|
||||
use crate::recording_rules::{FrontendClient, RecordingRuleEngine};
|
||||
use crate::transform::register_function_to_query_engine;
|
||||
use crate::utils::{SizeReportSender, StateReportHandler};
|
||||
use crate::{Error, FlowWorkerManager, FlownodeOptions};
|
||||
@@ -246,7 +245,6 @@ impl FlownodeInstance {
|
||||
self.server.shutdown().await.context(ShutdownServerSnafu)?;
|
||||
|
||||
if let Some(task) = &self.heartbeat_task {
|
||||
info!("Close heartbeat task for flownode");
|
||||
task.shutdown();
|
||||
}
|
||||
|
||||
@@ -273,8 +271,6 @@ pub struct FlownodeBuilder {
|
||||
heartbeat_task: Option<HeartbeatTask>,
|
||||
/// receive a oneshot sender to send state size report
|
||||
state_report_handler: Option<StateReportHandler>,
|
||||
/// Client to send sql to frontend
|
||||
frontend_client: Arc<FrontendClient>,
|
||||
}
|
||||
|
||||
impl FlownodeBuilder {
|
||||
@@ -285,7 +281,6 @@ impl FlownodeBuilder {
|
||||
table_meta: TableMetadataManagerRef,
|
||||
catalog_manager: CatalogManagerRef,
|
||||
flow_metadata_manager: FlowMetadataManagerRef,
|
||||
frontend_client: Arc<FrontendClient>,
|
||||
) -> Self {
|
||||
Self {
|
||||
opts,
|
||||
@@ -295,7 +290,6 @@ impl FlownodeBuilder {
|
||||
flow_metadata_manager,
|
||||
heartbeat_task: None,
|
||||
state_report_handler: None,
|
||||
frontend_client,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -453,14 +447,7 @@ impl FlownodeBuilder {
|
||||
|
||||
let node_id = self.opts.node_id.map(|id| id as u32);
|
||||
|
||||
let rule_engine = RecordingRuleEngine::new(
|
||||
self.frontend_client.clone(),
|
||||
query_engine.clone(),
|
||||
self.flow_metadata_manager.clone(),
|
||||
table_meta.clone(),
|
||||
);
|
||||
|
||||
let mut man = FlowWorkerManager::new(node_id, query_engine, table_meta, rule_engine);
|
||||
let mut man = FlowWorkerManager::new(node_id, query_engine, table_meta);
|
||||
for worker_id in 0..num_workers {
|
||||
let (tx, rx) = oneshot::channel();
|
||||
|
||||
|
||||
@@ -86,8 +86,7 @@ pub fn create_test_query_engine() -> Arc<dyn QueryEngine> {
|
||||
|
||||
let schema = vec![
|
||||
datatypes::schema::ColumnSchema::new("number", CDT::uint32_datatype(), false),
|
||||
datatypes::schema::ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false)
|
||||
.with_time_index(true),
|
||||
datatypes::schema::ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false),
|
||||
];
|
||||
let mut columns = vec![];
|
||||
let numbers = (1..=10).collect_vec();
|
||||
@@ -115,37 +114,6 @@ pub fn create_test_query_engine() -> Arc<dyn QueryEngine> {
|
||||
};
|
||||
catalog_list.register_table_sync(req_with_ts).unwrap();
|
||||
|
||||
let schema = vec![
|
||||
datatypes::schema::ColumnSchema::new("NUMBER", CDT::uint32_datatype(), false),
|
||||
datatypes::schema::ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false)
|
||||
.with_time_index(true),
|
||||
];
|
||||
let mut columns = vec![];
|
||||
let numbers = (1..=10).collect_vec();
|
||||
let column: VectorRef = Arc::new(<u32 as Scalar>::VectorType::from_vec(numbers));
|
||||
columns.push(column);
|
||||
|
||||
let ts = (1..=10).collect_vec();
|
||||
let mut builder = TimestampMillisecondVectorBuilder::with_capacity(10);
|
||||
ts.into_iter()
|
||||
.map(|v| builder.push(Some(TimestampMillisecond::new(v))))
|
||||
.count();
|
||||
let column: VectorRef = builder.to_vector_cloned();
|
||||
columns.push(column);
|
||||
|
||||
let schema = Arc::new(Schema::new(schema));
|
||||
let recordbatch = common_recordbatch::RecordBatch::new(schema, columns).unwrap();
|
||||
let table = MemTable::table("UPPERCASE_NUMBERS_WITH_TS", recordbatch);
|
||||
|
||||
let req_with_ts = RegisterTableRequest {
|
||||
catalog: DEFAULT_CATALOG_NAME.to_string(),
|
||||
schema: DEFAULT_SCHEMA_NAME.to_string(),
|
||||
table_name: "UPPERCASE_NUMBERS_WITH_TS".to_string(),
|
||||
table_id: 1025,
|
||||
table,
|
||||
};
|
||||
catalog_list.register_table_sync(req_with_ts).unwrap();
|
||||
|
||||
let factory = query::QueryEngineFactory::new(catalog_list, None, None, None, None, false);
|
||||
|
||||
let engine = factory.query_engine();
|
||||
|
||||
@@ -238,13 +238,6 @@ pub enum Error {
|
||||
source: servers::error::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to create logical plan for prometheus label values query"))]
|
||||
PrometheusLabelValuesQueryPlan {
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
source: query::promql::error::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to describe schema for given statement"))]
|
||||
DescribeStatement {
|
||||
#[snafu(implicit)]
|
||||
@@ -373,8 +366,6 @@ impl ErrorExt for Error {
|
||||
| Error::PrometheusMetricNamesQueryPlan { source, .. }
|
||||
| Error::ExecutePromql { source, .. } => source.status_code(),
|
||||
|
||||
Error::PrometheusLabelValuesQueryPlan { source, .. } => source.status_code(),
|
||||
|
||||
Error::CollectRecordbatch { .. } => StatusCode::EngineExecuteQuery,
|
||||
|
||||
Error::SqlExecIntercepted { source, .. } => source.status_code(),
|
||||
|
||||
@@ -26,7 +26,6 @@ mod region_query;
|
||||
pub mod standalone;
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::time::SystemTime;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
|
||||
@@ -472,21 +471,6 @@ impl PrometheusHandler for Instance {
|
||||
.context(ExecuteQuerySnafu)
|
||||
}
|
||||
|
||||
async fn query_label_values(
|
||||
&self,
|
||||
metric: String,
|
||||
label_name: String,
|
||||
matchers: Vec<Matcher>,
|
||||
start: SystemTime,
|
||||
end: SystemTime,
|
||||
ctx: &QueryContextRef,
|
||||
) -> server_error::Result<Vec<String>> {
|
||||
self.handle_query_label_values(metric, label_name, matchers, start, end, ctx)
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExecuteQuerySnafu)
|
||||
}
|
||||
|
||||
fn catalog_manager(&self) -> CatalogManagerRef {
|
||||
self.catalog_manager.clone()
|
||||
}
|
||||
|
||||
@@ -12,26 +12,20 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::time::SystemTime;
|
||||
|
||||
use catalog::information_schema::TABLES;
|
||||
use client::OutputData;
|
||||
use common_catalog::consts::INFORMATION_SCHEMA_NAME;
|
||||
use common_catalog::format_full_table_name;
|
||||
use common_recordbatch::util;
|
||||
use common_telemetry::tracing;
|
||||
use datatypes::prelude::Value;
|
||||
use promql_parser::label::{Matcher, Matchers};
|
||||
use query::promql;
|
||||
use query::promql::planner::PromPlanner;
|
||||
use promql_parser::label::Matcher;
|
||||
use servers::prometheus;
|
||||
use session::context::QueryContextRef;
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
|
||||
use crate::error::{
|
||||
CatalogSnafu, CollectRecordbatchSnafu, ExecLogicalPlanSnafu,
|
||||
PrometheusLabelValuesQueryPlanSnafu, PrometheusMetricNamesQueryPlanSnafu, ReadTableSnafu,
|
||||
Result, TableNotFoundSnafu,
|
||||
PrometheusMetricNamesQueryPlanSnafu, ReadTableSnafu, Result, TableNotFoundSnafu,
|
||||
};
|
||||
use crate::instance::Instance;
|
||||
|
||||
@@ -102,77 +96,4 @@ impl Instance {
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Handles label values query request, returns the values.
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub(crate) async fn handle_query_label_values(
|
||||
&self,
|
||||
metric: String,
|
||||
label_name: String,
|
||||
matchers: Vec<Matcher>,
|
||||
start: SystemTime,
|
||||
end: SystemTime,
|
||||
ctx: &QueryContextRef,
|
||||
) -> Result<Vec<String>> {
|
||||
let table_schema = ctx.current_schema();
|
||||
let table = self
|
||||
.catalog_manager
|
||||
.table(ctx.current_catalog(), &table_schema, &metric, Some(ctx))
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
.with_context(|| TableNotFoundSnafu {
|
||||
table_name: format_full_table_name(ctx.current_catalog(), &table_schema, &metric),
|
||||
})?;
|
||||
|
||||
let dataframe = self
|
||||
.query_engine
|
||||
.read_table(table.clone())
|
||||
.with_context(|_| ReadTableSnafu {
|
||||
table_name: format_full_table_name(ctx.current_catalog(), &table_schema, &metric),
|
||||
})?;
|
||||
|
||||
let scan_plan = dataframe.into_logical_plan();
|
||||
let filter_conditions =
|
||||
PromPlanner::matchers_to_expr(Matchers::new(matchers), scan_plan.schema())
|
||||
.context(PrometheusLabelValuesQueryPlanSnafu)?;
|
||||
let logical_plan = promql::label_values::rewrite_label_values_query(
|
||||
table,
|
||||
scan_plan,
|
||||
filter_conditions,
|
||||
label_name,
|
||||
start,
|
||||
end,
|
||||
)
|
||||
.context(PrometheusLabelValuesQueryPlanSnafu)?;
|
||||
|
||||
let results = self
|
||||
.query_engine
|
||||
.execute(logical_plan, ctx.clone())
|
||||
.await
|
||||
.context(ExecLogicalPlanSnafu)?;
|
||||
|
||||
let batches = match results.data {
|
||||
OutputData::Stream(stream) => util::collect(stream)
|
||||
.await
|
||||
.context(CollectRecordbatchSnafu)?,
|
||||
OutputData::RecordBatches(rbs) => rbs.take(),
|
||||
_ => unreachable!("should not happen"),
|
||||
};
|
||||
|
||||
let mut results = Vec::with_capacity(batches.iter().map(|b| b.num_rows()).sum());
|
||||
for batch in batches {
|
||||
// Only one column the results, ensured by `prometheus::label_values_matchers_to_plan`.
|
||||
let names = batch.column(0);
|
||||
|
||||
for i in 0..names.len() {
|
||||
let Value::String(name) = names.get(i) else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
results.push(name.into_string());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -29,7 +29,6 @@ prost.workspace = true
|
||||
puffin.workspace = true
|
||||
regex.workspace = true
|
||||
regex-automata.workspace = true
|
||||
roaring = "0.10"
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
snafu.workspace = true
|
||||
|
||||
@@ -1,868 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::io;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
use common_base::BitVec;
|
||||
/// `BitmapType` enumerates how bitmaps are encoded within the inverted index.
|
||||
pub use greptime_proto::v1::index::BitmapType;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
/// A bitmap representation supporting both BitVec and RoaringBitmap formats.
|
||||
///
|
||||
/// This enum provides unified bitmap operations while allowing efficient storage
|
||||
/// in different formats. The implementation automatically handles type conversions
|
||||
/// when performing operations between different formats.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// Creating a new Roaring bitmap:
|
||||
/// ```
|
||||
/// use bitmap::Bitmap;
|
||||
/// let bitmap = Bitmap::new_roaring();
|
||||
/// assert!(bitmap.is_empty());
|
||||
/// ```
|
||||
///
|
||||
/// Creating a full BitVec bitmap:
|
||||
/// ```
|
||||
/// use bitmap::Bitmap;
|
||||
/// let bitmap = Bitmap::full_bitvec(10);
|
||||
/// assert_eq!(bitmap.count_ones(), 10);
|
||||
/// ```
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum Bitmap {
|
||||
Roaring(RoaringBitmap),
|
||||
BitVec(BitVec),
|
||||
}
|
||||
|
||||
impl Bitmap {
|
||||
/// Creates a new empty BitVec-based bitmap.
|
||||
pub fn new_bitvec() -> Self {
|
||||
Bitmap::BitVec(BitVec::EMPTY)
|
||||
}
|
||||
|
||||
/// Creates a new empty RoaringBitmap-based bitmap.
|
||||
pub fn new_roaring() -> Self {
|
||||
Bitmap::Roaring(RoaringBitmap::new())
|
||||
}
|
||||
|
||||
/// Creates a full BitVec-based bitmap with all bits set to 1.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `size` - The number of bits to allocate and set
|
||||
pub fn full_bitvec(size: usize) -> Self {
|
||||
Bitmap::BitVec(BitVec::repeat(true, size))
|
||||
}
|
||||
|
||||
/// Creates a full RoaringBitmap-based bitmap with bits 0..size set to 1.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `size` - The exclusive upper bound for the bit range
|
||||
pub fn full_roaring(size: usize) -> Self {
|
||||
let mut roaring = RoaringBitmap::new();
|
||||
roaring.insert_range(0..size as u32);
|
||||
Bitmap::Roaring(roaring)
|
||||
}
|
||||
|
||||
/// Returns the number of bits set to 1 in the bitmap.
|
||||
pub fn count_ones(&self) -> usize {
|
||||
match self {
|
||||
Bitmap::BitVec(bitvec) => bitvec.count_ones(),
|
||||
Bitmap::Roaring(roaring) => roaring.len() as _,
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks if the bitmap contains no set bits.
|
||||
pub fn is_empty(&self) -> bool {
|
||||
match self {
|
||||
Bitmap::BitVec(bitvec) => bitvec.is_empty(),
|
||||
Bitmap::Roaring(roaring) => roaring.is_empty(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Inserts a range of bits into the bitmap.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `range` - Inclusive range of bits to set
|
||||
pub fn insert_range(&mut self, range: RangeInclusive<usize>) {
|
||||
match self {
|
||||
Bitmap::BitVec(bitvec) => {
|
||||
if *range.end() >= bitvec.len() {
|
||||
bitvec.resize(range.end() + 1, false);
|
||||
}
|
||||
for i in range {
|
||||
bitvec.set(i, true);
|
||||
}
|
||||
}
|
||||
Bitmap::Roaring(roaring) => {
|
||||
let range = *range.start() as u32..=*range.end() as u32;
|
||||
roaring.insert_range(range);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Serializes the bitmap into a byte buffer using the specified format.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `serialize_type` - Target format for serialization
|
||||
/// * `writer` - Output writer to write the serialized data
|
||||
pub fn serialize_into(
|
||||
&self,
|
||||
serialize_type: BitmapType,
|
||||
mut writer: impl io::Write,
|
||||
) -> io::Result<()> {
|
||||
match (self, serialize_type) {
|
||||
(Bitmap::BitVec(bitvec), BitmapType::BitVec) => {
|
||||
writer.write_all(bitvec.as_raw_slice())?;
|
||||
}
|
||||
(Bitmap::Roaring(roaring), BitmapType::Roaring) => {
|
||||
roaring.serialize_into(writer)?;
|
||||
}
|
||||
(Bitmap::BitVec(bitvec), BitmapType::Roaring) => {
|
||||
let bitmap = Bitmap::bitvec_to_roaring(bitvec.clone());
|
||||
bitmap.serialize_into(writer)?;
|
||||
}
|
||||
(Bitmap::Roaring(roaring), BitmapType::BitVec) => {
|
||||
let bitvec = Bitmap::roaring_to_bitvec(roaring);
|
||||
writer.write_all(bitvec.as_raw_slice())?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Computes the size of the serialized bitmap in bytes.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `bitmap_type` - Format of data to be serialized
|
||||
pub fn serialized_size(&self, bitmap_type: BitmapType) -> usize {
|
||||
match (self, bitmap_type) {
|
||||
(Bitmap::BitVec(bitvec), BitmapType::BitVec) => bitvec.as_raw_slice().len(),
|
||||
(Bitmap::Roaring(roaring), BitmapType::Roaring) => roaring.serialized_size(),
|
||||
(Bitmap::BitVec(bitvec), BitmapType::Roaring) => {
|
||||
let bitmap = Bitmap::bitvec_to_roaring(bitvec.clone());
|
||||
bitmap.serialized_size()
|
||||
}
|
||||
(Bitmap::Roaring(roaring), BitmapType::BitVec) => {
|
||||
let bitvec = Bitmap::roaring_to_bitvec(roaring);
|
||||
bitvec.as_raw_slice().len()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Deserializes a bitmap from a byte buffer.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `buf` - Input buffer containing serialized data
|
||||
/// * `bitmap_type` - Format of the serialized data
|
||||
pub fn deserialize_from(buf: &[u8], bitmap_type: BitmapType) -> std::io::Result<Self> {
|
||||
match bitmap_type {
|
||||
BitmapType::BitVec => {
|
||||
let bitvec = BitVec::from_slice(buf);
|
||||
Ok(Bitmap::BitVec(bitvec))
|
||||
}
|
||||
BitmapType::Roaring => {
|
||||
let roaring = RoaringBitmap::deserialize_from(buf)?;
|
||||
Ok(Bitmap::Roaring(roaring))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Computes the union with another bitmap (in-place).
|
||||
///
|
||||
/// If the other bitmap is a different type, it will be converted to match
|
||||
/// the current bitmap's type.
|
||||
pub fn union(&mut self, other: Self) {
|
||||
if self.is_empty() {
|
||||
*self = other;
|
||||
return;
|
||||
}
|
||||
|
||||
match (self, other) {
|
||||
(Bitmap::BitVec(bitvec1), bitmap) => {
|
||||
let bitvec2 = bitmap.into_bitvec();
|
||||
if bitvec1.len() > bitvec2.len() {
|
||||
*bitvec1 |= bitvec2
|
||||
} else {
|
||||
*bitvec1 = bitvec2 | &*bitvec1;
|
||||
}
|
||||
}
|
||||
(Bitmap::Roaring(roaring1), bitmap) => {
|
||||
let roaring2 = bitmap.into_roaring();
|
||||
*roaring1 |= roaring2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Computes the intersection with another bitmap (in-place).
|
||||
///
|
||||
/// If the other bitmap is a different type, it will be converted to match
|
||||
/// the current bitmap's type.
|
||||
pub fn intersect(&mut self, other: Self) {
|
||||
match (self, other) {
|
||||
(Bitmap::BitVec(bitvec1), bitmap) => {
|
||||
let mut bitvec2 = bitmap.into_bitvec();
|
||||
let len = (bitvec1.len() - bitvec1.trailing_zeros())
|
||||
.min(bitvec2.len() - bitvec2.trailing_zeros());
|
||||
bitvec1.truncate(len);
|
||||
bitvec2.truncate(len);
|
||||
*bitvec1 &= bitvec2;
|
||||
}
|
||||
(Bitmap::Roaring(roaring1), bitmap) => {
|
||||
let roaring2 = bitmap.into_roaring();
|
||||
*roaring1 &= roaring2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an iterator over the indices of set bits.
|
||||
pub fn iter_ones(&self) -> Box<dyn Iterator<Item = usize> + '_> {
|
||||
match self {
|
||||
Bitmap::BitVec(bitvec) => Box::new(bitvec.iter_ones()),
|
||||
Bitmap::Roaring(roaring) => Box::new(roaring.iter().map(|x| x as usize)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a bitmap from bytes in LSB0 (least significant bit first) order.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `bytes` - Input bytes in LSB0 order
|
||||
/// * `bitmap_type` - Type of bitmap to create
|
||||
pub fn from_lsb0_bytes(bytes: &[u8], bitmap_type: BitmapType) -> Self {
|
||||
match bitmap_type {
|
||||
BitmapType::BitVec => {
|
||||
let bitvec = BitVec::from_slice(bytes);
|
||||
Bitmap::BitVec(bitvec)
|
||||
}
|
||||
BitmapType::Roaring => {
|
||||
let roaring = RoaringBitmap::from_lsb0_bytes(0, bytes);
|
||||
Bitmap::Roaring(roaring)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Computes memory usage of the bitmap in bytes.
|
||||
pub fn memory_usage(&self) -> usize {
|
||||
match self {
|
||||
Bitmap::BitVec(bitvec) => bitvec.capacity(),
|
||||
Bitmap::Roaring(roaring) => {
|
||||
let stat = roaring.statistics();
|
||||
(stat.n_bytes_array_containers
|
||||
+ stat.n_bytes_bitset_containers
|
||||
+ stat.n_bytes_run_containers) as usize
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn into_bitvec(self) -> BitVec {
|
||||
match self {
|
||||
Bitmap::BitVec(bitvec) => bitvec,
|
||||
Bitmap::Roaring(roaring) => Self::roaring_to_bitvec(&roaring),
|
||||
}
|
||||
}
|
||||
|
||||
fn into_roaring(self) -> RoaringBitmap {
|
||||
match self {
|
||||
Bitmap::Roaring(roaring) => roaring,
|
||||
Bitmap::BitVec(bitvec) => Self::bitvec_to_roaring(bitvec),
|
||||
}
|
||||
}
|
||||
|
||||
fn roaring_to_bitvec(roaring: &RoaringBitmap) -> BitVec {
|
||||
let max_value = roaring.max().unwrap_or(0);
|
||||
let mut bitvec = BitVec::repeat(false, max_value as usize + 1);
|
||||
for i in roaring {
|
||||
bitvec.set(i as usize, true);
|
||||
}
|
||||
bitvec
|
||||
}
|
||||
|
||||
fn bitvec_to_roaring(mut bitvec: BitVec) -> RoaringBitmap {
|
||||
bitvec.resize(bitvec.capacity(), false);
|
||||
RoaringBitmap::from_lsb0_bytes(0, bitvec.as_raw_slice())
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Bitmap {
|
||||
fn default() -> Self {
|
||||
Bitmap::new_roaring()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_full_bitmaps() {
|
||||
let bv = Bitmap::full_bitvec(10);
|
||||
assert_eq!(bv.count_ones(), 10);
|
||||
|
||||
let rb = Bitmap::full_roaring(10);
|
||||
assert_eq!(rb.count_ones(), 10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialization_roundtrip() {
|
||||
let original = Bitmap::full_roaring(100);
|
||||
let mut buf = Vec::new();
|
||||
|
||||
// Serialize as Roaring
|
||||
original
|
||||
.serialize_into(BitmapType::Roaring, &mut buf)
|
||||
.unwrap();
|
||||
let deserialized = Bitmap::deserialize_from(&buf, BitmapType::Roaring).unwrap();
|
||||
assert_eq!(original, deserialized);
|
||||
|
||||
// Serialize as BitVec
|
||||
buf.clear();
|
||||
original
|
||||
.serialize_into(BitmapType::BitVec, &mut buf)
|
||||
.unwrap();
|
||||
let deserialized = Bitmap::deserialize_from(&buf, BitmapType::BitVec).unwrap();
|
||||
assert_eq!(original.count_ones(), deserialized.count_ones());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_union_fulls() {
|
||||
// Test BitVec union
|
||||
let mut bv1 = Bitmap::full_bitvec(3); // 0-2: 111
|
||||
let bv2 = Bitmap::full_bitvec(5); // 0-4: 11111
|
||||
bv1.union(bv2);
|
||||
assert_eq!(bv1.count_ones(), 5);
|
||||
|
||||
let mut bv1 = Bitmap::full_bitvec(5); // 0-4: 11111
|
||||
let bv2 = Bitmap::full_bitvec(3); // 0-2: 111
|
||||
bv1.union(bv2);
|
||||
assert_eq!(bv1.count_ones(), 5);
|
||||
|
||||
// Test Roaring union
|
||||
let mut rb1 = Bitmap::full_roaring(3); // 0-2: 111
|
||||
let rb2 = Bitmap::full_roaring(5); // 0-4: 11111
|
||||
rb1.union(rb2);
|
||||
assert_eq!(rb1.count_ones(), 5);
|
||||
|
||||
let mut rb1 = Bitmap::full_roaring(5); // 0-4: 11111
|
||||
let rb2 = Bitmap::full_roaring(3); // 0-2: 111
|
||||
rb1.union(rb2);
|
||||
assert_eq!(rb1.count_ones(), 5);
|
||||
|
||||
// Test cross-type union
|
||||
let mut rb = Bitmap::full_roaring(5); // 0-4: 11111
|
||||
let bv = Bitmap::full_bitvec(3); // 0-2: 111
|
||||
rb.union(bv);
|
||||
assert_eq!(rb.count_ones(), 5);
|
||||
|
||||
let mut bv = Bitmap::full_bitvec(5); // 0-4: 11111
|
||||
let rb = Bitmap::full_roaring(3); // 0-2: 111
|
||||
bv.union(rb);
|
||||
assert_eq!(bv.count_ones(), 5);
|
||||
|
||||
let mut rb = Bitmap::full_roaring(3); // 0-2: 111
|
||||
let bv = Bitmap::full_bitvec(5); // 0-4: 11111
|
||||
rb.union(bv);
|
||||
assert_eq!(rb.count_ones(), 5);
|
||||
|
||||
let mut bv = Bitmap::full_bitvec(3); // 0-2: 111
|
||||
let rb = Bitmap::full_roaring(5); // 0-4: 11111
|
||||
bv.union(rb);
|
||||
assert_eq!(bv.count_ones(), 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_union_bitvec() {
|
||||
let mut bv1 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
|
||||
let bv2 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
|
||||
bv1.union(bv2);
|
||||
assert_eq!(
|
||||
bv1,
|
||||
Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::BitVec)
|
||||
);
|
||||
|
||||
// Test different lengths
|
||||
let mut bv1 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
|
||||
let bv2 = Bitmap::from_lsb0_bytes(&[0b01010101, 0b00000001], BitmapType::BitVec);
|
||||
bv1.union(bv2);
|
||||
assert_eq!(
|
||||
bv1,
|
||||
Bitmap::from_lsb0_bytes(&[0b11111111, 0b00000001], BitmapType::BitVec)
|
||||
);
|
||||
|
||||
let mut bv1 = Bitmap::from_lsb0_bytes(&[0b10101010, 0b00000001], BitmapType::BitVec);
|
||||
let bv2 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
|
||||
bv1.union(bv2);
|
||||
assert_eq!(
|
||||
bv1,
|
||||
Bitmap::from_lsb0_bytes(&[0b11111111, 0b00000001], BitmapType::BitVec)
|
||||
);
|
||||
|
||||
// Test empty bitmaps
|
||||
let mut bv1 = Bitmap::new_bitvec();
|
||||
let bv2 = Bitmap::new_bitvec();
|
||||
bv1.union(bv2);
|
||||
assert!(bv1.is_empty());
|
||||
|
||||
let mut bv1 = Bitmap::new_bitvec();
|
||||
let bv2 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
|
||||
bv1.union(bv2);
|
||||
assert_eq!(
|
||||
bv1,
|
||||
Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec)
|
||||
);
|
||||
|
||||
let mut bv1 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
|
||||
let bv2 = Bitmap::new_bitvec();
|
||||
bv1.union(bv2);
|
||||
assert_eq!(
|
||||
bv1,
|
||||
Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec)
|
||||
);
|
||||
|
||||
// Test empty and full bitmaps
|
||||
let mut bv1 = Bitmap::new_bitvec();
|
||||
let bv2 = Bitmap::full_bitvec(8);
|
||||
bv1.union(bv2);
|
||||
assert_eq!(bv1, Bitmap::full_bitvec(8));
|
||||
|
||||
let mut bv1 = Bitmap::full_bitvec(8);
|
||||
let bv2 = Bitmap::new_bitvec();
|
||||
bv1.union(bv2);
|
||||
assert_eq!(bv1, Bitmap::full_bitvec(8));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_union_roaring() {
|
||||
let mut rb1 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
|
||||
let rb2 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
|
||||
rb1.union(rb2);
|
||||
assert_eq!(
|
||||
rb1,
|
||||
Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::Roaring)
|
||||
);
|
||||
|
||||
// Test different lengths
|
||||
let mut rb1 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
|
||||
let rb2 = Bitmap::from_lsb0_bytes(&[0b01010101, 0b00000001], BitmapType::Roaring);
|
||||
rb1.union(rb2);
|
||||
assert_eq!(
|
||||
rb1,
|
||||
Bitmap::from_lsb0_bytes(&[0b11111111, 0b00000001], BitmapType::Roaring)
|
||||
);
|
||||
|
||||
let mut rb1 = Bitmap::from_lsb0_bytes(&[0b10101010, 0b00000001], BitmapType::Roaring);
|
||||
let rb2 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
|
||||
rb1.union(rb2);
|
||||
assert_eq!(
|
||||
rb1,
|
||||
Bitmap::from_lsb0_bytes(&[0b11111111, 0b00000001], BitmapType::Roaring)
|
||||
);
|
||||
|
||||
// Test empty bitmaps
|
||||
let mut rb1 = Bitmap::new_roaring();
|
||||
let rb2 = Bitmap::new_roaring();
|
||||
rb1.union(rb2);
|
||||
assert!(rb1.is_empty());
|
||||
|
||||
let mut rb1 = Bitmap::new_roaring();
|
||||
let rb2 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
|
||||
rb1.union(rb2);
|
||||
assert_eq!(
|
||||
rb1,
|
||||
Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring)
|
||||
);
|
||||
|
||||
let mut rb1 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
|
||||
let rb2 = Bitmap::new_roaring();
|
||||
rb1.union(rb2);
|
||||
assert_eq!(
|
||||
rb1,
|
||||
Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring)
|
||||
);
|
||||
|
||||
// Test empty and full bit
|
||||
let mut rb1 = Bitmap::new_roaring();
|
||||
let rb2 = Bitmap::full_roaring(8);
|
||||
rb1.union(rb2);
|
||||
assert_eq!(rb1, Bitmap::full_roaring(8));
|
||||
|
||||
let mut rb1 = Bitmap::full_roaring(8);
|
||||
let rb2 = Bitmap::new_roaring();
|
||||
rb1.union(rb2);
|
||||
assert_eq!(rb1, Bitmap::full_roaring(8));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_union_mixed() {
|
||||
let mut rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
|
||||
let bv = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
|
||||
rb.union(bv);
|
||||
assert_eq!(
|
||||
rb,
|
||||
Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::Roaring)
|
||||
);
|
||||
|
||||
let mut bv = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
|
||||
let rb = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
|
||||
bv.union(rb);
|
||||
assert_eq!(
|
||||
bv,
|
||||
Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::BitVec)
|
||||
);
|
||||
|
||||
let mut rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
|
||||
let bv = Bitmap::full_bitvec(8);
|
||||
rb.union(bv);
|
||||
assert_eq!(rb, Bitmap::full_roaring(8));
|
||||
|
||||
let mut bv = Bitmap::full_bitvec(8);
|
||||
let rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
|
||||
bv.union(rb);
|
||||
assert_eq!(bv, Bitmap::full_bitvec(8));
|
||||
|
||||
let mut rb = Bitmap::new_roaring();
|
||||
let bv = Bitmap::full_bitvec(8);
|
||||
rb.union(bv);
|
||||
assert_eq!(rb, Bitmap::full_bitvec(8));
|
||||
|
||||
let mut bv = Bitmap::full_bitvec(8);
|
||||
let rb = Bitmap::new_roaring();
|
||||
bv.union(rb);
|
||||
assert_eq!(bv, Bitmap::full_bitvec(8));
|
||||
|
||||
let mut rb = Bitmap::new_roaring();
|
||||
let bv = Bitmap::new_bitvec();
|
||||
rb.union(bv);
|
||||
assert!(rb.is_empty());
|
||||
|
||||
let mut bv = Bitmap::new_bitvec();
|
||||
let rb = Bitmap::new_roaring();
|
||||
bv.union(rb);
|
||||
assert!(bv.is_empty());
|
||||
|
||||
let mut rb = Bitmap::new_roaring();
|
||||
let bv = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
|
||||
rb.union(bv);
|
||||
assert_eq!(
|
||||
rb,
|
||||
Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec)
|
||||
);
|
||||
|
||||
let mut bv = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
|
||||
let rb = Bitmap::new_roaring();
|
||||
bv.union(rb);
|
||||
assert_eq!(
|
||||
bv,
|
||||
Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec)
|
||||
);
|
||||
|
||||
let mut rb = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
|
||||
let bv = Bitmap::new_bitvec();
|
||||
rb.union(bv);
|
||||
assert_eq!(
|
||||
rb,
|
||||
Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring)
|
||||
);
|
||||
|
||||
let mut bv = Bitmap::new_bitvec();
|
||||
let rb = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
|
||||
bv.union(rb);
|
||||
assert_eq!(
|
||||
bv,
|
||||
Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intersect_fulls() {
|
||||
// Test BitVec intersect
|
||||
let mut bv1 = Bitmap::full_bitvec(3); // 0-2: 111
|
||||
let bv2 = Bitmap::full_bitvec(5); // 0-4: 11111
|
||||
bv1.intersect(bv2);
|
||||
assert_eq!(bv1.count_ones(), 3);
|
||||
|
||||
let mut bv1 = Bitmap::full_bitvec(5); // 0-4: 11111
|
||||
let bv2 = Bitmap::full_bitvec(3); // 0-2: 111
|
||||
bv1.intersect(bv2);
|
||||
assert_eq!(bv1.count_ones(), 3);
|
||||
|
||||
// Test Roaring intersect
|
||||
let mut rb1 = Bitmap::full_roaring(3); // 0-2: 111
|
||||
let rb2 = Bitmap::full_roaring(5); // 0-4: 11111
|
||||
rb1.intersect(rb2);
|
||||
assert_eq!(rb1.count_ones(), 3);
|
||||
|
||||
let mut rb1 = Bitmap::full_roaring(5); // 0-4: 11111
|
||||
let rb2 = Bitmap::full_roaring(3); // 0-2: 111
|
||||
rb1.intersect(rb2);
|
||||
assert_eq!(rb1.count_ones(), 3);
|
||||
|
||||
// Test cross-type intersect
|
||||
let mut rb = Bitmap::full_roaring(5); // 0-4: 11111
|
||||
let bv = Bitmap::full_bitvec(3); // 0-2: 111
|
||||
rb.intersect(bv);
|
||||
assert_eq!(rb.count_ones(), 3);
|
||||
|
||||
let mut bv = Bitmap::full_bitvec(5); // 0-4: 11111
|
||||
let rb = Bitmap::full_roaring(3); // 0-2: 111
|
||||
bv.intersect(rb);
|
||||
assert_eq!(bv.count_ones(), 3);
|
||||
|
||||
let mut rb = Bitmap::full_roaring(3); // 0-2: 111
|
||||
let bv = Bitmap::full_bitvec(5); // 0-4: 11111
|
||||
rb.intersect(bv);
|
||||
assert_eq!(rb.count_ones(), 3);
|
||||
|
||||
let mut bv = Bitmap::full_bitvec(3); // 0-2: 111
|
||||
let rb = Bitmap::full_roaring(5); // 0-4: 11111
|
||||
bv.intersect(rb);
|
||||
assert_eq!(bv.count_ones(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intersect_bitvec() {
|
||||
let mut bv1 = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::BitVec);
|
||||
let bv2 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
|
||||
bv1.intersect(bv2);
|
||||
assert_eq!(
|
||||
bv1,
|
||||
Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::BitVec)
|
||||
);
|
||||
|
||||
// Test different lengths
|
||||
let mut bv1 = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::BitVec);
|
||||
let bv2 = Bitmap::from_lsb0_bytes(&[0b10101010, 0b00000001], BitmapType::BitVec);
|
||||
bv1.intersect(bv2);
|
||||
assert_eq!(
|
||||
bv1,
|
||||
Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::BitVec)
|
||||
);
|
||||
|
||||
let mut bv1 = Bitmap::from_lsb0_bytes(&[0b11110000, 0b00000001], BitmapType::BitVec);
|
||||
let bv2 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
|
||||
bv1.intersect(bv2);
|
||||
assert_eq!(
|
||||
bv1,
|
||||
Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::BitVec)
|
||||
);
|
||||
|
||||
// Test empty bitmaps
|
||||
let mut bv1 = Bitmap::new_bitvec();
|
||||
let bv2 = Bitmap::new_bitvec();
|
||||
bv1.intersect(bv2);
|
||||
assert!(bv1.is_empty());
|
||||
|
||||
let mut bv1 = Bitmap::new_bitvec();
|
||||
let bv2 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
|
||||
bv1.intersect(bv2);
|
||||
assert!(bv1.is_empty());
|
||||
|
||||
let mut bv1 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
|
||||
let bv2 = Bitmap::new_bitvec();
|
||||
bv1.intersect(bv2);
|
||||
assert!(bv1.is_empty());
|
||||
|
||||
// Test empty and full bitmaps
|
||||
let mut bv1 = Bitmap::new_bitvec();
|
||||
let bv2 = Bitmap::full_bitvec(8);
|
||||
bv1.intersect(bv2);
|
||||
assert!(bv1.is_empty());
|
||||
|
||||
let mut bv1 = Bitmap::full_bitvec(8);
|
||||
let bv2 = Bitmap::new_bitvec();
|
||||
bv1.intersect(bv2);
|
||||
assert!(bv1.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intersect_roaring() {
|
||||
let mut rb1 = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring);
|
||||
let rb2 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
|
||||
rb1.intersect(rb2);
|
||||
assert_eq!(
|
||||
rb1,
|
||||
Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::Roaring)
|
||||
);
|
||||
|
||||
// Test different lengths
|
||||
let mut rb1 = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring);
|
||||
let rb2 = Bitmap::from_lsb0_bytes(&[0b10101010, 0b00000001], BitmapType::Roaring);
|
||||
rb1.intersect(rb2);
|
||||
assert_eq!(
|
||||
rb1,
|
||||
Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::Roaring)
|
||||
);
|
||||
|
||||
let mut rb1 = Bitmap::from_lsb0_bytes(&[0b11110000, 0b00000001], BitmapType::Roaring);
|
||||
let rb2 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
|
||||
rb1.intersect(rb2);
|
||||
assert_eq!(
|
||||
rb1,
|
||||
Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::Roaring)
|
||||
);
|
||||
|
||||
// Test empty bitmaps
|
||||
let mut rb1 = Bitmap::new_roaring();
|
||||
let rb2 = Bitmap::new_roaring();
|
||||
rb1.intersect(rb2);
|
||||
assert!(rb1.is_empty());
|
||||
|
||||
let mut rb1 = Bitmap::new_roaring();
|
||||
let rb2 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
|
||||
rb1.intersect(rb2);
|
||||
assert!(rb1.is_empty());
|
||||
|
||||
let mut rb1 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
|
||||
let rb2 = Bitmap::new_roaring();
|
||||
rb1.intersect(rb2);
|
||||
assert!(rb1.is_empty());
|
||||
|
||||
// Test empty and full bitmaps
|
||||
let mut rb1 = Bitmap::new_roaring();
|
||||
let rb2 = Bitmap::full_roaring(8);
|
||||
rb1.intersect(rb2);
|
||||
assert!(rb1.is_empty());
|
||||
|
||||
let mut rb1 = Bitmap::full_roaring(8);
|
||||
let rb2 = Bitmap::new_roaring();
|
||||
rb1.intersect(rb2);
|
||||
assert!(rb1.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intersect_mixed() {
|
||||
let mut rb = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring);
|
||||
let bv = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
|
||||
rb.intersect(bv);
|
||||
assert_eq!(
|
||||
rb,
|
||||
Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::Roaring)
|
||||
);
|
||||
|
||||
let mut bv = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::BitVec);
|
||||
let rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
|
||||
bv.intersect(rb);
|
||||
assert_eq!(
|
||||
bv,
|
||||
Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::BitVec)
|
||||
);
|
||||
|
||||
let mut rb = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring);
|
||||
let bv = Bitmap::full_bitvec(8);
|
||||
rb.intersect(bv);
|
||||
assert_eq!(
|
||||
rb,
|
||||
Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring)
|
||||
);
|
||||
|
||||
let mut bv = Bitmap::full_bitvec(8);
|
||||
let rb = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring);
|
||||
bv.intersect(rb);
|
||||
assert_eq!(
|
||||
bv,
|
||||
Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::BitVec)
|
||||
);
|
||||
|
||||
let mut rb = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring);
|
||||
let bv = Bitmap::from_lsb0_bytes(&[0b10101010, 0b00000001], BitmapType::BitVec);
|
||||
rb.intersect(bv);
|
||||
assert_eq!(
|
||||
rb,
|
||||
Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::Roaring)
|
||||
);
|
||||
|
||||
let mut bv = Bitmap::from_lsb0_bytes(&[0b11110000, 0b00000001], BitmapType::BitVec);
|
||||
let rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
|
||||
bv.intersect(rb);
|
||||
assert_eq!(
|
||||
bv,
|
||||
Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::BitVec)
|
||||
);
|
||||
|
||||
let mut rb = Bitmap::from_lsb0_bytes(&[0b11110000, 0b00000001], BitmapType::Roaring);
|
||||
let bv = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
|
||||
rb.intersect(bv);
|
||||
assert_eq!(
|
||||
rb,
|
||||
Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::Roaring)
|
||||
);
|
||||
|
||||
let mut bv = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::BitVec);
|
||||
let rb = Bitmap::from_lsb0_bytes(&[0b10101010, 0b00000001], BitmapType::Roaring);
|
||||
bv.intersect(rb);
|
||||
assert_eq!(
|
||||
bv,
|
||||
Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::BitVec)
|
||||
);
|
||||
|
||||
let mut rb = Bitmap::new_roaring();
|
||||
let bv = Bitmap::full_bitvec(8);
|
||||
rb.intersect(bv);
|
||||
assert!(rb.is_empty());
|
||||
|
||||
let mut bv = Bitmap::full_bitvec(8);
|
||||
let rb = Bitmap::new_roaring();
|
||||
bv.intersect(rb);
|
||||
assert!(bv.is_empty());
|
||||
|
||||
let mut bv = Bitmap::new_bitvec();
|
||||
let rb = Bitmap::full_roaring(8);
|
||||
bv.intersect(rb);
|
||||
assert!(bv.is_empty());
|
||||
|
||||
let mut rb = Bitmap::full_roaring(8);
|
||||
let bv = Bitmap::new_bitvec();
|
||||
rb.intersect(bv);
|
||||
assert!(rb.is_empty());
|
||||
|
||||
let mut rb = Bitmap::new_roaring();
|
||||
let bv = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
|
||||
rb.intersect(bv);
|
||||
assert!(rb.is_empty());
|
||||
|
||||
let mut bv = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
|
||||
let rb = Bitmap::new_roaring();
|
||||
bv.intersect(rb);
|
||||
assert!(bv.is_empty());
|
||||
|
||||
let mut bv = Bitmap::new_bitvec();
|
||||
let rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
|
||||
bv.intersect(rb);
|
||||
assert!(bv.is_empty());
|
||||
|
||||
let mut rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
|
||||
let bv = Bitmap::new_bitvec();
|
||||
rb.intersect(bv);
|
||||
assert!(rb.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_insert_range() {
|
||||
let mut bv = Bitmap::new_bitvec();
|
||||
bv.insert_range(0..=5);
|
||||
assert_eq!(bv.iter_ones().collect::<Vec<_>>(), vec![0, 1, 2, 3, 4, 5]);
|
||||
|
||||
let mut rb = Bitmap::new_roaring();
|
||||
rb.insert_range(0..=5);
|
||||
assert_eq!(bv.iter_ones().collect::<Vec<_>>(), vec![0, 1, 2, 3, 4, 5]);
|
||||
|
||||
let mut bv = Bitmap::new_bitvec();
|
||||
bv.insert_range(10..=10);
|
||||
assert_eq!(bv.iter_ones().collect::<Vec<_>>(), vec![10]);
|
||||
|
||||
let mut rb = Bitmap::new_roaring();
|
||||
rb.insert_range(10..=10);
|
||||
assert_eq!(bv.iter_ones().collect::<Vec<_>>(), vec![10]);
|
||||
}
|
||||
}
|
||||
@@ -17,7 +17,6 @@ pub mod sort_create;
|
||||
|
||||
use async_trait::async_trait;
|
||||
|
||||
use crate::bitmap::BitmapType;
|
||||
use crate::inverted_index::error::Result;
|
||||
use crate::inverted_index::format::writer::InvertedIndexWriter;
|
||||
use crate::BytesRef;
|
||||
@@ -54,9 +53,5 @@ pub trait InvertedIndexCreator: Send {
|
||||
|
||||
/// Finalizes the index creation process, ensuring all data is properly indexed and stored
|
||||
/// in the provided writer
|
||||
async fn finish(
|
||||
&mut self,
|
||||
writer: &mut dyn InvertedIndexWriter,
|
||||
bitmap_type: BitmapType,
|
||||
) -> Result<()>;
|
||||
async fn finish(&mut self, writer: &mut dyn InvertedIndexWriter) -> Result<()>;
|
||||
}
|
||||
|
||||
@@ -17,23 +17,22 @@ mod intermediate_rw;
|
||||
mod merge_stream;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use common_base::BitVec;
|
||||
use futures::Stream;
|
||||
|
||||
use crate::bitmap::Bitmap;
|
||||
use crate::inverted_index::error::Result;
|
||||
use crate::inverted_index::format::writer::ValueStream;
|
||||
use crate::{Bytes, BytesRef};
|
||||
|
||||
/// A stream of sorted values along with their associated bitmap
|
||||
pub type SortedStream = Box<dyn Stream<Item = Result<(Bytes, Bitmap)>> + Send + Unpin>;
|
||||
pub type SortedStream = Box<dyn Stream<Item = Result<(Bytes, BitVec)>> + Send + Unpin>;
|
||||
|
||||
/// Output of a sorting operation, encapsulating a bitmap for null values and a stream of sorted items
|
||||
pub struct SortOutput {
|
||||
/// Bitmap indicating which segments have null values
|
||||
pub segment_null_bitmap: Bitmap,
|
||||
pub segment_null_bitmap: BitVec,
|
||||
|
||||
/// Stream of sorted items
|
||||
pub sorted_stream: ValueStream,
|
||||
pub sorted_stream: SortedStream,
|
||||
|
||||
/// Total number of rows in the sorted data
|
||||
pub total_row_count: usize,
|
||||
|
||||
@@ -20,11 +20,11 @@ use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use common_base::BitVec;
|
||||
use common_telemetry::{debug, error};
|
||||
use futures::stream;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::bitmap::Bitmap;
|
||||
use crate::external_provider::ExternalTempFileProvider;
|
||||
use crate::inverted_index::create::sort::intermediate_rw::{
|
||||
IntermediateReader, IntermediateWriter,
|
||||
@@ -45,10 +45,18 @@ pub struct ExternalSorter {
|
||||
temp_file_provider: Arc<dyn ExternalTempFileProvider>,
|
||||
|
||||
/// Bitmap indicating which segments have null values
|
||||
segment_null_bitmap: Bitmap,
|
||||
segment_null_bitmap: BitVec,
|
||||
|
||||
/// In-memory buffer to hold values and their corresponding bitmaps until memory threshold is exceeded
|
||||
values_buffer: BTreeMap<Bytes, (Bitmap, usize)>,
|
||||
values_buffer: BTreeMap<Bytes, BitVec>,
|
||||
|
||||
/// Count of rows in the last dumped buffer, used to streamline memory usage of `values_buffer`.
|
||||
///
|
||||
/// After data is dumped to external files, `last_dump_row_count` is updated to reflect the new starting point
|
||||
/// for `BitVec` indexing. This means each `BitVec` in `values_buffer` thereafter encodes positions relative to
|
||||
/// this count, not from 0. This mechanism effectively shrinks the memory footprint of each `BitVec`, helping manage
|
||||
/// memory use more efficiently by focusing only on newly ingested data post-dump.
|
||||
last_dump_row_count: usize,
|
||||
|
||||
/// Count of all rows ingested so far
|
||||
total_row_count: usize,
|
||||
@@ -85,14 +93,14 @@ impl Sorter for ExternalSorter {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let segment_index_range = self.segment_index_range(n);
|
||||
let segment_index_range = self.segment_index_range(n, value.is_none());
|
||||
self.total_row_count += n;
|
||||
|
||||
if let Some(value) = value {
|
||||
let memory_diff = self.push_not_null(value, segment_index_range);
|
||||
self.may_dump_buffer(memory_diff).await
|
||||
} else {
|
||||
self.segment_null_bitmap.insert_range(segment_index_range);
|
||||
set_bits(&mut self.segment_null_bitmap, segment_index_range);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -109,10 +117,15 @@ impl Sorter for ExternalSorter {
|
||||
// TODO(zhongzc): k-way merge instead of 2-way merge
|
||||
|
||||
let mut tree_nodes: VecDeque<SortedStream> = VecDeque::with_capacity(readers.len() + 1);
|
||||
let leading_zeros = self.last_dump_row_count / self.segment_row_count;
|
||||
tree_nodes.push_back(Box::new(stream::iter(
|
||||
mem::take(&mut self.values_buffer)
|
||||
.into_iter()
|
||||
.map(|(value, (bitmap, _))| Ok((value, bitmap))),
|
||||
.map(move |(value, mut bitmap)| {
|
||||
bitmap.resize(bitmap.len() + leading_zeros, false);
|
||||
bitmap.shift_right(leading_zeros);
|
||||
Ok((value, bitmap))
|
||||
}),
|
||||
)));
|
||||
for (_, reader) in readers {
|
||||
tree_nodes.push_back(IntermediateReader::new(reader).into_stream().await?);
|
||||
@@ -148,10 +161,11 @@ impl ExternalSorter {
|
||||
index_name,
|
||||
temp_file_provider,
|
||||
|
||||
segment_null_bitmap: Bitmap::new_bitvec(), // bitvec is more efficient for many null values
|
||||
segment_null_bitmap: BitVec::new(),
|
||||
values_buffer: BTreeMap::new(),
|
||||
|
||||
total_row_count: 0,
|
||||
last_dump_row_count: 0,
|
||||
segment_row_count,
|
||||
|
||||
current_memory_usage: 0,
|
||||
@@ -181,7 +195,7 @@ impl ExternalSorter {
|
||||
}
|
||||
|
||||
/// Pushes the non-null values to the values buffer and sets the bits within
|
||||
/// the specified range in the given bitmap to true.
|
||||
/// the specified range in the given BitVec to true.
|
||||
/// Returns the memory usage difference of the buffer after the operation.
|
||||
fn push_not_null(
|
||||
&mut self,
|
||||
@@ -189,23 +203,20 @@ impl ExternalSorter {
|
||||
segment_index_range: RangeInclusive<usize>,
|
||||
) -> usize {
|
||||
match self.values_buffer.get_mut(value) {
|
||||
Some((bitmap, mem_usage)) => {
|
||||
bitmap.insert_range(segment_index_range);
|
||||
let new_usage = bitmap.memory_usage() + value.len();
|
||||
let diff = new_usage - *mem_usage;
|
||||
*mem_usage = new_usage;
|
||||
Some(bitmap) => {
|
||||
let old_len = bitmap.as_raw_slice().len();
|
||||
set_bits(bitmap, segment_index_range);
|
||||
|
||||
diff
|
||||
bitmap.as_raw_slice().len() - old_len
|
||||
}
|
||||
None => {
|
||||
let mut bitmap = Bitmap::new_roaring();
|
||||
bitmap.insert_range(segment_index_range);
|
||||
let mut bitmap = BitVec::default();
|
||||
set_bits(&mut bitmap, segment_index_range);
|
||||
|
||||
let mem_usage = bitmap.memory_usage() + value.len();
|
||||
self.values_buffer
|
||||
.insert(value.to_vec(), (bitmap, mem_usage));
|
||||
let mem_diff = bitmap.as_raw_slice().len() + value.len();
|
||||
self.values_buffer.insert(value.to_vec(), bitmap);
|
||||
|
||||
mem_usage
|
||||
mem_diff
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -246,8 +257,12 @@ impl ExternalSorter {
|
||||
.fetch_sub(memory_usage, Ordering::Relaxed);
|
||||
self.current_memory_usage = 0;
|
||||
|
||||
let bitmap_leading_zeros = self.last_dump_row_count / self.segment_row_count;
|
||||
self.last_dump_row_count =
|
||||
self.total_row_count - self.total_row_count % self.segment_row_count; // align to segment
|
||||
|
||||
let entries = values.len();
|
||||
IntermediateWriter::new(writer).write_all(values.into_iter().map(|(k, (b, _))| (k, b))).await.inspect(|_|
|
||||
IntermediateWriter::new(writer).write_all(values, bitmap_leading_zeros as _).await.inspect(|_|
|
||||
debug!("Dumped {entries} entries ({memory_usage} bytes) to intermediate file {file_id} for index {index_name}")
|
||||
).inspect_err(|e|
|
||||
error!(e; "Failed to dump {entries} entries to intermediate file {file_id} for index {index_name}")
|
||||
@@ -256,8 +271,13 @@ impl ExternalSorter {
|
||||
|
||||
/// Determines the segment index range for the row index range
|
||||
/// `[row_begin, row_begin + n - 1]`
|
||||
fn segment_index_range(&self, n: usize) -> RangeInclusive<usize> {
|
||||
let row_begin = self.total_row_count;
|
||||
fn segment_index_range(&self, n: usize, is_null: bool) -> RangeInclusive<usize> {
|
||||
let row_begin = if is_null {
|
||||
self.total_row_count
|
||||
} else {
|
||||
self.total_row_count - self.last_dump_row_count
|
||||
};
|
||||
|
||||
let start = self.segment_index(row_begin);
|
||||
let end = self.segment_index(row_begin + n - 1);
|
||||
start..=end
|
||||
@@ -269,6 +289,16 @@ impl ExternalSorter {
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets the bits within the specified range in the given `BitVec` to true
|
||||
fn set_bits(bitmap: &mut BitVec, index_range: RangeInclusive<usize>) {
|
||||
if *index_range.end() >= bitmap.len() {
|
||||
bitmap.resize(index_range.end() + 1, false);
|
||||
}
|
||||
for index in index_range {
|
||||
bitmap.set(index, true);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashMap;
|
||||
@@ -300,7 +330,7 @@ mod tests {
|
||||
move |index_name, file_id| {
|
||||
assert_eq!(index_name, "test");
|
||||
let mut files = files.lock().unwrap();
|
||||
let (writer, reader) = duplex(1024 * 1024);
|
||||
let (writer, reader) = duplex(8 * 1024);
|
||||
files.insert(file_id.to_string(), Box::new(reader.compat()));
|
||||
Ok(Box::new(writer.compat_write()))
|
||||
}
|
||||
|
||||
@@ -19,24 +19,29 @@
|
||||
//! The serialization format is as follows:
|
||||
//!
|
||||
//! ```text
|
||||
//! [magic][item][item]...[item]
|
||||
//! [4] [?]
|
||||
//! [magic][bitmap leading zeros][item][item]...[item]
|
||||
//! [4] [4] [?]
|
||||
//!
|
||||
//! Each [item] is structured as:
|
||||
//! [value len][value][bitmap len][bitmap]
|
||||
//! [8] [?] [8] [?]
|
||||
//! ```
|
||||
//!
|
||||
//! Each item represents a value and its associated bitmap, serialized with their lengths for
|
||||
//! The format starts with a 4-byte magic identifier, followed by a 4-byte
|
||||
//! bitmap leading zeros count, indicating how many leading zeros are in the
|
||||
//! fixed-size region of the bitmap. Following that, each item represents
|
||||
//! a value and its associated bitmap, serialized with their lengths for
|
||||
//! easier deserialization.
|
||||
|
||||
mod codec_v1;
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use asynchronous_codec::{FramedRead, FramedWrite};
|
||||
use common_base::BitVec;
|
||||
use futures::{stream, AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt, StreamExt};
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::bitmap::{Bitmap, BitmapType};
|
||||
use crate::inverted_index::create::sort::SortedStream;
|
||||
use crate::inverted_index::error::{
|
||||
CloseSnafu, FlushSnafu, ReadSnafu, Result, UnknownIntermediateCodecMagicSnafu, WriteSnafu,
|
||||
@@ -57,13 +62,12 @@ impl<W: AsyncWrite + Unpin> IntermediateWriter<W> {
|
||||
/// Serializes and writes all provided values to the wrapped writer
|
||||
pub async fn write_all(
|
||||
mut self,
|
||||
values: impl IntoIterator<Item = (Bytes, Bitmap)>,
|
||||
values: BTreeMap<Bytes, BitVec>,
|
||||
bitmap_leading_zeros: u32,
|
||||
) -> Result<()> {
|
||||
let (codec_magic, encoder) = (
|
||||
codec_v1::CODEC_V1_MAGIC,
|
||||
codec_v1::IntermediateItemEncoderV1 {
|
||||
bitmap_type: BitmapType::Roaring,
|
||||
},
|
||||
codec_v1::IntermediateItemEncoderV1,
|
||||
);
|
||||
|
||||
self.writer
|
||||
@@ -71,6 +75,11 @@ impl<W: AsyncWrite + Unpin> IntermediateWriter<W> {
|
||||
.await
|
||||
.context(WriteSnafu)?;
|
||||
|
||||
self.writer
|
||||
.write_all(&bitmap_leading_zeros.to_be_bytes())
|
||||
.await
|
||||
.context(WriteSnafu)?;
|
||||
|
||||
let value_stream = stream::iter(values.into_iter().map(Ok));
|
||||
let frame_write = FramedWrite::new(&mut self.writer, encoder);
|
||||
// `forward()` will flush and close the writer when the stream ends
|
||||
@@ -103,9 +112,17 @@ impl<R: AsyncRead + Unpin + Send + 'static> IntermediateReader<R> {
|
||||
.context(ReadSnafu)?;
|
||||
|
||||
let decoder = match &magic {
|
||||
codec_v1::CODEC_V1_MAGIC => codec_v1::IntermediateItemDecoderV1 {
|
||||
bitmap_type: BitmapType::Roaring,
|
||||
},
|
||||
codec_v1::CODEC_V1_MAGIC => {
|
||||
let bitmap_leading_zeros = {
|
||||
let mut buf = [0u8; 4];
|
||||
self.reader.read_exact(&mut buf).await.context(ReadSnafu)?;
|
||||
u32::from_be_bytes(buf)
|
||||
};
|
||||
|
||||
codec_v1::IntermediateItemDecoderV1 {
|
||||
bitmap_leading_zeros,
|
||||
}
|
||||
}
|
||||
_ => return UnknownIntermediateCodecMagicSnafu { magic }.fail(),
|
||||
};
|
||||
|
||||
@@ -115,7 +132,6 @@ impl<R: AsyncRead + Unpin + Send + 'static> IntermediateReader<R> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::BTreeMap;
|
||||
use std::io::{Seek, SeekFrom};
|
||||
|
||||
use futures::io::{AllowStdIo, Cursor};
|
||||
@@ -124,10 +140,6 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::inverted_index::error::Error;
|
||||
|
||||
fn bitmap(bytes: &[u8]) -> Bitmap {
|
||||
Bitmap::from_lsb0_bytes(bytes, BitmapType::Roaring)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_intermediate_read_write_basic() {
|
||||
let file_r = tempfile().unwrap();
|
||||
@@ -136,12 +148,12 @@ mod tests {
|
||||
let buf_w = AllowStdIo::new(file_w);
|
||||
|
||||
let values = BTreeMap::from_iter([
|
||||
(Bytes::from("a"), bitmap(&[0b10101010])),
|
||||
(Bytes::from("b"), bitmap(&[0b01010101])),
|
||||
(Bytes::from("a"), BitVec::from_slice(&[0b10101010])),
|
||||
(Bytes::from("b"), BitVec::from_slice(&[0b01010101])),
|
||||
]);
|
||||
|
||||
let writer = IntermediateWriter::new(buf_w);
|
||||
writer.write_all(values.clone()).await.unwrap();
|
||||
writer.write_all(values.clone(), 0).await.unwrap();
|
||||
// reset the handle
|
||||
buf_r.seek(SeekFrom::Start(0)).unwrap();
|
||||
|
||||
@@ -149,9 +161,48 @@ mod tests {
|
||||
let mut stream = reader.into_stream().await.unwrap();
|
||||
|
||||
let a = stream.next().await.unwrap().unwrap();
|
||||
assert_eq!(a, (Bytes::from("a"), bitmap(&[0b10101010])));
|
||||
assert_eq!(a, (Bytes::from("a"), BitVec::from_slice(&[0b10101010])));
|
||||
let b = stream.next().await.unwrap().unwrap();
|
||||
assert_eq!(b, (Bytes::from("b"), bitmap(&[0b01010101])));
|
||||
assert_eq!(b, (Bytes::from("b"), BitVec::from_slice(&[0b01010101])));
|
||||
assert!(stream.next().await.is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_intermediate_read_write_with_prefix_zeros() {
|
||||
let file_r = tempfile().unwrap();
|
||||
let file_w = file_r.try_clone().unwrap();
|
||||
let mut buf_r = AllowStdIo::new(file_r);
|
||||
let buf_w = AllowStdIo::new(file_w);
|
||||
|
||||
let values = BTreeMap::from_iter([
|
||||
(Bytes::from("a"), BitVec::from_slice(&[0b10101010])),
|
||||
(Bytes::from("b"), BitVec::from_slice(&[0b01010101])),
|
||||
]);
|
||||
|
||||
let writer = IntermediateWriter::new(buf_w);
|
||||
writer.write_all(values.clone(), 8).await.unwrap();
|
||||
// reset the handle
|
||||
buf_r.seek(SeekFrom::Start(0)).unwrap();
|
||||
|
||||
let reader = IntermediateReader::new(buf_r);
|
||||
let mut stream = reader.into_stream().await.unwrap();
|
||||
|
||||
let a = stream.next().await.unwrap().unwrap();
|
||||
assert_eq!(
|
||||
a,
|
||||
(
|
||||
Bytes::from("a"),
|
||||
BitVec::from_slice(&[0b00000000, 0b10101010])
|
||||
)
|
||||
);
|
||||
let b = stream.next().await.unwrap().unwrap();
|
||||
assert_eq!(
|
||||
b,
|
||||
(
|
||||
Bytes::from("b"),
|
||||
BitVec::from_slice(&[0b00000000, 0b01010101])
|
||||
)
|
||||
);
|
||||
assert!(stream.next().await.is_none());
|
||||
}
|
||||
|
||||
@@ -162,7 +213,7 @@ mod tests {
|
||||
let values = BTreeMap::new();
|
||||
|
||||
let writer = IntermediateWriter::new(&mut buf);
|
||||
writer.write_all(values.clone()).await.unwrap();
|
||||
writer.write_all(values.clone(), 0).await.unwrap();
|
||||
|
||||
let reader = IntermediateReader::new(Cursor::new(buf));
|
||||
let mut stream = reader.into_stream().await.unwrap();
|
||||
|
||||
@@ -16,10 +16,9 @@ use std::io;
|
||||
|
||||
use asynchronous_codec::{BytesMut, Decoder, Encoder};
|
||||
use bytes::{Buf, BufMut};
|
||||
use greptime_proto::v1::index::BitmapType;
|
||||
use common_base::BitVec;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::bitmap::Bitmap;
|
||||
use crate::inverted_index::error::{CommonIoSnafu, Error, Result};
|
||||
use crate::Bytes;
|
||||
|
||||
@@ -29,42 +28,37 @@ const U64_LENGTH: usize = std::mem::size_of::<u64>();
|
||||
pub const CODEC_V1_MAGIC: &[u8; 4] = b"im01";
|
||||
|
||||
/// Serializes items of external sorting intermediate files.
|
||||
pub struct IntermediateItemEncoderV1 {
|
||||
pub bitmap_type: BitmapType,
|
||||
}
|
||||
pub struct IntermediateItemEncoderV1;
|
||||
|
||||
/// [`FramedWrite`] requires the [`Encoder`] trait to be implemented.
|
||||
impl Encoder for IntermediateItemEncoderV1 {
|
||||
type Item<'a> = (Bytes, Bitmap);
|
||||
type Item<'a> = (Bytes, BitVec);
|
||||
type Error = Error;
|
||||
|
||||
fn encode(&mut self, item: (Bytes, Bitmap), dst: &mut BytesMut) -> Result<()> {
|
||||
fn encode(&mut self, item: (Bytes, BitVec), dst: &mut BytesMut) -> Result<()> {
|
||||
let value_bytes = item.0;
|
||||
let bitmap_size = item.1.serialized_size(self.bitmap_type);
|
||||
let bitmap_bytes = item.1.into_vec();
|
||||
|
||||
dst.reserve(U64_LENGTH * 2 + value_bytes.len() + bitmap_size);
|
||||
dst.reserve(U64_LENGTH * 2 + value_bytes.len() + bitmap_bytes.len());
|
||||
dst.put_u64_le(value_bytes.len() as u64);
|
||||
dst.extend_from_slice(&value_bytes);
|
||||
dst.put_u64_le(bitmap_size as u64);
|
||||
item.1
|
||||
.serialize_into(self.bitmap_type, &mut dst.writer())
|
||||
.context(CommonIoSnafu)?;
|
||||
|
||||
dst.put_u64_le(bitmap_bytes.len() as u64);
|
||||
dst.extend_from_slice(&bitmap_bytes);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Deserializes items of external sorting intermediate files.
|
||||
pub struct IntermediateItemDecoderV1 {
|
||||
pub bitmap_type: BitmapType,
|
||||
pub(crate) bitmap_leading_zeros: u32,
|
||||
}
|
||||
|
||||
/// [`FramedRead`] requires the [`Decoder`] trait to be implemented.
|
||||
impl Decoder for IntermediateItemDecoderV1 {
|
||||
type Item = (Bytes, Bitmap);
|
||||
type Item = (Bytes, BitVec);
|
||||
type Error = Error;
|
||||
|
||||
/// Decodes the `src` into `(Bytes, RoaringBitmap)`. Returns `None` if
|
||||
/// Decodes the `src` into `(Bytes, BitVec)`. Returns `None` if
|
||||
/// the `src` does not contain enough data for a complete item.
|
||||
///
|
||||
/// Only after successful decoding, the `src` is advanced. Otherwise,
|
||||
@@ -98,8 +92,8 @@ impl Decoder for IntermediateItemDecoderV1 {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let bitmap = Bitmap::deserialize_from(&buf[..bitmap_len], self.bitmap_type)
|
||||
.context(CommonIoSnafu)?;
|
||||
let mut bitmap = BitVec::repeat(false, self.bitmap_leading_zeros as _);
|
||||
bitmap.extend_from_raw_slice(&buf[..bitmap_len]);
|
||||
|
||||
let item = (value_bytes.to_vec(), bitmap);
|
||||
|
||||
@@ -119,29 +113,25 @@ impl From<io::Error> for Error {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use common_base::bit_vec::prelude::{bitvec, Lsb0};
|
||||
|
||||
fn bitmap(bytes: &[u8]) -> Bitmap {
|
||||
Bitmap::from_lsb0_bytes(bytes, BitmapType::Roaring)
|
||||
}
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_intermediate_codec_basic() {
|
||||
let mut encoder = IntermediateItemEncoderV1 {
|
||||
bitmap_type: BitmapType::Roaring,
|
||||
};
|
||||
let mut encoder = IntermediateItemEncoderV1;
|
||||
let mut buf = BytesMut::new();
|
||||
|
||||
let item = (b"hello".to_vec(), bitmap(&[0b10101010]));
|
||||
let item = (b"hello".to_vec(), BitVec::from_slice(&[0b10101010]));
|
||||
encoder.encode(item.clone(), &mut buf).unwrap();
|
||||
|
||||
let mut decoder = IntermediateItemDecoderV1 {
|
||||
bitmap_type: BitmapType::Roaring,
|
||||
bitmap_leading_zeros: 0,
|
||||
};
|
||||
assert_eq!(decoder.decode(&mut buf).unwrap().unwrap(), item);
|
||||
assert_eq!(decoder.decode(&mut buf).unwrap(), None);
|
||||
|
||||
let item1 = (b"world".to_vec(), bitmap(&[0b01010101]));
|
||||
let item1 = (b"world".to_vec(), BitVec::from_slice(&[0b01010101]));
|
||||
encoder.encode(item.clone(), &mut buf).unwrap();
|
||||
encoder.encode(item1.clone(), &mut buf).unwrap();
|
||||
assert_eq!(decoder.decode(&mut buf).unwrap().unwrap(), item);
|
||||
@@ -152,16 +142,14 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_intermediate_codec_empty_item() {
|
||||
let mut encoder = IntermediateItemEncoderV1 {
|
||||
bitmap_type: BitmapType::Roaring,
|
||||
};
|
||||
let mut encoder = IntermediateItemEncoderV1;
|
||||
let mut buf = BytesMut::new();
|
||||
|
||||
let item = (b"".to_vec(), bitmap(&[]));
|
||||
let item = (b"".to_vec(), BitVec::from_slice(&[]));
|
||||
encoder.encode(item.clone(), &mut buf).unwrap();
|
||||
|
||||
let mut decoder = IntermediateItemDecoderV1 {
|
||||
bitmap_type: BitmapType::Roaring,
|
||||
bitmap_leading_zeros: 0,
|
||||
};
|
||||
assert_eq!(decoder.decode(&mut buf).unwrap().unwrap(), item);
|
||||
assert_eq!(decoder.decode(&mut buf).unwrap(), None);
|
||||
@@ -170,19 +158,17 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_intermediate_codec_partial() {
|
||||
let mut encoder = IntermediateItemEncoderV1 {
|
||||
bitmap_type: BitmapType::Roaring,
|
||||
};
|
||||
let mut encoder = IntermediateItemEncoderV1;
|
||||
let mut buf = BytesMut::new();
|
||||
|
||||
let item = (b"hello".to_vec(), bitmap(&[0b10101010]));
|
||||
let item = (b"hello".to_vec(), BitVec::from_slice(&[0b10101010]));
|
||||
encoder.encode(item.clone(), &mut buf).unwrap();
|
||||
|
||||
let partial_length = U64_LENGTH + 3;
|
||||
let mut partial_bytes = buf.split_to(partial_length);
|
||||
|
||||
let mut decoder = IntermediateItemDecoderV1 {
|
||||
bitmap_type: BitmapType::Roaring,
|
||||
bitmap_leading_zeros: 0,
|
||||
};
|
||||
assert_eq!(decoder.decode(&mut partial_bytes).unwrap(), None); // not enough data
|
||||
partial_bytes.extend_from_slice(&buf[..]);
|
||||
@@ -190,4 +176,25 @@ mod tests {
|
||||
assert_eq!(decoder.decode(&mut partial_bytes).unwrap(), None);
|
||||
assert!(partial_bytes.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intermediate_codec_prefix_zeros() {
|
||||
let mut encoder = IntermediateItemEncoderV1;
|
||||
let mut buf = BytesMut::new();
|
||||
|
||||
let item = (b"hello".to_vec(), bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1, 0]);
|
||||
encoder.encode(item.clone(), &mut buf).unwrap();
|
||||
|
||||
let mut decoder = IntermediateItemDecoderV1 {
|
||||
bitmap_leading_zeros: 3,
|
||||
};
|
||||
let decoded_item = decoder.decode(&mut buf).unwrap().unwrap();
|
||||
assert_eq!(decoded_item.0, b"hello");
|
||||
assert_eq!(
|
||||
decoded_item.1,
|
||||
bitvec![u8, Lsb0; 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0]
|
||||
);
|
||||
assert_eq!(decoder.decode(&mut buf).unwrap(), None);
|
||||
assert!(buf.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,10 +16,10 @@ use std::cmp::Ordering;
|
||||
use std::pin::Pin;
|
||||
use std::task::{Context, Poll};
|
||||
|
||||
use common_base::BitVec;
|
||||
use futures::{ready, Stream, StreamExt};
|
||||
use pin_project::pin_project;
|
||||
|
||||
use crate::bitmap::Bitmap;
|
||||
use crate::inverted_index::create::sort::SortedStream;
|
||||
use crate::inverted_index::error::Result;
|
||||
use crate::Bytes;
|
||||
@@ -28,10 +28,10 @@ use crate::Bytes;
|
||||
#[pin_project]
|
||||
pub struct MergeSortedStream {
|
||||
stream1: Option<SortedStream>,
|
||||
peek1: Option<(Bytes, Bitmap)>,
|
||||
peek1: Option<(Bytes, BitVec)>,
|
||||
|
||||
stream2: Option<SortedStream>,
|
||||
peek2: Option<(Bytes, Bitmap)>,
|
||||
peek2: Option<(Bytes, BitVec)>,
|
||||
}
|
||||
|
||||
impl MergeSortedStream {
|
||||
@@ -49,7 +49,7 @@ impl MergeSortedStream {
|
||||
}
|
||||
|
||||
impl Stream for MergeSortedStream {
|
||||
type Item = Result<(Bytes, Bitmap)>;
|
||||
type Item = Result<(Bytes, BitVec)>;
|
||||
|
||||
/// Polls both streams and returns the next item from the stream that has the smaller next item.
|
||||
/// If both streams have the same next item, the bitmaps are unioned together.
|
||||
@@ -89,77 +89,77 @@ impl Stream for MergeSortedStream {
|
||||
}
|
||||
|
||||
/// Merges two bitmaps by bit-wise OR'ing them together, preserving all bits from both
|
||||
fn merge_bitmaps(mut bitmap1: Bitmap, bitmap2: Bitmap) -> Bitmap {
|
||||
bitmap1.union(bitmap2);
|
||||
bitmap1
|
||||
fn merge_bitmaps(bitmap1: BitVec, bitmap2: BitVec) -> BitVec {
|
||||
// make sure longer bitmap is on the left to avoid truncation
|
||||
#[allow(clippy::if_same_then_else)]
|
||||
if bitmap1.len() > bitmap2.len() {
|
||||
bitmap1 | bitmap2
|
||||
} else {
|
||||
bitmap2 | bitmap1
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use futures::stream;
|
||||
use greptime_proto::v1::index::BitmapType;
|
||||
|
||||
use super::*;
|
||||
use crate::inverted_index::error::Error;
|
||||
|
||||
fn bitmap(bytes: &[u8]) -> Bitmap {
|
||||
Bitmap::from_lsb0_bytes(bytes, BitmapType::Roaring)
|
||||
}
|
||||
|
||||
fn sorted_stream_from_vec(vec: Vec<(Bytes, Bitmap)>) -> SortedStream {
|
||||
fn sorted_stream_from_vec(vec: Vec<(Bytes, BitVec)>) -> SortedStream {
|
||||
Box::new(stream::iter(vec.into_iter().map(Ok::<_, Error>)))
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_merge_sorted_stream_non_overlapping() {
|
||||
let stream1 = sorted_stream_from_vec(vec![
|
||||
(Bytes::from("apple"), bitmap(&[0b10101010])),
|
||||
(Bytes::from("orange"), bitmap(&[0b01010101])),
|
||||
(Bytes::from("apple"), BitVec::from_slice(&[0b10101010])),
|
||||
(Bytes::from("orange"), BitVec::from_slice(&[0b01010101])),
|
||||
]);
|
||||
let stream2 = sorted_stream_from_vec(vec![
|
||||
(Bytes::from("banana"), bitmap(&[0b10101010])),
|
||||
(Bytes::from("peach"), bitmap(&[0b01010101])),
|
||||
(Bytes::from("banana"), BitVec::from_slice(&[0b10101010])),
|
||||
(Bytes::from("peach"), BitVec::from_slice(&[0b01010101])),
|
||||
]);
|
||||
|
||||
let mut merged_stream = MergeSortedStream::merge(stream1, stream2);
|
||||
|
||||
let item = merged_stream.next().await.unwrap().unwrap();
|
||||
assert_eq!(item.0, Bytes::from("apple"));
|
||||
assert_eq!(item.1, bitmap(&[0b10101010]));
|
||||
assert_eq!(item.1, BitVec::from_slice(&[0b10101010]));
|
||||
let item = merged_stream.next().await.unwrap().unwrap();
|
||||
assert_eq!(item.0, Bytes::from("banana"));
|
||||
assert_eq!(item.1, bitmap(&[0b10101010]));
|
||||
assert_eq!(item.1, BitVec::from_slice(&[0b10101010]));
|
||||
let item = merged_stream.next().await.unwrap().unwrap();
|
||||
assert_eq!(item.0, Bytes::from("orange"));
|
||||
assert_eq!(item.1, bitmap(&[0b01010101]));
|
||||
assert_eq!(item.1, BitVec::from_slice(&[0b01010101]));
|
||||
let item = merged_stream.next().await.unwrap().unwrap();
|
||||
assert_eq!(item.0, Bytes::from("peach"));
|
||||
assert_eq!(item.1, bitmap(&[0b01010101]));
|
||||
assert_eq!(item.1, BitVec::from_slice(&[0b01010101]));
|
||||
assert!(merged_stream.next().await.is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_merge_sorted_stream_overlapping() {
|
||||
let stream1 = sorted_stream_from_vec(vec![
|
||||
(Bytes::from("apple"), bitmap(&[0b10101010])),
|
||||
(Bytes::from("orange"), bitmap(&[0b10101010])),
|
||||
(Bytes::from("apple"), BitVec::from_slice(&[0b10101010])),
|
||||
(Bytes::from("orange"), BitVec::from_slice(&[0b10101010])),
|
||||
]);
|
||||
let stream2 = sorted_stream_from_vec(vec![
|
||||
(Bytes::from("apple"), bitmap(&[0b01010101])),
|
||||
(Bytes::from("peach"), bitmap(&[0b01010101])),
|
||||
(Bytes::from("apple"), BitVec::from_slice(&[0b01010101])),
|
||||
(Bytes::from("peach"), BitVec::from_slice(&[0b01010101])),
|
||||
]);
|
||||
|
||||
let mut merged_stream = MergeSortedStream::merge(stream1, stream2);
|
||||
|
||||
let item = merged_stream.next().await.unwrap().unwrap();
|
||||
assert_eq!(item.0, Bytes::from("apple"));
|
||||
assert_eq!(item.1, bitmap(&[0b11111111]));
|
||||
assert_eq!(item.1, BitVec::from_slice(&[0b11111111]));
|
||||
let item = merged_stream.next().await.unwrap().unwrap();
|
||||
assert_eq!(item.0, Bytes::from("orange"));
|
||||
assert_eq!(item.1, bitmap(&[0b10101010]));
|
||||
assert_eq!(item.1, BitVec::from_slice(&[0b10101010]));
|
||||
let item = merged_stream.next().await.unwrap().unwrap();
|
||||
assert_eq!(item.0, Bytes::from("peach"));
|
||||
assert_eq!(item.1, bitmap(&[0b01010101]));
|
||||
assert_eq!(item.1, BitVec::from_slice(&[0b01010101]));
|
||||
assert!(merged_stream.next().await.is_none());
|
||||
}
|
||||
|
||||
|
||||
@@ -18,7 +18,6 @@ use std::num::NonZeroUsize;
|
||||
use async_trait::async_trait;
|
||||
use snafu::ensure;
|
||||
|
||||
use crate::bitmap::BitmapType;
|
||||
use crate::inverted_index::create::sort::{SortOutput, Sorter};
|
||||
use crate::inverted_index::create::InvertedIndexCreator;
|
||||
use crate::inverted_index::error::{InconsistentRowCountSnafu, Result};
|
||||
@@ -69,11 +68,7 @@ impl InvertedIndexCreator for SortIndexCreator {
|
||||
}
|
||||
|
||||
/// Finalizes the sorting for all indexes and writes them using the inverted index writer
|
||||
async fn finish(
|
||||
&mut self,
|
||||
writer: &mut dyn InvertedIndexWriter,
|
||||
bitmap_type: BitmapType,
|
||||
) -> Result<()> {
|
||||
async fn finish(&mut self, writer: &mut dyn InvertedIndexWriter) -> Result<()> {
|
||||
let mut output_row_count = None;
|
||||
for (index_name, mut sorter) in self.sorters.drain() {
|
||||
let SortOutput {
|
||||
@@ -93,7 +88,7 @@ impl InvertedIndexCreator for SortIndexCreator {
|
||||
);
|
||||
|
||||
writer
|
||||
.add_index(index_name, segment_null_bitmap, sorted_stream, bitmap_type)
|
||||
.add_index(index_name, segment_null_bitmap, sorted_stream)
|
||||
.await?;
|
||||
}
|
||||
|
||||
@@ -122,9 +117,9 @@ mod tests {
|
||||
use futures::{stream, StreamExt};
|
||||
|
||||
use super::*;
|
||||
use crate::bitmap::Bitmap;
|
||||
use crate::inverted_index::create::sort::SortedStream;
|
||||
use crate::inverted_index::error::Error;
|
||||
use crate::inverted_index::format::writer::{MockInvertedIndexWriter, ValueStream};
|
||||
use crate::inverted_index::format::writer::MockInvertedIndexWriter;
|
||||
use crate::Bytes;
|
||||
|
||||
#[tokio::test]
|
||||
@@ -148,10 +143,11 @@ mod tests {
|
||||
}
|
||||
|
||||
let mut mock_writer = MockInvertedIndexWriter::new();
|
||||
mock_writer.expect_add_index().times(3).returning(
|
||||
|name, null_bitmap, stream, bitmap_type| {
|
||||
mock_writer
|
||||
.expect_add_index()
|
||||
.times(3)
|
||||
.returning(|name, null_bitmap, stream| {
|
||||
assert!(null_bitmap.is_empty());
|
||||
assert_eq!(bitmap_type, BitmapType::Roaring);
|
||||
match name.as_str() {
|
||||
"a" => assert_eq!(stream_to_values(stream), vec![b"1", b"2", b"3"]),
|
||||
"b" => assert_eq!(stream_to_values(stream), vec![b"4", b"5", b"6"]),
|
||||
@@ -159,8 +155,7 @@ mod tests {
|
||||
_ => panic!("unexpected index name: {}", name),
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
});
|
||||
mock_writer
|
||||
.expect_finish()
|
||||
.times(1)
|
||||
@@ -170,10 +165,7 @@ mod tests {
|
||||
Ok(())
|
||||
});
|
||||
|
||||
creator
|
||||
.finish(&mut mock_writer, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
creator.finish(&mut mock_writer).await.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -199,9 +191,8 @@ mod tests {
|
||||
let mut mock_writer = MockInvertedIndexWriter::new();
|
||||
mock_writer
|
||||
.expect_add_index()
|
||||
.returning(|name, null_bitmap, stream, bitmap_type| {
|
||||
.returning(|name, null_bitmap, stream| {
|
||||
assert!(null_bitmap.is_empty());
|
||||
assert_eq!(bitmap_type, BitmapType::Roaring);
|
||||
match name.as_str() {
|
||||
"a" => assert_eq!(stream_to_values(stream), vec![b"1", b"2", b"3"]),
|
||||
"b" => assert_eq!(stream_to_values(stream), vec![b"4", b"5", b"6"]),
|
||||
@@ -212,7 +203,7 @@ mod tests {
|
||||
});
|
||||
mock_writer.expect_finish().never();
|
||||
|
||||
let res = creator.finish(&mut mock_writer, BitmapType::Roaring).await;
|
||||
let res = creator.finish(&mut mock_writer).await;
|
||||
assert!(matches!(res, Err(Error::InconsistentRowCount { .. })));
|
||||
}
|
||||
|
||||
@@ -228,9 +219,8 @@ mod tests {
|
||||
let mut mock_writer = MockInvertedIndexWriter::new();
|
||||
mock_writer
|
||||
.expect_add_index()
|
||||
.returning(|name, null_bitmap, stream, bitmap_type| {
|
||||
.returning(|name, null_bitmap, stream| {
|
||||
assert!(null_bitmap.is_empty());
|
||||
assert_eq!(bitmap_type, BitmapType::Roaring);
|
||||
assert!(matches!(name.as_str(), "a" | "b" | "c"));
|
||||
assert!(stream_to_values(stream).is_empty());
|
||||
Ok(())
|
||||
@@ -244,10 +234,7 @@ mod tests {
|
||||
Ok(())
|
||||
});
|
||||
|
||||
creator
|
||||
.finish(&mut mock_writer, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
creator.finish(&mut mock_writer).await.unwrap();
|
||||
}
|
||||
|
||||
fn set_bit(bit_vec: &mut BitVec, index: usize) {
|
||||
@@ -296,21 +283,20 @@ mod tests {
|
||||
|
||||
async fn output(&mut self) -> Result<SortOutput> {
|
||||
let segment_null_bitmap = self.values.remove(&None).unwrap_or_default();
|
||||
let segment_null_bitmap = Bitmap::BitVec(segment_null_bitmap);
|
||||
|
||||
Ok(SortOutput {
|
||||
segment_null_bitmap,
|
||||
sorted_stream: Box::new(stream::iter(
|
||||
std::mem::take(&mut self.values)
|
||||
.into_iter()
|
||||
.map(|(v, b)| Ok((v.unwrap(), Bitmap::BitVec(b)))),
|
||||
.map(|(v, b)| Ok((v.unwrap(), b))),
|
||||
)),
|
||||
total_row_count: self.total_row_count,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn stream_to_values(stream: ValueStream) -> Vec<Bytes> {
|
||||
fn stream_to_values(stream: SortedStream) -> Vec<Bytes> {
|
||||
futures::executor::block_on(async {
|
||||
stream.map(|r| r.unwrap().0).collect::<Vec<Bytes>>().await
|
||||
})
|
||||
|
||||
@@ -110,14 +110,6 @@ pub enum Error {
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to decode bitmap"))]
|
||||
DecodeBitmap {
|
||||
#[snafu(source)]
|
||||
error: IoError,
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to decode protobuf"))]
|
||||
DecodeProto {
|
||||
#[snafu(source)]
|
||||
@@ -248,7 +240,6 @@ impl ErrorExt for Error {
|
||||
| CommonIo { .. }
|
||||
| UnknownIntermediateCodecMagic { .. }
|
||||
| FstCompile { .. }
|
||||
| DecodeBitmap { .. }
|
||||
| InvalidFooterPayloadSize { .. }
|
||||
| BlobSizeTooSmall { .. } => StatusCode::Unexpected,
|
||||
|
||||
|
||||
@@ -18,11 +18,11 @@ use std::sync::Arc;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
use common_base::BitVec;
|
||||
use greptime_proto::v1::index::InvertedIndexMetas;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::bitmap::{Bitmap, BitmapType};
|
||||
use crate::inverted_index::error::{DecodeBitmapSnafu, DecodeFstSnafu, Result};
|
||||
use crate::inverted_index::error::{DecodeFstSnafu, Result};
|
||||
pub use crate::inverted_index::format::reader::blob::InvertedIndexBlobReader;
|
||||
use crate::inverted_index::FstMap;
|
||||
|
||||
@@ -67,25 +67,17 @@ pub trait InvertedIndexReader: Send + Sync {
|
||||
}
|
||||
|
||||
/// Retrieves the bitmap from the given offset and size.
|
||||
async fn bitmap(&self, offset: u64, size: u32, bitmap_type: BitmapType) -> Result<Bitmap> {
|
||||
self.range_read(offset, size).await.and_then(|bytes| {
|
||||
Bitmap::deserialize_from(&bytes, bitmap_type).context(DecodeBitmapSnafu)
|
||||
})
|
||||
async fn bitmap(&self, offset: u64, size: u32) -> Result<BitVec> {
|
||||
self.range_read(offset, size).await.map(BitVec::from_vec)
|
||||
}
|
||||
|
||||
/// Retrieves the multiple bitmaps from the given ranges.
|
||||
async fn bitmap_deque(
|
||||
&mut self,
|
||||
ranges: &[(Range<u64>, BitmapType)],
|
||||
) -> Result<VecDeque<Bitmap>> {
|
||||
let (ranges, types): (Vec<_>, Vec<_>) = ranges.iter().cloned().unzip();
|
||||
let bytes = self.read_vec(&ranges).await?;
|
||||
bytes
|
||||
async fn bitmap_deque(&mut self, ranges: &[Range<u64>]) -> Result<VecDeque<BitVec>> {
|
||||
Ok(self
|
||||
.read_vec(ranges)
|
||||
.await?
|
||||
.into_iter()
|
||||
.zip(types)
|
||||
.map(|(bytes, bitmap_type)| {
|
||||
Bitmap::deserialize_from(&bytes, bitmap_type).context(DecodeBitmapSnafu)
|
||||
})
|
||||
.collect::<Result<VecDeque<_>>>()
|
||||
.map(|bytes| BitVec::from_slice(bytes.as_ref()))
|
||||
.collect::<VecDeque<_>>())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,14 +78,14 @@ impl<R: RangeReader + Sync> InvertedIndexReader for InvertedIndexBlobReader<R> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use common_base::bit_vec::prelude::*;
|
||||
use fst::MapBuilder;
|
||||
use greptime_proto::v1::index::{BitmapType, InvertedIndexMeta, InvertedIndexMetas};
|
||||
use greptime_proto::v1::index::{InvertedIndexMeta, InvertedIndexMetas};
|
||||
use prost::Message;
|
||||
|
||||
use super::*;
|
||||
use crate::bitmap::Bitmap;
|
||||
|
||||
fn mock_fst() -> Vec<u8> {
|
||||
fn create_fake_fst() -> Vec<u8> {
|
||||
let mut fst_buf = Vec::new();
|
||||
let mut build = MapBuilder::new(&mut fst_buf).unwrap();
|
||||
build.insert("key1".as_bytes(), 1).unwrap();
|
||||
@@ -94,27 +94,19 @@ mod tests {
|
||||
fst_buf
|
||||
}
|
||||
|
||||
fn mock_bitmap() -> Bitmap {
|
||||
Bitmap::from_lsb0_bytes(&[0b10101010, 0b10000000], BitmapType::Roaring)
|
||||
}
|
||||
|
||||
fn mock_bitmap_bytes() -> Vec<u8> {
|
||||
let mut buf = Vec::new();
|
||||
mock_bitmap()
|
||||
.serialize_into(BitmapType::Roaring, &mut buf)
|
||||
.unwrap();
|
||||
buf
|
||||
fn create_fake_bitmap() -> Vec<u8> {
|
||||
bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1, 0, 1, 0].into_vec()
|
||||
}
|
||||
|
||||
fn create_inverted_index_blob() -> Vec<u8> {
|
||||
let bitmap_size = mock_bitmap_bytes().len();
|
||||
let fst_size = mock_fst().len();
|
||||
let bitmap_size = create_fake_bitmap().len();
|
||||
let fst_size = create_fake_fst().len();
|
||||
|
||||
// first index
|
||||
let mut inverted_index = Vec::new();
|
||||
inverted_index.extend_from_slice(&mock_bitmap_bytes()); // value bitmap
|
||||
inverted_index.extend_from_slice(&mock_bitmap_bytes()); // null bitmap
|
||||
inverted_index.extend_from_slice(&mock_fst()); // fst
|
||||
inverted_index.extend_from_slice(&create_fake_bitmap()); // value bitmap
|
||||
inverted_index.extend_from_slice(&create_fake_bitmap()); // null bitmap
|
||||
inverted_index.extend_from_slice(&create_fake_fst()); // fst
|
||||
|
||||
let meta = InvertedIndexMeta {
|
||||
name: "tag0".to_string(),
|
||||
@@ -124,7 +116,6 @@ mod tests {
|
||||
null_bitmap_size: bitmap_size as _,
|
||||
relative_fst_offset: (bitmap_size * 2) as _,
|
||||
fst_size: fst_size as _,
|
||||
bitmap_type: BitmapType::Roaring as _,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
@@ -137,7 +128,6 @@ mod tests {
|
||||
null_bitmap_size: bitmap_size as _,
|
||||
relative_fst_offset: (bitmap_size * 2) as _,
|
||||
fst_size: fst_size as _,
|
||||
bitmap_type: BitmapType::Roaring as _,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
@@ -178,19 +168,19 @@ mod tests {
|
||||
let meta0 = metas.metas.get("tag0").unwrap();
|
||||
assert_eq!(meta0.name, "tag0");
|
||||
assert_eq!(meta0.base_offset, 0);
|
||||
assert_eq!(meta0.inverted_index_size, 102);
|
||||
assert_eq!(meta0.relative_null_bitmap_offset, 26);
|
||||
assert_eq!(meta0.null_bitmap_size, 26);
|
||||
assert_eq!(meta0.relative_fst_offset, 52);
|
||||
assert_eq!(meta0.inverted_index_size, 54);
|
||||
assert_eq!(meta0.relative_null_bitmap_offset, 2);
|
||||
assert_eq!(meta0.null_bitmap_size, 2);
|
||||
assert_eq!(meta0.relative_fst_offset, 4);
|
||||
assert_eq!(meta0.fst_size, 50);
|
||||
|
||||
let meta1 = metas.metas.get("tag1").unwrap();
|
||||
assert_eq!(meta1.name, "tag1");
|
||||
assert_eq!(meta1.base_offset, 102);
|
||||
assert_eq!(meta1.inverted_index_size, 102);
|
||||
assert_eq!(meta1.relative_null_bitmap_offset, 26);
|
||||
assert_eq!(meta1.null_bitmap_size, 26);
|
||||
assert_eq!(meta1.relative_fst_offset, 52);
|
||||
assert_eq!(meta1.base_offset, 54);
|
||||
assert_eq!(meta1.inverted_index_size, 54);
|
||||
assert_eq!(meta1.relative_null_bitmap_offset, 2);
|
||||
assert_eq!(meta1.null_bitmap_size, 2);
|
||||
assert_eq!(meta1.relative_fst_offset, 4);
|
||||
assert_eq!(meta1.fst_size, 50);
|
||||
}
|
||||
|
||||
@@ -234,29 +224,17 @@ mod tests {
|
||||
let metas = blob_reader.metadata().await.unwrap();
|
||||
let meta = metas.metas.get("tag0").unwrap();
|
||||
|
||||
let bitmap = blob_reader
|
||||
.bitmap(meta.base_offset, 26, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(bitmap, mock_bitmap());
|
||||
let bitmap = blob_reader
|
||||
.bitmap(meta.base_offset + 26, 26, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(bitmap, mock_bitmap());
|
||||
let bitmap = blob_reader.bitmap(meta.base_offset, 2).await.unwrap();
|
||||
assert_eq!(bitmap.into_vec(), create_fake_bitmap());
|
||||
let bitmap = blob_reader.bitmap(meta.base_offset + 2, 2).await.unwrap();
|
||||
assert_eq!(bitmap.into_vec(), create_fake_bitmap());
|
||||
|
||||
let metas = blob_reader.metadata().await.unwrap();
|
||||
let meta = metas.metas.get("tag1").unwrap();
|
||||
|
||||
let bitmap = blob_reader
|
||||
.bitmap(meta.base_offset, 26, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(bitmap, mock_bitmap());
|
||||
let bitmap = blob_reader
|
||||
.bitmap(meta.base_offset + 26, 26, BitmapType::Roaring)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(bitmap, mock_bitmap());
|
||||
let bitmap = blob_reader.bitmap(meta.base_offset, 2).await.unwrap();
|
||||
assert_eq!(bitmap.into_vec(), create_fake_bitmap());
|
||||
let bitmap = blob_reader.bitmap(meta.base_offset + 2, 2).await.unwrap();
|
||||
assert_eq!(bitmap.into_vec(), create_fake_bitmap());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,14 +18,14 @@ mod single;
|
||||
use std::num::NonZeroUsize;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use common_base::BitVec;
|
||||
use futures::Stream;
|
||||
|
||||
use crate::bitmap::{Bitmap, BitmapType};
|
||||
use crate::inverted_index::error::Result;
|
||||
pub use crate::inverted_index::format::writer::blob::InvertedIndexBlobWriter;
|
||||
use crate::Bytes;
|
||||
|
||||
pub type ValueStream = Box<dyn Stream<Item = Result<(Bytes, Bitmap)>> + Send + Unpin>;
|
||||
pub type ValueStream = Box<dyn Stream<Item = Result<(Bytes, BitVec)>> + Send + Unpin>;
|
||||
|
||||
/// Trait for writing inverted index data to underlying storage.
|
||||
#[mockall::automock]
|
||||
@@ -37,13 +37,11 @@ pub trait InvertedIndexWriter: Send {
|
||||
/// * `null_bitmap` marks positions of null entries.
|
||||
/// * `values` is a stream of values and their locations, yielded lexicographically.
|
||||
/// Errors occur if the values are out of order.
|
||||
/// * `bitmap_type` is the type of bitmap to encode.
|
||||
async fn add_index(
|
||||
&mut self,
|
||||
name: String,
|
||||
null_bitmap: Bitmap,
|
||||
null_bitmap: BitVec,
|
||||
values: ValueStream,
|
||||
bitmap_type: BitmapType,
|
||||
) -> Result<()>;
|
||||
|
||||
/// Finalizes the index writing process, ensuring all data is written.
|
||||
|
||||
@@ -15,12 +15,12 @@
|
||||
use std::num::NonZeroUsize;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use common_base::BitVec;
|
||||
use futures::{AsyncWrite, AsyncWriteExt};
|
||||
use greptime_proto::v1::index::InvertedIndexMetas;
|
||||
use prost::Message;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::bitmap::{Bitmap, BitmapType};
|
||||
use crate::inverted_index::error::{CloseSnafu, FlushSnafu, Result, WriteSnafu};
|
||||
use crate::inverted_index::format::writer::single::SingleIndexWriter;
|
||||
use crate::inverted_index::format::writer::{InvertedIndexWriter, ValueStream};
|
||||
@@ -43,9 +43,8 @@ impl<W: AsyncWrite + Send + Unpin> InvertedIndexWriter for InvertedIndexBlobWrit
|
||||
async fn add_index(
|
||||
&mut self,
|
||||
name: String,
|
||||
null_bitmap: Bitmap,
|
||||
null_bitmap: BitVec,
|
||||
values: ValueStream,
|
||||
bitmap_type: BitmapType,
|
||||
) -> Result<()> {
|
||||
let single_writer = SingleIndexWriter::new(
|
||||
name.clone(),
|
||||
@@ -53,7 +52,6 @@ impl<W: AsyncWrite + Send + Unpin> InvertedIndexWriter for InvertedIndexBlobWrit
|
||||
null_bitmap,
|
||||
values,
|
||||
&mut self.blob_writer,
|
||||
bitmap_type,
|
||||
);
|
||||
let metadata = single_writer.write().await?;
|
||||
|
||||
@@ -102,7 +100,6 @@ impl<W: AsyncWrite + Send + Unpin> InvertedIndexBlobWriter<W> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use futures::stream;
|
||||
use greptime_proto::v1::index::BitmapType;
|
||||
|
||||
use super::*;
|
||||
use crate::inverted_index::format::reader::{InvertedIndexBlobReader, InvertedIndexReader};
|
||||
@@ -135,44 +132,24 @@ mod tests {
|
||||
writer
|
||||
.add_index(
|
||||
"tag0".to_string(),
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001, 0b0000_0000], BitmapType::Roaring),
|
||||
BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
|
||||
Box::new(stream::iter(vec![
|
||||
Ok((
|
||||
Bytes::from("a"),
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
|
||||
)),
|
||||
Ok((
|
||||
Bytes::from("b"),
|
||||
Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring),
|
||||
)),
|
||||
Ok((
|
||||
Bytes::from("c"),
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
|
||||
)),
|
||||
Ok((Bytes::from("a"), BitVec::from_slice(&[0b0000_0001]))),
|
||||
Ok((Bytes::from("b"), BitVec::from_slice(&[0b0010_0000]))),
|
||||
Ok((Bytes::from("c"), BitVec::from_slice(&[0b0000_0001]))),
|
||||
])),
|
||||
BitmapType::Roaring,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
writer
|
||||
.add_index(
|
||||
"tag1".to_string(),
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001, 0b0000_0000], BitmapType::Roaring),
|
||||
BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
|
||||
Box::new(stream::iter(vec![
|
||||
Ok((
|
||||
Bytes::from("x"),
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
|
||||
)),
|
||||
Ok((
|
||||
Bytes::from("y"),
|
||||
Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring),
|
||||
)),
|
||||
Ok((
|
||||
Bytes::from("z"),
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
|
||||
)),
|
||||
Ok((Bytes::from("x"), BitVec::from_slice(&[0b0000_0001]))),
|
||||
Ok((Bytes::from("y"), BitVec::from_slice(&[0b0010_0000]))),
|
||||
Ok((Bytes::from("z"), BitVec::from_slice(&[0b0000_0001]))),
|
||||
])),
|
||||
BitmapType::Roaring,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -204,31 +181,22 @@ mod tests {
|
||||
assert_eq!(fst0.len(), 3);
|
||||
let [offset, size] = unpack(fst0.get(b"a").unwrap());
|
||||
let bitmap = reader
|
||||
.bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.bitmap(tag0.base_offset + offset as u64, size)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
bitmap,
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
|
||||
);
|
||||
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
|
||||
let [offset, size] = unpack(fst0.get(b"b").unwrap());
|
||||
let bitmap = reader
|
||||
.bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.bitmap(tag0.base_offset + offset as u64, size)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
bitmap,
|
||||
Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring)
|
||||
);
|
||||
assert_eq!(bitmap, BitVec::from_slice(&[0b0010_0000]));
|
||||
let [offset, size] = unpack(fst0.get(b"c").unwrap());
|
||||
let bitmap = reader
|
||||
.bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.bitmap(tag0.base_offset + offset as u64, size)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
bitmap,
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
|
||||
);
|
||||
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
|
||||
|
||||
// tag1
|
||||
let tag1 = metadata.metas.get("tag1").unwrap();
|
||||
@@ -247,30 +215,21 @@ mod tests {
|
||||
assert_eq!(fst1.len(), 3);
|
||||
let [offset, size] = unpack(fst1.get(b"x").unwrap());
|
||||
let bitmap = reader
|
||||
.bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.bitmap(tag1.base_offset + offset as u64, size)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
bitmap,
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
|
||||
);
|
||||
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
|
||||
let [offset, size] = unpack(fst1.get(b"y").unwrap());
|
||||
let bitmap = reader
|
||||
.bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.bitmap(tag1.base_offset + offset as u64, size)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
bitmap,
|
||||
Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring)
|
||||
);
|
||||
assert_eq!(bitmap, BitVec::from_slice(&[0b0010_0000]));
|
||||
let [offset, size] = unpack(fst1.get(b"z").unwrap());
|
||||
let bitmap = reader
|
||||
.bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.bitmap(tag1.base_offset + offset as u64, size)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
bitmap,
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
|
||||
);
|
||||
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,12 +12,12 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use common_base::BitVec;
|
||||
use fst::MapBuilder;
|
||||
use futures::{AsyncWrite, AsyncWriteExt, Stream, StreamExt};
|
||||
use greptime_proto::v1::index::{InvertedIndexMeta, InvertedIndexStats};
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::bitmap::{Bitmap, BitmapType};
|
||||
use crate::inverted_index::error::{FstCompileSnafu, FstInsertSnafu, Result, WriteSnafu};
|
||||
use crate::Bytes;
|
||||
|
||||
@@ -27,7 +27,7 @@ pub struct SingleIndexWriter<W, S> {
|
||||
blob_writer: W,
|
||||
|
||||
/// The null bitmap to be written
|
||||
null_bitmap: Bitmap,
|
||||
null_bitmap: BitVec,
|
||||
|
||||
/// The stream of values to be written, yielded lexicographically
|
||||
values: S,
|
||||
@@ -37,40 +37,30 @@ pub struct SingleIndexWriter<W, S> {
|
||||
|
||||
/// Metadata about the index
|
||||
meta: InvertedIndexMeta,
|
||||
|
||||
/// The type of bitmap to use
|
||||
bitmap_type: BitmapType,
|
||||
|
||||
/// Buffer for writing the blob
|
||||
buf: Vec<u8>,
|
||||
}
|
||||
|
||||
impl<W, S> SingleIndexWriter<W, S>
|
||||
where
|
||||
W: AsyncWrite + Send + Unpin,
|
||||
S: Stream<Item = Result<(Bytes, Bitmap)>> + Send + Unpin,
|
||||
S: Stream<Item = Result<(Bytes, BitVec)>> + Send + Unpin,
|
||||
{
|
||||
/// Constructs a new `SingleIndexWriter`
|
||||
pub fn new(
|
||||
name: String,
|
||||
base_offset: u64,
|
||||
null_bitmap: Bitmap,
|
||||
null_bitmap: BitVec,
|
||||
values: S,
|
||||
blob_writer: W,
|
||||
bitmap_type: BitmapType,
|
||||
) -> SingleIndexWriter<W, S> {
|
||||
SingleIndexWriter {
|
||||
blob_writer,
|
||||
null_bitmap,
|
||||
values,
|
||||
fst: MapBuilder::memory(),
|
||||
bitmap_type,
|
||||
buf: Vec::new(),
|
||||
meta: InvertedIndexMeta {
|
||||
name,
|
||||
base_offset,
|
||||
stats: Some(InvertedIndexStats::default()),
|
||||
bitmap_type: bitmap_type.into(),
|
||||
..Default::default()
|
||||
},
|
||||
}
|
||||
@@ -90,17 +80,14 @@ where
|
||||
|
||||
/// Writes the null bitmap to the blob and updates the metadata accordingly
|
||||
async fn write_null_bitmap(&mut self) -> Result<()> {
|
||||
self.buf.clear();
|
||||
self.null_bitmap
|
||||
.serialize_into(self.bitmap_type, &mut self.buf)
|
||||
.expect("Write to vec should not fail");
|
||||
let null_bitmap_bytes = self.null_bitmap.as_raw_slice();
|
||||
self.blob_writer
|
||||
.write_all(&self.buf)
|
||||
.write_all(null_bitmap_bytes)
|
||||
.await
|
||||
.context(WriteSnafu)?;
|
||||
|
||||
self.meta.relative_null_bitmap_offset = self.meta.inverted_index_size as _;
|
||||
self.meta.null_bitmap_size = self.buf.len() as _;
|
||||
self.meta.null_bitmap_size = null_bitmap_bytes.len() as _;
|
||||
self.meta.inverted_index_size += self.meta.null_bitmap_size as u64;
|
||||
|
||||
// update stats
|
||||
@@ -113,18 +100,15 @@ where
|
||||
}
|
||||
|
||||
/// Appends a value and its bitmap to the blob, updates the FST, and the metadata
|
||||
async fn append_value(&mut self, value: Bytes, bitmap: Bitmap) -> Result<()> {
|
||||
self.buf.clear();
|
||||
bitmap
|
||||
.serialize_into(self.bitmap_type, &mut self.buf)
|
||||
.expect("Write to vec should not fail");
|
||||
async fn append_value(&mut self, value: Bytes, bitmap: BitVec) -> Result<()> {
|
||||
let bitmap_bytes = bitmap.into_vec();
|
||||
self.blob_writer
|
||||
.write_all(&self.buf)
|
||||
.write_all(&bitmap_bytes)
|
||||
.await
|
||||
.context(WriteSnafu)?;
|
||||
|
||||
let offset = self.meta.inverted_index_size as u32;
|
||||
let size = self.buf.len() as u32;
|
||||
let size = bitmap_bytes.len() as u32;
|
||||
self.meta.inverted_index_size += size as u64;
|
||||
|
||||
let packed = bytemuck::cast::<[u32; 2], u64>([offset, size]);
|
||||
@@ -173,10 +157,9 @@ mod tests {
|
||||
let writer = SingleIndexWriter::new(
|
||||
"test".to_string(),
|
||||
0,
|
||||
Bitmap::new_roaring(),
|
||||
BitVec::new(),
|
||||
stream::empty(),
|
||||
&mut blob,
|
||||
BitmapType::Roaring,
|
||||
);
|
||||
|
||||
let meta = writer.write().await.unwrap();
|
||||
@@ -191,23 +174,13 @@ mod tests {
|
||||
let writer = SingleIndexWriter::new(
|
||||
"test".to_string(),
|
||||
0,
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001, 0b0000_0000], BitmapType::Roaring),
|
||||
BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
|
||||
stream::iter(vec![
|
||||
Ok((
|
||||
Bytes::from("a"),
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
|
||||
)),
|
||||
Ok((
|
||||
Bytes::from("b"),
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0000], BitmapType::Roaring),
|
||||
)),
|
||||
Ok((
|
||||
Bytes::from("c"),
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
|
||||
)),
|
||||
Ok((Bytes::from("a"), BitVec::from_slice(&[0b0000_0001]))),
|
||||
Ok((Bytes::from("b"), BitVec::from_slice(&[0b0000_0000]))),
|
||||
Ok((Bytes::from("c"), BitVec::from_slice(&[0b0000_0001]))),
|
||||
]),
|
||||
&mut blob,
|
||||
BitmapType::Roaring,
|
||||
);
|
||||
let meta = writer.write().await.unwrap();
|
||||
|
||||
@@ -226,23 +199,13 @@ mod tests {
|
||||
let writer = SingleIndexWriter::new(
|
||||
"test".to_string(),
|
||||
0,
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001, 0b0000_0000], BitmapType::Roaring),
|
||||
BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
|
||||
stream::iter(vec![
|
||||
Ok((
|
||||
Bytes::from("b"),
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0000], BitmapType::Roaring),
|
||||
)),
|
||||
Ok((
|
||||
Bytes::from("a"),
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
|
||||
)),
|
||||
Ok((
|
||||
Bytes::from("c"),
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
|
||||
)),
|
||||
Ok((Bytes::from("b"), BitVec::from_slice(&[0b0000_0000]))),
|
||||
Ok((Bytes::from("a"), BitVec::from_slice(&[0b0000_0001]))),
|
||||
Ok((Bytes::from("c"), BitVec::from_slice(&[0b0000_0001]))),
|
||||
]),
|
||||
&mut blob,
|
||||
BitmapType::Roaring,
|
||||
);
|
||||
let res = writer.write().await;
|
||||
assert!(matches!(res, Err(Error::FstInsert { .. })));
|
||||
|
||||
@@ -12,9 +12,9 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use greptime_proto::v1::index::{BitmapType, InvertedIndexMeta};
|
||||
use common_base::BitVec;
|
||||
use greptime_proto::v1::index::InvertedIndexMeta;
|
||||
|
||||
use crate::bitmap::Bitmap;
|
||||
use crate::inverted_index::error::Result;
|
||||
use crate::inverted_index::format::reader::InvertedIndexReader;
|
||||
|
||||
@@ -36,7 +36,7 @@ impl<'a> ParallelFstValuesMapper<'a> {
|
||||
pub async fn map_values_vec(
|
||||
&mut self,
|
||||
value_and_meta_vec: &[(Vec<u64>, &'a InvertedIndexMeta)],
|
||||
) -> Result<Vec<Bitmap>> {
|
||||
) -> Result<Vec<BitVec>> {
|
||||
let groups = value_and_meta_vec
|
||||
.iter()
|
||||
.map(|(values, _)| values.len())
|
||||
@@ -50,17 +50,15 @@ impl<'a> ParallelFstValuesMapper<'a> {
|
||||
// bitmap offset and the lower 32 bits represent its size. This mapper uses these
|
||||
// combined offset-size pairs to fetch and union multiple bitmaps into a single `BitVec`.
|
||||
let [relative_offset, size] = bytemuck::cast::<u64, [u32; 2]>(*value);
|
||||
let range = meta.base_offset + relative_offset as u64
|
||||
..meta.base_offset + relative_offset as u64 + size as u64;
|
||||
fetch_ranges.push((
|
||||
range,
|
||||
BitmapType::try_from(meta.bitmap_type).unwrap_or(BitmapType::BitVec),
|
||||
));
|
||||
fetch_ranges.push(
|
||||
meta.base_offset + relative_offset as u64
|
||||
..meta.base_offset + relative_offset as u64 + size as u64,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if fetch_ranges.is_empty() {
|
||||
return Ok(vec![Bitmap::new_bitvec()]);
|
||||
return Ok(vec![BitVec::new()]);
|
||||
}
|
||||
|
||||
common_telemetry::debug!("fetch ranges: {:?}", fetch_ranges);
|
||||
@@ -68,10 +66,14 @@ impl<'a> ParallelFstValuesMapper<'a> {
|
||||
let mut output = Vec::with_capacity(groups.len());
|
||||
|
||||
for counter in groups {
|
||||
let mut bitmap = Bitmap::new_roaring();
|
||||
let mut bitmap = BitVec::new();
|
||||
for _ in 0..counter {
|
||||
let bm = bitmaps.pop_front().unwrap();
|
||||
bitmap.union(bm);
|
||||
if bm.len() > bitmap.len() {
|
||||
bitmap = bm | bitmap
|
||||
} else {
|
||||
bitmap |= bm
|
||||
}
|
||||
}
|
||||
|
||||
output.push(bitmap);
|
||||
@@ -85,6 +87,8 @@ impl<'a> ParallelFstValuesMapper<'a> {
|
||||
mod tests {
|
||||
use std::collections::VecDeque;
|
||||
|
||||
use common_base::bit_vec::prelude::*;
|
||||
|
||||
use super::*;
|
||||
use crate::inverted_index::format::reader::MockInvertedIndexReader;
|
||||
|
||||
@@ -97,26 +101,19 @@ mod tests {
|
||||
let mut mock_reader = MockInvertedIndexReader::new();
|
||||
mock_reader.expect_bitmap_deque().returning(|ranges| {
|
||||
let mut output = VecDeque::new();
|
||||
for (range, bitmap_type) in ranges {
|
||||
for range in ranges {
|
||||
let offset = range.start;
|
||||
let size = range.end - range.start;
|
||||
match (offset, size, bitmap_type) {
|
||||
(1, 1, BitmapType::Roaring) => {
|
||||
output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
|
||||
}
|
||||
(2, 1, BitmapType::Roaring) => {
|
||||
output.push_back(Bitmap::from_lsb0_bytes(&[0b01010101], *bitmap_type))
|
||||
}
|
||||
match (offset, size) {
|
||||
(1, 1) => output.push_back(bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1]),
|
||||
(2, 1) => output.push_back(bitvec![u8, Lsb0; 0, 1, 0, 1, 0, 1, 0, 1]),
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
Ok(output)
|
||||
});
|
||||
|
||||
let meta = InvertedIndexMeta {
|
||||
bitmap_type: BitmapType::Roaring.into(),
|
||||
..Default::default()
|
||||
};
|
||||
let meta = InvertedIndexMeta::default();
|
||||
let mut values_mapper = ParallelFstValuesMapper::new(&mut mock_reader);
|
||||
|
||||
let result = values_mapper
|
||||
@@ -129,50 +126,33 @@ mod tests {
|
||||
.map_values_vec(&[(vec![value(1, 1)], &meta)])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
result[0],
|
||||
Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring)
|
||||
);
|
||||
assert_eq!(result[0], bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1]);
|
||||
|
||||
let result = values_mapper
|
||||
.map_values_vec(&[(vec![value(2, 1)], &meta)])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
result[0],
|
||||
Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring)
|
||||
);
|
||||
assert_eq!(result[0], bitvec![u8, Lsb0; 0, 1, 0, 1, 0, 1, 0, 1]);
|
||||
|
||||
let result = values_mapper
|
||||
.map_values_vec(&[(vec![value(1, 1), value(2, 1)], &meta)])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
result[0],
|
||||
Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::Roaring)
|
||||
);
|
||||
assert_eq!(result[0], bitvec![u8, Lsb0; 1, 1, 1, 1, 1, 1, 1, 1]);
|
||||
|
||||
let result = values_mapper
|
||||
.map_values_vec(&[(vec![value(2, 1), value(1, 1)], &meta)])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
result[0],
|
||||
Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::Roaring)
|
||||
);
|
||||
assert_eq!(result[0], bitvec![u8, Lsb0; 1, 1, 1, 1, 1, 1, 1, 1]);
|
||||
|
||||
let result = values_mapper
|
||||
.map_values_vec(&[(vec![value(2, 1)], &meta), (vec![value(1, 1)], &meta)])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
result[0],
|
||||
Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring)
|
||||
);
|
||||
assert_eq!(
|
||||
result[1],
|
||||
Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring)
|
||||
);
|
||||
assert_eq!(result[0], bitvec![u8, Lsb0; 0, 1, 0, 1, 0, 1, 0, 1]);
|
||||
assert_eq!(result[1], bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1]);
|
||||
|
||||
let result = values_mapper
|
||||
.map_values_vec(&[
|
||||
(vec![value(2, 1), value(1, 1)], &meta),
|
||||
@@ -180,13 +160,7 @@ mod tests {
|
||||
])
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
result[0],
|
||||
Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::Roaring)
|
||||
);
|
||||
assert_eq!(
|
||||
result[1],
|
||||
Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring)
|
||||
);
|
||||
assert_eq!(result[0], bitvec![u8, Lsb0; 1, 1, 1, 1, 1, 1, 1, 1]);
|
||||
assert_eq!(result[1], bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,17 +15,17 @@
|
||||
mod predicates_apply;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use common_base::BitVec;
|
||||
pub use predicates_apply::PredicatesIndexApplier;
|
||||
|
||||
use crate::bitmap::Bitmap;
|
||||
use crate::inverted_index::error::Result;
|
||||
use crate::inverted_index::format::reader::InvertedIndexReader;
|
||||
|
||||
/// The output of an apply operation.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
pub struct ApplyOutput {
|
||||
/// Bitmap of indices that match the predicates.
|
||||
pub matched_segment_ids: Bitmap,
|
||||
pub matched_segment_ids: BitVec,
|
||||
|
||||
/// The total number of rows in the index.
|
||||
pub total_row_count: usize,
|
||||
|
||||
@@ -15,9 +15,9 @@
|
||||
use std::mem::size_of;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use common_base::BitVec;
|
||||
use greptime_proto::v1::index::InvertedIndexMetas;
|
||||
|
||||
use crate::bitmap::Bitmap;
|
||||
use crate::inverted_index::error::{IndexNotFoundSnafu, Result};
|
||||
use crate::inverted_index::format::reader::InvertedIndexReader;
|
||||
use crate::inverted_index::search::fst_apply::{
|
||||
@@ -50,11 +50,12 @@ impl IndexApplier for PredicatesIndexApplier {
|
||||
) -> Result<ApplyOutput> {
|
||||
let metadata = reader.metadata().await?;
|
||||
let mut output = ApplyOutput {
|
||||
matched_segment_ids: Bitmap::new_bitvec(),
|
||||
matched_segment_ids: BitVec::EMPTY,
|
||||
total_row_count: metadata.total_row_count as _,
|
||||
segment_row_count: metadata.segment_row_count as _,
|
||||
};
|
||||
|
||||
let mut bitmap = Self::bitmap_full_range(&metadata);
|
||||
// TODO(zhongzc): optimize the order of applying to make it quicker to return empty.
|
||||
let mut appliers = Vec::with_capacity(self.fst_appliers.len());
|
||||
let mut fst_ranges = Vec::with_capacity(self.fst_appliers.len());
|
||||
@@ -80,7 +81,7 @@ impl IndexApplier for PredicatesIndexApplier {
|
||||
}
|
||||
|
||||
if fst_ranges.is_empty() {
|
||||
output.matched_segment_ids = Self::bitmap_full_range(&metadata);
|
||||
output.matched_segment_ids = bitmap;
|
||||
return Ok(output);
|
||||
}
|
||||
|
||||
@@ -92,15 +93,14 @@ impl IndexApplier for PredicatesIndexApplier {
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut mapper = ParallelFstValuesMapper::new(reader);
|
||||
let mut bm_vec = mapper.map_values_vec(&value_and_meta_vec).await?;
|
||||
let bm_vec = mapper.map_values_vec(&value_and_meta_vec).await?;
|
||||
|
||||
let mut bitmap = bm_vec.pop().unwrap(); // SAFETY: `fst_ranges` is not empty
|
||||
for bm in bm_vec {
|
||||
if bm.count_ones() == 0 {
|
||||
if bitmap.count_ones() == 0 {
|
||||
break;
|
||||
}
|
||||
|
||||
bitmap.intersect(bm);
|
||||
bitmap &= bm;
|
||||
}
|
||||
|
||||
output.matched_segment_ids = bitmap;
|
||||
@@ -146,12 +146,12 @@ impl PredicatesIndexApplier {
|
||||
Ok(PredicatesIndexApplier { fst_appliers })
|
||||
}
|
||||
|
||||
/// Creates a `Bitmap` representing the full range of data in the index for initial scanning.
|
||||
fn bitmap_full_range(metadata: &InvertedIndexMetas) -> Bitmap {
|
||||
/// Creates a `BitVec` representing the full range of data in the index for initial scanning.
|
||||
fn bitmap_full_range(metadata: &InvertedIndexMetas) -> BitVec {
|
||||
let total_count = metadata.total_row_count;
|
||||
let segment_count = metadata.segment_row_count;
|
||||
let len = total_count.div_ceil(segment_count);
|
||||
Bitmap::full_bitvec(len as _)
|
||||
BitVec::repeat(true, len as _)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -167,10 +167,10 @@ mod tests {
|
||||
use std::collections::VecDeque;
|
||||
use std::sync::Arc;
|
||||
|
||||
use greptime_proto::v1::index::{BitmapType, InvertedIndexMeta};
|
||||
use common_base::bit_vec::prelude::*;
|
||||
use greptime_proto::v1::index::InvertedIndexMeta;
|
||||
|
||||
use super::*;
|
||||
use crate::bitmap::Bitmap;
|
||||
use crate::inverted_index::error::Error;
|
||||
use crate::inverted_index::format::reader::MockInvertedIndexReader;
|
||||
use crate::inverted_index::search::fst_apply::MockFstApplier;
|
||||
@@ -190,7 +190,6 @@ mod tests {
|
||||
let meta = InvertedIndexMeta {
|
||||
name: s(tag),
|
||||
relative_fst_offset: idx,
|
||||
bitmap_type: BitmapType::Roaring.into(),
|
||||
..Default::default()
|
||||
};
|
||||
metas.metas.insert(s(tag), meta);
|
||||
@@ -230,16 +229,10 @@ mod tests {
|
||||
.unwrap()])
|
||||
});
|
||||
|
||||
mock_reader.expect_bitmap_deque().returning(|arg| {
|
||||
assert_eq!(arg.len(), 1);
|
||||
let range = &arg[0].0;
|
||||
let bitmap_type = arg[0].1;
|
||||
assert_eq!(*range, 2..3);
|
||||
assert_eq!(bitmap_type, BitmapType::Roaring);
|
||||
Ok(VecDeque::from([Bitmap::from_lsb0_bytes(
|
||||
&[0b10101010],
|
||||
bitmap_type,
|
||||
)]))
|
||||
mock_reader.expect_bitmap_deque().returning(|range| {
|
||||
assert_eq!(range.len(), 1);
|
||||
assert_eq!(range[0], 2..3);
|
||||
Ok(VecDeque::from([bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1, 0]]))
|
||||
});
|
||||
let output = applier
|
||||
.apply(SearchContext::default(), &mut mock_reader)
|
||||
@@ -247,7 +240,7 @@ mod tests {
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
output.matched_segment_ids,
|
||||
Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring)
|
||||
bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1, 0]
|
||||
);
|
||||
|
||||
// An index reader with a single tag "tag-0" but without value "tag-0_value-0"
|
||||
@@ -299,16 +292,12 @@ mod tests {
|
||||
});
|
||||
mock_reader.expect_bitmap_deque().returning(|ranges| {
|
||||
let mut output = VecDeque::new();
|
||||
for (range, bitmap_type) in ranges {
|
||||
for range in ranges {
|
||||
let offset = range.start;
|
||||
let size = range.end - range.start;
|
||||
match (offset, size, bitmap_type) {
|
||||
(1, 1, BitmapType::Roaring) => {
|
||||
output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
|
||||
}
|
||||
(2, 1, BitmapType::Roaring) => {
|
||||
output.push_back(Bitmap::from_lsb0_bytes(&[0b11011011], *bitmap_type))
|
||||
}
|
||||
match (offset, size) {
|
||||
(1, 1) => output.push_back(bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1, 0]),
|
||||
(2, 1) => output.push_back(bitvec![u8, Lsb0; 1, 1, 0, 1, 1, 0, 1, 1]),
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
@@ -322,7 +311,7 @@ mod tests {
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
output.matched_segment_ids,
|
||||
Bitmap::from_lsb0_bytes(&[0b10001010], BitmapType::Roaring)
|
||||
bitvec![u8, Lsb0; 1, 0, 0, 0, 1, 0, 1, 0]
|
||||
);
|
||||
}
|
||||
|
||||
@@ -341,7 +330,10 @@ mod tests {
|
||||
.apply(SearchContext::default(), &mut mock_reader)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(output.matched_segment_ids, Bitmap::full_bitvec(8)); // full range to scan
|
||||
assert_eq!(
|
||||
output.matched_segment_ids,
|
||||
bitvec![u8, Lsb0; 1, 1, 1, 1, 1, 1, 1, 1]
|
||||
); // full range to scan
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -413,7 +405,10 @@ mod tests {
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(output.matched_segment_ids, Bitmap::full_bitvec(8));
|
||||
assert_eq!(
|
||||
output.matched_segment_ids,
|
||||
bitvec![u8, Lsb0; 1, 1, 1, 1, 1, 1, 1, 1]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -15,7 +15,6 @@
|
||||
#![feature(iter_partition_in_place)]
|
||||
#![feature(assert_matches)]
|
||||
|
||||
pub mod bitmap;
|
||||
pub mod bloom_filter;
|
||||
pub mod error;
|
||||
pub mod external_provider;
|
||||
|
||||
23
src/ingester/Cargo.toml
Normal file
23
src/ingester/Cargo.toml
Normal file
@@ -0,0 +1,23 @@
|
||||
[package]
|
||||
name = "ingester"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
clap.workspace = true
|
||||
common-telemetry.workspace = true
|
||||
common-time.workspace = true
|
||||
datanode.workspace = true
|
||||
meta-client.workspace = true
|
||||
mito2.workspace = true
|
||||
object-store.workspace = true
|
||||
reqwest.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
sst-convert.workspace = true
|
||||
tokio.workspace = true
|
||||
toml.workspace = true
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
294
src/ingester/src/main.rs
Normal file
294
src/ingester/src/main.rs
Normal file
@@ -0,0 +1,294 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use clap::Parser;
|
||||
use common_telemetry::info;
|
||||
use common_time::timestamp::TimeUnit;
|
||||
use datanode::config::StorageConfig;
|
||||
use meta_client::MetaClientOptions;
|
||||
use mito2::config::MitoConfig;
|
||||
use mito2::sst::file::IndexType;
|
||||
use mito2::sst::parquet::SstInfo;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sst_convert::converter::{InputFile, InputFileType, SstConverterBuilder};
|
||||
use tokio::sync::oneshot;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(version, about = "Greptime Ingester", long_about = None)]
|
||||
struct Args {
|
||||
/// Input directory
|
||||
#[arg(short, long)]
|
||||
input_dir: String,
|
||||
/// Directory of input parquet files, relative to input_dir
|
||||
#[arg(short, long)]
|
||||
parquet_dir: Option<String>,
|
||||
/// Directory of input json files, relative to input_dir
|
||||
#[arg(short, long)]
|
||||
remote_write_dir: Option<String>,
|
||||
/// Config file
|
||||
#[arg(short, long)]
|
||||
cfg: String,
|
||||
/// DB HTTP address
|
||||
#[arg(short, long)]
|
||||
db_http_addr: String,
|
||||
|
||||
/// Output path for the converted SST files.
|
||||
/// If it is not None, the converted SST files will be written to the specified path
|
||||
/// in the `input_store`.
|
||||
/// This is for debugging purposes.
|
||||
#[arg(short, long)]
|
||||
sst_output_path: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
struct IngesterConfig {
|
||||
meta_client: MetaClientOptions,
|
||||
storage: StorageConfig,
|
||||
mito: MitoConfig,
|
||||
}
|
||||
|
||||
pub const APP_NAME: &str = "greptime-ingester";
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let _guard = common_telemetry::init_global_logging(
|
||||
APP_NAME,
|
||||
&Default::default(),
|
||||
&Default::default(),
|
||||
None,
|
||||
);
|
||||
|
||||
let args = Args::parse();
|
||||
|
||||
let cfg_file = std::fs::read_to_string(&args.cfg).expect("Failed to read config file");
|
||||
let cfg: IngesterConfig = toml::from_str(&cfg_file).expect("Failed to parse config");
|
||||
|
||||
let sst_builder = {
|
||||
let mut builder = SstConverterBuilder::new_fs(args.input_dir)
|
||||
.with_meta_options(cfg.meta_client)
|
||||
.with_storage_config(cfg.storage)
|
||||
.with_config(cfg.mito);
|
||||
|
||||
if let Some(output_path) = args.sst_output_path {
|
||||
builder = builder.with_output_path(output_path);
|
||||
}
|
||||
|
||||
builder
|
||||
};
|
||||
|
||||
let sst_converter = sst_builder
|
||||
.clone()
|
||||
.build()
|
||||
.await
|
||||
.expect("Failed to build sst converter");
|
||||
|
||||
let input_store = sst_converter.input_store.clone();
|
||||
|
||||
if let Some(parquet_dir) = args.parquet_dir {
|
||||
// using opendal to read parquet files in given input object store
|
||||
let all_parquets = input_store
|
||||
.list(&parquet_dir)
|
||||
.await
|
||||
.expect("Failed to list parquet files");
|
||||
info!("Listed all files in parquet directory: {:?}", all_parquets);
|
||||
let all_parquets = all_parquets
|
||||
.iter()
|
||||
.filter(|parquet| parquet.name().ends_with(".parquet") && parquet.metadata().is_file())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let input_files = all_parquets
|
||||
.iter()
|
||||
.map(|parquet| {
|
||||
let full_table_name = parquet.name().split("-").next().unwrap();
|
||||
let (catalog_name, schema_name, table_name) = extract_name(full_table_name);
|
||||
|
||||
info!(
|
||||
"catalog: {}, schema: {}, table: {}",
|
||||
catalog_name, schema_name, table_name
|
||||
);
|
||||
|
||||
InputFile {
|
||||
catalog: catalog_name.to_string(),
|
||||
schema: schema_name.to_string(),
|
||||
table: table_name.to_string(),
|
||||
path: parquet.path().to_string(),
|
||||
file_type: InputFileType::Parquet,
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
convert_and_send(&input_files, sst_builder.clone(), &args.db_http_addr).await;
|
||||
}
|
||||
|
||||
if let Some(remote_write_dir) = args.remote_write_dir {
|
||||
// using opendal to read parquet files in given input object store
|
||||
let all_parquets = input_store
|
||||
.list(&remote_write_dir)
|
||||
.await
|
||||
.expect("Failed to list parquet files");
|
||||
|
||||
let all_parquets = all_parquets
|
||||
.iter()
|
||||
.filter(|parquet| parquet.name().ends_with(".parquet") && parquet.metadata().is_file())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let input_files = all_parquets
|
||||
.iter()
|
||||
.map(|parquet| {
|
||||
let full_table_name = parquet.name().split("-").next().unwrap();
|
||||
let (catalog_name, schema_name, table_name) = extract_name(full_table_name);
|
||||
|
||||
info!(
|
||||
"catalog: {}, schema: {}, table: {}",
|
||||
catalog_name, schema_name, table_name
|
||||
);
|
||||
InputFile {
|
||||
catalog: catalog_name.to_string(),
|
||||
schema: schema_name.to_string(),
|
||||
table: table_name.to_string(),
|
||||
path: parquet.path().to_string(),
|
||||
file_type: InputFileType::RemoteWrite,
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
convert_and_send(&input_files, sst_builder.clone(), &args.db_http_addr).await;
|
||||
}
|
||||
}
|
||||
|
||||
async fn convert_and_send(
|
||||
input_files: &[InputFile],
|
||||
sst_builder: SstConverterBuilder,
|
||||
db_http_addr: &str,
|
||||
) {
|
||||
let table_names = input_files
|
||||
.iter()
|
||||
.map(|f| (f.schema.clone(), f.table.clone()))
|
||||
.collect::<Vec<_>>();
|
||||
let mut rxs = Vec::new();
|
||||
|
||||
// Spawn a task for each input file
|
||||
info!("Spawning tasks for {} input files", input_files.len());
|
||||
for input_file in input_files.iter() {
|
||||
let (tx, rx) = oneshot::channel();
|
||||
let sst_builder = sst_builder.clone();
|
||||
let input_file = (*input_file).clone();
|
||||
tokio::task::spawn(async move {
|
||||
let mut sst_converter = sst_builder
|
||||
.build()
|
||||
.await
|
||||
.expect("Failed to build sst converter");
|
||||
let sst_info = sst_converter
|
||||
.convert_one(&input_file)
|
||||
.await
|
||||
.expect("Failed to convert parquet files");
|
||||
tx.send(sst_info).unwrap();
|
||||
});
|
||||
rxs.push(rx);
|
||||
}
|
||||
|
||||
let mut sst_infos = Vec::new();
|
||||
for rx in rxs {
|
||||
sst_infos.push(rx.await.unwrap());
|
||||
}
|
||||
|
||||
info!("Converted {} input files", sst_infos.len());
|
||||
|
||||
let ingest_reqs = table_names
|
||||
.iter()
|
||||
.zip(sst_infos.iter())
|
||||
.flat_map(|(schema_name, sst_info)| {
|
||||
sst_info
|
||||
.ssts
|
||||
.iter()
|
||||
.map(|sst| to_ingest_sst_req(&schema_name.0, &schema_name.1, sst))
|
||||
.collect::<Vec<_>>()
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// send ingest requests to DB
|
||||
send_ingest_requests(db_http_addr, ingest_reqs)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
fn extract_name(full_table_name: &str) -> (String, String, String) {
|
||||
let mut names = full_table_name.split('.').rev();
|
||||
let table_name = names.next().unwrap();
|
||||
let schema_name = names.next().unwrap_or("public");
|
||||
let catalog_name = names.next().unwrap_or("greptime");
|
||||
(
|
||||
catalog_name.to_string(),
|
||||
schema_name.to_string(),
|
||||
table_name.to_string(),
|
||||
)
|
||||
}
|
||||
|
||||
async fn send_ingest_requests(
|
||||
addr: &str,
|
||||
reqs: Vec<ClientIngestSstRequest>,
|
||||
) -> Result<(), Box<dyn std::error::Error>> {
|
||||
let client = reqwest::Client::new();
|
||||
for req in reqs {
|
||||
info!("ingesting sst: {req:?}");
|
||||
let req = client.post(addr).json(&req);
|
||||
let resp = req.send().await?;
|
||||
info!("ingest response: {resp:?}");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub(crate) struct ClientIngestSstRequest {
|
||||
schema: Option<String>,
|
||||
table: String,
|
||||
pub(crate) file_id: String,
|
||||
pub(crate) min_ts: i64,
|
||||
pub(crate) max_ts: i64,
|
||||
pub(crate) file_size: u64,
|
||||
pub(crate) rows: u32,
|
||||
pub(crate) row_groups: u32,
|
||||
/// Available indexes of the file.
|
||||
pub available_indexes: Vec<IndexType>,
|
||||
/// Size of the index file.
|
||||
pub index_file_size: u64,
|
||||
pub time_unit: u32,
|
||||
}
|
||||
|
||||
fn to_ingest_sst_req(
|
||||
schema_name: &str,
|
||||
table_name: &str,
|
||||
sst_info: &SstInfo,
|
||||
) -> ClientIngestSstRequest {
|
||||
let index_file_size = sst_info.index_metadata.file_size;
|
||||
let available_indexs = sst_info.index_metadata.build_available_indexes();
|
||||
ClientIngestSstRequest {
|
||||
schema: Some(schema_name.to_string()),
|
||||
table: table_name.to_string(),
|
||||
file_id: sst_info.file_id.to_string(),
|
||||
min_ts: sst_info.time_range.0.value(),
|
||||
max_ts: sst_info.time_range.1.value(),
|
||||
file_size: sst_info.file_size,
|
||||
rows: sst_info.num_rows as _,
|
||||
row_groups: sst_info.num_row_groups as _,
|
||||
available_indexes: available_indexs.to_vec(),
|
||||
index_file_size,
|
||||
time_unit: match sst_info.time_range.0.unit() {
|
||||
TimeUnit::Second => 0,
|
||||
TimeUnit::Millisecond => 3,
|
||||
TimeUnit::Microsecond => 6,
|
||||
TimeUnit::Nanosecond => 9,
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -40,17 +40,15 @@ pub enum Error {
|
||||
actual: String,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to start log store task: {}", name))]
|
||||
StartWalTask {
|
||||
name: String,
|
||||
#[snafu(display("Failed to start log store gc task"))]
|
||||
StartGcTask {
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
source: RuntimeError,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to stop log store task: {}", name))]
|
||||
StopWalTask {
|
||||
name: String,
|
||||
#[snafu(display("Failed to stop log store gc task"))]
|
||||
StopGcTask {
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
source: RuntimeError,
|
||||
|
||||
@@ -35,7 +35,7 @@ use common_runtime::RepeatedTask;
|
||||
use raft_engine::{Config, Engine, LogBatch, ReadableSize, RecoveryMode};
|
||||
use snafu::{IntoError, ResultExt};
|
||||
|
||||
use crate::error::{self, Error, IoSnafu, RaftEngineSnafu, StartWalTaskSnafu};
|
||||
use crate::error::{self, Error, IoSnafu, RaftEngineSnafu, StartGcTaskSnafu};
|
||||
use crate::raft_engine::log_store::PurgeExpiredFilesFunction;
|
||||
|
||||
pub(crate) const SYSTEM_NAMESPACE: u64 = 0;
|
||||
@@ -93,8 +93,7 @@ impl RaftEngineBackend {
|
||||
);
|
||||
gc_task
|
||||
.start(common_runtime::global_runtime())
|
||||
.context(StartWalTaskSnafu { name: "gc_task" })?;
|
||||
|
||||
.context(StartGcTaskSnafu)?;
|
||||
Ok(Self {
|
||||
engine: RwLock::new(engine),
|
||||
_gc_task: gc_task,
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
|
||||
use std::collections::{hash_map, HashMap};
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use std::sync::atomic::{AtomicI64, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
@@ -31,7 +32,7 @@ use store_api::storage::RegionId;
|
||||
use crate::error::{
|
||||
AddEntryLogBatchSnafu, DiscontinuousLogIndexSnafu, Error, FetchEntrySnafu,
|
||||
IllegalNamespaceSnafu, IllegalStateSnafu, InvalidProviderSnafu, OverrideCompactedEntrySnafu,
|
||||
RaftEngineSnafu, Result, StartWalTaskSnafu, StopWalTaskSnafu,
|
||||
RaftEngineSnafu, Result, StartGcTaskSnafu, StopGcTaskSnafu,
|
||||
};
|
||||
use crate::metrics;
|
||||
use crate::raft_engine::backend::SYSTEM_NAMESPACE;
|
||||
@@ -45,7 +46,7 @@ pub struct RaftEngineLogStore {
|
||||
read_batch_size: usize,
|
||||
engine: Arc<Engine>,
|
||||
gc_task: RepeatedTask<Error>,
|
||||
sync_task: RepeatedTask<Error>,
|
||||
last_sync_time: AtomicI64,
|
||||
}
|
||||
|
||||
pub struct PurgeExpiredFilesFunction {
|
||||
@@ -82,31 +83,6 @@ impl TaskFunction<Error> for PurgeExpiredFilesFunction {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SyncWalTaskFunction {
|
||||
engine: Arc<Engine>,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl TaskFunction<Error> for SyncWalTaskFunction {
|
||||
async fn call(&mut self) -> std::result::Result<(), Error> {
|
||||
let engine = self.engine.clone();
|
||||
if let Err(e) = tokio::task::spawn_blocking(move || engine.sync()).await {
|
||||
error!(e; "Failed to sync raft engine log files");
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"SyncWalTaskFunction"
|
||||
}
|
||||
}
|
||||
|
||||
impl SyncWalTaskFunction {
|
||||
pub fn new(engine: Arc<Engine>) -> Self {
|
||||
Self { engine }
|
||||
}
|
||||
}
|
||||
|
||||
impl RaftEngineLogStore {
|
||||
pub async fn try_new(dir: String, config: &RaftEngineConfig) -> Result<Self> {
|
||||
let raft_engine_config = Config {
|
||||
@@ -128,18 +104,13 @@ impl RaftEngineLogStore {
|
||||
}),
|
||||
);
|
||||
|
||||
let sync_task = RepeatedTask::new(
|
||||
config.sync_period.unwrap_or(Duration::from_secs(5)),
|
||||
Box::new(SyncWalTaskFunction::new(engine.clone())),
|
||||
);
|
||||
|
||||
let log_store = Self {
|
||||
sync_write: config.sync_write,
|
||||
sync_period: config.sync_period,
|
||||
read_batch_size: config.read_batch_size,
|
||||
engine,
|
||||
gc_task,
|
||||
sync_task,
|
||||
last_sync_time: AtomicI64::new(0),
|
||||
};
|
||||
log_store.start()?;
|
||||
Ok(log_store)
|
||||
@@ -152,10 +123,7 @@ impl RaftEngineLogStore {
|
||||
fn start(&self) -> Result<()> {
|
||||
self.gc_task
|
||||
.start(common_runtime::global_runtime())
|
||||
.context(StartWalTaskSnafu { name: "gc_task" })?;
|
||||
self.sync_task
|
||||
.start(common_runtime::global_runtime())
|
||||
.context(StartWalTaskSnafu { name: "sync_task" })
|
||||
.context(StartGcTaskSnafu)
|
||||
}
|
||||
|
||||
fn span(&self, provider: &RaftEngineProvider) -> (Option<u64>, Option<u64>) {
|
||||
@@ -252,14 +220,7 @@ impl LogStore for RaftEngineLogStore {
|
||||
type Error = Error;
|
||||
|
||||
async fn stop(&self) -> Result<()> {
|
||||
self.gc_task
|
||||
.stop()
|
||||
.await
|
||||
.context(StopWalTaskSnafu { name: "gc_task" })?;
|
||||
self.sync_task
|
||||
.stop()
|
||||
.await
|
||||
.context(StopWalTaskSnafu { name: "sync_task" })
|
||||
self.gc_task.stop().await.context(StopGcTaskSnafu)
|
||||
}
|
||||
|
||||
/// Appends a batch of entries to logstore. `RaftEngineLogStore` assures the atomicity of
|
||||
@@ -279,9 +240,20 @@ impl LogStore for RaftEngineLogStore {
|
||||
}
|
||||
|
||||
let (mut batch, last_entry_ids) = self.entries_to_batch(entries)?;
|
||||
|
||||
let mut sync = self.sync_write;
|
||||
|
||||
if let Some(sync_period) = &self.sync_period {
|
||||
let now = common_time::util::current_time_millis();
|
||||
if now - self.last_sync_time.load(Ordering::Relaxed) >= sync_period.as_millis() as i64 {
|
||||
self.last_sync_time.store(now, Ordering::Relaxed);
|
||||
sync = true;
|
||||
}
|
||||
}
|
||||
|
||||
let _ = self
|
||||
.engine
|
||||
.write(&mut batch, self.sync_write)
|
||||
.write(&mut batch, sync)
|
||||
.context(RaftEngineSnafu)?;
|
||||
|
||||
Ok(AppendBatchResponse { last_entry_ids })
|
||||
|
||||
@@ -111,7 +111,6 @@ impl MetaClientBuilder {
|
||||
.enable_store()
|
||||
.enable_heartbeat()
|
||||
.enable_procedure()
|
||||
.enable_access_cluster_info()
|
||||
}
|
||||
|
||||
pub fn enable_heartbeat(self) -> Self {
|
||||
|
||||
@@ -7,7 +7,6 @@ license.workspace = true
|
||||
[features]
|
||||
mock = []
|
||||
pg_kvbackend = ["dep:tokio-postgres", "common-meta/pg_kvbackend"]
|
||||
mysql_kvbackend = [] # placeholder features so CI can compile
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
@@ -335,10 +335,6 @@ impl MetricEngine {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn mito(&self) -> MitoEngine {
|
||||
self.inner.mito.clone()
|
||||
}
|
||||
|
||||
pub async fn logical_regions(&self, physical_region_id: RegionId) -> Result<Vec<RegionId>> {
|
||||
self.inner
|
||||
.metadata_region
|
||||
|
||||
@@ -59,7 +59,7 @@ pub mod engine;
|
||||
pub mod error;
|
||||
mod metadata_region;
|
||||
mod metrics;
|
||||
mod row_modifier;
|
||||
pub mod row_modifier;
|
||||
#[cfg(test)]
|
||||
mod test_util;
|
||||
mod utils;
|
||||
|
||||
@@ -338,7 +338,6 @@ impl MetadataRegion {
|
||||
limit: None,
|
||||
series_row_selector: None,
|
||||
sequence: None,
|
||||
distribution: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -528,7 +527,6 @@ impl MetadataRegion {
|
||||
limit: None,
|
||||
series_row_selector: None,
|
||||
sequence: None,
|
||||
distribution: None,
|
||||
};
|
||||
let record_batch_stream = self
|
||||
.mito
|
||||
|
||||
@@ -40,7 +40,7 @@ const TSID_HASH_SEED: u32 = 846793005;
|
||||
///
|
||||
/// - For [`PrimaryKeyEncoding::Dense`] encoding,
|
||||
/// it adds two columns(`__table_id`, `__tsid`) to the row.
|
||||
pub struct RowModifier {
|
||||
pub(crate) struct RowModifier {
|
||||
codec: SparsePrimaryKeyCodec,
|
||||
}
|
||||
|
||||
@@ -52,7 +52,7 @@ impl RowModifier {
|
||||
}
|
||||
|
||||
/// Modify rows with the given primary key encoding.
|
||||
pub fn modify_rows(
|
||||
pub(crate) fn modify_rows(
|
||||
&self,
|
||||
iter: RowsIter,
|
||||
table_id: TableId,
|
||||
@@ -145,16 +145,14 @@ impl RowModifier {
|
||||
|
||||
/// Fills internal columns of a row with table name and a hash of tag values.
|
||||
fn fill_internal_columns(&self, table_id: TableId, iter: &RowIter<'_>) -> (Value, Value) {
|
||||
let mut hasher = mur3::Hasher128::with_seed(TSID_HASH_SEED);
|
||||
let mut hasher = TsidGenerator::default();
|
||||
for (name, value) in iter.primary_keys_with_name() {
|
||||
// The type is checked before. So only null is ignored.
|
||||
if let Some(ValueData::StringValue(string)) = &value.value_data {
|
||||
name.hash(&mut hasher);
|
||||
string.hash(&mut hasher);
|
||||
hasher.write_label(name, string);
|
||||
}
|
||||
}
|
||||
// TSID is 64 bits, simply truncate the 128 bits hash
|
||||
let (hash, _) = hasher.finish128();
|
||||
let hash = hasher.finish();
|
||||
|
||||
(
|
||||
ValueData::U32Value(table_id).into(),
|
||||
@@ -163,6 +161,34 @@ impl RowModifier {
|
||||
}
|
||||
}
|
||||
|
||||
/// Tsid generator.
|
||||
pub struct TsidGenerator {
|
||||
hasher: mur3::Hasher128,
|
||||
}
|
||||
|
||||
impl Default for TsidGenerator {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
hasher: mur3::Hasher128::with_seed(TSID_HASH_SEED),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TsidGenerator {
|
||||
/// Writes a label pair to the generator.
|
||||
pub fn write_label(&mut self, name: &str, value: &str) {
|
||||
name.hash(&mut self.hasher);
|
||||
value.hash(&mut self.hasher);
|
||||
}
|
||||
|
||||
/// Generates a new TSID.
|
||||
pub fn finish(&mut self) -> u64 {
|
||||
// TSID is 64 bits, simply truncate the 128 bits hash
|
||||
let (hash, _) = self.hasher.finish128();
|
||||
hash
|
||||
}
|
||||
}
|
||||
|
||||
/// Index of a value.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
struct ValueIndex {
|
||||
|
||||
@@ -121,7 +121,7 @@ impl AccessLayer {
|
||||
/// Writes a SST with specific `file_id` and `metadata` to the layer.
|
||||
///
|
||||
/// Returns the info of the SST. If no data written, returns None.
|
||||
pub(crate) async fn write_sst(
|
||||
pub async fn write_sst(
|
||||
&self,
|
||||
request: SstWriteRequest,
|
||||
write_opts: &WriteOptions,
|
||||
@@ -191,26 +191,26 @@ impl AccessLayer {
|
||||
|
||||
/// `OperationType` represents the origin of the `SstWriteRequest`.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub(crate) enum OperationType {
|
||||
pub enum OperationType {
|
||||
Flush,
|
||||
Compact,
|
||||
}
|
||||
|
||||
/// Contents to build a SST.
|
||||
pub(crate) struct SstWriteRequest {
|
||||
pub(crate) op_type: OperationType,
|
||||
pub(crate) metadata: RegionMetadataRef,
|
||||
pub(crate) source: Source,
|
||||
pub(crate) cache_manager: CacheManagerRef,
|
||||
pub struct SstWriteRequest {
|
||||
pub op_type: OperationType,
|
||||
pub metadata: RegionMetadataRef,
|
||||
pub source: Source,
|
||||
pub cache_manager: CacheManagerRef,
|
||||
#[allow(dead_code)]
|
||||
pub(crate) storage: Option<String>,
|
||||
pub(crate) max_sequence: Option<SequenceNumber>,
|
||||
pub storage: Option<String>,
|
||||
pub max_sequence: Option<SequenceNumber>,
|
||||
|
||||
/// Configs for index
|
||||
pub(crate) index_options: IndexOptions,
|
||||
pub(crate) inverted_index_config: InvertedIndexConfig,
|
||||
pub(crate) fulltext_index_config: FulltextIndexConfig,
|
||||
pub(crate) bloom_filter_index_config: BloomFilterConfig,
|
||||
pub index_options: IndexOptions,
|
||||
pub inverted_index_config: InvertedIndexConfig,
|
||||
pub fulltext_index_config: FulltextIndexConfig,
|
||||
pub bloom_filter_index_config: BloomFilterConfig,
|
||||
}
|
||||
|
||||
pub(crate) async fn new_fs_cache_store(root: &str) -> Result<ObjectStore> {
|
||||
|
||||
80
src/mito2/src/cache/index/inverted_index.rs
vendored
80
src/mito2/src/cache/index/inverted_index.rs
vendored
@@ -127,8 +127,8 @@ impl<R: InvertedIndexReader> InvertedIndexReader for CachedInvertedIndexBlobRead
|
||||
mod test {
|
||||
use std::num::NonZeroUsize;
|
||||
|
||||
use common_base::BitVec;
|
||||
use futures::stream;
|
||||
use index::bitmap::{Bitmap, BitmapType};
|
||||
use index::inverted_index::format::reader::{InvertedIndexBlobReader, InvertedIndexReader};
|
||||
use index::inverted_index::format::writer::{InvertedIndexBlobWriter, InvertedIndexWriter};
|
||||
use index::Bytes;
|
||||
@@ -191,44 +191,24 @@ mod test {
|
||||
writer
|
||||
.add_index(
|
||||
"tag0".to_string(),
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001, 0b0000_0000], BitmapType::Roaring),
|
||||
BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
|
||||
Box::new(stream::iter(vec![
|
||||
Ok((
|
||||
Bytes::from("a"),
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
|
||||
)),
|
||||
Ok((
|
||||
Bytes::from("b"),
|
||||
Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring),
|
||||
)),
|
||||
Ok((
|
||||
Bytes::from("c"),
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
|
||||
)),
|
||||
Ok((Bytes::from("a"), BitVec::from_slice(&[0b0000_0001]))),
|
||||
Ok((Bytes::from("b"), BitVec::from_slice(&[0b0010_0000]))),
|
||||
Ok((Bytes::from("c"), BitVec::from_slice(&[0b0000_0001]))),
|
||||
])),
|
||||
index::bitmap::BitmapType::Roaring,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
writer
|
||||
.add_index(
|
||||
"tag1".to_string(),
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001, 0b0000_0000], BitmapType::Roaring),
|
||||
BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
|
||||
Box::new(stream::iter(vec![
|
||||
Ok((
|
||||
Bytes::from("x"),
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
|
||||
)),
|
||||
Ok((
|
||||
Bytes::from("y"),
|
||||
Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring),
|
||||
)),
|
||||
Ok((
|
||||
Bytes::from("z"),
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
|
||||
)),
|
||||
Ok((Bytes::from("x"), BitVec::from_slice(&[0b0000_0001]))),
|
||||
Ok((Bytes::from("y"), BitVec::from_slice(&[0b0010_0000]))),
|
||||
Ok((Bytes::from("z"), BitVec::from_slice(&[0b0000_0001]))),
|
||||
])),
|
||||
index::bitmap::BitmapType::Roaring,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -287,31 +267,22 @@ mod test {
|
||||
assert_eq!(fst0.len(), 3);
|
||||
let [offset, size] = unpack(fst0.get(b"a").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.bitmap(tag0.base_offset + offset as u64, size)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
bitmap,
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
|
||||
);
|
||||
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
|
||||
let [offset, size] = unpack(fst0.get(b"b").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.bitmap(tag0.base_offset + offset as u64, size)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
bitmap,
|
||||
Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring)
|
||||
);
|
||||
assert_eq!(bitmap, BitVec::from_slice(&[0b0010_0000]));
|
||||
let [offset, size] = unpack(fst0.get(b"c").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.bitmap(tag0.base_offset + offset as u64, size)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
bitmap,
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
|
||||
);
|
||||
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
|
||||
|
||||
// tag1
|
||||
let tag1 = metadata.metas.get("tag1").unwrap();
|
||||
@@ -330,31 +301,22 @@ mod test {
|
||||
assert_eq!(fst1.len(), 3);
|
||||
let [offset, size] = unpack(fst1.get(b"x").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.bitmap(tag1.base_offset + offset as u64, size)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
bitmap,
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
|
||||
);
|
||||
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
|
||||
let [offset, size] = unpack(fst1.get(b"y").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.bitmap(tag1.base_offset + offset as u64, size)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
bitmap,
|
||||
Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring)
|
||||
);
|
||||
assert_eq!(bitmap, BitVec::from_slice(&[0b0010_0000]));
|
||||
let [offset, size] = unpack(fst1.get(b"z").unwrap());
|
||||
let bitmap = cached_reader
|
||||
.bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
|
||||
.bitmap(tag1.base_offset + offset as u64, size)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
bitmap,
|
||||
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
|
||||
);
|
||||
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
|
||||
|
||||
// fuzz test
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
@@ -46,6 +46,7 @@ const INDEX_CREATE_MEM_THRESHOLD_FACTOR: u64 = 16;
|
||||
pub(crate) const FETCH_OPTION_TIMEOUT: Duration = Duration::from_secs(3);
|
||||
|
||||
/// Configuration for [MitoEngine](crate::engine::MitoEngine).
|
||||
/// Before using the config, make sure to call `MitoConfig::validate()` to check if the config is valid.
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
|
||||
#[serde(default)]
|
||||
pub struct MitoConfig {
|
||||
|
||||
@@ -80,7 +80,6 @@ async fn test_scan_projection() {
|
||||
limit: None,
|
||||
series_row_selector: None,
|
||||
sequence: None,
|
||||
distribution: None,
|
||||
};
|
||||
let stream = engine.scan_to_stream(region_id, request).await.unwrap();
|
||||
let batches = RecordBatches::try_collect(stream).await.unwrap();
|
||||
|
||||
@@ -42,6 +42,13 @@ use crate::worker::WorkerId;
|
||||
#[snafu(visibility(pub))]
|
||||
#[stack_trace_debug]
|
||||
pub enum Error {
|
||||
#[snafu(display("External error, context: {}", context))]
|
||||
External {
|
||||
source: BoxedError,
|
||||
context: String,
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
},
|
||||
#[snafu(display("Failed to encode sparse primary key, reason: {}", reason))]
|
||||
EncodeSparsePrimaryKey {
|
||||
reason: String,
|
||||
@@ -1085,7 +1092,7 @@ impl ErrorExt for Error {
|
||||
| PuffinPurgeStager { source, .. } => source.status_code(),
|
||||
CleanDir { .. } => StatusCode::Unexpected,
|
||||
InvalidConfig { .. } => StatusCode::InvalidArguments,
|
||||
StaleLogEntry { .. } => StatusCode::Unexpected,
|
||||
StaleLogEntry { .. } | External { .. } => StatusCode::Unexpected,
|
||||
|
||||
FilterRecordBatch { source, .. } => source.status_code(),
|
||||
|
||||
|
||||
@@ -23,8 +23,8 @@
|
||||
#[cfg_attr(feature = "test", allow(unused))]
|
||||
pub mod test_util;
|
||||
|
||||
mod access_layer;
|
||||
mod cache;
|
||||
pub mod access_layer;
|
||||
pub mod cache;
|
||||
pub mod compaction;
|
||||
pub mod config;
|
||||
pub mod engine;
|
||||
|
||||
@@ -21,7 +21,6 @@ use common_time::Timestamp;
|
||||
use parquet::arrow::arrow_reader::RowSelection;
|
||||
use smallvec::{smallvec, SmallVec};
|
||||
use store_api::region_engine::PartitionRange;
|
||||
use store_api::storage::TimeSeriesDistribution;
|
||||
|
||||
use crate::cache::CacheStrategy;
|
||||
use crate::error::Result;
|
||||
@@ -99,8 +98,8 @@ impl RangeMeta {
|
||||
Self::push_seq_file_ranges(input.memtables.len(), &input.files, &mut ranges);
|
||||
|
||||
let ranges = group_ranges_for_seq_scan(ranges);
|
||||
if compaction || input.distribution == Some(TimeSeriesDistribution::PerSeries) {
|
||||
// We don't split ranges in compaction or TimeSeriesDistribution::PerSeries.
|
||||
if compaction {
|
||||
// We don't split ranges in compaction.
|
||||
return ranges;
|
||||
}
|
||||
maybe_split_ranges_for_seq_scan(ranges)
|
||||
|
||||
@@ -31,7 +31,7 @@ use datafusion_expr::Expr;
|
||||
use smallvec::SmallVec;
|
||||
use store_api::metadata::RegionMetadata;
|
||||
use store_api::region_engine::{PartitionRange, RegionScannerRef};
|
||||
use store_api::storage::{ScanRequest, TimeSeriesDistribution, TimeSeriesRowSelector};
|
||||
use store_api::storage::{ScanRequest, TimeSeriesRowSelector};
|
||||
use table::predicate::{build_time_range_predicate, Predicate};
|
||||
use tokio::sync::{mpsc, Semaphore};
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
@@ -287,16 +287,9 @@ impl ScanRegion {
|
||||
|
||||
/// Returns true if the region can use unordered scan for current request.
|
||||
fn use_unordered_scan(&self) -> bool {
|
||||
// We use unordered scan when:
|
||||
// 1. The region is in append mode.
|
||||
// 2. There is no series row selector.
|
||||
// 3. The required distribution is None or TimeSeriesDistribution::TimeWindowed.
|
||||
//
|
||||
// If table is append only and there is no series row selector, we use unordered scan in query.
|
||||
// We still use seq scan in compaction.
|
||||
self.version.options.append_mode
|
||||
&& self.request.series_row_selector.is_none()
|
||||
&& (self.request.distribution.is_none()
|
||||
|| self.request.distribution == Some(TimeSeriesDistribution::TimeWindowed))
|
||||
self.version.options.append_mode && self.request.series_row_selector.is_none()
|
||||
}
|
||||
|
||||
/// Creates a scan input.
|
||||
@@ -384,8 +377,7 @@ impl ScanRegion {
|
||||
.with_append_mode(self.version.options.append_mode)
|
||||
.with_filter_deleted(filter_deleted)
|
||||
.with_merge_mode(self.version.options.merge_mode())
|
||||
.with_series_row_selector(self.request.series_row_selector)
|
||||
.with_distribution(self.request.distribution);
|
||||
.with_series_row_selector(self.request.series_row_selector);
|
||||
Ok(input)
|
||||
}
|
||||
|
||||
@@ -565,8 +557,6 @@ pub(crate) struct ScanInput {
|
||||
pub(crate) merge_mode: MergeMode,
|
||||
/// Hint to select rows from time series.
|
||||
pub(crate) series_row_selector: Option<TimeSeriesRowSelector>,
|
||||
/// Hint for the required distribution of the scanner.
|
||||
pub(crate) distribution: Option<TimeSeriesDistribution>,
|
||||
}
|
||||
|
||||
impl ScanInput {
|
||||
@@ -591,7 +581,6 @@ impl ScanInput {
|
||||
filter_deleted: true,
|
||||
merge_mode: MergeMode::default(),
|
||||
series_row_selector: None,
|
||||
distribution: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -704,16 +693,6 @@ impl ScanInput {
|
||||
self
|
||||
}
|
||||
|
||||
/// Sets the distribution hint.
|
||||
#[must_use]
|
||||
pub(crate) fn with_distribution(
|
||||
mut self,
|
||||
distribution: Option<TimeSeriesDistribution>,
|
||||
) -> Self {
|
||||
self.distribution = distribution;
|
||||
self
|
||||
}
|
||||
|
||||
/// Sets the time series row selector.
|
||||
#[must_use]
|
||||
pub(crate) fn with_series_row_selector(
|
||||
|
||||
@@ -29,7 +29,7 @@ use datatypes::schema::SchemaRef;
|
||||
use snafu::ResultExt;
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::region_engine::{PartitionRange, PrepareRequest, RegionScanner, ScannerProperties};
|
||||
use store_api::storage::{TimeSeriesDistribution, TimeSeriesRowSelector};
|
||||
use store_api::storage::TimeSeriesRowSelector;
|
||||
use tokio::sync::Semaphore;
|
||||
|
||||
use crate::error::{PartitionOutOfRangeSnafu, Result};
|
||||
@@ -206,16 +206,32 @@ impl SeqScan {
|
||||
));
|
||||
}
|
||||
|
||||
if self.stream_ctx.input.distribution == Some(TimeSeriesDistribution::PerSeries) {
|
||||
return self.scan_partition_by_series(partition);
|
||||
}
|
||||
|
||||
let stream_ctx = self.stream_ctx.clone();
|
||||
let semaphore = self.new_semaphore();
|
||||
let semaphore = if self.properties.target_partitions() > self.properties.num_partitions() {
|
||||
// We can use additional tasks to read the data if we have more target partitions than actual partitions.
|
||||
// This semaphore is partition level.
|
||||
// We don't use a global semaphore to avoid a partition waiting for others. The final concurrency
|
||||
// of tasks usually won't exceed the target partitions a lot as compaction can reduce the number of
|
||||
// files in a part range.
|
||||
Some(Arc::new(Semaphore::new(
|
||||
self.properties.target_partitions() - self.properties.num_partitions() + 1,
|
||||
)))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let partition_ranges = self.properties.partitions[partition].clone();
|
||||
let compaction = self.compaction;
|
||||
let distinguish_range = self.properties.distinguish_partition_range;
|
||||
let part_metrics = self.new_partition_metrics(partition);
|
||||
let part_metrics = PartitionMetrics::new(
|
||||
self.stream_ctx.input.mapper.metadata().region_id,
|
||||
partition,
|
||||
get_scanner_type(self.compaction),
|
||||
stream_ctx.query_start,
|
||||
ScannerMetrics {
|
||||
prepare_scan_cost: self.stream_ctx.query_start.elapsed(),
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
|
||||
let stream = try_stream! {
|
||||
part_metrics.on_first_poll();
|
||||
@@ -305,124 +321,6 @@ impl SeqScan {
|
||||
|
||||
Ok(stream)
|
||||
}
|
||||
|
||||
/// Scans all ranges in the given partition and merge by time series.
|
||||
/// Otherwise the returned stream might not contains any data.
|
||||
fn scan_partition_by_series(
|
||||
&self,
|
||||
partition: usize,
|
||||
) -> Result<SendableRecordBatchStream, BoxedError> {
|
||||
let stream_ctx = self.stream_ctx.clone();
|
||||
let semaphore = self.new_semaphore();
|
||||
let partition_ranges = self.properties.partitions[partition].clone();
|
||||
let distinguish_range = self.properties.distinguish_partition_range;
|
||||
let part_metrics = self.new_partition_metrics(partition);
|
||||
debug_assert!(!self.compaction);
|
||||
|
||||
let stream = try_stream! {
|
||||
part_metrics.on_first_poll();
|
||||
|
||||
let range_builder_list = Arc::new(RangeBuilderList::new(
|
||||
stream_ctx.input.num_memtables(),
|
||||
stream_ctx.input.num_files(),
|
||||
));
|
||||
// Scans all parts.
|
||||
let mut sources = Vec::with_capacity(partition_ranges.len());
|
||||
for part_range in partition_ranges {
|
||||
build_sources(
|
||||
&stream_ctx,
|
||||
&part_range,
|
||||
false,
|
||||
&part_metrics,
|
||||
range_builder_list.clone(),
|
||||
&mut sources,
|
||||
);
|
||||
}
|
||||
|
||||
// Builds a reader that merge sources from all parts.
|
||||
let mut reader =
|
||||
Self::build_reader_from_sources(&stream_ctx, sources, semaphore.clone())
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExternalSnafu)?;
|
||||
let cache = &stream_ctx.input.cache_strategy;
|
||||
let mut metrics = ScannerMetrics::default();
|
||||
let mut fetch_start = Instant::now();
|
||||
|
||||
while let Some(batch) = reader
|
||||
.next_batch()
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExternalSnafu)?
|
||||
{
|
||||
metrics.scan_cost += fetch_start.elapsed();
|
||||
metrics.num_batches += 1;
|
||||
metrics.num_rows += batch.num_rows();
|
||||
|
||||
debug_assert!(!batch.is_empty());
|
||||
if batch.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let convert_start = Instant::now();
|
||||
let record_batch = stream_ctx.input.mapper.convert(&batch, cache)?;
|
||||
metrics.convert_cost += convert_start.elapsed();
|
||||
let yield_start = Instant::now();
|
||||
yield record_batch;
|
||||
metrics.yield_cost += yield_start.elapsed();
|
||||
|
||||
fetch_start = Instant::now();
|
||||
}
|
||||
|
||||
// Yields an empty part to indicate this range is terminated.
|
||||
// The query engine can use this to optimize some queries.
|
||||
if distinguish_range {
|
||||
let yield_start = Instant::now();
|
||||
yield stream_ctx.input.mapper.empty_record_batch();
|
||||
metrics.yield_cost += yield_start.elapsed();
|
||||
}
|
||||
|
||||
metrics.scan_cost += fetch_start.elapsed();
|
||||
part_metrics.merge_metrics(&metrics);
|
||||
|
||||
part_metrics.on_finish();
|
||||
};
|
||||
|
||||
let stream = Box::pin(RecordBatchStreamWrapper::new(
|
||||
self.stream_ctx.input.mapper.output_schema(),
|
||||
Box::pin(stream),
|
||||
));
|
||||
|
||||
Ok(stream)
|
||||
}
|
||||
|
||||
fn new_semaphore(&self) -> Option<Arc<Semaphore>> {
|
||||
if self.properties.target_partitions() > self.properties.num_partitions() {
|
||||
// We can use additional tasks to read the data if we have more target partitions than actual partitions.
|
||||
// This semaphore is partition level.
|
||||
// We don't use a global semaphore to avoid a partition waiting for others. The final concurrency
|
||||
// of tasks usually won't exceed the target partitions a lot as compaction can reduce the number of
|
||||
// files in a part range.
|
||||
Some(Arc::new(Semaphore::new(
|
||||
self.properties.target_partitions() - self.properties.num_partitions() + 1,
|
||||
)))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn new_partition_metrics(&self, partition: usize) -> PartitionMetrics {
|
||||
PartitionMetrics::new(
|
||||
self.stream_ctx.input.mapper.metadata().region_id,
|
||||
partition,
|
||||
get_scanner_type(self.compaction),
|
||||
self.stream_ctx.query_start,
|
||||
ScannerMetrics {
|
||||
prepare_scan_cost: self.stream_ctx.query_start.elapsed(),
|
||||
..Default::default()
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl RegionScanner for SeqScan {
|
||||
@@ -472,7 +370,7 @@ impl fmt::Debug for SeqScan {
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds sources for the partition range and push them to the `sources` vector.
|
||||
/// Builds sources for the partition range.
|
||||
fn build_sources(
|
||||
stream_ctx: &Arc<StreamContext>,
|
||||
part_range: &PartitionRange,
|
||||
@@ -484,8 +382,8 @@ fn build_sources(
|
||||
// Gets range meta.
|
||||
let range_meta = &stream_ctx.ranges[part_range.identifier];
|
||||
#[cfg(debug_assertions)]
|
||||
if compaction || stream_ctx.input.distribution == Some(TimeSeriesDistribution::PerSeries) {
|
||||
// Compaction or per series distribution expects input sources are not been split.
|
||||
if compaction {
|
||||
// Compaction expects input sources are not been split.
|
||||
debug_assert_eq!(range_meta.indices.len(), range_meta.row_group_indices.len());
|
||||
for (i, row_group_idx) in range_meta.row_group_indices.iter().enumerate() {
|
||||
// It should scan all row groups.
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
//! Mito region.
|
||||
|
||||
pub(crate) mod opener;
|
||||
pub mod opener;
|
||||
pub mod options;
|
||||
pub(crate) mod version;
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
//! Region opener.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::AtomicI64;
|
||||
use std::sync::atomic::{AtomicI64, AtomicU64};
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_telemetry::{debug, error, info, warn};
|
||||
@@ -27,7 +27,9 @@ use object_store::util::{join_dir, normalize_dir};
|
||||
use snafu::{ensure, OptionExt, ResultExt};
|
||||
use store_api::logstore::provider::Provider;
|
||||
use store_api::logstore::LogStore;
|
||||
use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder};
|
||||
use store_api::metadata::{
|
||||
ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
|
||||
};
|
||||
use store_api::region_engine::RegionRole;
|
||||
use store_api::storage::{ColumnId, RegionId};
|
||||
|
||||
@@ -38,6 +40,7 @@ use crate::error::{
|
||||
EmptyRegionDirSnafu, InvalidMetadataSnafu, ObjectStoreNotFoundSnafu, RegionCorruptedSnafu,
|
||||
Result, StaleLogEntrySnafu,
|
||||
};
|
||||
use crate::manifest::action::RegionManifest;
|
||||
use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions};
|
||||
use crate::manifest::storage::manifest_compress_type;
|
||||
use crate::memtable::time_partition::TimePartitions;
|
||||
@@ -203,11 +206,16 @@ impl RegionOpener {
|
||||
}
|
||||
// Safety: must be set before calling this method.
|
||||
let options = self.options.take().unwrap();
|
||||
let object_store = self.object_store(&options.storage)?.clone();
|
||||
let object_store = get_object_store(&options.storage, &self.object_store_manager)?.clone();
|
||||
let provider = self.provider(&options.wal_options);
|
||||
let metadata = Arc::new(metadata);
|
||||
// Create a manifest manager for this region and writes regions to the manifest file.
|
||||
let region_manifest_options = self.manifest_options(config, &options)?;
|
||||
let region_manifest_options = Self::manifest_options(
|
||||
config,
|
||||
&options,
|
||||
&self.region_dir,
|
||||
&self.object_store_manager,
|
||||
)?;
|
||||
let manifest_manager = RegionManifestManager::new(
|
||||
metadata.clone(),
|
||||
region_manifest_options,
|
||||
@@ -312,7 +320,12 @@ impl RegionOpener {
|
||||
) -> Result<Option<MitoRegion>> {
|
||||
let region_options = self.options.as_ref().unwrap().clone();
|
||||
|
||||
let region_manifest_options = self.manifest_options(config, ®ion_options)?;
|
||||
let region_manifest_options = Self::manifest_options(
|
||||
config,
|
||||
®ion_options,
|
||||
&self.region_dir,
|
||||
&self.object_store_manager,
|
||||
)?;
|
||||
let Some(manifest_manager) = RegionManifestManager::open(
|
||||
region_manifest_options,
|
||||
self.stats.total_manifest_size.clone(),
|
||||
@@ -332,7 +345,7 @@ impl RegionOpener {
|
||||
.take()
|
||||
.unwrap_or_else(|| wal.wal_entry_reader(&provider, region_id, None));
|
||||
let on_region_opened = wal.on_region_opened();
|
||||
let object_store = self.object_store(®ion_options.storage)?.clone();
|
||||
let object_store = get_object_store(®ion_options.storage, &self.object_store_manager)?;
|
||||
|
||||
debug!("Open region {} with options: {:?}", region_id, self.options);
|
||||
|
||||
@@ -422,13 +435,14 @@ impl RegionOpener {
|
||||
|
||||
/// Returns a new manifest options.
|
||||
fn manifest_options(
|
||||
&self,
|
||||
config: &MitoConfig,
|
||||
options: &RegionOptions,
|
||||
region_dir: &str,
|
||||
object_store_manager: &ObjectStoreManagerRef,
|
||||
) -> Result<RegionManifestOptions> {
|
||||
let object_store = self.object_store(&options.storage)?.clone();
|
||||
let object_store = get_object_store(&options.storage, object_store_manager)?;
|
||||
Ok(RegionManifestOptions {
|
||||
manifest_dir: new_manifest_dir(&self.region_dir),
|
||||
manifest_dir: new_manifest_dir(region_dir),
|
||||
object_store,
|
||||
// We don't allow users to set the compression algorithm as we use it as a file suffix.
|
||||
// Currently, the manifest storage doesn't have good support for changing compression algorithms.
|
||||
@@ -436,20 +450,72 @@ impl RegionOpener {
|
||||
checkpoint_distance: config.manifest_checkpoint_distance,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an object store corresponding to `name`. If `name` is `None`, this method returns the default object store.
|
||||
fn object_store(&self, name: &Option<String>) -> Result<&object_store::ObjectStore> {
|
||||
if let Some(name) = name {
|
||||
Ok(self
|
||||
.object_store_manager
|
||||
.find(name)
|
||||
.context(ObjectStoreNotFoundSnafu {
|
||||
object_store: name.to_string(),
|
||||
})?)
|
||||
} else {
|
||||
Ok(self.object_store_manager.default_object_store())
|
||||
/// Returns an object store corresponding to `name`. If `name` is `None`, this method returns the default object store.
|
||||
pub fn get_object_store(
|
||||
name: &Option<String>,
|
||||
object_store_manager: &ObjectStoreManagerRef,
|
||||
) -> Result<object_store::ObjectStore> {
|
||||
if let Some(name) = name {
|
||||
Ok(object_store_manager
|
||||
.find(name)
|
||||
.context(ObjectStoreNotFoundSnafu {
|
||||
object_store: name.to_string(),
|
||||
})?
|
||||
.clone())
|
||||
} else {
|
||||
Ok(object_store_manager.default_object_store().clone())
|
||||
}
|
||||
}
|
||||
|
||||
/// A loader for loading metadata from a region dir.
|
||||
pub struct RegionMetadataLoader {
|
||||
config: Arc<MitoConfig>,
|
||||
object_store_manager: ObjectStoreManagerRef,
|
||||
}
|
||||
|
||||
impl RegionMetadataLoader {
|
||||
/// Creates a new `RegionOpenerBuilder`.
|
||||
pub fn new(config: Arc<MitoConfig>, object_store_manager: ObjectStoreManagerRef) -> Self {
|
||||
Self {
|
||||
config,
|
||||
object_store_manager,
|
||||
}
|
||||
}
|
||||
|
||||
/// Loads the metadata of the region from the region dir.
|
||||
pub async fn load(
|
||||
&self,
|
||||
region_dir: &str,
|
||||
region_options: &RegionOptions,
|
||||
) -> Result<Option<RegionMetadataRef>> {
|
||||
let manifest = self.load_manifest(region_dir, region_options).await?;
|
||||
Ok(manifest.map(|m| m.metadata.clone()))
|
||||
}
|
||||
|
||||
/// Loads the manifest of the region from the region dir.
|
||||
pub async fn load_manifest(
|
||||
&self,
|
||||
region_dir: &str,
|
||||
region_options: &RegionOptions,
|
||||
) -> Result<Option<Arc<RegionManifest>>> {
|
||||
let region_manifest_options = RegionOpener::manifest_options(
|
||||
&self.config,
|
||||
region_options,
|
||||
region_dir,
|
||||
&self.object_store_manager,
|
||||
)?;
|
||||
let Some(manifest_manager) =
|
||||
RegionManifestManager::open(region_manifest_options, Arc::new(AtomicU64::new(0)))
|
||||
.await?
|
||||
else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let manifest = manifest_manager.manifest();
|
||||
Ok(Some(manifest))
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks whether the recovered region has the same schema as region to create.
|
||||
|
||||
@@ -33,6 +33,8 @@ use crate::row_converter::dense::SortField;
|
||||
use crate::row_converter::{CompositeValues, PrimaryKeyCodec, PrimaryKeyFilter};
|
||||
|
||||
/// A codec for sparse key of metrics.
|
||||
/// It requires the input primary key columns are sorted by the column name in lexicographical order.
|
||||
/// It encodes the column id of the physical region.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct SparsePrimaryKeyCodec {
|
||||
inner: Arc<SparsePrimaryKeyCodecInner>,
|
||||
|
||||
@@ -16,9 +16,9 @@ pub(crate) mod bloom_filter;
|
||||
mod codec;
|
||||
pub(crate) mod fulltext_index;
|
||||
mod indexer;
|
||||
pub(crate) mod intermediate;
|
||||
pub mod intermediate;
|
||||
pub(crate) mod inverted_index;
|
||||
pub(crate) mod puffin_manager;
|
||||
pub mod puffin_manager;
|
||||
mod statistics;
|
||||
pub(crate) mod store;
|
||||
|
||||
|
||||
@@ -49,6 +49,11 @@ impl IntermediateManager {
|
||||
/// Create a new `IntermediateManager` with the given root path.
|
||||
/// It will clean up all garbage intermediate files from previous runs.
|
||||
pub async fn init_fs(aux_path: impl AsRef<str>) -> Result<Self> {
|
||||
common_telemetry::info!(
|
||||
"Initializing intermediate manager, aux_path: {}",
|
||||
aux_path.as_ref()
|
||||
);
|
||||
|
||||
let store = new_fs_cache_store(&normalize_dir(aux_path.as_ref())).await?;
|
||||
let store = InstrumentedStore::new(store);
|
||||
|
||||
|
||||
@@ -228,8 +228,8 @@ impl Drop for InvertedIndexApplier {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use common_base::BitVec;
|
||||
use futures::io::Cursor;
|
||||
use index::bitmap::Bitmap;
|
||||
use index::inverted_index::search::index_apply::MockIndexApplier;
|
||||
use object_store::services::Memory;
|
||||
use puffin::puffin_manager::PuffinWriter;
|
||||
@@ -259,7 +259,7 @@ mod tests {
|
||||
mock_index_applier.expect_memory_usage().returning(|| 100);
|
||||
mock_index_applier.expect_apply().returning(|_, _| {
|
||||
Ok(ApplyOutput {
|
||||
matched_segment_ids: Bitmap::new_bitvec(),
|
||||
matched_segment_ids: BitVec::EMPTY,
|
||||
total_row_count: 100,
|
||||
segment_row_count: 10,
|
||||
})
|
||||
@@ -276,7 +276,7 @@ mod tests {
|
||||
assert_eq!(
|
||||
output,
|
||||
ApplyOutput {
|
||||
matched_segment_ids: Bitmap::new_bitvec(),
|
||||
matched_segment_ids: BitVec::EMPTY,
|
||||
total_row_count: 100,
|
||||
segment_row_count: 10,
|
||||
}
|
||||
|
||||
@@ -277,9 +277,7 @@ impl InvertedIndexer {
|
||||
let mut index_writer = InvertedIndexBlobWriter::new(tx.compat_write());
|
||||
|
||||
let (index_finish, puffin_add_blob) = futures::join!(
|
||||
// TODO(zhongzc): config bitmap type
|
||||
self.index_creator
|
||||
.finish(&mut index_writer, index::bitmap::BitmapType::Roaring),
|
||||
self.index_creator.finish(&mut index_writer),
|
||||
puffin_writer.put_blob(INDEX_BLOB_TYPE, rx.compat(), PutOptions::default())
|
||||
);
|
||||
|
||||
|
||||
@@ -61,6 +61,7 @@ impl Default for WriteOptions {
|
||||
}
|
||||
|
||||
/// Parquet SST info returned by the writer.
|
||||
#[derive(Debug)]
|
||||
pub struct SstInfo {
|
||||
/// SST file id.
|
||||
pub file_id: FileId,
|
||||
|
||||
@@ -583,8 +583,6 @@ type RequestBuffer = Vec<WorkerRequest>;
|
||||
#[derive(Default)]
|
||||
pub(crate) struct StalledRequests {
|
||||
/// Stalled requests.
|
||||
/// Remember to use `StalledRequests::stalled_count()` to get the total number of stalled requests
|
||||
/// instead of `StalledRequests::requests.len()`.
|
||||
///
|
||||
/// Key: RegionId
|
||||
/// Value: (estimated size, stalled requests)
|
||||
@@ -619,11 +617,6 @@ impl StalledRequests {
|
||||
vec![]
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the total number of all stalled requests.
|
||||
pub(crate) fn stalled_count(&self) -> usize {
|
||||
self.requests.values().map(|reqs| reqs.1.len()).sum()
|
||||
}
|
||||
}
|
||||
|
||||
/// Background worker loop to handle requests.
|
||||
|
||||
@@ -329,15 +329,6 @@ async fn edit_region(
|
||||
|
||||
let index_key = IndexKey::new(region_id, file_meta.file_id, FileType::Parquet);
|
||||
let remote_path = location::sst_file_path(layer.region_dir(), file_meta.file_id);
|
||||
|
||||
let is_index_exist = file_meta.exists_index();
|
||||
let index_file_size = file_meta.index_file_size();
|
||||
|
||||
let index_file_index_key =
|
||||
IndexKey::new(region_id, file_meta.file_id, FileType::Puffin);
|
||||
let index_remote_path =
|
||||
location::index_file_path(layer.region_dir(), file_meta.file_id);
|
||||
|
||||
let file_size = file_meta.file_size;
|
||||
common_runtime::spawn_global(async move {
|
||||
if write_cache
|
||||
@@ -354,22 +345,6 @@ async fn edit_region(
|
||||
|
||||
listener.on_file_cache_filled(index_key.file_id);
|
||||
}
|
||||
if is_index_exist {
|
||||
// also download puffin file
|
||||
if let Err(err) = write_cache
|
||||
.download(
|
||||
index_file_index_key,
|
||||
&index_remote_path,
|
||||
layer.object_store(),
|
||||
index_file_size,
|
||||
)
|
||||
.await
|
||||
{
|
||||
common_telemetry::error!(
|
||||
err; "Failed to download puffin file, region_id: {}, index_file_index_key: {:?}, index_remote_path: {}", region_id, index_file_index_key, index_remote_path
|
||||
);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -147,7 +147,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
|
||||
pub(crate) async fn handle_stalled_requests(&mut self) {
|
||||
// Handle stalled requests.
|
||||
let stalled = std::mem::take(&mut self.stalled_requests);
|
||||
self.stalled_count.sub(stalled.stalled_count() as i64);
|
||||
self.stalled_count.sub(stalled.requests.len() as i64);
|
||||
// We already stalled these requests, don't stall them again.
|
||||
for (_, (_, mut requests)) in stalled.requests {
|
||||
self.handle_write_requests(&mut requests, false).await;
|
||||
@@ -157,7 +157,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
|
||||
/// Rejects all stalled requests.
|
||||
pub(crate) fn reject_stalled_requests(&mut self) {
|
||||
let stalled = std::mem::take(&mut self.stalled_requests);
|
||||
self.stalled_count.sub(stalled.stalled_count() as i64);
|
||||
self.stalled_count.sub(stalled.requests.len() as i64);
|
||||
for (_, (_, mut requests)) in stalled.requests {
|
||||
reject_write_requests(&mut requests);
|
||||
}
|
||||
|
||||
@@ -74,7 +74,6 @@ pub struct Inserter {
|
||||
catalog_manager: CatalogManagerRef,
|
||||
partition_manager: PartitionRuleManagerRef,
|
||||
node_manager: NodeManagerRef,
|
||||
#[allow(unused)]
|
||||
table_flownode_set_cache: TableFlownodeSetCacheRef,
|
||||
}
|
||||
|
||||
@@ -363,8 +362,6 @@ impl Inserter {
|
||||
instant_requests,
|
||||
} = requests;
|
||||
|
||||
// TODO(discord9): mirror some
|
||||
|
||||
// Mirror requests for source table to flownode asynchronously
|
||||
let flow_mirror_task = FlowMirrorTask::new(
|
||||
&self.table_flownode_set_cache,
|
||||
@@ -898,14 +895,12 @@ struct CreateAlterTableResult {
|
||||
table_infos: HashMap<TableId, Arc<TableInfo>>,
|
||||
}
|
||||
|
||||
#[allow(unused)]
|
||||
struct FlowMirrorTask {
|
||||
requests: HashMap<Peer, RegionInsertRequests>,
|
||||
num_rows: usize,
|
||||
}
|
||||
|
||||
impl FlowMirrorTask {
|
||||
#[allow(unused)]
|
||||
async fn new(
|
||||
cache: &TableFlownodeSetCacheRef,
|
||||
requests: impl Iterator<Item = &RegionInsertRequest>,
|
||||
@@ -979,7 +974,6 @@ impl FlowMirrorTask {
|
||||
})
|
||||
}
|
||||
|
||||
#[allow(unused)]
|
||||
fn detach(self, node_manager: NodeManagerRef) -> Result<()> {
|
||||
crate::metrics::DIST_MIRROR_PENDING_ROW_COUNT.add(self.num_rows as i64);
|
||||
for (peer, inserts) in self.requests {
|
||||
|
||||
@@ -41,7 +41,7 @@ futures.workspace = true
|
||||
greptime-proto.workspace = true
|
||||
itertools.workspace = true
|
||||
jsonb.workspace = true
|
||||
jsonpath-rust = "0.7.5"
|
||||
jsonpath-rust = "0.7.3"
|
||||
lazy_static.workspace = true
|
||||
moka = { workspace = true, features = ["sync"] }
|
||||
once_cell.workspace = true
|
||||
|
||||
@@ -16,13 +16,10 @@ pub mod array;
|
||||
pub mod map;
|
||||
pub mod time;
|
||||
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
pub use array::Array;
|
||||
use jsonb::{Number as JsonbNumber, Object as JsonbObject, Value as JsonbValue};
|
||||
use jsonpath_rust::parser::{parse_json_path, JsonPathIndex};
|
||||
use jsonpath_rust::path::{JsonLike, Path};
|
||||
use jsonpath_rust::{jsp_idx, jsp_obj, JsonPath, JsonPathParserError, JsonPathStr};
|
||||
use jsonpath_rust::{jsp_idx, jsp_obj};
|
||||
pub use map::Map;
|
||||
use regex::Regex;
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
@@ -289,52 +286,6 @@ impl Value {
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
// ref https://github.com/serde-rs/json/blob/master/src/value/mod.rs#L779
|
||||
pub fn pointer(&self, pointer: &str) -> Option<&Value> {
|
||||
if pointer.is_empty() {
|
||||
return Some(self);
|
||||
}
|
||||
if !pointer.starts_with('/') {
|
||||
return None;
|
||||
}
|
||||
pointer
|
||||
.split('/')
|
||||
.skip(1)
|
||||
.map(|x| x.replace("~1", "/").replace("~0", "~"))
|
||||
.try_fold(self, |target, token| match target {
|
||||
Value::Map(map) => map.get(&token),
|
||||
Value::Array(list) => parse_index(&token).and_then(|x| list.get(x)),
|
||||
_ => None,
|
||||
})
|
||||
}
|
||||
|
||||
// ref https://github.com/serde-rs/json/blob/master/src/value/mod.rs#L834
|
||||
pub fn pointer_mut(&mut self, pointer: &str) -> Option<&mut Value> {
|
||||
if pointer.is_empty() {
|
||||
return Some(self);
|
||||
}
|
||||
if !pointer.starts_with('/') {
|
||||
return None;
|
||||
}
|
||||
pointer
|
||||
.split('/')
|
||||
.skip(1)
|
||||
.map(|x| x.replace("~1", "/").replace("~0", "~"))
|
||||
.try_fold(self, |target, token| match target {
|
||||
Value::Map(map) => map.get_mut(&token),
|
||||
Value::Array(list) => parse_index(&token).and_then(move |x| list.get_mut(x)),
|
||||
_ => None,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ref https://github.com/serde-rs/json/blob/master/src/value/mod.rs#L259
|
||||
fn parse_index(s: &str) -> Option<usize> {
|
||||
if s.starts_with('+') || (s.starts_with('0') && s.len() != 1) {
|
||||
return None;
|
||||
}
|
||||
s.parse().ok()
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Value {
|
||||
@@ -863,46 +814,4 @@ impl JsonLike for Value {
|
||||
fn null() -> Self {
|
||||
Value::Null
|
||||
}
|
||||
|
||||
// ref https://github.com/besok/jsonpath-rust/blob/main/src/path/mod.rs#L423
|
||||
fn reference<T>(
|
||||
&self,
|
||||
path: T,
|
||||
) -> std::result::Result<std::option::Option<&Value>, JsonPathParserError>
|
||||
where
|
||||
T: Into<JsonPathStr>,
|
||||
{
|
||||
Ok(self.pointer(&path_to_json_path(path.into())?))
|
||||
}
|
||||
|
||||
// https://github.com/besok/jsonpath-rust/blob/main/src/path/mod.rs#L430
|
||||
fn reference_mut<T>(
|
||||
&mut self,
|
||||
path: T,
|
||||
) -> std::result::Result<std::option::Option<&mut Value>, JsonPathParserError>
|
||||
where
|
||||
T: Into<JsonPathStr>,
|
||||
{
|
||||
Ok(self.pointer_mut(&path_to_json_path(path.into())?))
|
||||
}
|
||||
}
|
||||
|
||||
// ref https://github.com/besok/jsonpath-rust/blob/main/src/path/mod.rs#L438
|
||||
fn path_to_json_path(path: JsonPathStr) -> StdResult<String, JsonPathParserError> {
|
||||
convert_part(&parse_json_path(path.as_str())?)
|
||||
}
|
||||
|
||||
// https://github.com/besok/jsonpath-rust/blob/main/src/path/mod.rs#L442
|
||||
fn convert_part(path: &JsonPath) -> StdResult<String, JsonPathParserError> {
|
||||
match path {
|
||||
JsonPath::Chain(elems) => elems
|
||||
.iter()
|
||||
.map(convert_part)
|
||||
.collect::<StdResult<String, JsonPathParserError>>(),
|
||||
|
||||
JsonPath::Index(JsonPathIndex::Single(v)) => Ok(format!("/{}", v)),
|
||||
JsonPath::Field(e) => Ok(format!("/{}", e)),
|
||||
JsonPath::Root => Ok("".to_string()),
|
||||
e => Err(JsonPathParserError::InvalidJsonPath(e.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,7 +16,6 @@ common-macro.workspace = true
|
||||
common-recordbatch.workspace = true
|
||||
common-telemetry.workspace = true
|
||||
datafusion.workspace = true
|
||||
datafusion-common.workspace = true
|
||||
datafusion-expr.workspace = true
|
||||
datatypes.workspace = true
|
||||
futures.workspace = true
|
||||
|
||||
@@ -20,7 +20,6 @@ mod holt_winters;
|
||||
mod idelta;
|
||||
mod predict_linear;
|
||||
mod quantile;
|
||||
mod quantile_aggr;
|
||||
mod resets;
|
||||
mod round;
|
||||
#[cfg(test)]
|
||||
@@ -40,7 +39,6 @@ pub use holt_winters::HoltWinters;
|
||||
pub use idelta::IDelta;
|
||||
pub use predict_linear::PredictLinear;
|
||||
pub use quantile::QuantileOverTime;
|
||||
pub use quantile_aggr::quantile_udaf;
|
||||
pub use resets::Resets;
|
||||
pub use round::Round;
|
||||
|
||||
|
||||
@@ -125,7 +125,7 @@ impl QuantileOverTime {
|
||||
}
|
||||
|
||||
/// Refer to <https://github.com/prometheus/prometheus/blob/6e2905a4d4ff9b47b1f6d201333f5bd53633f921/promql/quantile.go#L357-L386>
|
||||
pub(crate) fn quantile_impl(values: &[f64], quantile: f64) -> Option<f64> {
|
||||
fn quantile_impl(values: &[f64], quantile: f64) -> Option<f64> {
|
||||
if quantile.is_nan() || values.is_empty() {
|
||||
return Some(f64::NAN);
|
||||
}
|
||||
|
||||
@@ -1,297 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use datafusion::arrow::array::{ArrayRef, AsArray};
|
||||
use datafusion::common::cast::{as_list_array, as_primitive_array, as_struct_array};
|
||||
use datafusion::error::Result as DfResult;
|
||||
use datafusion::logical_expr::{Accumulator as DfAccumulator, AggregateUDF, Volatility};
|
||||
use datafusion::prelude::create_udaf;
|
||||
use datafusion_common::ScalarValue;
|
||||
use datatypes::arrow::array::{ListArray, StructArray};
|
||||
use datatypes::arrow::datatypes::{DataType, Field, Float64Type};
|
||||
|
||||
use crate::functions::quantile::quantile_impl;
|
||||
|
||||
const QUANTILE_NAME: &str = "quantile";
|
||||
|
||||
const VALUES_FIELD_NAME: &str = "values";
|
||||
const DEFAULT_LIST_FIELD_NAME: &str = "item";
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct QuantileAccumulator {
|
||||
q: f64,
|
||||
values: Vec<Option<f64>>,
|
||||
}
|
||||
|
||||
/// Create a quantile `AggregateUDF` for PromQL quantile operator,
|
||||
/// which calculates φ-quantile (0 ≤ φ ≤ 1) over dimensions
|
||||
pub fn quantile_udaf(q: f64) -> Arc<AggregateUDF> {
|
||||
Arc::new(create_udaf(
|
||||
QUANTILE_NAME,
|
||||
// Input type: (values)
|
||||
vec![DataType::Float64],
|
||||
// Output type: the φ-quantile
|
||||
Arc::new(DataType::Float64),
|
||||
Volatility::Immutable,
|
||||
// Create the accumulator
|
||||
Arc::new(move |_| Ok(Box::new(QuantileAccumulator::new(q)))),
|
||||
// Intermediate state types
|
||||
Arc::new(vec![DataType::Struct(
|
||||
vec![Field::new(
|
||||
VALUES_FIELD_NAME,
|
||||
DataType::List(Arc::new(Field::new(
|
||||
DEFAULT_LIST_FIELD_NAME,
|
||||
DataType::Float64,
|
||||
true,
|
||||
))),
|
||||
false,
|
||||
)]
|
||||
.into(),
|
||||
)]),
|
||||
))
|
||||
}
|
||||
|
||||
impl QuantileAccumulator {
|
||||
pub fn new(q: f64) -> Self {
|
||||
Self {
|
||||
q,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DfAccumulator for QuantileAccumulator {
|
||||
fn update_batch(&mut self, values: &[ArrayRef]) -> DfResult<()> {
|
||||
let f64_array = values[0].as_primitive::<Float64Type>();
|
||||
|
||||
self.values.extend(f64_array);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn evaluate(&mut self) -> DfResult<ScalarValue> {
|
||||
let values: Vec<_> = self.values.iter().map(|v| v.unwrap_or(0.0)).collect();
|
||||
|
||||
let result = quantile_impl(&values, self.q);
|
||||
|
||||
ScalarValue::new_primitive::<Float64Type>(result, &DataType::Float64)
|
||||
}
|
||||
|
||||
fn size(&self) -> usize {
|
||||
std::mem::size_of::<Self>() + self.values.capacity() * std::mem::size_of::<Option<f64>>()
|
||||
}
|
||||
|
||||
fn state(&mut self) -> DfResult<Vec<ScalarValue>> {
|
||||
let values_array = Arc::new(ListArray::from_iter_primitive::<Float64Type, _, _>(vec![
|
||||
Some(self.values.clone()),
|
||||
]));
|
||||
|
||||
let state_struct = StructArray::new(
|
||||
vec![Field::new(
|
||||
VALUES_FIELD_NAME,
|
||||
DataType::List(Arc::new(Field::new(
|
||||
DEFAULT_LIST_FIELD_NAME,
|
||||
DataType::Float64,
|
||||
true,
|
||||
))),
|
||||
false,
|
||||
)]
|
||||
.into(),
|
||||
vec![values_array],
|
||||
None,
|
||||
);
|
||||
|
||||
Ok(vec![ScalarValue::Struct(Arc::new(state_struct))])
|
||||
}
|
||||
|
||||
fn merge_batch(&mut self, states: &[ArrayRef]) -> DfResult<()> {
|
||||
if states.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
for state in states {
|
||||
let state = as_struct_array(state)?;
|
||||
|
||||
for list in as_list_array(state.column(0))?.iter().flatten() {
|
||||
let f64_array = as_primitive_array::<Float64Type>(&list)?.clone();
|
||||
self.values.extend(&f64_array);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use datafusion::arrow::array::{ArrayRef, Float64Array};
|
||||
use datafusion_common::ScalarValue;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn create_f64_array(values: Vec<Option<f64>>) -> ArrayRef {
|
||||
Arc::new(Float64Array::from(values)) as ArrayRef
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_quantile_accumulator_empty() {
|
||||
let mut accumulator = QuantileAccumulator::new(0.5);
|
||||
|
||||
let result = accumulator.evaluate().unwrap();
|
||||
|
||||
match result {
|
||||
ScalarValue::Float64(_) => (),
|
||||
_ => panic!("Expected Float64 scalar value"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_quantile_accumulator_single_value() {
|
||||
let mut accumulator = QuantileAccumulator::new(0.5);
|
||||
let input = create_f64_array(vec![Some(10.0)]);
|
||||
|
||||
accumulator.update_batch(&[input]).unwrap();
|
||||
let result = accumulator.evaluate().unwrap();
|
||||
|
||||
assert_eq!(result, ScalarValue::Float64(Some(10.0)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_quantile_accumulator_multiple_values() {
|
||||
let mut accumulator = QuantileAccumulator::new(0.5);
|
||||
let input = create_f64_array(vec![Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)]);
|
||||
|
||||
accumulator.update_batch(&[input]).unwrap();
|
||||
let result = accumulator.evaluate().unwrap();
|
||||
|
||||
assert_eq!(result, ScalarValue::Float64(Some(3.0)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_quantile_accumulator_with_nulls() {
|
||||
let mut accumulator = QuantileAccumulator::new(0.5);
|
||||
let input = create_f64_array(vec![Some(1.0), None, Some(3.0), Some(4.0), Some(5.0)]);
|
||||
|
||||
accumulator.update_batch(&[input]).unwrap();
|
||||
|
||||
let result = accumulator.evaluate().unwrap();
|
||||
assert_eq!(result, ScalarValue::Float64(Some(3.0)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_quantile_accumulator_multiple_batches() {
|
||||
let mut accumulator = QuantileAccumulator::new(0.5);
|
||||
let input1 = create_f64_array(vec![Some(1.0), Some(2.0)]);
|
||||
let input2 = create_f64_array(vec![Some(3.0), Some(4.0), Some(5.0)]);
|
||||
|
||||
accumulator.update_batch(&[input1]).unwrap();
|
||||
accumulator.update_batch(&[input2]).unwrap();
|
||||
|
||||
let result = accumulator.evaluate().unwrap();
|
||||
assert_eq!(result, ScalarValue::Float64(Some(3.0)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_quantile_accumulator_different_quantiles() {
|
||||
let mut min_accumulator = QuantileAccumulator::new(0.0);
|
||||
let input = create_f64_array(vec![Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)]);
|
||||
min_accumulator.update_batch(&[input.clone()]).unwrap();
|
||||
assert_eq!(
|
||||
min_accumulator.evaluate().unwrap(),
|
||||
ScalarValue::Float64(Some(1.0))
|
||||
);
|
||||
|
||||
let mut q1_accumulator = QuantileAccumulator::new(0.25);
|
||||
q1_accumulator.update_batch(&[input.clone()]).unwrap();
|
||||
assert_eq!(
|
||||
q1_accumulator.evaluate().unwrap(),
|
||||
ScalarValue::Float64(Some(2.0))
|
||||
);
|
||||
|
||||
let mut q3_accumulator = QuantileAccumulator::new(0.75);
|
||||
q3_accumulator.update_batch(&[input.clone()]).unwrap();
|
||||
assert_eq!(
|
||||
q3_accumulator.evaluate().unwrap(),
|
||||
ScalarValue::Float64(Some(4.0))
|
||||
);
|
||||
|
||||
let mut max_accumulator = QuantileAccumulator::new(1.0);
|
||||
max_accumulator.update_batch(&[input]).unwrap();
|
||||
assert_eq!(
|
||||
max_accumulator.evaluate().unwrap(),
|
||||
ScalarValue::Float64(Some(5.0))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_quantile_accumulator_size() {
|
||||
let mut accumulator = QuantileAccumulator::new(0.5);
|
||||
let input = create_f64_array(vec![Some(1.0), Some(2.0), Some(3.0)]);
|
||||
|
||||
let initial_size = accumulator.size();
|
||||
accumulator.update_batch(&[input]).unwrap();
|
||||
let after_update_size = accumulator.size();
|
||||
|
||||
assert!(after_update_size >= initial_size);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_quantile_accumulator_state_and_merge() -> DfResult<()> {
|
||||
let mut acc1 = QuantileAccumulator::new(0.5);
|
||||
let input1 = create_f64_array(vec![Some(1.0), Some(2.0)]);
|
||||
acc1.update_batch(&[input1])?;
|
||||
|
||||
let state1 = acc1.state()?;
|
||||
|
||||
let mut acc2 = QuantileAccumulator::new(0.5);
|
||||
let input2 = create_f64_array(vec![Some(3.0), Some(4.0), Some(5.0)]);
|
||||
acc2.update_batch(&[input2])?;
|
||||
|
||||
let mut struct_builders = vec![];
|
||||
for scalar in &state1 {
|
||||
if let ScalarValue::Struct(struct_array) = scalar {
|
||||
struct_builders.push(struct_array.clone() as ArrayRef);
|
||||
}
|
||||
}
|
||||
|
||||
acc2.merge_batch(&struct_builders)?;
|
||||
|
||||
let result = acc2.evaluate()?;
|
||||
|
||||
assert_eq!(result, ScalarValue::Float64(Some(3.0)));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_quantile_accumulator_with_extreme_values() {
|
||||
let mut accumulator = QuantileAccumulator::new(0.5);
|
||||
let input = create_f64_array(vec![Some(f64::MAX), Some(f64::MIN), Some(0.0)]);
|
||||
|
||||
accumulator.update_batch(&[input]).unwrap();
|
||||
let _result = accumulator.evaluate().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_quantile_udaf_creation() {
|
||||
let q = 0.5;
|
||||
let udaf = quantile_udaf(q);
|
||||
|
||||
assert_eq!(udaf.name(), QUANTILE_NAME);
|
||||
assert_eq!(udaf.return_type(&[]).unwrap(), DataType::Float64);
|
||||
}
|
||||
}
|
||||
@@ -13,7 +13,6 @@
|
||||
// limitations under the License.
|
||||
|
||||
use datafusion::dataframe::DataFrame as DfDataFrame;
|
||||
use datafusion_expr::LogicalPlan;
|
||||
|
||||
/// DataFrame represents a logical set of rows with the same named columns.
|
||||
/// Similar to a Pandas DataFrame or Spark DataFrame
|
||||
@@ -21,11 +20,3 @@ use datafusion_expr::LogicalPlan;
|
||||
pub enum DataFrame {
|
||||
DataFusion(DfDataFrame),
|
||||
}
|
||||
|
||||
impl DataFrame {
|
||||
pub fn into_logical_plan(self) -> LogicalPlan {
|
||||
match self {
|
||||
Self::DataFusion(dataframe) => dataframe.into_parts().1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,7 +31,7 @@ use datatypes::arrow::datatypes::SchemaRef;
|
||||
use snafu::ResultExt;
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::region_engine::RegionEngineRef;
|
||||
use store_api::storage::{RegionId, ScanRequest, TimeSeriesDistribution, TimeSeriesRowSelector};
|
||||
use store_api::storage::{RegionId, ScanRequest, TimeSeriesRowSelector};
|
||||
use table::table::scan::RegionScanExec;
|
||||
|
||||
use crate::error::{GetRegionMetadataSnafu, Result};
|
||||
@@ -175,10 +175,10 @@ impl TableProvider for DummyTableProvider {
|
||||
|
||||
let scanner = self
|
||||
.engine
|
||||
.handle_query(self.region_id, request.clone())
|
||||
.handle_query(self.region_id, request)
|
||||
.await
|
||||
.map_err(|e| DataFusionError::External(Box::new(e)))?;
|
||||
Ok(Arc::new(RegionScanExec::new(scanner, request)?))
|
||||
Ok(Arc::new(RegionScanExec::new(scanner)))
|
||||
}
|
||||
|
||||
fn supports_filters_pushdown(
|
||||
@@ -233,11 +233,6 @@ impl DummyTableProvider {
|
||||
self.scan_request.lock().unwrap().output_ordering = Some(order_opts.to_vec());
|
||||
}
|
||||
|
||||
/// Sets the distribution hint of the query to the provider.
|
||||
pub fn with_distribution(&self, distribution: TimeSeriesDistribution) {
|
||||
self.scan_request.lock().unwrap().distribution = Some(distribution);
|
||||
}
|
||||
|
||||
/// Sets the time series selector hint of the query to the provider.
|
||||
pub fn with_time_series_selector_hint(&self, selector: TimeSeriesRowSelector) {
|
||||
self.scan_request.lock().unwrap().series_row_selector = Some(selector);
|
||||
|
||||
@@ -23,7 +23,6 @@ use datafusion::physical_plan::ExecutionPlan;
|
||||
use datafusion_common::tree_node::{Transformed, TreeNode};
|
||||
use datafusion_common::{DataFusionError, Result};
|
||||
use store_api::region_engine::PartitionRange;
|
||||
use store_api::storage::TimeSeriesDistribution;
|
||||
use table::table::scan::RegionScanExec;
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -66,14 +65,6 @@ impl ParallelizeScan {
|
||||
return Ok(Transformed::no(plan));
|
||||
}
|
||||
|
||||
// don't parallelize if we want per series distribution
|
||||
if matches!(
|
||||
region_scan_exec.distribution(),
|
||||
Some(TimeSeriesDistribution::PerSeries)
|
||||
) {
|
||||
return Ok(Transformed::no(plan));
|
||||
}
|
||||
|
||||
let ranges = region_scan_exec.get_partition_ranges();
|
||||
let total_range_num = ranges.len();
|
||||
let expected_partition_num = config.execution.target_partitions;
|
||||
|
||||
@@ -23,7 +23,7 @@ use datafusion_common::{Column, Result};
|
||||
use datafusion_expr::expr::Sort;
|
||||
use datafusion_expr::{utils, Expr, LogicalPlan};
|
||||
use datafusion_optimizer::{OptimizerConfig, OptimizerRule};
|
||||
use store_api::storage::{TimeSeriesDistribution, TimeSeriesRowSelector};
|
||||
use store_api::storage::TimeSeriesRowSelector;
|
||||
|
||||
use crate::dummy_catalog::DummyTableProvider;
|
||||
|
||||
@@ -121,36 +121,6 @@ impl ScanHintRule {
|
||||
});
|
||||
}
|
||||
adapter.with_ordering_hint(&opts);
|
||||
|
||||
let mut sort_expr_cursor = order_expr.iter().filter_map(|s| s.expr.try_as_col());
|
||||
let region_metadata = adapter.region_metadata();
|
||||
// ignore table without pk
|
||||
if region_metadata.primary_key.is_empty() {
|
||||
return;
|
||||
}
|
||||
let mut pk_column_iter = region_metadata.primary_key_columns();
|
||||
let mut curr_sort_expr = sort_expr_cursor.next();
|
||||
let mut curr_pk_col = pk_column_iter.next();
|
||||
|
||||
while let (Some(sort_expr), Some(pk_col)) = (curr_sort_expr, curr_pk_col) {
|
||||
if sort_expr.name == pk_col.column_schema.name {
|
||||
curr_sort_expr = sort_expr_cursor.next();
|
||||
curr_pk_col = pk_column_iter.next();
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
let next_remaining = sort_expr_cursor.next();
|
||||
match (curr_sort_expr, next_remaining) {
|
||||
(Some(expr), None)
|
||||
if expr.name == region_metadata.time_index_column().column_schema.name =>
|
||||
{
|
||||
adapter.with_distribution(TimeSeriesDistribution::PerSeries);
|
||||
}
|
||||
(None, _) => adapter.with_distribution(TimeSeriesDistribution::PerSeries),
|
||||
(Some(_), _) => {}
|
||||
}
|
||||
}
|
||||
|
||||
fn set_time_series_row_selector_hint(
|
||||
|
||||
@@ -188,7 +188,7 @@ impl QueryLanguageParser {
|
||||
Ok(QueryStatement::Promql(eval_stmt))
|
||||
}
|
||||
|
||||
pub fn parse_promql_timestamp(timestamp: &str) -> Result<SystemTime> {
|
||||
fn parse_promql_timestamp(timestamp: &str) -> Result<SystemTime> {
|
||||
// try rfc3339 format
|
||||
let rfc3339_result = DateTime::parse_from_rfc3339(timestamp)
|
||||
.context(ParseTimestampSnafu { raw: timestamp })
|
||||
|
||||
@@ -12,6 +12,5 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
pub mod error;
|
||||
pub mod label_values;
|
||||
pub(crate) mod error;
|
||||
pub mod planner;
|
||||
|
||||
@@ -1,107 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
|
||||
use datafusion_common::{Column, ScalarValue};
|
||||
use datafusion_expr::expr::Alias;
|
||||
use datafusion_expr::utils::conjunction;
|
||||
use datafusion_expr::{col, Cast, Expr, LogicalPlan, LogicalPlanBuilder};
|
||||
use datafusion_sql::TableReference;
|
||||
use datatypes::arrow::datatypes::{DataType as ArrowDataType, TimeUnit as ArrowTimeUnit};
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
use table::TableRef;
|
||||
|
||||
use crate::promql::error::{DataFusionPlanningSnafu, Result, TimeIndexNotFoundSnafu};
|
||||
|
||||
fn build_time_filter(time_index_expr: Expr, start: i64, end: i64) -> Expr {
|
||||
time_index_expr
|
||||
.clone()
|
||||
.gt_eq(Expr::Literal(ScalarValue::TimestampMillisecond(
|
||||
Some(start),
|
||||
None,
|
||||
)))
|
||||
.and(
|
||||
time_index_expr.lt_eq(Expr::Literal(ScalarValue::TimestampMillisecond(
|
||||
Some(end),
|
||||
None,
|
||||
))),
|
||||
)
|
||||
}
|
||||
|
||||
/// Rewrite label values query to DataFusion logical plan.
|
||||
pub fn rewrite_label_values_query(
|
||||
table: TableRef,
|
||||
mut scan_plan: LogicalPlan,
|
||||
mut conditions: Vec<Expr>,
|
||||
label_name: String,
|
||||
start: SystemTime,
|
||||
end: SystemTime,
|
||||
) -> Result<LogicalPlan> {
|
||||
let table_ref = TableReference::partial(
|
||||
table.table_info().schema_name.as_str(),
|
||||
table.table_info().name.as_str(),
|
||||
);
|
||||
let schema = table.schema();
|
||||
let ts_column = schema
|
||||
.timestamp_column()
|
||||
.with_context(|| TimeIndexNotFoundSnafu {
|
||||
table: table.table_info().full_table_name(),
|
||||
})?;
|
||||
|
||||
let is_time_index_ms =
|
||||
ts_column.data_type == ConcreteDataType::timestamp_millisecond_datatype();
|
||||
let time_index_expr = col(Column::from_name(ts_column.name.clone()));
|
||||
|
||||
if !is_time_index_ms {
|
||||
// cast to ms if time_index not in Millisecond precision
|
||||
let expr = vec![
|
||||
col(Column::from_name(label_name.clone())),
|
||||
Expr::Alias(Alias {
|
||||
expr: Box::new(Expr::Cast(Cast {
|
||||
expr: Box::new(time_index_expr.clone()),
|
||||
data_type: ArrowDataType::Timestamp(ArrowTimeUnit::Millisecond, None),
|
||||
})),
|
||||
relation: Some(table_ref),
|
||||
name: ts_column.name.clone(),
|
||||
}),
|
||||
];
|
||||
scan_plan = LogicalPlanBuilder::from(scan_plan)
|
||||
.project(expr)
|
||||
.context(DataFusionPlanningSnafu)?
|
||||
.build()
|
||||
.context(DataFusionPlanningSnafu)?;
|
||||
};
|
||||
|
||||
let start = start.duration_since(UNIX_EPOCH).unwrap().as_millis() as i64;
|
||||
let end = end.duration_since(UNIX_EPOCH).unwrap().as_millis() as i64;
|
||||
|
||||
conditions.push(build_time_filter(time_index_expr, start, end));
|
||||
// Safety: `conditions` is not empty.
|
||||
let filter = conjunction(conditions).unwrap();
|
||||
|
||||
// Builds time filter
|
||||
let logical_plan = LogicalPlanBuilder::from(scan_plan)
|
||||
.filter(filter)
|
||||
.context(DataFusionPlanningSnafu)?
|
||||
.project(vec![col(Column::from_name(label_name))])
|
||||
.context(DataFusionPlanningSnafu)?
|
||||
.distinct()
|
||||
.context(DataFusionPlanningSnafu)?
|
||||
.build()
|
||||
.context(DataFusionPlanningSnafu)?;
|
||||
|
||||
Ok(logical_plan)
|
||||
}
|
||||
@@ -51,8 +51,8 @@ use promql::extension_plan::{
|
||||
RangeManipulate, ScalarCalculate, SeriesDivide, SeriesNormalize, UnionDistinctOn,
|
||||
};
|
||||
use promql::functions::{
|
||||
quantile_udaf, AbsentOverTime, AvgOverTime, Changes, CountOverTime, Delta, Deriv, HoltWinters,
|
||||
IDelta, Increase, LastOverTime, MaxOverTime, MinOverTime, PredictLinear, PresentOverTime,
|
||||
AbsentOverTime, AvgOverTime, Changes, CountOverTime, Delta, Deriv, HoltWinters, IDelta,
|
||||
Increase, LastOverTime, MaxOverTime, MinOverTime, PredictLinear, PresentOverTime,
|
||||
QuantileOverTime, Rate, Resets, Round, StddevOverTime, StdvarOverTime, SumOverTime,
|
||||
};
|
||||
use promql_parser::label::{MatchOp, Matcher, Matchers, METRIC_NAME};
|
||||
@@ -266,10 +266,7 @@ impl PromPlanner {
|
||||
aggr_expr: &AggregateExpr,
|
||||
) -> Result<LogicalPlan> {
|
||||
let AggregateExpr {
|
||||
op,
|
||||
expr,
|
||||
modifier,
|
||||
param,
|
||||
op, expr, modifier, ..
|
||||
} = aggr_expr;
|
||||
|
||||
let input = self.prom_expr_to_plan(expr, session_state).await?;
|
||||
@@ -280,40 +277,19 @@ impl PromPlanner {
|
||||
_ => {
|
||||
// calculate columns to group by
|
||||
// Need to append time index column into group by columns
|
||||
let mut group_exprs = self.agg_modifier_to_col(input.schema(), modifier, true)?;
|
||||
let group_exprs = self.agg_modifier_to_col(input.schema(), modifier, true)?;
|
||||
// convert op and value columns to aggregate exprs
|
||||
let (aggr_exprs, prev_field_exprs) =
|
||||
self.create_aggregate_exprs(*op, param, &input)?;
|
||||
let aggr_exprs = self.create_aggregate_exprs(*op, &input)?;
|
||||
|
||||
// create plan
|
||||
let builder = LogicalPlanBuilder::from(input);
|
||||
let builder = if op.id() == token::T_COUNT_VALUES {
|
||||
let label = Self::get_param_value_as_str(*op, param)?;
|
||||
// `count_values` must be grouped by fields,
|
||||
// and project the fields to the new label.
|
||||
group_exprs.extend(prev_field_exprs.clone());
|
||||
let project_fields = self
|
||||
.create_field_column_exprs()?
|
||||
.into_iter()
|
||||
.chain(self.create_tag_column_exprs()?)
|
||||
.chain(Some(self.create_time_index_column_expr()?))
|
||||
.chain(prev_field_exprs.into_iter().map(|expr| expr.alias(label)));
|
||||
|
||||
builder
|
||||
.aggregate(group_exprs.clone(), aggr_exprs)
|
||||
.context(DataFusionPlanningSnafu)?
|
||||
.project(project_fields)
|
||||
.context(DataFusionPlanningSnafu)?
|
||||
} else {
|
||||
builder
|
||||
.aggregate(group_exprs.clone(), aggr_exprs)
|
||||
.context(DataFusionPlanningSnafu)?
|
||||
};
|
||||
|
||||
let sort_expr = group_exprs.into_iter().map(|expr| expr.sort(true, false));
|
||||
|
||||
builder
|
||||
.sort(sort_expr)
|
||||
let group_sort_expr = group_exprs
|
||||
.clone()
|
||||
.into_iter()
|
||||
.map(|expr| expr.sort(true, false));
|
||||
LogicalPlanBuilder::from(input)
|
||||
.aggregate(group_exprs.clone(), aggr_exprs)
|
||||
.context(DataFusionPlanningSnafu)?
|
||||
.sort(group_sort_expr)
|
||||
.context(DataFusionPlanningSnafu)?
|
||||
.build()
|
||||
.context(DataFusionPlanningSnafu)
|
||||
@@ -336,7 +312,18 @@ impl PromPlanner {
|
||||
|
||||
let group_exprs = self.agg_modifier_to_col(input.schema(), modifier, false)?;
|
||||
|
||||
let val = Self::get_param_value_as_f64(*op, param)?;
|
||||
let param = param
|
||||
.as_deref()
|
||||
.with_context(|| FunctionInvalidArgumentSnafu {
|
||||
fn_name: (*op).to_string(),
|
||||
})?;
|
||||
|
||||
let PromExpr::NumberLiteral(NumberLiteral { val }) = param else {
|
||||
return FunctionInvalidArgumentSnafu {
|
||||
fn_name: (*op).to_string(),
|
||||
}
|
||||
.fail();
|
||||
};
|
||||
|
||||
// convert op and value columns to window exprs.
|
||||
let window_exprs = self.create_window_exprs(*op, group_exprs.clone(), &input)?;
|
||||
@@ -354,7 +341,7 @@ impl PromPlanner {
|
||||
let predicate = DfExpr::BinaryExpr(BinaryExpr {
|
||||
left: Box::new(col(rank)),
|
||||
op: Operator::LtEq,
|
||||
right: Box::new(lit(val)),
|
||||
right: Box::new(lit(*val)),
|
||||
});
|
||||
|
||||
match expr {
|
||||
@@ -939,7 +926,7 @@ impl PromPlanner {
|
||||
Some(Offset::Neg(duration)) => -(duration.as_millis() as Millisecond),
|
||||
None => 0,
|
||||
};
|
||||
let mut scan_filters = Self::matchers_to_expr(label_matchers.clone(), table_schema)?;
|
||||
let mut scan_filters = self.matchers_to_expr(label_matchers.clone(), table_schema)?;
|
||||
if let Some(time_index_filter) = self.build_time_index_filter(offset_duration)? {
|
||||
scan_filters.push(time_index_filter);
|
||||
}
|
||||
@@ -1135,7 +1122,8 @@ impl PromPlanner {
|
||||
}
|
||||
|
||||
// TODO(ruihang): ignore `MetricNameLabel` (`__name__`) matcher
|
||||
pub fn matchers_to_expr(
|
||||
fn matchers_to_expr(
|
||||
&self,
|
||||
label_matchers: Matchers,
|
||||
table_schema: &DFSchemaRef,
|
||||
) -> Result<Vec<DfExpr>> {
|
||||
@@ -1943,44 +1931,32 @@ impl PromPlanner {
|
||||
})
|
||||
}
|
||||
|
||||
/// Creates a set of DataFusion `DfExpr::AggregateFunction` expressions for each value column using the specified aggregate function.
|
||||
/// Create [DfExpr::AggregateFunction] expr for each value column with given aggregate function.
|
||||
///
|
||||
/// # Side Effects
|
||||
///
|
||||
/// This method modifies the value columns in the context by replacing them with the new columns
|
||||
/// created by the aggregate function application.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns a tuple of `(aggregate_expressions, previous_field_expressions)` where:
|
||||
/// - `aggregate_expressions`: Expressions that apply the aggregate function to the original fields
|
||||
/// - `previous_field_expressions`: Original field expressions before aggregation. This is non-empty
|
||||
/// only when the operation is `count_values`, as this operation requires preserving the original
|
||||
/// values for grouping.
|
||||
/// # Side effect
|
||||
///
|
||||
/// This method will update value columns in context to the new value columns created by
|
||||
/// aggregate function.
|
||||
fn create_aggregate_exprs(
|
||||
&mut self,
|
||||
op: TokenType,
|
||||
param: &Option<Box<PromExpr>>,
|
||||
input_plan: &LogicalPlan,
|
||||
) -> Result<(Vec<DfExpr>, Vec<DfExpr>)> {
|
||||
) -> Result<Vec<DfExpr>> {
|
||||
let aggr = match op.id() {
|
||||
token::T_SUM => sum_udaf(),
|
||||
token::T_QUANTILE => {
|
||||
let q = Self::get_param_value_as_f64(op, param)?;
|
||||
quantile_udaf(q)
|
||||
}
|
||||
token::T_AVG => avg_udaf(),
|
||||
token::T_COUNT_VALUES | token::T_COUNT => count_udaf(),
|
||||
token::T_COUNT => count_udaf(),
|
||||
token::T_MIN => min_udaf(),
|
||||
token::T_MAX => max_udaf(),
|
||||
token::T_GROUP => grouping_udaf(),
|
||||
token::T_STDDEV => stddev_pop_udaf(),
|
||||
token::T_STDVAR => var_pop_udaf(),
|
||||
token::T_TOPK | token::T_BOTTOMK => UnsupportedExprSnafu {
|
||||
name: format!("{op:?}"),
|
||||
token::T_TOPK | token::T_BOTTOMK | token::T_COUNT_VALUES | token::T_QUANTILE => {
|
||||
UnsupportedExprSnafu {
|
||||
name: format!("{op:?}"),
|
||||
}
|
||||
.fail()?
|
||||
}
|
||||
.fail()?,
|
||||
_ => UnexpectedTokenSnafu { token: op }.fail()?,
|
||||
};
|
||||
|
||||
@@ -1990,41 +1966,19 @@ impl PromPlanner {
|
||||
.field_columns
|
||||
.iter()
|
||||
.map(|col| {
|
||||
Ok(DfExpr::AggregateFunction(AggregateFunction {
|
||||
DfExpr::AggregateFunction(AggregateFunction {
|
||||
func: aggr.clone(),
|
||||
args: vec![DfExpr::Column(Column::from_name(col))],
|
||||
distinct: false,
|
||||
filter: None,
|
||||
order_by: None,
|
||||
null_treatment: None,
|
||||
}))
|
||||
})
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
.collect();
|
||||
|
||||
// if the aggregator is `count_values`, it must be grouped by current fields.
|
||||
let prev_field_exprs = if op.id() == token::T_COUNT_VALUES {
|
||||
let prev_field_exprs: Vec<_> = self
|
||||
.ctx
|
||||
.field_columns
|
||||
.iter()
|
||||
.map(|col| DfExpr::Column(Column::from_name(col)))
|
||||
.collect();
|
||||
|
||||
ensure!(
|
||||
self.ctx.field_columns.len() == 1,
|
||||
UnsupportedExprSnafu {
|
||||
name: "count_values on multi-value input"
|
||||
}
|
||||
);
|
||||
|
||||
prev_field_exprs
|
||||
} else {
|
||||
vec![]
|
||||
};
|
||||
|
||||
// update value column name according to the aggregators,
|
||||
// update value column name according to the aggregators
|
||||
let mut new_field_columns = Vec::with_capacity(self.ctx.field_columns.len());
|
||||
|
||||
let normalized_exprs =
|
||||
normalize_cols(exprs.iter().cloned(), input_plan).context(DataFusionPlanningSnafu)?;
|
||||
for expr in normalized_exprs {
|
||||
@@ -2032,39 +1986,7 @@ impl PromPlanner {
|
||||
}
|
||||
self.ctx.field_columns = new_field_columns;
|
||||
|
||||
Ok((exprs, prev_field_exprs))
|
||||
}
|
||||
|
||||
fn get_param_value_as_str(op: TokenType, param: &Option<Box<PromExpr>>) -> Result<&str> {
|
||||
let param = param
|
||||
.as_deref()
|
||||
.with_context(|| FunctionInvalidArgumentSnafu {
|
||||
fn_name: op.to_string(),
|
||||
})?;
|
||||
let PromExpr::StringLiteral(StringLiteral { val }) = param else {
|
||||
return FunctionInvalidArgumentSnafu {
|
||||
fn_name: op.to_string(),
|
||||
}
|
||||
.fail();
|
||||
};
|
||||
|
||||
Ok(val)
|
||||
}
|
||||
|
||||
fn get_param_value_as_f64(op: TokenType, param: &Option<Box<PromExpr>>) -> Result<f64> {
|
||||
let param = param
|
||||
.as_deref()
|
||||
.with_context(|| FunctionInvalidArgumentSnafu {
|
||||
fn_name: op.to_string(),
|
||||
})?;
|
||||
let PromExpr::NumberLiteral(NumberLiteral { val }) = param else {
|
||||
return FunctionInvalidArgumentSnafu {
|
||||
fn_name: op.to_string(),
|
||||
}
|
||||
.fail();
|
||||
};
|
||||
|
||||
Ok(*val)
|
||||
Ok(exprs)
|
||||
}
|
||||
|
||||
/// Create [DfExpr::WindowFunction] expr for each value column with given window function.
|
||||
@@ -3420,6 +3342,30 @@ mod test {
|
||||
do_aggregate_expr_plan("stdvar", "var_pop").await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[should_panic]
|
||||
async fn aggregate_top_k() {
|
||||
do_aggregate_expr_plan("topk", "").await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[should_panic]
|
||||
async fn aggregate_bottom_k() {
|
||||
do_aggregate_expr_plan("bottomk", "").await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[should_panic]
|
||||
async fn aggregate_count_values() {
|
||||
do_aggregate_expr_plan("count_values", "").await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[should_panic]
|
||||
async fn aggregate_quantile() {
|
||||
do_aggregate_expr_plan("quantile", "").await;
|
||||
}
|
||||
|
||||
// TODO(ruihang): add range fn tests once exprs are ready.
|
||||
|
||||
// {
|
||||
@@ -4302,98 +4248,4 @@ mod test {
|
||||
|
||||
assert_eq!(plan.display_indent_schema().to_string(), expected);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_count_values_expr() {
|
||||
let mut eval_stmt = EvalStmt {
|
||||
expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
|
||||
start: UNIX_EPOCH,
|
||||
end: UNIX_EPOCH
|
||||
.checked_add(Duration::from_secs(100_000))
|
||||
.unwrap(),
|
||||
interval: Duration::from_secs(5),
|
||||
lookback_delta: Duration::from_secs(1),
|
||||
};
|
||||
let case = r#"count_values('series', prometheus_tsdb_head_series{ip=~"(10\\.0\\.160\\.237:8080|10\\.0\\.160\\.237:9090)"}) by (ip)"#;
|
||||
|
||||
let prom_expr = parser::parse(case).unwrap();
|
||||
eval_stmt.expr = prom_expr;
|
||||
let table_provider = build_test_table_provider_with_fields(
|
||||
&[
|
||||
(
|
||||
DEFAULT_SCHEMA_NAME.to_string(),
|
||||
"prometheus_tsdb_head_series".to_string(),
|
||||
),
|
||||
(
|
||||
DEFAULT_SCHEMA_NAME.to_string(),
|
||||
"http_server_requests_seconds_count".to_string(),
|
||||
),
|
||||
],
|
||||
&["ip"],
|
||||
)
|
||||
.await;
|
||||
|
||||
let plan = PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_session_state())
|
||||
.await
|
||||
.unwrap();
|
||||
let expected = r#"Projection: count(prometheus_tsdb_head_series.greptime_value), prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp, series [count(prometheus_tsdb_head_series.greptime_value):Int64, ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), series:Float64;N]
|
||||
Sort: prometheus_tsdb_head_series.ip ASC NULLS LAST, prometheus_tsdb_head_series.greptime_timestamp ASC NULLS LAST, prometheus_tsdb_head_series.greptime_value ASC NULLS LAST [count(prometheus_tsdb_head_series.greptime_value):Int64, ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), series:Float64;N, greptime_value:Float64;N]
|
||||
Projection: count(prometheus_tsdb_head_series.greptime_value), prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp, prometheus_tsdb_head_series.greptime_value AS series, prometheus_tsdb_head_series.greptime_value [count(prometheus_tsdb_head_series.greptime_value):Int64, ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), series:Float64;N, greptime_value:Float64;N]
|
||||
Aggregate: groupBy=[[prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp, prometheus_tsdb_head_series.greptime_value]], aggr=[[count(prometheus_tsdb_head_series.greptime_value)]] [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N, count(prometheus_tsdb_head_series.greptime_value):Int64]
|
||||
PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp] [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
|
||||
PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [false] [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
|
||||
PromSeriesDivide: tags=["ip"] [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
|
||||
Sort: prometheus_tsdb_head_series.ip DESC NULLS LAST, prometheus_tsdb_head_series.greptime_timestamp DESC NULLS LAST [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
|
||||
Filter: prometheus_tsdb_head_series.ip ~ Utf8("(10\.0\.160\.237:8080|10\.0\.160\.237:9090)") AND prometheus_tsdb_head_series.greptime_timestamp >= TimestampMillisecond(-1000, None) AND prometheus_tsdb_head_series.greptime_timestamp <= TimestampMillisecond(100001000, None) [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
|
||||
TableScan: prometheus_tsdb_head_series [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]"#;
|
||||
|
||||
assert_eq!(plan.display_indent_schema().to_string(), expected);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_quantile_expr() {
|
||||
let mut eval_stmt = EvalStmt {
|
||||
expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
|
||||
start: UNIX_EPOCH,
|
||||
end: UNIX_EPOCH
|
||||
.checked_add(Duration::from_secs(100_000))
|
||||
.unwrap(),
|
||||
interval: Duration::from_secs(5),
|
||||
lookback_delta: Duration::from_secs(1),
|
||||
};
|
||||
let case = r#"quantile(0.3, sum(prometheus_tsdb_head_series{ip=~"(10\\.0\\.160\\.237:8080|10\\.0\\.160\\.237:9090)"}) by (ip))"#;
|
||||
|
||||
let prom_expr = parser::parse(case).unwrap();
|
||||
eval_stmt.expr = prom_expr;
|
||||
let table_provider = build_test_table_provider_with_fields(
|
||||
&[
|
||||
(
|
||||
DEFAULT_SCHEMA_NAME.to_string(),
|
||||
"prometheus_tsdb_head_series".to_string(),
|
||||
),
|
||||
(
|
||||
DEFAULT_SCHEMA_NAME.to_string(),
|
||||
"http_server_requests_seconds_count".to_string(),
|
||||
),
|
||||
],
|
||||
&["ip"],
|
||||
)
|
||||
.await;
|
||||
|
||||
let plan = PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_session_state())
|
||||
.await
|
||||
.unwrap();
|
||||
let expected = r#"Sort: prometheus_tsdb_head_series.greptime_timestamp ASC NULLS LAST [greptime_timestamp:Timestamp(Millisecond, None), quantile(sum(prometheus_tsdb_head_series.greptime_value)):Float64;N]
|
||||
Aggregate: groupBy=[[prometheus_tsdb_head_series.greptime_timestamp]], aggr=[[quantile(sum(prometheus_tsdb_head_series.greptime_value))]] [greptime_timestamp:Timestamp(Millisecond, None), quantile(sum(prometheus_tsdb_head_series.greptime_value)):Float64;N]
|
||||
Sort: prometheus_tsdb_head_series.ip ASC NULLS LAST, prometheus_tsdb_head_series.greptime_timestamp ASC NULLS LAST [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), sum(prometheus_tsdb_head_series.greptime_value):Float64;N]
|
||||
Aggregate: groupBy=[[prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp]], aggr=[[sum(prometheus_tsdb_head_series.greptime_value)]] [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), sum(prometheus_tsdb_head_series.greptime_value):Float64;N]
|
||||
PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp] [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
|
||||
PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [false] [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
|
||||
PromSeriesDivide: tags=["ip"] [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
|
||||
Sort: prometheus_tsdb_head_series.ip DESC NULLS LAST, prometheus_tsdb_head_series.greptime_timestamp DESC NULLS LAST [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
|
||||
Filter: prometheus_tsdb_head_series.ip ~ Utf8("(10\.0\.160\.237:8080|10\.0\.160\.237:9090)") AND prometheus_tsdb_head_series.greptime_timestamp >= TimestampMillisecond(-1000, None) AND prometheus_tsdb_head_series.greptime_timestamp <= TimestampMillisecond(100001000, None) [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
|
||||
TableScan: prometheus_tsdb_head_series [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]"#;
|
||||
|
||||
assert_eq!(plan.display_indent_schema().to_string(), expected);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -410,15 +410,6 @@ pub enum Error {
|
||||
source: query::error::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to parse timestamp: {}", timestamp))]
|
||||
ParseTimestamp {
|
||||
timestamp: String,
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
#[snafu(source)]
|
||||
error: query::error::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("{}", reason))]
|
||||
UnexpectedResult {
|
||||
reason: String,
|
||||
@@ -694,8 +685,7 @@ impl ErrorExt for Error {
|
||||
| PrepareStatementNotFound { .. }
|
||||
| FailedToParseQuery { .. }
|
||||
| InvalidElasticsearchInput { .. }
|
||||
| InvalidJaegerQuery { .. }
|
||||
| ParseTimestamp { .. } => StatusCode::InvalidArguments,
|
||||
| InvalidJaegerQuery { .. } => StatusCode::InvalidArguments,
|
||||
|
||||
Catalog { source, .. } => source.status_code(),
|
||||
RowWriter { source, .. } => source.status_code(),
|
||||
|
||||
@@ -29,7 +29,7 @@ use common_time::util::{current_time_rfc3339, yesterday_rfc3339};
|
||||
use common_version::OwnedBuildInfo;
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use datatypes::scalars::ScalarVector;
|
||||
use datatypes::vectors::Float64Vector;
|
||||
use datatypes::vectors::{Float64Vector, StringVector};
|
||||
use futures::future::join_all;
|
||||
use futures::StreamExt;
|
||||
use promql_parser::label::{MatchOp, Matcher, Matchers, METRIC_NAME};
|
||||
@@ -38,7 +38,7 @@ use promql_parser::parser::{
|
||||
AggregateExpr, BinaryExpr, Call, Expr as PromqlExpr, MatrixSelector, ParenExpr, SubqueryExpr,
|
||||
UnaryExpr, VectorSelector,
|
||||
};
|
||||
use query::parser::{PromQuery, QueryLanguageParser, DEFAULT_LOOKBACK_STRING};
|
||||
use query::parser::{PromQuery, DEFAULT_LOOKBACK_STRING};
|
||||
use query::promql::planner::normalize_matcher;
|
||||
use serde::de::{self, MapAccess, Visitor};
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -51,8 +51,8 @@ use store_api::metric_engine_consts::{
|
||||
|
||||
pub use super::result::prometheus_resp::PrometheusJsonResponse;
|
||||
use crate::error::{
|
||||
CatalogSnafu, CollectRecordbatchSnafu, Error, InvalidQuerySnafu, ParseTimestampSnafu, Result,
|
||||
TableNotFoundSnafu, UnexpectedResultSnafu,
|
||||
CatalogSnafu, CollectRecordbatchSnafu, Error, InvalidQuerySnafu, Result, TableNotFoundSnafu,
|
||||
UnexpectedResultSnafu,
|
||||
};
|
||||
use crate::http::header::collect_plan_metrics;
|
||||
use crate::prom_store::{FIELD_NAME_LABEL, METRIC_NAME_LABEL};
|
||||
@@ -994,58 +994,44 @@ pub async fn label_values_query(
|
||||
|
||||
let start = params.start.unwrap_or_else(yesterday_rfc3339);
|
||||
let end = params.end.unwrap_or_else(current_time_rfc3339);
|
||||
let lookback = params
|
||||
.lookback
|
||||
.unwrap_or_else(|| DEFAULT_LOOKBACK_STRING.to_string());
|
||||
|
||||
let mut label_values = HashSet::new();
|
||||
|
||||
let start = try_call_return_response!(QueryLanguageParser::parse_promql_timestamp(&start)
|
||||
.context(ParseTimestampSnafu { timestamp: &start }));
|
||||
let end = try_call_return_response!(QueryLanguageParser::parse_promql_timestamp(&end)
|
||||
.context(ParseTimestampSnafu { timestamp: &end }));
|
||||
|
||||
let mut merge_map = HashMap::new();
|
||||
for query in queries {
|
||||
let promql_expr = try_call_return_response!(promql_parser::parser::parse(&query));
|
||||
let PromqlExpr::VectorSelector(VectorSelector { name, matchers, .. }) = promql_expr else {
|
||||
return PrometheusJsonResponse::error(
|
||||
StatusCode::InvalidArguments,
|
||||
"expected vector selector",
|
||||
);
|
||||
let prom_query = PromQuery {
|
||||
query,
|
||||
start: start.clone(),
|
||||
end: end.clone(),
|
||||
step: DEFAULT_LOOKBACK_STRING.to_string(),
|
||||
lookback: lookback.clone(),
|
||||
};
|
||||
let Some(name) = name else {
|
||||
return PrometheusJsonResponse::error(
|
||||
StatusCode::InvalidArguments,
|
||||
"expected metric name",
|
||||
);
|
||||
};
|
||||
// Only use and filter matchers.
|
||||
let matchers = matchers.matchers;
|
||||
let result = handler
|
||||
.query_label_values(
|
||||
name,
|
||||
label_name.to_string(),
|
||||
matchers,
|
||||
start,
|
||||
end,
|
||||
&query_ctx,
|
||||
)
|
||||
.await;
|
||||
|
||||
match result {
|
||||
Ok(result) => {
|
||||
label_values.extend(result.into_iter());
|
||||
}
|
||||
Err(err) => {
|
||||
// Prometheus won't report error if querying nonexist label and metric
|
||||
if err.status_code() != StatusCode::TableNotFound
|
||||
&& err.status_code() != StatusCode::TableColumnNotFound
|
||||
{
|
||||
return PrometheusJsonResponse::error(err.status_code(), err.output_msg());
|
||||
}
|
||||
let result = handler.do_query(&prom_query, query_ctx.clone()).await;
|
||||
if let Err(err) =
|
||||
retrieve_label_values(result, &label_name, &mut label_values, &mut merge_map).await
|
||||
{
|
||||
// Prometheus won't report error if querying nonexist label and metric
|
||||
if err.status_code() != StatusCode::TableNotFound
|
||||
&& err.status_code() != StatusCode::TableColumnNotFound
|
||||
{
|
||||
return PrometheusJsonResponse::error(err.status_code(), err.output_msg());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let merge_map = merge_map
|
||||
.into_iter()
|
||||
.map(|(k, v)| (k, Value::from(v)))
|
||||
.collect();
|
||||
|
||||
let mut label_values: Vec<_> = label_values.into_iter().collect();
|
||||
label_values.sort_unstable();
|
||||
PrometheusJsonResponse::success(PrometheusResponse::LabelValues(label_values))
|
||||
let mut resp = PrometheusJsonResponse::success(PrometheusResponse::LabelValues(label_values));
|
||||
resp.resp_metrics = merge_map;
|
||||
resp
|
||||
}
|
||||
|
||||
async fn retrieve_field_names(
|
||||
@@ -1090,6 +1076,71 @@ async fn retrieve_field_names(
|
||||
Ok(field_columns)
|
||||
}
|
||||
|
||||
async fn retrieve_label_values(
|
||||
result: Result<Output>,
|
||||
label_name: &str,
|
||||
labels_values: &mut HashSet<String>,
|
||||
metrics: &mut HashMap<String, u64>,
|
||||
) -> Result<()> {
|
||||
let result = result?;
|
||||
match result.data {
|
||||
OutputData::RecordBatches(batches) => {
|
||||
retrieve_label_values_from_record_batch(batches, label_name, labels_values).await
|
||||
}
|
||||
OutputData::Stream(stream) => {
|
||||
let batches = RecordBatches::try_collect(stream)
|
||||
.await
|
||||
.context(CollectRecordbatchSnafu)?;
|
||||
retrieve_label_values_from_record_batch(batches, label_name, labels_values).await
|
||||
}
|
||||
OutputData::AffectedRows(_) => UnexpectedResultSnafu {
|
||||
reason: "expected data result, but got affected rows".to_string(),
|
||||
}
|
||||
.fail(),
|
||||
}?;
|
||||
|
||||
if let Some(ref plan) = result.meta.plan {
|
||||
collect_plan_metrics(plan, &mut [metrics]);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn retrieve_label_values_from_record_batch(
|
||||
batches: RecordBatches,
|
||||
label_name: &str,
|
||||
labels_values: &mut HashSet<String>,
|
||||
) -> Result<()> {
|
||||
let Some(label_col_idx) = batches.schema().column_index_by_name(label_name) else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
// check whether label_name belongs to tag column
|
||||
match batches
|
||||
.schema()
|
||||
.column_schema_by_name(label_name)
|
||||
.unwrap()
|
||||
.data_type
|
||||
{
|
||||
ConcreteDataType::String(_) => {}
|
||||
_ => return Ok(()),
|
||||
}
|
||||
for batch in batches.iter() {
|
||||
let label_column = batch
|
||||
.column(label_col_idx)
|
||||
.as_any()
|
||||
.downcast_ref::<StringVector>()
|
||||
.unwrap();
|
||||
for row_index in 0..batch.num_rows() {
|
||||
if let Some(label_value) = label_column.get_data(row_index) {
|
||||
let _ = labels_values.insert(label_value.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Try to parse and extract the name of referenced metric from the promql query.
|
||||
///
|
||||
/// Returns the metric name if a single metric is referenced, otherwise None.
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user