Compare commits

..

39 Commits

Author SHA1 Message Date
evenyag
d5760a7348 chore: remove unused codes 2025-03-17 15:20:42 +08:00
discord9
bc9614e22c feat: file parallel 2025-03-10 21:00:40 +08:00
discord9
7dd9e98ff6 docs: chore 2025-03-10 16:12:28 +08:00
evenyag
fb6b7f7801 fix: use label value to add map 2025-03-10 15:17:59 +08:00
evenyag
87d7c316df fix: use label value as table name 2025-03-10 14:42:19 +08:00
evenyag
c80a73bc20 feat: use pb in parquet 2025-03-10 14:40:29 +08:00
discord9
dd9d13e7df fix: cli arg 2025-03-10 14:18:35 +08:00
evenyag
79d249f5fa feat: fix panic in TimeSeriesParquetReader 2025-03-10 14:13:37 +08:00
evenyag
63bc544514 refactor: use constant 2025-03-10 14:02:27 +08:00
evenyag
30c29539a3 feat: special handle metric engine path 2025-03-10 13:58:46 +08:00
evenyag
359da62d9e feat: use parquet 2025-03-10 13:36:49 +08:00
evenyag
c9f4b36360 fix: use flushed_sequence as we can't set sequence in ingester 2025-03-10 13:36:49 +08:00
discord9
85c346b16a chore: progress bar 2025-03-10 11:53:33 +08:00
discord9
738c23beb0 feat: time unit 2025-03-10 11:25:23 +08:00
evenyag
8aadd1e59a feat: parquet remote write reader 2025-03-09 23:42:08 +08:00
discord9
cbd58291da chore: more logs 2025-03-09 23:29:58 +08:00
evenyag
e522e8959b chore: add more logs 2025-03-09 21:19:55 +08:00
evenyag
7183a93e5a feat: sanitize mito config 2025-03-09 21:05:21 +08:00
evenyag
8c538622e2 feat: add logs 2025-03-09 20:52:02 +08:00
evenyag
142dacb2c8 chore: update fs object build 2025-03-09 20:52:02 +08:00
discord9
371afc458f chore: init logging 2025-03-09 20:44:53 +08:00
discord9
0751cd74c0 feat: all in one cfg 2025-03-09 20:36:10 +08:00
discord9
ec34e8739a fix: is file 2025-03-09 19:55:12 +08:00
evenyag
b650743785 feat: implement converter convert 2025-03-09 19:53:36 +08:00
discord9
80a8b2e1bd feat: debug output file option 2025-03-09 17:23:14 +08:00
discord9
ec8a15cadd feat: ingester(WIP) 2025-03-09 16:57:26 +08:00
evenyag
f929d751a5 feat: update api 2025-03-09 16:39:35 +08:00
evenyag
fad3621a7a feat: define converter api 2025-03-09 16:05:52 +08:00
evenyag
87723effc7 feat: declare converter 2025-03-09 15:33:49 +08:00
evenyag
62a333ad09 feat: import datanode 2025-03-09 15:32:02 +08:00
evenyag
6ad186a13e feat: series to batch 2025-03-09 15:09:13 +08:00
discord9
77dee84a75 fix: parquet also sort by pk 2025-03-09 14:47:34 +08:00
evenyag
a57e263e5a feat: sort time series 2025-03-08 22:20:13 +08:00
discord9
8796ddaf31 chore: remove unwrap 2025-03-08 20:32:11 +08:00
discord9
7fa3fbdfef feat: parquet reader 2025-03-08 20:27:44 +08:00
jeremyhi
457d2a620c feat: add get table api 2025-03-08 19:53:15 +08:00
evenyag
9f14edbb28 feat: implement sst writer 2025-03-08 17:22:03 +08:00
evenyag
cb3fad0c2d chore: add deps 2025-03-08 16:17:49 +08:00
evenyag
2d1e7c2441 feat: init the converter crate 2025-03-08 14:15:35 +08:00
125 changed files with 6380 additions and 8960 deletions

View File

@@ -1,52 +0,0 @@
name: Check Grafana Panels
on:
pull_request:
branches:
- main
paths:
- 'grafana/**' # Trigger only when files under the grafana/ directory change
jobs:
check-panels:
runs-on: ubuntu-latest
steps:
# Check out the repository
- name: Checkout repository
uses: actions/checkout@v4
# Install jq (required for the script)
- name: Install jq
run: sudo apt-get install -y jq
# Make the check.sh script executable
- name: Make check.sh executable
run: chmod +x grafana/check.sh
# Run the check.sh script
- name: Run check.sh
run: ./grafana/check.sh
# Only run summary.sh for pull_request events (not for merge queues or final pushes)
- name: Check if this is a pull request
id: check-pr
run: |
if [[ "${{ github.event_name }}" == "pull_request" ]]; then
echo "is_pull_request=true" >> $GITHUB_OUTPUT
else
echo "is_pull_request=false" >> $GITHUB_OUTPUT
fi
# Make the summary.sh script executable
- name: Make summary.sh executable
if: steps.check-pr.outputs.is_pull_request == 'true'
run: chmod +x grafana/summary.sh
# Run the summary.sh script and add its output to the GitHub Job Summary
- name: Run summary.sh and add to Job Summary
if: steps.check-pr.outputs.is_pull_request == 'true'
run: |
SUMMARY=$(./grafana/summary.sh)
echo "### Summary of Grafana Panels" >> $GITHUB_STEP_SUMMARY
echo "$SUMMARY" >> $GITHUB_STEP_SUMMARY

144
Cargo.lock generated
View File

@@ -1594,7 +1594,7 @@ dependencies = [
"bitflags 1.3.2",
"strsim 0.8.0",
"textwrap 0.11.0",
"unicode-width",
"unicode-width 0.1.14",
"vec_map",
]
@@ -1876,7 +1876,7 @@ checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7"
dependencies = [
"strum 0.26.3",
"strum_macros 0.26.4",
"unicode-width",
"unicode-width 0.1.14",
]
[[package]]
@@ -2469,6 +2469,7 @@ dependencies = [
"encode_unicode",
"lazy_static",
"libc",
"unicode-width 0.1.14",
"windows-sys 0.52.0",
]
@@ -4167,7 +4168,6 @@ dependencies = [
"bytes",
"cache",
"catalog",
"chrono",
"client",
"common-base",
"common-catalog",
@@ -4646,7 +4646,7 @@ version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5"
dependencies = [
"unicode-width",
"unicode-width 0.1.14",
]
[[package]]
@@ -5567,7 +5567,6 @@ dependencies = [
"rand",
"regex",
"regex-automata 0.4.8",
"roaring",
"serde",
"serde_json",
"snafu 0.8.5",
@@ -5601,6 +5600,19 @@ dependencies = [
"serde",
]
[[package]]
name = "indicatif"
version = "0.17.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235"
dependencies = [
"console",
"number_prefix",
"portable-atomic",
"unicode-width 0.2.0",
"web-time 1.1.0",
]
[[package]]
name = "inferno"
version = "0.11.21"
@@ -5630,6 +5642,25 @@ dependencies = [
"snafu 0.7.5",
]
[[package]]
name = "ingester"
version = "0.13.0"
dependencies = [
"clap 4.5.19",
"common-telemetry",
"common-time",
"datanode",
"meta-client",
"mito2",
"object-store",
"reqwest",
"serde",
"serde_json",
"sst-convert",
"tokio",
"toml 0.8.19",
]
[[package]]
name = "inotify"
version = "0.9.6"
@@ -5899,15 +5930,15 @@ dependencies = [
[[package]]
name = "jsonpath-rust"
version = "0.7.5"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c00ae348f9f8fd2d09f82a98ca381c60df9e0820d8d79fce43e649b4dc3128b"
checksum = "69a61b87f6a55cc6c28fed5739dd36b9642321ce63e4a5e4a4715d69106f4a10"
dependencies = [
"pest",
"pest_derive",
"regex",
"serde_json",
"thiserror 2.0.12",
"thiserror 1.0.64",
]
[[package]]
@@ -7519,6 +7550,12 @@ dependencies = [
"libc",
]
[[package]]
name = "number_prefix"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
[[package]]
name = "objc"
version = "0.2.7"
@@ -7975,7 +8012,7 @@ version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2ad9b889f1b12e0b9ee24db044b5129150d5eada288edc800f789928dc8c0e3"
dependencies = [
"unicode-width",
"unicode-width 0.1.14",
]
[[package]]
@@ -8071,6 +8108,19 @@ dependencies = [
"zstd-sys",
]
[[package]]
name = "parquet_opendal"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4140ae96f37c170f8d684a544711fabdac1d94adcbd97e8b033329bd37f40446"
dependencies = [
"async-trait",
"bytes",
"futures",
"opendal",
"parquet",
]
[[package]]
name = "parse-zoneinfo"
version = "0.3.1"
@@ -8272,7 +8322,7 @@ dependencies = [
"rand",
"ring",
"rust_decimal",
"thiserror 2.0.12",
"thiserror 2.0.6",
"tokio",
"tokio-rustls 0.26.0",
"tokio-util",
@@ -8384,7 +8434,7 @@ dependencies = [
"greptime-proto",
"itertools 0.10.5",
"jsonb",
"jsonpath-rust 0.7.5",
"jsonpath-rust 0.7.3",
"lazy_static",
"moka",
"once_cell",
@@ -8762,7 +8812,6 @@ dependencies = [
"common-recordbatch",
"common-telemetry",
"datafusion",
"datafusion-common",
"datafusion-expr",
"datatypes",
"futures",
@@ -8776,9 +8825,8 @@ dependencies = [
[[package]]
name = "promql-parser"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c6b1429bdd199d53bd58b745075c1652efedbe2746e5d4f0d56d3184dda48ec"
version = "0.4.3"
source = "git+https://github.com/GreptimeTeam/promql-parser.git?rev=27abb8e16003a50c720f00d6c85f41f5fa2a2a8e#27abb8e16003a50c720f00d6c85f41f5fa2a2a8e"
dependencies = [
"cfgrammar",
"chrono",
@@ -9636,16 +9684,6 @@ dependencies = [
"syn 1.0.109",
]
[[package]]
name = "roaring"
version = "0.10.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41589aba99537475bf697f2118357cad1c31590c5a1b9f6d9fc4ad6d07503661"
dependencies = [
"bytemuck",
"byteorder",
]
[[package]]
name = "robust"
version = "1.1.0"
@@ -10070,7 +10108,7 @@ dependencies = [
"radix_trie",
"scopeguard",
"unicode-segmentation",
"unicode-width",
"unicode-width 0.1.14",
"utf8parse",
"winapi",
]
@@ -11065,7 +11103,7 @@ dependencies = [
"serde_json",
"sha2",
"smallvec",
"thiserror 2.0.12",
"thiserror 2.0.6",
"tokio",
"tokio-stream",
"tracing",
@@ -11150,7 +11188,7 @@ dependencies = [
"smallvec",
"sqlx-core",
"stringprep",
"thiserror 2.0.12",
"thiserror 2.0.6",
"tracing",
"whoami",
]
@@ -11188,7 +11226,7 @@ dependencies = [
"smallvec",
"sqlx-core",
"stringprep",
"thiserror 2.0.12",
"thiserror 2.0.6",
"tracing",
"whoami",
]
@@ -11217,6 +11255,36 @@ dependencies = [
"url",
]
[[package]]
name = "sst-convert"
version = "0.13.0"
dependencies = [
"api",
"arrow-array",
"async-trait",
"catalog",
"common-error",
"common-macro",
"common-meta",
"common-recordbatch",
"common-telemetry",
"datanode",
"datatypes",
"futures",
"futures-util",
"indicatif",
"meta-client",
"metric-engine",
"mito2",
"object-store",
"parquet",
"parquet_opendal",
"prost 0.13.3",
"snafu 0.8.5",
"store-api",
"table",
]
[[package]]
name = "stable_deref_trait"
version = "1.2.0"
@@ -11949,7 +12017,7 @@ version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
dependencies = [
"unicode-width",
"unicode-width 0.1.14",
]
[[package]]
@@ -11969,11 +12037,11 @@ dependencies = [
[[package]]
name = "thiserror"
version = "2.0.12"
version = "2.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708"
checksum = "8fec2a1820ebd077e2b90c4df007bebf344cd394098a13c563957d0afc83ea47"
dependencies = [
"thiserror-impl 2.0.12",
"thiserror-impl 2.0.6",
]
[[package]]
@@ -11989,9 +12057,9 @@ dependencies = [
[[package]]
name = "thiserror-impl"
version = "2.0.12"
version = "2.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d"
checksum = "d65750cab40f4ff1929fb1ba509e9914eb756131cef4210da8d5d700d26f6312"
dependencies = [
"proc-macro2",
"quote",
@@ -13052,6 +13120,12 @@ version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af"
[[package]]
name = "unicode-width"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd"
[[package]]
name = "unicode-xid"
version = "0.2.6"

View File

@@ -41,6 +41,7 @@ members = [
"src/flow",
"src/frontend",
"src/index",
"src/ingester",
"src/log-query",
"src/log-store",
"src/meta-client",
@@ -58,6 +59,7 @@ members = [
"src/servers",
"src/session",
"src/sql",
"src/sst-convert",
"src/store-api",
"src/table",
"tests-fuzz",
@@ -160,7 +162,9 @@ parquet = { version = "53.0.0", default-features = false, features = ["arrow", "
paste = "1.0"
pin-project = "1.0"
prometheus = { version = "0.13.3", features = ["process"] }
promql-parser = { version = "0.5", features = ["ser"] }
promql-parser = { git = "https://github.com/GreptimeTeam/promql-parser.git", features = [
"ser",
], rev = "27abb8e16003a50c720f00d6c85f41f5fa2a2a8e" }
prost = "0.13"
raft-engine = { version = "0.4.1", default-features = false }
rand = "0.8"
@@ -269,6 +273,7 @@ query = { path = "src/query" }
servers = { path = "src/servers" }
session = { path = "src/session" }
sql = { path = "src/sql" }
sst-convert = { path = "src/sst-convert" }
store-api = { path = "src/store-api" }
substrait = { path = "src/common/substrait" }
table = { path = "src/table" }

76
chore.md Normal file
View File

@@ -0,0 +1,76 @@
# log
## first create table
```bash
mysql --host=127.0.0.1 --port=19195 --database=public;
```
```sql
CREATE DATABASE IF NOT EXISTS `cluster1`;
USE `cluster1`;
CREATE TABLE IF NOT EXISTS `app1` (
`greptime_timestamp` TimestampNanosecond NOT NULL TIME INDEX,
`app` STRING NULL INVERTED INDEX,
`cluster` STRING NULL INVERTED INDEX,
`message` STRING NULL,
`region` STRING NULL,
`cloud-provider` STRING NULL,
`environment` STRING NULL,
`product` STRING NULL,
`sub-product` STRING NULL,
`service` STRING NULL
) WITH (
append_mode = 'true',
'compaction.type' = 'twcs',
'compaction.twcs.max_output_file_size' = '500MB',
'compaction.twcs.max_active_window_files' = '16',
'compaction.twcs.max_active_window_runs' = '4',
'compaction.twcs.max_inactive_window_files' = '4',
'compaction.twcs.max_inactive_window_runs' = '2',
);
select count(*) from app1;
SELECT * FROM app1 ORDER BY greptime_timestamp DESC LIMIT 10\G
```
## then ingest
```bash
RUST_LOG="debug" cargo run --bin=ingester -- --input-dir="/home/discord9/greptimedb/parquet_store_bk/" --parquet-dir="parquet_store/" --cfg="ingester.toml" --db-http-addr="http://127.0.0.1:4000/v1/sst/ingest_json"
```
# metrics!!!!!!!
```bash
mysql --host=127.0.0.1 --port=19195 --database=public < public.greptime_physical_table-create-tables.sql
```
## then ingest
```bash
RUST_LOG="debug"
cargo run --bin=ingester -- --input-dir="/home/discord9/greptimedb/parquet_store_bk/" --remote-write-dir="metrics_parquet/" --cfg="ingester.toml" --db-http-addr="http://127.0.0.1:4000/v1/sst/ingest_json"
# perf it
cargo build --release ---bin=ingester
samply record target/release/ingester --input-dir="/home/discord9/greptimedb/parquet_store_bk/" --remote-write-dir="metrics_parquet/" --cfg="ingester.toml" --db-http-addr="http://127.0.0.1:4000/v1/sst/ingest_json"
```
## check data
```sql
select count(*) from greptime_physical_table;
+----------+
| count(*) |
+----------+
| 36200 |
+----------+
1 row in set (0.06 sec)
select count(*) from storage_operation_errors_total;
+----------+
| count(*) |
+----------+
| 10 |
+----------+
1 row in set (0.03 sec)
```
# with oss
the same, only different is change storage config in `ingester.toml`

View File

@@ -1,19 +0,0 @@
#!/usr/bin/env bash
BASEDIR=$(dirname "$0")
# Use jq to check for panels with empty or missing descriptions
invalid_panels=$(cat $BASEDIR/greptimedb-cluster.json | jq -r '
.panels[]
| select((.type == "stats" or .type == "timeseries") and (.description == "" or .description == null))
')
# Check if any invalid panels were found
if [[ -n "$invalid_panels" ]]; then
echo "Error: The following panels have empty or missing descriptions:"
echo "$invalid_panels"
exit 1
else
echo "All panels with type 'stats' or 'timeseries' have valid descriptions."
exit 0
fi

File diff suppressed because it is too large Load Diff

View File

@@ -1,11 +0,0 @@
#!/usr/bin/env bash
BASEDIR=$(dirname "$0")
echo '| Title | Description | Expressions |
|---|---|---|'
cat $BASEDIR/greptimedb-cluster.json | jq -r '
.panels |
map(select(.type == "stat" or .type == "timeseries")) |
.[] | "| \(.title) | \(.description | gsub("\n"; "<br>")) | \(.targets | map(.expr // .rawSql | "`\(.|gsub("\n"; "<br>"))`") | join("<br>")) |"
'

35
ingester.toml Normal file
View File

@@ -0,0 +1,35 @@
## The metasrv client options.
[meta_client]
## The addresses of the metasrv.
metasrv_addrs = ["127.0.0.1:3002", "127.0.0.1:3003"]
## Operation timeout.
timeout = "3s"
## Heartbeat timeout.
heartbeat_timeout = "500ms"
## DDL timeout.
ddl_timeout = "10s"
## Connect server timeout.
connect_timeout = "1s"
## `TCP_NODELAY` option for accepted connections.
tcp_nodelay = true
## The configuration about the cache of the metadata.
metadata_cache_max_capacity = 100000
## TTL of the metadata cache.
metadata_cache_ttl = "10m"
# TTI of the metadata cache.
metadata_cache_tti = "5m"
## The data storage options.
[storage]
## The working home directory.
data_home = "/tmp/greptimedb-cluster/datanode0"
type = "File"
[mito]

View File

@@ -16,6 +16,7 @@
mod client;
pub mod client_manager;
#[cfg(feature = "testing")]
mod database;
pub mod error;
pub mod flow;
@@ -33,6 +34,7 @@ pub use common_recordbatch::{RecordBatches, SendableRecordBatchStream};
use snafu::OptionExt;
pub use self::client::Client;
#[cfg(feature = "testing")]
pub use self::database::Database;
pub use self::error::{Error, Result};
use crate::error::{IllegalDatabaseResponseSnafu, ServerSnafu};

View File

@@ -32,7 +32,7 @@ use common_meta::key::TableMetadataManager;
use common_telemetry::info;
use common_telemetry::logging::TracingOptions;
use common_version::{short_version, version};
use flow::{FlownodeBuilder, FlownodeInstance, FrontendClient, FrontendInvoker};
use flow::{FlownodeBuilder, FlownodeInstance, FrontendInvoker};
use meta_client::{MetaClientOptions, MetaClientType};
use servers::Mode;
use snafu::{OptionExt, ResultExt};
@@ -311,8 +311,6 @@ impl StartCommand {
Arc::new(executor),
);
let frontend_client = FrontendClient::from_meta_client(meta_client.clone());
let flow_metadata_manager = Arc::new(FlowMetadataManager::new(cached_meta_backend.clone()));
let flownode_builder = FlownodeBuilder::new(
opts,
@@ -320,7 +318,6 @@ impl StartCommand {
table_metadata_manager,
catalog_manager.clone(),
flow_metadata_manager,
Arc::new(frontend_client),
)
.with_heartbeat_task(heartbeat_task);

View File

@@ -54,10 +54,7 @@ use datanode::config::{DatanodeOptions, ProcedureConfig, RegionEngineConfig, Sto
use datanode::datanode::{Datanode, DatanodeBuilder};
use datanode::region_server::RegionServer;
use file_engine::config::EngineConfig as FileEngineConfig;
use flow::{
FlowConfig, FlowWorkerManager, FlownodeBuilder, FlownodeOptions, FrontendClient,
FrontendInvoker,
};
use flow::{FlowConfig, FlowWorkerManager, FlownodeBuilder, FlownodeOptions, FrontendInvoker};
use frontend::frontend::FrontendOptions;
use frontend::instance::builder::FrontendBuilder;
use frontend::instance::{FrontendInstance, Instance as FeInstance, StandaloneDatanodeManager};
@@ -536,16 +533,12 @@ impl StartCommand {
flow: opts.flow.clone(),
..Default::default()
};
let fe_server_addr = fe_opts.grpc.bind_addr.clone();
let frontend_client = FrontendClient::from_static_grpc_addr(fe_server_addr);
let flow_builder = FlownodeBuilder::new(
flownode_options,
plugins.clone(),
table_metadata_manager.clone(),
catalog_manager.clone(),
flow_metadata_manager.clone(),
Arc::new(frontend_client),
);
let flownode = Arc::new(
flow_builder

View File

@@ -445,20 +445,10 @@ impl Pool {
async fn recycle_channel_in_loop(pool: Arc<Pool>, interval_secs: u64) {
let mut interval = tokio::time::interval(Duration::from_secs(interval_secs));
// use weak ref here to prevent pool being leaked
let pool_weak = {
let weak = Arc::downgrade(&pool);
drop(pool);
weak
};
loop {
let _ = interval.tick().await;
if let Some(pool) = pool_weak.upgrade() {
pool.retain_channel(|_, c| c.access.swap(0, Ordering::Relaxed) != 0)
} else {
// no one is using this pool, so we can also let go
break;
}
pool.retain_channel(|_, c| c.access.swap(0, Ordering::Relaxed) != 0)
}
}

View File

@@ -337,7 +337,6 @@ pub enum FlowType {
impl FlowType {
pub const RECORDING_RULE: &str = "recording_rule";
pub const STREAMING: &str = "streaming";
pub const FLOW_TYPE_KEY: &str = "flow_type";
}
impl Default for FlowType {
@@ -392,8 +391,7 @@ impl From<&CreateFlowData> for CreateRequest {
};
let flow_type = value.flow_type.unwrap_or_default().to_string();
req.flow_options
.insert(FlowType::FLOW_TYPE_KEY.to_string(), flow_type);
req.flow_options.insert("flow_type".to_string(), flow_type);
req
}
}
@@ -425,7 +423,7 @@ impl From<&CreateFlowData> for (FlowInfoValue, Vec<(FlowPartitionId, FlowRouteVa
.collect::<Vec<_>>();
let flow_type = value.flow_type.unwrap_or_default().to_string();
options.insert(FlowType::FLOW_TYPE_KEY.to_string(), flow_type);
options.insert("flow_type".to_string(), flow_type);
let flow_info = FlowInfoValue {
source_table_ids: value.source_table_ids.clone(),

View File

@@ -25,6 +25,6 @@ pub mod heartbeat;
pub mod metrics;
pub mod region_server;
pub mod service;
mod store;
pub mod store;
#[cfg(any(test, feature = "testing"))]
pub mod tests;

View File

@@ -15,7 +15,7 @@
//! object storage utilities
mod azblob;
mod fs;
pub mod fs;
mod gcs;
mod oss;
mod s3;

View File

@@ -24,7 +24,8 @@ use crate::config::FileConfig;
use crate::error::{self, Result};
use crate::store;
pub(crate) async fn new_fs_object_store(
/// A helper function to create a file system object store.
pub async fn new_fs_object_store(
data_home: &str,
_file_config: &FileConfig,
) -> Result<ObjectStore> {

View File

@@ -16,7 +16,6 @@ async-trait.workspace = true
bytes.workspace = true
cache.workspace = true
catalog.workspace = true
chrono.workspace = true
client.workspace = true
common-base.workspace = true
common-config.workspace = true

View File

@@ -49,13 +49,12 @@ pub(crate) use crate::adapter::node_context::FlownodeContext;
use crate::adapter::refill::RefillTask;
use crate::adapter::table_source::ManagedTableSource;
use crate::adapter::util::relation_desc_to_column_schemas_with_fallback;
pub(crate) use crate::adapter::worker::{create_worker, WorkerHandle};
pub(crate) use crate::adapter::worker::{create_worker, Worker, WorkerHandle};
use crate::compute::ErrCollector;
use crate::df_optimizer::sql_to_flow_plan;
use crate::error::{EvalSnafu, ExternalSnafu, InternalSnafu, InvalidQuerySnafu, UnexpectedSnafu};
use crate::expr::Batch;
use crate::metrics::{METRIC_FLOW_INSERT_ELAPSED, METRIC_FLOW_ROWS, METRIC_FLOW_RUN_INTERVAL_MS};
use crate::recording_rules::RecordingRuleEngine;
use crate::repr::{self, DiffRow, RelationDesc, Row, BATCH_SIZE};
mod flownode_impl;
@@ -64,7 +63,7 @@ pub(crate) mod refill;
mod stat;
#[cfg(test)]
mod tests;
pub(crate) mod util;
mod util;
mod worker;
pub(crate) mod node_context;
@@ -170,8 +169,6 @@ pub struct FlowWorkerManager {
flush_lock: RwLock<()>,
/// receive a oneshot sender to send state size report
state_report_handler: RwLock<Option<StateReportHandler>>,
/// engine for recording rule
rule_engine: RecordingRuleEngine,
}
/// Building FlownodeManager
@@ -186,7 +183,6 @@ impl FlowWorkerManager {
node_id: Option<u32>,
query_engine: Arc<dyn QueryEngine>,
table_meta: TableMetadataManagerRef,
rule_engine: RecordingRuleEngine,
) -> Self {
let srv_map = ManagedTableSource::new(
table_meta.table_info_manager().clone(),
@@ -209,7 +205,6 @@ impl FlowWorkerManager {
node_id,
flush_lock: RwLock::new(()),
state_report_handler: RwLock::new(None),
rule_engine,
}
}
@@ -218,6 +213,25 @@ impl FlowWorkerManager {
self
}
/// Create a flownode manager with one worker
pub fn new_with_workers<'s>(
node_id: Option<u32>,
query_engine: Arc<dyn QueryEngine>,
table_meta: TableMetadataManagerRef,
num_workers: usize,
) -> (Self, Vec<Worker<'s>>) {
let mut zelf = Self::new(node_id, query_engine, table_meta);
let workers: Vec<_> = (0..num_workers)
.map(|_| {
let (handle, worker) = create_worker();
zelf.add_worker_handle(handle);
worker
})
.collect();
(zelf, workers)
}
/// add a worker handler to manager, meaning this corresponding worker is under it's manage
pub fn add_worker_handle(&mut self, handle: WorkerHandle) {
self.worker_handles.push(handle);
@@ -735,11 +749,7 @@ pub struct CreateFlowArgs {
/// Create&Remove flow
impl FlowWorkerManager {
/// remove a flow by it's id
#[allow(unreachable_code)]
pub async fn remove_flow(&self, flow_id: FlowId) -> Result<(), Error> {
// TODO(discord9): reroute some back to streaming engine later
return self.rule_engine.remove_flow(flow_id).await;
for handle in self.worker_handles.iter() {
if handle.contains_flow(flow_id).await? {
handle.remove_flow(flow_id).await?;
@@ -755,10 +765,8 @@ impl FlowWorkerManager {
/// steps to create task:
/// 1. parse query into typed plan(and optional parse expire_after expr)
/// 2. render source/sink with output table id and used input table id
#[allow(clippy::too_many_arguments, unreachable_code)]
#[allow(clippy::too_many_arguments)]
pub async fn create_flow(&self, args: CreateFlowArgs) -> Result<Option<FlowId>, Error> {
// TODO(discord9): reroute some back to streaming engine later
return self.rule_engine.create_flow(args).await;
let CreateFlowArgs {
flow_id,
sink_table_name,

View File

@@ -153,13 +153,7 @@ impl Flownode for FlowWorkerManager {
}
}
#[allow(unreachable_code, unused)]
async fn handle_inserts(&self, request: InsertRequests) -> Result<FlowResponse> {
return self
.rule_engine
.handle_inserts(request)
.await
.map_err(to_meta_err(snafu::location!()));
// using try_read to ensure two things:
// 1. flush wouldn't happen until inserts before it is inserted
// 2. inserts happening concurrently with flush wouldn't be block by flush
@@ -212,15 +206,15 @@ impl Flownode for FlowWorkerManager {
.collect_vec();
let table_col_names = table_schema.relation_desc.names;
let table_col_names = table_col_names
.iter().enumerate()
.map(|(idx,name)| match name {
Some(name) => Ok(name.clone()),
None => InternalSnafu {
reason: format!("Expect column {idx} of table id={table_id} to have name in table schema, found None"),
}
.fail().map_err(BoxedError::new).context(ExternalSnafu),
})
.collect::<Result<Vec<_>>>()?;
.iter().enumerate()
.map(|(idx,name)| match name {
Some(name) => Ok(name.clone()),
None => InternalSnafu {
reason: format!("Expect column {idx} of table id={table_id} to have name in table schema, found None"),
}
.fail().map_err(BoxedError::new).context(ExternalSnafu),
})
.collect::<Result<Vec<_>>>()?;
let name_to_col = HashMap::<_, _>::from_iter(
insert_schema
.iter()

View File

@@ -12,8 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
//! Some utility functions
use std::sync::Arc;
use api::helper::ColumnDataTypeWrapper;

View File

@@ -16,7 +16,6 @@
use std::any::Any;
use arrow_schema::ArrowError;
use common_error::ext::BoxedError;
use common_error::{define_into_tonic_status, from_err_code_msg_to_header};
use common_macro::stack_trace_debug;
@@ -54,13 +53,6 @@ pub enum Error {
location: Location,
},
#[snafu(display("Time error"))]
Time {
source: common_time::error::Error,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("External error"))]
External {
source: BoxedError,
@@ -164,15 +156,6 @@ pub enum Error {
location: Location,
},
#[snafu(display("Arrow error: {raw:?} in context: {context}"))]
Arrow {
#[snafu(source)]
raw: ArrowError,
context: String,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Datafusion error: {raw:?} in context: {context}"))]
Datafusion {
#[snafu(source)]
@@ -247,7 +230,6 @@ impl ErrorExt for Error {
match self {
Self::Eval { .. }
| Self::JoinTask { .. }
| Self::Arrow { .. }
| Self::Datafusion { .. }
| Self::InsertIntoFlow { .. } => StatusCode::Internal,
Self::FlowAlreadyExist { .. } => StatusCode::TableAlreadyExists,
@@ -256,9 +238,7 @@ impl ErrorExt for Error {
| Self::FlowNotFound { .. }
| Self::ListFlows { .. } => StatusCode::TableNotFound,
Self::Plan { .. } | Self::Datatypes { .. } => StatusCode::PlanQuery,
Self::InvalidQuery { .. } | Self::CreateFlow { .. } | Self::Time { .. } => {
StatusCode::EngineExecuteQuery
}
Self::InvalidQuery { .. } | Self::CreateFlow { .. } => StatusCode::EngineExecuteQuery,
Self::Unexpected { .. } => StatusCode::Unexpected,
Self::NotImplemented { .. } | Self::UnsupportedTemporalFilter { .. } => {
StatusCode::Unsupported

View File

@@ -238,7 +238,6 @@ mod test {
for (sql, current, expected) in &testcases {
let plan = sql_to_substrait(engine.clone(), sql).await;
let mut ctx = create_test_ctx();
let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan)
.await

View File

@@ -130,6 +130,13 @@ impl HeartbeatTask {
pub fn shutdown(&self) {
info!("Close heartbeat task for flownode");
if self
.running
.compare_exchange(true, false, Ordering::AcqRel, Ordering::Acquire)
.is_err()
{
warn!("Call close heartbeat task multiple times");
}
}
fn new_heartbeat_request(

View File

@@ -33,7 +33,6 @@ mod expr;
pub mod heartbeat;
mod metrics;
mod plan;
mod recording_rules;
mod repr;
mod server;
mod transform;
@@ -44,5 +43,4 @@ mod test_utils;
pub use adapter::{FlowConfig, FlowWorkerManager, FlowWorkerManagerRef, FlownodeOptions};
pub use error::{Error, Result};
pub use recording_rules::FrontendClient;
pub use server::{FlownodeBuilder, FlownodeInstance, FlownodeServer, FrontendInvoker};

View File

@@ -28,32 +28,6 @@ lazy_static! {
&["table_id"]
)
.unwrap();
pub static ref METRIC_FLOW_RULE_ENGINE_QUERY_TIME: HistogramVec = register_histogram_vec!(
"greptime_flow_rule_engine_query_time",
"flow rule engine query time",
&["flow_id"],
vec![
0.0,
1.,
3.,
5.,
10.,
20.,
30.,
60.,
2. * 60.,
5. * 60.,
10. * 60.
]
)
.unwrap();
pub static ref METRIC_FLOW_RULE_ENGINE_SLOW_QUERY: HistogramVec = register_histogram_vec!(
"greptime_flow_rule_engine_slow_query",
"flow rule engine slow query",
&["flow_id", "sql", "peer"],
vec![60., 2. * 60., 3. * 60., 5. * 60., 10. * 60.]
)
.unwrap();
pub static ref METRIC_FLOW_RUN_INTERVAL_MS: IntGauge =
register_int_gauge!("greptime_flow_run_interval_ms", "flow run interval in ms").unwrap();
pub static ref METRIC_FLOW_ROWS: IntCounterVec = register_int_counter_vec!(

View File

@@ -1,940 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Run flow as recording rule which is time-window-aware normal query triggered every tick set by user
mod engine;
mod frontend_client;
use std::collections::BTreeSet;
use std::sync::Arc;
use api::helper::pb_value_to_value_ref;
use catalog::CatalogManagerRef;
use common_error::ext::BoxedError;
use common_recordbatch::DfRecordBatch;
use common_telemetry::warn;
use common_time::timestamp::TimeUnit;
use common_time::Timestamp;
use datafusion::error::Result as DfResult;
use datafusion::logical_expr::Expr;
use datafusion::physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner};
use datafusion::prelude::SessionContext;
use datafusion::sql::unparser::Unparser;
use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion, TreeNodeRewriter};
use datafusion_common::{DFSchema, TableReference};
use datafusion_expr::{ColumnarValue, LogicalPlan};
use datafusion_physical_expr::PhysicalExprRef;
use datatypes::prelude::{ConcreteDataType, DataType};
use datatypes::scalars::ScalarVector;
use datatypes::schema::TIME_INDEX_KEY;
use datatypes::value::Value;
use datatypes::vectors::{
TimestampMicrosecondVector, TimestampMillisecondVector, TimestampNanosecondVector,
TimestampSecondVector, Vector,
};
pub use engine::RecordingRuleEngine;
pub use frontend_client::FrontendClient;
use itertools::Itertools;
use query::parser::QueryLanguageParser;
use query::QueryEngineRef;
use session::context::QueryContextRef;
use snafu::{ensure, OptionExt, ResultExt};
use crate::adapter::util::from_proto_to_data_type;
use crate::df_optimizer::apply_df_optimizer;
use crate::error::{ArrowSnafu, DatafusionSnafu, DatatypesSnafu, ExternalSnafu, UnexpectedSnafu};
use crate::expr::error::DataTypeSnafu;
use crate::Error;
#[derive(Debug, Clone)]
pub struct TimeWindowExpr {
phy_expr: PhysicalExprRef,
column_name: String,
logical_expr: Expr,
df_schema: DFSchema,
}
impl TimeWindowExpr {
pub fn from_expr(expr: &Expr, column_name: &str, df_schema: &DFSchema) -> Result<Self, Error> {
let phy_planner = DefaultPhysicalPlanner::default();
let phy_expr: PhysicalExprRef = phy_planner
.create_physical_expr(expr, df_schema, &SessionContext::new().state())
.with_context(|_e| DatafusionSnafu {
context: format!(
"Failed to create physical expression from {expr:?} using {df_schema:?}"
),
})?;
Ok(Self {
phy_expr,
column_name: column_name.to_string(),
logical_expr: expr.clone(),
df_schema: df_schema.clone(),
})
}
pub fn eval(
&self,
current: Timestamp,
) -> Result<(Option<Timestamp>, Option<Timestamp>), Error> {
let lower_bound =
find_expr_time_window_lower_bound(&self.logical_expr, &self.df_schema, current)?;
let upper_bound =
find_expr_time_window_upper_bound(&self.logical_expr, &self.df_schema, current)?;
Ok((lower_bound, upper_bound))
}
/// Find timestamps from rows using time window expr
pub async fn handle_rows(
&self,
rows_list: Vec<api::v1::Rows>,
) -> Result<BTreeSet<Timestamp>, Error> {
let mut time_windows = BTreeSet::new();
for rows in rows_list {
// pick the time index column and use it to eval on `self.expr`
let ts_col_index = rows
.schema
.iter()
.map(|col| col.column_name.clone())
.position(|name| name == self.column_name);
let Some(ts_col_index) = ts_col_index else {
warn!("can't found time index column in schema: {:?}", rows.schema);
continue;
};
let col_schema = &rows.schema[ts_col_index];
let cdt = from_proto_to_data_type(col_schema)?;
let column_values = rows
.rows
.iter()
.map(|row| &row.values[ts_col_index])
.collect_vec();
let mut vector = cdt.create_mutable_vector(column_values.len());
for value in column_values {
let value = pb_value_to_value_ref(value, &None);
vector.try_push_value_ref(value).context(DataTypeSnafu {
msg: "Failed to convert rows to columns",
})?;
}
let vector = vector.to_vector();
let df_schema = create_df_schema_for_ts_column(&self.column_name, cdt)?;
let rb =
DfRecordBatch::try_new(df_schema.inner().clone(), vec![vector.to_arrow_array()])
.with_context(|_e| ArrowSnafu {
context: format!(
"Failed to create record batch from {df_schema:?} and {vector:?}"
),
})?;
let eval_res = self
.phy_expr
.evaluate(&rb)
.with_context(|_| DatafusionSnafu {
context: format!(
"Failed to evaluate physical expression {:?} on {rb:?}",
self.phy_expr
),
})?;
let res = columnar_to_ts_vector(&eval_res)?;
for ts in res.into_iter().flatten() {
time_windows.insert(ts);
}
}
Ok(time_windows)
}
}
fn create_df_schema_for_ts_column(name: &str, cdt: ConcreteDataType) -> Result<DFSchema, Error> {
let arrow_schema = Arc::new(arrow_schema::Schema::new(vec![arrow_schema::Field::new(
name,
cdt.as_arrow_type(),
false,
)]));
let df_schema = DFSchema::from_field_specific_qualified_schema(
vec![Some(TableReference::bare("TimeIndexOnlyTable"))],
&arrow_schema,
)
.with_context(|_e| DatafusionSnafu {
context: format!("Failed to create DFSchema from arrow schema {arrow_schema:?}"),
})?;
Ok(df_schema)
}
/// Convert `ColumnarValue` to `Vec<Option<Timestamp>>`
fn columnar_to_ts_vector(columnar: &ColumnarValue) -> Result<Vec<Option<Timestamp>>, Error> {
let val = match columnar {
datafusion_expr::ColumnarValue::Array(array) => {
let ty = array.data_type();
let ty = ConcreteDataType::from_arrow_type(ty);
let time_unit = if let ConcreteDataType::Timestamp(ty) = ty {
ty.unit()
} else {
return UnexpectedSnafu {
reason: format!("Non-timestamp type: {ty:?}"),
}
.fail();
};
match time_unit {
TimeUnit::Second => TimestampSecondVector::try_from_arrow_array(array.clone())
.with_context(|_| DatatypesSnafu {
extra: format!("Failed to create vector from arrow array {array:?}"),
})?
.iter_data()
.map(|d| d.map(|d| d.0))
.collect_vec(),
TimeUnit::Millisecond => {
TimestampMillisecondVector::try_from_arrow_array(array.clone())
.with_context(|_| DatatypesSnafu {
extra: format!("Failed to create vector from arrow array {array:?}"),
})?
.iter_data()
.map(|d| d.map(|d| d.0))
.collect_vec()
}
TimeUnit::Microsecond => {
TimestampMicrosecondVector::try_from_arrow_array(array.clone())
.with_context(|_| DatatypesSnafu {
extra: format!("Failed to create vector from arrow array {array:?}"),
})?
.iter_data()
.map(|d| d.map(|d| d.0))
.collect_vec()
}
TimeUnit::Nanosecond => {
TimestampNanosecondVector::try_from_arrow_array(array.clone())
.with_context(|_| DatatypesSnafu {
extra: format!("Failed to create vector from arrow array {array:?}"),
})?
.iter_data()
.map(|d| d.map(|d| d.0))
.collect_vec()
}
}
}
datafusion_expr::ColumnarValue::Scalar(scalar) => {
let value = Value::try_from(scalar.clone()).with_context(|_| DatatypesSnafu {
extra: format!("Failed to convert scalar {scalar:?} to value"),
})?;
let ts = value.as_timestamp().context(UnexpectedSnafu {
reason: format!("Expect Timestamp, found {:?}", value),
})?;
vec![Some(ts)]
}
};
Ok(val)
}
/// Convert sql to datafusion logical plan
pub async fn sql_to_df_plan(
query_ctx: QueryContextRef,
engine: QueryEngineRef,
sql: &str,
optimize: bool,
) -> Result<LogicalPlan, Error> {
let stmt = QueryLanguageParser::parse_sql(sql, &query_ctx)
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
let plan = engine
.planner()
.plan(&stmt, query_ctx)
.await
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
let plan = if optimize {
apply_df_optimizer(plan).await?
} else {
plan
};
Ok(plan)
}
/// Return (the column name of time index column, the time window expr, the expected time unit of time index column, the expr's schema for evaluating the time window)
async fn find_time_window_expr(
plan: &LogicalPlan,
catalog_man: CatalogManagerRef,
query_ctx: QueryContextRef,
) -> Result<(String, Option<datafusion_expr::Expr>, TimeUnit, DFSchema), Error> {
// TODO(discord9): find the expr that do time window
let mut table_name = None;
// first find the table source in the logical plan
plan.apply(|plan| {
let LogicalPlan::TableScan(table_scan) = plan else {
return Ok(TreeNodeRecursion::Continue);
};
table_name = Some(table_scan.table_name.clone());
Ok(TreeNodeRecursion::Stop)
})
.with_context(|_| DatafusionSnafu {
context: format!("Can't find table source in plan {plan:?}"),
})?;
let Some(table_name) = table_name else {
UnexpectedSnafu {
reason: format!("Can't find table source in plan {plan:?}"),
}
.fail()?
};
let current_schema = query_ctx.current_schema();
let catalog_name = table_name.catalog().unwrap_or(query_ctx.current_catalog());
let schema_name = table_name.schema().unwrap_or(&current_schema);
let table_name = table_name.table();
let Some(table_ref) = catalog_man
.table(catalog_name, schema_name, table_name, Some(&query_ctx))
.await
.map_err(BoxedError::new)
.context(ExternalSnafu)?
else {
UnexpectedSnafu {
reason: format!(
"Can't find table {table_name:?} in catalog {catalog_name:?}/{schema_name:?}"
),
}
.fail()?
};
let schema = &table_ref.table_info().meta.schema;
let ts_index = schema.timestamp_column().context(UnexpectedSnafu {
reason: format!("Can't find timestamp column in table {table_name:?}"),
})?;
let ts_col_name = ts_index.name.clone();
let expected_time_unit = ts_index.data_type.as_timestamp().with_context(|| UnexpectedSnafu {
reason: format!(
"Expected timestamp column {ts_col_name:?} in table {table_name:?} to be timestamp, but got {ts_index:?}"
),
})?.unit();
let arrow_schema = Arc::new(arrow_schema::Schema::new(vec![arrow_schema::Field::new(
ts_col_name.clone(),
ts_index.data_type.as_arrow_type(),
false,
)]));
let df_schema = DFSchema::from_field_specific_qualified_schema(
vec![Some(TableReference::bare(table_name))],
&arrow_schema,
)
.with_context(|_e| DatafusionSnafu {
context: format!("Failed to create DFSchema from arrow schema {arrow_schema:?}"),
})?;
// find the time window expr which refers to the time index column
let mut aggr_expr = None;
let mut time_window_expr: Option<Expr> = None;
let find_inner_aggr_expr = |plan: &LogicalPlan| {
if let LogicalPlan::Aggregate(aggregate) = plan {
aggr_expr = Some(aggregate.clone());
};
Ok(TreeNodeRecursion::Continue)
};
plan.apply(find_inner_aggr_expr)
.with_context(|_| DatafusionSnafu {
context: format!("Can't find aggr expr in plan {plan:?}"),
})?;
if let Some(aggregate) = aggr_expr {
for group_expr in &aggregate.group_expr {
let refs = group_expr.column_refs();
if refs.len() != 1 {
continue;
}
let ref_col = refs.iter().next().unwrap();
let index = aggregate.input.schema().maybe_index_of_column(ref_col);
let Some(index) = index else {
continue;
};
let field = aggregate.input.schema().field(index);
let is_time_index = field.metadata().get(TIME_INDEX_KEY) == Some(&"true".to_string());
if is_time_index {
let rewrite_column = group_expr.clone();
let rewritten = rewrite_column
.rewrite(&mut RewriteColumn {
table_name: table_name.to_string(),
})
.with_context(|_| DatafusionSnafu {
context: format!("Rewrite expr failed, expr={:?}", group_expr),
})?
.data;
struct RewriteColumn {
table_name: String,
}
impl TreeNodeRewriter for RewriteColumn {
type Node = Expr;
fn f_down(&mut self, node: Self::Node) -> DfResult<Transformed<Self::Node>> {
let Expr::Column(mut column) = node else {
return Ok(Transformed::no(node));
};
column.relation = Some(TableReference::bare(self.table_name.clone()));
Ok(Transformed::yes(Expr::Column(column)))
}
}
time_window_expr = Some(rewritten);
break;
}
}
Ok((ts_col_name, time_window_expr, expected_time_unit, df_schema))
} else {
// can't found time window expr, return None
Ok((ts_col_name, None, expected_time_unit, df_schema))
}
}
/// Find nearest lower bound for time `current` in given `plan` for the time window expr.
/// i.e. for time window expr being `date_bin(INTERVAL '5 minutes', ts) as time_window` and `current="2021-07-01 00:01:01.000"`,
/// return `Some("2021-07-01 00:00:00.000")`
/// if `plan` doesn't contain a `TIME INDEX` column, return `None`
///
/// Time window expr is a expr that:
/// 1. ref only to a time index column
/// 2. is monotonic increasing
/// 3. show up in GROUP BY clause
///
/// note this plan should only contain one TableScan
pub async fn find_plan_time_window_bound(
plan: &LogicalPlan,
current: Timestamp,
query_ctx: QueryContextRef,
engine: QueryEngineRef,
) -> Result<(String, Option<Timestamp>, Option<Timestamp>), Error> {
// TODO(discord9): find the expr that do time window
let catalog_man = engine.engine_state().catalog_manager();
let (ts_col_name, time_window_expr, expected_time_unit, df_schema) =
find_time_window_expr(plan, catalog_man.clone(), query_ctx).await?;
// cast current to ts_index's type
let new_current = current
.convert_to(expected_time_unit)
.with_context(|| UnexpectedSnafu {
reason: format!("Failed to cast current timestamp {current:?} to {expected_time_unit}"),
})?;
// if no time_window_expr is found, return None
if let Some(time_window_expr) = time_window_expr {
let lower_bound =
find_expr_time_window_lower_bound(&time_window_expr, &df_schema, new_current)?;
let upper_bound =
find_expr_time_window_upper_bound(&time_window_expr, &df_schema, new_current)?;
Ok((ts_col_name, lower_bound, upper_bound))
} else {
Ok((ts_col_name, None, None))
}
}
/// Find the lower bound of time window in given `expr` and `current` timestamp.
///
/// i.e. for `current="2021-07-01 00:01:01.000"` and `expr=date_bin(INTERVAL '5 minutes', ts) as time_window` and `ts_col=ts`,
/// return `Some("2021-07-01 00:00:00.000")` since it's the lower bound
/// return `Some("2021-07-01 00:00:00.000")` since it's the lower bound
/// of current time window given the current timestamp
///
/// if return None, meaning this time window have no lower bound
fn find_expr_time_window_lower_bound(
expr: &Expr,
df_schema: &DFSchema,
current: Timestamp,
) -> Result<Option<Timestamp>, Error> {
let phy_planner = DefaultPhysicalPlanner::default();
let phy_expr: PhysicalExprRef = phy_planner
.create_physical_expr(expr, df_schema, &SessionContext::new().state())
.with_context(|_e| DatafusionSnafu {
context: format!(
"Failed to create physical expression from {expr:?} using {df_schema:?}"
),
})?;
let cur_time_window = eval_ts_to_ts(&phy_expr, df_schema, current)?;
let input_time_unit = cur_time_window.unit();
Ok(cur_time_window.convert_to(input_time_unit))
}
/// Find the upper bound for time window expression
fn find_expr_time_window_upper_bound(
expr: &Expr,
df_schema: &DFSchema,
current: Timestamp,
) -> Result<Option<Timestamp>, Error> {
use std::cmp::Ordering;
let phy_planner = DefaultPhysicalPlanner::default();
let phy_expr: PhysicalExprRef = phy_planner
.create_physical_expr(expr, df_schema, &SessionContext::new().state())
.with_context(|_e| DatafusionSnafu {
context: format!(
"Failed to create physical expression from {expr:?} using {df_schema:?}"
),
})?;
let cur_time_window = eval_ts_to_ts(&phy_expr, df_schema, current)?;
// search to find the lower bound
let mut offset: i64 = 1;
let mut lower_bound = Some(current);
let upper_bound;
// first expontial probe to found a range for binary search
loop {
let Some(next_val) = current.value().checked_add(offset) else {
// no upper bound if overflow
return Ok(None);
};
let next_time_probe = common_time::Timestamp::new(next_val, current.unit());
let next_time_window = eval_ts_to_ts(&phy_expr, df_schema, next_time_probe)?;
match next_time_window.cmp(&cur_time_window) {
Ordering::Less => {UnexpectedSnafu {
reason: format!(
"Unsupported time window expression, expect monotonic increasing for time window expression {expr:?}"
),
}
.fail()?
}
Ordering::Equal => {
lower_bound = Some(next_time_probe);
}
Ordering::Greater => {
upper_bound = Some(next_time_probe);
break
}
}
let Some(new_offset) = offset.checked_mul(2) else {
// no upper bound if overflow
return Ok(None);
};
offset = new_offset;
}
// binary search for the exact upper bound
ensure!(lower_bound.map(|v|v.unit())==upper_bound.map(|v|v.unit()), UnexpectedSnafu{
reason: format!(" unit mismatch for time window expression {expr:?}, found {lower_bound:?} and {upper_bound:?}"),
});
let output_unit = upper_bound
.context(UnexpectedSnafu {
reason: "should have lower bound",
})?
.unit();
let mut low = lower_bound
.context(UnexpectedSnafu {
reason: "should have lower bound",
})?
.value();
let mut high = upper_bound
.context(UnexpectedSnafu {
reason: "should have upper bound",
})?
.value();
while low < high {
let mid = (low + high) / 2;
let mid_probe = common_time::Timestamp::new(mid, output_unit);
let mid_time_window = eval_ts_to_ts(&phy_expr, df_schema, mid_probe)?;
match mid_time_window.cmp(&cur_time_window) {
Ordering::Less => UnexpectedSnafu {
reason: format!("Binary search failed for time window expression {expr:?}"),
}
.fail()?,
Ordering::Equal => low = mid + 1,
Ordering::Greater => high = mid,
}
}
let final_upper_bound_for_time_window = common_time::Timestamp::new(high, output_unit);
Ok(Some(final_upper_bound_for_time_window))
}
fn eval_ts_to_ts(
phy: &PhysicalExprRef,
df_schema: &DFSchema,
input_value: Timestamp,
) -> Result<Timestamp, Error> {
let schema_ty = df_schema.field(0).data_type();
let schema_cdt = ConcreteDataType::from_arrow_type(schema_ty);
let schema_unit = if let ConcreteDataType::Timestamp(ts) = schema_cdt {
ts.unit()
} else {
return UnexpectedSnafu {
reason: format!("Expect Timestamp, found {:?}", schema_cdt),
}
.fail();
};
let input_value = input_value
.convert_to(schema_unit)
.with_context(|| UnexpectedSnafu {
reason: format!("Failed to convert timestamp {input_value:?} to {schema_unit}"),
})?;
let ts_vector = match schema_unit {
TimeUnit::Second => {
TimestampSecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
}
TimeUnit::Millisecond => {
TimestampMillisecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
}
TimeUnit::Microsecond => {
TimestampMicrosecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
}
TimeUnit::Nanosecond => {
TimestampNanosecondVector::from_vec(vec![input_value.value()]).to_arrow_array()
}
};
let rb = DfRecordBatch::try_new(df_schema.inner().clone(), vec![ts_vector.clone()])
.with_context(|_| ArrowSnafu {
context: format!("Failed to create record batch from {df_schema:?} and {ts_vector:?}"),
})?;
let eval_res = phy.evaluate(&rb).with_context(|_| DatafusionSnafu {
context: format!("Failed to evaluate physical expression {phy:?} on {rb:?}"),
})?;
if let Some(Some(ts)) = columnar_to_ts_vector(&eval_res)?.first() {
Ok(*ts)
} else {
UnexpectedSnafu {
reason: format!(
"Expected timestamp in expression {phy:?} but got {:?}",
eval_res
),
}
.fail()?
}
}
// TODO(discord9): a method to found out the precise time window
/// Find out the `Filter` Node corresponding to outermost `WHERE` and add a new filter expr to it
#[derive(Debug)]
pub struct AddFilterRewriter {
extra_filter: Expr,
is_rewritten: bool,
}
impl AddFilterRewriter {
fn new(filter: Expr) -> Self {
Self {
extra_filter: filter,
is_rewritten: false,
}
}
}
impl TreeNodeRewriter for AddFilterRewriter {
type Node = LogicalPlan;
fn f_up(&mut self, node: Self::Node) -> DfResult<Transformed<Self::Node>> {
if self.is_rewritten {
return Ok(Transformed::no(node));
}
match node {
LogicalPlan::Filter(mut filter) if !filter.having => {
filter.predicate = filter.predicate.and(self.extra_filter.clone());
self.is_rewritten = true;
Ok(Transformed::yes(LogicalPlan::Filter(filter)))
}
LogicalPlan::TableScan(_) => {
// add a new filter
let filter =
datafusion_expr::Filter::try_new(self.extra_filter.clone(), Arc::new(node))?;
self.is_rewritten = true;
Ok(Transformed::yes(LogicalPlan::Filter(filter)))
}
_ => Ok(Transformed::no(node)),
}
}
}
fn df_plan_to_sql(plan: &LogicalPlan) -> Result<String, Error> {
/// A dialect that forces all identifiers to be quoted
struct ForceQuoteIdentifiers;
impl datafusion::sql::unparser::dialect::Dialect for ForceQuoteIdentifiers {
fn identifier_quote_style(&self, identifier: &str) -> Option<char> {
if identifier.to_lowercase() != identifier {
Some('"')
} else {
None
}
}
}
let unparser = Unparser::new(&ForceQuoteIdentifiers);
// first make all column qualified
let sql = unparser
.plan_to_sql(plan)
.with_context(|_e| DatafusionSnafu {
context: format!("Failed to unparse logical plan {plan:?}"),
})?;
Ok(sql.to_string())
}
#[cfg(test)]
mod test {
use datafusion_common::tree_node::TreeNode;
use pretty_assertions::assert_eq;
use session::context::QueryContext;
use super::{sql_to_df_plan, *};
use crate::recording_rules::{df_plan_to_sql, AddFilterRewriter};
use crate::test_utils::create_test_query_engine;
#[tokio::test]
async fn test_sql_plan_convert() {
let query_engine = create_test_query_engine();
let ctx = QueryContext::arc();
let old = r#"SELECT "NUMBER" FROM "UPPERCASE_NUMBERS_WITH_TS""#;
let new = sql_to_df_plan(ctx.clone(), query_engine.clone(), old, false)
.await
.unwrap();
let new_sql = df_plan_to_sql(&new).unwrap();
assert_eq!(
r#"SELECT "UPPERCASE_NUMBERS_WITH_TS"."NUMBER" FROM "UPPERCASE_NUMBERS_WITH_TS""#,
new_sql
);
}
#[tokio::test]
async fn test_add_filter() {
let testcases = vec![
(
"SELECT number FROM numbers_with_ts GROUP BY number","SELECT numbers_with_ts.number FROM numbers_with_ts WHERE (number > 4) GROUP BY numbers_with_ts.number"
),
(
"SELECT number FROM numbers_with_ts WHERE number < 2 OR number >10",
"SELECT numbers_with_ts.number FROM numbers_with_ts WHERE ((numbers_with_ts.number < 2) OR (numbers_with_ts.number > 10)) AND (number > 4)"
),
(
"SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window",
"SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE (number > 4) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
)
];
use datafusion_expr::{col, lit};
let query_engine = create_test_query_engine();
let ctx = QueryContext::arc();
for (before, after) in testcases {
let sql = before;
let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, false)
.await
.unwrap();
let mut add_filter = AddFilterRewriter::new(col("number").gt(lit(4u32)));
let plan = plan.rewrite(&mut add_filter).unwrap().data;
let new_sql = df_plan_to_sql(&plan).unwrap();
assert_eq!(after, new_sql);
}
}
#[tokio::test]
async fn test_plan_time_window_lower_bound() {
use datafusion_expr::{col, lit};
let query_engine = create_test_query_engine();
let ctx = QueryContext::arc();
let testcases = [
// same alias is not same column
(
"SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS ts FROM numbers_with_ts GROUP BY ts;",
Timestamp::new(1740394109, TimeUnit::Second),
(
"ts".to_string(),
Some(Timestamp::new(1740394109000, TimeUnit::Millisecond)),
Some(Timestamp::new(1740394109001, TimeUnit::Millisecond)),
),
r#"SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS ts FROM numbers_with_ts WHERE ((ts >= CAST('2025-02-24 10:48:29' AS TIMESTAMP)) AND (ts <= CAST('2025-02-24 10:48:29.001' AS TIMESTAMP))) GROUP BY numbers_with_ts.ts"#
),
// complex time window index
(
"SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS time_window FROM numbers_with_ts GROUP BY time_window;",
Timestamp::new(1740394109, TimeUnit::Second),
(
"ts".to_string(),
Some(Timestamp::new(1740394080, TimeUnit::Second)),
Some(Timestamp::new(1740394140, TimeUnit::Second)),
),
"SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('2025-02-24 10:48:00' AS TIMESTAMP)) AND (ts <= CAST('2025-02-24 10:49:00' AS TIMESTAMP))) GROUP BY arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)')"
),
// no time index
(
"SELECT date_bin('5 minutes', ts) FROM numbers_with_ts;",
Timestamp::new(23, TimeUnit::Millisecond),
("ts".to_string(), None, None),
"SELECT date_bin('5 minutes', ts) FROM numbers_with_ts;"
),
// time index
(
"SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
Timestamp::new(23, TimeUnit::Nanosecond),
(
"ts".to_string(),
Some(Timestamp::new(0, TimeUnit::Millisecond)),
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
),
"SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
),
// on spot
(
"SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
Timestamp::new(0, TimeUnit::Nanosecond),
(
"ts".to_string(),
Some(Timestamp::new(0, TimeUnit::Millisecond)),
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
),
"SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
),
// different time unit
(
"SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
Timestamp::new(23_000_000, TimeUnit::Nanosecond),
(
"ts".to_string(),
Some(Timestamp::new(0, TimeUnit::Millisecond)),
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
),
"SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
),
// time index with other fields
(
"SELECT sum(number) as sum_up, date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window;",
Timestamp::new(23, TimeUnit::Millisecond),
(
"ts".to_string(),
Some(Timestamp::new(0, TimeUnit::Millisecond)),
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
),
"SELECT sum(numbers_with_ts.number) AS sum_up, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
),
// time index with other pks
(
"SELECT number, date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window, number;",
Timestamp::new(23, TimeUnit::Millisecond),
(
"ts".to_string(),
Some(Timestamp::new(0, TimeUnit::Millisecond)),
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
),
"SELECT numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts), numbers_with_ts.number"
),
// subquery
(
"SELECT number, time_window FROM (SELECT number, date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window, number);",
Timestamp::new(23, TimeUnit::Millisecond),
(
"ts".to_string(),
Some(Timestamp::new(0, TimeUnit::Millisecond)),
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
),
"SELECT numbers_with_ts.number, time_window FROM (SELECT numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts), numbers_with_ts.number)"
),
// cte
(
"with cte as (select number, date_bin('5 minutes', ts) as time_window from numbers_with_ts GROUP BY time_window, number) select number, time_window from cte;",
Timestamp::new(23, TimeUnit::Millisecond),
(
"ts".to_string(),
Some(Timestamp::new(0, TimeUnit::Millisecond)),
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
),
"SELECT cte.number, cte.time_window FROM (SELECT numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP))) GROUP BY date_bin('5 minutes', numbers_with_ts.ts), numbers_with_ts.number) AS cte"
),
// complex subquery without alias
(
"SELECT sum(number), number, date_bin('5 minutes', ts) as time_window, bucket_name FROM (SELECT number, ts, case when number < 5 THEN 'bucket_0_5' when number >= 5 THEN 'bucket_5_inf' END as bucket_name FROM numbers_with_ts) GROUP BY number, time_window, bucket_name;",
Timestamp::new(23, TimeUnit::Millisecond),
(
"ts".to_string(),
Some(Timestamp::new(0, TimeUnit::Millisecond)),
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
),
"SELECT sum(numbers_with_ts.number), numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window, bucket_name FROM (SELECT numbers_with_ts.number, numbers_with_ts.ts, CASE WHEN (numbers_with_ts.number < 5) THEN 'bucket_0_5' WHEN (numbers_with_ts.number >= 5) THEN 'bucket_5_inf' END AS bucket_name FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP)))) GROUP BY numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts), bucket_name"
),
// complex subquery alias
(
"SELECT sum(number), number, date_bin('5 minutes', ts) as time_window, bucket_name FROM (SELECT number, ts, case when number < 5 THEN 'bucket_0_5' when number >= 5 THEN 'bucket_5_inf' END as bucket_name FROM numbers_with_ts) as cte GROUP BY number, time_window, bucket_name;",
Timestamp::new(23, TimeUnit::Millisecond),
(
"ts".to_string(),
Some(Timestamp::new(0, TimeUnit::Millisecond)),
Some(Timestamp::new(300000, TimeUnit::Millisecond)),
),
"SELECT sum(cte.number), cte.number, date_bin('5 minutes', cte.ts) AS time_window, cte.bucket_name FROM (SELECT numbers_with_ts.number, numbers_with_ts.ts, CASE WHEN (numbers_with_ts.number < 5) THEN 'bucket_0_5' WHEN (numbers_with_ts.number >= 5) THEN 'bucket_5_inf' END AS bucket_name FROM numbers_with_ts WHERE ((ts >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (ts <= CAST('1970-01-01 00:05:00' AS TIMESTAMP)))) AS cte GROUP BY cte.number, date_bin('5 minutes', cte.ts), cte.bucket_name"
),
];
for (sql, current, expected, expected_unparsed) in testcases {
let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, true)
.await
.unwrap();
let real =
find_plan_time_window_bound(&plan, current, ctx.clone(), query_engine.clone())
.await
.unwrap();
assert_eq!(expected, real);
let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, false)
.await
.unwrap();
let (col_name, lower, upper) = real;
let new_sql = if lower.is_some() {
let to_df_literal = |value| {
let value = Value::from(value);
value.try_to_scalar_value(&value.data_type()).unwrap()
};
let lower = to_df_literal(lower.unwrap());
let upper = to_df_literal(upper.unwrap());
let expr = col(&col_name)
.gt_eq(lit(lower))
.and(col(&col_name).lt_eq(lit(upper)));
let mut add_filter = AddFilterRewriter::new(expr);
let plan = plan.rewrite(&mut add_filter).unwrap().data;
df_plan_to_sql(&plan).unwrap()
} else {
sql.to_string()
};
assert_eq!(expected_unparsed, new_sql);
}
}
}

View File

@@ -1,815 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::{BTreeMap, HashMap, HashSet};
use std::sync::Arc;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use api::v1::flow::FlowResponse;
use common_error::ext::BoxedError;
use common_meta::ddl::create_flow::FlowType;
use common_meta::key::flow::FlowMetadataManagerRef;
use common_meta::key::table_info::TableInfoManager;
use common_meta::key::TableMetadataManagerRef;
use common_telemetry::tracing::warn;
use common_telemetry::{debug, info};
use common_time::Timestamp;
use datafusion::sql::unparser::expr_to_sql;
use datafusion_common::tree_node::TreeNode;
use datatypes::value::Value;
use query::QueryEngineRef;
use session::context::QueryContextRef;
use snafu::{ensure, OptionExt, ResultExt};
use store_api::storage::RegionId;
use table::metadata::TableId;
use tokio::sync::oneshot::error::TryRecvError;
use tokio::sync::{oneshot, RwLock};
use tokio::time::Instant;
use super::frontend_client::FrontendClient;
use super::{df_plan_to_sql, AddFilterRewriter, TimeWindowExpr};
use crate::adapter::{CreateFlowArgs, FlowId, TableName};
use crate::error::{
DatafusionSnafu, DatatypesSnafu, ExternalSnafu, FlowAlreadyExistSnafu, InternalSnafu,
TimeSnafu, UnexpectedSnafu,
};
use crate::metrics::{METRIC_FLOW_RULE_ENGINE_QUERY_TIME, METRIC_FLOW_RULE_ENGINE_SLOW_QUERY};
use crate::recording_rules::{find_time_window_expr, sql_to_df_plan};
use crate::Error;
/// TODO(discord9): make those constants configurable
/// The default rule engine query timeout is 10 minutes
pub const DEFAULT_RULE_ENGINE_QUERY_TIMEOUT: Duration = Duration::from_secs(10 * 60);
/// will output a warn log for any query that runs for more that 1 minutes, and also every 1 minutes when that query is still running
pub const SLOW_QUERY_THRESHOLD: Duration = Duration::from_secs(60);
/// TODO(discord9): determine how to configure refresh rate
pub struct RecordingRuleEngine {
tasks: RwLock<BTreeMap<FlowId, RecordingRuleTask>>,
shutdown_txs: RwLock<BTreeMap<FlowId, oneshot::Sender<()>>>,
frontend_client: Arc<FrontendClient>,
flow_metadata_manager: FlowMetadataManagerRef,
table_meta: TableMetadataManagerRef,
engine: QueryEngineRef,
}
impl RecordingRuleEngine {
pub fn new(
frontend_client: Arc<FrontendClient>,
engine: QueryEngineRef,
flow_metadata_manager: FlowMetadataManagerRef,
table_meta: TableMetadataManagerRef,
) -> Self {
Self {
tasks: Default::default(),
shutdown_txs: Default::default(),
frontend_client,
flow_metadata_manager,
table_meta,
engine,
}
}
pub async fn handle_inserts(
&self,
request: api::v1::region::InsertRequests,
) -> Result<FlowResponse, Error> {
let table_info_mgr = self.table_meta.table_info_manager();
let mut group_by_table_name: HashMap<TableName, Vec<api::v1::Rows>> = HashMap::new();
for r in request.requests {
let tid = RegionId::from(r.region_id).table_id();
let name = get_table_name(table_info_mgr, &tid).await?;
let entry = group_by_table_name.entry(name).or_default();
if let Some(rows) = r.rows {
entry.push(rows);
}
}
for (_flow_id, task) in self.tasks.read().await.iter() {
let src_table_names = &task.source_table_names;
for src_table_name in src_table_names {
if let Some(entry) = group_by_table_name.get(src_table_name) {
let Some(expr) = &task.time_window_expr else {
continue;
};
let involved_time_windows = expr.handle_rows(entry.clone()).await?;
let mut state = task.state.write().await;
state
.dirty_time_windows
.add_lower_bounds(involved_time_windows.into_iter());
}
}
}
Ok(Default::default())
}
}
async fn get_table_name(zelf: &TableInfoManager, table_id: &TableId) -> Result<TableName, Error> {
zelf.get(*table_id)
.await
.map_err(BoxedError::new)
.context(ExternalSnafu)?
.with_context(|| UnexpectedSnafu {
reason: format!("Table id = {:?}, couldn't found table name", table_id),
})
.map(|name| name.table_name())
.map(|name| [name.catalog_name, name.schema_name, name.table_name])
}
const MIN_REFRESH_DURATION: Duration = Duration::new(5, 0);
impl RecordingRuleEngine {
pub async fn create_flow(&self, args: CreateFlowArgs) -> Result<Option<FlowId>, Error> {
let CreateFlowArgs {
flow_id,
sink_table_name,
source_table_ids,
create_if_not_exists,
or_replace,
expire_after,
comment: _,
sql,
flow_options,
query_ctx,
} = args;
// or replace logic
{
let is_exist = self.tasks.read().await.contains_key(&flow_id);
match (create_if_not_exists, or_replace, is_exist) {
// if replace, ignore that old flow exists
(_, true, true) => {
info!("Replacing flow with id={}", flow_id);
}
(false, false, true) => FlowAlreadyExistSnafu { id: flow_id }.fail()?,
// already exists, and not replace, return None
(true, false, true) => {
info!("Flow with id={} already exists, do nothing", flow_id);
return Ok(None);
}
// continue as normal
(_, _, false) => (),
}
}
let flow_type = flow_options.get(FlowType::FLOW_TYPE_KEY);
ensure!(
flow_type == Some(&FlowType::RecordingRule.to_string()) || flow_type.is_none(),
UnexpectedSnafu {
reason: format!("Flow type is not RecordingRule nor None, got {flow_type:?}")
}
);
let Some(query_ctx) = query_ctx else {
UnexpectedSnafu {
reason: "Query context is None".to_string(),
}
.fail()?
};
let query_ctx = Arc::new(query_ctx);
let mut source_table_names = Vec::new();
for src_id in source_table_ids {
let table_name = self
.table_meta
.table_info_manager()
.get(src_id)
.await
.map_err(BoxedError::new)
.context(ExternalSnafu)?
.with_context(|| UnexpectedSnafu {
reason: format!("Table id = {:?}, couldn't found table name", src_id),
})
.map(|name| name.table_name())
.map(|name| [name.catalog_name, name.schema_name, name.table_name])?;
source_table_names.push(table_name);
}
let (tx, rx) = oneshot::channel();
let plan = sql_to_df_plan(query_ctx.clone(), self.engine.clone(), &sql, true).await?;
let (column_name, time_window_expr, _, df_schema) = find_time_window_expr(
&plan,
self.engine.engine_state().catalog_manager().clone(),
query_ctx.clone(),
)
.await?;
let phy_expr = time_window_expr
.map(|expr| TimeWindowExpr::from_expr(&expr, &column_name, &df_schema))
.transpose()?;
info!("Flow id={}, found time window expr={:?}", flow_id, phy_expr);
let task = RecordingRuleTask::new(
flow_id,
&sql,
phy_expr,
expire_after,
sink_table_name,
source_table_names,
query_ctx,
rx,
);
let task_inner = task.clone();
let engine = self.engine.clone();
let frontend = self.frontend_client.clone();
// TODO(discord9): also save handle & use time wheel or what for better
let _handle = common_runtime::spawn_global(async move {
match task_inner.start_executing(engine, frontend).await {
Ok(()) => info!("Flow {} shutdown", task_inner.flow_id),
Err(err) => common_telemetry::error!(
"Flow {} encounter unrecoverable error: {err:?}",
task_inner.flow_id
),
}
});
// TODO(discord9): deal with replace logic
let replaced_old_task_opt = self.tasks.write().await.insert(flow_id, task);
drop(replaced_old_task_opt);
self.shutdown_txs.write().await.insert(flow_id, tx);
Ok(Some(flow_id))
}
pub async fn remove_flow(&self, flow_id: FlowId) -> Result<(), Error> {
if self.tasks.write().await.remove(&flow_id).is_none() {
warn!("Flow {flow_id} not found in tasks")
}
let Some(tx) = self.shutdown_txs.write().await.remove(&flow_id) else {
UnexpectedSnafu {
reason: format!("Can't found shutdown tx for flow {flow_id}"),
}
.fail()?
};
if tx.send(()).is_err() {
warn!("Fail to shutdown flow {flow_id} due to receiver already dropped, maybe flow {flow_id} is already dropped?")
}
Ok(())
}
}
#[derive(Debug, Clone)]
pub struct RecordingRuleTask {
pub flow_id: FlowId,
query: String,
pub time_window_expr: Option<TimeWindowExpr>,
/// in seconds
pub expire_after: Option<i64>,
sink_table_name: [String; 3],
source_table_names: HashSet<[String; 3]>,
state: Arc<RwLock<RecordingRuleState>>,
}
impl RecordingRuleTask {
#[allow(clippy::too_many_arguments)]
pub fn new(
flow_id: FlowId,
query: &str,
time_window_expr: Option<TimeWindowExpr>,
expire_after: Option<i64>,
sink_table_name: [String; 3],
source_table_names: Vec<[String; 3]>,
query_ctx: QueryContextRef,
shutdown_rx: oneshot::Receiver<()>,
) -> Self {
Self {
flow_id,
query: query.to_string(),
time_window_expr,
expire_after,
sink_table_name,
source_table_names: source_table_names.into_iter().collect(),
state: Arc::new(RwLock::new(RecordingRuleState::new(query_ctx, shutdown_rx))),
}
}
}
impl RecordingRuleTask {
/// This should be called in a new tokio task
pub async fn start_executing(
&self,
engine: QueryEngineRef,
frontend_client: Arc<FrontendClient>,
) -> Result<(), Error> {
// only first query don't need upper bound
let mut is_first = true;
loop {
// FIXME(discord9): test if need upper bound also works
let new_query = self.gen_query_with_time_window(engine.clone()).await?;
let insert_into = if let Some(new_query) = new_query {
format!(
"INSERT INTO {}.{}.{} {}",
self.sink_table_name[0],
self.sink_table_name[1],
self.sink_table_name[2],
new_query
)
} else {
tokio::time::sleep(MIN_REFRESH_DURATION).await;
continue;
};
if is_first {
is_first = false;
}
let instant = Instant::now();
let flow_id = self.flow_id;
let db_client = frontend_client.get_database_client().await?;
let peer_addr = db_client.peer.addr;
debug!(
"Executing flow {flow_id}(expire_after={:?} secs) on {:?} with query {}",
self.expire_after, peer_addr, &insert_into
);
let timer = METRIC_FLOW_RULE_ENGINE_QUERY_TIME
.with_label_values(&[flow_id.to_string().as_str()])
.start_timer();
let res = db_client.database.sql(&insert_into).await;
drop(timer);
let elapsed = instant.elapsed();
if let Ok(res1) = &res {
debug!(
"Flow {flow_id} executed, result: {res1:?}, elapsed: {:?}",
elapsed
);
} else if let Err(res) = &res {
warn!(
"Failed to execute Flow {flow_id} on frontend {}, result: {res:?}, elapsed: {:?} with query: {}",
peer_addr, elapsed, &insert_into
);
}
// record slow query
if elapsed >= SLOW_QUERY_THRESHOLD {
warn!(
"Flow {flow_id} on frontend {} executed for {:?} before complete, query: {}",
peer_addr, elapsed, &insert_into
);
METRIC_FLOW_RULE_ENGINE_SLOW_QUERY
.with_label_values(&[flow_id.to_string().as_str(), &insert_into, &peer_addr])
.observe(elapsed.as_secs_f64());
}
self.state
.write()
.await
.after_query_exec(elapsed, res.is_ok());
// drop the result to free client-related resources
drop(res);
let sleep_until = {
let mut state = self.state.write().await;
match state.shutdown_rx.try_recv() {
Ok(()) => break Ok(()),
Err(TryRecvError::Closed) => {
warn!("Unexpected shutdown flow {flow_id}, shutdown anyway");
break Ok(());
}
Err(TryRecvError::Empty) => (),
}
state.get_next_start_query_time(None)
};
tokio::time::sleep_until(sleep_until).await;
}
}
/// will merge and use the first ten time window in query
async fn gen_query_with_time_window(
&self,
engine: QueryEngineRef,
) -> Result<Option<String>, Error> {
let query_ctx = self.state.read().await.query_ctx.clone();
let start = SystemTime::now();
let since_the_epoch = start
.duration_since(UNIX_EPOCH)
.expect("Time went backwards");
let low_bound = self
.expire_after
.map(|e| since_the_epoch.as_secs() - e as u64)
.unwrap_or(u64::MIN);
let low_bound = Timestamp::new_second(low_bound as i64);
// TODO(discord9): use time window expr to get the precise expire lower bound
let expire_time_window_bound = self
.time_window_expr
.as_ref()
.map(|expr| expr.eval(low_bound))
.transpose()?;
let new_sql = {
let expr = {
match expire_time_window_bound {
Some((Some(l), Some(u))) => {
let window_size = u.sub(&l).with_context(|| UnexpectedSnafu {
reason: format!("Can't get window size from {u:?} - {l:?}"),
})?;
let col_name = self
.time_window_expr
.as_ref()
.map(|expr| expr.column_name.clone())
.with_context(|| UnexpectedSnafu {
reason: format!(
"Flow id={:?}, Failed to get column name from time window expr",
self.flow_id
),
})?;
self.state
.write()
.await
.dirty_time_windows
.gen_filter_exprs(&col_name, Some(l), window_size, self)?
}
_ => {
debug!(
"Flow id = {:?}, can't get window size: precise_lower_bound={expire_time_window_bound:?}, using the same query", self.flow_id
);
// since no time window lower/upper bound is found, just return the original query
return Ok(Some(self.query.clone()));
}
}
};
debug!(
"Flow id={:?}, Generated filter expr: {:?}",
self.flow_id,
expr.as_ref()
.map(|expr| expr_to_sql(expr).with_context(|_| DatafusionSnafu {
context: format!("Failed to generate filter expr from {expr:?}"),
}))
.transpose()?
.map(|s| s.to_string())
);
let Some(expr) = expr else {
// no new data, hence no need to update
debug!("Flow id={:?}, no new data, not update", self.flow_id);
return Ok(None);
};
let mut add_filter = AddFilterRewriter::new(expr);
// make a not optimized plan for clearer unparse
let plan =
sql_to_df_plan(query_ctx.clone(), engine.clone(), &self.query, false).await?;
let plan = plan
.clone()
.rewrite(&mut add_filter)
.with_context(|_| DatafusionSnafu {
context: format!("Failed to rewrite plan {plan:?}"),
})?
.data;
df_plan_to_sql(&plan)?
};
Ok(Some(new_sql))
}
}
#[derive(Debug)]
pub struct RecordingRuleState {
query_ctx: QueryContextRef,
/// last query complete time
last_update_time: Instant,
/// last time query duration
last_query_duration: Duration,
/// Dirty Time windows need to be updated
/// mapping of `start -> end` and non-overlapping
dirty_time_windows: DirtyTimeWindows,
exec_state: ExecState,
shutdown_rx: oneshot::Receiver<()>,
}
#[derive(Debug, Clone, Default)]
pub struct DirtyTimeWindows {
windows: BTreeMap<Timestamp, Option<Timestamp>>,
}
fn to_df_literal(value: Timestamp) -> Result<datafusion_common::ScalarValue, Error> {
let value = Value::from(value);
let value = value
.try_to_scalar_value(&value.data_type())
.with_context(|_| DatatypesSnafu {
extra: format!("Failed to convert to scalar value: {}", value),
})?;
Ok(value)
}
impl DirtyTimeWindows {
/// Time window merge distance
const MERGE_DIST: i32 = 3;
/// Maximum number of filters allowed in a single query
const MAX_FILTER_NUM: usize = 20;
/// Add lower bounds to the dirty time windows. Upper bounds are ignored.
///
/// # Arguments
///
/// * `lower_bounds` - An iterator of lower bounds to be added.
pub fn add_lower_bounds(&mut self, lower_bounds: impl Iterator<Item = Timestamp>) {
for lower_bound in lower_bounds {
let entry = self.windows.entry(lower_bound);
entry.or_insert(None);
}
}
/// Generate all filter expressions consuming all time windows
pub fn gen_filter_exprs(
&mut self,
col_name: &str,
expire_lower_bound: Option<Timestamp>,
window_size: chrono::Duration,
task_ctx: &RecordingRuleTask,
) -> Result<Option<datafusion_expr::Expr>, Error> {
debug!(
"expire_lower_bound: {:?}, window_size: {:?}",
expire_lower_bound.map(|t| t.to_iso8601_string()),
window_size
);
self.merge_dirty_time_windows(window_size, expire_lower_bound)?;
if self.windows.len() > Self::MAX_FILTER_NUM {
let first_time_window = self.windows.first_key_value();
let last_time_window = self.windows.last_key_value();
warn!(
"Flow id = {:?}, too many time windows: {}, only the first {} are taken for this query, the group by expression might be wrong. Time window expr={:?}, expire_after={:?}, first_time_window={:?}, last_time_window={:?}, the original query: {:?}",
task_ctx.flow_id,
self.windows.len(),
Self::MAX_FILTER_NUM,
task_ctx.time_window_expr,
task_ctx.expire_after,
first_time_window,
last_time_window,
task_ctx.query
);
}
// get the first `MAX_FILTER_NUM` time windows
let nth = self
.windows
.iter()
.nth(Self::MAX_FILTER_NUM)
.map(|(key, _)| *key);
let first_nth = {
if let Some(nth) = nth {
let mut after = self.windows.split_off(&nth);
std::mem::swap(&mut self.windows, &mut after);
after
} else {
std::mem::take(&mut self.windows)
}
};
let mut expr_lst = vec![];
for (start, end) in first_nth.into_iter() {
debug!(
"Time window start: {:?}, end: {:?}",
start.to_iso8601_string(),
end.map(|t| t.to_iso8601_string())
);
use datafusion_expr::{col, lit};
let lower = to_df_literal(start)?;
let upper = end.map(to_df_literal).transpose()?;
let expr = if let Some(upper) = upper {
col(col_name)
.gt_eq(lit(lower))
.and(col(col_name).lt(lit(upper)))
} else {
col(col_name).gt_eq(lit(lower))
};
expr_lst.push(expr);
}
let expr = expr_lst.into_iter().reduce(|a, b| a.or(b));
Ok(expr)
}
/// Merge time windows that overlaps or get too close
pub fn merge_dirty_time_windows(
&mut self,
window_size: chrono::Duration,
expire_lower_bound: Option<Timestamp>,
) -> Result<(), Error> {
let mut new_windows = BTreeMap::new();
let mut prev_tw = None;
for (lower_bound, upper_bound) in std::mem::take(&mut self.windows) {
// filter out expired time window
if let Some(expire_lower_bound) = expire_lower_bound {
if lower_bound <= expire_lower_bound {
continue;
}
}
let Some(prev_tw) = &mut prev_tw else {
prev_tw = Some((lower_bound, upper_bound));
continue;
};
let std_window_size = window_size.to_std().map_err(|e| {
InternalSnafu {
reason: e.to_string(),
}
.build()
})?;
// if cur.lower - prev.upper <= window_size * 2, merge
let prev_upper = prev_tw
.1
.unwrap_or(prev_tw.0.add_duration(std_window_size).context(TimeSnafu)?);
prev_tw.1 = Some(prev_upper);
let cur_upper = upper_bound.unwrap_or(
lower_bound
.add_duration(std_window_size)
.context(TimeSnafu)?,
);
if lower_bound
.sub(&prev_upper)
.map(|dist| dist <= window_size * Self::MERGE_DIST)
.unwrap_or(false)
{
prev_tw.1 = Some(cur_upper);
} else {
new_windows.insert(prev_tw.0, prev_tw.1);
*prev_tw = (lower_bound, Some(cur_upper));
}
}
if let Some(prev_tw) = prev_tw {
new_windows.insert(prev_tw.0, prev_tw.1);
}
self.windows = new_windows;
Ok(())
}
}
impl RecordingRuleState {
pub fn new(query_ctx: QueryContextRef, shutdown_rx: oneshot::Receiver<()>) -> Self {
Self {
query_ctx,
last_update_time: Instant::now(),
last_query_duration: Duration::from_secs(0),
dirty_time_windows: Default::default(),
exec_state: ExecState::Idle,
shutdown_rx,
}
}
/// called after last query is done
/// `is_succ` indicate whether the last query is successful
pub fn after_query_exec(&mut self, elapsed: Duration, _is_succ: bool) {
self.exec_state = ExecState::Idle;
self.last_query_duration = elapsed;
self.last_update_time = Instant::now();
}
/// wait for at least `last_query_duration`, at most `max_timeout` to start next query
pub fn get_next_start_query_time(&self, max_timeout: Option<Duration>) -> Instant {
let next_duration = max_timeout
.unwrap_or(self.last_query_duration)
.min(self.last_query_duration);
let next_duration = next_duration.max(MIN_REFRESH_DURATION);
self.last_update_time + next_duration
}
}
#[derive(Debug, Clone)]
enum ExecState {
Idle,
Executing,
}
#[cfg(test)]
mod test {
use pretty_assertions::assert_eq;
use super::*;
#[test]
fn test_merge_dirty_time_windows() {
let mut dirty = DirtyTimeWindows::default();
dirty.add_lower_bounds(
vec![
Timestamp::new_second(0),
Timestamp::new_second((1 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
]
.into_iter(),
);
dirty
.merge_dirty_time_windows(chrono::Duration::seconds(5 * 60), None)
.unwrap();
// just enough to merge
assert_eq!(
dirty.windows,
BTreeMap::from([(
Timestamp::new_second(0),
Some(Timestamp::new_second(
(2 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60
))
)])
);
// separate time window
let mut dirty = DirtyTimeWindows::default();
dirty.add_lower_bounds(
vec![
Timestamp::new_second(0),
Timestamp::new_second((2 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
]
.into_iter(),
);
dirty
.merge_dirty_time_windows(chrono::Duration::seconds(5 * 60), None)
.unwrap();
// just enough to merge
assert_eq!(
BTreeMap::from([
(
Timestamp::new_second(0),
Some(Timestamp::new_second(5 * 60))
),
(
Timestamp::new_second((2 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
Some(Timestamp::new_second(
(3 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60
))
)
]),
dirty.windows
);
// overlapping
let mut dirty = DirtyTimeWindows::default();
dirty.add_lower_bounds(
vec![
Timestamp::new_second(0),
Timestamp::new_second((DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
]
.into_iter(),
);
dirty
.merge_dirty_time_windows(chrono::Duration::seconds(5 * 60), None)
.unwrap();
// just enough to merge
assert_eq!(
BTreeMap::from([(
Timestamp::new_second(0),
Some(Timestamp::new_second(
(1 + DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60
))
),]),
dirty.windows
);
// expired
let mut dirty = DirtyTimeWindows::default();
dirty.add_lower_bounds(
vec![
Timestamp::new_second(0),
Timestamp::new_second((DirtyTimeWindows::MERGE_DIST as i64) * 5 * 60),
]
.into_iter(),
);
dirty
.merge_dirty_time_windows(
chrono::Duration::seconds(5 * 60),
Some(Timestamp::new_second(
(DirtyTimeWindows::MERGE_DIST as i64) * 6 * 60,
)),
)
.unwrap();
// just enough to merge
assert_eq!(BTreeMap::from([]), dirty.windows);
}
}

View File

@@ -1,163 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Frontend client to run flow as recording rule which is time-window-aware normal query triggered every tick set by user
use std::sync::Arc;
use client::{Client, Database, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use common_error::ext::BoxedError;
use common_grpc::channel_manager::{ChannelConfig, ChannelManager};
use common_meta::cluster::{NodeInfo, NodeInfoKey, Role};
use common_meta::peer::Peer;
use common_meta::rpc::store::RangeRequest;
use meta_client::client::MetaClient;
use snafu::ResultExt;
use crate::error::{ExternalSnafu, UnexpectedSnafu};
use crate::recording_rules::engine::DEFAULT_RULE_ENGINE_QUERY_TIMEOUT;
use crate::Error;
fn default_channel_mgr() -> ChannelManager {
let cfg = ChannelConfig::new().timeout(DEFAULT_RULE_ENGINE_QUERY_TIMEOUT);
ChannelManager::with_config(cfg)
}
fn client_from_urls(addrs: Vec<String>) -> Client {
Client::with_manager_and_urls(default_channel_mgr(), addrs)
}
/// A simple frontend client able to execute sql using grpc protocol
#[derive(Debug)]
pub enum FrontendClient {
Distributed {
meta_client: Arc<MetaClient>,
channel_mgr: ChannelManager,
},
Standalone {
/// for the sake of simplicity still use grpc even in standalone mode
/// notice the client here should all be lazy, so that can wait after frontend is booted then make conn
/// TODO(discord9): not use grpc under standalone mode
database_client: DatabaseWithPeer,
},
}
#[derive(Debug, Clone)]
pub struct DatabaseWithPeer {
pub database: Database,
pub peer: Peer,
}
impl DatabaseWithPeer {
fn new(database: Database, peer: Peer) -> Self {
Self { database, peer }
}
}
impl FrontendClient {
pub fn from_meta_client(meta_client: Arc<MetaClient>) -> Self {
Self::Distributed {
meta_client,
channel_mgr: default_channel_mgr(),
}
}
pub fn from_static_grpc_addr(addr: String) -> Self {
let peer = Peer {
id: 0,
addr: addr.clone(),
};
let mgr = default_channel_mgr();
let client = Client::with_manager_and_urls(mgr.clone(), vec![addr]);
let database = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client);
Self::Standalone {
database_client: DatabaseWithPeer::new(database, peer),
}
}
}
impl FrontendClient {
async fn scan_for_frontend(&self) -> Result<Vec<(NodeInfoKey, NodeInfo)>, Error> {
let Self::Distributed { meta_client, .. } = self else {
return Ok(vec![]);
};
let cluster_client = meta_client
.cluster_client()
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
let prefix = NodeInfoKey::key_prefix_with_role(Role::Frontend);
let req = RangeRequest::new().with_prefix(prefix);
let resp = cluster_client
.range(req)
.await
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
let mut res = Vec::with_capacity(resp.kvs.len());
for kv in resp.kvs {
let key = NodeInfoKey::try_from(kv.key)
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
let val = NodeInfo::try_from(kv.value)
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
res.push((key, val));
}
Ok(res)
}
/// Get the database with max `last_activity_ts`
async fn get_last_active_frontend(&self) -> Result<DatabaseWithPeer, Error> {
if let Self::Standalone { database_client } = self {
return Ok(database_client.clone());
}
match &self {
Self::Standalone { database_client } => Ok(database_client.clone()),
Self::Distributed {
meta_client: _,
channel_mgr,
} => {
let frontends = self.scan_for_frontend().await?;
let mut last_activity_ts = i64::MIN;
let mut peer = None;
for (_key, val) in frontends.iter() {
if val.last_activity_ts > last_activity_ts {
last_activity_ts = val.last_activity_ts;
peer = Some(val.peer.clone());
}
}
let Some(peer) = peer else {
UnexpectedSnafu {
reason: format!("No frontend available: {:?}", frontends),
}
.fail()?
};
let client =
Client::with_manager_and_urls(channel_mgr.clone(), vec![peer.addr.clone()]);
let database = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client);
Ok(DatabaseWithPeer::new(database, peer))
}
}
}
/// Get a database client, and possibly update it before returning.
pub async fn get_database_client(&self) -> Result<DatabaseWithPeer, Error> {
match self {
Self::Standalone { database_client } => Ok(database_client.clone()),
Self::Distributed { meta_client: _, .. } => self.get_last_active_frontend().await,
}
}
}

View File

@@ -57,7 +57,6 @@ use crate::error::{
};
use crate::heartbeat::HeartbeatTask;
use crate::metrics::{METRIC_FLOW_PROCESSING_TIME, METRIC_FLOW_ROWS};
use crate::recording_rules::{FrontendClient, RecordingRuleEngine};
use crate::transform::register_function_to_query_engine;
use crate::utils::{SizeReportSender, StateReportHandler};
use crate::{Error, FlowWorkerManager, FlownodeOptions};
@@ -246,7 +245,6 @@ impl FlownodeInstance {
self.server.shutdown().await.context(ShutdownServerSnafu)?;
if let Some(task) = &self.heartbeat_task {
info!("Close heartbeat task for flownode");
task.shutdown();
}
@@ -273,8 +271,6 @@ pub struct FlownodeBuilder {
heartbeat_task: Option<HeartbeatTask>,
/// receive a oneshot sender to send state size report
state_report_handler: Option<StateReportHandler>,
/// Client to send sql to frontend
frontend_client: Arc<FrontendClient>,
}
impl FlownodeBuilder {
@@ -285,7 +281,6 @@ impl FlownodeBuilder {
table_meta: TableMetadataManagerRef,
catalog_manager: CatalogManagerRef,
flow_metadata_manager: FlowMetadataManagerRef,
frontend_client: Arc<FrontendClient>,
) -> Self {
Self {
opts,
@@ -295,7 +290,6 @@ impl FlownodeBuilder {
flow_metadata_manager,
heartbeat_task: None,
state_report_handler: None,
frontend_client,
}
}
@@ -453,14 +447,7 @@ impl FlownodeBuilder {
let node_id = self.opts.node_id.map(|id| id as u32);
let rule_engine = RecordingRuleEngine::new(
self.frontend_client.clone(),
query_engine.clone(),
self.flow_metadata_manager.clone(),
table_meta.clone(),
);
let mut man = FlowWorkerManager::new(node_id, query_engine, table_meta, rule_engine);
let mut man = FlowWorkerManager::new(node_id, query_engine, table_meta);
for worker_id in 0..num_workers {
let (tx, rx) = oneshot::channel();

View File

@@ -86,8 +86,7 @@ pub fn create_test_query_engine() -> Arc<dyn QueryEngine> {
let schema = vec![
datatypes::schema::ColumnSchema::new("number", CDT::uint32_datatype(), false),
datatypes::schema::ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false)
.with_time_index(true),
datatypes::schema::ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false),
];
let mut columns = vec![];
let numbers = (1..=10).collect_vec();
@@ -115,37 +114,6 @@ pub fn create_test_query_engine() -> Arc<dyn QueryEngine> {
};
catalog_list.register_table_sync(req_with_ts).unwrap();
let schema = vec![
datatypes::schema::ColumnSchema::new("NUMBER", CDT::uint32_datatype(), false),
datatypes::schema::ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false)
.with_time_index(true),
];
let mut columns = vec![];
let numbers = (1..=10).collect_vec();
let column: VectorRef = Arc::new(<u32 as Scalar>::VectorType::from_vec(numbers));
columns.push(column);
let ts = (1..=10).collect_vec();
let mut builder = TimestampMillisecondVectorBuilder::with_capacity(10);
ts.into_iter()
.map(|v| builder.push(Some(TimestampMillisecond::new(v))))
.count();
let column: VectorRef = builder.to_vector_cloned();
columns.push(column);
let schema = Arc::new(Schema::new(schema));
let recordbatch = common_recordbatch::RecordBatch::new(schema, columns).unwrap();
let table = MemTable::table("UPPERCASE_NUMBERS_WITH_TS", recordbatch);
let req_with_ts = RegisterTableRequest {
catalog: DEFAULT_CATALOG_NAME.to_string(),
schema: DEFAULT_SCHEMA_NAME.to_string(),
table_name: "UPPERCASE_NUMBERS_WITH_TS".to_string(),
table_id: 1025,
table,
};
catalog_list.register_table_sync(req_with_ts).unwrap();
let factory = query::QueryEngineFactory::new(catalog_list, None, None, None, None, false);
let engine = factory.query_engine();

View File

@@ -238,13 +238,6 @@ pub enum Error {
source: servers::error::Error,
},
#[snafu(display("Failed to create logical plan for prometheus label values query"))]
PrometheusLabelValuesQueryPlan {
#[snafu(implicit)]
location: Location,
source: query::promql::error::Error,
},
#[snafu(display("Failed to describe schema for given statement"))]
DescribeStatement {
#[snafu(implicit)]
@@ -373,8 +366,6 @@ impl ErrorExt for Error {
| Error::PrometheusMetricNamesQueryPlan { source, .. }
| Error::ExecutePromql { source, .. } => source.status_code(),
Error::PrometheusLabelValuesQueryPlan { source, .. } => source.status_code(),
Error::CollectRecordbatch { .. } => StatusCode::EngineExecuteQuery,
Error::SqlExecIntercepted { source, .. } => source.status_code(),

View File

@@ -26,7 +26,6 @@ mod region_query;
pub mod standalone;
use std::sync::Arc;
use std::time::SystemTime;
use async_trait::async_trait;
use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
@@ -472,21 +471,6 @@ impl PrometheusHandler for Instance {
.context(ExecuteQuerySnafu)
}
async fn query_label_values(
&self,
metric: String,
label_name: String,
matchers: Vec<Matcher>,
start: SystemTime,
end: SystemTime,
ctx: &QueryContextRef,
) -> server_error::Result<Vec<String>> {
self.handle_query_label_values(metric, label_name, matchers, start, end, ctx)
.await
.map_err(BoxedError::new)
.context(ExecuteQuerySnafu)
}
fn catalog_manager(&self) -> CatalogManagerRef {
self.catalog_manager.clone()
}

View File

@@ -12,26 +12,20 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::time::SystemTime;
use catalog::information_schema::TABLES;
use client::OutputData;
use common_catalog::consts::INFORMATION_SCHEMA_NAME;
use common_catalog::format_full_table_name;
use common_recordbatch::util;
use common_telemetry::tracing;
use datatypes::prelude::Value;
use promql_parser::label::{Matcher, Matchers};
use query::promql;
use query::promql::planner::PromPlanner;
use promql_parser::label::Matcher;
use servers::prometheus;
use session::context::QueryContextRef;
use snafu::{OptionExt, ResultExt};
use crate::error::{
CatalogSnafu, CollectRecordbatchSnafu, ExecLogicalPlanSnafu,
PrometheusLabelValuesQueryPlanSnafu, PrometheusMetricNamesQueryPlanSnafu, ReadTableSnafu,
Result, TableNotFoundSnafu,
PrometheusMetricNamesQueryPlanSnafu, ReadTableSnafu, Result, TableNotFoundSnafu,
};
use crate::instance::Instance;
@@ -102,77 +96,4 @@ impl Instance {
Ok(results)
}
/// Handles label values query request, returns the values.
#[tracing::instrument(skip_all)]
pub(crate) async fn handle_query_label_values(
&self,
metric: String,
label_name: String,
matchers: Vec<Matcher>,
start: SystemTime,
end: SystemTime,
ctx: &QueryContextRef,
) -> Result<Vec<String>> {
let table_schema = ctx.current_schema();
let table = self
.catalog_manager
.table(ctx.current_catalog(), &table_schema, &metric, Some(ctx))
.await
.context(CatalogSnafu)?
.with_context(|| TableNotFoundSnafu {
table_name: format_full_table_name(ctx.current_catalog(), &table_schema, &metric),
})?;
let dataframe = self
.query_engine
.read_table(table.clone())
.with_context(|_| ReadTableSnafu {
table_name: format_full_table_name(ctx.current_catalog(), &table_schema, &metric),
})?;
let scan_plan = dataframe.into_logical_plan();
let filter_conditions =
PromPlanner::matchers_to_expr(Matchers::new(matchers), scan_plan.schema())
.context(PrometheusLabelValuesQueryPlanSnafu)?;
let logical_plan = promql::label_values::rewrite_label_values_query(
table,
scan_plan,
filter_conditions,
label_name,
start,
end,
)
.context(PrometheusLabelValuesQueryPlanSnafu)?;
let results = self
.query_engine
.execute(logical_plan, ctx.clone())
.await
.context(ExecLogicalPlanSnafu)?;
let batches = match results.data {
OutputData::Stream(stream) => util::collect(stream)
.await
.context(CollectRecordbatchSnafu)?,
OutputData::RecordBatches(rbs) => rbs.take(),
_ => unreachable!("should not happen"),
};
let mut results = Vec::with_capacity(batches.iter().map(|b| b.num_rows()).sum());
for batch in batches {
// Only one column the results, ensured by `prometheus::label_values_matchers_to_plan`.
let names = batch.column(0);
for i in 0..names.len() {
let Value::String(name) = names.get(i) else {
unreachable!();
};
results.push(name.into_string());
}
}
Ok(results)
}
}

View File

@@ -29,7 +29,6 @@ prost.workspace = true
puffin.workspace = true
regex.workspace = true
regex-automata.workspace = true
roaring = "0.10"
serde.workspace = true
serde_json.workspace = true
snafu.workspace = true

View File

@@ -1,868 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::io;
use std::ops::RangeInclusive;
use common_base::BitVec;
/// `BitmapType` enumerates how bitmaps are encoded within the inverted index.
pub use greptime_proto::v1::index::BitmapType;
use roaring::RoaringBitmap;
/// A bitmap representation supporting both BitVec and RoaringBitmap formats.
///
/// This enum provides unified bitmap operations while allowing efficient storage
/// in different formats. The implementation automatically handles type conversions
/// when performing operations between different formats.
///
/// # Examples
///
/// Creating a new Roaring bitmap:
/// ```
/// use bitmap::Bitmap;
/// let bitmap = Bitmap::new_roaring();
/// assert!(bitmap.is_empty());
/// ```
///
/// Creating a full BitVec bitmap:
/// ```
/// use bitmap::Bitmap;
/// let bitmap = Bitmap::full_bitvec(10);
/// assert_eq!(bitmap.count_ones(), 10);
/// ```
#[derive(Debug, Clone, PartialEq)]
pub enum Bitmap {
Roaring(RoaringBitmap),
BitVec(BitVec),
}
impl Bitmap {
/// Creates a new empty BitVec-based bitmap.
pub fn new_bitvec() -> Self {
Bitmap::BitVec(BitVec::EMPTY)
}
/// Creates a new empty RoaringBitmap-based bitmap.
pub fn new_roaring() -> Self {
Bitmap::Roaring(RoaringBitmap::new())
}
/// Creates a full BitVec-based bitmap with all bits set to 1.
///
/// # Arguments
/// * `size` - The number of bits to allocate and set
pub fn full_bitvec(size: usize) -> Self {
Bitmap::BitVec(BitVec::repeat(true, size))
}
/// Creates a full RoaringBitmap-based bitmap with bits 0..size set to 1.
///
/// # Arguments
/// * `size` - The exclusive upper bound for the bit range
pub fn full_roaring(size: usize) -> Self {
let mut roaring = RoaringBitmap::new();
roaring.insert_range(0..size as u32);
Bitmap::Roaring(roaring)
}
/// Returns the number of bits set to 1 in the bitmap.
pub fn count_ones(&self) -> usize {
match self {
Bitmap::BitVec(bitvec) => bitvec.count_ones(),
Bitmap::Roaring(roaring) => roaring.len() as _,
}
}
/// Checks if the bitmap contains no set bits.
pub fn is_empty(&self) -> bool {
match self {
Bitmap::BitVec(bitvec) => bitvec.is_empty(),
Bitmap::Roaring(roaring) => roaring.is_empty(),
}
}
/// Inserts a range of bits into the bitmap.
///
/// # Arguments
/// * `range` - Inclusive range of bits to set
pub fn insert_range(&mut self, range: RangeInclusive<usize>) {
match self {
Bitmap::BitVec(bitvec) => {
if *range.end() >= bitvec.len() {
bitvec.resize(range.end() + 1, false);
}
for i in range {
bitvec.set(i, true);
}
}
Bitmap::Roaring(roaring) => {
let range = *range.start() as u32..=*range.end() as u32;
roaring.insert_range(range);
}
}
}
/// Serializes the bitmap into a byte buffer using the specified format.
///
/// # Arguments
/// * `serialize_type` - Target format for serialization
/// * `writer` - Output writer to write the serialized data
pub fn serialize_into(
&self,
serialize_type: BitmapType,
mut writer: impl io::Write,
) -> io::Result<()> {
match (self, serialize_type) {
(Bitmap::BitVec(bitvec), BitmapType::BitVec) => {
writer.write_all(bitvec.as_raw_slice())?;
}
(Bitmap::Roaring(roaring), BitmapType::Roaring) => {
roaring.serialize_into(writer)?;
}
(Bitmap::BitVec(bitvec), BitmapType::Roaring) => {
let bitmap = Bitmap::bitvec_to_roaring(bitvec.clone());
bitmap.serialize_into(writer)?;
}
(Bitmap::Roaring(roaring), BitmapType::BitVec) => {
let bitvec = Bitmap::roaring_to_bitvec(roaring);
writer.write_all(bitvec.as_raw_slice())?;
}
}
Ok(())
}
/// Computes the size of the serialized bitmap in bytes.
///
/// # Arguments
/// * `bitmap_type` - Format of data to be serialized
pub fn serialized_size(&self, bitmap_type: BitmapType) -> usize {
match (self, bitmap_type) {
(Bitmap::BitVec(bitvec), BitmapType::BitVec) => bitvec.as_raw_slice().len(),
(Bitmap::Roaring(roaring), BitmapType::Roaring) => roaring.serialized_size(),
(Bitmap::BitVec(bitvec), BitmapType::Roaring) => {
let bitmap = Bitmap::bitvec_to_roaring(bitvec.clone());
bitmap.serialized_size()
}
(Bitmap::Roaring(roaring), BitmapType::BitVec) => {
let bitvec = Bitmap::roaring_to_bitvec(roaring);
bitvec.as_raw_slice().len()
}
}
}
/// Deserializes a bitmap from a byte buffer.
///
/// # Arguments
/// * `buf` - Input buffer containing serialized data
/// * `bitmap_type` - Format of the serialized data
pub fn deserialize_from(buf: &[u8], bitmap_type: BitmapType) -> std::io::Result<Self> {
match bitmap_type {
BitmapType::BitVec => {
let bitvec = BitVec::from_slice(buf);
Ok(Bitmap::BitVec(bitvec))
}
BitmapType::Roaring => {
let roaring = RoaringBitmap::deserialize_from(buf)?;
Ok(Bitmap::Roaring(roaring))
}
}
}
/// Computes the union with another bitmap (in-place).
///
/// If the other bitmap is a different type, it will be converted to match
/// the current bitmap's type.
pub fn union(&mut self, other: Self) {
if self.is_empty() {
*self = other;
return;
}
match (self, other) {
(Bitmap::BitVec(bitvec1), bitmap) => {
let bitvec2 = bitmap.into_bitvec();
if bitvec1.len() > bitvec2.len() {
*bitvec1 |= bitvec2
} else {
*bitvec1 = bitvec2 | &*bitvec1;
}
}
(Bitmap::Roaring(roaring1), bitmap) => {
let roaring2 = bitmap.into_roaring();
*roaring1 |= roaring2;
}
}
}
/// Computes the intersection with another bitmap (in-place).
///
/// If the other bitmap is a different type, it will be converted to match
/// the current bitmap's type.
pub fn intersect(&mut self, other: Self) {
match (self, other) {
(Bitmap::BitVec(bitvec1), bitmap) => {
let mut bitvec2 = bitmap.into_bitvec();
let len = (bitvec1.len() - bitvec1.trailing_zeros())
.min(bitvec2.len() - bitvec2.trailing_zeros());
bitvec1.truncate(len);
bitvec2.truncate(len);
*bitvec1 &= bitvec2;
}
(Bitmap::Roaring(roaring1), bitmap) => {
let roaring2 = bitmap.into_roaring();
*roaring1 &= roaring2;
}
}
}
/// Returns an iterator over the indices of set bits.
pub fn iter_ones(&self) -> Box<dyn Iterator<Item = usize> + '_> {
match self {
Bitmap::BitVec(bitvec) => Box::new(bitvec.iter_ones()),
Bitmap::Roaring(roaring) => Box::new(roaring.iter().map(|x| x as usize)),
}
}
/// Creates a bitmap from bytes in LSB0 (least significant bit first) order.
///
/// # Arguments
/// * `bytes` - Input bytes in LSB0 order
/// * `bitmap_type` - Type of bitmap to create
pub fn from_lsb0_bytes(bytes: &[u8], bitmap_type: BitmapType) -> Self {
match bitmap_type {
BitmapType::BitVec => {
let bitvec = BitVec::from_slice(bytes);
Bitmap::BitVec(bitvec)
}
BitmapType::Roaring => {
let roaring = RoaringBitmap::from_lsb0_bytes(0, bytes);
Bitmap::Roaring(roaring)
}
}
}
/// Computes memory usage of the bitmap in bytes.
pub fn memory_usage(&self) -> usize {
match self {
Bitmap::BitVec(bitvec) => bitvec.capacity(),
Bitmap::Roaring(roaring) => {
let stat = roaring.statistics();
(stat.n_bytes_array_containers
+ stat.n_bytes_bitset_containers
+ stat.n_bytes_run_containers) as usize
}
}
}
fn into_bitvec(self) -> BitVec {
match self {
Bitmap::BitVec(bitvec) => bitvec,
Bitmap::Roaring(roaring) => Self::roaring_to_bitvec(&roaring),
}
}
fn into_roaring(self) -> RoaringBitmap {
match self {
Bitmap::Roaring(roaring) => roaring,
Bitmap::BitVec(bitvec) => Self::bitvec_to_roaring(bitvec),
}
}
fn roaring_to_bitvec(roaring: &RoaringBitmap) -> BitVec {
let max_value = roaring.max().unwrap_or(0);
let mut bitvec = BitVec::repeat(false, max_value as usize + 1);
for i in roaring {
bitvec.set(i as usize, true);
}
bitvec
}
fn bitvec_to_roaring(mut bitvec: BitVec) -> RoaringBitmap {
bitvec.resize(bitvec.capacity(), false);
RoaringBitmap::from_lsb0_bytes(0, bitvec.as_raw_slice())
}
}
impl Default for Bitmap {
fn default() -> Self {
Bitmap::new_roaring()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_full_bitmaps() {
let bv = Bitmap::full_bitvec(10);
assert_eq!(bv.count_ones(), 10);
let rb = Bitmap::full_roaring(10);
assert_eq!(rb.count_ones(), 10);
}
#[test]
fn test_serialization_roundtrip() {
let original = Bitmap::full_roaring(100);
let mut buf = Vec::new();
// Serialize as Roaring
original
.serialize_into(BitmapType::Roaring, &mut buf)
.unwrap();
let deserialized = Bitmap::deserialize_from(&buf, BitmapType::Roaring).unwrap();
assert_eq!(original, deserialized);
// Serialize as BitVec
buf.clear();
original
.serialize_into(BitmapType::BitVec, &mut buf)
.unwrap();
let deserialized = Bitmap::deserialize_from(&buf, BitmapType::BitVec).unwrap();
assert_eq!(original.count_ones(), deserialized.count_ones());
}
#[test]
fn test_union_fulls() {
// Test BitVec union
let mut bv1 = Bitmap::full_bitvec(3); // 0-2: 111
let bv2 = Bitmap::full_bitvec(5); // 0-4: 11111
bv1.union(bv2);
assert_eq!(bv1.count_ones(), 5);
let mut bv1 = Bitmap::full_bitvec(5); // 0-4: 11111
let bv2 = Bitmap::full_bitvec(3); // 0-2: 111
bv1.union(bv2);
assert_eq!(bv1.count_ones(), 5);
// Test Roaring union
let mut rb1 = Bitmap::full_roaring(3); // 0-2: 111
let rb2 = Bitmap::full_roaring(5); // 0-4: 11111
rb1.union(rb2);
assert_eq!(rb1.count_ones(), 5);
let mut rb1 = Bitmap::full_roaring(5); // 0-4: 11111
let rb2 = Bitmap::full_roaring(3); // 0-2: 111
rb1.union(rb2);
assert_eq!(rb1.count_ones(), 5);
// Test cross-type union
let mut rb = Bitmap::full_roaring(5); // 0-4: 11111
let bv = Bitmap::full_bitvec(3); // 0-2: 111
rb.union(bv);
assert_eq!(rb.count_ones(), 5);
let mut bv = Bitmap::full_bitvec(5); // 0-4: 11111
let rb = Bitmap::full_roaring(3); // 0-2: 111
bv.union(rb);
assert_eq!(bv.count_ones(), 5);
let mut rb = Bitmap::full_roaring(3); // 0-2: 111
let bv = Bitmap::full_bitvec(5); // 0-4: 11111
rb.union(bv);
assert_eq!(rb.count_ones(), 5);
let mut bv = Bitmap::full_bitvec(3); // 0-2: 111
let rb = Bitmap::full_roaring(5); // 0-4: 11111
bv.union(rb);
assert_eq!(bv.count_ones(), 5);
}
#[test]
fn test_union_bitvec() {
let mut bv1 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
let bv2 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
bv1.union(bv2);
assert_eq!(
bv1,
Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::BitVec)
);
// Test different lengths
let mut bv1 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
let bv2 = Bitmap::from_lsb0_bytes(&[0b01010101, 0b00000001], BitmapType::BitVec);
bv1.union(bv2);
assert_eq!(
bv1,
Bitmap::from_lsb0_bytes(&[0b11111111, 0b00000001], BitmapType::BitVec)
);
let mut bv1 = Bitmap::from_lsb0_bytes(&[0b10101010, 0b00000001], BitmapType::BitVec);
let bv2 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
bv1.union(bv2);
assert_eq!(
bv1,
Bitmap::from_lsb0_bytes(&[0b11111111, 0b00000001], BitmapType::BitVec)
);
// Test empty bitmaps
let mut bv1 = Bitmap::new_bitvec();
let bv2 = Bitmap::new_bitvec();
bv1.union(bv2);
assert!(bv1.is_empty());
let mut bv1 = Bitmap::new_bitvec();
let bv2 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
bv1.union(bv2);
assert_eq!(
bv1,
Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec)
);
let mut bv1 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
let bv2 = Bitmap::new_bitvec();
bv1.union(bv2);
assert_eq!(
bv1,
Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec)
);
// Test empty and full bitmaps
let mut bv1 = Bitmap::new_bitvec();
let bv2 = Bitmap::full_bitvec(8);
bv1.union(bv2);
assert_eq!(bv1, Bitmap::full_bitvec(8));
let mut bv1 = Bitmap::full_bitvec(8);
let bv2 = Bitmap::new_bitvec();
bv1.union(bv2);
assert_eq!(bv1, Bitmap::full_bitvec(8));
}
#[test]
fn test_union_roaring() {
let mut rb1 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
let rb2 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
rb1.union(rb2);
assert_eq!(
rb1,
Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::Roaring)
);
// Test different lengths
let mut rb1 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
let rb2 = Bitmap::from_lsb0_bytes(&[0b01010101, 0b00000001], BitmapType::Roaring);
rb1.union(rb2);
assert_eq!(
rb1,
Bitmap::from_lsb0_bytes(&[0b11111111, 0b00000001], BitmapType::Roaring)
);
let mut rb1 = Bitmap::from_lsb0_bytes(&[0b10101010, 0b00000001], BitmapType::Roaring);
let rb2 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
rb1.union(rb2);
assert_eq!(
rb1,
Bitmap::from_lsb0_bytes(&[0b11111111, 0b00000001], BitmapType::Roaring)
);
// Test empty bitmaps
let mut rb1 = Bitmap::new_roaring();
let rb2 = Bitmap::new_roaring();
rb1.union(rb2);
assert!(rb1.is_empty());
let mut rb1 = Bitmap::new_roaring();
let rb2 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
rb1.union(rb2);
assert_eq!(
rb1,
Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring)
);
let mut rb1 = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
let rb2 = Bitmap::new_roaring();
rb1.union(rb2);
assert_eq!(
rb1,
Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring)
);
// Test empty and full bit
let mut rb1 = Bitmap::new_roaring();
let rb2 = Bitmap::full_roaring(8);
rb1.union(rb2);
assert_eq!(rb1, Bitmap::full_roaring(8));
let mut rb1 = Bitmap::full_roaring(8);
let rb2 = Bitmap::new_roaring();
rb1.union(rb2);
assert_eq!(rb1, Bitmap::full_roaring(8));
}
#[test]
fn test_union_mixed() {
let mut rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
let bv = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
rb.union(bv);
assert_eq!(
rb,
Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::Roaring)
);
let mut bv = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
let rb = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
bv.union(rb);
assert_eq!(
bv,
Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::BitVec)
);
let mut rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
let bv = Bitmap::full_bitvec(8);
rb.union(bv);
assert_eq!(rb, Bitmap::full_roaring(8));
let mut bv = Bitmap::full_bitvec(8);
let rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
bv.union(rb);
assert_eq!(bv, Bitmap::full_bitvec(8));
let mut rb = Bitmap::new_roaring();
let bv = Bitmap::full_bitvec(8);
rb.union(bv);
assert_eq!(rb, Bitmap::full_bitvec(8));
let mut bv = Bitmap::full_bitvec(8);
let rb = Bitmap::new_roaring();
bv.union(rb);
assert_eq!(bv, Bitmap::full_bitvec(8));
let mut rb = Bitmap::new_roaring();
let bv = Bitmap::new_bitvec();
rb.union(bv);
assert!(rb.is_empty());
let mut bv = Bitmap::new_bitvec();
let rb = Bitmap::new_roaring();
bv.union(rb);
assert!(bv.is_empty());
let mut rb = Bitmap::new_roaring();
let bv = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
rb.union(bv);
assert_eq!(
rb,
Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec)
);
let mut bv = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec);
let rb = Bitmap::new_roaring();
bv.union(rb);
assert_eq!(
bv,
Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::BitVec)
);
let mut rb = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
let bv = Bitmap::new_bitvec();
rb.union(bv);
assert_eq!(
rb,
Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring)
);
let mut bv = Bitmap::new_bitvec();
let rb = Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring);
bv.union(rb);
assert_eq!(
bv,
Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring)
);
}
#[test]
fn test_intersect_fulls() {
// Test BitVec intersect
let mut bv1 = Bitmap::full_bitvec(3); // 0-2: 111
let bv2 = Bitmap::full_bitvec(5); // 0-4: 11111
bv1.intersect(bv2);
assert_eq!(bv1.count_ones(), 3);
let mut bv1 = Bitmap::full_bitvec(5); // 0-4: 11111
let bv2 = Bitmap::full_bitvec(3); // 0-2: 111
bv1.intersect(bv2);
assert_eq!(bv1.count_ones(), 3);
// Test Roaring intersect
let mut rb1 = Bitmap::full_roaring(3); // 0-2: 111
let rb2 = Bitmap::full_roaring(5); // 0-4: 11111
rb1.intersect(rb2);
assert_eq!(rb1.count_ones(), 3);
let mut rb1 = Bitmap::full_roaring(5); // 0-4: 11111
let rb2 = Bitmap::full_roaring(3); // 0-2: 111
rb1.intersect(rb2);
assert_eq!(rb1.count_ones(), 3);
// Test cross-type intersect
let mut rb = Bitmap::full_roaring(5); // 0-4: 11111
let bv = Bitmap::full_bitvec(3); // 0-2: 111
rb.intersect(bv);
assert_eq!(rb.count_ones(), 3);
let mut bv = Bitmap::full_bitvec(5); // 0-4: 11111
let rb = Bitmap::full_roaring(3); // 0-2: 111
bv.intersect(rb);
assert_eq!(bv.count_ones(), 3);
let mut rb = Bitmap::full_roaring(3); // 0-2: 111
let bv = Bitmap::full_bitvec(5); // 0-4: 11111
rb.intersect(bv);
assert_eq!(rb.count_ones(), 3);
let mut bv = Bitmap::full_bitvec(3); // 0-2: 111
let rb = Bitmap::full_roaring(5); // 0-4: 11111
bv.intersect(rb);
assert_eq!(bv.count_ones(), 3);
}
#[test]
fn test_intersect_bitvec() {
let mut bv1 = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::BitVec);
let bv2 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
bv1.intersect(bv2);
assert_eq!(
bv1,
Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::BitVec)
);
// Test different lengths
let mut bv1 = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::BitVec);
let bv2 = Bitmap::from_lsb0_bytes(&[0b10101010, 0b00000001], BitmapType::BitVec);
bv1.intersect(bv2);
assert_eq!(
bv1,
Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::BitVec)
);
let mut bv1 = Bitmap::from_lsb0_bytes(&[0b11110000, 0b00000001], BitmapType::BitVec);
let bv2 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
bv1.intersect(bv2);
assert_eq!(
bv1,
Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::BitVec)
);
// Test empty bitmaps
let mut bv1 = Bitmap::new_bitvec();
let bv2 = Bitmap::new_bitvec();
bv1.intersect(bv2);
assert!(bv1.is_empty());
let mut bv1 = Bitmap::new_bitvec();
let bv2 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
bv1.intersect(bv2);
assert!(bv1.is_empty());
let mut bv1 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
let bv2 = Bitmap::new_bitvec();
bv1.intersect(bv2);
assert!(bv1.is_empty());
// Test empty and full bitmaps
let mut bv1 = Bitmap::new_bitvec();
let bv2 = Bitmap::full_bitvec(8);
bv1.intersect(bv2);
assert!(bv1.is_empty());
let mut bv1 = Bitmap::full_bitvec(8);
let bv2 = Bitmap::new_bitvec();
bv1.intersect(bv2);
assert!(bv1.is_empty());
}
#[test]
fn test_intersect_roaring() {
let mut rb1 = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring);
let rb2 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
rb1.intersect(rb2);
assert_eq!(
rb1,
Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::Roaring)
);
// Test different lengths
let mut rb1 = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring);
let rb2 = Bitmap::from_lsb0_bytes(&[0b10101010, 0b00000001], BitmapType::Roaring);
rb1.intersect(rb2);
assert_eq!(
rb1,
Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::Roaring)
);
let mut rb1 = Bitmap::from_lsb0_bytes(&[0b11110000, 0b00000001], BitmapType::Roaring);
let rb2 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
rb1.intersect(rb2);
assert_eq!(
rb1,
Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::Roaring)
);
// Test empty bitmaps
let mut rb1 = Bitmap::new_roaring();
let rb2 = Bitmap::new_roaring();
rb1.intersect(rb2);
assert!(rb1.is_empty());
let mut rb1 = Bitmap::new_roaring();
let rb2 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
rb1.intersect(rb2);
assert!(rb1.is_empty());
let mut rb1 = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
let rb2 = Bitmap::new_roaring();
rb1.intersect(rb2);
assert!(rb1.is_empty());
// Test empty and full bitmaps
let mut rb1 = Bitmap::new_roaring();
let rb2 = Bitmap::full_roaring(8);
rb1.intersect(rb2);
assert!(rb1.is_empty());
let mut rb1 = Bitmap::full_roaring(8);
let rb2 = Bitmap::new_roaring();
rb1.intersect(rb2);
assert!(rb1.is_empty());
}
#[test]
fn test_intersect_mixed() {
let mut rb = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring);
let bv = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
rb.intersect(bv);
assert_eq!(
rb,
Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::Roaring)
);
let mut bv = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::BitVec);
let rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
bv.intersect(rb);
assert_eq!(
bv,
Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::BitVec)
);
let mut rb = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring);
let bv = Bitmap::full_bitvec(8);
rb.intersect(bv);
assert_eq!(
rb,
Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring)
);
let mut bv = Bitmap::full_bitvec(8);
let rb = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring);
bv.intersect(rb);
assert_eq!(
bv,
Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::BitVec)
);
let mut rb = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::Roaring);
let bv = Bitmap::from_lsb0_bytes(&[0b10101010, 0b00000001], BitmapType::BitVec);
rb.intersect(bv);
assert_eq!(
rb,
Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::Roaring)
);
let mut bv = Bitmap::from_lsb0_bytes(&[0b11110000, 0b00000001], BitmapType::BitVec);
let rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
bv.intersect(rb);
assert_eq!(
bv,
Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::BitVec)
);
let mut rb = Bitmap::from_lsb0_bytes(&[0b11110000, 0b00000001], BitmapType::Roaring);
let bv = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
rb.intersect(bv);
assert_eq!(
rb,
Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::Roaring)
);
let mut bv = Bitmap::from_lsb0_bytes(&[0b11110000], BitmapType::BitVec);
let rb = Bitmap::from_lsb0_bytes(&[0b10101010, 0b00000001], BitmapType::Roaring);
bv.intersect(rb);
assert_eq!(
bv,
Bitmap::from_lsb0_bytes(&[0b10100000], BitmapType::BitVec)
);
let mut rb = Bitmap::new_roaring();
let bv = Bitmap::full_bitvec(8);
rb.intersect(bv);
assert!(rb.is_empty());
let mut bv = Bitmap::full_bitvec(8);
let rb = Bitmap::new_roaring();
bv.intersect(rb);
assert!(bv.is_empty());
let mut bv = Bitmap::new_bitvec();
let rb = Bitmap::full_roaring(8);
bv.intersect(rb);
assert!(bv.is_empty());
let mut rb = Bitmap::full_roaring(8);
let bv = Bitmap::new_bitvec();
rb.intersect(bv);
assert!(rb.is_empty());
let mut rb = Bitmap::new_roaring();
let bv = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
rb.intersect(bv);
assert!(rb.is_empty());
let mut bv = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::BitVec);
let rb = Bitmap::new_roaring();
bv.intersect(rb);
assert!(bv.is_empty());
let mut bv = Bitmap::new_bitvec();
let rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
bv.intersect(rb);
assert!(bv.is_empty());
let mut rb = Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring);
let bv = Bitmap::new_bitvec();
rb.intersect(bv);
assert!(rb.is_empty());
}
#[test]
fn test_insert_range() {
let mut bv = Bitmap::new_bitvec();
bv.insert_range(0..=5);
assert_eq!(bv.iter_ones().collect::<Vec<_>>(), vec![0, 1, 2, 3, 4, 5]);
let mut rb = Bitmap::new_roaring();
rb.insert_range(0..=5);
assert_eq!(bv.iter_ones().collect::<Vec<_>>(), vec![0, 1, 2, 3, 4, 5]);
let mut bv = Bitmap::new_bitvec();
bv.insert_range(10..=10);
assert_eq!(bv.iter_ones().collect::<Vec<_>>(), vec![10]);
let mut rb = Bitmap::new_roaring();
rb.insert_range(10..=10);
assert_eq!(bv.iter_ones().collect::<Vec<_>>(), vec![10]);
}
}

View File

@@ -17,7 +17,6 @@ pub mod sort_create;
use async_trait::async_trait;
use crate::bitmap::BitmapType;
use crate::inverted_index::error::Result;
use crate::inverted_index::format::writer::InvertedIndexWriter;
use crate::BytesRef;
@@ -54,9 +53,5 @@ pub trait InvertedIndexCreator: Send {
/// Finalizes the index creation process, ensuring all data is properly indexed and stored
/// in the provided writer
async fn finish(
&mut self,
writer: &mut dyn InvertedIndexWriter,
bitmap_type: BitmapType,
) -> Result<()>;
async fn finish(&mut self, writer: &mut dyn InvertedIndexWriter) -> Result<()>;
}

View File

@@ -17,23 +17,22 @@ mod intermediate_rw;
mod merge_stream;
use async_trait::async_trait;
use common_base::BitVec;
use futures::Stream;
use crate::bitmap::Bitmap;
use crate::inverted_index::error::Result;
use crate::inverted_index::format::writer::ValueStream;
use crate::{Bytes, BytesRef};
/// A stream of sorted values along with their associated bitmap
pub type SortedStream = Box<dyn Stream<Item = Result<(Bytes, Bitmap)>> + Send + Unpin>;
pub type SortedStream = Box<dyn Stream<Item = Result<(Bytes, BitVec)>> + Send + Unpin>;
/// Output of a sorting operation, encapsulating a bitmap for null values and a stream of sorted items
pub struct SortOutput {
/// Bitmap indicating which segments have null values
pub segment_null_bitmap: Bitmap,
pub segment_null_bitmap: BitVec,
/// Stream of sorted items
pub sorted_stream: ValueStream,
pub sorted_stream: SortedStream,
/// Total number of rows in the sorted data
pub total_row_count: usize,

View File

@@ -20,11 +20,11 @@ use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use async_trait::async_trait;
use common_base::BitVec;
use common_telemetry::{debug, error};
use futures::stream;
use snafu::ResultExt;
use crate::bitmap::Bitmap;
use crate::external_provider::ExternalTempFileProvider;
use crate::inverted_index::create::sort::intermediate_rw::{
IntermediateReader, IntermediateWriter,
@@ -45,10 +45,18 @@ pub struct ExternalSorter {
temp_file_provider: Arc<dyn ExternalTempFileProvider>,
/// Bitmap indicating which segments have null values
segment_null_bitmap: Bitmap,
segment_null_bitmap: BitVec,
/// In-memory buffer to hold values and their corresponding bitmaps until memory threshold is exceeded
values_buffer: BTreeMap<Bytes, (Bitmap, usize)>,
values_buffer: BTreeMap<Bytes, BitVec>,
/// Count of rows in the last dumped buffer, used to streamline memory usage of `values_buffer`.
///
/// After data is dumped to external files, `last_dump_row_count` is updated to reflect the new starting point
/// for `BitVec` indexing. This means each `BitVec` in `values_buffer` thereafter encodes positions relative to
/// this count, not from 0. This mechanism effectively shrinks the memory footprint of each `BitVec`, helping manage
/// memory use more efficiently by focusing only on newly ingested data post-dump.
last_dump_row_count: usize,
/// Count of all rows ingested so far
total_row_count: usize,
@@ -85,14 +93,14 @@ impl Sorter for ExternalSorter {
return Ok(());
}
let segment_index_range = self.segment_index_range(n);
let segment_index_range = self.segment_index_range(n, value.is_none());
self.total_row_count += n;
if let Some(value) = value {
let memory_diff = self.push_not_null(value, segment_index_range);
self.may_dump_buffer(memory_diff).await
} else {
self.segment_null_bitmap.insert_range(segment_index_range);
set_bits(&mut self.segment_null_bitmap, segment_index_range);
Ok(())
}
}
@@ -109,10 +117,15 @@ impl Sorter for ExternalSorter {
// TODO(zhongzc): k-way merge instead of 2-way merge
let mut tree_nodes: VecDeque<SortedStream> = VecDeque::with_capacity(readers.len() + 1);
let leading_zeros = self.last_dump_row_count / self.segment_row_count;
tree_nodes.push_back(Box::new(stream::iter(
mem::take(&mut self.values_buffer)
.into_iter()
.map(|(value, (bitmap, _))| Ok((value, bitmap))),
.map(move |(value, mut bitmap)| {
bitmap.resize(bitmap.len() + leading_zeros, false);
bitmap.shift_right(leading_zeros);
Ok((value, bitmap))
}),
)));
for (_, reader) in readers {
tree_nodes.push_back(IntermediateReader::new(reader).into_stream().await?);
@@ -148,10 +161,11 @@ impl ExternalSorter {
index_name,
temp_file_provider,
segment_null_bitmap: Bitmap::new_bitvec(), // bitvec is more efficient for many null values
segment_null_bitmap: BitVec::new(),
values_buffer: BTreeMap::new(),
total_row_count: 0,
last_dump_row_count: 0,
segment_row_count,
current_memory_usage: 0,
@@ -181,7 +195,7 @@ impl ExternalSorter {
}
/// Pushes the non-null values to the values buffer and sets the bits within
/// the specified range in the given bitmap to true.
/// the specified range in the given BitVec to true.
/// Returns the memory usage difference of the buffer after the operation.
fn push_not_null(
&mut self,
@@ -189,23 +203,20 @@ impl ExternalSorter {
segment_index_range: RangeInclusive<usize>,
) -> usize {
match self.values_buffer.get_mut(value) {
Some((bitmap, mem_usage)) => {
bitmap.insert_range(segment_index_range);
let new_usage = bitmap.memory_usage() + value.len();
let diff = new_usage - *mem_usage;
*mem_usage = new_usage;
Some(bitmap) => {
let old_len = bitmap.as_raw_slice().len();
set_bits(bitmap, segment_index_range);
diff
bitmap.as_raw_slice().len() - old_len
}
None => {
let mut bitmap = Bitmap::new_roaring();
bitmap.insert_range(segment_index_range);
let mut bitmap = BitVec::default();
set_bits(&mut bitmap, segment_index_range);
let mem_usage = bitmap.memory_usage() + value.len();
self.values_buffer
.insert(value.to_vec(), (bitmap, mem_usage));
let mem_diff = bitmap.as_raw_slice().len() + value.len();
self.values_buffer.insert(value.to_vec(), bitmap);
mem_usage
mem_diff
}
}
}
@@ -246,8 +257,12 @@ impl ExternalSorter {
.fetch_sub(memory_usage, Ordering::Relaxed);
self.current_memory_usage = 0;
let bitmap_leading_zeros = self.last_dump_row_count / self.segment_row_count;
self.last_dump_row_count =
self.total_row_count - self.total_row_count % self.segment_row_count; // align to segment
let entries = values.len();
IntermediateWriter::new(writer).write_all(values.into_iter().map(|(k, (b, _))| (k, b))).await.inspect(|_|
IntermediateWriter::new(writer).write_all(values, bitmap_leading_zeros as _).await.inspect(|_|
debug!("Dumped {entries} entries ({memory_usage} bytes) to intermediate file {file_id} for index {index_name}")
).inspect_err(|e|
error!(e; "Failed to dump {entries} entries to intermediate file {file_id} for index {index_name}")
@@ -256,8 +271,13 @@ impl ExternalSorter {
/// Determines the segment index range for the row index range
/// `[row_begin, row_begin + n - 1]`
fn segment_index_range(&self, n: usize) -> RangeInclusive<usize> {
let row_begin = self.total_row_count;
fn segment_index_range(&self, n: usize, is_null: bool) -> RangeInclusive<usize> {
let row_begin = if is_null {
self.total_row_count
} else {
self.total_row_count - self.last_dump_row_count
};
let start = self.segment_index(row_begin);
let end = self.segment_index(row_begin + n - 1);
start..=end
@@ -269,6 +289,16 @@ impl ExternalSorter {
}
}
/// Sets the bits within the specified range in the given `BitVec` to true
fn set_bits(bitmap: &mut BitVec, index_range: RangeInclusive<usize>) {
if *index_range.end() >= bitmap.len() {
bitmap.resize(index_range.end() + 1, false);
}
for index in index_range {
bitmap.set(index, true);
}
}
#[cfg(test)]
mod tests {
use std::collections::HashMap;
@@ -300,7 +330,7 @@ mod tests {
move |index_name, file_id| {
assert_eq!(index_name, "test");
let mut files = files.lock().unwrap();
let (writer, reader) = duplex(1024 * 1024);
let (writer, reader) = duplex(8 * 1024);
files.insert(file_id.to_string(), Box::new(reader.compat()));
Ok(Box::new(writer.compat_write()))
}

View File

@@ -19,24 +19,29 @@
//! The serialization format is as follows:
//!
//! ```text
//! [magic][item][item]...[item]
//! [4] [?]
//! [magic][bitmap leading zeros][item][item]...[item]
//! [4] [4] [?]
//!
//! Each [item] is structured as:
//! [value len][value][bitmap len][bitmap]
//! [8] [?] [8] [?]
//! ```
//!
//! Each item represents a value and its associated bitmap, serialized with their lengths for
//! The format starts with a 4-byte magic identifier, followed by a 4-byte
//! bitmap leading zeros count, indicating how many leading zeros are in the
//! fixed-size region of the bitmap. Following that, each item represents
//! a value and its associated bitmap, serialized with their lengths for
//! easier deserialization.
mod codec_v1;
use std::collections::BTreeMap;
use asynchronous_codec::{FramedRead, FramedWrite};
use common_base::BitVec;
use futures::{stream, AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt, StreamExt};
use snafu::ResultExt;
use crate::bitmap::{Bitmap, BitmapType};
use crate::inverted_index::create::sort::SortedStream;
use crate::inverted_index::error::{
CloseSnafu, FlushSnafu, ReadSnafu, Result, UnknownIntermediateCodecMagicSnafu, WriteSnafu,
@@ -57,13 +62,12 @@ impl<W: AsyncWrite + Unpin> IntermediateWriter<W> {
/// Serializes and writes all provided values to the wrapped writer
pub async fn write_all(
mut self,
values: impl IntoIterator<Item = (Bytes, Bitmap)>,
values: BTreeMap<Bytes, BitVec>,
bitmap_leading_zeros: u32,
) -> Result<()> {
let (codec_magic, encoder) = (
codec_v1::CODEC_V1_MAGIC,
codec_v1::IntermediateItemEncoderV1 {
bitmap_type: BitmapType::Roaring,
},
codec_v1::IntermediateItemEncoderV1,
);
self.writer
@@ -71,6 +75,11 @@ impl<W: AsyncWrite + Unpin> IntermediateWriter<W> {
.await
.context(WriteSnafu)?;
self.writer
.write_all(&bitmap_leading_zeros.to_be_bytes())
.await
.context(WriteSnafu)?;
let value_stream = stream::iter(values.into_iter().map(Ok));
let frame_write = FramedWrite::new(&mut self.writer, encoder);
// `forward()` will flush and close the writer when the stream ends
@@ -103,9 +112,17 @@ impl<R: AsyncRead + Unpin + Send + 'static> IntermediateReader<R> {
.context(ReadSnafu)?;
let decoder = match &magic {
codec_v1::CODEC_V1_MAGIC => codec_v1::IntermediateItemDecoderV1 {
bitmap_type: BitmapType::Roaring,
},
codec_v1::CODEC_V1_MAGIC => {
let bitmap_leading_zeros = {
let mut buf = [0u8; 4];
self.reader.read_exact(&mut buf).await.context(ReadSnafu)?;
u32::from_be_bytes(buf)
};
codec_v1::IntermediateItemDecoderV1 {
bitmap_leading_zeros,
}
}
_ => return UnknownIntermediateCodecMagicSnafu { magic }.fail(),
};
@@ -115,7 +132,6 @@ impl<R: AsyncRead + Unpin + Send + 'static> IntermediateReader<R> {
#[cfg(test)]
mod tests {
use std::collections::BTreeMap;
use std::io::{Seek, SeekFrom};
use futures::io::{AllowStdIo, Cursor};
@@ -124,10 +140,6 @@ mod tests {
use super::*;
use crate::inverted_index::error::Error;
fn bitmap(bytes: &[u8]) -> Bitmap {
Bitmap::from_lsb0_bytes(bytes, BitmapType::Roaring)
}
#[tokio::test]
async fn test_intermediate_read_write_basic() {
let file_r = tempfile().unwrap();
@@ -136,12 +148,12 @@ mod tests {
let buf_w = AllowStdIo::new(file_w);
let values = BTreeMap::from_iter([
(Bytes::from("a"), bitmap(&[0b10101010])),
(Bytes::from("b"), bitmap(&[0b01010101])),
(Bytes::from("a"), BitVec::from_slice(&[0b10101010])),
(Bytes::from("b"), BitVec::from_slice(&[0b01010101])),
]);
let writer = IntermediateWriter::new(buf_w);
writer.write_all(values.clone()).await.unwrap();
writer.write_all(values.clone(), 0).await.unwrap();
// reset the handle
buf_r.seek(SeekFrom::Start(0)).unwrap();
@@ -149,9 +161,48 @@ mod tests {
let mut stream = reader.into_stream().await.unwrap();
let a = stream.next().await.unwrap().unwrap();
assert_eq!(a, (Bytes::from("a"), bitmap(&[0b10101010])));
assert_eq!(a, (Bytes::from("a"), BitVec::from_slice(&[0b10101010])));
let b = stream.next().await.unwrap().unwrap();
assert_eq!(b, (Bytes::from("b"), bitmap(&[0b01010101])));
assert_eq!(b, (Bytes::from("b"), BitVec::from_slice(&[0b01010101])));
assert!(stream.next().await.is_none());
}
#[tokio::test]
async fn test_intermediate_read_write_with_prefix_zeros() {
let file_r = tempfile().unwrap();
let file_w = file_r.try_clone().unwrap();
let mut buf_r = AllowStdIo::new(file_r);
let buf_w = AllowStdIo::new(file_w);
let values = BTreeMap::from_iter([
(Bytes::from("a"), BitVec::from_slice(&[0b10101010])),
(Bytes::from("b"), BitVec::from_slice(&[0b01010101])),
]);
let writer = IntermediateWriter::new(buf_w);
writer.write_all(values.clone(), 8).await.unwrap();
// reset the handle
buf_r.seek(SeekFrom::Start(0)).unwrap();
let reader = IntermediateReader::new(buf_r);
let mut stream = reader.into_stream().await.unwrap();
let a = stream.next().await.unwrap().unwrap();
assert_eq!(
a,
(
Bytes::from("a"),
BitVec::from_slice(&[0b00000000, 0b10101010])
)
);
let b = stream.next().await.unwrap().unwrap();
assert_eq!(
b,
(
Bytes::from("b"),
BitVec::from_slice(&[0b00000000, 0b01010101])
)
);
assert!(stream.next().await.is_none());
}
@@ -162,7 +213,7 @@ mod tests {
let values = BTreeMap::new();
let writer = IntermediateWriter::new(&mut buf);
writer.write_all(values.clone()).await.unwrap();
writer.write_all(values.clone(), 0).await.unwrap();
let reader = IntermediateReader::new(Cursor::new(buf));
let mut stream = reader.into_stream().await.unwrap();

View File

@@ -16,10 +16,9 @@ use std::io;
use asynchronous_codec::{BytesMut, Decoder, Encoder};
use bytes::{Buf, BufMut};
use greptime_proto::v1::index::BitmapType;
use common_base::BitVec;
use snafu::ResultExt;
use crate::bitmap::Bitmap;
use crate::inverted_index::error::{CommonIoSnafu, Error, Result};
use crate::Bytes;
@@ -29,42 +28,37 @@ const U64_LENGTH: usize = std::mem::size_of::<u64>();
pub const CODEC_V1_MAGIC: &[u8; 4] = b"im01";
/// Serializes items of external sorting intermediate files.
pub struct IntermediateItemEncoderV1 {
pub bitmap_type: BitmapType,
}
pub struct IntermediateItemEncoderV1;
/// [`FramedWrite`] requires the [`Encoder`] trait to be implemented.
impl Encoder for IntermediateItemEncoderV1 {
type Item<'a> = (Bytes, Bitmap);
type Item<'a> = (Bytes, BitVec);
type Error = Error;
fn encode(&mut self, item: (Bytes, Bitmap), dst: &mut BytesMut) -> Result<()> {
fn encode(&mut self, item: (Bytes, BitVec), dst: &mut BytesMut) -> Result<()> {
let value_bytes = item.0;
let bitmap_size = item.1.serialized_size(self.bitmap_type);
let bitmap_bytes = item.1.into_vec();
dst.reserve(U64_LENGTH * 2 + value_bytes.len() + bitmap_size);
dst.reserve(U64_LENGTH * 2 + value_bytes.len() + bitmap_bytes.len());
dst.put_u64_le(value_bytes.len() as u64);
dst.extend_from_slice(&value_bytes);
dst.put_u64_le(bitmap_size as u64);
item.1
.serialize_into(self.bitmap_type, &mut dst.writer())
.context(CommonIoSnafu)?;
dst.put_u64_le(bitmap_bytes.len() as u64);
dst.extend_from_slice(&bitmap_bytes);
Ok(())
}
}
/// Deserializes items of external sorting intermediate files.
pub struct IntermediateItemDecoderV1 {
pub bitmap_type: BitmapType,
pub(crate) bitmap_leading_zeros: u32,
}
/// [`FramedRead`] requires the [`Decoder`] trait to be implemented.
impl Decoder for IntermediateItemDecoderV1 {
type Item = (Bytes, Bitmap);
type Item = (Bytes, BitVec);
type Error = Error;
/// Decodes the `src` into `(Bytes, RoaringBitmap)`. Returns `None` if
/// Decodes the `src` into `(Bytes, BitVec)`. Returns `None` if
/// the `src` does not contain enough data for a complete item.
///
/// Only after successful decoding, the `src` is advanced. Otherwise,
@@ -98,8 +92,8 @@ impl Decoder for IntermediateItemDecoderV1 {
return Ok(None);
}
let bitmap = Bitmap::deserialize_from(&buf[..bitmap_len], self.bitmap_type)
.context(CommonIoSnafu)?;
let mut bitmap = BitVec::repeat(false, self.bitmap_leading_zeros as _);
bitmap.extend_from_raw_slice(&buf[..bitmap_len]);
let item = (value_bytes.to_vec(), bitmap);
@@ -119,29 +113,25 @@ impl From<io::Error> for Error {
#[cfg(test)]
mod tests {
use super::*;
use common_base::bit_vec::prelude::{bitvec, Lsb0};
fn bitmap(bytes: &[u8]) -> Bitmap {
Bitmap::from_lsb0_bytes(bytes, BitmapType::Roaring)
}
use super::*;
#[test]
fn test_intermediate_codec_basic() {
let mut encoder = IntermediateItemEncoderV1 {
bitmap_type: BitmapType::Roaring,
};
let mut encoder = IntermediateItemEncoderV1;
let mut buf = BytesMut::new();
let item = (b"hello".to_vec(), bitmap(&[0b10101010]));
let item = (b"hello".to_vec(), BitVec::from_slice(&[0b10101010]));
encoder.encode(item.clone(), &mut buf).unwrap();
let mut decoder = IntermediateItemDecoderV1 {
bitmap_type: BitmapType::Roaring,
bitmap_leading_zeros: 0,
};
assert_eq!(decoder.decode(&mut buf).unwrap().unwrap(), item);
assert_eq!(decoder.decode(&mut buf).unwrap(), None);
let item1 = (b"world".to_vec(), bitmap(&[0b01010101]));
let item1 = (b"world".to_vec(), BitVec::from_slice(&[0b01010101]));
encoder.encode(item.clone(), &mut buf).unwrap();
encoder.encode(item1.clone(), &mut buf).unwrap();
assert_eq!(decoder.decode(&mut buf).unwrap().unwrap(), item);
@@ -152,16 +142,14 @@ mod tests {
#[test]
fn test_intermediate_codec_empty_item() {
let mut encoder = IntermediateItemEncoderV1 {
bitmap_type: BitmapType::Roaring,
};
let mut encoder = IntermediateItemEncoderV1;
let mut buf = BytesMut::new();
let item = (b"".to_vec(), bitmap(&[]));
let item = (b"".to_vec(), BitVec::from_slice(&[]));
encoder.encode(item.clone(), &mut buf).unwrap();
let mut decoder = IntermediateItemDecoderV1 {
bitmap_type: BitmapType::Roaring,
bitmap_leading_zeros: 0,
};
assert_eq!(decoder.decode(&mut buf).unwrap().unwrap(), item);
assert_eq!(decoder.decode(&mut buf).unwrap(), None);
@@ -170,19 +158,17 @@ mod tests {
#[test]
fn test_intermediate_codec_partial() {
let mut encoder = IntermediateItemEncoderV1 {
bitmap_type: BitmapType::Roaring,
};
let mut encoder = IntermediateItemEncoderV1;
let mut buf = BytesMut::new();
let item = (b"hello".to_vec(), bitmap(&[0b10101010]));
let item = (b"hello".to_vec(), BitVec::from_slice(&[0b10101010]));
encoder.encode(item.clone(), &mut buf).unwrap();
let partial_length = U64_LENGTH + 3;
let mut partial_bytes = buf.split_to(partial_length);
let mut decoder = IntermediateItemDecoderV1 {
bitmap_type: BitmapType::Roaring,
bitmap_leading_zeros: 0,
};
assert_eq!(decoder.decode(&mut partial_bytes).unwrap(), None); // not enough data
partial_bytes.extend_from_slice(&buf[..]);
@@ -190,4 +176,25 @@ mod tests {
assert_eq!(decoder.decode(&mut partial_bytes).unwrap(), None);
assert!(partial_bytes.is_empty());
}
#[test]
fn test_intermediate_codec_prefix_zeros() {
let mut encoder = IntermediateItemEncoderV1;
let mut buf = BytesMut::new();
let item = (b"hello".to_vec(), bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1, 0]);
encoder.encode(item.clone(), &mut buf).unwrap();
let mut decoder = IntermediateItemDecoderV1 {
bitmap_leading_zeros: 3,
};
let decoded_item = decoder.decode(&mut buf).unwrap().unwrap();
assert_eq!(decoded_item.0, b"hello");
assert_eq!(
decoded_item.1,
bitvec![u8, Lsb0; 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0]
);
assert_eq!(decoder.decode(&mut buf).unwrap(), None);
assert!(buf.is_empty());
}
}

View File

@@ -16,10 +16,10 @@ use std::cmp::Ordering;
use std::pin::Pin;
use std::task::{Context, Poll};
use common_base::BitVec;
use futures::{ready, Stream, StreamExt};
use pin_project::pin_project;
use crate::bitmap::Bitmap;
use crate::inverted_index::create::sort::SortedStream;
use crate::inverted_index::error::Result;
use crate::Bytes;
@@ -28,10 +28,10 @@ use crate::Bytes;
#[pin_project]
pub struct MergeSortedStream {
stream1: Option<SortedStream>,
peek1: Option<(Bytes, Bitmap)>,
peek1: Option<(Bytes, BitVec)>,
stream2: Option<SortedStream>,
peek2: Option<(Bytes, Bitmap)>,
peek2: Option<(Bytes, BitVec)>,
}
impl MergeSortedStream {
@@ -49,7 +49,7 @@ impl MergeSortedStream {
}
impl Stream for MergeSortedStream {
type Item = Result<(Bytes, Bitmap)>;
type Item = Result<(Bytes, BitVec)>;
/// Polls both streams and returns the next item from the stream that has the smaller next item.
/// If both streams have the same next item, the bitmaps are unioned together.
@@ -89,77 +89,77 @@ impl Stream for MergeSortedStream {
}
/// Merges two bitmaps by bit-wise OR'ing them together, preserving all bits from both
fn merge_bitmaps(mut bitmap1: Bitmap, bitmap2: Bitmap) -> Bitmap {
bitmap1.union(bitmap2);
bitmap1
fn merge_bitmaps(bitmap1: BitVec, bitmap2: BitVec) -> BitVec {
// make sure longer bitmap is on the left to avoid truncation
#[allow(clippy::if_same_then_else)]
if bitmap1.len() > bitmap2.len() {
bitmap1 | bitmap2
} else {
bitmap2 | bitmap1
}
}
#[cfg(test)]
mod tests {
use futures::stream;
use greptime_proto::v1::index::BitmapType;
use super::*;
use crate::inverted_index::error::Error;
fn bitmap(bytes: &[u8]) -> Bitmap {
Bitmap::from_lsb0_bytes(bytes, BitmapType::Roaring)
}
fn sorted_stream_from_vec(vec: Vec<(Bytes, Bitmap)>) -> SortedStream {
fn sorted_stream_from_vec(vec: Vec<(Bytes, BitVec)>) -> SortedStream {
Box::new(stream::iter(vec.into_iter().map(Ok::<_, Error>)))
}
#[tokio::test]
async fn test_merge_sorted_stream_non_overlapping() {
let stream1 = sorted_stream_from_vec(vec![
(Bytes::from("apple"), bitmap(&[0b10101010])),
(Bytes::from("orange"), bitmap(&[0b01010101])),
(Bytes::from("apple"), BitVec::from_slice(&[0b10101010])),
(Bytes::from("orange"), BitVec::from_slice(&[0b01010101])),
]);
let stream2 = sorted_stream_from_vec(vec![
(Bytes::from("banana"), bitmap(&[0b10101010])),
(Bytes::from("peach"), bitmap(&[0b01010101])),
(Bytes::from("banana"), BitVec::from_slice(&[0b10101010])),
(Bytes::from("peach"), BitVec::from_slice(&[0b01010101])),
]);
let mut merged_stream = MergeSortedStream::merge(stream1, stream2);
let item = merged_stream.next().await.unwrap().unwrap();
assert_eq!(item.0, Bytes::from("apple"));
assert_eq!(item.1, bitmap(&[0b10101010]));
assert_eq!(item.1, BitVec::from_slice(&[0b10101010]));
let item = merged_stream.next().await.unwrap().unwrap();
assert_eq!(item.0, Bytes::from("banana"));
assert_eq!(item.1, bitmap(&[0b10101010]));
assert_eq!(item.1, BitVec::from_slice(&[0b10101010]));
let item = merged_stream.next().await.unwrap().unwrap();
assert_eq!(item.0, Bytes::from("orange"));
assert_eq!(item.1, bitmap(&[0b01010101]));
assert_eq!(item.1, BitVec::from_slice(&[0b01010101]));
let item = merged_stream.next().await.unwrap().unwrap();
assert_eq!(item.0, Bytes::from("peach"));
assert_eq!(item.1, bitmap(&[0b01010101]));
assert_eq!(item.1, BitVec::from_slice(&[0b01010101]));
assert!(merged_stream.next().await.is_none());
}
#[tokio::test]
async fn test_merge_sorted_stream_overlapping() {
let stream1 = sorted_stream_from_vec(vec![
(Bytes::from("apple"), bitmap(&[0b10101010])),
(Bytes::from("orange"), bitmap(&[0b10101010])),
(Bytes::from("apple"), BitVec::from_slice(&[0b10101010])),
(Bytes::from("orange"), BitVec::from_slice(&[0b10101010])),
]);
let stream2 = sorted_stream_from_vec(vec![
(Bytes::from("apple"), bitmap(&[0b01010101])),
(Bytes::from("peach"), bitmap(&[0b01010101])),
(Bytes::from("apple"), BitVec::from_slice(&[0b01010101])),
(Bytes::from("peach"), BitVec::from_slice(&[0b01010101])),
]);
let mut merged_stream = MergeSortedStream::merge(stream1, stream2);
let item = merged_stream.next().await.unwrap().unwrap();
assert_eq!(item.0, Bytes::from("apple"));
assert_eq!(item.1, bitmap(&[0b11111111]));
assert_eq!(item.1, BitVec::from_slice(&[0b11111111]));
let item = merged_stream.next().await.unwrap().unwrap();
assert_eq!(item.0, Bytes::from("orange"));
assert_eq!(item.1, bitmap(&[0b10101010]));
assert_eq!(item.1, BitVec::from_slice(&[0b10101010]));
let item = merged_stream.next().await.unwrap().unwrap();
assert_eq!(item.0, Bytes::from("peach"));
assert_eq!(item.1, bitmap(&[0b01010101]));
assert_eq!(item.1, BitVec::from_slice(&[0b01010101]));
assert!(merged_stream.next().await.is_none());
}

View File

@@ -18,7 +18,6 @@ use std::num::NonZeroUsize;
use async_trait::async_trait;
use snafu::ensure;
use crate::bitmap::BitmapType;
use crate::inverted_index::create::sort::{SortOutput, Sorter};
use crate::inverted_index::create::InvertedIndexCreator;
use crate::inverted_index::error::{InconsistentRowCountSnafu, Result};
@@ -69,11 +68,7 @@ impl InvertedIndexCreator for SortIndexCreator {
}
/// Finalizes the sorting for all indexes and writes them using the inverted index writer
async fn finish(
&mut self,
writer: &mut dyn InvertedIndexWriter,
bitmap_type: BitmapType,
) -> Result<()> {
async fn finish(&mut self, writer: &mut dyn InvertedIndexWriter) -> Result<()> {
let mut output_row_count = None;
for (index_name, mut sorter) in self.sorters.drain() {
let SortOutput {
@@ -93,7 +88,7 @@ impl InvertedIndexCreator for SortIndexCreator {
);
writer
.add_index(index_name, segment_null_bitmap, sorted_stream, bitmap_type)
.add_index(index_name, segment_null_bitmap, sorted_stream)
.await?;
}
@@ -122,9 +117,9 @@ mod tests {
use futures::{stream, StreamExt};
use super::*;
use crate::bitmap::Bitmap;
use crate::inverted_index::create::sort::SortedStream;
use crate::inverted_index::error::Error;
use crate::inverted_index::format::writer::{MockInvertedIndexWriter, ValueStream};
use crate::inverted_index::format::writer::MockInvertedIndexWriter;
use crate::Bytes;
#[tokio::test]
@@ -148,10 +143,11 @@ mod tests {
}
let mut mock_writer = MockInvertedIndexWriter::new();
mock_writer.expect_add_index().times(3).returning(
|name, null_bitmap, stream, bitmap_type| {
mock_writer
.expect_add_index()
.times(3)
.returning(|name, null_bitmap, stream| {
assert!(null_bitmap.is_empty());
assert_eq!(bitmap_type, BitmapType::Roaring);
match name.as_str() {
"a" => assert_eq!(stream_to_values(stream), vec![b"1", b"2", b"3"]),
"b" => assert_eq!(stream_to_values(stream), vec![b"4", b"5", b"6"]),
@@ -159,8 +155,7 @@ mod tests {
_ => panic!("unexpected index name: {}", name),
}
Ok(())
},
);
});
mock_writer
.expect_finish()
.times(1)
@@ -170,10 +165,7 @@ mod tests {
Ok(())
});
creator
.finish(&mut mock_writer, BitmapType::Roaring)
.await
.unwrap();
creator.finish(&mut mock_writer).await.unwrap();
}
#[tokio::test]
@@ -199,9 +191,8 @@ mod tests {
let mut mock_writer = MockInvertedIndexWriter::new();
mock_writer
.expect_add_index()
.returning(|name, null_bitmap, stream, bitmap_type| {
.returning(|name, null_bitmap, stream| {
assert!(null_bitmap.is_empty());
assert_eq!(bitmap_type, BitmapType::Roaring);
match name.as_str() {
"a" => assert_eq!(stream_to_values(stream), vec![b"1", b"2", b"3"]),
"b" => assert_eq!(stream_to_values(stream), vec![b"4", b"5", b"6"]),
@@ -212,7 +203,7 @@ mod tests {
});
mock_writer.expect_finish().never();
let res = creator.finish(&mut mock_writer, BitmapType::Roaring).await;
let res = creator.finish(&mut mock_writer).await;
assert!(matches!(res, Err(Error::InconsistentRowCount { .. })));
}
@@ -228,9 +219,8 @@ mod tests {
let mut mock_writer = MockInvertedIndexWriter::new();
mock_writer
.expect_add_index()
.returning(|name, null_bitmap, stream, bitmap_type| {
.returning(|name, null_bitmap, stream| {
assert!(null_bitmap.is_empty());
assert_eq!(bitmap_type, BitmapType::Roaring);
assert!(matches!(name.as_str(), "a" | "b" | "c"));
assert!(stream_to_values(stream).is_empty());
Ok(())
@@ -244,10 +234,7 @@ mod tests {
Ok(())
});
creator
.finish(&mut mock_writer, BitmapType::Roaring)
.await
.unwrap();
creator.finish(&mut mock_writer).await.unwrap();
}
fn set_bit(bit_vec: &mut BitVec, index: usize) {
@@ -296,21 +283,20 @@ mod tests {
async fn output(&mut self) -> Result<SortOutput> {
let segment_null_bitmap = self.values.remove(&None).unwrap_or_default();
let segment_null_bitmap = Bitmap::BitVec(segment_null_bitmap);
Ok(SortOutput {
segment_null_bitmap,
sorted_stream: Box::new(stream::iter(
std::mem::take(&mut self.values)
.into_iter()
.map(|(v, b)| Ok((v.unwrap(), Bitmap::BitVec(b)))),
.map(|(v, b)| Ok((v.unwrap(), b))),
)),
total_row_count: self.total_row_count,
})
}
}
fn stream_to_values(stream: ValueStream) -> Vec<Bytes> {
fn stream_to_values(stream: SortedStream) -> Vec<Bytes> {
futures::executor::block_on(async {
stream.map(|r| r.unwrap().0).collect::<Vec<Bytes>>().await
})

View File

@@ -110,14 +110,6 @@ pub enum Error {
location: Location,
},
#[snafu(display("Failed to decode bitmap"))]
DecodeBitmap {
#[snafu(source)]
error: IoError,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Failed to decode protobuf"))]
DecodeProto {
#[snafu(source)]
@@ -248,7 +240,6 @@ impl ErrorExt for Error {
| CommonIo { .. }
| UnknownIntermediateCodecMagic { .. }
| FstCompile { .. }
| DecodeBitmap { .. }
| InvalidFooterPayloadSize { .. }
| BlobSizeTooSmall { .. } => StatusCode::Unexpected,

View File

@@ -18,11 +18,11 @@ use std::sync::Arc;
use async_trait::async_trait;
use bytes::Bytes;
use common_base::BitVec;
use greptime_proto::v1::index::InvertedIndexMetas;
use snafu::ResultExt;
use crate::bitmap::{Bitmap, BitmapType};
use crate::inverted_index::error::{DecodeBitmapSnafu, DecodeFstSnafu, Result};
use crate::inverted_index::error::{DecodeFstSnafu, Result};
pub use crate::inverted_index::format::reader::blob::InvertedIndexBlobReader;
use crate::inverted_index::FstMap;
@@ -67,25 +67,17 @@ pub trait InvertedIndexReader: Send + Sync {
}
/// Retrieves the bitmap from the given offset and size.
async fn bitmap(&self, offset: u64, size: u32, bitmap_type: BitmapType) -> Result<Bitmap> {
self.range_read(offset, size).await.and_then(|bytes| {
Bitmap::deserialize_from(&bytes, bitmap_type).context(DecodeBitmapSnafu)
})
async fn bitmap(&self, offset: u64, size: u32) -> Result<BitVec> {
self.range_read(offset, size).await.map(BitVec::from_vec)
}
/// Retrieves the multiple bitmaps from the given ranges.
async fn bitmap_deque(
&mut self,
ranges: &[(Range<u64>, BitmapType)],
) -> Result<VecDeque<Bitmap>> {
let (ranges, types): (Vec<_>, Vec<_>) = ranges.iter().cloned().unzip();
let bytes = self.read_vec(&ranges).await?;
bytes
async fn bitmap_deque(&mut self, ranges: &[Range<u64>]) -> Result<VecDeque<BitVec>> {
Ok(self
.read_vec(ranges)
.await?
.into_iter()
.zip(types)
.map(|(bytes, bitmap_type)| {
Bitmap::deserialize_from(&bytes, bitmap_type).context(DecodeBitmapSnafu)
})
.collect::<Result<VecDeque<_>>>()
.map(|bytes| BitVec::from_slice(bytes.as_ref()))
.collect::<VecDeque<_>>())
}
}

View File

@@ -78,14 +78,14 @@ impl<R: RangeReader + Sync> InvertedIndexReader for InvertedIndexBlobReader<R> {
#[cfg(test)]
mod tests {
use common_base::bit_vec::prelude::*;
use fst::MapBuilder;
use greptime_proto::v1::index::{BitmapType, InvertedIndexMeta, InvertedIndexMetas};
use greptime_proto::v1::index::{InvertedIndexMeta, InvertedIndexMetas};
use prost::Message;
use super::*;
use crate::bitmap::Bitmap;
fn mock_fst() -> Vec<u8> {
fn create_fake_fst() -> Vec<u8> {
let mut fst_buf = Vec::new();
let mut build = MapBuilder::new(&mut fst_buf).unwrap();
build.insert("key1".as_bytes(), 1).unwrap();
@@ -94,27 +94,19 @@ mod tests {
fst_buf
}
fn mock_bitmap() -> Bitmap {
Bitmap::from_lsb0_bytes(&[0b10101010, 0b10000000], BitmapType::Roaring)
}
fn mock_bitmap_bytes() -> Vec<u8> {
let mut buf = Vec::new();
mock_bitmap()
.serialize_into(BitmapType::Roaring, &mut buf)
.unwrap();
buf
fn create_fake_bitmap() -> Vec<u8> {
bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1, 0, 1, 0].into_vec()
}
fn create_inverted_index_blob() -> Vec<u8> {
let bitmap_size = mock_bitmap_bytes().len();
let fst_size = mock_fst().len();
let bitmap_size = create_fake_bitmap().len();
let fst_size = create_fake_fst().len();
// first index
let mut inverted_index = Vec::new();
inverted_index.extend_from_slice(&mock_bitmap_bytes()); // value bitmap
inverted_index.extend_from_slice(&mock_bitmap_bytes()); // null bitmap
inverted_index.extend_from_slice(&mock_fst()); // fst
inverted_index.extend_from_slice(&create_fake_bitmap()); // value bitmap
inverted_index.extend_from_slice(&create_fake_bitmap()); // null bitmap
inverted_index.extend_from_slice(&create_fake_fst()); // fst
let meta = InvertedIndexMeta {
name: "tag0".to_string(),
@@ -124,7 +116,6 @@ mod tests {
null_bitmap_size: bitmap_size as _,
relative_fst_offset: (bitmap_size * 2) as _,
fst_size: fst_size as _,
bitmap_type: BitmapType::Roaring as _,
..Default::default()
};
@@ -137,7 +128,6 @@ mod tests {
null_bitmap_size: bitmap_size as _,
relative_fst_offset: (bitmap_size * 2) as _,
fst_size: fst_size as _,
bitmap_type: BitmapType::Roaring as _,
..Default::default()
};
@@ -178,19 +168,19 @@ mod tests {
let meta0 = metas.metas.get("tag0").unwrap();
assert_eq!(meta0.name, "tag0");
assert_eq!(meta0.base_offset, 0);
assert_eq!(meta0.inverted_index_size, 102);
assert_eq!(meta0.relative_null_bitmap_offset, 26);
assert_eq!(meta0.null_bitmap_size, 26);
assert_eq!(meta0.relative_fst_offset, 52);
assert_eq!(meta0.inverted_index_size, 54);
assert_eq!(meta0.relative_null_bitmap_offset, 2);
assert_eq!(meta0.null_bitmap_size, 2);
assert_eq!(meta0.relative_fst_offset, 4);
assert_eq!(meta0.fst_size, 50);
let meta1 = metas.metas.get("tag1").unwrap();
assert_eq!(meta1.name, "tag1");
assert_eq!(meta1.base_offset, 102);
assert_eq!(meta1.inverted_index_size, 102);
assert_eq!(meta1.relative_null_bitmap_offset, 26);
assert_eq!(meta1.null_bitmap_size, 26);
assert_eq!(meta1.relative_fst_offset, 52);
assert_eq!(meta1.base_offset, 54);
assert_eq!(meta1.inverted_index_size, 54);
assert_eq!(meta1.relative_null_bitmap_offset, 2);
assert_eq!(meta1.null_bitmap_size, 2);
assert_eq!(meta1.relative_fst_offset, 4);
assert_eq!(meta1.fst_size, 50);
}
@@ -234,29 +224,17 @@ mod tests {
let metas = blob_reader.metadata().await.unwrap();
let meta = metas.metas.get("tag0").unwrap();
let bitmap = blob_reader
.bitmap(meta.base_offset, 26, BitmapType::Roaring)
.await
.unwrap();
assert_eq!(bitmap, mock_bitmap());
let bitmap = blob_reader
.bitmap(meta.base_offset + 26, 26, BitmapType::Roaring)
.await
.unwrap();
assert_eq!(bitmap, mock_bitmap());
let bitmap = blob_reader.bitmap(meta.base_offset, 2).await.unwrap();
assert_eq!(bitmap.into_vec(), create_fake_bitmap());
let bitmap = blob_reader.bitmap(meta.base_offset + 2, 2).await.unwrap();
assert_eq!(bitmap.into_vec(), create_fake_bitmap());
let metas = blob_reader.metadata().await.unwrap();
let meta = metas.metas.get("tag1").unwrap();
let bitmap = blob_reader
.bitmap(meta.base_offset, 26, BitmapType::Roaring)
.await
.unwrap();
assert_eq!(bitmap, mock_bitmap());
let bitmap = blob_reader
.bitmap(meta.base_offset + 26, 26, BitmapType::Roaring)
.await
.unwrap();
assert_eq!(bitmap, mock_bitmap());
let bitmap = blob_reader.bitmap(meta.base_offset, 2).await.unwrap();
assert_eq!(bitmap.into_vec(), create_fake_bitmap());
let bitmap = blob_reader.bitmap(meta.base_offset + 2, 2).await.unwrap();
assert_eq!(bitmap.into_vec(), create_fake_bitmap());
}
}

View File

@@ -18,14 +18,14 @@ mod single;
use std::num::NonZeroUsize;
use async_trait::async_trait;
use common_base::BitVec;
use futures::Stream;
use crate::bitmap::{Bitmap, BitmapType};
use crate::inverted_index::error::Result;
pub use crate::inverted_index::format::writer::blob::InvertedIndexBlobWriter;
use crate::Bytes;
pub type ValueStream = Box<dyn Stream<Item = Result<(Bytes, Bitmap)>> + Send + Unpin>;
pub type ValueStream = Box<dyn Stream<Item = Result<(Bytes, BitVec)>> + Send + Unpin>;
/// Trait for writing inverted index data to underlying storage.
#[mockall::automock]
@@ -37,13 +37,11 @@ pub trait InvertedIndexWriter: Send {
/// * `null_bitmap` marks positions of null entries.
/// * `values` is a stream of values and their locations, yielded lexicographically.
/// Errors occur if the values are out of order.
/// * `bitmap_type` is the type of bitmap to encode.
async fn add_index(
&mut self,
name: String,
null_bitmap: Bitmap,
null_bitmap: BitVec,
values: ValueStream,
bitmap_type: BitmapType,
) -> Result<()>;
/// Finalizes the index writing process, ensuring all data is written.

View File

@@ -15,12 +15,12 @@
use std::num::NonZeroUsize;
use async_trait::async_trait;
use common_base::BitVec;
use futures::{AsyncWrite, AsyncWriteExt};
use greptime_proto::v1::index::InvertedIndexMetas;
use prost::Message;
use snafu::ResultExt;
use crate::bitmap::{Bitmap, BitmapType};
use crate::inverted_index::error::{CloseSnafu, FlushSnafu, Result, WriteSnafu};
use crate::inverted_index::format::writer::single::SingleIndexWriter;
use crate::inverted_index::format::writer::{InvertedIndexWriter, ValueStream};
@@ -43,9 +43,8 @@ impl<W: AsyncWrite + Send + Unpin> InvertedIndexWriter for InvertedIndexBlobWrit
async fn add_index(
&mut self,
name: String,
null_bitmap: Bitmap,
null_bitmap: BitVec,
values: ValueStream,
bitmap_type: BitmapType,
) -> Result<()> {
let single_writer = SingleIndexWriter::new(
name.clone(),
@@ -53,7 +52,6 @@ impl<W: AsyncWrite + Send + Unpin> InvertedIndexWriter for InvertedIndexBlobWrit
null_bitmap,
values,
&mut self.blob_writer,
bitmap_type,
);
let metadata = single_writer.write().await?;
@@ -102,7 +100,6 @@ impl<W: AsyncWrite + Send + Unpin> InvertedIndexBlobWriter<W> {
#[cfg(test)]
mod tests {
use futures::stream;
use greptime_proto::v1::index::BitmapType;
use super::*;
use crate::inverted_index::format::reader::{InvertedIndexBlobReader, InvertedIndexReader};
@@ -135,44 +132,24 @@ mod tests {
writer
.add_index(
"tag0".to_string(),
Bitmap::from_lsb0_bytes(&[0b0000_0001, 0b0000_0000], BitmapType::Roaring),
BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
Box::new(stream::iter(vec![
Ok((
Bytes::from("a"),
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
)),
Ok((
Bytes::from("b"),
Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring),
)),
Ok((
Bytes::from("c"),
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
)),
Ok((Bytes::from("a"), BitVec::from_slice(&[0b0000_0001]))),
Ok((Bytes::from("b"), BitVec::from_slice(&[0b0010_0000]))),
Ok((Bytes::from("c"), BitVec::from_slice(&[0b0000_0001]))),
])),
BitmapType::Roaring,
)
.await
.unwrap();
writer
.add_index(
"tag1".to_string(),
Bitmap::from_lsb0_bytes(&[0b0000_0001, 0b0000_0000], BitmapType::Roaring),
BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
Box::new(stream::iter(vec![
Ok((
Bytes::from("x"),
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
)),
Ok((
Bytes::from("y"),
Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring),
)),
Ok((
Bytes::from("z"),
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
)),
Ok((Bytes::from("x"), BitVec::from_slice(&[0b0000_0001]))),
Ok((Bytes::from("y"), BitVec::from_slice(&[0b0010_0000]))),
Ok((Bytes::from("z"), BitVec::from_slice(&[0b0000_0001]))),
])),
BitmapType::Roaring,
)
.await
.unwrap();
@@ -204,31 +181,22 @@ mod tests {
assert_eq!(fst0.len(), 3);
let [offset, size] = unpack(fst0.get(b"a").unwrap());
let bitmap = reader
.bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
.bitmap(tag0.base_offset + offset as u64, size)
.await
.unwrap();
assert_eq!(
bitmap,
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
);
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
let [offset, size] = unpack(fst0.get(b"b").unwrap());
let bitmap = reader
.bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
.bitmap(tag0.base_offset + offset as u64, size)
.await
.unwrap();
assert_eq!(
bitmap,
Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring)
);
assert_eq!(bitmap, BitVec::from_slice(&[0b0010_0000]));
let [offset, size] = unpack(fst0.get(b"c").unwrap());
let bitmap = reader
.bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
.bitmap(tag0.base_offset + offset as u64, size)
.await
.unwrap();
assert_eq!(
bitmap,
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
);
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
// tag1
let tag1 = metadata.metas.get("tag1").unwrap();
@@ -247,30 +215,21 @@ mod tests {
assert_eq!(fst1.len(), 3);
let [offset, size] = unpack(fst1.get(b"x").unwrap());
let bitmap = reader
.bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
.bitmap(tag1.base_offset + offset as u64, size)
.await
.unwrap();
assert_eq!(
bitmap,
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
);
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
let [offset, size] = unpack(fst1.get(b"y").unwrap());
let bitmap = reader
.bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
.bitmap(tag1.base_offset + offset as u64, size)
.await
.unwrap();
assert_eq!(
bitmap,
Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring)
);
assert_eq!(bitmap, BitVec::from_slice(&[0b0010_0000]));
let [offset, size] = unpack(fst1.get(b"z").unwrap());
let bitmap = reader
.bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
.bitmap(tag1.base_offset + offset as u64, size)
.await
.unwrap();
assert_eq!(
bitmap,
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
);
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
}
}

View File

@@ -12,12 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use common_base::BitVec;
use fst::MapBuilder;
use futures::{AsyncWrite, AsyncWriteExt, Stream, StreamExt};
use greptime_proto::v1::index::{InvertedIndexMeta, InvertedIndexStats};
use snafu::ResultExt;
use crate::bitmap::{Bitmap, BitmapType};
use crate::inverted_index::error::{FstCompileSnafu, FstInsertSnafu, Result, WriteSnafu};
use crate::Bytes;
@@ -27,7 +27,7 @@ pub struct SingleIndexWriter<W, S> {
blob_writer: W,
/// The null bitmap to be written
null_bitmap: Bitmap,
null_bitmap: BitVec,
/// The stream of values to be written, yielded lexicographically
values: S,
@@ -37,40 +37,30 @@ pub struct SingleIndexWriter<W, S> {
/// Metadata about the index
meta: InvertedIndexMeta,
/// The type of bitmap to use
bitmap_type: BitmapType,
/// Buffer for writing the blob
buf: Vec<u8>,
}
impl<W, S> SingleIndexWriter<W, S>
where
W: AsyncWrite + Send + Unpin,
S: Stream<Item = Result<(Bytes, Bitmap)>> + Send + Unpin,
S: Stream<Item = Result<(Bytes, BitVec)>> + Send + Unpin,
{
/// Constructs a new `SingleIndexWriter`
pub fn new(
name: String,
base_offset: u64,
null_bitmap: Bitmap,
null_bitmap: BitVec,
values: S,
blob_writer: W,
bitmap_type: BitmapType,
) -> SingleIndexWriter<W, S> {
SingleIndexWriter {
blob_writer,
null_bitmap,
values,
fst: MapBuilder::memory(),
bitmap_type,
buf: Vec::new(),
meta: InvertedIndexMeta {
name,
base_offset,
stats: Some(InvertedIndexStats::default()),
bitmap_type: bitmap_type.into(),
..Default::default()
},
}
@@ -90,17 +80,14 @@ where
/// Writes the null bitmap to the blob and updates the metadata accordingly
async fn write_null_bitmap(&mut self) -> Result<()> {
self.buf.clear();
self.null_bitmap
.serialize_into(self.bitmap_type, &mut self.buf)
.expect("Write to vec should not fail");
let null_bitmap_bytes = self.null_bitmap.as_raw_slice();
self.blob_writer
.write_all(&self.buf)
.write_all(null_bitmap_bytes)
.await
.context(WriteSnafu)?;
self.meta.relative_null_bitmap_offset = self.meta.inverted_index_size as _;
self.meta.null_bitmap_size = self.buf.len() as _;
self.meta.null_bitmap_size = null_bitmap_bytes.len() as _;
self.meta.inverted_index_size += self.meta.null_bitmap_size as u64;
// update stats
@@ -113,18 +100,15 @@ where
}
/// Appends a value and its bitmap to the blob, updates the FST, and the metadata
async fn append_value(&mut self, value: Bytes, bitmap: Bitmap) -> Result<()> {
self.buf.clear();
bitmap
.serialize_into(self.bitmap_type, &mut self.buf)
.expect("Write to vec should not fail");
async fn append_value(&mut self, value: Bytes, bitmap: BitVec) -> Result<()> {
let bitmap_bytes = bitmap.into_vec();
self.blob_writer
.write_all(&self.buf)
.write_all(&bitmap_bytes)
.await
.context(WriteSnafu)?;
let offset = self.meta.inverted_index_size as u32;
let size = self.buf.len() as u32;
let size = bitmap_bytes.len() as u32;
self.meta.inverted_index_size += size as u64;
let packed = bytemuck::cast::<[u32; 2], u64>([offset, size]);
@@ -173,10 +157,9 @@ mod tests {
let writer = SingleIndexWriter::new(
"test".to_string(),
0,
Bitmap::new_roaring(),
BitVec::new(),
stream::empty(),
&mut blob,
BitmapType::Roaring,
);
let meta = writer.write().await.unwrap();
@@ -191,23 +174,13 @@ mod tests {
let writer = SingleIndexWriter::new(
"test".to_string(),
0,
Bitmap::from_lsb0_bytes(&[0b0000_0001, 0b0000_0000], BitmapType::Roaring),
BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
stream::iter(vec![
Ok((
Bytes::from("a"),
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
)),
Ok((
Bytes::from("b"),
Bitmap::from_lsb0_bytes(&[0b0000_0000], BitmapType::Roaring),
)),
Ok((
Bytes::from("c"),
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
)),
Ok((Bytes::from("a"), BitVec::from_slice(&[0b0000_0001]))),
Ok((Bytes::from("b"), BitVec::from_slice(&[0b0000_0000]))),
Ok((Bytes::from("c"), BitVec::from_slice(&[0b0000_0001]))),
]),
&mut blob,
BitmapType::Roaring,
);
let meta = writer.write().await.unwrap();
@@ -226,23 +199,13 @@ mod tests {
let writer = SingleIndexWriter::new(
"test".to_string(),
0,
Bitmap::from_lsb0_bytes(&[0b0000_0001, 0b0000_0000], BitmapType::Roaring),
BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
stream::iter(vec![
Ok((
Bytes::from("b"),
Bitmap::from_lsb0_bytes(&[0b0000_0000], BitmapType::Roaring),
)),
Ok((
Bytes::from("a"),
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
)),
Ok((
Bytes::from("c"),
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
)),
Ok((Bytes::from("b"), BitVec::from_slice(&[0b0000_0000]))),
Ok((Bytes::from("a"), BitVec::from_slice(&[0b0000_0001]))),
Ok((Bytes::from("c"), BitVec::from_slice(&[0b0000_0001]))),
]),
&mut blob,
BitmapType::Roaring,
);
let res = writer.write().await;
assert!(matches!(res, Err(Error::FstInsert { .. })));

View File

@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use greptime_proto::v1::index::{BitmapType, InvertedIndexMeta};
use common_base::BitVec;
use greptime_proto::v1::index::InvertedIndexMeta;
use crate::bitmap::Bitmap;
use crate::inverted_index::error::Result;
use crate::inverted_index::format::reader::InvertedIndexReader;
@@ -36,7 +36,7 @@ impl<'a> ParallelFstValuesMapper<'a> {
pub async fn map_values_vec(
&mut self,
value_and_meta_vec: &[(Vec<u64>, &'a InvertedIndexMeta)],
) -> Result<Vec<Bitmap>> {
) -> Result<Vec<BitVec>> {
let groups = value_and_meta_vec
.iter()
.map(|(values, _)| values.len())
@@ -50,17 +50,15 @@ impl<'a> ParallelFstValuesMapper<'a> {
// bitmap offset and the lower 32 bits represent its size. This mapper uses these
// combined offset-size pairs to fetch and union multiple bitmaps into a single `BitVec`.
let [relative_offset, size] = bytemuck::cast::<u64, [u32; 2]>(*value);
let range = meta.base_offset + relative_offset as u64
..meta.base_offset + relative_offset as u64 + size as u64;
fetch_ranges.push((
range,
BitmapType::try_from(meta.bitmap_type).unwrap_or(BitmapType::BitVec),
));
fetch_ranges.push(
meta.base_offset + relative_offset as u64
..meta.base_offset + relative_offset as u64 + size as u64,
);
}
}
if fetch_ranges.is_empty() {
return Ok(vec![Bitmap::new_bitvec()]);
return Ok(vec![BitVec::new()]);
}
common_telemetry::debug!("fetch ranges: {:?}", fetch_ranges);
@@ -68,10 +66,14 @@ impl<'a> ParallelFstValuesMapper<'a> {
let mut output = Vec::with_capacity(groups.len());
for counter in groups {
let mut bitmap = Bitmap::new_roaring();
let mut bitmap = BitVec::new();
for _ in 0..counter {
let bm = bitmaps.pop_front().unwrap();
bitmap.union(bm);
if bm.len() > bitmap.len() {
bitmap = bm | bitmap
} else {
bitmap |= bm
}
}
output.push(bitmap);
@@ -85,6 +87,8 @@ impl<'a> ParallelFstValuesMapper<'a> {
mod tests {
use std::collections::VecDeque;
use common_base::bit_vec::prelude::*;
use super::*;
use crate::inverted_index::format::reader::MockInvertedIndexReader;
@@ -97,26 +101,19 @@ mod tests {
let mut mock_reader = MockInvertedIndexReader::new();
mock_reader.expect_bitmap_deque().returning(|ranges| {
let mut output = VecDeque::new();
for (range, bitmap_type) in ranges {
for range in ranges {
let offset = range.start;
let size = range.end - range.start;
match (offset, size, bitmap_type) {
(1, 1, BitmapType::Roaring) => {
output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
}
(2, 1, BitmapType::Roaring) => {
output.push_back(Bitmap::from_lsb0_bytes(&[0b01010101], *bitmap_type))
}
match (offset, size) {
(1, 1) => output.push_back(bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1]),
(2, 1) => output.push_back(bitvec![u8, Lsb0; 0, 1, 0, 1, 0, 1, 0, 1]),
_ => unreachable!(),
}
}
Ok(output)
});
let meta = InvertedIndexMeta {
bitmap_type: BitmapType::Roaring.into(),
..Default::default()
};
let meta = InvertedIndexMeta::default();
let mut values_mapper = ParallelFstValuesMapper::new(&mut mock_reader);
let result = values_mapper
@@ -129,50 +126,33 @@ mod tests {
.map_values_vec(&[(vec![value(1, 1)], &meta)])
.await
.unwrap();
assert_eq!(
result[0],
Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring)
);
assert_eq!(result[0], bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1]);
let result = values_mapper
.map_values_vec(&[(vec![value(2, 1)], &meta)])
.await
.unwrap();
assert_eq!(
result[0],
Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring)
);
assert_eq!(result[0], bitvec![u8, Lsb0; 0, 1, 0, 1, 0, 1, 0, 1]);
let result = values_mapper
.map_values_vec(&[(vec![value(1, 1), value(2, 1)], &meta)])
.await
.unwrap();
assert_eq!(
result[0],
Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::Roaring)
);
assert_eq!(result[0], bitvec![u8, Lsb0; 1, 1, 1, 1, 1, 1, 1, 1]);
let result = values_mapper
.map_values_vec(&[(vec![value(2, 1), value(1, 1)], &meta)])
.await
.unwrap();
assert_eq!(
result[0],
Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::Roaring)
);
assert_eq!(result[0], bitvec![u8, Lsb0; 1, 1, 1, 1, 1, 1, 1, 1]);
let result = values_mapper
.map_values_vec(&[(vec![value(2, 1)], &meta), (vec![value(1, 1)], &meta)])
.await
.unwrap();
assert_eq!(
result[0],
Bitmap::from_lsb0_bytes(&[0b01010101], BitmapType::Roaring)
);
assert_eq!(
result[1],
Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring)
);
assert_eq!(result[0], bitvec![u8, Lsb0; 0, 1, 0, 1, 0, 1, 0, 1]);
assert_eq!(result[1], bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1]);
let result = values_mapper
.map_values_vec(&[
(vec![value(2, 1), value(1, 1)], &meta),
@@ -180,13 +160,7 @@ mod tests {
])
.await
.unwrap();
assert_eq!(
result[0],
Bitmap::from_lsb0_bytes(&[0b11111111], BitmapType::Roaring)
);
assert_eq!(
result[1],
Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring)
);
assert_eq!(result[0], bitvec![u8, Lsb0; 1, 1, 1, 1, 1, 1, 1, 1]);
assert_eq!(result[1], bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1]);
}
}

View File

@@ -15,17 +15,17 @@
mod predicates_apply;
use async_trait::async_trait;
use common_base::BitVec;
pub use predicates_apply::PredicatesIndexApplier;
use crate::bitmap::Bitmap;
use crate::inverted_index::error::Result;
use crate::inverted_index::format::reader::InvertedIndexReader;
/// The output of an apply operation.
#[derive(Clone, Debug, PartialEq)]
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct ApplyOutput {
/// Bitmap of indices that match the predicates.
pub matched_segment_ids: Bitmap,
pub matched_segment_ids: BitVec,
/// The total number of rows in the index.
pub total_row_count: usize,

View File

@@ -15,9 +15,9 @@
use std::mem::size_of;
use async_trait::async_trait;
use common_base::BitVec;
use greptime_proto::v1::index::InvertedIndexMetas;
use crate::bitmap::Bitmap;
use crate::inverted_index::error::{IndexNotFoundSnafu, Result};
use crate::inverted_index::format::reader::InvertedIndexReader;
use crate::inverted_index::search::fst_apply::{
@@ -50,11 +50,12 @@ impl IndexApplier for PredicatesIndexApplier {
) -> Result<ApplyOutput> {
let metadata = reader.metadata().await?;
let mut output = ApplyOutput {
matched_segment_ids: Bitmap::new_bitvec(),
matched_segment_ids: BitVec::EMPTY,
total_row_count: metadata.total_row_count as _,
segment_row_count: metadata.segment_row_count as _,
};
let mut bitmap = Self::bitmap_full_range(&metadata);
// TODO(zhongzc): optimize the order of applying to make it quicker to return empty.
let mut appliers = Vec::with_capacity(self.fst_appliers.len());
let mut fst_ranges = Vec::with_capacity(self.fst_appliers.len());
@@ -80,7 +81,7 @@ impl IndexApplier for PredicatesIndexApplier {
}
if fst_ranges.is_empty() {
output.matched_segment_ids = Self::bitmap_full_range(&metadata);
output.matched_segment_ids = bitmap;
return Ok(output);
}
@@ -92,15 +93,14 @@ impl IndexApplier for PredicatesIndexApplier {
.collect::<Vec<_>>();
let mut mapper = ParallelFstValuesMapper::new(reader);
let mut bm_vec = mapper.map_values_vec(&value_and_meta_vec).await?;
let bm_vec = mapper.map_values_vec(&value_and_meta_vec).await?;
let mut bitmap = bm_vec.pop().unwrap(); // SAFETY: `fst_ranges` is not empty
for bm in bm_vec {
if bm.count_ones() == 0 {
if bitmap.count_ones() == 0 {
break;
}
bitmap.intersect(bm);
bitmap &= bm;
}
output.matched_segment_ids = bitmap;
@@ -146,12 +146,12 @@ impl PredicatesIndexApplier {
Ok(PredicatesIndexApplier { fst_appliers })
}
/// Creates a `Bitmap` representing the full range of data in the index for initial scanning.
fn bitmap_full_range(metadata: &InvertedIndexMetas) -> Bitmap {
/// Creates a `BitVec` representing the full range of data in the index for initial scanning.
fn bitmap_full_range(metadata: &InvertedIndexMetas) -> BitVec {
let total_count = metadata.total_row_count;
let segment_count = metadata.segment_row_count;
let len = total_count.div_ceil(segment_count);
Bitmap::full_bitvec(len as _)
BitVec::repeat(true, len as _)
}
}
@@ -167,10 +167,10 @@ mod tests {
use std::collections::VecDeque;
use std::sync::Arc;
use greptime_proto::v1::index::{BitmapType, InvertedIndexMeta};
use common_base::bit_vec::prelude::*;
use greptime_proto::v1::index::InvertedIndexMeta;
use super::*;
use crate::bitmap::Bitmap;
use crate::inverted_index::error::Error;
use crate::inverted_index::format::reader::MockInvertedIndexReader;
use crate::inverted_index::search::fst_apply::MockFstApplier;
@@ -190,7 +190,6 @@ mod tests {
let meta = InvertedIndexMeta {
name: s(tag),
relative_fst_offset: idx,
bitmap_type: BitmapType::Roaring.into(),
..Default::default()
};
metas.metas.insert(s(tag), meta);
@@ -230,16 +229,10 @@ mod tests {
.unwrap()])
});
mock_reader.expect_bitmap_deque().returning(|arg| {
assert_eq!(arg.len(), 1);
let range = &arg[0].0;
let bitmap_type = arg[0].1;
assert_eq!(*range, 2..3);
assert_eq!(bitmap_type, BitmapType::Roaring);
Ok(VecDeque::from([Bitmap::from_lsb0_bytes(
&[0b10101010],
bitmap_type,
)]))
mock_reader.expect_bitmap_deque().returning(|range| {
assert_eq!(range.len(), 1);
assert_eq!(range[0], 2..3);
Ok(VecDeque::from([bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1, 0]]))
});
let output = applier
.apply(SearchContext::default(), &mut mock_reader)
@@ -247,7 +240,7 @@ mod tests {
.unwrap();
assert_eq!(
output.matched_segment_ids,
Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring)
bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1, 0]
);
// An index reader with a single tag "tag-0" but without value "tag-0_value-0"
@@ -299,16 +292,12 @@ mod tests {
});
mock_reader.expect_bitmap_deque().returning(|ranges| {
let mut output = VecDeque::new();
for (range, bitmap_type) in ranges {
for range in ranges {
let offset = range.start;
let size = range.end - range.start;
match (offset, size, bitmap_type) {
(1, 1, BitmapType::Roaring) => {
output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
}
(2, 1, BitmapType::Roaring) => {
output.push_back(Bitmap::from_lsb0_bytes(&[0b11011011], *bitmap_type))
}
match (offset, size) {
(1, 1) => output.push_back(bitvec![u8, Lsb0; 1, 0, 1, 0, 1, 0, 1, 0]),
(2, 1) => output.push_back(bitvec![u8, Lsb0; 1, 1, 0, 1, 1, 0, 1, 1]),
_ => unreachable!(),
}
}
@@ -322,7 +311,7 @@ mod tests {
.unwrap();
assert_eq!(
output.matched_segment_ids,
Bitmap::from_lsb0_bytes(&[0b10001010], BitmapType::Roaring)
bitvec![u8, Lsb0; 1, 0, 0, 0, 1, 0, 1, 0]
);
}
@@ -341,7 +330,10 @@ mod tests {
.apply(SearchContext::default(), &mut mock_reader)
.await
.unwrap();
assert_eq!(output.matched_segment_ids, Bitmap::full_bitvec(8)); // full range to scan
assert_eq!(
output.matched_segment_ids,
bitvec![u8, Lsb0; 1, 1, 1, 1, 1, 1, 1, 1]
); // full range to scan
}
#[tokio::test]
@@ -413,7 +405,10 @@ mod tests {
)
.await
.unwrap();
assert_eq!(output.matched_segment_ids, Bitmap::full_bitvec(8));
assert_eq!(
output.matched_segment_ids,
bitvec![u8, Lsb0; 1, 1, 1, 1, 1, 1, 1, 1]
);
}
#[test]

View File

@@ -15,7 +15,6 @@
#![feature(iter_partition_in_place)]
#![feature(assert_matches)]
pub mod bitmap;
pub mod bloom_filter;
pub mod error;
pub mod external_provider;

23
src/ingester/Cargo.toml Normal file
View File

@@ -0,0 +1,23 @@
[package]
name = "ingester"
version.workspace = true
edition.workspace = true
license.workspace = true
[dependencies]
clap.workspace = true
common-telemetry.workspace = true
common-time.workspace = true
datanode.workspace = true
meta-client.workspace = true
mito2.workspace = true
object-store.workspace = true
reqwest.workspace = true
serde.workspace = true
serde_json.workspace = true
sst-convert.workspace = true
tokio.workspace = true
toml.workspace = true
[lints]
workspace = true

294
src/ingester/src/main.rs Normal file
View File

@@ -0,0 +1,294 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use clap::Parser;
use common_telemetry::info;
use common_time::timestamp::TimeUnit;
use datanode::config::StorageConfig;
use meta_client::MetaClientOptions;
use mito2::config::MitoConfig;
use mito2::sst::file::IndexType;
use mito2::sst::parquet::SstInfo;
use serde::{Deserialize, Serialize};
use sst_convert::converter::{InputFile, InputFileType, SstConverterBuilder};
use tokio::sync::oneshot;
#[derive(Parser, Debug)]
#[command(version, about = "Greptime Ingester", long_about = None)]
struct Args {
/// Input directory
#[arg(short, long)]
input_dir: String,
/// Directory of input parquet files, relative to input_dir
#[arg(short, long)]
parquet_dir: Option<String>,
/// Directory of input json files, relative to input_dir
#[arg(short, long)]
remote_write_dir: Option<String>,
/// Config file
#[arg(short, long)]
cfg: String,
/// DB HTTP address
#[arg(short, long)]
db_http_addr: String,
/// Output path for the converted SST files.
/// If it is not None, the converted SST files will be written to the specified path
/// in the `input_store`.
/// This is for debugging purposes.
#[arg(short, long)]
sst_output_path: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
struct IngesterConfig {
meta_client: MetaClientOptions,
storage: StorageConfig,
mito: MitoConfig,
}
pub const APP_NAME: &str = "greptime-ingester";
#[tokio::main]
async fn main() {
let _guard = common_telemetry::init_global_logging(
APP_NAME,
&Default::default(),
&Default::default(),
None,
);
let args = Args::parse();
let cfg_file = std::fs::read_to_string(&args.cfg).expect("Failed to read config file");
let cfg: IngesterConfig = toml::from_str(&cfg_file).expect("Failed to parse config");
let sst_builder = {
let mut builder = SstConverterBuilder::new_fs(args.input_dir)
.with_meta_options(cfg.meta_client)
.with_storage_config(cfg.storage)
.with_config(cfg.mito);
if let Some(output_path) = args.sst_output_path {
builder = builder.with_output_path(output_path);
}
builder
};
let sst_converter = sst_builder
.clone()
.build()
.await
.expect("Failed to build sst converter");
let input_store = sst_converter.input_store.clone();
if let Some(parquet_dir) = args.parquet_dir {
// using opendal to read parquet files in given input object store
let all_parquets = input_store
.list(&parquet_dir)
.await
.expect("Failed to list parquet files");
info!("Listed all files in parquet directory: {:?}", all_parquets);
let all_parquets = all_parquets
.iter()
.filter(|parquet| parquet.name().ends_with(".parquet") && parquet.metadata().is_file())
.collect::<Vec<_>>();
let input_files = all_parquets
.iter()
.map(|parquet| {
let full_table_name = parquet.name().split("-").next().unwrap();
let (catalog_name, schema_name, table_name) = extract_name(full_table_name);
info!(
"catalog: {}, schema: {}, table: {}",
catalog_name, schema_name, table_name
);
InputFile {
catalog: catalog_name.to_string(),
schema: schema_name.to_string(),
table: table_name.to_string(),
path: parquet.path().to_string(),
file_type: InputFileType::Parquet,
}
})
.collect::<Vec<_>>();
convert_and_send(&input_files, sst_builder.clone(), &args.db_http_addr).await;
}
if let Some(remote_write_dir) = args.remote_write_dir {
// using opendal to read parquet files in given input object store
let all_parquets = input_store
.list(&remote_write_dir)
.await
.expect("Failed to list parquet files");
let all_parquets = all_parquets
.iter()
.filter(|parquet| parquet.name().ends_with(".parquet") && parquet.metadata().is_file())
.collect::<Vec<_>>();
let input_files = all_parquets
.iter()
.map(|parquet| {
let full_table_name = parquet.name().split("-").next().unwrap();
let (catalog_name, schema_name, table_name) = extract_name(full_table_name);
info!(
"catalog: {}, schema: {}, table: {}",
catalog_name, schema_name, table_name
);
InputFile {
catalog: catalog_name.to_string(),
schema: schema_name.to_string(),
table: table_name.to_string(),
path: parquet.path().to_string(),
file_type: InputFileType::RemoteWrite,
}
})
.collect::<Vec<_>>();
convert_and_send(&input_files, sst_builder.clone(), &args.db_http_addr).await;
}
}
async fn convert_and_send(
input_files: &[InputFile],
sst_builder: SstConverterBuilder,
db_http_addr: &str,
) {
let table_names = input_files
.iter()
.map(|f| (f.schema.clone(), f.table.clone()))
.collect::<Vec<_>>();
let mut rxs = Vec::new();
// Spawn a task for each input file
info!("Spawning tasks for {} input files", input_files.len());
for input_file in input_files.iter() {
let (tx, rx) = oneshot::channel();
let sst_builder = sst_builder.clone();
let input_file = (*input_file).clone();
tokio::task::spawn(async move {
let mut sst_converter = sst_builder
.build()
.await
.expect("Failed to build sst converter");
let sst_info = sst_converter
.convert_one(&input_file)
.await
.expect("Failed to convert parquet files");
tx.send(sst_info).unwrap();
});
rxs.push(rx);
}
let mut sst_infos = Vec::new();
for rx in rxs {
sst_infos.push(rx.await.unwrap());
}
info!("Converted {} input files", sst_infos.len());
let ingest_reqs = table_names
.iter()
.zip(sst_infos.iter())
.flat_map(|(schema_name, sst_info)| {
sst_info
.ssts
.iter()
.map(|sst| to_ingest_sst_req(&schema_name.0, &schema_name.1, sst))
.collect::<Vec<_>>()
})
.collect::<Vec<_>>();
// send ingest requests to DB
send_ingest_requests(db_http_addr, ingest_reqs)
.await
.unwrap();
}
fn extract_name(full_table_name: &str) -> (String, String, String) {
let mut names = full_table_name.split('.').rev();
let table_name = names.next().unwrap();
let schema_name = names.next().unwrap_or("public");
let catalog_name = names.next().unwrap_or("greptime");
(
catalog_name.to_string(),
schema_name.to_string(),
table_name.to_string(),
)
}
async fn send_ingest_requests(
addr: &str,
reqs: Vec<ClientIngestSstRequest>,
) -> Result<(), Box<dyn std::error::Error>> {
let client = reqwest::Client::new();
for req in reqs {
info!("ingesting sst: {req:?}");
let req = client.post(addr).json(&req);
let resp = req.send().await?;
info!("ingest response: {resp:?}");
}
Ok(())
}
#[derive(Debug, Serialize, Deserialize)]
pub(crate) struct ClientIngestSstRequest {
schema: Option<String>,
table: String,
pub(crate) file_id: String,
pub(crate) min_ts: i64,
pub(crate) max_ts: i64,
pub(crate) file_size: u64,
pub(crate) rows: u32,
pub(crate) row_groups: u32,
/// Available indexes of the file.
pub available_indexes: Vec<IndexType>,
/// Size of the index file.
pub index_file_size: u64,
pub time_unit: u32,
}
fn to_ingest_sst_req(
schema_name: &str,
table_name: &str,
sst_info: &SstInfo,
) -> ClientIngestSstRequest {
let index_file_size = sst_info.index_metadata.file_size;
let available_indexs = sst_info.index_metadata.build_available_indexes();
ClientIngestSstRequest {
schema: Some(schema_name.to_string()),
table: table_name.to_string(),
file_id: sst_info.file_id.to_string(),
min_ts: sst_info.time_range.0.value(),
max_ts: sst_info.time_range.1.value(),
file_size: sst_info.file_size,
rows: sst_info.num_rows as _,
row_groups: sst_info.num_row_groups as _,
available_indexes: available_indexs.to_vec(),
index_file_size,
time_unit: match sst_info.time_range.0.unit() {
TimeUnit::Second => 0,
TimeUnit::Millisecond => 3,
TimeUnit::Microsecond => 6,
TimeUnit::Nanosecond => 9,
},
}
}

View File

@@ -40,17 +40,15 @@ pub enum Error {
actual: String,
},
#[snafu(display("Failed to start log store task: {}", name))]
StartWalTask {
name: String,
#[snafu(display("Failed to start log store gc task"))]
StartGcTask {
#[snafu(implicit)]
location: Location,
source: RuntimeError,
},
#[snafu(display("Failed to stop log store task: {}", name))]
StopWalTask {
name: String,
#[snafu(display("Failed to stop log store gc task"))]
StopGcTask {
#[snafu(implicit)]
location: Location,
source: RuntimeError,

View File

@@ -35,7 +35,7 @@ use common_runtime::RepeatedTask;
use raft_engine::{Config, Engine, LogBatch, ReadableSize, RecoveryMode};
use snafu::{IntoError, ResultExt};
use crate::error::{self, Error, IoSnafu, RaftEngineSnafu, StartWalTaskSnafu};
use crate::error::{self, Error, IoSnafu, RaftEngineSnafu, StartGcTaskSnafu};
use crate::raft_engine::log_store::PurgeExpiredFilesFunction;
pub(crate) const SYSTEM_NAMESPACE: u64 = 0;
@@ -93,8 +93,7 @@ impl RaftEngineBackend {
);
gc_task
.start(common_runtime::global_runtime())
.context(StartWalTaskSnafu { name: "gc_task" })?;
.context(StartGcTaskSnafu)?;
Ok(Self {
engine: RwLock::new(engine),
_gc_task: gc_task,

View File

@@ -14,6 +14,7 @@
use std::collections::{hash_map, HashMap};
use std::fmt::{Debug, Formatter};
use std::sync::atomic::{AtomicI64, Ordering};
use std::sync::Arc;
use std::time::Duration;
@@ -31,7 +32,7 @@ use store_api::storage::RegionId;
use crate::error::{
AddEntryLogBatchSnafu, DiscontinuousLogIndexSnafu, Error, FetchEntrySnafu,
IllegalNamespaceSnafu, IllegalStateSnafu, InvalidProviderSnafu, OverrideCompactedEntrySnafu,
RaftEngineSnafu, Result, StartWalTaskSnafu, StopWalTaskSnafu,
RaftEngineSnafu, Result, StartGcTaskSnafu, StopGcTaskSnafu,
};
use crate::metrics;
use crate::raft_engine::backend::SYSTEM_NAMESPACE;
@@ -45,7 +46,7 @@ pub struct RaftEngineLogStore {
read_batch_size: usize,
engine: Arc<Engine>,
gc_task: RepeatedTask<Error>,
sync_task: RepeatedTask<Error>,
last_sync_time: AtomicI64,
}
pub struct PurgeExpiredFilesFunction {
@@ -82,31 +83,6 @@ impl TaskFunction<Error> for PurgeExpiredFilesFunction {
}
}
pub struct SyncWalTaskFunction {
engine: Arc<Engine>,
}
#[async_trait::async_trait]
impl TaskFunction<Error> for SyncWalTaskFunction {
async fn call(&mut self) -> std::result::Result<(), Error> {
let engine = self.engine.clone();
if let Err(e) = tokio::task::spawn_blocking(move || engine.sync()).await {
error!(e; "Failed to sync raft engine log files");
};
Ok(())
}
fn name(&self) -> &str {
"SyncWalTaskFunction"
}
}
impl SyncWalTaskFunction {
pub fn new(engine: Arc<Engine>) -> Self {
Self { engine }
}
}
impl RaftEngineLogStore {
pub async fn try_new(dir: String, config: &RaftEngineConfig) -> Result<Self> {
let raft_engine_config = Config {
@@ -128,18 +104,13 @@ impl RaftEngineLogStore {
}),
);
let sync_task = RepeatedTask::new(
config.sync_period.unwrap_or(Duration::from_secs(5)),
Box::new(SyncWalTaskFunction::new(engine.clone())),
);
let log_store = Self {
sync_write: config.sync_write,
sync_period: config.sync_period,
read_batch_size: config.read_batch_size,
engine,
gc_task,
sync_task,
last_sync_time: AtomicI64::new(0),
};
log_store.start()?;
Ok(log_store)
@@ -152,10 +123,7 @@ impl RaftEngineLogStore {
fn start(&self) -> Result<()> {
self.gc_task
.start(common_runtime::global_runtime())
.context(StartWalTaskSnafu { name: "gc_task" })?;
self.sync_task
.start(common_runtime::global_runtime())
.context(StartWalTaskSnafu { name: "sync_task" })
.context(StartGcTaskSnafu)
}
fn span(&self, provider: &RaftEngineProvider) -> (Option<u64>, Option<u64>) {
@@ -252,14 +220,7 @@ impl LogStore for RaftEngineLogStore {
type Error = Error;
async fn stop(&self) -> Result<()> {
self.gc_task
.stop()
.await
.context(StopWalTaskSnafu { name: "gc_task" })?;
self.sync_task
.stop()
.await
.context(StopWalTaskSnafu { name: "sync_task" })
self.gc_task.stop().await.context(StopGcTaskSnafu)
}
/// Appends a batch of entries to logstore. `RaftEngineLogStore` assures the atomicity of
@@ -279,9 +240,20 @@ impl LogStore for RaftEngineLogStore {
}
let (mut batch, last_entry_ids) = self.entries_to_batch(entries)?;
let mut sync = self.sync_write;
if let Some(sync_period) = &self.sync_period {
let now = common_time::util::current_time_millis();
if now - self.last_sync_time.load(Ordering::Relaxed) >= sync_period.as_millis() as i64 {
self.last_sync_time.store(now, Ordering::Relaxed);
sync = true;
}
}
let _ = self
.engine
.write(&mut batch, self.sync_write)
.write(&mut batch, sync)
.context(RaftEngineSnafu)?;
Ok(AppendBatchResponse { last_entry_ids })

View File

@@ -111,7 +111,6 @@ impl MetaClientBuilder {
.enable_store()
.enable_heartbeat()
.enable_procedure()
.enable_access_cluster_info()
}
pub fn enable_heartbeat(self) -> Self {

View File

@@ -7,7 +7,6 @@ license.workspace = true
[features]
mock = []
pg_kvbackend = ["dep:tokio-postgres", "common-meta/pg_kvbackend"]
mysql_kvbackend = [] # placeholder features so CI can compile
[lints]
workspace = true

View File

@@ -335,10 +335,6 @@ impl MetricEngine {
}
}
pub fn mito(&self) -> MitoEngine {
self.inner.mito.clone()
}
pub async fn logical_regions(&self, physical_region_id: RegionId) -> Result<Vec<RegionId>> {
self.inner
.metadata_region

View File

@@ -59,7 +59,7 @@ pub mod engine;
pub mod error;
mod metadata_region;
mod metrics;
mod row_modifier;
pub mod row_modifier;
#[cfg(test)]
mod test_util;
mod utils;

View File

@@ -338,7 +338,6 @@ impl MetadataRegion {
limit: None,
series_row_selector: None,
sequence: None,
distribution: None,
}
}
@@ -528,7 +527,6 @@ impl MetadataRegion {
limit: None,
series_row_selector: None,
sequence: None,
distribution: None,
};
let record_batch_stream = self
.mito

View File

@@ -40,7 +40,7 @@ const TSID_HASH_SEED: u32 = 846793005;
///
/// - For [`PrimaryKeyEncoding::Dense`] encoding,
/// it adds two columns(`__table_id`, `__tsid`) to the row.
pub struct RowModifier {
pub(crate) struct RowModifier {
codec: SparsePrimaryKeyCodec,
}
@@ -52,7 +52,7 @@ impl RowModifier {
}
/// Modify rows with the given primary key encoding.
pub fn modify_rows(
pub(crate) fn modify_rows(
&self,
iter: RowsIter,
table_id: TableId,
@@ -145,16 +145,14 @@ impl RowModifier {
/// Fills internal columns of a row with table name and a hash of tag values.
fn fill_internal_columns(&self, table_id: TableId, iter: &RowIter<'_>) -> (Value, Value) {
let mut hasher = mur3::Hasher128::with_seed(TSID_HASH_SEED);
let mut hasher = TsidGenerator::default();
for (name, value) in iter.primary_keys_with_name() {
// The type is checked before. So only null is ignored.
if let Some(ValueData::StringValue(string)) = &value.value_data {
name.hash(&mut hasher);
string.hash(&mut hasher);
hasher.write_label(name, string);
}
}
// TSID is 64 bits, simply truncate the 128 bits hash
let (hash, _) = hasher.finish128();
let hash = hasher.finish();
(
ValueData::U32Value(table_id).into(),
@@ -163,6 +161,34 @@ impl RowModifier {
}
}
/// Tsid generator.
pub struct TsidGenerator {
hasher: mur3::Hasher128,
}
impl Default for TsidGenerator {
fn default() -> Self {
Self {
hasher: mur3::Hasher128::with_seed(TSID_HASH_SEED),
}
}
}
impl TsidGenerator {
/// Writes a label pair to the generator.
pub fn write_label(&mut self, name: &str, value: &str) {
name.hash(&mut self.hasher);
value.hash(&mut self.hasher);
}
/// Generates a new TSID.
pub fn finish(&mut self) -> u64 {
// TSID is 64 bits, simply truncate the 128 bits hash
let (hash, _) = self.hasher.finish128();
hash
}
}
/// Index of a value.
#[derive(Debug, Clone, Copy)]
struct ValueIndex {

View File

@@ -121,7 +121,7 @@ impl AccessLayer {
/// Writes a SST with specific `file_id` and `metadata` to the layer.
///
/// Returns the info of the SST. If no data written, returns None.
pub(crate) async fn write_sst(
pub async fn write_sst(
&self,
request: SstWriteRequest,
write_opts: &WriteOptions,
@@ -191,26 +191,26 @@ impl AccessLayer {
/// `OperationType` represents the origin of the `SstWriteRequest`.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub(crate) enum OperationType {
pub enum OperationType {
Flush,
Compact,
}
/// Contents to build a SST.
pub(crate) struct SstWriteRequest {
pub(crate) op_type: OperationType,
pub(crate) metadata: RegionMetadataRef,
pub(crate) source: Source,
pub(crate) cache_manager: CacheManagerRef,
pub struct SstWriteRequest {
pub op_type: OperationType,
pub metadata: RegionMetadataRef,
pub source: Source,
pub cache_manager: CacheManagerRef,
#[allow(dead_code)]
pub(crate) storage: Option<String>,
pub(crate) max_sequence: Option<SequenceNumber>,
pub storage: Option<String>,
pub max_sequence: Option<SequenceNumber>,
/// Configs for index
pub(crate) index_options: IndexOptions,
pub(crate) inverted_index_config: InvertedIndexConfig,
pub(crate) fulltext_index_config: FulltextIndexConfig,
pub(crate) bloom_filter_index_config: BloomFilterConfig,
pub index_options: IndexOptions,
pub inverted_index_config: InvertedIndexConfig,
pub fulltext_index_config: FulltextIndexConfig,
pub bloom_filter_index_config: BloomFilterConfig,
}
pub(crate) async fn new_fs_cache_store(root: &str) -> Result<ObjectStore> {

View File

@@ -127,8 +127,8 @@ impl<R: InvertedIndexReader> InvertedIndexReader for CachedInvertedIndexBlobRead
mod test {
use std::num::NonZeroUsize;
use common_base::BitVec;
use futures::stream;
use index::bitmap::{Bitmap, BitmapType};
use index::inverted_index::format::reader::{InvertedIndexBlobReader, InvertedIndexReader};
use index::inverted_index::format::writer::{InvertedIndexBlobWriter, InvertedIndexWriter};
use index::Bytes;
@@ -191,44 +191,24 @@ mod test {
writer
.add_index(
"tag0".to_string(),
Bitmap::from_lsb0_bytes(&[0b0000_0001, 0b0000_0000], BitmapType::Roaring),
BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
Box::new(stream::iter(vec![
Ok((
Bytes::from("a"),
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
)),
Ok((
Bytes::from("b"),
Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring),
)),
Ok((
Bytes::from("c"),
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
)),
Ok((Bytes::from("a"), BitVec::from_slice(&[0b0000_0001]))),
Ok((Bytes::from("b"), BitVec::from_slice(&[0b0010_0000]))),
Ok((Bytes::from("c"), BitVec::from_slice(&[0b0000_0001]))),
])),
index::bitmap::BitmapType::Roaring,
)
.await
.unwrap();
writer
.add_index(
"tag1".to_string(),
Bitmap::from_lsb0_bytes(&[0b0000_0001, 0b0000_0000], BitmapType::Roaring),
BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
Box::new(stream::iter(vec![
Ok((
Bytes::from("x"),
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
)),
Ok((
Bytes::from("y"),
Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring),
)),
Ok((
Bytes::from("z"),
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring),
)),
Ok((Bytes::from("x"), BitVec::from_slice(&[0b0000_0001]))),
Ok((Bytes::from("y"), BitVec::from_slice(&[0b0010_0000]))),
Ok((Bytes::from("z"), BitVec::from_slice(&[0b0000_0001]))),
])),
index::bitmap::BitmapType::Roaring,
)
.await
.unwrap();
@@ -287,31 +267,22 @@ mod test {
assert_eq!(fst0.len(), 3);
let [offset, size] = unpack(fst0.get(b"a").unwrap());
let bitmap = cached_reader
.bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
.bitmap(tag0.base_offset + offset as u64, size)
.await
.unwrap();
assert_eq!(
bitmap,
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
);
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
let [offset, size] = unpack(fst0.get(b"b").unwrap());
let bitmap = cached_reader
.bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
.bitmap(tag0.base_offset + offset as u64, size)
.await
.unwrap();
assert_eq!(
bitmap,
Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring)
);
assert_eq!(bitmap, BitVec::from_slice(&[0b0010_0000]));
let [offset, size] = unpack(fst0.get(b"c").unwrap());
let bitmap = cached_reader
.bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
.bitmap(tag0.base_offset + offset as u64, size)
.await
.unwrap();
assert_eq!(
bitmap,
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
);
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
// tag1
let tag1 = metadata.metas.get("tag1").unwrap();
@@ -330,31 +301,22 @@ mod test {
assert_eq!(fst1.len(), 3);
let [offset, size] = unpack(fst1.get(b"x").unwrap());
let bitmap = cached_reader
.bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
.bitmap(tag1.base_offset + offset as u64, size)
.await
.unwrap();
assert_eq!(
bitmap,
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
);
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
let [offset, size] = unpack(fst1.get(b"y").unwrap());
let bitmap = cached_reader
.bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
.bitmap(tag1.base_offset + offset as u64, size)
.await
.unwrap();
assert_eq!(
bitmap,
Bitmap::from_lsb0_bytes(&[0b0010_0000], BitmapType::Roaring)
);
assert_eq!(bitmap, BitVec::from_slice(&[0b0010_0000]));
let [offset, size] = unpack(fst1.get(b"z").unwrap());
let bitmap = cached_reader
.bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
.bitmap(tag1.base_offset + offset as u64, size)
.await
.unwrap();
assert_eq!(
bitmap,
Bitmap::from_lsb0_bytes(&[0b0000_0001], BitmapType::Roaring)
);
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
// fuzz test
let mut rng = rand::thread_rng();

View File

@@ -46,6 +46,7 @@ const INDEX_CREATE_MEM_THRESHOLD_FACTOR: u64 = 16;
pub(crate) const FETCH_OPTION_TIMEOUT: Duration = Duration::from_secs(3);
/// Configuration for [MitoEngine](crate::engine::MitoEngine).
/// Before using the config, make sure to call `MitoConfig::validate()` to check if the config is valid.
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
#[serde(default)]
pub struct MitoConfig {

View File

@@ -80,7 +80,6 @@ async fn test_scan_projection() {
limit: None,
series_row_selector: None,
sequence: None,
distribution: None,
};
let stream = engine.scan_to_stream(region_id, request).await.unwrap();
let batches = RecordBatches::try_collect(stream).await.unwrap();

View File

@@ -42,6 +42,13 @@ use crate::worker::WorkerId;
#[snafu(visibility(pub))]
#[stack_trace_debug]
pub enum Error {
#[snafu(display("External error, context: {}", context))]
External {
source: BoxedError,
context: String,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Failed to encode sparse primary key, reason: {}", reason))]
EncodeSparsePrimaryKey {
reason: String,
@@ -1085,7 +1092,7 @@ impl ErrorExt for Error {
| PuffinPurgeStager { source, .. } => source.status_code(),
CleanDir { .. } => StatusCode::Unexpected,
InvalidConfig { .. } => StatusCode::InvalidArguments,
StaleLogEntry { .. } => StatusCode::Unexpected,
StaleLogEntry { .. } | External { .. } => StatusCode::Unexpected,
FilterRecordBatch { source, .. } => source.status_code(),

View File

@@ -23,8 +23,8 @@
#[cfg_attr(feature = "test", allow(unused))]
pub mod test_util;
mod access_layer;
mod cache;
pub mod access_layer;
pub mod cache;
pub mod compaction;
pub mod config;
pub mod engine;

View File

@@ -21,7 +21,6 @@ use common_time::Timestamp;
use parquet::arrow::arrow_reader::RowSelection;
use smallvec::{smallvec, SmallVec};
use store_api::region_engine::PartitionRange;
use store_api::storage::TimeSeriesDistribution;
use crate::cache::CacheStrategy;
use crate::error::Result;
@@ -99,8 +98,8 @@ impl RangeMeta {
Self::push_seq_file_ranges(input.memtables.len(), &input.files, &mut ranges);
let ranges = group_ranges_for_seq_scan(ranges);
if compaction || input.distribution == Some(TimeSeriesDistribution::PerSeries) {
// We don't split ranges in compaction or TimeSeriesDistribution::PerSeries.
if compaction {
// We don't split ranges in compaction.
return ranges;
}
maybe_split_ranges_for_seq_scan(ranges)

View File

@@ -31,7 +31,7 @@ use datafusion_expr::Expr;
use smallvec::SmallVec;
use store_api::metadata::RegionMetadata;
use store_api::region_engine::{PartitionRange, RegionScannerRef};
use store_api::storage::{ScanRequest, TimeSeriesDistribution, TimeSeriesRowSelector};
use store_api::storage::{ScanRequest, TimeSeriesRowSelector};
use table::predicate::{build_time_range_predicate, Predicate};
use tokio::sync::{mpsc, Semaphore};
use tokio_stream::wrappers::ReceiverStream;
@@ -287,16 +287,9 @@ impl ScanRegion {
/// Returns true if the region can use unordered scan for current request.
fn use_unordered_scan(&self) -> bool {
// We use unordered scan when:
// 1. The region is in append mode.
// 2. There is no series row selector.
// 3. The required distribution is None or TimeSeriesDistribution::TimeWindowed.
//
// If table is append only and there is no series row selector, we use unordered scan in query.
// We still use seq scan in compaction.
self.version.options.append_mode
&& self.request.series_row_selector.is_none()
&& (self.request.distribution.is_none()
|| self.request.distribution == Some(TimeSeriesDistribution::TimeWindowed))
self.version.options.append_mode && self.request.series_row_selector.is_none()
}
/// Creates a scan input.
@@ -384,8 +377,7 @@ impl ScanRegion {
.with_append_mode(self.version.options.append_mode)
.with_filter_deleted(filter_deleted)
.with_merge_mode(self.version.options.merge_mode())
.with_series_row_selector(self.request.series_row_selector)
.with_distribution(self.request.distribution);
.with_series_row_selector(self.request.series_row_selector);
Ok(input)
}
@@ -565,8 +557,6 @@ pub(crate) struct ScanInput {
pub(crate) merge_mode: MergeMode,
/// Hint to select rows from time series.
pub(crate) series_row_selector: Option<TimeSeriesRowSelector>,
/// Hint for the required distribution of the scanner.
pub(crate) distribution: Option<TimeSeriesDistribution>,
}
impl ScanInput {
@@ -591,7 +581,6 @@ impl ScanInput {
filter_deleted: true,
merge_mode: MergeMode::default(),
series_row_selector: None,
distribution: None,
}
}
@@ -704,16 +693,6 @@ impl ScanInput {
self
}
/// Sets the distribution hint.
#[must_use]
pub(crate) fn with_distribution(
mut self,
distribution: Option<TimeSeriesDistribution>,
) -> Self {
self.distribution = distribution;
self
}
/// Sets the time series row selector.
#[must_use]
pub(crate) fn with_series_row_selector(

View File

@@ -29,7 +29,7 @@ use datatypes::schema::SchemaRef;
use snafu::ResultExt;
use store_api::metadata::RegionMetadataRef;
use store_api::region_engine::{PartitionRange, PrepareRequest, RegionScanner, ScannerProperties};
use store_api::storage::{TimeSeriesDistribution, TimeSeriesRowSelector};
use store_api::storage::TimeSeriesRowSelector;
use tokio::sync::Semaphore;
use crate::error::{PartitionOutOfRangeSnafu, Result};
@@ -206,16 +206,32 @@ impl SeqScan {
));
}
if self.stream_ctx.input.distribution == Some(TimeSeriesDistribution::PerSeries) {
return self.scan_partition_by_series(partition);
}
let stream_ctx = self.stream_ctx.clone();
let semaphore = self.new_semaphore();
let semaphore = if self.properties.target_partitions() > self.properties.num_partitions() {
// We can use additional tasks to read the data if we have more target partitions than actual partitions.
// This semaphore is partition level.
// We don't use a global semaphore to avoid a partition waiting for others. The final concurrency
// of tasks usually won't exceed the target partitions a lot as compaction can reduce the number of
// files in a part range.
Some(Arc::new(Semaphore::new(
self.properties.target_partitions() - self.properties.num_partitions() + 1,
)))
} else {
None
};
let partition_ranges = self.properties.partitions[partition].clone();
let compaction = self.compaction;
let distinguish_range = self.properties.distinguish_partition_range;
let part_metrics = self.new_partition_metrics(partition);
let part_metrics = PartitionMetrics::new(
self.stream_ctx.input.mapper.metadata().region_id,
partition,
get_scanner_type(self.compaction),
stream_ctx.query_start,
ScannerMetrics {
prepare_scan_cost: self.stream_ctx.query_start.elapsed(),
..Default::default()
},
);
let stream = try_stream! {
part_metrics.on_first_poll();
@@ -305,124 +321,6 @@ impl SeqScan {
Ok(stream)
}
/// Scans all ranges in the given partition and merge by time series.
/// Otherwise the returned stream might not contains any data.
fn scan_partition_by_series(
&self,
partition: usize,
) -> Result<SendableRecordBatchStream, BoxedError> {
let stream_ctx = self.stream_ctx.clone();
let semaphore = self.new_semaphore();
let partition_ranges = self.properties.partitions[partition].clone();
let distinguish_range = self.properties.distinguish_partition_range;
let part_metrics = self.new_partition_metrics(partition);
debug_assert!(!self.compaction);
let stream = try_stream! {
part_metrics.on_first_poll();
let range_builder_list = Arc::new(RangeBuilderList::new(
stream_ctx.input.num_memtables(),
stream_ctx.input.num_files(),
));
// Scans all parts.
let mut sources = Vec::with_capacity(partition_ranges.len());
for part_range in partition_ranges {
build_sources(
&stream_ctx,
&part_range,
false,
&part_metrics,
range_builder_list.clone(),
&mut sources,
);
}
// Builds a reader that merge sources from all parts.
let mut reader =
Self::build_reader_from_sources(&stream_ctx, sources, semaphore.clone())
.await
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
let cache = &stream_ctx.input.cache_strategy;
let mut metrics = ScannerMetrics::default();
let mut fetch_start = Instant::now();
while let Some(batch) = reader
.next_batch()
.await
.map_err(BoxedError::new)
.context(ExternalSnafu)?
{
metrics.scan_cost += fetch_start.elapsed();
metrics.num_batches += 1;
metrics.num_rows += batch.num_rows();
debug_assert!(!batch.is_empty());
if batch.is_empty() {
continue;
}
let convert_start = Instant::now();
let record_batch = stream_ctx.input.mapper.convert(&batch, cache)?;
metrics.convert_cost += convert_start.elapsed();
let yield_start = Instant::now();
yield record_batch;
metrics.yield_cost += yield_start.elapsed();
fetch_start = Instant::now();
}
// Yields an empty part to indicate this range is terminated.
// The query engine can use this to optimize some queries.
if distinguish_range {
let yield_start = Instant::now();
yield stream_ctx.input.mapper.empty_record_batch();
metrics.yield_cost += yield_start.elapsed();
}
metrics.scan_cost += fetch_start.elapsed();
part_metrics.merge_metrics(&metrics);
part_metrics.on_finish();
};
let stream = Box::pin(RecordBatchStreamWrapper::new(
self.stream_ctx.input.mapper.output_schema(),
Box::pin(stream),
));
Ok(stream)
}
fn new_semaphore(&self) -> Option<Arc<Semaphore>> {
if self.properties.target_partitions() > self.properties.num_partitions() {
// We can use additional tasks to read the data if we have more target partitions than actual partitions.
// This semaphore is partition level.
// We don't use a global semaphore to avoid a partition waiting for others. The final concurrency
// of tasks usually won't exceed the target partitions a lot as compaction can reduce the number of
// files in a part range.
Some(Arc::new(Semaphore::new(
self.properties.target_partitions() - self.properties.num_partitions() + 1,
)))
} else {
None
}
}
fn new_partition_metrics(&self, partition: usize) -> PartitionMetrics {
PartitionMetrics::new(
self.stream_ctx.input.mapper.metadata().region_id,
partition,
get_scanner_type(self.compaction),
self.stream_ctx.query_start,
ScannerMetrics {
prepare_scan_cost: self.stream_ctx.query_start.elapsed(),
..Default::default()
},
)
}
}
impl RegionScanner for SeqScan {
@@ -472,7 +370,7 @@ impl fmt::Debug for SeqScan {
}
}
/// Builds sources for the partition range and push them to the `sources` vector.
/// Builds sources for the partition range.
fn build_sources(
stream_ctx: &Arc<StreamContext>,
part_range: &PartitionRange,
@@ -484,8 +382,8 @@ fn build_sources(
// Gets range meta.
let range_meta = &stream_ctx.ranges[part_range.identifier];
#[cfg(debug_assertions)]
if compaction || stream_ctx.input.distribution == Some(TimeSeriesDistribution::PerSeries) {
// Compaction or per series distribution expects input sources are not been split.
if compaction {
// Compaction expects input sources are not been split.
debug_assert_eq!(range_meta.indices.len(), range_meta.row_group_indices.len());
for (i, row_group_idx) in range_meta.row_group_indices.iter().enumerate() {
// It should scan all row groups.

View File

@@ -14,7 +14,7 @@
//! Mito region.
pub(crate) mod opener;
pub mod opener;
pub mod options;
pub(crate) mod version;

View File

@@ -15,7 +15,7 @@
//! Region opener.
use std::collections::HashMap;
use std::sync::atomic::AtomicI64;
use std::sync::atomic::{AtomicI64, AtomicU64};
use std::sync::Arc;
use common_telemetry::{debug, error, info, warn};
@@ -27,7 +27,9 @@ use object_store::util::{join_dir, normalize_dir};
use snafu::{ensure, OptionExt, ResultExt};
use store_api::logstore::provider::Provider;
use store_api::logstore::LogStore;
use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder};
use store_api::metadata::{
ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
};
use store_api::region_engine::RegionRole;
use store_api::storage::{ColumnId, RegionId};
@@ -38,6 +40,7 @@ use crate::error::{
EmptyRegionDirSnafu, InvalidMetadataSnafu, ObjectStoreNotFoundSnafu, RegionCorruptedSnafu,
Result, StaleLogEntrySnafu,
};
use crate::manifest::action::RegionManifest;
use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions};
use crate::manifest::storage::manifest_compress_type;
use crate::memtable::time_partition::TimePartitions;
@@ -203,11 +206,16 @@ impl RegionOpener {
}
// Safety: must be set before calling this method.
let options = self.options.take().unwrap();
let object_store = self.object_store(&options.storage)?.clone();
let object_store = get_object_store(&options.storage, &self.object_store_manager)?.clone();
let provider = self.provider(&options.wal_options);
let metadata = Arc::new(metadata);
// Create a manifest manager for this region and writes regions to the manifest file.
let region_manifest_options = self.manifest_options(config, &options)?;
let region_manifest_options = Self::manifest_options(
config,
&options,
&self.region_dir,
&self.object_store_manager,
)?;
let manifest_manager = RegionManifestManager::new(
metadata.clone(),
region_manifest_options,
@@ -312,7 +320,12 @@ impl RegionOpener {
) -> Result<Option<MitoRegion>> {
let region_options = self.options.as_ref().unwrap().clone();
let region_manifest_options = self.manifest_options(config, &region_options)?;
let region_manifest_options = Self::manifest_options(
config,
&region_options,
&self.region_dir,
&self.object_store_manager,
)?;
let Some(manifest_manager) = RegionManifestManager::open(
region_manifest_options,
self.stats.total_manifest_size.clone(),
@@ -332,7 +345,7 @@ impl RegionOpener {
.take()
.unwrap_or_else(|| wal.wal_entry_reader(&provider, region_id, None));
let on_region_opened = wal.on_region_opened();
let object_store = self.object_store(&region_options.storage)?.clone();
let object_store = get_object_store(&region_options.storage, &self.object_store_manager)?;
debug!("Open region {} with options: {:?}", region_id, self.options);
@@ -422,13 +435,14 @@ impl RegionOpener {
/// Returns a new manifest options.
fn manifest_options(
&self,
config: &MitoConfig,
options: &RegionOptions,
region_dir: &str,
object_store_manager: &ObjectStoreManagerRef,
) -> Result<RegionManifestOptions> {
let object_store = self.object_store(&options.storage)?.clone();
let object_store = get_object_store(&options.storage, object_store_manager)?;
Ok(RegionManifestOptions {
manifest_dir: new_manifest_dir(&self.region_dir),
manifest_dir: new_manifest_dir(region_dir),
object_store,
// We don't allow users to set the compression algorithm as we use it as a file suffix.
// Currently, the manifest storage doesn't have good support for changing compression algorithms.
@@ -436,20 +450,72 @@ impl RegionOpener {
checkpoint_distance: config.manifest_checkpoint_distance,
})
}
}
/// Returns an object store corresponding to `name`. If `name` is `None`, this method returns the default object store.
fn object_store(&self, name: &Option<String>) -> Result<&object_store::ObjectStore> {
if let Some(name) = name {
Ok(self
.object_store_manager
.find(name)
.context(ObjectStoreNotFoundSnafu {
object_store: name.to_string(),
})?)
} else {
Ok(self.object_store_manager.default_object_store())
/// Returns an object store corresponding to `name`. If `name` is `None`, this method returns the default object store.
pub fn get_object_store(
name: &Option<String>,
object_store_manager: &ObjectStoreManagerRef,
) -> Result<object_store::ObjectStore> {
if let Some(name) = name {
Ok(object_store_manager
.find(name)
.context(ObjectStoreNotFoundSnafu {
object_store: name.to_string(),
})?
.clone())
} else {
Ok(object_store_manager.default_object_store().clone())
}
}
/// A loader for loading metadata from a region dir.
pub struct RegionMetadataLoader {
config: Arc<MitoConfig>,
object_store_manager: ObjectStoreManagerRef,
}
impl RegionMetadataLoader {
/// Creates a new `RegionOpenerBuilder`.
pub fn new(config: Arc<MitoConfig>, object_store_manager: ObjectStoreManagerRef) -> Self {
Self {
config,
object_store_manager,
}
}
/// Loads the metadata of the region from the region dir.
pub async fn load(
&self,
region_dir: &str,
region_options: &RegionOptions,
) -> Result<Option<RegionMetadataRef>> {
let manifest = self.load_manifest(region_dir, region_options).await?;
Ok(manifest.map(|m| m.metadata.clone()))
}
/// Loads the manifest of the region from the region dir.
pub async fn load_manifest(
&self,
region_dir: &str,
region_options: &RegionOptions,
) -> Result<Option<Arc<RegionManifest>>> {
let region_manifest_options = RegionOpener::manifest_options(
&self.config,
region_options,
region_dir,
&self.object_store_manager,
)?;
let Some(manifest_manager) =
RegionManifestManager::open(region_manifest_options, Arc::new(AtomicU64::new(0)))
.await?
else {
return Ok(None);
};
let manifest = manifest_manager.manifest();
Ok(Some(manifest))
}
}
/// Checks whether the recovered region has the same schema as region to create.

View File

@@ -33,6 +33,8 @@ use crate::row_converter::dense::SortField;
use crate::row_converter::{CompositeValues, PrimaryKeyCodec, PrimaryKeyFilter};
/// A codec for sparse key of metrics.
/// It requires the input primary key columns are sorted by the column name in lexicographical order.
/// It encodes the column id of the physical region.
#[derive(Clone, Debug)]
pub struct SparsePrimaryKeyCodec {
inner: Arc<SparsePrimaryKeyCodecInner>,

View File

@@ -16,9 +16,9 @@ pub(crate) mod bloom_filter;
mod codec;
pub(crate) mod fulltext_index;
mod indexer;
pub(crate) mod intermediate;
pub mod intermediate;
pub(crate) mod inverted_index;
pub(crate) mod puffin_manager;
pub mod puffin_manager;
mod statistics;
pub(crate) mod store;

View File

@@ -49,6 +49,11 @@ impl IntermediateManager {
/// Create a new `IntermediateManager` with the given root path.
/// It will clean up all garbage intermediate files from previous runs.
pub async fn init_fs(aux_path: impl AsRef<str>) -> Result<Self> {
common_telemetry::info!(
"Initializing intermediate manager, aux_path: {}",
aux_path.as_ref()
);
let store = new_fs_cache_store(&normalize_dir(aux_path.as_ref())).await?;
let store = InstrumentedStore::new(store);

View File

@@ -228,8 +228,8 @@ impl Drop for InvertedIndexApplier {
#[cfg(test)]
mod tests {
use common_base::BitVec;
use futures::io::Cursor;
use index::bitmap::Bitmap;
use index::inverted_index::search::index_apply::MockIndexApplier;
use object_store::services::Memory;
use puffin::puffin_manager::PuffinWriter;
@@ -259,7 +259,7 @@ mod tests {
mock_index_applier.expect_memory_usage().returning(|| 100);
mock_index_applier.expect_apply().returning(|_, _| {
Ok(ApplyOutput {
matched_segment_ids: Bitmap::new_bitvec(),
matched_segment_ids: BitVec::EMPTY,
total_row_count: 100,
segment_row_count: 10,
})
@@ -276,7 +276,7 @@ mod tests {
assert_eq!(
output,
ApplyOutput {
matched_segment_ids: Bitmap::new_bitvec(),
matched_segment_ids: BitVec::EMPTY,
total_row_count: 100,
segment_row_count: 10,
}

View File

@@ -277,9 +277,7 @@ impl InvertedIndexer {
let mut index_writer = InvertedIndexBlobWriter::new(tx.compat_write());
let (index_finish, puffin_add_blob) = futures::join!(
// TODO(zhongzc): config bitmap type
self.index_creator
.finish(&mut index_writer, index::bitmap::BitmapType::Roaring),
self.index_creator.finish(&mut index_writer),
puffin_writer.put_blob(INDEX_BLOB_TYPE, rx.compat(), PutOptions::default())
);

View File

@@ -61,6 +61,7 @@ impl Default for WriteOptions {
}
/// Parquet SST info returned by the writer.
#[derive(Debug)]
pub struct SstInfo {
/// SST file id.
pub file_id: FileId,

View File

@@ -583,8 +583,6 @@ type RequestBuffer = Vec<WorkerRequest>;
#[derive(Default)]
pub(crate) struct StalledRequests {
/// Stalled requests.
/// Remember to use `StalledRequests::stalled_count()` to get the total number of stalled requests
/// instead of `StalledRequests::requests.len()`.
///
/// Key: RegionId
/// Value: (estimated size, stalled requests)
@@ -619,11 +617,6 @@ impl StalledRequests {
vec![]
}
}
/// Returns the total number of all stalled requests.
pub(crate) fn stalled_count(&self) -> usize {
self.requests.values().map(|reqs| reqs.1.len()).sum()
}
}
/// Background worker loop to handle requests.

View File

@@ -329,15 +329,6 @@ async fn edit_region(
let index_key = IndexKey::new(region_id, file_meta.file_id, FileType::Parquet);
let remote_path = location::sst_file_path(layer.region_dir(), file_meta.file_id);
let is_index_exist = file_meta.exists_index();
let index_file_size = file_meta.index_file_size();
let index_file_index_key =
IndexKey::new(region_id, file_meta.file_id, FileType::Puffin);
let index_remote_path =
location::index_file_path(layer.region_dir(), file_meta.file_id);
let file_size = file_meta.file_size;
common_runtime::spawn_global(async move {
if write_cache
@@ -354,22 +345,6 @@ async fn edit_region(
listener.on_file_cache_filled(index_key.file_id);
}
if is_index_exist {
// also download puffin file
if let Err(err) = write_cache
.download(
index_file_index_key,
&index_remote_path,
layer.object_store(),
index_file_size,
)
.await
{
common_telemetry::error!(
err; "Failed to download puffin file, region_id: {}, index_file_index_key: {:?}, index_remote_path: {}", region_id, index_file_index_key, index_remote_path
);
}
}
});
}
}

View File

@@ -147,7 +147,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
pub(crate) async fn handle_stalled_requests(&mut self) {
// Handle stalled requests.
let stalled = std::mem::take(&mut self.stalled_requests);
self.stalled_count.sub(stalled.stalled_count() as i64);
self.stalled_count.sub(stalled.requests.len() as i64);
// We already stalled these requests, don't stall them again.
for (_, (_, mut requests)) in stalled.requests {
self.handle_write_requests(&mut requests, false).await;
@@ -157,7 +157,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
/// Rejects all stalled requests.
pub(crate) fn reject_stalled_requests(&mut self) {
let stalled = std::mem::take(&mut self.stalled_requests);
self.stalled_count.sub(stalled.stalled_count() as i64);
self.stalled_count.sub(stalled.requests.len() as i64);
for (_, (_, mut requests)) in stalled.requests {
reject_write_requests(&mut requests);
}

View File

@@ -74,7 +74,6 @@ pub struct Inserter {
catalog_manager: CatalogManagerRef,
partition_manager: PartitionRuleManagerRef,
node_manager: NodeManagerRef,
#[allow(unused)]
table_flownode_set_cache: TableFlownodeSetCacheRef,
}
@@ -363,8 +362,6 @@ impl Inserter {
instant_requests,
} = requests;
// TODO(discord9): mirror some
// Mirror requests for source table to flownode asynchronously
let flow_mirror_task = FlowMirrorTask::new(
&self.table_flownode_set_cache,
@@ -898,14 +895,12 @@ struct CreateAlterTableResult {
table_infos: HashMap<TableId, Arc<TableInfo>>,
}
#[allow(unused)]
struct FlowMirrorTask {
requests: HashMap<Peer, RegionInsertRequests>,
num_rows: usize,
}
impl FlowMirrorTask {
#[allow(unused)]
async fn new(
cache: &TableFlownodeSetCacheRef,
requests: impl Iterator<Item = &RegionInsertRequest>,
@@ -979,7 +974,6 @@ impl FlowMirrorTask {
})
}
#[allow(unused)]
fn detach(self, node_manager: NodeManagerRef) -> Result<()> {
crate::metrics::DIST_MIRROR_PENDING_ROW_COUNT.add(self.num_rows as i64);
for (peer, inserts) in self.requests {

View File

@@ -41,7 +41,7 @@ futures.workspace = true
greptime-proto.workspace = true
itertools.workspace = true
jsonb.workspace = true
jsonpath-rust = "0.7.5"
jsonpath-rust = "0.7.3"
lazy_static.workspace = true
moka = { workspace = true, features = ["sync"] }
once_cell.workspace = true

View File

@@ -16,13 +16,10 @@ pub mod array;
pub mod map;
pub mod time;
use std::result::Result as StdResult;
pub use array::Array;
use jsonb::{Number as JsonbNumber, Object as JsonbObject, Value as JsonbValue};
use jsonpath_rust::parser::{parse_json_path, JsonPathIndex};
use jsonpath_rust::path::{JsonLike, Path};
use jsonpath_rust::{jsp_idx, jsp_obj, JsonPath, JsonPathParserError, JsonPathStr};
use jsonpath_rust::{jsp_idx, jsp_obj};
pub use map::Map;
use regex::Regex;
use snafu::{OptionExt, ResultExt};
@@ -289,52 +286,6 @@ impl Value {
_ => None,
}
}
// ref https://github.com/serde-rs/json/blob/master/src/value/mod.rs#L779
pub fn pointer(&self, pointer: &str) -> Option<&Value> {
if pointer.is_empty() {
return Some(self);
}
if !pointer.starts_with('/') {
return None;
}
pointer
.split('/')
.skip(1)
.map(|x| x.replace("~1", "/").replace("~0", "~"))
.try_fold(self, |target, token| match target {
Value::Map(map) => map.get(&token),
Value::Array(list) => parse_index(&token).and_then(|x| list.get(x)),
_ => None,
})
}
// ref https://github.com/serde-rs/json/blob/master/src/value/mod.rs#L834
pub fn pointer_mut(&mut self, pointer: &str) -> Option<&mut Value> {
if pointer.is_empty() {
return Some(self);
}
if !pointer.starts_with('/') {
return None;
}
pointer
.split('/')
.skip(1)
.map(|x| x.replace("~1", "/").replace("~0", "~"))
.try_fold(self, |target, token| match target {
Value::Map(map) => map.get_mut(&token),
Value::Array(list) => parse_index(&token).and_then(move |x| list.get_mut(x)),
_ => None,
})
}
}
// ref https://github.com/serde-rs/json/blob/master/src/value/mod.rs#L259
fn parse_index(s: &str) -> Option<usize> {
if s.starts_with('+') || (s.starts_with('0') && s.len() != 1) {
return None;
}
s.parse().ok()
}
impl std::fmt::Display for Value {
@@ -863,46 +814,4 @@ impl JsonLike for Value {
fn null() -> Self {
Value::Null
}
// ref https://github.com/besok/jsonpath-rust/blob/main/src/path/mod.rs#L423
fn reference<T>(
&self,
path: T,
) -> std::result::Result<std::option::Option<&Value>, JsonPathParserError>
where
T: Into<JsonPathStr>,
{
Ok(self.pointer(&path_to_json_path(path.into())?))
}
// https://github.com/besok/jsonpath-rust/blob/main/src/path/mod.rs#L430
fn reference_mut<T>(
&mut self,
path: T,
) -> std::result::Result<std::option::Option<&mut Value>, JsonPathParserError>
where
T: Into<JsonPathStr>,
{
Ok(self.pointer_mut(&path_to_json_path(path.into())?))
}
}
// ref https://github.com/besok/jsonpath-rust/blob/main/src/path/mod.rs#L438
fn path_to_json_path(path: JsonPathStr) -> StdResult<String, JsonPathParserError> {
convert_part(&parse_json_path(path.as_str())?)
}
// https://github.com/besok/jsonpath-rust/blob/main/src/path/mod.rs#L442
fn convert_part(path: &JsonPath) -> StdResult<String, JsonPathParserError> {
match path {
JsonPath::Chain(elems) => elems
.iter()
.map(convert_part)
.collect::<StdResult<String, JsonPathParserError>>(),
JsonPath::Index(JsonPathIndex::Single(v)) => Ok(format!("/{}", v)),
JsonPath::Field(e) => Ok(format!("/{}", e)),
JsonPath::Root => Ok("".to_string()),
e => Err(JsonPathParserError::InvalidJsonPath(e.to_string())),
}
}

View File

@@ -16,7 +16,6 @@ common-macro.workspace = true
common-recordbatch.workspace = true
common-telemetry.workspace = true
datafusion.workspace = true
datafusion-common.workspace = true
datafusion-expr.workspace = true
datatypes.workspace = true
futures.workspace = true

View File

@@ -20,7 +20,6 @@ mod holt_winters;
mod idelta;
mod predict_linear;
mod quantile;
mod quantile_aggr;
mod resets;
mod round;
#[cfg(test)]
@@ -40,7 +39,6 @@ pub use holt_winters::HoltWinters;
pub use idelta::IDelta;
pub use predict_linear::PredictLinear;
pub use quantile::QuantileOverTime;
pub use quantile_aggr::quantile_udaf;
pub use resets::Resets;
pub use round::Round;

View File

@@ -125,7 +125,7 @@ impl QuantileOverTime {
}
/// Refer to <https://github.com/prometheus/prometheus/blob/6e2905a4d4ff9b47b1f6d201333f5bd53633f921/promql/quantile.go#L357-L386>
pub(crate) fn quantile_impl(values: &[f64], quantile: f64) -> Option<f64> {
fn quantile_impl(values: &[f64], quantile: f64) -> Option<f64> {
if quantile.is_nan() || values.is_empty() {
return Some(f64::NAN);
}

View File

@@ -1,297 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use datafusion::arrow::array::{ArrayRef, AsArray};
use datafusion::common::cast::{as_list_array, as_primitive_array, as_struct_array};
use datafusion::error::Result as DfResult;
use datafusion::logical_expr::{Accumulator as DfAccumulator, AggregateUDF, Volatility};
use datafusion::prelude::create_udaf;
use datafusion_common::ScalarValue;
use datatypes::arrow::array::{ListArray, StructArray};
use datatypes::arrow::datatypes::{DataType, Field, Float64Type};
use crate::functions::quantile::quantile_impl;
const QUANTILE_NAME: &str = "quantile";
const VALUES_FIELD_NAME: &str = "values";
const DEFAULT_LIST_FIELD_NAME: &str = "item";
#[derive(Debug, Default)]
pub struct QuantileAccumulator {
q: f64,
values: Vec<Option<f64>>,
}
/// Create a quantile `AggregateUDF` for PromQL quantile operator,
/// which calculates φ-quantile (0 ≤ φ ≤ 1) over dimensions
pub fn quantile_udaf(q: f64) -> Arc<AggregateUDF> {
Arc::new(create_udaf(
QUANTILE_NAME,
// Input type: (values)
vec![DataType::Float64],
// Output type: the φ-quantile
Arc::new(DataType::Float64),
Volatility::Immutable,
// Create the accumulator
Arc::new(move |_| Ok(Box::new(QuantileAccumulator::new(q)))),
// Intermediate state types
Arc::new(vec![DataType::Struct(
vec![Field::new(
VALUES_FIELD_NAME,
DataType::List(Arc::new(Field::new(
DEFAULT_LIST_FIELD_NAME,
DataType::Float64,
true,
))),
false,
)]
.into(),
)]),
))
}
impl QuantileAccumulator {
pub fn new(q: f64) -> Self {
Self {
q,
..Default::default()
}
}
}
impl DfAccumulator for QuantileAccumulator {
fn update_batch(&mut self, values: &[ArrayRef]) -> DfResult<()> {
let f64_array = values[0].as_primitive::<Float64Type>();
self.values.extend(f64_array);
Ok(())
}
fn evaluate(&mut self) -> DfResult<ScalarValue> {
let values: Vec<_> = self.values.iter().map(|v| v.unwrap_or(0.0)).collect();
let result = quantile_impl(&values, self.q);
ScalarValue::new_primitive::<Float64Type>(result, &DataType::Float64)
}
fn size(&self) -> usize {
std::mem::size_of::<Self>() + self.values.capacity() * std::mem::size_of::<Option<f64>>()
}
fn state(&mut self) -> DfResult<Vec<ScalarValue>> {
let values_array = Arc::new(ListArray::from_iter_primitive::<Float64Type, _, _>(vec![
Some(self.values.clone()),
]));
let state_struct = StructArray::new(
vec![Field::new(
VALUES_FIELD_NAME,
DataType::List(Arc::new(Field::new(
DEFAULT_LIST_FIELD_NAME,
DataType::Float64,
true,
))),
false,
)]
.into(),
vec![values_array],
None,
);
Ok(vec![ScalarValue::Struct(Arc::new(state_struct))])
}
fn merge_batch(&mut self, states: &[ArrayRef]) -> DfResult<()> {
if states.is_empty() {
return Ok(());
}
for state in states {
let state = as_struct_array(state)?;
for list in as_list_array(state.column(0))?.iter().flatten() {
let f64_array = as_primitive_array::<Float64Type>(&list)?.clone();
self.values.extend(&f64_array);
}
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use datafusion::arrow::array::{ArrayRef, Float64Array};
use datafusion_common::ScalarValue;
use super::*;
fn create_f64_array(values: Vec<Option<f64>>) -> ArrayRef {
Arc::new(Float64Array::from(values)) as ArrayRef
}
#[test]
fn test_quantile_accumulator_empty() {
let mut accumulator = QuantileAccumulator::new(0.5);
let result = accumulator.evaluate().unwrap();
match result {
ScalarValue::Float64(_) => (),
_ => panic!("Expected Float64 scalar value"),
}
}
#[test]
fn test_quantile_accumulator_single_value() {
let mut accumulator = QuantileAccumulator::new(0.5);
let input = create_f64_array(vec![Some(10.0)]);
accumulator.update_batch(&[input]).unwrap();
let result = accumulator.evaluate().unwrap();
assert_eq!(result, ScalarValue::Float64(Some(10.0)));
}
#[test]
fn test_quantile_accumulator_multiple_values() {
let mut accumulator = QuantileAccumulator::new(0.5);
let input = create_f64_array(vec![Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)]);
accumulator.update_batch(&[input]).unwrap();
let result = accumulator.evaluate().unwrap();
assert_eq!(result, ScalarValue::Float64(Some(3.0)));
}
#[test]
fn test_quantile_accumulator_with_nulls() {
let mut accumulator = QuantileAccumulator::new(0.5);
let input = create_f64_array(vec![Some(1.0), None, Some(3.0), Some(4.0), Some(5.0)]);
accumulator.update_batch(&[input]).unwrap();
let result = accumulator.evaluate().unwrap();
assert_eq!(result, ScalarValue::Float64(Some(3.0)));
}
#[test]
fn test_quantile_accumulator_multiple_batches() {
let mut accumulator = QuantileAccumulator::new(0.5);
let input1 = create_f64_array(vec![Some(1.0), Some(2.0)]);
let input2 = create_f64_array(vec![Some(3.0), Some(4.0), Some(5.0)]);
accumulator.update_batch(&[input1]).unwrap();
accumulator.update_batch(&[input2]).unwrap();
let result = accumulator.evaluate().unwrap();
assert_eq!(result, ScalarValue::Float64(Some(3.0)));
}
#[test]
fn test_quantile_accumulator_different_quantiles() {
let mut min_accumulator = QuantileAccumulator::new(0.0);
let input = create_f64_array(vec![Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)]);
min_accumulator.update_batch(&[input.clone()]).unwrap();
assert_eq!(
min_accumulator.evaluate().unwrap(),
ScalarValue::Float64(Some(1.0))
);
let mut q1_accumulator = QuantileAccumulator::new(0.25);
q1_accumulator.update_batch(&[input.clone()]).unwrap();
assert_eq!(
q1_accumulator.evaluate().unwrap(),
ScalarValue::Float64(Some(2.0))
);
let mut q3_accumulator = QuantileAccumulator::new(0.75);
q3_accumulator.update_batch(&[input.clone()]).unwrap();
assert_eq!(
q3_accumulator.evaluate().unwrap(),
ScalarValue::Float64(Some(4.0))
);
let mut max_accumulator = QuantileAccumulator::new(1.0);
max_accumulator.update_batch(&[input]).unwrap();
assert_eq!(
max_accumulator.evaluate().unwrap(),
ScalarValue::Float64(Some(5.0))
);
}
#[test]
fn test_quantile_accumulator_size() {
let mut accumulator = QuantileAccumulator::new(0.5);
let input = create_f64_array(vec![Some(1.0), Some(2.0), Some(3.0)]);
let initial_size = accumulator.size();
accumulator.update_batch(&[input]).unwrap();
let after_update_size = accumulator.size();
assert!(after_update_size >= initial_size);
}
#[test]
fn test_quantile_accumulator_state_and_merge() -> DfResult<()> {
let mut acc1 = QuantileAccumulator::new(0.5);
let input1 = create_f64_array(vec![Some(1.0), Some(2.0)]);
acc1.update_batch(&[input1])?;
let state1 = acc1.state()?;
let mut acc2 = QuantileAccumulator::new(0.5);
let input2 = create_f64_array(vec![Some(3.0), Some(4.0), Some(5.0)]);
acc2.update_batch(&[input2])?;
let mut struct_builders = vec![];
for scalar in &state1 {
if let ScalarValue::Struct(struct_array) = scalar {
struct_builders.push(struct_array.clone() as ArrayRef);
}
}
acc2.merge_batch(&struct_builders)?;
let result = acc2.evaluate()?;
assert_eq!(result, ScalarValue::Float64(Some(3.0)));
Ok(())
}
#[test]
fn test_quantile_accumulator_with_extreme_values() {
let mut accumulator = QuantileAccumulator::new(0.5);
let input = create_f64_array(vec![Some(f64::MAX), Some(f64::MIN), Some(0.0)]);
accumulator.update_batch(&[input]).unwrap();
let _result = accumulator.evaluate().unwrap();
}
#[test]
fn test_quantile_udaf_creation() {
let q = 0.5;
let udaf = quantile_udaf(q);
assert_eq!(udaf.name(), QUANTILE_NAME);
assert_eq!(udaf.return_type(&[]).unwrap(), DataType::Float64);
}
}

View File

@@ -13,7 +13,6 @@
// limitations under the License.
use datafusion::dataframe::DataFrame as DfDataFrame;
use datafusion_expr::LogicalPlan;
/// DataFrame represents a logical set of rows with the same named columns.
/// Similar to a Pandas DataFrame or Spark DataFrame
@@ -21,11 +20,3 @@ use datafusion_expr::LogicalPlan;
pub enum DataFrame {
DataFusion(DfDataFrame),
}
impl DataFrame {
pub fn into_logical_plan(self) -> LogicalPlan {
match self {
Self::DataFusion(dataframe) => dataframe.into_parts().1,
}
}
}

View File

@@ -31,7 +31,7 @@ use datatypes::arrow::datatypes::SchemaRef;
use snafu::ResultExt;
use store_api::metadata::RegionMetadataRef;
use store_api::region_engine::RegionEngineRef;
use store_api::storage::{RegionId, ScanRequest, TimeSeriesDistribution, TimeSeriesRowSelector};
use store_api::storage::{RegionId, ScanRequest, TimeSeriesRowSelector};
use table::table::scan::RegionScanExec;
use crate::error::{GetRegionMetadataSnafu, Result};
@@ -175,10 +175,10 @@ impl TableProvider for DummyTableProvider {
let scanner = self
.engine
.handle_query(self.region_id, request.clone())
.handle_query(self.region_id, request)
.await
.map_err(|e| DataFusionError::External(Box::new(e)))?;
Ok(Arc::new(RegionScanExec::new(scanner, request)?))
Ok(Arc::new(RegionScanExec::new(scanner)))
}
fn supports_filters_pushdown(
@@ -233,11 +233,6 @@ impl DummyTableProvider {
self.scan_request.lock().unwrap().output_ordering = Some(order_opts.to_vec());
}
/// Sets the distribution hint of the query to the provider.
pub fn with_distribution(&self, distribution: TimeSeriesDistribution) {
self.scan_request.lock().unwrap().distribution = Some(distribution);
}
/// Sets the time series selector hint of the query to the provider.
pub fn with_time_series_selector_hint(&self, selector: TimeSeriesRowSelector) {
self.scan_request.lock().unwrap().series_row_selector = Some(selector);

View File

@@ -23,7 +23,6 @@ use datafusion::physical_plan::ExecutionPlan;
use datafusion_common::tree_node::{Transformed, TreeNode};
use datafusion_common::{DataFusionError, Result};
use store_api::region_engine::PartitionRange;
use store_api::storage::TimeSeriesDistribution;
use table::table::scan::RegionScanExec;
#[derive(Debug)]
@@ -66,14 +65,6 @@ impl ParallelizeScan {
return Ok(Transformed::no(plan));
}
// don't parallelize if we want per series distribution
if matches!(
region_scan_exec.distribution(),
Some(TimeSeriesDistribution::PerSeries)
) {
return Ok(Transformed::no(plan));
}
let ranges = region_scan_exec.get_partition_ranges();
let total_range_num = ranges.len();
let expected_partition_num = config.execution.target_partitions;

View File

@@ -23,7 +23,7 @@ use datafusion_common::{Column, Result};
use datafusion_expr::expr::Sort;
use datafusion_expr::{utils, Expr, LogicalPlan};
use datafusion_optimizer::{OptimizerConfig, OptimizerRule};
use store_api::storage::{TimeSeriesDistribution, TimeSeriesRowSelector};
use store_api::storage::TimeSeriesRowSelector;
use crate::dummy_catalog::DummyTableProvider;
@@ -121,36 +121,6 @@ impl ScanHintRule {
});
}
adapter.with_ordering_hint(&opts);
let mut sort_expr_cursor = order_expr.iter().filter_map(|s| s.expr.try_as_col());
let region_metadata = adapter.region_metadata();
// ignore table without pk
if region_metadata.primary_key.is_empty() {
return;
}
let mut pk_column_iter = region_metadata.primary_key_columns();
let mut curr_sort_expr = sort_expr_cursor.next();
let mut curr_pk_col = pk_column_iter.next();
while let (Some(sort_expr), Some(pk_col)) = (curr_sort_expr, curr_pk_col) {
if sort_expr.name == pk_col.column_schema.name {
curr_sort_expr = sort_expr_cursor.next();
curr_pk_col = pk_column_iter.next();
} else {
return;
}
}
let next_remaining = sort_expr_cursor.next();
match (curr_sort_expr, next_remaining) {
(Some(expr), None)
if expr.name == region_metadata.time_index_column().column_schema.name =>
{
adapter.with_distribution(TimeSeriesDistribution::PerSeries);
}
(None, _) => adapter.with_distribution(TimeSeriesDistribution::PerSeries),
(Some(_), _) => {}
}
}
fn set_time_series_row_selector_hint(

View File

@@ -188,7 +188,7 @@ impl QueryLanguageParser {
Ok(QueryStatement::Promql(eval_stmt))
}
pub fn parse_promql_timestamp(timestamp: &str) -> Result<SystemTime> {
fn parse_promql_timestamp(timestamp: &str) -> Result<SystemTime> {
// try rfc3339 format
let rfc3339_result = DateTime::parse_from_rfc3339(timestamp)
.context(ParseTimestampSnafu { raw: timestamp })

View File

@@ -12,6 +12,5 @@
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod error;
pub mod label_values;
pub(crate) mod error;
pub mod planner;

View File

@@ -1,107 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::time::{SystemTime, UNIX_EPOCH};
use datafusion_common::{Column, ScalarValue};
use datafusion_expr::expr::Alias;
use datafusion_expr::utils::conjunction;
use datafusion_expr::{col, Cast, Expr, LogicalPlan, LogicalPlanBuilder};
use datafusion_sql::TableReference;
use datatypes::arrow::datatypes::{DataType as ArrowDataType, TimeUnit as ArrowTimeUnit};
use datatypes::prelude::ConcreteDataType;
use snafu::{OptionExt, ResultExt};
use table::TableRef;
use crate::promql::error::{DataFusionPlanningSnafu, Result, TimeIndexNotFoundSnafu};
fn build_time_filter(time_index_expr: Expr, start: i64, end: i64) -> Expr {
time_index_expr
.clone()
.gt_eq(Expr::Literal(ScalarValue::TimestampMillisecond(
Some(start),
None,
)))
.and(
time_index_expr.lt_eq(Expr::Literal(ScalarValue::TimestampMillisecond(
Some(end),
None,
))),
)
}
/// Rewrite label values query to DataFusion logical plan.
pub fn rewrite_label_values_query(
table: TableRef,
mut scan_plan: LogicalPlan,
mut conditions: Vec<Expr>,
label_name: String,
start: SystemTime,
end: SystemTime,
) -> Result<LogicalPlan> {
let table_ref = TableReference::partial(
table.table_info().schema_name.as_str(),
table.table_info().name.as_str(),
);
let schema = table.schema();
let ts_column = schema
.timestamp_column()
.with_context(|| TimeIndexNotFoundSnafu {
table: table.table_info().full_table_name(),
})?;
let is_time_index_ms =
ts_column.data_type == ConcreteDataType::timestamp_millisecond_datatype();
let time_index_expr = col(Column::from_name(ts_column.name.clone()));
if !is_time_index_ms {
// cast to ms if time_index not in Millisecond precision
let expr = vec![
col(Column::from_name(label_name.clone())),
Expr::Alias(Alias {
expr: Box::new(Expr::Cast(Cast {
expr: Box::new(time_index_expr.clone()),
data_type: ArrowDataType::Timestamp(ArrowTimeUnit::Millisecond, None),
})),
relation: Some(table_ref),
name: ts_column.name.clone(),
}),
];
scan_plan = LogicalPlanBuilder::from(scan_plan)
.project(expr)
.context(DataFusionPlanningSnafu)?
.build()
.context(DataFusionPlanningSnafu)?;
};
let start = start.duration_since(UNIX_EPOCH).unwrap().as_millis() as i64;
let end = end.duration_since(UNIX_EPOCH).unwrap().as_millis() as i64;
conditions.push(build_time_filter(time_index_expr, start, end));
// Safety: `conditions` is not empty.
let filter = conjunction(conditions).unwrap();
// Builds time filter
let logical_plan = LogicalPlanBuilder::from(scan_plan)
.filter(filter)
.context(DataFusionPlanningSnafu)?
.project(vec![col(Column::from_name(label_name))])
.context(DataFusionPlanningSnafu)?
.distinct()
.context(DataFusionPlanningSnafu)?
.build()
.context(DataFusionPlanningSnafu)?;
Ok(logical_plan)
}

View File

@@ -51,8 +51,8 @@ use promql::extension_plan::{
RangeManipulate, ScalarCalculate, SeriesDivide, SeriesNormalize, UnionDistinctOn,
};
use promql::functions::{
quantile_udaf, AbsentOverTime, AvgOverTime, Changes, CountOverTime, Delta, Deriv, HoltWinters,
IDelta, Increase, LastOverTime, MaxOverTime, MinOverTime, PredictLinear, PresentOverTime,
AbsentOverTime, AvgOverTime, Changes, CountOverTime, Delta, Deriv, HoltWinters, IDelta,
Increase, LastOverTime, MaxOverTime, MinOverTime, PredictLinear, PresentOverTime,
QuantileOverTime, Rate, Resets, Round, StddevOverTime, StdvarOverTime, SumOverTime,
};
use promql_parser::label::{MatchOp, Matcher, Matchers, METRIC_NAME};
@@ -266,10 +266,7 @@ impl PromPlanner {
aggr_expr: &AggregateExpr,
) -> Result<LogicalPlan> {
let AggregateExpr {
op,
expr,
modifier,
param,
op, expr, modifier, ..
} = aggr_expr;
let input = self.prom_expr_to_plan(expr, session_state).await?;
@@ -280,40 +277,19 @@ impl PromPlanner {
_ => {
// calculate columns to group by
// Need to append time index column into group by columns
let mut group_exprs = self.agg_modifier_to_col(input.schema(), modifier, true)?;
let group_exprs = self.agg_modifier_to_col(input.schema(), modifier, true)?;
// convert op and value columns to aggregate exprs
let (aggr_exprs, prev_field_exprs) =
self.create_aggregate_exprs(*op, param, &input)?;
let aggr_exprs = self.create_aggregate_exprs(*op, &input)?;
// create plan
let builder = LogicalPlanBuilder::from(input);
let builder = if op.id() == token::T_COUNT_VALUES {
let label = Self::get_param_value_as_str(*op, param)?;
// `count_values` must be grouped by fields,
// and project the fields to the new label.
group_exprs.extend(prev_field_exprs.clone());
let project_fields = self
.create_field_column_exprs()?
.into_iter()
.chain(self.create_tag_column_exprs()?)
.chain(Some(self.create_time_index_column_expr()?))
.chain(prev_field_exprs.into_iter().map(|expr| expr.alias(label)));
builder
.aggregate(group_exprs.clone(), aggr_exprs)
.context(DataFusionPlanningSnafu)?
.project(project_fields)
.context(DataFusionPlanningSnafu)?
} else {
builder
.aggregate(group_exprs.clone(), aggr_exprs)
.context(DataFusionPlanningSnafu)?
};
let sort_expr = group_exprs.into_iter().map(|expr| expr.sort(true, false));
builder
.sort(sort_expr)
let group_sort_expr = group_exprs
.clone()
.into_iter()
.map(|expr| expr.sort(true, false));
LogicalPlanBuilder::from(input)
.aggregate(group_exprs.clone(), aggr_exprs)
.context(DataFusionPlanningSnafu)?
.sort(group_sort_expr)
.context(DataFusionPlanningSnafu)?
.build()
.context(DataFusionPlanningSnafu)
@@ -336,7 +312,18 @@ impl PromPlanner {
let group_exprs = self.agg_modifier_to_col(input.schema(), modifier, false)?;
let val = Self::get_param_value_as_f64(*op, param)?;
let param = param
.as_deref()
.with_context(|| FunctionInvalidArgumentSnafu {
fn_name: (*op).to_string(),
})?;
let PromExpr::NumberLiteral(NumberLiteral { val }) = param else {
return FunctionInvalidArgumentSnafu {
fn_name: (*op).to_string(),
}
.fail();
};
// convert op and value columns to window exprs.
let window_exprs = self.create_window_exprs(*op, group_exprs.clone(), &input)?;
@@ -354,7 +341,7 @@ impl PromPlanner {
let predicate = DfExpr::BinaryExpr(BinaryExpr {
left: Box::new(col(rank)),
op: Operator::LtEq,
right: Box::new(lit(val)),
right: Box::new(lit(*val)),
});
match expr {
@@ -939,7 +926,7 @@ impl PromPlanner {
Some(Offset::Neg(duration)) => -(duration.as_millis() as Millisecond),
None => 0,
};
let mut scan_filters = Self::matchers_to_expr(label_matchers.clone(), table_schema)?;
let mut scan_filters = self.matchers_to_expr(label_matchers.clone(), table_schema)?;
if let Some(time_index_filter) = self.build_time_index_filter(offset_duration)? {
scan_filters.push(time_index_filter);
}
@@ -1135,7 +1122,8 @@ impl PromPlanner {
}
// TODO(ruihang): ignore `MetricNameLabel` (`__name__`) matcher
pub fn matchers_to_expr(
fn matchers_to_expr(
&self,
label_matchers: Matchers,
table_schema: &DFSchemaRef,
) -> Result<Vec<DfExpr>> {
@@ -1943,44 +1931,32 @@ impl PromPlanner {
})
}
/// Creates a set of DataFusion `DfExpr::AggregateFunction` expressions for each value column using the specified aggregate function.
/// Create [DfExpr::AggregateFunction] expr for each value column with given aggregate function.
///
/// # Side Effects
///
/// This method modifies the value columns in the context by replacing them with the new columns
/// created by the aggregate function application.
///
/// # Returns
///
/// Returns a tuple of `(aggregate_expressions, previous_field_expressions)` where:
/// - `aggregate_expressions`: Expressions that apply the aggregate function to the original fields
/// - `previous_field_expressions`: Original field expressions before aggregation. This is non-empty
/// only when the operation is `count_values`, as this operation requires preserving the original
/// values for grouping.
/// # Side effect
///
/// This method will update value columns in context to the new value columns created by
/// aggregate function.
fn create_aggregate_exprs(
&mut self,
op: TokenType,
param: &Option<Box<PromExpr>>,
input_plan: &LogicalPlan,
) -> Result<(Vec<DfExpr>, Vec<DfExpr>)> {
) -> Result<Vec<DfExpr>> {
let aggr = match op.id() {
token::T_SUM => sum_udaf(),
token::T_QUANTILE => {
let q = Self::get_param_value_as_f64(op, param)?;
quantile_udaf(q)
}
token::T_AVG => avg_udaf(),
token::T_COUNT_VALUES | token::T_COUNT => count_udaf(),
token::T_COUNT => count_udaf(),
token::T_MIN => min_udaf(),
token::T_MAX => max_udaf(),
token::T_GROUP => grouping_udaf(),
token::T_STDDEV => stddev_pop_udaf(),
token::T_STDVAR => var_pop_udaf(),
token::T_TOPK | token::T_BOTTOMK => UnsupportedExprSnafu {
name: format!("{op:?}"),
token::T_TOPK | token::T_BOTTOMK | token::T_COUNT_VALUES | token::T_QUANTILE => {
UnsupportedExprSnafu {
name: format!("{op:?}"),
}
.fail()?
}
.fail()?,
_ => UnexpectedTokenSnafu { token: op }.fail()?,
};
@@ -1990,41 +1966,19 @@ impl PromPlanner {
.field_columns
.iter()
.map(|col| {
Ok(DfExpr::AggregateFunction(AggregateFunction {
DfExpr::AggregateFunction(AggregateFunction {
func: aggr.clone(),
args: vec![DfExpr::Column(Column::from_name(col))],
distinct: false,
filter: None,
order_by: None,
null_treatment: None,
}))
})
})
.collect::<Result<Vec<_>>>()?;
.collect();
// if the aggregator is `count_values`, it must be grouped by current fields.
let prev_field_exprs = if op.id() == token::T_COUNT_VALUES {
let prev_field_exprs: Vec<_> = self
.ctx
.field_columns
.iter()
.map(|col| DfExpr::Column(Column::from_name(col)))
.collect();
ensure!(
self.ctx.field_columns.len() == 1,
UnsupportedExprSnafu {
name: "count_values on multi-value input"
}
);
prev_field_exprs
} else {
vec![]
};
// update value column name according to the aggregators,
// update value column name according to the aggregators
let mut new_field_columns = Vec::with_capacity(self.ctx.field_columns.len());
let normalized_exprs =
normalize_cols(exprs.iter().cloned(), input_plan).context(DataFusionPlanningSnafu)?;
for expr in normalized_exprs {
@@ -2032,39 +1986,7 @@ impl PromPlanner {
}
self.ctx.field_columns = new_field_columns;
Ok((exprs, prev_field_exprs))
}
fn get_param_value_as_str(op: TokenType, param: &Option<Box<PromExpr>>) -> Result<&str> {
let param = param
.as_deref()
.with_context(|| FunctionInvalidArgumentSnafu {
fn_name: op.to_string(),
})?;
let PromExpr::StringLiteral(StringLiteral { val }) = param else {
return FunctionInvalidArgumentSnafu {
fn_name: op.to_string(),
}
.fail();
};
Ok(val)
}
fn get_param_value_as_f64(op: TokenType, param: &Option<Box<PromExpr>>) -> Result<f64> {
let param = param
.as_deref()
.with_context(|| FunctionInvalidArgumentSnafu {
fn_name: op.to_string(),
})?;
let PromExpr::NumberLiteral(NumberLiteral { val }) = param else {
return FunctionInvalidArgumentSnafu {
fn_name: op.to_string(),
}
.fail();
};
Ok(*val)
Ok(exprs)
}
/// Create [DfExpr::WindowFunction] expr for each value column with given window function.
@@ -3420,6 +3342,30 @@ mod test {
do_aggregate_expr_plan("stdvar", "var_pop").await;
}
#[tokio::test]
#[should_panic]
async fn aggregate_top_k() {
do_aggregate_expr_plan("topk", "").await;
}
#[tokio::test]
#[should_panic]
async fn aggregate_bottom_k() {
do_aggregate_expr_plan("bottomk", "").await;
}
#[tokio::test]
#[should_panic]
async fn aggregate_count_values() {
do_aggregate_expr_plan("count_values", "").await;
}
#[tokio::test]
#[should_panic]
async fn aggregate_quantile() {
do_aggregate_expr_plan("quantile", "").await;
}
// TODO(ruihang): add range fn tests once exprs are ready.
// {
@@ -4302,98 +4248,4 @@ mod test {
assert_eq!(plan.display_indent_schema().to_string(), expected);
}
#[tokio::test]
async fn test_count_values_expr() {
let mut eval_stmt = EvalStmt {
expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
start: UNIX_EPOCH,
end: UNIX_EPOCH
.checked_add(Duration::from_secs(100_000))
.unwrap(),
interval: Duration::from_secs(5),
lookback_delta: Duration::from_secs(1),
};
let case = r#"count_values('series', prometheus_tsdb_head_series{ip=~"(10\\.0\\.160\\.237:8080|10\\.0\\.160\\.237:9090)"}) by (ip)"#;
let prom_expr = parser::parse(case).unwrap();
eval_stmt.expr = prom_expr;
let table_provider = build_test_table_provider_with_fields(
&[
(
DEFAULT_SCHEMA_NAME.to_string(),
"prometheus_tsdb_head_series".to_string(),
),
(
DEFAULT_SCHEMA_NAME.to_string(),
"http_server_requests_seconds_count".to_string(),
),
],
&["ip"],
)
.await;
let plan = PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_session_state())
.await
.unwrap();
let expected = r#"Projection: count(prometheus_tsdb_head_series.greptime_value), prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp, series [count(prometheus_tsdb_head_series.greptime_value):Int64, ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), series:Float64;N]
Sort: prometheus_tsdb_head_series.ip ASC NULLS LAST, prometheus_tsdb_head_series.greptime_timestamp ASC NULLS LAST, prometheus_tsdb_head_series.greptime_value ASC NULLS LAST [count(prometheus_tsdb_head_series.greptime_value):Int64, ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), series:Float64;N, greptime_value:Float64;N]
Projection: count(prometheus_tsdb_head_series.greptime_value), prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp, prometheus_tsdb_head_series.greptime_value AS series, prometheus_tsdb_head_series.greptime_value [count(prometheus_tsdb_head_series.greptime_value):Int64, ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), series:Float64;N, greptime_value:Float64;N]
Aggregate: groupBy=[[prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp, prometheus_tsdb_head_series.greptime_value]], aggr=[[count(prometheus_tsdb_head_series.greptime_value)]] [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N, count(prometheus_tsdb_head_series.greptime_value):Int64]
PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp] [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [false] [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
PromSeriesDivide: tags=["ip"] [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
Sort: prometheus_tsdb_head_series.ip DESC NULLS LAST, prometheus_tsdb_head_series.greptime_timestamp DESC NULLS LAST [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
Filter: prometheus_tsdb_head_series.ip ~ Utf8("(10\.0\.160\.237:8080|10\.0\.160\.237:9090)") AND prometheus_tsdb_head_series.greptime_timestamp >= TimestampMillisecond(-1000, None) AND prometheus_tsdb_head_series.greptime_timestamp <= TimestampMillisecond(100001000, None) [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
TableScan: prometheus_tsdb_head_series [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]"#;
assert_eq!(plan.display_indent_schema().to_string(), expected);
}
#[tokio::test]
async fn test_quantile_expr() {
let mut eval_stmt = EvalStmt {
expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
start: UNIX_EPOCH,
end: UNIX_EPOCH
.checked_add(Duration::from_secs(100_000))
.unwrap(),
interval: Duration::from_secs(5),
lookback_delta: Duration::from_secs(1),
};
let case = r#"quantile(0.3, sum(prometheus_tsdb_head_series{ip=~"(10\\.0\\.160\\.237:8080|10\\.0\\.160\\.237:9090)"}) by (ip))"#;
let prom_expr = parser::parse(case).unwrap();
eval_stmt.expr = prom_expr;
let table_provider = build_test_table_provider_with_fields(
&[
(
DEFAULT_SCHEMA_NAME.to_string(),
"prometheus_tsdb_head_series".to_string(),
),
(
DEFAULT_SCHEMA_NAME.to_string(),
"http_server_requests_seconds_count".to_string(),
),
],
&["ip"],
)
.await;
let plan = PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_session_state())
.await
.unwrap();
let expected = r#"Sort: prometheus_tsdb_head_series.greptime_timestamp ASC NULLS LAST [greptime_timestamp:Timestamp(Millisecond, None), quantile(sum(prometheus_tsdb_head_series.greptime_value)):Float64;N]
Aggregate: groupBy=[[prometheus_tsdb_head_series.greptime_timestamp]], aggr=[[quantile(sum(prometheus_tsdb_head_series.greptime_value))]] [greptime_timestamp:Timestamp(Millisecond, None), quantile(sum(prometheus_tsdb_head_series.greptime_value)):Float64;N]
Sort: prometheus_tsdb_head_series.ip ASC NULLS LAST, prometheus_tsdb_head_series.greptime_timestamp ASC NULLS LAST [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), sum(prometheus_tsdb_head_series.greptime_value):Float64;N]
Aggregate: groupBy=[[prometheus_tsdb_head_series.ip, prometheus_tsdb_head_series.greptime_timestamp]], aggr=[[sum(prometheus_tsdb_head_series.greptime_value)]] [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), sum(prometheus_tsdb_head_series.greptime_value):Float64;N]
PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp] [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [false] [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
PromSeriesDivide: tags=["ip"] [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
Sort: prometheus_tsdb_head_series.ip DESC NULLS LAST, prometheus_tsdb_head_series.greptime_timestamp DESC NULLS LAST [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
Filter: prometheus_tsdb_head_series.ip ~ Utf8("(10\.0\.160\.237:8080|10\.0\.160\.237:9090)") AND prometheus_tsdb_head_series.greptime_timestamp >= TimestampMillisecond(-1000, None) AND prometheus_tsdb_head_series.greptime_timestamp <= TimestampMillisecond(100001000, None) [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]
TableScan: prometheus_tsdb_head_series [ip:Utf8, greptime_timestamp:Timestamp(Millisecond, None), greptime_value:Float64;N]"#;
assert_eq!(plan.display_indent_schema().to_string(), expected);
}
}

View File

@@ -410,15 +410,6 @@ pub enum Error {
source: query::error::Error,
},
#[snafu(display("Failed to parse timestamp: {}", timestamp))]
ParseTimestamp {
timestamp: String,
#[snafu(implicit)]
location: Location,
#[snafu(source)]
error: query::error::Error,
},
#[snafu(display("{}", reason))]
UnexpectedResult {
reason: String,
@@ -694,8 +685,7 @@ impl ErrorExt for Error {
| PrepareStatementNotFound { .. }
| FailedToParseQuery { .. }
| InvalidElasticsearchInput { .. }
| InvalidJaegerQuery { .. }
| ParseTimestamp { .. } => StatusCode::InvalidArguments,
| InvalidJaegerQuery { .. } => StatusCode::InvalidArguments,
Catalog { source, .. } => source.status_code(),
RowWriter { source, .. } => source.status_code(),

View File

@@ -29,7 +29,7 @@ use common_time::util::{current_time_rfc3339, yesterday_rfc3339};
use common_version::OwnedBuildInfo;
use datatypes::prelude::ConcreteDataType;
use datatypes::scalars::ScalarVector;
use datatypes::vectors::Float64Vector;
use datatypes::vectors::{Float64Vector, StringVector};
use futures::future::join_all;
use futures::StreamExt;
use promql_parser::label::{MatchOp, Matcher, Matchers, METRIC_NAME};
@@ -38,7 +38,7 @@ use promql_parser::parser::{
AggregateExpr, BinaryExpr, Call, Expr as PromqlExpr, MatrixSelector, ParenExpr, SubqueryExpr,
UnaryExpr, VectorSelector,
};
use query::parser::{PromQuery, QueryLanguageParser, DEFAULT_LOOKBACK_STRING};
use query::parser::{PromQuery, DEFAULT_LOOKBACK_STRING};
use query::promql::planner::normalize_matcher;
use serde::de::{self, MapAccess, Visitor};
use serde::{Deserialize, Serialize};
@@ -51,8 +51,8 @@ use store_api::metric_engine_consts::{
pub use super::result::prometheus_resp::PrometheusJsonResponse;
use crate::error::{
CatalogSnafu, CollectRecordbatchSnafu, Error, InvalidQuerySnafu, ParseTimestampSnafu, Result,
TableNotFoundSnafu, UnexpectedResultSnafu,
CatalogSnafu, CollectRecordbatchSnafu, Error, InvalidQuerySnafu, Result, TableNotFoundSnafu,
UnexpectedResultSnafu,
};
use crate::http::header::collect_plan_metrics;
use crate::prom_store::{FIELD_NAME_LABEL, METRIC_NAME_LABEL};
@@ -994,58 +994,44 @@ pub async fn label_values_query(
let start = params.start.unwrap_or_else(yesterday_rfc3339);
let end = params.end.unwrap_or_else(current_time_rfc3339);
let lookback = params
.lookback
.unwrap_or_else(|| DEFAULT_LOOKBACK_STRING.to_string());
let mut label_values = HashSet::new();
let start = try_call_return_response!(QueryLanguageParser::parse_promql_timestamp(&start)
.context(ParseTimestampSnafu { timestamp: &start }));
let end = try_call_return_response!(QueryLanguageParser::parse_promql_timestamp(&end)
.context(ParseTimestampSnafu { timestamp: &end }));
let mut merge_map = HashMap::new();
for query in queries {
let promql_expr = try_call_return_response!(promql_parser::parser::parse(&query));
let PromqlExpr::VectorSelector(VectorSelector { name, matchers, .. }) = promql_expr else {
return PrometheusJsonResponse::error(
StatusCode::InvalidArguments,
"expected vector selector",
);
let prom_query = PromQuery {
query,
start: start.clone(),
end: end.clone(),
step: DEFAULT_LOOKBACK_STRING.to_string(),
lookback: lookback.clone(),
};
let Some(name) = name else {
return PrometheusJsonResponse::error(
StatusCode::InvalidArguments,
"expected metric name",
);
};
// Only use and filter matchers.
let matchers = matchers.matchers;
let result = handler
.query_label_values(
name,
label_name.to_string(),
matchers,
start,
end,
&query_ctx,
)
.await;
match result {
Ok(result) => {
label_values.extend(result.into_iter());
}
Err(err) => {
// Prometheus won't report error if querying nonexist label and metric
if err.status_code() != StatusCode::TableNotFound
&& err.status_code() != StatusCode::TableColumnNotFound
{
return PrometheusJsonResponse::error(err.status_code(), err.output_msg());
}
let result = handler.do_query(&prom_query, query_ctx.clone()).await;
if let Err(err) =
retrieve_label_values(result, &label_name, &mut label_values, &mut merge_map).await
{
// Prometheus won't report error if querying nonexist label and metric
if err.status_code() != StatusCode::TableNotFound
&& err.status_code() != StatusCode::TableColumnNotFound
{
return PrometheusJsonResponse::error(err.status_code(), err.output_msg());
}
}
}
let merge_map = merge_map
.into_iter()
.map(|(k, v)| (k, Value::from(v)))
.collect();
let mut label_values: Vec<_> = label_values.into_iter().collect();
label_values.sort_unstable();
PrometheusJsonResponse::success(PrometheusResponse::LabelValues(label_values))
let mut resp = PrometheusJsonResponse::success(PrometheusResponse::LabelValues(label_values));
resp.resp_metrics = merge_map;
resp
}
async fn retrieve_field_names(
@@ -1090,6 +1076,71 @@ async fn retrieve_field_names(
Ok(field_columns)
}
async fn retrieve_label_values(
result: Result<Output>,
label_name: &str,
labels_values: &mut HashSet<String>,
metrics: &mut HashMap<String, u64>,
) -> Result<()> {
let result = result?;
match result.data {
OutputData::RecordBatches(batches) => {
retrieve_label_values_from_record_batch(batches, label_name, labels_values).await
}
OutputData::Stream(stream) => {
let batches = RecordBatches::try_collect(stream)
.await
.context(CollectRecordbatchSnafu)?;
retrieve_label_values_from_record_batch(batches, label_name, labels_values).await
}
OutputData::AffectedRows(_) => UnexpectedResultSnafu {
reason: "expected data result, but got affected rows".to_string(),
}
.fail(),
}?;
if let Some(ref plan) = result.meta.plan {
collect_plan_metrics(plan, &mut [metrics]);
}
Ok(())
}
async fn retrieve_label_values_from_record_batch(
batches: RecordBatches,
label_name: &str,
labels_values: &mut HashSet<String>,
) -> Result<()> {
let Some(label_col_idx) = batches.schema().column_index_by_name(label_name) else {
return Ok(());
};
// check whether label_name belongs to tag column
match batches
.schema()
.column_schema_by_name(label_name)
.unwrap()
.data_type
{
ConcreteDataType::String(_) => {}
_ => return Ok(()),
}
for batch in batches.iter() {
let label_column = batch
.column(label_col_idx)
.as_any()
.downcast_ref::<StringVector>()
.unwrap();
for row_index in 0..batch.num_rows() {
if let Some(label_value) = label_column.get_data(row_index) {
let _ = labels_values.insert(label_value.to_string());
}
}
}
Ok(())
}
/// Try to parse and extract the name of referenced metric from the promql query.
///
/// Returns the metric name if a single metric is referenced, otherwise None.

Some files were not shown because too many files have changed in this diff Show More