mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-01-04 04:12:55 +00:00
Merge remote-tracking branch 'origin/main' into zhongzc/repartition-procedure-scaffold
This commit is contained in:
8
.github/scripts/pr-review-reminder.js
vendored
8
.github/scripts/pr-review-reminder.js
vendored
@@ -57,14 +57,6 @@
|
||||
return days;
|
||||
}
|
||||
|
||||
// Get urgency emoji based on PR age
|
||||
function getAgeEmoji(days) {
|
||||
if (days >= 14) return "🔴"; // 14+ days - critical
|
||||
if (days >= 7) return "🟠"; // 7+ days - urgent
|
||||
if (days >= 3) return "🟡"; // 3+ days - needs attention
|
||||
return "🟢"; // < 3 days - fresh
|
||||
}
|
||||
|
||||
// Build Slack notification message from PR list
|
||||
function buildSlackMessage(prs) {
|
||||
if (prs.length === 0) {
|
||||
|
||||
4
.github/workflows/pr-review-reminder.yml
vendored
4
.github/workflows/pr-review-reminder.yml
vendored
@@ -2,8 +2,8 @@ name: PR Review Reminder
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Run at 9:00 AM UTC+8 (01:00 AM UTC) every day
|
||||
- cron: '0 1 * * *'
|
||||
# Run at 9:00 AM UTC+8 (01:00 AM UTC) on Monday, Wednesday, Friday
|
||||
- cron: '0 1 * * 1,3,5'
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
|
||||
835
Cargo.lock
generated
835
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
70
Cargo.toml
70
Cargo.toml
@@ -99,12 +99,12 @@ rust.unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tokio_unstable)'] }
|
||||
# See for more detaiils: https://github.com/rust-lang/cargo/issues/11329
|
||||
ahash = { version = "0.8", features = ["compile-time-rng"] }
|
||||
aquamarine = "0.6"
|
||||
arrow = { version = "56.0", features = ["prettyprint"] }
|
||||
arrow-array = { version = "56.0", default-features = false, features = ["chrono-tz"] }
|
||||
arrow-buffer = "56.0"
|
||||
arrow-flight = "56.0"
|
||||
arrow-ipc = { version = "56.0", default-features = false, features = ["lz4", "zstd"] }
|
||||
arrow-schema = { version = "56.0", features = ["serde"] }
|
||||
arrow = { version = "56.2", features = ["prettyprint"] }
|
||||
arrow-array = { version = "56.2", default-features = false, features = ["chrono-tz"] }
|
||||
arrow-buffer = "56.2"
|
||||
arrow-flight = "56.2"
|
||||
arrow-ipc = { version = "56.2", default-features = false, features = ["lz4", "zstd"] }
|
||||
arrow-schema = { version = "56.2", features = ["serde"] }
|
||||
async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
# Remember to update axum-extra, axum-macros when updating axum
|
||||
@@ -123,18 +123,18 @@ clap = { version = "4.4", features = ["derive"] }
|
||||
config = "0.13.0"
|
||||
crossbeam-utils = "0.8"
|
||||
dashmap = "6.1"
|
||||
datafusion = "49"
|
||||
datafusion-common = "49"
|
||||
datafusion-expr = "49"
|
||||
datafusion-functions = "49"
|
||||
datafusion-functions-aggregate-common = "49"
|
||||
datafusion-optimizer = "49"
|
||||
datafusion-orc = { git = "https://github.com/GreptimeTeam/datafusion-orc", rev = "a0a5f902158f153119316eaeec868cff3fc8a99d" }
|
||||
datafusion-pg-catalog = { git = "https://github.com/datafusion-contrib/datafusion-postgres", rev = "3d1b7c7d5b82dd49bafc2803259365e633f654fa" }
|
||||
datafusion-physical-expr = "49"
|
||||
datafusion-physical-plan = "49"
|
||||
datafusion-sql = "49"
|
||||
datafusion-substrait = "49"
|
||||
datafusion = "50"
|
||||
datafusion-common = "50"
|
||||
datafusion-expr = "50"
|
||||
datafusion-functions = "50"
|
||||
datafusion-functions-aggregate-common = "50"
|
||||
datafusion-optimizer = "50"
|
||||
datafusion-orc = "0.5"
|
||||
datafusion-pg-catalog = "0.11"
|
||||
datafusion-physical-expr = "50"
|
||||
datafusion-physical-plan = "50"
|
||||
datafusion-sql = "50"
|
||||
datafusion-substrait = "50"
|
||||
deadpool = "0.12"
|
||||
deadpool-postgres = "0.14"
|
||||
derive_builder = "0.20"
|
||||
@@ -147,7 +147,7 @@ etcd-client = { git = "https://github.com/GreptimeTeam/etcd-client", rev = "f62d
|
||||
fst = "0.4.7"
|
||||
futures = "0.3"
|
||||
futures-util = "0.3"
|
||||
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "69a6089933daa573c96808ec4bbc48f447ec6e8c" }
|
||||
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "14b9dc40bdc8288742b0cefc7bb024303b7429ef" }
|
||||
hex = "0.4"
|
||||
http = "1"
|
||||
humantime = "2.1"
|
||||
@@ -180,7 +180,7 @@ otel-arrow-rust = { git = "https://github.com/GreptimeTeam/otel-arrow", rev = "2
|
||||
"server",
|
||||
] }
|
||||
parking_lot = "0.12"
|
||||
parquet = { version = "56.0", default-features = false, features = ["arrow", "async", "object_store"] }
|
||||
parquet = { version = "56.2", default-features = false, features = ["arrow", "async", "object_store"] }
|
||||
paste = "1.0"
|
||||
pin-project = "1.0"
|
||||
pretty_assertions = "1.4.0"
|
||||
@@ -191,7 +191,7 @@ prost-types = "0.13"
|
||||
raft-engine = { version = "0.4.1", default-features = false }
|
||||
rand = "0.9"
|
||||
ratelimit = "0.10"
|
||||
regex = "1.8"
|
||||
regex = "1.12"
|
||||
regex-automata = "0.4"
|
||||
reqwest = { version = "0.12", default-features = false, features = [
|
||||
"json",
|
||||
@@ -217,10 +217,7 @@ simd-json = "0.15"
|
||||
similar-asserts = "1.6.0"
|
||||
smallvec = { version = "1", features = ["serde"] }
|
||||
snafu = "0.8"
|
||||
sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "39e4fc94c3c741981f77e9d63b5ce8c02e0a27ea", features = [
|
||||
"visitor",
|
||||
"serde",
|
||||
] } # branch = "v0.55.x"
|
||||
sqlparser = { version = "0.58.0", default-features = false, features = ["std", "visitor", "serde"] }
|
||||
sqlx = { version = "0.8", features = [
|
||||
"runtime-tokio-rustls",
|
||||
"mysql",
|
||||
@@ -322,16 +319,19 @@ git = "https://github.com/GreptimeTeam/greptime-meter.git"
|
||||
rev = "5618e779cf2bb4755b499c630fba4c35e91898cb"
|
||||
|
||||
[patch.crates-io]
|
||||
datafusion = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7d5214512740b4dfb742b6b3d91ed9affcc2c9d0" }
|
||||
datafusion-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7d5214512740b4dfb742b6b3d91ed9affcc2c9d0" }
|
||||
datafusion-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7d5214512740b4dfb742b6b3d91ed9affcc2c9d0" }
|
||||
datafusion-functions = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7d5214512740b4dfb742b6b3d91ed9affcc2c9d0" }
|
||||
datafusion-functions-aggregate-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7d5214512740b4dfb742b6b3d91ed9affcc2c9d0" }
|
||||
datafusion-optimizer = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7d5214512740b4dfb742b6b3d91ed9affcc2c9d0" }
|
||||
datafusion-physical-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7d5214512740b4dfb742b6b3d91ed9affcc2c9d0" }
|
||||
datafusion-physical-plan = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7d5214512740b4dfb742b6b3d91ed9affcc2c9d0" }
|
||||
datafusion-sql = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7d5214512740b4dfb742b6b3d91ed9affcc2c9d0" }
|
||||
datafusion-substrait = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7d5214512740b4dfb742b6b3d91ed9affcc2c9d0" }
|
||||
datafusion = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-functions = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-functions-aggregate-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-optimizer = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-physical-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-physical-expr-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-physical-plan = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-datasource = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-sql = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-substrait = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "4b519a5caa95472cc3988f5556813a583dd35af1" } # branch = "v0.58.x"
|
||||
|
||||
[profile.release]
|
||||
debug = 1
|
||||
|
||||
@@ -25,12 +25,14 @@
|
||||
| `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. |
|
||||
| `http.timeout` | String | `0s` | HTTP request timeout. Set to 0 to disable timeout. |
|
||||
| `http.body_limit` | String | `64MB` | HTTP request body limit.<br/>The following units are supported: `B`, `KB`, `KiB`, `MB`, `MiB`, `GB`, `GiB`, `TB`, `TiB`, `PB`, `PiB`.<br/>Set to 0 to disable limit. |
|
||||
| `http.max_total_body_memory` | String | Unset | Maximum total memory for all concurrent HTTP request bodies.<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
|
||||
| `http.enable_cors` | Bool | `true` | HTTP CORS support, it's turned on by default<br/>This allows browser to access http APIs without CORS restrictions |
|
||||
| `http.cors_allowed_origins` | Array | Unset | Customize allowed origins for HTTP CORS. |
|
||||
| `http.prom_validation_mode` | String | `strict` | Whether to enable validation for Prometheus remote write requests.<br/>Available options:<br/>- strict: deny invalid UTF-8 strings (default).<br/>- lossy: allow invalid UTF-8 strings, replace invalid characters with REPLACEMENT_CHARACTER(U+FFFD).<br/>- unchecked: do not valid strings. |
|
||||
| `grpc` | -- | -- | The gRPC server options. |
|
||||
| `grpc.bind_addr` | String | `127.0.0.1:4001` | The address to bind the gRPC server. |
|
||||
| `grpc.runtime_size` | Integer | `8` | The number of server worker threads. |
|
||||
| `grpc.max_total_message_memory` | String | Unset | Maximum total memory for all concurrent gRPC request messages.<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
|
||||
| `grpc.max_connection_age` | String | Unset | The maximum connection age for gRPC connection.<br/>The value can be a human-readable time string. For example: `10m` for ten minutes or `1h` for one hour.<br/>Refer to https://grpc.io/docs/guides/keepalive/ for more details. |
|
||||
| `grpc.tls` | -- | -- | gRPC server TLS options, see `mysql.tls` section. |
|
||||
| `grpc.tls.mode` | String | `disable` | TLS mode. |
|
||||
@@ -235,6 +237,7 @@
|
||||
| `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. |
|
||||
| `http.timeout` | String | `0s` | HTTP request timeout. Set to 0 to disable timeout. |
|
||||
| `http.body_limit` | String | `64MB` | HTTP request body limit.<br/>The following units are supported: `B`, `KB`, `KiB`, `MB`, `MiB`, `GB`, `GiB`, `TB`, `TiB`, `PB`, `PiB`.<br/>Set to 0 to disable limit. |
|
||||
| `http.max_total_body_memory` | String | Unset | Maximum total memory for all concurrent HTTP request bodies.<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
|
||||
| `http.enable_cors` | Bool | `true` | HTTP CORS support, it's turned on by default<br/>This allows browser to access http APIs without CORS restrictions |
|
||||
| `http.cors_allowed_origins` | Array | Unset | Customize allowed origins for HTTP CORS. |
|
||||
| `http.prom_validation_mode` | String | `strict` | Whether to enable validation for Prometheus remote write requests.<br/>Available options:<br/>- strict: deny invalid UTF-8 strings (default).<br/>- lossy: allow invalid UTF-8 strings, replace invalid characters with REPLACEMENT_CHARACTER(U+FFFD).<br/>- unchecked: do not valid strings. |
|
||||
@@ -242,6 +245,7 @@
|
||||
| `grpc.bind_addr` | String | `127.0.0.1:4001` | The address to bind the gRPC server. |
|
||||
| `grpc.server_addr` | String | `127.0.0.1:4001` | The address advertised to the metasrv, and used for connections from outside the host.<br/>If left empty or unset, the server will automatically use the IP address of the first network interface<br/>on the host, with the same port number as the one specified in `grpc.bind_addr`. |
|
||||
| `grpc.runtime_size` | Integer | `8` | The number of server worker threads. |
|
||||
| `grpc.max_total_message_memory` | String | Unset | Maximum total memory for all concurrent gRPC request messages.<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
|
||||
| `grpc.flight_compression` | String | `arrow_ipc` | Compression mode for frontend side Arrow IPC service. Available options:<br/>- `none`: disable all compression<br/>- `transport`: only enable gRPC transport compression (zstd)<br/>- `arrow_ipc`: only enable Arrow IPC compression (lz4)<br/>- `all`: enable all compression.<br/>Default to `none` |
|
||||
| `grpc.max_connection_age` | String | Unset | The maximum connection age for gRPC connection.<br/>The value can be a human-readable time string. For example: `10m` for ten minutes or `1h` for one hour.<br/>Refer to https://grpc.io/docs/guides/keepalive/ for more details. |
|
||||
| `grpc.tls` | -- | -- | gRPC server TLS options, see `mysql.tls` section. |
|
||||
|
||||
@@ -31,6 +31,10 @@ timeout = "0s"
|
||||
## The following units are supported: `B`, `KB`, `KiB`, `MB`, `MiB`, `GB`, `GiB`, `TB`, `TiB`, `PB`, `PiB`.
|
||||
## Set to 0 to disable limit.
|
||||
body_limit = "64MB"
|
||||
## Maximum total memory for all concurrent HTTP request bodies.
|
||||
## Set to 0 to disable the limit. Default: "0" (unlimited)
|
||||
## @toml2docs:none-default
|
||||
#+ max_total_body_memory = "1GB"
|
||||
## HTTP CORS support, it's turned on by default
|
||||
## This allows browser to access http APIs without CORS restrictions
|
||||
enable_cors = true
|
||||
@@ -54,6 +58,10 @@ bind_addr = "127.0.0.1:4001"
|
||||
server_addr = "127.0.0.1:4001"
|
||||
## The number of server worker threads.
|
||||
runtime_size = 8
|
||||
## Maximum total memory for all concurrent gRPC request messages.
|
||||
## Set to 0 to disable the limit. Default: "0" (unlimited)
|
||||
## @toml2docs:none-default
|
||||
#+ max_total_message_memory = "1GB"
|
||||
## Compression mode for frontend side Arrow IPC service. Available options:
|
||||
## - `none`: disable all compression
|
||||
## - `transport`: only enable gRPC transport compression (zstd)
|
||||
|
||||
@@ -36,6 +36,10 @@ timeout = "0s"
|
||||
## The following units are supported: `B`, `KB`, `KiB`, `MB`, `MiB`, `GB`, `GiB`, `TB`, `TiB`, `PB`, `PiB`.
|
||||
## Set to 0 to disable limit.
|
||||
body_limit = "64MB"
|
||||
## Maximum total memory for all concurrent HTTP request bodies.
|
||||
## Set to 0 to disable the limit. Default: "0" (unlimited)
|
||||
## @toml2docs:none-default
|
||||
#+ max_total_body_memory = "1GB"
|
||||
## HTTP CORS support, it's turned on by default
|
||||
## This allows browser to access http APIs without CORS restrictions
|
||||
enable_cors = true
|
||||
@@ -56,6 +60,10 @@ prom_validation_mode = "strict"
|
||||
bind_addr = "127.0.0.1:4001"
|
||||
## The number of server worker threads.
|
||||
runtime_size = 8
|
||||
## Maximum total memory for all concurrent gRPC request messages.
|
||||
## Set to 0 to disable the limit. Default: "0" (unlimited)
|
||||
## @toml2docs:none-default
|
||||
#+ max_total_message_memory = "1GB"
|
||||
## The maximum connection age for gRPC connection.
|
||||
## The value can be a human-readable time string. For example: `10m` for ten minutes or `1h` for one hour.
|
||||
## Refer to https://grpc.io/docs/guides/keepalive/ for more details.
|
||||
|
||||
@@ -16,8 +16,8 @@ use std::collections::HashMap;
|
||||
|
||||
use datatypes::schema::{
|
||||
COMMENT_KEY, ColumnDefaultConstraint, ColumnSchema, FULLTEXT_KEY, FulltextAnalyzer,
|
||||
FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, SKIPPING_INDEX_KEY, SkippingIndexOptions,
|
||||
SkippingIndexType,
|
||||
FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, JSON_STRUCTURE_SETTINGS_KEY,
|
||||
SKIPPING_INDEX_KEY, SkippingIndexOptions, SkippingIndexType,
|
||||
};
|
||||
use greptime_proto::v1::{
|
||||
Analyzer, FulltextBackend as PbFulltextBackend, SkippingIndexType as PbSkippingIndexType,
|
||||
@@ -68,6 +68,9 @@ pub fn try_as_column_schema(column_def: &ColumnDef) -> Result<ColumnSchema> {
|
||||
if let Some(skipping_index) = options.options.get(SKIPPING_INDEX_GRPC_KEY) {
|
||||
metadata.insert(SKIPPING_INDEX_KEY.to_string(), skipping_index.to_owned());
|
||||
}
|
||||
if let Some(settings) = options.options.get(JSON_STRUCTURE_SETTINGS_KEY) {
|
||||
metadata.insert(JSON_STRUCTURE_SETTINGS_KEY.to_string(), settings.clone());
|
||||
}
|
||||
}
|
||||
|
||||
ColumnSchema::new(&column_def.name, data_type.into(), column_def.is_nullable)
|
||||
@@ -139,6 +142,11 @@ pub fn options_from_column_schema(column_schema: &ColumnSchema) -> Option<Column
|
||||
.options
|
||||
.insert(SKIPPING_INDEX_GRPC_KEY.to_string(), skipping_index.clone());
|
||||
}
|
||||
if let Some(settings) = column_schema.metadata().get(JSON_STRUCTURE_SETTINGS_KEY) {
|
||||
options
|
||||
.options
|
||||
.insert(JSON_STRUCTURE_SETTINGS_KEY.to_string(), settings.clone());
|
||||
}
|
||||
|
||||
(!options.options.is_empty()).then_some(options)
|
||||
}
|
||||
|
||||
@@ -33,7 +33,6 @@ use datatypes::timestamp::TimestampMillisecond;
|
||||
use datatypes::value::Value;
|
||||
use datatypes::vectors::{
|
||||
Int64VectorBuilder, StringVectorBuilder, TimestampMillisecondVectorBuilder,
|
||||
UInt32VectorBuilder, UInt64VectorBuilder,
|
||||
};
|
||||
use serde::Serialize;
|
||||
use snafu::ResultExt;
|
||||
@@ -53,6 +52,8 @@ const PEER_ADDR: &str = "peer_addr";
|
||||
const PEER_HOSTNAME: &str = "peer_hostname";
|
||||
const TOTAL_CPU_MILLICORES: &str = "total_cpu_millicores";
|
||||
const TOTAL_MEMORY_BYTES: &str = "total_memory_bytes";
|
||||
const CPU_USAGE_MILLICORES: &str = "cpu_usage_millicores";
|
||||
const MEMORY_USAGE_BYTES: &str = "memory_usage_bytes";
|
||||
const VERSION: &str = "version";
|
||||
const GIT_COMMIT: &str = "git_commit";
|
||||
const START_TIME: &str = "start_time";
|
||||
@@ -67,15 +68,17 @@ const INIT_CAPACITY: usize = 42;
|
||||
/// - `peer_id`: the peer server id.
|
||||
/// - `peer_type`: the peer type, such as `datanode`, `frontend`, `metasrv` etc.
|
||||
/// - `peer_addr`: the peer gRPC address.
|
||||
/// - `peer_hostname`: the hostname of the peer.
|
||||
/// - `total_cpu_millicores`: the total CPU millicores of the peer.
|
||||
/// - `total_memory_bytes`: the total memory bytes of the peer.
|
||||
/// - `cpu_usage_millicores`: the CPU usage millicores of the peer.
|
||||
/// - `memory_usage_bytes`: the memory usage bytes of the peer.
|
||||
/// - `version`: the build package version of the peer.
|
||||
/// - `git_commit`: the build git commit hash of the peer.
|
||||
/// - `start_time`: the starting time of the peer.
|
||||
/// - `uptime`: the uptime of the peer.
|
||||
/// - `active_time`: the time since the last activity of the peer.
|
||||
/// - `node_status`: the status info of the peer.
|
||||
/// - `peer_hostname`: the hostname of the peer.
|
||||
///
|
||||
#[derive(Debug)]
|
||||
pub(super) struct InformationSchemaClusterInfo {
|
||||
@@ -99,12 +102,22 @@ impl InformationSchemaClusterInfo {
|
||||
ColumnSchema::new(PEER_HOSTNAME, ConcreteDataType::string_datatype(), true),
|
||||
ColumnSchema::new(
|
||||
TOTAL_CPU_MILLICORES,
|
||||
ConcreteDataType::uint32_datatype(),
|
||||
ConcreteDataType::int64_datatype(),
|
||||
false,
|
||||
),
|
||||
ColumnSchema::new(
|
||||
TOTAL_MEMORY_BYTES,
|
||||
ConcreteDataType::uint64_datatype(),
|
||||
ConcreteDataType::int64_datatype(),
|
||||
false,
|
||||
),
|
||||
ColumnSchema::new(
|
||||
CPU_USAGE_MILLICORES,
|
||||
ConcreteDataType::int64_datatype(),
|
||||
false,
|
||||
),
|
||||
ColumnSchema::new(
|
||||
MEMORY_USAGE_BYTES,
|
||||
ConcreteDataType::int64_datatype(),
|
||||
false,
|
||||
),
|
||||
ColumnSchema::new(VERSION, ConcreteDataType::string_datatype(), false),
|
||||
@@ -167,8 +180,10 @@ struct InformationSchemaClusterInfoBuilder {
|
||||
peer_types: StringVectorBuilder,
|
||||
peer_addrs: StringVectorBuilder,
|
||||
peer_hostnames: StringVectorBuilder,
|
||||
cpus: UInt32VectorBuilder,
|
||||
memory_bytes: UInt64VectorBuilder,
|
||||
total_cpu_millicores: Int64VectorBuilder,
|
||||
total_memory_bytes: Int64VectorBuilder,
|
||||
cpu_usage_millicores: Int64VectorBuilder,
|
||||
memory_usage_bytes: Int64VectorBuilder,
|
||||
versions: StringVectorBuilder,
|
||||
git_commits: StringVectorBuilder,
|
||||
start_times: TimestampMillisecondVectorBuilder,
|
||||
@@ -186,8 +201,10 @@ impl InformationSchemaClusterInfoBuilder {
|
||||
peer_types: StringVectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
peer_addrs: StringVectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
peer_hostnames: StringVectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
cpus: UInt32VectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
memory_bytes: UInt64VectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
total_cpu_millicores: Int64VectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
total_memory_bytes: Int64VectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
cpu_usage_millicores: Int64VectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
memory_usage_bytes: Int64VectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
versions: StringVectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
git_commits: StringVectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
start_times: TimestampMillisecondVectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
@@ -243,8 +260,14 @@ impl InformationSchemaClusterInfoBuilder {
|
||||
self.start_times.push(None);
|
||||
self.uptimes.push(None);
|
||||
}
|
||||
self.cpus.push(Some(node_info.cpus));
|
||||
self.memory_bytes.push(Some(node_info.memory_bytes));
|
||||
self.total_cpu_millicores
|
||||
.push(Some(node_info.total_cpu_millicores));
|
||||
self.total_memory_bytes
|
||||
.push(Some(node_info.total_memory_bytes));
|
||||
self.cpu_usage_millicores
|
||||
.push(Some(node_info.cpu_usage_millicores));
|
||||
self.memory_usage_bytes
|
||||
.push(Some(node_info.memory_usage_bytes));
|
||||
|
||||
if node_info.last_activity_ts > 0 {
|
||||
self.active_times.push(Some(
|
||||
@@ -269,8 +292,10 @@ impl InformationSchemaClusterInfoBuilder {
|
||||
Arc::new(self.peer_types.finish()),
|
||||
Arc::new(self.peer_addrs.finish()),
|
||||
Arc::new(self.peer_hostnames.finish()),
|
||||
Arc::new(self.cpus.finish()),
|
||||
Arc::new(self.memory_bytes.finish()),
|
||||
Arc::new(self.total_cpu_millicores.finish()),
|
||||
Arc::new(self.total_memory_bytes.finish()),
|
||||
Arc::new(self.cpu_usage_millicores.finish()),
|
||||
Arc::new(self.memory_usage_bytes.finish()),
|
||||
Arc::new(self.versions.finish()),
|
||||
Arc::new(self.git_commits.finish()),
|
||||
Arc::new(self.start_times.finish()),
|
||||
|
||||
@@ -27,6 +27,7 @@ use datafusion::error::DataFusionError;
|
||||
use datafusion::execution::TaskContext;
|
||||
use datafusion::physical_plan::stream::RecordBatchStreamAdapter as DfRecordBatchStreamAdapter;
|
||||
use datafusion_pg_catalog::pg_catalog::catalog_info::CatalogInfo;
|
||||
use datafusion_pg_catalog::pg_catalog::context::EmptyContextProvider;
|
||||
use datafusion_pg_catalog::pg_catalog::{
|
||||
PG_CATALOG_TABLES, PgCatalogSchemaProvider, PgCatalogStaticTables, PgCatalogTable,
|
||||
};
|
||||
@@ -44,7 +45,7 @@ use crate::system_schema::{
|
||||
/// [`PGCatalogProvider`] is the provider for a schema named `pg_catalog`, it is not a catalog.
|
||||
pub struct PGCatalogProvider {
|
||||
catalog_name: String,
|
||||
inner: PgCatalogSchemaProvider<CatalogManagerWrapper>,
|
||||
inner: PgCatalogSchemaProvider<CatalogManagerWrapper, EmptyContextProvider>,
|
||||
tables: HashMap<String, TableRef>,
|
||||
table_ids: HashMap<&'static str, u32>,
|
||||
}
|
||||
@@ -69,6 +70,7 @@ impl PGCatalogProvider {
|
||||
catalog_manager,
|
||||
},
|
||||
Arc::new(static_tables),
|
||||
EmptyContextProvider,
|
||||
)
|
||||
.expect("Failed to initialize PgCatalogSchemaProvider");
|
||||
|
||||
|
||||
@@ -30,6 +30,7 @@ use common_meta::heartbeat::handler::invalidate_table_cache::InvalidateCacheHand
|
||||
use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler;
|
||||
use common_meta::key::TableMetadataManager;
|
||||
use common_meta::key::flow::FlowMetadataManager;
|
||||
use common_stat::ResourceStatImpl;
|
||||
use common_telemetry::info;
|
||||
use common_telemetry::logging::{DEFAULT_LOGGING_DIR, TracingOptions};
|
||||
use common_version::{short_version, verbose_version};
|
||||
@@ -372,11 +373,15 @@ impl StartCommand {
|
||||
Arc::new(InvalidateCacheHandler::new(layered_cache_registry.clone())),
|
||||
]);
|
||||
|
||||
let mut resource_stat = ResourceStatImpl::default();
|
||||
resource_stat.start_collect_cpu_usage();
|
||||
|
||||
let heartbeat_task = flow::heartbeat::HeartbeatTask::new(
|
||||
&opts,
|
||||
meta_client.clone(),
|
||||
opts.heartbeat.clone(),
|
||||
Arc::new(executor),
|
||||
Arc::new(resource_stat),
|
||||
);
|
||||
|
||||
let flow_metadata_manager = Arc::new(FlowMetadataManager::new(cached_meta_backend.clone()));
|
||||
|
||||
@@ -30,6 +30,7 @@ use common_meta::cache::{CacheRegistryBuilder, LayeredCacheRegistryBuilder};
|
||||
use common_meta::heartbeat::handler::HandlerGroupExecutor;
|
||||
use common_meta::heartbeat::handler::invalidate_table_cache::InvalidateCacheHandler;
|
||||
use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler;
|
||||
use common_stat::ResourceStatImpl;
|
||||
use common_telemetry::info;
|
||||
use common_telemetry::logging::{DEFAULT_LOGGING_DIR, TracingOptions};
|
||||
use common_time::timezone::set_default_timezone;
|
||||
@@ -421,11 +422,15 @@ impl StartCommand {
|
||||
Arc::new(InvalidateCacheHandler::new(layered_cache_registry.clone())),
|
||||
]);
|
||||
|
||||
let mut resource_stat = ResourceStatImpl::default();
|
||||
resource_stat.start_collect_cpu_usage();
|
||||
|
||||
let heartbeat_task = HeartbeatTask::new(
|
||||
&opts,
|
||||
meta_client.clone(),
|
||||
opts.heartbeat.clone(),
|
||||
Arc::new(executor),
|
||||
Arc::new(resource_stat),
|
||||
);
|
||||
let heartbeat_task = Some(heartbeat_task);
|
||||
|
||||
|
||||
@@ -11,7 +11,6 @@ workspace = true
|
||||
common-base.workspace = true
|
||||
common-error.workspace = true
|
||||
common-macro.workspace = true
|
||||
common-stat.workspace = true
|
||||
config.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
object-store.workspace = true
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
|
||||
pub mod config;
|
||||
pub mod error;
|
||||
pub mod utils;
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
|
||||
@@ -1,34 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use common_base::readable_size::ReadableSize;
|
||||
use common_stat::{get_total_cpu_millicores, get_total_memory_readable};
|
||||
|
||||
/// `ResourceSpec` holds the static resource specifications of a node,
|
||||
/// such as CPU cores and memory capacity. These values are fixed
|
||||
/// at startup and do not change dynamically during runtime.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct ResourceSpec {
|
||||
pub cpus: i64,
|
||||
pub memory: Option<ReadableSize>,
|
||||
}
|
||||
|
||||
impl Default for ResourceSpec {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
cpus: get_total_cpu_millicores(),
|
||||
memory: get_total_memory_readable(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -36,7 +36,7 @@ object_store_opendal.workspace = true
|
||||
orc-rust = { version = "0.6.3", default-features = false, features = ["async"] }
|
||||
parquet.workspace = true
|
||||
paste.workspace = true
|
||||
regex = "1.7"
|
||||
regex.workspace = true
|
||||
serde.workspace = true
|
||||
snafu.workspace = true
|
||||
strum.workspace = true
|
||||
|
||||
@@ -33,7 +33,7 @@ use bytes::{Buf, Bytes};
|
||||
use datafusion::datasource::physical_plan::FileOpenFuture;
|
||||
use datafusion::error::{DataFusionError, Result as DataFusionResult};
|
||||
use datafusion::physical_plan::SendableRecordBatchStream;
|
||||
use futures::StreamExt;
|
||||
use futures::{StreamExt, TryStreamExt};
|
||||
use object_store::ObjectStore;
|
||||
use snafu::ResultExt;
|
||||
use tokio_util::compat::FuturesAsyncWriteCompatExt;
|
||||
@@ -179,7 +179,7 @@ pub fn open_with_decoder<T: ArrowDecoder, F: Fn() -> DataFusionResult<T>>(
|
||||
Poll::Ready(decoder.flush().transpose())
|
||||
});
|
||||
|
||||
Ok(stream.boxed())
|
||||
Ok(stream.map_err(Into::into).boxed())
|
||||
}))
|
||||
}
|
||||
|
||||
|
||||
@@ -51,6 +51,7 @@ nalgebra.workspace = true
|
||||
num = "0.4"
|
||||
num-traits = "0.2"
|
||||
paste.workspace = true
|
||||
regex.workspace = true
|
||||
s2 = { version = "0.0.12", optional = true }
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
//! `foo_merge`'s input arg is the same as `foo_state`'s output, and its output is the same as `foo`'s input.
|
||||
//!
|
||||
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow::array::StructArray;
|
||||
@@ -272,7 +273,7 @@ impl StateMergeHelper {
|
||||
}
|
||||
|
||||
/// Wrapper to make an aggregate function out of a state function.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct StateWrapper {
|
||||
inner: AggregateUDF,
|
||||
name: String,
|
||||
@@ -616,6 +617,20 @@ impl AggregateUDFImpl for MergeWrapper {
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for MergeWrapper {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.inner == other.inner
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for MergeWrapper {}
|
||||
|
||||
impl Hash for MergeWrapper {
|
||||
fn hash<H: Hasher>(&self, state: &mut H) {
|
||||
self.inner.hash(state);
|
||||
}
|
||||
}
|
||||
|
||||
/// The merge accumulator, which modify `update_batch`'s behavior to accept one struct array which
|
||||
/// include the state fields of original aggregate function, and merge said states into original accumulator
|
||||
/// the output is the same as original aggregate function
|
||||
|
||||
@@ -39,8 +39,7 @@ use datafusion::prelude::SessionContext;
|
||||
use datafusion_common::arrow::array::AsArray;
|
||||
use datafusion_common::arrow::datatypes::{Float64Type, UInt64Type};
|
||||
use datafusion_common::{Column, TableReference};
|
||||
use datafusion_expr::expr::AggregateFunction;
|
||||
use datafusion_expr::sqlparser::ast::NullTreatment;
|
||||
use datafusion_expr::expr::{AggregateFunction, NullTreatment};
|
||||
use datafusion_expr::{
|
||||
Aggregate, ColumnarValue, Expr, LogicalPlan, ScalarFunctionArgs, SortExpr, TableScan, lit,
|
||||
};
|
||||
|
||||
@@ -68,7 +68,7 @@ impl CountHash {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug, Clone, Eq, PartialEq, Hash)]
|
||||
pub struct CountHash {
|
||||
signature: Signature,
|
||||
}
|
||||
|
||||
@@ -34,6 +34,7 @@ use crate::scalars::json::JsonFunction;
|
||||
use crate::scalars::matches::MatchesFunction;
|
||||
use crate::scalars::matches_term::MatchesTermFunction;
|
||||
use crate::scalars::math::MathFunction;
|
||||
use crate::scalars::string::register_string_functions;
|
||||
use crate::scalars::timestamp::TimestampFunction;
|
||||
use crate::scalars::uddsketch_calc::UddSketchCalcFunction;
|
||||
use crate::scalars::vector::VectorFunction as VectorScalarFunction;
|
||||
@@ -154,6 +155,9 @@ pub static FUNCTION_REGISTRY: LazyLock<Arc<FunctionRegistry>> = LazyLock::new(||
|
||||
// Json related functions
|
||||
JsonFunction::register(&function_registry);
|
||||
|
||||
// String related functions
|
||||
register_string_functions(&function_registry);
|
||||
|
||||
// Vector related functions
|
||||
VectorScalarFunction::register(&function_registry);
|
||||
VectorAggrFunction::register(&function_registry);
|
||||
|
||||
@@ -20,6 +20,7 @@ pub mod json;
|
||||
pub mod matches;
|
||||
pub mod matches_term;
|
||||
pub mod math;
|
||||
pub(crate) mod string;
|
||||
pub mod vector;
|
||||
|
||||
pub(crate) mod hll_count;
|
||||
|
||||
@@ -20,7 +20,9 @@ use common_query::error;
|
||||
use common_time::{Date, Timestamp};
|
||||
use datafusion_common::DataFusionError;
|
||||
use datafusion_common::arrow::array::{Array, AsArray, StringViewBuilder};
|
||||
use datafusion_common::arrow::datatypes::{ArrowTimestampType, DataType, Date32Type, TimeUnit};
|
||||
use datafusion_common::arrow::datatypes::{
|
||||
ArrowTimestampType, DataType, Date32Type, Date64Type, TimeUnit,
|
||||
};
|
||||
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature};
|
||||
use snafu::ResultExt;
|
||||
|
||||
@@ -40,6 +42,7 @@ impl Default for DateFormatFunction {
|
||||
signature: helper::one_of_sigs2(
|
||||
vec![
|
||||
DataType::Date32,
|
||||
DataType::Date64,
|
||||
DataType::Timestamp(TimeUnit::Second, None),
|
||||
DataType::Timestamp(TimeUnit::Millisecond, None),
|
||||
DataType::Timestamp(TimeUnit::Microsecond, None),
|
||||
@@ -115,6 +118,29 @@ impl Function for DateFormatFunction {
|
||||
builder.append_option(result.as_deref());
|
||||
}
|
||||
}
|
||||
DataType::Date64 => {
|
||||
let left = left.as_primitive::<Date64Type>();
|
||||
for i in 0..size {
|
||||
let date = left.is_valid(i).then(|| {
|
||||
let ms = left.value(i);
|
||||
Timestamp::new_millisecond(ms)
|
||||
});
|
||||
let format = formats.is_valid(i).then(|| formats.value(i));
|
||||
|
||||
let result = match (date, format) {
|
||||
(Some(ts), Some(fmt)) => {
|
||||
Some(ts.as_formatted_string(fmt, Some(timezone)).map_err(|e| {
|
||||
DataFusionError::Execution(format!(
|
||||
"cannot format {ts:?} as '{fmt}': {e}"
|
||||
))
|
||||
})?)
|
||||
}
|
||||
_ => None,
|
||||
};
|
||||
|
||||
builder.append_option(result.as_deref());
|
||||
}
|
||||
}
|
||||
x => {
|
||||
return Err(DataFusionError::Execution(format!(
|
||||
"unsupported input data type {x}"
|
||||
@@ -137,7 +163,9 @@ mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow_schema::Field;
|
||||
use datafusion_common::arrow::array::{Date32Array, StringArray, TimestampSecondArray};
|
||||
use datafusion_common::arrow::array::{
|
||||
Date32Array, Date64Array, StringArray, TimestampSecondArray,
|
||||
};
|
||||
use datafusion_common::config::ConfigOptions;
|
||||
use datafusion_expr::{TypeSignature, Volatility};
|
||||
|
||||
@@ -166,7 +194,7 @@ mod tests {
|
||||
Signature {
|
||||
type_signature: TypeSignature::OneOf(sigs),
|
||||
volatility: Volatility::Immutable
|
||||
} if sigs.len() == 5));
|
||||
} if sigs.len() == 6));
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -213,6 +241,50 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_date64_date_format() {
|
||||
let f = DateFormatFunction::default();
|
||||
|
||||
let dates = vec![Some(123000), None, Some(42000), None];
|
||||
let formats = vec![
|
||||
"%Y-%m-%d %T.%3f",
|
||||
"%Y-%m-%d %T.%3f",
|
||||
"%Y-%m-%d %T.%3f",
|
||||
"%Y-%m-%d %T.%3f",
|
||||
];
|
||||
let results = [
|
||||
Some("1970-01-01 00:02:03.000"),
|
||||
None,
|
||||
Some("1970-01-01 00:00:42.000"),
|
||||
None,
|
||||
];
|
||||
|
||||
let mut config_options = ConfigOptions::default();
|
||||
config_options.extensions.insert(FunctionContext::default());
|
||||
let config_options = Arc::new(config_options);
|
||||
|
||||
let args = ScalarFunctionArgs {
|
||||
args: vec![
|
||||
ColumnarValue::Array(Arc::new(Date64Array::from(dates))),
|
||||
ColumnarValue::Array(Arc::new(StringArray::from_iter_values(formats))),
|
||||
],
|
||||
arg_fields: vec![],
|
||||
number_rows: 4,
|
||||
return_field: Arc::new(Field::new("x", DataType::Utf8View, false)),
|
||||
config_options,
|
||||
};
|
||||
let result = f
|
||||
.invoke_with_args(args)
|
||||
.and_then(|x| x.to_array(4))
|
||||
.unwrap();
|
||||
let vector = result.as_string_view();
|
||||
|
||||
assert_eq!(4, vector.len());
|
||||
for (actual, expect) in vector.iter().zip(results) {
|
||||
assert_eq!(actual, expect);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_date_date_format() {
|
||||
let f = DateFormatFunction::default();
|
||||
|
||||
@@ -76,7 +76,7 @@ impl Function for GeohashFunction {
|
||||
}
|
||||
|
||||
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
|
||||
Ok(DataType::Utf8)
|
||||
Ok(DataType::Utf8View)
|
||||
}
|
||||
|
||||
fn signature(&self) -> &Signature {
|
||||
@@ -176,7 +176,7 @@ impl Function for GeohashNeighboursFunction {
|
||||
Ok(DataType::List(Arc::new(Field::new(
|
||||
"item",
|
||||
DataType::Utf8View,
|
||||
false,
|
||||
true,
|
||||
))))
|
||||
}
|
||||
|
||||
|
||||
@@ -355,9 +355,9 @@ impl Function for H3CellCenterLatLng {
|
||||
|
||||
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
|
||||
Ok(DataType::List(Arc::new(Field::new(
|
||||
"x",
|
||||
"item",
|
||||
DataType::Float64,
|
||||
false,
|
||||
true,
|
||||
))))
|
||||
}
|
||||
|
||||
|
||||
26
src/common/function/src/scalars/string.rs
Normal file
26
src/common/function/src/scalars/string.rs
Normal file
@@ -0,0 +1,26 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! String scalar functions
|
||||
|
||||
mod regexp_extract;
|
||||
|
||||
pub(crate) use regexp_extract::RegexpExtractFunction;
|
||||
|
||||
use crate::function_registry::FunctionRegistry;
|
||||
|
||||
/// Register all string functions
|
||||
pub fn register_string_functions(registry: &FunctionRegistry) {
|
||||
RegexpExtractFunction::register(registry);
|
||||
}
|
||||
339
src/common/function/src/scalars/string/regexp_extract.rs
Normal file
339
src/common/function/src/scalars/string/regexp_extract.rs
Normal file
@@ -0,0 +1,339 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Implementation of REGEXP_EXTRACT function
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
|
||||
use datafusion_common::DataFusionError;
|
||||
use datafusion_common::arrow::array::{Array, AsArray, LargeStringBuilder};
|
||||
use datafusion_common::arrow::compute::cast;
|
||||
use datafusion_common::arrow::datatypes::DataType;
|
||||
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, TypeSignature, Volatility};
|
||||
use regex::{Regex, RegexBuilder};
|
||||
|
||||
use crate::function::Function;
|
||||
use crate::function_registry::FunctionRegistry;
|
||||
|
||||
const NAME: &str = "regexp_extract";
|
||||
|
||||
// Safety limits
|
||||
const MAX_REGEX_SIZE: usize = 1024 * 1024; // compiled regex heap cap
|
||||
const MAX_DFA_SIZE: usize = 2 * 1024 * 1024; // lazy DFA cap
|
||||
const MAX_TOTAL_RESULT_SIZE: usize = 64 * 1024 * 1024; // total batch cap
|
||||
const MAX_SINGLE_MATCH: usize = 1024 * 1024; // per-row cap
|
||||
const MAX_PATTERN_LEN: usize = 10_000; // pattern text length cap
|
||||
|
||||
/// REGEXP_EXTRACT function implementation
|
||||
/// Extracts the first substring matching the given regular expression pattern.
|
||||
/// If no match is found, returns NULL.
|
||||
///
|
||||
#[derive(Debug)]
|
||||
pub struct RegexpExtractFunction {
|
||||
signature: Signature,
|
||||
}
|
||||
|
||||
impl RegexpExtractFunction {
|
||||
pub fn register(registry: &FunctionRegistry) {
|
||||
registry.register_scalar(RegexpExtractFunction::default());
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for RegexpExtractFunction {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
signature: Signature::one_of(
|
||||
vec![
|
||||
TypeSignature::Exact(vec![DataType::Utf8View, DataType::Utf8]),
|
||||
TypeSignature::Exact(vec![DataType::Utf8View, DataType::Utf8View]),
|
||||
TypeSignature::Exact(vec![DataType::Utf8, DataType::Utf8View]),
|
||||
TypeSignature::Exact(vec![DataType::LargeUtf8, DataType::Utf8View]),
|
||||
TypeSignature::Exact(vec![DataType::Utf8View, DataType::LargeUtf8]),
|
||||
TypeSignature::Exact(vec![DataType::Utf8, DataType::Utf8]),
|
||||
TypeSignature::Exact(vec![DataType::LargeUtf8, DataType::Utf8]),
|
||||
TypeSignature::Exact(vec![DataType::Utf8, DataType::LargeUtf8]),
|
||||
TypeSignature::Exact(vec![DataType::LargeUtf8, DataType::LargeUtf8]),
|
||||
],
|
||||
Volatility::Immutable,
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for RegexpExtractFunction {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "{}", NAME.to_ascii_uppercase())
|
||||
}
|
||||
}
|
||||
|
||||
impl Function for RegexpExtractFunction {
|
||||
fn name(&self) -> &str {
|
||||
NAME
|
||||
}
|
||||
|
||||
// Always return LargeUtf8 for simplicity and safety
|
||||
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
|
||||
Ok(DataType::LargeUtf8)
|
||||
}
|
||||
|
||||
fn signature(&self) -> &Signature {
|
||||
&self.signature
|
||||
}
|
||||
|
||||
fn invoke_with_args(
|
||||
&self,
|
||||
args: ScalarFunctionArgs,
|
||||
) -> datafusion_common::Result<ColumnarValue> {
|
||||
if args.args.len() != 2 {
|
||||
return Err(DataFusionError::Execution(
|
||||
"REGEXP_EXTRACT requires exactly two arguments (text, pattern)".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
// Keep original ColumnarValue variants for scalar-pattern fast path
|
||||
let pattern_is_scalar = matches!(args.args[1], ColumnarValue::Scalar(_));
|
||||
|
||||
let arrays = ColumnarValue::values_to_arrays(&args.args)?;
|
||||
let text_array = &arrays[0];
|
||||
let pattern_array = &arrays[1];
|
||||
|
||||
// Cast both to LargeUtf8 for uniform access (supports Utf8/Utf8View/Dictionary<String>)
|
||||
let text_large = cast(text_array.as_ref(), &DataType::LargeUtf8).map_err(|e| {
|
||||
DataFusionError::Execution(format!("REGEXP_EXTRACT: text cast failed: {e}"))
|
||||
})?;
|
||||
let pattern_large = cast(pattern_array.as_ref(), &DataType::LargeUtf8).map_err(|e| {
|
||||
DataFusionError::Execution(format!("REGEXP_EXTRACT: pattern cast failed: {e}"))
|
||||
})?;
|
||||
|
||||
let text = text_large.as_string::<i64>();
|
||||
let pattern = pattern_large.as_string::<i64>();
|
||||
let len = text.len();
|
||||
|
||||
// Pre-size result builder with conservative estimate
|
||||
let mut estimated_total = 0usize;
|
||||
for i in 0..len {
|
||||
if !text.is_null(i) {
|
||||
estimated_total = estimated_total.saturating_add(text.value_length(i) as usize);
|
||||
if estimated_total > MAX_TOTAL_RESULT_SIZE {
|
||||
return Err(DataFusionError::ResourcesExhausted(format!(
|
||||
"REGEXP_EXTRACT total output exceeds {} bytes",
|
||||
MAX_TOTAL_RESULT_SIZE
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
let mut builder = LargeStringBuilder::with_capacity(len, estimated_total);
|
||||
|
||||
// Fast path: if pattern is scalar, compile once
|
||||
let compiled_scalar: Option<Regex> = if pattern_is_scalar && len > 0 && !pattern.is_null(0)
|
||||
{
|
||||
Some(compile_regex_checked(pattern.value(0))?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
for i in 0..len {
|
||||
if text.is_null(i) || pattern.is_null(i) {
|
||||
builder.append_null();
|
||||
continue;
|
||||
}
|
||||
|
||||
let s = text.value(i);
|
||||
let pat = pattern.value(i);
|
||||
|
||||
// Compile or reuse regex
|
||||
let re = if let Some(ref compiled) = compiled_scalar {
|
||||
compiled
|
||||
} else {
|
||||
// TODO: For performance-critical applications with repeating patterns,
|
||||
// consider adding a small LRU cache here
|
||||
&compile_regex_checked(pat)?
|
||||
};
|
||||
|
||||
// First match only
|
||||
if let Some(m) = re.find(s) {
|
||||
let m_str = m.as_str();
|
||||
if m_str.len() > MAX_SINGLE_MATCH {
|
||||
return Err(DataFusionError::Execution(
|
||||
"REGEXP_EXTRACT match exceeds per-row limit (1MB)".to_string(),
|
||||
));
|
||||
}
|
||||
builder.append_value(m_str);
|
||||
} else {
|
||||
builder.append_null();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ColumnarValue::Array(Arc::new(builder.finish())))
|
||||
}
|
||||
}
|
||||
|
||||
// Compile a regex with safety checks
|
||||
fn compile_regex_checked(pattern: &str) -> datafusion_common::Result<Regex> {
|
||||
if pattern.len() > MAX_PATTERN_LEN {
|
||||
return Err(DataFusionError::Execution(format!(
|
||||
"REGEXP_EXTRACT pattern too long (> {} chars)",
|
||||
MAX_PATTERN_LEN
|
||||
)));
|
||||
}
|
||||
RegexBuilder::new(pattern)
|
||||
.size_limit(MAX_REGEX_SIZE)
|
||||
.dfa_size_limit(MAX_DFA_SIZE)
|
||||
.build()
|
||||
.map_err(|e| {
|
||||
DataFusionError::Execution(format!("REGEXP_EXTRACT invalid pattern '{}': {e}", pattern))
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use datafusion_common::arrow::array::StringArray;
|
||||
use datafusion_common::arrow::datatypes::Field;
|
||||
use datafusion_expr::ScalarFunctionArgs;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_regexp_extract_function_basic() {
|
||||
let text_array = Arc::new(StringArray::from(vec!["version 1.2.3", "no match here"]));
|
||||
let pattern_array = Arc::new(StringArray::from(vec!["\\d+\\.\\d+\\.\\d+", "\\d+"]));
|
||||
|
||||
let args = ScalarFunctionArgs {
|
||||
args: vec![
|
||||
ColumnarValue::Array(text_array),
|
||||
ColumnarValue::Array(pattern_array),
|
||||
],
|
||||
arg_fields: vec![
|
||||
Arc::new(Field::new("arg_0", DataType::Utf8, false)),
|
||||
Arc::new(Field::new("arg_1", DataType::Utf8, false)),
|
||||
],
|
||||
return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)),
|
||||
number_rows: 2,
|
||||
config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
|
||||
};
|
||||
|
||||
let function = RegexpExtractFunction::default();
|
||||
let result = function.invoke_with_args(args).unwrap();
|
||||
|
||||
if let ColumnarValue::Array(array) = result {
|
||||
let string_array = array.as_string::<i64>();
|
||||
assert_eq!(string_array.value(0), "1.2.3");
|
||||
assert!(string_array.is_null(1)); // no match should return NULL
|
||||
} else {
|
||||
panic!("Expected array result");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_regexp_extract_phone_number() {
|
||||
let text_array = Arc::new(StringArray::from(vec!["Phone: 123-456-7890", "No phone"]));
|
||||
let pattern_array = Arc::new(StringArray::from(vec![
|
||||
"\\d{3}-\\d{3}-\\d{4}",
|
||||
"\\d{3}-\\d{3}-\\d{4}",
|
||||
]));
|
||||
|
||||
let args = ScalarFunctionArgs {
|
||||
args: vec![
|
||||
ColumnarValue::Array(text_array),
|
||||
ColumnarValue::Array(pattern_array),
|
||||
],
|
||||
arg_fields: vec![
|
||||
Arc::new(Field::new("arg_0", DataType::Utf8, false)),
|
||||
Arc::new(Field::new("arg_1", DataType::Utf8, false)),
|
||||
],
|
||||
return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)),
|
||||
number_rows: 2,
|
||||
config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
|
||||
};
|
||||
|
||||
let function = RegexpExtractFunction::default();
|
||||
let result = function.invoke_with_args(args).unwrap();
|
||||
|
||||
if let ColumnarValue::Array(array) = result {
|
||||
let string_array = array.as_string::<i64>();
|
||||
assert_eq!(string_array.value(0), "123-456-7890");
|
||||
assert!(string_array.is_null(1)); // no match should return NULL
|
||||
} else {
|
||||
panic!("Expected array result");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_regexp_extract_email() {
|
||||
let text_array = Arc::new(StringArray::from(vec![
|
||||
"Email: user@domain.com",
|
||||
"Invalid email",
|
||||
]));
|
||||
let pattern_array = Arc::new(StringArray::from(vec![
|
||||
"[a-zA-Z0-9]+@[a-zA-Z0-9]+\\.[a-zA-Z]+",
|
||||
"[a-zA-Z0-9]+@[a-zA-Z0-9]+\\.[a-zA-Z]+",
|
||||
]));
|
||||
|
||||
let args = ScalarFunctionArgs {
|
||||
args: vec![
|
||||
ColumnarValue::Array(text_array),
|
||||
ColumnarValue::Array(pattern_array),
|
||||
],
|
||||
arg_fields: vec![
|
||||
Arc::new(Field::new("arg_0", DataType::Utf8, false)),
|
||||
Arc::new(Field::new("arg_1", DataType::Utf8, false)),
|
||||
],
|
||||
return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)),
|
||||
number_rows: 2,
|
||||
config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
|
||||
};
|
||||
|
||||
let function = RegexpExtractFunction::default();
|
||||
let result = function.invoke_with_args(args).unwrap();
|
||||
|
||||
if let ColumnarValue::Array(array) = result {
|
||||
let string_array = array.as_string::<i64>();
|
||||
assert_eq!(string_array.value(0), "user@domain.com");
|
||||
assert!(string_array.is_null(1)); // no match should return NULL
|
||||
} else {
|
||||
panic!("Expected array result");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_regexp_extract_with_nulls() {
|
||||
let text_array = Arc::new(StringArray::from(vec![Some("test 123"), None]));
|
||||
let pattern_array = Arc::new(StringArray::from(vec![Some("\\d+"), Some("\\d+")]));
|
||||
|
||||
let args = ScalarFunctionArgs {
|
||||
args: vec![
|
||||
ColumnarValue::Array(text_array),
|
||||
ColumnarValue::Array(pattern_array),
|
||||
],
|
||||
arg_fields: vec![
|
||||
Arc::new(Field::new("arg_0", DataType::Utf8, true)),
|
||||
Arc::new(Field::new("arg_1", DataType::Utf8, false)),
|
||||
],
|
||||
return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)),
|
||||
number_rows: 2,
|
||||
config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
|
||||
};
|
||||
|
||||
let function = RegexpExtractFunction::default();
|
||||
let result = function.invoke_with_args(args).unwrap();
|
||||
|
||||
if let ColumnarValue::Array(array) = result {
|
||||
let string_array = array.as_string::<i64>();
|
||||
assert_eq!(string_array.value(0), "123");
|
||||
assert!(string_array.is_null(1)); // NULL input should return NULL
|
||||
} else {
|
||||
panic!("Expected array result");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -14,6 +14,7 @@
|
||||
|
||||
use std::any::Any;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
use datafusion::arrow::datatypes::DataType;
|
||||
use datafusion::logical_expr::{ScalarFunctionArgs, ScalarUDFImpl};
|
||||
@@ -33,6 +34,20 @@ impl Debug for ScalarUdf {
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for ScalarUdf {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.function.signature() == other.function.signature()
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for ScalarUdf {}
|
||||
|
||||
impl Hash for ScalarUdf {
|
||||
fn hash<H: Hasher>(&self, state: &mut H) {
|
||||
self.function.signature().hash(state)
|
||||
}
|
||||
}
|
||||
|
||||
impl ScalarUDFImpl for ScalarUdf {
|
||||
fn as_any(&self) -> &dyn Any {
|
||||
self
|
||||
|
||||
@@ -32,10 +32,36 @@ use crate::system::define_nullary_udf;
|
||||
const CURRENT_SCHEMA_FUNCTION_NAME: &str = "current_schema";
|
||||
const CURRENT_SCHEMAS_FUNCTION_NAME: &str = "current_schemas";
|
||||
const SESSION_USER_FUNCTION_NAME: &str = "session_user";
|
||||
const CURRENT_DATABASE_FUNCTION_NAME: &str = "current_database";
|
||||
|
||||
define_nullary_udf!(CurrentSchemaFunction);
|
||||
define_nullary_udf!(CurrentSchemasFunction);
|
||||
define_nullary_udf!(SessionUserFunction);
|
||||
define_nullary_udf!(CurrentDatabaseFunction);
|
||||
|
||||
impl Function for CurrentDatabaseFunction {
|
||||
fn name(&self) -> &str {
|
||||
CURRENT_DATABASE_FUNCTION_NAME
|
||||
}
|
||||
|
||||
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
|
||||
Ok(DataType::Utf8View)
|
||||
}
|
||||
|
||||
fn signature(&self) -> &Signature {
|
||||
&self.signature
|
||||
}
|
||||
|
||||
fn invoke_with_args(
|
||||
&self,
|
||||
args: ScalarFunctionArgs,
|
||||
) -> datafusion_common::Result<ColumnarValue> {
|
||||
let func_ctx = find_function_context(&args)?;
|
||||
let db = func_ctx.query_ctx.current_catalog().to_string();
|
||||
|
||||
Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(Some(db))))
|
||||
}
|
||||
}
|
||||
|
||||
// Though "current_schema" can be aliased to "database", to not cause any breaking changes,
|
||||
// we are not doing it: not until https://github.com/apache/datafusion/issues/17469 is resolved.
|
||||
@@ -141,6 +167,7 @@ impl PGCatalogFunction {
|
||||
registry.register_scalar(CurrentSchemaFunction::default());
|
||||
registry.register_scalar(CurrentSchemasFunction::default());
|
||||
registry.register_scalar(SessionUserFunction::default());
|
||||
registry.register_scalar(CurrentDatabaseFunction::default());
|
||||
registry.register(pg_catalog::format_type::create_format_type_udf());
|
||||
registry.register(pg_catalog::create_pg_get_partkeydef_udf());
|
||||
registry.register(pg_catalog::has_privilege_udf::create_has_privilege_udf(
|
||||
|
||||
@@ -345,6 +345,20 @@ fn build_struct(
|
||||
Ok(datafusion_expr::ColumnarValue::Array(result_vector.to_arrow_array()))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for #name {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.signature == other.signature
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for #name {}
|
||||
|
||||
impl std::hash::Hash for #name {
|
||||
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
||||
self.signature.hash(state)
|
||||
}
|
||||
}
|
||||
}
|
||||
.into()
|
||||
}
|
||||
|
||||
@@ -120,10 +120,16 @@ pub struct NodeInfo {
|
||||
pub start_time_ms: u64,
|
||||
// The node build cpus
|
||||
#[serde(default)]
|
||||
pub cpus: u32,
|
||||
pub total_cpu_millicores: i64,
|
||||
// The node build memory bytes
|
||||
#[serde(default)]
|
||||
pub memory_bytes: u64,
|
||||
pub total_memory_bytes: i64,
|
||||
// The node build cpu usage millicores
|
||||
#[serde(default)]
|
||||
pub cpu_usage_millicores: i64,
|
||||
// The node build memory usage bytes
|
||||
#[serde(default)]
|
||||
pub memory_usage_bytes: i64,
|
||||
// The node build hostname
|
||||
#[serde(default)]
|
||||
pub hostname: String,
|
||||
@@ -333,8 +339,10 @@ mod tests {
|
||||
version: "".to_string(),
|
||||
git_commit: "".to_string(),
|
||||
start_time_ms: 1,
|
||||
cpus: 0,
|
||||
memory_bytes: 0,
|
||||
total_cpu_millicores: 0,
|
||||
total_memory_bytes: 0,
|
||||
cpu_usage_millicores: 0,
|
||||
memory_usage_bytes: 0,
|
||||
hostname: "test_hostname".to_string(),
|
||||
};
|
||||
|
||||
|
||||
@@ -55,6 +55,10 @@ impl Display for RegionIdent {
|
||||
/// The result of downgrade leader region.
|
||||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
|
||||
pub struct DowngradeRegionReply {
|
||||
/// The [RegionId].
|
||||
/// For compatibility, it is defaulted to [RegionId::new(0, 0)].
|
||||
#[serde(default)]
|
||||
pub region_id: RegionId,
|
||||
/// Returns the `last_entry_id` if available.
|
||||
pub last_entry_id: Option<u64>,
|
||||
/// Returns the `metadata_last_entry_id` if available (Only available for metric engine).
|
||||
@@ -423,14 +427,60 @@ pub enum Instruction {
|
||||
CloseRegions(Vec<RegionIdent>),
|
||||
/// Upgrades a region.
|
||||
UpgradeRegion(UpgradeRegion),
|
||||
#[serde(
|
||||
deserialize_with = "single_or_multiple_from",
|
||||
alias = "DowngradeRegion"
|
||||
)]
|
||||
/// Downgrades a region.
|
||||
DowngradeRegion(DowngradeRegion),
|
||||
DowngradeRegions(Vec<DowngradeRegion>),
|
||||
/// Invalidates batch cache.
|
||||
InvalidateCaches(Vec<CacheIdent>),
|
||||
/// Flushes regions.
|
||||
FlushRegions(FlushRegions),
|
||||
}
|
||||
|
||||
impl Instruction {
|
||||
/// Converts the instruction into a vector of [OpenRegion].
|
||||
pub fn into_open_regions(self) -> Option<Vec<OpenRegion>> {
|
||||
match self {
|
||||
Self::OpenRegions(open_regions) => Some(open_regions),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts the instruction into a vector of [RegionIdent].
|
||||
pub fn into_close_regions(self) -> Option<Vec<RegionIdent>> {
|
||||
match self {
|
||||
Self::CloseRegions(close_regions) => Some(close_regions),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts the instruction into a [FlushRegions].
|
||||
pub fn into_flush_regions(self) -> Option<FlushRegions> {
|
||||
match self {
|
||||
Self::FlushRegions(flush_regions) => Some(flush_regions),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts the instruction into a [DowngradeRegion].
|
||||
pub fn into_downgrade_regions(self) -> Option<Vec<DowngradeRegion>> {
|
||||
match self {
|
||||
Self::DowngradeRegions(downgrade_region) => Some(downgrade_region),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts the instruction into a [UpgradeRegion].
|
||||
pub fn into_upgrade_regions(self) -> Option<UpgradeRegion> {
|
||||
match self {
|
||||
Self::UpgradeRegion(upgrade_region) => Some(upgrade_region),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The reply of [UpgradeRegion].
|
||||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
|
||||
pub struct UpgradeRegionReply {
|
||||
@@ -452,6 +502,39 @@ impl Display for UpgradeRegionReply {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
|
||||
pub struct DowngradeRegionsReply {
|
||||
pub replies: Vec<DowngradeRegionReply>,
|
||||
}
|
||||
|
||||
impl DowngradeRegionsReply {
|
||||
pub fn new(replies: Vec<DowngradeRegionReply>) -> Self {
|
||||
Self { replies }
|
||||
}
|
||||
|
||||
pub fn single(reply: DowngradeRegionReply) -> Self {
|
||||
Self::new(vec![reply])
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
#[serde(untagged)]
|
||||
enum DowngradeRegionsCompat {
|
||||
Single(DowngradeRegionReply),
|
||||
Multiple(DowngradeRegionsReply),
|
||||
}
|
||||
|
||||
fn downgrade_regions_compat_from<'de, D>(deserializer: D) -> Result<DowngradeRegionsReply, D::Error>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
let helper = DowngradeRegionsCompat::deserialize(deserializer)?;
|
||||
Ok(match helper {
|
||||
DowngradeRegionsCompat::Single(x) => DowngradeRegionsReply::new(vec![x]),
|
||||
DowngradeRegionsCompat::Multiple(reply) => reply,
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
|
||||
#[serde(tag = "type", rename_all = "snake_case")]
|
||||
pub enum InstructionReply {
|
||||
@@ -460,7 +543,11 @@ pub enum InstructionReply {
|
||||
#[serde(alias = "close_region")]
|
||||
CloseRegions(SimpleReply),
|
||||
UpgradeRegion(UpgradeRegionReply),
|
||||
DowngradeRegion(DowngradeRegionReply),
|
||||
#[serde(
|
||||
alias = "downgrade_region",
|
||||
deserialize_with = "downgrade_regions_compat_from"
|
||||
)]
|
||||
DowngradeRegions(DowngradeRegionsReply),
|
||||
FlushRegions(FlushRegionReply),
|
||||
}
|
||||
|
||||
@@ -470,8 +557,8 @@ impl Display for InstructionReply {
|
||||
Self::OpenRegions(reply) => write!(f, "InstructionReply::OpenRegions({})", reply),
|
||||
Self::CloseRegions(reply) => write!(f, "InstructionReply::CloseRegions({})", reply),
|
||||
Self::UpgradeRegion(reply) => write!(f, "InstructionReply::UpgradeRegion({})", reply),
|
||||
Self::DowngradeRegion(reply) => {
|
||||
write!(f, "InstructionReply::DowngradeRegion({})", reply)
|
||||
Self::DowngradeRegions(reply) => {
|
||||
write!(f, "InstructionReply::DowngradeRegions({:?})", reply)
|
||||
}
|
||||
Self::FlushRegions(reply) => write!(f, "InstructionReply::FlushRegions({})", reply),
|
||||
}
|
||||
@@ -493,6 +580,27 @@ impl InstructionReply {
|
||||
_ => panic!("Expected OpenRegions reply"),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn expect_upgrade_region_reply(self) -> UpgradeRegionReply {
|
||||
match self {
|
||||
Self::UpgradeRegion(reply) => reply,
|
||||
_ => panic!("Expected UpgradeRegion reply"),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn expect_downgrade_regions_reply(self) -> Vec<DowngradeRegionReply> {
|
||||
match self {
|
||||
Self::DowngradeRegions(reply) => reply.replies,
|
||||
_ => panic!("Expected DowngradeRegion reply"),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn expect_flush_regions_reply(self) -> FlushRegionReply {
|
||||
match self {
|
||||
Self::FlushRegions(reply) => reply,
|
||||
_ => panic!("Expected FlushRegions reply"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -532,11 +640,27 @@ mod tests {
|
||||
r#"{"CloseRegions":[{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"}]}"#,
|
||||
serialized
|
||||
);
|
||||
|
||||
let downgrade_region = InstructionReply::DowngradeRegions(DowngradeRegionsReply::single(
|
||||
DowngradeRegionReply {
|
||||
region_id: RegionId::new(1024, 1),
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
exists: true,
|
||||
error: None,
|
||||
},
|
||||
));
|
||||
|
||||
let serialized = serde_json::to_string(&downgrade_region).unwrap();
|
||||
assert_eq!(
|
||||
r#"{"type":"downgrade_regions","replies":[{"region_id":4398046511105,"last_entry_id":null,"metadata_last_entry_id":null,"exists":true,"error":null}]}"#,
|
||||
serialized
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_deserialize_instruction() {
|
||||
let open_region_instruction = r#"{"OpenRegion":[{"region_ident":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"},"region_storage_path":"test/foo","region_options":{},"region_wal_options":{},"skip_wal_replay":false}]}"#;
|
||||
let open_region_instruction = r#"{"OpenRegion":{"region_ident":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"},"region_storage_path":"test/foo","region_options":{},"region_wal_options":{},"skip_wal_replay":false}}"#;
|
||||
let open_region_instruction: Instruction =
|
||||
serde_json::from_str(open_region_instruction).unwrap();
|
||||
let open_region = Instruction::OpenRegions(vec![OpenRegion::new(
|
||||
@@ -553,7 +677,7 @@ mod tests {
|
||||
)]);
|
||||
assert_eq!(open_region_instruction, open_region);
|
||||
|
||||
let close_region_instruction = r#"{"CloseRegion":[{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"}]}"#;
|
||||
let close_region_instruction = r#"{"CloseRegion":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"}}"#;
|
||||
let close_region_instruction: Instruction =
|
||||
serde_json::from_str(close_region_instruction).unwrap();
|
||||
let close_region = Instruction::CloseRegions(vec![RegionIdent {
|
||||
@@ -564,6 +688,15 @@ mod tests {
|
||||
}]);
|
||||
assert_eq!(close_region_instruction, close_region);
|
||||
|
||||
let downgrade_region_instruction = r#"{"DowngradeRegions":{"region_id":4398046511105,"flush_timeout":{"secs":1,"nanos":0}}}"#;
|
||||
let downgrade_region_instruction: Instruction =
|
||||
serde_json::from_str(downgrade_region_instruction).unwrap();
|
||||
let downgrade_region = Instruction::DowngradeRegions(vec![DowngradeRegion {
|
||||
region_id: RegionId::new(1024, 1),
|
||||
flush_timeout: Some(Duration::from_millis(1000)),
|
||||
}]);
|
||||
assert_eq!(downgrade_region_instruction, downgrade_region);
|
||||
|
||||
let close_region_instruction_reply =
|
||||
r#"{"result":true,"error":null,"type":"close_region"}"#;
|
||||
let close_region_instruction_reply: InstructionReply =
|
||||
@@ -582,6 +715,20 @@ mod tests {
|
||||
error: None,
|
||||
});
|
||||
assert_eq!(open_region_instruction_reply, open_region_reply);
|
||||
|
||||
let downgrade_region_instruction_reply = r#"{"region_id":4398046511105,"last_entry_id":null,"metadata_last_entry_id":null,"exists":true,"error":null,"type":"downgrade_region"}"#;
|
||||
let downgrade_region_instruction_reply: InstructionReply =
|
||||
serde_json::from_str(downgrade_region_instruction_reply).unwrap();
|
||||
let downgrade_region_reply = InstructionReply::DowngradeRegions(
|
||||
DowngradeRegionsReply::single(DowngradeRegionReply {
|
||||
region_id: RegionId::new(1024, 1),
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
exists: true,
|
||||
error: None,
|
||||
}),
|
||||
);
|
||||
assert_eq!(downgrade_region_instruction_reply, downgrade_region_reply);
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
|
||||
@@ -6,11 +6,14 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
common-base.workspace = true
|
||||
common-runtime.workspace = true
|
||||
common-telemetry.workspace = true
|
||||
lazy_static.workspace = true
|
||||
nix.workspace = true
|
||||
num_cpus.workspace = true
|
||||
prometheus.workspace = true
|
||||
sysinfo.workspace = true
|
||||
tokio.workspace = true
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
@@ -117,7 +117,10 @@ pub fn get_cpu_limit_from_cgroups() -> Option<i64> {
|
||||
None
|
||||
}
|
||||
|
||||
fn get_cpu_usage() -> Option<i64> {
|
||||
/// Get the usage of cpu in millicores from cgroups filesystem.
|
||||
///
|
||||
/// - Return `None` if it's not in the cgroups v2 environment or fails to read the cpu usage.
|
||||
pub fn get_cpu_usage_from_cgroups() -> Option<i64> {
|
||||
// In certain bare-metal environments, the `/sys/fs/cgroup/cpu.stat` file may be present and reflect system-wide CPU usage rather than container-specific metrics.
|
||||
// To ensure accurate collection of container-level CPU usage, verify the existence of the `/sys/fs/cgroup/memory.current` file.
|
||||
// The presence of this file typically indicates execution within a containerized environment, thereby validating the relevance of the collected CPU usage data.
|
||||
@@ -142,6 +145,22 @@ fn get_cpu_usage() -> Option<i64> {
|
||||
fields[1].trim().parse::<i64>().ok()
|
||||
}
|
||||
|
||||
// Calculate the cpu usage in millicores from cgroups filesystem.
|
||||
//
|
||||
// - Return `0` if the current cpu usage is equal to the last cpu usage or the interval is 0.
|
||||
pub(crate) fn calculate_cpu_usage(
|
||||
current_cpu_usage_usecs: i64,
|
||||
last_cpu_usage_usecs: i64,
|
||||
interval_milliseconds: i64,
|
||||
) -> i64 {
|
||||
let diff = current_cpu_usage_usecs - last_cpu_usage_usecs;
|
||||
if diff > 0 && interval_milliseconds > 0 {
|
||||
((diff as f64 / interval_milliseconds as f64).round() as i64).max(1)
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
|
||||
// Check whether the cgroup is v2.
|
||||
// - Return `true` if the cgroup is v2, otherwise return `false`.
|
||||
// - Return `None` if the detection fails or not on linux.
|
||||
@@ -230,7 +249,7 @@ impl Collector for CgroupsMetricsCollector {
|
||||
}
|
||||
|
||||
fn collect(&self) -> Vec<MetricFamily> {
|
||||
if let Some(cpu_usage) = get_cpu_usage() {
|
||||
if let Some(cpu_usage) = get_cpu_usage_from_cgroups() {
|
||||
self.cpu_usage.set(cpu_usage);
|
||||
}
|
||||
|
||||
|
||||
@@ -13,66 +13,7 @@
|
||||
// limitations under the License.
|
||||
|
||||
mod cgroups;
|
||||
mod resource;
|
||||
|
||||
pub use cgroups::*;
|
||||
use common_base::readable_size::ReadableSize;
|
||||
use sysinfo::System;
|
||||
|
||||
/// Get the total CPU in millicores.
|
||||
pub fn get_total_cpu_millicores() -> i64 {
|
||||
// Get CPU limit from cgroups filesystem.
|
||||
if let Some(cgroup_cpu_limit) = get_cpu_limit_from_cgroups() {
|
||||
cgroup_cpu_limit
|
||||
} else {
|
||||
// Get total CPU cores from host system.
|
||||
num_cpus::get() as i64 * 1000
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the total memory in bytes.
|
||||
pub fn get_total_memory_bytes() -> i64 {
|
||||
// Get memory limit from cgroups filesystem.
|
||||
if let Some(cgroup_memory_limit) = get_memory_limit_from_cgroups() {
|
||||
cgroup_memory_limit
|
||||
} else {
|
||||
// Get total memory from host system.
|
||||
if sysinfo::IS_SUPPORTED_SYSTEM {
|
||||
let mut sys_info = System::new();
|
||||
sys_info.refresh_memory();
|
||||
sys_info.total_memory() as i64
|
||||
} else {
|
||||
// If the system is not supported, return -1.
|
||||
-1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the total CPU cores. The result will be rounded to the nearest integer.
|
||||
/// For example, if the total CPU is 1.5 cores(1500 millicores), the result will be 2.
|
||||
pub fn get_total_cpu_cores() -> usize {
|
||||
((get_total_cpu_millicores() as f64) / 1000.0).round() as usize
|
||||
}
|
||||
|
||||
/// Get the total memory in readable size.
|
||||
pub fn get_total_memory_readable() -> Option<ReadableSize> {
|
||||
if get_total_memory_bytes() > 0 {
|
||||
Some(ReadableSize(get_total_memory_bytes() as u64))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_get_total_cpu_cores() {
|
||||
assert!(get_total_cpu_cores() > 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_total_memory_readable() {
|
||||
assert!(get_total_memory_readable().unwrap() > ReadableSize::mb(0));
|
||||
}
|
||||
}
|
||||
pub use resource::*;
|
||||
|
||||
187
src/common/stat/src/resource.rs
Normal file
187
src/common/stat/src/resource.rs
Normal file
@@ -0,0 +1,187 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicI64, Ordering};
|
||||
use std::time::Duration;
|
||||
|
||||
use common_base::readable_size::ReadableSize;
|
||||
use common_runtime::JoinHandle;
|
||||
use common_telemetry::info;
|
||||
use sysinfo::System;
|
||||
use tokio::time::sleep;
|
||||
|
||||
use crate::cgroups::calculate_cpu_usage;
|
||||
use crate::{
|
||||
get_cpu_limit_from_cgroups, get_cpu_usage_from_cgroups, get_memory_limit_from_cgroups,
|
||||
get_memory_usage_from_cgroups,
|
||||
};
|
||||
|
||||
/// Get the total CPU in millicores. If the CPU limit is unset, it will return the total CPU cores from host system.
|
||||
pub fn get_total_cpu_millicores() -> i64 {
|
||||
// Get CPU limit from cgroups filesystem.
|
||||
if let Some(cgroup_cpu_limit) = get_cpu_limit_from_cgroups() {
|
||||
cgroup_cpu_limit
|
||||
} else {
|
||||
// Get total CPU cores from host system.
|
||||
num_cpus::get() as i64 * 1000
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the total memory in bytes. If the memory limit is unset, it will return the total memory from host system.
|
||||
/// If the system is not supported to get the total host memory, it will return 0.
|
||||
pub fn get_total_memory_bytes() -> i64 {
|
||||
// Get memory limit from cgroups filesystem.
|
||||
if let Some(cgroup_memory_limit) = get_memory_limit_from_cgroups() {
|
||||
cgroup_memory_limit
|
||||
} else {
|
||||
// Get total memory from host system.
|
||||
if sysinfo::IS_SUPPORTED_SYSTEM {
|
||||
let mut sys_info = System::new();
|
||||
sys_info.refresh_memory();
|
||||
sys_info.total_memory() as i64
|
||||
} else {
|
||||
// If the system is not supported, return 0
|
||||
0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the total CPU cores. The result will be rounded to the nearest integer.
|
||||
/// For example, if the total CPU is 1.5 cores(1500 millicores), the result will be 2.
|
||||
pub fn get_total_cpu_cores() -> usize {
|
||||
((get_total_cpu_millicores() as f64) / 1000.0).round() as usize
|
||||
}
|
||||
|
||||
/// Get the total memory in readable size.
|
||||
pub fn get_total_memory_readable() -> Option<ReadableSize> {
|
||||
if get_total_memory_bytes() > 0 {
|
||||
Some(ReadableSize(get_total_memory_bytes() as u64))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// A reference to a `ResourceStat` implementation.
|
||||
pub type ResourceStatRef = Arc<dyn ResourceStat + Send + Sync>;
|
||||
|
||||
/// A trait for getting resource statistics.
|
||||
pub trait ResourceStat {
|
||||
/// Get the total CPU in millicores.
|
||||
fn get_total_cpu_millicores(&self) -> i64;
|
||||
/// Get the total memory in bytes.
|
||||
fn get_total_memory_bytes(&self) -> i64;
|
||||
/// Get the CPU usage in millicores.
|
||||
fn get_cpu_usage_millicores(&self) -> i64;
|
||||
/// Get the memory usage in bytes.
|
||||
fn get_memory_usage_bytes(&self) -> i64;
|
||||
}
|
||||
|
||||
/// A implementation of `ResourceStat` trait.
|
||||
pub struct ResourceStatImpl {
|
||||
cpu_usage_millicores: Arc<AtomicI64>,
|
||||
last_cpu_usage_usecs: Arc<AtomicI64>,
|
||||
calculate_interval: Duration,
|
||||
handler: Option<JoinHandle<()>>,
|
||||
}
|
||||
|
||||
impl Default for ResourceStatImpl {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
cpu_usage_millicores: Arc::new(AtomicI64::new(0)),
|
||||
last_cpu_usage_usecs: Arc::new(AtomicI64::new(0)),
|
||||
calculate_interval: Duration::from_secs(5),
|
||||
handler: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ResourceStatImpl {
|
||||
/// Start collecting CPU usage periodically. It will calculate the CPU usage in millicores based on rate of change of CPU usage usage_usec in `/sys/fs/cgroup/cpu.stat`.
|
||||
/// It ONLY works in cgroup v2 environment.
|
||||
pub fn start_collect_cpu_usage(&mut self) {
|
||||
if self.handler.is_some() {
|
||||
return;
|
||||
}
|
||||
|
||||
let cpu_usage_millicores = self.cpu_usage_millicores.clone();
|
||||
let last_cpu_usage_usecs = self.last_cpu_usage_usecs.clone();
|
||||
let calculate_interval = self.calculate_interval;
|
||||
|
||||
let handler = common_runtime::spawn_global(async move {
|
||||
info!(
|
||||
"Starting to collect CPU usage periodically for every {} seconds",
|
||||
calculate_interval.as_secs()
|
||||
);
|
||||
loop {
|
||||
let current_cpu_usage_usecs = get_cpu_usage_from_cgroups();
|
||||
if let Some(current_cpu_usage_usecs) = current_cpu_usage_usecs {
|
||||
// Skip the first time to collect CPU usage.
|
||||
if last_cpu_usage_usecs.load(Ordering::Relaxed) == 0 {
|
||||
last_cpu_usage_usecs.store(current_cpu_usage_usecs, Ordering::Relaxed);
|
||||
continue;
|
||||
}
|
||||
let cpu_usage = calculate_cpu_usage(
|
||||
current_cpu_usage_usecs,
|
||||
last_cpu_usage_usecs.load(Ordering::Relaxed),
|
||||
calculate_interval.as_millis() as i64,
|
||||
);
|
||||
cpu_usage_millicores.store(cpu_usage, Ordering::Relaxed);
|
||||
last_cpu_usage_usecs.store(current_cpu_usage_usecs, Ordering::Relaxed);
|
||||
}
|
||||
sleep(calculate_interval).await;
|
||||
}
|
||||
});
|
||||
|
||||
self.handler = Some(handler);
|
||||
}
|
||||
}
|
||||
|
||||
impl ResourceStat for ResourceStatImpl {
|
||||
/// Get the total CPU in millicores.
|
||||
fn get_total_cpu_millicores(&self) -> i64 {
|
||||
get_total_cpu_millicores()
|
||||
}
|
||||
|
||||
/// Get the total memory in bytes.
|
||||
fn get_total_memory_bytes(&self) -> i64 {
|
||||
get_total_memory_bytes()
|
||||
}
|
||||
|
||||
/// Get the CPU usage in millicores.
|
||||
fn get_cpu_usage_millicores(&self) -> i64 {
|
||||
self.cpu_usage_millicores.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
/// Get the memory usage in bytes.
|
||||
/// It ONLY works in cgroup v2 environment.
|
||||
fn get_memory_usage_bytes(&self) -> i64 {
|
||||
get_memory_usage_from_cgroups().unwrap_or_default()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_get_total_cpu_cores() {
|
||||
assert!(get_total_cpu_cores() > 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_total_memory_readable() {
|
||||
assert!(get_total_memory_readable().unwrap() > ReadableSize::mb(0));
|
||||
}
|
||||
}
|
||||
@@ -28,7 +28,7 @@ pub async fn check_output_stream(output: OutputData, expected: &str) {
|
||||
_ => unreachable!(),
|
||||
};
|
||||
let pretty_print = recordbatches.pretty_print().unwrap();
|
||||
assert_eq!(pretty_print, expected, "actual: \n{}", pretty_print);
|
||||
assert_eq!(pretty_print, expected.trim(), "actual: \n{}", pretty_print);
|
||||
}
|
||||
|
||||
pub async fn execute_and_check_output(db: &Database, sql: &str, expected: ExpectedOutput<'_>) {
|
||||
|
||||
@@ -30,6 +30,7 @@ common-procedure.workspace = true
|
||||
common-query.workspace = true
|
||||
common-recordbatch.workspace = true
|
||||
common-runtime.workspace = true
|
||||
common-stat.workspace = true
|
||||
common-telemetry.workspace = true
|
||||
common-time.workspace = true
|
||||
common-version.workspace = true
|
||||
|
||||
@@ -27,6 +27,7 @@ use common_meta::key::runtime_switch::RuntimeSwitchManager;
|
||||
use common_meta::key::{SchemaMetadataManager, SchemaMetadataManagerRef};
|
||||
use common_meta::kv_backend::KvBackendRef;
|
||||
pub use common_procedure::options::ProcedureConfig;
|
||||
use common_stat::ResourceStatImpl;
|
||||
use common_telemetry::{error, info, warn};
|
||||
use common_wal::config::DatanodeWalConfig;
|
||||
use common_wal::config::kafka::DatanodeKafkaConfig;
|
||||
@@ -282,6 +283,9 @@ impl DatanodeBuilder {
|
||||
open_all_regions.await?;
|
||||
}
|
||||
|
||||
let mut resource_stat = ResourceStatImpl::default();
|
||||
resource_stat.start_collect_cpu_usage();
|
||||
|
||||
let heartbeat_task = if let Some(meta_client) = meta_client {
|
||||
Some(
|
||||
HeartbeatTask::try_new(
|
||||
@@ -290,6 +294,7 @@ impl DatanodeBuilder {
|
||||
meta_client,
|
||||
cache_registry,
|
||||
self.plugins.clone(),
|
||||
Arc::new(resource_stat),
|
||||
)
|
||||
.await?,
|
||||
)
|
||||
|
||||
@@ -20,7 +20,6 @@ use std::time::Duration;
|
||||
use api::v1::meta::heartbeat_request::NodeWorkloads;
|
||||
use api::v1::meta::{DatanodeWorkloads, HeartbeatRequest, NodeInfo, Peer, RegionRole, RegionStat};
|
||||
use common_base::Plugins;
|
||||
use common_config::utils::ResourceSpec;
|
||||
use common_meta::cache_invalidator::CacheInvalidatorRef;
|
||||
use common_meta::datanode::REGION_STATISTIC_KEY;
|
||||
use common_meta::distributed_time_constants::META_KEEP_ALIVE_INTERVAL_SECS;
|
||||
@@ -31,6 +30,7 @@ use common_meta::heartbeat::handler::{
|
||||
};
|
||||
use common_meta::heartbeat::mailbox::{HeartbeatMailbox, MailboxRef};
|
||||
use common_meta::heartbeat::utils::outgoing_message_to_mailbox_message;
|
||||
use common_stat::ResourceStatRef;
|
||||
use common_telemetry::{debug, error, info, trace, warn};
|
||||
use common_workload::DatanodeWorkloadType;
|
||||
use meta_client::MetaClientRef;
|
||||
@@ -63,7 +63,7 @@ pub struct HeartbeatTask {
|
||||
interval: u64,
|
||||
resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
|
||||
region_alive_keeper: Arc<RegionAliveKeeper>,
|
||||
resource_spec: ResourceSpec,
|
||||
resource_stat: ResourceStatRef,
|
||||
}
|
||||
|
||||
impl Drop for HeartbeatTask {
|
||||
@@ -80,6 +80,7 @@ impl HeartbeatTask {
|
||||
meta_client: MetaClientRef,
|
||||
cache_invalidator: CacheInvalidatorRef,
|
||||
plugins: Plugins,
|
||||
resource_stat: ResourceStatRef,
|
||||
) -> Result<Self> {
|
||||
let countdown_task_handler_ext = plugins.get::<CountdownTaskHandlerExtRef>();
|
||||
let region_alive_keeper = Arc::new(RegionAliveKeeper::new(
|
||||
@@ -109,7 +110,7 @@ impl HeartbeatTask {
|
||||
interval: opts.heartbeat.interval.as_millis() as u64,
|
||||
resp_handler_executor,
|
||||
region_alive_keeper,
|
||||
resource_spec: Default::default(),
|
||||
resource_stat,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -186,6 +187,7 @@ impl HeartbeatTask {
|
||||
.context(error::HandleHeartbeatResponseSnafu)
|
||||
}
|
||||
|
||||
#[allow(deprecated)]
|
||||
/// Start heartbeat task, spawn background task.
|
||||
pub async fn start(
|
||||
&self,
|
||||
@@ -237,8 +239,9 @@ impl HeartbeatTask {
|
||||
|
||||
self.region_alive_keeper.start(Some(event_receiver)).await?;
|
||||
let mut last_sent = Instant::now();
|
||||
let cpus = self.resource_spec.cpus as u32;
|
||||
let memory_bytes = self.resource_spec.memory.unwrap_or_default().as_bytes();
|
||||
let total_cpu_millicores = self.resource_stat.get_total_cpu_millicores();
|
||||
let total_memory_bytes = self.resource_stat.get_total_memory_bytes();
|
||||
let resource_stat = self.resource_stat.clone();
|
||||
|
||||
common_runtime::spawn_hb(async move {
|
||||
let sleep = tokio::time::sleep(Duration::from_millis(0));
|
||||
@@ -252,8 +255,13 @@ impl HeartbeatTask {
|
||||
version: build_info.version.to_string(),
|
||||
git_commit: build_info.commit_short.to_string(),
|
||||
start_time_ms: node_epoch,
|
||||
cpus,
|
||||
memory_bytes,
|
||||
total_cpu_millicores,
|
||||
total_memory_bytes,
|
||||
cpu_usage_millicores: 0,
|
||||
memory_usage_bytes: 0,
|
||||
// TODO(zyy17): Remove these deprecated fields when the deprecated fields are removed from the proto.
|
||||
cpus: total_cpu_millicores as u32,
|
||||
memory_bytes: total_memory_bytes as u64,
|
||||
hostname: hostname::get()
|
||||
.unwrap_or_default()
|
||||
.to_string_lossy()
|
||||
@@ -297,12 +305,18 @@ impl HeartbeatTask {
|
||||
let topic_stats = region_server_clone.topic_stats();
|
||||
let now = Instant::now();
|
||||
let duration_since_epoch = (now - epoch).as_millis() as u64;
|
||||
let req = HeartbeatRequest {
|
||||
let mut req = HeartbeatRequest {
|
||||
region_stats,
|
||||
topic_stats,
|
||||
duration_since_epoch,
|
||||
..heartbeat_request.clone()
|
||||
};
|
||||
|
||||
if let Some(info) = req.info.as_mut() {
|
||||
info.cpu_usage_millicores = resource_stat.get_cpu_usage_millicores();
|
||||
info.memory_usage_bytes = resource_stat.get_memory_usage_bytes();
|
||||
}
|
||||
|
||||
sleep.as_mut().reset(now + Duration::from_millis(interval));
|
||||
Some(req)
|
||||
}
|
||||
|
||||
@@ -13,16 +13,13 @@
|
||||
// limitations under the License.
|
||||
|
||||
use async_trait::async_trait;
|
||||
use common_meta::RegionIdent;
|
||||
use common_meta::error::{InvalidHeartbeatResponseSnafu, Result as MetaResult};
|
||||
use common_meta::heartbeat::handler::{
|
||||
HandleControl, HeartbeatResponseHandler, HeartbeatResponseHandlerContext,
|
||||
};
|
||||
use common_meta::instruction::{Instruction, InstructionReply};
|
||||
use common_telemetry::error;
|
||||
use futures::future::BoxFuture;
|
||||
use snafu::OptionExt;
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
mod close_region;
|
||||
mod downgrade_region;
|
||||
@@ -30,10 +27,15 @@ mod flush_region;
|
||||
mod open_region;
|
||||
mod upgrade_region;
|
||||
|
||||
use crate::heartbeat::handler::close_region::CloseRegionsHandler;
|
||||
use crate::heartbeat::handler::downgrade_region::DowngradeRegionsHandler;
|
||||
use crate::heartbeat::handler::flush_region::FlushRegionsHandler;
|
||||
use crate::heartbeat::handler::open_region::OpenRegionsHandler;
|
||||
use crate::heartbeat::handler::upgrade_region::UpgradeRegionsHandler;
|
||||
use crate::heartbeat::task_tracker::TaskTracker;
|
||||
use crate::region_server::RegionServer;
|
||||
|
||||
/// Handler for [Instruction::OpenRegion] and [Instruction::CloseRegion].
|
||||
/// The handler for [`Instruction`]s.
|
||||
#[derive(Clone)]
|
||||
pub struct RegionHeartbeatResponseHandler {
|
||||
region_server: RegionServer,
|
||||
@@ -43,9 +45,14 @@ pub struct RegionHeartbeatResponseHandler {
|
||||
open_region_parallelism: usize,
|
||||
}
|
||||
|
||||
/// Handler of the instruction.
|
||||
pub type InstructionHandler =
|
||||
Box<dyn FnOnce(HandlerContext) -> BoxFuture<'static, Option<InstructionReply>> + Send>;
|
||||
#[async_trait::async_trait]
|
||||
pub trait InstructionHandler: Send + Sync {
|
||||
async fn handle(
|
||||
&self,
|
||||
ctx: &HandlerContext,
|
||||
instruction: Instruction,
|
||||
) -> Option<InstructionReply>;
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct HandlerContext {
|
||||
@@ -56,10 +63,6 @@ pub struct HandlerContext {
|
||||
}
|
||||
|
||||
impl HandlerContext {
|
||||
fn region_ident_to_region_id(region_ident: &RegionIdent) -> RegionId {
|
||||
RegionId::new(region_ident.table_id, region_ident.region_number)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn new_for_test(region_server: RegionServer) -> Self {
|
||||
Self {
|
||||
@@ -90,31 +93,16 @@ impl RegionHeartbeatResponseHandler {
|
||||
self
|
||||
}
|
||||
|
||||
/// Builds the [InstructionHandler].
|
||||
fn build_handler(&self, instruction: Instruction) -> MetaResult<InstructionHandler> {
|
||||
fn build_handler(&self, instruction: &Instruction) -> MetaResult<Box<dyn InstructionHandler>> {
|
||||
match instruction {
|
||||
Instruction::OpenRegions(open_regions) => {
|
||||
let open_region_parallelism = self.open_region_parallelism;
|
||||
Ok(Box::new(move |handler_context| {
|
||||
handler_context
|
||||
.handle_open_regions_instruction(open_regions, open_region_parallelism)
|
||||
}))
|
||||
}
|
||||
Instruction::CloseRegions(close_regions) => Ok(Box::new(move |handler_context| {
|
||||
handler_context.handle_close_regions_instruction(close_regions)
|
||||
})),
|
||||
Instruction::DowngradeRegion(downgrade_region) => {
|
||||
Ok(Box::new(move |handler_context| {
|
||||
handler_context.handle_downgrade_region_instruction(downgrade_region)
|
||||
}))
|
||||
}
|
||||
Instruction::UpgradeRegion(upgrade_region) => Ok(Box::new(move |handler_context| {
|
||||
handler_context.handle_upgrade_region_instruction(upgrade_region)
|
||||
Instruction::CloseRegions(_) => Ok(Box::new(CloseRegionsHandler)),
|
||||
Instruction::OpenRegions(_) => Ok(Box::new(OpenRegionsHandler {
|
||||
open_region_parallelism: self.open_region_parallelism,
|
||||
})),
|
||||
Instruction::FlushRegions(_) => Ok(Box::new(FlushRegionsHandler)),
|
||||
Instruction::DowngradeRegions(_) => Ok(Box::new(DowngradeRegionsHandler)),
|
||||
Instruction::UpgradeRegion(_) => Ok(Box::new(UpgradeRegionsHandler)),
|
||||
Instruction::InvalidateCaches(_) => InvalidHeartbeatResponseSnafu.fail(),
|
||||
Instruction::FlushRegions(flush_regions) => Ok(Box::new(move |handler_context| {
|
||||
handler_context.handle_flush_regions_instruction(flush_regions)
|
||||
})),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -124,7 +112,7 @@ impl HeartbeatResponseHandler for RegionHeartbeatResponseHandler {
|
||||
fn is_acceptable(&self, ctx: &HeartbeatResponseHandlerContext) -> bool {
|
||||
matches!(ctx.incoming_message.as_ref(), |Some((
|
||||
_,
|
||||
Instruction::DowngradeRegion { .. },
|
||||
Instruction::DowngradeRegions { .. },
|
||||
))| Some((
|
||||
_,
|
||||
Instruction::UpgradeRegion { .. }
|
||||
@@ -151,15 +139,19 @@ impl HeartbeatResponseHandler for RegionHeartbeatResponseHandler {
|
||||
let catchup_tasks = self.catchup_tasks.clone();
|
||||
let downgrade_tasks = self.downgrade_tasks.clone();
|
||||
let flush_tasks = self.flush_tasks.clone();
|
||||
let handler = self.build_handler(instruction)?;
|
||||
let handler = self.build_handler(&instruction)?;
|
||||
let _handle = common_runtime::spawn_global(async move {
|
||||
let reply = handler(HandlerContext {
|
||||
region_server,
|
||||
catchup_tasks,
|
||||
downgrade_tasks,
|
||||
flush_tasks,
|
||||
})
|
||||
.await;
|
||||
let reply = handler
|
||||
.handle(
|
||||
&HandlerContext {
|
||||
region_server,
|
||||
catchup_tasks,
|
||||
downgrade_tasks,
|
||||
flush_tasks,
|
||||
},
|
||||
instruction,
|
||||
)
|
||||
.await;
|
||||
|
||||
if let Some(reply) = reply
|
||||
&& let Err(e) = mailbox.send((meta, reply)).await
|
||||
@@ -179,6 +171,7 @@ mod tests {
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use common_meta::RegionIdent;
|
||||
use common_meta::heartbeat::mailbox::{
|
||||
HeartbeatMailbox, IncomingMessage, MailboxRef, MessageMeta,
|
||||
};
|
||||
@@ -249,10 +242,10 @@ mod tests {
|
||||
);
|
||||
|
||||
// Downgrade region
|
||||
let instruction = Instruction::DowngradeRegion(DowngradeRegion {
|
||||
let instruction = Instruction::DowngradeRegions(vec![DowngradeRegion {
|
||||
region_id: RegionId::new(2048, 1),
|
||||
flush_timeout: Some(Duration::from_secs(1)),
|
||||
});
|
||||
}]);
|
||||
assert!(
|
||||
heartbeat_handler
|
||||
.is_acceptable(&heartbeat_env.create_handler_ctx((meta.clone(), instruction)))
|
||||
@@ -447,10 +440,10 @@ mod tests {
|
||||
// Should be ok, if we try to downgrade it twice.
|
||||
for _ in 0..2 {
|
||||
let meta = MessageMeta::new_test(1, "test", "dn-1", "me-0");
|
||||
let instruction = Instruction::DowngradeRegion(DowngradeRegion {
|
||||
let instruction = Instruction::DowngradeRegions(vec![DowngradeRegion {
|
||||
region_id,
|
||||
flush_timeout: Some(Duration::from_secs(1)),
|
||||
});
|
||||
}]);
|
||||
|
||||
let mut ctx = heartbeat_env.create_handler_ctx((meta, instruction));
|
||||
let control = heartbeat_handler.handle(&mut ctx).await.unwrap();
|
||||
@@ -458,33 +451,27 @@ mod tests {
|
||||
|
||||
let (_, reply) = heartbeat_env.receiver.recv().await.unwrap();
|
||||
|
||||
if let InstructionReply::DowngradeRegion(reply) = reply {
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
assert_eq!(reply.last_entry_id.unwrap(), 0);
|
||||
} else {
|
||||
unreachable!()
|
||||
}
|
||||
let reply = &reply.expect_downgrade_regions_reply()[0];
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
assert_eq!(reply.last_entry_id.unwrap(), 0);
|
||||
}
|
||||
|
||||
// Downgrades a not exists region.
|
||||
let meta = MessageMeta::new_test(1, "test", "dn-1", "me-0");
|
||||
let instruction = Instruction::DowngradeRegion(DowngradeRegion {
|
||||
let instruction = Instruction::DowngradeRegions(vec![DowngradeRegion {
|
||||
region_id: RegionId::new(2048, 1),
|
||||
flush_timeout: Some(Duration::from_secs(1)),
|
||||
});
|
||||
}]);
|
||||
let mut ctx = heartbeat_env.create_handler_ctx((meta, instruction));
|
||||
let control = heartbeat_handler.handle(&mut ctx).await.unwrap();
|
||||
assert_matches!(control, HandleControl::Continue);
|
||||
|
||||
let (_, reply) = heartbeat_env.receiver.recv().await.unwrap();
|
||||
|
||||
if let InstructionReply::DowngradeRegion(reply) = reply {
|
||||
assert!(!reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
assert!(reply.last_entry_id.is_none());
|
||||
} else {
|
||||
unreachable!()
|
||||
}
|
||||
let reply = reply.expect_downgrade_regions_reply();
|
||||
assert!(!reply[0].exists);
|
||||
assert!(reply[0].error.is_none());
|
||||
assert!(reply[0].last_entry_id.is_none());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,60 +12,64 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use common_meta::RegionIdent;
|
||||
use common_meta::instruction::{InstructionReply, SimpleReply};
|
||||
use common_meta::instruction::{Instruction, InstructionReply, SimpleReply};
|
||||
use common_telemetry::warn;
|
||||
use futures::future::join_all;
|
||||
use futures_util::future::BoxFuture;
|
||||
use store_api::region_request::{RegionCloseRequest, RegionRequest};
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use crate::error;
|
||||
use crate::heartbeat::handler::HandlerContext;
|
||||
use crate::heartbeat::handler::{HandlerContext, InstructionHandler};
|
||||
|
||||
impl HandlerContext {
|
||||
pub(crate) fn handle_close_regions_instruction(
|
||||
self,
|
||||
region_idents: Vec<RegionIdent>,
|
||||
) -> BoxFuture<'static, Option<InstructionReply>> {
|
||||
Box::pin(async move {
|
||||
let region_ids = region_idents
|
||||
.into_iter()
|
||||
.map(|region_ident| Self::region_ident_to_region_id(®ion_ident))
|
||||
.collect::<Vec<_>>();
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct CloseRegionsHandler;
|
||||
|
||||
let futs = region_ids.iter().map(|region_id| {
|
||||
self.region_server
|
||||
.handle_request(*region_id, RegionRequest::Close(RegionCloseRequest {}))
|
||||
});
|
||||
#[async_trait::async_trait]
|
||||
impl InstructionHandler for CloseRegionsHandler {
|
||||
async fn handle(
|
||||
&self,
|
||||
ctx: &HandlerContext,
|
||||
instruction: Instruction,
|
||||
) -> Option<InstructionReply> {
|
||||
// Safety: must be `Instruction::CloseRegions` instruction.
|
||||
let region_idents = instruction.into_close_regions().unwrap();
|
||||
let region_ids = region_idents
|
||||
.into_iter()
|
||||
.map(|region_ident| RegionId::new(region_ident.table_id, region_ident.region_number))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let results = join_all(futs).await;
|
||||
let futs = region_ids.iter().map(|region_id| {
|
||||
ctx.region_server
|
||||
.handle_request(*region_id, RegionRequest::Close(RegionCloseRequest {}))
|
||||
});
|
||||
|
||||
let mut errors = vec![];
|
||||
for (region_id, result) in region_ids.into_iter().zip(results.into_iter()) {
|
||||
match result {
|
||||
Ok(_) => (),
|
||||
Err(error::Error::RegionNotFound { .. }) => {
|
||||
warn!(
|
||||
"Received a close regions instruction from meta, but target region:{} is not found.",
|
||||
region_id
|
||||
);
|
||||
}
|
||||
Err(err) => errors.push(format!("region:{region_id}: {err:?}")),
|
||||
let results = join_all(futs).await;
|
||||
|
||||
let mut errors = vec![];
|
||||
for (region_id, result) in region_ids.into_iter().zip(results.into_iter()) {
|
||||
match result {
|
||||
Ok(_) => (),
|
||||
Err(error::Error::RegionNotFound { .. }) => {
|
||||
warn!(
|
||||
"Received a close regions instruction from meta, but target region:{} is not found.",
|
||||
region_id
|
||||
);
|
||||
}
|
||||
Err(err) => errors.push(format!("region:{region_id}: {err:?}")),
|
||||
}
|
||||
}
|
||||
|
||||
if errors.is_empty() {
|
||||
return Some(InstructionReply::CloseRegions(SimpleReply {
|
||||
result: true,
|
||||
error: None,
|
||||
}));
|
||||
}
|
||||
if errors.is_empty() {
|
||||
return Some(InstructionReply::CloseRegions(SimpleReply {
|
||||
result: true,
|
||||
error: None,
|
||||
}));
|
||||
}
|
||||
|
||||
Some(InstructionReply::CloseRegions(SimpleReply {
|
||||
result: false,
|
||||
error: Some(errors.join("; ")),
|
||||
}))
|
||||
})
|
||||
Some(InstructionReply::CloseRegions(SimpleReply {
|
||||
result: false,
|
||||
error: Some(errors.join("; ")),
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -12,209 +12,242 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use common_meta::instruction::{DowngradeRegion, DowngradeRegionReply, InstructionReply};
|
||||
use common_meta::instruction::{
|
||||
DowngradeRegion, DowngradeRegionReply, DowngradeRegionsReply, Instruction, InstructionReply,
|
||||
};
|
||||
use common_telemetry::tracing::info;
|
||||
use common_telemetry::{error, warn};
|
||||
use futures_util::future::BoxFuture;
|
||||
use futures::future::join_all;
|
||||
use store_api::region_engine::{SetRegionRoleStateResponse, SettableRegionRoleState};
|
||||
use store_api::region_request::{RegionFlushRequest, RegionRequest};
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use crate::heartbeat::handler::HandlerContext;
|
||||
use crate::heartbeat::handler::{HandlerContext, InstructionHandler};
|
||||
use crate::heartbeat::task_tracker::WaitResult;
|
||||
|
||||
impl HandlerContext {
|
||||
async fn downgrade_to_follower_gracefully(
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct DowngradeRegionsHandler;
|
||||
|
||||
impl DowngradeRegionsHandler {
|
||||
async fn handle_downgrade_region(
|
||||
ctx: &HandlerContext,
|
||||
DowngradeRegion {
|
||||
region_id,
|
||||
flush_timeout,
|
||||
}: DowngradeRegion,
|
||||
) -> DowngradeRegionReply {
|
||||
let Some(writable) = ctx.region_server.is_region_leader(region_id) else {
|
||||
warn!("Region: {region_id} is not found");
|
||||
return DowngradeRegionReply {
|
||||
region_id,
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
exists: false,
|
||||
error: None,
|
||||
};
|
||||
};
|
||||
|
||||
let region_server_moved = ctx.region_server.clone();
|
||||
|
||||
// Ignores flush request
|
||||
if !writable {
|
||||
warn!(
|
||||
"Region: {region_id} is not writable, flush_timeout: {:?}",
|
||||
flush_timeout
|
||||
);
|
||||
return ctx.downgrade_to_follower_gracefully(region_id).await;
|
||||
}
|
||||
|
||||
// If flush_timeout is not set, directly convert region to follower.
|
||||
let Some(flush_timeout) = flush_timeout else {
|
||||
return ctx.downgrade_to_follower_gracefully(region_id).await;
|
||||
};
|
||||
|
||||
// Sets region to downgrading,
|
||||
// the downgrading region will reject all write requests.
|
||||
// However, the downgrading region will still accept read, flush requests.
|
||||
match ctx
|
||||
.region_server
|
||||
.set_region_role_state_gracefully(region_id, SettableRegionRoleState::DowngradingLeader)
|
||||
.await
|
||||
{
|
||||
Ok(SetRegionRoleStateResponse::Success { .. }) => {}
|
||||
Ok(SetRegionRoleStateResponse::NotFound) => {
|
||||
warn!("Region: {region_id} is not found");
|
||||
return DowngradeRegionReply {
|
||||
region_id,
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
exists: false,
|
||||
error: None,
|
||||
};
|
||||
}
|
||||
Ok(SetRegionRoleStateResponse::InvalidTransition(err)) => {
|
||||
error!(err; "Failed to convert region to downgrading leader - invalid transition");
|
||||
return DowngradeRegionReply {
|
||||
region_id,
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
exists: true,
|
||||
error: Some(format!("{err:?}")),
|
||||
};
|
||||
}
|
||||
Err(err) => {
|
||||
error!(err; "Failed to convert region to downgrading leader");
|
||||
return DowngradeRegionReply {
|
||||
region_id,
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
exists: true,
|
||||
error: Some(format!("{err:?}")),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
let register_result = ctx
|
||||
.downgrade_tasks
|
||||
.try_register(
|
||||
region_id,
|
||||
Box::pin(async move {
|
||||
info!("Flush region: {region_id} before converting region to follower");
|
||||
region_server_moved
|
||||
.handle_request(
|
||||
region_id,
|
||||
RegionRequest::Flush(RegionFlushRequest {
|
||||
row_group_size: None,
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
|
||||
if register_result.is_busy() {
|
||||
warn!("Another flush task is running for the region: {region_id}");
|
||||
}
|
||||
|
||||
let mut watcher = register_result.into_watcher();
|
||||
let result = ctx.downgrade_tasks.wait(&mut watcher, flush_timeout).await;
|
||||
|
||||
match result {
|
||||
WaitResult::Timeout => DowngradeRegionReply {
|
||||
region_id,
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
exists: true,
|
||||
error: Some(format!(
|
||||
"Flush region timeout, region: {region_id}, timeout: {:?}",
|
||||
flush_timeout
|
||||
)),
|
||||
},
|
||||
WaitResult::Finish(Ok(_)) => ctx.downgrade_to_follower_gracefully(region_id).await,
|
||||
WaitResult::Finish(Err(err)) => DowngradeRegionReply {
|
||||
region_id,
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
exists: true,
|
||||
error: Some(format!("{err:?}")),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl InstructionHandler for DowngradeRegionsHandler {
|
||||
async fn handle(
|
||||
&self,
|
||||
region_id: RegionId,
|
||||
ctx: &HandlerContext,
|
||||
instruction: Instruction,
|
||||
) -> Option<InstructionReply> {
|
||||
// Safety: must be `Instruction::DowngradeRegion` instruction.
|
||||
let downgrade_regions = instruction.into_downgrade_regions().unwrap();
|
||||
let futures = downgrade_regions
|
||||
.into_iter()
|
||||
.map(|downgrade_region| Self::handle_downgrade_region(ctx, downgrade_region));
|
||||
// Join all futures; parallelism is governed by the underlying flush scheduler.
|
||||
let results = join_all(futures).await;
|
||||
|
||||
Some(InstructionReply::DowngradeRegions(
|
||||
DowngradeRegionsReply::new(results),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
impl HandlerContext {
|
||||
async fn downgrade_to_follower_gracefully(&self, region_id: RegionId) -> DowngradeRegionReply {
|
||||
match self
|
||||
.region_server
|
||||
.set_region_role_state_gracefully(region_id, SettableRegionRoleState::Follower)
|
||||
.await
|
||||
{
|
||||
Ok(SetRegionRoleStateResponse::Success(success)) => {
|
||||
Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
|
||||
last_entry_id: success.last_entry_id(),
|
||||
metadata_last_entry_id: success.metadata_last_entry_id(),
|
||||
exists: true,
|
||||
error: None,
|
||||
}))
|
||||
}
|
||||
Ok(SetRegionRoleStateResponse::Success(success)) => DowngradeRegionReply {
|
||||
region_id,
|
||||
last_entry_id: success.last_entry_id(),
|
||||
metadata_last_entry_id: success.metadata_last_entry_id(),
|
||||
exists: true,
|
||||
error: None,
|
||||
},
|
||||
Ok(SetRegionRoleStateResponse::NotFound) => {
|
||||
warn!("Region: {region_id} is not found");
|
||||
Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
|
||||
DowngradeRegionReply {
|
||||
region_id,
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
exists: false,
|
||||
error: None,
|
||||
}))
|
||||
}
|
||||
}
|
||||
Ok(SetRegionRoleStateResponse::InvalidTransition(err)) => {
|
||||
error!(err; "Failed to convert region to follower - invalid transition");
|
||||
Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
|
||||
DowngradeRegionReply {
|
||||
region_id,
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
exists: true,
|
||||
error: Some(format!("{err:?}")),
|
||||
}))
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
error!(err; "Failed to convert region to {}", SettableRegionRoleState::Follower);
|
||||
Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
|
||||
DowngradeRegionReply {
|
||||
region_id,
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
exists: true,
|
||||
error: Some(format!("{err:?}")),
|
||||
}))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn handle_downgrade_region_instruction(
|
||||
self,
|
||||
DowngradeRegion {
|
||||
region_id,
|
||||
flush_timeout,
|
||||
}: DowngradeRegion,
|
||||
) -> BoxFuture<'static, Option<InstructionReply>> {
|
||||
Box::pin(async move {
|
||||
let Some(writable) = self.region_server.is_region_leader(region_id) else {
|
||||
warn!("Region: {region_id} is not found");
|
||||
return Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
exists: false,
|
||||
error: None,
|
||||
}));
|
||||
};
|
||||
|
||||
let region_server_moved = self.region_server.clone();
|
||||
|
||||
// Ignores flush request
|
||||
if !writable {
|
||||
warn!(
|
||||
"Region: {region_id} is not writable, flush_timeout: {:?}",
|
||||
flush_timeout
|
||||
);
|
||||
return self.downgrade_to_follower_gracefully(region_id).await;
|
||||
}
|
||||
|
||||
// If flush_timeout is not set, directly convert region to follower.
|
||||
let Some(flush_timeout) = flush_timeout else {
|
||||
return self.downgrade_to_follower_gracefully(region_id).await;
|
||||
};
|
||||
|
||||
// Sets region to downgrading,
|
||||
// the downgrading region will reject all write requests.
|
||||
// However, the downgrading region will still accept read, flush requests.
|
||||
match self
|
||||
.region_server
|
||||
.set_region_role_state_gracefully(
|
||||
region_id,
|
||||
SettableRegionRoleState::DowngradingLeader,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(SetRegionRoleStateResponse::Success { .. }) => {}
|
||||
Ok(SetRegionRoleStateResponse::NotFound) => {
|
||||
warn!("Region: {region_id} is not found");
|
||||
return Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
exists: false,
|
||||
error: None,
|
||||
}));
|
||||
}
|
||||
Ok(SetRegionRoleStateResponse::InvalidTransition(err)) => {
|
||||
error!(err; "Failed to convert region to downgrading leader - invalid transition");
|
||||
return Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
exists: true,
|
||||
error: Some(format!("{err:?}")),
|
||||
}));
|
||||
}
|
||||
Err(err) => {
|
||||
error!(err; "Failed to convert region to downgrading leader");
|
||||
return Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
exists: true,
|
||||
error: Some(format!("{err:?}")),
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
let register_result = self
|
||||
.downgrade_tasks
|
||||
.try_register(
|
||||
region_id,
|
||||
Box::pin(async move {
|
||||
info!("Flush region: {region_id} before converting region to follower");
|
||||
region_server_moved
|
||||
.handle_request(
|
||||
region_id,
|
||||
RegionRequest::Flush(RegionFlushRequest {
|
||||
row_group_size: None,
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
|
||||
if register_result.is_busy() {
|
||||
warn!("Another flush task is running for the region: {region_id}");
|
||||
}
|
||||
|
||||
let mut watcher = register_result.into_watcher();
|
||||
let result = self.downgrade_tasks.wait(&mut watcher, flush_timeout).await;
|
||||
|
||||
match result {
|
||||
WaitResult::Timeout => {
|
||||
Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
exists: true,
|
||||
error: Some(format!(
|
||||
"Flush region timeout, region: {region_id}, timeout: {:?}",
|
||||
flush_timeout
|
||||
)),
|
||||
}))
|
||||
}
|
||||
WaitResult::Finish(Ok(_)) => self.downgrade_to_follower_gracefully(region_id).await,
|
||||
WaitResult::Finish(Err(err)) => {
|
||||
Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
|
||||
last_entry_id: None,
|
||||
metadata_last_entry_id: None,
|
||||
exists: true,
|
||||
error: Some(format!("{err:?}")),
|
||||
}))
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::assert_matches::assert_matches;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use common_meta::instruction::{DowngradeRegion, InstructionReply};
|
||||
use common_meta::heartbeat::handler::{HandleControl, HeartbeatResponseHandler};
|
||||
use common_meta::heartbeat::mailbox::MessageMeta;
|
||||
use common_meta::instruction::{DowngradeRegion, Instruction};
|
||||
use mito2::config::MitoConfig;
|
||||
use mito2::engine::MITO_ENGINE_NAME;
|
||||
use mito2::test_util::{CreateRequestBuilder, TestEnv};
|
||||
use store_api::region_engine::{
|
||||
RegionRole, SetRegionRoleStateResponse, SetRegionRoleStateSuccess,
|
||||
RegionEngine, RegionRole, SetRegionRoleStateResponse, SetRegionRoleStateSuccess,
|
||||
};
|
||||
use store_api::region_request::RegionRequest;
|
||||
use store_api::storage::RegionId;
|
||||
use tokio::time::Instant;
|
||||
|
||||
use crate::error;
|
||||
use crate::heartbeat::handler::HandlerContext;
|
||||
use crate::heartbeat::handler::downgrade_region::DowngradeRegionsHandler;
|
||||
use crate::heartbeat::handler::tests::HeartbeatResponseTestEnv;
|
||||
use crate::heartbeat::handler::{
|
||||
HandlerContext, InstructionHandler, RegionHeartbeatResponseHandler,
|
||||
};
|
||||
use crate::tests::{MockRegionEngine, mock_region_server};
|
||||
|
||||
#[tokio::test]
|
||||
@@ -227,20 +260,20 @@ mod tests {
|
||||
let waits = vec![None, Some(Duration::from_millis(100u64))];
|
||||
|
||||
for flush_timeout in waits {
|
||||
let reply = handler_context
|
||||
.clone()
|
||||
.handle_downgrade_region_instruction(DowngradeRegion {
|
||||
region_id,
|
||||
flush_timeout,
|
||||
})
|
||||
let reply = DowngradeRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
Instruction::DowngradeRegions(vec![DowngradeRegion {
|
||||
region_id,
|
||||
flush_timeout,
|
||||
}]),
|
||||
)
|
||||
.await;
|
||||
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
|
||||
|
||||
if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() {
|
||||
assert!(!reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
assert!(reply.last_entry_id.is_none());
|
||||
}
|
||||
let reply = &reply.unwrap().expect_downgrade_regions_reply()[0];
|
||||
assert!(!reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
assert!(reply.last_entry_id.is_none());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -270,20 +303,20 @@ mod tests {
|
||||
|
||||
let waits = vec![None, Some(Duration::from_millis(100u64))];
|
||||
for flush_timeout in waits {
|
||||
let reply = handler_context
|
||||
.clone()
|
||||
.handle_downgrade_region_instruction(DowngradeRegion {
|
||||
region_id,
|
||||
flush_timeout,
|
||||
})
|
||||
let reply = DowngradeRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
Instruction::DowngradeRegions(vec![DowngradeRegion {
|
||||
region_id,
|
||||
flush_timeout,
|
||||
}]),
|
||||
)
|
||||
.await;
|
||||
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
|
||||
|
||||
if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() {
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
assert_eq!(reply.last_entry_id.unwrap(), 1024);
|
||||
}
|
||||
let reply = &reply.unwrap().expect_downgrade_regions_reply()[0];
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
assert_eq!(reply.last_entry_id.unwrap(), 1024);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -305,20 +338,20 @@ mod tests {
|
||||
let handler_context = HandlerContext::new_for_test(mock_region_server);
|
||||
|
||||
let flush_timeout = Duration::from_millis(100);
|
||||
let reply = handler_context
|
||||
.clone()
|
||||
.handle_downgrade_region_instruction(DowngradeRegion {
|
||||
region_id,
|
||||
flush_timeout: Some(flush_timeout),
|
||||
})
|
||||
let reply = DowngradeRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
Instruction::DowngradeRegions(vec![DowngradeRegion {
|
||||
region_id,
|
||||
flush_timeout: Some(flush_timeout),
|
||||
}]),
|
||||
)
|
||||
.await;
|
||||
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
|
||||
|
||||
if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() {
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.unwrap().contains("timeout"));
|
||||
assert!(reply.last_entry_id.is_none());
|
||||
}
|
||||
let reply = &reply.unwrap().expect_downgrade_regions_reply()[0];
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.as_ref().unwrap().contains("timeout"));
|
||||
assert!(reply.last_entry_id.is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -344,36 +377,38 @@ mod tests {
|
||||
];
|
||||
|
||||
for flush_timeout in waits {
|
||||
let reply = handler_context
|
||||
.clone()
|
||||
.handle_downgrade_region_instruction(DowngradeRegion {
|
||||
region_id,
|
||||
flush_timeout,
|
||||
})
|
||||
let reply = DowngradeRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
Instruction::DowngradeRegions(vec![DowngradeRegion {
|
||||
region_id,
|
||||
flush_timeout,
|
||||
}]),
|
||||
)
|
||||
.await;
|
||||
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
|
||||
if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() {
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.unwrap().contains("timeout"));
|
||||
assert!(reply.last_entry_id.is_none());
|
||||
}
|
||||
|
||||
let reply = &reply.unwrap().expect_downgrade_regions_reply()[0];
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.as_ref().unwrap().contains("timeout"));
|
||||
assert!(reply.last_entry_id.is_none());
|
||||
}
|
||||
let timer = Instant::now();
|
||||
let reply = handler_context
|
||||
.handle_downgrade_region_instruction(DowngradeRegion {
|
||||
region_id,
|
||||
flush_timeout: Some(Duration::from_millis(500)),
|
||||
})
|
||||
let reply = DowngradeRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
Instruction::DowngradeRegions(vec![DowngradeRegion {
|
||||
region_id,
|
||||
flush_timeout: Some(Duration::from_millis(500)),
|
||||
}]),
|
||||
)
|
||||
.await;
|
||||
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
|
||||
// Must less than 300 ms.
|
||||
assert!(timer.elapsed().as_millis() < 300);
|
||||
|
||||
if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() {
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
assert_eq!(reply.last_entry_id.unwrap(), 1024);
|
||||
}
|
||||
let reply = &reply.unwrap().expect_downgrade_regions_reply()[0];
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
assert_eq!(reply.last_entry_id.unwrap(), 1024);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -405,36 +440,36 @@ mod tests {
|
||||
];
|
||||
|
||||
for flush_timeout in waits {
|
||||
let reply = handler_context
|
||||
.clone()
|
||||
.handle_downgrade_region_instruction(DowngradeRegion {
|
||||
region_id,
|
||||
flush_timeout,
|
||||
})
|
||||
let reply = DowngradeRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
Instruction::DowngradeRegions(vec![DowngradeRegion {
|
||||
region_id,
|
||||
flush_timeout,
|
||||
}]),
|
||||
)
|
||||
.await;
|
||||
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
|
||||
if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() {
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.unwrap().contains("timeout"));
|
||||
assert!(reply.last_entry_id.is_none());
|
||||
}
|
||||
}
|
||||
let timer = Instant::now();
|
||||
let reply = handler_context
|
||||
.handle_downgrade_region_instruction(DowngradeRegion {
|
||||
region_id,
|
||||
flush_timeout: Some(Duration::from_millis(500)),
|
||||
})
|
||||
.await;
|
||||
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
|
||||
// Must less than 300 ms.
|
||||
assert!(timer.elapsed().as_millis() < 300);
|
||||
|
||||
if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() {
|
||||
let reply = &reply.unwrap().expect_downgrade_regions_reply()[0];
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.unwrap().contains("flush failed"));
|
||||
assert!(reply.error.as_ref().unwrap().contains("timeout"));
|
||||
assert!(reply.last_entry_id.is_none());
|
||||
}
|
||||
let timer = Instant::now();
|
||||
let reply = DowngradeRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
Instruction::DowngradeRegions(vec![DowngradeRegion {
|
||||
region_id,
|
||||
flush_timeout: Some(Duration::from_millis(500)),
|
||||
}]),
|
||||
)
|
||||
.await;
|
||||
// Must less than 300 ms.
|
||||
assert!(timer.elapsed().as_millis() < 300);
|
||||
let reply = &reply.unwrap().expect_downgrade_regions_reply()[0];
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.as_ref().unwrap().contains("flush failed"));
|
||||
assert!(reply.last_entry_id.is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -449,19 +484,19 @@ mod tests {
|
||||
});
|
||||
mock_region_server.register_test_region(region_id, mock_engine);
|
||||
let handler_context = HandlerContext::new_for_test(mock_region_server);
|
||||
let reply = handler_context
|
||||
.clone()
|
||||
.handle_downgrade_region_instruction(DowngradeRegion {
|
||||
region_id,
|
||||
flush_timeout: None,
|
||||
})
|
||||
let reply = DowngradeRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
Instruction::DowngradeRegions(vec![DowngradeRegion {
|
||||
region_id,
|
||||
flush_timeout: None,
|
||||
}]),
|
||||
)
|
||||
.await;
|
||||
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
|
||||
if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() {
|
||||
assert!(!reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
assert!(reply.last_entry_id.is_none());
|
||||
}
|
||||
let reply = &reply.unwrap().expect_downgrade_regions_reply()[0];
|
||||
assert!(!reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
assert!(reply.last_entry_id.is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -480,23 +515,77 @@ mod tests {
|
||||
});
|
||||
mock_region_server.register_test_region(region_id, mock_engine);
|
||||
let handler_context = HandlerContext::new_for_test(mock_region_server);
|
||||
let reply = handler_context
|
||||
.clone()
|
||||
.handle_downgrade_region_instruction(DowngradeRegion {
|
||||
region_id,
|
||||
flush_timeout: None,
|
||||
})
|
||||
let reply = DowngradeRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
Instruction::DowngradeRegions(vec![DowngradeRegion {
|
||||
region_id,
|
||||
flush_timeout: None,
|
||||
}]),
|
||||
)
|
||||
.await;
|
||||
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
|
||||
if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() {
|
||||
assert!(reply.exists);
|
||||
assert!(
|
||||
reply
|
||||
.error
|
||||
.unwrap()
|
||||
.contains("Failed to set region to readonly")
|
||||
);
|
||||
assert!(reply.last_entry_id.is_none());
|
||||
}
|
||||
let reply = &reply.unwrap().expect_downgrade_regions_reply()[0];
|
||||
assert!(reply.exists);
|
||||
assert!(
|
||||
reply
|
||||
.error
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.contains("Failed to set region to readonly")
|
||||
);
|
||||
assert!(reply.last_entry_id.is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_downgrade_regions() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
|
||||
let mut region_server = mock_region_server();
|
||||
let heartbeat_handler = RegionHeartbeatResponseHandler::new(region_server.clone());
|
||||
let mut engine_env = TestEnv::with_prefix("downgrade-regions").await;
|
||||
let engine = engine_env.create_engine(MitoConfig::default()).await;
|
||||
region_server.register_engine(Arc::new(engine.clone()));
|
||||
let region_id = RegionId::new(1024, 1);
|
||||
let region_id1 = RegionId::new(1024, 2);
|
||||
let builder = CreateRequestBuilder::new();
|
||||
let create_req = builder.build();
|
||||
region_server
|
||||
.handle_request(region_id, RegionRequest::Create(create_req))
|
||||
.await
|
||||
.unwrap();
|
||||
let create_req1 = builder.build();
|
||||
region_server
|
||||
.handle_request(region_id1, RegionRequest::Create(create_req1))
|
||||
.await
|
||||
.unwrap();
|
||||
let meta = MessageMeta::new_test(1, "test", "dn-1", "meta-0");
|
||||
let instruction = Instruction::DowngradeRegions(vec![
|
||||
DowngradeRegion {
|
||||
region_id,
|
||||
flush_timeout: Some(Duration::from_secs(1)),
|
||||
},
|
||||
DowngradeRegion {
|
||||
region_id: region_id1,
|
||||
flush_timeout: Some(Duration::from_secs(1)),
|
||||
},
|
||||
]);
|
||||
let mut heartbeat_env = HeartbeatResponseTestEnv::new();
|
||||
let mut ctx = heartbeat_env.create_handler_ctx((meta, instruction));
|
||||
let control = heartbeat_handler.handle(&mut ctx).await.unwrap();
|
||||
assert_matches!(control, HandleControl::Continue);
|
||||
|
||||
let (_, reply) = heartbeat_env.receiver.recv().await.unwrap();
|
||||
let reply = reply.expect_downgrade_regions_reply();
|
||||
assert_eq!(reply[0].region_id, region_id);
|
||||
assert!(reply[0].exists);
|
||||
assert!(reply[0].error.is_none());
|
||||
assert_eq!(reply[0].last_entry_id, Some(0));
|
||||
assert_eq!(reply[1].region_id, region_id1);
|
||||
assert!(reply[1].exists);
|
||||
assert!(reply[1].error.is_none());
|
||||
assert_eq!(reply[1].last_entry_id, Some(0));
|
||||
|
||||
assert_eq!(engine.role(region_id).unwrap(), RegionRole::Follower);
|
||||
assert_eq!(engine.role(region_id1).unwrap(), RegionRole::Follower);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,19 +15,53 @@
|
||||
use std::time::Instant;
|
||||
|
||||
use common_meta::instruction::{
|
||||
FlushErrorStrategy, FlushRegionReply, FlushRegions, FlushStrategy, InstructionReply,
|
||||
FlushErrorStrategy, FlushRegionReply, FlushStrategy, Instruction, InstructionReply,
|
||||
};
|
||||
use common_telemetry::{debug, warn};
|
||||
use futures_util::future::BoxFuture;
|
||||
use store_api::region_request::{RegionFlushRequest, RegionRequest};
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use crate::error::{self, RegionNotFoundSnafu, RegionNotReadySnafu, UnexpectedSnafu};
|
||||
use crate::heartbeat::handler::HandlerContext;
|
||||
use crate::error::{self, RegionNotFoundSnafu, RegionNotReadySnafu, Result, UnexpectedSnafu};
|
||||
use crate::heartbeat::handler::{HandlerContext, InstructionHandler};
|
||||
|
||||
pub struct FlushRegionsHandler;
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl InstructionHandler for FlushRegionsHandler {
|
||||
async fn handle(
|
||||
&self,
|
||||
ctx: &HandlerContext,
|
||||
instruction: Instruction,
|
||||
) -> Option<InstructionReply> {
|
||||
let start_time = Instant::now();
|
||||
let flush_regions = instruction.into_flush_regions().unwrap();
|
||||
let strategy = flush_regions.strategy;
|
||||
let region_ids = flush_regions.region_ids;
|
||||
let error_strategy = flush_regions.error_strategy;
|
||||
|
||||
let reply = if matches!(strategy, FlushStrategy::Async) {
|
||||
// Asynchronous hint mode: fire-and-forget, no reply expected
|
||||
ctx.handle_flush_hint(region_ids).await;
|
||||
None
|
||||
} else {
|
||||
// Synchronous mode: return reply with results
|
||||
let reply = ctx.handle_flush_sync(region_ids, error_strategy).await;
|
||||
Some(InstructionReply::FlushRegions(reply))
|
||||
};
|
||||
|
||||
let elapsed = start_time.elapsed();
|
||||
debug!(
|
||||
"FlushRegions strategy: {:?}, elapsed: {:?}, reply: {:?}",
|
||||
strategy, elapsed, reply
|
||||
);
|
||||
|
||||
reply
|
||||
}
|
||||
}
|
||||
|
||||
impl HandlerContext {
|
||||
/// Performs the actual region flush operation.
|
||||
async fn perform_region_flush(&self, region_id: RegionId) -> Result<(), error::Error> {
|
||||
async fn perform_region_flush(&self, region_id: RegionId) -> Result<()> {
|
||||
let request = RegionRequest::Flush(RegionFlushRequest {
|
||||
row_group_size: None,
|
||||
});
|
||||
@@ -92,7 +126,7 @@ impl HandlerContext {
|
||||
}
|
||||
|
||||
/// Flushes a single region synchronously with proper error handling.
|
||||
async fn flush_single_region_sync(&self, region_id: RegionId) -> Result<(), error::Error> {
|
||||
async fn flush_single_region_sync(&self, region_id: RegionId) -> Result<()> {
|
||||
// Check if region is leader and writable
|
||||
let Some(writable) = self.region_server.is_region_leader(region_id) else {
|
||||
return Err(RegionNotFoundSnafu { region_id }.build());
|
||||
@@ -135,37 +169,6 @@ impl HandlerContext {
|
||||
.build()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Unified handler for FlushRegions with all flush semantics.
|
||||
pub(crate) fn handle_flush_regions_instruction(
|
||||
self,
|
||||
flush_regions: FlushRegions,
|
||||
) -> BoxFuture<'static, Option<InstructionReply>> {
|
||||
Box::pin(async move {
|
||||
let start_time = Instant::now();
|
||||
let strategy = flush_regions.strategy;
|
||||
let region_ids = flush_regions.region_ids;
|
||||
let error_strategy = flush_regions.error_strategy;
|
||||
|
||||
let reply = if matches!(strategy, FlushStrategy::Async) {
|
||||
// Asynchronous hint mode: fire-and-forget, no reply expected
|
||||
self.handle_flush_hint(region_ids).await;
|
||||
None
|
||||
} else {
|
||||
// Synchronous mode: return reply with results
|
||||
let reply = self.handle_flush_sync(region_ids, error_strategy).await;
|
||||
Some(InstructionReply::FlushRegions(reply))
|
||||
};
|
||||
|
||||
let elapsed = start_time.elapsed();
|
||||
debug!(
|
||||
"FlushRegions strategy: {:?}, elapsed: {:?}, reply: {:?}",
|
||||
strategy, elapsed, reply
|
||||
);
|
||||
|
||||
reply
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -201,9 +204,11 @@ mod tests {
|
||||
|
||||
// Async hint mode
|
||||
let flush_instruction = FlushRegions::async_batch(region_ids.clone());
|
||||
let reply = handler_context
|
||||
.clone()
|
||||
.handle_flush_regions_instruction(flush_instruction)
|
||||
let reply = FlushRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
Instruction::FlushRegions(flush_instruction),
|
||||
)
|
||||
.await;
|
||||
assert!(reply.is_none()); // Hint mode returns no reply
|
||||
assert_eq!(*flushed_region_ids.read().unwrap(), region_ids);
|
||||
@@ -212,8 +217,11 @@ mod tests {
|
||||
flushed_region_ids.write().unwrap().clear();
|
||||
let not_found_region_ids = (0..2).map(|i| RegionId::new(2048, i)).collect::<Vec<_>>();
|
||||
let flush_instruction = FlushRegions::async_batch(not_found_region_ids);
|
||||
let reply = handler_context
|
||||
.handle_flush_regions_instruction(flush_instruction)
|
||||
let reply = FlushRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
Instruction::FlushRegions(flush_instruction),
|
||||
)
|
||||
.await;
|
||||
assert!(reply.is_none());
|
||||
assert!(flushed_region_ids.read().unwrap().is_empty());
|
||||
@@ -238,20 +246,17 @@ mod tests {
|
||||
let handler_context = HandlerContext::new_for_test(mock_region_server);
|
||||
|
||||
let flush_instruction = FlushRegions::sync_single(region_id);
|
||||
let reply = handler_context
|
||||
.handle_flush_regions_instruction(flush_instruction)
|
||||
let reply = FlushRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
Instruction::FlushRegions(flush_instruction),
|
||||
)
|
||||
.await;
|
||||
|
||||
assert!(reply.is_some());
|
||||
if let Some(InstructionReply::FlushRegions(flush_reply)) = reply {
|
||||
assert!(flush_reply.overall_success);
|
||||
assert_eq!(flush_reply.results.len(), 1);
|
||||
assert_eq!(flush_reply.results[0].0, region_id);
|
||||
assert!(flush_reply.results[0].1.is_ok());
|
||||
} else {
|
||||
panic!("Expected FlushRegions reply");
|
||||
}
|
||||
|
||||
let flush_reply = reply.unwrap().expect_flush_regions_reply();
|
||||
assert!(flush_reply.overall_success);
|
||||
assert_eq!(flush_reply.results.len(), 1);
|
||||
assert_eq!(flush_reply.results[0].0, region_id);
|
||||
assert!(flush_reply.results[0].1.is_ok());
|
||||
assert_eq!(*flushed_region_ids.read().unwrap(), vec![region_id]);
|
||||
}
|
||||
|
||||
@@ -281,18 +286,16 @@ mod tests {
|
||||
// Sync batch with fail-fast strategy
|
||||
let flush_instruction =
|
||||
FlushRegions::sync_batch(region_ids.clone(), FlushErrorStrategy::FailFast);
|
||||
let reply = handler_context
|
||||
.handle_flush_regions_instruction(flush_instruction)
|
||||
let reply = FlushRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
Instruction::FlushRegions(flush_instruction),
|
||||
)
|
||||
.await;
|
||||
|
||||
assert!(reply.is_some());
|
||||
if let Some(InstructionReply::FlushRegions(flush_reply)) = reply {
|
||||
assert!(!flush_reply.overall_success); // Should fail due to non-existent regions
|
||||
// With fail-fast, only process regions until first failure
|
||||
assert!(flush_reply.results.len() <= region_ids.len());
|
||||
} else {
|
||||
panic!("Expected FlushRegions reply");
|
||||
}
|
||||
let flush_reply = reply.unwrap().expect_flush_regions_reply();
|
||||
assert!(!flush_reply.overall_success); // Should fail due to non-existent regions
|
||||
// With fail-fast, only process regions until first failure
|
||||
assert!(flush_reply.results.len() <= region_ids.len());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -317,20 +320,18 @@ mod tests {
|
||||
// Sync batch with try-all strategy
|
||||
let flush_instruction =
|
||||
FlushRegions::sync_batch(region_ids.clone(), FlushErrorStrategy::TryAll);
|
||||
let reply = handler_context
|
||||
.handle_flush_regions_instruction(flush_instruction)
|
||||
let reply = FlushRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
Instruction::FlushRegions(flush_instruction),
|
||||
)
|
||||
.await;
|
||||
|
||||
assert!(reply.is_some());
|
||||
if let Some(InstructionReply::FlushRegions(flush_reply)) = reply {
|
||||
assert!(!flush_reply.overall_success); // Should fail due to one non-existent region
|
||||
// With try-all, should process all regions
|
||||
assert_eq!(flush_reply.results.len(), region_ids.len());
|
||||
// First should succeed, second should fail
|
||||
assert!(flush_reply.results[0].1.is_ok());
|
||||
assert!(flush_reply.results[1].1.is_err());
|
||||
} else {
|
||||
panic!("Expected FlushRegions reply");
|
||||
}
|
||||
let flush_reply = reply.unwrap().expect_flush_regions_reply();
|
||||
assert!(!flush_reply.overall_success); // Should fail due to one non-existent region
|
||||
// With try-all, should process all regions
|
||||
assert_eq!(flush_reply.results.len(), region_ids.len());
|
||||
// First should succeed, second should fail
|
||||
assert!(flush_reply.results[0].1.is_ok());
|
||||
assert!(flush_reply.results[1].1.is_err());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,56 +12,62 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use common_meta::instruction::{InstructionReply, OpenRegion, SimpleReply};
|
||||
use common_meta::instruction::{Instruction, InstructionReply, OpenRegion, SimpleReply};
|
||||
use common_meta::wal_options_allocator::prepare_wal_options;
|
||||
use futures_util::future::BoxFuture;
|
||||
use store_api::path_utils::table_dir;
|
||||
use store_api::region_request::{PathType, RegionOpenRequest};
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use crate::heartbeat::handler::HandlerContext;
|
||||
use crate::heartbeat::handler::{HandlerContext, InstructionHandler};
|
||||
|
||||
impl HandlerContext {
|
||||
pub(crate) fn handle_open_regions_instruction(
|
||||
self,
|
||||
open_regions: Vec<OpenRegion>,
|
||||
open_region_parallelism: usize,
|
||||
) -> BoxFuture<'static, Option<InstructionReply>> {
|
||||
Box::pin(async move {
|
||||
let requests = open_regions
|
||||
.into_iter()
|
||||
.map(|open_region| {
|
||||
let OpenRegion {
|
||||
region_ident,
|
||||
region_storage_path,
|
||||
mut region_options,
|
||||
region_wal_options,
|
||||
skip_wal_replay,
|
||||
} = open_region;
|
||||
let region_id = Self::region_ident_to_region_id(®ion_ident);
|
||||
prepare_wal_options(&mut region_options, region_id, ®ion_wal_options);
|
||||
let request = RegionOpenRequest {
|
||||
engine: region_ident.engine,
|
||||
table_dir: table_dir(®ion_storage_path, region_id.table_id()),
|
||||
path_type: PathType::Bare,
|
||||
options: region_options,
|
||||
skip_wal_replay,
|
||||
checkpoint: None,
|
||||
};
|
||||
(region_id, request)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
pub struct OpenRegionsHandler {
|
||||
pub open_region_parallelism: usize,
|
||||
}
|
||||
|
||||
let result = self
|
||||
.region_server
|
||||
.handle_batch_open_requests(open_region_parallelism, requests, false)
|
||||
.await;
|
||||
let success = result.is_ok();
|
||||
let error = result.as_ref().map_err(|e| format!("{e:?}")).err();
|
||||
Some(InstructionReply::OpenRegions(SimpleReply {
|
||||
result: success,
|
||||
error,
|
||||
}))
|
||||
})
|
||||
#[async_trait::async_trait]
|
||||
impl InstructionHandler for OpenRegionsHandler {
|
||||
async fn handle(
|
||||
&self,
|
||||
ctx: &HandlerContext,
|
||||
instruction: Instruction,
|
||||
) -> Option<InstructionReply> {
|
||||
let open_regions = instruction.into_open_regions().unwrap();
|
||||
|
||||
let requests = open_regions
|
||||
.into_iter()
|
||||
.map(|open_region| {
|
||||
let OpenRegion {
|
||||
region_ident,
|
||||
region_storage_path,
|
||||
mut region_options,
|
||||
region_wal_options,
|
||||
skip_wal_replay,
|
||||
} = open_region;
|
||||
let region_id = RegionId::new(region_ident.table_id, region_ident.region_number);
|
||||
prepare_wal_options(&mut region_options, region_id, ®ion_wal_options);
|
||||
let request = RegionOpenRequest {
|
||||
engine: region_ident.engine,
|
||||
table_dir: table_dir(®ion_storage_path, region_id.table_id()),
|
||||
path_type: PathType::Bare,
|
||||
options: region_options,
|
||||
skip_wal_replay,
|
||||
checkpoint: None,
|
||||
};
|
||||
(region_id, request)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let result = ctx
|
||||
.region_server
|
||||
.handle_batch_open_requests(self.open_region_parallelism, requests, false)
|
||||
.await;
|
||||
let success = result.is_ok();
|
||||
let error = result.as_ref().map_err(|e| format!("{e:?}")).err();
|
||||
|
||||
Some(InstructionReply::OpenRegions(SimpleReply {
|
||||
result: success,
|
||||
error,
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -12,18 +12,24 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use common_meta::instruction::{InstructionReply, UpgradeRegion, UpgradeRegionReply};
|
||||
use common_meta::instruction::{Instruction, InstructionReply, UpgradeRegion, UpgradeRegionReply};
|
||||
use common_telemetry::{info, warn};
|
||||
use futures_util::future::BoxFuture;
|
||||
use store_api::region_request::{RegionCatchupRequest, RegionRequest, ReplayCheckpoint};
|
||||
|
||||
use crate::heartbeat::handler::HandlerContext;
|
||||
use crate::heartbeat::handler::{HandlerContext, InstructionHandler};
|
||||
use crate::heartbeat::task_tracker::WaitResult;
|
||||
|
||||
impl HandlerContext {
|
||||
pub(crate) fn handle_upgrade_region_instruction(
|
||||
self,
|
||||
UpgradeRegion {
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct UpgradeRegionsHandler;
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl InstructionHandler for UpgradeRegionsHandler {
|
||||
async fn handle(
|
||||
&self,
|
||||
ctx: &HandlerContext,
|
||||
instruction: Instruction,
|
||||
) -> Option<InstructionReply> {
|
||||
let UpgradeRegion {
|
||||
region_id,
|
||||
last_entry_id,
|
||||
metadata_last_entry_id,
|
||||
@@ -31,116 +37,116 @@ impl HandlerContext {
|
||||
location_id,
|
||||
replay_entry_id,
|
||||
metadata_replay_entry_id,
|
||||
}: UpgradeRegion,
|
||||
) -> BoxFuture<'static, Option<InstructionReply>> {
|
||||
Box::pin(async move {
|
||||
let Some(writable) = self.region_server.is_region_leader(region_id) else {
|
||||
return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
|
||||
ready: false,
|
||||
exists: false,
|
||||
error: None,
|
||||
}));
|
||||
};
|
||||
} = instruction.into_upgrade_regions().unwrap();
|
||||
|
||||
if writable {
|
||||
return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
|
||||
let Some(writable) = ctx.region_server.is_region_leader(region_id) else {
|
||||
return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
|
||||
ready: false,
|
||||
exists: false,
|
||||
error: None,
|
||||
}));
|
||||
};
|
||||
|
||||
if writable {
|
||||
return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
|
||||
ready: true,
|
||||
exists: true,
|
||||
error: None,
|
||||
}));
|
||||
}
|
||||
|
||||
let region_server_moved = ctx.region_server.clone();
|
||||
|
||||
let checkpoint = match (replay_entry_id, metadata_replay_entry_id) {
|
||||
(Some(entry_id), metadata_entry_id) => Some(ReplayCheckpoint {
|
||||
entry_id,
|
||||
metadata_entry_id,
|
||||
}),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
// The catchup task is almost zero cost if the inside region is writable.
|
||||
// Therefore, it always registers a new catchup task.
|
||||
let register_result = ctx
|
||||
.catchup_tasks
|
||||
.try_register(
|
||||
region_id,
|
||||
Box::pin(async move {
|
||||
info!(
|
||||
"Executing region: {region_id} catchup to: last entry id {last_entry_id:?}"
|
||||
);
|
||||
region_server_moved
|
||||
.handle_request(
|
||||
region_id,
|
||||
RegionRequest::Catchup(RegionCatchupRequest {
|
||||
set_writable: true,
|
||||
entry_id: last_entry_id,
|
||||
metadata_entry_id: metadata_last_entry_id,
|
||||
location_id,
|
||||
checkpoint,
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
|
||||
if register_result.is_busy() {
|
||||
warn!("Another catchup task is running for the region: {region_id}");
|
||||
}
|
||||
|
||||
// Returns immediately
|
||||
let Some(replay_timeout) = replay_timeout else {
|
||||
return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
|
||||
ready: false,
|
||||
exists: true,
|
||||
error: None,
|
||||
}));
|
||||
};
|
||||
|
||||
// We don't care that it returns a newly registered or running task.
|
||||
let mut watcher = register_result.into_watcher();
|
||||
let result = ctx.catchup_tasks.wait(&mut watcher, replay_timeout).await;
|
||||
|
||||
match result {
|
||||
WaitResult::Timeout => Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
|
||||
ready: false,
|
||||
exists: true,
|
||||
error: None,
|
||||
})),
|
||||
WaitResult::Finish(Ok(_)) => {
|
||||
Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
|
||||
ready: true,
|
||||
exists: true,
|
||||
error: None,
|
||||
}));
|
||||
}))
|
||||
}
|
||||
|
||||
let region_server_moved = self.region_server.clone();
|
||||
|
||||
let checkpoint = match (replay_entry_id, metadata_replay_entry_id) {
|
||||
(Some(entry_id), metadata_entry_id) => Some(ReplayCheckpoint {
|
||||
entry_id,
|
||||
metadata_entry_id,
|
||||
}),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
// The catchup task is almost zero cost if the inside region is writable.
|
||||
// Therefore, it always registers a new catchup task.
|
||||
let register_result = self
|
||||
.catchup_tasks
|
||||
.try_register(
|
||||
region_id,
|
||||
Box::pin(async move {
|
||||
info!("Executing region: {region_id} catchup to: last entry id {last_entry_id:?}");
|
||||
region_server_moved
|
||||
.handle_request(
|
||||
region_id,
|
||||
RegionRequest::Catchup(RegionCatchupRequest {
|
||||
set_writable: true,
|
||||
entry_id: last_entry_id,
|
||||
metadata_entry_id: metadata_last_entry_id,
|
||||
location_id,
|
||||
checkpoint,
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
|
||||
if register_result.is_busy() {
|
||||
warn!("Another catchup task is running for the region: {region_id}");
|
||||
}
|
||||
|
||||
// Returns immediately
|
||||
let Some(replay_timeout) = replay_timeout else {
|
||||
return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
|
||||
WaitResult::Finish(Err(err)) => {
|
||||
Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
|
||||
ready: false,
|
||||
exists: true,
|
||||
error: None,
|
||||
}));
|
||||
};
|
||||
|
||||
// We don't care that it returns a newly registered or running task.
|
||||
let mut watcher = register_result.into_watcher();
|
||||
let result = self.catchup_tasks.wait(&mut watcher, replay_timeout).await;
|
||||
|
||||
match result {
|
||||
WaitResult::Timeout => Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
|
||||
ready: false,
|
||||
exists: true,
|
||||
error: None,
|
||||
})),
|
||||
WaitResult::Finish(Ok(_)) => {
|
||||
Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
|
||||
ready: true,
|
||||
exists: true,
|
||||
error: None,
|
||||
}))
|
||||
}
|
||||
WaitResult::Finish(Err(err)) => {
|
||||
Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
|
||||
ready: false,
|
||||
exists: true,
|
||||
error: Some(format!("{err:?}")),
|
||||
}))
|
||||
}
|
||||
error: Some(format!("{err:?}")),
|
||||
}))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::assert_matches::assert_matches;
|
||||
use std::time::Duration;
|
||||
|
||||
use common_meta::instruction::{InstructionReply, UpgradeRegion};
|
||||
use common_meta::instruction::{Instruction, UpgradeRegion};
|
||||
use mito2::engine::MITO_ENGINE_NAME;
|
||||
use store_api::region_engine::RegionRole;
|
||||
use store_api::storage::RegionId;
|
||||
use tokio::time::Instant;
|
||||
|
||||
use crate::error;
|
||||
use crate::heartbeat::handler::HandlerContext;
|
||||
use crate::heartbeat::handler::upgrade_region::UpgradeRegionsHandler;
|
||||
use crate::heartbeat::handler::{HandlerContext, InstructionHandler};
|
||||
use crate::tests::{MockRegionEngine, mock_region_server};
|
||||
|
||||
#[tokio::test]
|
||||
@@ -155,20 +161,20 @@ mod tests {
|
||||
let waits = vec![None, Some(Duration::from_millis(100u64))];
|
||||
|
||||
for replay_timeout in waits {
|
||||
let reply = handler_context
|
||||
.clone()
|
||||
.handle_upgrade_region_instruction(UpgradeRegion {
|
||||
region_id,
|
||||
replay_timeout,
|
||||
..Default::default()
|
||||
})
|
||||
let reply = UpgradeRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
Instruction::UpgradeRegion(UpgradeRegion {
|
||||
region_id,
|
||||
replay_timeout,
|
||||
..Default::default()
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_)));
|
||||
|
||||
if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() {
|
||||
assert!(!reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
}
|
||||
let reply = reply.unwrap().expect_upgrade_region_reply();
|
||||
assert!(!reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -192,21 +198,21 @@ mod tests {
|
||||
let waits = vec![None, Some(Duration::from_millis(100u64))];
|
||||
|
||||
for replay_timeout in waits {
|
||||
let reply = handler_context
|
||||
.clone()
|
||||
.handle_upgrade_region_instruction(UpgradeRegion {
|
||||
region_id,
|
||||
replay_timeout,
|
||||
..Default::default()
|
||||
})
|
||||
let reply = UpgradeRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
Instruction::UpgradeRegion(UpgradeRegion {
|
||||
region_id,
|
||||
replay_timeout,
|
||||
..Default::default()
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_)));
|
||||
|
||||
if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() {
|
||||
assert!(reply.ready);
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
}
|
||||
let reply = reply.unwrap().expect_upgrade_region_reply();
|
||||
assert!(reply.ready);
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -230,21 +236,21 @@ mod tests {
|
||||
let waits = vec![None, Some(Duration::from_millis(100u64))];
|
||||
|
||||
for replay_timeout in waits {
|
||||
let reply = handler_context
|
||||
.clone()
|
||||
.handle_upgrade_region_instruction(UpgradeRegion {
|
||||
region_id,
|
||||
replay_timeout,
|
||||
..Default::default()
|
||||
})
|
||||
let reply = UpgradeRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
Instruction::UpgradeRegion(UpgradeRegion {
|
||||
region_id,
|
||||
replay_timeout,
|
||||
..Default::default()
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_)));
|
||||
|
||||
if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() {
|
||||
assert!(!reply.ready);
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
}
|
||||
let reply = reply.unwrap().expect_upgrade_region_reply();
|
||||
assert!(!reply.ready);
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -271,40 +277,41 @@ mod tests {
|
||||
let handler_context = HandlerContext::new_for_test(mock_region_server);
|
||||
|
||||
for replay_timeout in waits {
|
||||
let reply = handler_context
|
||||
.clone()
|
||||
.handle_upgrade_region_instruction(UpgradeRegion {
|
||||
region_id,
|
||||
replay_timeout,
|
||||
..Default::default()
|
||||
})
|
||||
let reply = UpgradeRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
Instruction::UpgradeRegion(UpgradeRegion {
|
||||
region_id,
|
||||
replay_timeout,
|
||||
..Default::default()
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_)));
|
||||
|
||||
if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() {
|
||||
assert!(!reply.ready);
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
}
|
||||
}
|
||||
|
||||
let timer = Instant::now();
|
||||
let reply = handler_context
|
||||
.handle_upgrade_region_instruction(UpgradeRegion {
|
||||
region_id,
|
||||
replay_timeout: Some(Duration::from_millis(500)),
|
||||
..Default::default()
|
||||
})
|
||||
.await;
|
||||
assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_)));
|
||||
// Must less than 300 ms.
|
||||
assert!(timer.elapsed().as_millis() < 300);
|
||||
|
||||
if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() {
|
||||
assert!(reply.ready);
|
||||
let reply = reply.unwrap().expect_upgrade_region_reply();
|
||||
assert!(!reply.ready);
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
}
|
||||
|
||||
let timer = Instant::now();
|
||||
let reply = UpgradeRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
Instruction::UpgradeRegion(UpgradeRegion {
|
||||
region_id,
|
||||
replay_timeout: Some(Duration::from_millis(500)),
|
||||
..Default::default()
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
// Must less than 300 ms.
|
||||
assert!(timer.elapsed().as_millis() < 300);
|
||||
|
||||
let reply = reply.unwrap().expect_upgrade_region_reply();
|
||||
assert!(reply.ready);
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -329,37 +336,37 @@ mod tests {
|
||||
|
||||
let handler_context = HandlerContext::new_for_test(mock_region_server);
|
||||
|
||||
let reply = handler_context
|
||||
.clone()
|
||||
.handle_upgrade_region_instruction(UpgradeRegion {
|
||||
region_id,
|
||||
..Default::default()
|
||||
})
|
||||
let reply = UpgradeRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
Instruction::UpgradeRegion(UpgradeRegion {
|
||||
region_id,
|
||||
..Default::default()
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_)));
|
||||
|
||||
// It didn't wait for handle returns; it had no idea about the error.
|
||||
if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() {
|
||||
assert!(!reply.ready);
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
}
|
||||
let reply = reply.unwrap().expect_upgrade_region_reply();
|
||||
assert!(!reply.ready);
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
|
||||
let reply = handler_context
|
||||
.clone()
|
||||
.handle_upgrade_region_instruction(UpgradeRegion {
|
||||
region_id,
|
||||
replay_timeout: Some(Duration::from_millis(200)),
|
||||
..Default::default()
|
||||
})
|
||||
let reply = UpgradeRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
Instruction::UpgradeRegion(UpgradeRegion {
|
||||
region_id,
|
||||
replay_timeout: Some(Duration::from_millis(200)),
|
||||
..Default::default()
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_)));
|
||||
|
||||
if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() {
|
||||
assert!(!reply.ready);
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.is_some());
|
||||
assert!(reply.error.unwrap().contains("mock_error"));
|
||||
}
|
||||
let reply = reply.unwrap().expect_upgrade_region_reply();
|
||||
assert!(!reply.ready);
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.is_some());
|
||||
assert!(reply.error.unwrap().contains("mock_error"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,6 +24,7 @@ use std::sync::Arc;
|
||||
|
||||
use common_base::bytes::StringBytes;
|
||||
use ordered_float::OrderedFloat;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::{Map, Value as Json};
|
||||
use snafu::{ResultExt, ensure};
|
||||
|
||||
@@ -45,7 +46,7 @@ use crate::value::{ListValue, StructValue, Value};
|
||||
/// convert them to fully structured StructValue for user-facing APIs: the UI protocol and the UDF interface.
|
||||
///
|
||||
/// **Important**: This settings only controls the internal form of JSON encoding.
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum JsonStructureSettings {
|
||||
// TODO(sunng87): provide a limit
|
||||
Structured(Option<StructType>),
|
||||
@@ -111,6 +112,12 @@ impl JsonStructureSettings {
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for JsonStructureSettings {
|
||||
fn default() -> Self {
|
||||
Self::Structured(None)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> JsonContext<'a> {
|
||||
/// Create a new context with an updated key path
|
||||
pub fn with_key(&self, key: &str) -> JsonContext<'a> {
|
||||
|
||||
@@ -32,8 +32,9 @@ pub use crate::schema::column_schema::{
|
||||
COLUMN_FULLTEXT_OPT_KEY_FALSE_POSITIVE_RATE, COLUMN_FULLTEXT_OPT_KEY_GRANULARITY,
|
||||
COLUMN_SKIPPING_INDEX_OPT_KEY_FALSE_POSITIVE_RATE, COLUMN_SKIPPING_INDEX_OPT_KEY_GRANULARITY,
|
||||
COLUMN_SKIPPING_INDEX_OPT_KEY_TYPE, COMMENT_KEY, ColumnExtType, ColumnSchema, FULLTEXT_KEY,
|
||||
FulltextAnalyzer, FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, Metadata,
|
||||
SKIPPING_INDEX_KEY, SkippingIndexOptions, SkippingIndexType, TIME_INDEX_KEY,
|
||||
FulltextAnalyzer, FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY,
|
||||
JSON_STRUCTURE_SETTINGS_KEY, Metadata, SKIPPING_INDEX_KEY, SkippingIndexOptions,
|
||||
SkippingIndexType, TIME_INDEX_KEY,
|
||||
};
|
||||
pub use crate::schema::constraint::ColumnDefaultConstraint;
|
||||
pub use crate::schema::raw::RawSchema;
|
||||
@@ -368,8 +369,7 @@ impl TryFrom<DFSchemaRef> for Schema {
|
||||
type Error = Error;
|
||||
|
||||
fn try_from(value: DFSchemaRef) -> Result<Self> {
|
||||
let s: ArrowSchema = value.as_ref().into();
|
||||
s.try_into()
|
||||
value.inner().clone().try_into()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -23,6 +23,7 @@ use sqlparser_derive::{Visit, VisitMut};
|
||||
|
||||
use crate::data_type::{ConcreteDataType, DataType};
|
||||
use crate::error::{self, Error, InvalidFulltextOptionSnafu, ParseExtendedTypeSnafu, Result};
|
||||
use crate::json::JsonStructureSettings;
|
||||
use crate::schema::TYPE_KEY;
|
||||
use crate::schema::constraint::ColumnDefaultConstraint;
|
||||
use crate::value::Value;
|
||||
@@ -41,6 +42,7 @@ pub const FULLTEXT_KEY: &str = "greptime:fulltext";
|
||||
pub const INVERTED_INDEX_KEY: &str = "greptime:inverted_index";
|
||||
/// Key used to store skip options in arrow field's metadata.
|
||||
pub const SKIPPING_INDEX_KEY: &str = "greptime:skipping_index";
|
||||
pub const JSON_STRUCTURE_SETTINGS_KEY: &str = "greptime:json:structure_settings";
|
||||
|
||||
/// Keys used in fulltext options
|
||||
pub const COLUMN_FULLTEXT_CHANGE_OPT_KEY_ENABLE: &str = "enable";
|
||||
@@ -391,6 +393,21 @@ impl ColumnSchema {
|
||||
self.metadata.remove(SKIPPING_INDEX_KEY);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn json_structure_settings(&self) -> Result<Option<JsonStructureSettings>> {
|
||||
self.metadata
|
||||
.get(JSON_STRUCTURE_SETTINGS_KEY)
|
||||
.map(|json| serde_json::from_str(json).context(error::DeserializeSnafu { json }))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub fn with_json_structure_settings(&mut self, settings: &JsonStructureSettings) -> Result<()> {
|
||||
self.metadata.insert(
|
||||
JSON_STRUCTURE_SETTINGS_KEY.to_string(),
|
||||
serde_json::to_string(settings).context(error::SerializeSnafu)?,
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Column extended type set in column schema's metadata.
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
use std::str::FromStr;
|
||||
|
||||
use arrow::datatypes::DataType as ArrowDataType;
|
||||
use arrow_schema::Fields;
|
||||
use common_base::bytes::Bytes;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use snafu::ResultExt;
|
||||
@@ -63,7 +64,10 @@ impl DataType for JsonType {
|
||||
}
|
||||
|
||||
fn as_arrow_type(&self) -> ArrowDataType {
|
||||
ArrowDataType::Binary
|
||||
match self.format {
|
||||
JsonFormat::Jsonb => ArrowDataType::Binary,
|
||||
JsonFormat::Native(_) => ArrowDataType::Struct(Fields::empty()),
|
||||
}
|
||||
}
|
||||
|
||||
fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector> {
|
||||
|
||||
@@ -1208,7 +1208,9 @@ impl TryFrom<ScalarValue> for Value {
|
||||
.collect::<Result<Vec<Value>>>()?;
|
||||
Value::Struct(StructValue::try_new(items, struct_type)?)
|
||||
}
|
||||
ScalarValue::Decimal256(_, _, _)
|
||||
ScalarValue::Decimal32(_, _, _)
|
||||
| ScalarValue::Decimal64(_, _, _)
|
||||
| ScalarValue::Decimal256(_, _, _)
|
||||
| ScalarValue::FixedSizeList(_)
|
||||
| ScalarValue::LargeList(_)
|
||||
| ScalarValue::Dictionary(_, _)
|
||||
|
||||
@@ -245,7 +245,9 @@ impl Helper {
|
||||
length,
|
||||
)
|
||||
}
|
||||
ScalarValue::Decimal256(_, _, _)
|
||||
ScalarValue::Decimal32(_, _, _)
|
||||
| ScalarValue::Decimal64(_, _, _)
|
||||
| ScalarValue::Decimal256(_, _, _)
|
||||
| ScalarValue::FixedSizeList(_)
|
||||
| ScalarValue::LargeList(_)
|
||||
| ScalarValue::Dictionary(_, _)
|
||||
|
||||
@@ -427,7 +427,7 @@ fn expand_tumble_analyzer(
|
||||
|
||||
/// This is a placeholder for tumble_start and tumble_end function, so that datafusion can
|
||||
/// recognize them as scalar function
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, PartialEq, Eq, Hash)]
|
||||
pub struct TumbleExpand {
|
||||
signature: Signature,
|
||||
name: String,
|
||||
|
||||
@@ -18,7 +18,6 @@ use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
|
||||
use api::v1::meta::{HeartbeatRequest, Peer};
|
||||
use common_config::utils::ResourceSpec;
|
||||
use common_error::ext::BoxedError;
|
||||
use common_meta::heartbeat::handler::{
|
||||
HeartbeatResponseHandlerContext, HeartbeatResponseHandlerExecutorRef,
|
||||
@@ -26,6 +25,7 @@ use common_meta::heartbeat::handler::{
|
||||
use common_meta::heartbeat::mailbox::{HeartbeatMailbox, MailboxRef, OutgoingMessage};
|
||||
use common_meta::heartbeat::utils::outgoing_message_to_mailbox_message;
|
||||
use common_meta::key::flow::flow_state::FlowStat;
|
||||
use common_stat::ResourceStatRef;
|
||||
use common_telemetry::{debug, error, info, warn};
|
||||
use greptime_proto::v1::meta::NodeInfo;
|
||||
use meta_client::client::{HeartbeatSender, HeartbeatStream, MetaClient};
|
||||
@@ -69,7 +69,7 @@ pub struct HeartbeatTask {
|
||||
resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
|
||||
running: Arc<AtomicBool>,
|
||||
query_stat_size: Option<SizeReportSender>,
|
||||
resource_spec: ResourceSpec,
|
||||
resource_stat: ResourceStatRef,
|
||||
}
|
||||
|
||||
impl HeartbeatTask {
|
||||
@@ -77,11 +77,13 @@ impl HeartbeatTask {
|
||||
self.query_stat_size = Some(query_stat_size);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn new(
|
||||
opts: &FlownodeOptions,
|
||||
meta_client: Arc<MetaClient>,
|
||||
heartbeat_opts: HeartbeatOptions,
|
||||
resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
|
||||
resource_stat: ResourceStatRef,
|
||||
) -> Self {
|
||||
Self {
|
||||
node_id: opts.node_id.unwrap_or(0),
|
||||
@@ -93,7 +95,7 @@ impl HeartbeatTask {
|
||||
resp_handler_executor,
|
||||
running: Arc::new(AtomicBool::new(false)),
|
||||
query_stat_size: None,
|
||||
resource_spec: Default::default(),
|
||||
resource_stat,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -146,6 +148,8 @@ impl HeartbeatTask {
|
||||
heartbeat_request: &HeartbeatRequest,
|
||||
message: Option<OutgoingMessage>,
|
||||
latest_report: &Option<FlowStat>,
|
||||
cpu_usage: i64,
|
||||
memory_usage: i64,
|
||||
) -> Option<HeartbeatRequest> {
|
||||
let mailbox_message = match message.map(outgoing_message_to_mailbox_message) {
|
||||
Some(Ok(message)) => Some(message),
|
||||
@@ -170,21 +174,38 @@ impl HeartbeatTask {
|
||||
.collect(),
|
||||
});
|
||||
|
||||
Some(HeartbeatRequest {
|
||||
let mut heartbeat_request = HeartbeatRequest {
|
||||
mailbox_message,
|
||||
flow_stat,
|
||||
..heartbeat_request.clone()
|
||||
})
|
||||
};
|
||||
|
||||
if let Some(info) = heartbeat_request.info.as_mut() {
|
||||
info.cpu_usage_millicores = cpu_usage;
|
||||
info.memory_usage_bytes = memory_usage;
|
||||
}
|
||||
|
||||
Some(heartbeat_request)
|
||||
}
|
||||
|
||||
fn build_node_info(start_time_ms: u64, cpus: u32, memory_bytes: u64) -> Option<NodeInfo> {
|
||||
#[allow(deprecated)]
|
||||
fn build_node_info(
|
||||
start_time_ms: u64,
|
||||
total_cpu_millicores: i64,
|
||||
total_memory_bytes: i64,
|
||||
) -> Option<NodeInfo> {
|
||||
let build_info = common_version::build_info();
|
||||
Some(NodeInfo {
|
||||
version: build_info.version.to_string(),
|
||||
git_commit: build_info.commit_short.to_string(),
|
||||
start_time_ms,
|
||||
cpus,
|
||||
memory_bytes,
|
||||
total_cpu_millicores,
|
||||
total_memory_bytes,
|
||||
cpu_usage_millicores: 0,
|
||||
memory_usage_bytes: 0,
|
||||
// TODO(zyy17): Remove these deprecated fields when the deprecated fields are removed from the proto.
|
||||
cpus: total_cpu_millicores as u32,
|
||||
memory_bytes: total_memory_bytes as u64,
|
||||
hostname: hostname::get()
|
||||
.unwrap_or_default()
|
||||
.to_string_lossy()
|
||||
@@ -203,9 +224,9 @@ impl HeartbeatTask {
|
||||
id: self.node_id,
|
||||
addr: self.peer_addr.clone(),
|
||||
});
|
||||
let cpus = self.resource_spec.cpus as u32;
|
||||
let memory_bytes = self.resource_spec.memory.unwrap_or_default().as_bytes();
|
||||
|
||||
let total_cpu_millicores = self.resource_stat.get_total_cpu_millicores();
|
||||
let total_memory_bytes = self.resource_stat.get_total_memory_bytes();
|
||||
let resource_stat = self.resource_stat.clone();
|
||||
let query_stat_size = self.query_stat_size.clone();
|
||||
|
||||
common_runtime::spawn_hb(async move {
|
||||
@@ -218,7 +239,7 @@ impl HeartbeatTask {
|
||||
let heartbeat_request = HeartbeatRequest {
|
||||
peer: self_peer,
|
||||
node_epoch,
|
||||
info: Self::build_node_info(node_epoch, cpus, memory_bytes),
|
||||
info: Self::build_node_info(node_epoch, total_cpu_millicores, total_memory_bytes),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
@@ -226,7 +247,7 @@ impl HeartbeatTask {
|
||||
let req = tokio::select! {
|
||||
message = outgoing_rx.recv() => {
|
||||
if let Some(message) = message {
|
||||
Self::new_heartbeat_request(&heartbeat_request, Some(message), &latest_report)
|
||||
Self::new_heartbeat_request(&heartbeat_request, Some(message), &latest_report, 0, 0)
|
||||
} else {
|
||||
warn!("Sender has been dropped, exiting the heartbeat loop");
|
||||
// Receives None that means Sender was dropped, we need to break the current loop
|
||||
@@ -234,7 +255,7 @@ impl HeartbeatTask {
|
||||
}
|
||||
}
|
||||
_ = interval.tick() => {
|
||||
Self::new_heartbeat_request(&heartbeat_request, None, &latest_report)
|
||||
Self::new_heartbeat_request(&heartbeat_request, None, &latest_report, resource_stat.get_cpu_usage_millicores(), resource_stat.get_memory_usage_bytes())
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -490,6 +490,7 @@ impl<'a> FlownodeServiceBuilder<'a> {
|
||||
let config = GrpcServerConfig {
|
||||
max_recv_message_size: opts.grpc.max_recv_message_size.as_bytes() as usize,
|
||||
max_send_message_size: opts.grpc.max_send_message_size.as_bytes() as usize,
|
||||
max_total_message_memory: opts.grpc.max_total_message_memory.as_bytes() as usize,
|
||||
tls: opts.grpc.tls.clone(),
|
||||
max_connection_age: opts.grpc.max_connection_age,
|
||||
};
|
||||
|
||||
@@ -37,6 +37,7 @@ common-procedure.workspace = true
|
||||
common-query.workspace = true
|
||||
common-recordbatch.workspace = true
|
||||
common-runtime.workspace = true
|
||||
common-stat.workspace = true
|
||||
common-telemetry.workspace = true
|
||||
common-time.workspace = true
|
||||
common-version.workspace = true
|
||||
|
||||
@@ -18,12 +18,12 @@ mod tests;
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::v1::meta::{HeartbeatRequest, NodeInfo, Peer};
|
||||
use common_config::utils::ResourceSpec;
|
||||
use common_meta::heartbeat::handler::{
|
||||
HeartbeatResponseHandlerContext, HeartbeatResponseHandlerExecutorRef,
|
||||
};
|
||||
use common_meta::heartbeat::mailbox::{HeartbeatMailbox, MailboxRef, OutgoingMessage};
|
||||
use common_meta::heartbeat::utils::outgoing_message_to_mailbox_message;
|
||||
use common_stat::ResourceStatRef;
|
||||
use common_telemetry::{debug, error, info, warn};
|
||||
use meta_client::client::{HeartbeatSender, HeartbeatStream, MetaClient};
|
||||
use servers::addrs;
|
||||
@@ -47,7 +47,7 @@ pub struct HeartbeatTask {
|
||||
retry_interval: Duration,
|
||||
resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
|
||||
start_time_ms: u64,
|
||||
resource_spec: ResourceSpec,
|
||||
resource_stat: ResourceStatRef,
|
||||
}
|
||||
|
||||
impl HeartbeatTask {
|
||||
@@ -56,6 +56,7 @@ impl HeartbeatTask {
|
||||
meta_client: Arc<MetaClient>,
|
||||
heartbeat_opts: HeartbeatOptions,
|
||||
resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
|
||||
resource_stat: ResourceStatRef,
|
||||
) -> Self {
|
||||
HeartbeatTask {
|
||||
// if internal grpc is configured, use its address as the peer address
|
||||
@@ -71,7 +72,7 @@ impl HeartbeatTask {
|
||||
retry_interval: heartbeat_opts.retry_interval,
|
||||
resp_handler_executor,
|
||||
start_time_ms: common_time::util::current_time_millis() as u64,
|
||||
resource_spec: Default::default(),
|
||||
resource_stat,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -133,6 +134,8 @@ impl HeartbeatTask {
|
||||
fn new_heartbeat_request(
|
||||
heartbeat_request: &HeartbeatRequest,
|
||||
message: Option<OutgoingMessage>,
|
||||
cpu_usage: i64,
|
||||
memory_usage: i64,
|
||||
) -> Option<HeartbeatRequest> {
|
||||
let mailbox_message = match message.map(outgoing_message_to_mailbox_message) {
|
||||
Some(Ok(message)) => Some(message),
|
||||
@@ -143,21 +146,38 @@ impl HeartbeatTask {
|
||||
None => None,
|
||||
};
|
||||
|
||||
Some(HeartbeatRequest {
|
||||
let mut heartbeat_request = HeartbeatRequest {
|
||||
mailbox_message,
|
||||
..heartbeat_request.clone()
|
||||
})
|
||||
};
|
||||
|
||||
if let Some(info) = heartbeat_request.info.as_mut() {
|
||||
info.memory_usage_bytes = memory_usage;
|
||||
info.cpu_usage_millicores = cpu_usage;
|
||||
}
|
||||
|
||||
Some(heartbeat_request)
|
||||
}
|
||||
|
||||
fn build_node_info(start_time_ms: u64, cpus: u32, memory_bytes: u64) -> Option<NodeInfo> {
|
||||
#[allow(deprecated)]
|
||||
fn build_node_info(
|
||||
start_time_ms: u64,
|
||||
total_cpu_millicores: i64,
|
||||
total_memory_bytes: i64,
|
||||
) -> Option<NodeInfo> {
|
||||
let build_info = common_version::build_info();
|
||||
|
||||
Some(NodeInfo {
|
||||
version: build_info.version.to_string(),
|
||||
git_commit: build_info.commit_short.to_string(),
|
||||
start_time_ms,
|
||||
cpus,
|
||||
memory_bytes,
|
||||
total_cpu_millicores,
|
||||
total_memory_bytes,
|
||||
cpu_usage_millicores: 0,
|
||||
memory_usage_bytes: 0,
|
||||
// TODO(zyy17): Remove these deprecated fields when the deprecated fields are removed from the proto.
|
||||
cpus: total_cpu_millicores as u32,
|
||||
memory_bytes: total_memory_bytes as u64,
|
||||
hostname: hostname::get()
|
||||
.unwrap_or_default()
|
||||
.to_string_lossy()
|
||||
@@ -177,16 +197,20 @@ impl HeartbeatTask {
|
||||
id: 0,
|
||||
addr: self.peer_addr.clone(),
|
||||
});
|
||||
let cpus = self.resource_spec.cpus as u32;
|
||||
let memory_bytes = self.resource_spec.memory.unwrap_or_default().as_bytes();
|
||||
|
||||
let total_cpu_millicores = self.resource_stat.get_total_cpu_millicores();
|
||||
let total_memory_bytes = self.resource_stat.get_total_memory_bytes();
|
||||
let resource_stat = self.resource_stat.clone();
|
||||
common_runtime::spawn_hb(async move {
|
||||
let sleep = tokio::time::sleep(Duration::from_millis(0));
|
||||
tokio::pin!(sleep);
|
||||
|
||||
let heartbeat_request = HeartbeatRequest {
|
||||
peer: self_peer,
|
||||
info: Self::build_node_info(start_time_ms, cpus, memory_bytes),
|
||||
info: Self::build_node_info(
|
||||
start_time_ms,
|
||||
total_cpu_millicores,
|
||||
total_memory_bytes,
|
||||
),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
@@ -194,7 +218,7 @@ impl HeartbeatTask {
|
||||
let req = tokio::select! {
|
||||
message = outgoing_rx.recv() => {
|
||||
if let Some(message) = message {
|
||||
Self::new_heartbeat_request(&heartbeat_request, Some(message))
|
||||
Self::new_heartbeat_request(&heartbeat_request, Some(message), 0, 0)
|
||||
} else {
|
||||
warn!("Sender has been dropped, exiting the heartbeat loop");
|
||||
// Receives None that means Sender was dropped, we need to break the current loop
|
||||
@@ -202,8 +226,8 @@ impl HeartbeatTask {
|
||||
}
|
||||
}
|
||||
_ = &mut sleep => {
|
||||
sleep.as_mut().reset(Instant::now() + report_interval);
|
||||
Self::new_heartbeat_request(&heartbeat_request, None)
|
||||
sleep.as_mut().reset(Instant::now() + report_interval);
|
||||
Self::new_heartbeat_request(&heartbeat_request, None, resource_stat.get_cpu_usage_millicores(), resource_stat.get_memory_usage_bytes())
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -24,7 +24,9 @@ mod util;
|
||||
use std::fmt::Debug;
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::v1::meta::{ProcedureDetailResponse, ReconcileRequest, ReconcileResponse, Role};
|
||||
use api::v1::meta::{
|
||||
MetasrvNodeInfo, ProcedureDetailResponse, ReconcileRequest, ReconcileResponse, Role,
|
||||
};
|
||||
pub use ask_leader::{AskLeader, LeaderProvider, LeaderProviderRef};
|
||||
use cluster::Client as ClusterClient;
|
||||
pub use cluster::ClusterKvBackend;
|
||||
@@ -371,7 +373,8 @@ impl ClusterInfo for MetaClient {
|
||||
let mut nodes = if get_metasrv_nodes {
|
||||
let last_activity_ts = -1; // Metasrv does not provide this information.
|
||||
|
||||
let (leader, followers) = cluster_client.get_metasrv_peers().await?;
|
||||
let (leader, followers): (Option<MetasrvNodeInfo>, Vec<MetasrvNodeInfo>) =
|
||||
cluster_client.get_metasrv_peers().await?;
|
||||
followers
|
||||
.into_iter()
|
||||
.map(|node| {
|
||||
@@ -383,8 +386,10 @@ impl ClusterInfo for MetaClient {
|
||||
version: node_info.version,
|
||||
git_commit: node_info.git_commit,
|
||||
start_time_ms: node_info.start_time_ms,
|
||||
cpus: node_info.cpus,
|
||||
memory_bytes: node_info.memory_bytes,
|
||||
total_cpu_millicores: node_info.total_cpu_millicores,
|
||||
total_memory_bytes: node_info.total_memory_bytes,
|
||||
cpu_usage_millicores: node_info.cpu_usage_millicores,
|
||||
memory_usage_bytes: node_info.memory_usage_bytes,
|
||||
hostname: node_info.hostname,
|
||||
}
|
||||
} else {
|
||||
@@ -396,8 +401,10 @@ impl ClusterInfo for MetaClient {
|
||||
version: node.version,
|
||||
git_commit: node.git_commit,
|
||||
start_time_ms: node.start_time_ms,
|
||||
cpus: node.cpus,
|
||||
memory_bytes: node.memory_bytes,
|
||||
total_cpu_millicores: node.cpus as i64,
|
||||
total_memory_bytes: node.memory_bytes as i64,
|
||||
cpu_usage_millicores: 0,
|
||||
memory_usage_bytes: 0,
|
||||
hostname: "".to_string(),
|
||||
}
|
||||
}
|
||||
@@ -411,8 +418,10 @@ impl ClusterInfo for MetaClient {
|
||||
version: node_info.version,
|
||||
git_commit: node_info.git_commit,
|
||||
start_time_ms: node_info.start_time_ms,
|
||||
cpus: node_info.cpus,
|
||||
memory_bytes: node_info.memory_bytes,
|
||||
total_cpu_millicores: node_info.total_cpu_millicores,
|
||||
total_memory_bytes: node_info.total_memory_bytes,
|
||||
cpu_usage_millicores: node_info.cpu_usage_millicores,
|
||||
memory_usage_bytes: node_info.memory_usage_bytes,
|
||||
hostname: node_info.hostname,
|
||||
}
|
||||
} else {
|
||||
@@ -424,8 +433,10 @@ impl ClusterInfo for MetaClient {
|
||||
version: node.version,
|
||||
git_commit: node.git_commit,
|
||||
start_time_ms: node.start_time_ms,
|
||||
cpus: node.cpus,
|
||||
memory_bytes: node.memory_bytes,
|
||||
total_cpu_millicores: node.cpus as i64,
|
||||
total_memory_bytes: node.memory_bytes as i64,
|
||||
cpu_usage_millicores: 0,
|
||||
memory_usage_bytes: 0,
|
||||
hostname: "".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,6 +39,7 @@ common-meta.workspace = true
|
||||
common-options.workspace = true
|
||||
common-procedure.workspace = true
|
||||
common-runtime.workspace = true
|
||||
common-stat.workspace = true
|
||||
common-telemetry.workspace = true
|
||||
common-time.workspace = true
|
||||
common-version.workspace = true
|
||||
|
||||
@@ -243,8 +243,10 @@ mod tests {
|
||||
version: "1.0.0".to_string(),
|
||||
git_commit: "1234567890".to_string(),
|
||||
start_time_ms: current_time_millis() as u64,
|
||||
cpus: 0,
|
||||
memory_bytes: 0,
|
||||
total_cpu_millicores: 0,
|
||||
total_memory_bytes: 0,
|
||||
cpu_usage_millicores: 0,
|
||||
memory_usage_bytes: 0,
|
||||
hostname: "test_hostname".to_string(),
|
||||
};
|
||||
|
||||
@@ -269,8 +271,10 @@ mod tests {
|
||||
version: "1.0.0".to_string(),
|
||||
git_commit: "1234567890".to_string(),
|
||||
start_time_ms: current_time_millis() as u64,
|
||||
cpus: 0,
|
||||
memory_bytes: 0,
|
||||
total_cpu_millicores: 0,
|
||||
total_memory_bytes: 0,
|
||||
cpu_usage_millicores: 0,
|
||||
memory_usage_bytes: 0,
|
||||
hostname: "test_hostname".to_string(),
|
||||
};
|
||||
|
||||
@@ -307,8 +311,10 @@ mod tests {
|
||||
version: "1.0.0".to_string(),
|
||||
git_commit: "1234567890".to_string(),
|
||||
start_time_ms: last_activity_ts as u64,
|
||||
cpus: 0,
|
||||
memory_bytes: 0,
|
||||
total_cpu_millicores: 0,
|
||||
total_memory_bytes: 0,
|
||||
cpu_usage_millicores: 0,
|
||||
memory_usage_bytes: 0,
|
||||
hostname: "test_hostname".to_string(),
|
||||
};
|
||||
|
||||
|
||||
@@ -1161,8 +1161,10 @@ mod tests {
|
||||
version: "test_version".to_string(),
|
||||
git_commit: "test_git_commit".to_string(),
|
||||
start_time_ms: 0,
|
||||
cpus: 0,
|
||||
memory_bytes: 0,
|
||||
total_cpu_millicores: 0,
|
||||
total_memory_bytes: 0,
|
||||
cpu_usage_millicores: 0,
|
||||
memory_usage_bytes: 0,
|
||||
hostname: "test_hostname".to_string(),
|
||||
};
|
||||
mysql_election.register_candidate(&node_info).await.unwrap();
|
||||
|
||||
@@ -1000,8 +1000,10 @@ mod tests {
|
||||
version: "test_version".to_string(),
|
||||
git_commit: "test_git_commit".to_string(),
|
||||
start_time_ms: 0,
|
||||
cpus: 0,
|
||||
memory_bytes: 0,
|
||||
total_cpu_millicores: 0,
|
||||
total_memory_bytes: 0,
|
||||
cpu_usage_millicores: 0,
|
||||
memory_usage_bytes: 0,
|
||||
hostname: "test_hostname".to_string(),
|
||||
};
|
||||
pg_election.register_candidate(&node_info).await.unwrap();
|
||||
|
||||
@@ -52,8 +52,10 @@ impl HeartbeatHandler for CollectFrontendClusterInfoHandler {
|
||||
version: info.version,
|
||||
git_commit: info.git_commit,
|
||||
start_time_ms: info.start_time_ms,
|
||||
cpus: info.cpus,
|
||||
memory_bytes: info.memory_bytes,
|
||||
total_cpu_millicores: info.total_cpu_millicores,
|
||||
total_memory_bytes: info.total_memory_bytes,
|
||||
cpu_usage_millicores: info.cpu_usage_millicores,
|
||||
memory_usage_bytes: info.memory_usage_bytes,
|
||||
hostname: info.hostname,
|
||||
};
|
||||
|
||||
@@ -88,8 +90,10 @@ impl HeartbeatHandler for CollectFlownodeClusterInfoHandler {
|
||||
version: info.version,
|
||||
git_commit: info.git_commit,
|
||||
start_time_ms: info.start_time_ms,
|
||||
cpus: info.cpus,
|
||||
memory_bytes: info.memory_bytes,
|
||||
total_cpu_millicores: info.total_cpu_millicores,
|
||||
total_memory_bytes: info.total_memory_bytes,
|
||||
cpu_usage_millicores: info.cpu_usage_millicores,
|
||||
memory_usage_bytes: info.memory_usage_bytes,
|
||||
hostname: info.hostname,
|
||||
};
|
||||
|
||||
@@ -142,8 +146,10 @@ impl HeartbeatHandler for CollectDatanodeClusterInfoHandler {
|
||||
version: info.version,
|
||||
git_commit: info.git_commit,
|
||||
start_time_ms: info.start_time_ms,
|
||||
cpus: info.cpus,
|
||||
memory_bytes: info.memory_bytes,
|
||||
total_cpu_millicores: info.total_cpu_millicores,
|
||||
total_memory_bytes: info.total_memory_bytes,
|
||||
cpu_usage_millicores: info.cpu_usage_millicores,
|
||||
memory_usage_bytes: info.memory_usage_bytes,
|
||||
hostname: info.hostname,
|
||||
};
|
||||
|
||||
|
||||
@@ -22,7 +22,6 @@ use std::time::Duration;
|
||||
use clap::ValueEnum;
|
||||
use common_base::Plugins;
|
||||
use common_base::readable_size::ReadableSize;
|
||||
use common_config::utils::ResourceSpec;
|
||||
use common_config::{Configurable, DEFAULT_DATA_HOME};
|
||||
use common_event_recorder::EventRecorderOptions;
|
||||
use common_greptimedb_telemetry::GreptimeDBTelemetryTask;
|
||||
@@ -47,6 +46,7 @@ use common_options::datanode::DatanodeClientOptions;
|
||||
use common_options::memory::MemoryOptions;
|
||||
use common_procedure::ProcedureManagerRef;
|
||||
use common_procedure::options::ProcedureConfig;
|
||||
use common_stat::ResourceStatRef;
|
||||
use common_telemetry::logging::{LoggingOptions, TracingOptions};
|
||||
use common_telemetry::{error, info, warn};
|
||||
use common_wal::config::MetasrvWalConfig;
|
||||
@@ -372,12 +372,16 @@ pub struct MetasrvNodeInfo {
|
||||
pub git_commit: String,
|
||||
// The node start timestamp in milliseconds
|
||||
pub start_time_ms: u64,
|
||||
// The node cpus
|
||||
// The node total cpu millicores
|
||||
#[serde(default)]
|
||||
pub cpus: u32,
|
||||
// The node memory bytes
|
||||
pub total_cpu_millicores: i64,
|
||||
#[serde(default)]
|
||||
pub memory_bytes: u64,
|
||||
// The node total memory bytes
|
||||
pub total_memory_bytes: i64,
|
||||
/// The node build cpu usage millicores
|
||||
pub cpu_usage_millicores: i64,
|
||||
/// The node build memory usage bytes
|
||||
pub memory_usage_bytes: i64,
|
||||
// The node hostname
|
||||
#[serde(default)]
|
||||
pub hostname: String,
|
||||
@@ -397,15 +401,19 @@ impl From<MetasrvNodeInfo> for api::v1::meta::MetasrvNodeInfo {
|
||||
version: node_info.version.clone(),
|
||||
git_commit: node_info.git_commit.clone(),
|
||||
start_time_ms: node_info.start_time_ms,
|
||||
cpus: node_info.cpus,
|
||||
memory_bytes: node_info.memory_bytes,
|
||||
cpus: node_info.total_cpu_millicores as u32,
|
||||
memory_bytes: node_info.total_memory_bytes as u64,
|
||||
// The canonical location for node information.
|
||||
info: Some(api::v1::meta::NodeInfo {
|
||||
version: node_info.version,
|
||||
git_commit: node_info.git_commit,
|
||||
start_time_ms: node_info.start_time_ms,
|
||||
cpus: node_info.cpus,
|
||||
memory_bytes: node_info.memory_bytes,
|
||||
total_cpu_millicores: node_info.total_cpu_millicores,
|
||||
total_memory_bytes: node_info.total_memory_bytes,
|
||||
cpu_usage_millicores: node_info.cpu_usage_millicores,
|
||||
memory_usage_bytes: node_info.memory_usage_bytes,
|
||||
cpus: node_info.total_cpu_millicores as u32,
|
||||
memory_bytes: node_info.total_memory_bytes as u64,
|
||||
hostname: node_info.hostname,
|
||||
}),
|
||||
}
|
||||
@@ -517,7 +525,7 @@ pub struct Metasrv {
|
||||
region_flush_ticker: Option<RegionFlushTickerRef>,
|
||||
table_id_sequence: SequenceRef,
|
||||
reconciliation_manager: ReconciliationManagerRef,
|
||||
resource_spec: ResourceSpec,
|
||||
resource_stat: ResourceStatRef,
|
||||
|
||||
plugins: Plugins,
|
||||
}
|
||||
@@ -699,8 +707,8 @@ impl Metasrv {
|
||||
self.start_time_ms
|
||||
}
|
||||
|
||||
pub fn resource_spec(&self) -> &ResourceSpec {
|
||||
&self.resource_spec
|
||||
pub fn resource_stat(&self) -> &ResourceStatRef {
|
||||
&self.resource_stat
|
||||
}
|
||||
|
||||
pub fn node_info(&self) -> MetasrvNodeInfo {
|
||||
@@ -710,8 +718,10 @@ impl Metasrv {
|
||||
version: build_info.version.to_string(),
|
||||
git_commit: build_info.commit_short.to_string(),
|
||||
start_time_ms: self.start_time_ms(),
|
||||
cpus: self.resource_spec().cpus as u32,
|
||||
memory_bytes: self.resource_spec().memory.unwrap_or_default().as_bytes(),
|
||||
total_cpu_millicores: self.resource_stat.get_total_cpu_millicores(),
|
||||
total_memory_bytes: self.resource_stat.get_total_memory_bytes(),
|
||||
cpu_usage_millicores: self.resource_stat.get_cpu_usage_millicores(),
|
||||
memory_usage_bytes: self.resource_stat.get_memory_usage_bytes(),
|
||||
hostname: hostname::get()
|
||||
.unwrap_or_default()
|
||||
.to_string_lossy()
|
||||
|
||||
@@ -46,6 +46,7 @@ use common_meta::stats::topic::TopicStatsRegistry;
|
||||
use common_meta::wal_options_allocator::{build_kafka_client, build_wal_options_allocator};
|
||||
use common_procedure::ProcedureManagerRef;
|
||||
use common_procedure::local::{LocalManager, ManagerConfig};
|
||||
use common_stat::ResourceStatImpl;
|
||||
use common_telemetry::{info, warn};
|
||||
use snafu::{ResultExt, ensure};
|
||||
use store_api::storage::MAX_REGION_SEQ;
|
||||
@@ -517,6 +518,9 @@ impl MetasrvBuilder {
|
||||
.try_start()
|
||||
.context(error::InitReconciliationManagerSnafu)?;
|
||||
|
||||
let mut resource_stat = ResourceStatImpl::default();
|
||||
resource_stat.start_collect_cpu_usage();
|
||||
|
||||
Ok(Metasrv {
|
||||
state,
|
||||
started: Arc::new(AtomicBool::new(false)),
|
||||
@@ -556,7 +560,7 @@ impl MetasrvBuilder {
|
||||
table_id_sequence,
|
||||
reconciliation_manager,
|
||||
topic_stats_registry,
|
||||
resource_spec: Default::default(),
|
||||
resource_stat: Arc::new(resource_stat),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,7 +19,7 @@ use api::v1::meta::MailboxMessage;
|
||||
use common_error::ext::BoxedError;
|
||||
use common_meta::distributed_time_constants::REGION_LEASE_SECS;
|
||||
use common_meta::instruction::{
|
||||
DowngradeRegion, DowngradeRegionReply, Instruction, InstructionReply,
|
||||
DowngradeRegion, DowngradeRegionReply, DowngradeRegionsReply, Instruction, InstructionReply,
|
||||
};
|
||||
use common_procedure::{Context as ProcedureContext, Status};
|
||||
use common_telemetry::{error, info, warn};
|
||||
@@ -120,10 +120,10 @@ impl DowngradeLeaderRegion {
|
||||
) -> Instruction {
|
||||
let pc = &ctx.persistent_ctx;
|
||||
let region_id = pc.region_id;
|
||||
Instruction::DowngradeRegion(DowngradeRegion {
|
||||
Instruction::DowngradeRegions(vec![DowngradeRegion {
|
||||
region_id,
|
||||
flush_timeout: Some(flush_timeout),
|
||||
})
|
||||
}])
|
||||
}
|
||||
|
||||
/// Tries to downgrade a leader region.
|
||||
@@ -173,12 +173,7 @@ impl DowngradeLeaderRegion {
|
||||
region_id,
|
||||
now.elapsed()
|
||||
);
|
||||
let InstructionReply::DowngradeRegion(DowngradeRegionReply {
|
||||
last_entry_id,
|
||||
metadata_last_entry_id,
|
||||
exists,
|
||||
error,
|
||||
}) = reply
|
||||
let InstructionReply::DowngradeRegions(DowngradeRegionsReply { replies }) = reply
|
||||
else {
|
||||
return error::UnexpectedInstructionReplySnafu {
|
||||
mailbox_message: msg.to_string(),
|
||||
@@ -187,6 +182,15 @@ impl DowngradeLeaderRegion {
|
||||
.fail();
|
||||
};
|
||||
|
||||
// TODO(weny): handle multiple replies.
|
||||
let DowngradeRegionReply {
|
||||
region_id,
|
||||
last_entry_id,
|
||||
metadata_last_entry_id,
|
||||
exists,
|
||||
error,
|
||||
} = &replies[0];
|
||||
|
||||
if error.is_some() {
|
||||
return error::RetryLaterSnafu {
|
||||
reason: format!(
|
||||
@@ -216,12 +220,12 @@ impl DowngradeLeaderRegion {
|
||||
}
|
||||
|
||||
if let Some(last_entry_id) = last_entry_id {
|
||||
ctx.volatile_ctx.set_last_entry_id(last_entry_id);
|
||||
ctx.volatile_ctx.set_last_entry_id(*last_entry_id);
|
||||
}
|
||||
|
||||
if let Some(metadata_last_entry_id) = metadata_last_entry_id {
|
||||
ctx.volatile_ctx
|
||||
.set_metadata_last_entry_id(metadata_last_entry_id);
|
||||
.set_metadata_last_entry_id(*metadata_last_entry_id);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -17,7 +17,8 @@ use std::collections::HashMap;
|
||||
use api::v1::meta::mailbox_message::Payload;
|
||||
use api::v1::meta::{HeartbeatResponse, MailboxMessage};
|
||||
use common_meta::instruction::{
|
||||
DowngradeRegionReply, FlushRegionReply, InstructionReply, SimpleReply, UpgradeRegionReply,
|
||||
DowngradeRegionReply, DowngradeRegionsReply, FlushRegionReply, InstructionReply, SimpleReply,
|
||||
UpgradeRegionReply,
|
||||
};
|
||||
use common_meta::key::TableMetadataManagerRef;
|
||||
use common_meta::key::table_route::TableRouteValue;
|
||||
@@ -183,12 +184,15 @@ pub fn new_downgrade_region_reply(
|
||||
to: "meta".to_string(),
|
||||
timestamp_millis: current_time_millis(),
|
||||
payload: Some(Payload::Json(
|
||||
serde_json::to_string(&InstructionReply::DowngradeRegion(DowngradeRegionReply {
|
||||
last_entry_id,
|
||||
metadata_last_entry_id: None,
|
||||
exists: exist,
|
||||
error,
|
||||
}))
|
||||
serde_json::to_string(&InstructionReply::DowngradeRegions(
|
||||
DowngradeRegionsReply::new(vec![DowngradeRegionReply {
|
||||
region_id: RegionId::new(0, 0),
|
||||
last_entry_id,
|
||||
metadata_last_entry_id: None,
|
||||
exists: exist,
|
||||
error,
|
||||
}]),
|
||||
))
|
||||
.unwrap(),
|
||||
)),
|
||||
}
|
||||
|
||||
@@ -97,8 +97,10 @@ impl Metasrv {
|
||||
version: build_info.version.to_string(),
|
||||
git_commit: build_info.commit_short.to_string(),
|
||||
start_time_ms: self.start_time_ms(),
|
||||
cpus: self.resource_spec().cpus as u32,
|
||||
memory_bytes: self.resource_spec().memory.unwrap_or_default().as_bytes(),
|
||||
total_cpu_millicores: self.resource_stat().get_total_cpu_millicores(),
|
||||
total_memory_bytes: self.resource_stat().get_total_memory_bytes(),
|
||||
cpu_usage_millicores: self.resource_stat().get_cpu_usage_millicores(),
|
||||
memory_usage_bytes: self.resource_stat().get_memory_usage_bytes(),
|
||||
hostname: hostname::get()
|
||||
.unwrap_or_default()
|
||||
.to_string_lossy()
|
||||
|
||||
@@ -127,12 +127,12 @@ mod tests {
|
||||
assert_eq!(
|
||||
debug_format,
|
||||
r#"
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000001/data/<file_id>.parquet", file_size: 3173, index_file_path: Some("test_metric_region/11_0000000001/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(20), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000002/data/<file_id>.parquet", file_size: 3173, index_file_path: Some("test_metric_region/11_0000000002/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417473(11, 16777217), table_id: 11, region_number: 16777217, region_group: 1, region_sequence: 1, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000001/metadata/<file_id>.parquet", file_size: 3505, index_file_path: None, index_file_size: None, num_rows: 8, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(8), origin_region_id: 47261417473(11, 16777217), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417474(11, 16777218), table_id: 11, region_number: 16777218, region_group: 1, region_sequence: 2, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000002/metadata/<file_id>.parquet", file_size: 3489, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 47261417474(11, 16777218), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", level: 0, file_path: "test_metric_region/22_0000000042/data/<file_id>.parquet", file_size: 3173, index_file_path: Some("test_metric_region/22_0000000042/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94506057770(22, 16777258), table_id: 22, region_number: 16777258, region_group: 1, region_sequence: 42, file_id: "<file_id>", level: 0, file_path: "test_metric_region/22_0000000042/metadata/<file_id>.parquet", file_size: 3489, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 94506057770(22, 16777258), node_id: None, visible: true }"#
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000001/data/<file_id>.parquet", file_size: 3173, index_file_path: Some("test_metric_region/11_0000000001/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(20), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000002/data/<file_id>.parquet", file_size: 3173, index_file_path: Some("test_metric_region/11_0000000002/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417473(11, 16777217), table_id: 11, region_number: 16777217, region_group: 1, region_sequence: 1, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000001/metadata/<file_id>.parquet", file_size: 3505, index_file_path: None, index_file_size: None, num_rows: 8, num_row_groups: 1, num_series: Some(8), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(8), origin_region_id: 47261417473(11, 16777217), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417474(11, 16777218), table_id: 11, region_number: 16777218, region_group: 1, region_sequence: 2, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000002/metadata/<file_id>.parquet", file_size: 3489, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 47261417474(11, 16777218), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", level: 0, file_path: "test_metric_region/22_0000000042/data/<file_id>.parquet", file_size: 3173, index_file_path: Some("test_metric_region/22_0000000042/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94506057770(22, 16777258), table_id: 22, region_number: 16777258, region_group: 1, region_sequence: 42, file_id: "<file_id>", level: 0, file_path: "test_metric_region/22_0000000042/metadata/<file_id>.parquet", file_size: 3489, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 94506057770(22, 16777258), node_id: None, visible: true }"#
|
||||
);
|
||||
// list from storage
|
||||
let storage_entries = mito
|
||||
|
||||
@@ -65,7 +65,7 @@ partition.workspace = true
|
||||
puffin.workspace = true
|
||||
rand.workspace = true
|
||||
rayon = "1.10"
|
||||
regex = "1.5"
|
||||
regex.workspace = true
|
||||
rskafka = { workspace = true, optional = true }
|
||||
rstest = { workspace = true, optional = true }
|
||||
rstest_reuse = { workspace = true, optional = true }
|
||||
|
||||
@@ -433,6 +433,7 @@ impl Compactor for DefaultCompactor {
|
||||
num_row_groups: sst_info.num_row_groups,
|
||||
sequence: max_sequence,
|
||||
partition_expr: partition_expr.clone(),
|
||||
num_series: sst_info.num_series,
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let output_file_names =
|
||||
|
||||
@@ -78,6 +78,7 @@ pub fn new_file_handle_with_size_and_sequence(
|
||||
index_file_size: 0,
|
||||
num_rows: 0,
|
||||
num_row_groups: 0,
|
||||
num_series: 0,
|
||||
sequence: NonZeroU64::new(sequence),
|
||||
partition_expr: None,
|
||||
},
|
||||
|
||||
@@ -859,9 +859,9 @@ async fn test_cache_null_primary_key_with_format(flat_format: bool) {
|
||||
#[tokio::test]
|
||||
async fn test_list_ssts() {
|
||||
test_list_ssts_with_format(false, r#"
|
||||
ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", level: 0, file_path: "test/11_0000000001/<file_id>.parquet", file_size: 2531, index_file_path: Some("test/11_0000000001/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", level: 0, file_path: "test/11_0000000002/<file_id>.parquet", file_size: 2531, index_file_path: Some("test/11_0000000002/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", level: 0, file_path: "test/22_0000000042/<file_id>.parquet", file_size: 2531, index_file_path: Some("test/22_0000000042/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"# ,r#"
|
||||
ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", level: 0, file_path: "test/11_0000000001/<file_id>.parquet", file_size: 2531, index_file_path: Some("test/11_0000000001/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", level: 0, file_path: "test/11_0000000002/<file_id>.parquet", file_size: 2531, index_file_path: Some("test/11_0000000002/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", level: 0, file_path: "test/22_0000000042/<file_id>.parquet", file_size: 2531, index_file_path: Some("test/22_0000000042/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"# ,r#"
|
||||
StorageSstEntry { file_path: "test/11_0000000001/<file_id>.parquet", file_size: None, last_modified_ms: None, node_id: None }
|
||||
StorageSstEntry { file_path: "test/11_0000000001/index/<file_id>.puffin", file_size: None, last_modified_ms: None, node_id: None }
|
||||
StorageSstEntry { file_path: "test/11_0000000002/<file_id>.parquet", file_size: None, last_modified_ms: None, node_id: None }
|
||||
@@ -869,9 +869,9 @@ StorageSstEntry { file_path: "test/11_0000000002/index/<file_id>.puffin", file_s
|
||||
StorageSstEntry { file_path: "test/22_0000000042/<file_id>.parquet", file_size: None, last_modified_ms: None, node_id: None }
|
||||
StorageSstEntry { file_path: "test/22_0000000042/index/<file_id>.puffin", file_size: None, last_modified_ms: None, node_id: None }"#).await;
|
||||
test_list_ssts_with_format(true, r#"
|
||||
ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", level: 0, file_path: "test/11_0000000001/<file_id>.parquet", file_size: 2855, index_file_path: Some("test/11_0000000001/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", level: 0, file_path: "test/11_0000000002/<file_id>.parquet", file_size: 2855, index_file_path: Some("test/11_0000000002/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", level: 0, file_path: "test/22_0000000042/<file_id>.parquet", file_size: 2855, index_file_path: Some("test/22_0000000042/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"#, r#"
|
||||
ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", level: 0, file_path: "test/11_0000000001/<file_id>.parquet", file_size: 2855, index_file_path: Some("test/11_0000000001/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", level: 0, file_path: "test/11_0000000002/<file_id>.parquet", file_size: 2855, index_file_path: Some("test/11_0000000002/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
|
||||
ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", level: 0, file_path: "test/22_0000000042/<file_id>.parquet", file_size: 2855, index_file_path: Some("test/22_0000000042/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"#, r#"
|
||||
StorageSstEntry { file_path: "test/11_0000000001/<file_id>.parquet", file_size: None, last_modified_ms: None, node_id: None }
|
||||
StorageSstEntry { file_path: "test/11_0000000001/index/<file_id>.puffin", file_size: None, last_modified_ms: None, node_id: None }
|
||||
StorageSstEntry { file_path: "test/11_0000000002/<file_id>.parquet", file_size: None, last_modified_ms: None, node_id: None }
|
||||
@@ -972,17 +972,17 @@ async fn test_list_ssts_with_format(
|
||||
#[tokio::test]
|
||||
async fn test_all_index_metas_list_all_types() {
|
||||
test_all_index_metas_list_all_types_with_format(false, r#"
|
||||
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6032), index_type: "bloom_filter", target_type: "column", target_key: "3", target_json: "{\"column\":3}", blob_size: 751, meta_json: Some("{\"bloom\":{\"bloom_filter_size\":640,\"row_count\":20,\"rows_per_segment\":2,\"segment_count\":10}}"), node_id: None }
|
||||
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6032), index_type: "fulltext_bloom", target_type: "column", target_key: "1", target_json: "{\"column\":1}", blob_size: 87, meta_json: Some("{\"bloom\":{\"bloom_filter_size\":64,\"row_count\":20,\"rows_per_segment\":4,\"segment_count\":5},\"fulltext\":{\"analyzer\":\"English\",\"case_sensitive\":false}}"), node_id: None }
|
||||
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6032), index_type: "fulltext_tantivy", target_type: "column", target_key: "2", target_json: "{\"column\":2}", blob_size: 1104, meta_json: Some("{\"fulltext\":{\"analyzer\":\"Chinese\",\"case_sensitive\":true}}"), node_id: None }
|
||||
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6032), index_type: "inverted", target_type: "column", target_key: "0", target_json: "{\"column\":0}", blob_size: 70, meta_json: Some("{\"inverted\":{\"base_offset\":0,\"bitmap_type\":\"Roaring\",\"fst_size\":44,\"inverted_index_size\":70,\"null_bitmap_size\":8,\"relative_fst_offset\":26,\"relative_null_bitmap_offset\":0,\"segment_row_count\":1024,\"total_row_count\":20}}"), node_id: None }
|
||||
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6032), index_type: "inverted", target_type: "column", target_key: "4", target_json: "{\"column\":4}", blob_size: 515, meta_json: Some("{\"inverted\":{\"base_offset\":0,\"bitmap_type\":\"Roaring\",\"fst_size\":147,\"inverted_index_size\":515,\"null_bitmap_size\":8,\"relative_fst_offset\":368,\"relative_null_bitmap_offset\":0,\"segment_row_count\":1024,\"total_row_count\":20}}"), node_id: None }"#).await;
|
||||
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "bloom_filter", target_type: "column", target_key: "1", target_json: "{\"column\":1}", blob_size: 751, meta_json: Some("{\"bloom\":{\"bloom_filter_size\":640,\"row_count\":20,\"rows_per_segment\":2,\"segment_count\":10}}"), node_id: None }
|
||||
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "fulltext_bloom", target_type: "column", target_key: "4", target_json: "{\"column\":4}", blob_size: 89, meta_json: Some("{\"bloom\":{\"bloom_filter_size\":64,\"row_count\":20,\"rows_per_segment\":4,\"segment_count\":5},\"fulltext\":{\"analyzer\":\"English\",\"case_sensitive\":false}}"), node_id: None }
|
||||
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "fulltext_tantivy", target_type: "column", target_key: "5", target_json: "{\"column\":5}", blob_size: 1100, meta_json: Some("{\"fulltext\":{\"analyzer\":\"Chinese\",\"case_sensitive\":true}}"), node_id: None }
|
||||
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "inverted", target_type: "column", target_key: "1", target_json: "{\"column\":1}", blob_size: 518, meta_json: Some("{\"inverted\":{\"base_offset\":0,\"bitmap_type\":\"Roaring\",\"fst_size\":150,\"inverted_index_size\":518,\"null_bitmap_size\":8,\"relative_fst_offset\":368,\"relative_null_bitmap_offset\":0,\"segment_row_count\":1024,\"total_row_count\":20}}"), node_id: None }
|
||||
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "inverted", target_type: "column", target_key: "2", target_json: "{\"column\":2}", blob_size: 515, meta_json: Some("{\"inverted\":{\"base_offset\":0,\"bitmap_type\":\"Roaring\",\"fst_size\":147,\"inverted_index_size\":515,\"null_bitmap_size\":8,\"relative_fst_offset\":368,\"relative_null_bitmap_offset\":0,\"segment_row_count\":1024,\"total_row_count\":20}}"), node_id: None }"#).await;
|
||||
test_all_index_metas_list_all_types_with_format(true, r#"
|
||||
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6144), index_type: "bloom_filter", target_type: "column", target_key: "3", target_json: "{\"column\":3}", blob_size: 751, meta_json: Some("{\"bloom\":{\"bloom_filter_size\":640,\"row_count\":20,\"rows_per_segment\":2,\"segment_count\":10}}"), node_id: None }
|
||||
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6144), index_type: "fulltext_bloom", target_type: "column", target_key: "1", target_json: "{\"column\":1}", blob_size: 89, meta_json: Some("{\"bloom\":{\"bloom_filter_size\":64,\"row_count\":20,\"rows_per_segment\":4,\"segment_count\":5},\"fulltext\":{\"analyzer\":\"English\",\"case_sensitive\":false}}"), node_id: None }
|
||||
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6144), index_type: "fulltext_tantivy", target_type: "column", target_key: "2", target_json: "{\"column\":2}", blob_size: 1104, meta_json: Some("{\"fulltext\":{\"analyzer\":\"Chinese\",\"case_sensitive\":true}}"), node_id: None }
|
||||
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6144), index_type: "inverted", target_type: "column", target_key: "0", target_json: "{\"column\":0}", blob_size: 92, meta_json: Some("{\"inverted\":{\"base_offset\":0,\"bitmap_type\":\"Roaring\",\"fst_size\":66,\"inverted_index_size\":92,\"null_bitmap_size\":8,\"relative_fst_offset\":26,\"relative_null_bitmap_offset\":0,\"segment_row_count\":1024,\"total_row_count\":20}}"), node_id: None }
|
||||
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6144), index_type: "inverted", target_type: "column", target_key: "4", target_json: "{\"column\":4}", blob_size: 515, meta_json: Some("{\"inverted\":{\"base_offset\":0,\"bitmap_type\":\"Roaring\",\"fst_size\":147,\"inverted_index_size\":515,\"null_bitmap_size\":8,\"relative_fst_offset\":368,\"relative_null_bitmap_offset\":0,\"segment_row_count\":1024,\"total_row_count\":20}}"), node_id: None }"#).await;
|
||||
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "bloom_filter", target_type: "column", target_key: "1", target_json: "{\"column\":1}", blob_size: 751, meta_json: Some("{\"bloom\":{\"bloom_filter_size\":640,\"row_count\":20,\"rows_per_segment\":2,\"segment_count\":10}}"), node_id: None }
|
||||
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "fulltext_bloom", target_type: "column", target_key: "4", target_json: "{\"column\":4}", blob_size: 89, meta_json: Some("{\"bloom\":{\"bloom_filter_size\":64,\"row_count\":20,\"rows_per_segment\":4,\"segment_count\":5},\"fulltext\":{\"analyzer\":\"English\",\"case_sensitive\":false}}"), node_id: None }
|
||||
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "fulltext_tantivy", target_type: "column", target_key: "5", target_json: "{\"column\":5}", blob_size: 1100, meta_json: Some("{\"fulltext\":{\"analyzer\":\"Chinese\",\"case_sensitive\":true}}"), node_id: None }
|
||||
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "inverted", target_type: "column", target_key: "1", target_json: "{\"column\":1}", blob_size: 518, meta_json: Some("{\"inverted\":{\"base_offset\":0,\"bitmap_type\":\"Roaring\",\"fst_size\":150,\"inverted_index_size\":518,\"null_bitmap_size\":8,\"relative_fst_offset\":368,\"relative_null_bitmap_offset\":0,\"segment_row_count\":1024,\"total_row_count\":20}}"), node_id: None }
|
||||
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "inverted", target_type: "column", target_key: "2", target_json: "{\"column\":2}", blob_size: 515, meta_json: Some("{\"inverted\":{\"base_offset\":0,\"bitmap_type\":\"Roaring\",\"fst_size\":147,\"inverted_index_size\":515,\"null_bitmap_size\":8,\"relative_fst_offset\":368,\"relative_null_bitmap_offset\":0,\"segment_row_count\":1024,\"total_row_count\":20}}"), node_id: None }"#).await;
|
||||
}
|
||||
|
||||
async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expect_format: &str) {
|
||||
@@ -1001,12 +1001,33 @@ async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expe
|
||||
// One region with both fulltext backends and inverted index enabled, plus bloom skipping index
|
||||
let region_id = RegionId::new(11, 1);
|
||||
|
||||
let mut request = CreateRequestBuilder::new().tag_num(3).field_num(2).build();
|
||||
// inverted index on tag_0
|
||||
request.column_metadatas[0]
|
||||
let mut request = CreateRequestBuilder::new().tag_num(1).field_num(2).build();
|
||||
// bloom filter skipping index on field_1
|
||||
let skipping = SkippingIndexOptions::new_unchecked(2, 0.01, SkippingIndexType::BloomFilter);
|
||||
request.column_metadatas[1]
|
||||
.column_schema
|
||||
.set_skipping_options(&skipping)
|
||||
.unwrap();
|
||||
|
||||
// inverted index on field_1
|
||||
request.column_metadatas[2]
|
||||
.column_schema
|
||||
.set_inverted_index(true);
|
||||
// fulltext bloom on tag_1
|
||||
// inverted index on tag_0
|
||||
request.column_metadatas[1]
|
||||
.column_schema
|
||||
.set_inverted_index(true);
|
||||
|
||||
request.column_metadatas.push(ColumnMetadata {
|
||||
column_schema: ColumnSchema::new(
|
||||
"field_2".to_string(),
|
||||
ConcreteDataType::string_datatype(),
|
||||
true,
|
||||
),
|
||||
semantic_type: SemanticType::Field,
|
||||
column_id: 4,
|
||||
});
|
||||
// fulltext bloom on field_2
|
||||
let ft_bloom = FulltextOptions::new_unchecked(
|
||||
true,
|
||||
FulltextAnalyzer::English,
|
||||
@@ -1015,11 +1036,24 @@ async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expe
|
||||
4,
|
||||
0.001,
|
||||
);
|
||||
request.column_metadatas[1]
|
||||
request
|
||||
.column_metadatas
|
||||
.last_mut()
|
||||
.unwrap()
|
||||
.column_schema
|
||||
.set_fulltext_options(&ft_bloom)
|
||||
.unwrap();
|
||||
// fulltext tantivy on tag_2
|
||||
|
||||
request.column_metadatas.push(ColumnMetadata {
|
||||
column_schema: ColumnSchema::new(
|
||||
"field_3".to_string(),
|
||||
ConcreteDataType::string_datatype(),
|
||||
true,
|
||||
),
|
||||
semantic_type: SemanticType::Field,
|
||||
column_id: 5,
|
||||
});
|
||||
// fulltext tantivy on field_3
|
||||
let ft_tantivy = FulltextOptions::new_unchecked(
|
||||
true,
|
||||
FulltextAnalyzer::Chinese,
|
||||
@@ -1028,28 +1062,20 @@ async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expe
|
||||
2,
|
||||
0.01,
|
||||
);
|
||||
request.column_metadatas[2]
|
||||
request
|
||||
.column_metadatas
|
||||
.last_mut()
|
||||
.unwrap()
|
||||
.column_schema
|
||||
.set_fulltext_options(&ft_tantivy)
|
||||
.unwrap();
|
||||
// bloom filter skipping index on field_1 (which is at index 3)
|
||||
let skipping = SkippingIndexOptions::new_unchecked(2, 0.01, SkippingIndexType::BloomFilter);
|
||||
request.column_metadatas[3]
|
||||
.column_schema
|
||||
.set_skipping_options(&skipping)
|
||||
.unwrap();
|
||||
|
||||
// inverted index on field_1
|
||||
request.column_metadatas[4]
|
||||
.column_schema
|
||||
.set_inverted_index(true);
|
||||
|
||||
engine
|
||||
.handle_request(region_id, RegionRequest::Create(request.clone()))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// write some rows (schema: tag_0, tag_1, tag_2, field_0, field_1, ts)
|
||||
// write some rows (schema: tag_0, field_0, field_1, field_2, field_3, ts)
|
||||
let column_schemas = rows_schema(&request);
|
||||
let rows_vec: Vec<api::v1::Row> = (0..20)
|
||||
.map(|ts| api::v1::Row {
|
||||
@@ -1057,12 +1083,6 @@ async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expe
|
||||
api::v1::Value {
|
||||
value_data: Some(api::v1::value::ValueData::StringValue("x".to_string())),
|
||||
},
|
||||
api::v1::Value {
|
||||
value_data: Some(api::v1::value::ValueData::StringValue("y".to_string())),
|
||||
},
|
||||
api::v1::Value {
|
||||
value_data: Some(api::v1::value::ValueData::StringValue("z".to_string())),
|
||||
},
|
||||
api::v1::Value {
|
||||
value_data: Some(api::v1::value::ValueData::F64Value(ts as f64)),
|
||||
},
|
||||
@@ -1074,6 +1094,12 @@ async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expe
|
||||
ts as i64 * 1000,
|
||||
)),
|
||||
},
|
||||
api::v1::Value {
|
||||
value_data: Some(api::v1::value::ValueData::StringValue("y".to_string())),
|
||||
},
|
||||
api::v1::Value {
|
||||
value_data: Some(api::v1::value::ValueData::StringValue("z".to_string())),
|
||||
},
|
||||
],
|
||||
})
|
||||
.collect();
|
||||
@@ -1095,7 +1121,7 @@ async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expe
|
||||
.unwrap();
|
||||
|
||||
fn bucket_size(size: u64) -> u64 {
|
||||
if size < 512 { size } else { (size / 16) * 16 }
|
||||
if size < 512 { size } else { (size / 100) * 100 }
|
||||
}
|
||||
|
||||
let mut metas = engine.all_index_metas().await;
|
||||
@@ -1125,5 +1151,5 @@ async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expe
|
||||
.map(|entry| format!("\n{:?}", entry))
|
||||
.collect::<String>();
|
||||
|
||||
assert_eq!(debug_format, expect_format);
|
||||
assert_eq!(expect_format, debug_format);
|
||||
}
|
||||
|
||||
@@ -32,11 +32,6 @@ use crate::test_util::{
|
||||
CreateRequestBuilder, TestEnv, build_rows, flush_region, put_rows, reopen_region, rows_schema,
|
||||
};
|
||||
|
||||
// wait listener receives enough success count.
|
||||
async fn wait_finish(listener: &IndexBuildListener, times: usize) {
|
||||
listener.wait_finish(times).await;
|
||||
}
|
||||
|
||||
fn async_build_mode_config(is_create_on_flush: bool) -> MitoConfig {
|
||||
let mut config = MitoConfig::default();
|
||||
config.index.build_mode = IndexBuildMode::Async;
|
||||
@@ -84,7 +79,7 @@ fn assert_listener_counts(
|
||||
expected_success_count: usize,
|
||||
) {
|
||||
assert_eq!(listener.begin_count(), expected_begin_count);
|
||||
assert_eq!(listener.success_count(), expected_success_count);
|
||||
assert_eq!(listener.finish_count(), expected_success_count);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -155,7 +150,7 @@ async fn test_index_build_type_flush() {
|
||||
flush_region(&engine, region_id, None).await;
|
||||
|
||||
// After 2 index build task are finished, 2 index files should exist.
|
||||
wait_finish(&listener, 2).await;
|
||||
listener.wait_finish(2).await;
|
||||
let scanner = engine
|
||||
.scanner(region_id, ScanRequest::default())
|
||||
.await
|
||||
@@ -204,6 +199,8 @@ async fn test_index_build_type_compact() {
|
||||
put_and_flush(&engine, region_id, &column_schemas, 15..25).await;
|
||||
put_and_flush(&engine, region_id, &column_schemas, 40..50).await;
|
||||
|
||||
// all index build tasks begin means flush tasks are all finished.
|
||||
listener.wait_begin(4).await;
|
||||
// Before compaction is triggered, files should be 4, and not all index files are built.
|
||||
let scanner = engine
|
||||
.scanner(region_id, ScanRequest::default())
|
||||
@@ -216,8 +213,8 @@ async fn test_index_build_type_compact() {
|
||||
// This explicit compaction call serves to make the process deterministic for the test.
|
||||
compact(&engine, region_id).await;
|
||||
|
||||
listener.wait_begin(5).await; // 4 flush + 1 compaction begin
|
||||
// Before compaction is triggered, files should be 2, and not all index files are built.
|
||||
listener.clear_success_count();
|
||||
let scanner = engine
|
||||
.scanner(region_id, ScanRequest::default())
|
||||
.await
|
||||
@@ -226,7 +223,7 @@ async fn test_index_build_type_compact() {
|
||||
assert!(num_of_index_files(&engine, &scanner, region_id).await < 2);
|
||||
|
||||
// Wait a while to make sure index build tasks are finished.
|
||||
wait_finish(&listener, 2).await;
|
||||
listener.wait_stop(5).await; // 4 flush + 1 compaction = some abort + some finish
|
||||
let scanner = engine
|
||||
.scanner(region_id, ScanRequest::default())
|
||||
.await
|
||||
@@ -292,7 +289,7 @@ async fn test_index_build_type_schema_change() {
|
||||
.handle_request(region_id, RegionRequest::Alter(set_index_request))
|
||||
.await
|
||||
.unwrap();
|
||||
wait_finish(&listener, 1).await;
|
||||
listener.wait_finish(1).await;
|
||||
let scanner = engine
|
||||
.scanner(region_id, ScanRequest::default())
|
||||
.await
|
||||
|
||||
@@ -75,10 +75,13 @@ pub trait EventListener: Send + Sync {
|
||||
async fn on_notify_region_change_result_begin(&self, _region_id: RegionId) {}
|
||||
|
||||
/// Notifies the listener that the index build task is executed successfully.
|
||||
async fn on_index_build_success(&self, _region_file_id: RegionFileId) {}
|
||||
async fn on_index_build_finish(&self, _region_file_id: RegionFileId) {}
|
||||
|
||||
/// Notifies the listener that the index build task is started.
|
||||
async fn on_index_build_begin(&self, _region_file_id: RegionFileId) {}
|
||||
|
||||
/// Notifies the listener that the index build task is aborted.
|
||||
async fn on_index_build_abort(&self, _region_file_id: RegionFileId) {}
|
||||
}
|
||||
|
||||
pub type EventListenerRef = Arc<dyn EventListener>;
|
||||
@@ -309,45 +312,75 @@ impl EventListener for NotifyRegionChangeResultListener {
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct IndexBuildListener {
|
||||
notify: Notify,
|
||||
success_count: AtomicUsize,
|
||||
start_count: AtomicUsize,
|
||||
begin_count: AtomicUsize,
|
||||
begin_notify: Notify,
|
||||
finish_count: AtomicUsize,
|
||||
finish_notify: Notify,
|
||||
abort_count: AtomicUsize,
|
||||
abort_notify: Notify,
|
||||
// stop means finished or aborted
|
||||
stop_notify: Notify,
|
||||
}
|
||||
|
||||
impl IndexBuildListener {
|
||||
/// Wait until index build is done for `times` times.
|
||||
pub async fn wait_finish(&self, times: usize) {
|
||||
while self.success_count.load(Ordering::Relaxed) < times {
|
||||
self.notify.notified().await;
|
||||
while self.finish_count.load(Ordering::Relaxed) < times {
|
||||
self.finish_notify.notified().await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Wait until index build is stopped for `times` times.
|
||||
pub async fn wait_stop(&self, times: usize) {
|
||||
while self.finish_count.load(Ordering::Relaxed) + self.abort_count.load(Ordering::Relaxed)
|
||||
< times
|
||||
{
|
||||
self.stop_notify.notified().await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Wait until index build is begun for `times` times.
|
||||
pub async fn wait_begin(&self, times: usize) {
|
||||
while self.begin_count.load(Ordering::Relaxed) < times {
|
||||
self.begin_notify.notified().await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Clears the success count.
|
||||
pub fn clear_success_count(&self) {
|
||||
self.success_count.store(0, Ordering::Relaxed);
|
||||
pub fn clear_finish_count(&self) {
|
||||
self.finish_count.store(0, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
/// Returns the success count.
|
||||
pub fn success_count(&self) -> usize {
|
||||
self.success_count.load(Ordering::Relaxed)
|
||||
pub fn finish_count(&self) -> usize {
|
||||
self.finish_count.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
/// Returns the start count.
|
||||
pub fn begin_count(&self) -> usize {
|
||||
self.start_count.load(Ordering::Relaxed)
|
||||
self.begin_count.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl EventListener for IndexBuildListener {
|
||||
async fn on_index_build_success(&self, region_file_id: RegionFileId) {
|
||||
async fn on_index_build_finish(&self, region_file_id: RegionFileId) {
|
||||
info!("Region {} index build successfully", region_file_id);
|
||||
self.success_count.fetch_add(1, Ordering::Relaxed);
|
||||
self.notify.notify_one();
|
||||
self.finish_count.fetch_add(1, Ordering::Relaxed);
|
||||
self.finish_notify.notify_one();
|
||||
self.stop_notify.notify_one();
|
||||
}
|
||||
|
||||
async fn on_index_build_begin(&self, region_file_id: RegionFileId) {
|
||||
info!("Region {} index build begin", region_file_id);
|
||||
self.start_count.fetch_add(1, Ordering::Relaxed);
|
||||
self.begin_count.fetch_add(1, Ordering::Relaxed);
|
||||
self.begin_notify.notify_one();
|
||||
}
|
||||
|
||||
async fn on_index_build_abort(&self, region_file_id: RegionFileId) {
|
||||
info!("Region {} index build aborted", region_file_id);
|
||||
self.abort_count.fetch_add(1, Ordering::Relaxed);
|
||||
self.abort_notify.notify_one();
|
||||
self.stop_notify.notify_one();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -641,6 +641,7 @@ impl RegionFlushTask {
|
||||
num_row_groups: sst_info.num_row_groups,
|
||||
sequence: NonZeroU64::new(max_sequence),
|
||||
partition_expr,
|
||||
num_series: sst_info.num_series,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -269,6 +269,7 @@ async fn checkpoint_with_different_compression_types() {
|
||||
num_row_groups: 0,
|
||||
sequence: None,
|
||||
partition_expr: None,
|
||||
num_series: 0,
|
||||
};
|
||||
let action = RegionMetaActionList::new(vec![RegionMetaAction::Edit(RegionEdit {
|
||||
files_to_add: vec![file_meta],
|
||||
@@ -334,6 +335,7 @@ fn generate_action_lists(num: usize) -> (Vec<FileId>, Vec<RegionMetaActionList>)
|
||||
num_row_groups: 0,
|
||||
sequence: None,
|
||||
partition_expr: None,
|
||||
num_series: 0,
|
||||
};
|
||||
let action = RegionMetaActionList::new(vec![RegionMetaAction::Edit(RegionEdit {
|
||||
files_to_add: vec![file_meta],
|
||||
|
||||
@@ -69,7 +69,7 @@ use crate::sst::parquet::flat_format::primary_key_column_index;
|
||||
use crate::sst::parquet::format::{PrimaryKeyArray, PrimaryKeyArrayBuilder, ReadFormat};
|
||||
use crate::sst::parquet::helper::parse_parquet_metadata;
|
||||
use crate::sst::parquet::{PARQUET_METADATA_KEY, SstInfo};
|
||||
use crate::sst::to_sst_arrow_schema;
|
||||
use crate::sst::{SeriesEstimator, to_sst_arrow_schema};
|
||||
|
||||
const INIT_DICT_VALUE_CAPACITY: usize = 8;
|
||||
|
||||
@@ -563,6 +563,7 @@ impl EncodedBulkPart {
|
||||
num_row_groups: self.metadata.parquet_metadata.num_row_groups() as u64,
|
||||
file_metadata: Some(self.metadata.parquet_metadata.clone()),
|
||||
index_metadata: IndexOutput::default(),
|
||||
num_series: self.metadata.num_series,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -602,6 +603,8 @@ pub struct BulkPartMeta {
|
||||
pub parquet_metadata: Arc<ParquetMetaData>,
|
||||
/// Part region schema.
|
||||
pub region_metadata: RegionMetadataRef,
|
||||
/// Number of series.
|
||||
pub num_series: u64,
|
||||
}
|
||||
|
||||
/// Metrics for encoding a part.
|
||||
@@ -669,6 +672,7 @@ impl BulkPartEncoder {
|
||||
let mut writer = ArrowWriter::try_new(&mut buf, arrow_schema, self.writer_props.clone())
|
||||
.context(EncodeMemtableSnafu)?;
|
||||
let mut total_rows = 0;
|
||||
let mut series_estimator = SeriesEstimator::default();
|
||||
|
||||
// Process each batch from the iterator
|
||||
let mut iter_start = Instant::now();
|
||||
@@ -679,6 +683,7 @@ impl BulkPartEncoder {
|
||||
continue;
|
||||
}
|
||||
|
||||
series_estimator.update_flat(&batch);
|
||||
metrics.raw_size += record_batch_estimated_size(&batch);
|
||||
let write_start = Instant::now();
|
||||
writer.write(&batch).context(EncodeMemtableSnafu)?;
|
||||
@@ -701,6 +706,7 @@ impl BulkPartEncoder {
|
||||
|
||||
let buf = Bytes::from(buf);
|
||||
let parquet_metadata = Arc::new(parse_parquet_metadata(file_metadata)?);
|
||||
let num_series = series_estimator.finish();
|
||||
|
||||
Ok(Some(EncodedBulkPart {
|
||||
data: buf,
|
||||
@@ -710,6 +716,7 @@ impl BulkPartEncoder {
|
||||
min_timestamp,
|
||||
parquet_metadata,
|
||||
region_metadata: self.metadata.clone(),
|
||||
num_series,
|
||||
},
|
||||
}))
|
||||
}
|
||||
@@ -742,6 +749,7 @@ impl BulkPartEncoder {
|
||||
min_timestamp: part.min_timestamp,
|
||||
parquet_metadata,
|
||||
region_metadata: self.metadata.clone(),
|
||||
num_series: part.estimated_series_count() as u64,
|
||||
},
|
||||
}))
|
||||
}
|
||||
|
||||
@@ -13,12 +13,10 @@
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::VecDeque;
|
||||
use std::ops::BitAnd;
|
||||
use std::sync::Arc;
|
||||
|
||||
use bytes::Bytes;
|
||||
use datatypes::arrow::array::BooleanArray;
|
||||
use datatypes::arrow::buffer::BooleanBuffer;
|
||||
use datatypes::arrow::record_batch::RecordBatch;
|
||||
use parquet::arrow::ProjectionMask;
|
||||
use parquet::arrow::arrow_reader::ParquetRecordBatchReader;
|
||||
@@ -30,7 +28,7 @@ use crate::error::{self, ComputeArrowSnafu, DecodeArrowRowGroupSnafu};
|
||||
use crate::memtable::bulk::context::{BulkIterContext, BulkIterContextRef};
|
||||
use crate::memtable::bulk::row_group_reader::MemtableRowGroupReaderBuilder;
|
||||
use crate::sst::parquet::flat_format::sequence_column_index;
|
||||
use crate::sst::parquet::reader::{MaybeFilter, RowGroupReaderContext};
|
||||
use crate::sst::parquet::reader::RowGroupReaderContext;
|
||||
|
||||
/// Iterator for reading data inside a bulk part.
|
||||
pub struct EncodedBulkPartIter {
|
||||
@@ -191,38 +189,13 @@ fn apply_combined_filters(
|
||||
let num_rows = record_batch.num_rows();
|
||||
let mut combined_filter = None;
|
||||
|
||||
// First, apply predicate filters.
|
||||
// First, apply predicate filters using the shared method.
|
||||
if !context.base.filters.is_empty() {
|
||||
let num_rows = record_batch.num_rows();
|
||||
let mut mask = BooleanBuffer::new_set(num_rows);
|
||||
|
||||
// Run filter one by one and combine them result, similar to RangeBase::precise_filter
|
||||
for filter_ctx in &context.base.filters {
|
||||
let filter = match filter_ctx.filter() {
|
||||
MaybeFilter::Filter(f) => f,
|
||||
// Column matches.
|
||||
MaybeFilter::Matched => continue,
|
||||
// Column doesn't match, filter the entire batch.
|
||||
MaybeFilter::Pruned => return Ok(None),
|
||||
};
|
||||
|
||||
// Safety: We checked the format type in new().
|
||||
let Some(column_index) = context
|
||||
.read_format()
|
||||
.as_flat()
|
||||
.unwrap()
|
||||
.projected_index_by_id(filter_ctx.column_id())
|
||||
else {
|
||||
continue;
|
||||
};
|
||||
let array = record_batch.column(column_index);
|
||||
let result = filter
|
||||
.evaluate_array(array)
|
||||
.context(crate::error::RecordBatchSnafu)?;
|
||||
|
||||
mask = mask.bitand(&result);
|
||||
}
|
||||
// Convert the mask to BooleanArray
|
||||
let predicate_mask = context.base.compute_filter_mask_flat(&record_batch)?;
|
||||
// If predicate filters out the entire batch, return None early
|
||||
let Some(mask) = predicate_mask else {
|
||||
return Ok(None);
|
||||
};
|
||||
combined_filter = Some(BooleanArray::from(mask));
|
||||
}
|
||||
|
||||
|
||||
@@ -386,7 +386,8 @@ impl FlatCompatBatch {
|
||||
/// Repeats the vector value `to_len` times.
|
||||
fn repeat_vector(vector: &VectorRef, to_len: usize, is_tag: bool) -> Result<ArrayRef> {
|
||||
assert_eq!(1, vector.len());
|
||||
if is_tag {
|
||||
let data_type = vector.data_type();
|
||||
if is_tag && data_type.is_string() {
|
||||
let values = vector.to_arrow_array();
|
||||
if values.is_null(0) {
|
||||
// Creates a dictionary array with `to_len` null keys.
|
||||
|
||||
@@ -48,6 +48,8 @@ pub struct FlatProjectionMapper {
|
||||
/// Ids of columns to project. It keeps ids in the same order as the `projection`
|
||||
/// indices to build the mapper.
|
||||
/// The mapper won't deduplicate the column ids.
|
||||
///
|
||||
/// Note that this doesn't contain the `__table_id` and `__tsid`.
|
||||
column_ids: Vec<ColumnId>,
|
||||
/// Ids and DataTypes of columns of the expected batch.
|
||||
/// We can use this to check if the batch is compatible with the expected schema.
|
||||
|
||||
@@ -608,6 +608,7 @@ impl MitoRegion {
|
||||
index_file_size,
|
||||
num_rows: meta.num_rows,
|
||||
num_row_groups: meta.num_row_groups,
|
||||
num_series: Some(meta.num_series),
|
||||
min_ts: meta.time_range.0,
|
||||
max_ts: meta.time_range.1,
|
||||
sequence: meta.sequence.map(|s| s.get()),
|
||||
|
||||
@@ -431,6 +431,7 @@ mod tests {
|
||||
num_row_groups: 1,
|
||||
sequence: NonZeroU64::new(1),
|
||||
partition_expr,
|
||||
num_series: 1,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -21,7 +21,9 @@ use common_base::readable_size::ReadableSize;
|
||||
use datatypes::arrow::datatypes::{
|
||||
DataType as ArrowDataType, Field, FieldRef, Fields, Schema, SchemaRef,
|
||||
};
|
||||
use datatypes::arrow::record_batch::RecordBatch;
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use datatypes::timestamp::timestamp_array_to_primitive;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use store_api::codec::PrimaryKeyEncoding;
|
||||
use store_api::metadata::RegionMetadata;
|
||||
@@ -29,6 +31,9 @@ use store_api::storage::consts::{
|
||||
OP_TYPE_COLUMN_NAME, PRIMARY_KEY_COLUMN_NAME, SEQUENCE_COLUMN_NAME,
|
||||
};
|
||||
|
||||
use crate::read::Batch;
|
||||
use crate::sst::parquet::flat_format::time_index_column_index;
|
||||
|
||||
pub mod file;
|
||||
pub mod file_purger;
|
||||
pub mod file_ref;
|
||||
@@ -241,3 +246,426 @@ fn plain_internal_fields() -> [FieldRef; 2] {
|
||||
Arc::new(Field::new(OP_TYPE_COLUMN_NAME, ArrowDataType::UInt8, false)),
|
||||
]
|
||||
}
|
||||
|
||||
/// Gets the estimated number of series from record batches.
|
||||
///
|
||||
/// This struct tracks the last timestamp value to detect series boundaries
|
||||
/// by observing when timestamps decrease (indicating a new series).
|
||||
#[derive(Default)]
|
||||
pub(crate) struct SeriesEstimator {
|
||||
/// The last timestamp value seen
|
||||
last_timestamp: Option<i64>,
|
||||
/// The estimated number of series
|
||||
series_count: u64,
|
||||
}
|
||||
|
||||
impl SeriesEstimator {
|
||||
/// Updates the estimator with a new Batch.
|
||||
///
|
||||
/// Since each Batch contains only one series, this increments the series count
|
||||
/// and updates the last timestamp.
|
||||
pub(crate) fn update(&mut self, batch: &Batch) {
|
||||
let Some(last_ts) = batch.last_timestamp() else {
|
||||
return;
|
||||
};
|
||||
|
||||
// Checks if there's a boundary between the last batch and this batch
|
||||
if let Some(prev_last_ts) = self.last_timestamp {
|
||||
// If the first timestamp of this batch is less than the last timestamp
|
||||
// we've seen, it indicates a new series
|
||||
if let Some(first_ts) = batch.first_timestamp()
|
||||
&& first_ts.value() <= prev_last_ts
|
||||
{
|
||||
self.series_count += 1;
|
||||
}
|
||||
} else {
|
||||
// First batch, counts as first series
|
||||
self.series_count = 1;
|
||||
}
|
||||
|
||||
// Updates the last timestamp
|
||||
self.last_timestamp = Some(last_ts.value());
|
||||
}
|
||||
|
||||
/// Updates the estimator with a new record batch in flat format.
|
||||
///
|
||||
/// This method examines the time index column to detect series boundaries.
|
||||
pub(crate) fn update_flat(&mut self, record_batch: &RecordBatch) {
|
||||
let batch_rows = record_batch.num_rows();
|
||||
if batch_rows == 0 {
|
||||
return;
|
||||
}
|
||||
|
||||
let time_index_pos = time_index_column_index(record_batch.num_columns());
|
||||
let timestamps = record_batch.column(time_index_pos);
|
||||
let Some((ts_values, _unit)) = timestamp_array_to_primitive(timestamps) else {
|
||||
return;
|
||||
};
|
||||
let values = ts_values.values();
|
||||
|
||||
// Checks if there's a boundary between the last batch and this batch
|
||||
if let Some(last_ts) = self.last_timestamp {
|
||||
if values[0] <= last_ts {
|
||||
self.series_count += 1;
|
||||
}
|
||||
} else {
|
||||
// First batch, counts as first series
|
||||
self.series_count = 1;
|
||||
}
|
||||
|
||||
// Counts series boundaries within this batch.
|
||||
for i in 0..batch_rows - 1 {
|
||||
// We assumes the same timestamp as a new series, which is different from
|
||||
// how we split batches.
|
||||
if values[i] >= values[i + 1] {
|
||||
self.series_count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Updates the last timestamp
|
||||
self.last_timestamp = Some(values[batch_rows - 1]);
|
||||
}
|
||||
|
||||
/// Returns the estimated number of series.
|
||||
pub(crate) fn finish(&mut self) -> u64 {
|
||||
self.last_timestamp = None;
|
||||
let count = self.series_count;
|
||||
self.series_count = 0;
|
||||
|
||||
count
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::v1::OpType;
|
||||
use datatypes::arrow::array::{
|
||||
BinaryArray, DictionaryArray, TimestampMillisecondArray, UInt8Array, UInt8Builder,
|
||||
UInt32Array, UInt64Array,
|
||||
};
|
||||
use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit};
|
||||
use datatypes::arrow::record_batch::RecordBatch;
|
||||
|
||||
use super::*;
|
||||
use crate::read::{Batch, BatchBuilder};
|
||||
|
||||
fn new_batch(
|
||||
primary_key: &[u8],
|
||||
timestamps: &[i64],
|
||||
sequences: &[u64],
|
||||
op_types: &[OpType],
|
||||
) -> Batch {
|
||||
let timestamps = Arc::new(TimestampMillisecondArray::from(timestamps.to_vec()));
|
||||
let sequences = Arc::new(UInt64Array::from(sequences.to_vec()));
|
||||
let mut op_type_builder = UInt8Builder::with_capacity(op_types.len());
|
||||
for op_type in op_types {
|
||||
op_type_builder.append_value(*op_type as u8);
|
||||
}
|
||||
let op_types = Arc::new(UInt8Array::from(
|
||||
op_types.iter().map(|op| *op as u8).collect::<Vec<_>>(),
|
||||
));
|
||||
|
||||
let mut builder = BatchBuilder::new(primary_key.to_vec());
|
||||
builder
|
||||
.timestamps_array(timestamps)
|
||||
.unwrap()
|
||||
.sequences_array(sequences)
|
||||
.unwrap()
|
||||
.op_types_array(op_types)
|
||||
.unwrap();
|
||||
builder.build().unwrap()
|
||||
}
|
||||
|
||||
fn new_flat_record_batch(timestamps: &[i64]) -> RecordBatch {
|
||||
// Flat format has: [fields..., time_index, __primary_key, __sequence, __op_type]
|
||||
let num_cols = 4; // time_index + 3 internal columns
|
||||
let time_index_pos = time_index_column_index(num_cols);
|
||||
assert_eq!(time_index_pos, 0); // For 4 columns, time index should be at position 0
|
||||
|
||||
let time_array = Arc::new(TimestampMillisecondArray::from(timestamps.to_vec()));
|
||||
let pk_array = Arc::new(DictionaryArray::new(
|
||||
UInt32Array::from(vec![0; timestamps.len()]),
|
||||
Arc::new(BinaryArray::from(vec![b"test".as_slice()])),
|
||||
));
|
||||
let seq_array = Arc::new(UInt64Array::from(vec![1; timestamps.len()]));
|
||||
let op_array = Arc::new(UInt8Array::from(vec![1; timestamps.len()]));
|
||||
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new(
|
||||
"time",
|
||||
ArrowDataType::Timestamp(TimeUnit::Millisecond, None),
|
||||
false,
|
||||
),
|
||||
Field::new_dictionary(
|
||||
"__primary_key",
|
||||
ArrowDataType::UInt32,
|
||||
ArrowDataType::Binary,
|
||||
false,
|
||||
),
|
||||
Field::new("__sequence", ArrowDataType::UInt64, false),
|
||||
Field::new("__op_type", ArrowDataType::UInt8, false),
|
||||
]));
|
||||
|
||||
RecordBatch::try_new(schema, vec![time_array, pk_array, seq_array, op_array]).unwrap()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_series_estimator_empty_batch() {
|
||||
let mut estimator = SeriesEstimator::default();
|
||||
let batch = new_batch(b"test", &[], &[], &[]);
|
||||
estimator.update(&batch);
|
||||
assert_eq!(0, estimator.finish());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_series_estimator_single_batch() {
|
||||
let mut estimator = SeriesEstimator::default();
|
||||
let batch = new_batch(
|
||||
b"test",
|
||||
&[1, 2, 3],
|
||||
&[1, 2, 3],
|
||||
&[OpType::Put, OpType::Put, OpType::Put],
|
||||
);
|
||||
estimator.update(&batch);
|
||||
assert_eq!(1, estimator.finish());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_series_estimator_multiple_batches_same_series() {
|
||||
let mut estimator = SeriesEstimator::default();
|
||||
|
||||
// First batch with timestamps 1, 2, 3
|
||||
let batch1 = new_batch(
|
||||
b"test",
|
||||
&[1, 2, 3],
|
||||
&[1, 2, 3],
|
||||
&[OpType::Put, OpType::Put, OpType::Put],
|
||||
);
|
||||
estimator.update(&batch1);
|
||||
|
||||
// Second batch with timestamps 4, 5, 6 (continuation)
|
||||
let batch2 = new_batch(
|
||||
b"test",
|
||||
&[4, 5, 6],
|
||||
&[4, 5, 6],
|
||||
&[OpType::Put, OpType::Put, OpType::Put],
|
||||
);
|
||||
estimator.update(&batch2);
|
||||
|
||||
assert_eq!(1, estimator.finish());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_series_estimator_new_series_detected() {
|
||||
let mut estimator = SeriesEstimator::default();
|
||||
|
||||
// First batch with timestamps 1, 2, 3
|
||||
let batch1 = new_batch(
|
||||
b"pk0",
|
||||
&[1, 2, 3],
|
||||
&[1, 2, 3],
|
||||
&[OpType::Put, OpType::Put, OpType::Put],
|
||||
);
|
||||
estimator.update(&batch1);
|
||||
|
||||
// Second batch with timestamps 2, 3, 4 (timestamp goes back, new series)
|
||||
let batch2 = new_batch(
|
||||
b"pk1",
|
||||
&[2, 3, 4],
|
||||
&[4, 5, 6],
|
||||
&[OpType::Put, OpType::Put, OpType::Put],
|
||||
);
|
||||
estimator.update(&batch2);
|
||||
|
||||
assert_eq!(2, estimator.finish());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_series_estimator_equal_timestamp_boundary() {
|
||||
let mut estimator = SeriesEstimator::default();
|
||||
|
||||
// First batch ending at timestamp 5
|
||||
let batch1 = new_batch(
|
||||
b"test",
|
||||
&[1, 2, 5],
|
||||
&[1, 2, 3],
|
||||
&[OpType::Put, OpType::Put, OpType::Put],
|
||||
);
|
||||
estimator.update(&batch1);
|
||||
|
||||
// Second batch starting at timestamp 5 (equal, indicates new series)
|
||||
let batch2 = new_batch(
|
||||
b"test",
|
||||
&[5, 6, 7],
|
||||
&[4, 5, 6],
|
||||
&[OpType::Put, OpType::Put, OpType::Put],
|
||||
);
|
||||
estimator.update(&batch2);
|
||||
|
||||
assert_eq!(2, estimator.finish());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_series_estimator_finish_resets_state() {
|
||||
let mut estimator = SeriesEstimator::default();
|
||||
|
||||
let batch1 = new_batch(
|
||||
b"test",
|
||||
&[1, 2, 3],
|
||||
&[1, 2, 3],
|
||||
&[OpType::Put, OpType::Put, OpType::Put],
|
||||
);
|
||||
estimator.update(&batch1);
|
||||
|
||||
assert_eq!(1, estimator.finish());
|
||||
|
||||
// After finish, state should be reset
|
||||
let batch2 = new_batch(
|
||||
b"test",
|
||||
&[4, 5, 6],
|
||||
&[4, 5, 6],
|
||||
&[OpType::Put, OpType::Put, OpType::Put],
|
||||
);
|
||||
estimator.update(&batch2);
|
||||
|
||||
assert_eq!(1, estimator.finish());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_series_estimator_flat_empty_batch() {
|
||||
let mut estimator = SeriesEstimator::default();
|
||||
let record_batch = new_flat_record_batch(&[]);
|
||||
estimator.update_flat(&record_batch);
|
||||
assert_eq!(0, estimator.finish());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_series_estimator_flat_single_batch() {
|
||||
let mut estimator = SeriesEstimator::default();
|
||||
let record_batch = new_flat_record_batch(&[1, 2, 3]);
|
||||
estimator.update_flat(&record_batch);
|
||||
assert_eq!(1, estimator.finish());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_series_estimator_flat_series_boundary_within_batch() {
|
||||
let mut estimator = SeriesEstimator::default();
|
||||
// Timestamps decrease from 3 to 2, indicating a series boundary
|
||||
let record_batch = new_flat_record_batch(&[1, 2, 3, 2, 4, 5]);
|
||||
estimator.update_flat(&record_batch);
|
||||
// Should detect boundary at position 3 (3 >= 2)
|
||||
assert_eq!(2, estimator.finish());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_series_estimator_flat_multiple_boundaries_within_batch() {
|
||||
let mut estimator = SeriesEstimator::default();
|
||||
// Multiple series boundaries: 5>=4, 6>=3
|
||||
let record_batch = new_flat_record_batch(&[1, 2, 5, 4, 6, 3, 7]);
|
||||
estimator.update_flat(&record_batch);
|
||||
assert_eq!(3, estimator.finish());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_series_estimator_flat_equal_timestamps() {
|
||||
let mut estimator = SeriesEstimator::default();
|
||||
// Equal timestamps are considered as new series
|
||||
let record_batch = new_flat_record_batch(&[1, 2, 2, 3, 3, 3, 4]);
|
||||
estimator.update_flat(&record_batch);
|
||||
// Boundaries at: 2>=2, 3>=3, 3>=3
|
||||
assert_eq!(4, estimator.finish());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_series_estimator_flat_multiple_batches_continuation() {
|
||||
let mut estimator = SeriesEstimator::default();
|
||||
|
||||
// First batch: timestamps 1, 2, 3
|
||||
let batch1 = new_flat_record_batch(&[1, 2, 3]);
|
||||
estimator.update_flat(&batch1);
|
||||
|
||||
// Second batch: timestamps 4, 5, 6 (continuation)
|
||||
let batch2 = new_flat_record_batch(&[4, 5, 6]);
|
||||
estimator.update_flat(&batch2);
|
||||
|
||||
assert_eq!(1, estimator.finish());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_series_estimator_flat_multiple_batches_new_series() {
|
||||
let mut estimator = SeriesEstimator::default();
|
||||
|
||||
// First batch: timestamps 1, 2, 3
|
||||
let batch1 = new_flat_record_batch(&[1, 2, 3]);
|
||||
estimator.update_flat(&batch1);
|
||||
|
||||
// Second batch: timestamps 2, 3, 4 (goes back to 2, new series)
|
||||
let batch2 = new_flat_record_batch(&[2, 3, 4]);
|
||||
estimator.update_flat(&batch2);
|
||||
|
||||
assert_eq!(2, estimator.finish());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_series_estimator_flat_boundary_at_batch_edge_equal() {
|
||||
let mut estimator = SeriesEstimator::default();
|
||||
|
||||
// First batch ending at 5
|
||||
let batch1 = new_flat_record_batch(&[1, 2, 5]);
|
||||
estimator.update_flat(&batch1);
|
||||
|
||||
// Second batch starting at 5 (equal timestamp, new series)
|
||||
let batch2 = new_flat_record_batch(&[5, 6, 7]);
|
||||
estimator.update_flat(&batch2);
|
||||
|
||||
assert_eq!(2, estimator.finish());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_series_estimator_flat_mixed_batches() {
|
||||
let mut estimator = SeriesEstimator::default();
|
||||
|
||||
// Batch 1: single series [10, 20, 30]
|
||||
let batch1 = new_flat_record_batch(&[10, 20, 30]);
|
||||
estimator.update_flat(&batch1);
|
||||
|
||||
// Batch 2: starts new series [5, 15], boundary within batch [15, 10, 25]
|
||||
let batch2 = new_flat_record_batch(&[5, 15, 10, 25]);
|
||||
estimator.update_flat(&batch2);
|
||||
|
||||
// Batch 3: continues from 25 to [30, 35]
|
||||
let batch3 = new_flat_record_batch(&[30, 35]);
|
||||
estimator.update_flat(&batch3);
|
||||
|
||||
// Expected: 1 (batch1) + 1 (batch2 start) + 1 (within batch2) = 3
|
||||
assert_eq!(3, estimator.finish());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_series_estimator_flat_descending_timestamps() {
|
||||
let mut estimator = SeriesEstimator::default();
|
||||
// Strictly descending timestamps - each pair creates a boundary
|
||||
let record_batch = new_flat_record_batch(&[10, 9, 8, 7, 6]);
|
||||
estimator.update_flat(&record_batch);
|
||||
// Boundaries: 10>=9, 9>=8, 8>=7, 7>=6 = 4 boundaries + 1 initial = 5 series
|
||||
assert_eq!(5, estimator.finish());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_series_estimator_flat_finish_resets_state() {
|
||||
let mut estimator = SeriesEstimator::default();
|
||||
|
||||
let batch1 = new_flat_record_batch(&[1, 2, 3]);
|
||||
estimator.update_flat(&batch1);
|
||||
|
||||
assert_eq!(1, estimator.finish());
|
||||
|
||||
// After finish, state should be reset
|
||||
let batch2 = new_flat_record_batch(&[4, 5, 6]);
|
||||
estimator.update_flat(&batch2);
|
||||
|
||||
assert_eq!(1, estimator.finish());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -175,6 +175,10 @@ pub struct FileMeta {
|
||||
deserialize_with = "deserialize_partition_expr"
|
||||
)]
|
||||
pub partition_expr: Option<PartitionExpr>,
|
||||
/// Number of series in the file.
|
||||
///
|
||||
/// The number is 0 if the series number is not available.
|
||||
pub num_series: u64,
|
||||
}
|
||||
|
||||
impl Debug for FileMeta {
|
||||
@@ -210,6 +214,7 @@ impl Debug for FileMeta {
|
||||
}
|
||||
})
|
||||
.field("partition_expr", &self.partition_expr)
|
||||
.field("num_series", &self.num_series)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
@@ -458,6 +463,7 @@ mod tests {
|
||||
num_row_groups: 0,
|
||||
sequence: None,
|
||||
partition_expr: None,
|
||||
num_series: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -503,6 +509,7 @@ mod tests {
|
||||
num_row_groups: 0,
|
||||
sequence: None,
|
||||
partition_expr: Some(partition_expr.clone()),
|
||||
num_series: 0,
|
||||
};
|
||||
|
||||
// Test serialization/deserialization
|
||||
|
||||
@@ -236,6 +236,7 @@ mod tests {
|
||||
num_row_groups: 0,
|
||||
sequence: None,
|
||||
partition_expr: None,
|
||||
num_series: 0,
|
||||
},
|
||||
file_purger,
|
||||
);
|
||||
@@ -302,6 +303,7 @@ mod tests {
|
||||
num_row_groups: 1,
|
||||
sequence: NonZeroU64::new(4096),
|
||||
partition_expr: None,
|
||||
num_series: 0,
|
||||
},
|
||||
file_purger,
|
||||
);
|
||||
|
||||
@@ -259,6 +259,7 @@ mod tests {
|
||||
num_row_groups: 1,
|
||||
sequence: NonZeroU64::new(4096),
|
||||
partition_expr: None,
|
||||
num_series: 0,
|
||||
};
|
||||
|
||||
file_ref_mgr.add_file(&file_meta);
|
||||
|
||||
@@ -26,10 +26,13 @@ use std::sync::Arc;
|
||||
|
||||
use bloom_filter::creator::BloomFilterIndexer;
|
||||
use common_telemetry::{debug, info, warn};
|
||||
use datatypes::arrow::array::BinaryArray;
|
||||
use datatypes::arrow::record_batch::RecordBatch;
|
||||
use mito_codec::index::IndexValuesCodec;
|
||||
use mito_codec::row_converter::CompositeValues;
|
||||
use puffin_manager::SstPuffinManager;
|
||||
use smallvec::{SmallVec, smallvec};
|
||||
use snafu::ResultExt;
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
use statistics::{ByteCount, RowCount};
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::{ColumnId, FileId, RegionId};
|
||||
@@ -40,7 +43,7 @@ use crate::access_layer::{AccessLayerRef, FilePathProvider, OperationType, Regio
|
||||
use crate::cache::file_cache::{FileType, IndexKey};
|
||||
use crate::cache::write_cache::{UploadTracker, WriteCacheRef};
|
||||
use crate::config::{BloomFilterConfig, FulltextIndexConfig, InvertedIndexConfig};
|
||||
use crate::error::{BuildIndexAsyncSnafu, Error, Result};
|
||||
use crate::error::{BuildIndexAsyncSnafu, DecodeSnafu, Error, InvalidRecordBatchSnafu, Result};
|
||||
use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList};
|
||||
use crate::metrics::INDEX_CREATE_MEMORY_USAGE;
|
||||
use crate::read::{Batch, BatchReader};
|
||||
@@ -57,6 +60,9 @@ use crate::sst::index::fulltext_index::creator::FulltextIndexer;
|
||||
use crate::sst::index::intermediate::IntermediateManager;
|
||||
use crate::sst::index::inverted_index::creator::InvertedIndexer;
|
||||
use crate::sst::parquet::SstInfo;
|
||||
use crate::sst::parquet::flat_format::primary_key_column_index;
|
||||
use crate::sst::parquet::format::PrimaryKeyArray;
|
||||
use crate::worker::WorkerListener;
|
||||
|
||||
pub(crate) const TYPE_INVERTED_INDEX: &str = "inverted_index";
|
||||
pub(crate) const TYPE_FULLTEXT_INDEX: &str = "fulltext_index";
|
||||
@@ -446,6 +452,7 @@ pub struct IndexBuildTask {
|
||||
pub file_meta: FileMeta,
|
||||
pub reason: IndexBuildType,
|
||||
pub access_layer: AccessLayerRef,
|
||||
pub(crate) listener: WorkerListener,
|
||||
pub(crate) manifest_ctx: ManifestContextRef,
|
||||
pub write_cache: Option<WriteCacheRef>,
|
||||
pub file_purger: FilePurgerRef,
|
||||
@@ -481,6 +488,12 @@ impl IndexBuildTask {
|
||||
}
|
||||
|
||||
async fn do_index_build(&mut self, version_control: VersionControlRef) {
|
||||
self.listener
|
||||
.on_index_build_begin(RegionFileId::new(
|
||||
self.file_meta.region_id,
|
||||
self.file_meta.file_id,
|
||||
))
|
||||
.await;
|
||||
match self.index_build(version_control).await {
|
||||
Ok(outcome) => self.on_success(outcome).await,
|
||||
Err(e) => {
|
||||
@@ -535,6 +548,12 @@ impl IndexBuildTask {
|
||||
if !self.check_sst_file_exists(&version_control).await {
|
||||
// Calls abort to clean up index files.
|
||||
indexer.abort().await;
|
||||
self.listener
|
||||
.on_index_build_abort(RegionFileId::new(
|
||||
self.file_meta.region_id,
|
||||
self.file_meta.file_id,
|
||||
))
|
||||
.await;
|
||||
return Ok(IndexBuildOutcome::Aborted(format!(
|
||||
"SST file not found during index build, region: {}, file_id: {}",
|
||||
self.file_meta.region_id, self.file_meta.file_id
|
||||
@@ -570,6 +589,12 @@ impl IndexBuildTask {
|
||||
if !self.check_sst_file_exists(&version_control).await {
|
||||
// Calls abort to clean up index files.
|
||||
indexer.abort().await;
|
||||
self.listener
|
||||
.on_index_build_abort(RegionFileId::new(
|
||||
self.file_meta.region_id,
|
||||
self.file_meta.file_id,
|
||||
))
|
||||
.await;
|
||||
return Ok(IndexBuildOutcome::Aborted(format!(
|
||||
"SST file not found during index build, region: {}, file_id: {}",
|
||||
self.file_meta.region_id, self.file_meta.file_id
|
||||
@@ -698,6 +723,56 @@ impl IndexBuildScheduler {
|
||||
}
|
||||
}
|
||||
|
||||
/// Decodes primary keys from a flat format RecordBatch.
|
||||
/// Returns a list of (decoded_pk_value, count) tuples where count is the number of occurrences.
|
||||
pub(crate) fn decode_primary_keys_with_counts(
|
||||
batch: &RecordBatch,
|
||||
codec: &IndexValuesCodec,
|
||||
) -> Result<Vec<(CompositeValues, usize)>> {
|
||||
let primary_key_index = primary_key_column_index(batch.num_columns());
|
||||
let pk_dict_array = batch
|
||||
.column(primary_key_index)
|
||||
.as_any()
|
||||
.downcast_ref::<PrimaryKeyArray>()
|
||||
.context(InvalidRecordBatchSnafu {
|
||||
reason: "Primary key column is not a dictionary array",
|
||||
})?;
|
||||
let pk_values_array = pk_dict_array
|
||||
.values()
|
||||
.as_any()
|
||||
.downcast_ref::<BinaryArray>()
|
||||
.context(InvalidRecordBatchSnafu {
|
||||
reason: "Primary key values are not binary array",
|
||||
})?;
|
||||
let keys = pk_dict_array.keys();
|
||||
|
||||
// Decodes primary keys and count consecutive occurrences
|
||||
let mut result: Vec<(CompositeValues, usize)> = Vec::new();
|
||||
let mut prev_key: Option<u32> = None;
|
||||
|
||||
for i in 0..keys.len() {
|
||||
let current_key = keys.value(i);
|
||||
|
||||
// Checks if current key is the same as previous key
|
||||
if let Some(prev) = prev_key
|
||||
&& prev == current_key
|
||||
{
|
||||
// Safety: We already have a key in the result vector.
|
||||
result.last_mut().unwrap().1 += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
// New key, decodes it.
|
||||
let pk_bytes = pk_values_array.value(current_key as usize);
|
||||
let decoded_value = codec.decoder().decode(pk_bytes).context(DecodeSnafu)?;
|
||||
|
||||
result.push((decoded_value, 1));
|
||||
prev_key = Some(current_key);
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
@@ -1137,6 +1212,7 @@ mod tests {
|
||||
},
|
||||
reason: IndexBuildType::Flush,
|
||||
access_layer: env.access_layer.clone(),
|
||||
listener: WorkerListener::default(),
|
||||
manifest_ctx,
|
||||
write_cache: None,
|
||||
file_purger,
|
||||
@@ -1187,6 +1263,7 @@ mod tests {
|
||||
file_meta: file_meta.clone(),
|
||||
reason: IndexBuildType::Flush,
|
||||
access_layer: env.access_layer.clone(),
|
||||
listener: WorkerListener::default(),
|
||||
manifest_ctx,
|
||||
write_cache: None,
|
||||
file_purger,
|
||||
@@ -1254,6 +1331,7 @@ mod tests {
|
||||
file_meta: file_meta.clone(),
|
||||
reason: IndexBuildType::Flush,
|
||||
access_layer: env.access_layer.clone(),
|
||||
listener: WorkerListener::default(),
|
||||
manifest_ctx,
|
||||
write_cache: None,
|
||||
file_purger,
|
||||
@@ -1350,6 +1428,7 @@ mod tests {
|
||||
file_meta: file_meta.clone(),
|
||||
reason: IndexBuildType::Flush,
|
||||
access_layer: env.access_layer.clone(),
|
||||
listener: WorkerListener::default(),
|
||||
manifest_ctx,
|
||||
write_cache: None,
|
||||
file_purger,
|
||||
@@ -1430,6 +1509,7 @@ mod tests {
|
||||
file_meta: file_meta.clone(),
|
||||
reason: IndexBuildType::Flush,
|
||||
access_layer: env.access_layer.clone(),
|
||||
listener: WorkerListener::default(),
|
||||
manifest_ctx,
|
||||
write_cache: Some(write_cache.clone()),
|
||||
file_purger,
|
||||
|
||||
@@ -16,6 +16,7 @@ use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
|
||||
use api::v1::SemanticType;
|
||||
use common_telemetry::{debug, warn};
|
||||
use datatypes::arrow::record_batch::RecordBatch;
|
||||
use datatypes::schema::SkippingIndexType;
|
||||
@@ -23,9 +24,10 @@ use datatypes::vectors::Helper;
|
||||
use index::bloom_filter::creator::BloomFilterCreator;
|
||||
use index::target::IndexTarget;
|
||||
use mito_codec::index::{IndexValueCodec, IndexValuesCodec};
|
||||
use mito_codec::row_converter::SortField;
|
||||
use mito_codec::row_converter::{CompositeValues, SortField};
|
||||
use puffin::puffin_manager::{PuffinWriter, PutOptions};
|
||||
use snafu::{ResultExt, ensure};
|
||||
use store_api::codec::PrimaryKeyEncoding;
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::{ColumnId, FileId};
|
||||
use tokio_util::compat::{TokioAsyncReadCompatExt, TokioAsyncWriteCompatExt};
|
||||
@@ -35,13 +37,13 @@ use crate::error::{
|
||||
OperateAbortedIndexSnafu, PuffinAddBlobSnafu, PushBloomFilterValueSnafu, Result,
|
||||
};
|
||||
use crate::read::Batch;
|
||||
use crate::sst::index::TYPE_BLOOM_FILTER_INDEX;
|
||||
use crate::sst::index::bloom_filter::INDEX_BLOB_TYPE;
|
||||
use crate::sst::index::intermediate::{
|
||||
IntermediateLocation, IntermediateManager, TempFileProvider,
|
||||
};
|
||||
use crate::sst::index::puffin_manager::SstPuffinWriter;
|
||||
use crate::sst::index::statistics::{ByteCount, RowCount, Statistics};
|
||||
use crate::sst::index::{TYPE_BLOOM_FILTER_INDEX, decode_primary_keys_with_counts};
|
||||
|
||||
/// The buffer size for the pipe used to send index data to the puffin blob.
|
||||
const PIPE_BUFFER_SIZE_FOR_SENDING_BLOB: usize = 8192;
|
||||
@@ -289,47 +291,81 @@ impl BloomFilterIndexer {
|
||||
let n = batch.num_rows();
|
||||
guard.inc_row_count(n);
|
||||
|
||||
let is_sparse = self.metadata.primary_key_encoding == PrimaryKeyEncoding::Sparse;
|
||||
let mut decoded_pks: Option<Vec<(CompositeValues, usize)>> = None;
|
||||
|
||||
for (col_id, creator) in &mut self.creators {
|
||||
// Get the column name from metadata
|
||||
if let Some(column_meta) = self.metadata.column_by_id(*col_id) {
|
||||
let column_name = &column_meta.column_schema.name;
|
||||
// Safety: `creators` are created from the metadata so it won't be None.
|
||||
let column_meta = self.metadata.column_by_id(*col_id).unwrap();
|
||||
let column_name = &column_meta.column_schema.name;
|
||||
if let Some(column_array) = batch.column_by_name(column_name) {
|
||||
// Convert Arrow array to VectorRef
|
||||
let vector = Helper::try_into_vector(column_array.clone())
|
||||
.context(crate::error::ConvertVectorSnafu)?;
|
||||
let sort_field = SortField::new(vector.data_type());
|
||||
|
||||
// Find the column in the RecordBatch by name
|
||||
if let Some(column_array) = batch.column_by_name(column_name) {
|
||||
// Convert Arrow array to VectorRef
|
||||
let vector = Helper::try_into_vector(column_array.clone())
|
||||
.context(crate::error::ConvertVectorSnafu)?;
|
||||
let sort_field = SortField::new(vector.data_type());
|
||||
for i in 0..n {
|
||||
let value = vector.get_ref(i);
|
||||
let elems = (!value.is_null())
|
||||
.then(|| {
|
||||
let mut buf = vec![];
|
||||
IndexValueCodec::encode_nonnull_value(value, &sort_field, &mut buf)
|
||||
.context(EncodeSnafu)?;
|
||||
Ok(buf)
|
||||
})
|
||||
.transpose()?;
|
||||
|
||||
for i in 0..n {
|
||||
let value = vector.get_ref(i);
|
||||
let elems = (!value.is_null())
|
||||
.then(|| {
|
||||
let mut buf = vec![];
|
||||
IndexValueCodec::encode_nonnull_value(value, &sort_field, &mut buf)
|
||||
.context(EncodeSnafu)?;
|
||||
Ok(buf)
|
||||
})
|
||||
.transpose()?;
|
||||
creator
|
||||
.push_row_elems(elems)
|
||||
.await
|
||||
.context(PushBloomFilterValueSnafu)?;
|
||||
}
|
||||
} else if is_sparse && column_meta.semantic_type == SemanticType::Tag {
|
||||
// Column not found in batch, tries to decode from primary keys for sparse encoding.
|
||||
if decoded_pks.is_none() {
|
||||
decoded_pks = Some(decode_primary_keys_with_counts(batch, &self.codec)?);
|
||||
}
|
||||
|
||||
creator
|
||||
.push_row_elems(elems)
|
||||
.await
|
||||
.context(PushBloomFilterValueSnafu)?;
|
||||
}
|
||||
} else {
|
||||
let pk_values_with_counts = decoded_pks.as_ref().unwrap();
|
||||
let Some(col_info) = self.codec.pk_col_info(*col_id) else {
|
||||
debug!(
|
||||
"Column {} not found in the batch during building bloom filter index",
|
||||
"Column {} not found in primary key during building bloom filter index",
|
||||
column_name
|
||||
);
|
||||
// Push empty elements to maintain alignment
|
||||
for _ in 0..n {
|
||||
creator
|
||||
.push_row_elems(None)
|
||||
.await
|
||||
.context(PushBloomFilterValueSnafu)?;
|
||||
}
|
||||
continue;
|
||||
};
|
||||
let pk_index = col_info.idx;
|
||||
let field = &col_info.field;
|
||||
for (decoded, count) in pk_values_with_counts {
|
||||
let value = match decoded {
|
||||
CompositeValues::Dense(dense) => dense.get(pk_index).map(|v| &v.1),
|
||||
CompositeValues::Sparse(sparse) => sparse.get(col_id),
|
||||
};
|
||||
|
||||
let elems = value
|
||||
.filter(|v| !v.is_null())
|
||||
.map(|v| {
|
||||
let mut buf = vec![];
|
||||
IndexValueCodec::encode_nonnull_value(
|
||||
v.as_value_ref(),
|
||||
field,
|
||||
&mut buf,
|
||||
)
|
||||
.context(EncodeSnafu)?;
|
||||
Ok(buf)
|
||||
})
|
||||
.transpose()?;
|
||||
|
||||
creator
|
||||
.push_n_row_elems(*count, elems)
|
||||
.await
|
||||
.context(PushBloomFilterValueSnafu)?;
|
||||
}
|
||||
} else {
|
||||
debug!(
|
||||
"Column {} not found in the batch during building bloom filter index",
|
||||
column_name
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@ use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
|
||||
use api::v1::SemanticType;
|
||||
use common_telemetry::warn;
|
||||
use datatypes::arrow::array::{Array, LargeStringArray, StringArray};
|
||||
use datatypes::arrow::datatypes::DataType;
|
||||
@@ -69,6 +70,17 @@ impl FulltextIndexer {
|
||||
let mut creators = HashMap::new();
|
||||
|
||||
for column in &metadata.column_metadatas {
|
||||
// Tag columns don't support fulltext index now.
|
||||
// If we need to support fulltext index for tag columns, we also need to parse
|
||||
// the codec and handle sparse encoding for flat format specially.
|
||||
if column.semantic_type == SemanticType::Tag {
|
||||
common_telemetry::debug!(
|
||||
"Skip creating fulltext index for tag column {}",
|
||||
column.column_schema.name
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
let options = column
|
||||
.column_schema
|
||||
.fulltext_options()
|
||||
|
||||
@@ -17,6 +17,7 @@ use std::num::NonZeroUsize;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
|
||||
use api::v1::SemanticType;
|
||||
use common_telemetry::{debug, warn};
|
||||
use datatypes::arrow::record_batch::RecordBatch;
|
||||
use datatypes::vectors::Helper;
|
||||
@@ -26,9 +27,10 @@ use index::inverted_index::create::sort_create::SortIndexCreator;
|
||||
use index::inverted_index::format::writer::InvertedIndexBlobWriter;
|
||||
use index::target::IndexTarget;
|
||||
use mito_codec::index::{IndexValueCodec, IndexValuesCodec};
|
||||
use mito_codec::row_converter::SortField;
|
||||
use mito_codec::row_converter::{CompositeValues, SortField};
|
||||
use puffin::puffin_manager::{PuffinWriter, PutOptions};
|
||||
use snafu::{ResultExt, ensure};
|
||||
use store_api::codec::PrimaryKeyEncoding;
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::{ColumnId, FileId};
|
||||
use tokio::io::duplex;
|
||||
@@ -39,13 +41,13 @@ use crate::error::{
|
||||
PushIndexValueSnafu, Result,
|
||||
};
|
||||
use crate::read::Batch;
|
||||
use crate::sst::index::TYPE_INVERTED_INDEX;
|
||||
use crate::sst::index::intermediate::{
|
||||
IntermediateLocation, IntermediateManager, TempFileProvider,
|
||||
};
|
||||
use crate::sst::index::inverted_index::INDEX_BLOB_TYPE;
|
||||
use crate::sst::index::puffin_manager::SstPuffinWriter;
|
||||
use crate::sst::index::statistics::{ByteCount, RowCount, Statistics};
|
||||
use crate::sst::index::{TYPE_INVERTED_INDEX, decode_primary_keys_with_counts};
|
||||
|
||||
/// The minimum memory usage threshold for one column.
|
||||
const MIN_MEMORY_USAGE_THRESHOLD_PER_COLUMN: usize = 1024 * 1024; // 1MB
|
||||
@@ -78,9 +80,6 @@ pub struct InvertedIndexer {
|
||||
|
||||
/// Region metadata for column lookups.
|
||||
metadata: RegionMetadataRef,
|
||||
/// Cache for mapping indexed column positions to their indices in the RecordBatch.
|
||||
/// Aligns with indexed_column_ids. Initialized lazily when first batch is processed.
|
||||
column_index_cache: Option<Vec<Option<usize>>>,
|
||||
}
|
||||
|
||||
impl InvertedIndexer {
|
||||
@@ -130,7 +129,6 @@ impl InvertedIndexer {
|
||||
memory_usage,
|
||||
indexed_column_ids,
|
||||
metadata: metadata.clone(),
|
||||
column_index_cache: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -170,29 +168,29 @@ impl InvertedIndexer {
|
||||
}
|
||||
|
||||
async fn do_update_flat(&mut self, batch: &RecordBatch) -> Result<()> {
|
||||
// Initialize column index cache if not already done
|
||||
if self.column_index_cache.is_none() {
|
||||
self.initialize_column_index_cache(batch);
|
||||
}
|
||||
|
||||
let mut guard = self.stats.record_update();
|
||||
|
||||
let n = batch.num_rows();
|
||||
guard.inc_row_count(n);
|
||||
guard.inc_row_count(batch.num_rows());
|
||||
|
||||
let column_indices = self.column_index_cache.as_ref().unwrap();
|
||||
let is_sparse = self.metadata.primary_key_encoding == PrimaryKeyEncoding::Sparse;
|
||||
let mut decoded_pks: Option<Vec<(CompositeValues, usize)>> = None;
|
||||
|
||||
for ((col_id, target_key), &column_index) in
|
||||
self.indexed_column_ids.iter().zip(column_indices.iter())
|
||||
{
|
||||
if let Some(index) = column_index {
|
||||
let column_array = batch.column(index);
|
||||
for (col_id, target_key) in &self.indexed_column_ids {
|
||||
let Some(column_meta) = self.metadata.column_by_id(*col_id) else {
|
||||
debug!(
|
||||
"Column {} not found in the metadata during building inverted index",
|
||||
col_id
|
||||
);
|
||||
continue;
|
||||
};
|
||||
let column_name = &column_meta.column_schema.name;
|
||||
if let Some(column_array) = batch.column_by_name(column_name) {
|
||||
// Convert Arrow array to VectorRef using Helper
|
||||
let vector = Helper::try_into_vector(column_array.clone())
|
||||
.context(crate::error::ConvertVectorSnafu)?;
|
||||
let sort_field = SortField::new(vector.data_type());
|
||||
|
||||
for row in 0..n {
|
||||
for row in 0..batch.num_rows() {
|
||||
self.value_buf.clear();
|
||||
let value_ref = vector.get_ref(row);
|
||||
|
||||
@@ -214,6 +212,47 @@ impl InvertedIndexer {
|
||||
.context(PushIndexValueSnafu)?;
|
||||
}
|
||||
}
|
||||
} else if is_sparse && column_meta.semantic_type == SemanticType::Tag {
|
||||
// Column not found in batch, tries to decode from primary keys for sparse encoding.
|
||||
if decoded_pks.is_none() {
|
||||
decoded_pks = Some(decode_primary_keys_with_counts(batch, &self.codec)?);
|
||||
}
|
||||
|
||||
let pk_values_with_counts = decoded_pks.as_ref().unwrap();
|
||||
let Some(col_info) = self.codec.pk_col_info(*col_id) else {
|
||||
debug!(
|
||||
"Column {} not found in primary key during building bloom filter index",
|
||||
column_name
|
||||
);
|
||||
continue;
|
||||
};
|
||||
let pk_index = col_info.idx;
|
||||
let field = &col_info.field;
|
||||
for (decoded, count) in pk_values_with_counts {
|
||||
let value = match decoded {
|
||||
CompositeValues::Dense(dense) => dense.get(pk_index).map(|v| &v.1),
|
||||
CompositeValues::Sparse(sparse) => sparse.get(col_id),
|
||||
};
|
||||
|
||||
let elem = value
|
||||
.filter(|v| !v.is_null())
|
||||
.map(|v| {
|
||||
self.value_buf.clear();
|
||||
IndexValueCodec::encode_nonnull_value(
|
||||
v.as_value_ref(),
|
||||
field,
|
||||
&mut self.value_buf,
|
||||
)
|
||||
.context(EncodeSnafu)?;
|
||||
Ok(self.value_buf.as_slice())
|
||||
})
|
||||
.transpose()?;
|
||||
|
||||
self.index_creator
|
||||
.push_with_name_n(target_key, elem, *count)
|
||||
.await
|
||||
.context(PushIndexValueSnafu)?;
|
||||
}
|
||||
} else {
|
||||
debug!(
|
||||
"Column {} not found in the batch during building inverted index",
|
||||
@@ -225,26 +264,6 @@ impl InvertedIndexer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Initializes the column index cache by mapping indexed column ids to their positions in the RecordBatch.
|
||||
fn initialize_column_index_cache(&mut self, batch: &RecordBatch) {
|
||||
let mut column_indices = Vec::with_capacity(self.indexed_column_ids.len());
|
||||
|
||||
for (col_id, _) in &self.indexed_column_ids {
|
||||
let column_index = if let Some(column_meta) = self.metadata.column_by_id(*col_id) {
|
||||
let column_name = &column_meta.column_schema.name;
|
||||
batch
|
||||
.schema()
|
||||
.column_with_name(column_name)
|
||||
.map(|(index, _)| index)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
column_indices.push(column_index);
|
||||
}
|
||||
|
||||
self.column_index_cache = Some(column_indices);
|
||||
}
|
||||
|
||||
/// Finishes index creation and cleans up garbage.
|
||||
/// Returns the number of rows and bytes written.
|
||||
pub async fn finish(
|
||||
|
||||
@@ -84,6 +84,8 @@ pub struct SstInfo {
|
||||
pub file_metadata: Option<Arc<ParquetMetaData>>,
|
||||
/// Index Meta Data
|
||||
pub index_metadata: IndexOutput,
|
||||
/// Number of series
|
||||
pub num_series: u64,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -766,6 +768,7 @@ mod tests {
|
||||
.expect("partition expression should be valid JSON"),
|
||||
None => None,
|
||||
},
|
||||
num_series: 0,
|
||||
},
|
||||
Arc::new(NoopFilePurger),
|
||||
);
|
||||
|
||||
@@ -15,18 +15,20 @@
|
||||
//! Structs and functions for reading ranges from a parquet file. A file range
|
||||
//! is usually a row group in a parquet file.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::ops::BitAnd;
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::v1::{OpType, SemanticType};
|
||||
use common_telemetry::error;
|
||||
use datatypes::arrow::array::BooleanArray;
|
||||
use datatypes::arrow::array::{ArrayRef, BooleanArray};
|
||||
use datatypes::arrow::buffer::BooleanBuffer;
|
||||
use datatypes::arrow::record_batch::RecordBatch;
|
||||
use mito_codec::row_converter::{CompositeValues, PrimaryKeyCodec};
|
||||
use parquet::arrow::arrow_reader::RowSelection;
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
use store_api::storage::TimeSeriesRowSelector;
|
||||
use store_api::codec::PrimaryKeyEncoding;
|
||||
use store_api::storage::{ColumnId, TimeSeriesRowSelector};
|
||||
|
||||
use crate::error::{
|
||||
ComputeArrowSnafu, DataTypeMismatchSnafu, DecodeSnafu, DecodeStatsSnafu, RecordBatchSnafu,
|
||||
@@ -37,11 +39,11 @@ use crate::read::compat::CompatBatch;
|
||||
use crate::read::last_row::RowGroupLastRowCachedReader;
|
||||
use crate::read::prune::{FlatPruneReader, PruneReader};
|
||||
use crate::sst::file::FileHandle;
|
||||
use crate::sst::parquet::flat_format::{DecodedPrimaryKeys, decode_primary_keys};
|
||||
use crate::sst::parquet::format::ReadFormat;
|
||||
use crate::sst::parquet::reader::{
|
||||
FlatRowGroupReader, MaybeFilter, RowGroupReader, RowGroupReaderBuilder, SimpleFilterContext,
|
||||
};
|
||||
|
||||
/// A range of a parquet SST. Now it is a row group.
|
||||
/// We can read different file ranges in parallel.
|
||||
#[derive(Clone)]
|
||||
@@ -357,7 +359,34 @@ impl RangeBase {
|
||||
}
|
||||
|
||||
/// Filters the input RecordBatch by the pushed down predicate and returns RecordBatch.
|
||||
///
|
||||
/// It assumes all necessary tags are already decoded from the primary key.
|
||||
pub(crate) fn precise_filter_flat(&self, input: RecordBatch) -> Result<Option<RecordBatch>> {
|
||||
let mask = self.compute_filter_mask_flat(&input)?;
|
||||
|
||||
// If mask is None, the entire batch is filtered out
|
||||
let Some(mask) = mask else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let filtered_batch =
|
||||
datatypes::arrow::compute::filter_record_batch(&input, &BooleanArray::from(mask))
|
||||
.context(ComputeArrowSnafu)?;
|
||||
|
||||
if filtered_batch.num_rows() > 0 {
|
||||
Ok(Some(filtered_batch))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
/// Computes the filter mask for the input RecordBatch based on pushed down predicates.
|
||||
///
|
||||
/// Returns `None` if the entire batch is filtered out, otherwise returns the boolean mask.
|
||||
pub(crate) fn compute_filter_mask_flat(
|
||||
&self,
|
||||
input: &RecordBatch,
|
||||
) -> Result<Option<BooleanBuffer>> {
|
||||
let mut mask = BooleanBuffer::new_set(input.num_rows());
|
||||
|
||||
let flat_format = self
|
||||
@@ -367,6 +396,11 @@ impl RangeBase {
|
||||
reason: "Expected flat format for precise_filter_flat",
|
||||
})?;
|
||||
|
||||
// Decodes primary keys once if we have any tag filters not in projection
|
||||
let mut decoded_pks: Option<DecodedPrimaryKeys> = None;
|
||||
// Cache decoded tag arrays by column id to avoid redundant decoding
|
||||
let mut decoded_tag_cache: HashMap<ColumnId, ArrayRef> = HashMap::new();
|
||||
|
||||
// Run filter one by one and combine them result
|
||||
for filter_ctx in &self.filters {
|
||||
let filter = match filter_ctx.filter() {
|
||||
@@ -383,20 +417,53 @@ impl RangeBase {
|
||||
let column = &input.columns()[idx];
|
||||
let result = filter.evaluate_array(column).context(RecordBatchSnafu)?;
|
||||
mask = mask.bitand(&result);
|
||||
} else {
|
||||
// Column not found in projection, continue
|
||||
continue;
|
||||
} else if filter_ctx.semantic_type() == SemanticType::Tag {
|
||||
// Column not found in projection, it may be a tag column.
|
||||
// Decodes primary keys if not already decoded.
|
||||
if decoded_pks.is_none() {
|
||||
decoded_pks = Some(decode_primary_keys(self.codec.as_ref(), input)?);
|
||||
}
|
||||
|
||||
let metadata = flat_format.metadata();
|
||||
let column_id = filter_ctx.column_id();
|
||||
|
||||
// Check cache first
|
||||
let tag_column = if let Some(cached_column) = decoded_tag_cache.get(&column_id) {
|
||||
cached_column.clone()
|
||||
} else {
|
||||
// For dense encoding, we need pk_index. For sparse encoding, pk_index is None.
|
||||
let pk_index = if self.codec.encoding() == PrimaryKeyEncoding::Sparse {
|
||||
None
|
||||
} else {
|
||||
metadata.primary_key_index(column_id)
|
||||
};
|
||||
let column_index = metadata.column_index_by_id(column_id);
|
||||
|
||||
if let (Some(column_index), Some(decoded)) =
|
||||
(column_index, decoded_pks.as_ref())
|
||||
{
|
||||
let column_metadata = &metadata.column_metadatas[column_index];
|
||||
let tag_column = decoded.get_tag_column(
|
||||
column_id,
|
||||
pk_index,
|
||||
&column_metadata.column_schema.data_type,
|
||||
)?;
|
||||
// Cache the decoded tag column
|
||||
decoded_tag_cache.insert(column_id, tag_column.clone());
|
||||
tag_column
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let result = filter
|
||||
.evaluate_array(&tag_column)
|
||||
.context(RecordBatchSnafu)?;
|
||||
mask = mask.bitand(&result);
|
||||
}
|
||||
// Non-tag column not found in projection.
|
||||
}
|
||||
|
||||
let filtered_batch =
|
||||
datatypes::arrow::compute::filter_record_batch(&input, &BooleanArray::from(mask))
|
||||
.context(ComputeArrowSnafu)?;
|
||||
|
||||
if filtered_batch.num_rows() > 0 {
|
||||
Ok(Some(filtered_batch))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
Ok(Some(mask))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -127,7 +127,9 @@ pub(crate) fn op_type_column_index(num_columns: usize) -> usize {
|
||||
num_columns - 1
|
||||
}
|
||||
|
||||
// TODO(yingwen): Add an option to skip reading internal columns.
|
||||
// TODO(yingwen): Add an option to skip reading internal columns if the region is
|
||||
// append only and doesn't use sparse encoding (We need to check the table id under
|
||||
// sparse encoding).
|
||||
/// Helper for reading the flat SST format with projection.
|
||||
///
|
||||
/// It only supports flat format that stores primary keys additionally.
|
||||
@@ -528,6 +530,125 @@ pub(crate) fn sst_column_id_indices(metadata: &RegionMetadata) -> HashMap<Column
|
||||
id_to_index
|
||||
}
|
||||
|
||||
/// Decodes primary keys from a batch and returns decoded primary key information.
|
||||
///
|
||||
/// The batch must contain a primary key column at the expected index.
|
||||
pub(crate) fn decode_primary_keys(
|
||||
codec: &dyn PrimaryKeyCodec,
|
||||
batch: &RecordBatch,
|
||||
) -> Result<DecodedPrimaryKeys> {
|
||||
let primary_key_index = primary_key_column_index(batch.num_columns());
|
||||
let pk_dict_array = batch
|
||||
.column(primary_key_index)
|
||||
.as_any()
|
||||
.downcast_ref::<PrimaryKeyArray>()
|
||||
.with_context(|| InvalidRecordBatchSnafu {
|
||||
reason: "Primary key column is not a dictionary array".to_string(),
|
||||
})?;
|
||||
let pk_values_array = pk_dict_array
|
||||
.values()
|
||||
.as_any()
|
||||
.downcast_ref::<BinaryArray>()
|
||||
.with_context(|| InvalidRecordBatchSnafu {
|
||||
reason: "Primary key values are not binary array".to_string(),
|
||||
})?;
|
||||
|
||||
let keys = pk_dict_array.keys();
|
||||
|
||||
// Decodes primary key values by iterating through keys, reusing decoded values for duplicate keys.
|
||||
// Maps original key index -> new decoded value index
|
||||
let mut key_to_decoded_index = Vec::with_capacity(keys.len());
|
||||
let mut decoded_pk_values = Vec::new();
|
||||
let mut prev_key: Option<u32> = None;
|
||||
|
||||
// The parquet reader may read the whole dictionary page into the dictionary values, so
|
||||
// we may decode many primary keys not in this batch if we decode the values array directly.
|
||||
for i in 0..keys.len() {
|
||||
let current_key = keys.value(i);
|
||||
|
||||
// Check if current key is the same as previous key
|
||||
if let Some(prev) = prev_key
|
||||
&& prev == current_key
|
||||
{
|
||||
// Reuse the last decoded index
|
||||
key_to_decoded_index.push((decoded_pk_values.len() - 1) as u32);
|
||||
continue;
|
||||
}
|
||||
|
||||
// New key, decodes the value
|
||||
let pk_bytes = pk_values_array.value(current_key as usize);
|
||||
let decoded_value = codec.decode(pk_bytes).context(DecodeSnafu)?;
|
||||
|
||||
decoded_pk_values.push(decoded_value);
|
||||
key_to_decoded_index.push((decoded_pk_values.len() - 1) as u32);
|
||||
prev_key = Some(current_key);
|
||||
}
|
||||
|
||||
// Create the keys array from key_to_decoded_index
|
||||
let keys_array = UInt32Array::from(key_to_decoded_index);
|
||||
|
||||
Ok(DecodedPrimaryKeys {
|
||||
decoded_pk_values,
|
||||
keys_array,
|
||||
})
|
||||
}
|
||||
|
||||
/// Holds decoded primary key values and their indices.
|
||||
pub(crate) struct DecodedPrimaryKeys {
|
||||
/// Decoded primary key values for unique keys in the dictionary.
|
||||
decoded_pk_values: Vec<CompositeValues>,
|
||||
/// Prebuilt keys array for creating dictionary arrays.
|
||||
keys_array: UInt32Array,
|
||||
}
|
||||
|
||||
impl DecodedPrimaryKeys {
|
||||
/// Gets a tag column array by column id and data type.
|
||||
///
|
||||
/// For sparse encoding, uses column_id to lookup values.
|
||||
/// For dense encoding, uses pk_index to get values.
|
||||
pub(crate) fn get_tag_column(
|
||||
&self,
|
||||
column_id: ColumnId,
|
||||
pk_index: Option<usize>,
|
||||
column_type: &ConcreteDataType,
|
||||
) -> Result<ArrayRef> {
|
||||
// Gets values from the primary key.
|
||||
let mut builder = column_type.create_mutable_vector(self.decoded_pk_values.len());
|
||||
for decoded in &self.decoded_pk_values {
|
||||
match decoded {
|
||||
CompositeValues::Dense(dense) => {
|
||||
let pk_idx = pk_index.expect("pk_index required for dense encoding");
|
||||
if pk_idx < dense.len() {
|
||||
builder.push_value_ref(&dense[pk_idx].1.as_value_ref());
|
||||
} else {
|
||||
builder.push_null();
|
||||
}
|
||||
}
|
||||
CompositeValues::Sparse(sparse) => {
|
||||
let value = sparse.get_or_null(column_id);
|
||||
builder.push_value_ref(&value.as_value_ref());
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
let values_vector = builder.to_vector();
|
||||
let values_array = values_vector.to_arrow_array();
|
||||
|
||||
// Only creates dictionary array for string types, otherwise take values by keys
|
||||
if column_type.is_string() {
|
||||
// Creates dictionary array using the same keys for string types
|
||||
// Note that the dictionary values may have nulls.
|
||||
let dict_array = DictionaryArray::new(self.keys_array.clone(), values_array);
|
||||
Ok(Arc::new(dict_array))
|
||||
} else {
|
||||
// For non-string types, takes values by keys indices to create a regular array
|
||||
let taken_array =
|
||||
take(&values_array, &self.keys_array, None).context(ComputeArrowSnafu)?;
|
||||
Ok(taken_array)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts a batch that doesn't have decoded primary key columns into a batch that has decoded
|
||||
/// primary key columns in flat format.
|
||||
pub(crate) struct FlatConvertFormat {
|
||||
@@ -577,53 +698,22 @@ impl FlatConvertFormat {
|
||||
|
||||
/// Converts a batch to have decoded primary key columns in flat format.
|
||||
///
|
||||
/// The primary key array in the batch is a dictionary array. We decode each value which is a
|
||||
/// primary key and reuse the keys array to build a dictionary array for each tag column.
|
||||
/// The decoded columns are inserted in front of other columns.
|
||||
/// The primary key array in the batch is a dictionary array.
|
||||
pub(crate) fn convert(&self, batch: RecordBatch) -> Result<RecordBatch> {
|
||||
if self.projected_primary_keys.is_empty() {
|
||||
return Ok(batch);
|
||||
}
|
||||
|
||||
let primary_key_index = primary_key_column_index(batch.num_columns());
|
||||
let pk_dict_array = batch
|
||||
.column(primary_key_index)
|
||||
.as_any()
|
||||
.downcast_ref::<PrimaryKeyArray>()
|
||||
.with_context(|| InvalidRecordBatchSnafu {
|
||||
reason: "Primary key column is not a dictionary array".to_string(),
|
||||
})?;
|
||||
|
||||
let pk_values_array = pk_dict_array
|
||||
.values()
|
||||
.as_any()
|
||||
.downcast_ref::<BinaryArray>()
|
||||
.with_context(|| InvalidRecordBatchSnafu {
|
||||
reason: "Primary key values are not binary array".to_string(),
|
||||
})?;
|
||||
|
||||
// Decodes all primary key values
|
||||
let mut decoded_pk_values = Vec::with_capacity(pk_values_array.len());
|
||||
for i in 0..pk_values_array.len() {
|
||||
if pk_values_array.is_null(i) {
|
||||
decoded_pk_values.push(None);
|
||||
} else {
|
||||
let pk_bytes = pk_values_array.value(i);
|
||||
let decoded = self.codec.decode(pk_bytes).context(DecodeSnafu)?;
|
||||
decoded_pk_values.push(Some(decoded));
|
||||
}
|
||||
}
|
||||
let decoded_pks = decode_primary_keys(self.codec.as_ref(), &batch)?;
|
||||
|
||||
// Builds decoded tag column arrays.
|
||||
let mut decoded_columns = Vec::new();
|
||||
for (column_id, pk_index, column_index) in &self.projected_primary_keys {
|
||||
let column_metadata = &self.metadata.column_metadatas[*column_index];
|
||||
let tag_column = self.build_primary_key_column(
|
||||
let tag_column = decoded_pks.get_tag_column(
|
||||
*column_id,
|
||||
*pk_index,
|
||||
Some(*pk_index),
|
||||
&column_metadata.column_schema.data_type,
|
||||
pk_dict_array.keys(),
|
||||
&decoded_pk_values,
|
||||
)?;
|
||||
decoded_columns.push(tag_column);
|
||||
}
|
||||
@@ -648,57 +738,6 @@ impl FlatConvertFormat {
|
||||
let new_schema = Arc::new(Schema::new(new_fields));
|
||||
RecordBatch::try_new(new_schema, new_columns).context(NewRecordBatchSnafu)
|
||||
}
|
||||
|
||||
/// Builds an array for a specific tag column.
|
||||
///
|
||||
/// It may build a dictionary array if the type is string. Note that the dictionary
|
||||
/// array may have null values, although keys are not null.
|
||||
fn build_primary_key_column(
|
||||
&self,
|
||||
column_id: ColumnId,
|
||||
pk_index: usize,
|
||||
column_type: &ConcreteDataType,
|
||||
keys: &UInt32Array,
|
||||
decoded_pk_values: &[Option<CompositeValues>],
|
||||
) -> Result<ArrayRef> {
|
||||
// Gets values from the primary key.
|
||||
let mut builder = column_type.create_mutable_vector(decoded_pk_values.len());
|
||||
for decoded_opt in decoded_pk_values {
|
||||
match decoded_opt {
|
||||
Some(decoded) => {
|
||||
match decoded {
|
||||
CompositeValues::Dense(dense) => {
|
||||
if pk_index < dense.len() {
|
||||
builder.push_value_ref(&dense[pk_index].1.as_value_ref());
|
||||
} else {
|
||||
builder.push_null();
|
||||
}
|
||||
}
|
||||
CompositeValues::Sparse(sparse) => {
|
||||
let value = sparse.get_or_null(column_id);
|
||||
builder.push_value_ref(&value.as_value_ref());
|
||||
}
|
||||
};
|
||||
}
|
||||
None => builder.push_null(),
|
||||
}
|
||||
}
|
||||
|
||||
let values_vector = builder.to_vector();
|
||||
let values_array = values_vector.to_arrow_array();
|
||||
|
||||
// Only creates dictionary array for string types, otherwise take values by keys
|
||||
if column_type.is_string() {
|
||||
// Creates dictionary array using the same keys for string types
|
||||
// Note that the dictionary values may have nulls.
|
||||
let dict_array = DictionaryArray::new(keys.clone(), values_array);
|
||||
Ok(Arc::new(dict_array))
|
||||
} else {
|
||||
// For non-string types, takes values by keys indices to create a regular array
|
||||
let taken_array = take(&values_array, keys, None).context(ComputeArrowSnafu)?;
|
||||
Ok(taken_array)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1397,6 +1397,7 @@ impl FlatRowGroupReader {
|
||||
let record_batch = batch_result.context(ArrowReaderSnafu {
|
||||
path: self.context.file_path(),
|
||||
})?;
|
||||
|
||||
// Safety: Only flat format use FlatRowGroupReader.
|
||||
let flat_format = self.context.read_format().as_flat().unwrap();
|
||||
let record_batch =
|
||||
|
||||
@@ -57,7 +57,9 @@ use crate::sst::parquet::flat_format::{FlatWriteFormat, time_index_column_index}
|
||||
use crate::sst::parquet::format::PrimaryKeyWriteFormat;
|
||||
use crate::sst::parquet::helper::parse_parquet_metadata;
|
||||
use crate::sst::parquet::{PARQUET_METADATA_KEY, SstInfo, WriteOptions};
|
||||
use crate::sst::{DEFAULT_WRITE_BUFFER_SIZE, DEFAULT_WRITE_CONCURRENCY, FlatSchemaOptions};
|
||||
use crate::sst::{
|
||||
DEFAULT_WRITE_BUFFER_SIZE, DEFAULT_WRITE_CONCURRENCY, FlatSchemaOptions, SeriesEstimator,
|
||||
};
|
||||
|
||||
/// Parquet SST writer.
|
||||
pub struct ParquetWriter<F: WriterFactory, I: IndexerBuilder, P: FilePathProvider> {
|
||||
@@ -176,7 +178,7 @@ where
|
||||
) -> Result<()> {
|
||||
// maybe_init_writer will re-create a new file.
|
||||
if let Some(mut current_writer) = mem::take(&mut self.writer) {
|
||||
let stats = mem::take(stats);
|
||||
let mut stats = mem::take(stats);
|
||||
// At least one row has been written.
|
||||
assert!(stats.num_rows > 0);
|
||||
|
||||
@@ -211,6 +213,7 @@ where
|
||||
|
||||
// convert FileMetaData to ParquetMetaData
|
||||
let parquet_metadata = parse_parquet_metadata(file_meta)?;
|
||||
let num_series = stats.series_estimator.finish();
|
||||
ssts.push(SstInfo {
|
||||
file_id: self.current_file,
|
||||
time_range,
|
||||
@@ -219,6 +222,7 @@ where
|
||||
num_row_groups: parquet_metadata.num_row_groups() as u64,
|
||||
file_metadata: Some(Arc::new(parquet_metadata)),
|
||||
index_metadata: index_output,
|
||||
num_series,
|
||||
});
|
||||
self.current_file = FileId::random();
|
||||
self.bytes_written.store(0, Ordering::Relaxed)
|
||||
@@ -496,6 +500,8 @@ struct SourceStats {
|
||||
num_rows: usize,
|
||||
/// Time range of fetched batches.
|
||||
time_range: Option<(Timestamp, Timestamp)>,
|
||||
/// Series estimator for computing num_series.
|
||||
series_estimator: SeriesEstimator,
|
||||
}
|
||||
|
||||
impl SourceStats {
|
||||
@@ -505,6 +511,7 @@ impl SourceStats {
|
||||
}
|
||||
|
||||
self.num_rows += batch.num_rows();
|
||||
self.series_estimator.update(batch);
|
||||
// Safety: batch is not empty.
|
||||
let (min_in_batch, max_in_batch) = (
|
||||
batch.first_timestamp().unwrap(),
|
||||
@@ -524,6 +531,7 @@ impl SourceStats {
|
||||
}
|
||||
|
||||
self.num_rows += record_batch.num_rows();
|
||||
self.series_estimator.update_flat(record_batch);
|
||||
|
||||
// Get the timestamp column by index
|
||||
let time_index_col_idx = time_index_column_index(record_batch.num_columns());
|
||||
|
||||
@@ -127,6 +127,7 @@ pub fn sst_file_handle_with_file_id(file_id: FileId, start_ms: i64, end_ms: i64)
|
||||
index_file_size: 0,
|
||||
num_rows: 0,
|
||||
num_row_groups: 0,
|
||||
num_series: 0,
|
||||
sequence: None,
|
||||
partition_expr: None,
|
||||
},
|
||||
|
||||
@@ -105,6 +105,7 @@ impl VersionControlBuilder {
|
||||
index_file_size: 0,
|
||||
num_rows: 0,
|
||||
num_row_groups: 0,
|
||||
num_series: 0,
|
||||
sequence: NonZeroU64::new(start_ms as u64),
|
||||
partition_expr: match &self.metadata.partition_expr {
|
||||
Some(json_str) => partition::expr::PartitionExpr::from_json_str(json_str)
|
||||
@@ -193,6 +194,7 @@ pub(crate) fn apply_edit(
|
||||
index_file_size: 0,
|
||||
num_rows: 0,
|
||||
num_row_groups: 0,
|
||||
num_series: 0,
|
||||
sequence: NonZeroU64::new(*start_ms as u64),
|
||||
partition_expr: match &version_control.current().version.metadata.partition_expr {
|
||||
Some(json_str) => partition::expr::PartitionExpr::from_json_str(json_str)
|
||||
|
||||
@@ -1220,10 +1220,10 @@ impl WorkerListener {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn on_index_build_success(&self, _region_file_id: RegionFileId) {
|
||||
pub(crate) async fn on_index_build_finish(&self, _region_file_id: RegionFileId) {
|
||||
#[cfg(any(test, feature = "test"))]
|
||||
if let Some(listener) = &self.listener {
|
||||
listener.on_index_build_success(_region_file_id).await;
|
||||
listener.on_index_build_finish(_region_file_id).await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1233,6 +1233,13 @@ impl WorkerListener {
|
||||
listener.on_index_build_begin(_region_file_id).await;
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn on_index_build_abort(&self, _region_file_id: RegionFileId) {
|
||||
#[cfg(any(test, feature = "test"))]
|
||||
if let Some(listener) = &self.listener {
|
||||
listener.on_index_build_abort(_region_file_id).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user