Merge remote-tracking branch 'origin/main' into zhongzc/repartition-procedure-scaffold

This commit is contained in:
Zhenchi
2025-10-26 10:56:09 +00:00
237 changed files with 12156 additions and 2632 deletions

View File

@@ -57,14 +57,6 @@
return days;
}
// Get urgency emoji based on PR age
function getAgeEmoji(days) {
if (days >= 14) return "🔴"; // 14+ days - critical
if (days >= 7) return "🟠"; // 7+ days - urgent
if (days >= 3) return "🟡"; // 3+ days - needs attention
return "🟢"; // < 3 days - fresh
}
// Build Slack notification message from PR list
function buildSlackMessage(prs) {
if (prs.length === 0) {

View File

@@ -2,8 +2,8 @@ name: PR Review Reminder
on:
schedule:
# Run at 9:00 AM UTC+8 (01:00 AM UTC) every day
- cron: '0 1 * * *'
# Run at 9:00 AM UTC+8 (01:00 AM UTC) on Monday, Wednesday, Friday
- cron: '0 1 * * 1,3,5'
workflow_dispatch:
jobs:

835
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -99,12 +99,12 @@ rust.unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tokio_unstable)'] }
# See for more detaiils: https://github.com/rust-lang/cargo/issues/11329
ahash = { version = "0.8", features = ["compile-time-rng"] }
aquamarine = "0.6"
arrow = { version = "56.0", features = ["prettyprint"] }
arrow-array = { version = "56.0", default-features = false, features = ["chrono-tz"] }
arrow-buffer = "56.0"
arrow-flight = "56.0"
arrow-ipc = { version = "56.0", default-features = false, features = ["lz4", "zstd"] }
arrow-schema = { version = "56.0", features = ["serde"] }
arrow = { version = "56.2", features = ["prettyprint"] }
arrow-array = { version = "56.2", default-features = false, features = ["chrono-tz"] }
arrow-buffer = "56.2"
arrow-flight = "56.2"
arrow-ipc = { version = "56.2", default-features = false, features = ["lz4", "zstd"] }
arrow-schema = { version = "56.2", features = ["serde"] }
async-stream = "0.3"
async-trait = "0.1"
# Remember to update axum-extra, axum-macros when updating axum
@@ -123,18 +123,18 @@ clap = { version = "4.4", features = ["derive"] }
config = "0.13.0"
crossbeam-utils = "0.8"
dashmap = "6.1"
datafusion = "49"
datafusion-common = "49"
datafusion-expr = "49"
datafusion-functions = "49"
datafusion-functions-aggregate-common = "49"
datafusion-optimizer = "49"
datafusion-orc = { git = "https://github.com/GreptimeTeam/datafusion-orc", rev = "a0a5f902158f153119316eaeec868cff3fc8a99d" }
datafusion-pg-catalog = { git = "https://github.com/datafusion-contrib/datafusion-postgres", rev = "3d1b7c7d5b82dd49bafc2803259365e633f654fa" }
datafusion-physical-expr = "49"
datafusion-physical-plan = "49"
datafusion-sql = "49"
datafusion-substrait = "49"
datafusion = "50"
datafusion-common = "50"
datafusion-expr = "50"
datafusion-functions = "50"
datafusion-functions-aggregate-common = "50"
datafusion-optimizer = "50"
datafusion-orc = "0.5"
datafusion-pg-catalog = "0.11"
datafusion-physical-expr = "50"
datafusion-physical-plan = "50"
datafusion-sql = "50"
datafusion-substrait = "50"
deadpool = "0.12"
deadpool-postgres = "0.14"
derive_builder = "0.20"
@@ -147,7 +147,7 @@ etcd-client = { git = "https://github.com/GreptimeTeam/etcd-client", rev = "f62d
fst = "0.4.7"
futures = "0.3"
futures-util = "0.3"
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "69a6089933daa573c96808ec4bbc48f447ec6e8c" }
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "14b9dc40bdc8288742b0cefc7bb024303b7429ef" }
hex = "0.4"
http = "1"
humantime = "2.1"
@@ -180,7 +180,7 @@ otel-arrow-rust = { git = "https://github.com/GreptimeTeam/otel-arrow", rev = "2
"server",
] }
parking_lot = "0.12"
parquet = { version = "56.0", default-features = false, features = ["arrow", "async", "object_store"] }
parquet = { version = "56.2", default-features = false, features = ["arrow", "async", "object_store"] }
paste = "1.0"
pin-project = "1.0"
pretty_assertions = "1.4.0"
@@ -191,7 +191,7 @@ prost-types = "0.13"
raft-engine = { version = "0.4.1", default-features = false }
rand = "0.9"
ratelimit = "0.10"
regex = "1.8"
regex = "1.12"
regex-automata = "0.4"
reqwest = { version = "0.12", default-features = false, features = [
"json",
@@ -217,10 +217,7 @@ simd-json = "0.15"
similar-asserts = "1.6.0"
smallvec = { version = "1", features = ["serde"] }
snafu = "0.8"
sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "39e4fc94c3c741981f77e9d63b5ce8c02e0a27ea", features = [
"visitor",
"serde",
] } # branch = "v0.55.x"
sqlparser = { version = "0.58.0", default-features = false, features = ["std", "visitor", "serde"] }
sqlx = { version = "0.8", features = [
"runtime-tokio-rustls",
"mysql",
@@ -322,16 +319,19 @@ git = "https://github.com/GreptimeTeam/greptime-meter.git"
rev = "5618e779cf2bb4755b499c630fba4c35e91898cb"
[patch.crates-io]
datafusion = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7d5214512740b4dfb742b6b3d91ed9affcc2c9d0" }
datafusion-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7d5214512740b4dfb742b6b3d91ed9affcc2c9d0" }
datafusion-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7d5214512740b4dfb742b6b3d91ed9affcc2c9d0" }
datafusion-functions = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7d5214512740b4dfb742b6b3d91ed9affcc2c9d0" }
datafusion-functions-aggregate-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7d5214512740b4dfb742b6b3d91ed9affcc2c9d0" }
datafusion-optimizer = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7d5214512740b4dfb742b6b3d91ed9affcc2c9d0" }
datafusion-physical-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7d5214512740b4dfb742b6b3d91ed9affcc2c9d0" }
datafusion-physical-plan = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7d5214512740b4dfb742b6b3d91ed9affcc2c9d0" }
datafusion-sql = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7d5214512740b4dfb742b6b3d91ed9affcc2c9d0" }
datafusion-substrait = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "7d5214512740b4dfb742b6b3d91ed9affcc2c9d0" }
datafusion = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
datafusion-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
datafusion-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
datafusion-functions = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
datafusion-functions-aggregate-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
datafusion-optimizer = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
datafusion-physical-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
datafusion-physical-expr-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
datafusion-physical-plan = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
datafusion-datasource = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
datafusion-sql = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
datafusion-substrait = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "4b519a5caa95472cc3988f5556813a583dd35af1" } # branch = "v0.58.x"
[profile.release]
debug = 1

View File

@@ -25,12 +25,14 @@
| `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. |
| `http.timeout` | String | `0s` | HTTP request timeout. Set to 0 to disable timeout. |
| `http.body_limit` | String | `64MB` | HTTP request body limit.<br/>The following units are supported: `B`, `KB`, `KiB`, `MB`, `MiB`, `GB`, `GiB`, `TB`, `TiB`, `PB`, `PiB`.<br/>Set to 0 to disable limit. |
| `http.max_total_body_memory` | String | Unset | Maximum total memory for all concurrent HTTP request bodies.<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
| `http.enable_cors` | Bool | `true` | HTTP CORS support, it's turned on by default<br/>This allows browser to access http APIs without CORS restrictions |
| `http.cors_allowed_origins` | Array | Unset | Customize allowed origins for HTTP CORS. |
| `http.prom_validation_mode` | String | `strict` | Whether to enable validation for Prometheus remote write requests.<br/>Available options:<br/>- strict: deny invalid UTF-8 strings (default).<br/>- lossy: allow invalid UTF-8 strings, replace invalid characters with REPLACEMENT_CHARACTER(U+FFFD).<br/>- unchecked: do not valid strings. |
| `grpc` | -- | -- | The gRPC server options. |
| `grpc.bind_addr` | String | `127.0.0.1:4001` | The address to bind the gRPC server. |
| `grpc.runtime_size` | Integer | `8` | The number of server worker threads. |
| `grpc.max_total_message_memory` | String | Unset | Maximum total memory for all concurrent gRPC request messages.<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
| `grpc.max_connection_age` | String | Unset | The maximum connection age for gRPC connection.<br/>The value can be a human-readable time string. For example: `10m` for ten minutes or `1h` for one hour.<br/>Refer to https://grpc.io/docs/guides/keepalive/ for more details. |
| `grpc.tls` | -- | -- | gRPC server TLS options, see `mysql.tls` section. |
| `grpc.tls.mode` | String | `disable` | TLS mode. |
@@ -235,6 +237,7 @@
| `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. |
| `http.timeout` | String | `0s` | HTTP request timeout. Set to 0 to disable timeout. |
| `http.body_limit` | String | `64MB` | HTTP request body limit.<br/>The following units are supported: `B`, `KB`, `KiB`, `MB`, `MiB`, `GB`, `GiB`, `TB`, `TiB`, `PB`, `PiB`.<br/>Set to 0 to disable limit. |
| `http.max_total_body_memory` | String | Unset | Maximum total memory for all concurrent HTTP request bodies.<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
| `http.enable_cors` | Bool | `true` | HTTP CORS support, it's turned on by default<br/>This allows browser to access http APIs without CORS restrictions |
| `http.cors_allowed_origins` | Array | Unset | Customize allowed origins for HTTP CORS. |
| `http.prom_validation_mode` | String | `strict` | Whether to enable validation for Prometheus remote write requests.<br/>Available options:<br/>- strict: deny invalid UTF-8 strings (default).<br/>- lossy: allow invalid UTF-8 strings, replace invalid characters with REPLACEMENT_CHARACTER(U+FFFD).<br/>- unchecked: do not valid strings. |
@@ -242,6 +245,7 @@
| `grpc.bind_addr` | String | `127.0.0.1:4001` | The address to bind the gRPC server. |
| `grpc.server_addr` | String | `127.0.0.1:4001` | The address advertised to the metasrv, and used for connections from outside the host.<br/>If left empty or unset, the server will automatically use the IP address of the first network interface<br/>on the host, with the same port number as the one specified in `grpc.bind_addr`. |
| `grpc.runtime_size` | Integer | `8` | The number of server worker threads. |
| `grpc.max_total_message_memory` | String | Unset | Maximum total memory for all concurrent gRPC request messages.<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
| `grpc.flight_compression` | String | `arrow_ipc` | Compression mode for frontend side Arrow IPC service. Available options:<br/>- `none`: disable all compression<br/>- `transport`: only enable gRPC transport compression (zstd)<br/>- `arrow_ipc`: only enable Arrow IPC compression (lz4)<br/>- `all`: enable all compression.<br/>Default to `none` |
| `grpc.max_connection_age` | String | Unset | The maximum connection age for gRPC connection.<br/>The value can be a human-readable time string. For example: `10m` for ten minutes or `1h` for one hour.<br/>Refer to https://grpc.io/docs/guides/keepalive/ for more details. |
| `grpc.tls` | -- | -- | gRPC server TLS options, see `mysql.tls` section. |

View File

@@ -31,6 +31,10 @@ timeout = "0s"
## The following units are supported: `B`, `KB`, `KiB`, `MB`, `MiB`, `GB`, `GiB`, `TB`, `TiB`, `PB`, `PiB`.
## Set to 0 to disable limit.
body_limit = "64MB"
## Maximum total memory for all concurrent HTTP request bodies.
## Set to 0 to disable the limit. Default: "0" (unlimited)
## @toml2docs:none-default
#+ max_total_body_memory = "1GB"
## HTTP CORS support, it's turned on by default
## This allows browser to access http APIs without CORS restrictions
enable_cors = true
@@ -54,6 +58,10 @@ bind_addr = "127.0.0.1:4001"
server_addr = "127.0.0.1:4001"
## The number of server worker threads.
runtime_size = 8
## Maximum total memory for all concurrent gRPC request messages.
## Set to 0 to disable the limit. Default: "0" (unlimited)
## @toml2docs:none-default
#+ max_total_message_memory = "1GB"
## Compression mode for frontend side Arrow IPC service. Available options:
## - `none`: disable all compression
## - `transport`: only enable gRPC transport compression (zstd)

View File

@@ -36,6 +36,10 @@ timeout = "0s"
## The following units are supported: `B`, `KB`, `KiB`, `MB`, `MiB`, `GB`, `GiB`, `TB`, `TiB`, `PB`, `PiB`.
## Set to 0 to disable limit.
body_limit = "64MB"
## Maximum total memory for all concurrent HTTP request bodies.
## Set to 0 to disable the limit. Default: "0" (unlimited)
## @toml2docs:none-default
#+ max_total_body_memory = "1GB"
## HTTP CORS support, it's turned on by default
## This allows browser to access http APIs without CORS restrictions
enable_cors = true
@@ -56,6 +60,10 @@ prom_validation_mode = "strict"
bind_addr = "127.0.0.1:4001"
## The number of server worker threads.
runtime_size = 8
## Maximum total memory for all concurrent gRPC request messages.
## Set to 0 to disable the limit. Default: "0" (unlimited)
## @toml2docs:none-default
#+ max_total_message_memory = "1GB"
## The maximum connection age for gRPC connection.
## The value can be a human-readable time string. For example: `10m` for ten minutes or `1h` for one hour.
## Refer to https://grpc.io/docs/guides/keepalive/ for more details.

View File

@@ -16,8 +16,8 @@ use std::collections::HashMap;
use datatypes::schema::{
COMMENT_KEY, ColumnDefaultConstraint, ColumnSchema, FULLTEXT_KEY, FulltextAnalyzer,
FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, SKIPPING_INDEX_KEY, SkippingIndexOptions,
SkippingIndexType,
FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, JSON_STRUCTURE_SETTINGS_KEY,
SKIPPING_INDEX_KEY, SkippingIndexOptions, SkippingIndexType,
};
use greptime_proto::v1::{
Analyzer, FulltextBackend as PbFulltextBackend, SkippingIndexType as PbSkippingIndexType,
@@ -68,6 +68,9 @@ pub fn try_as_column_schema(column_def: &ColumnDef) -> Result<ColumnSchema> {
if let Some(skipping_index) = options.options.get(SKIPPING_INDEX_GRPC_KEY) {
metadata.insert(SKIPPING_INDEX_KEY.to_string(), skipping_index.to_owned());
}
if let Some(settings) = options.options.get(JSON_STRUCTURE_SETTINGS_KEY) {
metadata.insert(JSON_STRUCTURE_SETTINGS_KEY.to_string(), settings.clone());
}
}
ColumnSchema::new(&column_def.name, data_type.into(), column_def.is_nullable)
@@ -139,6 +142,11 @@ pub fn options_from_column_schema(column_schema: &ColumnSchema) -> Option<Column
.options
.insert(SKIPPING_INDEX_GRPC_KEY.to_string(), skipping_index.clone());
}
if let Some(settings) = column_schema.metadata().get(JSON_STRUCTURE_SETTINGS_KEY) {
options
.options
.insert(JSON_STRUCTURE_SETTINGS_KEY.to_string(), settings.clone());
}
(!options.options.is_empty()).then_some(options)
}

View File

@@ -33,7 +33,6 @@ use datatypes::timestamp::TimestampMillisecond;
use datatypes::value::Value;
use datatypes::vectors::{
Int64VectorBuilder, StringVectorBuilder, TimestampMillisecondVectorBuilder,
UInt32VectorBuilder, UInt64VectorBuilder,
};
use serde::Serialize;
use snafu::ResultExt;
@@ -53,6 +52,8 @@ const PEER_ADDR: &str = "peer_addr";
const PEER_HOSTNAME: &str = "peer_hostname";
const TOTAL_CPU_MILLICORES: &str = "total_cpu_millicores";
const TOTAL_MEMORY_BYTES: &str = "total_memory_bytes";
const CPU_USAGE_MILLICORES: &str = "cpu_usage_millicores";
const MEMORY_USAGE_BYTES: &str = "memory_usage_bytes";
const VERSION: &str = "version";
const GIT_COMMIT: &str = "git_commit";
const START_TIME: &str = "start_time";
@@ -67,15 +68,17 @@ const INIT_CAPACITY: usize = 42;
/// - `peer_id`: the peer server id.
/// - `peer_type`: the peer type, such as `datanode`, `frontend`, `metasrv` etc.
/// - `peer_addr`: the peer gRPC address.
/// - `peer_hostname`: the hostname of the peer.
/// - `total_cpu_millicores`: the total CPU millicores of the peer.
/// - `total_memory_bytes`: the total memory bytes of the peer.
/// - `cpu_usage_millicores`: the CPU usage millicores of the peer.
/// - `memory_usage_bytes`: the memory usage bytes of the peer.
/// - `version`: the build package version of the peer.
/// - `git_commit`: the build git commit hash of the peer.
/// - `start_time`: the starting time of the peer.
/// - `uptime`: the uptime of the peer.
/// - `active_time`: the time since the last activity of the peer.
/// - `node_status`: the status info of the peer.
/// - `peer_hostname`: the hostname of the peer.
///
#[derive(Debug)]
pub(super) struct InformationSchemaClusterInfo {
@@ -99,12 +102,22 @@ impl InformationSchemaClusterInfo {
ColumnSchema::new(PEER_HOSTNAME, ConcreteDataType::string_datatype(), true),
ColumnSchema::new(
TOTAL_CPU_MILLICORES,
ConcreteDataType::uint32_datatype(),
ConcreteDataType::int64_datatype(),
false,
),
ColumnSchema::new(
TOTAL_MEMORY_BYTES,
ConcreteDataType::uint64_datatype(),
ConcreteDataType::int64_datatype(),
false,
),
ColumnSchema::new(
CPU_USAGE_MILLICORES,
ConcreteDataType::int64_datatype(),
false,
),
ColumnSchema::new(
MEMORY_USAGE_BYTES,
ConcreteDataType::int64_datatype(),
false,
),
ColumnSchema::new(VERSION, ConcreteDataType::string_datatype(), false),
@@ -167,8 +180,10 @@ struct InformationSchemaClusterInfoBuilder {
peer_types: StringVectorBuilder,
peer_addrs: StringVectorBuilder,
peer_hostnames: StringVectorBuilder,
cpus: UInt32VectorBuilder,
memory_bytes: UInt64VectorBuilder,
total_cpu_millicores: Int64VectorBuilder,
total_memory_bytes: Int64VectorBuilder,
cpu_usage_millicores: Int64VectorBuilder,
memory_usage_bytes: Int64VectorBuilder,
versions: StringVectorBuilder,
git_commits: StringVectorBuilder,
start_times: TimestampMillisecondVectorBuilder,
@@ -186,8 +201,10 @@ impl InformationSchemaClusterInfoBuilder {
peer_types: StringVectorBuilder::with_capacity(INIT_CAPACITY),
peer_addrs: StringVectorBuilder::with_capacity(INIT_CAPACITY),
peer_hostnames: StringVectorBuilder::with_capacity(INIT_CAPACITY),
cpus: UInt32VectorBuilder::with_capacity(INIT_CAPACITY),
memory_bytes: UInt64VectorBuilder::with_capacity(INIT_CAPACITY),
total_cpu_millicores: Int64VectorBuilder::with_capacity(INIT_CAPACITY),
total_memory_bytes: Int64VectorBuilder::with_capacity(INIT_CAPACITY),
cpu_usage_millicores: Int64VectorBuilder::with_capacity(INIT_CAPACITY),
memory_usage_bytes: Int64VectorBuilder::with_capacity(INIT_CAPACITY),
versions: StringVectorBuilder::with_capacity(INIT_CAPACITY),
git_commits: StringVectorBuilder::with_capacity(INIT_CAPACITY),
start_times: TimestampMillisecondVectorBuilder::with_capacity(INIT_CAPACITY),
@@ -243,8 +260,14 @@ impl InformationSchemaClusterInfoBuilder {
self.start_times.push(None);
self.uptimes.push(None);
}
self.cpus.push(Some(node_info.cpus));
self.memory_bytes.push(Some(node_info.memory_bytes));
self.total_cpu_millicores
.push(Some(node_info.total_cpu_millicores));
self.total_memory_bytes
.push(Some(node_info.total_memory_bytes));
self.cpu_usage_millicores
.push(Some(node_info.cpu_usage_millicores));
self.memory_usage_bytes
.push(Some(node_info.memory_usage_bytes));
if node_info.last_activity_ts > 0 {
self.active_times.push(Some(
@@ -269,8 +292,10 @@ impl InformationSchemaClusterInfoBuilder {
Arc::new(self.peer_types.finish()),
Arc::new(self.peer_addrs.finish()),
Arc::new(self.peer_hostnames.finish()),
Arc::new(self.cpus.finish()),
Arc::new(self.memory_bytes.finish()),
Arc::new(self.total_cpu_millicores.finish()),
Arc::new(self.total_memory_bytes.finish()),
Arc::new(self.cpu_usage_millicores.finish()),
Arc::new(self.memory_usage_bytes.finish()),
Arc::new(self.versions.finish()),
Arc::new(self.git_commits.finish()),
Arc::new(self.start_times.finish()),

View File

@@ -27,6 +27,7 @@ use datafusion::error::DataFusionError;
use datafusion::execution::TaskContext;
use datafusion::physical_plan::stream::RecordBatchStreamAdapter as DfRecordBatchStreamAdapter;
use datafusion_pg_catalog::pg_catalog::catalog_info::CatalogInfo;
use datafusion_pg_catalog::pg_catalog::context::EmptyContextProvider;
use datafusion_pg_catalog::pg_catalog::{
PG_CATALOG_TABLES, PgCatalogSchemaProvider, PgCatalogStaticTables, PgCatalogTable,
};
@@ -44,7 +45,7 @@ use crate::system_schema::{
/// [`PGCatalogProvider`] is the provider for a schema named `pg_catalog`, it is not a catalog.
pub struct PGCatalogProvider {
catalog_name: String,
inner: PgCatalogSchemaProvider<CatalogManagerWrapper>,
inner: PgCatalogSchemaProvider<CatalogManagerWrapper, EmptyContextProvider>,
tables: HashMap<String, TableRef>,
table_ids: HashMap<&'static str, u32>,
}
@@ -69,6 +70,7 @@ impl PGCatalogProvider {
catalog_manager,
},
Arc::new(static_tables),
EmptyContextProvider,
)
.expect("Failed to initialize PgCatalogSchemaProvider");

View File

@@ -30,6 +30,7 @@ use common_meta::heartbeat::handler::invalidate_table_cache::InvalidateCacheHand
use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler;
use common_meta::key::TableMetadataManager;
use common_meta::key::flow::FlowMetadataManager;
use common_stat::ResourceStatImpl;
use common_telemetry::info;
use common_telemetry::logging::{DEFAULT_LOGGING_DIR, TracingOptions};
use common_version::{short_version, verbose_version};
@@ -372,11 +373,15 @@ impl StartCommand {
Arc::new(InvalidateCacheHandler::new(layered_cache_registry.clone())),
]);
let mut resource_stat = ResourceStatImpl::default();
resource_stat.start_collect_cpu_usage();
let heartbeat_task = flow::heartbeat::HeartbeatTask::new(
&opts,
meta_client.clone(),
opts.heartbeat.clone(),
Arc::new(executor),
Arc::new(resource_stat),
);
let flow_metadata_manager = Arc::new(FlowMetadataManager::new(cached_meta_backend.clone()));

View File

@@ -30,6 +30,7 @@ use common_meta::cache::{CacheRegistryBuilder, LayeredCacheRegistryBuilder};
use common_meta::heartbeat::handler::HandlerGroupExecutor;
use common_meta::heartbeat::handler::invalidate_table_cache::InvalidateCacheHandler;
use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler;
use common_stat::ResourceStatImpl;
use common_telemetry::info;
use common_telemetry::logging::{DEFAULT_LOGGING_DIR, TracingOptions};
use common_time::timezone::set_default_timezone;
@@ -421,11 +422,15 @@ impl StartCommand {
Arc::new(InvalidateCacheHandler::new(layered_cache_registry.clone())),
]);
let mut resource_stat = ResourceStatImpl::default();
resource_stat.start_collect_cpu_usage();
let heartbeat_task = HeartbeatTask::new(
&opts,
meta_client.clone(),
opts.heartbeat.clone(),
Arc::new(executor),
Arc::new(resource_stat),
);
let heartbeat_task = Some(heartbeat_task);

View File

@@ -11,7 +11,6 @@ workspace = true
common-base.workspace = true
common-error.workspace = true
common-macro.workspace = true
common-stat.workspace = true
config.workspace = true
humantime-serde.workspace = true
object-store.workspace = true

View File

@@ -14,7 +14,6 @@
pub mod config;
pub mod error;
pub mod utils;
use std::time::Duration;

View File

@@ -1,34 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use common_base::readable_size::ReadableSize;
use common_stat::{get_total_cpu_millicores, get_total_memory_readable};
/// `ResourceSpec` holds the static resource specifications of a node,
/// such as CPU cores and memory capacity. These values are fixed
/// at startup and do not change dynamically during runtime.
#[derive(Debug, Clone, Copy)]
pub struct ResourceSpec {
pub cpus: i64,
pub memory: Option<ReadableSize>,
}
impl Default for ResourceSpec {
fn default() -> Self {
Self {
cpus: get_total_cpu_millicores(),
memory: get_total_memory_readable(),
}
}
}

View File

@@ -36,7 +36,7 @@ object_store_opendal.workspace = true
orc-rust = { version = "0.6.3", default-features = false, features = ["async"] }
parquet.workspace = true
paste.workspace = true
regex = "1.7"
regex.workspace = true
serde.workspace = true
snafu.workspace = true
strum.workspace = true

View File

@@ -33,7 +33,7 @@ use bytes::{Buf, Bytes};
use datafusion::datasource::physical_plan::FileOpenFuture;
use datafusion::error::{DataFusionError, Result as DataFusionResult};
use datafusion::physical_plan::SendableRecordBatchStream;
use futures::StreamExt;
use futures::{StreamExt, TryStreamExt};
use object_store::ObjectStore;
use snafu::ResultExt;
use tokio_util::compat::FuturesAsyncWriteCompatExt;
@@ -179,7 +179,7 @@ pub fn open_with_decoder<T: ArrowDecoder, F: Fn() -> DataFusionResult<T>>(
Poll::Ready(decoder.flush().transpose())
});
Ok(stream.boxed())
Ok(stream.map_err(Into::into).boxed())
}))
}

View File

@@ -51,6 +51,7 @@ nalgebra.workspace = true
num = "0.4"
num-traits = "0.2"
paste.workspace = true
regex.workspace = true
s2 = { version = "0.0.12", optional = true }
serde.workspace = true
serde_json.workspace = true

View File

@@ -22,6 +22,7 @@
//! `foo_merge`'s input arg is the same as `foo_state`'s output, and its output is the same as `foo`'s input.
//!
use std::hash::{Hash, Hasher};
use std::sync::Arc;
use arrow::array::StructArray;
@@ -272,7 +273,7 @@ impl StateMergeHelper {
}
/// Wrapper to make an aggregate function out of a state function.
#[derive(Debug, Clone, PartialEq, Eq)]
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct StateWrapper {
inner: AggregateUDF,
name: String,
@@ -616,6 +617,20 @@ impl AggregateUDFImpl for MergeWrapper {
}
}
impl PartialEq for MergeWrapper {
fn eq(&self, other: &Self) -> bool {
self.inner == other.inner
}
}
impl Eq for MergeWrapper {}
impl Hash for MergeWrapper {
fn hash<H: Hasher>(&self, state: &mut H) {
self.inner.hash(state);
}
}
/// The merge accumulator, which modify `update_batch`'s behavior to accept one struct array which
/// include the state fields of original aggregate function, and merge said states into original accumulator
/// the output is the same as original aggregate function

View File

@@ -39,8 +39,7 @@ use datafusion::prelude::SessionContext;
use datafusion_common::arrow::array::AsArray;
use datafusion_common::arrow::datatypes::{Float64Type, UInt64Type};
use datafusion_common::{Column, TableReference};
use datafusion_expr::expr::AggregateFunction;
use datafusion_expr::sqlparser::ast::NullTreatment;
use datafusion_expr::expr::{AggregateFunction, NullTreatment};
use datafusion_expr::{
Aggregate, ColumnarValue, Expr, LogicalPlan, ScalarFunctionArgs, SortExpr, TableScan, lit,
};

View File

@@ -68,7 +68,7 @@ impl CountHash {
}
}
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Eq, PartialEq, Hash)]
pub struct CountHash {
signature: Signature,
}

View File

@@ -34,6 +34,7 @@ use crate::scalars::json::JsonFunction;
use crate::scalars::matches::MatchesFunction;
use crate::scalars::matches_term::MatchesTermFunction;
use crate::scalars::math::MathFunction;
use crate::scalars::string::register_string_functions;
use crate::scalars::timestamp::TimestampFunction;
use crate::scalars::uddsketch_calc::UddSketchCalcFunction;
use crate::scalars::vector::VectorFunction as VectorScalarFunction;
@@ -154,6 +155,9 @@ pub static FUNCTION_REGISTRY: LazyLock<Arc<FunctionRegistry>> = LazyLock::new(||
// Json related functions
JsonFunction::register(&function_registry);
// String related functions
register_string_functions(&function_registry);
// Vector related functions
VectorScalarFunction::register(&function_registry);
VectorAggrFunction::register(&function_registry);

View File

@@ -20,6 +20,7 @@ pub mod json;
pub mod matches;
pub mod matches_term;
pub mod math;
pub(crate) mod string;
pub mod vector;
pub(crate) mod hll_count;

View File

@@ -20,7 +20,9 @@ use common_query::error;
use common_time::{Date, Timestamp};
use datafusion_common::DataFusionError;
use datafusion_common::arrow::array::{Array, AsArray, StringViewBuilder};
use datafusion_common::arrow::datatypes::{ArrowTimestampType, DataType, Date32Type, TimeUnit};
use datafusion_common::arrow::datatypes::{
ArrowTimestampType, DataType, Date32Type, Date64Type, TimeUnit,
};
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature};
use snafu::ResultExt;
@@ -40,6 +42,7 @@ impl Default for DateFormatFunction {
signature: helper::one_of_sigs2(
vec![
DataType::Date32,
DataType::Date64,
DataType::Timestamp(TimeUnit::Second, None),
DataType::Timestamp(TimeUnit::Millisecond, None),
DataType::Timestamp(TimeUnit::Microsecond, None),
@@ -115,6 +118,29 @@ impl Function for DateFormatFunction {
builder.append_option(result.as_deref());
}
}
DataType::Date64 => {
let left = left.as_primitive::<Date64Type>();
for i in 0..size {
let date = left.is_valid(i).then(|| {
let ms = left.value(i);
Timestamp::new_millisecond(ms)
});
let format = formats.is_valid(i).then(|| formats.value(i));
let result = match (date, format) {
(Some(ts), Some(fmt)) => {
Some(ts.as_formatted_string(fmt, Some(timezone)).map_err(|e| {
DataFusionError::Execution(format!(
"cannot format {ts:?} as '{fmt}': {e}"
))
})?)
}
_ => None,
};
builder.append_option(result.as_deref());
}
}
x => {
return Err(DataFusionError::Execution(format!(
"unsupported input data type {x}"
@@ -137,7 +163,9 @@ mod tests {
use std::sync::Arc;
use arrow_schema::Field;
use datafusion_common::arrow::array::{Date32Array, StringArray, TimestampSecondArray};
use datafusion_common::arrow::array::{
Date32Array, Date64Array, StringArray, TimestampSecondArray,
};
use datafusion_common::config::ConfigOptions;
use datafusion_expr::{TypeSignature, Volatility};
@@ -166,7 +194,7 @@ mod tests {
Signature {
type_signature: TypeSignature::OneOf(sigs),
volatility: Volatility::Immutable
} if sigs.len() == 5));
} if sigs.len() == 6));
}
#[test]
@@ -213,6 +241,50 @@ mod tests {
}
}
#[test]
fn test_date64_date_format() {
let f = DateFormatFunction::default();
let dates = vec![Some(123000), None, Some(42000), None];
let formats = vec![
"%Y-%m-%d %T.%3f",
"%Y-%m-%d %T.%3f",
"%Y-%m-%d %T.%3f",
"%Y-%m-%d %T.%3f",
];
let results = [
Some("1970-01-01 00:02:03.000"),
None,
Some("1970-01-01 00:00:42.000"),
None,
];
let mut config_options = ConfigOptions::default();
config_options.extensions.insert(FunctionContext::default());
let config_options = Arc::new(config_options);
let args = ScalarFunctionArgs {
args: vec![
ColumnarValue::Array(Arc::new(Date64Array::from(dates))),
ColumnarValue::Array(Arc::new(StringArray::from_iter_values(formats))),
],
arg_fields: vec![],
number_rows: 4,
return_field: Arc::new(Field::new("x", DataType::Utf8View, false)),
config_options,
};
let result = f
.invoke_with_args(args)
.and_then(|x| x.to_array(4))
.unwrap();
let vector = result.as_string_view();
assert_eq!(4, vector.len());
for (actual, expect) in vector.iter().zip(results) {
assert_eq!(actual, expect);
}
}
#[test]
fn test_date_date_format() {
let f = DateFormatFunction::default();

View File

@@ -76,7 +76,7 @@ impl Function for GeohashFunction {
}
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
Ok(DataType::Utf8)
Ok(DataType::Utf8View)
}
fn signature(&self) -> &Signature {
@@ -176,7 +176,7 @@ impl Function for GeohashNeighboursFunction {
Ok(DataType::List(Arc::new(Field::new(
"item",
DataType::Utf8View,
false,
true,
))))
}

View File

@@ -355,9 +355,9 @@ impl Function for H3CellCenterLatLng {
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
Ok(DataType::List(Arc::new(Field::new(
"x",
"item",
DataType::Float64,
false,
true,
))))
}

View File

@@ -0,0 +1,26 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! String scalar functions
mod regexp_extract;
pub(crate) use regexp_extract::RegexpExtractFunction;
use crate::function_registry::FunctionRegistry;
/// Register all string functions
pub fn register_string_functions(registry: &FunctionRegistry) {
RegexpExtractFunction::register(registry);
}

View File

@@ -0,0 +1,339 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Implementation of REGEXP_EXTRACT function
use std::fmt;
use std::sync::Arc;
use datafusion_common::DataFusionError;
use datafusion_common::arrow::array::{Array, AsArray, LargeStringBuilder};
use datafusion_common::arrow::compute::cast;
use datafusion_common::arrow::datatypes::DataType;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, TypeSignature, Volatility};
use regex::{Regex, RegexBuilder};
use crate::function::Function;
use crate::function_registry::FunctionRegistry;
const NAME: &str = "regexp_extract";
// Safety limits
const MAX_REGEX_SIZE: usize = 1024 * 1024; // compiled regex heap cap
const MAX_DFA_SIZE: usize = 2 * 1024 * 1024; // lazy DFA cap
const MAX_TOTAL_RESULT_SIZE: usize = 64 * 1024 * 1024; // total batch cap
const MAX_SINGLE_MATCH: usize = 1024 * 1024; // per-row cap
const MAX_PATTERN_LEN: usize = 10_000; // pattern text length cap
/// REGEXP_EXTRACT function implementation
/// Extracts the first substring matching the given regular expression pattern.
/// If no match is found, returns NULL.
///
#[derive(Debug)]
pub struct RegexpExtractFunction {
signature: Signature,
}
impl RegexpExtractFunction {
pub fn register(registry: &FunctionRegistry) {
registry.register_scalar(RegexpExtractFunction::default());
}
}
impl Default for RegexpExtractFunction {
fn default() -> Self {
Self {
signature: Signature::one_of(
vec![
TypeSignature::Exact(vec![DataType::Utf8View, DataType::Utf8]),
TypeSignature::Exact(vec![DataType::Utf8View, DataType::Utf8View]),
TypeSignature::Exact(vec![DataType::Utf8, DataType::Utf8View]),
TypeSignature::Exact(vec![DataType::LargeUtf8, DataType::Utf8View]),
TypeSignature::Exact(vec![DataType::Utf8View, DataType::LargeUtf8]),
TypeSignature::Exact(vec![DataType::Utf8, DataType::Utf8]),
TypeSignature::Exact(vec![DataType::LargeUtf8, DataType::Utf8]),
TypeSignature::Exact(vec![DataType::Utf8, DataType::LargeUtf8]),
TypeSignature::Exact(vec![DataType::LargeUtf8, DataType::LargeUtf8]),
],
Volatility::Immutable,
),
}
}
}
impl fmt::Display for RegexpExtractFunction {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", NAME.to_ascii_uppercase())
}
}
impl Function for RegexpExtractFunction {
fn name(&self) -> &str {
NAME
}
// Always return LargeUtf8 for simplicity and safety
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
Ok(DataType::LargeUtf8)
}
fn signature(&self) -> &Signature {
&self.signature
}
fn invoke_with_args(
&self,
args: ScalarFunctionArgs,
) -> datafusion_common::Result<ColumnarValue> {
if args.args.len() != 2 {
return Err(DataFusionError::Execution(
"REGEXP_EXTRACT requires exactly two arguments (text, pattern)".to_string(),
));
}
// Keep original ColumnarValue variants for scalar-pattern fast path
let pattern_is_scalar = matches!(args.args[1], ColumnarValue::Scalar(_));
let arrays = ColumnarValue::values_to_arrays(&args.args)?;
let text_array = &arrays[0];
let pattern_array = &arrays[1];
// Cast both to LargeUtf8 for uniform access (supports Utf8/Utf8View/Dictionary<String>)
let text_large = cast(text_array.as_ref(), &DataType::LargeUtf8).map_err(|e| {
DataFusionError::Execution(format!("REGEXP_EXTRACT: text cast failed: {e}"))
})?;
let pattern_large = cast(pattern_array.as_ref(), &DataType::LargeUtf8).map_err(|e| {
DataFusionError::Execution(format!("REGEXP_EXTRACT: pattern cast failed: {e}"))
})?;
let text = text_large.as_string::<i64>();
let pattern = pattern_large.as_string::<i64>();
let len = text.len();
// Pre-size result builder with conservative estimate
let mut estimated_total = 0usize;
for i in 0..len {
if !text.is_null(i) {
estimated_total = estimated_total.saturating_add(text.value_length(i) as usize);
if estimated_total > MAX_TOTAL_RESULT_SIZE {
return Err(DataFusionError::ResourcesExhausted(format!(
"REGEXP_EXTRACT total output exceeds {} bytes",
MAX_TOTAL_RESULT_SIZE
)));
}
}
}
let mut builder = LargeStringBuilder::with_capacity(len, estimated_total);
// Fast path: if pattern is scalar, compile once
let compiled_scalar: Option<Regex> = if pattern_is_scalar && len > 0 && !pattern.is_null(0)
{
Some(compile_regex_checked(pattern.value(0))?)
} else {
None
};
for i in 0..len {
if text.is_null(i) || pattern.is_null(i) {
builder.append_null();
continue;
}
let s = text.value(i);
let pat = pattern.value(i);
// Compile or reuse regex
let re = if let Some(ref compiled) = compiled_scalar {
compiled
} else {
// TODO: For performance-critical applications with repeating patterns,
// consider adding a small LRU cache here
&compile_regex_checked(pat)?
};
// First match only
if let Some(m) = re.find(s) {
let m_str = m.as_str();
if m_str.len() > MAX_SINGLE_MATCH {
return Err(DataFusionError::Execution(
"REGEXP_EXTRACT match exceeds per-row limit (1MB)".to_string(),
));
}
builder.append_value(m_str);
} else {
builder.append_null();
}
}
Ok(ColumnarValue::Array(Arc::new(builder.finish())))
}
}
// Compile a regex with safety checks
fn compile_regex_checked(pattern: &str) -> datafusion_common::Result<Regex> {
if pattern.len() > MAX_PATTERN_LEN {
return Err(DataFusionError::Execution(format!(
"REGEXP_EXTRACT pattern too long (> {} chars)",
MAX_PATTERN_LEN
)));
}
RegexBuilder::new(pattern)
.size_limit(MAX_REGEX_SIZE)
.dfa_size_limit(MAX_DFA_SIZE)
.build()
.map_err(|e| {
DataFusionError::Execution(format!("REGEXP_EXTRACT invalid pattern '{}': {e}", pattern))
})
}
#[cfg(test)]
mod tests {
use datafusion_common::arrow::array::StringArray;
use datafusion_common::arrow::datatypes::Field;
use datafusion_expr::ScalarFunctionArgs;
use super::*;
#[test]
fn test_regexp_extract_function_basic() {
let text_array = Arc::new(StringArray::from(vec!["version 1.2.3", "no match here"]));
let pattern_array = Arc::new(StringArray::from(vec!["\\d+\\.\\d+\\.\\d+", "\\d+"]));
let args = ScalarFunctionArgs {
args: vec![
ColumnarValue::Array(text_array),
ColumnarValue::Array(pattern_array),
],
arg_fields: vec![
Arc::new(Field::new("arg_0", DataType::Utf8, false)),
Arc::new(Field::new("arg_1", DataType::Utf8, false)),
],
return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)),
number_rows: 2,
config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
};
let function = RegexpExtractFunction::default();
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let string_array = array.as_string::<i64>();
assert_eq!(string_array.value(0), "1.2.3");
assert!(string_array.is_null(1)); // no match should return NULL
} else {
panic!("Expected array result");
}
}
#[test]
fn test_regexp_extract_phone_number() {
let text_array = Arc::new(StringArray::from(vec!["Phone: 123-456-7890", "No phone"]));
let pattern_array = Arc::new(StringArray::from(vec![
"\\d{3}-\\d{3}-\\d{4}",
"\\d{3}-\\d{3}-\\d{4}",
]));
let args = ScalarFunctionArgs {
args: vec![
ColumnarValue::Array(text_array),
ColumnarValue::Array(pattern_array),
],
arg_fields: vec![
Arc::new(Field::new("arg_0", DataType::Utf8, false)),
Arc::new(Field::new("arg_1", DataType::Utf8, false)),
],
return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)),
number_rows: 2,
config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
};
let function = RegexpExtractFunction::default();
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let string_array = array.as_string::<i64>();
assert_eq!(string_array.value(0), "123-456-7890");
assert!(string_array.is_null(1)); // no match should return NULL
} else {
panic!("Expected array result");
}
}
#[test]
fn test_regexp_extract_email() {
let text_array = Arc::new(StringArray::from(vec![
"Email: user@domain.com",
"Invalid email",
]));
let pattern_array = Arc::new(StringArray::from(vec![
"[a-zA-Z0-9]+@[a-zA-Z0-9]+\\.[a-zA-Z]+",
"[a-zA-Z0-9]+@[a-zA-Z0-9]+\\.[a-zA-Z]+",
]));
let args = ScalarFunctionArgs {
args: vec![
ColumnarValue::Array(text_array),
ColumnarValue::Array(pattern_array),
],
arg_fields: vec![
Arc::new(Field::new("arg_0", DataType::Utf8, false)),
Arc::new(Field::new("arg_1", DataType::Utf8, false)),
],
return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)),
number_rows: 2,
config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
};
let function = RegexpExtractFunction::default();
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let string_array = array.as_string::<i64>();
assert_eq!(string_array.value(0), "user@domain.com");
assert!(string_array.is_null(1)); // no match should return NULL
} else {
panic!("Expected array result");
}
}
#[test]
fn test_regexp_extract_with_nulls() {
let text_array = Arc::new(StringArray::from(vec![Some("test 123"), None]));
let pattern_array = Arc::new(StringArray::from(vec![Some("\\d+"), Some("\\d+")]));
let args = ScalarFunctionArgs {
args: vec![
ColumnarValue::Array(text_array),
ColumnarValue::Array(pattern_array),
],
arg_fields: vec![
Arc::new(Field::new("arg_0", DataType::Utf8, true)),
Arc::new(Field::new("arg_1", DataType::Utf8, false)),
],
return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)),
number_rows: 2,
config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
};
let function = RegexpExtractFunction::default();
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let string_array = array.as_string::<i64>();
assert_eq!(string_array.value(0), "123");
assert!(string_array.is_null(1)); // NULL input should return NULL
} else {
panic!("Expected array result");
}
}
}

View File

@@ -14,6 +14,7 @@
use std::any::Any;
use std::fmt::{Debug, Formatter};
use std::hash::{Hash, Hasher};
use datafusion::arrow::datatypes::DataType;
use datafusion::logical_expr::{ScalarFunctionArgs, ScalarUDFImpl};
@@ -33,6 +34,20 @@ impl Debug for ScalarUdf {
}
}
impl PartialEq for ScalarUdf {
fn eq(&self, other: &Self) -> bool {
self.function.signature() == other.function.signature()
}
}
impl Eq for ScalarUdf {}
impl Hash for ScalarUdf {
fn hash<H: Hasher>(&self, state: &mut H) {
self.function.signature().hash(state)
}
}
impl ScalarUDFImpl for ScalarUdf {
fn as_any(&self) -> &dyn Any {
self

View File

@@ -32,10 +32,36 @@ use crate::system::define_nullary_udf;
const CURRENT_SCHEMA_FUNCTION_NAME: &str = "current_schema";
const CURRENT_SCHEMAS_FUNCTION_NAME: &str = "current_schemas";
const SESSION_USER_FUNCTION_NAME: &str = "session_user";
const CURRENT_DATABASE_FUNCTION_NAME: &str = "current_database";
define_nullary_udf!(CurrentSchemaFunction);
define_nullary_udf!(CurrentSchemasFunction);
define_nullary_udf!(SessionUserFunction);
define_nullary_udf!(CurrentDatabaseFunction);
impl Function for CurrentDatabaseFunction {
fn name(&self) -> &str {
CURRENT_DATABASE_FUNCTION_NAME
}
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
Ok(DataType::Utf8View)
}
fn signature(&self) -> &Signature {
&self.signature
}
fn invoke_with_args(
&self,
args: ScalarFunctionArgs,
) -> datafusion_common::Result<ColumnarValue> {
let func_ctx = find_function_context(&args)?;
let db = func_ctx.query_ctx.current_catalog().to_string();
Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(Some(db))))
}
}
// Though "current_schema" can be aliased to "database", to not cause any breaking changes,
// we are not doing it: not until https://github.com/apache/datafusion/issues/17469 is resolved.
@@ -141,6 +167,7 @@ impl PGCatalogFunction {
registry.register_scalar(CurrentSchemaFunction::default());
registry.register_scalar(CurrentSchemasFunction::default());
registry.register_scalar(SessionUserFunction::default());
registry.register_scalar(CurrentDatabaseFunction::default());
registry.register(pg_catalog::format_type::create_format_type_udf());
registry.register(pg_catalog::create_pg_get_partkeydef_udf());
registry.register(pg_catalog::has_privilege_udf::create_has_privilege_udf(

View File

@@ -345,6 +345,20 @@ fn build_struct(
Ok(datafusion_expr::ColumnarValue::Array(result_vector.to_arrow_array()))
}
}
impl PartialEq for #name {
fn eq(&self, other: &Self) -> bool {
self.signature == other.signature
}
}
impl Eq for #name {}
impl std::hash::Hash for #name {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.signature.hash(state)
}
}
}
.into()
}

View File

@@ -120,10 +120,16 @@ pub struct NodeInfo {
pub start_time_ms: u64,
// The node build cpus
#[serde(default)]
pub cpus: u32,
pub total_cpu_millicores: i64,
// The node build memory bytes
#[serde(default)]
pub memory_bytes: u64,
pub total_memory_bytes: i64,
// The node build cpu usage millicores
#[serde(default)]
pub cpu_usage_millicores: i64,
// The node build memory usage bytes
#[serde(default)]
pub memory_usage_bytes: i64,
// The node build hostname
#[serde(default)]
pub hostname: String,
@@ -333,8 +339,10 @@ mod tests {
version: "".to_string(),
git_commit: "".to_string(),
start_time_ms: 1,
cpus: 0,
memory_bytes: 0,
total_cpu_millicores: 0,
total_memory_bytes: 0,
cpu_usage_millicores: 0,
memory_usage_bytes: 0,
hostname: "test_hostname".to_string(),
};

View File

@@ -55,6 +55,10 @@ impl Display for RegionIdent {
/// The result of downgrade leader region.
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
pub struct DowngradeRegionReply {
/// The [RegionId].
/// For compatibility, it is defaulted to [RegionId::new(0, 0)].
#[serde(default)]
pub region_id: RegionId,
/// Returns the `last_entry_id` if available.
pub last_entry_id: Option<u64>,
/// Returns the `metadata_last_entry_id` if available (Only available for metric engine).
@@ -423,14 +427,60 @@ pub enum Instruction {
CloseRegions(Vec<RegionIdent>),
/// Upgrades a region.
UpgradeRegion(UpgradeRegion),
#[serde(
deserialize_with = "single_or_multiple_from",
alias = "DowngradeRegion"
)]
/// Downgrades a region.
DowngradeRegion(DowngradeRegion),
DowngradeRegions(Vec<DowngradeRegion>),
/// Invalidates batch cache.
InvalidateCaches(Vec<CacheIdent>),
/// Flushes regions.
FlushRegions(FlushRegions),
}
impl Instruction {
/// Converts the instruction into a vector of [OpenRegion].
pub fn into_open_regions(self) -> Option<Vec<OpenRegion>> {
match self {
Self::OpenRegions(open_regions) => Some(open_regions),
_ => None,
}
}
/// Converts the instruction into a vector of [RegionIdent].
pub fn into_close_regions(self) -> Option<Vec<RegionIdent>> {
match self {
Self::CloseRegions(close_regions) => Some(close_regions),
_ => None,
}
}
/// Converts the instruction into a [FlushRegions].
pub fn into_flush_regions(self) -> Option<FlushRegions> {
match self {
Self::FlushRegions(flush_regions) => Some(flush_regions),
_ => None,
}
}
/// Converts the instruction into a [DowngradeRegion].
pub fn into_downgrade_regions(self) -> Option<Vec<DowngradeRegion>> {
match self {
Self::DowngradeRegions(downgrade_region) => Some(downgrade_region),
_ => None,
}
}
/// Converts the instruction into a [UpgradeRegion].
pub fn into_upgrade_regions(self) -> Option<UpgradeRegion> {
match self {
Self::UpgradeRegion(upgrade_region) => Some(upgrade_region),
_ => None,
}
}
}
/// The reply of [UpgradeRegion].
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
pub struct UpgradeRegionReply {
@@ -452,6 +502,39 @@ impl Display for UpgradeRegionReply {
}
}
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
pub struct DowngradeRegionsReply {
pub replies: Vec<DowngradeRegionReply>,
}
impl DowngradeRegionsReply {
pub fn new(replies: Vec<DowngradeRegionReply>) -> Self {
Self { replies }
}
pub fn single(reply: DowngradeRegionReply) -> Self {
Self::new(vec![reply])
}
}
#[derive(Deserialize)]
#[serde(untagged)]
enum DowngradeRegionsCompat {
Single(DowngradeRegionReply),
Multiple(DowngradeRegionsReply),
}
fn downgrade_regions_compat_from<'de, D>(deserializer: D) -> Result<DowngradeRegionsReply, D::Error>
where
D: Deserializer<'de>,
{
let helper = DowngradeRegionsCompat::deserialize(deserializer)?;
Ok(match helper {
DowngradeRegionsCompat::Single(x) => DowngradeRegionsReply::new(vec![x]),
DowngradeRegionsCompat::Multiple(reply) => reply,
})
}
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum InstructionReply {
@@ -460,7 +543,11 @@ pub enum InstructionReply {
#[serde(alias = "close_region")]
CloseRegions(SimpleReply),
UpgradeRegion(UpgradeRegionReply),
DowngradeRegion(DowngradeRegionReply),
#[serde(
alias = "downgrade_region",
deserialize_with = "downgrade_regions_compat_from"
)]
DowngradeRegions(DowngradeRegionsReply),
FlushRegions(FlushRegionReply),
}
@@ -470,8 +557,8 @@ impl Display for InstructionReply {
Self::OpenRegions(reply) => write!(f, "InstructionReply::OpenRegions({})", reply),
Self::CloseRegions(reply) => write!(f, "InstructionReply::CloseRegions({})", reply),
Self::UpgradeRegion(reply) => write!(f, "InstructionReply::UpgradeRegion({})", reply),
Self::DowngradeRegion(reply) => {
write!(f, "InstructionReply::DowngradeRegion({})", reply)
Self::DowngradeRegions(reply) => {
write!(f, "InstructionReply::DowngradeRegions({:?})", reply)
}
Self::FlushRegions(reply) => write!(f, "InstructionReply::FlushRegions({})", reply),
}
@@ -493,6 +580,27 @@ impl InstructionReply {
_ => panic!("Expected OpenRegions reply"),
}
}
pub fn expect_upgrade_region_reply(self) -> UpgradeRegionReply {
match self {
Self::UpgradeRegion(reply) => reply,
_ => panic!("Expected UpgradeRegion reply"),
}
}
pub fn expect_downgrade_regions_reply(self) -> Vec<DowngradeRegionReply> {
match self {
Self::DowngradeRegions(reply) => reply.replies,
_ => panic!("Expected DowngradeRegion reply"),
}
}
pub fn expect_flush_regions_reply(self) -> FlushRegionReply {
match self {
Self::FlushRegions(reply) => reply,
_ => panic!("Expected FlushRegions reply"),
}
}
}
#[cfg(test)]
@@ -532,11 +640,27 @@ mod tests {
r#"{"CloseRegions":[{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"}]}"#,
serialized
);
let downgrade_region = InstructionReply::DowngradeRegions(DowngradeRegionsReply::single(
DowngradeRegionReply {
region_id: RegionId::new(1024, 1),
last_entry_id: None,
metadata_last_entry_id: None,
exists: true,
error: None,
},
));
let serialized = serde_json::to_string(&downgrade_region).unwrap();
assert_eq!(
r#"{"type":"downgrade_regions","replies":[{"region_id":4398046511105,"last_entry_id":null,"metadata_last_entry_id":null,"exists":true,"error":null}]}"#,
serialized
)
}
#[test]
fn test_deserialize_instruction() {
let open_region_instruction = r#"{"OpenRegion":[{"region_ident":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"},"region_storage_path":"test/foo","region_options":{},"region_wal_options":{},"skip_wal_replay":false}]}"#;
let open_region_instruction = r#"{"OpenRegion":{"region_ident":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"},"region_storage_path":"test/foo","region_options":{},"region_wal_options":{},"skip_wal_replay":false}}"#;
let open_region_instruction: Instruction =
serde_json::from_str(open_region_instruction).unwrap();
let open_region = Instruction::OpenRegions(vec![OpenRegion::new(
@@ -553,7 +677,7 @@ mod tests {
)]);
assert_eq!(open_region_instruction, open_region);
let close_region_instruction = r#"{"CloseRegion":[{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"}]}"#;
let close_region_instruction = r#"{"CloseRegion":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"}}"#;
let close_region_instruction: Instruction =
serde_json::from_str(close_region_instruction).unwrap();
let close_region = Instruction::CloseRegions(vec![RegionIdent {
@@ -564,6 +688,15 @@ mod tests {
}]);
assert_eq!(close_region_instruction, close_region);
let downgrade_region_instruction = r#"{"DowngradeRegions":{"region_id":4398046511105,"flush_timeout":{"secs":1,"nanos":0}}}"#;
let downgrade_region_instruction: Instruction =
serde_json::from_str(downgrade_region_instruction).unwrap();
let downgrade_region = Instruction::DowngradeRegions(vec![DowngradeRegion {
region_id: RegionId::new(1024, 1),
flush_timeout: Some(Duration::from_millis(1000)),
}]);
assert_eq!(downgrade_region_instruction, downgrade_region);
let close_region_instruction_reply =
r#"{"result":true,"error":null,"type":"close_region"}"#;
let close_region_instruction_reply: InstructionReply =
@@ -582,6 +715,20 @@ mod tests {
error: None,
});
assert_eq!(open_region_instruction_reply, open_region_reply);
let downgrade_region_instruction_reply = r#"{"region_id":4398046511105,"last_entry_id":null,"metadata_last_entry_id":null,"exists":true,"error":null,"type":"downgrade_region"}"#;
let downgrade_region_instruction_reply: InstructionReply =
serde_json::from_str(downgrade_region_instruction_reply).unwrap();
let downgrade_region_reply = InstructionReply::DowngradeRegions(
DowngradeRegionsReply::single(DowngradeRegionReply {
region_id: RegionId::new(1024, 1),
last_entry_id: None,
metadata_last_entry_id: None,
exists: true,
error: None,
}),
);
assert_eq!(downgrade_region_instruction_reply, downgrade_region_reply);
}
#[derive(Debug, Clone, Serialize, Deserialize)]

View File

@@ -6,11 +6,14 @@ license.workspace = true
[dependencies]
common-base.workspace = true
common-runtime.workspace = true
common-telemetry.workspace = true
lazy_static.workspace = true
nix.workspace = true
num_cpus.workspace = true
prometheus.workspace = true
sysinfo.workspace = true
tokio.workspace = true
[lints]
workspace = true

View File

@@ -117,7 +117,10 @@ pub fn get_cpu_limit_from_cgroups() -> Option<i64> {
None
}
fn get_cpu_usage() -> Option<i64> {
/// Get the usage of cpu in millicores from cgroups filesystem.
///
/// - Return `None` if it's not in the cgroups v2 environment or fails to read the cpu usage.
pub fn get_cpu_usage_from_cgroups() -> Option<i64> {
// In certain bare-metal environments, the `/sys/fs/cgroup/cpu.stat` file may be present and reflect system-wide CPU usage rather than container-specific metrics.
// To ensure accurate collection of container-level CPU usage, verify the existence of the `/sys/fs/cgroup/memory.current` file.
// The presence of this file typically indicates execution within a containerized environment, thereby validating the relevance of the collected CPU usage data.
@@ -142,6 +145,22 @@ fn get_cpu_usage() -> Option<i64> {
fields[1].trim().parse::<i64>().ok()
}
// Calculate the cpu usage in millicores from cgroups filesystem.
//
// - Return `0` if the current cpu usage is equal to the last cpu usage or the interval is 0.
pub(crate) fn calculate_cpu_usage(
current_cpu_usage_usecs: i64,
last_cpu_usage_usecs: i64,
interval_milliseconds: i64,
) -> i64 {
let diff = current_cpu_usage_usecs - last_cpu_usage_usecs;
if diff > 0 && interval_milliseconds > 0 {
((diff as f64 / interval_milliseconds as f64).round() as i64).max(1)
} else {
0
}
}
// Check whether the cgroup is v2.
// - Return `true` if the cgroup is v2, otherwise return `false`.
// - Return `None` if the detection fails or not on linux.
@@ -230,7 +249,7 @@ impl Collector for CgroupsMetricsCollector {
}
fn collect(&self) -> Vec<MetricFamily> {
if let Some(cpu_usage) = get_cpu_usage() {
if let Some(cpu_usage) = get_cpu_usage_from_cgroups() {
self.cpu_usage.set(cpu_usage);
}

View File

@@ -13,66 +13,7 @@
// limitations under the License.
mod cgroups;
mod resource;
pub use cgroups::*;
use common_base::readable_size::ReadableSize;
use sysinfo::System;
/// Get the total CPU in millicores.
pub fn get_total_cpu_millicores() -> i64 {
// Get CPU limit from cgroups filesystem.
if let Some(cgroup_cpu_limit) = get_cpu_limit_from_cgroups() {
cgroup_cpu_limit
} else {
// Get total CPU cores from host system.
num_cpus::get() as i64 * 1000
}
}
/// Get the total memory in bytes.
pub fn get_total_memory_bytes() -> i64 {
// Get memory limit from cgroups filesystem.
if let Some(cgroup_memory_limit) = get_memory_limit_from_cgroups() {
cgroup_memory_limit
} else {
// Get total memory from host system.
if sysinfo::IS_SUPPORTED_SYSTEM {
let mut sys_info = System::new();
sys_info.refresh_memory();
sys_info.total_memory() as i64
} else {
// If the system is not supported, return -1.
-1
}
}
}
/// Get the total CPU cores. The result will be rounded to the nearest integer.
/// For example, if the total CPU is 1.5 cores(1500 millicores), the result will be 2.
pub fn get_total_cpu_cores() -> usize {
((get_total_cpu_millicores() as f64) / 1000.0).round() as usize
}
/// Get the total memory in readable size.
pub fn get_total_memory_readable() -> Option<ReadableSize> {
if get_total_memory_bytes() > 0 {
Some(ReadableSize(get_total_memory_bytes() as u64))
} else {
None
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_get_total_cpu_cores() {
assert!(get_total_cpu_cores() > 0);
}
#[test]
fn test_get_total_memory_readable() {
assert!(get_total_memory_readable().unwrap() > ReadableSize::mb(0));
}
}
pub use resource::*;

View File

@@ -0,0 +1,187 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use std::sync::atomic::{AtomicI64, Ordering};
use std::time::Duration;
use common_base::readable_size::ReadableSize;
use common_runtime::JoinHandle;
use common_telemetry::info;
use sysinfo::System;
use tokio::time::sleep;
use crate::cgroups::calculate_cpu_usage;
use crate::{
get_cpu_limit_from_cgroups, get_cpu_usage_from_cgroups, get_memory_limit_from_cgroups,
get_memory_usage_from_cgroups,
};
/// Get the total CPU in millicores. If the CPU limit is unset, it will return the total CPU cores from host system.
pub fn get_total_cpu_millicores() -> i64 {
// Get CPU limit from cgroups filesystem.
if let Some(cgroup_cpu_limit) = get_cpu_limit_from_cgroups() {
cgroup_cpu_limit
} else {
// Get total CPU cores from host system.
num_cpus::get() as i64 * 1000
}
}
/// Get the total memory in bytes. If the memory limit is unset, it will return the total memory from host system.
/// If the system is not supported to get the total host memory, it will return 0.
pub fn get_total_memory_bytes() -> i64 {
// Get memory limit from cgroups filesystem.
if let Some(cgroup_memory_limit) = get_memory_limit_from_cgroups() {
cgroup_memory_limit
} else {
// Get total memory from host system.
if sysinfo::IS_SUPPORTED_SYSTEM {
let mut sys_info = System::new();
sys_info.refresh_memory();
sys_info.total_memory() as i64
} else {
// If the system is not supported, return 0
0
}
}
}
/// Get the total CPU cores. The result will be rounded to the nearest integer.
/// For example, if the total CPU is 1.5 cores(1500 millicores), the result will be 2.
pub fn get_total_cpu_cores() -> usize {
((get_total_cpu_millicores() as f64) / 1000.0).round() as usize
}
/// Get the total memory in readable size.
pub fn get_total_memory_readable() -> Option<ReadableSize> {
if get_total_memory_bytes() > 0 {
Some(ReadableSize(get_total_memory_bytes() as u64))
} else {
None
}
}
/// A reference to a `ResourceStat` implementation.
pub type ResourceStatRef = Arc<dyn ResourceStat + Send + Sync>;
/// A trait for getting resource statistics.
pub trait ResourceStat {
/// Get the total CPU in millicores.
fn get_total_cpu_millicores(&self) -> i64;
/// Get the total memory in bytes.
fn get_total_memory_bytes(&self) -> i64;
/// Get the CPU usage in millicores.
fn get_cpu_usage_millicores(&self) -> i64;
/// Get the memory usage in bytes.
fn get_memory_usage_bytes(&self) -> i64;
}
/// A implementation of `ResourceStat` trait.
pub struct ResourceStatImpl {
cpu_usage_millicores: Arc<AtomicI64>,
last_cpu_usage_usecs: Arc<AtomicI64>,
calculate_interval: Duration,
handler: Option<JoinHandle<()>>,
}
impl Default for ResourceStatImpl {
fn default() -> Self {
Self {
cpu_usage_millicores: Arc::new(AtomicI64::new(0)),
last_cpu_usage_usecs: Arc::new(AtomicI64::new(0)),
calculate_interval: Duration::from_secs(5),
handler: None,
}
}
}
impl ResourceStatImpl {
/// Start collecting CPU usage periodically. It will calculate the CPU usage in millicores based on rate of change of CPU usage usage_usec in `/sys/fs/cgroup/cpu.stat`.
/// It ONLY works in cgroup v2 environment.
pub fn start_collect_cpu_usage(&mut self) {
if self.handler.is_some() {
return;
}
let cpu_usage_millicores = self.cpu_usage_millicores.clone();
let last_cpu_usage_usecs = self.last_cpu_usage_usecs.clone();
let calculate_interval = self.calculate_interval;
let handler = common_runtime::spawn_global(async move {
info!(
"Starting to collect CPU usage periodically for every {} seconds",
calculate_interval.as_secs()
);
loop {
let current_cpu_usage_usecs = get_cpu_usage_from_cgroups();
if let Some(current_cpu_usage_usecs) = current_cpu_usage_usecs {
// Skip the first time to collect CPU usage.
if last_cpu_usage_usecs.load(Ordering::Relaxed) == 0 {
last_cpu_usage_usecs.store(current_cpu_usage_usecs, Ordering::Relaxed);
continue;
}
let cpu_usage = calculate_cpu_usage(
current_cpu_usage_usecs,
last_cpu_usage_usecs.load(Ordering::Relaxed),
calculate_interval.as_millis() as i64,
);
cpu_usage_millicores.store(cpu_usage, Ordering::Relaxed);
last_cpu_usage_usecs.store(current_cpu_usage_usecs, Ordering::Relaxed);
}
sleep(calculate_interval).await;
}
});
self.handler = Some(handler);
}
}
impl ResourceStat for ResourceStatImpl {
/// Get the total CPU in millicores.
fn get_total_cpu_millicores(&self) -> i64 {
get_total_cpu_millicores()
}
/// Get the total memory in bytes.
fn get_total_memory_bytes(&self) -> i64 {
get_total_memory_bytes()
}
/// Get the CPU usage in millicores.
fn get_cpu_usage_millicores(&self) -> i64 {
self.cpu_usage_millicores.load(Ordering::Relaxed)
}
/// Get the memory usage in bytes.
/// It ONLY works in cgroup v2 environment.
fn get_memory_usage_bytes(&self) -> i64 {
get_memory_usage_from_cgroups().unwrap_or_default()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_get_total_cpu_cores() {
assert!(get_total_cpu_cores() > 0);
}
#[test]
fn test_get_total_memory_readable() {
assert!(get_total_memory_readable().unwrap() > ReadableSize::mb(0));
}
}

View File

@@ -28,7 +28,7 @@ pub async fn check_output_stream(output: OutputData, expected: &str) {
_ => unreachable!(),
};
let pretty_print = recordbatches.pretty_print().unwrap();
assert_eq!(pretty_print, expected, "actual: \n{}", pretty_print);
assert_eq!(pretty_print, expected.trim(), "actual: \n{}", pretty_print);
}
pub async fn execute_and_check_output(db: &Database, sql: &str, expected: ExpectedOutput<'_>) {

View File

@@ -30,6 +30,7 @@ common-procedure.workspace = true
common-query.workspace = true
common-recordbatch.workspace = true
common-runtime.workspace = true
common-stat.workspace = true
common-telemetry.workspace = true
common-time.workspace = true
common-version.workspace = true

View File

@@ -27,6 +27,7 @@ use common_meta::key::runtime_switch::RuntimeSwitchManager;
use common_meta::key::{SchemaMetadataManager, SchemaMetadataManagerRef};
use common_meta::kv_backend::KvBackendRef;
pub use common_procedure::options::ProcedureConfig;
use common_stat::ResourceStatImpl;
use common_telemetry::{error, info, warn};
use common_wal::config::DatanodeWalConfig;
use common_wal::config::kafka::DatanodeKafkaConfig;
@@ -282,6 +283,9 @@ impl DatanodeBuilder {
open_all_regions.await?;
}
let mut resource_stat = ResourceStatImpl::default();
resource_stat.start_collect_cpu_usage();
let heartbeat_task = if let Some(meta_client) = meta_client {
Some(
HeartbeatTask::try_new(
@@ -290,6 +294,7 @@ impl DatanodeBuilder {
meta_client,
cache_registry,
self.plugins.clone(),
Arc::new(resource_stat),
)
.await?,
)

View File

@@ -20,7 +20,6 @@ use std::time::Duration;
use api::v1::meta::heartbeat_request::NodeWorkloads;
use api::v1::meta::{DatanodeWorkloads, HeartbeatRequest, NodeInfo, Peer, RegionRole, RegionStat};
use common_base::Plugins;
use common_config::utils::ResourceSpec;
use common_meta::cache_invalidator::CacheInvalidatorRef;
use common_meta::datanode::REGION_STATISTIC_KEY;
use common_meta::distributed_time_constants::META_KEEP_ALIVE_INTERVAL_SECS;
@@ -31,6 +30,7 @@ use common_meta::heartbeat::handler::{
};
use common_meta::heartbeat::mailbox::{HeartbeatMailbox, MailboxRef};
use common_meta::heartbeat::utils::outgoing_message_to_mailbox_message;
use common_stat::ResourceStatRef;
use common_telemetry::{debug, error, info, trace, warn};
use common_workload::DatanodeWorkloadType;
use meta_client::MetaClientRef;
@@ -63,7 +63,7 @@ pub struct HeartbeatTask {
interval: u64,
resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
region_alive_keeper: Arc<RegionAliveKeeper>,
resource_spec: ResourceSpec,
resource_stat: ResourceStatRef,
}
impl Drop for HeartbeatTask {
@@ -80,6 +80,7 @@ impl HeartbeatTask {
meta_client: MetaClientRef,
cache_invalidator: CacheInvalidatorRef,
plugins: Plugins,
resource_stat: ResourceStatRef,
) -> Result<Self> {
let countdown_task_handler_ext = plugins.get::<CountdownTaskHandlerExtRef>();
let region_alive_keeper = Arc::new(RegionAliveKeeper::new(
@@ -109,7 +110,7 @@ impl HeartbeatTask {
interval: opts.heartbeat.interval.as_millis() as u64,
resp_handler_executor,
region_alive_keeper,
resource_spec: Default::default(),
resource_stat,
})
}
@@ -186,6 +187,7 @@ impl HeartbeatTask {
.context(error::HandleHeartbeatResponseSnafu)
}
#[allow(deprecated)]
/// Start heartbeat task, spawn background task.
pub async fn start(
&self,
@@ -237,8 +239,9 @@ impl HeartbeatTask {
self.region_alive_keeper.start(Some(event_receiver)).await?;
let mut last_sent = Instant::now();
let cpus = self.resource_spec.cpus as u32;
let memory_bytes = self.resource_spec.memory.unwrap_or_default().as_bytes();
let total_cpu_millicores = self.resource_stat.get_total_cpu_millicores();
let total_memory_bytes = self.resource_stat.get_total_memory_bytes();
let resource_stat = self.resource_stat.clone();
common_runtime::spawn_hb(async move {
let sleep = tokio::time::sleep(Duration::from_millis(0));
@@ -252,8 +255,13 @@ impl HeartbeatTask {
version: build_info.version.to_string(),
git_commit: build_info.commit_short.to_string(),
start_time_ms: node_epoch,
cpus,
memory_bytes,
total_cpu_millicores,
total_memory_bytes,
cpu_usage_millicores: 0,
memory_usage_bytes: 0,
// TODO(zyy17): Remove these deprecated fields when the deprecated fields are removed from the proto.
cpus: total_cpu_millicores as u32,
memory_bytes: total_memory_bytes as u64,
hostname: hostname::get()
.unwrap_or_default()
.to_string_lossy()
@@ -297,12 +305,18 @@ impl HeartbeatTask {
let topic_stats = region_server_clone.topic_stats();
let now = Instant::now();
let duration_since_epoch = (now - epoch).as_millis() as u64;
let req = HeartbeatRequest {
let mut req = HeartbeatRequest {
region_stats,
topic_stats,
duration_since_epoch,
..heartbeat_request.clone()
};
if let Some(info) = req.info.as_mut() {
info.cpu_usage_millicores = resource_stat.get_cpu_usage_millicores();
info.memory_usage_bytes = resource_stat.get_memory_usage_bytes();
}
sleep.as_mut().reset(now + Duration::from_millis(interval));
Some(req)
}

View File

@@ -13,16 +13,13 @@
// limitations under the License.
use async_trait::async_trait;
use common_meta::RegionIdent;
use common_meta::error::{InvalidHeartbeatResponseSnafu, Result as MetaResult};
use common_meta::heartbeat::handler::{
HandleControl, HeartbeatResponseHandler, HeartbeatResponseHandlerContext,
};
use common_meta::instruction::{Instruction, InstructionReply};
use common_telemetry::error;
use futures::future::BoxFuture;
use snafu::OptionExt;
use store_api::storage::RegionId;
mod close_region;
mod downgrade_region;
@@ -30,10 +27,15 @@ mod flush_region;
mod open_region;
mod upgrade_region;
use crate::heartbeat::handler::close_region::CloseRegionsHandler;
use crate::heartbeat::handler::downgrade_region::DowngradeRegionsHandler;
use crate::heartbeat::handler::flush_region::FlushRegionsHandler;
use crate::heartbeat::handler::open_region::OpenRegionsHandler;
use crate::heartbeat::handler::upgrade_region::UpgradeRegionsHandler;
use crate::heartbeat::task_tracker::TaskTracker;
use crate::region_server::RegionServer;
/// Handler for [Instruction::OpenRegion] and [Instruction::CloseRegion].
/// The handler for [`Instruction`]s.
#[derive(Clone)]
pub struct RegionHeartbeatResponseHandler {
region_server: RegionServer,
@@ -43,9 +45,14 @@ pub struct RegionHeartbeatResponseHandler {
open_region_parallelism: usize,
}
/// Handler of the instruction.
pub type InstructionHandler =
Box<dyn FnOnce(HandlerContext) -> BoxFuture<'static, Option<InstructionReply>> + Send>;
#[async_trait::async_trait]
pub trait InstructionHandler: Send + Sync {
async fn handle(
&self,
ctx: &HandlerContext,
instruction: Instruction,
) -> Option<InstructionReply>;
}
#[derive(Clone)]
pub struct HandlerContext {
@@ -56,10 +63,6 @@ pub struct HandlerContext {
}
impl HandlerContext {
fn region_ident_to_region_id(region_ident: &RegionIdent) -> RegionId {
RegionId::new(region_ident.table_id, region_ident.region_number)
}
#[cfg(test)]
pub fn new_for_test(region_server: RegionServer) -> Self {
Self {
@@ -90,31 +93,16 @@ impl RegionHeartbeatResponseHandler {
self
}
/// Builds the [InstructionHandler].
fn build_handler(&self, instruction: Instruction) -> MetaResult<InstructionHandler> {
fn build_handler(&self, instruction: &Instruction) -> MetaResult<Box<dyn InstructionHandler>> {
match instruction {
Instruction::OpenRegions(open_regions) => {
let open_region_parallelism = self.open_region_parallelism;
Ok(Box::new(move |handler_context| {
handler_context
.handle_open_regions_instruction(open_regions, open_region_parallelism)
}))
}
Instruction::CloseRegions(close_regions) => Ok(Box::new(move |handler_context| {
handler_context.handle_close_regions_instruction(close_regions)
})),
Instruction::DowngradeRegion(downgrade_region) => {
Ok(Box::new(move |handler_context| {
handler_context.handle_downgrade_region_instruction(downgrade_region)
}))
}
Instruction::UpgradeRegion(upgrade_region) => Ok(Box::new(move |handler_context| {
handler_context.handle_upgrade_region_instruction(upgrade_region)
Instruction::CloseRegions(_) => Ok(Box::new(CloseRegionsHandler)),
Instruction::OpenRegions(_) => Ok(Box::new(OpenRegionsHandler {
open_region_parallelism: self.open_region_parallelism,
})),
Instruction::FlushRegions(_) => Ok(Box::new(FlushRegionsHandler)),
Instruction::DowngradeRegions(_) => Ok(Box::new(DowngradeRegionsHandler)),
Instruction::UpgradeRegion(_) => Ok(Box::new(UpgradeRegionsHandler)),
Instruction::InvalidateCaches(_) => InvalidHeartbeatResponseSnafu.fail(),
Instruction::FlushRegions(flush_regions) => Ok(Box::new(move |handler_context| {
handler_context.handle_flush_regions_instruction(flush_regions)
})),
}
}
}
@@ -124,7 +112,7 @@ impl HeartbeatResponseHandler for RegionHeartbeatResponseHandler {
fn is_acceptable(&self, ctx: &HeartbeatResponseHandlerContext) -> bool {
matches!(ctx.incoming_message.as_ref(), |Some((
_,
Instruction::DowngradeRegion { .. },
Instruction::DowngradeRegions { .. },
))| Some((
_,
Instruction::UpgradeRegion { .. }
@@ -151,15 +139,19 @@ impl HeartbeatResponseHandler for RegionHeartbeatResponseHandler {
let catchup_tasks = self.catchup_tasks.clone();
let downgrade_tasks = self.downgrade_tasks.clone();
let flush_tasks = self.flush_tasks.clone();
let handler = self.build_handler(instruction)?;
let handler = self.build_handler(&instruction)?;
let _handle = common_runtime::spawn_global(async move {
let reply = handler(HandlerContext {
region_server,
catchup_tasks,
downgrade_tasks,
flush_tasks,
})
.await;
let reply = handler
.handle(
&HandlerContext {
region_server,
catchup_tasks,
downgrade_tasks,
flush_tasks,
},
instruction,
)
.await;
if let Some(reply) = reply
&& let Err(e) = mailbox.send((meta, reply)).await
@@ -179,6 +171,7 @@ mod tests {
use std::sync::Arc;
use std::time::Duration;
use common_meta::RegionIdent;
use common_meta::heartbeat::mailbox::{
HeartbeatMailbox, IncomingMessage, MailboxRef, MessageMeta,
};
@@ -249,10 +242,10 @@ mod tests {
);
// Downgrade region
let instruction = Instruction::DowngradeRegion(DowngradeRegion {
let instruction = Instruction::DowngradeRegions(vec![DowngradeRegion {
region_id: RegionId::new(2048, 1),
flush_timeout: Some(Duration::from_secs(1)),
});
}]);
assert!(
heartbeat_handler
.is_acceptable(&heartbeat_env.create_handler_ctx((meta.clone(), instruction)))
@@ -447,10 +440,10 @@ mod tests {
// Should be ok, if we try to downgrade it twice.
for _ in 0..2 {
let meta = MessageMeta::new_test(1, "test", "dn-1", "me-0");
let instruction = Instruction::DowngradeRegion(DowngradeRegion {
let instruction = Instruction::DowngradeRegions(vec![DowngradeRegion {
region_id,
flush_timeout: Some(Duration::from_secs(1)),
});
}]);
let mut ctx = heartbeat_env.create_handler_ctx((meta, instruction));
let control = heartbeat_handler.handle(&mut ctx).await.unwrap();
@@ -458,33 +451,27 @@ mod tests {
let (_, reply) = heartbeat_env.receiver.recv().await.unwrap();
if let InstructionReply::DowngradeRegion(reply) = reply {
assert!(reply.exists);
assert!(reply.error.is_none());
assert_eq!(reply.last_entry_id.unwrap(), 0);
} else {
unreachable!()
}
let reply = &reply.expect_downgrade_regions_reply()[0];
assert!(reply.exists);
assert!(reply.error.is_none());
assert_eq!(reply.last_entry_id.unwrap(), 0);
}
// Downgrades a not exists region.
let meta = MessageMeta::new_test(1, "test", "dn-1", "me-0");
let instruction = Instruction::DowngradeRegion(DowngradeRegion {
let instruction = Instruction::DowngradeRegions(vec![DowngradeRegion {
region_id: RegionId::new(2048, 1),
flush_timeout: Some(Duration::from_secs(1)),
});
}]);
let mut ctx = heartbeat_env.create_handler_ctx((meta, instruction));
let control = heartbeat_handler.handle(&mut ctx).await.unwrap();
assert_matches!(control, HandleControl::Continue);
let (_, reply) = heartbeat_env.receiver.recv().await.unwrap();
if let InstructionReply::DowngradeRegion(reply) = reply {
assert!(!reply.exists);
assert!(reply.error.is_none());
assert!(reply.last_entry_id.is_none());
} else {
unreachable!()
}
let reply = reply.expect_downgrade_regions_reply();
assert!(!reply[0].exists);
assert!(reply[0].error.is_none());
assert!(reply[0].last_entry_id.is_none());
}
}

View File

@@ -12,60 +12,64 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use common_meta::RegionIdent;
use common_meta::instruction::{InstructionReply, SimpleReply};
use common_meta::instruction::{Instruction, InstructionReply, SimpleReply};
use common_telemetry::warn;
use futures::future::join_all;
use futures_util::future::BoxFuture;
use store_api::region_request::{RegionCloseRequest, RegionRequest};
use store_api::storage::RegionId;
use crate::error;
use crate::heartbeat::handler::HandlerContext;
use crate::heartbeat::handler::{HandlerContext, InstructionHandler};
impl HandlerContext {
pub(crate) fn handle_close_regions_instruction(
self,
region_idents: Vec<RegionIdent>,
) -> BoxFuture<'static, Option<InstructionReply>> {
Box::pin(async move {
let region_ids = region_idents
.into_iter()
.map(|region_ident| Self::region_ident_to_region_id(&region_ident))
.collect::<Vec<_>>();
#[derive(Debug, Clone, Copy, Default)]
pub struct CloseRegionsHandler;
let futs = region_ids.iter().map(|region_id| {
self.region_server
.handle_request(*region_id, RegionRequest::Close(RegionCloseRequest {}))
});
#[async_trait::async_trait]
impl InstructionHandler for CloseRegionsHandler {
async fn handle(
&self,
ctx: &HandlerContext,
instruction: Instruction,
) -> Option<InstructionReply> {
// Safety: must be `Instruction::CloseRegions` instruction.
let region_idents = instruction.into_close_regions().unwrap();
let region_ids = region_idents
.into_iter()
.map(|region_ident| RegionId::new(region_ident.table_id, region_ident.region_number))
.collect::<Vec<_>>();
let results = join_all(futs).await;
let futs = region_ids.iter().map(|region_id| {
ctx.region_server
.handle_request(*region_id, RegionRequest::Close(RegionCloseRequest {}))
});
let mut errors = vec![];
for (region_id, result) in region_ids.into_iter().zip(results.into_iter()) {
match result {
Ok(_) => (),
Err(error::Error::RegionNotFound { .. }) => {
warn!(
"Received a close regions instruction from meta, but target region:{} is not found.",
region_id
);
}
Err(err) => errors.push(format!("region:{region_id}: {err:?}")),
let results = join_all(futs).await;
let mut errors = vec![];
for (region_id, result) in region_ids.into_iter().zip(results.into_iter()) {
match result {
Ok(_) => (),
Err(error::Error::RegionNotFound { .. }) => {
warn!(
"Received a close regions instruction from meta, but target region:{} is not found.",
region_id
);
}
Err(err) => errors.push(format!("region:{region_id}: {err:?}")),
}
}
if errors.is_empty() {
return Some(InstructionReply::CloseRegions(SimpleReply {
result: true,
error: None,
}));
}
if errors.is_empty() {
return Some(InstructionReply::CloseRegions(SimpleReply {
result: true,
error: None,
}));
}
Some(InstructionReply::CloseRegions(SimpleReply {
result: false,
error: Some(errors.join("; ")),
}))
})
Some(InstructionReply::CloseRegions(SimpleReply {
result: false,
error: Some(errors.join("; ")),
}))
}
}

View File

@@ -12,209 +12,242 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use common_meta::instruction::{DowngradeRegion, DowngradeRegionReply, InstructionReply};
use common_meta::instruction::{
DowngradeRegion, DowngradeRegionReply, DowngradeRegionsReply, Instruction, InstructionReply,
};
use common_telemetry::tracing::info;
use common_telemetry::{error, warn};
use futures_util::future::BoxFuture;
use futures::future::join_all;
use store_api::region_engine::{SetRegionRoleStateResponse, SettableRegionRoleState};
use store_api::region_request::{RegionFlushRequest, RegionRequest};
use store_api::storage::RegionId;
use crate::heartbeat::handler::HandlerContext;
use crate::heartbeat::handler::{HandlerContext, InstructionHandler};
use crate::heartbeat::task_tracker::WaitResult;
impl HandlerContext {
async fn downgrade_to_follower_gracefully(
#[derive(Debug, Clone, Copy, Default)]
pub struct DowngradeRegionsHandler;
impl DowngradeRegionsHandler {
async fn handle_downgrade_region(
ctx: &HandlerContext,
DowngradeRegion {
region_id,
flush_timeout,
}: DowngradeRegion,
) -> DowngradeRegionReply {
let Some(writable) = ctx.region_server.is_region_leader(region_id) else {
warn!("Region: {region_id} is not found");
return DowngradeRegionReply {
region_id,
last_entry_id: None,
metadata_last_entry_id: None,
exists: false,
error: None,
};
};
let region_server_moved = ctx.region_server.clone();
// Ignores flush request
if !writable {
warn!(
"Region: {region_id} is not writable, flush_timeout: {:?}",
flush_timeout
);
return ctx.downgrade_to_follower_gracefully(region_id).await;
}
// If flush_timeout is not set, directly convert region to follower.
let Some(flush_timeout) = flush_timeout else {
return ctx.downgrade_to_follower_gracefully(region_id).await;
};
// Sets region to downgrading,
// the downgrading region will reject all write requests.
// However, the downgrading region will still accept read, flush requests.
match ctx
.region_server
.set_region_role_state_gracefully(region_id, SettableRegionRoleState::DowngradingLeader)
.await
{
Ok(SetRegionRoleStateResponse::Success { .. }) => {}
Ok(SetRegionRoleStateResponse::NotFound) => {
warn!("Region: {region_id} is not found");
return DowngradeRegionReply {
region_id,
last_entry_id: None,
metadata_last_entry_id: None,
exists: false,
error: None,
};
}
Ok(SetRegionRoleStateResponse::InvalidTransition(err)) => {
error!(err; "Failed to convert region to downgrading leader - invalid transition");
return DowngradeRegionReply {
region_id,
last_entry_id: None,
metadata_last_entry_id: None,
exists: true,
error: Some(format!("{err:?}")),
};
}
Err(err) => {
error!(err; "Failed to convert region to downgrading leader");
return DowngradeRegionReply {
region_id,
last_entry_id: None,
metadata_last_entry_id: None,
exists: true,
error: Some(format!("{err:?}")),
};
}
}
let register_result = ctx
.downgrade_tasks
.try_register(
region_id,
Box::pin(async move {
info!("Flush region: {region_id} before converting region to follower");
region_server_moved
.handle_request(
region_id,
RegionRequest::Flush(RegionFlushRequest {
row_group_size: None,
}),
)
.await?;
Ok(())
}),
)
.await;
if register_result.is_busy() {
warn!("Another flush task is running for the region: {region_id}");
}
let mut watcher = register_result.into_watcher();
let result = ctx.downgrade_tasks.wait(&mut watcher, flush_timeout).await;
match result {
WaitResult::Timeout => DowngradeRegionReply {
region_id,
last_entry_id: None,
metadata_last_entry_id: None,
exists: true,
error: Some(format!(
"Flush region timeout, region: {region_id}, timeout: {:?}",
flush_timeout
)),
},
WaitResult::Finish(Ok(_)) => ctx.downgrade_to_follower_gracefully(region_id).await,
WaitResult::Finish(Err(err)) => DowngradeRegionReply {
region_id,
last_entry_id: None,
metadata_last_entry_id: None,
exists: true,
error: Some(format!("{err:?}")),
},
}
}
}
#[async_trait::async_trait]
impl InstructionHandler for DowngradeRegionsHandler {
async fn handle(
&self,
region_id: RegionId,
ctx: &HandlerContext,
instruction: Instruction,
) -> Option<InstructionReply> {
// Safety: must be `Instruction::DowngradeRegion` instruction.
let downgrade_regions = instruction.into_downgrade_regions().unwrap();
let futures = downgrade_regions
.into_iter()
.map(|downgrade_region| Self::handle_downgrade_region(ctx, downgrade_region));
// Join all futures; parallelism is governed by the underlying flush scheduler.
let results = join_all(futures).await;
Some(InstructionReply::DowngradeRegions(
DowngradeRegionsReply::new(results),
))
}
}
impl HandlerContext {
async fn downgrade_to_follower_gracefully(&self, region_id: RegionId) -> DowngradeRegionReply {
match self
.region_server
.set_region_role_state_gracefully(region_id, SettableRegionRoleState::Follower)
.await
{
Ok(SetRegionRoleStateResponse::Success(success)) => {
Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
last_entry_id: success.last_entry_id(),
metadata_last_entry_id: success.metadata_last_entry_id(),
exists: true,
error: None,
}))
}
Ok(SetRegionRoleStateResponse::Success(success)) => DowngradeRegionReply {
region_id,
last_entry_id: success.last_entry_id(),
metadata_last_entry_id: success.metadata_last_entry_id(),
exists: true,
error: None,
},
Ok(SetRegionRoleStateResponse::NotFound) => {
warn!("Region: {region_id} is not found");
Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
DowngradeRegionReply {
region_id,
last_entry_id: None,
metadata_last_entry_id: None,
exists: false,
error: None,
}))
}
}
Ok(SetRegionRoleStateResponse::InvalidTransition(err)) => {
error!(err; "Failed to convert region to follower - invalid transition");
Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
DowngradeRegionReply {
region_id,
last_entry_id: None,
metadata_last_entry_id: None,
exists: true,
error: Some(format!("{err:?}")),
}))
}
}
Err(err) => {
error!(err; "Failed to convert region to {}", SettableRegionRoleState::Follower);
Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
DowngradeRegionReply {
region_id,
last_entry_id: None,
metadata_last_entry_id: None,
exists: true,
error: Some(format!("{err:?}")),
}))
}
}
}
}
pub(crate) fn handle_downgrade_region_instruction(
self,
DowngradeRegion {
region_id,
flush_timeout,
}: DowngradeRegion,
) -> BoxFuture<'static, Option<InstructionReply>> {
Box::pin(async move {
let Some(writable) = self.region_server.is_region_leader(region_id) else {
warn!("Region: {region_id} is not found");
return Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
last_entry_id: None,
metadata_last_entry_id: None,
exists: false,
error: None,
}));
};
let region_server_moved = self.region_server.clone();
// Ignores flush request
if !writable {
warn!(
"Region: {region_id} is not writable, flush_timeout: {:?}",
flush_timeout
);
return self.downgrade_to_follower_gracefully(region_id).await;
}
// If flush_timeout is not set, directly convert region to follower.
let Some(flush_timeout) = flush_timeout else {
return self.downgrade_to_follower_gracefully(region_id).await;
};
// Sets region to downgrading,
// the downgrading region will reject all write requests.
// However, the downgrading region will still accept read, flush requests.
match self
.region_server
.set_region_role_state_gracefully(
region_id,
SettableRegionRoleState::DowngradingLeader,
)
.await
{
Ok(SetRegionRoleStateResponse::Success { .. }) => {}
Ok(SetRegionRoleStateResponse::NotFound) => {
warn!("Region: {region_id} is not found");
return Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
last_entry_id: None,
metadata_last_entry_id: None,
exists: false,
error: None,
}));
}
Ok(SetRegionRoleStateResponse::InvalidTransition(err)) => {
error!(err; "Failed to convert region to downgrading leader - invalid transition");
return Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
last_entry_id: None,
metadata_last_entry_id: None,
exists: true,
error: Some(format!("{err:?}")),
}));
}
Err(err) => {
error!(err; "Failed to convert region to downgrading leader");
return Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
last_entry_id: None,
metadata_last_entry_id: None,
exists: true,
error: Some(format!("{err:?}")),
}));
}
}
let register_result = self
.downgrade_tasks
.try_register(
region_id,
Box::pin(async move {
info!("Flush region: {region_id} before converting region to follower");
region_server_moved
.handle_request(
region_id,
RegionRequest::Flush(RegionFlushRequest {
row_group_size: None,
}),
)
.await?;
Ok(())
}),
)
.await;
if register_result.is_busy() {
warn!("Another flush task is running for the region: {region_id}");
}
let mut watcher = register_result.into_watcher();
let result = self.downgrade_tasks.wait(&mut watcher, flush_timeout).await;
match result {
WaitResult::Timeout => {
Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
last_entry_id: None,
metadata_last_entry_id: None,
exists: true,
error: Some(format!(
"Flush region timeout, region: {region_id}, timeout: {:?}",
flush_timeout
)),
}))
}
WaitResult::Finish(Ok(_)) => self.downgrade_to_follower_gracefully(region_id).await,
WaitResult::Finish(Err(err)) => {
Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
last_entry_id: None,
metadata_last_entry_id: None,
exists: true,
error: Some(format!("{err:?}")),
}))
}
}
})
}
}
#[cfg(test)]
mod tests {
use std::assert_matches::assert_matches;
use std::sync::Arc;
use std::time::Duration;
use common_meta::instruction::{DowngradeRegion, InstructionReply};
use common_meta::heartbeat::handler::{HandleControl, HeartbeatResponseHandler};
use common_meta::heartbeat::mailbox::MessageMeta;
use common_meta::instruction::{DowngradeRegion, Instruction};
use mito2::config::MitoConfig;
use mito2::engine::MITO_ENGINE_NAME;
use mito2::test_util::{CreateRequestBuilder, TestEnv};
use store_api::region_engine::{
RegionRole, SetRegionRoleStateResponse, SetRegionRoleStateSuccess,
RegionEngine, RegionRole, SetRegionRoleStateResponse, SetRegionRoleStateSuccess,
};
use store_api::region_request::RegionRequest;
use store_api::storage::RegionId;
use tokio::time::Instant;
use crate::error;
use crate::heartbeat::handler::HandlerContext;
use crate::heartbeat::handler::downgrade_region::DowngradeRegionsHandler;
use crate::heartbeat::handler::tests::HeartbeatResponseTestEnv;
use crate::heartbeat::handler::{
HandlerContext, InstructionHandler, RegionHeartbeatResponseHandler,
};
use crate::tests::{MockRegionEngine, mock_region_server};
#[tokio::test]
@@ -227,20 +260,20 @@ mod tests {
let waits = vec![None, Some(Duration::from_millis(100u64))];
for flush_timeout in waits {
let reply = handler_context
.clone()
.handle_downgrade_region_instruction(DowngradeRegion {
region_id,
flush_timeout,
})
let reply = DowngradeRegionsHandler
.handle(
&handler_context,
Instruction::DowngradeRegions(vec![DowngradeRegion {
region_id,
flush_timeout,
}]),
)
.await;
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() {
assert!(!reply.exists);
assert!(reply.error.is_none());
assert!(reply.last_entry_id.is_none());
}
let reply = &reply.unwrap().expect_downgrade_regions_reply()[0];
assert!(!reply.exists);
assert!(reply.error.is_none());
assert!(reply.last_entry_id.is_none());
}
}
@@ -270,20 +303,20 @@ mod tests {
let waits = vec![None, Some(Duration::from_millis(100u64))];
for flush_timeout in waits {
let reply = handler_context
.clone()
.handle_downgrade_region_instruction(DowngradeRegion {
region_id,
flush_timeout,
})
let reply = DowngradeRegionsHandler
.handle(
&handler_context,
Instruction::DowngradeRegions(vec![DowngradeRegion {
region_id,
flush_timeout,
}]),
)
.await;
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() {
assert!(reply.exists);
assert!(reply.error.is_none());
assert_eq!(reply.last_entry_id.unwrap(), 1024);
}
let reply = &reply.unwrap().expect_downgrade_regions_reply()[0];
assert!(reply.exists);
assert!(reply.error.is_none());
assert_eq!(reply.last_entry_id.unwrap(), 1024);
}
}
@@ -305,20 +338,20 @@ mod tests {
let handler_context = HandlerContext::new_for_test(mock_region_server);
let flush_timeout = Duration::from_millis(100);
let reply = handler_context
.clone()
.handle_downgrade_region_instruction(DowngradeRegion {
region_id,
flush_timeout: Some(flush_timeout),
})
let reply = DowngradeRegionsHandler
.handle(
&handler_context,
Instruction::DowngradeRegions(vec![DowngradeRegion {
region_id,
flush_timeout: Some(flush_timeout),
}]),
)
.await;
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() {
assert!(reply.exists);
assert!(reply.error.unwrap().contains("timeout"));
assert!(reply.last_entry_id.is_none());
}
let reply = &reply.unwrap().expect_downgrade_regions_reply()[0];
assert!(reply.exists);
assert!(reply.error.as_ref().unwrap().contains("timeout"));
assert!(reply.last_entry_id.is_none());
}
#[tokio::test]
@@ -344,36 +377,38 @@ mod tests {
];
for flush_timeout in waits {
let reply = handler_context
.clone()
.handle_downgrade_region_instruction(DowngradeRegion {
region_id,
flush_timeout,
})
let reply = DowngradeRegionsHandler
.handle(
&handler_context,
Instruction::DowngradeRegions(vec![DowngradeRegion {
region_id,
flush_timeout,
}]),
)
.await;
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() {
assert!(reply.exists);
assert!(reply.error.unwrap().contains("timeout"));
assert!(reply.last_entry_id.is_none());
}
let reply = &reply.unwrap().expect_downgrade_regions_reply()[0];
assert!(reply.exists);
assert!(reply.error.as_ref().unwrap().contains("timeout"));
assert!(reply.last_entry_id.is_none());
}
let timer = Instant::now();
let reply = handler_context
.handle_downgrade_region_instruction(DowngradeRegion {
region_id,
flush_timeout: Some(Duration::from_millis(500)),
})
let reply = DowngradeRegionsHandler
.handle(
&handler_context,
Instruction::DowngradeRegions(vec![DowngradeRegion {
region_id,
flush_timeout: Some(Duration::from_millis(500)),
}]),
)
.await;
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
// Must less than 300 ms.
assert!(timer.elapsed().as_millis() < 300);
if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() {
assert!(reply.exists);
assert!(reply.error.is_none());
assert_eq!(reply.last_entry_id.unwrap(), 1024);
}
let reply = &reply.unwrap().expect_downgrade_regions_reply()[0];
assert!(reply.exists);
assert!(reply.error.is_none());
assert_eq!(reply.last_entry_id.unwrap(), 1024);
}
#[tokio::test]
@@ -405,36 +440,36 @@ mod tests {
];
for flush_timeout in waits {
let reply = handler_context
.clone()
.handle_downgrade_region_instruction(DowngradeRegion {
region_id,
flush_timeout,
})
let reply = DowngradeRegionsHandler
.handle(
&handler_context,
Instruction::DowngradeRegions(vec![DowngradeRegion {
region_id,
flush_timeout,
}]),
)
.await;
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() {
assert!(reply.exists);
assert!(reply.error.unwrap().contains("timeout"));
assert!(reply.last_entry_id.is_none());
}
}
let timer = Instant::now();
let reply = handler_context
.handle_downgrade_region_instruction(DowngradeRegion {
region_id,
flush_timeout: Some(Duration::from_millis(500)),
})
.await;
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
// Must less than 300 ms.
assert!(timer.elapsed().as_millis() < 300);
if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() {
let reply = &reply.unwrap().expect_downgrade_regions_reply()[0];
assert!(reply.exists);
assert!(reply.error.unwrap().contains("flush failed"));
assert!(reply.error.as_ref().unwrap().contains("timeout"));
assert!(reply.last_entry_id.is_none());
}
let timer = Instant::now();
let reply = DowngradeRegionsHandler
.handle(
&handler_context,
Instruction::DowngradeRegions(vec![DowngradeRegion {
region_id,
flush_timeout: Some(Duration::from_millis(500)),
}]),
)
.await;
// Must less than 300 ms.
assert!(timer.elapsed().as_millis() < 300);
let reply = &reply.unwrap().expect_downgrade_regions_reply()[0];
assert!(reply.exists);
assert!(reply.error.as_ref().unwrap().contains("flush failed"));
assert!(reply.last_entry_id.is_none());
}
#[tokio::test]
@@ -449,19 +484,19 @@ mod tests {
});
mock_region_server.register_test_region(region_id, mock_engine);
let handler_context = HandlerContext::new_for_test(mock_region_server);
let reply = handler_context
.clone()
.handle_downgrade_region_instruction(DowngradeRegion {
region_id,
flush_timeout: None,
})
let reply = DowngradeRegionsHandler
.handle(
&handler_context,
Instruction::DowngradeRegions(vec![DowngradeRegion {
region_id,
flush_timeout: None,
}]),
)
.await;
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() {
assert!(!reply.exists);
assert!(reply.error.is_none());
assert!(reply.last_entry_id.is_none());
}
let reply = &reply.unwrap().expect_downgrade_regions_reply()[0];
assert!(!reply.exists);
assert!(reply.error.is_none());
assert!(reply.last_entry_id.is_none());
}
#[tokio::test]
@@ -480,23 +515,77 @@ mod tests {
});
mock_region_server.register_test_region(region_id, mock_engine);
let handler_context = HandlerContext::new_for_test(mock_region_server);
let reply = handler_context
.clone()
.handle_downgrade_region_instruction(DowngradeRegion {
region_id,
flush_timeout: None,
})
let reply = DowngradeRegionsHandler
.handle(
&handler_context,
Instruction::DowngradeRegions(vec![DowngradeRegion {
region_id,
flush_timeout: None,
}]),
)
.await;
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() {
assert!(reply.exists);
assert!(
reply
.error
.unwrap()
.contains("Failed to set region to readonly")
);
assert!(reply.last_entry_id.is_none());
}
let reply = &reply.unwrap().expect_downgrade_regions_reply()[0];
assert!(reply.exists);
assert!(
reply
.error
.as_ref()
.unwrap()
.contains("Failed to set region to readonly")
);
assert!(reply.last_entry_id.is_none());
}
#[tokio::test]
async fn test_downgrade_regions() {
common_telemetry::init_default_ut_logging();
let mut region_server = mock_region_server();
let heartbeat_handler = RegionHeartbeatResponseHandler::new(region_server.clone());
let mut engine_env = TestEnv::with_prefix("downgrade-regions").await;
let engine = engine_env.create_engine(MitoConfig::default()).await;
region_server.register_engine(Arc::new(engine.clone()));
let region_id = RegionId::new(1024, 1);
let region_id1 = RegionId::new(1024, 2);
let builder = CreateRequestBuilder::new();
let create_req = builder.build();
region_server
.handle_request(region_id, RegionRequest::Create(create_req))
.await
.unwrap();
let create_req1 = builder.build();
region_server
.handle_request(region_id1, RegionRequest::Create(create_req1))
.await
.unwrap();
let meta = MessageMeta::new_test(1, "test", "dn-1", "meta-0");
let instruction = Instruction::DowngradeRegions(vec![
DowngradeRegion {
region_id,
flush_timeout: Some(Duration::from_secs(1)),
},
DowngradeRegion {
region_id: region_id1,
flush_timeout: Some(Duration::from_secs(1)),
},
]);
let mut heartbeat_env = HeartbeatResponseTestEnv::new();
let mut ctx = heartbeat_env.create_handler_ctx((meta, instruction));
let control = heartbeat_handler.handle(&mut ctx).await.unwrap();
assert_matches!(control, HandleControl::Continue);
let (_, reply) = heartbeat_env.receiver.recv().await.unwrap();
let reply = reply.expect_downgrade_regions_reply();
assert_eq!(reply[0].region_id, region_id);
assert!(reply[0].exists);
assert!(reply[0].error.is_none());
assert_eq!(reply[0].last_entry_id, Some(0));
assert_eq!(reply[1].region_id, region_id1);
assert!(reply[1].exists);
assert!(reply[1].error.is_none());
assert_eq!(reply[1].last_entry_id, Some(0));
assert_eq!(engine.role(region_id).unwrap(), RegionRole::Follower);
assert_eq!(engine.role(region_id1).unwrap(), RegionRole::Follower);
}
}

View File

@@ -15,19 +15,53 @@
use std::time::Instant;
use common_meta::instruction::{
FlushErrorStrategy, FlushRegionReply, FlushRegions, FlushStrategy, InstructionReply,
FlushErrorStrategy, FlushRegionReply, FlushStrategy, Instruction, InstructionReply,
};
use common_telemetry::{debug, warn};
use futures_util::future::BoxFuture;
use store_api::region_request::{RegionFlushRequest, RegionRequest};
use store_api::storage::RegionId;
use crate::error::{self, RegionNotFoundSnafu, RegionNotReadySnafu, UnexpectedSnafu};
use crate::heartbeat::handler::HandlerContext;
use crate::error::{self, RegionNotFoundSnafu, RegionNotReadySnafu, Result, UnexpectedSnafu};
use crate::heartbeat::handler::{HandlerContext, InstructionHandler};
pub struct FlushRegionsHandler;
#[async_trait::async_trait]
impl InstructionHandler for FlushRegionsHandler {
async fn handle(
&self,
ctx: &HandlerContext,
instruction: Instruction,
) -> Option<InstructionReply> {
let start_time = Instant::now();
let flush_regions = instruction.into_flush_regions().unwrap();
let strategy = flush_regions.strategy;
let region_ids = flush_regions.region_ids;
let error_strategy = flush_regions.error_strategy;
let reply = if matches!(strategy, FlushStrategy::Async) {
// Asynchronous hint mode: fire-and-forget, no reply expected
ctx.handle_flush_hint(region_ids).await;
None
} else {
// Synchronous mode: return reply with results
let reply = ctx.handle_flush_sync(region_ids, error_strategy).await;
Some(InstructionReply::FlushRegions(reply))
};
let elapsed = start_time.elapsed();
debug!(
"FlushRegions strategy: {:?}, elapsed: {:?}, reply: {:?}",
strategy, elapsed, reply
);
reply
}
}
impl HandlerContext {
/// Performs the actual region flush operation.
async fn perform_region_flush(&self, region_id: RegionId) -> Result<(), error::Error> {
async fn perform_region_flush(&self, region_id: RegionId) -> Result<()> {
let request = RegionRequest::Flush(RegionFlushRequest {
row_group_size: None,
});
@@ -92,7 +126,7 @@ impl HandlerContext {
}
/// Flushes a single region synchronously with proper error handling.
async fn flush_single_region_sync(&self, region_id: RegionId) -> Result<(), error::Error> {
async fn flush_single_region_sync(&self, region_id: RegionId) -> Result<()> {
// Check if region is leader and writable
let Some(writable) = self.region_server.is_region_leader(region_id) else {
return Err(RegionNotFoundSnafu { region_id }.build());
@@ -135,37 +169,6 @@ impl HandlerContext {
.build()),
}
}
/// Unified handler for FlushRegions with all flush semantics.
pub(crate) fn handle_flush_regions_instruction(
self,
flush_regions: FlushRegions,
) -> BoxFuture<'static, Option<InstructionReply>> {
Box::pin(async move {
let start_time = Instant::now();
let strategy = flush_regions.strategy;
let region_ids = flush_regions.region_ids;
let error_strategy = flush_regions.error_strategy;
let reply = if matches!(strategy, FlushStrategy::Async) {
// Asynchronous hint mode: fire-and-forget, no reply expected
self.handle_flush_hint(region_ids).await;
None
} else {
// Synchronous mode: return reply with results
let reply = self.handle_flush_sync(region_ids, error_strategy).await;
Some(InstructionReply::FlushRegions(reply))
};
let elapsed = start_time.elapsed();
debug!(
"FlushRegions strategy: {:?}, elapsed: {:?}, reply: {:?}",
strategy, elapsed, reply
);
reply
})
}
}
#[cfg(test)]
@@ -201,9 +204,11 @@ mod tests {
// Async hint mode
let flush_instruction = FlushRegions::async_batch(region_ids.clone());
let reply = handler_context
.clone()
.handle_flush_regions_instruction(flush_instruction)
let reply = FlushRegionsHandler
.handle(
&handler_context,
Instruction::FlushRegions(flush_instruction),
)
.await;
assert!(reply.is_none()); // Hint mode returns no reply
assert_eq!(*flushed_region_ids.read().unwrap(), region_ids);
@@ -212,8 +217,11 @@ mod tests {
flushed_region_ids.write().unwrap().clear();
let not_found_region_ids = (0..2).map(|i| RegionId::new(2048, i)).collect::<Vec<_>>();
let flush_instruction = FlushRegions::async_batch(not_found_region_ids);
let reply = handler_context
.handle_flush_regions_instruction(flush_instruction)
let reply = FlushRegionsHandler
.handle(
&handler_context,
Instruction::FlushRegions(flush_instruction),
)
.await;
assert!(reply.is_none());
assert!(flushed_region_ids.read().unwrap().is_empty());
@@ -238,20 +246,17 @@ mod tests {
let handler_context = HandlerContext::new_for_test(mock_region_server);
let flush_instruction = FlushRegions::sync_single(region_id);
let reply = handler_context
.handle_flush_regions_instruction(flush_instruction)
let reply = FlushRegionsHandler
.handle(
&handler_context,
Instruction::FlushRegions(flush_instruction),
)
.await;
assert!(reply.is_some());
if let Some(InstructionReply::FlushRegions(flush_reply)) = reply {
assert!(flush_reply.overall_success);
assert_eq!(flush_reply.results.len(), 1);
assert_eq!(flush_reply.results[0].0, region_id);
assert!(flush_reply.results[0].1.is_ok());
} else {
panic!("Expected FlushRegions reply");
}
let flush_reply = reply.unwrap().expect_flush_regions_reply();
assert!(flush_reply.overall_success);
assert_eq!(flush_reply.results.len(), 1);
assert_eq!(flush_reply.results[0].0, region_id);
assert!(flush_reply.results[0].1.is_ok());
assert_eq!(*flushed_region_ids.read().unwrap(), vec![region_id]);
}
@@ -281,18 +286,16 @@ mod tests {
// Sync batch with fail-fast strategy
let flush_instruction =
FlushRegions::sync_batch(region_ids.clone(), FlushErrorStrategy::FailFast);
let reply = handler_context
.handle_flush_regions_instruction(flush_instruction)
let reply = FlushRegionsHandler
.handle(
&handler_context,
Instruction::FlushRegions(flush_instruction),
)
.await;
assert!(reply.is_some());
if let Some(InstructionReply::FlushRegions(flush_reply)) = reply {
assert!(!flush_reply.overall_success); // Should fail due to non-existent regions
// With fail-fast, only process regions until first failure
assert!(flush_reply.results.len() <= region_ids.len());
} else {
panic!("Expected FlushRegions reply");
}
let flush_reply = reply.unwrap().expect_flush_regions_reply();
assert!(!flush_reply.overall_success); // Should fail due to non-existent regions
// With fail-fast, only process regions until first failure
assert!(flush_reply.results.len() <= region_ids.len());
}
#[tokio::test]
@@ -317,20 +320,18 @@ mod tests {
// Sync batch with try-all strategy
let flush_instruction =
FlushRegions::sync_batch(region_ids.clone(), FlushErrorStrategy::TryAll);
let reply = handler_context
.handle_flush_regions_instruction(flush_instruction)
let reply = FlushRegionsHandler
.handle(
&handler_context,
Instruction::FlushRegions(flush_instruction),
)
.await;
assert!(reply.is_some());
if let Some(InstructionReply::FlushRegions(flush_reply)) = reply {
assert!(!flush_reply.overall_success); // Should fail due to one non-existent region
// With try-all, should process all regions
assert_eq!(flush_reply.results.len(), region_ids.len());
// First should succeed, second should fail
assert!(flush_reply.results[0].1.is_ok());
assert!(flush_reply.results[1].1.is_err());
} else {
panic!("Expected FlushRegions reply");
}
let flush_reply = reply.unwrap().expect_flush_regions_reply();
assert!(!flush_reply.overall_success); // Should fail due to one non-existent region
// With try-all, should process all regions
assert_eq!(flush_reply.results.len(), region_ids.len());
// First should succeed, second should fail
assert!(flush_reply.results[0].1.is_ok());
assert!(flush_reply.results[1].1.is_err());
}
}

View File

@@ -12,56 +12,62 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use common_meta::instruction::{InstructionReply, OpenRegion, SimpleReply};
use common_meta::instruction::{Instruction, InstructionReply, OpenRegion, SimpleReply};
use common_meta::wal_options_allocator::prepare_wal_options;
use futures_util::future::BoxFuture;
use store_api::path_utils::table_dir;
use store_api::region_request::{PathType, RegionOpenRequest};
use store_api::storage::RegionId;
use crate::heartbeat::handler::HandlerContext;
use crate::heartbeat::handler::{HandlerContext, InstructionHandler};
impl HandlerContext {
pub(crate) fn handle_open_regions_instruction(
self,
open_regions: Vec<OpenRegion>,
open_region_parallelism: usize,
) -> BoxFuture<'static, Option<InstructionReply>> {
Box::pin(async move {
let requests = open_regions
.into_iter()
.map(|open_region| {
let OpenRegion {
region_ident,
region_storage_path,
mut region_options,
region_wal_options,
skip_wal_replay,
} = open_region;
let region_id = Self::region_ident_to_region_id(&region_ident);
prepare_wal_options(&mut region_options, region_id, &region_wal_options);
let request = RegionOpenRequest {
engine: region_ident.engine,
table_dir: table_dir(&region_storage_path, region_id.table_id()),
path_type: PathType::Bare,
options: region_options,
skip_wal_replay,
checkpoint: None,
};
(region_id, request)
})
.collect::<Vec<_>>();
pub struct OpenRegionsHandler {
pub open_region_parallelism: usize,
}
let result = self
.region_server
.handle_batch_open_requests(open_region_parallelism, requests, false)
.await;
let success = result.is_ok();
let error = result.as_ref().map_err(|e| format!("{e:?}")).err();
Some(InstructionReply::OpenRegions(SimpleReply {
result: success,
error,
}))
})
#[async_trait::async_trait]
impl InstructionHandler for OpenRegionsHandler {
async fn handle(
&self,
ctx: &HandlerContext,
instruction: Instruction,
) -> Option<InstructionReply> {
let open_regions = instruction.into_open_regions().unwrap();
let requests = open_regions
.into_iter()
.map(|open_region| {
let OpenRegion {
region_ident,
region_storage_path,
mut region_options,
region_wal_options,
skip_wal_replay,
} = open_region;
let region_id = RegionId::new(region_ident.table_id, region_ident.region_number);
prepare_wal_options(&mut region_options, region_id, &region_wal_options);
let request = RegionOpenRequest {
engine: region_ident.engine,
table_dir: table_dir(&region_storage_path, region_id.table_id()),
path_type: PathType::Bare,
options: region_options,
skip_wal_replay,
checkpoint: None,
};
(region_id, request)
})
.collect::<Vec<_>>();
let result = ctx
.region_server
.handle_batch_open_requests(self.open_region_parallelism, requests, false)
.await;
let success = result.is_ok();
let error = result.as_ref().map_err(|e| format!("{e:?}")).err();
Some(InstructionReply::OpenRegions(SimpleReply {
result: success,
error,
}))
}
}

View File

@@ -12,18 +12,24 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use common_meta::instruction::{InstructionReply, UpgradeRegion, UpgradeRegionReply};
use common_meta::instruction::{Instruction, InstructionReply, UpgradeRegion, UpgradeRegionReply};
use common_telemetry::{info, warn};
use futures_util::future::BoxFuture;
use store_api::region_request::{RegionCatchupRequest, RegionRequest, ReplayCheckpoint};
use crate::heartbeat::handler::HandlerContext;
use crate::heartbeat::handler::{HandlerContext, InstructionHandler};
use crate::heartbeat::task_tracker::WaitResult;
impl HandlerContext {
pub(crate) fn handle_upgrade_region_instruction(
self,
UpgradeRegion {
#[derive(Debug, Clone, Copy, Default)]
pub struct UpgradeRegionsHandler;
#[async_trait::async_trait]
impl InstructionHandler for UpgradeRegionsHandler {
async fn handle(
&self,
ctx: &HandlerContext,
instruction: Instruction,
) -> Option<InstructionReply> {
let UpgradeRegion {
region_id,
last_entry_id,
metadata_last_entry_id,
@@ -31,116 +37,116 @@ impl HandlerContext {
location_id,
replay_entry_id,
metadata_replay_entry_id,
}: UpgradeRegion,
) -> BoxFuture<'static, Option<InstructionReply>> {
Box::pin(async move {
let Some(writable) = self.region_server.is_region_leader(region_id) else {
return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
ready: false,
exists: false,
error: None,
}));
};
} = instruction.into_upgrade_regions().unwrap();
if writable {
return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
let Some(writable) = ctx.region_server.is_region_leader(region_id) else {
return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
ready: false,
exists: false,
error: None,
}));
};
if writable {
return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
ready: true,
exists: true,
error: None,
}));
}
let region_server_moved = ctx.region_server.clone();
let checkpoint = match (replay_entry_id, metadata_replay_entry_id) {
(Some(entry_id), metadata_entry_id) => Some(ReplayCheckpoint {
entry_id,
metadata_entry_id,
}),
_ => None,
};
// The catchup task is almost zero cost if the inside region is writable.
// Therefore, it always registers a new catchup task.
let register_result = ctx
.catchup_tasks
.try_register(
region_id,
Box::pin(async move {
info!(
"Executing region: {region_id} catchup to: last entry id {last_entry_id:?}"
);
region_server_moved
.handle_request(
region_id,
RegionRequest::Catchup(RegionCatchupRequest {
set_writable: true,
entry_id: last_entry_id,
metadata_entry_id: metadata_last_entry_id,
location_id,
checkpoint,
}),
)
.await?;
Ok(())
}),
)
.await;
if register_result.is_busy() {
warn!("Another catchup task is running for the region: {region_id}");
}
// Returns immediately
let Some(replay_timeout) = replay_timeout else {
return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
ready: false,
exists: true,
error: None,
}));
};
// We don't care that it returns a newly registered or running task.
let mut watcher = register_result.into_watcher();
let result = ctx.catchup_tasks.wait(&mut watcher, replay_timeout).await;
match result {
WaitResult::Timeout => Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
ready: false,
exists: true,
error: None,
})),
WaitResult::Finish(Ok(_)) => {
Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
ready: true,
exists: true,
error: None,
}));
}))
}
let region_server_moved = self.region_server.clone();
let checkpoint = match (replay_entry_id, metadata_replay_entry_id) {
(Some(entry_id), metadata_entry_id) => Some(ReplayCheckpoint {
entry_id,
metadata_entry_id,
}),
_ => None,
};
// The catchup task is almost zero cost if the inside region is writable.
// Therefore, it always registers a new catchup task.
let register_result = self
.catchup_tasks
.try_register(
region_id,
Box::pin(async move {
info!("Executing region: {region_id} catchup to: last entry id {last_entry_id:?}");
region_server_moved
.handle_request(
region_id,
RegionRequest::Catchup(RegionCatchupRequest {
set_writable: true,
entry_id: last_entry_id,
metadata_entry_id: metadata_last_entry_id,
location_id,
checkpoint,
}),
)
.await?;
Ok(())
}),
)
.await;
if register_result.is_busy() {
warn!("Another catchup task is running for the region: {region_id}");
}
// Returns immediately
let Some(replay_timeout) = replay_timeout else {
return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
WaitResult::Finish(Err(err)) => {
Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
ready: false,
exists: true,
error: None,
}));
};
// We don't care that it returns a newly registered or running task.
let mut watcher = register_result.into_watcher();
let result = self.catchup_tasks.wait(&mut watcher, replay_timeout).await;
match result {
WaitResult::Timeout => Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
ready: false,
exists: true,
error: None,
})),
WaitResult::Finish(Ok(_)) => {
Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
ready: true,
exists: true,
error: None,
}))
}
WaitResult::Finish(Err(err)) => {
Some(InstructionReply::UpgradeRegion(UpgradeRegionReply {
ready: false,
exists: true,
error: Some(format!("{err:?}")),
}))
}
error: Some(format!("{err:?}")),
}))
}
})
}
}
}
#[cfg(test)]
mod tests {
use std::assert_matches::assert_matches;
use std::time::Duration;
use common_meta::instruction::{InstructionReply, UpgradeRegion};
use common_meta::instruction::{Instruction, UpgradeRegion};
use mito2::engine::MITO_ENGINE_NAME;
use store_api::region_engine::RegionRole;
use store_api::storage::RegionId;
use tokio::time::Instant;
use crate::error;
use crate::heartbeat::handler::HandlerContext;
use crate::heartbeat::handler::upgrade_region::UpgradeRegionsHandler;
use crate::heartbeat::handler::{HandlerContext, InstructionHandler};
use crate::tests::{MockRegionEngine, mock_region_server};
#[tokio::test]
@@ -155,20 +161,20 @@ mod tests {
let waits = vec![None, Some(Duration::from_millis(100u64))];
for replay_timeout in waits {
let reply = handler_context
.clone()
.handle_upgrade_region_instruction(UpgradeRegion {
region_id,
replay_timeout,
..Default::default()
})
let reply = UpgradeRegionsHandler
.handle(
&handler_context,
Instruction::UpgradeRegion(UpgradeRegion {
region_id,
replay_timeout,
..Default::default()
}),
)
.await;
assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_)));
if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() {
assert!(!reply.exists);
assert!(reply.error.is_none());
}
let reply = reply.unwrap().expect_upgrade_region_reply();
assert!(!reply.exists);
assert!(reply.error.is_none());
}
}
@@ -192,21 +198,21 @@ mod tests {
let waits = vec![None, Some(Duration::from_millis(100u64))];
for replay_timeout in waits {
let reply = handler_context
.clone()
.handle_upgrade_region_instruction(UpgradeRegion {
region_id,
replay_timeout,
..Default::default()
})
let reply = UpgradeRegionsHandler
.handle(
&handler_context,
Instruction::UpgradeRegion(UpgradeRegion {
region_id,
replay_timeout,
..Default::default()
}),
)
.await;
assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_)));
if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() {
assert!(reply.ready);
assert!(reply.exists);
assert!(reply.error.is_none());
}
let reply = reply.unwrap().expect_upgrade_region_reply();
assert!(reply.ready);
assert!(reply.exists);
assert!(reply.error.is_none());
}
}
@@ -230,21 +236,21 @@ mod tests {
let waits = vec![None, Some(Duration::from_millis(100u64))];
for replay_timeout in waits {
let reply = handler_context
.clone()
.handle_upgrade_region_instruction(UpgradeRegion {
region_id,
replay_timeout,
..Default::default()
})
let reply = UpgradeRegionsHandler
.handle(
&handler_context,
Instruction::UpgradeRegion(UpgradeRegion {
region_id,
replay_timeout,
..Default::default()
}),
)
.await;
assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_)));
if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() {
assert!(!reply.ready);
assert!(reply.exists);
assert!(reply.error.is_none());
}
let reply = reply.unwrap().expect_upgrade_region_reply();
assert!(!reply.ready);
assert!(reply.exists);
assert!(reply.error.is_none());
}
}
@@ -271,40 +277,41 @@ mod tests {
let handler_context = HandlerContext::new_for_test(mock_region_server);
for replay_timeout in waits {
let reply = handler_context
.clone()
.handle_upgrade_region_instruction(UpgradeRegion {
region_id,
replay_timeout,
..Default::default()
})
let reply = UpgradeRegionsHandler
.handle(
&handler_context,
Instruction::UpgradeRegion(UpgradeRegion {
region_id,
replay_timeout,
..Default::default()
}),
)
.await;
assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_)));
if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() {
assert!(!reply.ready);
assert!(reply.exists);
assert!(reply.error.is_none());
}
}
let timer = Instant::now();
let reply = handler_context
.handle_upgrade_region_instruction(UpgradeRegion {
region_id,
replay_timeout: Some(Duration::from_millis(500)),
..Default::default()
})
.await;
assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_)));
// Must less than 300 ms.
assert!(timer.elapsed().as_millis() < 300);
if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() {
assert!(reply.ready);
let reply = reply.unwrap().expect_upgrade_region_reply();
assert!(!reply.ready);
assert!(reply.exists);
assert!(reply.error.is_none());
}
let timer = Instant::now();
let reply = UpgradeRegionsHandler
.handle(
&handler_context,
Instruction::UpgradeRegion(UpgradeRegion {
region_id,
replay_timeout: Some(Duration::from_millis(500)),
..Default::default()
}),
)
.await;
// Must less than 300 ms.
assert!(timer.elapsed().as_millis() < 300);
let reply = reply.unwrap().expect_upgrade_region_reply();
assert!(reply.ready);
assert!(reply.exists);
assert!(reply.error.is_none());
}
#[tokio::test]
@@ -329,37 +336,37 @@ mod tests {
let handler_context = HandlerContext::new_for_test(mock_region_server);
let reply = handler_context
.clone()
.handle_upgrade_region_instruction(UpgradeRegion {
region_id,
..Default::default()
})
let reply = UpgradeRegionsHandler
.handle(
&handler_context,
Instruction::UpgradeRegion(UpgradeRegion {
region_id,
..Default::default()
}),
)
.await;
assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_)));
// It didn't wait for handle returns; it had no idea about the error.
if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() {
assert!(!reply.ready);
assert!(reply.exists);
assert!(reply.error.is_none());
}
let reply = reply.unwrap().expect_upgrade_region_reply();
assert!(!reply.ready);
assert!(reply.exists);
assert!(reply.error.is_none());
let reply = handler_context
.clone()
.handle_upgrade_region_instruction(UpgradeRegion {
region_id,
replay_timeout: Some(Duration::from_millis(200)),
..Default::default()
})
let reply = UpgradeRegionsHandler
.handle(
&handler_context,
Instruction::UpgradeRegion(UpgradeRegion {
region_id,
replay_timeout: Some(Duration::from_millis(200)),
..Default::default()
}),
)
.await;
assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_)));
if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() {
assert!(!reply.ready);
assert!(reply.exists);
assert!(reply.error.is_some());
assert!(reply.error.unwrap().contains("mock_error"));
}
let reply = reply.unwrap().expect_upgrade_region_reply();
assert!(!reply.ready);
assert!(reply.exists);
assert!(reply.error.is_some());
assert!(reply.error.unwrap().contains("mock_error"));
}
}

View File

@@ -24,6 +24,7 @@ use std::sync::Arc;
use common_base::bytes::StringBytes;
use ordered_float::OrderedFloat;
use serde::{Deserialize, Serialize};
use serde_json::{Map, Value as Json};
use snafu::{ResultExt, ensure};
@@ -45,7 +46,7 @@ use crate::value::{ListValue, StructValue, Value};
/// convert them to fully structured StructValue for user-facing APIs: the UI protocol and the UDF interface.
///
/// **Important**: This settings only controls the internal form of JSON encoding.
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum JsonStructureSettings {
// TODO(sunng87): provide a limit
Structured(Option<StructType>),
@@ -111,6 +112,12 @@ impl JsonStructureSettings {
}
}
impl Default for JsonStructureSettings {
fn default() -> Self {
Self::Structured(None)
}
}
impl<'a> JsonContext<'a> {
/// Create a new context with an updated key path
pub fn with_key(&self, key: &str) -> JsonContext<'a> {

View File

@@ -32,8 +32,9 @@ pub use crate::schema::column_schema::{
COLUMN_FULLTEXT_OPT_KEY_FALSE_POSITIVE_RATE, COLUMN_FULLTEXT_OPT_KEY_GRANULARITY,
COLUMN_SKIPPING_INDEX_OPT_KEY_FALSE_POSITIVE_RATE, COLUMN_SKIPPING_INDEX_OPT_KEY_GRANULARITY,
COLUMN_SKIPPING_INDEX_OPT_KEY_TYPE, COMMENT_KEY, ColumnExtType, ColumnSchema, FULLTEXT_KEY,
FulltextAnalyzer, FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, Metadata,
SKIPPING_INDEX_KEY, SkippingIndexOptions, SkippingIndexType, TIME_INDEX_KEY,
FulltextAnalyzer, FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY,
JSON_STRUCTURE_SETTINGS_KEY, Metadata, SKIPPING_INDEX_KEY, SkippingIndexOptions,
SkippingIndexType, TIME_INDEX_KEY,
};
pub use crate::schema::constraint::ColumnDefaultConstraint;
pub use crate::schema::raw::RawSchema;
@@ -368,8 +369,7 @@ impl TryFrom<DFSchemaRef> for Schema {
type Error = Error;
fn try_from(value: DFSchemaRef) -> Result<Self> {
let s: ArrowSchema = value.as_ref().into();
s.try_into()
value.inner().clone().try_into()
}
}

View File

@@ -23,6 +23,7 @@ use sqlparser_derive::{Visit, VisitMut};
use crate::data_type::{ConcreteDataType, DataType};
use crate::error::{self, Error, InvalidFulltextOptionSnafu, ParseExtendedTypeSnafu, Result};
use crate::json::JsonStructureSettings;
use crate::schema::TYPE_KEY;
use crate::schema::constraint::ColumnDefaultConstraint;
use crate::value::Value;
@@ -41,6 +42,7 @@ pub const FULLTEXT_KEY: &str = "greptime:fulltext";
pub const INVERTED_INDEX_KEY: &str = "greptime:inverted_index";
/// Key used to store skip options in arrow field's metadata.
pub const SKIPPING_INDEX_KEY: &str = "greptime:skipping_index";
pub const JSON_STRUCTURE_SETTINGS_KEY: &str = "greptime:json:structure_settings";
/// Keys used in fulltext options
pub const COLUMN_FULLTEXT_CHANGE_OPT_KEY_ENABLE: &str = "enable";
@@ -391,6 +393,21 @@ impl ColumnSchema {
self.metadata.remove(SKIPPING_INDEX_KEY);
Ok(())
}
pub fn json_structure_settings(&self) -> Result<Option<JsonStructureSettings>> {
self.metadata
.get(JSON_STRUCTURE_SETTINGS_KEY)
.map(|json| serde_json::from_str(json).context(error::DeserializeSnafu { json }))
.transpose()
}
pub fn with_json_structure_settings(&mut self, settings: &JsonStructureSettings) -> Result<()> {
self.metadata.insert(
JSON_STRUCTURE_SETTINGS_KEY.to_string(),
serde_json::to_string(settings).context(error::SerializeSnafu)?,
);
Ok(())
}
}
/// Column extended type set in column schema's metadata.

View File

@@ -15,6 +15,7 @@
use std::str::FromStr;
use arrow::datatypes::DataType as ArrowDataType;
use arrow_schema::Fields;
use common_base::bytes::Bytes;
use serde::{Deserialize, Serialize};
use snafu::ResultExt;
@@ -63,7 +64,10 @@ impl DataType for JsonType {
}
fn as_arrow_type(&self) -> ArrowDataType {
ArrowDataType::Binary
match self.format {
JsonFormat::Jsonb => ArrowDataType::Binary,
JsonFormat::Native(_) => ArrowDataType::Struct(Fields::empty()),
}
}
fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector> {

View File

@@ -1208,7 +1208,9 @@ impl TryFrom<ScalarValue> for Value {
.collect::<Result<Vec<Value>>>()?;
Value::Struct(StructValue::try_new(items, struct_type)?)
}
ScalarValue::Decimal256(_, _, _)
ScalarValue::Decimal32(_, _, _)
| ScalarValue::Decimal64(_, _, _)
| ScalarValue::Decimal256(_, _, _)
| ScalarValue::FixedSizeList(_)
| ScalarValue::LargeList(_)
| ScalarValue::Dictionary(_, _)

View File

@@ -245,7 +245,9 @@ impl Helper {
length,
)
}
ScalarValue::Decimal256(_, _, _)
ScalarValue::Decimal32(_, _, _)
| ScalarValue::Decimal64(_, _, _)
| ScalarValue::Decimal256(_, _, _)
| ScalarValue::FixedSizeList(_)
| ScalarValue::LargeList(_)
| ScalarValue::Dictionary(_, _)

View File

@@ -427,7 +427,7 @@ fn expand_tumble_analyzer(
/// This is a placeholder for tumble_start and tumble_end function, so that datafusion can
/// recognize them as scalar function
#[derive(Debug)]
#[derive(Debug, PartialEq, Eq, Hash)]
pub struct TumbleExpand {
signature: Signature,
name: String,

View File

@@ -18,7 +18,6 @@ use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use api::v1::meta::{HeartbeatRequest, Peer};
use common_config::utils::ResourceSpec;
use common_error::ext::BoxedError;
use common_meta::heartbeat::handler::{
HeartbeatResponseHandlerContext, HeartbeatResponseHandlerExecutorRef,
@@ -26,6 +25,7 @@ use common_meta::heartbeat::handler::{
use common_meta::heartbeat::mailbox::{HeartbeatMailbox, MailboxRef, OutgoingMessage};
use common_meta::heartbeat::utils::outgoing_message_to_mailbox_message;
use common_meta::key::flow::flow_state::FlowStat;
use common_stat::ResourceStatRef;
use common_telemetry::{debug, error, info, warn};
use greptime_proto::v1::meta::NodeInfo;
use meta_client::client::{HeartbeatSender, HeartbeatStream, MetaClient};
@@ -69,7 +69,7 @@ pub struct HeartbeatTask {
resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
running: Arc<AtomicBool>,
query_stat_size: Option<SizeReportSender>,
resource_spec: ResourceSpec,
resource_stat: ResourceStatRef,
}
impl HeartbeatTask {
@@ -77,11 +77,13 @@ impl HeartbeatTask {
self.query_stat_size = Some(query_stat_size);
self
}
pub fn new(
opts: &FlownodeOptions,
meta_client: Arc<MetaClient>,
heartbeat_opts: HeartbeatOptions,
resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
resource_stat: ResourceStatRef,
) -> Self {
Self {
node_id: opts.node_id.unwrap_or(0),
@@ -93,7 +95,7 @@ impl HeartbeatTask {
resp_handler_executor,
running: Arc::new(AtomicBool::new(false)),
query_stat_size: None,
resource_spec: Default::default(),
resource_stat,
}
}
@@ -146,6 +148,8 @@ impl HeartbeatTask {
heartbeat_request: &HeartbeatRequest,
message: Option<OutgoingMessage>,
latest_report: &Option<FlowStat>,
cpu_usage: i64,
memory_usage: i64,
) -> Option<HeartbeatRequest> {
let mailbox_message = match message.map(outgoing_message_to_mailbox_message) {
Some(Ok(message)) => Some(message),
@@ -170,21 +174,38 @@ impl HeartbeatTask {
.collect(),
});
Some(HeartbeatRequest {
let mut heartbeat_request = HeartbeatRequest {
mailbox_message,
flow_stat,
..heartbeat_request.clone()
})
};
if let Some(info) = heartbeat_request.info.as_mut() {
info.cpu_usage_millicores = cpu_usage;
info.memory_usage_bytes = memory_usage;
}
Some(heartbeat_request)
}
fn build_node_info(start_time_ms: u64, cpus: u32, memory_bytes: u64) -> Option<NodeInfo> {
#[allow(deprecated)]
fn build_node_info(
start_time_ms: u64,
total_cpu_millicores: i64,
total_memory_bytes: i64,
) -> Option<NodeInfo> {
let build_info = common_version::build_info();
Some(NodeInfo {
version: build_info.version.to_string(),
git_commit: build_info.commit_short.to_string(),
start_time_ms,
cpus,
memory_bytes,
total_cpu_millicores,
total_memory_bytes,
cpu_usage_millicores: 0,
memory_usage_bytes: 0,
// TODO(zyy17): Remove these deprecated fields when the deprecated fields are removed from the proto.
cpus: total_cpu_millicores as u32,
memory_bytes: total_memory_bytes as u64,
hostname: hostname::get()
.unwrap_or_default()
.to_string_lossy()
@@ -203,9 +224,9 @@ impl HeartbeatTask {
id: self.node_id,
addr: self.peer_addr.clone(),
});
let cpus = self.resource_spec.cpus as u32;
let memory_bytes = self.resource_spec.memory.unwrap_or_default().as_bytes();
let total_cpu_millicores = self.resource_stat.get_total_cpu_millicores();
let total_memory_bytes = self.resource_stat.get_total_memory_bytes();
let resource_stat = self.resource_stat.clone();
let query_stat_size = self.query_stat_size.clone();
common_runtime::spawn_hb(async move {
@@ -218,7 +239,7 @@ impl HeartbeatTask {
let heartbeat_request = HeartbeatRequest {
peer: self_peer,
node_epoch,
info: Self::build_node_info(node_epoch, cpus, memory_bytes),
info: Self::build_node_info(node_epoch, total_cpu_millicores, total_memory_bytes),
..Default::default()
};
@@ -226,7 +247,7 @@ impl HeartbeatTask {
let req = tokio::select! {
message = outgoing_rx.recv() => {
if let Some(message) = message {
Self::new_heartbeat_request(&heartbeat_request, Some(message), &latest_report)
Self::new_heartbeat_request(&heartbeat_request, Some(message), &latest_report, 0, 0)
} else {
warn!("Sender has been dropped, exiting the heartbeat loop");
// Receives None that means Sender was dropped, we need to break the current loop
@@ -234,7 +255,7 @@ impl HeartbeatTask {
}
}
_ = interval.tick() => {
Self::new_heartbeat_request(&heartbeat_request, None, &latest_report)
Self::new_heartbeat_request(&heartbeat_request, None, &latest_report, resource_stat.get_cpu_usage_millicores(), resource_stat.get_memory_usage_bytes())
}
};

View File

@@ -490,6 +490,7 @@ impl<'a> FlownodeServiceBuilder<'a> {
let config = GrpcServerConfig {
max_recv_message_size: opts.grpc.max_recv_message_size.as_bytes() as usize,
max_send_message_size: opts.grpc.max_send_message_size.as_bytes() as usize,
max_total_message_memory: opts.grpc.max_total_message_memory.as_bytes() as usize,
tls: opts.grpc.tls.clone(),
max_connection_age: opts.grpc.max_connection_age,
};

View File

@@ -37,6 +37,7 @@ common-procedure.workspace = true
common-query.workspace = true
common-recordbatch.workspace = true
common-runtime.workspace = true
common-stat.workspace = true
common-telemetry.workspace = true
common-time.workspace = true
common-version.workspace = true

View File

@@ -18,12 +18,12 @@ mod tests;
use std::sync::Arc;
use api::v1::meta::{HeartbeatRequest, NodeInfo, Peer};
use common_config::utils::ResourceSpec;
use common_meta::heartbeat::handler::{
HeartbeatResponseHandlerContext, HeartbeatResponseHandlerExecutorRef,
};
use common_meta::heartbeat::mailbox::{HeartbeatMailbox, MailboxRef, OutgoingMessage};
use common_meta::heartbeat::utils::outgoing_message_to_mailbox_message;
use common_stat::ResourceStatRef;
use common_telemetry::{debug, error, info, warn};
use meta_client::client::{HeartbeatSender, HeartbeatStream, MetaClient};
use servers::addrs;
@@ -47,7 +47,7 @@ pub struct HeartbeatTask {
retry_interval: Duration,
resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
start_time_ms: u64,
resource_spec: ResourceSpec,
resource_stat: ResourceStatRef,
}
impl HeartbeatTask {
@@ -56,6 +56,7 @@ impl HeartbeatTask {
meta_client: Arc<MetaClient>,
heartbeat_opts: HeartbeatOptions,
resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
resource_stat: ResourceStatRef,
) -> Self {
HeartbeatTask {
// if internal grpc is configured, use its address as the peer address
@@ -71,7 +72,7 @@ impl HeartbeatTask {
retry_interval: heartbeat_opts.retry_interval,
resp_handler_executor,
start_time_ms: common_time::util::current_time_millis() as u64,
resource_spec: Default::default(),
resource_stat,
}
}
@@ -133,6 +134,8 @@ impl HeartbeatTask {
fn new_heartbeat_request(
heartbeat_request: &HeartbeatRequest,
message: Option<OutgoingMessage>,
cpu_usage: i64,
memory_usage: i64,
) -> Option<HeartbeatRequest> {
let mailbox_message = match message.map(outgoing_message_to_mailbox_message) {
Some(Ok(message)) => Some(message),
@@ -143,21 +146,38 @@ impl HeartbeatTask {
None => None,
};
Some(HeartbeatRequest {
let mut heartbeat_request = HeartbeatRequest {
mailbox_message,
..heartbeat_request.clone()
})
};
if let Some(info) = heartbeat_request.info.as_mut() {
info.memory_usage_bytes = memory_usage;
info.cpu_usage_millicores = cpu_usage;
}
Some(heartbeat_request)
}
fn build_node_info(start_time_ms: u64, cpus: u32, memory_bytes: u64) -> Option<NodeInfo> {
#[allow(deprecated)]
fn build_node_info(
start_time_ms: u64,
total_cpu_millicores: i64,
total_memory_bytes: i64,
) -> Option<NodeInfo> {
let build_info = common_version::build_info();
Some(NodeInfo {
version: build_info.version.to_string(),
git_commit: build_info.commit_short.to_string(),
start_time_ms,
cpus,
memory_bytes,
total_cpu_millicores,
total_memory_bytes,
cpu_usage_millicores: 0,
memory_usage_bytes: 0,
// TODO(zyy17): Remove these deprecated fields when the deprecated fields are removed from the proto.
cpus: total_cpu_millicores as u32,
memory_bytes: total_memory_bytes as u64,
hostname: hostname::get()
.unwrap_or_default()
.to_string_lossy()
@@ -177,16 +197,20 @@ impl HeartbeatTask {
id: 0,
addr: self.peer_addr.clone(),
});
let cpus = self.resource_spec.cpus as u32;
let memory_bytes = self.resource_spec.memory.unwrap_or_default().as_bytes();
let total_cpu_millicores = self.resource_stat.get_total_cpu_millicores();
let total_memory_bytes = self.resource_stat.get_total_memory_bytes();
let resource_stat = self.resource_stat.clone();
common_runtime::spawn_hb(async move {
let sleep = tokio::time::sleep(Duration::from_millis(0));
tokio::pin!(sleep);
let heartbeat_request = HeartbeatRequest {
peer: self_peer,
info: Self::build_node_info(start_time_ms, cpus, memory_bytes),
info: Self::build_node_info(
start_time_ms,
total_cpu_millicores,
total_memory_bytes,
),
..Default::default()
};
@@ -194,7 +218,7 @@ impl HeartbeatTask {
let req = tokio::select! {
message = outgoing_rx.recv() => {
if let Some(message) = message {
Self::new_heartbeat_request(&heartbeat_request, Some(message))
Self::new_heartbeat_request(&heartbeat_request, Some(message), 0, 0)
} else {
warn!("Sender has been dropped, exiting the heartbeat loop");
// Receives None that means Sender was dropped, we need to break the current loop
@@ -202,8 +226,8 @@ impl HeartbeatTask {
}
}
_ = &mut sleep => {
sleep.as_mut().reset(Instant::now() + report_interval);
Self::new_heartbeat_request(&heartbeat_request, None)
sleep.as_mut().reset(Instant::now() + report_interval);
Self::new_heartbeat_request(&heartbeat_request, None, resource_stat.get_cpu_usage_millicores(), resource_stat.get_memory_usage_bytes())
}
};

View File

@@ -24,7 +24,9 @@ mod util;
use std::fmt::Debug;
use std::sync::Arc;
use api::v1::meta::{ProcedureDetailResponse, ReconcileRequest, ReconcileResponse, Role};
use api::v1::meta::{
MetasrvNodeInfo, ProcedureDetailResponse, ReconcileRequest, ReconcileResponse, Role,
};
pub use ask_leader::{AskLeader, LeaderProvider, LeaderProviderRef};
use cluster::Client as ClusterClient;
pub use cluster::ClusterKvBackend;
@@ -371,7 +373,8 @@ impl ClusterInfo for MetaClient {
let mut nodes = if get_metasrv_nodes {
let last_activity_ts = -1; // Metasrv does not provide this information.
let (leader, followers) = cluster_client.get_metasrv_peers().await?;
let (leader, followers): (Option<MetasrvNodeInfo>, Vec<MetasrvNodeInfo>) =
cluster_client.get_metasrv_peers().await?;
followers
.into_iter()
.map(|node| {
@@ -383,8 +386,10 @@ impl ClusterInfo for MetaClient {
version: node_info.version,
git_commit: node_info.git_commit,
start_time_ms: node_info.start_time_ms,
cpus: node_info.cpus,
memory_bytes: node_info.memory_bytes,
total_cpu_millicores: node_info.total_cpu_millicores,
total_memory_bytes: node_info.total_memory_bytes,
cpu_usage_millicores: node_info.cpu_usage_millicores,
memory_usage_bytes: node_info.memory_usage_bytes,
hostname: node_info.hostname,
}
} else {
@@ -396,8 +401,10 @@ impl ClusterInfo for MetaClient {
version: node.version,
git_commit: node.git_commit,
start_time_ms: node.start_time_ms,
cpus: node.cpus,
memory_bytes: node.memory_bytes,
total_cpu_millicores: node.cpus as i64,
total_memory_bytes: node.memory_bytes as i64,
cpu_usage_millicores: 0,
memory_usage_bytes: 0,
hostname: "".to_string(),
}
}
@@ -411,8 +418,10 @@ impl ClusterInfo for MetaClient {
version: node_info.version,
git_commit: node_info.git_commit,
start_time_ms: node_info.start_time_ms,
cpus: node_info.cpus,
memory_bytes: node_info.memory_bytes,
total_cpu_millicores: node_info.total_cpu_millicores,
total_memory_bytes: node_info.total_memory_bytes,
cpu_usage_millicores: node_info.cpu_usage_millicores,
memory_usage_bytes: node_info.memory_usage_bytes,
hostname: node_info.hostname,
}
} else {
@@ -424,8 +433,10 @@ impl ClusterInfo for MetaClient {
version: node.version,
git_commit: node.git_commit,
start_time_ms: node.start_time_ms,
cpus: node.cpus,
memory_bytes: node.memory_bytes,
total_cpu_millicores: node.cpus as i64,
total_memory_bytes: node.memory_bytes as i64,
cpu_usage_millicores: 0,
memory_usage_bytes: 0,
hostname: "".to_string(),
}
}

View File

@@ -39,6 +39,7 @@ common-meta.workspace = true
common-options.workspace = true
common-procedure.workspace = true
common-runtime.workspace = true
common-stat.workspace = true
common-telemetry.workspace = true
common-time.workspace = true
common-version.workspace = true

View File

@@ -243,8 +243,10 @@ mod tests {
version: "1.0.0".to_string(),
git_commit: "1234567890".to_string(),
start_time_ms: current_time_millis() as u64,
cpus: 0,
memory_bytes: 0,
total_cpu_millicores: 0,
total_memory_bytes: 0,
cpu_usage_millicores: 0,
memory_usage_bytes: 0,
hostname: "test_hostname".to_string(),
};
@@ -269,8 +271,10 @@ mod tests {
version: "1.0.0".to_string(),
git_commit: "1234567890".to_string(),
start_time_ms: current_time_millis() as u64,
cpus: 0,
memory_bytes: 0,
total_cpu_millicores: 0,
total_memory_bytes: 0,
cpu_usage_millicores: 0,
memory_usage_bytes: 0,
hostname: "test_hostname".to_string(),
};
@@ -307,8 +311,10 @@ mod tests {
version: "1.0.0".to_string(),
git_commit: "1234567890".to_string(),
start_time_ms: last_activity_ts as u64,
cpus: 0,
memory_bytes: 0,
total_cpu_millicores: 0,
total_memory_bytes: 0,
cpu_usage_millicores: 0,
memory_usage_bytes: 0,
hostname: "test_hostname".to_string(),
};

View File

@@ -1161,8 +1161,10 @@ mod tests {
version: "test_version".to_string(),
git_commit: "test_git_commit".to_string(),
start_time_ms: 0,
cpus: 0,
memory_bytes: 0,
total_cpu_millicores: 0,
total_memory_bytes: 0,
cpu_usage_millicores: 0,
memory_usage_bytes: 0,
hostname: "test_hostname".to_string(),
};
mysql_election.register_candidate(&node_info).await.unwrap();

View File

@@ -1000,8 +1000,10 @@ mod tests {
version: "test_version".to_string(),
git_commit: "test_git_commit".to_string(),
start_time_ms: 0,
cpus: 0,
memory_bytes: 0,
total_cpu_millicores: 0,
total_memory_bytes: 0,
cpu_usage_millicores: 0,
memory_usage_bytes: 0,
hostname: "test_hostname".to_string(),
};
pg_election.register_candidate(&node_info).await.unwrap();

View File

@@ -52,8 +52,10 @@ impl HeartbeatHandler for CollectFrontendClusterInfoHandler {
version: info.version,
git_commit: info.git_commit,
start_time_ms: info.start_time_ms,
cpus: info.cpus,
memory_bytes: info.memory_bytes,
total_cpu_millicores: info.total_cpu_millicores,
total_memory_bytes: info.total_memory_bytes,
cpu_usage_millicores: info.cpu_usage_millicores,
memory_usage_bytes: info.memory_usage_bytes,
hostname: info.hostname,
};
@@ -88,8 +90,10 @@ impl HeartbeatHandler for CollectFlownodeClusterInfoHandler {
version: info.version,
git_commit: info.git_commit,
start_time_ms: info.start_time_ms,
cpus: info.cpus,
memory_bytes: info.memory_bytes,
total_cpu_millicores: info.total_cpu_millicores,
total_memory_bytes: info.total_memory_bytes,
cpu_usage_millicores: info.cpu_usage_millicores,
memory_usage_bytes: info.memory_usage_bytes,
hostname: info.hostname,
};
@@ -142,8 +146,10 @@ impl HeartbeatHandler for CollectDatanodeClusterInfoHandler {
version: info.version,
git_commit: info.git_commit,
start_time_ms: info.start_time_ms,
cpus: info.cpus,
memory_bytes: info.memory_bytes,
total_cpu_millicores: info.total_cpu_millicores,
total_memory_bytes: info.total_memory_bytes,
cpu_usage_millicores: info.cpu_usage_millicores,
memory_usage_bytes: info.memory_usage_bytes,
hostname: info.hostname,
};

View File

@@ -22,7 +22,6 @@ use std::time::Duration;
use clap::ValueEnum;
use common_base::Plugins;
use common_base::readable_size::ReadableSize;
use common_config::utils::ResourceSpec;
use common_config::{Configurable, DEFAULT_DATA_HOME};
use common_event_recorder::EventRecorderOptions;
use common_greptimedb_telemetry::GreptimeDBTelemetryTask;
@@ -47,6 +46,7 @@ use common_options::datanode::DatanodeClientOptions;
use common_options::memory::MemoryOptions;
use common_procedure::ProcedureManagerRef;
use common_procedure::options::ProcedureConfig;
use common_stat::ResourceStatRef;
use common_telemetry::logging::{LoggingOptions, TracingOptions};
use common_telemetry::{error, info, warn};
use common_wal::config::MetasrvWalConfig;
@@ -372,12 +372,16 @@ pub struct MetasrvNodeInfo {
pub git_commit: String,
// The node start timestamp in milliseconds
pub start_time_ms: u64,
// The node cpus
// The node total cpu millicores
#[serde(default)]
pub cpus: u32,
// The node memory bytes
pub total_cpu_millicores: i64,
#[serde(default)]
pub memory_bytes: u64,
// The node total memory bytes
pub total_memory_bytes: i64,
/// The node build cpu usage millicores
pub cpu_usage_millicores: i64,
/// The node build memory usage bytes
pub memory_usage_bytes: i64,
// The node hostname
#[serde(default)]
pub hostname: String,
@@ -397,15 +401,19 @@ impl From<MetasrvNodeInfo> for api::v1::meta::MetasrvNodeInfo {
version: node_info.version.clone(),
git_commit: node_info.git_commit.clone(),
start_time_ms: node_info.start_time_ms,
cpus: node_info.cpus,
memory_bytes: node_info.memory_bytes,
cpus: node_info.total_cpu_millicores as u32,
memory_bytes: node_info.total_memory_bytes as u64,
// The canonical location for node information.
info: Some(api::v1::meta::NodeInfo {
version: node_info.version,
git_commit: node_info.git_commit,
start_time_ms: node_info.start_time_ms,
cpus: node_info.cpus,
memory_bytes: node_info.memory_bytes,
total_cpu_millicores: node_info.total_cpu_millicores,
total_memory_bytes: node_info.total_memory_bytes,
cpu_usage_millicores: node_info.cpu_usage_millicores,
memory_usage_bytes: node_info.memory_usage_bytes,
cpus: node_info.total_cpu_millicores as u32,
memory_bytes: node_info.total_memory_bytes as u64,
hostname: node_info.hostname,
}),
}
@@ -517,7 +525,7 @@ pub struct Metasrv {
region_flush_ticker: Option<RegionFlushTickerRef>,
table_id_sequence: SequenceRef,
reconciliation_manager: ReconciliationManagerRef,
resource_spec: ResourceSpec,
resource_stat: ResourceStatRef,
plugins: Plugins,
}
@@ -699,8 +707,8 @@ impl Metasrv {
self.start_time_ms
}
pub fn resource_spec(&self) -> &ResourceSpec {
&self.resource_spec
pub fn resource_stat(&self) -> &ResourceStatRef {
&self.resource_stat
}
pub fn node_info(&self) -> MetasrvNodeInfo {
@@ -710,8 +718,10 @@ impl Metasrv {
version: build_info.version.to_string(),
git_commit: build_info.commit_short.to_string(),
start_time_ms: self.start_time_ms(),
cpus: self.resource_spec().cpus as u32,
memory_bytes: self.resource_spec().memory.unwrap_or_default().as_bytes(),
total_cpu_millicores: self.resource_stat.get_total_cpu_millicores(),
total_memory_bytes: self.resource_stat.get_total_memory_bytes(),
cpu_usage_millicores: self.resource_stat.get_cpu_usage_millicores(),
memory_usage_bytes: self.resource_stat.get_memory_usage_bytes(),
hostname: hostname::get()
.unwrap_or_default()
.to_string_lossy()

View File

@@ -46,6 +46,7 @@ use common_meta::stats::topic::TopicStatsRegistry;
use common_meta::wal_options_allocator::{build_kafka_client, build_wal_options_allocator};
use common_procedure::ProcedureManagerRef;
use common_procedure::local::{LocalManager, ManagerConfig};
use common_stat::ResourceStatImpl;
use common_telemetry::{info, warn};
use snafu::{ResultExt, ensure};
use store_api::storage::MAX_REGION_SEQ;
@@ -517,6 +518,9 @@ impl MetasrvBuilder {
.try_start()
.context(error::InitReconciliationManagerSnafu)?;
let mut resource_stat = ResourceStatImpl::default();
resource_stat.start_collect_cpu_usage();
Ok(Metasrv {
state,
started: Arc::new(AtomicBool::new(false)),
@@ -556,7 +560,7 @@ impl MetasrvBuilder {
table_id_sequence,
reconciliation_manager,
topic_stats_registry,
resource_spec: Default::default(),
resource_stat: Arc::new(resource_stat),
})
}
}

View File

@@ -19,7 +19,7 @@ use api::v1::meta::MailboxMessage;
use common_error::ext::BoxedError;
use common_meta::distributed_time_constants::REGION_LEASE_SECS;
use common_meta::instruction::{
DowngradeRegion, DowngradeRegionReply, Instruction, InstructionReply,
DowngradeRegion, DowngradeRegionReply, DowngradeRegionsReply, Instruction, InstructionReply,
};
use common_procedure::{Context as ProcedureContext, Status};
use common_telemetry::{error, info, warn};
@@ -120,10 +120,10 @@ impl DowngradeLeaderRegion {
) -> Instruction {
let pc = &ctx.persistent_ctx;
let region_id = pc.region_id;
Instruction::DowngradeRegion(DowngradeRegion {
Instruction::DowngradeRegions(vec![DowngradeRegion {
region_id,
flush_timeout: Some(flush_timeout),
})
}])
}
/// Tries to downgrade a leader region.
@@ -173,12 +173,7 @@ impl DowngradeLeaderRegion {
region_id,
now.elapsed()
);
let InstructionReply::DowngradeRegion(DowngradeRegionReply {
last_entry_id,
metadata_last_entry_id,
exists,
error,
}) = reply
let InstructionReply::DowngradeRegions(DowngradeRegionsReply { replies }) = reply
else {
return error::UnexpectedInstructionReplySnafu {
mailbox_message: msg.to_string(),
@@ -187,6 +182,15 @@ impl DowngradeLeaderRegion {
.fail();
};
// TODO(weny): handle multiple replies.
let DowngradeRegionReply {
region_id,
last_entry_id,
metadata_last_entry_id,
exists,
error,
} = &replies[0];
if error.is_some() {
return error::RetryLaterSnafu {
reason: format!(
@@ -216,12 +220,12 @@ impl DowngradeLeaderRegion {
}
if let Some(last_entry_id) = last_entry_id {
ctx.volatile_ctx.set_last_entry_id(last_entry_id);
ctx.volatile_ctx.set_last_entry_id(*last_entry_id);
}
if let Some(metadata_last_entry_id) = metadata_last_entry_id {
ctx.volatile_ctx
.set_metadata_last_entry_id(metadata_last_entry_id);
.set_metadata_last_entry_id(*metadata_last_entry_id);
}
Ok(())

View File

@@ -17,7 +17,8 @@ use std::collections::HashMap;
use api::v1::meta::mailbox_message::Payload;
use api::v1::meta::{HeartbeatResponse, MailboxMessage};
use common_meta::instruction::{
DowngradeRegionReply, FlushRegionReply, InstructionReply, SimpleReply, UpgradeRegionReply,
DowngradeRegionReply, DowngradeRegionsReply, FlushRegionReply, InstructionReply, SimpleReply,
UpgradeRegionReply,
};
use common_meta::key::TableMetadataManagerRef;
use common_meta::key::table_route::TableRouteValue;
@@ -183,12 +184,15 @@ pub fn new_downgrade_region_reply(
to: "meta".to_string(),
timestamp_millis: current_time_millis(),
payload: Some(Payload::Json(
serde_json::to_string(&InstructionReply::DowngradeRegion(DowngradeRegionReply {
last_entry_id,
metadata_last_entry_id: None,
exists: exist,
error,
}))
serde_json::to_string(&InstructionReply::DowngradeRegions(
DowngradeRegionsReply::new(vec![DowngradeRegionReply {
region_id: RegionId::new(0, 0),
last_entry_id,
metadata_last_entry_id: None,
exists: exist,
error,
}]),
))
.unwrap(),
)),
}

View File

@@ -97,8 +97,10 @@ impl Metasrv {
version: build_info.version.to_string(),
git_commit: build_info.commit_short.to_string(),
start_time_ms: self.start_time_ms(),
cpus: self.resource_spec().cpus as u32,
memory_bytes: self.resource_spec().memory.unwrap_or_default().as_bytes(),
total_cpu_millicores: self.resource_stat().get_total_cpu_millicores(),
total_memory_bytes: self.resource_stat().get_total_memory_bytes(),
cpu_usage_millicores: self.resource_stat().get_cpu_usage_millicores(),
memory_usage_bytes: self.resource_stat().get_memory_usage_bytes(),
hostname: hostname::get()
.unwrap_or_default()
.to_string_lossy()

View File

@@ -127,12 +127,12 @@ mod tests {
assert_eq!(
debug_format,
r#"
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000001/data/<file_id>.parquet", file_size: 3173, index_file_path: Some("test_metric_region/11_0000000001/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(20), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000002/data/<file_id>.parquet", file_size: 3173, index_file_path: Some("test_metric_region/11_0000000002/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417473(11, 16777217), table_id: 11, region_number: 16777217, region_group: 1, region_sequence: 1, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000001/metadata/<file_id>.parquet", file_size: 3505, index_file_path: None, index_file_size: None, num_rows: 8, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(8), origin_region_id: 47261417473(11, 16777217), node_id: None, visible: true }
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417474(11, 16777218), table_id: 11, region_number: 16777218, region_group: 1, region_sequence: 2, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000002/metadata/<file_id>.parquet", file_size: 3489, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 47261417474(11, 16777218), node_id: None, visible: true }
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", level: 0, file_path: "test_metric_region/22_0000000042/data/<file_id>.parquet", file_size: 3173, index_file_path: Some("test_metric_region/22_0000000042/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94506057770(22, 16777258), table_id: 22, region_number: 16777258, region_group: 1, region_sequence: 42, file_id: "<file_id>", level: 0, file_path: "test_metric_region/22_0000000042/metadata/<file_id>.parquet", file_size: 3489, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 94506057770(22, 16777258), node_id: None, visible: true }"#
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000001/data/<file_id>.parquet", file_size: 3173, index_file_path: Some("test_metric_region/11_0000000001/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(20), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000002/data/<file_id>.parquet", file_size: 3173, index_file_path: Some("test_metric_region/11_0000000002/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417473(11, 16777217), table_id: 11, region_number: 16777217, region_group: 1, region_sequence: 1, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000001/metadata/<file_id>.parquet", file_size: 3505, index_file_path: None, index_file_size: None, num_rows: 8, num_row_groups: 1, num_series: Some(8), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(8), origin_region_id: 47261417473(11, 16777217), node_id: None, visible: true }
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417474(11, 16777218), table_id: 11, region_number: 16777218, region_group: 1, region_sequence: 2, file_id: "<file_id>", level: 0, file_path: "test_metric_region/11_0000000002/metadata/<file_id>.parquet", file_size: 3489, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 47261417474(11, 16777218), node_id: None, visible: true }
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", level: 0, file_path: "test_metric_region/22_0000000042/data/<file_id>.parquet", file_size: 3173, index_file_path: Some("test_metric_region/22_0000000042/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }
ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94506057770(22, 16777258), table_id: 22, region_number: 16777258, region_group: 1, region_sequence: 42, file_id: "<file_id>", level: 0, file_path: "test_metric_region/22_0000000042/metadata/<file_id>.parquet", file_size: 3489, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 94506057770(22, 16777258), node_id: None, visible: true }"#
);
// list from storage
let storage_entries = mito

View File

@@ -65,7 +65,7 @@ partition.workspace = true
puffin.workspace = true
rand.workspace = true
rayon = "1.10"
regex = "1.5"
regex.workspace = true
rskafka = { workspace = true, optional = true }
rstest = { workspace = true, optional = true }
rstest_reuse = { workspace = true, optional = true }

View File

@@ -433,6 +433,7 @@ impl Compactor for DefaultCompactor {
num_row_groups: sst_info.num_row_groups,
sequence: max_sequence,
partition_expr: partition_expr.clone(),
num_series: sst_info.num_series,
})
.collect::<Vec<_>>();
let output_file_names =

View File

@@ -78,6 +78,7 @@ pub fn new_file_handle_with_size_and_sequence(
index_file_size: 0,
num_rows: 0,
num_row_groups: 0,
num_series: 0,
sequence: NonZeroU64::new(sequence),
partition_expr: None,
},

View File

@@ -859,9 +859,9 @@ async fn test_cache_null_primary_key_with_format(flat_format: bool) {
#[tokio::test]
async fn test_list_ssts() {
test_list_ssts_with_format(false, r#"
ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", level: 0, file_path: "test/11_0000000001/<file_id>.parquet", file_size: 2531, index_file_path: Some("test/11_0000000001/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", level: 0, file_path: "test/11_0000000002/<file_id>.parquet", file_size: 2531, index_file_path: Some("test/11_0000000002/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", level: 0, file_path: "test/22_0000000042/<file_id>.parquet", file_size: 2531, index_file_path: Some("test/22_0000000042/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"# ,r#"
ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", level: 0, file_path: "test/11_0000000001/<file_id>.parquet", file_size: 2531, index_file_path: Some("test/11_0000000001/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", level: 0, file_path: "test/11_0000000002/<file_id>.parquet", file_size: 2531, index_file_path: Some("test/11_0000000002/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", level: 0, file_path: "test/22_0000000042/<file_id>.parquet", file_size: 2531, index_file_path: Some("test/22_0000000042/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"# ,r#"
StorageSstEntry { file_path: "test/11_0000000001/<file_id>.parquet", file_size: None, last_modified_ms: None, node_id: None }
StorageSstEntry { file_path: "test/11_0000000001/index/<file_id>.puffin", file_size: None, last_modified_ms: None, node_id: None }
StorageSstEntry { file_path: "test/11_0000000002/<file_id>.parquet", file_size: None, last_modified_ms: None, node_id: None }
@@ -869,9 +869,9 @@ StorageSstEntry { file_path: "test/11_0000000002/index/<file_id>.puffin", file_s
StorageSstEntry { file_path: "test/22_0000000042/<file_id>.parquet", file_size: None, last_modified_ms: None, node_id: None }
StorageSstEntry { file_path: "test/22_0000000042/index/<file_id>.puffin", file_size: None, last_modified_ms: None, node_id: None }"#).await;
test_list_ssts_with_format(true, r#"
ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", level: 0, file_path: "test/11_0000000001/<file_id>.parquet", file_size: 2855, index_file_path: Some("test/11_0000000001/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", level: 0, file_path: "test/11_0000000002/<file_id>.parquet", file_size: 2855, index_file_path: Some("test/11_0000000002/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", level: 0, file_path: "test/22_0000000042/<file_id>.parquet", file_size: 2855, index_file_path: Some("test/22_0000000042/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"#, r#"
ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", level: 0, file_path: "test/11_0000000001/<file_id>.parquet", file_size: 2855, index_file_path: Some("test/11_0000000001/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", level: 0, file_path: "test/11_0000000002/<file_id>.parquet", file_size: 2855, index_file_path: Some("test/11_0000000002/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", level: 0, file_path: "test/22_0000000042/<file_id>.parquet", file_size: 2855, index_file_path: Some("test/22_0000000042/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"#, r#"
StorageSstEntry { file_path: "test/11_0000000001/<file_id>.parquet", file_size: None, last_modified_ms: None, node_id: None }
StorageSstEntry { file_path: "test/11_0000000001/index/<file_id>.puffin", file_size: None, last_modified_ms: None, node_id: None }
StorageSstEntry { file_path: "test/11_0000000002/<file_id>.parquet", file_size: None, last_modified_ms: None, node_id: None }
@@ -972,17 +972,17 @@ async fn test_list_ssts_with_format(
#[tokio::test]
async fn test_all_index_metas_list_all_types() {
test_all_index_metas_list_all_types_with_format(false, r#"
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6032), index_type: "bloom_filter", target_type: "column", target_key: "3", target_json: "{\"column\":3}", blob_size: 751, meta_json: Some("{\"bloom\":{\"bloom_filter_size\":640,\"row_count\":20,\"rows_per_segment\":2,\"segment_count\":10}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6032), index_type: "fulltext_bloom", target_type: "column", target_key: "1", target_json: "{\"column\":1}", blob_size: 87, meta_json: Some("{\"bloom\":{\"bloom_filter_size\":64,\"row_count\":20,\"rows_per_segment\":4,\"segment_count\":5},\"fulltext\":{\"analyzer\":\"English\",\"case_sensitive\":false}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6032), index_type: "fulltext_tantivy", target_type: "column", target_key: "2", target_json: "{\"column\":2}", blob_size: 1104, meta_json: Some("{\"fulltext\":{\"analyzer\":\"Chinese\",\"case_sensitive\":true}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6032), index_type: "inverted", target_type: "column", target_key: "0", target_json: "{\"column\":0}", blob_size: 70, meta_json: Some("{\"inverted\":{\"base_offset\":0,\"bitmap_type\":\"Roaring\",\"fst_size\":44,\"inverted_index_size\":70,\"null_bitmap_size\":8,\"relative_fst_offset\":26,\"relative_null_bitmap_offset\":0,\"segment_row_count\":1024,\"total_row_count\":20}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6032), index_type: "inverted", target_type: "column", target_key: "4", target_json: "{\"column\":4}", blob_size: 515, meta_json: Some("{\"inverted\":{\"base_offset\":0,\"bitmap_type\":\"Roaring\",\"fst_size\":147,\"inverted_index_size\":515,\"null_bitmap_size\":8,\"relative_fst_offset\":368,\"relative_null_bitmap_offset\":0,\"segment_row_count\":1024,\"total_row_count\":20}}"), node_id: None }"#).await;
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "bloom_filter", target_type: "column", target_key: "1", target_json: "{\"column\":1}", blob_size: 751, meta_json: Some("{\"bloom\":{\"bloom_filter_size\":640,\"row_count\":20,\"rows_per_segment\":2,\"segment_count\":10}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "fulltext_bloom", target_type: "column", target_key: "4", target_json: "{\"column\":4}", blob_size: 89, meta_json: Some("{\"bloom\":{\"bloom_filter_size\":64,\"row_count\":20,\"rows_per_segment\":4,\"segment_count\":5},\"fulltext\":{\"analyzer\":\"English\",\"case_sensitive\":false}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "fulltext_tantivy", target_type: "column", target_key: "5", target_json: "{\"column\":5}", blob_size: 1100, meta_json: Some("{\"fulltext\":{\"analyzer\":\"Chinese\",\"case_sensitive\":true}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "inverted", target_type: "column", target_key: "1", target_json: "{\"column\":1}", blob_size: 518, meta_json: Some("{\"inverted\":{\"base_offset\":0,\"bitmap_type\":\"Roaring\",\"fst_size\":150,\"inverted_index_size\":518,\"null_bitmap_size\":8,\"relative_fst_offset\":368,\"relative_null_bitmap_offset\":0,\"segment_row_count\":1024,\"total_row_count\":20}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "inverted", target_type: "column", target_key: "2", target_json: "{\"column\":2}", blob_size: 515, meta_json: Some("{\"inverted\":{\"base_offset\":0,\"bitmap_type\":\"Roaring\",\"fst_size\":147,\"inverted_index_size\":515,\"null_bitmap_size\":8,\"relative_fst_offset\":368,\"relative_null_bitmap_offset\":0,\"segment_row_count\":1024,\"total_row_count\":20}}"), node_id: None }"#).await;
test_all_index_metas_list_all_types_with_format(true, r#"
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6144), index_type: "bloom_filter", target_type: "column", target_key: "3", target_json: "{\"column\":3}", blob_size: 751, meta_json: Some("{\"bloom\":{\"bloom_filter_size\":640,\"row_count\":20,\"rows_per_segment\":2,\"segment_count\":10}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6144), index_type: "fulltext_bloom", target_type: "column", target_key: "1", target_json: "{\"column\":1}", blob_size: 89, meta_json: Some("{\"bloom\":{\"bloom_filter_size\":64,\"row_count\":20,\"rows_per_segment\":4,\"segment_count\":5},\"fulltext\":{\"analyzer\":\"English\",\"case_sensitive\":false}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6144), index_type: "fulltext_tantivy", target_type: "column", target_key: "2", target_json: "{\"column\":2}", blob_size: 1104, meta_json: Some("{\"fulltext\":{\"analyzer\":\"Chinese\",\"case_sensitive\":true}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6144), index_type: "inverted", target_type: "column", target_key: "0", target_json: "{\"column\":0}", blob_size: 92, meta_json: Some("{\"inverted\":{\"base_offset\":0,\"bitmap_type\":\"Roaring\",\"fst_size\":66,\"inverted_index_size\":92,\"null_bitmap_size\":8,\"relative_fst_offset\":26,\"relative_null_bitmap_offset\":0,\"segment_row_count\":1024,\"total_row_count\":20}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6144), index_type: "inverted", target_type: "column", target_key: "4", target_json: "{\"column\":4}", blob_size: 515, meta_json: Some("{\"inverted\":{\"base_offset\":0,\"bitmap_type\":\"Roaring\",\"fst_size\":147,\"inverted_index_size\":515,\"null_bitmap_size\":8,\"relative_fst_offset\":368,\"relative_null_bitmap_offset\":0,\"segment_row_count\":1024,\"total_row_count\":20}}"), node_id: None }"#).await;
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "bloom_filter", target_type: "column", target_key: "1", target_json: "{\"column\":1}", blob_size: 751, meta_json: Some("{\"bloom\":{\"bloom_filter_size\":640,\"row_count\":20,\"rows_per_segment\":2,\"segment_count\":10}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "fulltext_bloom", target_type: "column", target_key: "4", target_json: "{\"column\":4}", blob_size: 89, meta_json: Some("{\"bloom\":{\"bloom_filter_size\":64,\"row_count\":20,\"rows_per_segment\":4,\"segment_count\":5},\"fulltext\":{\"analyzer\":\"English\",\"case_sensitive\":false}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "fulltext_tantivy", target_type: "column", target_key: "5", target_json: "{\"column\":5}", blob_size: 1100, meta_json: Some("{\"fulltext\":{\"analyzer\":\"Chinese\",\"case_sensitive\":true}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "inverted", target_type: "column", target_key: "1", target_json: "{\"column\":1}", blob_size: 518, meta_json: Some("{\"inverted\":{\"base_offset\":0,\"bitmap_type\":\"Roaring\",\"fst_size\":150,\"inverted_index_size\":518,\"null_bitmap_size\":8,\"relative_fst_offset\":368,\"relative_null_bitmap_offset\":0,\"segment_row_count\":1024,\"total_row_count\":20}}"), node_id: None }
PuffinIndexMetaEntry { table_dir: "test/", index_file_path: "test/11_0000000001/index/<file_id>.puffin", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_size: Some(6500), index_type: "inverted", target_type: "column", target_key: "2", target_json: "{\"column\":2}", blob_size: 515, meta_json: Some("{\"inverted\":{\"base_offset\":0,\"bitmap_type\":\"Roaring\",\"fst_size\":147,\"inverted_index_size\":515,\"null_bitmap_size\":8,\"relative_fst_offset\":368,\"relative_null_bitmap_offset\":0,\"segment_row_count\":1024,\"total_row_count\":20}}"), node_id: None }"#).await;
}
async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expect_format: &str) {
@@ -1001,12 +1001,33 @@ async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expe
// One region with both fulltext backends and inverted index enabled, plus bloom skipping index
let region_id = RegionId::new(11, 1);
let mut request = CreateRequestBuilder::new().tag_num(3).field_num(2).build();
// inverted index on tag_0
request.column_metadatas[0]
let mut request = CreateRequestBuilder::new().tag_num(1).field_num(2).build();
// bloom filter skipping index on field_1
let skipping = SkippingIndexOptions::new_unchecked(2, 0.01, SkippingIndexType::BloomFilter);
request.column_metadatas[1]
.column_schema
.set_skipping_options(&skipping)
.unwrap();
// inverted index on field_1
request.column_metadatas[2]
.column_schema
.set_inverted_index(true);
// fulltext bloom on tag_1
// inverted index on tag_0
request.column_metadatas[1]
.column_schema
.set_inverted_index(true);
request.column_metadatas.push(ColumnMetadata {
column_schema: ColumnSchema::new(
"field_2".to_string(),
ConcreteDataType::string_datatype(),
true,
),
semantic_type: SemanticType::Field,
column_id: 4,
});
// fulltext bloom on field_2
let ft_bloom = FulltextOptions::new_unchecked(
true,
FulltextAnalyzer::English,
@@ -1015,11 +1036,24 @@ async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expe
4,
0.001,
);
request.column_metadatas[1]
request
.column_metadatas
.last_mut()
.unwrap()
.column_schema
.set_fulltext_options(&ft_bloom)
.unwrap();
// fulltext tantivy on tag_2
request.column_metadatas.push(ColumnMetadata {
column_schema: ColumnSchema::new(
"field_3".to_string(),
ConcreteDataType::string_datatype(),
true,
),
semantic_type: SemanticType::Field,
column_id: 5,
});
// fulltext tantivy on field_3
let ft_tantivy = FulltextOptions::new_unchecked(
true,
FulltextAnalyzer::Chinese,
@@ -1028,28 +1062,20 @@ async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expe
2,
0.01,
);
request.column_metadatas[2]
request
.column_metadatas
.last_mut()
.unwrap()
.column_schema
.set_fulltext_options(&ft_tantivy)
.unwrap();
// bloom filter skipping index on field_1 (which is at index 3)
let skipping = SkippingIndexOptions::new_unchecked(2, 0.01, SkippingIndexType::BloomFilter);
request.column_metadatas[3]
.column_schema
.set_skipping_options(&skipping)
.unwrap();
// inverted index on field_1
request.column_metadatas[4]
.column_schema
.set_inverted_index(true);
engine
.handle_request(region_id, RegionRequest::Create(request.clone()))
.await
.unwrap();
// write some rows (schema: tag_0, tag_1, tag_2, field_0, field_1, ts)
// write some rows (schema: tag_0, field_0, field_1, field_2, field_3, ts)
let column_schemas = rows_schema(&request);
let rows_vec: Vec<api::v1::Row> = (0..20)
.map(|ts| api::v1::Row {
@@ -1057,12 +1083,6 @@ async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expe
api::v1::Value {
value_data: Some(api::v1::value::ValueData::StringValue("x".to_string())),
},
api::v1::Value {
value_data: Some(api::v1::value::ValueData::StringValue("y".to_string())),
},
api::v1::Value {
value_data: Some(api::v1::value::ValueData::StringValue("z".to_string())),
},
api::v1::Value {
value_data: Some(api::v1::value::ValueData::F64Value(ts as f64)),
},
@@ -1074,6 +1094,12 @@ async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expe
ts as i64 * 1000,
)),
},
api::v1::Value {
value_data: Some(api::v1::value::ValueData::StringValue("y".to_string())),
},
api::v1::Value {
value_data: Some(api::v1::value::ValueData::StringValue("z".to_string())),
},
],
})
.collect();
@@ -1095,7 +1121,7 @@ async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expe
.unwrap();
fn bucket_size(size: u64) -> u64 {
if size < 512 { size } else { (size / 16) * 16 }
if size < 512 { size } else { (size / 100) * 100 }
}
let mut metas = engine.all_index_metas().await;
@@ -1125,5 +1151,5 @@ async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expe
.map(|entry| format!("\n{:?}", entry))
.collect::<String>();
assert_eq!(debug_format, expect_format);
assert_eq!(expect_format, debug_format);
}

View File

@@ -32,11 +32,6 @@ use crate::test_util::{
CreateRequestBuilder, TestEnv, build_rows, flush_region, put_rows, reopen_region, rows_schema,
};
// wait listener receives enough success count.
async fn wait_finish(listener: &IndexBuildListener, times: usize) {
listener.wait_finish(times).await;
}
fn async_build_mode_config(is_create_on_flush: bool) -> MitoConfig {
let mut config = MitoConfig::default();
config.index.build_mode = IndexBuildMode::Async;
@@ -84,7 +79,7 @@ fn assert_listener_counts(
expected_success_count: usize,
) {
assert_eq!(listener.begin_count(), expected_begin_count);
assert_eq!(listener.success_count(), expected_success_count);
assert_eq!(listener.finish_count(), expected_success_count);
}
#[tokio::test]
@@ -155,7 +150,7 @@ async fn test_index_build_type_flush() {
flush_region(&engine, region_id, None).await;
// After 2 index build task are finished, 2 index files should exist.
wait_finish(&listener, 2).await;
listener.wait_finish(2).await;
let scanner = engine
.scanner(region_id, ScanRequest::default())
.await
@@ -204,6 +199,8 @@ async fn test_index_build_type_compact() {
put_and_flush(&engine, region_id, &column_schemas, 15..25).await;
put_and_flush(&engine, region_id, &column_schemas, 40..50).await;
// all index build tasks begin means flush tasks are all finished.
listener.wait_begin(4).await;
// Before compaction is triggered, files should be 4, and not all index files are built.
let scanner = engine
.scanner(region_id, ScanRequest::default())
@@ -216,8 +213,8 @@ async fn test_index_build_type_compact() {
// This explicit compaction call serves to make the process deterministic for the test.
compact(&engine, region_id).await;
listener.wait_begin(5).await; // 4 flush + 1 compaction begin
// Before compaction is triggered, files should be 2, and not all index files are built.
listener.clear_success_count();
let scanner = engine
.scanner(region_id, ScanRequest::default())
.await
@@ -226,7 +223,7 @@ async fn test_index_build_type_compact() {
assert!(num_of_index_files(&engine, &scanner, region_id).await < 2);
// Wait a while to make sure index build tasks are finished.
wait_finish(&listener, 2).await;
listener.wait_stop(5).await; // 4 flush + 1 compaction = some abort + some finish
let scanner = engine
.scanner(region_id, ScanRequest::default())
.await
@@ -292,7 +289,7 @@ async fn test_index_build_type_schema_change() {
.handle_request(region_id, RegionRequest::Alter(set_index_request))
.await
.unwrap();
wait_finish(&listener, 1).await;
listener.wait_finish(1).await;
let scanner = engine
.scanner(region_id, ScanRequest::default())
.await

View File

@@ -75,10 +75,13 @@ pub trait EventListener: Send + Sync {
async fn on_notify_region_change_result_begin(&self, _region_id: RegionId) {}
/// Notifies the listener that the index build task is executed successfully.
async fn on_index_build_success(&self, _region_file_id: RegionFileId) {}
async fn on_index_build_finish(&self, _region_file_id: RegionFileId) {}
/// Notifies the listener that the index build task is started.
async fn on_index_build_begin(&self, _region_file_id: RegionFileId) {}
/// Notifies the listener that the index build task is aborted.
async fn on_index_build_abort(&self, _region_file_id: RegionFileId) {}
}
pub type EventListenerRef = Arc<dyn EventListener>;
@@ -309,45 +312,75 @@ impl EventListener for NotifyRegionChangeResultListener {
#[derive(Default)]
pub struct IndexBuildListener {
notify: Notify,
success_count: AtomicUsize,
start_count: AtomicUsize,
begin_count: AtomicUsize,
begin_notify: Notify,
finish_count: AtomicUsize,
finish_notify: Notify,
abort_count: AtomicUsize,
abort_notify: Notify,
// stop means finished or aborted
stop_notify: Notify,
}
impl IndexBuildListener {
/// Wait until index build is done for `times` times.
pub async fn wait_finish(&self, times: usize) {
while self.success_count.load(Ordering::Relaxed) < times {
self.notify.notified().await;
while self.finish_count.load(Ordering::Relaxed) < times {
self.finish_notify.notified().await;
}
}
/// Wait until index build is stopped for `times` times.
pub async fn wait_stop(&self, times: usize) {
while self.finish_count.load(Ordering::Relaxed) + self.abort_count.load(Ordering::Relaxed)
< times
{
self.stop_notify.notified().await;
}
}
/// Wait until index build is begun for `times` times.
pub async fn wait_begin(&self, times: usize) {
while self.begin_count.load(Ordering::Relaxed) < times {
self.begin_notify.notified().await;
}
}
/// Clears the success count.
pub fn clear_success_count(&self) {
self.success_count.store(0, Ordering::Relaxed);
pub fn clear_finish_count(&self) {
self.finish_count.store(0, Ordering::Relaxed);
}
/// Returns the success count.
pub fn success_count(&self) -> usize {
self.success_count.load(Ordering::Relaxed)
pub fn finish_count(&self) -> usize {
self.finish_count.load(Ordering::Relaxed)
}
/// Returns the start count.
pub fn begin_count(&self) -> usize {
self.start_count.load(Ordering::Relaxed)
self.begin_count.load(Ordering::Relaxed)
}
}
#[async_trait]
impl EventListener for IndexBuildListener {
async fn on_index_build_success(&self, region_file_id: RegionFileId) {
async fn on_index_build_finish(&self, region_file_id: RegionFileId) {
info!("Region {} index build successfully", region_file_id);
self.success_count.fetch_add(1, Ordering::Relaxed);
self.notify.notify_one();
self.finish_count.fetch_add(1, Ordering::Relaxed);
self.finish_notify.notify_one();
self.stop_notify.notify_one();
}
async fn on_index_build_begin(&self, region_file_id: RegionFileId) {
info!("Region {} index build begin", region_file_id);
self.start_count.fetch_add(1, Ordering::Relaxed);
self.begin_count.fetch_add(1, Ordering::Relaxed);
self.begin_notify.notify_one();
}
async fn on_index_build_abort(&self, region_file_id: RegionFileId) {
info!("Region {} index build aborted", region_file_id);
self.abort_count.fetch_add(1, Ordering::Relaxed);
self.abort_notify.notify_one();
self.stop_notify.notify_one();
}
}

View File

@@ -641,6 +641,7 @@ impl RegionFlushTask {
num_row_groups: sst_info.num_row_groups,
sequence: NonZeroU64::new(max_sequence),
partition_expr,
num_series: sst_info.num_series,
}
}

View File

@@ -269,6 +269,7 @@ async fn checkpoint_with_different_compression_types() {
num_row_groups: 0,
sequence: None,
partition_expr: None,
num_series: 0,
};
let action = RegionMetaActionList::new(vec![RegionMetaAction::Edit(RegionEdit {
files_to_add: vec![file_meta],
@@ -334,6 +335,7 @@ fn generate_action_lists(num: usize) -> (Vec<FileId>, Vec<RegionMetaActionList>)
num_row_groups: 0,
sequence: None,
partition_expr: None,
num_series: 0,
};
let action = RegionMetaActionList::new(vec![RegionMetaAction::Edit(RegionEdit {
files_to_add: vec![file_meta],

View File

@@ -69,7 +69,7 @@ use crate::sst::parquet::flat_format::primary_key_column_index;
use crate::sst::parquet::format::{PrimaryKeyArray, PrimaryKeyArrayBuilder, ReadFormat};
use crate::sst::parquet::helper::parse_parquet_metadata;
use crate::sst::parquet::{PARQUET_METADATA_KEY, SstInfo};
use crate::sst::to_sst_arrow_schema;
use crate::sst::{SeriesEstimator, to_sst_arrow_schema};
const INIT_DICT_VALUE_CAPACITY: usize = 8;
@@ -563,6 +563,7 @@ impl EncodedBulkPart {
num_row_groups: self.metadata.parquet_metadata.num_row_groups() as u64,
file_metadata: Some(self.metadata.parquet_metadata.clone()),
index_metadata: IndexOutput::default(),
num_series: self.metadata.num_series,
}
}
@@ -602,6 +603,8 @@ pub struct BulkPartMeta {
pub parquet_metadata: Arc<ParquetMetaData>,
/// Part region schema.
pub region_metadata: RegionMetadataRef,
/// Number of series.
pub num_series: u64,
}
/// Metrics for encoding a part.
@@ -669,6 +672,7 @@ impl BulkPartEncoder {
let mut writer = ArrowWriter::try_new(&mut buf, arrow_schema, self.writer_props.clone())
.context(EncodeMemtableSnafu)?;
let mut total_rows = 0;
let mut series_estimator = SeriesEstimator::default();
// Process each batch from the iterator
let mut iter_start = Instant::now();
@@ -679,6 +683,7 @@ impl BulkPartEncoder {
continue;
}
series_estimator.update_flat(&batch);
metrics.raw_size += record_batch_estimated_size(&batch);
let write_start = Instant::now();
writer.write(&batch).context(EncodeMemtableSnafu)?;
@@ -701,6 +706,7 @@ impl BulkPartEncoder {
let buf = Bytes::from(buf);
let parquet_metadata = Arc::new(parse_parquet_metadata(file_metadata)?);
let num_series = series_estimator.finish();
Ok(Some(EncodedBulkPart {
data: buf,
@@ -710,6 +716,7 @@ impl BulkPartEncoder {
min_timestamp,
parquet_metadata,
region_metadata: self.metadata.clone(),
num_series,
},
}))
}
@@ -742,6 +749,7 @@ impl BulkPartEncoder {
min_timestamp: part.min_timestamp,
parquet_metadata,
region_metadata: self.metadata.clone(),
num_series: part.estimated_series_count() as u64,
},
}))
}

View File

@@ -13,12 +13,10 @@
// limitations under the License.
use std::collections::VecDeque;
use std::ops::BitAnd;
use std::sync::Arc;
use bytes::Bytes;
use datatypes::arrow::array::BooleanArray;
use datatypes::arrow::buffer::BooleanBuffer;
use datatypes::arrow::record_batch::RecordBatch;
use parquet::arrow::ProjectionMask;
use parquet::arrow::arrow_reader::ParquetRecordBatchReader;
@@ -30,7 +28,7 @@ use crate::error::{self, ComputeArrowSnafu, DecodeArrowRowGroupSnafu};
use crate::memtable::bulk::context::{BulkIterContext, BulkIterContextRef};
use crate::memtable::bulk::row_group_reader::MemtableRowGroupReaderBuilder;
use crate::sst::parquet::flat_format::sequence_column_index;
use crate::sst::parquet::reader::{MaybeFilter, RowGroupReaderContext};
use crate::sst::parquet::reader::RowGroupReaderContext;
/// Iterator for reading data inside a bulk part.
pub struct EncodedBulkPartIter {
@@ -191,38 +189,13 @@ fn apply_combined_filters(
let num_rows = record_batch.num_rows();
let mut combined_filter = None;
// First, apply predicate filters.
// First, apply predicate filters using the shared method.
if !context.base.filters.is_empty() {
let num_rows = record_batch.num_rows();
let mut mask = BooleanBuffer::new_set(num_rows);
// Run filter one by one and combine them result, similar to RangeBase::precise_filter
for filter_ctx in &context.base.filters {
let filter = match filter_ctx.filter() {
MaybeFilter::Filter(f) => f,
// Column matches.
MaybeFilter::Matched => continue,
// Column doesn't match, filter the entire batch.
MaybeFilter::Pruned => return Ok(None),
};
// Safety: We checked the format type in new().
let Some(column_index) = context
.read_format()
.as_flat()
.unwrap()
.projected_index_by_id(filter_ctx.column_id())
else {
continue;
};
let array = record_batch.column(column_index);
let result = filter
.evaluate_array(array)
.context(crate::error::RecordBatchSnafu)?;
mask = mask.bitand(&result);
}
// Convert the mask to BooleanArray
let predicate_mask = context.base.compute_filter_mask_flat(&record_batch)?;
// If predicate filters out the entire batch, return None early
let Some(mask) = predicate_mask else {
return Ok(None);
};
combined_filter = Some(BooleanArray::from(mask));
}

View File

@@ -386,7 +386,8 @@ impl FlatCompatBatch {
/// Repeats the vector value `to_len` times.
fn repeat_vector(vector: &VectorRef, to_len: usize, is_tag: bool) -> Result<ArrayRef> {
assert_eq!(1, vector.len());
if is_tag {
let data_type = vector.data_type();
if is_tag && data_type.is_string() {
let values = vector.to_arrow_array();
if values.is_null(0) {
// Creates a dictionary array with `to_len` null keys.

View File

@@ -48,6 +48,8 @@ pub struct FlatProjectionMapper {
/// Ids of columns to project. It keeps ids in the same order as the `projection`
/// indices to build the mapper.
/// The mapper won't deduplicate the column ids.
///
/// Note that this doesn't contain the `__table_id` and `__tsid`.
column_ids: Vec<ColumnId>,
/// Ids and DataTypes of columns of the expected batch.
/// We can use this to check if the batch is compatible with the expected schema.

View File

@@ -608,6 +608,7 @@ impl MitoRegion {
index_file_size,
num_rows: meta.num_rows,
num_row_groups: meta.num_row_groups,
num_series: Some(meta.num_series),
min_ts: meta.time_range.0,
max_ts: meta.time_range.1,
sequence: meta.sequence.map(|s| s.get()),

View File

@@ -431,6 +431,7 @@ mod tests {
num_row_groups: 1,
sequence: NonZeroU64::new(1),
partition_expr,
num_series: 1,
}
}

View File

@@ -21,7 +21,9 @@ use common_base::readable_size::ReadableSize;
use datatypes::arrow::datatypes::{
DataType as ArrowDataType, Field, FieldRef, Fields, Schema, SchemaRef,
};
use datatypes::arrow::record_batch::RecordBatch;
use datatypes::prelude::ConcreteDataType;
use datatypes::timestamp::timestamp_array_to_primitive;
use serde::{Deserialize, Serialize};
use store_api::codec::PrimaryKeyEncoding;
use store_api::metadata::RegionMetadata;
@@ -29,6 +31,9 @@ use store_api::storage::consts::{
OP_TYPE_COLUMN_NAME, PRIMARY_KEY_COLUMN_NAME, SEQUENCE_COLUMN_NAME,
};
use crate::read::Batch;
use crate::sst::parquet::flat_format::time_index_column_index;
pub mod file;
pub mod file_purger;
pub mod file_ref;
@@ -241,3 +246,426 @@ fn plain_internal_fields() -> [FieldRef; 2] {
Arc::new(Field::new(OP_TYPE_COLUMN_NAME, ArrowDataType::UInt8, false)),
]
}
/// Gets the estimated number of series from record batches.
///
/// This struct tracks the last timestamp value to detect series boundaries
/// by observing when timestamps decrease (indicating a new series).
#[derive(Default)]
pub(crate) struct SeriesEstimator {
/// The last timestamp value seen
last_timestamp: Option<i64>,
/// The estimated number of series
series_count: u64,
}
impl SeriesEstimator {
/// Updates the estimator with a new Batch.
///
/// Since each Batch contains only one series, this increments the series count
/// and updates the last timestamp.
pub(crate) fn update(&mut self, batch: &Batch) {
let Some(last_ts) = batch.last_timestamp() else {
return;
};
// Checks if there's a boundary between the last batch and this batch
if let Some(prev_last_ts) = self.last_timestamp {
// If the first timestamp of this batch is less than the last timestamp
// we've seen, it indicates a new series
if let Some(first_ts) = batch.first_timestamp()
&& first_ts.value() <= prev_last_ts
{
self.series_count += 1;
}
} else {
// First batch, counts as first series
self.series_count = 1;
}
// Updates the last timestamp
self.last_timestamp = Some(last_ts.value());
}
/// Updates the estimator with a new record batch in flat format.
///
/// This method examines the time index column to detect series boundaries.
pub(crate) fn update_flat(&mut self, record_batch: &RecordBatch) {
let batch_rows = record_batch.num_rows();
if batch_rows == 0 {
return;
}
let time_index_pos = time_index_column_index(record_batch.num_columns());
let timestamps = record_batch.column(time_index_pos);
let Some((ts_values, _unit)) = timestamp_array_to_primitive(timestamps) else {
return;
};
let values = ts_values.values();
// Checks if there's a boundary between the last batch and this batch
if let Some(last_ts) = self.last_timestamp {
if values[0] <= last_ts {
self.series_count += 1;
}
} else {
// First batch, counts as first series
self.series_count = 1;
}
// Counts series boundaries within this batch.
for i in 0..batch_rows - 1 {
// We assumes the same timestamp as a new series, which is different from
// how we split batches.
if values[i] >= values[i + 1] {
self.series_count += 1;
}
}
// Updates the last timestamp
self.last_timestamp = Some(values[batch_rows - 1]);
}
/// Returns the estimated number of series.
pub(crate) fn finish(&mut self) -> u64 {
self.last_timestamp = None;
let count = self.series_count;
self.series_count = 0;
count
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use api::v1::OpType;
use datatypes::arrow::array::{
BinaryArray, DictionaryArray, TimestampMillisecondArray, UInt8Array, UInt8Builder,
UInt32Array, UInt64Array,
};
use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit};
use datatypes::arrow::record_batch::RecordBatch;
use super::*;
use crate::read::{Batch, BatchBuilder};
fn new_batch(
primary_key: &[u8],
timestamps: &[i64],
sequences: &[u64],
op_types: &[OpType],
) -> Batch {
let timestamps = Arc::new(TimestampMillisecondArray::from(timestamps.to_vec()));
let sequences = Arc::new(UInt64Array::from(sequences.to_vec()));
let mut op_type_builder = UInt8Builder::with_capacity(op_types.len());
for op_type in op_types {
op_type_builder.append_value(*op_type as u8);
}
let op_types = Arc::new(UInt8Array::from(
op_types.iter().map(|op| *op as u8).collect::<Vec<_>>(),
));
let mut builder = BatchBuilder::new(primary_key.to_vec());
builder
.timestamps_array(timestamps)
.unwrap()
.sequences_array(sequences)
.unwrap()
.op_types_array(op_types)
.unwrap();
builder.build().unwrap()
}
fn new_flat_record_batch(timestamps: &[i64]) -> RecordBatch {
// Flat format has: [fields..., time_index, __primary_key, __sequence, __op_type]
let num_cols = 4; // time_index + 3 internal columns
let time_index_pos = time_index_column_index(num_cols);
assert_eq!(time_index_pos, 0); // For 4 columns, time index should be at position 0
let time_array = Arc::new(TimestampMillisecondArray::from(timestamps.to_vec()));
let pk_array = Arc::new(DictionaryArray::new(
UInt32Array::from(vec![0; timestamps.len()]),
Arc::new(BinaryArray::from(vec![b"test".as_slice()])),
));
let seq_array = Arc::new(UInt64Array::from(vec![1; timestamps.len()]));
let op_array = Arc::new(UInt8Array::from(vec![1; timestamps.len()]));
let schema = Arc::new(Schema::new(vec![
Field::new(
"time",
ArrowDataType::Timestamp(TimeUnit::Millisecond, None),
false,
),
Field::new_dictionary(
"__primary_key",
ArrowDataType::UInt32,
ArrowDataType::Binary,
false,
),
Field::new("__sequence", ArrowDataType::UInt64, false),
Field::new("__op_type", ArrowDataType::UInt8, false),
]));
RecordBatch::try_new(schema, vec![time_array, pk_array, seq_array, op_array]).unwrap()
}
#[test]
fn test_series_estimator_empty_batch() {
let mut estimator = SeriesEstimator::default();
let batch = new_batch(b"test", &[], &[], &[]);
estimator.update(&batch);
assert_eq!(0, estimator.finish());
}
#[test]
fn test_series_estimator_single_batch() {
let mut estimator = SeriesEstimator::default();
let batch = new_batch(
b"test",
&[1, 2, 3],
&[1, 2, 3],
&[OpType::Put, OpType::Put, OpType::Put],
);
estimator.update(&batch);
assert_eq!(1, estimator.finish());
}
#[test]
fn test_series_estimator_multiple_batches_same_series() {
let mut estimator = SeriesEstimator::default();
// First batch with timestamps 1, 2, 3
let batch1 = new_batch(
b"test",
&[1, 2, 3],
&[1, 2, 3],
&[OpType::Put, OpType::Put, OpType::Put],
);
estimator.update(&batch1);
// Second batch with timestamps 4, 5, 6 (continuation)
let batch2 = new_batch(
b"test",
&[4, 5, 6],
&[4, 5, 6],
&[OpType::Put, OpType::Put, OpType::Put],
);
estimator.update(&batch2);
assert_eq!(1, estimator.finish());
}
#[test]
fn test_series_estimator_new_series_detected() {
let mut estimator = SeriesEstimator::default();
// First batch with timestamps 1, 2, 3
let batch1 = new_batch(
b"pk0",
&[1, 2, 3],
&[1, 2, 3],
&[OpType::Put, OpType::Put, OpType::Put],
);
estimator.update(&batch1);
// Second batch with timestamps 2, 3, 4 (timestamp goes back, new series)
let batch2 = new_batch(
b"pk1",
&[2, 3, 4],
&[4, 5, 6],
&[OpType::Put, OpType::Put, OpType::Put],
);
estimator.update(&batch2);
assert_eq!(2, estimator.finish());
}
#[test]
fn test_series_estimator_equal_timestamp_boundary() {
let mut estimator = SeriesEstimator::default();
// First batch ending at timestamp 5
let batch1 = new_batch(
b"test",
&[1, 2, 5],
&[1, 2, 3],
&[OpType::Put, OpType::Put, OpType::Put],
);
estimator.update(&batch1);
// Second batch starting at timestamp 5 (equal, indicates new series)
let batch2 = new_batch(
b"test",
&[5, 6, 7],
&[4, 5, 6],
&[OpType::Put, OpType::Put, OpType::Put],
);
estimator.update(&batch2);
assert_eq!(2, estimator.finish());
}
#[test]
fn test_series_estimator_finish_resets_state() {
let mut estimator = SeriesEstimator::default();
let batch1 = new_batch(
b"test",
&[1, 2, 3],
&[1, 2, 3],
&[OpType::Put, OpType::Put, OpType::Put],
);
estimator.update(&batch1);
assert_eq!(1, estimator.finish());
// After finish, state should be reset
let batch2 = new_batch(
b"test",
&[4, 5, 6],
&[4, 5, 6],
&[OpType::Put, OpType::Put, OpType::Put],
);
estimator.update(&batch2);
assert_eq!(1, estimator.finish());
}
#[test]
fn test_series_estimator_flat_empty_batch() {
let mut estimator = SeriesEstimator::default();
let record_batch = new_flat_record_batch(&[]);
estimator.update_flat(&record_batch);
assert_eq!(0, estimator.finish());
}
#[test]
fn test_series_estimator_flat_single_batch() {
let mut estimator = SeriesEstimator::default();
let record_batch = new_flat_record_batch(&[1, 2, 3]);
estimator.update_flat(&record_batch);
assert_eq!(1, estimator.finish());
}
#[test]
fn test_series_estimator_flat_series_boundary_within_batch() {
let mut estimator = SeriesEstimator::default();
// Timestamps decrease from 3 to 2, indicating a series boundary
let record_batch = new_flat_record_batch(&[1, 2, 3, 2, 4, 5]);
estimator.update_flat(&record_batch);
// Should detect boundary at position 3 (3 >= 2)
assert_eq!(2, estimator.finish());
}
#[test]
fn test_series_estimator_flat_multiple_boundaries_within_batch() {
let mut estimator = SeriesEstimator::default();
// Multiple series boundaries: 5>=4, 6>=3
let record_batch = new_flat_record_batch(&[1, 2, 5, 4, 6, 3, 7]);
estimator.update_flat(&record_batch);
assert_eq!(3, estimator.finish());
}
#[test]
fn test_series_estimator_flat_equal_timestamps() {
let mut estimator = SeriesEstimator::default();
// Equal timestamps are considered as new series
let record_batch = new_flat_record_batch(&[1, 2, 2, 3, 3, 3, 4]);
estimator.update_flat(&record_batch);
// Boundaries at: 2>=2, 3>=3, 3>=3
assert_eq!(4, estimator.finish());
}
#[test]
fn test_series_estimator_flat_multiple_batches_continuation() {
let mut estimator = SeriesEstimator::default();
// First batch: timestamps 1, 2, 3
let batch1 = new_flat_record_batch(&[1, 2, 3]);
estimator.update_flat(&batch1);
// Second batch: timestamps 4, 5, 6 (continuation)
let batch2 = new_flat_record_batch(&[4, 5, 6]);
estimator.update_flat(&batch2);
assert_eq!(1, estimator.finish());
}
#[test]
fn test_series_estimator_flat_multiple_batches_new_series() {
let mut estimator = SeriesEstimator::default();
// First batch: timestamps 1, 2, 3
let batch1 = new_flat_record_batch(&[1, 2, 3]);
estimator.update_flat(&batch1);
// Second batch: timestamps 2, 3, 4 (goes back to 2, new series)
let batch2 = new_flat_record_batch(&[2, 3, 4]);
estimator.update_flat(&batch2);
assert_eq!(2, estimator.finish());
}
#[test]
fn test_series_estimator_flat_boundary_at_batch_edge_equal() {
let mut estimator = SeriesEstimator::default();
// First batch ending at 5
let batch1 = new_flat_record_batch(&[1, 2, 5]);
estimator.update_flat(&batch1);
// Second batch starting at 5 (equal timestamp, new series)
let batch2 = new_flat_record_batch(&[5, 6, 7]);
estimator.update_flat(&batch2);
assert_eq!(2, estimator.finish());
}
#[test]
fn test_series_estimator_flat_mixed_batches() {
let mut estimator = SeriesEstimator::default();
// Batch 1: single series [10, 20, 30]
let batch1 = new_flat_record_batch(&[10, 20, 30]);
estimator.update_flat(&batch1);
// Batch 2: starts new series [5, 15], boundary within batch [15, 10, 25]
let batch2 = new_flat_record_batch(&[5, 15, 10, 25]);
estimator.update_flat(&batch2);
// Batch 3: continues from 25 to [30, 35]
let batch3 = new_flat_record_batch(&[30, 35]);
estimator.update_flat(&batch3);
// Expected: 1 (batch1) + 1 (batch2 start) + 1 (within batch2) = 3
assert_eq!(3, estimator.finish());
}
#[test]
fn test_series_estimator_flat_descending_timestamps() {
let mut estimator = SeriesEstimator::default();
// Strictly descending timestamps - each pair creates a boundary
let record_batch = new_flat_record_batch(&[10, 9, 8, 7, 6]);
estimator.update_flat(&record_batch);
// Boundaries: 10>=9, 9>=8, 8>=7, 7>=6 = 4 boundaries + 1 initial = 5 series
assert_eq!(5, estimator.finish());
}
#[test]
fn test_series_estimator_flat_finish_resets_state() {
let mut estimator = SeriesEstimator::default();
let batch1 = new_flat_record_batch(&[1, 2, 3]);
estimator.update_flat(&batch1);
assert_eq!(1, estimator.finish());
// After finish, state should be reset
let batch2 = new_flat_record_batch(&[4, 5, 6]);
estimator.update_flat(&batch2);
assert_eq!(1, estimator.finish());
}
}

View File

@@ -175,6 +175,10 @@ pub struct FileMeta {
deserialize_with = "deserialize_partition_expr"
)]
pub partition_expr: Option<PartitionExpr>,
/// Number of series in the file.
///
/// The number is 0 if the series number is not available.
pub num_series: u64,
}
impl Debug for FileMeta {
@@ -210,6 +214,7 @@ impl Debug for FileMeta {
}
})
.field("partition_expr", &self.partition_expr)
.field("num_series", &self.num_series)
.finish()
}
}
@@ -458,6 +463,7 @@ mod tests {
num_row_groups: 0,
sequence: None,
partition_expr: None,
num_series: 0,
}
}
@@ -503,6 +509,7 @@ mod tests {
num_row_groups: 0,
sequence: None,
partition_expr: Some(partition_expr.clone()),
num_series: 0,
};
// Test serialization/deserialization

View File

@@ -236,6 +236,7 @@ mod tests {
num_row_groups: 0,
sequence: None,
partition_expr: None,
num_series: 0,
},
file_purger,
);
@@ -302,6 +303,7 @@ mod tests {
num_row_groups: 1,
sequence: NonZeroU64::new(4096),
partition_expr: None,
num_series: 0,
},
file_purger,
);

View File

@@ -259,6 +259,7 @@ mod tests {
num_row_groups: 1,
sequence: NonZeroU64::new(4096),
partition_expr: None,
num_series: 0,
};
file_ref_mgr.add_file(&file_meta);

View File

@@ -26,10 +26,13 @@ use std::sync::Arc;
use bloom_filter::creator::BloomFilterIndexer;
use common_telemetry::{debug, info, warn};
use datatypes::arrow::array::BinaryArray;
use datatypes::arrow::record_batch::RecordBatch;
use mito_codec::index::IndexValuesCodec;
use mito_codec::row_converter::CompositeValues;
use puffin_manager::SstPuffinManager;
use smallvec::{SmallVec, smallvec};
use snafu::ResultExt;
use snafu::{OptionExt, ResultExt};
use statistics::{ByteCount, RowCount};
use store_api::metadata::RegionMetadataRef;
use store_api::storage::{ColumnId, FileId, RegionId};
@@ -40,7 +43,7 @@ use crate::access_layer::{AccessLayerRef, FilePathProvider, OperationType, Regio
use crate::cache::file_cache::{FileType, IndexKey};
use crate::cache::write_cache::{UploadTracker, WriteCacheRef};
use crate::config::{BloomFilterConfig, FulltextIndexConfig, InvertedIndexConfig};
use crate::error::{BuildIndexAsyncSnafu, Error, Result};
use crate::error::{BuildIndexAsyncSnafu, DecodeSnafu, Error, InvalidRecordBatchSnafu, Result};
use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList};
use crate::metrics::INDEX_CREATE_MEMORY_USAGE;
use crate::read::{Batch, BatchReader};
@@ -57,6 +60,9 @@ use crate::sst::index::fulltext_index::creator::FulltextIndexer;
use crate::sst::index::intermediate::IntermediateManager;
use crate::sst::index::inverted_index::creator::InvertedIndexer;
use crate::sst::parquet::SstInfo;
use crate::sst::parquet::flat_format::primary_key_column_index;
use crate::sst::parquet::format::PrimaryKeyArray;
use crate::worker::WorkerListener;
pub(crate) const TYPE_INVERTED_INDEX: &str = "inverted_index";
pub(crate) const TYPE_FULLTEXT_INDEX: &str = "fulltext_index";
@@ -446,6 +452,7 @@ pub struct IndexBuildTask {
pub file_meta: FileMeta,
pub reason: IndexBuildType,
pub access_layer: AccessLayerRef,
pub(crate) listener: WorkerListener,
pub(crate) manifest_ctx: ManifestContextRef,
pub write_cache: Option<WriteCacheRef>,
pub file_purger: FilePurgerRef,
@@ -481,6 +488,12 @@ impl IndexBuildTask {
}
async fn do_index_build(&mut self, version_control: VersionControlRef) {
self.listener
.on_index_build_begin(RegionFileId::new(
self.file_meta.region_id,
self.file_meta.file_id,
))
.await;
match self.index_build(version_control).await {
Ok(outcome) => self.on_success(outcome).await,
Err(e) => {
@@ -535,6 +548,12 @@ impl IndexBuildTask {
if !self.check_sst_file_exists(&version_control).await {
// Calls abort to clean up index files.
indexer.abort().await;
self.listener
.on_index_build_abort(RegionFileId::new(
self.file_meta.region_id,
self.file_meta.file_id,
))
.await;
return Ok(IndexBuildOutcome::Aborted(format!(
"SST file not found during index build, region: {}, file_id: {}",
self.file_meta.region_id, self.file_meta.file_id
@@ -570,6 +589,12 @@ impl IndexBuildTask {
if !self.check_sst_file_exists(&version_control).await {
// Calls abort to clean up index files.
indexer.abort().await;
self.listener
.on_index_build_abort(RegionFileId::new(
self.file_meta.region_id,
self.file_meta.file_id,
))
.await;
return Ok(IndexBuildOutcome::Aborted(format!(
"SST file not found during index build, region: {}, file_id: {}",
self.file_meta.region_id, self.file_meta.file_id
@@ -698,6 +723,56 @@ impl IndexBuildScheduler {
}
}
/// Decodes primary keys from a flat format RecordBatch.
/// Returns a list of (decoded_pk_value, count) tuples where count is the number of occurrences.
pub(crate) fn decode_primary_keys_with_counts(
batch: &RecordBatch,
codec: &IndexValuesCodec,
) -> Result<Vec<(CompositeValues, usize)>> {
let primary_key_index = primary_key_column_index(batch.num_columns());
let pk_dict_array = batch
.column(primary_key_index)
.as_any()
.downcast_ref::<PrimaryKeyArray>()
.context(InvalidRecordBatchSnafu {
reason: "Primary key column is not a dictionary array",
})?;
let pk_values_array = pk_dict_array
.values()
.as_any()
.downcast_ref::<BinaryArray>()
.context(InvalidRecordBatchSnafu {
reason: "Primary key values are not binary array",
})?;
let keys = pk_dict_array.keys();
// Decodes primary keys and count consecutive occurrences
let mut result: Vec<(CompositeValues, usize)> = Vec::new();
let mut prev_key: Option<u32> = None;
for i in 0..keys.len() {
let current_key = keys.value(i);
// Checks if current key is the same as previous key
if let Some(prev) = prev_key
&& prev == current_key
{
// Safety: We already have a key in the result vector.
result.last_mut().unwrap().1 += 1;
continue;
}
// New key, decodes it.
let pk_bytes = pk_values_array.value(current_key as usize);
let decoded_value = codec.decoder().decode(pk_bytes).context(DecodeSnafu)?;
result.push((decoded_value, 1));
prev_key = Some(current_key);
}
Ok(result)
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
@@ -1137,6 +1212,7 @@ mod tests {
},
reason: IndexBuildType::Flush,
access_layer: env.access_layer.clone(),
listener: WorkerListener::default(),
manifest_ctx,
write_cache: None,
file_purger,
@@ -1187,6 +1263,7 @@ mod tests {
file_meta: file_meta.clone(),
reason: IndexBuildType::Flush,
access_layer: env.access_layer.clone(),
listener: WorkerListener::default(),
manifest_ctx,
write_cache: None,
file_purger,
@@ -1254,6 +1331,7 @@ mod tests {
file_meta: file_meta.clone(),
reason: IndexBuildType::Flush,
access_layer: env.access_layer.clone(),
listener: WorkerListener::default(),
manifest_ctx,
write_cache: None,
file_purger,
@@ -1350,6 +1428,7 @@ mod tests {
file_meta: file_meta.clone(),
reason: IndexBuildType::Flush,
access_layer: env.access_layer.clone(),
listener: WorkerListener::default(),
manifest_ctx,
write_cache: None,
file_purger,
@@ -1430,6 +1509,7 @@ mod tests {
file_meta: file_meta.clone(),
reason: IndexBuildType::Flush,
access_layer: env.access_layer.clone(),
listener: WorkerListener::default(),
manifest_ctx,
write_cache: Some(write_cache.clone()),
file_purger,

View File

@@ -16,6 +16,7 @@ use std::collections::HashMap;
use std::sync::Arc;
use std::sync::atomic::AtomicUsize;
use api::v1::SemanticType;
use common_telemetry::{debug, warn};
use datatypes::arrow::record_batch::RecordBatch;
use datatypes::schema::SkippingIndexType;
@@ -23,9 +24,10 @@ use datatypes::vectors::Helper;
use index::bloom_filter::creator::BloomFilterCreator;
use index::target::IndexTarget;
use mito_codec::index::{IndexValueCodec, IndexValuesCodec};
use mito_codec::row_converter::SortField;
use mito_codec::row_converter::{CompositeValues, SortField};
use puffin::puffin_manager::{PuffinWriter, PutOptions};
use snafu::{ResultExt, ensure};
use store_api::codec::PrimaryKeyEncoding;
use store_api::metadata::RegionMetadataRef;
use store_api::storage::{ColumnId, FileId};
use tokio_util::compat::{TokioAsyncReadCompatExt, TokioAsyncWriteCompatExt};
@@ -35,13 +37,13 @@ use crate::error::{
OperateAbortedIndexSnafu, PuffinAddBlobSnafu, PushBloomFilterValueSnafu, Result,
};
use crate::read::Batch;
use crate::sst::index::TYPE_BLOOM_FILTER_INDEX;
use crate::sst::index::bloom_filter::INDEX_BLOB_TYPE;
use crate::sst::index::intermediate::{
IntermediateLocation, IntermediateManager, TempFileProvider,
};
use crate::sst::index::puffin_manager::SstPuffinWriter;
use crate::sst::index::statistics::{ByteCount, RowCount, Statistics};
use crate::sst::index::{TYPE_BLOOM_FILTER_INDEX, decode_primary_keys_with_counts};
/// The buffer size for the pipe used to send index data to the puffin blob.
const PIPE_BUFFER_SIZE_FOR_SENDING_BLOB: usize = 8192;
@@ -289,47 +291,81 @@ impl BloomFilterIndexer {
let n = batch.num_rows();
guard.inc_row_count(n);
let is_sparse = self.metadata.primary_key_encoding == PrimaryKeyEncoding::Sparse;
let mut decoded_pks: Option<Vec<(CompositeValues, usize)>> = None;
for (col_id, creator) in &mut self.creators {
// Get the column name from metadata
if let Some(column_meta) = self.metadata.column_by_id(*col_id) {
let column_name = &column_meta.column_schema.name;
// Safety: `creators` are created from the metadata so it won't be None.
let column_meta = self.metadata.column_by_id(*col_id).unwrap();
let column_name = &column_meta.column_schema.name;
if let Some(column_array) = batch.column_by_name(column_name) {
// Convert Arrow array to VectorRef
let vector = Helper::try_into_vector(column_array.clone())
.context(crate::error::ConvertVectorSnafu)?;
let sort_field = SortField::new(vector.data_type());
// Find the column in the RecordBatch by name
if let Some(column_array) = batch.column_by_name(column_name) {
// Convert Arrow array to VectorRef
let vector = Helper::try_into_vector(column_array.clone())
.context(crate::error::ConvertVectorSnafu)?;
let sort_field = SortField::new(vector.data_type());
for i in 0..n {
let value = vector.get_ref(i);
let elems = (!value.is_null())
.then(|| {
let mut buf = vec![];
IndexValueCodec::encode_nonnull_value(value, &sort_field, &mut buf)
.context(EncodeSnafu)?;
Ok(buf)
})
.transpose()?;
for i in 0..n {
let value = vector.get_ref(i);
let elems = (!value.is_null())
.then(|| {
let mut buf = vec![];
IndexValueCodec::encode_nonnull_value(value, &sort_field, &mut buf)
.context(EncodeSnafu)?;
Ok(buf)
})
.transpose()?;
creator
.push_row_elems(elems)
.await
.context(PushBloomFilterValueSnafu)?;
}
} else if is_sparse && column_meta.semantic_type == SemanticType::Tag {
// Column not found in batch, tries to decode from primary keys for sparse encoding.
if decoded_pks.is_none() {
decoded_pks = Some(decode_primary_keys_with_counts(batch, &self.codec)?);
}
creator
.push_row_elems(elems)
.await
.context(PushBloomFilterValueSnafu)?;
}
} else {
let pk_values_with_counts = decoded_pks.as_ref().unwrap();
let Some(col_info) = self.codec.pk_col_info(*col_id) else {
debug!(
"Column {} not found in the batch during building bloom filter index",
"Column {} not found in primary key during building bloom filter index",
column_name
);
// Push empty elements to maintain alignment
for _ in 0..n {
creator
.push_row_elems(None)
.await
.context(PushBloomFilterValueSnafu)?;
}
continue;
};
let pk_index = col_info.idx;
let field = &col_info.field;
for (decoded, count) in pk_values_with_counts {
let value = match decoded {
CompositeValues::Dense(dense) => dense.get(pk_index).map(|v| &v.1),
CompositeValues::Sparse(sparse) => sparse.get(col_id),
};
let elems = value
.filter(|v| !v.is_null())
.map(|v| {
let mut buf = vec![];
IndexValueCodec::encode_nonnull_value(
v.as_value_ref(),
field,
&mut buf,
)
.context(EncodeSnafu)?;
Ok(buf)
})
.transpose()?;
creator
.push_n_row_elems(*count, elems)
.await
.context(PushBloomFilterValueSnafu)?;
}
} else {
debug!(
"Column {} not found in the batch during building bloom filter index",
column_name
);
}
}

View File

@@ -16,6 +16,7 @@ use std::collections::HashMap;
use std::sync::Arc;
use std::sync::atomic::AtomicUsize;
use api::v1::SemanticType;
use common_telemetry::warn;
use datatypes::arrow::array::{Array, LargeStringArray, StringArray};
use datatypes::arrow::datatypes::DataType;
@@ -69,6 +70,17 @@ impl FulltextIndexer {
let mut creators = HashMap::new();
for column in &metadata.column_metadatas {
// Tag columns don't support fulltext index now.
// If we need to support fulltext index for tag columns, we also need to parse
// the codec and handle sparse encoding for flat format specially.
if column.semantic_type == SemanticType::Tag {
common_telemetry::debug!(
"Skip creating fulltext index for tag column {}",
column.column_schema.name
);
continue;
}
let options = column
.column_schema
.fulltext_options()

View File

@@ -17,6 +17,7 @@ use std::num::NonZeroUsize;
use std::sync::Arc;
use std::sync::atomic::AtomicUsize;
use api::v1::SemanticType;
use common_telemetry::{debug, warn};
use datatypes::arrow::record_batch::RecordBatch;
use datatypes::vectors::Helper;
@@ -26,9 +27,10 @@ use index::inverted_index::create::sort_create::SortIndexCreator;
use index::inverted_index::format::writer::InvertedIndexBlobWriter;
use index::target::IndexTarget;
use mito_codec::index::{IndexValueCodec, IndexValuesCodec};
use mito_codec::row_converter::SortField;
use mito_codec::row_converter::{CompositeValues, SortField};
use puffin::puffin_manager::{PuffinWriter, PutOptions};
use snafu::{ResultExt, ensure};
use store_api::codec::PrimaryKeyEncoding;
use store_api::metadata::RegionMetadataRef;
use store_api::storage::{ColumnId, FileId};
use tokio::io::duplex;
@@ -39,13 +41,13 @@ use crate::error::{
PushIndexValueSnafu, Result,
};
use crate::read::Batch;
use crate::sst::index::TYPE_INVERTED_INDEX;
use crate::sst::index::intermediate::{
IntermediateLocation, IntermediateManager, TempFileProvider,
};
use crate::sst::index::inverted_index::INDEX_BLOB_TYPE;
use crate::sst::index::puffin_manager::SstPuffinWriter;
use crate::sst::index::statistics::{ByteCount, RowCount, Statistics};
use crate::sst::index::{TYPE_INVERTED_INDEX, decode_primary_keys_with_counts};
/// The minimum memory usage threshold for one column.
const MIN_MEMORY_USAGE_THRESHOLD_PER_COLUMN: usize = 1024 * 1024; // 1MB
@@ -78,9 +80,6 @@ pub struct InvertedIndexer {
/// Region metadata for column lookups.
metadata: RegionMetadataRef,
/// Cache for mapping indexed column positions to their indices in the RecordBatch.
/// Aligns with indexed_column_ids. Initialized lazily when first batch is processed.
column_index_cache: Option<Vec<Option<usize>>>,
}
impl InvertedIndexer {
@@ -130,7 +129,6 @@ impl InvertedIndexer {
memory_usage,
indexed_column_ids,
metadata: metadata.clone(),
column_index_cache: None,
}
}
@@ -170,29 +168,29 @@ impl InvertedIndexer {
}
async fn do_update_flat(&mut self, batch: &RecordBatch) -> Result<()> {
// Initialize column index cache if not already done
if self.column_index_cache.is_none() {
self.initialize_column_index_cache(batch);
}
let mut guard = self.stats.record_update();
let n = batch.num_rows();
guard.inc_row_count(n);
guard.inc_row_count(batch.num_rows());
let column_indices = self.column_index_cache.as_ref().unwrap();
let is_sparse = self.metadata.primary_key_encoding == PrimaryKeyEncoding::Sparse;
let mut decoded_pks: Option<Vec<(CompositeValues, usize)>> = None;
for ((col_id, target_key), &column_index) in
self.indexed_column_ids.iter().zip(column_indices.iter())
{
if let Some(index) = column_index {
let column_array = batch.column(index);
for (col_id, target_key) in &self.indexed_column_ids {
let Some(column_meta) = self.metadata.column_by_id(*col_id) else {
debug!(
"Column {} not found in the metadata during building inverted index",
col_id
);
continue;
};
let column_name = &column_meta.column_schema.name;
if let Some(column_array) = batch.column_by_name(column_name) {
// Convert Arrow array to VectorRef using Helper
let vector = Helper::try_into_vector(column_array.clone())
.context(crate::error::ConvertVectorSnafu)?;
let sort_field = SortField::new(vector.data_type());
for row in 0..n {
for row in 0..batch.num_rows() {
self.value_buf.clear();
let value_ref = vector.get_ref(row);
@@ -214,6 +212,47 @@ impl InvertedIndexer {
.context(PushIndexValueSnafu)?;
}
}
} else if is_sparse && column_meta.semantic_type == SemanticType::Tag {
// Column not found in batch, tries to decode from primary keys for sparse encoding.
if decoded_pks.is_none() {
decoded_pks = Some(decode_primary_keys_with_counts(batch, &self.codec)?);
}
let pk_values_with_counts = decoded_pks.as_ref().unwrap();
let Some(col_info) = self.codec.pk_col_info(*col_id) else {
debug!(
"Column {} not found in primary key during building bloom filter index",
column_name
);
continue;
};
let pk_index = col_info.idx;
let field = &col_info.field;
for (decoded, count) in pk_values_with_counts {
let value = match decoded {
CompositeValues::Dense(dense) => dense.get(pk_index).map(|v| &v.1),
CompositeValues::Sparse(sparse) => sparse.get(col_id),
};
let elem = value
.filter(|v| !v.is_null())
.map(|v| {
self.value_buf.clear();
IndexValueCodec::encode_nonnull_value(
v.as_value_ref(),
field,
&mut self.value_buf,
)
.context(EncodeSnafu)?;
Ok(self.value_buf.as_slice())
})
.transpose()?;
self.index_creator
.push_with_name_n(target_key, elem, *count)
.await
.context(PushIndexValueSnafu)?;
}
} else {
debug!(
"Column {} not found in the batch during building inverted index",
@@ -225,26 +264,6 @@ impl InvertedIndexer {
Ok(())
}
/// Initializes the column index cache by mapping indexed column ids to their positions in the RecordBatch.
fn initialize_column_index_cache(&mut self, batch: &RecordBatch) {
let mut column_indices = Vec::with_capacity(self.indexed_column_ids.len());
for (col_id, _) in &self.indexed_column_ids {
let column_index = if let Some(column_meta) = self.metadata.column_by_id(*col_id) {
let column_name = &column_meta.column_schema.name;
batch
.schema()
.column_with_name(column_name)
.map(|(index, _)| index)
} else {
None
};
column_indices.push(column_index);
}
self.column_index_cache = Some(column_indices);
}
/// Finishes index creation and cleans up garbage.
/// Returns the number of rows and bytes written.
pub async fn finish(

View File

@@ -84,6 +84,8 @@ pub struct SstInfo {
pub file_metadata: Option<Arc<ParquetMetaData>>,
/// Index Meta Data
pub index_metadata: IndexOutput,
/// Number of series
pub num_series: u64,
}
#[cfg(test)]
@@ -766,6 +768,7 @@ mod tests {
.expect("partition expression should be valid JSON"),
None => None,
},
num_series: 0,
},
Arc::new(NoopFilePurger),
);

View File

@@ -15,18 +15,20 @@
//! Structs and functions for reading ranges from a parquet file. A file range
//! is usually a row group in a parquet file.
use std::collections::HashMap;
use std::ops::BitAnd;
use std::sync::Arc;
use api::v1::{OpType, SemanticType};
use common_telemetry::error;
use datatypes::arrow::array::BooleanArray;
use datatypes::arrow::array::{ArrayRef, BooleanArray};
use datatypes::arrow::buffer::BooleanBuffer;
use datatypes::arrow::record_batch::RecordBatch;
use mito_codec::row_converter::{CompositeValues, PrimaryKeyCodec};
use parquet::arrow::arrow_reader::RowSelection;
use snafu::{OptionExt, ResultExt};
use store_api::storage::TimeSeriesRowSelector;
use store_api::codec::PrimaryKeyEncoding;
use store_api::storage::{ColumnId, TimeSeriesRowSelector};
use crate::error::{
ComputeArrowSnafu, DataTypeMismatchSnafu, DecodeSnafu, DecodeStatsSnafu, RecordBatchSnafu,
@@ -37,11 +39,11 @@ use crate::read::compat::CompatBatch;
use crate::read::last_row::RowGroupLastRowCachedReader;
use crate::read::prune::{FlatPruneReader, PruneReader};
use crate::sst::file::FileHandle;
use crate::sst::parquet::flat_format::{DecodedPrimaryKeys, decode_primary_keys};
use crate::sst::parquet::format::ReadFormat;
use crate::sst::parquet::reader::{
FlatRowGroupReader, MaybeFilter, RowGroupReader, RowGroupReaderBuilder, SimpleFilterContext,
};
/// A range of a parquet SST. Now it is a row group.
/// We can read different file ranges in parallel.
#[derive(Clone)]
@@ -357,7 +359,34 @@ impl RangeBase {
}
/// Filters the input RecordBatch by the pushed down predicate and returns RecordBatch.
///
/// It assumes all necessary tags are already decoded from the primary key.
pub(crate) fn precise_filter_flat(&self, input: RecordBatch) -> Result<Option<RecordBatch>> {
let mask = self.compute_filter_mask_flat(&input)?;
// If mask is None, the entire batch is filtered out
let Some(mask) = mask else {
return Ok(None);
};
let filtered_batch =
datatypes::arrow::compute::filter_record_batch(&input, &BooleanArray::from(mask))
.context(ComputeArrowSnafu)?;
if filtered_batch.num_rows() > 0 {
Ok(Some(filtered_batch))
} else {
Ok(None)
}
}
/// Computes the filter mask for the input RecordBatch based on pushed down predicates.
///
/// Returns `None` if the entire batch is filtered out, otherwise returns the boolean mask.
pub(crate) fn compute_filter_mask_flat(
&self,
input: &RecordBatch,
) -> Result<Option<BooleanBuffer>> {
let mut mask = BooleanBuffer::new_set(input.num_rows());
let flat_format = self
@@ -367,6 +396,11 @@ impl RangeBase {
reason: "Expected flat format for precise_filter_flat",
})?;
// Decodes primary keys once if we have any tag filters not in projection
let mut decoded_pks: Option<DecodedPrimaryKeys> = None;
// Cache decoded tag arrays by column id to avoid redundant decoding
let mut decoded_tag_cache: HashMap<ColumnId, ArrayRef> = HashMap::new();
// Run filter one by one and combine them result
for filter_ctx in &self.filters {
let filter = match filter_ctx.filter() {
@@ -383,20 +417,53 @@ impl RangeBase {
let column = &input.columns()[idx];
let result = filter.evaluate_array(column).context(RecordBatchSnafu)?;
mask = mask.bitand(&result);
} else {
// Column not found in projection, continue
continue;
} else if filter_ctx.semantic_type() == SemanticType::Tag {
// Column not found in projection, it may be a tag column.
// Decodes primary keys if not already decoded.
if decoded_pks.is_none() {
decoded_pks = Some(decode_primary_keys(self.codec.as_ref(), input)?);
}
let metadata = flat_format.metadata();
let column_id = filter_ctx.column_id();
// Check cache first
let tag_column = if let Some(cached_column) = decoded_tag_cache.get(&column_id) {
cached_column.clone()
} else {
// For dense encoding, we need pk_index. For sparse encoding, pk_index is None.
let pk_index = if self.codec.encoding() == PrimaryKeyEncoding::Sparse {
None
} else {
metadata.primary_key_index(column_id)
};
let column_index = metadata.column_index_by_id(column_id);
if let (Some(column_index), Some(decoded)) =
(column_index, decoded_pks.as_ref())
{
let column_metadata = &metadata.column_metadatas[column_index];
let tag_column = decoded.get_tag_column(
column_id,
pk_index,
&column_metadata.column_schema.data_type,
)?;
// Cache the decoded tag column
decoded_tag_cache.insert(column_id, tag_column.clone());
tag_column
} else {
continue;
}
};
let result = filter
.evaluate_array(&tag_column)
.context(RecordBatchSnafu)?;
mask = mask.bitand(&result);
}
// Non-tag column not found in projection.
}
let filtered_batch =
datatypes::arrow::compute::filter_record_batch(&input, &BooleanArray::from(mask))
.context(ComputeArrowSnafu)?;
if filtered_batch.num_rows() > 0 {
Ok(Some(filtered_batch))
} else {
Ok(None)
}
Ok(Some(mask))
}
}

View File

@@ -127,7 +127,9 @@ pub(crate) fn op_type_column_index(num_columns: usize) -> usize {
num_columns - 1
}
// TODO(yingwen): Add an option to skip reading internal columns.
// TODO(yingwen): Add an option to skip reading internal columns if the region is
// append only and doesn't use sparse encoding (We need to check the table id under
// sparse encoding).
/// Helper for reading the flat SST format with projection.
///
/// It only supports flat format that stores primary keys additionally.
@@ -528,6 +530,125 @@ pub(crate) fn sst_column_id_indices(metadata: &RegionMetadata) -> HashMap<Column
id_to_index
}
/// Decodes primary keys from a batch and returns decoded primary key information.
///
/// The batch must contain a primary key column at the expected index.
pub(crate) fn decode_primary_keys(
codec: &dyn PrimaryKeyCodec,
batch: &RecordBatch,
) -> Result<DecodedPrimaryKeys> {
let primary_key_index = primary_key_column_index(batch.num_columns());
let pk_dict_array = batch
.column(primary_key_index)
.as_any()
.downcast_ref::<PrimaryKeyArray>()
.with_context(|| InvalidRecordBatchSnafu {
reason: "Primary key column is not a dictionary array".to_string(),
})?;
let pk_values_array = pk_dict_array
.values()
.as_any()
.downcast_ref::<BinaryArray>()
.with_context(|| InvalidRecordBatchSnafu {
reason: "Primary key values are not binary array".to_string(),
})?;
let keys = pk_dict_array.keys();
// Decodes primary key values by iterating through keys, reusing decoded values for duplicate keys.
// Maps original key index -> new decoded value index
let mut key_to_decoded_index = Vec::with_capacity(keys.len());
let mut decoded_pk_values = Vec::new();
let mut prev_key: Option<u32> = None;
// The parquet reader may read the whole dictionary page into the dictionary values, so
// we may decode many primary keys not in this batch if we decode the values array directly.
for i in 0..keys.len() {
let current_key = keys.value(i);
// Check if current key is the same as previous key
if let Some(prev) = prev_key
&& prev == current_key
{
// Reuse the last decoded index
key_to_decoded_index.push((decoded_pk_values.len() - 1) as u32);
continue;
}
// New key, decodes the value
let pk_bytes = pk_values_array.value(current_key as usize);
let decoded_value = codec.decode(pk_bytes).context(DecodeSnafu)?;
decoded_pk_values.push(decoded_value);
key_to_decoded_index.push((decoded_pk_values.len() - 1) as u32);
prev_key = Some(current_key);
}
// Create the keys array from key_to_decoded_index
let keys_array = UInt32Array::from(key_to_decoded_index);
Ok(DecodedPrimaryKeys {
decoded_pk_values,
keys_array,
})
}
/// Holds decoded primary key values and their indices.
pub(crate) struct DecodedPrimaryKeys {
/// Decoded primary key values for unique keys in the dictionary.
decoded_pk_values: Vec<CompositeValues>,
/// Prebuilt keys array for creating dictionary arrays.
keys_array: UInt32Array,
}
impl DecodedPrimaryKeys {
/// Gets a tag column array by column id and data type.
///
/// For sparse encoding, uses column_id to lookup values.
/// For dense encoding, uses pk_index to get values.
pub(crate) fn get_tag_column(
&self,
column_id: ColumnId,
pk_index: Option<usize>,
column_type: &ConcreteDataType,
) -> Result<ArrayRef> {
// Gets values from the primary key.
let mut builder = column_type.create_mutable_vector(self.decoded_pk_values.len());
for decoded in &self.decoded_pk_values {
match decoded {
CompositeValues::Dense(dense) => {
let pk_idx = pk_index.expect("pk_index required for dense encoding");
if pk_idx < dense.len() {
builder.push_value_ref(&dense[pk_idx].1.as_value_ref());
} else {
builder.push_null();
}
}
CompositeValues::Sparse(sparse) => {
let value = sparse.get_or_null(column_id);
builder.push_value_ref(&value.as_value_ref());
}
};
}
let values_vector = builder.to_vector();
let values_array = values_vector.to_arrow_array();
// Only creates dictionary array for string types, otherwise take values by keys
if column_type.is_string() {
// Creates dictionary array using the same keys for string types
// Note that the dictionary values may have nulls.
let dict_array = DictionaryArray::new(self.keys_array.clone(), values_array);
Ok(Arc::new(dict_array))
} else {
// For non-string types, takes values by keys indices to create a regular array
let taken_array =
take(&values_array, &self.keys_array, None).context(ComputeArrowSnafu)?;
Ok(taken_array)
}
}
}
/// Converts a batch that doesn't have decoded primary key columns into a batch that has decoded
/// primary key columns in flat format.
pub(crate) struct FlatConvertFormat {
@@ -577,53 +698,22 @@ impl FlatConvertFormat {
/// Converts a batch to have decoded primary key columns in flat format.
///
/// The primary key array in the batch is a dictionary array. We decode each value which is a
/// primary key and reuse the keys array to build a dictionary array for each tag column.
/// The decoded columns are inserted in front of other columns.
/// The primary key array in the batch is a dictionary array.
pub(crate) fn convert(&self, batch: RecordBatch) -> Result<RecordBatch> {
if self.projected_primary_keys.is_empty() {
return Ok(batch);
}
let primary_key_index = primary_key_column_index(batch.num_columns());
let pk_dict_array = batch
.column(primary_key_index)
.as_any()
.downcast_ref::<PrimaryKeyArray>()
.with_context(|| InvalidRecordBatchSnafu {
reason: "Primary key column is not a dictionary array".to_string(),
})?;
let pk_values_array = pk_dict_array
.values()
.as_any()
.downcast_ref::<BinaryArray>()
.with_context(|| InvalidRecordBatchSnafu {
reason: "Primary key values are not binary array".to_string(),
})?;
// Decodes all primary key values
let mut decoded_pk_values = Vec::with_capacity(pk_values_array.len());
for i in 0..pk_values_array.len() {
if pk_values_array.is_null(i) {
decoded_pk_values.push(None);
} else {
let pk_bytes = pk_values_array.value(i);
let decoded = self.codec.decode(pk_bytes).context(DecodeSnafu)?;
decoded_pk_values.push(Some(decoded));
}
}
let decoded_pks = decode_primary_keys(self.codec.as_ref(), &batch)?;
// Builds decoded tag column arrays.
let mut decoded_columns = Vec::new();
for (column_id, pk_index, column_index) in &self.projected_primary_keys {
let column_metadata = &self.metadata.column_metadatas[*column_index];
let tag_column = self.build_primary_key_column(
let tag_column = decoded_pks.get_tag_column(
*column_id,
*pk_index,
Some(*pk_index),
&column_metadata.column_schema.data_type,
pk_dict_array.keys(),
&decoded_pk_values,
)?;
decoded_columns.push(tag_column);
}
@@ -648,57 +738,6 @@ impl FlatConvertFormat {
let new_schema = Arc::new(Schema::new(new_fields));
RecordBatch::try_new(new_schema, new_columns).context(NewRecordBatchSnafu)
}
/// Builds an array for a specific tag column.
///
/// It may build a dictionary array if the type is string. Note that the dictionary
/// array may have null values, although keys are not null.
fn build_primary_key_column(
&self,
column_id: ColumnId,
pk_index: usize,
column_type: &ConcreteDataType,
keys: &UInt32Array,
decoded_pk_values: &[Option<CompositeValues>],
) -> Result<ArrayRef> {
// Gets values from the primary key.
let mut builder = column_type.create_mutable_vector(decoded_pk_values.len());
for decoded_opt in decoded_pk_values {
match decoded_opt {
Some(decoded) => {
match decoded {
CompositeValues::Dense(dense) => {
if pk_index < dense.len() {
builder.push_value_ref(&dense[pk_index].1.as_value_ref());
} else {
builder.push_null();
}
}
CompositeValues::Sparse(sparse) => {
let value = sparse.get_or_null(column_id);
builder.push_value_ref(&value.as_value_ref());
}
};
}
None => builder.push_null(),
}
}
let values_vector = builder.to_vector();
let values_array = values_vector.to_arrow_array();
// Only creates dictionary array for string types, otherwise take values by keys
if column_type.is_string() {
// Creates dictionary array using the same keys for string types
// Note that the dictionary values may have nulls.
let dict_array = DictionaryArray::new(keys.clone(), values_array);
Ok(Arc::new(dict_array))
} else {
// For non-string types, takes values by keys indices to create a regular array
let taken_array = take(&values_array, keys, None).context(ComputeArrowSnafu)?;
Ok(taken_array)
}
}
}
#[cfg(test)]

View File

@@ -1397,6 +1397,7 @@ impl FlatRowGroupReader {
let record_batch = batch_result.context(ArrowReaderSnafu {
path: self.context.file_path(),
})?;
// Safety: Only flat format use FlatRowGroupReader.
let flat_format = self.context.read_format().as_flat().unwrap();
let record_batch =

View File

@@ -57,7 +57,9 @@ use crate::sst::parquet::flat_format::{FlatWriteFormat, time_index_column_index}
use crate::sst::parquet::format::PrimaryKeyWriteFormat;
use crate::sst::parquet::helper::parse_parquet_metadata;
use crate::sst::parquet::{PARQUET_METADATA_KEY, SstInfo, WriteOptions};
use crate::sst::{DEFAULT_WRITE_BUFFER_SIZE, DEFAULT_WRITE_CONCURRENCY, FlatSchemaOptions};
use crate::sst::{
DEFAULT_WRITE_BUFFER_SIZE, DEFAULT_WRITE_CONCURRENCY, FlatSchemaOptions, SeriesEstimator,
};
/// Parquet SST writer.
pub struct ParquetWriter<F: WriterFactory, I: IndexerBuilder, P: FilePathProvider> {
@@ -176,7 +178,7 @@ where
) -> Result<()> {
// maybe_init_writer will re-create a new file.
if let Some(mut current_writer) = mem::take(&mut self.writer) {
let stats = mem::take(stats);
let mut stats = mem::take(stats);
// At least one row has been written.
assert!(stats.num_rows > 0);
@@ -211,6 +213,7 @@ where
// convert FileMetaData to ParquetMetaData
let parquet_metadata = parse_parquet_metadata(file_meta)?;
let num_series = stats.series_estimator.finish();
ssts.push(SstInfo {
file_id: self.current_file,
time_range,
@@ -219,6 +222,7 @@ where
num_row_groups: parquet_metadata.num_row_groups() as u64,
file_metadata: Some(Arc::new(parquet_metadata)),
index_metadata: index_output,
num_series,
});
self.current_file = FileId::random();
self.bytes_written.store(0, Ordering::Relaxed)
@@ -496,6 +500,8 @@ struct SourceStats {
num_rows: usize,
/// Time range of fetched batches.
time_range: Option<(Timestamp, Timestamp)>,
/// Series estimator for computing num_series.
series_estimator: SeriesEstimator,
}
impl SourceStats {
@@ -505,6 +511,7 @@ impl SourceStats {
}
self.num_rows += batch.num_rows();
self.series_estimator.update(batch);
// Safety: batch is not empty.
let (min_in_batch, max_in_batch) = (
batch.first_timestamp().unwrap(),
@@ -524,6 +531,7 @@ impl SourceStats {
}
self.num_rows += record_batch.num_rows();
self.series_estimator.update_flat(record_batch);
// Get the timestamp column by index
let time_index_col_idx = time_index_column_index(record_batch.num_columns());

View File

@@ -127,6 +127,7 @@ pub fn sst_file_handle_with_file_id(file_id: FileId, start_ms: i64, end_ms: i64)
index_file_size: 0,
num_rows: 0,
num_row_groups: 0,
num_series: 0,
sequence: None,
partition_expr: None,
},

View File

@@ -105,6 +105,7 @@ impl VersionControlBuilder {
index_file_size: 0,
num_rows: 0,
num_row_groups: 0,
num_series: 0,
sequence: NonZeroU64::new(start_ms as u64),
partition_expr: match &self.metadata.partition_expr {
Some(json_str) => partition::expr::PartitionExpr::from_json_str(json_str)
@@ -193,6 +194,7 @@ pub(crate) fn apply_edit(
index_file_size: 0,
num_rows: 0,
num_row_groups: 0,
num_series: 0,
sequence: NonZeroU64::new(*start_ms as u64),
partition_expr: match &version_control.current().version.metadata.partition_expr {
Some(json_str) => partition::expr::PartitionExpr::from_json_str(json_str)

View File

@@ -1220,10 +1220,10 @@ impl WorkerListener {
}
}
pub(crate) async fn on_index_build_success(&self, _region_file_id: RegionFileId) {
pub(crate) async fn on_index_build_finish(&self, _region_file_id: RegionFileId) {
#[cfg(any(test, feature = "test"))]
if let Some(listener) = &self.listener {
listener.on_index_build_success(_region_file_id).await;
listener.on_index_build_finish(_region_file_id).await;
}
}
@@ -1233,6 +1233,13 @@ impl WorkerListener {
listener.on_index_build_begin(_region_file_id).await;
}
}
pub(crate) async fn on_index_build_abort(&self, _region_file_id: RegionFileId) {
#[cfg(any(test, feature = "test"))]
if let Some(listener) = &self.listener {
listener.on_index_build_abort(_region_file_id).await;
}
}
}
#[cfg(test)]

Some files were not shown because too many files have changed in this diff Show More