mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-01-04 12:22:55 +00:00
Compare commits
32 Commits
v0.14.0-ni
...
v0.14.0-ni
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c5b55fd8cf | ||
|
|
8051dbbc31 | ||
|
|
2d3192984d | ||
|
|
bef45ed0e8 | ||
|
|
a9e990768d | ||
|
|
7e1ba49d3d | ||
|
|
737558ef53 | ||
|
|
dbc25dd8da | ||
|
|
76a58a07e1 | ||
|
|
c2ba7fb16c | ||
|
|
09ef24fd75 | ||
|
|
9b7b012620 | ||
|
|
898e0bd828 | ||
|
|
2b4ed43692 | ||
|
|
8f2ae4e136 | ||
|
|
0cd219a5d2 | ||
|
|
2b2ea5bf72 | ||
|
|
e107bd5529 | ||
|
|
a31f0e255b | ||
|
|
40b52f3b13 | ||
|
|
f13a43647a | ||
|
|
7bcb01d269 | ||
|
|
e81213728b | ||
|
|
d88482b996 | ||
|
|
3b547d9d13 | ||
|
|
278553fc3f | ||
|
|
a36901a653 | ||
|
|
c4ac242c69 | ||
|
|
9f9307de73 | ||
|
|
c77ce958a3 | ||
|
|
5ad2d8b3b8 | ||
|
|
2724c3c142 |
@@ -47,7 +47,6 @@ runs:
|
||||
shell: pwsh
|
||||
run: make test sqlness-test
|
||||
env:
|
||||
RUSTUP_WINDOWS_PATH_ADD_BIN: 1 # Workaround for https://github.com/nextest-rs/nextest/issues/1493
|
||||
RUST_BACKTRACE: 1
|
||||
SQLNESS_OPTS: "--preserve-state"
|
||||
|
||||
|
||||
1
.github/workflows/nightly-ci.yml
vendored
1
.github/workflows/nightly-ci.yml
vendored
@@ -107,7 +107,6 @@ jobs:
|
||||
CARGO_BUILD_RUSTFLAGS: "-C linker=lld-link"
|
||||
RUST_BACKTRACE: 1
|
||||
CARGO_INCREMENTAL: 0
|
||||
RUSTUP_WINDOWS_PATH_ADD_BIN: 1 # Workaround for https://github.com/nextest-rs/nextest/issues/1493
|
||||
GT_S3_BUCKET: ${{ vars.AWS_CI_TEST_BUCKET }}
|
||||
GT_S3_ACCESS_KEY_ID: ${{ secrets.AWS_CI_TEST_ACCESS_KEY_ID }}
|
||||
GT_S3_ACCESS_KEY: ${{ secrets.AWS_CI_TEST_SECRET_ACCESS_KEY }}
|
||||
|
||||
707
Cargo.lock
generated
707
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
34
Cargo.toml
34
Cargo.toml
@@ -88,7 +88,7 @@ rust.unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tokio_unstable)'] }
|
||||
#
|
||||
# See for more detaiils: https://github.com/rust-lang/cargo/issues/11329
|
||||
ahash = { version = "0.8", features = ["compile-time-rng"] }
|
||||
aquamarine = "0.3"
|
||||
aquamarine = "0.6"
|
||||
arrow = { version = "53.0.0", features = ["prettyprint"] }
|
||||
arrow-array = { version = "53.0.0", default-features = false, features = ["chrono-tz"] }
|
||||
arrow-flight = "53.0"
|
||||
@@ -99,9 +99,9 @@ async-trait = "0.1"
|
||||
# Remember to update axum-extra, axum-macros when updating axum
|
||||
axum = "0.8"
|
||||
axum-extra = "0.10"
|
||||
axum-macros = "0.4"
|
||||
axum-macros = "0.5"
|
||||
backon = "1"
|
||||
base64 = "0.21"
|
||||
base64 = "0.22"
|
||||
bigdecimal = "0.4.2"
|
||||
bitflags = "2.4.1"
|
||||
bytemuck = "1.12"
|
||||
@@ -111,7 +111,7 @@ chrono-tz = "0.10.1"
|
||||
clap = { version = "4.4", features = ["derive"] }
|
||||
config = "0.13.0"
|
||||
crossbeam-utils = "0.8"
|
||||
dashmap = "5.4"
|
||||
dashmap = "6.1"
|
||||
datafusion = { git = "https://github.com/apache/datafusion.git", rev = "2464703c84c400a09cc59277018813f0e797bb4e" }
|
||||
datafusion-common = { git = "https://github.com/apache/datafusion.git", rev = "2464703c84c400a09cc59277018813f0e797bb4e" }
|
||||
datafusion-expr = { git = "https://github.com/apache/datafusion.git", rev = "2464703c84c400a09cc59277018813f0e797bb4e" }
|
||||
@@ -121,31 +121,31 @@ datafusion-physical-expr = { git = "https://github.com/apache/datafusion.git", r
|
||||
datafusion-physical-plan = { git = "https://github.com/apache/datafusion.git", rev = "2464703c84c400a09cc59277018813f0e797bb4e" }
|
||||
datafusion-sql = { git = "https://github.com/apache/datafusion.git", rev = "2464703c84c400a09cc59277018813f0e797bb4e" }
|
||||
datafusion-substrait = { git = "https://github.com/apache/datafusion.git", rev = "2464703c84c400a09cc59277018813f0e797bb4e" }
|
||||
deadpool = "0.10"
|
||||
deadpool-postgres = "0.12"
|
||||
derive_builder = "0.12"
|
||||
deadpool = "0.12"
|
||||
deadpool-postgres = "0.14"
|
||||
derive_builder = "0.20"
|
||||
dotenv = "0.15"
|
||||
etcd-client = "0.14"
|
||||
fst = "0.4.7"
|
||||
futures = "0.3"
|
||||
futures-util = "0.3"
|
||||
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "a7274ddce299f33d23dbe8af5bbe6219f07c559a" }
|
||||
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "97e298d119fdb9499bc6ba9e03f375cfa7cdf130" }
|
||||
hex = "0.4"
|
||||
http = "1"
|
||||
humantime = "2.1"
|
||||
humantime-serde = "1.1"
|
||||
hyper = "1.1"
|
||||
hyper-util = "0.1"
|
||||
itertools = "0.10"
|
||||
itertools = "0.14"
|
||||
jsonb = { git = "https://github.com/databendlabs/jsonb.git", rev = "8c8d2fc294a39f3ff08909d60f718639cfba3875", default-features = false }
|
||||
lazy_static = "1.4"
|
||||
local-ip-address = "0.6"
|
||||
loki-proto = { git = "https://github.com/GreptimeTeam/loki-proto.git", rev = "1434ecf23a2654025d86188fb5205e7a74b225d3" }
|
||||
meter-core = { git = "https://github.com/GreptimeTeam/greptime-meter.git", rev = "5618e779cf2bb4755b499c630fba4c35e91898cb" }
|
||||
mockall = "0.11.4"
|
||||
mockall = "0.13"
|
||||
moka = "0.12"
|
||||
nalgebra = "0.33"
|
||||
notify = "6.1"
|
||||
notify = "8.0"
|
||||
num_cpus = "1.16"
|
||||
once_cell = "1.18"
|
||||
opentelemetry-proto = { version = "0.27", features = [
|
||||
@@ -163,8 +163,8 @@ prometheus = { version = "0.13.3", features = ["process"] }
|
||||
promql-parser = { version = "0.5", features = ["ser"] }
|
||||
prost = "0.13"
|
||||
raft-engine = { version = "0.4.1", default-features = false }
|
||||
rand = "0.8"
|
||||
ratelimit = "0.9"
|
||||
rand = "0.9"
|
||||
ratelimit = "0.10"
|
||||
regex = "1.8"
|
||||
regex-automata = "0.4"
|
||||
reqwest = { version = "0.12", default-features = false, features = [
|
||||
@@ -176,7 +176,7 @@ reqwest = { version = "0.12", default-features = false, features = [
|
||||
rskafka = { git = "https://github.com/influxdata/rskafka.git", rev = "75535b5ad9bae4a5dbb582c82e44dfd81ec10105", features = [
|
||||
"transport-tls",
|
||||
] }
|
||||
rstest = "0.21"
|
||||
rstest = "0.25"
|
||||
rstest_reuse = "0.7"
|
||||
rust_decimal = "1.33"
|
||||
rustc-hash = "2.0"
|
||||
@@ -184,7 +184,7 @@ rustls = { version = "0.23.20", default-features = false } # override by patch,
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = { version = "1.0", features = ["float_roundtrip"] }
|
||||
serde_with = "3"
|
||||
shadow-rs = "0.38"
|
||||
shadow-rs = "1.1"
|
||||
similar-asserts = "1.6.0"
|
||||
smallvec = { version = "1", features = ["serde"] }
|
||||
snafu = "0.8"
|
||||
@@ -194,13 +194,13 @@ sqlx = { version = "0.8", features = [
|
||||
"postgres",
|
||||
"chrono",
|
||||
] }
|
||||
sysinfo = "0.30"
|
||||
sysinfo = "0.33"
|
||||
# on branch v0.52.x
|
||||
sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "71dd86058d2af97b9925093d40c4e03360403170", features = [
|
||||
"visitor",
|
||||
"serde",
|
||||
] } # on branch v0.44.x
|
||||
strum = { version = "0.25", features = ["derive"] }
|
||||
strum = { version = "0.27", features = ["derive"] }
|
||||
tempfile = "3"
|
||||
tokio = { version = "1.40", features = ["full"] }
|
||||
tokio-postgres = "0.7"
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
This document introduces how to write fuzz tests in GreptimeDB.
|
||||
|
||||
## What is a fuzz test
|
||||
Fuzz test is tool that leverage deterministic random generation to assist in finding bugs. The goal of fuzz tests is to identify inputs generated by the fuzzer that cause system panics, crashes, or unexpected behaviors to occur. And we are using the [cargo-fuzz](https://github.com/rust-fuzz/cargo-fuzz) to run our fuzz test targets.
|
||||
Fuzz test is tool that leverage deterministic random generation to assist in finding bugs. The goal of fuzz tests is to identify inputs generated by the fuzzer that cause system panics, crashes, or unexpected behaviors to occur. And we are using the [cargo-fuzz](https://github.com/rust-fuzz/cargo-fuzz) to run our fuzz test targets.
|
||||
|
||||
## Why we need them
|
||||
- Find bugs by leveraging random generation
|
||||
@@ -13,7 +13,7 @@ Fuzz test is tool that leverage deterministic random generation to assist in fin
|
||||
All fuzz test-related resources are located in the `/tests-fuzz` directory.
|
||||
There are two types of resources: (1) fundamental components and (2) test targets.
|
||||
|
||||
### Fundamental components
|
||||
### Fundamental components
|
||||
They are located in the `/tests-fuzz/src` directory. The fundamental components define how to generate SQLs (including dialects for different protocols) and validate execution results (e.g., column attribute validation), etc.
|
||||
|
||||
### Test targets
|
||||
@@ -21,25 +21,25 @@ They are located in the `/tests-fuzz/targets` directory, with each file represen
|
||||
|
||||
Figure 1 illustrates the fundamental components of the fuzz test provide the ability to generate random SQLs. It utilizes a Random Number Generator (Rng) to generate the Intermediate Representation (IR), then employs a DialectTranslator to produce specified dialects for different protocols. Finally, the fuzz tests send the generated SQL via the specified protocol and verify that the execution results meet expectations.
|
||||
```
|
||||
Rng
|
||||
|
|
||||
|
|
||||
v
|
||||
ExprGenerator
|
||||
|
|
||||
|
|
||||
v
|
||||
Intermediate representation (IR)
|
||||
|
|
||||
|
|
||||
+----------------------+----------------------+
|
||||
| | |
|
||||
v v v
|
||||
Rng
|
||||
|
|
||||
|
|
||||
v
|
||||
ExprGenerator
|
||||
|
|
||||
|
|
||||
v
|
||||
Intermediate representation (IR)
|
||||
|
|
||||
|
|
||||
+----------------------+----------------------+
|
||||
| | |
|
||||
v v v
|
||||
MySQLTranslator PostgreSQLTranslator OtherDialectTranslator
|
||||
| | |
|
||||
| | |
|
||||
v v v
|
||||
SQL(MySQL Dialect) ..... .....
|
||||
| | |
|
||||
| | |
|
||||
v v v
|
||||
SQL(MySQL Dialect) ..... .....
|
||||
|
|
||||
|
|
||||
v
|
||||
@@ -133,4 +133,4 @@ fuzz_target!(|input: FuzzInput| {
|
||||
cargo fuzz run <fuzz-target> --fuzz-dir tests-fuzz
|
||||
```
|
||||
|
||||
For more details, please refer to this [document](/tests-fuzz/README.md).
|
||||
For more details, please refer to this [document](/tests-fuzz/README.md).
|
||||
|
||||
77
docs/rfcs/2025-02-06-remote-wal-purge.md
Normal file
77
docs/rfcs/2025-02-06-remote-wal-purge.md
Normal file
@@ -0,0 +1,77 @@
|
||||
---
|
||||
Feature Name: Remote WAL Purge
|
||||
Tracking Issue: https://github.com/GreptimeTeam/greptimedb/issues/5474
|
||||
Date: 2025-02-06
|
||||
Author: "Yuhan Wang <profsyb@gmail.com>"
|
||||
---
|
||||
|
||||
# Summary
|
||||
|
||||
This RFC proposes a method for purging remote WAL in the database.
|
||||
|
||||
# Motivation
|
||||
|
||||
Currently only local wal entries are purged when flushing, while remote wal does nothing.
|
||||
|
||||
# Details
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
Region0->>Kafka: Last entry id of the topic in use
|
||||
Region0->>WALPruner: Heartbeat with last entry id
|
||||
WALPruner->>+WALPruner: Time Loop
|
||||
WALPruner->>+ProcedureManager: Submit purge procedure
|
||||
ProcedureManager->>Region0: Flush request
|
||||
ProcedureManager->>Kafka: Prune WAL entries
|
||||
Region0->>Region0: Flush
|
||||
```
|
||||
|
||||
## Steps
|
||||
|
||||
### Before purge
|
||||
|
||||
Before purging remote WAL, metasrv needs to know:
|
||||
|
||||
1. `last_entry_id` of each region.
|
||||
2. `kafka_topic_last_entry_id` which is the last entry id of the topic in use. Can be lazily updated and needed when region has empty memtable.
|
||||
3. Kafka topics that each region uses.
|
||||
|
||||
The states are maintained through:
|
||||
1. Heartbeat: Datanode sends `last_entry_id` to metasrv in heartbeat. As for regions with empty memtable, `last_entry_id` should equals to `kafka_topic_last_entry_id`.
|
||||
2. Metasrv maintains a topic-region map to know which region uses which topic.
|
||||
|
||||
`kafka_topic_last_entry_id` will be maintained by the region itself. Region will update the value after `k` heartbeats if the memtable is empty.
|
||||
|
||||
### Purge procedure
|
||||
|
||||
We can better handle locks utilizing current procedure. It's quite similar to the region migration procedure.
|
||||
|
||||
After a period of time, metasrv will submit a purge procedure to ProcedureManager. The purge will apply to all topics.
|
||||
|
||||
The procedure is divided into following stages:
|
||||
|
||||
1. Preparation:
|
||||
- Retrieve `last_entry_id` of each region kvbackend.
|
||||
- Choose regions that have a relatively small `last_entry_id` as candidate regions, which means we need to send a flush request to these regions.
|
||||
2. Communication:
|
||||
- Send flush requests to candidate regions.
|
||||
3. Purge:
|
||||
- Choose proper entry id to delete for each topic. The entry should be the smallest `last_entry_id - 1` among all regions.
|
||||
- Delete legacy entries in Kafka.
|
||||
- Store the `last_purged_entry_id` in kvbackend. It should be locked to prevent other regions from replaying the purged entries.
|
||||
|
||||
### After purge
|
||||
|
||||
After purge, there may be some regions that have `last_entry_id` smaller than the entry we just deleted. It's legal since we only delete the entries that are not needed anymore.
|
||||
|
||||
When restarting a region, it should query the `last_purged_entry_id` from metasrv and replay from `min(last_entry_id, last_purged_entry_id)`.
|
||||
|
||||
### Error handling
|
||||
|
||||
No persisted states are needed since all states are maintained in kvbackend.
|
||||
|
||||
Retry when failed to retrieving metadata from kvbackend.
|
||||
|
||||
# Alternatives
|
||||
|
||||
Purge time can depend on the size of the WAL entries instead of a fixed period of time, which may be more efficient.
|
||||
@@ -19,7 +19,7 @@ mod information_memory_table;
|
||||
pub mod key_column_usage;
|
||||
mod partitions;
|
||||
mod procedure_info;
|
||||
mod region_peers;
|
||||
pub mod region_peers;
|
||||
mod region_statistics;
|
||||
mod runtime_metrics;
|
||||
pub mod schemata;
|
||||
|
||||
@@ -56,6 +56,8 @@ pub const TABLE_CATALOG: &str = "table_catalog";
|
||||
pub const TABLE_SCHEMA: &str = "table_schema";
|
||||
pub const TABLE_NAME: &str = "table_name";
|
||||
pub const COLUMN_NAME: &str = "column_name";
|
||||
pub const REGION_ID: &str = "region_id";
|
||||
pub const PEER_ID: &str = "peer_id";
|
||||
const ORDINAL_POSITION: &str = "ordinal_position";
|
||||
const CHARACTER_MAXIMUM_LENGTH: &str = "character_maximum_length";
|
||||
const CHARACTER_OCTET_LENGTH: &str = "character_octet_length";
|
||||
|
||||
@@ -21,6 +21,7 @@ use common_error::ext::BoxedError;
|
||||
use common_meta::rpc::router::RegionRoute;
|
||||
use common_recordbatch::adapter::RecordBatchStreamAdapter;
|
||||
use common_recordbatch::{RecordBatch, SendableRecordBatchStream};
|
||||
use datafusion::common::HashMap;
|
||||
use datafusion::execution::TaskContext;
|
||||
use datafusion::physical_plan::stream::RecordBatchStreamAdapter as DfRecordBatchStreamAdapter;
|
||||
use datafusion::physical_plan::streaming::PartitionStream as DfPartitionStream;
|
||||
@@ -43,16 +44,22 @@ use crate::kvbackend::KvBackendCatalogManager;
|
||||
use crate::system_schema::information_schema::{InformationTable, Predicates};
|
||||
use crate::CatalogManager;
|
||||
|
||||
const REGION_ID: &str = "region_id";
|
||||
const PEER_ID: &str = "peer_id";
|
||||
pub const TABLE_CATALOG: &str = "table_catalog";
|
||||
pub const TABLE_SCHEMA: &str = "table_schema";
|
||||
pub const TABLE_NAME: &str = "table_name";
|
||||
pub const REGION_ID: &str = "region_id";
|
||||
pub const PEER_ID: &str = "peer_id";
|
||||
const PEER_ADDR: &str = "peer_addr";
|
||||
const IS_LEADER: &str = "is_leader";
|
||||
pub const IS_LEADER: &str = "is_leader";
|
||||
const STATUS: &str = "status";
|
||||
const DOWN_SECONDS: &str = "down_seconds";
|
||||
const INIT_CAPACITY: usize = 42;
|
||||
|
||||
/// The `REGION_PEERS` table provides information about the region distribution and routes. Including fields:
|
||||
///
|
||||
/// - `table_catalog`: the table catalog name
|
||||
/// - `table_schema`: the table schema name
|
||||
/// - `table_name`: the table name
|
||||
/// - `region_id`: the region id
|
||||
/// - `peer_id`: the region storage datanode peer id
|
||||
/// - `peer_addr`: the region storage datanode gRPC peer address
|
||||
@@ -77,6 +84,9 @@ impl InformationSchemaRegionPeers {
|
||||
|
||||
pub(crate) fn schema() -> SchemaRef {
|
||||
Arc::new(Schema::new(vec![
|
||||
ColumnSchema::new(TABLE_CATALOG, ConcreteDataType::string_datatype(), false),
|
||||
ColumnSchema::new(TABLE_SCHEMA, ConcreteDataType::string_datatype(), false),
|
||||
ColumnSchema::new(TABLE_NAME, ConcreteDataType::string_datatype(), false),
|
||||
ColumnSchema::new(REGION_ID, ConcreteDataType::uint64_datatype(), false),
|
||||
ColumnSchema::new(PEER_ID, ConcreteDataType::uint64_datatype(), true),
|
||||
ColumnSchema::new(PEER_ADDR, ConcreteDataType::string_datatype(), true),
|
||||
@@ -134,6 +144,9 @@ struct InformationSchemaRegionPeersBuilder {
|
||||
catalog_name: String,
|
||||
catalog_manager: Weak<dyn CatalogManager>,
|
||||
|
||||
table_catalogs: StringVectorBuilder,
|
||||
table_schemas: StringVectorBuilder,
|
||||
table_names: StringVectorBuilder,
|
||||
region_ids: UInt64VectorBuilder,
|
||||
peer_ids: UInt64VectorBuilder,
|
||||
peer_addrs: StringVectorBuilder,
|
||||
@@ -152,6 +165,9 @@ impl InformationSchemaRegionPeersBuilder {
|
||||
schema,
|
||||
catalog_name,
|
||||
catalog_manager,
|
||||
table_catalogs: StringVectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
table_schemas: StringVectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
table_names: StringVectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
region_ids: UInt64VectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
peer_ids: UInt64VectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
peer_addrs: StringVectorBuilder::with_capacity(INIT_CAPACITY),
|
||||
@@ -177,24 +193,28 @@ impl InformationSchemaRegionPeersBuilder {
|
||||
let predicates = Predicates::from_scan_request(&request);
|
||||
|
||||
for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
|
||||
let table_id_stream = catalog_manager
|
||||
let table_stream = catalog_manager
|
||||
.tables(&catalog_name, &schema_name, None)
|
||||
.try_filter_map(|t| async move {
|
||||
let table_info = t.table_info();
|
||||
if table_info.table_type == TableType::Temporary {
|
||||
Ok(None)
|
||||
} else {
|
||||
Ok(Some(table_info.ident.table_id))
|
||||
Ok(Some((
|
||||
table_info.ident.table_id,
|
||||
table_info.name.to_string(),
|
||||
)))
|
||||
}
|
||||
});
|
||||
|
||||
const BATCH_SIZE: usize = 128;
|
||||
|
||||
// Split table ids into chunks
|
||||
let mut table_id_chunks = pin!(table_id_stream.ready_chunks(BATCH_SIZE));
|
||||
// Split tables into chunks
|
||||
let mut table_chunks = pin!(table_stream.ready_chunks(BATCH_SIZE));
|
||||
|
||||
while let Some(table_ids) = table_id_chunks.next().await {
|
||||
let table_ids = table_ids.into_iter().collect::<Result<Vec<_>>>()?;
|
||||
while let Some(tables) = table_chunks.next().await {
|
||||
let tables = tables.into_iter().collect::<Result<HashMap<_, _>>>()?;
|
||||
let table_ids = tables.keys().cloned().collect::<Vec<_>>();
|
||||
|
||||
let table_routes = if let Some(partition_manager) = &partition_manager {
|
||||
partition_manager
|
||||
@@ -206,7 +226,16 @@ impl InformationSchemaRegionPeersBuilder {
|
||||
};
|
||||
|
||||
for (table_id, routes) in table_routes {
|
||||
self.add_region_peers(&predicates, table_id, &routes);
|
||||
// Safety: table_id is guaranteed to be in the map
|
||||
let table_name = tables.get(&table_id).unwrap();
|
||||
self.add_region_peers(
|
||||
&catalog_name,
|
||||
&schema_name,
|
||||
table_name,
|
||||
&predicates,
|
||||
table_id,
|
||||
&routes,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -216,6 +245,9 @@ impl InformationSchemaRegionPeersBuilder {
|
||||
|
||||
fn add_region_peers(
|
||||
&mut self,
|
||||
table_catalog: &str,
|
||||
table_schema: &str,
|
||||
table_name: &str,
|
||||
predicates: &Predicates,
|
||||
table_id: TableId,
|
||||
routes: &[RegionRoute],
|
||||
@@ -231,13 +263,20 @@ impl InformationSchemaRegionPeersBuilder {
|
||||
Some("ALIVE".to_string())
|
||||
};
|
||||
|
||||
let row = [(REGION_ID, &Value::from(region_id))];
|
||||
let row = [
|
||||
(TABLE_CATALOG, &Value::from(table_catalog)),
|
||||
(TABLE_SCHEMA, &Value::from(table_schema)),
|
||||
(TABLE_NAME, &Value::from(table_name)),
|
||||
(REGION_ID, &Value::from(region_id)),
|
||||
];
|
||||
|
||||
if !predicates.eval(&row) {
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO(dennis): adds followers.
|
||||
self.table_catalogs.push(Some(table_catalog));
|
||||
self.table_schemas.push(Some(table_schema));
|
||||
self.table_names.push(Some(table_name));
|
||||
self.region_ids.push(Some(region_id));
|
||||
self.peer_ids.push(peer_id);
|
||||
self.peer_addrs.push(peer_addr.as_deref());
|
||||
@@ -245,11 +284,26 @@ impl InformationSchemaRegionPeersBuilder {
|
||||
self.statuses.push(state.as_deref());
|
||||
self.down_seconds
|
||||
.push(route.leader_down_millis().map(|m| m / 1000));
|
||||
|
||||
for follower in &route.follower_peers {
|
||||
self.table_catalogs.push(Some(table_catalog));
|
||||
self.table_schemas.push(Some(table_schema));
|
||||
self.table_names.push(Some(table_name));
|
||||
self.region_ids.push(Some(region_id));
|
||||
self.peer_ids.push(Some(follower.id));
|
||||
self.peer_addrs.push(Some(follower.addr.as_str()));
|
||||
self.is_leaders.push(Some("No"));
|
||||
self.statuses.push(None);
|
||||
self.down_seconds.push(None);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn finish(&mut self) -> Result<RecordBatch> {
|
||||
let columns: Vec<VectorRef> = vec![
|
||||
Arc::new(self.table_catalogs.finish()),
|
||||
Arc::new(self.table_schemas.finish()),
|
||||
Arc::new(self.table_names.finish()),
|
||||
Arc::new(self.region_ids.finish()),
|
||||
Arc::new(self.peer_ids.finish()),
|
||||
Arc::new(self.peer_addrs.finish()),
|
||||
|
||||
@@ -177,7 +177,7 @@ fn create_table_info(table_id: TableId, table_name: TableName) -> RawTableInfo {
|
||||
|
||||
fn create_region_routes(regions: Vec<RegionNumber>) -> Vec<RegionRoute> {
|
||||
let mut region_routes = Vec::with_capacity(100);
|
||||
let mut rng = rand::thread_rng();
|
||||
let mut rng = rand::rng();
|
||||
|
||||
for region_id in regions.into_iter().map(u64::from) {
|
||||
region_routes.push(RegionRoute {
|
||||
@@ -188,7 +188,7 @@ fn create_region_routes(regions: Vec<RegionNumber>) -> Vec<RegionRoute> {
|
||||
attrs: BTreeMap::new(),
|
||||
},
|
||||
leader_peer: Some(Peer {
|
||||
id: rng.gen_range(0..10),
|
||||
id: rng.random_range(0..10),
|
||||
addr: String::new(),
|
||||
}),
|
||||
follower_peers: vec![],
|
||||
|
||||
@@ -16,7 +16,6 @@
|
||||
|
||||
mod client;
|
||||
pub mod client_manager;
|
||||
#[cfg(feature = "testing")]
|
||||
mod database;
|
||||
pub mod error;
|
||||
pub mod flow;
|
||||
@@ -34,7 +33,6 @@ pub use common_recordbatch::{RecordBatches, SendableRecordBatchStream};
|
||||
use snafu::OptionExt;
|
||||
|
||||
pub use self::client::Client;
|
||||
#[cfg(feature = "testing")]
|
||||
pub use self::database::Database;
|
||||
pub use self::error::{Error, Result};
|
||||
use crate::error::{IllegalDatabaseResponseSnafu, ServerSnafu};
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
// limitations under the License.
|
||||
|
||||
use enum_dispatch::enum_dispatch;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::seq::IndexedRandom;
|
||||
|
||||
#[enum_dispatch]
|
||||
pub trait LoadBalance {
|
||||
@@ -37,7 +37,7 @@ pub struct Random;
|
||||
|
||||
impl LoadBalance for Random {
|
||||
fn get_peer<'a>(&self, peers: &'a [String]) -> Option<&'a String> {
|
||||
peers.choose(&mut rand::thread_rng())
|
||||
peers.choose(&mut rand::rng())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -295,10 +295,13 @@ impl StartCommand {
|
||||
msg: "'meta_client_options'",
|
||||
})?;
|
||||
|
||||
let meta_client =
|
||||
meta_client::create_meta_client(MetaClientType::Datanode { member_id }, meta_config)
|
||||
.await
|
||||
.context(MetaClientInitSnafu)?;
|
||||
let meta_client = meta_client::create_meta_client(
|
||||
MetaClientType::Datanode { member_id },
|
||||
meta_config,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.context(MetaClientInitSnafu)?;
|
||||
|
||||
let meta_backend = Arc::new(MetaKvBackend {
|
||||
client: meta_client.clone(),
|
||||
|
||||
@@ -100,6 +100,13 @@ pub enum Error {
|
||||
source: flow::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Servers error"))]
|
||||
Servers {
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
source: servers::error::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to start frontend"))]
|
||||
StartFrontend {
|
||||
#[snafu(implicit)]
|
||||
@@ -365,6 +372,7 @@ impl ErrorExt for Error {
|
||||
Error::ShutdownFrontend { source, .. } => source.status_code(),
|
||||
Error::StartMetaServer { source, .. } => source.status_code(),
|
||||
Error::ShutdownMetaServer { source, .. } => source.status_code(),
|
||||
Error::Servers { source, .. } => source.status_code(),
|
||||
Error::BuildMetaServer { source, .. } => source.status_code(),
|
||||
Error::UnsupportedSelectorType { source, .. } => source.status_code(),
|
||||
Error::BuildCli { source, .. } => source.status_code(),
|
||||
|
||||
@@ -249,10 +249,13 @@ impl StartCommand {
|
||||
msg: "'meta_client_options'",
|
||||
})?;
|
||||
|
||||
let meta_client =
|
||||
meta_client::create_meta_client(MetaClientType::Flownode { member_id }, meta_config)
|
||||
.await
|
||||
.context(MetaClientInitSnafu)?;
|
||||
let meta_client = meta_client::create_meta_client(
|
||||
MetaClientType::Flownode { member_id },
|
||||
meta_config,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.context(MetaClientInitSnafu)?;
|
||||
|
||||
let cache_max_capacity = meta_config.metadata_cache_max_capacity;
|
||||
let cache_ttl = meta_config.metadata_cache_ttl;
|
||||
|
||||
@@ -32,28 +32,25 @@ use common_telemetry::info;
|
||||
use common_telemetry::logging::TracingOptions;
|
||||
use common_time::timezone::set_default_timezone;
|
||||
use common_version::{short_version, version};
|
||||
use frontend::frontend::Frontend;
|
||||
use frontend::heartbeat::HeartbeatTask;
|
||||
use frontend::instance::builder::FrontendBuilder;
|
||||
use frontend::instance::{FrontendInstance, Instance as FeInstance};
|
||||
use frontend::server::Services;
|
||||
use meta_client::{MetaClientOptions, MetaClientType};
|
||||
use query::stats::StatementStatistics;
|
||||
use servers::export_metrics::ExportMetricsTask;
|
||||
use servers::tls::{TlsMode, TlsOption};
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
use tracing_appender::non_blocking::WorkerGuard;
|
||||
|
||||
use crate::error::{
|
||||
self, InitTimezoneSnafu, LoadLayeredConfigSnafu, MetaClientInitSnafu, MissingConfigSnafu,
|
||||
Result, StartFrontendSnafu,
|
||||
};
|
||||
use crate::error::{self, Result};
|
||||
use crate::options::{GlobalOptions, GreptimeOptions};
|
||||
use crate::{log_versions, App};
|
||||
|
||||
type FrontendOptions = GreptimeOptions<frontend::frontend::FrontendOptions>;
|
||||
|
||||
pub struct Instance {
|
||||
frontend: FeInstance,
|
||||
|
||||
frontend: Frontend,
|
||||
// Keep the logging guard to prevent the worker from being dropped.
|
||||
_guard: Vec<WorkerGuard>,
|
||||
}
|
||||
@@ -61,20 +58,17 @@ pub struct Instance {
|
||||
pub const APP_NAME: &str = "greptime-frontend";
|
||||
|
||||
impl Instance {
|
||||
pub fn new(frontend: FeInstance, guard: Vec<WorkerGuard>) -> Self {
|
||||
Self {
|
||||
frontend,
|
||||
_guard: guard,
|
||||
}
|
||||
pub fn new(frontend: Frontend, _guard: Vec<WorkerGuard>) -> Self {
|
||||
Self { frontend, _guard }
|
||||
}
|
||||
|
||||
pub fn mut_inner(&mut self) -> &mut FeInstance {
|
||||
&mut self.frontend
|
||||
}
|
||||
|
||||
pub fn inner(&self) -> &FeInstance {
|
||||
pub fn inner(&self) -> &Frontend {
|
||||
&self.frontend
|
||||
}
|
||||
|
||||
pub fn mut_inner(&mut self) -> &mut Frontend {
|
||||
&mut self.frontend
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
@@ -84,11 +78,15 @@ impl App for Instance {
|
||||
}
|
||||
|
||||
async fn start(&mut self) -> Result<()> {
|
||||
plugins::start_frontend_plugins(self.frontend.plugins().clone())
|
||||
let plugins = self.frontend.instance.plugins().clone();
|
||||
plugins::start_frontend_plugins(plugins)
|
||||
.await
|
||||
.context(StartFrontendSnafu)?;
|
||||
.context(error::StartFrontendSnafu)?;
|
||||
|
||||
self.frontend.start().await.context(StartFrontendSnafu)
|
||||
self.frontend
|
||||
.start()
|
||||
.await
|
||||
.context(error::StartFrontendSnafu)
|
||||
}
|
||||
|
||||
async fn stop(&self) -> Result<()> {
|
||||
@@ -178,7 +176,7 @@ impl StartCommand {
|
||||
self.config_file.as_deref(),
|
||||
self.env_prefix.as_ref(),
|
||||
)
|
||||
.context(LoadLayeredConfigSnafu)?;
|
||||
.context(error::LoadLayeredConfigSnafu)?;
|
||||
|
||||
self.merge_with_cli_options(global_options, &mut opts)?;
|
||||
|
||||
@@ -283,22 +281,28 @@ impl StartCommand {
|
||||
let mut plugins = Plugins::new();
|
||||
plugins::setup_frontend_plugins(&mut plugins, &plugin_opts, &opts)
|
||||
.await
|
||||
.context(StartFrontendSnafu)?;
|
||||
.context(error::StartFrontendSnafu)?;
|
||||
|
||||
set_default_timezone(opts.default_timezone.as_deref()).context(InitTimezoneSnafu)?;
|
||||
set_default_timezone(opts.default_timezone.as_deref()).context(error::InitTimezoneSnafu)?;
|
||||
|
||||
let meta_client_options = opts.meta_client.as_ref().context(MissingConfigSnafu {
|
||||
msg: "'meta_client'",
|
||||
})?;
|
||||
let meta_client_options = opts
|
||||
.meta_client
|
||||
.as_ref()
|
||||
.context(error::MissingConfigSnafu {
|
||||
msg: "'meta_client'",
|
||||
})?;
|
||||
|
||||
let cache_max_capacity = meta_client_options.metadata_cache_max_capacity;
|
||||
let cache_ttl = meta_client_options.metadata_cache_ttl;
|
||||
let cache_tti = meta_client_options.metadata_cache_tti;
|
||||
|
||||
let meta_client =
|
||||
meta_client::create_meta_client(MetaClientType::Frontend, meta_client_options)
|
||||
.await
|
||||
.context(MetaClientInitSnafu)?;
|
||||
let meta_client = meta_client::create_meta_client(
|
||||
MetaClientType::Frontend,
|
||||
meta_client_options,
|
||||
Some(&plugins),
|
||||
)
|
||||
.await
|
||||
.context(error::MetaClientInitSnafu)?;
|
||||
|
||||
// TODO(discord9): add helper function to ease the creation of cache registry&such
|
||||
let cached_meta_backend =
|
||||
@@ -345,6 +349,7 @@ impl StartCommand {
|
||||
opts.heartbeat.clone(),
|
||||
Arc::new(executor),
|
||||
);
|
||||
let heartbeat_task = Some(heartbeat_task);
|
||||
|
||||
// frontend to datanode need not timeout.
|
||||
// Some queries are expected to take long time.
|
||||
@@ -356,7 +361,7 @@ impl StartCommand {
|
||||
};
|
||||
let client = NodeClients::new(channel_config);
|
||||
|
||||
let mut instance = FrontendBuilder::new(
|
||||
let instance = FrontendBuilder::new(
|
||||
opts.clone(),
|
||||
cached_meta_backend.clone(),
|
||||
layered_cache_registry.clone(),
|
||||
@@ -367,20 +372,27 @@ impl StartCommand {
|
||||
)
|
||||
.with_plugin(plugins.clone())
|
||||
.with_local_cache_invalidator(layered_cache_registry)
|
||||
.with_heartbeat_task(heartbeat_task)
|
||||
.try_build()
|
||||
.await
|
||||
.context(StartFrontendSnafu)?;
|
||||
.context(error::StartFrontendSnafu)?;
|
||||
let instance = Arc::new(instance);
|
||||
|
||||
let servers = Services::new(opts, Arc::new(instance.clone()), plugins)
|
||||
let export_metrics_task = ExportMetricsTask::try_new(&opts.export_metrics, Some(&plugins))
|
||||
.context(error::ServersSnafu)?;
|
||||
|
||||
let servers = Services::new(opts, instance.clone(), plugins)
|
||||
.build()
|
||||
.await
|
||||
.context(StartFrontendSnafu)?;
|
||||
instance
|
||||
.build_servers(servers)
|
||||
.context(StartFrontendSnafu)?;
|
||||
.context(error::StartFrontendSnafu)?;
|
||||
|
||||
Ok(Instance::new(instance, guard))
|
||||
let frontend = Frontend {
|
||||
instance,
|
||||
servers,
|
||||
heartbeat_task,
|
||||
export_metrics_task,
|
||||
};
|
||||
|
||||
Ok(Instance::new(frontend, guard))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -55,9 +55,9 @@ use datanode::datanode::{Datanode, DatanodeBuilder};
|
||||
use datanode::region_server::RegionServer;
|
||||
use file_engine::config::EngineConfig as FileEngineConfig;
|
||||
use flow::{FlowConfig, FlowWorkerManager, FlownodeBuilder, FlownodeOptions, FrontendInvoker};
|
||||
use frontend::frontend::FrontendOptions;
|
||||
use frontend::frontend::{Frontend, FrontendOptions};
|
||||
use frontend::instance::builder::FrontendBuilder;
|
||||
use frontend::instance::{FrontendInstance, Instance as FeInstance, StandaloneDatanodeManager};
|
||||
use frontend::instance::{Instance as FeInstance, StandaloneDatanodeManager};
|
||||
use frontend::server::Services;
|
||||
use frontend::service_config::{
|
||||
InfluxdbOptions, JaegerOptions, MysqlOptions, OpentsdbOptions, PostgresOptions,
|
||||
@@ -67,7 +67,7 @@ use meta_srv::metasrv::{FLOW_ID_SEQ, TABLE_ID_SEQ};
|
||||
use mito2::config::MitoConfig;
|
||||
use query::stats::StatementStatistics;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use servers::export_metrics::ExportMetricsOption;
|
||||
use servers::export_metrics::{ExportMetricsOption, ExportMetricsTask};
|
||||
use servers::grpc::GrpcOptions;
|
||||
use servers::http::HttpOptions;
|
||||
use servers::tls::{TlsMode, TlsOption};
|
||||
@@ -76,15 +76,9 @@ use snafu::ResultExt;
|
||||
use tokio::sync::{broadcast, RwLock};
|
||||
use tracing_appender::non_blocking::WorkerGuard;
|
||||
|
||||
use crate::error::{
|
||||
BuildCacheRegistrySnafu, BuildWalOptionsAllocatorSnafu, CreateDirSnafu, IllegalConfigSnafu,
|
||||
InitDdlManagerSnafu, InitMetadataSnafu, InitTimezoneSnafu, LoadLayeredConfigSnafu, OtherSnafu,
|
||||
Result, ShutdownDatanodeSnafu, ShutdownFlownodeSnafu, ShutdownFrontendSnafu,
|
||||
StartDatanodeSnafu, StartFlownodeSnafu, StartFrontendSnafu, StartProcedureManagerSnafu,
|
||||
StartWalOptionsAllocatorSnafu, StopProcedureManagerSnafu,
|
||||
};
|
||||
use crate::error::Result;
|
||||
use crate::options::{GlobalOptions, GreptimeOptions};
|
||||
use crate::{log_versions, App};
|
||||
use crate::{error, log_versions, App};
|
||||
|
||||
pub const APP_NAME: &str = "greptime-standalone";
|
||||
|
||||
@@ -251,13 +245,12 @@ impl StandaloneOptions {
|
||||
|
||||
pub struct Instance {
|
||||
datanode: Datanode,
|
||||
frontend: FeInstance,
|
||||
frontend: Frontend,
|
||||
// TODO(discord9): wrapped it in flownode instance instead
|
||||
flow_worker_manager: Arc<FlowWorkerManager>,
|
||||
flow_shutdown: broadcast::Sender<()>,
|
||||
procedure_manager: ProcedureManagerRef,
|
||||
wal_options_allocator: WalOptionsAllocatorRef,
|
||||
|
||||
// Keep the logging guard to prevent the worker from being dropped.
|
||||
_guard: Vec<WorkerGuard>,
|
||||
}
|
||||
@@ -281,21 +274,26 @@ impl App for Instance {
|
||||
self.procedure_manager
|
||||
.start()
|
||||
.await
|
||||
.context(StartProcedureManagerSnafu)?;
|
||||
.context(error::StartProcedureManagerSnafu)?;
|
||||
|
||||
self.wal_options_allocator
|
||||
.start()
|
||||
.await
|
||||
.context(StartWalOptionsAllocatorSnafu)?;
|
||||
.context(error::StartWalOptionsAllocatorSnafu)?;
|
||||
|
||||
plugins::start_frontend_plugins(self.frontend.plugins().clone())
|
||||
plugins::start_frontend_plugins(self.frontend.instance.plugins().clone())
|
||||
.await
|
||||
.context(StartFrontendSnafu)?;
|
||||
.context(error::StartFrontendSnafu)?;
|
||||
|
||||
self.frontend
|
||||
.start()
|
||||
.await
|
||||
.context(error::StartFrontendSnafu)?;
|
||||
|
||||
self.frontend.start().await.context(StartFrontendSnafu)?;
|
||||
self.flow_worker_manager
|
||||
.clone()
|
||||
.run_background(Some(self.flow_shutdown.subscribe()));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -303,17 +301,18 @@ impl App for Instance {
|
||||
self.frontend
|
||||
.shutdown()
|
||||
.await
|
||||
.context(ShutdownFrontendSnafu)?;
|
||||
.context(error::ShutdownFrontendSnafu)?;
|
||||
|
||||
self.procedure_manager
|
||||
.stop()
|
||||
.await
|
||||
.context(StopProcedureManagerSnafu)?;
|
||||
.context(error::StopProcedureManagerSnafu)?;
|
||||
|
||||
self.datanode
|
||||
.shutdown()
|
||||
.await
|
||||
.context(ShutdownDatanodeSnafu)?;
|
||||
.context(error::ShutdownDatanodeSnafu)?;
|
||||
|
||||
self.flow_shutdown
|
||||
.send(())
|
||||
.map_err(|_e| {
|
||||
@@ -322,7 +321,8 @@ impl App for Instance {
|
||||
}
|
||||
.build()
|
||||
})
|
||||
.context(ShutdownFlownodeSnafu)?;
|
||||
.context(error::ShutdownFlownodeSnafu)?;
|
||||
|
||||
info!("Datanode instance stopped.");
|
||||
|
||||
Ok(())
|
||||
@@ -368,7 +368,7 @@ impl StartCommand {
|
||||
self.config_file.as_deref(),
|
||||
self.env_prefix.as_ref(),
|
||||
)
|
||||
.context(LoadLayeredConfigSnafu)?;
|
||||
.context(error::LoadLayeredConfigSnafu)?;
|
||||
|
||||
self.merge_with_cli_options(global_options, &mut opts.component)?;
|
||||
|
||||
@@ -415,7 +415,7 @@ impl StartCommand {
|
||||
// frontend grpc addr conflict with datanode default grpc addr
|
||||
let datanode_grpc_addr = DatanodeOptions::default().grpc.bind_addr;
|
||||
if addr.eq(&datanode_grpc_addr) {
|
||||
return IllegalConfigSnafu {
|
||||
return error::IllegalConfigSnafu {
|
||||
msg: format!(
|
||||
"gRPC listen address conflicts with datanode reserved gRPC addr: {datanode_grpc_addr}",
|
||||
),
|
||||
@@ -474,18 +474,19 @@ impl StartCommand {
|
||||
|
||||
plugins::setup_frontend_plugins(&mut plugins, &plugin_opts, &fe_opts)
|
||||
.await
|
||||
.context(StartFrontendSnafu)?;
|
||||
.context(error::StartFrontendSnafu)?;
|
||||
|
||||
plugins::setup_datanode_plugins(&mut plugins, &plugin_opts, &dn_opts)
|
||||
.await
|
||||
.context(StartDatanodeSnafu)?;
|
||||
.context(error::StartDatanodeSnafu)?;
|
||||
|
||||
set_default_timezone(fe_opts.default_timezone.as_deref()).context(InitTimezoneSnafu)?;
|
||||
set_default_timezone(fe_opts.default_timezone.as_deref())
|
||||
.context(error::InitTimezoneSnafu)?;
|
||||
|
||||
let data_home = &dn_opts.storage.data_home;
|
||||
// Ensure the data_home directory exists.
|
||||
fs::create_dir_all(path::Path::new(data_home))
|
||||
.context(CreateDirSnafu { dir: data_home })?;
|
||||
.context(error::CreateDirSnafu { dir: data_home })?;
|
||||
|
||||
let metadata_dir = metadata_store_dir(data_home);
|
||||
let (kv_backend, procedure_manager) = FeInstance::try_build_standalone_components(
|
||||
@@ -494,7 +495,7 @@ impl StartCommand {
|
||||
opts.procedure,
|
||||
)
|
||||
.await
|
||||
.context(StartFrontendSnafu)?;
|
||||
.context(error::StartFrontendSnafu)?;
|
||||
|
||||
// Builds cache registry
|
||||
let layered_cache_builder = LayeredCacheRegistryBuilder::default();
|
||||
@@ -503,7 +504,7 @@ impl StartCommand {
|
||||
with_default_composite_cache_registry(
|
||||
layered_cache_builder.add_cache_registry(fundamental_cache_registry),
|
||||
)
|
||||
.context(BuildCacheRegistrySnafu)?
|
||||
.context(error::BuildCacheRegistrySnafu)?
|
||||
.build(),
|
||||
);
|
||||
|
||||
@@ -512,7 +513,7 @@ impl StartCommand {
|
||||
.with_cache_registry(layered_cache_registry.clone())
|
||||
.build()
|
||||
.await
|
||||
.context(StartDatanodeSnafu)?;
|
||||
.context(error::StartDatanodeSnafu)?;
|
||||
|
||||
let information_extension = Arc::new(StandaloneInformationExtension::new(
|
||||
datanode.region_server(),
|
||||
@@ -545,7 +546,7 @@ impl StartCommand {
|
||||
.build()
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(OtherSnafu)?,
|
||||
.context(error::OtherSnafu)?,
|
||||
);
|
||||
|
||||
// set the ref to query for the local flow state
|
||||
@@ -576,7 +577,7 @@ impl StartCommand {
|
||||
let kafka_options = opts.wal.clone().into();
|
||||
let wal_options_allocator = build_wal_options_allocator(&kafka_options, kv_backend.clone())
|
||||
.await
|
||||
.context(BuildWalOptionsAllocatorSnafu)?;
|
||||
.context(error::BuildWalOptionsAllocatorSnafu)?;
|
||||
let wal_options_allocator = Arc::new(wal_options_allocator);
|
||||
let table_meta_allocator = Arc::new(TableMetadataAllocator::new(
|
||||
table_id_sequence,
|
||||
@@ -597,8 +598,8 @@ impl StartCommand {
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut frontend = FrontendBuilder::new(
|
||||
fe_opts,
|
||||
let fe_instance = FrontendBuilder::new(
|
||||
fe_opts.clone(),
|
||||
kv_backend.clone(),
|
||||
layered_cache_registry.clone(),
|
||||
catalog_manager.clone(),
|
||||
@@ -609,7 +610,8 @@ impl StartCommand {
|
||||
.with_plugin(plugins.clone())
|
||||
.try_build()
|
||||
.await
|
||||
.context(StartFrontendSnafu)?;
|
||||
.context(error::StartFrontendSnafu)?;
|
||||
let fe_instance = Arc::new(fe_instance);
|
||||
|
||||
let flow_worker_manager = flownode.flow_worker_manager();
|
||||
// flow server need to be able to use frontend to write insert requests back
|
||||
@@ -622,18 +624,25 @@ impl StartCommand {
|
||||
node_manager,
|
||||
)
|
||||
.await
|
||||
.context(StartFlownodeSnafu)?;
|
||||
.context(error::StartFlownodeSnafu)?;
|
||||
flow_worker_manager.set_frontend_invoker(invoker).await;
|
||||
|
||||
let (tx, _rx) = broadcast::channel(1);
|
||||
|
||||
let servers = Services::new(opts, Arc::new(frontend.clone()), plugins)
|
||||
let export_metrics_task = ExportMetricsTask::try_new(&opts.export_metrics, Some(&plugins))
|
||||
.context(error::ServersSnafu)?;
|
||||
|
||||
let servers = Services::new(opts, fe_instance.clone(), plugins)
|
||||
.build()
|
||||
.await
|
||||
.context(StartFrontendSnafu)?;
|
||||
frontend
|
||||
.build_servers(servers)
|
||||
.context(StartFrontendSnafu)?;
|
||||
.context(error::StartFrontendSnafu)?;
|
||||
|
||||
let frontend = Frontend {
|
||||
instance: fe_instance,
|
||||
servers,
|
||||
heartbeat_task: None,
|
||||
export_metrics_task,
|
||||
};
|
||||
|
||||
Ok(Instance {
|
||||
datanode,
|
||||
@@ -670,7 +679,7 @@ impl StartCommand {
|
||||
procedure_manager,
|
||||
true,
|
||||
)
|
||||
.context(InitDdlManagerSnafu)?,
|
||||
.context(error::InitDdlManagerSnafu)?,
|
||||
);
|
||||
|
||||
Ok(procedure_executor)
|
||||
@@ -684,7 +693,7 @@ impl StartCommand {
|
||||
table_metadata_manager
|
||||
.init()
|
||||
.await
|
||||
.context(InitMetadataSnafu)?;
|
||||
.context(error::InitMetadataSnafu)?;
|
||||
|
||||
Ok(table_metadata_manager)
|
||||
}
|
||||
|
||||
@@ -12,15 +12,19 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
mod add_region_follower;
|
||||
mod flush_compact_region;
|
||||
mod flush_compact_table;
|
||||
mod migrate_region;
|
||||
mod remove_region_follower;
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use add_region_follower::AddRegionFollowerFunction;
|
||||
use flush_compact_region::{CompactRegionFunction, FlushRegionFunction};
|
||||
use flush_compact_table::{CompactTableFunction, FlushTableFunction};
|
||||
use migrate_region::MigrateRegionFunction;
|
||||
use remove_region_follower::RemoveRegionFollowerFunction;
|
||||
|
||||
use crate::flush_flow::FlushFlowFunction;
|
||||
use crate::function_registry::FunctionRegistry;
|
||||
@@ -32,6 +36,8 @@ impl AdminFunction {
|
||||
/// Register all table functions to [`FunctionRegistry`].
|
||||
pub fn register(registry: &FunctionRegistry) {
|
||||
registry.register_async(Arc::new(MigrateRegionFunction));
|
||||
registry.register_async(Arc::new(AddRegionFollowerFunction));
|
||||
registry.register_async(Arc::new(RemoveRegionFollowerFunction));
|
||||
registry.register_async(Arc::new(FlushRegionFunction));
|
||||
registry.register_async(Arc::new(CompactRegionFunction));
|
||||
registry.register_async(Arc::new(FlushTableFunction));
|
||||
|
||||
129
src/common/function/src/admin/add_region_follower.rs
Normal file
129
src/common/function/src/admin/add_region_follower.rs
Normal file
@@ -0,0 +1,129 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use common_macro::admin_fn;
|
||||
use common_meta::rpc::procedure::AddRegionFollowerRequest;
|
||||
use common_query::error::{
|
||||
InvalidFuncArgsSnafu, MissingProcedureServiceHandlerSnafu, Result,
|
||||
UnsupportedInputDataTypeSnafu,
|
||||
};
|
||||
use common_query::prelude::{Signature, TypeSignature, Volatility};
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use datatypes::value::{Value, ValueRef};
|
||||
use session::context::QueryContextRef;
|
||||
use snafu::ensure;
|
||||
|
||||
use crate::handlers::ProcedureServiceHandlerRef;
|
||||
use crate::helper::cast_u64;
|
||||
|
||||
/// A function to add a follower to a region.
|
||||
/// Only available in cluster mode.
|
||||
///
|
||||
/// - `add_region_follower(region_id, peer_id)`.
|
||||
///
|
||||
/// The parameters:
|
||||
/// - `region_id`: the region id
|
||||
/// - `peer_id`: the peer id
|
||||
#[admin_fn(
|
||||
name = AddRegionFollowerFunction,
|
||||
display_name = add_region_follower,
|
||||
sig_fn = signature,
|
||||
ret = uint64
|
||||
)]
|
||||
pub(crate) async fn add_region_follower(
|
||||
procedure_service_handler: &ProcedureServiceHandlerRef,
|
||||
_ctx: &QueryContextRef,
|
||||
params: &[ValueRef<'_>],
|
||||
) -> Result<Value> {
|
||||
ensure!(
|
||||
params.len() == 2,
|
||||
InvalidFuncArgsSnafu {
|
||||
err_msg: format!(
|
||||
"The length of the args is not correct, expect exactly 2, have: {}",
|
||||
params.len()
|
||||
),
|
||||
}
|
||||
);
|
||||
|
||||
let Some(region_id) = cast_u64(¶ms[0])? else {
|
||||
return UnsupportedInputDataTypeSnafu {
|
||||
function: "add_region_follower",
|
||||
datatypes: params.iter().map(|v| v.data_type()).collect::<Vec<_>>(),
|
||||
}
|
||||
.fail();
|
||||
};
|
||||
let Some(peer_id) = cast_u64(¶ms[1])? else {
|
||||
return UnsupportedInputDataTypeSnafu {
|
||||
function: "add_region_follower",
|
||||
datatypes: params.iter().map(|v| v.data_type()).collect::<Vec<_>>(),
|
||||
}
|
||||
.fail();
|
||||
};
|
||||
|
||||
procedure_service_handler
|
||||
.add_region_follower(AddRegionFollowerRequest { region_id, peer_id })
|
||||
.await?;
|
||||
|
||||
Ok(Value::from(0u64))
|
||||
}
|
||||
|
||||
fn signature() -> Signature {
|
||||
Signature::one_of(
|
||||
vec![
|
||||
// add_region_follower(region_id, peer)
|
||||
TypeSignature::Uniform(2, ConcreteDataType::numerics()),
|
||||
],
|
||||
Volatility::Immutable,
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_query::prelude::TypeSignature;
|
||||
use datatypes::vectors::{UInt64Vector, VectorRef};
|
||||
|
||||
use super::*;
|
||||
use crate::function::{AsyncFunction, FunctionContext};
|
||||
|
||||
#[test]
|
||||
fn test_add_region_follower_misc() {
|
||||
let f = AddRegionFollowerFunction;
|
||||
assert_eq!("add_region_follower", f.name());
|
||||
assert_eq!(
|
||||
ConcreteDataType::uint64_datatype(),
|
||||
f.return_type(&[]).unwrap()
|
||||
);
|
||||
assert!(matches!(f.signature(),
|
||||
Signature {
|
||||
type_signature: TypeSignature::OneOf(sigs),
|
||||
volatility: Volatility::Immutable
|
||||
} if sigs.len() == 1));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_add_region_follower() {
|
||||
let f = AddRegionFollowerFunction;
|
||||
let args = vec![1, 1];
|
||||
let args = args
|
||||
.into_iter()
|
||||
.map(|arg| Arc::new(UInt64Vector::from_slice([arg])) as _)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let result = f.eval(FunctionContext::mock(), &args).await.unwrap();
|
||||
let expect: VectorRef = Arc::new(UInt64Vector::from_slice([0u64]));
|
||||
assert_eq!(result, expect);
|
||||
}
|
||||
}
|
||||
129
src/common/function/src/admin/remove_region_follower.rs
Normal file
129
src/common/function/src/admin/remove_region_follower.rs
Normal file
@@ -0,0 +1,129 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use common_macro::admin_fn;
|
||||
use common_meta::rpc::procedure::RemoveRegionFollowerRequest;
|
||||
use common_query::error::{
|
||||
InvalidFuncArgsSnafu, MissingProcedureServiceHandlerSnafu, Result,
|
||||
UnsupportedInputDataTypeSnafu,
|
||||
};
|
||||
use common_query::prelude::{Signature, TypeSignature, Volatility};
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use datatypes::value::{Value, ValueRef};
|
||||
use session::context::QueryContextRef;
|
||||
use snafu::ensure;
|
||||
|
||||
use crate::handlers::ProcedureServiceHandlerRef;
|
||||
use crate::helper::cast_u64;
|
||||
|
||||
/// A function to remove a follower from a region.
|
||||
//// Only available in cluster mode.
|
||||
///
|
||||
/// - `remove_region_follower(region_id, peer_id)`.
|
||||
///
|
||||
/// The parameters:
|
||||
/// - `region_id`: the region id
|
||||
/// - `peer_id`: the peer id
|
||||
#[admin_fn(
|
||||
name = RemoveRegionFollowerFunction,
|
||||
display_name = remove_region_follower,
|
||||
sig_fn = signature,
|
||||
ret = uint64
|
||||
)]
|
||||
pub(crate) async fn remove_region_follower(
|
||||
procedure_service_handler: &ProcedureServiceHandlerRef,
|
||||
_ctx: &QueryContextRef,
|
||||
params: &[ValueRef<'_>],
|
||||
) -> Result<Value> {
|
||||
ensure!(
|
||||
params.len() == 2,
|
||||
InvalidFuncArgsSnafu {
|
||||
err_msg: format!(
|
||||
"The length of the args is not correct, expect exactly 2, have: {}",
|
||||
params.len()
|
||||
),
|
||||
}
|
||||
);
|
||||
|
||||
let Some(region_id) = cast_u64(¶ms[0])? else {
|
||||
return UnsupportedInputDataTypeSnafu {
|
||||
function: "add_region_follower",
|
||||
datatypes: params.iter().map(|v| v.data_type()).collect::<Vec<_>>(),
|
||||
}
|
||||
.fail();
|
||||
};
|
||||
let Some(peer_id) = cast_u64(¶ms[1])? else {
|
||||
return UnsupportedInputDataTypeSnafu {
|
||||
function: "add_region_follower",
|
||||
datatypes: params.iter().map(|v| v.data_type()).collect::<Vec<_>>(),
|
||||
}
|
||||
.fail();
|
||||
};
|
||||
|
||||
procedure_service_handler
|
||||
.remove_region_follower(RemoveRegionFollowerRequest { region_id, peer_id })
|
||||
.await?;
|
||||
|
||||
Ok(Value::from(0u64))
|
||||
}
|
||||
|
||||
fn signature() -> Signature {
|
||||
Signature::one_of(
|
||||
vec![
|
||||
// remove_region_follower(region_id, peer_id)
|
||||
TypeSignature::Uniform(2, ConcreteDataType::numerics()),
|
||||
],
|
||||
Volatility::Immutable,
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_query::prelude::TypeSignature;
|
||||
use datatypes::vectors::{UInt64Vector, VectorRef};
|
||||
|
||||
use super::*;
|
||||
use crate::function::{AsyncFunction, FunctionContext};
|
||||
|
||||
#[test]
|
||||
fn test_remove_region_follower_misc() {
|
||||
let f = RemoveRegionFollowerFunction;
|
||||
assert_eq!("remove_region_follower", f.name());
|
||||
assert_eq!(
|
||||
ConcreteDataType::uint64_datatype(),
|
||||
f.return_type(&[]).unwrap()
|
||||
);
|
||||
assert!(matches!(f.signature(),
|
||||
Signature {
|
||||
type_signature: TypeSignature::OneOf(sigs),
|
||||
volatility: Volatility::Immutable
|
||||
} if sigs.len() == 1));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_remove_region_follower() {
|
||||
let f = RemoveRegionFollowerFunction;
|
||||
let args = vec![1, 1];
|
||||
let args = args
|
||||
.into_iter()
|
||||
.map(|arg| Arc::new(UInt64Vector::from_slice([arg])) as _)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let result = f.eval(FunctionContext::mock(), &args).await.unwrap();
|
||||
let expect: VectorRef = Arc::new(UInt64Vector::from_slice([0u64]));
|
||||
assert_eq!(result, expect);
|
||||
}
|
||||
}
|
||||
@@ -16,7 +16,10 @@ use std::sync::Arc;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use common_base::AffectedRows;
|
||||
use common_meta::rpc::procedure::{MigrateRegionRequest, ProcedureStateResponse};
|
||||
use common_meta::rpc::procedure::{
|
||||
AddRegionFollowerRequest, MigrateRegionRequest, ProcedureStateResponse,
|
||||
RemoveRegionFollowerRequest,
|
||||
};
|
||||
use common_query::error::Result;
|
||||
use common_query::Output;
|
||||
use session::context::QueryContextRef;
|
||||
@@ -63,6 +66,12 @@ pub trait ProcedureServiceHandler: Send + Sync {
|
||||
|
||||
/// Query the procedure' state by its id
|
||||
async fn query_procedure_state(&self, pid: &str) -> Result<ProcedureStateResponse>;
|
||||
|
||||
/// Add a region follower to a region.
|
||||
async fn add_region_follower(&self, request: AddRegionFollowerRequest) -> Result<()>;
|
||||
|
||||
/// Remove a region follower from a region.
|
||||
async fn remove_region_follower(&self, request: RemoveRegionFollowerRequest) -> Result<()>;
|
||||
}
|
||||
|
||||
/// This flow service handler is only use for flush flow for now.
|
||||
|
||||
@@ -35,7 +35,10 @@ impl FunctionState {
|
||||
use api::v1::meta::ProcedureStatus;
|
||||
use async_trait::async_trait;
|
||||
use common_base::AffectedRows;
|
||||
use common_meta::rpc::procedure::{MigrateRegionRequest, ProcedureStateResponse};
|
||||
use common_meta::rpc::procedure::{
|
||||
AddRegionFollowerRequest, MigrateRegionRequest, ProcedureStateResponse,
|
||||
RemoveRegionFollowerRequest,
|
||||
};
|
||||
use common_query::error::Result;
|
||||
use common_query::Output;
|
||||
use session::context::QueryContextRef;
|
||||
@@ -66,6 +69,17 @@ impl FunctionState {
|
||||
..Default::default()
|
||||
})
|
||||
}
|
||||
|
||||
async fn add_region_follower(&self, _request: AddRegionFollowerRequest) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn remove_region_follower(
|
||||
&self,
|
||||
_request: RemoveRegionFollowerRequest,
|
||||
) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
|
||||
@@ -25,7 +25,7 @@ async fn do_bench_channel_manager() {
|
||||
let m_clone = m.clone();
|
||||
let join = tokio::spawn(async move {
|
||||
for _ in 0..10000 {
|
||||
let idx = rand::random::<usize>() % 100;
|
||||
let idx = rand::random::<u32>() % 100;
|
||||
let ret = m_clone.get(format!("{idx}"));
|
||||
let _ = ret.unwrap();
|
||||
}
|
||||
|
||||
@@ -27,6 +27,7 @@ use crate::error::{
|
||||
DecodeJsonSnafu, EncodeJsonSnafu, Error, FromUtf8Snafu, InvalidNodeInfoKeySnafu,
|
||||
InvalidRoleSnafu, ParseNumSnafu, Result,
|
||||
};
|
||||
use crate::key::flow::flow_state::FlowStat;
|
||||
use crate::peer::Peer;
|
||||
|
||||
const CLUSTER_NODE_INFO_PREFIX: &str = "__meta_cluster_node_info";
|
||||
@@ -52,6 +53,9 @@ pub trait ClusterInfo {
|
||||
/// List all region stats in the cluster.
|
||||
async fn list_region_stats(&self) -> std::result::Result<Vec<RegionStat>, Self::Error>;
|
||||
|
||||
/// List all flow stats in the cluster.
|
||||
async fn list_flow_stats(&self) -> std::result::Result<Option<FlowStat>, Self::Error>;
|
||||
|
||||
// TODO(jeremy): Other info, like region status, etc.
|
||||
}
|
||||
|
||||
|
||||
@@ -22,14 +22,17 @@ use store_api::storage::{RegionId, RegionNumber, TableId};
|
||||
use crate::cache_invalidator::CacheInvalidatorRef;
|
||||
use crate::ddl::flow_meta::FlowMetadataAllocatorRef;
|
||||
use crate::ddl::table_meta::TableMetadataAllocatorRef;
|
||||
use crate::error::Result;
|
||||
use crate::error::{Result, UnsupportedSnafu};
|
||||
use crate::key::flow::FlowMetadataManagerRef;
|
||||
use crate::key::table_route::PhysicalTableRouteValue;
|
||||
use crate::key::TableMetadataManagerRef;
|
||||
use crate::node_manager::NodeManagerRef;
|
||||
use crate::region_keeper::MemoryRegionKeeperRef;
|
||||
use crate::rpc::ddl::{SubmitDdlTaskRequest, SubmitDdlTaskResponse};
|
||||
use crate::rpc::procedure::{MigrateRegionRequest, MigrateRegionResponse, ProcedureStateResponse};
|
||||
use crate::rpc::procedure::{
|
||||
AddRegionFollowerRequest, MigrateRegionRequest, MigrateRegionResponse, ProcedureStateResponse,
|
||||
RemoveRegionFollowerRequest,
|
||||
};
|
||||
use crate::DatanodeId;
|
||||
|
||||
pub mod alter_database;
|
||||
@@ -70,6 +73,30 @@ pub trait ProcedureExecutor: Send + Sync {
|
||||
request: SubmitDdlTaskRequest,
|
||||
) -> Result<SubmitDdlTaskResponse>;
|
||||
|
||||
/// Add a region follower
|
||||
async fn add_region_follower(
|
||||
&self,
|
||||
_ctx: &ExecutorContext,
|
||||
_request: AddRegionFollowerRequest,
|
||||
) -> Result<()> {
|
||||
UnsupportedSnafu {
|
||||
operation: "add_region_follower",
|
||||
}
|
||||
.fail()
|
||||
}
|
||||
|
||||
/// Remove a region follower
|
||||
async fn remove_region_follower(
|
||||
&self,
|
||||
_ctx: &ExecutorContext,
|
||||
_request: RemoveRegionFollowerRequest,
|
||||
) -> Result<()> {
|
||||
UnsupportedSnafu {
|
||||
operation: "remove_region_follower",
|
||||
}
|
||||
.fail()
|
||||
}
|
||||
|
||||
/// Submit a region migration task
|
||||
async fn migrate_region(
|
||||
&self,
|
||||
|
||||
@@ -98,13 +98,14 @@ impl TableMetadataAllocator {
|
||||
fn create_wal_options(
|
||||
&self,
|
||||
table_route: &PhysicalTableRouteValue,
|
||||
skip_wal: bool,
|
||||
) -> Result<HashMap<RegionNumber, String>> {
|
||||
let region_numbers = table_route
|
||||
.region_routes
|
||||
.iter()
|
||||
.map(|route| route.region.id.region_number())
|
||||
.collect();
|
||||
allocate_region_wal_options(region_numbers, &self.wal_options_allocator)
|
||||
allocate_region_wal_options(region_numbers, &self.wal_options_allocator, skip_wal)
|
||||
}
|
||||
|
||||
async fn create_table_route(
|
||||
@@ -158,7 +159,9 @@ impl TableMetadataAllocator {
|
||||
pub async fn create(&self, task: &CreateTableTask) -> Result<TableMetadata> {
|
||||
let table_id = self.allocate_table_id(&task.create_table.table_id).await?;
|
||||
let table_route = self.create_table_route(table_id, task).await?;
|
||||
let region_wal_options = self.create_wal_options(&table_route)?;
|
||||
|
||||
let region_wal_options =
|
||||
self.create_wal_options(&table_route, task.table_info.meta.options.skip_wal)?;
|
||||
|
||||
debug!(
|
||||
"Allocated region wal options {:?} for table {}",
|
||||
|
||||
@@ -1471,7 +1471,8 @@ mod tests {
|
||||
new_test_table_info(region_routes.iter().map(|r| r.region.id.region_number())).into();
|
||||
let wal_allocator = WalOptionsAllocator::RaftEngine;
|
||||
let regions = (0..16).collect();
|
||||
let region_wal_options = allocate_region_wal_options(regions, &wal_allocator).unwrap();
|
||||
let region_wal_options =
|
||||
allocate_region_wal_options(regions, &wal_allocator, false).unwrap();
|
||||
create_physical_table_metadata(
|
||||
&table_metadata_manager,
|
||||
table_info.clone(),
|
||||
|
||||
@@ -224,6 +224,7 @@ impl TopicRegionManager {
|
||||
Some((region_id, kafka.topic.as_str()))
|
||||
}
|
||||
Some(WalOptions::RaftEngine) => None,
|
||||
Some(WalOptions::Noop) => None,
|
||||
None => None,
|
||||
},
|
||||
)
|
||||
|
||||
@@ -1240,6 +1240,7 @@ impl From<QueryContext> for PbQueryContext {
|
||||
extensions,
|
||||
channel: channel as u32,
|
||||
snapshot_seqs: None,
|
||||
explain: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -290,13 +290,13 @@ mod tests {
|
||||
num_per_range: u32,
|
||||
max_bytes: u32,
|
||||
) {
|
||||
let num_cases = rand::thread_rng().gen_range(1..=8);
|
||||
let num_cases = rand::rng().random_range(1..=8);
|
||||
common_telemetry::info!("num_cases: {}", num_cases);
|
||||
let mut cases = Vec::with_capacity(num_cases);
|
||||
for i in 0..num_cases {
|
||||
let size = rand::thread_rng().gen_range(size_limit..=max_bytes);
|
||||
let size = rand::rng().random_range(size_limit..=max_bytes);
|
||||
let mut large_value = vec![0u8; size as usize];
|
||||
rand::thread_rng().fill_bytes(&mut large_value);
|
||||
rand::rng().fill_bytes(&mut large_value);
|
||||
|
||||
// Starts from `a`.
|
||||
let prefix = format!("{}/", std::char::from_u32(97 + i as u32).unwrap());
|
||||
@@ -354,8 +354,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_meta_state_store_split_value() {
|
||||
let size_limit = rand::thread_rng().gen_range(128..=512);
|
||||
let page_size = rand::thread_rng().gen_range(1..10);
|
||||
let size_limit = rand::rng().random_range(128..=512);
|
||||
let page_size = rand::rng().random_range(1..10);
|
||||
let kv_backend = Arc::new(MemoryKvBackend::new());
|
||||
test_meta_state_store_split_value_with_size_limit(kv_backend, size_limit, page_size, 8192)
|
||||
.await;
|
||||
@@ -388,7 +388,7 @@ mod tests {
|
||||
// However, some KvBackends, the `ChrootKvBackend`, will add the prefix to `key`;
|
||||
// we don't know the exact size of the key.
|
||||
let size_limit = 1536 * 1024 - key_size;
|
||||
let page_size = rand::thread_rng().gen_range(1..10);
|
||||
let page_size = rand::rng().random_range(1..10);
|
||||
test_meta_state_store_split_value_with_size_limit(
|
||||
kv_backend,
|
||||
size_limit,
|
||||
|
||||
@@ -53,21 +53,12 @@ impl WalOptionsAllocator {
|
||||
}
|
||||
}
|
||||
|
||||
/// Allocates a wal options for a region.
|
||||
pub fn alloc(&self) -> Result<WalOptions> {
|
||||
match self {
|
||||
Self::RaftEngine => Ok(WalOptions::RaftEngine),
|
||||
Self::Kafka(topic_manager) => {
|
||||
let topic = topic_manager.select()?;
|
||||
Ok(WalOptions::Kafka(KafkaWalOptions {
|
||||
topic: topic.clone(),
|
||||
}))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Allocates a batch of wal options where each wal options goes to a region.
|
||||
pub fn alloc_batch(&self, num_regions: usize) -> Result<Vec<WalOptions>> {
|
||||
/// If skip_wal is true, the wal options will be set to Noop regardless of the allocator type.
|
||||
pub fn alloc_batch(&self, num_regions: usize, skip_wal: bool) -> Result<Vec<WalOptions>> {
|
||||
if skip_wal {
|
||||
return Ok(vec![WalOptions::Noop; num_regions]);
|
||||
}
|
||||
match self {
|
||||
WalOptionsAllocator::RaftEngine => Ok(vec![WalOptions::RaftEngine; num_regions]),
|
||||
WalOptionsAllocator::Kafka(topic_manager) => {
|
||||
@@ -130,9 +121,10 @@ pub async fn build_wal_options_allocator(
|
||||
pub fn allocate_region_wal_options(
|
||||
regions: Vec<RegionNumber>,
|
||||
wal_options_allocator: &WalOptionsAllocator,
|
||||
skip_wal: bool,
|
||||
) -> Result<HashMap<RegionNumber, String>> {
|
||||
let wal_options = wal_options_allocator
|
||||
.alloc_batch(regions.len())?
|
||||
.alloc_batch(regions.len(), skip_wal)?
|
||||
.into_iter()
|
||||
.map(|wal_options| {
|
||||
serde_json::to_string(&wal_options).context(EncodeWalOptionsSnafu { wal_options })
|
||||
@@ -177,7 +169,7 @@ mod tests {
|
||||
|
||||
let num_regions = 32;
|
||||
let regions = (0..num_regions).collect::<Vec<_>>();
|
||||
let got = allocate_region_wal_options(regions.clone(), &allocator).unwrap();
|
||||
let got = allocate_region_wal_options(regions.clone(), &allocator, false).unwrap();
|
||||
|
||||
let encoded_wal_options = serde_json::to_string(&WalOptions::RaftEngine).unwrap();
|
||||
let expected = regions
|
||||
@@ -237,7 +229,7 @@ mod tests {
|
||||
|
||||
let num_regions = 32;
|
||||
let regions = (0..num_regions).collect::<Vec<_>>();
|
||||
let got = allocate_region_wal_options(regions.clone(), &allocator).unwrap();
|
||||
let got = allocate_region_wal_options(regions.clone(), &allocator, false).unwrap();
|
||||
|
||||
// Check the allocated wal options contain the expected topics.
|
||||
let expected = (0..num_regions)
|
||||
@@ -253,4 +245,18 @@ mod tests {
|
||||
})
|
||||
.await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_allocator_with_skip_wal() {
|
||||
let allocator = WalOptionsAllocator::RaftEngine;
|
||||
allocator.start().await.unwrap();
|
||||
|
||||
let num_regions = 32;
|
||||
let regions = (0..num_regions).collect::<Vec<_>>();
|
||||
let got = allocate_region_wal_options(regions.clone(), &allocator, true).unwrap();
|
||||
assert_eq!(got.len(), num_regions as usize);
|
||||
for wal_options in got.values() {
|
||||
assert_eq!(wal_options, &"{\"wal.provider\":\"noop\"}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,7 +39,7 @@ impl RoundRobinTopicSelector {
|
||||
// The cursor in the round-robin selector is not persisted which may break the round-robin strategy cross crashes.
|
||||
// Introducing a shuffling strategy may help mitigate this issue.
|
||||
pub fn with_shuffle() -> Self {
|
||||
let offset = rand::thread_rng().gen_range(0..64);
|
||||
let offset = rand::rng().random_range(0..64);
|
||||
Self {
|
||||
cursor: AtomicUsize::new(offset),
|
||||
}
|
||||
|
||||
@@ -207,7 +207,7 @@ impl Runner {
|
||||
if let Some(d) = retry.next() {
|
||||
let millis = d.as_millis() as u64;
|
||||
// Add random noise to the retry delay to avoid retry storms.
|
||||
let noise = rand::thread_rng().gen_range(0..(millis / 4) + 1);
|
||||
let noise = rand::rng().random_range(0..(millis / 4) + 1);
|
||||
let d = d.add(Duration::from_millis(noise));
|
||||
|
||||
self.wait_on_err(d, retry_times).await;
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::fmt::Display;
|
||||
use std::fmt::{self, Display};
|
||||
use std::future::Future;
|
||||
use std::marker::PhantomData;
|
||||
use std::pin::Pin;
|
||||
@@ -28,7 +28,7 @@ use datafusion::logical_expr::Expr;
|
||||
use datafusion::physical_expr::create_physical_expr;
|
||||
use datafusion::physical_plan::metrics::{BaselineMetrics, MetricValue};
|
||||
use datafusion::physical_plan::{
|
||||
accept, displayable, ExecutionPlan, ExecutionPlanVisitor, PhysicalExpr,
|
||||
accept, DisplayFormatType, ExecutionPlan, ExecutionPlanVisitor, PhysicalExpr,
|
||||
RecordBatchStream as DfRecordBatchStream,
|
||||
};
|
||||
use datafusion_common::arrow::error::ArrowError;
|
||||
@@ -206,13 +206,16 @@ impl Stream for DfRecordBatchStreamAdapter {
|
||||
}
|
||||
|
||||
/// DataFusion [SendableRecordBatchStream](DfSendableRecordBatchStream) -> Greptime [RecordBatchStream].
|
||||
/// The reverse one is [DfRecordBatchStreamAdapter]
|
||||
/// The reverse one is [DfRecordBatchStreamAdapter].
|
||||
/// It can collect metrics from DataFusion execution plan.
|
||||
pub struct RecordBatchStreamAdapter {
|
||||
schema: SchemaRef,
|
||||
stream: DfSendableRecordBatchStream,
|
||||
metrics: Option<BaselineMetrics>,
|
||||
/// Aggregated plan-level metrics. Resolved after an [ExecutionPlan] is finished.
|
||||
metrics_2: Metrics,
|
||||
/// Display plan and metrics in verbose mode.
|
||||
explain_verbose: bool,
|
||||
}
|
||||
|
||||
/// Json encoded metrics. Contains metric from a whole plan tree.
|
||||
@@ -231,6 +234,7 @@ impl RecordBatchStreamAdapter {
|
||||
stream,
|
||||
metrics: None,
|
||||
metrics_2: Metrics::Unavailable,
|
||||
explain_verbose: false,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -246,12 +250,18 @@ impl RecordBatchStreamAdapter {
|
||||
stream,
|
||||
metrics: Some(metrics),
|
||||
metrics_2: Metrics::Unresolved(df_plan),
|
||||
explain_verbose: false,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn set_metrics2(&mut self, plan: Arc<dyn ExecutionPlan>) {
|
||||
self.metrics_2 = Metrics::Unresolved(plan)
|
||||
}
|
||||
|
||||
/// Set the verbose mode for displaying plan and metrics.
|
||||
pub fn set_explain_verbose(&mut self, verbose: bool) {
|
||||
self.explain_verbose = verbose;
|
||||
}
|
||||
}
|
||||
|
||||
impl RecordBatchStream for RecordBatchStreamAdapter {
|
||||
@@ -296,7 +306,7 @@ impl Stream for RecordBatchStreamAdapter {
|
||||
}
|
||||
Poll::Ready(None) => {
|
||||
if let Metrics::Unresolved(df_plan) = &self.metrics_2 {
|
||||
let mut metric_collector = MetricCollector::default();
|
||||
let mut metric_collector = MetricCollector::new(self.explain_verbose);
|
||||
accept(df_plan.as_ref(), &mut metric_collector).unwrap();
|
||||
self.metrics_2 = Metrics::Resolved(metric_collector.record_batch_metrics);
|
||||
}
|
||||
@@ -312,10 +322,20 @@ impl Stream for RecordBatchStreamAdapter {
|
||||
}
|
||||
|
||||
/// An [ExecutionPlanVisitor] to collect metrics from a [ExecutionPlan].
|
||||
#[derive(Default)]
|
||||
pub struct MetricCollector {
|
||||
current_level: usize,
|
||||
pub record_batch_metrics: RecordBatchMetrics,
|
||||
verbose: bool,
|
||||
}
|
||||
|
||||
impl MetricCollector {
|
||||
pub fn new(verbose: bool) -> Self {
|
||||
Self {
|
||||
current_level: 0,
|
||||
record_batch_metrics: RecordBatchMetrics::default(),
|
||||
verbose,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ExecutionPlanVisitor for MetricCollector {
|
||||
@@ -339,7 +359,7 @@ impl ExecutionPlanVisitor for MetricCollector {
|
||||
.sorted_for_display()
|
||||
.timestamps_removed();
|
||||
let mut plan_metric = PlanMetrics {
|
||||
plan: displayable(plan).one_line().to_string(),
|
||||
plan: one_line(plan, self.verbose).to_string(),
|
||||
level: self.current_level,
|
||||
metrics: Vec::with_capacity(metric.iter().size_hint().0),
|
||||
};
|
||||
@@ -371,6 +391,29 @@ impl ExecutionPlanVisitor for MetricCollector {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a single-line summary of the root of the plan.
|
||||
/// If the `verbose` flag is set, it will display detailed information about the plan.
|
||||
fn one_line(plan: &dyn ExecutionPlan, verbose: bool) -> impl fmt::Display + '_ {
|
||||
struct Wrapper<'a> {
|
||||
plan: &'a dyn ExecutionPlan,
|
||||
format_type: DisplayFormatType,
|
||||
}
|
||||
|
||||
impl fmt::Display for Wrapper<'_> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
self.plan.fmt_as(self.format_type, f)?;
|
||||
writeln!(f)
|
||||
}
|
||||
}
|
||||
|
||||
let format_type = if verbose {
|
||||
DisplayFormatType::Verbose
|
||||
} else {
|
||||
DisplayFormatType::Default
|
||||
};
|
||||
Wrapper { plan, format_type }
|
||||
}
|
||||
|
||||
/// [`RecordBatchMetrics`] carrys metrics value
|
||||
/// from datanode to frontend through gRPC
|
||||
#[derive(serde::Serialize, serde::Deserialize, Default, Debug, Clone)]
|
||||
|
||||
@@ -22,10 +22,12 @@ use datafusion::physical_plan::PhysicalExpr;
|
||||
use datafusion_common::arrow::array::{ArrayRef, Datum, Scalar};
|
||||
use datafusion_common::arrow::buffer::BooleanBuffer;
|
||||
use datafusion_common::arrow::compute::kernels::cmp;
|
||||
use datafusion_common::cast::{as_boolean_array, as_null_array};
|
||||
use datafusion_common::cast::{as_boolean_array, as_null_array, as_string_array};
|
||||
use datafusion_common::{internal_err, DataFusionError, ScalarValue};
|
||||
use datatypes::arrow::array::{Array, BooleanArray, RecordBatch};
|
||||
use datatypes::arrow::compute::filter_record_batch;
|
||||
use datatypes::arrow::error::ArrowError;
|
||||
use datatypes::compute::kernels::regexp;
|
||||
use datatypes::compute::or_kleene;
|
||||
use datatypes::vectors::VectorRef;
|
||||
use snafu::ResultExt;
|
||||
@@ -36,7 +38,8 @@ use crate::error::{ArrowComputeSnafu, Result, ToArrowScalarSnafu, UnsupportedOpe
|
||||
/// - `col` `op` `literal`
|
||||
/// - `literal` `op` `col`
|
||||
///
|
||||
/// And the `op` is one of `=`, `!=`, `>`, `>=`, `<`, `<=`.
|
||||
/// And the `op` is one of `=`, `!=`, `>`, `>=`, `<`, `<=`,
|
||||
/// or regex operators: `~`, `~*`, `!~`, `!~*`.
|
||||
///
|
||||
/// This struct contains normalized predicate expr. In the form of
|
||||
/// `col` `op` `literal` where the `col` is provided from input.
|
||||
@@ -86,7 +89,11 @@ impl SimpleFilterEvaluator {
|
||||
| Operator::Lt
|
||||
| Operator::LtEq
|
||||
| Operator::Gt
|
||||
| Operator::GtEq => {}
|
||||
| Operator::GtEq
|
||||
| Operator::RegexMatch
|
||||
| Operator::RegexIMatch
|
||||
| Operator::RegexNotMatch
|
||||
| Operator::RegexNotIMatch => {}
|
||||
Operator::Or => {
|
||||
let lhs = Self::try_new(&binary.left)?;
|
||||
let rhs = Self::try_new(&binary.right)?;
|
||||
@@ -172,6 +179,10 @@ impl SimpleFilterEvaluator {
|
||||
Operator::LtEq => cmp::lt_eq(input, &self.literal),
|
||||
Operator::Gt => cmp::gt(input, &self.literal),
|
||||
Operator::GtEq => cmp::gt_eq(input, &self.literal),
|
||||
Operator::RegexMatch => self.regex_match(input, false, false),
|
||||
Operator::RegexIMatch => self.regex_match(input, true, false),
|
||||
Operator::RegexNotMatch => self.regex_match(input, false, true),
|
||||
Operator::RegexNotIMatch => self.regex_match(input, true, true),
|
||||
Operator::Or => {
|
||||
// OR operator stands for OR-chained EQs (or INLIST in other words)
|
||||
let mut result: BooleanArray = vec![false; input_len].into();
|
||||
@@ -192,6 +203,28 @@ impl SimpleFilterEvaluator {
|
||||
.context(ArrowComputeSnafu)
|
||||
.map(|array| array.values().clone())
|
||||
}
|
||||
|
||||
fn regex_match(
|
||||
&self,
|
||||
input: &impl Datum,
|
||||
ignore_case: bool,
|
||||
negative: bool,
|
||||
) -> std::result::Result<BooleanArray, ArrowError> {
|
||||
let flag = if ignore_case { Some("i") } else { None };
|
||||
let array = input.get().0;
|
||||
let string_array = as_string_array(array).map_err(|_| {
|
||||
ArrowError::CastError(format!("Cannot cast {:?} to StringArray", array))
|
||||
})?;
|
||||
let literal_array = self.literal.clone().into_inner();
|
||||
let regex_array = as_string_array(&literal_array).map_err(|_| {
|
||||
ArrowError::CastError(format!("Cannot cast {:?} to StringArray", literal_array))
|
||||
})?;
|
||||
let mut result = regexp::regexp_is_match_scalar(string_array, regex_array.value(0), flag)?;
|
||||
if negative {
|
||||
result = datatypes::compute::not(&result)?;
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
/// Evaluate the predicate on the input [RecordBatch], and return a new [RecordBatch].
|
||||
|
||||
@@ -22,6 +22,6 @@ static PORTS: OnceCell<AtomicUsize> = OnceCell::new();
|
||||
/// Return a unique port(in runtime) for test
|
||||
pub fn get_port() -> usize {
|
||||
PORTS
|
||||
.get_or_init(|| AtomicUsize::new(rand::thread_rng().gen_range(13000..13800)))
|
||||
.get_or_init(|| AtomicUsize::new(rand::rng().random_range(13000..13800)))
|
||||
.fetch_add(1, Ordering::Relaxed)
|
||||
}
|
||||
|
||||
@@ -715,10 +715,10 @@ mod tests {
|
||||
TimeUnit::Microsecond,
|
||||
TimeUnit::Nanosecond,
|
||||
];
|
||||
let mut rng = rand::thread_rng();
|
||||
let unit_idx: usize = rng.gen_range(0..4);
|
||||
let mut rng = rand::rng();
|
||||
let unit_idx: usize = rng.random_range(0..4);
|
||||
let unit = units[unit_idx];
|
||||
let value: i64 = rng.gen();
|
||||
let value: i64 = rng.random();
|
||||
Timestamp::new(value, unit)
|
||||
}
|
||||
|
||||
@@ -745,8 +745,8 @@ mod tests {
|
||||
|
||||
/// Generate timestamp less than or equal to `threshold`
|
||||
fn gen_ts_le(threshold: &Timestamp) -> Timestamp {
|
||||
let mut rng = rand::thread_rng();
|
||||
let timestamp = rng.gen_range(i64::MIN..=threshold.value);
|
||||
let mut rng = rand::rng();
|
||||
let timestamp = rng.random_range(i64::MIN..=threshold.value);
|
||||
Timestamp::new(timestamp, threshold.unit)
|
||||
}
|
||||
|
||||
|
||||
@@ -33,6 +33,7 @@ pub enum WalOptions {
|
||||
RaftEngine,
|
||||
#[serde(with = "kafka_prefix")]
|
||||
Kafka(KafkaWalOptions),
|
||||
Noop,
|
||||
}
|
||||
|
||||
with_prefix!(kafka_prefix "wal.kafka.");
|
||||
@@ -62,5 +63,14 @@ mod tests {
|
||||
|
||||
let decoded: WalOptions = serde_json::from_str(&encoded).unwrap();
|
||||
assert_eq!(decoded, wal_options);
|
||||
|
||||
// Test serde noop wal options.
|
||||
let wal_options = WalOptions::Noop;
|
||||
let encoded = serde_json::to_string(&wal_options).unwrap();
|
||||
let expected = r#"{"wal.provider":"noop"}"#;
|
||||
assert_eq!(&encoded, expected);
|
||||
|
||||
let decoded: WalOptions = serde_json::from_str(&encoded).unwrap();
|
||||
assert_eq!(decoded, wal_options);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,6 +30,7 @@ mod boolean;
|
||||
mod constant;
|
||||
mod date;
|
||||
mod decimal;
|
||||
mod dictionary;
|
||||
mod duration;
|
||||
mod eq;
|
||||
mod helper;
|
||||
@@ -48,6 +49,7 @@ pub use boolean::{BooleanVector, BooleanVectorBuilder};
|
||||
pub use constant::ConstantVector;
|
||||
pub use date::{DateVector, DateVectorBuilder};
|
||||
pub use decimal::{Decimal128Vector, Decimal128VectorBuilder};
|
||||
pub use dictionary::{DictionaryIter, DictionaryVector};
|
||||
pub use duration::{
|
||||
DurationMicrosecondVector, DurationMicrosecondVectorBuilder, DurationMillisecondVector,
|
||||
DurationMillisecondVectorBuilder, DurationNanosecondVector, DurationNanosecondVectorBuilder,
|
||||
|
||||
438
src/datatypes/src/vectors/dictionary.rs
Normal file
438
src/datatypes/src/vectors/dictionary.rs
Normal file
@@ -0,0 +1,438 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::any::Any;
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow::array::Array;
|
||||
use arrow::datatypes::Int32Type;
|
||||
use arrow_array::{ArrayRef, DictionaryArray, Int32Array};
|
||||
use serde_json::Value as JsonValue;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use super::operations::VectorOp;
|
||||
use crate::data_type::ConcreteDataType;
|
||||
use crate::error::{self, Result};
|
||||
use crate::serialize::Serializable;
|
||||
use crate::types::DictionaryType;
|
||||
use crate::value::{Value, ValueRef};
|
||||
use crate::vectors::{self, Helper, Validity, Vector, VectorRef};
|
||||
|
||||
/// Vector of dictionaries, basically backed by Arrow's `DictionaryArray`.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct DictionaryVector {
|
||||
array: DictionaryArray<Int32Type>,
|
||||
/// The datatype of the items in the dictionary.
|
||||
item_type: ConcreteDataType,
|
||||
/// The vector of items in the dictionary.
|
||||
item_vector: VectorRef,
|
||||
}
|
||||
|
||||
impl DictionaryVector {
|
||||
/// Create a new instance of `DictionaryVector` from a dictionary array and item type
|
||||
pub fn new(array: DictionaryArray<Int32Type>, item_type: ConcreteDataType) -> Result<Self> {
|
||||
let item_vector = Helper::try_into_vector(array.values())?;
|
||||
|
||||
Ok(Self {
|
||||
array,
|
||||
item_type,
|
||||
item_vector,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns the underlying Arrow dictionary array
|
||||
pub fn array(&self) -> &DictionaryArray<Int32Type> {
|
||||
&self.array
|
||||
}
|
||||
|
||||
/// Returns the keys array of this dictionary
|
||||
pub fn keys(&self) -> &arrow_array::PrimitiveArray<Int32Type> {
|
||||
self.array.keys()
|
||||
}
|
||||
|
||||
/// Returns the values array of this dictionary
|
||||
pub fn values(&self) -> &ArrayRef {
|
||||
self.array.values()
|
||||
}
|
||||
|
||||
pub fn as_arrow(&self) -> &dyn Array {
|
||||
&self.array
|
||||
}
|
||||
}
|
||||
|
||||
impl Vector for DictionaryVector {
|
||||
fn data_type(&self) -> ConcreteDataType {
|
||||
ConcreteDataType::Dictionary(DictionaryType::new(
|
||||
ConcreteDataType::int32_datatype(),
|
||||
self.item_type.clone(),
|
||||
))
|
||||
}
|
||||
|
||||
fn vector_type_name(&self) -> String {
|
||||
"DictionaryVector".to_string()
|
||||
}
|
||||
|
||||
fn as_any(&self) -> &dyn Any {
|
||||
self
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.array.len()
|
||||
}
|
||||
|
||||
fn to_arrow_array(&self) -> ArrayRef {
|
||||
Arc::new(self.array.clone())
|
||||
}
|
||||
|
||||
fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
|
||||
Box::new(self.array.clone())
|
||||
}
|
||||
|
||||
fn validity(&self) -> Validity {
|
||||
vectors::impl_validity_for_vector!(self.array)
|
||||
}
|
||||
|
||||
fn memory_size(&self) -> usize {
|
||||
self.array.get_buffer_memory_size()
|
||||
}
|
||||
|
||||
fn null_count(&self) -> usize {
|
||||
self.array.null_count()
|
||||
}
|
||||
|
||||
fn is_null(&self, row: usize) -> bool {
|
||||
self.array.is_null(row)
|
||||
}
|
||||
|
||||
fn slice(&self, offset: usize, length: usize) -> VectorRef {
|
||||
Arc::new(Self {
|
||||
array: self.array.slice(offset, length),
|
||||
item_type: self.item_type.clone(),
|
||||
item_vector: self.item_vector.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
fn get(&self, index: usize) -> Value {
|
||||
if !self.array.is_valid(index) {
|
||||
return Value::Null;
|
||||
}
|
||||
|
||||
let key = self.array.keys().value(index);
|
||||
self.item_vector.get(key as usize)
|
||||
}
|
||||
|
||||
fn get_ref(&self, index: usize) -> ValueRef {
|
||||
if !self.array.is_valid(index) {
|
||||
return ValueRef::Null;
|
||||
}
|
||||
|
||||
let key = self.array.keys().value(index);
|
||||
self.item_vector.get_ref(key as usize)
|
||||
}
|
||||
}
|
||||
|
||||
impl Serializable for DictionaryVector {
|
||||
fn serialize_to_json(&self) -> Result<Vec<JsonValue>> {
|
||||
// Convert the dictionary array to JSON, where each element is either null or
|
||||
// the value it refers to in the dictionary
|
||||
let mut result = Vec::with_capacity(self.len());
|
||||
|
||||
for i in 0..self.len() {
|
||||
if self.is_null(i) {
|
||||
result.push(JsonValue::Null);
|
||||
} else {
|
||||
let key = self.array.keys().value(i);
|
||||
let value = self.item_vector.get(key as usize);
|
||||
let json_value = serde_json::to_value(value).context(error::SerializeSnafu)?;
|
||||
result.push(json_value);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<DictionaryArray<Int32Type>> for DictionaryVector {
|
||||
type Error = crate::error::Error;
|
||||
|
||||
fn try_from(array: DictionaryArray<Int32Type>) -> Result<Self> {
|
||||
let item_type = ConcreteDataType::from_arrow_type(array.values().data_type());
|
||||
let item_vector = Helper::try_into_vector(array.values())?;
|
||||
|
||||
Ok(Self {
|
||||
array,
|
||||
item_type,
|
||||
item_vector,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DictionaryIter<'a> {
|
||||
vector: &'a DictionaryVector,
|
||||
idx: usize,
|
||||
}
|
||||
|
||||
impl<'a> DictionaryIter<'a> {
|
||||
pub fn new(vector: &'a DictionaryVector) -> DictionaryIter<'a> {
|
||||
DictionaryIter { vector, idx: 0 }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for DictionaryIter<'a> {
|
||||
type Item = Option<ValueRef<'a>>;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if self.idx >= self.vector.len() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let idx = self.idx;
|
||||
self.idx += 1;
|
||||
|
||||
if self.vector.is_null(idx) {
|
||||
return Some(None);
|
||||
}
|
||||
|
||||
Some(Some(self.vector.item_vector.get_ref(self.idx)))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
(
|
||||
self.vector.len() - self.idx,
|
||||
Some(self.vector.len() - self.idx),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl VectorOp for DictionaryVector {
|
||||
fn replicate(&self, offsets: &[usize]) -> VectorRef {
|
||||
let keys = self.array.keys();
|
||||
let mut replicated_keys = Vec::with_capacity(offsets.len());
|
||||
|
||||
let mut previous_offset = 0;
|
||||
for (i, &offset) in offsets.iter().enumerate() {
|
||||
let key = if i < self.len() {
|
||||
if keys.is_valid(i) {
|
||||
Some(keys.value(i))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// repeat this key (offset - previous_offset) times
|
||||
let repeat_count = offset - previous_offset;
|
||||
if repeat_count > 0 {
|
||||
replicated_keys.resize(replicated_keys.len() + repeat_count, key);
|
||||
}
|
||||
|
||||
previous_offset = offset;
|
||||
}
|
||||
|
||||
let new_keys = Int32Array::from(replicated_keys);
|
||||
let new_array = DictionaryArray::try_new(new_keys, self.values().clone())
|
||||
.expect("Failed to create replicated dictionary array");
|
||||
|
||||
Arc::new(Self {
|
||||
array: new_array,
|
||||
item_type: self.item_type.clone(),
|
||||
item_vector: self.item_vector.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
fn filter(&self, filter: &vectors::BooleanVector) -> Result<VectorRef> {
|
||||
let key_array: ArrayRef = Arc::new(self.array.keys().clone());
|
||||
let key_vector = Helper::try_into_vector(&key_array)?;
|
||||
let filtered_key_vector = key_vector.filter(filter)?;
|
||||
let filtered_key_array = filtered_key_vector.to_arrow_array();
|
||||
let filtered_key_array = filtered_key_array
|
||||
.as_any()
|
||||
.downcast_ref::<Int32Array>()
|
||||
.unwrap();
|
||||
|
||||
let new_array = DictionaryArray::try_new(filtered_key_array.clone(), self.values().clone())
|
||||
.expect("Failed to create filtered dictionary array");
|
||||
|
||||
Ok(Arc::new(Self {
|
||||
array: new_array,
|
||||
item_type: self.item_type.clone(),
|
||||
item_vector: self.item_vector.clone(),
|
||||
}))
|
||||
}
|
||||
|
||||
fn cast(&self, to_type: &ConcreteDataType) -> Result<VectorRef> {
|
||||
let new_items = self.item_vector.cast(to_type)?;
|
||||
let new_array =
|
||||
DictionaryArray::try_new(self.array.keys().clone(), new_items.to_arrow_array())
|
||||
.expect("Failed to create casted dictionary array");
|
||||
Ok(Arc::new(Self {
|
||||
array: new_array,
|
||||
item_type: to_type.clone(),
|
||||
item_vector: self.item_vector.clone(),
|
||||
}))
|
||||
}
|
||||
|
||||
fn take(&self, indices: &vectors::UInt32Vector) -> Result<VectorRef> {
|
||||
let key_array: ArrayRef = Arc::new(self.array.keys().clone());
|
||||
let key_vector = Helper::try_into_vector(&key_array)?;
|
||||
let new_key_vector = key_vector.take(indices)?;
|
||||
let new_key_array = new_key_vector.to_arrow_array();
|
||||
let new_key_array = new_key_array.as_any().downcast_ref::<Int32Array>().unwrap();
|
||||
|
||||
let new_array = DictionaryArray::try_new(new_key_array.clone(), self.values().clone())
|
||||
.expect("Failed to create filtered dictionary array");
|
||||
|
||||
Ok(Arc::new(Self {
|
||||
array: new_array,
|
||||
item_type: self.item_type.clone(),
|
||||
item_vector: self.item_vector.clone(),
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow_array::StringArray;
|
||||
|
||||
use super::*;
|
||||
|
||||
// Helper function to create a test dictionary vector with string values
|
||||
fn create_test_dictionary() -> DictionaryVector {
|
||||
// Dictionary values: ["a", "b", "c", "d"]
|
||||
// Keys: [0, 1, 2, null, 1, 3]
|
||||
// Resulting in: ["a", "b", "c", null, "b", "d"]
|
||||
let values = StringArray::from(vec!["a", "b", "c", "d"]);
|
||||
let keys = Int32Array::from(vec![Some(0), Some(1), Some(2), None, Some(1), Some(3)]);
|
||||
let dict_array = DictionaryArray::new(keys, Arc::new(values));
|
||||
DictionaryVector::try_from(dict_array).unwrap()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dictionary_vector_basics() {
|
||||
let dict_vec = create_test_dictionary();
|
||||
|
||||
// Test length and null count
|
||||
assert_eq!(dict_vec.len(), 6);
|
||||
assert_eq!(dict_vec.null_count(), 1);
|
||||
|
||||
// Test data type
|
||||
let data_type = dict_vec.data_type();
|
||||
if let ConcreteDataType::Dictionary(dict_type) = data_type {
|
||||
assert_eq!(*dict_type.value_type(), ConcreteDataType::string_datatype());
|
||||
} else {
|
||||
panic!("Expected Dictionary data type");
|
||||
}
|
||||
|
||||
// Test is_null
|
||||
assert!(!dict_vec.is_null(0));
|
||||
assert!(dict_vec.is_null(3));
|
||||
|
||||
// Test get values
|
||||
assert_eq!(dict_vec.get(0), Value::String("a".to_string().into()));
|
||||
assert_eq!(dict_vec.get(1), Value::String("b".to_string().into()));
|
||||
assert_eq!(dict_vec.get(3), Value::Null);
|
||||
assert_eq!(dict_vec.get(4), Value::String("b".to_string().into()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_slice() {
|
||||
let dict_vec = create_test_dictionary();
|
||||
let sliced = dict_vec.slice(1, 3);
|
||||
|
||||
assert_eq!(sliced.len(), 3);
|
||||
assert_eq!(sliced.get(0), Value::String("b".to_string().into()));
|
||||
assert_eq!(sliced.get(1), Value::String("c".to_string().into()));
|
||||
assert_eq!(sliced.get(2), Value::Null);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_replicate() {
|
||||
let dict_vec = create_test_dictionary();
|
||||
|
||||
// Replicate with offsets [0, 2, 5] - should get values at these indices
|
||||
let offsets = vec![0, 2, 5];
|
||||
let replicated = dict_vec.replicate(&offsets);
|
||||
assert_eq!(replicated.len(), 5);
|
||||
assert_eq!(replicated.get(0), Value::String("b".to_string().into()));
|
||||
assert_eq!(replicated.get(1), Value::String("b".to_string().into()));
|
||||
assert_eq!(replicated.get(2), Value::String("c".to_string().into()));
|
||||
assert_eq!(replicated.get(3), Value::String("c".to_string().into()));
|
||||
assert_eq!(replicated.get(4), Value::String("c".to_string().into()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_filter() {
|
||||
let dict_vec = create_test_dictionary();
|
||||
|
||||
// Keep only indices 0, 2, 4
|
||||
let filter_values = vec![true, false, true, false, true, false];
|
||||
let filter = vectors::BooleanVector::from(filter_values);
|
||||
|
||||
let filtered = dict_vec.filter(&filter).unwrap();
|
||||
assert_eq!(filtered.len(), 3);
|
||||
|
||||
// Check the values
|
||||
assert_eq!(filtered.get(0), Value::String("a".to_string().into()));
|
||||
assert_eq!(filtered.get(1), Value::String("c".to_string().into()));
|
||||
assert_eq!(filtered.get(2), Value::String("b".to_string().into()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cast() {
|
||||
let dict_vec = create_test_dictionary();
|
||||
|
||||
// Cast to the same type should return an equivalent vector
|
||||
let casted = dict_vec.cast(&ConcreteDataType::string_datatype()).unwrap();
|
||||
|
||||
// The returned vector should have string values
|
||||
assert_eq!(
|
||||
casted.data_type(),
|
||||
ConcreteDataType::Dictionary(DictionaryType::new(
|
||||
ConcreteDataType::int32_datatype(),
|
||||
ConcreteDataType::string_datatype(),
|
||||
))
|
||||
);
|
||||
assert_eq!(casted.len(), dict_vec.len());
|
||||
|
||||
// Values should match the original dictionary lookups
|
||||
assert_eq!(casted.get(0), Value::String("a".to_string().into()));
|
||||
assert_eq!(casted.get(1), Value::String("b".to_string().into()));
|
||||
assert_eq!(casted.get(2), Value::String("c".to_string().into()));
|
||||
assert_eq!(casted.get(3), Value::Null);
|
||||
assert_eq!(casted.get(4), Value::String("b".to_string().into()));
|
||||
assert_eq!(casted.get(5), Value::String("d".to_string().into()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_take() {
|
||||
let dict_vec = create_test_dictionary();
|
||||
|
||||
// Take indices 2, 0, 4
|
||||
let indices_vec = vec![Some(2u32), Some(0), Some(4)];
|
||||
let indices = vectors::UInt32Vector::from(indices_vec);
|
||||
|
||||
let taken = dict_vec.take(&indices).unwrap();
|
||||
assert_eq!(taken.len(), 3);
|
||||
|
||||
// Check the values
|
||||
assert_eq!(taken.get(0), Value::String("c".to_string().into()));
|
||||
assert_eq!(taken.get(1), Value::String("a".to_string().into()));
|
||||
assert_eq!(taken.get(2), Value::String("b".to_string().into()));
|
||||
}
|
||||
}
|
||||
@@ -20,7 +20,8 @@ use std::sync::Arc;
|
||||
use arrow::array::{Array, ArrayRef, StringArray};
|
||||
use arrow::compute;
|
||||
use arrow::compute::kernels::comparison;
|
||||
use arrow::datatypes::{DataType as ArrowDataType, TimeUnit};
|
||||
use arrow::datatypes::{DataType as ArrowDataType, Int32Type, TimeUnit};
|
||||
use arrow_array::DictionaryArray;
|
||||
use arrow_schema::IntervalUnit;
|
||||
use datafusion_common::ScalarValue;
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
@@ -31,7 +32,7 @@ use crate::prelude::DataType;
|
||||
use crate::scalars::{Scalar, ScalarVectorBuilder};
|
||||
use crate::value::{ListValue, ListValueRef, Value};
|
||||
use crate::vectors::{
|
||||
BinaryVector, BooleanVector, ConstantVector, DateVector, Decimal128Vector,
|
||||
BinaryVector, BooleanVector, ConstantVector, DateVector, Decimal128Vector, DictionaryVector,
|
||||
DurationMicrosecondVector, DurationMillisecondVector, DurationNanosecondVector,
|
||||
DurationSecondVector, Float32Vector, Float64Vector, Int16Vector, Int32Vector, Int64Vector,
|
||||
Int8Vector, IntervalDayTimeVector, IntervalMonthDayNanoVector, IntervalYearMonthVector,
|
||||
@@ -347,6 +348,17 @@ impl Helper {
|
||||
ArrowDataType::Decimal128(_, _) => {
|
||||
Arc::new(Decimal128Vector::try_from_arrow_array(array)?)
|
||||
}
|
||||
ArrowDataType::Dictionary(key, value) if matches!(&**key, ArrowDataType::Int32) => {
|
||||
let array = array
|
||||
.as_ref()
|
||||
.as_any()
|
||||
.downcast_ref::<DictionaryArray<Int32Type>>()
|
||||
.unwrap(); // Safety: the type is guarded by match arm condition
|
||||
Arc::new(DictionaryVector::new(
|
||||
array.clone(),
|
||||
ConcreteDataType::try_from(value.as_ref())?,
|
||||
)?)
|
||||
}
|
||||
ArrowDataType::Float16
|
||||
| ArrowDataType::LargeList(_)
|
||||
| ArrowDataType::FixedSizeList(_, _)
|
||||
|
||||
@@ -14,14 +14,11 @@
|
||||
|
||||
mod cast;
|
||||
mod filter;
|
||||
mod find_unique;
|
||||
mod replicate;
|
||||
mod take;
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_base::BitVec;
|
||||
|
||||
use crate::error::{self, Result};
|
||||
use crate::types::LogicalPrimitiveType;
|
||||
use crate::vectors::constant::ConstantVector;
|
||||
@@ -40,23 +37,6 @@ pub trait VectorOp {
|
||||
/// Panics if `offsets.len() != self.len()`.
|
||||
fn replicate(&self, offsets: &[usize]) -> VectorRef;
|
||||
|
||||
/// Mark `i-th` bit of `selected` to `true` if the `i-th` element of `self` is unique, which
|
||||
/// means there is no elements behind it have same value as it.
|
||||
///
|
||||
/// The caller should ensure
|
||||
/// 1. the length of `selected` bitmap is equal to `vector.len()`.
|
||||
/// 2. `vector` and `prev_vector` are sorted.
|
||||
///
|
||||
/// If there are multiple duplicate elements, this function retains the **first** element.
|
||||
/// The first element is considered as unique if the first element of `self` is different
|
||||
/// from its previous element, that is the last element of `prev_vector`.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if
|
||||
/// - `selected.len() < self.len()`.
|
||||
/// - `prev_vector` and `self` have different data types.
|
||||
fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>);
|
||||
|
||||
/// Filters the vector, returns elements matching the `filter` (i.e. where the values are true).
|
||||
///
|
||||
/// Note that the nulls of `filter` are interpreted as `false` will lead to these elements being masked out.
|
||||
@@ -81,11 +61,6 @@ macro_rules! impl_scalar_vector_op {
|
||||
replicate::replicate_scalar(self, offsets)
|
||||
}
|
||||
|
||||
fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) {
|
||||
let prev_vector = prev_vector.map(|pv| pv.as_any().downcast_ref::<$VectorType>().unwrap());
|
||||
find_unique::find_unique_scalar(self, selected, prev_vector);
|
||||
}
|
||||
|
||||
fn filter(&self, filter: &BooleanVector) -> Result<VectorRef> {
|
||||
filter::filter_non_constant!(self, $VectorType, filter)
|
||||
}
|
||||
@@ -121,11 +96,6 @@ impl VectorOp for Decimal128Vector {
|
||||
std::sync::Arc::new(replicate::replicate_decimal128(self, offsets))
|
||||
}
|
||||
|
||||
fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) {
|
||||
let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::<Decimal128Vector>());
|
||||
find_unique::find_unique_scalar(self, selected, prev_vector);
|
||||
}
|
||||
|
||||
fn filter(&self, filter: &BooleanVector) -> Result<VectorRef> {
|
||||
filter::filter_non_constant!(self, Decimal128Vector, filter)
|
||||
}
|
||||
@@ -144,12 +114,6 @@ impl<T: LogicalPrimitiveType> VectorOp for PrimitiveVector<T> {
|
||||
std::sync::Arc::new(replicate::replicate_primitive(self, offsets))
|
||||
}
|
||||
|
||||
fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) {
|
||||
let prev_vector =
|
||||
prev_vector.and_then(|pv| pv.as_any().downcast_ref::<PrimitiveVector<T>>());
|
||||
find_unique::find_unique_scalar(self, selected, prev_vector);
|
||||
}
|
||||
|
||||
fn filter(&self, filter: &BooleanVector) -> Result<VectorRef> {
|
||||
filter::filter_non_constant!(self, PrimitiveVector<T>, filter)
|
||||
}
|
||||
@@ -168,11 +132,6 @@ impl VectorOp for NullVector {
|
||||
replicate::replicate_null(self, offsets)
|
||||
}
|
||||
|
||||
fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) {
|
||||
let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::<NullVector>());
|
||||
find_unique::find_unique_null(self, selected, prev_vector);
|
||||
}
|
||||
|
||||
fn filter(&self, filter: &BooleanVector) -> Result<VectorRef> {
|
||||
filter::filter_non_constant!(self, NullVector, filter)
|
||||
}
|
||||
@@ -195,11 +154,6 @@ impl VectorOp for ConstantVector {
|
||||
self.replicate_vector(offsets)
|
||||
}
|
||||
|
||||
fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) {
|
||||
let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::<ConstantVector>());
|
||||
find_unique::find_unique_constant(self, selected, prev_vector);
|
||||
}
|
||||
|
||||
fn filter(&self, filter: &BooleanVector) -> Result<VectorRef> {
|
||||
self.filter_vector(filter)
|
||||
}
|
||||
|
||||
@@ -1,366 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use common_base::BitVec;
|
||||
|
||||
use crate::scalars::ScalarVector;
|
||||
use crate::vectors::constant::ConstantVector;
|
||||
use crate::vectors::{NullVector, Vector};
|
||||
|
||||
// To implement `find_unique()` correctly, we need to keep in mind that always marks an element as
|
||||
// selected when it is different from the previous one, and leaves the `selected` unchanged
|
||||
// in any other case.
|
||||
pub(crate) fn find_unique_scalar<'a, T: ScalarVector>(
|
||||
vector: &'a T,
|
||||
selected: &'a mut BitVec,
|
||||
prev_vector: Option<&'a T>,
|
||||
) where
|
||||
T::RefItem<'a>: PartialEq,
|
||||
{
|
||||
assert!(selected.len() >= vector.len());
|
||||
|
||||
if vector.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
for ((i, current), next) in vector
|
||||
.iter_data()
|
||||
.enumerate()
|
||||
.zip(vector.iter_data().skip(1))
|
||||
{
|
||||
if current != next {
|
||||
// If next element is a different element, we mark it as selected.
|
||||
selected.set(i + 1, true);
|
||||
}
|
||||
}
|
||||
|
||||
// Marks first element as selected if it is different from previous element, otherwise
|
||||
// keep selected bitmap unchanged.
|
||||
let is_first_not_duplicate = prev_vector
|
||||
.map(|pv| {
|
||||
if pv.is_empty() {
|
||||
true
|
||||
} else {
|
||||
let last = pv.get_data(pv.len() - 1);
|
||||
last != vector.get_data(0)
|
||||
}
|
||||
})
|
||||
.unwrap_or(true);
|
||||
if is_first_not_duplicate {
|
||||
selected.set(0, true);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn find_unique_null(
|
||||
vector: &NullVector,
|
||||
selected: &mut BitVec,
|
||||
prev_vector: Option<&NullVector>,
|
||||
) {
|
||||
if vector.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let is_first_not_duplicate = prev_vector.map(NullVector::is_empty).unwrap_or(true);
|
||||
if is_first_not_duplicate {
|
||||
selected.set(0, true);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn find_unique_constant(
|
||||
vector: &ConstantVector,
|
||||
selected: &mut BitVec,
|
||||
prev_vector: Option<&ConstantVector>,
|
||||
) {
|
||||
if vector.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let is_first_not_duplicate = prev_vector
|
||||
.map(|pv| {
|
||||
if pv.is_empty() {
|
||||
true
|
||||
} else {
|
||||
vector.get_constant_ref() != pv.get_constant_ref()
|
||||
}
|
||||
})
|
||||
.unwrap_or(true);
|
||||
|
||||
if is_first_not_duplicate {
|
||||
selected.set(0, true);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_time::Date;
|
||||
|
||||
use super::*;
|
||||
use crate::timestamp::*;
|
||||
use crate::vectors::{Int32Vector, StringVector, Vector, VectorOp};
|
||||
|
||||
fn check_bitmap(expect: &[bool], selected: &BitVec) {
|
||||
let actual = selected.iter().collect::<Vec<_>>();
|
||||
assert_eq!(expect, actual);
|
||||
}
|
||||
|
||||
fn check_find_unique_scalar(expect: &[bool], input: &[i32], prev: Option<&[i32]>) {
|
||||
check_find_unique_scalar_opt(expect, input.iter().map(|v| Some(*v)), prev);
|
||||
}
|
||||
|
||||
fn check_find_unique_scalar_opt(
|
||||
expect: &[bool],
|
||||
input: impl Iterator<Item = Option<i32>>,
|
||||
prev: Option<&[i32]>,
|
||||
) {
|
||||
let input = Int32Vector::from(input.collect::<Vec<_>>());
|
||||
let prev = prev.map(Int32Vector::from_slice);
|
||||
|
||||
let mut selected = BitVec::repeat(false, input.len());
|
||||
input.find_unique(&mut selected, prev.as_ref().map(|v| v as _));
|
||||
|
||||
check_bitmap(expect, &selected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_unique_scalar() {
|
||||
check_find_unique_scalar(&[], &[], None);
|
||||
check_find_unique_scalar(&[true], &[1], None);
|
||||
check_find_unique_scalar(&[true, false], &[1, 1], None);
|
||||
check_find_unique_scalar(&[true, true], &[1, 2], None);
|
||||
check_find_unique_scalar(&[true, true, true, true], &[1, 2, 3, 4], None);
|
||||
check_find_unique_scalar(&[true, false, true, false], &[1, 1, 3, 3], None);
|
||||
check_find_unique_scalar(&[true, false, false, false, true], &[2, 2, 2, 2, 3], None);
|
||||
|
||||
check_find_unique_scalar(&[true], &[5], Some(&[]));
|
||||
check_find_unique_scalar(&[true], &[5], Some(&[3]));
|
||||
check_find_unique_scalar(&[false], &[5], Some(&[5]));
|
||||
check_find_unique_scalar(&[false], &[5], Some(&[4, 5]));
|
||||
check_find_unique_scalar(&[false, true], &[5, 6], Some(&[4, 5]));
|
||||
check_find_unique_scalar(&[false, true, false], &[5, 6, 6], Some(&[4, 5]));
|
||||
check_find_unique_scalar(
|
||||
&[false, true, false, true, true],
|
||||
&[5, 6, 6, 7, 8],
|
||||
Some(&[4, 5]),
|
||||
);
|
||||
|
||||
check_find_unique_scalar_opt(
|
||||
&[true, true, false, true, false],
|
||||
[Some(1), Some(2), Some(2), None, None].into_iter(),
|
||||
None,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_unique_scalar_multi_times_with_prev() {
|
||||
let prev = Int32Vector::from_slice([1]);
|
||||
|
||||
let v1 = Int32Vector::from_slice([2, 3, 4]);
|
||||
let mut selected = BitVec::repeat(false, v1.len());
|
||||
v1.find_unique(&mut selected, Some(&prev));
|
||||
|
||||
// Though element in v2 are the same as prev, but we should still keep them.
|
||||
let v2 = Int32Vector::from_slice([1, 1, 1]);
|
||||
v2.find_unique(&mut selected, Some(&prev));
|
||||
|
||||
check_bitmap(&[true, true, true], &selected);
|
||||
}
|
||||
|
||||
fn new_bitmap(bits: &[bool]) -> BitVec {
|
||||
BitVec::from_iter(bits)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_unique_scalar_with_prev() {
|
||||
let prev = Int32Vector::from_slice([1]);
|
||||
|
||||
let mut selected = new_bitmap(&[true, false, true, false]);
|
||||
let v = Int32Vector::from_slice([2, 3, 4, 5]);
|
||||
v.find_unique(&mut selected, Some(&prev));
|
||||
// All elements are different.
|
||||
check_bitmap(&[true, true, true, true], &selected);
|
||||
|
||||
let mut selected = new_bitmap(&[true, false, true, false]);
|
||||
let v = Int32Vector::from_slice([1, 2, 3, 4]);
|
||||
v.find_unique(&mut selected, Some(&prev));
|
||||
// Though first element is duplicate, but we keep the flag unchanged.
|
||||
check_bitmap(&[true, true, true, true], &selected);
|
||||
|
||||
// Same case as above, but now `prev` is None.
|
||||
let mut selected = new_bitmap(&[true, false, true, false]);
|
||||
let v = Int32Vector::from_slice([1, 2, 3, 4]);
|
||||
v.find_unique(&mut selected, None);
|
||||
check_bitmap(&[true, true, true, true], &selected);
|
||||
|
||||
// Same case as above, but now `prev` is empty.
|
||||
let mut selected = new_bitmap(&[true, false, true, false]);
|
||||
let v = Int32Vector::from_slice([1, 2, 3, 4]);
|
||||
v.find_unique(&mut selected, Some(&Int32Vector::from_slice([])));
|
||||
check_bitmap(&[true, true, true, true], &selected);
|
||||
|
||||
let mut selected = new_bitmap(&[false, false, false, false]);
|
||||
let v = Int32Vector::from_slice([2, 2, 4, 5]);
|
||||
v.find_unique(&mut selected, Some(&prev));
|
||||
// only v[1] is duplicate.
|
||||
check_bitmap(&[true, false, true, true], &selected);
|
||||
}
|
||||
|
||||
fn check_find_unique_null(len: usize) {
|
||||
let input = NullVector::new(len);
|
||||
let mut selected = BitVec::repeat(false, input.len());
|
||||
input.find_unique(&mut selected, None);
|
||||
|
||||
let mut expect = vec![false; len];
|
||||
if !expect.is_empty() {
|
||||
expect[0] = true;
|
||||
}
|
||||
check_bitmap(&expect, &selected);
|
||||
|
||||
let mut selected = BitVec::repeat(false, input.len());
|
||||
let prev = Some(NullVector::new(1));
|
||||
input.find_unique(&mut selected, prev.as_ref().map(|v| v as _));
|
||||
let expect = vec![false; len];
|
||||
check_bitmap(&expect, &selected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_unique_null() {
|
||||
for len in 0..5 {
|
||||
check_find_unique_null(len);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_unique_null_with_prev() {
|
||||
let prev = NullVector::new(1);
|
||||
|
||||
// Keep flags unchanged.
|
||||
let mut selected = new_bitmap(&[true, false, true, false]);
|
||||
let v = NullVector::new(4);
|
||||
v.find_unique(&mut selected, Some(&prev));
|
||||
check_bitmap(&[true, false, true, false], &selected);
|
||||
|
||||
// Keep flags unchanged.
|
||||
let mut selected = new_bitmap(&[false, false, true, false]);
|
||||
v.find_unique(&mut selected, Some(&prev));
|
||||
check_bitmap(&[false, false, true, false], &selected);
|
||||
|
||||
// Prev is None, select first element.
|
||||
let mut selected = new_bitmap(&[false, false, true, false]);
|
||||
v.find_unique(&mut selected, None);
|
||||
check_bitmap(&[true, false, true, false], &selected);
|
||||
|
||||
// Prev is empty, select first element.
|
||||
let mut selected = new_bitmap(&[false, false, true, false]);
|
||||
v.find_unique(&mut selected, Some(&NullVector::new(0)));
|
||||
check_bitmap(&[true, false, true, false], &selected);
|
||||
}
|
||||
|
||||
fn check_find_unique_constant(len: usize) {
|
||||
let input = ConstantVector::new(Arc::new(Int32Vector::from_slice([8])), len);
|
||||
let mut selected = BitVec::repeat(false, len);
|
||||
input.find_unique(&mut selected, None);
|
||||
|
||||
let mut expect = vec![false; len];
|
||||
if !expect.is_empty() {
|
||||
expect[0] = true;
|
||||
}
|
||||
check_bitmap(&expect, &selected);
|
||||
|
||||
let mut selected = BitVec::repeat(false, len);
|
||||
let prev = Some(ConstantVector::new(
|
||||
Arc::new(Int32Vector::from_slice([8])),
|
||||
1,
|
||||
));
|
||||
input.find_unique(&mut selected, prev.as_ref().map(|v| v as _));
|
||||
let expect = vec![false; len];
|
||||
check_bitmap(&expect, &selected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_unique_constant() {
|
||||
for len in 0..5 {
|
||||
check_find_unique_constant(len);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_unique_constant_with_prev() {
|
||||
let prev = ConstantVector::new(Arc::new(Int32Vector::from_slice([1])), 1);
|
||||
|
||||
// Keep flags unchanged.
|
||||
let mut selected = new_bitmap(&[true, false, true, false]);
|
||||
let v = ConstantVector::new(Arc::new(Int32Vector::from_slice([1])), 4);
|
||||
v.find_unique(&mut selected, Some(&prev));
|
||||
check_bitmap(&[true, false, true, false], &selected);
|
||||
|
||||
// Keep flags unchanged.
|
||||
let mut selected = new_bitmap(&[false, false, true, false]);
|
||||
v.find_unique(&mut selected, Some(&prev));
|
||||
check_bitmap(&[false, false, true, false], &selected);
|
||||
|
||||
// Prev is None, select first element.
|
||||
let mut selected = new_bitmap(&[false, false, true, false]);
|
||||
v.find_unique(&mut selected, None);
|
||||
check_bitmap(&[true, false, true, false], &selected);
|
||||
|
||||
// Prev is empty, select first element.
|
||||
let mut selected = new_bitmap(&[false, false, true, false]);
|
||||
v.find_unique(
|
||||
&mut selected,
|
||||
Some(&ConstantVector::new(
|
||||
Arc::new(Int32Vector::from_slice([1])),
|
||||
0,
|
||||
)),
|
||||
);
|
||||
check_bitmap(&[true, false, true, false], &selected);
|
||||
|
||||
// Different constant vector.
|
||||
let mut selected = new_bitmap(&[false, false, true, false]);
|
||||
let v = ConstantVector::new(Arc::new(Int32Vector::from_slice([2])), 4);
|
||||
v.find_unique(&mut selected, Some(&prev));
|
||||
check_bitmap(&[true, false, true, false], &selected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_unique_string() {
|
||||
let input = StringVector::from_slice(&["a", "a", "b", "c"]);
|
||||
let mut selected = BitVec::repeat(false, 4);
|
||||
input.find_unique(&mut selected, None);
|
||||
let expect = vec![true, false, true, true];
|
||||
check_bitmap(&expect, &selected);
|
||||
}
|
||||
|
||||
macro_rules! impl_find_unique_date_like_test {
|
||||
($VectorType: ident, $ValueType: ident, $method: ident) => {{
|
||||
use $crate::vectors::$VectorType;
|
||||
|
||||
let v = $VectorType::from_iterator([8, 8, 9, 10].into_iter().map($ValueType::$method));
|
||||
let mut selected = BitVec::repeat(false, 4);
|
||||
v.find_unique(&mut selected, None);
|
||||
let expect = vec![true, false, true, true];
|
||||
check_bitmap(&expect, &selected);
|
||||
}};
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_unique_date_like() {
|
||||
impl_find_unique_date_like_test!(DateVector, Date, new);
|
||||
impl_find_unique_date_like_test!(TimestampSecondVector, TimestampSecond, from);
|
||||
impl_find_unique_date_like_test!(TimestampMillisecondVector, TimestampMillisecond, from);
|
||||
impl_find_unique_date_like_test!(TimestampMicrosecondVector, TimestampMicrosecond, from);
|
||||
impl_find_unique_date_like_test!(TimestampNanosecondVector, TimestampNanosecond, from);
|
||||
}
|
||||
}
|
||||
@@ -20,7 +20,7 @@ use api::v1::{ColumnDataType, ColumnDataTypeExtension, CreateTableExpr, Semantic
|
||||
use common_error::ext::BoxedError;
|
||||
use common_meta::key::table_info::TableInfoValue;
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use datatypes::schema::ColumnSchema;
|
||||
use datatypes::schema::{ColumnDefaultConstraint, ColumnSchema};
|
||||
use itertools::Itertools;
|
||||
use operator::expr_helper;
|
||||
use session::context::QueryContextBuilder;
|
||||
@@ -174,7 +174,15 @@ pub fn table_info_value_to_relation_desc(
|
||||
let default_values = raw_schema
|
||||
.column_schemas
|
||||
.iter()
|
||||
.map(|c| c.default_constraint().cloned())
|
||||
.map(|c| {
|
||||
c.default_constraint().cloned().or_else(|| {
|
||||
if c.is_nullable() {
|
||||
Some(ColumnDefaultConstraint::null_value())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
})
|
||||
.collect_vec();
|
||||
|
||||
Ok(TableDesc::new(relation_desc, default_values))
|
||||
|
||||
@@ -179,7 +179,7 @@ impl Context<'_, '_> {
|
||||
) -> CollectionBundle<Batch> {
|
||||
let (send_port, recv_port) = self.df.make_edge::<_, Toff<Batch>>("constant_batch");
|
||||
let mut per_time: BTreeMap<repr::Timestamp, Vec<DiffRow>> = Default::default();
|
||||
for (key, group) in &rows.into_iter().group_by(|(_row, ts, _diff)| *ts) {
|
||||
for (key, group) in &rows.into_iter().chunk_by(|(_row, ts, _diff)| *ts) {
|
||||
per_time.entry(key).or_default().extend(group);
|
||||
}
|
||||
|
||||
@@ -233,7 +233,7 @@ impl Context<'_, '_> {
|
||||
pub fn render_constant(&mut self, rows: Vec<DiffRow>) -> CollectionBundle {
|
||||
let (send_port, recv_port) = self.df.make_edge::<_, Toff>("constant");
|
||||
let mut per_time: BTreeMap<repr::Timestamp, Vec<DiffRow>> = Default::default();
|
||||
for (key, group) in &rows.into_iter().group_by(|(_row, ts, _diff)| *ts) {
|
||||
for (key, group) in &rows.into_iter().chunk_by(|(_row, ts, _diff)| *ts) {
|
||||
per_time.entry(key).or_default().extend(group);
|
||||
}
|
||||
|
||||
|
||||
@@ -151,12 +151,12 @@ impl ScalarExpr {
|
||||
|
||||
/// apply optimization to the expression, like flatten variadic function
|
||||
pub fn optimize(&mut self) {
|
||||
self.flatten_varidic_fn();
|
||||
self.flatten_variadic_fn();
|
||||
}
|
||||
|
||||
/// Because Substrait's `And`/`Or` function is binary, but FlowPlan's
|
||||
/// `And`/`Or` function is variadic, we need to flatten the `And` function if multiple `And`/`Or` functions are nested.
|
||||
fn flatten_varidic_fn(&mut self) {
|
||||
fn flatten_variadic_fn(&mut self) {
|
||||
if let ScalarExpr::CallVariadic { func, exprs } = self {
|
||||
let mut new_exprs = vec![];
|
||||
for expr in std::mem::take(exprs) {
|
||||
@@ -167,7 +167,7 @@ impl ScalarExpr {
|
||||
{
|
||||
if *func == inner_func {
|
||||
for inner_expr in inner_exprs.iter_mut() {
|
||||
inner_expr.flatten_varidic_fn();
|
||||
inner_expr.flatten_variadic_fn();
|
||||
}
|
||||
new_exprs.extend(inner_exprs);
|
||||
}
|
||||
|
||||
@@ -33,6 +33,7 @@ mod expr;
|
||||
pub mod heartbeat;
|
||||
mod metrics;
|
||||
mod plan;
|
||||
mod recording_rules;
|
||||
mod repr;
|
||||
mod server;
|
||||
mod transform;
|
||||
|
||||
24
src/flow/src/recording_rules.rs
Normal file
24
src/flow/src/recording_rules.rs
Normal file
@@ -0,0 +1,24 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Run flow as recording rule which is time-window-aware normal query triggered when new data arrives
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
mod frontend_client;
|
||||
mod utils;
|
||||
|
||||
/// TODO(discord9): make those constants configurable
|
||||
/// The default rule engine query timeout is 10 minutes
|
||||
pub const DEFAULT_RULE_ENGINE_QUERY_TIMEOUT: Duration = Duration::from_secs(10 * 60);
|
||||
148
src/flow/src/recording_rules/frontend_client.rs
Normal file
148
src/flow/src/recording_rules/frontend_client.rs
Normal file
@@ -0,0 +1,148 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Frontend client to run flow as recording rule which is time-window-aware normal query triggered every tick set by user
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use client::{Client, Database, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
|
||||
use common_error::ext::BoxedError;
|
||||
use common_grpc::channel_manager::{ChannelConfig, ChannelManager};
|
||||
use common_meta::cluster::{NodeInfo, NodeInfoKey, Role};
|
||||
use common_meta::peer::Peer;
|
||||
use common_meta::rpc::store::RangeRequest;
|
||||
use meta_client::client::MetaClient;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::error::{ExternalSnafu, UnexpectedSnafu};
|
||||
use crate::recording_rules::DEFAULT_RULE_ENGINE_QUERY_TIMEOUT;
|
||||
use crate::Error;
|
||||
|
||||
fn default_channel_mgr() -> ChannelManager {
|
||||
let cfg = ChannelConfig::new().timeout(DEFAULT_RULE_ENGINE_QUERY_TIMEOUT);
|
||||
ChannelManager::with_config(cfg)
|
||||
}
|
||||
|
||||
fn client_from_urls(addrs: Vec<String>) -> Client {
|
||||
Client::with_manager_and_urls(default_channel_mgr(), addrs)
|
||||
}
|
||||
|
||||
/// A simple frontend client able to execute sql using grpc protocol
|
||||
#[derive(Debug)]
|
||||
pub enum FrontendClient {
|
||||
Distributed {
|
||||
meta_client: Arc<MetaClient>,
|
||||
},
|
||||
Standalone {
|
||||
/// for the sake of simplicity still use grpc even in standalone mode
|
||||
/// notice the client here should all be lazy, so that can wait after frontend is booted then make conn
|
||||
/// TODO(discord9): not use grpc under standalone mode
|
||||
database_client: DatabaseWithPeer,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DatabaseWithPeer {
|
||||
pub database: Database,
|
||||
pub peer: Peer,
|
||||
}
|
||||
|
||||
impl DatabaseWithPeer {
|
||||
fn new(database: Database, peer: Peer) -> Self {
|
||||
Self { database, peer }
|
||||
}
|
||||
}
|
||||
|
||||
impl FrontendClient {
|
||||
pub fn from_meta_client(meta_client: Arc<MetaClient>) -> Self {
|
||||
Self::Distributed { meta_client }
|
||||
}
|
||||
|
||||
pub fn from_static_grpc_addr(addr: String) -> Self {
|
||||
let peer = Peer {
|
||||
id: 0,
|
||||
addr: addr.clone(),
|
||||
};
|
||||
|
||||
let client = client_from_urls(vec![addr]);
|
||||
let database = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client);
|
||||
Self::Standalone {
|
||||
database_client: DatabaseWithPeer::new(database, peer),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FrontendClient {
|
||||
async fn scan_for_frontend(&self) -> Result<Vec<(NodeInfoKey, NodeInfo)>, Error> {
|
||||
let Self::Distributed { meta_client, .. } = self else {
|
||||
return Ok(vec![]);
|
||||
};
|
||||
let cluster_client = meta_client
|
||||
.cluster_client()
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExternalSnafu)?;
|
||||
|
||||
let prefix = NodeInfoKey::key_prefix_with_role(Role::Frontend);
|
||||
let req = RangeRequest::new().with_prefix(prefix);
|
||||
let resp = cluster_client
|
||||
.range(req)
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExternalSnafu)?;
|
||||
let mut res = Vec::with_capacity(resp.kvs.len());
|
||||
for kv in resp.kvs {
|
||||
let key = NodeInfoKey::try_from(kv.key)
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExternalSnafu)?;
|
||||
|
||||
let val = NodeInfo::try_from(kv.value)
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExternalSnafu)?;
|
||||
res.push((key, val));
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
/// Get the database with max `last_activity_ts`
|
||||
async fn get_last_active_frontend(&self) -> Result<DatabaseWithPeer, Error> {
|
||||
if let Self::Standalone { database_client } = self {
|
||||
return Ok(database_client.clone());
|
||||
}
|
||||
|
||||
let frontends = self.scan_for_frontend().await?;
|
||||
let mut peer = None;
|
||||
|
||||
if let Some((_, val)) = frontends.iter().max_by_key(|(_, val)| val.last_activity_ts) {
|
||||
peer = Some(val.peer.clone());
|
||||
}
|
||||
|
||||
let Some(peer) = peer else {
|
||||
UnexpectedSnafu {
|
||||
reason: format!("No frontend available: {:?}", frontends),
|
||||
}
|
||||
.fail()?
|
||||
};
|
||||
let client = client_from_urls(vec![peer.addr.clone()]);
|
||||
let database = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client);
|
||||
Ok(DatabaseWithPeer::new(database, peer))
|
||||
}
|
||||
|
||||
/// Get a database client, and possibly update it before returning.
|
||||
pub async fn get_database_client(&self) -> Result<DatabaseWithPeer, Error> {
|
||||
match self {
|
||||
Self::Standalone { database_client } => Ok(database_client.clone()),
|
||||
Self::Distributed { meta_client: _ } => self.get_last_active_frontend().await,
|
||||
}
|
||||
}
|
||||
}
|
||||
600
src/flow/src/recording_rules/utils.rs
Normal file
600
src/flow/src/recording_rules/utils.rs
Normal file
@@ -0,0 +1,600 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! some utils for helping with recording rule
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_error::ext::BoxedError;
|
||||
use common_telemetry::{debug, info};
|
||||
use datafusion::error::Result as DfResult;
|
||||
use datafusion::logical_expr::Expr;
|
||||
use datafusion::sql::unparser::Unparser;
|
||||
use datafusion_common::tree_node::{
|
||||
Transformed, TreeNodeRecursion, TreeNodeRewriter, TreeNodeVisitor,
|
||||
};
|
||||
use datafusion_common::DataFusionError;
|
||||
use datafusion_expr::{Distinct, LogicalPlan};
|
||||
use datatypes::schema::RawSchema;
|
||||
use query::parser::QueryLanguageParser;
|
||||
use query::QueryEngineRef;
|
||||
use session::context::QueryContextRef;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::adapter::AUTO_CREATED_PLACEHOLDER_TS_COL;
|
||||
use crate::df_optimizer::apply_df_optimizer;
|
||||
use crate::error::{DatafusionSnafu, ExternalSnafu};
|
||||
use crate::Error;
|
||||
|
||||
/// Convert sql to datafusion logical plan
|
||||
pub async fn sql_to_df_plan(
|
||||
query_ctx: QueryContextRef,
|
||||
engine: QueryEngineRef,
|
||||
sql: &str,
|
||||
optimize: bool,
|
||||
) -> Result<LogicalPlan, Error> {
|
||||
let stmt = QueryLanguageParser::parse_sql(sql, &query_ctx)
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExternalSnafu)?;
|
||||
let plan = engine
|
||||
.planner()
|
||||
.plan(&stmt, query_ctx)
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(ExternalSnafu)?;
|
||||
let plan = if optimize {
|
||||
apply_df_optimizer(plan).await?
|
||||
} else {
|
||||
plan
|
||||
};
|
||||
Ok(plan)
|
||||
}
|
||||
|
||||
pub fn df_plan_to_sql(plan: &LogicalPlan) -> Result<String, Error> {
|
||||
/// A dialect that forces identifiers to be quoted when have uppercase
|
||||
struct ForceQuoteIdentifiers;
|
||||
impl datafusion::sql::unparser::dialect::Dialect for ForceQuoteIdentifiers {
|
||||
fn identifier_quote_style(&self, identifier: &str) -> Option<char> {
|
||||
if identifier.to_lowercase() != identifier {
|
||||
Some('`')
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
let unparser = Unparser::new(&ForceQuoteIdentifiers);
|
||||
// first make all column qualified
|
||||
let sql = unparser
|
||||
.plan_to_sql(plan)
|
||||
.with_context(|_e| DatafusionSnafu {
|
||||
context: format!("Failed to unparse logical plan {plan:?}"),
|
||||
})?;
|
||||
Ok(sql.to_string())
|
||||
}
|
||||
|
||||
/// Helper to find the innermost group by expr in schema, return None if no group by expr
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct FindGroupByFinalName {
|
||||
group_exprs: Option<HashSet<datafusion_expr::Expr>>,
|
||||
}
|
||||
|
||||
impl FindGroupByFinalName {
|
||||
pub fn get_group_expr_names(&self) -> Option<HashSet<String>> {
|
||||
self.group_exprs
|
||||
.as_ref()
|
||||
.map(|exprs| exprs.iter().map(|expr| expr.qualified_name().1).collect())
|
||||
}
|
||||
}
|
||||
|
||||
impl TreeNodeVisitor<'_> for FindGroupByFinalName {
|
||||
type Node = LogicalPlan;
|
||||
|
||||
fn f_down(&mut self, node: &Self::Node) -> datafusion_common::Result<TreeNodeRecursion> {
|
||||
if let LogicalPlan::Aggregate(aggregate) = node {
|
||||
self.group_exprs = Some(aggregate.group_expr.iter().cloned().collect());
|
||||
debug!("Group by exprs: {:?}", self.group_exprs);
|
||||
} else if let LogicalPlan::Distinct(distinct) = node {
|
||||
debug!("Distinct: {:#?}", distinct);
|
||||
match distinct {
|
||||
Distinct::All(input) => {
|
||||
if let LogicalPlan::TableScan(table_scan) = &**input {
|
||||
// get column from field_qualifier, projection and projected_schema:
|
||||
let len = table_scan.projected_schema.fields().len();
|
||||
let columns = (0..len)
|
||||
.map(|f| {
|
||||
let (qualifier, field) =
|
||||
table_scan.projected_schema.qualified_field(f);
|
||||
datafusion_common::Column::new(qualifier.cloned(), field.name())
|
||||
})
|
||||
.map(datafusion_expr::Expr::Column);
|
||||
self.group_exprs = Some(columns.collect());
|
||||
} else {
|
||||
self.group_exprs = Some(input.expressions().iter().cloned().collect())
|
||||
}
|
||||
}
|
||||
Distinct::On(distinct_on) => {
|
||||
self.group_exprs = Some(distinct_on.on_expr.iter().cloned().collect())
|
||||
}
|
||||
}
|
||||
debug!("Group by exprs: {:?}", self.group_exprs);
|
||||
}
|
||||
|
||||
Ok(TreeNodeRecursion::Continue)
|
||||
}
|
||||
|
||||
/// deal with projection when going up with group exprs
|
||||
fn f_up(&mut self, node: &Self::Node) -> datafusion_common::Result<TreeNodeRecursion> {
|
||||
if let LogicalPlan::Projection(projection) = node {
|
||||
for expr in &projection.expr {
|
||||
let Some(group_exprs) = &mut self.group_exprs else {
|
||||
return Ok(TreeNodeRecursion::Continue);
|
||||
};
|
||||
if let datafusion_expr::Expr::Alias(alias) = expr {
|
||||
// if a alias exist, replace with the new alias
|
||||
let mut new_group_exprs = group_exprs.clone();
|
||||
for group_expr in group_exprs.iter() {
|
||||
if group_expr.name_for_alias()? == alias.expr.name_for_alias()? {
|
||||
new_group_exprs.remove(group_expr);
|
||||
new_group_exprs.insert(expr.clone());
|
||||
break;
|
||||
}
|
||||
}
|
||||
*group_exprs = new_group_exprs;
|
||||
}
|
||||
}
|
||||
}
|
||||
debug!("Aliased group by exprs: {:?}", self.group_exprs);
|
||||
Ok(TreeNodeRecursion::Continue)
|
||||
}
|
||||
}
|
||||
|
||||
/// Add to the final select columns like `update_at`(which doesn't necessary need to have exact name just need to be a extra timestamp column) and `__ts_placeholder`(this column need to have exact this name and be a timestamp) with values like `now()` and `0`
|
||||
#[derive(Debug)]
|
||||
pub struct AddAutoColumnRewriter {
|
||||
pub schema: RawSchema,
|
||||
pub is_rewritten: bool,
|
||||
}
|
||||
|
||||
impl AddAutoColumnRewriter {
|
||||
pub fn new(schema: RawSchema) -> Self {
|
||||
Self {
|
||||
schema,
|
||||
is_rewritten: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TreeNodeRewriter for AddAutoColumnRewriter {
|
||||
type Node = LogicalPlan;
|
||||
fn f_down(&mut self, node: Self::Node) -> DfResult<Transformed<Self::Node>> {
|
||||
if self.is_rewritten {
|
||||
return Ok(Transformed::no(node));
|
||||
}
|
||||
|
||||
// if is distinct all, go one level down
|
||||
if let LogicalPlan::Distinct(Distinct::All(_)) = node {
|
||||
return Ok(Transformed::no(node));
|
||||
}
|
||||
|
||||
// FIXME(discord9): just read plan.expr and do stuffs
|
||||
let mut exprs = node.expressions();
|
||||
|
||||
// add columns if have different column count
|
||||
let query_col_cnt = exprs.len();
|
||||
let table_col_cnt = self.schema.column_schemas.len();
|
||||
info!("query_col_cnt={query_col_cnt}, table_col_cnt={table_col_cnt}");
|
||||
if query_col_cnt == table_col_cnt {
|
||||
self.is_rewritten = true;
|
||||
return Ok(Transformed::no(node));
|
||||
} else if query_col_cnt + 1 == table_col_cnt {
|
||||
let last_col_schema = self.schema.column_schemas.last().unwrap();
|
||||
|
||||
// if time index column is auto created add it
|
||||
if last_col_schema.name == AUTO_CREATED_PLACEHOLDER_TS_COL
|
||||
&& self.schema.timestamp_index == Some(table_col_cnt - 1)
|
||||
{
|
||||
exprs.push(datafusion::logical_expr::lit(0));
|
||||
} else if last_col_schema.data_type.is_timestamp() {
|
||||
// is the update at column
|
||||
exprs.push(datafusion::prelude::now());
|
||||
} else {
|
||||
// helpful error message
|
||||
return Err(DataFusionError::Plan(format!(
|
||||
"Expect the last column in table to be timestamp column, found column {} with type {:?}",
|
||||
last_col_schema.name,
|
||||
last_col_schema.data_type
|
||||
)));
|
||||
}
|
||||
} else if query_col_cnt + 2 == table_col_cnt {
|
||||
let mut col_iter = self.schema.column_schemas.iter().rev();
|
||||
let last_col_schema = col_iter.next().unwrap();
|
||||
let second_last_col_schema = col_iter.next().unwrap();
|
||||
if second_last_col_schema.data_type.is_timestamp() {
|
||||
exprs.push(datafusion::prelude::now());
|
||||
} else {
|
||||
return Err(DataFusionError::Plan(format!(
|
||||
"Expect the second last column in the table to be timestamp column, found column {} with type {:?}",
|
||||
second_last_col_schema.name,
|
||||
second_last_col_schema.data_type
|
||||
)));
|
||||
}
|
||||
|
||||
if last_col_schema.name == AUTO_CREATED_PLACEHOLDER_TS_COL
|
||||
&& self.schema.timestamp_index == Some(table_col_cnt - 1)
|
||||
{
|
||||
exprs.push(datafusion::logical_expr::lit(0));
|
||||
} else {
|
||||
return Err(DataFusionError::Plan(format!(
|
||||
"Expect timestamp column {}, found {:?}",
|
||||
AUTO_CREATED_PLACEHOLDER_TS_COL, last_col_schema
|
||||
)));
|
||||
}
|
||||
} else {
|
||||
return Err(DataFusionError::Plan(format!(
|
||||
"Expect table have 0,1 or 2 columns more than query columns, found {} query columns {:?}, {} table columns {:?}",
|
||||
query_col_cnt, node.expressions(), table_col_cnt, self.schema.column_schemas
|
||||
)));
|
||||
}
|
||||
|
||||
self.is_rewritten = true;
|
||||
let new_plan = node.with_new_exprs(exprs, node.inputs().into_iter().cloned().collect())?;
|
||||
Ok(Transformed::yes(new_plan))
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(discord9): a method to found out the precise time window
|
||||
|
||||
/// Find out the `Filter` Node corresponding to innermost(deepest) `WHERE` and add a new filter expr to it
|
||||
#[derive(Debug)]
|
||||
pub struct AddFilterRewriter {
|
||||
extra_filter: Expr,
|
||||
is_rewritten: bool,
|
||||
}
|
||||
|
||||
impl AddFilterRewriter {
|
||||
fn new(filter: Expr) -> Self {
|
||||
Self {
|
||||
extra_filter: filter,
|
||||
is_rewritten: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TreeNodeRewriter for AddFilterRewriter {
|
||||
type Node = LogicalPlan;
|
||||
fn f_up(&mut self, node: Self::Node) -> DfResult<Transformed<Self::Node>> {
|
||||
if self.is_rewritten {
|
||||
return Ok(Transformed::no(node));
|
||||
}
|
||||
match node {
|
||||
LogicalPlan::Filter(mut filter) if !filter.having => {
|
||||
filter.predicate = filter.predicate.and(self.extra_filter.clone());
|
||||
self.is_rewritten = true;
|
||||
Ok(Transformed::yes(LogicalPlan::Filter(filter)))
|
||||
}
|
||||
LogicalPlan::TableScan(_) => {
|
||||
// add a new filter
|
||||
let filter =
|
||||
datafusion_expr::Filter::try_new(self.extra_filter.clone(), Arc::new(node))?;
|
||||
self.is_rewritten = true;
|
||||
Ok(Transformed::yes(LogicalPlan::Filter(filter)))
|
||||
}
|
||||
_ => Ok(Transformed::no(node)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use datafusion_common::tree_node::TreeNode as _;
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use datatypes::schema::ColumnSchema;
|
||||
use pretty_assertions::assert_eq;
|
||||
use session::context::QueryContext;
|
||||
|
||||
use super::*;
|
||||
use crate::test_utils::create_test_query_engine;
|
||||
|
||||
/// test if uppercase are handled correctly(with quote)
|
||||
#[tokio::test]
|
||||
async fn test_sql_plan_convert() {
|
||||
let query_engine = create_test_query_engine();
|
||||
let ctx = QueryContext::arc();
|
||||
let old = r#"SELECT "NUMBER" FROM "UPPERCASE_NUMBERS_WITH_TS""#;
|
||||
let new = sql_to_df_plan(ctx.clone(), query_engine.clone(), old, false)
|
||||
.await
|
||||
.unwrap();
|
||||
let new_sql = df_plan_to_sql(&new).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
r#"SELECT `UPPERCASE_NUMBERS_WITH_TS`.`NUMBER` FROM `UPPERCASE_NUMBERS_WITH_TS`"#,
|
||||
new_sql
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_add_filter() {
|
||||
let testcases = vec![
|
||||
(
|
||||
"SELECT number FROM numbers_with_ts GROUP BY number",
|
||||
"SELECT numbers_with_ts.number FROM numbers_with_ts WHERE (number > 4) GROUP BY numbers_with_ts.number"
|
||||
),
|
||||
|
||||
(
|
||||
"SELECT number FROM numbers_with_ts WHERE number < 2 OR number >10",
|
||||
"SELECT numbers_with_ts.number FROM numbers_with_ts WHERE ((numbers_with_ts.number < 2) OR (numbers_with_ts.number > 10)) AND (number > 4)"
|
||||
),
|
||||
|
||||
(
|
||||
"SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window",
|
||||
"SELECT date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE (number > 4) GROUP BY date_bin('5 minutes', numbers_with_ts.ts)"
|
||||
),
|
||||
|
||||
// subquery
|
||||
(
|
||||
"SELECT number, time_window FROM (SELECT number, date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window, number);",
|
||||
"SELECT numbers_with_ts.number, time_window FROM (SELECT numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window FROM numbers_with_ts WHERE (number > 4) GROUP BY date_bin('5 minutes', numbers_with_ts.ts), numbers_with_ts.number)"
|
||||
),
|
||||
|
||||
// complex subquery without alias
|
||||
(
|
||||
"SELECT sum(number), number, date_bin('5 minutes', ts) as time_window, bucket_name FROM (SELECT number, ts, case when number < 5 THEN 'bucket_0_5' when number >= 5 THEN 'bucket_5_inf' END as bucket_name FROM numbers_with_ts) GROUP BY number, time_window, bucket_name;",
|
||||
"SELECT sum(numbers_with_ts.number), numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts) AS time_window, bucket_name FROM (SELECT numbers_with_ts.number, numbers_with_ts.ts, CASE WHEN (numbers_with_ts.number < 5) THEN 'bucket_0_5' WHEN (numbers_with_ts.number >= 5) THEN 'bucket_5_inf' END AS bucket_name FROM numbers_with_ts WHERE (number > 4)) GROUP BY numbers_with_ts.number, date_bin('5 minutes', numbers_with_ts.ts), bucket_name"
|
||||
),
|
||||
|
||||
// complex subquery alias
|
||||
(
|
||||
"SELECT sum(number), number, date_bin('5 minutes', ts) as time_window, bucket_name FROM (SELECT number, ts, case when number < 5 THEN 'bucket_0_5' when number >= 5 THEN 'bucket_5_inf' END as bucket_name FROM numbers_with_ts) as cte WHERE number > 1 GROUP BY number, time_window, bucket_name;",
|
||||
"SELECT sum(cte.number), cte.number, date_bin('5 minutes', cte.ts) AS time_window, cte.bucket_name FROM (SELECT numbers_with_ts.number, numbers_with_ts.ts, CASE WHEN (numbers_with_ts.number < 5) THEN 'bucket_0_5' WHEN (numbers_with_ts.number >= 5) THEN 'bucket_5_inf' END AS bucket_name FROM numbers_with_ts WHERE (number > 4)) AS cte WHERE (cte.number > 1) GROUP BY cte.number, date_bin('5 minutes', cte.ts), cte.bucket_name"
|
||||
)
|
||||
];
|
||||
use datafusion_expr::{col, lit};
|
||||
let query_engine = create_test_query_engine();
|
||||
let ctx = QueryContext::arc();
|
||||
|
||||
for (before, after) in testcases {
|
||||
let sql = before;
|
||||
let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, false)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut add_filter = AddFilterRewriter::new(col("number").gt(lit(4u32)));
|
||||
let plan = plan.rewrite(&mut add_filter).unwrap().data;
|
||||
let new_sql = df_plan_to_sql(&plan).unwrap();
|
||||
assert_eq!(after, new_sql);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_add_auto_column_rewriter() {
|
||||
let testcases = vec![
|
||||
// add update_at
|
||||
(
|
||||
"SELECT number FROM numbers_with_ts",
|
||||
Ok("SELECT numbers_with_ts.number, now() FROM numbers_with_ts"),
|
||||
vec![
|
||||
ColumnSchema::new("number", ConcreteDataType::int32_datatype(), true),
|
||||
ColumnSchema::new(
|
||||
"ts",
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
false,
|
||||
)
|
||||
.with_time_index(true),
|
||||
],
|
||||
),
|
||||
// add ts placeholder
|
||||
(
|
||||
"SELECT number FROM numbers_with_ts",
|
||||
Ok("SELECT numbers_with_ts.number, 0 FROM numbers_with_ts"),
|
||||
vec![
|
||||
ColumnSchema::new("number", ConcreteDataType::int32_datatype(), true),
|
||||
ColumnSchema::new(
|
||||
AUTO_CREATED_PLACEHOLDER_TS_COL,
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
false,
|
||||
)
|
||||
.with_time_index(true),
|
||||
],
|
||||
),
|
||||
// no modify
|
||||
(
|
||||
"SELECT number, ts FROM numbers_with_ts",
|
||||
Ok("SELECT numbers_with_ts.number, numbers_with_ts.ts FROM numbers_with_ts"),
|
||||
vec![
|
||||
ColumnSchema::new("number", ConcreteDataType::int32_datatype(), true),
|
||||
ColumnSchema::new(
|
||||
"ts",
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
false,
|
||||
)
|
||||
.with_time_index(true),
|
||||
],
|
||||
),
|
||||
// add update_at and ts placeholder
|
||||
(
|
||||
"SELECT number FROM numbers_with_ts",
|
||||
Ok("SELECT numbers_with_ts.number, now(), 0 FROM numbers_with_ts"),
|
||||
vec![
|
||||
ColumnSchema::new("number", ConcreteDataType::int32_datatype(), true),
|
||||
ColumnSchema::new(
|
||||
"update_at",
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
false,
|
||||
),
|
||||
ColumnSchema::new(
|
||||
AUTO_CREATED_PLACEHOLDER_TS_COL,
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
false,
|
||||
)
|
||||
.with_time_index(true),
|
||||
],
|
||||
),
|
||||
// add ts placeholder
|
||||
(
|
||||
"SELECT number, ts FROM numbers_with_ts",
|
||||
Ok("SELECT numbers_with_ts.number, numbers_with_ts.ts, 0 FROM numbers_with_ts"),
|
||||
vec![
|
||||
ColumnSchema::new("number", ConcreteDataType::int32_datatype(), true),
|
||||
ColumnSchema::new(
|
||||
"update_at",
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
false,
|
||||
),
|
||||
ColumnSchema::new(
|
||||
AUTO_CREATED_PLACEHOLDER_TS_COL,
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
false,
|
||||
)
|
||||
.with_time_index(true),
|
||||
],
|
||||
),
|
||||
// add update_at after time index column
|
||||
(
|
||||
"SELECT number, ts FROM numbers_with_ts",
|
||||
Ok("SELECT numbers_with_ts.number, numbers_with_ts.ts, now() FROM numbers_with_ts"),
|
||||
vec![
|
||||
ColumnSchema::new("number", ConcreteDataType::int32_datatype(), true),
|
||||
ColumnSchema::new(
|
||||
"ts",
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
false,
|
||||
)
|
||||
.with_time_index(true),
|
||||
ColumnSchema::new(
|
||||
// name is irrelevant for update_at column
|
||||
"update_atat",
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
false,
|
||||
),
|
||||
],
|
||||
),
|
||||
// error datatype mismatch
|
||||
(
|
||||
"SELECT number, ts FROM numbers_with_ts",
|
||||
Err("Expect the last column in table to be timestamp column, found column atat with type Int8"),
|
||||
vec![
|
||||
ColumnSchema::new("number", ConcreteDataType::int32_datatype(), true),
|
||||
ColumnSchema::new(
|
||||
"ts",
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
false,
|
||||
)
|
||||
.with_time_index(true),
|
||||
ColumnSchema::new(
|
||||
// name is irrelevant for update_at column
|
||||
"atat",
|
||||
ConcreteDataType::int8_datatype(),
|
||||
false,
|
||||
),
|
||||
],
|
||||
),
|
||||
// error datatype mismatch on second last column
|
||||
(
|
||||
"SELECT number FROM numbers_with_ts",
|
||||
Err("Expect the second last column in the table to be timestamp column, found column ts with type Int8"),
|
||||
vec![
|
||||
ColumnSchema::new("number", ConcreteDataType::int32_datatype(), true),
|
||||
ColumnSchema::new(
|
||||
"ts",
|
||||
ConcreteDataType::int8_datatype(),
|
||||
false,
|
||||
),
|
||||
ColumnSchema::new(
|
||||
// name is irrelevant for update_at column
|
||||
"atat",
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
false,
|
||||
)
|
||||
.with_time_index(true),
|
||||
],
|
||||
),
|
||||
];
|
||||
|
||||
let query_engine = create_test_query_engine();
|
||||
let ctx = QueryContext::arc();
|
||||
for (before, after, column_schemas) in testcases {
|
||||
let raw_schema = RawSchema::new(column_schemas);
|
||||
let mut add_auto_column_rewriter = AddAutoColumnRewriter::new(raw_schema);
|
||||
|
||||
let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), before, false)
|
||||
.await
|
||||
.unwrap();
|
||||
let new_sql = (|| {
|
||||
let plan = plan
|
||||
.rewrite(&mut add_auto_column_rewriter)
|
||||
.map_err(|e| e.to_string())?
|
||||
.data;
|
||||
df_plan_to_sql(&plan).map_err(|e| e.to_string())
|
||||
})();
|
||||
match (after, new_sql.clone()) {
|
||||
(Ok(after), Ok(new_sql)) => assert_eq!(after, new_sql),
|
||||
(Err(expected), Err(real_err_msg)) => assert!(
|
||||
real_err_msg.contains(expected),
|
||||
"expected: {expected}, real: {real_err_msg}"
|
||||
),
|
||||
_ => panic!("expected: {:?}, real: {:?}", after, new_sql),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_find_group_by_exprs() {
|
||||
let testcases = vec![
|
||||
(
|
||||
"SELECT arrow_cast(date_bin(INTERVAL '1 MINS', numbers_with_ts.ts), 'Timestamp(Second, None)') AS ts FROM numbers_with_ts GROUP BY ts;",
|
||||
vec!["ts"]
|
||||
),
|
||||
(
|
||||
"SELECT number FROM numbers_with_ts GROUP BY number",
|
||||
vec!["number"]
|
||||
),
|
||||
(
|
||||
"SELECT date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window",
|
||||
vec!["time_window"]
|
||||
),
|
||||
// subquery
|
||||
(
|
||||
"SELECT number, time_window FROM (SELECT number, date_bin('5 minutes', ts) as time_window FROM numbers_with_ts GROUP BY time_window, number);",
|
||||
vec!["time_window", "number"]
|
||||
),
|
||||
// complex subquery without alias
|
||||
(
|
||||
"SELECT sum(number), number, date_bin('5 minutes', ts) as time_window, bucket_name FROM (SELECT number, ts, case when number < 5 THEN 'bucket_0_5' when number >= 5 THEN 'bucket_5_inf' END as bucket_name FROM numbers_with_ts) GROUP BY number, time_window, bucket_name;",
|
||||
vec!["number", "time_window", "bucket_name"]
|
||||
),
|
||||
// complex subquery alias
|
||||
(
|
||||
"SELECT sum(number), number, date_bin('5 minutes', ts) as time_window, bucket_name FROM (SELECT number, ts, case when number < 5 THEN 'bucket_0_5' when number >= 5 THEN 'bucket_5_inf' END as bucket_name FROM numbers_with_ts) as cte GROUP BY number, time_window, bucket_name;",
|
||||
vec!["number", "time_window", "bucket_name"]
|
||||
)
|
||||
];
|
||||
|
||||
let query_engine = create_test_query_engine();
|
||||
let ctx = QueryContext::arc();
|
||||
for (sql, expected) in testcases {
|
||||
// need to be unoptimize for better readiability
|
||||
let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, false)
|
||||
.await
|
||||
.unwrap();
|
||||
let mut group_by_exprs = FindGroupByFinalName::default();
|
||||
plan.visit(&mut group_by_exprs).unwrap();
|
||||
let expected: HashSet<String> = expected.into_iter().map(|s| s.to_string()).collect();
|
||||
assert_eq!(
|
||||
expected,
|
||||
group_by_exprs.get_group_expr_names().unwrap_or_default()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -86,7 +86,8 @@ pub fn create_test_query_engine() -> Arc<dyn QueryEngine> {
|
||||
|
||||
let schema = vec![
|
||||
datatypes::schema::ColumnSchema::new("number", CDT::uint32_datatype(), false),
|
||||
datatypes::schema::ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false),
|
||||
datatypes::schema::ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false)
|
||||
.with_time_index(true),
|
||||
];
|
||||
let mut columns = vec![];
|
||||
let numbers = (1..=10).collect_vec();
|
||||
@@ -114,6 +115,37 @@ pub fn create_test_query_engine() -> Arc<dyn QueryEngine> {
|
||||
};
|
||||
catalog_list.register_table_sync(req_with_ts).unwrap();
|
||||
|
||||
let schema = vec![
|
||||
datatypes::schema::ColumnSchema::new("NUMBER", CDT::uint32_datatype(), false),
|
||||
datatypes::schema::ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false)
|
||||
.with_time_index(true),
|
||||
];
|
||||
let mut columns = vec![];
|
||||
let numbers = (1..=10).collect_vec();
|
||||
let column: VectorRef = Arc::new(<u32 as Scalar>::VectorType::from_vec(numbers));
|
||||
columns.push(column);
|
||||
|
||||
let ts = (1..=10).collect_vec();
|
||||
let mut builder = TimestampMillisecondVectorBuilder::with_capacity(10);
|
||||
ts.into_iter()
|
||||
.map(|v| builder.push(Some(TimestampMillisecond::new(v))))
|
||||
.count();
|
||||
let column: VectorRef = builder.to_vector_cloned();
|
||||
columns.push(column);
|
||||
|
||||
let schema = Arc::new(Schema::new(schema));
|
||||
let recordbatch = common_recordbatch::RecordBatch::new(schema, columns).unwrap();
|
||||
let table = MemTable::table("UPPERCASE_NUMBERS_WITH_TS", recordbatch);
|
||||
|
||||
let req_with_ts = RegisterTableRequest {
|
||||
catalog: DEFAULT_CATALOG_NAME.to_string(),
|
||||
schema: DEFAULT_SCHEMA_NAME.to_string(),
|
||||
table_name: "UPPERCASE_NUMBERS_WITH_TS".to_string(),
|
||||
table_id: 1025,
|
||||
table,
|
||||
};
|
||||
catalog_list.register_table_sync(req_with_ts).unwrap();
|
||||
|
||||
let factory = query::QueryEngineFactory::new(catalog_list, None, None, None, None, false);
|
||||
|
||||
let engine = factory.query_engine();
|
||||
|
||||
@@ -12,17 +12,26 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_base::readable_size::ReadableSize;
|
||||
use common_config::config::Configurable;
|
||||
use common_options::datanode::DatanodeClientOptions;
|
||||
use common_telemetry::logging::{LoggingOptions, TracingOptions};
|
||||
use meta_client::MetaClientOptions;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use servers::export_metrics::ExportMetricsOption;
|
||||
use servers::export_metrics::{ExportMetricsOption, ExportMetricsTask};
|
||||
use servers::grpc::GrpcOptions;
|
||||
use servers::heartbeat_options::HeartbeatOptions;
|
||||
use servers::http::HttpOptions;
|
||||
use servers::server::ServerHandlers;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::error;
|
||||
use crate::error::Result;
|
||||
use crate::heartbeat::HeartbeatTask;
|
||||
use crate::instance::prom_store::ExportMetricHandler;
|
||||
use crate::instance::Instance;
|
||||
use crate::service_config::{
|
||||
InfluxdbOptions, JaegerOptions, MysqlOptions, OpentsdbOptions, OtlpOptions, PostgresOptions,
|
||||
PromStoreOptions,
|
||||
@@ -84,6 +93,50 @@ impl Configurable for FrontendOptions {
|
||||
}
|
||||
}
|
||||
|
||||
/// The [`Frontend`] struct is the main entry point for the frontend service
|
||||
/// which contains server handlers, frontend instance and some background tasks.
|
||||
pub struct Frontend {
|
||||
pub instance: Arc<Instance>,
|
||||
pub servers: ServerHandlers,
|
||||
pub heartbeat_task: Option<HeartbeatTask>,
|
||||
pub export_metrics_task: Option<ExportMetricsTask>,
|
||||
}
|
||||
|
||||
impl Frontend {
|
||||
pub async fn start(&self) -> Result<()> {
|
||||
if let Some(t) = &self.heartbeat_task {
|
||||
t.start().await?;
|
||||
}
|
||||
|
||||
if let Some(t) = self.export_metrics_task.as_ref() {
|
||||
if t.send_by_handler {
|
||||
let inserter = self.instance.inserter().clone();
|
||||
let statement_executor = self.instance.statement_executor().clone();
|
||||
let handler = ExportMetricHandler::new_handler(inserter, statement_executor);
|
||||
t.start(Some(handler)).context(error::StartServerSnafu)?
|
||||
} else {
|
||||
t.start(None).context(error::StartServerSnafu)?;
|
||||
}
|
||||
}
|
||||
|
||||
self.servers
|
||||
.start_all()
|
||||
.await
|
||||
.context(error::StartServerSnafu)
|
||||
}
|
||||
|
||||
pub async fn shutdown(&self) -> Result<()> {
|
||||
self.servers
|
||||
.shutdown_all()
|
||||
.await
|
||||
.context(error::ShutdownServerSnafu)
|
||||
}
|
||||
|
||||
pub fn server_handlers(&self) -> &ServerHandlers {
|
||||
&self.servers
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -20,7 +20,7 @@ mod log_handler;
|
||||
mod logs;
|
||||
mod opentsdb;
|
||||
mod otlp;
|
||||
mod prom_store;
|
||||
pub mod prom_store;
|
||||
mod promql;
|
||||
mod region_query;
|
||||
pub mod standalone;
|
||||
@@ -47,7 +47,7 @@ use datafusion_expr::LogicalPlan;
|
||||
use log_store::raft_engine::RaftEngineBackend;
|
||||
use operator::delete::DeleterRef;
|
||||
use operator::insert::InserterRef;
|
||||
use operator::statement::StatementExecutor;
|
||||
use operator::statement::{StatementExecutor, StatementExecutorRef};
|
||||
use pipeline::pipeline_operator::PipelineOperator;
|
||||
use prometheus::HistogramTimer;
|
||||
use promql_parser::label::Matcher;
|
||||
@@ -59,18 +59,11 @@ use query::stats::StatementStatistics;
|
||||
use query::QueryEngineRef;
|
||||
use servers::error as server_error;
|
||||
use servers::error::{AuthSnafu, ExecuteQuerySnafu, ParsePromQLSnafu};
|
||||
use servers::export_metrics::ExportMetricsTask;
|
||||
use servers::interceptor::{
|
||||
PromQueryInterceptor, PromQueryInterceptorRef, SqlQueryInterceptor, SqlQueryInterceptorRef,
|
||||
};
|
||||
use servers::prometheus_handler::PrometheusHandler;
|
||||
use servers::query_handler::grpc::GrpcQueryHandler;
|
||||
use servers::query_handler::sql::SqlQueryHandler;
|
||||
use servers::query_handler::{
|
||||
InfluxdbLineProtocolHandler, JaegerQueryHandler, LogQueryHandler, OpenTelemetryProtocolHandler,
|
||||
OpentsdbProtocolHandler, PipelineHandler, PromStoreProtocolHandler,
|
||||
};
|
||||
use servers::server::ServerHandlers;
|
||||
use session::context::QueryContextRef;
|
||||
use session::table_name::table_idents_to_full_name;
|
||||
use snafu::prelude::*;
|
||||
@@ -81,50 +74,25 @@ use sql::statements::statement::Statement;
|
||||
use sqlparser::ast::ObjectName;
|
||||
pub use standalone::StandaloneDatanodeManager;
|
||||
|
||||
use self::prom_store::ExportMetricHandler;
|
||||
use crate::error::{
|
||||
self, Error, ExecLogicalPlanSnafu, ExecutePromqlSnafu, ExternalSnafu, InvalidSqlSnafu,
|
||||
ParseSqlSnafu, PermissionSnafu, PlanStatementSnafu, Result, SqlExecInterceptedSnafu,
|
||||
StartServerSnafu, TableOperationSnafu,
|
||||
TableOperationSnafu,
|
||||
};
|
||||
use crate::frontend::FrontendOptions;
|
||||
use crate::heartbeat::HeartbeatTask;
|
||||
use crate::limiter::LimiterRef;
|
||||
|
||||
#[async_trait]
|
||||
pub trait FrontendInstance:
|
||||
GrpcQueryHandler<Error = Error>
|
||||
+ SqlQueryHandler<Error = Error>
|
||||
+ OpentsdbProtocolHandler
|
||||
+ InfluxdbLineProtocolHandler
|
||||
+ PromStoreProtocolHandler
|
||||
+ OpenTelemetryProtocolHandler
|
||||
+ PrometheusHandler
|
||||
+ PipelineHandler
|
||||
+ LogQueryHandler
|
||||
+ JaegerQueryHandler
|
||||
+ Send
|
||||
+ Sync
|
||||
+ 'static
|
||||
{
|
||||
async fn start(&self) -> Result<()>;
|
||||
}
|
||||
|
||||
pub type FrontendInstanceRef = Arc<dyn FrontendInstance>;
|
||||
|
||||
/// The frontend instance contains necessary components, and implements many
|
||||
/// traits, like [`servers::query_handler::grpc::GrpcQueryHandler`],
|
||||
/// [`servers::query_handler::sql::SqlQueryHandler`], etc.
|
||||
#[derive(Clone)]
|
||||
pub struct Instance {
|
||||
options: FrontendOptions,
|
||||
catalog_manager: CatalogManagerRef,
|
||||
pipeline_operator: Arc<PipelineOperator>,
|
||||
statement_executor: Arc<StatementExecutor>,
|
||||
query_engine: QueryEngineRef,
|
||||
plugins: Plugins,
|
||||
servers: ServerHandlers,
|
||||
heartbeat_task: Option<HeartbeatTask>,
|
||||
inserter: InserterRef,
|
||||
deleter: DeleterRef,
|
||||
export_metrics_task: Option<ExportMetricsTask>,
|
||||
table_metadata_manager: TableMetadataManagerRef,
|
||||
stats: StatementStatistics,
|
||||
limiter: Option<LimiterRef>,
|
||||
@@ -157,15 +125,6 @@ impl Instance {
|
||||
Ok((kv_backend, procedure_manager))
|
||||
}
|
||||
|
||||
pub fn build_servers(&mut self, servers: ServerHandlers) -> Result<()> {
|
||||
self.export_metrics_task =
|
||||
ExportMetricsTask::try_new(&self.options.export_metrics, Some(&self.plugins))
|
||||
.context(StartServerSnafu)?;
|
||||
|
||||
self.servers = servers;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn catalog_manager(&self) -> &CatalogManagerRef {
|
||||
&self.catalog_manager
|
||||
}
|
||||
@@ -174,50 +133,20 @@ impl Instance {
|
||||
&self.query_engine
|
||||
}
|
||||
|
||||
pub fn plugins(&self) -> Plugins {
|
||||
self.plugins.clone()
|
||||
pub fn plugins(&self) -> &Plugins {
|
||||
&self.plugins
|
||||
}
|
||||
|
||||
pub async fn shutdown(&self) -> Result<()> {
|
||||
self.servers
|
||||
.shutdown_all()
|
||||
.await
|
||||
.context(error::ShutdownServerSnafu)
|
||||
}
|
||||
|
||||
pub fn server_handlers(&self) -> &ServerHandlers {
|
||||
&self.servers
|
||||
}
|
||||
|
||||
pub fn statement_executor(&self) -> Arc<StatementExecutor> {
|
||||
self.statement_executor.clone()
|
||||
pub fn statement_executor(&self) -> &StatementExecutorRef {
|
||||
&self.statement_executor
|
||||
}
|
||||
|
||||
pub fn table_metadata_manager(&self) -> &TableMetadataManagerRef {
|
||||
&self.table_metadata_manager
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl FrontendInstance for Instance {
|
||||
async fn start(&self) -> Result<()> {
|
||||
if let Some(heartbeat_task) = &self.heartbeat_task {
|
||||
heartbeat_task.start().await?;
|
||||
}
|
||||
|
||||
if let Some(t) = self.export_metrics_task.as_ref() {
|
||||
if t.send_by_handler {
|
||||
let handler = ExportMetricHandler::new_handler(
|
||||
self.inserter.clone(),
|
||||
self.statement_executor.clone(),
|
||||
);
|
||||
t.start(Some(handler)).context(StartServerSnafu)?
|
||||
} else {
|
||||
t.start(None).context(StartServerSnafu)?;
|
||||
}
|
||||
}
|
||||
|
||||
self.servers.start_all().await.context(StartServerSnafu)
|
||||
pub fn inserter(&self) -> &InserterRef {
|
||||
&self.inserter
|
||||
}
|
||||
}
|
||||
|
||||
@@ -595,6 +524,9 @@ pub fn check_permission(
|
||||
Statement::ShowIndex(stmt) => {
|
||||
validate_db_permission!(stmt, query_ctx);
|
||||
}
|
||||
Statement::ShowRegion(stmt) => {
|
||||
validate_db_permission!(stmt, query_ctx);
|
||||
}
|
||||
Statement::ShowViews(stmt) => {
|
||||
validate_db_permission!(stmt, query_ctx);
|
||||
}
|
||||
|
||||
@@ -35,12 +35,10 @@ use partition::manager::PartitionRuleManager;
|
||||
use pipeline::pipeline_operator::PipelineOperator;
|
||||
use query::stats::StatementStatistics;
|
||||
use query::QueryEngineFactory;
|
||||
use servers::server::ServerHandlers;
|
||||
use snafu::OptionExt;
|
||||
|
||||
use crate::error::{self, Result};
|
||||
use crate::frontend::FrontendOptions;
|
||||
use crate::heartbeat::HeartbeatTask;
|
||||
use crate::instance::region_query::FrontendRegionQueryHandler;
|
||||
use crate::instance::Instance;
|
||||
use crate::limiter::Limiter;
|
||||
@@ -55,7 +53,6 @@ pub struct FrontendBuilder {
|
||||
node_manager: NodeManagerRef,
|
||||
plugins: Option<Plugins>,
|
||||
procedure_executor: ProcedureExecutorRef,
|
||||
heartbeat_task: Option<HeartbeatTask>,
|
||||
stats: StatementStatistics,
|
||||
}
|
||||
|
||||
@@ -78,7 +75,6 @@ impl FrontendBuilder {
|
||||
node_manager,
|
||||
plugins: None,
|
||||
procedure_executor,
|
||||
heartbeat_task: None,
|
||||
stats,
|
||||
}
|
||||
}
|
||||
@@ -97,13 +93,6 @@ impl FrontendBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_heartbeat_task(self, heartbeat_task: HeartbeatTask) -> Self {
|
||||
Self {
|
||||
heartbeat_task: Some(heartbeat_task),
|
||||
..self
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn try_build(self) -> Result<Instance> {
|
||||
let kv_backend = self.kv_backend;
|
||||
let node_manager = self.node_manager;
|
||||
@@ -202,17 +191,13 @@ impl FrontendBuilder {
|
||||
});
|
||||
|
||||
Ok(Instance {
|
||||
options: self.options,
|
||||
catalog_manager: self.catalog_manager,
|
||||
pipeline_operator,
|
||||
statement_executor,
|
||||
query_engine,
|
||||
plugins,
|
||||
servers: ServerHandlers::default(),
|
||||
heartbeat_task: self.heartbeat_task,
|
||||
inserter,
|
||||
deleter,
|
||||
export_metrics_task: None,
|
||||
table_metadata_manager: Arc::new(TableMetadataManager::new(kv_backend)),
|
||||
stats: self.stats,
|
||||
limiter,
|
||||
|
||||
@@ -20,7 +20,7 @@ use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
|
||||
use client::Output;
|
||||
use common_error::ext::BoxedError;
|
||||
use pipeline::pipeline_operator::PipelineOperator;
|
||||
use pipeline::{GreptimeTransformer, Pipeline, PipelineInfo, PipelineVersion};
|
||||
use pipeline::{Pipeline, PipelineInfo, PipelineVersion};
|
||||
use servers::error::{
|
||||
AuthSnafu, Error as ServerError, ExecuteGrpcRequestSnafu, InFlightWriteBytesExceededSnafu,
|
||||
PipelineSnafu, Result as ServerResult,
|
||||
@@ -56,7 +56,7 @@ impl PipelineHandler for Instance {
|
||||
name: &str,
|
||||
version: PipelineVersion,
|
||||
query_ctx: QueryContextRef,
|
||||
) -> ServerResult<Arc<Pipeline<GreptimeTransformer>>> {
|
||||
) -> ServerResult<Arc<Pipeline>> {
|
||||
self.pipeline_operator
|
||||
.get_pipeline(query_ctx, name, version)
|
||||
.await
|
||||
@@ -100,7 +100,7 @@ impl PipelineHandler for Instance {
|
||||
.await
|
||||
}
|
||||
|
||||
fn build_pipeline(&self, pipeline: &str) -> ServerResult<Pipeline<GreptimeTransformer>> {
|
||||
fn build_pipeline(&self, pipeline: &str) -> ServerResult<Pipeline> {
|
||||
PipelineOperator::build_pipeline(pipeline).context(PipelineSnafu)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,26 +36,24 @@ use snafu::ResultExt;
|
||||
|
||||
use crate::error::{self, Result, StartServerSnafu, TomlFormatSnafu};
|
||||
use crate::frontend::FrontendOptions;
|
||||
use crate::instance::FrontendInstance;
|
||||
use crate::instance::Instance;
|
||||
|
||||
pub struct Services<T, U>
|
||||
pub struct Services<T>
|
||||
where
|
||||
T: Into<FrontendOptions> + Configurable + Clone,
|
||||
U: FrontendInstance,
|
||||
{
|
||||
opts: T,
|
||||
instance: Arc<U>,
|
||||
instance: Arc<Instance>,
|
||||
grpc_server_builder: Option<GrpcServerBuilder>,
|
||||
http_server_builder: Option<HttpServerBuilder>,
|
||||
plugins: Plugins,
|
||||
}
|
||||
|
||||
impl<T, U> Services<T, U>
|
||||
impl<T> Services<T>
|
||||
where
|
||||
T: Into<FrontendOptions> + Configurable + Clone,
|
||||
U: FrontendInstance,
|
||||
{
|
||||
pub fn new(opts: T, instance: Arc<U>, plugins: Plugins) -> Self {
|
||||
pub fn new(opts: T, instance: Arc<Instance>, plugins: Plugins) -> Self {
|
||||
Self {
|
||||
opts,
|
||||
instance,
|
||||
|
||||
@@ -67,7 +67,7 @@ impl BloomFilterApplier {
|
||||
for ((_, mut group), bloom) in locs
|
||||
.iter()
|
||||
.zip(start_seg..end_seg)
|
||||
.group_by(|(x, _)| **x)
|
||||
.chunk_by(|(x, _)| **x)
|
||||
.into_iter()
|
||||
.zip(bfs.iter())
|
||||
{
|
||||
|
||||
@@ -91,7 +91,8 @@ impl FulltextIndexCreator for BloomFilterFulltextIndexCreator {
|
||||
|
||||
let (index_finish, puffin_add_blob) = futures::join!(
|
||||
creator.finish(tx.compat_write()),
|
||||
puffin_writer.put_blob(blob_key, rx.compat(), put_options)
|
||||
// TODO(zhongzc): add fulltext config properties
|
||||
puffin_writer.put_blob(blob_key, rx.compat(), put_options, Default::default())
|
||||
);
|
||||
|
||||
match (
|
||||
|
||||
@@ -164,6 +164,8 @@ impl FulltextIndexCreator for TantivyFulltextIndexCreator {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashMap;
|
||||
|
||||
use common_test_util::temp_dir::create_temp_dir;
|
||||
use futures::AsyncRead;
|
||||
use tantivy::collector::DocSetCollector;
|
||||
@@ -182,6 +184,7 @@ mod tests {
|
||||
_key: &str,
|
||||
_raw_data: R,
|
||||
_options: PutOptions,
|
||||
_properties: HashMap<String, String>,
|
||||
) -> puffin::error::Result<u64>
|
||||
where
|
||||
R: AsyncRead + Send,
|
||||
|
||||
@@ -437,9 +437,9 @@ mod tests {
|
||||
}
|
||||
|
||||
fn random_option_bytes(size: usize) -> Option<Vec<u8>> {
|
||||
let mut rng = rand::thread_rng();
|
||||
let mut rng = rand::rng();
|
||||
|
||||
if rng.gen() {
|
||||
if rng.random() {
|
||||
let mut buffer = vec![0u8; size];
|
||||
rng.fill(&mut buffer[..]);
|
||||
Some(buffer)
|
||||
@@ -469,11 +469,11 @@ mod tests {
|
||||
segment_row_count: usize,
|
||||
) -> (DictionaryValues, ValueSegIds) {
|
||||
let mut n = row_count;
|
||||
let mut rng = rand::thread_rng();
|
||||
let mut rng = rand::rng();
|
||||
let mut dic_values = Vec::new();
|
||||
|
||||
while n > 0 {
|
||||
let size = rng.gen_range(1..=n);
|
||||
let size = rng.random_range(1..=n);
|
||||
let value = random_option_bytes(100);
|
||||
dic_values.push((value, size));
|
||||
n -= size;
|
||||
|
||||
@@ -535,7 +535,7 @@ mod tests {
|
||||
.flatten()
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
all_entries.shuffle(&mut rand::thread_rng());
|
||||
all_entries.shuffle(&mut rand::rng());
|
||||
|
||||
let response = logstore.append_batch(all_entries.clone()).await.unwrap();
|
||||
// 5 region
|
||||
@@ -575,7 +575,7 @@ mod tests {
|
||||
warn!("The endpoints is empty, skipping the test 'test_append_batch_basic_large'");
|
||||
return;
|
||||
};
|
||||
let data_size_kb = rand::thread_rng().gen_range(9..31usize);
|
||||
let data_size_kb = rand::rng().random_range(9..31usize);
|
||||
info!("Entry size: {}Ki", data_size_kb);
|
||||
let broker_endpoints = broker_endpoints
|
||||
.split(',')
|
||||
@@ -608,7 +608,7 @@ mod tests {
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
assert_matches!(all_entries[0], Entry::MultiplePart(_));
|
||||
all_entries.shuffle(&mut rand::thread_rng());
|
||||
all_entries.shuffle(&mut rand::rng());
|
||||
|
||||
let response = logstore.append_batch(all_entries.clone()).await.unwrap();
|
||||
// 5 region
|
||||
|
||||
@@ -24,6 +24,10 @@ use tokio::sync::mpsc::{self, Receiver, Sender};
|
||||
use crate::error::{self, Result};
|
||||
use crate::kafka::index::IndexCollector;
|
||||
use crate::kafka::worker::{BackgroundProducerWorker, ProduceResultHandle, WorkerRequest};
|
||||
use crate::metrics::{
|
||||
METRIC_KAFKA_CLIENT_BYTES_TOTAL, METRIC_KAFKA_CLIENT_PRODUCE_ELAPSED,
|
||||
METRIC_KAFKA_CLIENT_TRAFFIC_TOTAL,
|
||||
};
|
||||
|
||||
pub type OrderedBatchProducerRef = Arc<OrderedBatchProducer>;
|
||||
|
||||
@@ -106,6 +110,18 @@ impl ProducerClient for PartitionClient {
|
||||
records: Vec<Record>,
|
||||
compression: Compression,
|
||||
) -> rskafka::client::error::Result<Vec<i64>> {
|
||||
let total_size = records.iter().map(|r| r.approximate_size()).sum::<usize>();
|
||||
let partition = self.partition().to_string();
|
||||
METRIC_KAFKA_CLIENT_BYTES_TOTAL
|
||||
.with_label_values(&[self.topic(), &partition])
|
||||
.inc_by(total_size as u64);
|
||||
METRIC_KAFKA_CLIENT_TRAFFIC_TOTAL
|
||||
.with_label_values(&[self.topic(), &partition])
|
||||
.inc();
|
||||
let _timer = METRIC_KAFKA_CLIENT_PRODUCE_ELAPSED
|
||||
.with_label_values(&[self.topic(), &partition])
|
||||
.start_timer();
|
||||
|
||||
self.produce(records, compression).await
|
||||
}
|
||||
|
||||
|
||||
@@ -19,6 +19,10 @@ use prometheus::*;
|
||||
pub const LOGSTORE_LABEL: &str = "logstore";
|
||||
/// Operation type label.
|
||||
pub const OPTYPE_LABEL: &str = "optype";
|
||||
/// Kafka topic label.
|
||||
pub const TOPIC_LABEL: &str = "topic";
|
||||
/// Kafka partition label.
|
||||
pub const PARTITION_LABEL: &str = "partition";
|
||||
|
||||
lazy_static! {
|
||||
/// Counters of bytes of each operation on a logstore.
|
||||
@@ -62,4 +66,23 @@ lazy_static! {
|
||||
/// Timer of the append_batch operation on the raft-engine logstore.
|
||||
/// This timer only measures the duration of the read operation, not measures the total duration of replay.
|
||||
pub static ref METRIC_RAFT_ENGINE_READ_ELAPSED: Histogram = METRIC_LOGSTORE_OP_ELAPSED.with_label_values(&["raft-engine", "read"]);
|
||||
|
||||
pub static ref METRIC_KAFKA_CLIENT_BYTES_TOTAL: IntCounterVec = register_int_counter_vec!(
|
||||
"greptime_logstore_kafka_client_bytes_total",
|
||||
"kafka logstore's bytes traffic total",
|
||||
&[LOGSTORE_LABEL, PARTITION_LABEL],
|
||||
)
|
||||
.unwrap();
|
||||
pub static ref METRIC_KAFKA_CLIENT_TRAFFIC_TOTAL: IntCounterVec = register_int_counter_vec!(
|
||||
"greptime_logstore_kafka_client_traffic_total",
|
||||
"kafka logstore's request count traffic total",
|
||||
&[LOGSTORE_LABEL, PARTITION_LABEL],
|
||||
)
|
||||
.unwrap();
|
||||
pub static ref METRIC_KAFKA_CLIENT_PRODUCE_ELAPSED: HistogramVec = register_histogram_vec!(
|
||||
"greptime_logstore_kafka_client_produce_elapsed",
|
||||
"kafka logstore produce operation elapsed",
|
||||
&[LOGSTORE_LABEL, PARTITION_LABEL],
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ workspace = true
|
||||
[dependencies]
|
||||
api.workspace = true
|
||||
async-trait.workspace = true
|
||||
common-base.workspace = true
|
||||
common-error.workspace = true
|
||||
common-grpc.workspace = true
|
||||
common-macro.workspace = true
|
||||
|
||||
@@ -21,9 +21,11 @@ mod cluster;
|
||||
mod store;
|
||||
mod util;
|
||||
|
||||
use std::fmt::Debug;
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::v1::meta::{ProcedureDetailResponse, Role};
|
||||
pub use ask_leader::AskLeader;
|
||||
use cluster::Client as ClusterClient;
|
||||
pub use cluster::ClusterKvBackend;
|
||||
use common_error::ext::BoxedError;
|
||||
@@ -33,13 +35,16 @@ use common_meta::cluster::{
|
||||
};
|
||||
use common_meta::datanode::{DatanodeStatKey, DatanodeStatValue, RegionStat};
|
||||
use common_meta::ddl::{ExecutorContext, ProcedureExecutor};
|
||||
use common_meta::error::{self as meta_error, ExternalSnafu, Result as MetaResult};
|
||||
use common_meta::error::{
|
||||
self as meta_error, ExternalSnafu, Result as MetaResult, UnsupportedSnafu,
|
||||
};
|
||||
use common_meta::key::flow::flow_state::{FlowStat, FlowStateManager};
|
||||
use common_meta::kv_backend::KvBackendRef;
|
||||
use common_meta::range_stream::PaginationStream;
|
||||
use common_meta::rpc::ddl::{SubmitDdlTaskRequest, SubmitDdlTaskResponse};
|
||||
use common_meta::rpc::procedure::{
|
||||
MigrateRegionRequest, MigrateRegionResponse, ProcedureStateResponse,
|
||||
AddRegionFollowerRequest, MigrateRegionRequest, MigrateRegionResponse, ProcedureStateResponse,
|
||||
RemoveRegionFollowerRequest,
|
||||
};
|
||||
use common_meta::rpc::store::{
|
||||
BatchDeleteRequest, BatchDeleteResponse, BatchGetRequest, BatchGetResponse, BatchPutRequest,
|
||||
@@ -74,6 +79,7 @@ pub struct MetaClientBuilder {
|
||||
enable_store: bool,
|
||||
enable_procedure: bool,
|
||||
enable_access_cluster_info: bool,
|
||||
region_follower: Option<RegionFollowerClientRef>,
|
||||
channel_manager: Option<ChannelManager>,
|
||||
ddl_channel_manager: Option<ChannelManager>,
|
||||
heartbeat_channel_manager: Option<ChannelManager>,
|
||||
@@ -162,6 +168,13 @@ impl MetaClientBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_region_follower(self, region_follower: RegionFollowerClientRef) -> Self {
|
||||
Self {
|
||||
region_follower: Some(region_follower),
|
||||
..self
|
||||
}
|
||||
}
|
||||
|
||||
pub fn build(self) -> MetaClient {
|
||||
let mut client = if let Some(mgr) = self.channel_manager {
|
||||
MetaClient::with_channel_manager(self.id, mgr)
|
||||
@@ -204,6 +217,10 @@ impl MetaClientBuilder {
|
||||
))
|
||||
}
|
||||
|
||||
if let Some(region_follower) = self.region_follower {
|
||||
client.region_follower = Some(region_follower);
|
||||
}
|
||||
|
||||
client
|
||||
}
|
||||
}
|
||||
@@ -216,6 +233,19 @@ pub struct MetaClient {
|
||||
store: Option<StoreClient>,
|
||||
procedure: Option<ProcedureClient>,
|
||||
cluster: Option<ClusterClient>,
|
||||
region_follower: Option<RegionFollowerClientRef>,
|
||||
}
|
||||
|
||||
pub type RegionFollowerClientRef = Arc<dyn RegionFollowerClient>;
|
||||
|
||||
/// A trait for clients that can manage region followers.
|
||||
#[async_trait::async_trait]
|
||||
pub trait RegionFollowerClient: Sync + Send + Debug {
|
||||
async fn add_region_follower(&self, request: AddRegionFollowerRequest) -> Result<()>;
|
||||
|
||||
async fn remove_region_follower(&self, request: RemoveRegionFollowerRequest) -> Result<()>;
|
||||
|
||||
async fn start(&self, urls: &[&str]) -> Result<()>;
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
@@ -242,6 +272,44 @@ impl ProcedureExecutor for MetaClient {
|
||||
.context(meta_error::ExternalSnafu)
|
||||
}
|
||||
|
||||
async fn add_region_follower(
|
||||
&self,
|
||||
_ctx: &ExecutorContext,
|
||||
request: AddRegionFollowerRequest,
|
||||
) -> MetaResult<()> {
|
||||
if let Some(region_follower) = &self.region_follower {
|
||||
region_follower
|
||||
.add_region_follower(request)
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(meta_error::ExternalSnafu)
|
||||
} else {
|
||||
UnsupportedSnafu {
|
||||
operation: "add_region_follower",
|
||||
}
|
||||
.fail()
|
||||
}
|
||||
}
|
||||
|
||||
async fn remove_region_follower(
|
||||
&self,
|
||||
_ctx: &ExecutorContext,
|
||||
request: RemoveRegionFollowerRequest,
|
||||
) -> MetaResult<()> {
|
||||
if let Some(region_follower) = &self.region_follower {
|
||||
region_follower
|
||||
.remove_region_follower(request)
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(meta_error::ExternalSnafu)
|
||||
} else {
|
||||
UnsupportedSnafu {
|
||||
operation: "remove_region_follower",
|
||||
}
|
||||
.fail()
|
||||
}
|
||||
}
|
||||
|
||||
async fn query_procedure_state(
|
||||
&self,
|
||||
_ctx: &ExecutorContext,
|
||||
@@ -335,6 +403,15 @@ impl ClusterInfo for MetaClient {
|
||||
|
||||
Ok(region_stats)
|
||||
}
|
||||
|
||||
async fn list_flow_stats(&self) -> Result<Option<FlowStat>> {
|
||||
let cluster_backend = ClusterKvBackend::new(Arc::new(self.cluster_client()?));
|
||||
let cluster_backend = Arc::new(cluster_backend) as KvBackendRef;
|
||||
let flow_state_manager = FlowStateManager::new(cluster_backend);
|
||||
let res = flow_state_manager.get().await.context(GetFlowStatSnafu)?;
|
||||
|
||||
Ok(res.map(|r| r.into()))
|
||||
}
|
||||
}
|
||||
|
||||
fn decode_stats(kv: KeyValue) -> MetaResult<DatanodeStatValue> {
|
||||
@@ -344,15 +421,6 @@ fn decode_stats(kv: KeyValue) -> MetaResult<DatanodeStatValue> {
|
||||
}
|
||||
|
||||
impl MetaClient {
|
||||
pub async fn list_flow_stats(&self) -> Result<Option<FlowStat>> {
|
||||
let cluster_backend = ClusterKvBackend::new(Arc::new(self.cluster_client()?));
|
||||
let cluster_backend = Arc::new(cluster_backend) as KvBackendRef;
|
||||
let flow_state_manager = FlowStateManager::new(cluster_backend);
|
||||
let res = flow_state_manager.get().await.context(GetFlowStatSnafu)?;
|
||||
|
||||
Ok(res.map(|r| r.into()))
|
||||
}
|
||||
|
||||
pub fn new(id: Id) -> Self {
|
||||
Self {
|
||||
id,
|
||||
@@ -375,6 +443,11 @@ impl MetaClient {
|
||||
{
|
||||
info!("MetaClient channel config: {:?}", self.channel_config());
|
||||
|
||||
if let Some(client) = &mut self.region_follower {
|
||||
let urls = urls.as_ref().iter().map(|u| u.as_ref()).collect::<Vec<_>>();
|
||||
client.start(&urls).await?;
|
||||
info!("Region follower client started");
|
||||
}
|
||||
if let Some(client) = &mut self.heartbeat {
|
||||
client.start(urls.clone()).await?;
|
||||
info!("Heartbeat client started");
|
||||
@@ -985,11 +1058,11 @@ mod tests {
|
||||
let tx = new_client("test_cluster_client").await;
|
||||
let in_memory = tx.in_memory().unwrap();
|
||||
let cluster_client = tx.client.cluster_client().unwrap();
|
||||
let mut rng = rand::thread_rng();
|
||||
let mut rng = rand::rng();
|
||||
|
||||
// Generates rough 10MB data, which is larger than the default grpc message size limit.
|
||||
for i in 0..10 {
|
||||
let data: Vec<u8> = (0..1024 * 1024).map(|_| rng.gen()).collect();
|
||||
let data: Vec<u8> = (0..1024 * 1024).map(|_| rng.random()).collect();
|
||||
in_memory
|
||||
.put(
|
||||
PutRequest::new()
|
||||
|
||||
@@ -75,7 +75,7 @@ impl AskLeader {
|
||||
let leadership_group = self.leadership_group.read().unwrap();
|
||||
leadership_group.peers.clone()
|
||||
};
|
||||
peers.shuffle(&mut rand::thread_rng());
|
||||
peers.shuffle(&mut rand::rng());
|
||||
|
||||
let req = AskLeaderRequest {
|
||||
header: Some(RequestHeader::new(
|
||||
|
||||
@@ -22,8 +22,8 @@ where
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut rng = rand::thread_rng();
|
||||
let i = rng.gen_range(0..len);
|
||||
let mut rng = rand::rng();
|
||||
let i = rng.random_range(0..len);
|
||||
|
||||
func(i)
|
||||
}
|
||||
|
||||
@@ -15,8 +15,10 @@
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use client::RegionFollowerClientRef;
|
||||
use common_base::Plugins;
|
||||
use common_grpc::channel_manager::{ChannelConfig, ChannelManager};
|
||||
use common_telemetry::info;
|
||||
use common_telemetry::{debug, info};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::client::MetaClientBuilder;
|
||||
@@ -73,6 +75,7 @@ pub type MetaClientRef = Arc<client::MetaClient>;
|
||||
pub async fn create_meta_client(
|
||||
client_type: MetaClientType,
|
||||
meta_client_options: &MetaClientOptions,
|
||||
plugins: Option<&Plugins>,
|
||||
) -> error::Result<MetaClientRef> {
|
||||
info!(
|
||||
"Creating {:?} instance with Metasrv addrs {:?}",
|
||||
@@ -98,6 +101,13 @@ pub async fn create_meta_client(
|
||||
if let MetaClientType::Frontend = client_type {
|
||||
let ddl_config = base_config.clone().timeout(meta_client_options.ddl_timeout);
|
||||
builder = builder.ddl_channel_manager(ChannelManager::with_config(ddl_config));
|
||||
if let Some(plugins) = plugins {
|
||||
let region_follower = plugins.get::<RegionFollowerClientRef>();
|
||||
if let Some(region_follower) = region_follower {
|
||||
debug!("Region follower client found in plugins");
|
||||
builder = builder.with_region_follower(region_follower);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
builder = builder
|
||||
|
||||
@@ -20,6 +20,8 @@ use api::v1::meta::procedure_service_server::ProcedureServiceServer;
|
||||
use api::v1::meta::store_server::StoreServer;
|
||||
use common_base::Plugins;
|
||||
use common_config::Configurable;
|
||||
#[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
|
||||
use common_meta::distributed_time_constants::META_LEASE_SECS;
|
||||
use common_meta::kv_backend::chroot::ChrootKvBackend;
|
||||
use common_meta::kv_backend::etcd::EtcdStore;
|
||||
use common_meta::kv_backend::memory::MemoryKvBackend;
|
||||
@@ -249,6 +251,7 @@ pub async fn metasrv_builder(
|
||||
election_client,
|
||||
opts.store_key_prefix.clone(),
|
||||
CANDIDATE_LEASE_SECS,
|
||||
META_LEASE_SECS,
|
||||
&opts.meta_table_name,
|
||||
opts.meta_election_lock_id,
|
||||
)
|
||||
@@ -270,6 +273,7 @@ pub async fn metasrv_builder(
|
||||
election_client,
|
||||
opts.store_key_prefix.clone(),
|
||||
CANDIDATE_LEASE_SECS,
|
||||
META_LEASE_SECS,
|
||||
&election_table_name,
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -16,7 +16,6 @@ use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use common_meta::distributed_time_constants::{META_KEEP_ALIVE_INTERVAL_SECS, META_LEASE_SECS};
|
||||
use common_telemetry::{error, warn};
|
||||
use common_time::Timestamp;
|
||||
use itertools::Itertools;
|
||||
@@ -25,7 +24,7 @@ use sqlx::mysql::{MySqlArguments, MySqlRow};
|
||||
use sqlx::query::Query;
|
||||
use sqlx::{MySql, MySqlConnection, MySqlTransaction, Row};
|
||||
use tokio::sync::{broadcast, Mutex, MutexGuard};
|
||||
use tokio::time::{Interval, MissedTickBehavior};
|
||||
use tokio::time::MissedTickBehavior;
|
||||
|
||||
use crate::election::{
|
||||
listen_leader_change, Election, LeaderChangeMessage, LeaderKey, CANDIDATES_ROOT, ELECTION_KEY,
|
||||
@@ -41,7 +40,7 @@ const LEASE_SEP: &str = r#"||__metadata_lease_sep||"#;
|
||||
|
||||
/// Lease information.
|
||||
/// TODO(CookiePie): PgElection can also use this struct. Refactor it to a common module.
|
||||
#[derive(Default, Clone)]
|
||||
#[derive(Default, Clone, Debug)]
|
||||
struct Lease {
|
||||
leader_value: String,
|
||||
expire_time: Timestamp,
|
||||
@@ -52,6 +51,7 @@ struct Lease {
|
||||
|
||||
struct ElectionSqlFactory<'a> {
|
||||
table_name: &'a str,
|
||||
meta_lease_ttl_secs: u64,
|
||||
}
|
||||
|
||||
struct ElectionSqlSet {
|
||||
@@ -99,8 +99,11 @@ struct ElectionSqlSet {
|
||||
}
|
||||
|
||||
impl<'a> ElectionSqlFactory<'a> {
|
||||
fn new(table_name: &'a str) -> Self {
|
||||
Self { table_name }
|
||||
fn new(table_name: &'a str, meta_lease_ttl_secs: u64) -> Self {
|
||||
Self {
|
||||
table_name,
|
||||
meta_lease_ttl_secs,
|
||||
}
|
||||
}
|
||||
|
||||
fn build(self) -> ElectionSqlSet {
|
||||
@@ -117,7 +120,10 @@ impl<'a> ElectionSqlFactory<'a> {
|
||||
// Currently the session timeout is longer than the leader lease time.
|
||||
// So the leader will renew the lease twice before the session timeout if everything goes well.
|
||||
fn set_idle_session_timeout_sql(&self) -> String {
|
||||
format!("SET SESSION wait_timeout = {};", META_LEASE_SECS + 1)
|
||||
format!(
|
||||
"SET SESSION wait_timeout = {};",
|
||||
self.meta_lease_ttl_secs + 1
|
||||
)
|
||||
}
|
||||
|
||||
fn set_lock_wait_timeout_sql(&self) -> &str {
|
||||
@@ -147,6 +153,8 @@ impl<'a> ElectionSqlFactory<'a> {
|
||||
"SELECT @@version;"
|
||||
}
|
||||
|
||||
/// Use `SELECT FOR UPDATE` to lock for compatibility with other MySQL-compatible databases
|
||||
/// instead of directly using `GET_LOCK`.
|
||||
fn campaign_sql(&self) -> String {
|
||||
format!("SELECT * FROM `{}` FOR UPDATE;", self.table_name)
|
||||
}
|
||||
@@ -315,6 +323,7 @@ pub struct MySqlElection {
|
||||
leader_watcher: broadcast::Sender<LeaderChangeMessage>,
|
||||
store_key_prefix: String,
|
||||
candidate_lease_ttl_secs: u64,
|
||||
meta_lease_ttl_secs: u64,
|
||||
sql_set: ElectionSqlSet,
|
||||
}
|
||||
|
||||
@@ -324,9 +333,10 @@ impl MySqlElection {
|
||||
mut client: sqlx::MySqlConnection,
|
||||
store_key_prefix: String,
|
||||
candidate_lease_ttl_secs: u64,
|
||||
meta_lease_ttl_secs: u64,
|
||||
table_name: &str,
|
||||
) -> Result<ElectionRef> {
|
||||
let sql_factory = ElectionSqlFactory::new(table_name);
|
||||
let sql_factory = ElectionSqlFactory::new(table_name, meta_lease_ttl_secs);
|
||||
sqlx::query(&sql_factory.create_table_sql())
|
||||
.execute(&mut client)
|
||||
.await
|
||||
@@ -365,6 +375,7 @@ impl MySqlElection {
|
||||
leader_watcher: tx,
|
||||
store_key_prefix,
|
||||
candidate_lease_ttl_secs,
|
||||
meta_lease_ttl_secs,
|
||||
sql_set: sql_factory.build(),
|
||||
}))
|
||||
}
|
||||
@@ -452,8 +463,14 @@ impl Election for MySqlElection {
|
||||
}
|
||||
);
|
||||
|
||||
self.update_value_with_lease(&key, &lease.origin, &node_info, &mut executor)
|
||||
.await?;
|
||||
self.update_value_with_lease(
|
||||
&key,
|
||||
&lease.origin,
|
||||
&node_info,
|
||||
self.candidate_lease_ttl_secs,
|
||||
&mut executor,
|
||||
)
|
||||
.await?;
|
||||
std::mem::drop(executor);
|
||||
}
|
||||
}
|
||||
@@ -480,10 +497,11 @@ impl Election for MySqlElection {
|
||||
|
||||
async fn campaign(&self) -> Result<()> {
|
||||
let mut keep_alive_interval =
|
||||
tokio::time::interval(Duration::from_secs(META_KEEP_ALIVE_INTERVAL_SECS));
|
||||
tokio::time::interval(Duration::from_secs(self.meta_lease_ttl_secs / 2));
|
||||
keep_alive_interval.set_missed_tick_behavior(MissedTickBehavior::Delay);
|
||||
loop {
|
||||
let _ = self.do_campaign(&mut keep_alive_interval).await;
|
||||
let _ = self.do_campaign().await;
|
||||
keep_alive_interval.tick().await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -514,7 +532,7 @@ impl Election for MySqlElection {
|
||||
}
|
||||
|
||||
impl MySqlElection {
|
||||
/// Returns value, expire time and current time. If `with_origin` is true, the origin string is also returned.
|
||||
/// Returns value, expire time and current time.
|
||||
async fn get_value_with_lease(
|
||||
&self,
|
||||
key: &str,
|
||||
@@ -587,6 +605,7 @@ impl MySqlElection {
|
||||
key: &str,
|
||||
prev: &str,
|
||||
updated: &str,
|
||||
lease_ttl: u64,
|
||||
executor: &mut Executor<'_>,
|
||||
) -> Result<()> {
|
||||
let key = key.as_bytes();
|
||||
@@ -595,7 +614,7 @@ impl MySqlElection {
|
||||
|
||||
let query = sqlx::query(&self.sql_set.update_value_with_lease)
|
||||
.bind(updated)
|
||||
.bind(self.candidate_lease_ttl_secs as f64)
|
||||
.bind(lease_ttl as f64)
|
||||
.bind(key)
|
||||
.bind(prev);
|
||||
let res = executor
|
||||
@@ -627,9 +646,9 @@ impl MySqlElection {
|
||||
.bind(value)
|
||||
.bind(lease_ttl_secs);
|
||||
let res = executor
|
||||
.query(query, &self.sql_set.put_value_with_lease)
|
||||
.execute(query, &self.sql_set.put_value_with_lease)
|
||||
.await?;
|
||||
Ok(res.is_empty())
|
||||
Ok(res == 1)
|
||||
}
|
||||
|
||||
/// Returns `true` if the deletion is successful.
|
||||
@@ -644,62 +663,82 @@ impl MySqlElection {
|
||||
|
||||
/// Attempts to acquire leadership by executing a campaign. This function continuously checks
|
||||
/// if the current lease is still valid.
|
||||
async fn do_campaign(&self, interval: &mut Interval) -> Result<()> {
|
||||
async fn do_campaign(&self) -> Result<()> {
|
||||
// Need to restrict the scope of the client to avoid ambiguous overloads.
|
||||
use sqlx::Acquire;
|
||||
|
||||
loop {
|
||||
let client = self.client.lock().await;
|
||||
let executor = Executor::Default(client);
|
||||
let mut lease = Lease::default();
|
||||
match (
|
||||
self.lease_check(executor, &mut lease).await,
|
||||
self.is_leader(),
|
||||
) {
|
||||
// If the leader lease is valid and I'm the leader, renew the lease.
|
||||
(Ok(_), true) => {
|
||||
let mut client = self.client.lock().await;
|
||||
let txn = client
|
||||
.begin()
|
||||
.await
|
||||
.context(MySqlExecutionSnafu { sql: "BEGIN" })?;
|
||||
let mut executor = Executor::Txn(txn);
|
||||
let query = sqlx::query(&self.sql_set.campaign);
|
||||
executor.query(query, &self.sql_set.campaign).await?;
|
||||
self.renew_lease(executor, lease).await?;
|
||||
}
|
||||
// If the leader lease expires and I'm the leader, notify the leader watcher and step down.
|
||||
// Another instance should be elected as the leader in this case.
|
||||
(Err(_), true) => {
|
||||
warn!("Leader lease expired, re-initiate the campaign");
|
||||
self.step_down_without_lock().await?;
|
||||
}
|
||||
// If the leader lease expires and I'm not the leader, elect myself.
|
||||
(Err(_), false) => {
|
||||
warn!("Leader lease expired, re-initiate the campaign");
|
||||
let mut client = self.client.lock().await;
|
||||
let txn = client
|
||||
.begin()
|
||||
.await
|
||||
.context(MySqlExecutionSnafu { sql: "BEGIN" })?;
|
||||
let mut executor = Executor::Txn(txn);
|
||||
let query = sqlx::query(&self.sql_set.campaign);
|
||||
executor.query(query, &self.sql_set.campaign).await?;
|
||||
self.elected(&mut executor).await?;
|
||||
executor.commit().await?;
|
||||
}
|
||||
// If the leader lease is valid and I'm not the leader, do nothing.
|
||||
(Ok(_), false) => {}
|
||||
let client = self.client.lock().await;
|
||||
let executor = Executor::Default(client);
|
||||
let mut lease = Lease::default();
|
||||
match (
|
||||
self.lease_check(executor, &mut lease).await,
|
||||
self.is_leader(),
|
||||
self.leader_value == lease.leader_value,
|
||||
) {
|
||||
// If the leader lease is valid and I'm the leader, renew the lease.
|
||||
(Ok(_), true, true) => {
|
||||
let mut client = self.client.lock().await;
|
||||
let txn = client
|
||||
.begin()
|
||||
.await
|
||||
.context(MySqlExecutionSnafu { sql: "BEGIN" })?;
|
||||
let mut executor = Executor::Txn(txn);
|
||||
let query = sqlx::query(&self.sql_set.campaign);
|
||||
executor.query(query, &self.sql_set.campaign).await?;
|
||||
self.renew_lease(executor, lease).await?;
|
||||
}
|
||||
interval.tick().await;
|
||||
// If the leader lease expires and I'm the leader, notify the leader watcher and step down.
|
||||
// Another instance should be elected as the leader in this case.
|
||||
(Err(_), true, _) | (Ok(_), true, false) => {
|
||||
warn!("Leader lease expired, step down...");
|
||||
self.step_down_without_lock().await?;
|
||||
}
|
||||
// If the leader lease expires and I'm not the leader, elect myself.
|
||||
(Err(_), false, _) => {
|
||||
warn!("Leader lease expired, elected.");
|
||||
let mut client = self.client.lock().await;
|
||||
let txn = client
|
||||
.begin()
|
||||
.await
|
||||
.context(MySqlExecutionSnafu { sql: "BEGIN" })?;
|
||||
let mut executor = Executor::Txn(txn);
|
||||
let query = sqlx::query(&self.sql_set.campaign);
|
||||
executor.query(query, &self.sql_set.campaign).await?;
|
||||
self.elected(&mut executor).await?;
|
||||
executor.commit().await?;
|
||||
}
|
||||
// If the leader lease is valid and I'm the leader, but I don't think I'm the leader.
|
||||
// Just re-elect myself.
|
||||
(Ok(_), false, true) => {
|
||||
warn!("I should be the leader, but I don't think so. Something went wrong.");
|
||||
let mut client = self.client.lock().await;
|
||||
let txn = client
|
||||
.begin()
|
||||
.await
|
||||
.context(MySqlExecutionSnafu { sql: "BEGIN" })?;
|
||||
let mut executor = Executor::Txn(txn);
|
||||
let query = sqlx::query(&self.sql_set.campaign);
|
||||
executor.query(query, &self.sql_set.campaign).await?;
|
||||
self.elected(&mut executor).await?;
|
||||
executor.commit().await?;
|
||||
}
|
||||
// If the leader lease is valid and I'm not the leader, do nothing.
|
||||
(Ok(_), false, false) => {}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Renew the lease
|
||||
async fn renew_lease(&self, mut executor: Executor<'_>, lease: Lease) -> Result<()> {
|
||||
let key = self.election_key();
|
||||
self.update_value_with_lease(&key, &lease.origin, &self.leader_value, &mut executor)
|
||||
.await?;
|
||||
self.update_value_with_lease(
|
||||
&key,
|
||||
&lease.origin,
|
||||
&self.leader_value,
|
||||
self.meta_lease_ttl_secs,
|
||||
&mut executor,
|
||||
)
|
||||
.await?;
|
||||
executor.commit().await?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -758,7 +797,7 @@ impl MySqlElection {
|
||||
..Default::default()
|
||||
};
|
||||
self.delete_value(&key, executor).await?;
|
||||
self.put_value_with_lease(&key, &self.leader_value, META_LEASE_SECS, executor)
|
||||
self.put_value_with_lease(&key, &self.leader_value, self.meta_lease_ttl_secs, executor)
|
||||
.await?;
|
||||
|
||||
if self
|
||||
@@ -784,7 +823,7 @@ impl MySqlElection {
|
||||
match query.fetch_one(client).await {
|
||||
Ok(row) => {
|
||||
let version: String = row.try_get(0).unwrap();
|
||||
if !version.starts_with("8.0") || !version.starts_with("5.7") {
|
||||
if !version.starts_with("8.0") && !version.starts_with("5.7") {
|
||||
warn!(
|
||||
"Unsupported MySQL version: {}, expected: [5.7, 8.0]",
|
||||
version
|
||||
@@ -798,3 +837,589 @@ impl MySqlElection {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::env;
|
||||
|
||||
use common_telemetry::init_default_ut_logging;
|
||||
use sqlx::Connection;
|
||||
|
||||
use super::*;
|
||||
use crate::error::MySqlExecutionSnafu;
|
||||
|
||||
async fn create_mysql_client(table_name: Option<&str>) -> Result<Mutex<MySqlConnection>> {
|
||||
init_default_ut_logging();
|
||||
let endpoint = env::var("GT_MYSQL_ENDPOINTS").unwrap_or_default();
|
||||
if endpoint.is_empty() {
|
||||
return UnexpectedSnafu {
|
||||
violated: "MySQL endpoint is empty".to_string(),
|
||||
}
|
||||
.fail();
|
||||
}
|
||||
let mut client = MySqlConnection::connect(&endpoint).await.unwrap();
|
||||
if let Some(table_name) = table_name {
|
||||
let create_table_sql = format!(
|
||||
"CREATE TABLE IF NOT EXISTS {}(k VARCHAR(255) PRIMARY KEY, v BLOB);",
|
||||
table_name
|
||||
);
|
||||
sqlx::query(&create_table_sql)
|
||||
.execute(&mut client)
|
||||
.await
|
||||
.context(MySqlExecutionSnafu {
|
||||
sql: create_table_sql,
|
||||
})?;
|
||||
}
|
||||
Ok(Mutex::new(client))
|
||||
}
|
||||
|
||||
async fn drop_table(client: &Mutex<MySqlConnection>, table_name: &str) {
|
||||
let mut client = client.lock().await;
|
||||
let sql = format!("DROP TABLE IF EXISTS {};", table_name);
|
||||
sqlx::query(&sql)
|
||||
.execute(&mut *client)
|
||||
.await
|
||||
.context(MySqlExecutionSnafu { sql })
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_mysql_crud() {
|
||||
let key = "test_key".to_string();
|
||||
let value = "test_value".to_string();
|
||||
|
||||
let uuid = uuid::Uuid::new_v4().to_string();
|
||||
let table_name = "test_mysql_crud_greptime_metakv";
|
||||
let client = create_mysql_client(Some(table_name)).await.unwrap();
|
||||
|
||||
{
|
||||
let mut a = client.lock().await;
|
||||
let txn = a.begin().await.unwrap();
|
||||
let mut executor = Executor::Txn(txn);
|
||||
let raw_query = format!("SELECT * FROM {} FOR UPDATE;", table_name);
|
||||
let query = sqlx::query(&raw_query);
|
||||
let _ = executor.query(query, &raw_query).await.unwrap();
|
||||
}
|
||||
|
||||
let (tx, _) = broadcast::channel(100);
|
||||
let mysql_election = MySqlElection {
|
||||
leader_value: "test_leader".to_string(),
|
||||
client,
|
||||
is_leader: AtomicBool::new(false),
|
||||
leader_infancy: AtomicBool::new(true),
|
||||
leader_watcher: tx,
|
||||
store_key_prefix: uuid,
|
||||
candidate_lease_ttl_secs: 10,
|
||||
meta_lease_ttl_secs: 1,
|
||||
sql_set: ElectionSqlFactory::new(table_name, 1).build(),
|
||||
};
|
||||
let client = mysql_election.client.lock().await;
|
||||
let mut executor = Executor::Default(client);
|
||||
let res = mysql_election
|
||||
.put_value_with_lease(&key, &value, 10, &mut executor)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(res);
|
||||
|
||||
let lease = mysql_election
|
||||
.get_value_with_lease(&key, &mut executor)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(lease.leader_value, value);
|
||||
|
||||
mysql_election
|
||||
.update_value_with_lease(&key, &lease.origin, &value, 10, &mut executor)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let res = mysql_election
|
||||
.delete_value(&key, &mut executor)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(res);
|
||||
|
||||
let res = mysql_election
|
||||
.get_value_with_lease(&key, &mut executor)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(res.is_none());
|
||||
|
||||
for i in 0..10 {
|
||||
let key = format!("test_key_{}", i);
|
||||
let value = format!("test_value_{}", i);
|
||||
mysql_election
|
||||
.put_value_with_lease(&key, &value, 10, &mut executor)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
let key_prefix = "test_key".to_string();
|
||||
let (res, _) = mysql_election
|
||||
.get_value_with_lease_by_prefix(&key_prefix, &mut executor)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(res.len(), 10);
|
||||
|
||||
for i in 0..10 {
|
||||
let key = format!("test_key_{}", i);
|
||||
let res = mysql_election
|
||||
.delete_value(&key, &mut executor)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(res);
|
||||
}
|
||||
|
||||
let (res, current) = mysql_election
|
||||
.get_value_with_lease_by_prefix(&key_prefix, &mut executor)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(res.is_empty());
|
||||
assert!(current == Timestamp::default());
|
||||
|
||||
// Should drop manually.
|
||||
std::mem::drop(executor);
|
||||
drop_table(&mysql_election.client, table_name).await;
|
||||
}
|
||||
|
||||
async fn candidate(
|
||||
leader_value: String,
|
||||
candidate_lease_ttl_secs: u64,
|
||||
store_key_prefix: String,
|
||||
table_name: String,
|
||||
) {
|
||||
let client = create_mysql_client(None).await.unwrap();
|
||||
|
||||
let (tx, _) = broadcast::channel(100);
|
||||
let mysql_election = MySqlElection {
|
||||
leader_value,
|
||||
client,
|
||||
is_leader: AtomicBool::new(false),
|
||||
leader_infancy: AtomicBool::new(true),
|
||||
leader_watcher: tx,
|
||||
store_key_prefix,
|
||||
candidate_lease_ttl_secs,
|
||||
meta_lease_ttl_secs: 1,
|
||||
sql_set: ElectionSqlFactory::new(&table_name, 1).build(),
|
||||
};
|
||||
|
||||
let node_info = MetasrvNodeInfo {
|
||||
addr: "test_addr".to_string(),
|
||||
version: "test_version".to_string(),
|
||||
git_commit: "test_git_commit".to_string(),
|
||||
start_time_ms: 0,
|
||||
};
|
||||
mysql_election.register_candidate(&node_info).await.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_candidate_registration() {
|
||||
let leader_value_prefix = "test_leader".to_string();
|
||||
let candidate_lease_ttl_secs = 2;
|
||||
let uuid = uuid::Uuid::new_v4().to_string();
|
||||
let table_name = "test_candidate_registration_greptime_metakv";
|
||||
let mut handles = vec![];
|
||||
let client = create_mysql_client(Some(table_name)).await.unwrap();
|
||||
|
||||
for i in 0..10 {
|
||||
let leader_value = format!("{}{}", leader_value_prefix, i);
|
||||
let handle = tokio::spawn(candidate(
|
||||
leader_value,
|
||||
candidate_lease_ttl_secs,
|
||||
uuid.clone(),
|
||||
table_name.to_string(),
|
||||
));
|
||||
handles.push(handle);
|
||||
}
|
||||
// Wait for candidates to registrate themselves and renew their leases at least once.
|
||||
tokio::time::sleep(Duration::from_secs(candidate_lease_ttl_secs / 2 + 1)).await;
|
||||
|
||||
let (tx, _) = broadcast::channel(100);
|
||||
let leader_value = "test_leader".to_string();
|
||||
let mysql_election = MySqlElection {
|
||||
leader_value,
|
||||
client,
|
||||
is_leader: AtomicBool::new(false),
|
||||
leader_infancy: AtomicBool::new(true),
|
||||
leader_watcher: tx,
|
||||
store_key_prefix: uuid.clone(),
|
||||
candidate_lease_ttl_secs,
|
||||
meta_lease_ttl_secs: 1,
|
||||
sql_set: ElectionSqlFactory::new(table_name, 1).build(),
|
||||
};
|
||||
|
||||
let candidates = mysql_election.all_candidates().await.unwrap();
|
||||
assert_eq!(candidates.len(), 10);
|
||||
|
||||
for handle in handles {
|
||||
handle.abort();
|
||||
}
|
||||
|
||||
// Wait for the candidate leases to expire.
|
||||
tokio::time::sleep(Duration::from_secs(candidate_lease_ttl_secs + 1)).await;
|
||||
let candidates = mysql_election.all_candidates().await.unwrap();
|
||||
assert!(candidates.is_empty());
|
||||
|
||||
// Garbage collection
|
||||
let client = mysql_election.client.lock().await;
|
||||
let mut executor = Executor::Default(client);
|
||||
for i in 0..10 {
|
||||
let key = format!("{}{}{}{}", uuid, CANDIDATES_ROOT, leader_value_prefix, i);
|
||||
let res = mysql_election
|
||||
.delete_value(&key, &mut executor)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(res);
|
||||
}
|
||||
|
||||
// Should drop manually.
|
||||
std::mem::drop(executor);
|
||||
drop_table(&mysql_election.client, table_name).await;
|
||||
}
|
||||
|
||||
async fn elected(election: &MySqlElection, table_name: &str) {
|
||||
let mut client = election.client.lock().await;
|
||||
let txn = client.begin().await.unwrap();
|
||||
let mut executor = Executor::Txn(txn);
|
||||
let raw_query = format!("SELECT * FROM {} FOR UPDATE;", table_name);
|
||||
let query = sqlx::query(&raw_query);
|
||||
let _ = executor.query(query, &raw_query).await.unwrap();
|
||||
election.elected(&mut executor).await.unwrap();
|
||||
executor.commit().await.unwrap();
|
||||
}
|
||||
|
||||
async fn get_lease(election: &MySqlElection) -> Option<Lease> {
|
||||
let client = election.client.lock().await;
|
||||
let mut executor = Executor::Default(client);
|
||||
election
|
||||
.get_value_with_lease(&election.election_key(), &mut executor)
|
||||
.await
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_elected_and_step_down() {
|
||||
let leader_value = "test_leader".to_string();
|
||||
let candidate_lease_ttl_secs = 1;
|
||||
let uuid = uuid::Uuid::new_v4().to_string();
|
||||
let table_name = "test_elected_and_step_down_greptime_metakv";
|
||||
let client = create_mysql_client(Some(table_name)).await.unwrap();
|
||||
|
||||
let (tx, mut rx) = broadcast::channel(100);
|
||||
let leader_mysql_election = MySqlElection {
|
||||
leader_value: leader_value.clone(),
|
||||
client,
|
||||
is_leader: AtomicBool::new(false),
|
||||
leader_infancy: AtomicBool::new(true),
|
||||
leader_watcher: tx,
|
||||
store_key_prefix: uuid,
|
||||
candidate_lease_ttl_secs,
|
||||
meta_lease_ttl_secs: 1,
|
||||
sql_set: ElectionSqlFactory::new(table_name, 1).build(),
|
||||
};
|
||||
|
||||
elected(&leader_mysql_election, table_name).await;
|
||||
let lease = get_lease(&leader_mysql_election).await.unwrap();
|
||||
assert_eq!(lease.leader_value, leader_value);
|
||||
assert!(lease.expire_time > lease.current);
|
||||
assert!(leader_mysql_election.is_leader());
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(LeaderChangeMessage::Elected(key)) => {
|
||||
assert_eq!(String::from_utf8_lossy(key.name()), leader_value);
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(key.key()),
|
||||
leader_mysql_election.election_key()
|
||||
);
|
||||
assert_eq!(key.lease_id(), i64::default());
|
||||
assert_eq!(key.revision(), i64::default());
|
||||
}
|
||||
_ => panic!("Expected LeaderChangeMessage::Elected"),
|
||||
}
|
||||
|
||||
leader_mysql_election
|
||||
.step_down_without_lock()
|
||||
.await
|
||||
.unwrap();
|
||||
let lease = get_lease(&leader_mysql_election).await.unwrap();
|
||||
assert_eq!(lease.leader_value, leader_value);
|
||||
assert!(!leader_mysql_election.is_leader());
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(LeaderChangeMessage::StepDown(key)) => {
|
||||
assert_eq!(String::from_utf8_lossy(key.name()), leader_value);
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(key.key()),
|
||||
leader_mysql_election.election_key()
|
||||
);
|
||||
assert_eq!(key.lease_id(), i64::default());
|
||||
assert_eq!(key.revision(), i64::default());
|
||||
}
|
||||
_ => panic!("Expected LeaderChangeMessage::StepDown"),
|
||||
}
|
||||
|
||||
elected(&leader_mysql_election, table_name).await;
|
||||
let lease = get_lease(&leader_mysql_election).await.unwrap();
|
||||
assert_eq!(lease.leader_value, leader_value);
|
||||
assert!(lease.expire_time > lease.current);
|
||||
assert!(leader_mysql_election.is_leader());
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(LeaderChangeMessage::Elected(key)) => {
|
||||
assert_eq!(String::from_utf8_lossy(key.name()), leader_value);
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(key.key()),
|
||||
leader_mysql_election.election_key()
|
||||
);
|
||||
assert_eq!(key.lease_id(), i64::default());
|
||||
assert_eq!(key.revision(), i64::default());
|
||||
}
|
||||
_ => panic!("Expected LeaderChangeMessage::Elected"),
|
||||
}
|
||||
|
||||
drop_table(&leader_mysql_election.client, table_name).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_campaign() {
|
||||
let leader_value = "test_leader".to_string();
|
||||
let uuid = uuid::Uuid::new_v4().to_string();
|
||||
let table_name = "test_leader_action_greptime_metakv";
|
||||
let candidate_lease_ttl_secs = 5;
|
||||
let meta_lease_ttl_secs = 1;
|
||||
let client = create_mysql_client(Some(table_name)).await.unwrap();
|
||||
|
||||
let (tx, mut rx) = broadcast::channel(100);
|
||||
let leader_mysql_election = MySqlElection {
|
||||
leader_value: leader_value.clone(),
|
||||
client,
|
||||
is_leader: AtomicBool::new(false),
|
||||
leader_infancy: AtomicBool::new(true),
|
||||
leader_watcher: tx,
|
||||
store_key_prefix: uuid,
|
||||
candidate_lease_ttl_secs,
|
||||
meta_lease_ttl_secs,
|
||||
sql_set: ElectionSqlFactory::new(table_name, 1).build(),
|
||||
};
|
||||
|
||||
// Step 1: No leader exists, campaign and elected.
|
||||
leader_mysql_election.do_campaign().await.unwrap();
|
||||
let lease = get_lease(&leader_mysql_election).await.unwrap();
|
||||
assert_eq!(lease.leader_value, leader_value);
|
||||
assert!(lease.expire_time > lease.current);
|
||||
assert!(leader_mysql_election.is_leader());
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(LeaderChangeMessage::Elected(key)) => {
|
||||
assert_eq!(String::from_utf8_lossy(key.name()), leader_value);
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(key.key()),
|
||||
leader_mysql_election.election_key()
|
||||
);
|
||||
assert_eq!(key.lease_id(), i64::default());
|
||||
assert_eq!(key.revision(), i64::default());
|
||||
}
|
||||
_ => panic!("Expected LeaderChangeMessage::Elected"),
|
||||
}
|
||||
|
||||
// Step 2: As a leader, renew the lease.
|
||||
leader_mysql_election.do_campaign().await.unwrap();
|
||||
let new_lease = get_lease(&leader_mysql_election).await.unwrap();
|
||||
assert_eq!(lease.leader_value, leader_value);
|
||||
// The lease should be renewed.
|
||||
assert!(new_lease.expire_time > lease.expire_time);
|
||||
assert!(new_lease.expire_time > new_lease.current);
|
||||
assert!(leader_mysql_election.is_leader());
|
||||
|
||||
// Step 3: Something wrong, the leader lease expired.
|
||||
tokio::time::sleep(Duration::from_secs(meta_lease_ttl_secs + 1)).await;
|
||||
leader_mysql_election.do_campaign().await.unwrap();
|
||||
let lease = get_lease(&leader_mysql_election).await.unwrap();
|
||||
assert_eq!(lease.leader_value, leader_value);
|
||||
assert!(lease.expire_time <= lease.current);
|
||||
assert!(!leader_mysql_election.is_leader());
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(LeaderChangeMessage::StepDown(key)) => {
|
||||
assert_eq!(String::from_utf8_lossy(key.name()), leader_value);
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(key.key()),
|
||||
leader_mysql_election.election_key()
|
||||
);
|
||||
assert_eq!(key.lease_id(), i64::default());
|
||||
assert_eq!(key.revision(), i64::default());
|
||||
}
|
||||
_ => panic!("Expected LeaderChangeMessage::StepDown"),
|
||||
}
|
||||
|
||||
// Step 4: Re-elect itself.
|
||||
leader_mysql_election.do_campaign().await.unwrap();
|
||||
let lease = get_lease(&leader_mysql_election).await.unwrap();
|
||||
assert_eq!(lease.leader_value, leader_value);
|
||||
assert!(lease.expire_time > lease.current);
|
||||
assert!(leader_mysql_election.is_leader());
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(LeaderChangeMessage::Elected(key)) => {
|
||||
assert_eq!(String::from_utf8_lossy(key.name()), leader_value);
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(key.key()),
|
||||
leader_mysql_election.election_key()
|
||||
);
|
||||
assert_eq!(key.lease_id(), i64::default());
|
||||
assert_eq!(key.revision(), i64::default());
|
||||
}
|
||||
_ => panic!("Expected LeaderChangeMessage::Elected"),
|
||||
}
|
||||
|
||||
// Step 5: Something wrong, the leader key is deleted by other followers.
|
||||
{
|
||||
let client = leader_mysql_election.client.lock().await;
|
||||
let mut executor = Executor::Default(client);
|
||||
leader_mysql_election
|
||||
.delete_value(&leader_mysql_election.election_key(), &mut executor)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
leader_mysql_election.do_campaign().await.unwrap();
|
||||
let res = get_lease(&leader_mysql_election).await;
|
||||
assert!(res.is_none());
|
||||
assert!(!leader_mysql_election.is_leader());
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(LeaderChangeMessage::StepDown(key)) => {
|
||||
assert_eq!(String::from_utf8_lossy(key.name()), leader_value);
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(key.key()),
|
||||
leader_mysql_election.election_key()
|
||||
);
|
||||
assert_eq!(key.lease_id(), i64::default());
|
||||
assert_eq!(key.revision(), i64::default());
|
||||
}
|
||||
_ => panic!("Expected LeaderChangeMessage::StepDown"),
|
||||
}
|
||||
|
||||
// Step 6: Re-elect itself.
|
||||
leader_mysql_election.do_campaign().await.unwrap();
|
||||
let lease = get_lease(&leader_mysql_election).await.unwrap();
|
||||
assert_eq!(lease.leader_value, leader_value);
|
||||
assert!(lease.expire_time > lease.current);
|
||||
assert!(leader_mysql_election.is_leader());
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(LeaderChangeMessage::Elected(key)) => {
|
||||
assert_eq!(String::from_utf8_lossy(key.name()), leader_value);
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(key.key()),
|
||||
leader_mysql_election.election_key()
|
||||
);
|
||||
assert_eq!(key.lease_id(), i64::default());
|
||||
assert_eq!(key.revision(), i64::default());
|
||||
}
|
||||
_ => panic!("Expected LeaderChangeMessage::Elected"),
|
||||
}
|
||||
|
||||
// Step 7: Something wrong, the leader key changed by others.
|
||||
let another_leader_key = "another_leader";
|
||||
{
|
||||
let client = leader_mysql_election.client.lock().await;
|
||||
let mut executor = Executor::Default(client);
|
||||
leader_mysql_election
|
||||
.delete_value(&leader_mysql_election.election_key(), &mut executor)
|
||||
.await
|
||||
.unwrap();
|
||||
leader_mysql_election
|
||||
.put_value_with_lease(
|
||||
&leader_mysql_election.election_key(),
|
||||
another_leader_key,
|
||||
10,
|
||||
&mut executor,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
leader_mysql_election.do_campaign().await.unwrap();
|
||||
let lease = get_lease(&leader_mysql_election).await.unwrap();
|
||||
// Different from pg, mysql will not delete the key, just step down.
|
||||
assert_eq!(lease.leader_value, another_leader_key);
|
||||
assert!(lease.expire_time > lease.current);
|
||||
assert!(!leader_mysql_election.is_leader());
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(LeaderChangeMessage::StepDown(key)) => {
|
||||
assert_eq!(String::from_utf8_lossy(key.name()), leader_value);
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(key.key()),
|
||||
leader_mysql_election.election_key()
|
||||
);
|
||||
assert_eq!(key.lease_id(), i64::default());
|
||||
assert_eq!(key.revision(), i64::default());
|
||||
}
|
||||
_ => panic!("Expected LeaderChangeMessage::StepDown"),
|
||||
}
|
||||
|
||||
drop_table(&leader_mysql_election.client, table_name).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_follower_action() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let candidate_lease_ttl_secs = 5;
|
||||
let meta_lease_ttl_secs = 1;
|
||||
let uuid = uuid::Uuid::new_v4().to_string();
|
||||
let table_name = "test_follower_action_greptime_metakv";
|
||||
|
||||
let follower_client = create_mysql_client(Some(table_name)).await.unwrap();
|
||||
let (tx, mut rx) = broadcast::channel(100);
|
||||
let follower_mysql_election = MySqlElection {
|
||||
leader_value: "test_follower".to_string(),
|
||||
client: follower_client,
|
||||
is_leader: AtomicBool::new(false),
|
||||
leader_infancy: AtomicBool::new(true),
|
||||
leader_watcher: tx,
|
||||
store_key_prefix: uuid.clone(),
|
||||
candidate_lease_ttl_secs,
|
||||
meta_lease_ttl_secs,
|
||||
sql_set: ElectionSqlFactory::new(table_name, 1).build(),
|
||||
};
|
||||
|
||||
let leader_client = create_mysql_client(Some(table_name)).await.unwrap();
|
||||
let (tx, _) = broadcast::channel(100);
|
||||
let leader_mysql_election = MySqlElection {
|
||||
leader_value: "test_leader".to_string(),
|
||||
client: leader_client,
|
||||
is_leader: AtomicBool::new(false),
|
||||
leader_infancy: AtomicBool::new(true),
|
||||
leader_watcher: tx,
|
||||
store_key_prefix: uuid,
|
||||
candidate_lease_ttl_secs,
|
||||
meta_lease_ttl_secs,
|
||||
sql_set: ElectionSqlFactory::new(table_name, 1).build(),
|
||||
};
|
||||
|
||||
leader_mysql_election.do_campaign().await.unwrap();
|
||||
|
||||
// Step 1: As a follower, the leader exists and the lease is not expired. Do nothing.
|
||||
follower_mysql_election.do_campaign().await.unwrap();
|
||||
|
||||
// Step 2: As a follower, the leader exists but the lease expired. Re-elect itself.
|
||||
tokio::time::sleep(Duration::from_secs(meta_lease_ttl_secs + 1)).await;
|
||||
follower_mysql_election.do_campaign().await.unwrap();
|
||||
assert!(follower_mysql_election.is_leader());
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(LeaderChangeMessage::Elected(key)) => {
|
||||
assert_eq!(String::from_utf8_lossy(key.name()), "test_follower");
|
||||
assert_eq!(
|
||||
String::from_utf8_lossy(key.key()),
|
||||
follower_mysql_election.election_key()
|
||||
);
|
||||
assert_eq!(key.lease_id(), i64::default());
|
||||
assert_eq!(key.revision(), i64::default());
|
||||
}
|
||||
_ => panic!("Expected LeaderChangeMessage::Elected"),
|
||||
}
|
||||
|
||||
drop_table(&follower_mysql_election.client, table_name).await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,7 +16,6 @@ use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use common_meta::distributed_time_constants::{META_KEEP_ALIVE_INTERVAL_SECS, META_LEASE_SECS};
|
||||
use common_telemetry::{error, warn};
|
||||
use common_time::Timestamp;
|
||||
use itertools::Itertools;
|
||||
@@ -41,6 +40,7 @@ const LEASE_SEP: &str = r#"||__metadata_lease_sep||"#;
|
||||
struct ElectionSqlFactory<'a> {
|
||||
lock_id: u64,
|
||||
table_name: &'a str,
|
||||
meta_lease_ttl_secs: u64,
|
||||
}
|
||||
|
||||
struct ElectionSqlSet {
|
||||
@@ -90,10 +90,11 @@ struct ElectionSqlSet {
|
||||
}
|
||||
|
||||
impl<'a> ElectionSqlFactory<'a> {
|
||||
fn new(lock_id: u64, table_name: &'a str) -> Self {
|
||||
fn new(lock_id: u64, table_name: &'a str, meta_lease_ttl_secs: u64) -> Self {
|
||||
Self {
|
||||
lock_id,
|
||||
table_name,
|
||||
meta_lease_ttl_secs,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -112,7 +113,10 @@ impl<'a> ElectionSqlFactory<'a> {
|
||||
// Currently the session timeout is longer than the leader lease time.
|
||||
// So the leader will renew the lease twice before the session timeout if everything goes well.
|
||||
fn set_idle_session_timeout_sql(&self) -> String {
|
||||
format!("SET idle_session_timeout = '{}s';", META_LEASE_SECS + 1)
|
||||
format!(
|
||||
"SET idle_session_timeout = '{}s';",
|
||||
self.meta_lease_ttl_secs + 1
|
||||
)
|
||||
}
|
||||
|
||||
fn campaign_sql(&self) -> String {
|
||||
@@ -226,6 +230,7 @@ pub struct PgElection {
|
||||
leader_watcher: broadcast::Sender<LeaderChangeMessage>,
|
||||
store_key_prefix: String,
|
||||
candidate_lease_ttl_secs: u64,
|
||||
meta_lease_ttl_secs: u64,
|
||||
sql_set: ElectionSqlSet,
|
||||
}
|
||||
|
||||
@@ -235,10 +240,11 @@ impl PgElection {
|
||||
client: Client,
|
||||
store_key_prefix: String,
|
||||
candidate_lease_ttl_secs: u64,
|
||||
meta_lease_ttl_secs: u64,
|
||||
table_name: &str,
|
||||
lock_id: u64,
|
||||
) -> Result<ElectionRef> {
|
||||
let sql_factory = ElectionSqlFactory::new(lock_id, table_name);
|
||||
let sql_factory = ElectionSqlFactory::new(lock_id, table_name, meta_lease_ttl_secs);
|
||||
// Set idle session timeout to IDLE_SESSION_TIMEOUT to avoid dead advisory lock.
|
||||
client
|
||||
.execute(&sql_factory.set_idle_session_timeout_sql(), &[])
|
||||
@@ -254,6 +260,7 @@ impl PgElection {
|
||||
leader_watcher: tx,
|
||||
store_key_prefix,
|
||||
candidate_lease_ttl_secs,
|
||||
meta_lease_ttl_secs,
|
||||
sql_set: sql_factory.build(),
|
||||
}))
|
||||
}
|
||||
@@ -326,7 +333,7 @@ impl Election for PgElection {
|
||||
|
||||
// Safety: origin is Some since we are using `get_value_with_lease` with `true`.
|
||||
let origin = origin.unwrap();
|
||||
self.update_value_with_lease(&key, &origin, &node_info)
|
||||
self.update_value_with_lease(&key, &origin, &node_info, self.candidate_lease_ttl_secs)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
@@ -361,7 +368,7 @@ impl Election for PgElection {
|
||||
/// to perform actions as a follower.
|
||||
async fn campaign(&self) -> Result<()> {
|
||||
let mut keep_alive_interval =
|
||||
tokio::time::interval(Duration::from_secs(META_KEEP_ALIVE_INTERVAL_SECS));
|
||||
tokio::time::interval(Duration::from_secs(self.meta_lease_ttl_secs / 2));
|
||||
keep_alive_interval.set_missed_tick_behavior(MissedTickBehavior::Delay);
|
||||
|
||||
loop {
|
||||
@@ -489,19 +496,20 @@ impl PgElection {
|
||||
Ok((values_with_leases, current))
|
||||
}
|
||||
|
||||
async fn update_value_with_lease(&self, key: &str, prev: &str, updated: &str) -> Result<()> {
|
||||
async fn update_value_with_lease(
|
||||
&self,
|
||||
key: &str,
|
||||
prev: &str,
|
||||
updated: &str,
|
||||
lease_ttl: u64,
|
||||
) -> Result<()> {
|
||||
let key = key.as_bytes();
|
||||
let prev = prev.as_bytes();
|
||||
let res = self
|
||||
.client
|
||||
.execute(
|
||||
&self.sql_set.update_value_with_lease,
|
||||
&[
|
||||
&key,
|
||||
&prev,
|
||||
&updated,
|
||||
&(self.candidate_lease_ttl_secs as f64),
|
||||
],
|
||||
&[&key, &prev, &updated, &(lease_ttl as f64)],
|
||||
)
|
||||
.await
|
||||
.context(PostgresExecutionSnafu)?;
|
||||
@@ -578,8 +586,13 @@ impl PgElection {
|
||||
(true, true) => {
|
||||
// Safety: prev is Some since we are using `get_value_with_lease` with `true`.
|
||||
let prev = prev.unwrap();
|
||||
self.update_value_with_lease(&key, &prev, &self.leader_value)
|
||||
.await?;
|
||||
self.update_value_with_lease(
|
||||
&key,
|
||||
&prev,
|
||||
&self.leader_value,
|
||||
self.meta_lease_ttl_secs,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
// Case 1.2
|
||||
(true, false) => {
|
||||
@@ -698,7 +711,7 @@ impl PgElection {
|
||||
..Default::default()
|
||||
};
|
||||
self.delete_value(&key).await?;
|
||||
self.put_value_with_lease(&key, &self.leader_value, META_LEASE_SECS)
|
||||
self.put_value_with_lease(&key, &self.leader_value, self.meta_lease_ttl_secs)
|
||||
.await?;
|
||||
|
||||
if self
|
||||
@@ -775,7 +788,8 @@ mod tests {
|
||||
leader_watcher: tx,
|
||||
store_key_prefix: uuid,
|
||||
candidate_lease_ttl_secs: 10,
|
||||
sql_set: ElectionSqlFactory::new(28319, table_name).build(),
|
||||
meta_lease_ttl_secs: 2,
|
||||
sql_set: ElectionSqlFactory::new(28319, table_name, 2).build(),
|
||||
};
|
||||
|
||||
let res = pg_election
|
||||
@@ -793,7 +807,7 @@ mod tests {
|
||||
|
||||
let prev = prev.unwrap();
|
||||
pg_election
|
||||
.update_value_with_lease(&key, &prev, &value)
|
||||
.update_value_with_lease(&key, &prev, &value, pg_election.meta_lease_ttl_secs)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -852,7 +866,8 @@ mod tests {
|
||||
leader_watcher: tx,
|
||||
store_key_prefix,
|
||||
candidate_lease_ttl_secs,
|
||||
sql_set: ElectionSqlFactory::new(28319, &table_name).build(),
|
||||
meta_lease_ttl_secs: 2,
|
||||
sql_set: ElectionSqlFactory::new(28319, &table_name, 2).build(),
|
||||
};
|
||||
|
||||
let node_info = MetasrvNodeInfo {
|
||||
@@ -896,7 +911,8 @@ mod tests {
|
||||
leader_watcher: tx,
|
||||
store_key_prefix: uuid.clone(),
|
||||
candidate_lease_ttl_secs,
|
||||
sql_set: ElectionSqlFactory::new(28319, table_name).build(),
|
||||
meta_lease_ttl_secs: 2,
|
||||
sql_set: ElectionSqlFactory::new(28319, table_name, 2).build(),
|
||||
};
|
||||
|
||||
let candidates = pg_election.all_candidates().await.unwrap();
|
||||
@@ -938,7 +954,8 @@ mod tests {
|
||||
leader_watcher: tx,
|
||||
store_key_prefix: uuid,
|
||||
candidate_lease_ttl_secs,
|
||||
sql_set: ElectionSqlFactory::new(28320, table_name).build(),
|
||||
meta_lease_ttl_secs: 2,
|
||||
sql_set: ElectionSqlFactory::new(28320, table_name, 2).build(),
|
||||
};
|
||||
|
||||
leader_pg_election.elected().await.unwrap();
|
||||
@@ -1050,7 +1067,8 @@ mod tests {
|
||||
leader_watcher: tx,
|
||||
store_key_prefix: uuid,
|
||||
candidate_lease_ttl_secs,
|
||||
sql_set: ElectionSqlFactory::new(28321, table_name).build(),
|
||||
meta_lease_ttl_secs: 2,
|
||||
sql_set: ElectionSqlFactory::new(28321, table_name, 2).build(),
|
||||
};
|
||||
|
||||
// Step 1: No leader exists, campaign and elected.
|
||||
@@ -1103,7 +1121,7 @@ mod tests {
|
||||
assert!(leader_pg_election.is_leader());
|
||||
|
||||
// Step 3: Something wrong, the leader lease expired.
|
||||
tokio::time::sleep(Duration::from_secs(META_LEASE_SECS)).await;
|
||||
tokio::time::sleep(Duration::from_secs(2)).await;
|
||||
|
||||
let res = leader_pg_election
|
||||
.client
|
||||
@@ -1284,7 +1302,8 @@ mod tests {
|
||||
leader_watcher: tx,
|
||||
store_key_prefix: uuid.clone(),
|
||||
candidate_lease_ttl_secs,
|
||||
sql_set: ElectionSqlFactory::new(28322, table_name).build(),
|
||||
meta_lease_ttl_secs: 2,
|
||||
sql_set: ElectionSqlFactory::new(28322, table_name, 2).build(),
|
||||
};
|
||||
|
||||
let leader_client = create_postgres_client(Some(table_name)).await.unwrap();
|
||||
@@ -1297,7 +1316,8 @@ mod tests {
|
||||
leader_watcher: tx,
|
||||
store_key_prefix: uuid,
|
||||
candidate_lease_ttl_secs,
|
||||
sql_set: ElectionSqlFactory::new(28322, table_name).build(),
|
||||
meta_lease_ttl_secs: 2,
|
||||
sql_set: ElectionSqlFactory::new(28322, table_name, 2).build(),
|
||||
};
|
||||
|
||||
leader_pg_election
|
||||
@@ -1311,7 +1331,7 @@ mod tests {
|
||||
follower_pg_election.follower_action().await.unwrap();
|
||||
|
||||
// Step 2: As a follower, the leader exists but the lease expired.
|
||||
tokio::time::sleep(Duration::from_secs(META_LEASE_SECS)).await;
|
||||
tokio::time::sleep(Duration::from_secs(2)).await;
|
||||
assert!(follower_pg_election.follower_action().await.is_err());
|
||||
|
||||
// Step 3: As a follower, the leader does not exist.
|
||||
|
||||
@@ -36,7 +36,7 @@ pub enum Error {
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
#[snafu(source)]
|
||||
error: rand::distributions::WeightedError,
|
||||
error: rand::distr::weighted::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Exceeded deadline, operation: {}", operation))]
|
||||
@@ -787,41 +787,6 @@ pub enum Error {
|
||||
location: Location,
|
||||
source: common_meta::error::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Logical table cannot add follower: {table_id}"))]
|
||||
LogicalTableCannotAddFollower {
|
||||
table_id: TableId,
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display(
|
||||
"A region follower cannot be placed on the same node as the leader: {region_id}, {peer_id}"
|
||||
))]
|
||||
RegionFollowerLeaderConflict {
|
||||
region_id: RegionId,
|
||||
peer_id: u64,
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display(
|
||||
"Multiple region followers cannot be placed on the same node: {region_id}, {peer_id}"
|
||||
))]
|
||||
MultipleRegionFollowersOnSameNode {
|
||||
region_id: RegionId,
|
||||
peer_id: u64,
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Region follower not exists: {region_id}, {peer_id}"))]
|
||||
RegionFollowerNotExists {
|
||||
region_id: RegionId,
|
||||
peer_id: u64,
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
},
|
||||
}
|
||||
|
||||
impl Error {
|
||||
@@ -891,10 +856,6 @@ impl ErrorExt for Error {
|
||||
| Error::ProcedureNotFound { .. }
|
||||
| Error::TooManyPartitions { .. }
|
||||
| Error::TomlFormat { .. }
|
||||
| Error::LogicalTableCannotAddFollower { .. }
|
||||
| Error::RegionFollowerLeaderConflict { .. }
|
||||
| Error::MultipleRegionFollowersOnSameNode { .. }
|
||||
| Error::RegionFollowerNotExists { .. }
|
||||
| Error::HandlerNotFound { .. } => StatusCode::InvalidArguments,
|
||||
Error::LeaseKeyFromUtf8 { .. }
|
||||
| Error::LeaseValueFromUtf8 { .. }
|
||||
|
||||
@@ -55,8 +55,6 @@ use crate::lease::MetaPeerLookupService;
|
||||
use crate::metasrv::{
|
||||
ElectionRef, Metasrv, MetasrvInfo, MetasrvOptions, SelectorContext, SelectorRef, TABLE_ID_SEQ,
|
||||
};
|
||||
use crate::procedure::region_follower::manager::RegionFollowerManager;
|
||||
use crate::procedure::region_follower::Context as ArfContext;
|
||||
use crate::procedure::region_migration::manager::RegionMigrationManager;
|
||||
use crate::procedure::region_migration::DefaultContextFactory;
|
||||
use crate::region::supervisor::{
|
||||
@@ -345,19 +343,6 @@ impl MetasrvBuilder {
|
||||
.context(error::InitDdlManagerSnafu)?,
|
||||
);
|
||||
|
||||
// alter region follower manager
|
||||
let region_follower_manager = Arc::new(RegionFollowerManager::new(
|
||||
procedure_manager.clone(),
|
||||
ArfContext {
|
||||
table_metadata_manager: table_metadata_manager.clone(),
|
||||
mailbox: mailbox.clone(),
|
||||
server_addr: options.server_addr.clone(),
|
||||
cache_invalidator: cache_invalidator.clone(),
|
||||
meta_peer_client: meta_peer_client.clone(),
|
||||
},
|
||||
));
|
||||
region_follower_manager.try_start()?;
|
||||
|
||||
let handler_group_builder = match handler_group_builder {
|
||||
Some(handler_group_builder) => handler_group_builder,
|
||||
None => {
|
||||
|
||||
@@ -18,7 +18,6 @@ use common_meta::leadership_notifier::LeadershipChangeListener;
|
||||
use common_procedure::ProcedureManagerRef;
|
||||
use snafu::ResultExt;
|
||||
|
||||
pub mod region_follower;
|
||||
pub mod region_migration;
|
||||
#[cfg(test)]
|
||||
mod test_util;
|
||||
|
||||
@@ -1,229 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
pub mod manager;
|
||||
|
||||
pub mod add_region_follower;
|
||||
mod create;
|
||||
mod remove;
|
||||
pub mod remove_region_follower;
|
||||
#[cfg(test)]
|
||||
mod test_util;
|
||||
|
||||
use common_error::ext::BoxedError;
|
||||
use common_meta::cache_invalidator::CacheInvalidatorRef;
|
||||
use common_meta::distributed_time_constants;
|
||||
use common_meta::key::datanode_table::{DatanodeTableKey, DatanodeTableValue, RegionInfo};
|
||||
use common_meta::key::table_route::{PhysicalTableRouteValue, TableRouteValue};
|
||||
use common_meta::key::{DeserializedValueWithBytes, TableMetadataManagerRef};
|
||||
use common_meta::lock_key::{CatalogLock, RegionLock, SchemaLock, TableLock};
|
||||
use common_meta::peer::Peer;
|
||||
use common_procedure::StringKey;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use snafu::{ensure, OptionExt, ResultExt};
|
||||
use store_api::storage::RegionId;
|
||||
use strum::AsRefStr;
|
||||
|
||||
use crate::cluster::MetaPeerClientRef;
|
||||
use crate::error::{self, Result};
|
||||
use crate::lease::lookup_datanode_peer;
|
||||
use crate::service::mailbox::MailboxRef;
|
||||
|
||||
#[derive(Clone)]
|
||||
/// The context of add/remove region follower procedure.
|
||||
pub struct Context {
|
||||
/// The table metadata manager.
|
||||
pub table_metadata_manager: TableMetadataManagerRef,
|
||||
/// The mailbox.
|
||||
pub mailbox: MailboxRef,
|
||||
/// The metasrv's address.
|
||||
pub server_addr: String,
|
||||
/// The cache invalidator.
|
||||
pub cache_invalidator: CacheInvalidatorRef,
|
||||
/// The meta peer client.
|
||||
pub meta_peer_client: MetaPeerClientRef,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct AlterRegionFollowerData {
|
||||
/// The catalog name.
|
||||
pub(crate) catalog: String,
|
||||
/// The schema name.
|
||||
pub(crate) schema: String,
|
||||
/// The region id.
|
||||
pub(crate) region_id: RegionId,
|
||||
/// The peer id of the datanode to add region follower.
|
||||
pub(crate) peer_id: u64,
|
||||
/// The peer of the datanode to add region follower.
|
||||
pub(crate) peer: Option<Peer>,
|
||||
/// The datanode table value of the region.
|
||||
pub(crate) datanode_table_value: Option<DatanodeTableValue>,
|
||||
/// The physical table route of the region.
|
||||
pub(crate) table_route: Option<(
|
||||
DeserializedValueWithBytes<TableRouteValue>,
|
||||
PhysicalTableRouteValue,
|
||||
)>,
|
||||
/// The state.
|
||||
pub(crate) state: AlterRegionFollowerState,
|
||||
}
|
||||
|
||||
impl AlterRegionFollowerData {
|
||||
pub fn lock_key(&self) -> Vec<StringKey> {
|
||||
let region_id = self.region_id;
|
||||
let lock_key = vec![
|
||||
CatalogLock::Read(&self.catalog).into(),
|
||||
SchemaLock::read(&self.catalog, &self.schema).into(),
|
||||
// The optimistic updating of table route is not working very well,
|
||||
// so we need to use the write lock here.
|
||||
TableLock::Write(region_id.table_id()).into(),
|
||||
RegionLock::Write(region_id).into(),
|
||||
];
|
||||
|
||||
lock_key
|
||||
}
|
||||
|
||||
pub(crate) fn datanode_peer(&self) -> Option<&Peer> {
|
||||
self.peer.as_ref()
|
||||
}
|
||||
|
||||
pub(crate) fn physical_table_route(&self) -> Option<&PhysicalTableRouteValue> {
|
||||
self.table_route
|
||||
.as_ref()
|
||||
.map(|(_, table_route)| table_route)
|
||||
}
|
||||
|
||||
/// Returns the region info of the region.
|
||||
pub(crate) fn region_info(&self) -> Option<RegionInfo> {
|
||||
self.datanode_table_value
|
||||
.as_ref()
|
||||
.map(|datanode_table_value| datanode_table_value.region_info.clone())
|
||||
}
|
||||
|
||||
/// Loads the datanode peer.
|
||||
pub(crate) async fn load_datanode_peer(&self, ctx: &Context) -> Result<Option<Peer>> {
|
||||
let peer = lookup_datanode_peer(
|
||||
self.peer_id,
|
||||
&ctx.meta_peer_client,
|
||||
distributed_time_constants::DATANODE_LEASE_SECS,
|
||||
)
|
||||
.await?
|
||||
.context(error::PeerUnavailableSnafu {
|
||||
peer_id: self.peer_id,
|
||||
})?;
|
||||
|
||||
Ok(Some(peer))
|
||||
}
|
||||
|
||||
/// Loads the datanode table value of the region.
|
||||
pub(crate) async fn load_datanode_table_value(
|
||||
&self,
|
||||
ctx: &Context,
|
||||
) -> Result<Option<DatanodeTableValue>> {
|
||||
let table_id = self.region_id.table_id();
|
||||
let datanode_id = self.peer_id;
|
||||
let datanode_table_key = DatanodeTableKey {
|
||||
datanode_id,
|
||||
table_id,
|
||||
};
|
||||
|
||||
let datanode_table_value = ctx
|
||||
.table_metadata_manager
|
||||
.datanode_table_manager()
|
||||
.get(&datanode_table_key)
|
||||
.await
|
||||
.context(error::TableMetadataManagerSnafu)
|
||||
.map_err(BoxedError::new)
|
||||
.with_context(|_| error::RetryLaterWithSourceSnafu {
|
||||
reason: format!("Failed to get DatanodeTable: ({datanode_id},{table_id})"),
|
||||
})?
|
||||
.context(error::DatanodeTableNotFoundSnafu {
|
||||
table_id,
|
||||
datanode_id,
|
||||
})?;
|
||||
|
||||
Ok(Some(datanode_table_value))
|
||||
}
|
||||
|
||||
/// Loads the table route of the region, returns the physical table id.
|
||||
pub(crate) async fn load_table_route(
|
||||
&self,
|
||||
ctx: &Context,
|
||||
) -> Result<
|
||||
Option<(
|
||||
DeserializedValueWithBytes<TableRouteValue>,
|
||||
PhysicalTableRouteValue,
|
||||
)>,
|
||||
> {
|
||||
let table_id = self.region_id.table_id();
|
||||
let raw_table_route = ctx
|
||||
.table_metadata_manager
|
||||
.table_route_manager()
|
||||
.table_route_storage()
|
||||
.get_with_raw_bytes(table_id)
|
||||
.await
|
||||
.context(error::TableMetadataManagerSnafu)
|
||||
.map_err(BoxedError::new)
|
||||
.with_context(|_| error::RetryLaterWithSourceSnafu {
|
||||
reason: format!("Failed to get TableRoute: {table_id}"),
|
||||
})?
|
||||
.context(error::TableRouteNotFoundSnafu { table_id })?;
|
||||
let table_route = raw_table_route.clone().into_inner();
|
||||
|
||||
ensure!(
|
||||
table_route.is_physical(),
|
||||
error::LogicalTableCannotAddFollowerSnafu { table_id }
|
||||
);
|
||||
|
||||
Ok(Some((
|
||||
raw_table_route,
|
||||
table_route.into_physical_table_route(),
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, AsRefStr)]
|
||||
pub enum AlterRegionFollowerState {
|
||||
/// Prepares to alter region follower.
|
||||
Prepare,
|
||||
/// Sends alter region follower request to Datanode.
|
||||
SubmitRequest,
|
||||
/// Updates table metadata.
|
||||
UpdateMetadata,
|
||||
/// Broadcasts the invalidate table route cache message.
|
||||
InvalidateTableCache,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_data_serialization() {
|
||||
let data = AlterRegionFollowerData {
|
||||
catalog: "test_catalog".to_string(),
|
||||
schema: "test_schema".to_string(),
|
||||
region_id: RegionId::new(1, 1),
|
||||
peer_id: 1,
|
||||
peer: None,
|
||||
datanode_table_value: None,
|
||||
table_route: None,
|
||||
state: AlterRegionFollowerState::Prepare,
|
||||
};
|
||||
|
||||
assert_eq!(data.region_id.as_u64(), 4294967297);
|
||||
let serialized = serde_json::to_string(&data).unwrap();
|
||||
let expected = r#"{"catalog":"test_catalog","schema":"test_schema","region_id":4294967297,"peer_id":1,"peer":null,"datanode_table_value":null,"table_route":null,"state":"Prepare"}"#;
|
||||
assert_eq!(expected, serialized);
|
||||
}
|
||||
}
|
||||
@@ -1,247 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use common_meta::instruction::CacheIdent;
|
||||
use common_procedure::error::ToJsonSnafu;
|
||||
use common_procedure::{
|
||||
Context as ProcedureContext, Error as ProcedureError, LockKey, Procedure,
|
||||
Result as ProcedureResult, Status,
|
||||
};
|
||||
use common_telemetry::info;
|
||||
use snafu::ResultExt;
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use super::create::CreateFollower;
|
||||
use super::{AlterRegionFollowerData, AlterRegionFollowerState, Context};
|
||||
use crate::error::{self, Result};
|
||||
use crate::metrics;
|
||||
|
||||
/// The procedure to add a region follower.
|
||||
pub struct AddRegionFollowerProcedure {
|
||||
pub data: AlterRegionFollowerData,
|
||||
pub context: Context,
|
||||
}
|
||||
|
||||
impl AddRegionFollowerProcedure {
|
||||
pub const TYPE_NAME: &'static str = "metasrv-procedure::AddRegionFollower";
|
||||
|
||||
pub fn new(
|
||||
catalog: String,
|
||||
schema: String,
|
||||
region_id: RegionId,
|
||||
peer_id: u64,
|
||||
context: Context,
|
||||
) -> Self {
|
||||
Self {
|
||||
data: AlterRegionFollowerData {
|
||||
catalog,
|
||||
schema,
|
||||
region_id,
|
||||
peer_id,
|
||||
peer: None,
|
||||
datanode_table_value: None,
|
||||
table_route: None,
|
||||
state: AlterRegionFollowerState::Prepare,
|
||||
},
|
||||
context,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_json(json: &str, context: Context) -> ProcedureResult<Self> {
|
||||
let data: AlterRegionFollowerData = serde_json::from_str(json).unwrap();
|
||||
Ok(Self { data, context })
|
||||
}
|
||||
|
||||
pub async fn on_prepare(&mut self) -> Result<Status> {
|
||||
// loads the datanode peer and check peer is alive
|
||||
self.data.peer = self.data.load_datanode_peer(&self.context).await?;
|
||||
|
||||
// loads the datanode table value
|
||||
self.data.datanode_table_value = self.data.load_datanode_table_value(&self.context).await?;
|
||||
|
||||
// loads the table route of the region
|
||||
self.data.table_route = self.data.load_table_route(&self.context).await?;
|
||||
let table_route = self.data.physical_table_route().unwrap();
|
||||
let datanode_peer = self.data.datanode_peer().unwrap();
|
||||
|
||||
// check if the destination peer is already a leader/follower of the region
|
||||
for region_route in &table_route.region_routes {
|
||||
if region_route.region.id != self.data.region_id {
|
||||
continue;
|
||||
}
|
||||
let Some(leader_peer) = ®ion_route.leader_peer else {
|
||||
continue;
|
||||
};
|
||||
|
||||
// check if the destination peer is already a leader of the region
|
||||
if leader_peer.id == datanode_peer.id {
|
||||
return error::RegionFollowerLeaderConflictSnafu {
|
||||
region_id: self.data.region_id,
|
||||
peer_id: datanode_peer.id,
|
||||
}
|
||||
.fail();
|
||||
}
|
||||
|
||||
// check if the destination peer is already a follower of the region
|
||||
if region_route
|
||||
.follower_peers
|
||||
.iter()
|
||||
.any(|peer| peer.id == datanode_peer.id)
|
||||
{
|
||||
return error::MultipleRegionFollowersOnSameNodeSnafu {
|
||||
region_id: self.data.region_id,
|
||||
peer_id: datanode_peer.id,
|
||||
}
|
||||
.fail();
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
"Add region({}) follower procedure is preparing, peer: {datanode_peer:?}",
|
||||
self.data.region_id
|
||||
);
|
||||
|
||||
Ok(Status::executing(true))
|
||||
}
|
||||
|
||||
pub async fn on_submit_request(&mut self) -> Result<Status> {
|
||||
let region_id = self.data.region_id;
|
||||
// Safety: we have already set the peer in `on_prepare``.
|
||||
let peer = self.data.peer.clone().unwrap();
|
||||
let create_follower = CreateFollower::new(region_id, peer);
|
||||
let instruction = create_follower
|
||||
.build_open_region_instruction(self.data.region_info().unwrap())
|
||||
.await?;
|
||||
create_follower
|
||||
.send_open_region_instruction(&self.context, instruction)
|
||||
.await?;
|
||||
|
||||
Ok(Status::executing(true))
|
||||
}
|
||||
|
||||
pub async fn on_update_metadata(&mut self) -> Result<Status> {
|
||||
// Safety: we have already load the table route in `on_prepare``.
|
||||
let (current_table_route_value, phy_table_route) = self.data.table_route.as_ref().unwrap();
|
||||
|
||||
let mut new_region_routes = phy_table_route.region_routes.clone();
|
||||
for region_route in &mut new_region_routes {
|
||||
if region_route.region.id != self.data.region_id {
|
||||
continue;
|
||||
}
|
||||
region_route
|
||||
.follower_peers
|
||||
.push(self.data.peer.clone().unwrap());
|
||||
}
|
||||
|
||||
// Safety: we have already load the region info in `on_prepare`.
|
||||
let region_info = self.data.region_info().unwrap();
|
||||
let new_region_options = region_info.region_options.clone();
|
||||
let new_region_wal_options = region_info.region_wal_options.clone();
|
||||
|
||||
self.context
|
||||
.table_metadata_manager
|
||||
.update_table_route(
|
||||
self.data.region_id.table_id(),
|
||||
region_info,
|
||||
current_table_route_value,
|
||||
new_region_routes,
|
||||
&new_region_options,
|
||||
&new_region_wal_options,
|
||||
)
|
||||
.await
|
||||
.context(error::TableMetadataManagerSnafu)?;
|
||||
|
||||
Ok(Status::executing(true))
|
||||
}
|
||||
|
||||
pub async fn on_broadcast(&mut self) -> Result<Status> {
|
||||
let table_id = self.data.region_id.table_id();
|
||||
// ignore the result
|
||||
let ctx = common_meta::cache_invalidator::Context::default();
|
||||
let _ = self
|
||||
.context
|
||||
.cache_invalidator
|
||||
.invalidate(&ctx, &[CacheIdent::TableId(table_id)])
|
||||
.await;
|
||||
Ok(Status::executing(true))
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Procedure for AddRegionFollowerProcedure {
|
||||
fn type_name(&self) -> &str {
|
||||
Self::TYPE_NAME
|
||||
}
|
||||
|
||||
async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult<Status> {
|
||||
let state = &self.data.state;
|
||||
|
||||
let _timer = metrics::METRIC_META_ADD_REGION_FOLLOWER_EXECUTE
|
||||
.with_label_values(&[state.as_ref()])
|
||||
.start_timer();
|
||||
|
||||
match state {
|
||||
AlterRegionFollowerState::Prepare => self.on_prepare().await,
|
||||
AlterRegionFollowerState::SubmitRequest => self.on_submit_request().await,
|
||||
AlterRegionFollowerState::UpdateMetadata => self.on_update_metadata().await,
|
||||
AlterRegionFollowerState::InvalidateTableCache => self.on_broadcast().await,
|
||||
}
|
||||
.map_err(|e| {
|
||||
if e.is_retryable() {
|
||||
ProcedureError::retry_later(e)
|
||||
} else {
|
||||
ProcedureError::external(e)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn dump(&self) -> ProcedureResult<String> {
|
||||
serde_json::to_string(&self.data).context(ToJsonSnafu)
|
||||
}
|
||||
|
||||
fn lock_key(&self) -> LockKey {
|
||||
LockKey::new(self.data.lock_key())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use common_meta::lock_key::{CatalogLock, RegionLock, SchemaLock, TableLock};
|
||||
|
||||
use super::*;
|
||||
use crate::procedure::region_follower::test_util::TestingEnv;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_lock_key() {
|
||||
let env = TestingEnv::new();
|
||||
let context = env.new_context();
|
||||
|
||||
let procedure = AddRegionFollowerProcedure::new(
|
||||
"test_catalog".to_string(),
|
||||
"test_schema".to_string(),
|
||||
RegionId::new(1, 1),
|
||||
1,
|
||||
context,
|
||||
);
|
||||
|
||||
let key = procedure.lock_key();
|
||||
let keys = key.keys_to_lock().cloned().collect::<Vec<_>>();
|
||||
|
||||
assert_eq!(keys.len(), 4);
|
||||
assert!(keys.contains(&CatalogLock::Read("test_catalog").into()));
|
||||
assert!(keys.contains(&SchemaLock::read("test_catalog", "test_schema").into()));
|
||||
assert!(keys.contains(&TableLock::Write(1).into()));
|
||||
assert!(keys.contains(&RegionLock::Write(RegionId::new(1, 1)).into()));
|
||||
}
|
||||
}
|
||||
@@ -1,252 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use api::v1::meta::MailboxMessage;
|
||||
use common_meta::distributed_time_constants::REGION_LEASE_SECS;
|
||||
use common_meta::instruction::{Instruction, InstructionReply, OpenRegion, SimpleReply};
|
||||
use common_meta::key::datanode_table::RegionInfo;
|
||||
use common_meta::peer::Peer;
|
||||
use common_meta::RegionIdent;
|
||||
use common_telemetry::info;
|
||||
use snafu::ResultExt;
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use super::Context;
|
||||
use crate::error::{self, Result};
|
||||
use crate::handler::HeartbeatMailbox;
|
||||
use crate::service::mailbox::Channel;
|
||||
|
||||
/// Uses lease time of a region as the timeout of opening a follower region.
|
||||
const OPEN_REGION_FOLLOWER_TIMEOUT: Duration = Duration::from_secs(REGION_LEASE_SECS);
|
||||
|
||||
pub(crate) struct CreateFollower {
|
||||
region_id: RegionId,
|
||||
// The peer of the datanode to add region follower.
|
||||
peer: Peer,
|
||||
}
|
||||
|
||||
impl CreateFollower {
|
||||
pub fn new(region_id: RegionId, peer: Peer) -> Self {
|
||||
Self { region_id, peer }
|
||||
}
|
||||
|
||||
/// Builds the open region instruction for the region follower.
|
||||
pub(crate) async fn build_open_region_instruction(
|
||||
&self,
|
||||
region_info: RegionInfo,
|
||||
) -> Result<Instruction> {
|
||||
let datanode_id = self.peer.id;
|
||||
let table_id = self.region_id.table_id();
|
||||
let region_number = self.region_id.region_number();
|
||||
|
||||
let RegionInfo {
|
||||
region_storage_path,
|
||||
region_options,
|
||||
region_wal_options,
|
||||
engine,
|
||||
} = region_info;
|
||||
|
||||
let region_ident = RegionIdent {
|
||||
datanode_id,
|
||||
table_id,
|
||||
region_number,
|
||||
engine,
|
||||
};
|
||||
|
||||
let open_instruction = Instruction::OpenRegion(OpenRegion::new(
|
||||
region_ident,
|
||||
®ion_storage_path,
|
||||
region_options,
|
||||
region_wal_options,
|
||||
true,
|
||||
));
|
||||
|
||||
Ok(open_instruction)
|
||||
}
|
||||
|
||||
/// Sends the open region instruction to the datanode.
|
||||
pub(crate) async fn send_open_region_instruction(
|
||||
&self,
|
||||
ctx: &Context,
|
||||
instruction: Instruction,
|
||||
) -> Result<()> {
|
||||
// TODO(jeremy): register the opening_region_keeper
|
||||
let msg = MailboxMessage::json_message(
|
||||
&format!("Open a follower region: {}", self.region_id),
|
||||
&format!("Metasrv@{}", ctx.server_addr),
|
||||
&format!("Datanode-{}@{}", self.peer.id, self.peer.addr),
|
||||
common_time::util::current_time_millis(),
|
||||
&instruction,
|
||||
)
|
||||
.with_context(|_| error::SerializeToJsonSnafu {
|
||||
input: instruction.to_string(),
|
||||
})?;
|
||||
|
||||
let ch = Channel::Datanode(self.peer.id);
|
||||
let now = Instant::now();
|
||||
let receiver = ctx
|
||||
.mailbox
|
||||
.send(&ch, msg, OPEN_REGION_FOLLOWER_TIMEOUT)
|
||||
.await?;
|
||||
|
||||
match receiver.await? {
|
||||
Ok(msg) => {
|
||||
let reply = HeartbeatMailbox::json_reply(&msg)?;
|
||||
info!(
|
||||
"Received open region follower reply: {:?}, region: {}, elapsed: {:?}",
|
||||
reply,
|
||||
self.region_id,
|
||||
now.elapsed()
|
||||
);
|
||||
let InstructionReply::OpenRegion(SimpleReply { result, error }) = reply else {
|
||||
return error::UnexpectedInstructionReplySnafu {
|
||||
mailbox_message: msg.to_string(),
|
||||
reason: "expect open region follower reply",
|
||||
}
|
||||
.fail();
|
||||
};
|
||||
|
||||
if result {
|
||||
Ok(())
|
||||
} else {
|
||||
error::RetryLaterSnafu {
|
||||
reason: format!(
|
||||
"Region {} is not opened by datanode {:?}, error: {error:?}, elapsed: {:?}",
|
||||
self.region_id,
|
||||
&self.peer,
|
||||
now.elapsed()
|
||||
),
|
||||
}
|
||||
.fail()
|
||||
}
|
||||
}
|
||||
Err(error::Error::MailboxTimeout { .. }) => {
|
||||
let reason = format!(
|
||||
"Mailbox received timeout for open region follower {} on datanode {:?}, elapsed: {:?}",
|
||||
self.region_id,
|
||||
&self.peer,
|
||||
now.elapsed()
|
||||
);
|
||||
error::RetryLaterSnafu { reason }.fail()
|
||||
}
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::assert_matches::assert_matches;
|
||||
|
||||
use common_meta::DatanodeId;
|
||||
|
||||
use super::*;
|
||||
use crate::error::Error;
|
||||
use crate::procedure::region_follower::test_util::TestingEnv;
|
||||
use crate::procedure::test_util::{new_close_region_reply, send_mock_reply};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_datanode_is_unreachable() {
|
||||
let env = TestingEnv::new();
|
||||
let ctx = env.new_context();
|
||||
|
||||
let region_id = RegionId::new(1, 1);
|
||||
let peer = Peer::new(1, "127.0.0.1:8080");
|
||||
let create_follower = CreateFollower::new(region_id, peer.clone());
|
||||
let instruction = mock_open_region_instruction(peer.id, region_id);
|
||||
let err = create_follower
|
||||
.send_open_region_instruction(&ctx, instruction)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert_matches!(err, Error::PusherNotFound { .. });
|
||||
assert!(!err.is_retryable());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_unexpected_instruction_reply() {
|
||||
let mut env = TestingEnv::new();
|
||||
let ctx = env.new_context();
|
||||
let mailbox_ctx = env.mailbox_context_mut();
|
||||
let mailbox = mailbox_ctx.mailbox().clone();
|
||||
|
||||
let region_id = RegionId::new(1, 1);
|
||||
let peer = Peer::new(1, "127.0.0.1:8080");
|
||||
|
||||
let (tx, rx) = tokio::sync::mpsc::channel(1);
|
||||
|
||||
mailbox_ctx
|
||||
.insert_heartbeat_response_receiver(Channel::Datanode(peer.id), tx)
|
||||
.await;
|
||||
|
||||
// Sends an timeout error.
|
||||
send_mock_reply(mailbox, rx, |id| Ok(new_close_region_reply(id)));
|
||||
|
||||
let create_follower = CreateFollower::new(region_id, peer.clone());
|
||||
let instruction = mock_open_region_instruction(peer.id, region_id);
|
||||
let err = create_follower
|
||||
.send_open_region_instruction(&ctx, instruction)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert_matches!(err, Error::UnexpectedInstructionReply { .. });
|
||||
assert!(!err.is_retryable());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_instruction_exceeded_deadline() {
|
||||
let mut env = TestingEnv::new();
|
||||
let ctx = env.new_context();
|
||||
let mailbox_ctx = env.mailbox_context_mut();
|
||||
let mailbox = mailbox_ctx.mailbox().clone();
|
||||
|
||||
let region_id = RegionId::new(1, 1);
|
||||
let peer = Peer::new(1, "127.0.0.1:8080");
|
||||
|
||||
let (tx, rx) = tokio::sync::mpsc::channel(1);
|
||||
|
||||
mailbox_ctx
|
||||
.insert_heartbeat_response_receiver(Channel::Datanode(peer.id), tx)
|
||||
.await;
|
||||
|
||||
// Sends an timeout error.
|
||||
send_mock_reply(mailbox, rx, |id| {
|
||||
Err(error::MailboxTimeoutSnafu { id }.build())
|
||||
});
|
||||
|
||||
let create_follower = CreateFollower::new(region_id, peer.clone());
|
||||
let instruction = mock_open_region_instruction(peer.id, region_id);
|
||||
let err = create_follower
|
||||
.send_open_region_instruction(&ctx, instruction)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert_matches!(err, Error::RetryLater { .. });
|
||||
assert!(err.is_retryable());
|
||||
}
|
||||
|
||||
fn mock_open_region_instruction(datanode_id: DatanodeId, region_id: RegionId) -> Instruction {
|
||||
Instruction::OpenRegion(OpenRegion {
|
||||
region_ident: RegionIdent {
|
||||
datanode_id,
|
||||
table_id: region_id.table_id(),
|
||||
region_number: region_id.region_number(),
|
||||
engine: "mito2".to_string(),
|
||||
},
|
||||
region_storage_path: "/tmp".to_string(),
|
||||
region_options: Default::default(),
|
||||
region_wal_options: Default::default(),
|
||||
skip_wal_replay: true,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -1,193 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use common_meta::rpc::procedure::{AddRegionFollowerRequest, RemoveRegionFollowerRequest};
|
||||
use common_procedure::{watcher, Output, ProcedureId, ProcedureManagerRef, ProcedureWithId};
|
||||
use common_telemetry::info;
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
use store_api::storage::RegionId;
|
||||
use table::table_name::TableName;
|
||||
|
||||
use super::remove_region_follower::RemoveRegionFollowerProcedure;
|
||||
use crate::error::{self, Result};
|
||||
use crate::procedure::region_follower::add_region_follower::AddRegionFollowerProcedure;
|
||||
use crate::procedure::region_follower::Context;
|
||||
|
||||
pub struct RegionFollowerManager {
|
||||
procedure_manager: ProcedureManagerRef,
|
||||
default_context: Context,
|
||||
}
|
||||
|
||||
impl RegionFollowerManager {
|
||||
pub fn new(procedure_manager: ProcedureManagerRef, default_context: Context) -> Self {
|
||||
Self {
|
||||
procedure_manager,
|
||||
default_context,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_context(&self) -> Context {
|
||||
self.default_context.clone()
|
||||
}
|
||||
|
||||
pub(crate) fn try_start(&self) -> Result<()> {
|
||||
// register add region follower procedure
|
||||
let context = self.new_context();
|
||||
let type_name = AddRegionFollowerProcedure::TYPE_NAME;
|
||||
self.procedure_manager
|
||||
.register_loader(
|
||||
type_name,
|
||||
Box::new(move |json| {
|
||||
let context = context.clone();
|
||||
AddRegionFollowerProcedure::from_json(json, context).map(|p| Box::new(p) as _)
|
||||
}),
|
||||
)
|
||||
.context(error::RegisterProcedureLoaderSnafu { type_name })?;
|
||||
|
||||
// register remove region follower procedure
|
||||
let context = self.new_context();
|
||||
let type_name = RemoveRegionFollowerProcedure::TYPE_NAME;
|
||||
self.procedure_manager
|
||||
.register_loader(
|
||||
type_name,
|
||||
Box::new(move |json| {
|
||||
let context = context.clone();
|
||||
RemoveRegionFollowerProcedure::from_json(json, context)
|
||||
.map(|p| Box::new(p) as _)
|
||||
}),
|
||||
)
|
||||
.context(error::RegisterProcedureLoaderSnafu { type_name })?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn submit_add_follower_procedure(
|
||||
&self,
|
||||
req: AddRegionFollowerRequest,
|
||||
) -> Result<(ProcedureId, Option<Output>)> {
|
||||
let AddRegionFollowerRequest { region_id, peer_id } = req;
|
||||
let region_id = RegionId::from_u64(region_id);
|
||||
let table_id = region_id.table_id();
|
||||
let ctx = self.new_context();
|
||||
|
||||
// get the table info
|
||||
let table_info = ctx
|
||||
.table_metadata_manager
|
||||
.table_info_manager()
|
||||
.get(table_id)
|
||||
.await
|
||||
.context(error::TableMetadataManagerSnafu)?
|
||||
.context(error::TableInfoNotFoundSnafu { table_id })?
|
||||
.into_inner();
|
||||
|
||||
let TableName {
|
||||
catalog_name,
|
||||
schema_name,
|
||||
..
|
||||
} = table_info.table_name();
|
||||
|
||||
let procedure =
|
||||
AddRegionFollowerProcedure::new(catalog_name, schema_name, region_id, peer_id, ctx);
|
||||
|
||||
let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));
|
||||
let procedure_id = procedure_with_id.id;
|
||||
info!("Starting add region follower procedure {procedure_id} for {req:?}");
|
||||
let mut watcher = self
|
||||
.procedure_manager
|
||||
.submit(procedure_with_id)
|
||||
.await
|
||||
.context(error::SubmitProcedureSnafu)?;
|
||||
let output = watcher::wait(&mut watcher)
|
||||
.await
|
||||
.context(error::WaitProcedureSnafu)?;
|
||||
|
||||
Ok((procedure_id, output))
|
||||
}
|
||||
|
||||
pub async fn submit_remove_follower_procedure(
|
||||
&self,
|
||||
req: RemoveRegionFollowerRequest,
|
||||
) -> Result<(ProcedureId, Option<Output>)> {
|
||||
let RemoveRegionFollowerRequest { region_id, peer_id } = req;
|
||||
let region_id = RegionId::from_u64(region_id);
|
||||
let table_id = region_id.table_id();
|
||||
let ctx = self.new_context();
|
||||
|
||||
// get the table info
|
||||
let table_info = ctx
|
||||
.table_metadata_manager
|
||||
.table_info_manager()
|
||||
.get(table_id)
|
||||
.await
|
||||
.context(error::TableMetadataManagerSnafu)?
|
||||
.context(error::TableInfoNotFoundSnafu { table_id })?
|
||||
.into_inner();
|
||||
|
||||
let TableName {
|
||||
catalog_name,
|
||||
schema_name,
|
||||
..
|
||||
} = table_info.table_name();
|
||||
|
||||
let procedure =
|
||||
RemoveRegionFollowerProcedure::new(catalog_name, schema_name, region_id, peer_id, ctx);
|
||||
|
||||
let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));
|
||||
let procedure_id = procedure_with_id.id;
|
||||
info!("Starting remove region follower procedure {procedure_id} for {req:?}");
|
||||
let mut watcher = self
|
||||
.procedure_manager
|
||||
.submit(procedure_with_id)
|
||||
.await
|
||||
.context(error::SubmitProcedureSnafu)?;
|
||||
let output = watcher::wait(&mut watcher)
|
||||
.await
|
||||
.context(error::WaitProcedureSnafu)?;
|
||||
|
||||
Ok((procedure_id, output))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::assert_matches::assert_matches;
|
||||
|
||||
use super::*;
|
||||
use crate::procedure::region_follower::test_util::TestingEnv;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_submit_procedure_table_not_found() {
|
||||
let env = TestingEnv::new();
|
||||
let ctx = env.new_context();
|
||||
let region_follower_manager = RegionFollowerManager::new(env.procedure_manager(), ctx);
|
||||
let req = AddRegionFollowerRequest {
|
||||
region_id: 1,
|
||||
peer_id: 2,
|
||||
};
|
||||
let err = region_follower_manager
|
||||
.submit_add_follower_procedure(req)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert_matches!(err, error::Error::TableInfoNotFound { .. });
|
||||
|
||||
let req = RemoveRegionFollowerRequest {
|
||||
region_id: 1,
|
||||
peer_id: 2,
|
||||
};
|
||||
let err = region_follower_manager
|
||||
.submit_remove_follower_procedure(req)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert_matches!(err, error::Error::TableInfoNotFound { .. });
|
||||
}
|
||||
}
|
||||
@@ -1,234 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use api::v1::meta::MailboxMessage;
|
||||
use common_meta::distributed_time_constants::REGION_LEASE_SECS;
|
||||
use common_meta::instruction::{Instruction, InstructionReply, SimpleReply};
|
||||
use common_meta::key::datanode_table::RegionInfo;
|
||||
use common_meta::peer::Peer;
|
||||
use common_meta::RegionIdent;
|
||||
use common_telemetry::info;
|
||||
use snafu::ResultExt;
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use super::Context;
|
||||
use crate::error::{self, Result};
|
||||
use crate::handler::HeartbeatMailbox;
|
||||
use crate::service::mailbox::Channel;
|
||||
|
||||
/// Uses lease time of a region as the timeout of closing a follower region.
|
||||
const CLOSE_REGION_FOLLOWER_TIMEOUT: Duration = Duration::from_secs(REGION_LEASE_SECS);
|
||||
|
||||
pub(crate) struct RemoveFollower {
|
||||
region_id: RegionId,
|
||||
// The peer of the datanode to add region follower.
|
||||
peer: Peer,
|
||||
}
|
||||
|
||||
impl RemoveFollower {
|
||||
pub fn new(region_id: RegionId, peer: Peer) -> Self {
|
||||
Self { region_id, peer }
|
||||
}
|
||||
|
||||
/// Builds the close region instruction for the region follower.
|
||||
pub(crate) async fn build_close_region_instruction(
|
||||
&self,
|
||||
region_info: RegionInfo,
|
||||
) -> Result<Instruction> {
|
||||
let datanode_id = self.peer.id;
|
||||
let table_id = self.region_id.table_id();
|
||||
let region_number = self.region_id.region_number();
|
||||
|
||||
let RegionInfo { engine, .. } = region_info;
|
||||
|
||||
let region_ident = RegionIdent {
|
||||
datanode_id,
|
||||
table_id,
|
||||
region_number,
|
||||
engine,
|
||||
};
|
||||
|
||||
let close_instruction = Instruction::CloseRegion(region_ident);
|
||||
|
||||
Ok(close_instruction)
|
||||
}
|
||||
|
||||
/// Sends the close region instruction to the datanode.
|
||||
pub(crate) async fn send_close_region_instruction(
|
||||
&self,
|
||||
ctx: &Context,
|
||||
instruction: Instruction,
|
||||
) -> Result<()> {
|
||||
let msg = MailboxMessage::json_message(
|
||||
&format!("Close a follower region: {}", self.region_id),
|
||||
&format!("Metasrv@{}", ctx.server_addr),
|
||||
&format!("Datanode-{}@{}", self.peer.id, self.peer.addr),
|
||||
common_time::util::current_time_millis(),
|
||||
&instruction,
|
||||
)
|
||||
.with_context(|_| error::SerializeToJsonSnafu {
|
||||
input: instruction.to_string(),
|
||||
})?;
|
||||
|
||||
let ch = Channel::Datanode(self.peer.id);
|
||||
let now = Instant::now();
|
||||
let receiver = ctx
|
||||
.mailbox
|
||||
.send(&ch, msg, CLOSE_REGION_FOLLOWER_TIMEOUT)
|
||||
.await?;
|
||||
|
||||
match receiver.await? {
|
||||
Ok(msg) => {
|
||||
let reply = HeartbeatMailbox::json_reply(&msg)?;
|
||||
info!(
|
||||
"Received close region follower reply: {:?}, region: {}, elapsed: {:?}",
|
||||
reply,
|
||||
self.region_id,
|
||||
now.elapsed()
|
||||
);
|
||||
let InstructionReply::CloseRegion(SimpleReply { result, error }) = reply else {
|
||||
return error::UnexpectedInstructionReplySnafu {
|
||||
mailbox_message: msg.to_string(),
|
||||
reason: "expect close region follower reply",
|
||||
}
|
||||
.fail();
|
||||
};
|
||||
|
||||
if result {
|
||||
Ok(())
|
||||
} else {
|
||||
error::RetryLaterSnafu {
|
||||
reason: format!(
|
||||
"Region {} is not closed by datanode {:?}, error: {error:?}, elapsed: {:?}",
|
||||
self.region_id,
|
||||
&self.peer,
|
||||
now.elapsed()
|
||||
),
|
||||
}
|
||||
.fail()
|
||||
}
|
||||
}
|
||||
Err(error::Error::MailboxTimeout { .. }) => {
|
||||
let reason = format!(
|
||||
"Mailbox received timeout for close region follower {} on datanode {:?}, elapsed: {:?}",
|
||||
self.region_id,
|
||||
&self.peer,
|
||||
now.elapsed()
|
||||
);
|
||||
error::RetryLaterSnafu { reason }.fail()
|
||||
}
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::assert_matches::assert_matches;
|
||||
|
||||
use common_meta::DatanodeId;
|
||||
|
||||
use super::*;
|
||||
use crate::error::Error;
|
||||
use crate::procedure::region_follower::test_util::TestingEnv;
|
||||
use crate::procedure::test_util::{new_open_region_reply, send_mock_reply};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_datanode_is_unreachable() {
|
||||
let env = TestingEnv::new();
|
||||
let ctx = env.new_context();
|
||||
|
||||
let region_id = RegionId::new(1, 1);
|
||||
let peer = Peer::new(1, "127.0.0.1:8080");
|
||||
let remove_follower = RemoveFollower::new(region_id, peer.clone());
|
||||
let instruction = mock_close_region_instruction(peer.id, region_id);
|
||||
let err = remove_follower
|
||||
.send_close_region_instruction(&ctx, instruction)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert_matches!(err, Error::PusherNotFound { .. });
|
||||
assert!(!err.is_retryable());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_unexpected_instruction_reply() {
|
||||
let mut env = TestingEnv::new();
|
||||
let ctx = env.new_context();
|
||||
let mailbox_ctx = env.mailbox_context_mut();
|
||||
let mailbox = mailbox_ctx.mailbox().clone();
|
||||
|
||||
let region_id = RegionId::new(1, 1);
|
||||
let peer = Peer::new(1, "127.0.0.1:8080");
|
||||
|
||||
let (tx, rx) = tokio::sync::mpsc::channel(1);
|
||||
|
||||
mailbox_ctx
|
||||
.insert_heartbeat_response_receiver(Channel::Datanode(peer.id), tx)
|
||||
.await;
|
||||
|
||||
// Sends an timeout error.
|
||||
send_mock_reply(mailbox, rx, |id| Ok(new_open_region_reply(id, false, None)));
|
||||
|
||||
let remove_follower = RemoveFollower::new(region_id, peer.clone());
|
||||
let instruction = mock_close_region_instruction(peer.id, region_id);
|
||||
let err = remove_follower
|
||||
.send_close_region_instruction(&ctx, instruction)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert_matches!(err, Error::UnexpectedInstructionReply { .. });
|
||||
assert!(!err.is_retryable());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_instruction_exceeded_deadline() {
|
||||
let mut env = TestingEnv::new();
|
||||
let ctx = env.new_context();
|
||||
let mailbox_ctx = env.mailbox_context_mut();
|
||||
let mailbox = mailbox_ctx.mailbox().clone();
|
||||
|
||||
let region_id = RegionId::new(1, 1);
|
||||
let peer = Peer::new(1, "127.0.0.1:8080");
|
||||
|
||||
let (tx, rx) = tokio::sync::mpsc::channel(1);
|
||||
|
||||
mailbox_ctx
|
||||
.insert_heartbeat_response_receiver(Channel::Datanode(peer.id), tx)
|
||||
.await;
|
||||
|
||||
// Sends an timeout error.
|
||||
send_mock_reply(mailbox, rx, |id| {
|
||||
Err(error::MailboxTimeoutSnafu { id }.build())
|
||||
});
|
||||
|
||||
let remove_follower = RemoveFollower::new(region_id, peer.clone());
|
||||
let instruction = mock_close_region_instruction(peer.id, region_id);
|
||||
let err = remove_follower
|
||||
.send_close_region_instruction(&ctx, instruction)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert_matches!(err, Error::RetryLater { .. });
|
||||
assert!(err.is_retryable());
|
||||
}
|
||||
|
||||
fn mock_close_region_instruction(datanode_id: DatanodeId, region_id: RegionId) -> Instruction {
|
||||
Instruction::CloseRegion(RegionIdent {
|
||||
datanode_id,
|
||||
table_id: region_id.table_id(),
|
||||
region_number: region_id.region_number(),
|
||||
engine: "mito2".to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -1,233 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use common_meta::instruction::CacheIdent;
|
||||
use common_procedure::error::ToJsonSnafu;
|
||||
use common_procedure::{
|
||||
Context as ProcedureContext, Error as ProcedureError, LockKey, Procedure,
|
||||
Result as ProcedureResult, Status,
|
||||
};
|
||||
use common_telemetry::info;
|
||||
use snafu::{ensure, ResultExt};
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use super::remove::RemoveFollower;
|
||||
use super::{AlterRegionFollowerData, AlterRegionFollowerState, Context};
|
||||
use crate::error::{self, Result};
|
||||
use crate::metrics;
|
||||
|
||||
/// The procedure to remove a region follower.
|
||||
pub struct RemoveRegionFollowerProcedure {
|
||||
pub data: AlterRegionFollowerData,
|
||||
pub context: Context,
|
||||
}
|
||||
|
||||
impl RemoveRegionFollowerProcedure {
|
||||
pub const TYPE_NAME: &'static str = "metasrv-procedure::RemoveRegionFollower";
|
||||
|
||||
pub fn new(
|
||||
catalog: String,
|
||||
schema: String,
|
||||
region_id: RegionId,
|
||||
peer_id: u64,
|
||||
context: Context,
|
||||
) -> Self {
|
||||
Self {
|
||||
data: AlterRegionFollowerData {
|
||||
catalog,
|
||||
schema,
|
||||
region_id,
|
||||
peer_id,
|
||||
peer: None,
|
||||
datanode_table_value: None,
|
||||
table_route: None,
|
||||
state: AlterRegionFollowerState::Prepare,
|
||||
},
|
||||
context,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_json(json: &str, context: Context) -> ProcedureResult<Self> {
|
||||
let data: AlterRegionFollowerData = serde_json::from_str(json).unwrap();
|
||||
Ok(Self { data, context })
|
||||
}
|
||||
|
||||
pub async fn on_prepare(&mut self) -> Result<Status> {
|
||||
// loads the datanode peer and check peer is alive
|
||||
self.data.peer = self.data.load_datanode_peer(&self.context).await?;
|
||||
|
||||
// loads the datanode table value
|
||||
self.data.datanode_table_value = self.data.load_datanode_table_value(&self.context).await?;
|
||||
|
||||
// loads the table route of the region
|
||||
self.data.table_route = self.data.load_table_route(&self.context).await?;
|
||||
let table_route = self.data.physical_table_route().unwrap();
|
||||
|
||||
// check if the destination peer has this region
|
||||
for region_route in &table_route.region_routes {
|
||||
if region_route.region.id != self.data.region_id {
|
||||
continue;
|
||||
}
|
||||
ensure!(
|
||||
!region_route
|
||||
.follower_peers
|
||||
.iter()
|
||||
.any(|peer| peer.id == self.data.peer_id),
|
||||
error::RegionFollowerNotExistsSnafu {
|
||||
region_id: self.data.region_id,
|
||||
peer_id: self.data.peer_id,
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
info!(
|
||||
"Remove region({}) follower procedure is preparing, peer: {:?}",
|
||||
self.data.region_id,
|
||||
self.data.datanode_peer()
|
||||
);
|
||||
|
||||
Ok(Status::executing(true))
|
||||
}
|
||||
|
||||
pub async fn on_submit_request(&mut self) -> Result<Status> {
|
||||
let region_id = self.data.region_id;
|
||||
// Safety: we have already set the peer in `on_prepare``.
|
||||
let peer = self.data.peer.clone().unwrap();
|
||||
let remove_follower = RemoveFollower::new(region_id, peer);
|
||||
let instruction = remove_follower
|
||||
.build_close_region_instruction(self.data.region_info().unwrap())
|
||||
.await?;
|
||||
remove_follower
|
||||
.send_close_region_instruction(&self.context, instruction)
|
||||
.await?;
|
||||
|
||||
Ok(Status::executing(true))
|
||||
}
|
||||
|
||||
pub async fn on_update_metadata(&mut self) -> Result<Status> {
|
||||
// Safety: we have already load the table route in `on_prepare``.
|
||||
let (current_table_route_value, phy_table_route) = self.data.table_route.as_ref().unwrap();
|
||||
|
||||
let mut new_region_routes = phy_table_route.region_routes.clone();
|
||||
for region_route in &mut new_region_routes {
|
||||
if region_route.region.id != self.data.region_id {
|
||||
continue;
|
||||
}
|
||||
// remove the follower peer from the region route
|
||||
region_route
|
||||
.follower_peers
|
||||
.retain(|peer| peer.id != self.data.peer_id);
|
||||
}
|
||||
|
||||
// Safety: we have already load the region info in `on_prepare`.
|
||||
let region_info = self.data.region_info().unwrap();
|
||||
let new_region_options = region_info.region_options.clone();
|
||||
let new_region_wal_options = region_info.region_wal_options.clone();
|
||||
|
||||
self.context
|
||||
.table_metadata_manager
|
||||
.update_table_route(
|
||||
self.data.region_id.table_id(),
|
||||
region_info,
|
||||
current_table_route_value,
|
||||
new_region_routes,
|
||||
&new_region_options,
|
||||
&new_region_wal_options,
|
||||
)
|
||||
.await
|
||||
.context(error::TableMetadataManagerSnafu)?;
|
||||
|
||||
Ok(Status::executing(true))
|
||||
}
|
||||
|
||||
pub async fn on_broadcast(&mut self) -> Result<Status> {
|
||||
let table_id = self.data.region_id.table_id();
|
||||
// ignore the result
|
||||
let ctx = common_meta::cache_invalidator::Context::default();
|
||||
let _ = self
|
||||
.context
|
||||
.cache_invalidator
|
||||
.invalidate(&ctx, &[CacheIdent::TableId(table_id)])
|
||||
.await;
|
||||
Ok(Status::executing(true))
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Procedure for RemoveRegionFollowerProcedure {
|
||||
fn type_name(&self) -> &str {
|
||||
Self::TYPE_NAME
|
||||
}
|
||||
|
||||
async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult<Status> {
|
||||
let state = &self.data.state;
|
||||
|
||||
let _timer = metrics::METRIC_META_REMOVE_REGION_FOLLOWER_EXECUTE
|
||||
.with_label_values(&[state.as_ref()])
|
||||
.start_timer();
|
||||
|
||||
match state {
|
||||
AlterRegionFollowerState::Prepare => self.on_prepare().await,
|
||||
AlterRegionFollowerState::SubmitRequest => self.on_submit_request().await,
|
||||
AlterRegionFollowerState::UpdateMetadata => self.on_update_metadata().await,
|
||||
AlterRegionFollowerState::InvalidateTableCache => self.on_broadcast().await,
|
||||
}
|
||||
.map_err(|e| {
|
||||
if e.is_retryable() {
|
||||
ProcedureError::retry_later(e)
|
||||
} else {
|
||||
ProcedureError::external(e)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn dump(&self) -> ProcedureResult<String> {
|
||||
serde_json::to_string(&self.data).context(ToJsonSnafu)
|
||||
}
|
||||
|
||||
fn lock_key(&self) -> LockKey {
|
||||
LockKey::new(self.data.lock_key())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use common_meta::lock_key::{CatalogLock, RegionLock, SchemaLock, TableLock};
|
||||
|
||||
use super::*;
|
||||
use crate::procedure::region_follower::test_util::TestingEnv;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_lock_key() {
|
||||
let env = TestingEnv::new();
|
||||
let context = env.new_context();
|
||||
|
||||
let procedure = RemoveRegionFollowerProcedure::new(
|
||||
"test_catalog".to_string(),
|
||||
"test_schema".to_string(),
|
||||
RegionId::new(1, 1),
|
||||
1,
|
||||
context,
|
||||
);
|
||||
|
||||
let key = procedure.lock_key();
|
||||
let keys = key.keys_to_lock().cloned().collect::<Vec<_>>();
|
||||
|
||||
assert_eq!(keys.len(), 4);
|
||||
assert!(keys.contains(&CatalogLock::Read("test_catalog").into()));
|
||||
assert!(keys.contains(&SchemaLock::read("test_catalog", "test_schema").into()));
|
||||
assert!(keys.contains(&TableLock::Write(1).into()));
|
||||
assert!(keys.contains(&RegionLock::Write(RegionId::new(1, 1)).into()));
|
||||
}
|
||||
}
|
||||
@@ -1,97 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_meta::key::{TableMetadataManager, TableMetadataManagerRef};
|
||||
use common_meta::kv_backend::memory::MemoryKvBackend;
|
||||
use common_meta::kv_backend::ResettableKvBackendRef;
|
||||
use common_meta::sequence::SequenceBuilder;
|
||||
use common_meta::state_store::KvStateStore;
|
||||
use common_procedure::local::{LocalManager, ManagerConfig};
|
||||
use common_procedure::ProcedureManagerRef;
|
||||
|
||||
use super::Context;
|
||||
use crate::cache_invalidator::MetasrvCacheInvalidator;
|
||||
use crate::cluster::MetaPeerClientBuilder;
|
||||
use crate::metasrv::MetasrvInfo;
|
||||
use crate::procedure::test_util::MailboxContext;
|
||||
|
||||
/// `TestingEnv` provides components during the tests.
|
||||
pub struct TestingEnv {
|
||||
table_metadata_manager: TableMetadataManagerRef,
|
||||
mailbox_ctx: MailboxContext,
|
||||
server_addr: String,
|
||||
procedure_manager: ProcedureManagerRef,
|
||||
in_memory: ResettableKvBackendRef,
|
||||
}
|
||||
|
||||
impl Default for TestingEnv {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl TestingEnv {
|
||||
pub fn new() -> Self {
|
||||
let kv_backend = Arc::new(MemoryKvBackend::new());
|
||||
let in_memory = Arc::new(MemoryKvBackend::new());
|
||||
let table_metadata_manager = Arc::new(TableMetadataManager::new(kv_backend.clone()));
|
||||
|
||||
let mailbox_sequence =
|
||||
SequenceBuilder::new("test_heartbeat_mailbox", kv_backend.clone()).build();
|
||||
|
||||
let mailbox_ctx = MailboxContext::new(mailbox_sequence);
|
||||
|
||||
let state_store = Arc::new(KvStateStore::new(kv_backend));
|
||||
let procedure_manager = Arc::new(LocalManager::new(ManagerConfig::default(), state_store));
|
||||
|
||||
Self {
|
||||
table_metadata_manager,
|
||||
mailbox_ctx,
|
||||
server_addr: "localhost".to_string(),
|
||||
procedure_manager,
|
||||
in_memory,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_context(&self) -> Context {
|
||||
Context {
|
||||
table_metadata_manager: self.table_metadata_manager.clone(),
|
||||
mailbox: self.mailbox_ctx.mailbox().clone(),
|
||||
server_addr: self.server_addr.clone(),
|
||||
cache_invalidator: Arc::new(MetasrvCacheInvalidator::new(
|
||||
self.mailbox_ctx.mailbox().clone(),
|
||||
MetasrvInfo {
|
||||
server_addr: self.server_addr.to_string(),
|
||||
},
|
||||
)),
|
||||
meta_peer_client: MetaPeerClientBuilder::default()
|
||||
.election(None)
|
||||
.in_memory(self.in_memory.clone())
|
||||
.build()
|
||||
.map(Arc::new)
|
||||
// Safety: all required fields set at initialization
|
||||
.unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn mailbox_context_mut(&mut self) -> &mut MailboxContext {
|
||||
&mut self.mailbox_ctx
|
||||
}
|
||||
|
||||
pub fn procedure_manager(&self) -> ProcedureManagerRef {
|
||||
self.procedure_manager.clone()
|
||||
}
|
||||
}
|
||||
@@ -543,11 +543,11 @@ pub(crate) mod tests {
|
||||
assert!(rx.await.unwrap().is_empty());
|
||||
|
||||
fn generate_heartbeats(datanode_id: u64, region_ids: Vec<u32>) -> Vec<DatanodeHeartbeat> {
|
||||
let mut rng = rand::thread_rng();
|
||||
let mut rng = rand::rng();
|
||||
let start = current_time_millis();
|
||||
(0..2000)
|
||||
.map(|i| DatanodeHeartbeat {
|
||||
timestamp: start + i * 1000 + rng.gen_range(0..100),
|
||||
timestamp: start + i * 1000 + rng.random_range(0..100),
|
||||
datanode_id,
|
||||
regions: region_ids
|
||||
.iter()
|
||||
|
||||
@@ -61,7 +61,7 @@ impl Selector for RandomNodeSelector {
|
||||
type Output = Vec<Peer>;
|
||||
|
||||
async fn select(&self, _ctx: &Self::Context, _opts: SelectorOptions) -> Result<Self::Output> {
|
||||
let mut rng = rand::thread_rng();
|
||||
let mut rng = rand::rng();
|
||||
let mut nodes = self.nodes.clone();
|
||||
nodes.shuffle(&mut rng);
|
||||
Ok(nodes)
|
||||
|
||||
@@ -12,8 +12,8 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
use rand::rng;
|
||||
use rand::seq::IndexedRandom;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::error;
|
||||
@@ -26,7 +26,10 @@ pub trait WeightedChoose<Item>: Send + Sync {
|
||||
|
||||
/// The method will choose multiple items.
|
||||
///
|
||||
/// Returns less than `amount` items if the weight_array is not enough.
|
||||
/// ## Note
|
||||
///
|
||||
/// - Returns less than `amount` items if the weight_array is not enough.
|
||||
/// - The returned items cannot be duplicated.
|
||||
fn choose_multiple(&mut self, amount: usize) -> Result<Vec<Item>>;
|
||||
|
||||
/// Returns the length of the weight_array.
|
||||
@@ -84,7 +87,7 @@ where
|
||||
// unwrap safety: whether weighted_index is none has been checked before.
|
||||
let item = self
|
||||
.items
|
||||
.choose_weighted(&mut thread_rng(), |item| item.weight as f64)
|
||||
.choose_weighted(&mut rng(), |item| item.weight as f64)
|
||||
.context(error::ChooseItemsSnafu)?
|
||||
.item
|
||||
.clone();
|
||||
@@ -92,9 +95,11 @@ where
|
||||
}
|
||||
|
||||
fn choose_multiple(&mut self, amount: usize) -> Result<Vec<Item>> {
|
||||
let amount = amount.min(self.items.iter().filter(|item| item.weight > 0).count());
|
||||
|
||||
Ok(self
|
||||
.items
|
||||
.choose_multiple_weighted(&mut thread_rng(), amount, |item| item.weight as f64)
|
||||
.choose_multiple_weighted(&mut rng(), amount, |item| item.weight as f64)
|
||||
.context(error::ChooseItemsSnafu)?
|
||||
.cloned()
|
||||
.map(|item| item.item)
|
||||
@@ -127,7 +132,7 @@ mod tests {
|
||||
|
||||
for _ in 0..100 {
|
||||
let ret = choose.choose_multiple(3).unwrap();
|
||||
assert_eq!(vec![1, 2], ret);
|
||||
assert_eq!(vec![1], ret);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -28,7 +28,7 @@ use mito2::region::options::MergeMode;
|
||||
use mito2::row_converter::DensePrimaryKeyCodec;
|
||||
use mito2::test_util::memtable_util::{self, region_metadata_to_row_schema};
|
||||
use rand::rngs::ThreadRng;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::seq::IndexedRandom;
|
||||
use rand::Rng;
|
||||
use store_api::metadata::{
|
||||
ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
|
||||
@@ -161,8 +161,8 @@ struct Host {
|
||||
|
||||
impl Host {
|
||||
fn random_with_id(id: usize) -> Host {
|
||||
let mut rng = rand::thread_rng();
|
||||
let region = format!("ap-southeast-{}", rng.gen_range(0..10));
|
||||
let mut rng = rand::rng();
|
||||
let region = format!("ap-southeast-{}", rng.random_range(0..10));
|
||||
let datacenter = format!(
|
||||
"{}{}",
|
||||
region,
|
||||
@@ -172,12 +172,12 @@ impl Host {
|
||||
hostname: format!("host_{id}"),
|
||||
region,
|
||||
datacenter,
|
||||
rack: rng.gen_range(0..100).to_string(),
|
||||
rack: rng.random_range(0..100).to_string(),
|
||||
os: "Ubuntu16.04LTS".to_string(),
|
||||
arch: "x86".to_string(),
|
||||
team: "CHI".to_string(),
|
||||
service: rng.gen_range(0..100).to_string(),
|
||||
service_version: rng.gen_range(0..10).to_string(),
|
||||
service: rng.random_range(0..100).to_string(),
|
||||
service_version: rng.random_range(0..10).to_string(),
|
||||
service_environment: "test".to_string(),
|
||||
}
|
||||
}
|
||||
@@ -254,7 +254,7 @@ impl CpuDataGenerator {
|
||||
.hosts
|
||||
.iter()
|
||||
.map(|host| {
|
||||
let mut rng = rand::thread_rng();
|
||||
let mut rng = rand::rng();
|
||||
let mut values = Vec::with_capacity(21);
|
||||
values.push(api::v1::Value {
|
||||
value_data: Some(ValueData::TimestampMillisecondValue(current_sec * 1000)),
|
||||
@@ -288,12 +288,12 @@ impl CpuDataGenerator {
|
||||
}
|
||||
|
||||
fn random_hostname(&self) -> String {
|
||||
let mut rng = rand::thread_rng();
|
||||
let mut rng = rand::rng();
|
||||
self.hosts.choose(&mut rng).unwrap().hostname.clone()
|
||||
}
|
||||
|
||||
fn random_f64(rng: &mut ThreadRng) -> f64 {
|
||||
let base: u32 = rng.gen_range(30..95);
|
||||
let base: u32 = rng.random_range(30..95);
|
||||
base as f64
|
||||
}
|
||||
|
||||
|
||||
@@ -146,14 +146,14 @@ mod test {
|
||||
|
||||
#[test]
|
||||
fn fuzz_index_calculation() {
|
||||
let mut rng = rand::thread_rng();
|
||||
let mut rng = rand::rng();
|
||||
let mut data = vec![0u8; 1024 * 1024];
|
||||
rng.fill_bytes(&mut data);
|
||||
|
||||
for _ in 0..FUZZ_REPEAT_TIMES {
|
||||
let offset = rng.gen_range(0..data.len() as u64);
|
||||
let size = rng.gen_range(0..data.len() as u32 - offset as u32);
|
||||
let page_size: usize = rng.gen_range(1..1024);
|
||||
let offset = rng.random_range(0..data.len() as u64);
|
||||
let size = rng.random_range(0..data.len() as u32 - offset as u32);
|
||||
let page_size: usize = rng.random_range(1..1024);
|
||||
|
||||
let indexes =
|
||||
PageKey::generate_page_keys(offset, size, page_size as u64).collect::<Vec<_>>();
|
||||
|
||||
14
src/mito2/src/cache/index/inverted_index.rs
vendored
14
src/mito2/src/cache/index/inverted_index.rs
vendored
@@ -146,14 +146,14 @@ mod test {
|
||||
#[test]
|
||||
fn fuzz_index_calculation() {
|
||||
// randomly generate a large u8 array
|
||||
let mut rng = rand::thread_rng();
|
||||
let mut rng = rand::rng();
|
||||
let mut data = vec![0u8; 1024 * 1024];
|
||||
rng.fill_bytes(&mut data);
|
||||
|
||||
for _ in 0..FUZZ_REPEAT_TIMES {
|
||||
let offset = rng.gen_range(0..data.len() as u64);
|
||||
let size = rng.gen_range(0..data.len() as u32 - offset as u32);
|
||||
let page_size: usize = rng.gen_range(1..1024);
|
||||
let offset = rng.random_range(0..data.len() as u64);
|
||||
let size = rng.random_range(0..data.len() as u32 - offset as u32);
|
||||
let page_size: usize = rng.random_range(1..1024);
|
||||
|
||||
let indexes =
|
||||
PageKey::generate_page_keys(offset, size, page_size as u64).collect::<Vec<_>>();
|
||||
@@ -357,10 +357,10 @@ mod test {
|
||||
);
|
||||
|
||||
// fuzz test
|
||||
let mut rng = rand::thread_rng();
|
||||
let mut rng = rand::rng();
|
||||
for _ in 0..FUZZ_REPEAT_TIMES {
|
||||
let offset = rng.gen_range(0..file_size);
|
||||
let size = rng.gen_range(0..file_size as u32 - offset as u32);
|
||||
let offset = rng.random_range(0..file_size);
|
||||
let size = rng.random_range(0..file_size as u32 - offset as u32);
|
||||
let expected = cached_reader.range_read(offset, size).await.unwrap();
|
||||
let inner = &cached_reader.inner;
|
||||
let read = cached_reader
|
||||
|
||||
@@ -272,7 +272,7 @@ fn prepare_batch_open_requests(
|
||||
.or_default()
|
||||
.push((region_id, request));
|
||||
}
|
||||
WalOptions::RaftEngine => {
|
||||
WalOptions::RaftEngine | WalOptions::Noop => {
|
||||
remaining_regions.push((region_id, request));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -389,10 +389,10 @@ mod tests {
|
||||
|
||||
fn prepare_input_keys(num_keys: usize) -> Vec<Vec<u8>> {
|
||||
let prefix = ["a", "b", "c", "d", "e", "f"];
|
||||
let mut rng = rand::thread_rng();
|
||||
let mut rng = rand::rng();
|
||||
let mut keys = Vec::with_capacity(num_keys);
|
||||
for i in 0..num_keys {
|
||||
let prefix_idx = rng.gen_range(0..prefix.len());
|
||||
let prefix_idx = rng.random_range(0..prefix.len());
|
||||
// We don't need to decode the primary key in index's test so we format the string
|
||||
// into the key.
|
||||
let key = format!("{}{}", prefix[prefix_idx], i);
|
||||
|
||||
@@ -666,17 +666,15 @@ impl Series {
|
||||
|
||||
/// Freezes active part to frozen part and compact frozen part to reduce memory fragmentation.
|
||||
/// Returns the frozen and compacted values.
|
||||
fn compact(&mut self, region_metadata: &RegionMetadataRef) -> Result<Values> {
|
||||
fn compact(&mut self, region_metadata: &RegionMetadataRef) -> Result<&Values> {
|
||||
self.freeze(region_metadata);
|
||||
|
||||
let mut frozen = self.frozen.clone();
|
||||
let frozen = &self.frozen;
|
||||
|
||||
// Each series must contain at least one row
|
||||
debug_assert!(!frozen.is_empty());
|
||||
|
||||
let values = if frozen.len() == 1 {
|
||||
frozen.pop().unwrap()
|
||||
} else {
|
||||
if frozen.len() > 1 {
|
||||
// TODO(hl): We should keep track of min/max timestamps for each values and avoid
|
||||
// cloning and sorting when values do not overlap with each other.
|
||||
|
||||
@@ -700,10 +698,9 @@ impl Series {
|
||||
|
||||
debug_assert_eq!(concatenated.len(), column_size);
|
||||
let values = Values::from_columns(&concatenated)?;
|
||||
self.frozen = vec![values.clone()];
|
||||
values
|
||||
self.frozen = vec![values];
|
||||
};
|
||||
Ok(values)
|
||||
Ok(&self.frozen[0])
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1008,7 +1005,7 @@ mod tests {
|
||||
vec![ValueRef::Int64(v0), ValueRef::Float64(OrderedFloat(v1))].into_iter()
|
||||
}
|
||||
|
||||
fn check_values(values: Values, expect: &[(i64, u64, u8, i64, f64)]) {
|
||||
fn check_values(values: &Values, expect: &[(i64, u64, u8, i64, f64)]) {
|
||||
let ts = values
|
||||
.timestamp
|
||||
.as_any()
|
||||
|
||||
@@ -56,7 +56,6 @@ use crate::error::{
|
||||
ComputeArrowSnafu, ComputeVectorSnafu, ConvertVectorSnafu, InvalidBatchSnafu, Result,
|
||||
};
|
||||
use crate::memtable::BoxedBatchIterator;
|
||||
use crate::metrics::{READ_BATCHES_RETURN, READ_ROWS_RETURN, READ_STAGE_ELAPSED};
|
||||
use crate::read::prune::PruneReader;
|
||||
use crate::row_converter::{CompositeValues, PrimaryKeyCodec};
|
||||
|
||||
@@ -383,8 +382,13 @@ impl Batch {
|
||||
];
|
||||
let rows = converter.convert_columns(&columns).unwrap();
|
||||
let mut to_sort: Vec<_> = rows.iter().enumerate().collect();
|
||||
to_sort.sort_unstable_by(|left, right| left.1.cmp(&right.1));
|
||||
|
||||
let was_sorted = to_sort.is_sorted_by_key(|x| x.1);
|
||||
if !was_sorted {
|
||||
to_sort.sort_unstable_by_key(|x| x.1);
|
||||
}
|
||||
|
||||
let num_rows = to_sort.len();
|
||||
if dedup {
|
||||
// Dedup by timestamps.
|
||||
to_sort.dedup_by(|left, right| {
|
||||
@@ -395,7 +399,11 @@ impl Batch {
|
||||
left_key[..TIMESTAMP_KEY_LEN] == right_key[..TIMESTAMP_KEY_LEN]
|
||||
});
|
||||
}
|
||||
let no_dedup = to_sort.len() == num_rows;
|
||||
|
||||
if was_sorted && no_dedup {
|
||||
return Ok(());
|
||||
}
|
||||
let indices = UInt32Vector::from_iter_values(to_sort.iter().map(|v| v.0 as u32));
|
||||
self.take_in_place(&indices)
|
||||
}
|
||||
@@ -991,13 +999,11 @@ impl<T: BatchReader + ?Sized> BatchReader for Box<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Metrics for scanners.
|
||||
/// Local metrics for scanners.
|
||||
#[derive(Debug, Default)]
|
||||
pub(crate) struct ScannerMetrics {
|
||||
/// Duration to prepare the scan task.
|
||||
prepare_scan_cost: Duration,
|
||||
/// Duration to build file ranges.
|
||||
build_parts_cost: Duration,
|
||||
/// Duration to build the (merge) reader.
|
||||
build_reader_cost: Duration,
|
||||
/// Duration to scan data.
|
||||
@@ -1006,8 +1012,6 @@ pub(crate) struct ScannerMetrics {
|
||||
convert_cost: Duration,
|
||||
/// Duration while waiting for `yield`.
|
||||
yield_cost: Duration,
|
||||
/// Duration of the scan.
|
||||
total_cost: Duration,
|
||||
/// Number of batches returned.
|
||||
num_batches: usize,
|
||||
/// Number of rows returned.
|
||||
@@ -1018,50 +1022,6 @@ pub(crate) struct ScannerMetrics {
|
||||
num_file_ranges: usize,
|
||||
}
|
||||
|
||||
impl ScannerMetrics {
|
||||
/// Observes metrics.
|
||||
fn observe_metrics(&self) {
|
||||
READ_STAGE_ELAPSED
|
||||
.with_label_values(&["prepare_scan"])
|
||||
.observe(self.prepare_scan_cost.as_secs_f64());
|
||||
READ_STAGE_ELAPSED
|
||||
.with_label_values(&["build_parts"])
|
||||
.observe(self.build_parts_cost.as_secs_f64());
|
||||
READ_STAGE_ELAPSED
|
||||
.with_label_values(&["build_reader"])
|
||||
.observe(self.build_reader_cost.as_secs_f64());
|
||||
READ_STAGE_ELAPSED
|
||||
.with_label_values(&["convert_rb"])
|
||||
.observe(self.convert_cost.as_secs_f64());
|
||||
READ_STAGE_ELAPSED
|
||||
.with_label_values(&["scan"])
|
||||
.observe(self.scan_cost.as_secs_f64());
|
||||
READ_STAGE_ELAPSED
|
||||
.with_label_values(&["yield"])
|
||||
.observe(self.yield_cost.as_secs_f64());
|
||||
READ_STAGE_ELAPSED
|
||||
.with_label_values(&["total"])
|
||||
.observe(self.total_cost.as_secs_f64());
|
||||
READ_ROWS_RETURN.observe(self.num_rows as f64);
|
||||
READ_BATCHES_RETURN.observe(self.num_batches as f64);
|
||||
}
|
||||
|
||||
/// Merges metrics from another [ScannerMetrics].
|
||||
fn merge_from(&mut self, other: &ScannerMetrics) {
|
||||
self.prepare_scan_cost += other.prepare_scan_cost;
|
||||
self.build_parts_cost += other.build_parts_cost;
|
||||
self.build_reader_cost += other.build_reader_cost;
|
||||
self.scan_cost += other.scan_cost;
|
||||
self.convert_cost += other.convert_cost;
|
||||
self.yield_cost += other.yield_cost;
|
||||
self.total_cost += other.total_cost;
|
||||
self.num_batches += other.num_batches;
|
||||
self.num_rows += other.num_rows;
|
||||
self.num_mem_ranges += other.num_mem_ranges;
|
||||
self.num_file_ranges += other.num_file_ranges;
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use store_api::codec::PrimaryKeyEncoding;
|
||||
|
||||
@@ -917,7 +917,7 @@ impl StreamContext {
|
||||
}
|
||||
|
||||
/// Format the context for explain.
|
||||
pub(crate) fn format_for_explain(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
pub(crate) fn format_for_explain(&self, verbose: bool, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let (mut num_mem_ranges, mut num_file_ranges) = (0, 0);
|
||||
for range_meta in &self.ranges {
|
||||
for idx in &range_meta.row_group_indices {
|
||||
@@ -939,8 +939,77 @@ impl StreamContext {
|
||||
if let Some(selector) = &self.input.series_row_selector {
|
||||
write!(f, ", selector={}", selector)?;
|
||||
}
|
||||
if let Some(distribution) = &self.input.distribution {
|
||||
write!(f, ", distribution={}", distribution)?;
|
||||
}
|
||||
|
||||
if verbose {
|
||||
self.format_verbose_content(f)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn format_verbose_content(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
struct FileWrapper<'a> {
|
||||
file: &'a FileHandle,
|
||||
}
|
||||
|
||||
impl fmt::Debug for FileWrapper<'_> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"[file={}, time_range=({}::{}, {}::{}), rows={}, size={}, index_size={}]",
|
||||
self.file.file_id(),
|
||||
self.file.time_range().0.value(),
|
||||
self.file.time_range().0.unit(),
|
||||
self.file.time_range().1.value(),
|
||||
self.file.time_range().1.unit(),
|
||||
self.file.num_rows(),
|
||||
self.file.size(),
|
||||
self.file.index_size()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
struct InputWrapper<'a> {
|
||||
input: &'a ScanInput,
|
||||
}
|
||||
|
||||
impl fmt::Debug for InputWrapper<'_> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let output_schema = self.input.mapper.output_schema();
|
||||
if !output_schema.is_empty() {
|
||||
write!(f, ", projection=")?;
|
||||
f.debug_list()
|
||||
.entries(output_schema.column_schemas().iter().map(|col| &col.name))
|
||||
.finish()?;
|
||||
}
|
||||
if let Some(predicate) = &self.input.predicate.predicate() {
|
||||
if !predicate.exprs().is_empty() {
|
||||
write!(f, ", filters=[")?;
|
||||
for (i, expr) in predicate.exprs().iter().enumerate() {
|
||||
if i == predicate.exprs().len() - 1 {
|
||||
write!(f, "{}]", expr)?;
|
||||
} else {
|
||||
write!(f, "{}, ", expr)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if !self.input.files.is_empty() {
|
||||
write!(f, ", files=")?;
|
||||
f.debug_list()
|
||||
.entries(self.input.files.iter().map(|file| FileWrapper { file }))
|
||||
.finish()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
write!(f, "{:?}", InputWrapper { input: &self.input })
|
||||
}
|
||||
}
|
||||
|
||||
/// Predicates to evaluate.
|
||||
|
||||
@@ -14,22 +14,280 @@
|
||||
|
||||
//! Utilities for scanners.
|
||||
|
||||
use std::fmt;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use async_stream::try_stream;
|
||||
use common_telemetry::debug;
|
||||
use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricBuilder, Time};
|
||||
use futures::Stream;
|
||||
use prometheus::IntGauge;
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::metrics::IN_PROGRESS_SCAN;
|
||||
use crate::metrics::{
|
||||
IN_PROGRESS_SCAN, PRECISE_FILTER_ROWS_TOTAL, READ_BATCHES_RETURN, READ_ROWS_IN_ROW_GROUP_TOTAL,
|
||||
READ_ROWS_RETURN, READ_ROW_GROUPS_TOTAL, READ_STAGE_ELAPSED,
|
||||
};
|
||||
use crate::read::range::{RangeBuilderList, RowGroupIndex};
|
||||
use crate::read::scan_region::StreamContext;
|
||||
use crate::read::{Batch, ScannerMetrics, Source};
|
||||
use crate::sst::file::FileTimeRange;
|
||||
use crate::sst::parquet::reader::ReaderMetrics;
|
||||
use crate::sst::parquet::reader::{ReaderFilterMetrics, ReaderMetrics};
|
||||
|
||||
/// Verbose scan metrics for a partition.
|
||||
#[derive(Default)]
|
||||
struct ScanMetricsSet {
|
||||
/// Duration to prepare the scan task.
|
||||
prepare_scan_cost: Duration,
|
||||
/// Duration to build the (merge) reader.
|
||||
build_reader_cost: Duration,
|
||||
/// Duration to scan data.
|
||||
scan_cost: Duration,
|
||||
/// Duration to convert batches.
|
||||
convert_cost: Duration,
|
||||
/// Duration while waiting for `yield`.
|
||||
yield_cost: Duration,
|
||||
/// Duration of the scan.
|
||||
total_cost: Duration,
|
||||
/// Number of rows returned.
|
||||
num_rows: usize,
|
||||
/// Number of batches returned.
|
||||
num_batches: usize,
|
||||
/// Number of mem ranges scanned.
|
||||
num_mem_ranges: usize,
|
||||
/// Number of file ranges scanned.
|
||||
num_file_ranges: usize,
|
||||
|
||||
// SST related metrics:
|
||||
/// Duration to build file ranges.
|
||||
build_parts_cost: Duration,
|
||||
/// Number of row groups before filtering.
|
||||
rg_total: usize,
|
||||
/// Number of row groups filtered by fulltext index.
|
||||
rg_fulltext_filtered: usize,
|
||||
/// Number of row groups filtered by inverted index.
|
||||
rg_inverted_filtered: usize,
|
||||
/// Number of row groups filtered by min-max index.
|
||||
rg_minmax_filtered: usize,
|
||||
/// Number of row groups filtered by bloom filter index.
|
||||
rg_bloom_filtered: usize,
|
||||
/// Number of rows in row group before filtering.
|
||||
rows_before_filter: usize,
|
||||
/// Number of rows in row group filtered by fulltext index.
|
||||
rows_fulltext_filtered: usize,
|
||||
/// Number of rows in row group filtered by inverted index.
|
||||
rows_inverted_filtered: usize,
|
||||
/// Number of rows in row group filtered by bloom filter index.
|
||||
rows_bloom_filtered: usize,
|
||||
/// Number of rows filtered by precise filter.
|
||||
rows_precise_filtered: usize,
|
||||
/// Number of record batches read from SST.
|
||||
num_sst_record_batches: usize,
|
||||
/// Number of batches decoded from SST.
|
||||
num_sst_batches: usize,
|
||||
/// Number of rows read from SST.
|
||||
num_sst_rows: usize,
|
||||
|
||||
/// Elapsed time before the first poll operation.
|
||||
first_poll: Duration,
|
||||
}
|
||||
|
||||
impl fmt::Debug for ScanMetricsSet {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let ScanMetricsSet {
|
||||
prepare_scan_cost,
|
||||
build_reader_cost,
|
||||
scan_cost,
|
||||
convert_cost,
|
||||
yield_cost,
|
||||
total_cost,
|
||||
num_rows,
|
||||
num_batches,
|
||||
num_mem_ranges,
|
||||
num_file_ranges,
|
||||
build_parts_cost,
|
||||
rg_total,
|
||||
rg_fulltext_filtered,
|
||||
rg_inverted_filtered,
|
||||
rg_minmax_filtered,
|
||||
rg_bloom_filtered,
|
||||
rows_before_filter,
|
||||
rows_fulltext_filtered,
|
||||
rows_inverted_filtered,
|
||||
rows_bloom_filtered,
|
||||
rows_precise_filtered,
|
||||
num_sst_record_batches,
|
||||
num_sst_batches,
|
||||
num_sst_rows,
|
||||
first_poll,
|
||||
} = self;
|
||||
|
||||
write!(
|
||||
f,
|
||||
"{{prepare_scan_cost={prepare_scan_cost:?}, \
|
||||
build_reader_cost={build_reader_cost:?}, \
|
||||
scan_cost={scan_cost:?}, \
|
||||
convert_cost={convert_cost:?}, \
|
||||
yield_cost={yield_cost:?}, \
|
||||
total_cost={total_cost:?}, \
|
||||
num_rows={num_rows}, \
|
||||
num_batches={num_batches}, \
|
||||
num_mem_ranges={num_mem_ranges}, \
|
||||
num_file_ranges={num_file_ranges}, \
|
||||
build_parts_cost={build_parts_cost:?}, \
|
||||
rg_total={rg_total}, \
|
||||
rg_fulltext_filtered={rg_fulltext_filtered}, \
|
||||
rg_inverted_filtered={rg_inverted_filtered}, \
|
||||
rg_minmax_filtered={rg_minmax_filtered}, \
|
||||
rg_bloom_filtered={rg_bloom_filtered}, \
|
||||
rows_before_filter={rows_before_filter}, \
|
||||
rows_fulltext_filtered={rows_fulltext_filtered}, \
|
||||
rows_inverted_filtered={rows_inverted_filtered}, \
|
||||
rows_bloom_filtered={rows_bloom_filtered}, \
|
||||
rows_precise_filtered={rows_precise_filtered}, \
|
||||
num_sst_record_batches={num_sst_record_batches}, \
|
||||
num_sst_batches={num_sst_batches}, \
|
||||
num_sst_rows={num_sst_rows}, \
|
||||
first_poll={first_poll:?}}}"
|
||||
)
|
||||
}
|
||||
}
|
||||
impl ScanMetricsSet {
|
||||
/// Attaches the `prepare_scan_cost` to the metrics set.
|
||||
fn with_prepare_scan_cost(mut self, cost: Duration) -> Self {
|
||||
self.prepare_scan_cost += cost;
|
||||
self
|
||||
}
|
||||
|
||||
/// Merges the local scanner metrics.
|
||||
fn merge_scanner_metrics(&mut self, other: &ScannerMetrics) {
|
||||
let ScannerMetrics {
|
||||
prepare_scan_cost,
|
||||
build_reader_cost,
|
||||
scan_cost,
|
||||
convert_cost,
|
||||
yield_cost,
|
||||
num_batches,
|
||||
num_rows,
|
||||
num_mem_ranges,
|
||||
num_file_ranges,
|
||||
} = other;
|
||||
|
||||
self.prepare_scan_cost += *prepare_scan_cost;
|
||||
self.build_reader_cost += *build_reader_cost;
|
||||
self.scan_cost += *scan_cost;
|
||||
self.convert_cost += *convert_cost;
|
||||
self.yield_cost += *yield_cost;
|
||||
self.num_rows += *num_rows;
|
||||
self.num_batches += *num_batches;
|
||||
self.num_mem_ranges += *num_mem_ranges;
|
||||
self.num_file_ranges += *num_file_ranges;
|
||||
}
|
||||
|
||||
/// Merges the local reader metrics.
|
||||
fn merge_reader_metrics(&mut self, other: &ReaderMetrics) {
|
||||
let ReaderMetrics {
|
||||
build_cost,
|
||||
filter_metrics:
|
||||
ReaderFilterMetrics {
|
||||
rg_total,
|
||||
rg_fulltext_filtered,
|
||||
rg_inverted_filtered,
|
||||
rg_minmax_filtered,
|
||||
rg_bloom_filtered,
|
||||
rows_total,
|
||||
rows_fulltext_filtered,
|
||||
rows_inverted_filtered,
|
||||
rows_bloom_filtered,
|
||||
rows_precise_filtered,
|
||||
},
|
||||
num_record_batches,
|
||||
num_batches,
|
||||
num_rows,
|
||||
scan_cost: _,
|
||||
} = other;
|
||||
|
||||
self.build_parts_cost += *build_cost;
|
||||
|
||||
self.rg_total += *rg_total;
|
||||
self.rg_fulltext_filtered += *rg_fulltext_filtered;
|
||||
self.rg_inverted_filtered += *rg_inverted_filtered;
|
||||
self.rg_minmax_filtered += *rg_minmax_filtered;
|
||||
self.rg_bloom_filtered += *rg_bloom_filtered;
|
||||
|
||||
self.rows_before_filter += *rows_total;
|
||||
self.rows_fulltext_filtered += *rows_fulltext_filtered;
|
||||
self.rows_inverted_filtered += *rows_inverted_filtered;
|
||||
self.rows_bloom_filtered += *rows_bloom_filtered;
|
||||
self.rows_precise_filtered += *rows_precise_filtered;
|
||||
|
||||
self.num_sst_record_batches += *num_record_batches;
|
||||
self.num_sst_batches += *num_batches;
|
||||
self.num_sst_rows += *num_rows;
|
||||
}
|
||||
|
||||
/// Observes metrics.
|
||||
fn observe_metrics(&self) {
|
||||
READ_STAGE_ELAPSED
|
||||
.with_label_values(&["prepare_scan"])
|
||||
.observe(self.prepare_scan_cost.as_secs_f64());
|
||||
READ_STAGE_ELAPSED
|
||||
.with_label_values(&["build_reader"])
|
||||
.observe(self.build_reader_cost.as_secs_f64());
|
||||
READ_STAGE_ELAPSED
|
||||
.with_label_values(&["convert_rb"])
|
||||
.observe(self.convert_cost.as_secs_f64());
|
||||
READ_STAGE_ELAPSED
|
||||
.with_label_values(&["scan"])
|
||||
.observe(self.scan_cost.as_secs_f64());
|
||||
READ_STAGE_ELAPSED
|
||||
.with_label_values(&["yield"])
|
||||
.observe(self.yield_cost.as_secs_f64());
|
||||
READ_STAGE_ELAPSED
|
||||
.with_label_values(&["total"])
|
||||
.observe(self.total_cost.as_secs_f64());
|
||||
READ_ROWS_RETURN.observe(self.num_rows as f64);
|
||||
READ_BATCHES_RETURN.observe(self.num_batches as f64);
|
||||
|
||||
READ_STAGE_ELAPSED
|
||||
.with_label_values(&["build_parts"])
|
||||
.observe(self.build_parts_cost.as_secs_f64());
|
||||
|
||||
READ_ROW_GROUPS_TOTAL
|
||||
.with_label_values(&["before_filtering"])
|
||||
.inc_by(self.rg_total as u64);
|
||||
READ_ROW_GROUPS_TOTAL
|
||||
.with_label_values(&["fulltext_index_filtered"])
|
||||
.inc_by(self.rg_fulltext_filtered as u64);
|
||||
READ_ROW_GROUPS_TOTAL
|
||||
.with_label_values(&["inverted_index_filtered"])
|
||||
.inc_by(self.rg_inverted_filtered as u64);
|
||||
READ_ROW_GROUPS_TOTAL
|
||||
.with_label_values(&["minmax_index_filtered"])
|
||||
.inc_by(self.rg_minmax_filtered as u64);
|
||||
READ_ROW_GROUPS_TOTAL
|
||||
.with_label_values(&["bloom_filter_index_filtered"])
|
||||
.inc_by(self.rg_bloom_filtered as u64);
|
||||
|
||||
PRECISE_FILTER_ROWS_TOTAL
|
||||
.with_label_values(&["parquet"])
|
||||
.inc_by(self.rows_precise_filtered as u64);
|
||||
READ_ROWS_IN_ROW_GROUP_TOTAL
|
||||
.with_label_values(&["before_filtering"])
|
||||
.inc_by(self.rows_before_filter as u64);
|
||||
READ_ROWS_IN_ROW_GROUP_TOTAL
|
||||
.with_label_values(&["fulltext_index_filtered"])
|
||||
.inc_by(self.rows_fulltext_filtered as u64);
|
||||
READ_ROWS_IN_ROW_GROUP_TOTAL
|
||||
.with_label_values(&["inverted_index_filtered"])
|
||||
.inc_by(self.rows_inverted_filtered as u64);
|
||||
READ_ROWS_IN_ROW_GROUP_TOTAL
|
||||
.with_label_values(&["bloom_filter_index_filtered"])
|
||||
.inc_by(self.rows_bloom_filtered as u64);
|
||||
}
|
||||
}
|
||||
|
||||
struct PartitionMetricsInner {
|
||||
region_id: RegionId,
|
||||
@@ -39,38 +297,71 @@ struct PartitionMetricsInner {
|
||||
scanner_type: &'static str,
|
||||
/// Query start time.
|
||||
query_start: Instant,
|
||||
/// Elapsed time before the first poll operation.
|
||||
first_poll: Duration,
|
||||
metrics: ScannerMetrics,
|
||||
reader_metrics: ReaderMetrics,
|
||||
/// Verbose scan metrics that only log to debug logs by default.
|
||||
metrics: Mutex<ScanMetricsSet>,
|
||||
in_progress_scan: IntGauge,
|
||||
|
||||
// Normal metrics that always report to the [ExecutionPlanMetricsSet]:
|
||||
/// Duration to build file ranges.
|
||||
build_parts_cost: Time,
|
||||
/// Duration to build the (merge) reader.
|
||||
build_reader_cost: Time,
|
||||
/// Duration to scan data.
|
||||
scan_cost: Time,
|
||||
/// Duration while waiting for `yield`.
|
||||
yield_cost: Time,
|
||||
}
|
||||
|
||||
impl PartitionMetricsInner {
|
||||
fn on_finish(&mut self) {
|
||||
if self.metrics.total_cost.is_zero() {
|
||||
self.metrics.total_cost = self.query_start.elapsed();
|
||||
fn on_finish(&self) {
|
||||
let mut metrics = self.metrics.lock().unwrap();
|
||||
if metrics.total_cost.is_zero() {
|
||||
metrics.total_cost = self.query_start.elapsed();
|
||||
}
|
||||
self.metrics.build_parts_cost = self.reader_metrics.build_cost;
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PartitionMetricsInner {
|
||||
fn drop(&mut self) {
|
||||
self.on_finish();
|
||||
self.metrics.observe_metrics();
|
||||
let metrics = self.metrics.lock().unwrap();
|
||||
metrics.observe_metrics();
|
||||
self.in_progress_scan.dec();
|
||||
|
||||
debug!(
|
||||
"{} finished, region_id: {}, partition: {}, first_poll: {:?}, metrics: {:?}, reader_metrics: {:?}",
|
||||
self.scanner_type, self.region_id, self.partition, self.first_poll, self.metrics, self.reader_metrics
|
||||
"{} finished, region_id: {}, partition: {}, metrics: {:?}",
|
||||
self.scanner_type, self.region_id, self.partition, metrics
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// List of PartitionMetrics.
|
||||
#[derive(Default)]
|
||||
pub(crate) struct PartitionMetricsList(Mutex<Vec<Option<PartitionMetrics>>>);
|
||||
|
||||
impl PartitionMetricsList {
|
||||
/// Sets a new [PartitionMetrics] at the specified partition.
|
||||
pub(crate) fn set(&self, partition: usize, metrics: PartitionMetrics) {
|
||||
let mut list = self.0.lock().unwrap();
|
||||
if list.len() <= partition {
|
||||
list.resize(partition + 1, None);
|
||||
}
|
||||
list[partition] = Some(metrics);
|
||||
}
|
||||
|
||||
/// Format verbose metrics for each partition for explain.
|
||||
pub(crate) fn format_verbose_metrics(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let list = self.0.lock().unwrap();
|
||||
write!(f, ", metrics_per_partition: ")?;
|
||||
f.debug_list()
|
||||
.entries(list.iter().filter_map(|p| p.as_ref()))
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// Metrics while reading a partition.
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct PartitionMetrics(Arc<Mutex<PartitionMetricsInner>>);
|
||||
pub(crate) struct PartitionMetrics(Arc<PartitionMetricsInner>);
|
||||
|
||||
impl PartitionMetrics {
|
||||
pub(crate) fn new(
|
||||
@@ -78,57 +369,82 @@ impl PartitionMetrics {
|
||||
partition: usize,
|
||||
scanner_type: &'static str,
|
||||
query_start: Instant,
|
||||
metrics: ScannerMetrics,
|
||||
metrics_set: &ExecutionPlanMetricsSet,
|
||||
) -> Self {
|
||||
let partition_str = partition.to_string();
|
||||
let in_progress_scan = IN_PROGRESS_SCAN.with_label_values(&[scanner_type, &partition_str]);
|
||||
in_progress_scan.inc();
|
||||
let metrics = ScanMetricsSet::default().with_prepare_scan_cost(query_start.elapsed());
|
||||
let inner = PartitionMetricsInner {
|
||||
region_id,
|
||||
partition,
|
||||
scanner_type,
|
||||
query_start,
|
||||
first_poll: Duration::default(),
|
||||
metrics,
|
||||
reader_metrics: ReaderMetrics::default(),
|
||||
metrics: Mutex::new(metrics),
|
||||
in_progress_scan,
|
||||
build_parts_cost: MetricBuilder::new(metrics_set)
|
||||
.subset_time("build_parts_cost", partition),
|
||||
build_reader_cost: MetricBuilder::new(metrics_set)
|
||||
.subset_time("build_reader_cost", partition),
|
||||
scan_cost: MetricBuilder::new(metrics_set).subset_time("scan_cost", partition),
|
||||
yield_cost: MetricBuilder::new(metrics_set).subset_time("yield_cost", partition),
|
||||
};
|
||||
Self(Arc::new(Mutex::new(inner)))
|
||||
Self(Arc::new(inner))
|
||||
}
|
||||
|
||||
pub(crate) fn on_first_poll(&self) {
|
||||
let mut inner = self.0.lock().unwrap();
|
||||
inner.first_poll = inner.query_start.elapsed();
|
||||
let mut metrics = self.0.metrics.lock().unwrap();
|
||||
metrics.first_poll = self.0.query_start.elapsed();
|
||||
}
|
||||
|
||||
pub(crate) fn inc_num_mem_ranges(&self, num: usize) {
|
||||
let mut inner = self.0.lock().unwrap();
|
||||
inner.metrics.num_mem_ranges += num;
|
||||
let mut metrics = self.0.metrics.lock().unwrap();
|
||||
metrics.num_mem_ranges += num;
|
||||
}
|
||||
|
||||
pub(crate) fn inc_num_file_ranges(&self, num: usize) {
|
||||
let mut inner = self.0.lock().unwrap();
|
||||
inner.metrics.num_file_ranges += num;
|
||||
let mut metrics = self.0.metrics.lock().unwrap();
|
||||
metrics.num_file_ranges += num;
|
||||
}
|
||||
|
||||
/// Merges `build_reader_cost`.
|
||||
pub(crate) fn inc_build_reader_cost(&self, cost: Duration) {
|
||||
let mut inner = self.0.lock().unwrap();
|
||||
inner.metrics.build_reader_cost += cost;
|
||||
self.0.build_reader_cost.add_duration(cost);
|
||||
|
||||
let mut metrics = self.0.metrics.lock().unwrap();
|
||||
metrics.build_reader_cost += cost;
|
||||
}
|
||||
|
||||
/// Merges [ScannerMetrics], `build_reader_cost`, `scan_cost` and `yield_cost`.
|
||||
pub(crate) fn merge_metrics(&self, metrics: &ScannerMetrics) {
|
||||
let mut inner = self.0.lock().unwrap();
|
||||
inner.metrics.merge_from(metrics);
|
||||
self.0
|
||||
.build_reader_cost
|
||||
.add_duration(metrics.build_reader_cost);
|
||||
self.0.scan_cost.add_duration(metrics.scan_cost);
|
||||
self.0.yield_cost.add_duration(metrics.yield_cost);
|
||||
|
||||
let mut metrics_set = self.0.metrics.lock().unwrap();
|
||||
metrics_set.merge_scanner_metrics(metrics);
|
||||
}
|
||||
|
||||
/// Merges [ReaderMetrics] and `build_reader_cost`.
|
||||
pub(crate) fn merge_reader_metrics(&self, metrics: &ReaderMetrics) {
|
||||
let mut inner = self.0.lock().unwrap();
|
||||
inner.reader_metrics.merge_from(metrics);
|
||||
self.0.build_parts_cost.add_duration(metrics.build_cost);
|
||||
|
||||
let mut metrics_set = self.0.metrics.lock().unwrap();
|
||||
metrics_set.merge_reader_metrics(metrics);
|
||||
}
|
||||
|
||||
/// Finishes the query.
|
||||
pub(crate) fn on_finish(&self) {
|
||||
let mut inner = self.0.lock().unwrap();
|
||||
inner.on_finish();
|
||||
self.0.on_finish();
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for PartitionMetrics {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let metrics = self.0.metrics.lock().unwrap();
|
||||
write!(f, "[partition={}, {:?}]", self.0.partition, metrics)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@ use common_recordbatch::error::ExternalSnafu;
|
||||
use common_recordbatch::util::ChainedRecordBatchStream;
|
||||
use common_recordbatch::{RecordBatchStreamWrapper, SendableRecordBatchStream};
|
||||
use common_telemetry::tracing;
|
||||
use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
|
||||
use datafusion::physical_plan::{DisplayAs, DisplayFormatType};
|
||||
use datatypes::schema::SchemaRef;
|
||||
use snafu::ResultExt;
|
||||
@@ -38,7 +39,9 @@ use crate::read::last_row::LastRowReader;
|
||||
use crate::read::merge::MergeReaderBuilder;
|
||||
use crate::read::range::RangeBuilderList;
|
||||
use crate::read::scan_region::{ScanInput, StreamContext};
|
||||
use crate::read::scan_util::{scan_file_ranges, scan_mem_ranges, PartitionMetrics};
|
||||
use crate::read::scan_util::{
|
||||
scan_file_ranges, scan_mem_ranges, PartitionMetrics, PartitionMetricsList,
|
||||
};
|
||||
use crate::read::{BatchReader, BoxedBatchReader, ScannerMetrics, Source};
|
||||
use crate::region::options::MergeMode;
|
||||
|
||||
@@ -53,6 +56,9 @@ pub struct SeqScan {
|
||||
stream_ctx: Arc<StreamContext>,
|
||||
/// The scanner is used for compaction.
|
||||
compaction: bool,
|
||||
/// Metrics for each partition.
|
||||
/// The scanner only sets in query and keeps it empty during compaction.
|
||||
metrics_list: PartitionMetricsList,
|
||||
}
|
||||
|
||||
impl SeqScan {
|
||||
@@ -69,6 +75,7 @@ impl SeqScan {
|
||||
properties,
|
||||
stream_ctx,
|
||||
compaction,
|
||||
metrics_list: PartitionMetricsList::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -77,8 +84,9 @@ impl SeqScan {
|
||||
/// The returned stream is not partitioned and will contains all the data. If want
|
||||
/// partitioned scan, use [`RegionScanner::scan_partition`].
|
||||
pub fn build_stream(&self) -> Result<SendableRecordBatchStream, BoxedError> {
|
||||
let metrics_set = ExecutionPlanMetricsSet::new();
|
||||
let streams = (0..self.properties.partitions.len())
|
||||
.map(|partition: usize| self.scan_partition(partition))
|
||||
.map(|partition: usize| self.scan_partition(&metrics_set, partition))
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
let aggr_stream = ChainedRecordBatchStream::new(streams).map_err(BoxedError::new)?;
|
||||
@@ -92,16 +100,8 @@ impl SeqScan {
|
||||
pub async fn build_reader_for_compaction(&self) -> Result<BoxedBatchReader> {
|
||||
assert!(self.compaction);
|
||||
|
||||
let part_metrics = PartitionMetrics::new(
|
||||
self.stream_ctx.input.mapper.metadata().region_id,
|
||||
0,
|
||||
get_scanner_type(self.compaction),
|
||||
self.stream_ctx.query_start,
|
||||
ScannerMetrics {
|
||||
prepare_scan_cost: self.stream_ctx.query_start.elapsed(),
|
||||
..Default::default()
|
||||
},
|
||||
);
|
||||
let metrics_set = ExecutionPlanMetricsSet::new();
|
||||
let part_metrics = self.new_partition_metrics(&metrics_set, 0);
|
||||
debug_assert_eq!(1, self.properties.partitions.len());
|
||||
let partition_ranges = &self.properties.partitions[0];
|
||||
|
||||
@@ -194,6 +194,7 @@ impl SeqScan {
|
||||
/// Otherwise the returned stream might not contains any data.
|
||||
fn scan_partition_impl(
|
||||
&self,
|
||||
metrics_set: &ExecutionPlanMetricsSet,
|
||||
partition: usize,
|
||||
) -> Result<SendableRecordBatchStream, BoxedError> {
|
||||
if partition >= self.properties.partitions.len() {
|
||||
@@ -207,7 +208,7 @@ impl SeqScan {
|
||||
}
|
||||
|
||||
if self.stream_ctx.input.distribution == Some(TimeSeriesDistribution::PerSeries) {
|
||||
return self.scan_partition_by_series(partition);
|
||||
return self.scan_partition_by_series(metrics_set, partition);
|
||||
}
|
||||
|
||||
let stream_ctx = self.stream_ctx.clone();
|
||||
@@ -215,7 +216,7 @@ impl SeqScan {
|
||||
let partition_ranges = self.properties.partitions[partition].clone();
|
||||
let compaction = self.compaction;
|
||||
let distinguish_range = self.properties.distinguish_partition_range;
|
||||
let part_metrics = self.new_partition_metrics(partition);
|
||||
let part_metrics = self.new_partition_metrics(metrics_set, partition);
|
||||
|
||||
let stream = try_stream! {
|
||||
part_metrics.on_first_poll();
|
||||
@@ -310,13 +311,14 @@ impl SeqScan {
|
||||
/// Otherwise the returned stream might not contains any data.
|
||||
fn scan_partition_by_series(
|
||||
&self,
|
||||
metrics_set: &ExecutionPlanMetricsSet,
|
||||
partition: usize,
|
||||
) -> Result<SendableRecordBatchStream, BoxedError> {
|
||||
let stream_ctx = self.stream_ctx.clone();
|
||||
let semaphore = self.new_semaphore();
|
||||
let partition_ranges = self.properties.partitions[partition].clone();
|
||||
let distinguish_range = self.properties.distinguish_partition_range;
|
||||
let part_metrics = self.new_partition_metrics(partition);
|
||||
let part_metrics = self.new_partition_metrics(metrics_set, partition);
|
||||
debug_assert!(!self.compaction);
|
||||
|
||||
let stream = try_stream! {
|
||||
@@ -411,17 +413,26 @@ impl SeqScan {
|
||||
}
|
||||
}
|
||||
|
||||
fn new_partition_metrics(&self, partition: usize) -> PartitionMetrics {
|
||||
PartitionMetrics::new(
|
||||
/// Creates a new partition metrics instance.
|
||||
/// Sets the partition metrics for the given partition if it is not for compaction.
|
||||
fn new_partition_metrics(
|
||||
&self,
|
||||
metrics_set: &ExecutionPlanMetricsSet,
|
||||
partition: usize,
|
||||
) -> PartitionMetrics {
|
||||
let metrics = PartitionMetrics::new(
|
||||
self.stream_ctx.input.mapper.metadata().region_id,
|
||||
partition,
|
||||
get_scanner_type(self.compaction),
|
||||
self.stream_ctx.query_start,
|
||||
ScannerMetrics {
|
||||
prepare_scan_cost: self.stream_ctx.query_start.elapsed(),
|
||||
..Default::default()
|
||||
},
|
||||
)
|
||||
metrics_set,
|
||||
);
|
||||
|
||||
if !self.compaction {
|
||||
self.metrics_list.set(partition, metrics.clone());
|
||||
}
|
||||
|
||||
metrics
|
||||
}
|
||||
}
|
||||
|
||||
@@ -438,8 +449,12 @@ impl RegionScanner for SeqScan {
|
||||
self.stream_ctx.input.mapper.metadata().clone()
|
||||
}
|
||||
|
||||
fn scan_partition(&self, partition: usize) -> Result<SendableRecordBatchStream, BoxedError> {
|
||||
self.scan_partition_impl(partition)
|
||||
fn scan_partition(
|
||||
&self,
|
||||
metrics_set: &ExecutionPlanMetricsSet,
|
||||
partition: usize,
|
||||
) -> Result<SendableRecordBatchStream, BoxedError> {
|
||||
self.scan_partition_impl(metrics_set, partition)
|
||||
}
|
||||
|
||||
fn prepare(&mut self, request: PrepareRequest) -> Result<(), BoxedError> {
|
||||
@@ -458,13 +473,19 @@ impl RegionScanner for SeqScan {
|
||||
}
|
||||
|
||||
impl DisplayAs for SeqScan {
|
||||
fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"SeqScan: region={}, ",
|
||||
self.stream_ctx.input.mapper.metadata().region_id
|
||||
)?;
|
||||
self.stream_ctx.format_for_explain(f)
|
||||
match t {
|
||||
DisplayFormatType::Default => self.stream_ctx.format_for_explain(false, f),
|
||||
DisplayFormatType::Verbose => {
|
||||
self.stream_ctx.format_for_explain(true, f)?;
|
||||
self.metrics_list.format_verbose_metrics(f)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@ use async_stream::{stream, try_stream};
|
||||
use common_error::ext::BoxedError;
|
||||
use common_recordbatch::error::ExternalSnafu;
|
||||
use common_recordbatch::{RecordBatchStreamWrapper, SendableRecordBatchStream};
|
||||
use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
|
||||
use datafusion::physical_plan::{DisplayAs, DisplayFormatType};
|
||||
use datatypes::schema::SchemaRef;
|
||||
use futures::{Stream, StreamExt};
|
||||
@@ -32,7 +33,9 @@ use store_api::region_engine::{PrepareRequest, RegionScanner, ScannerProperties}
|
||||
use crate::error::{PartitionOutOfRangeSnafu, Result};
|
||||
use crate::read::range::RangeBuilderList;
|
||||
use crate::read::scan_region::{ScanInput, StreamContext};
|
||||
use crate::read::scan_util::{scan_file_ranges, scan_mem_ranges, PartitionMetrics};
|
||||
use crate::read::scan_util::{
|
||||
scan_file_ranges, scan_mem_ranges, PartitionMetrics, PartitionMetricsList,
|
||||
};
|
||||
use crate::read::{Batch, ScannerMetrics};
|
||||
|
||||
/// Scans a region without providing any output ordering guarantee.
|
||||
@@ -43,6 +46,8 @@ pub struct UnorderedScan {
|
||||
properties: ScannerProperties,
|
||||
/// Context of streams.
|
||||
stream_ctx: Arc<StreamContext>,
|
||||
/// Metrics for each partition.
|
||||
metrics_list: PartitionMetricsList,
|
||||
}
|
||||
|
||||
impl UnorderedScan {
|
||||
@@ -57,14 +62,16 @@ impl UnorderedScan {
|
||||
Self {
|
||||
properties,
|
||||
stream_ctx,
|
||||
metrics_list: PartitionMetricsList::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Scans the region and returns a stream.
|
||||
pub(crate) async fn build_stream(&self) -> Result<SendableRecordBatchStream, BoxedError> {
|
||||
let metrics_set = ExecutionPlanMetricsSet::new();
|
||||
let part_num = self.properties.num_partitions();
|
||||
let streams = (0..part_num)
|
||||
.map(|i| self.scan_partition(i))
|
||||
.map(|i| self.scan_partition(&metrics_set, i))
|
||||
.collect::<Result<Vec<_>, BoxedError>>()?;
|
||||
let stream = stream! {
|
||||
for mut stream in streams {
|
||||
@@ -119,6 +126,7 @@ impl UnorderedScan {
|
||||
|
||||
fn scan_partition_impl(
|
||||
&self,
|
||||
metrics_set: &ExecutionPlanMetricsSet,
|
||||
partition: usize,
|
||||
) -> Result<SendableRecordBatchStream, BoxedError> {
|
||||
if partition >= self.properties.partitions.len() {
|
||||
@@ -136,11 +144,9 @@ impl UnorderedScan {
|
||||
partition,
|
||||
"UnorderedScan",
|
||||
self.stream_ctx.query_start,
|
||||
ScannerMetrics {
|
||||
prepare_scan_cost: self.stream_ctx.query_start.elapsed(),
|
||||
..Default::default()
|
||||
},
|
||||
metrics_set,
|
||||
);
|
||||
self.metrics_list.set(partition, part_metrics.clone());
|
||||
let stream_ctx = self.stream_ctx.clone();
|
||||
let part_ranges = self.properties.partitions[partition].clone();
|
||||
let distinguish_range = self.properties.distinguish_partition_range;
|
||||
@@ -239,8 +245,12 @@ impl RegionScanner for UnorderedScan {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn scan_partition(&self, partition: usize) -> Result<SendableRecordBatchStream, BoxedError> {
|
||||
self.scan_partition_impl(partition)
|
||||
fn scan_partition(
|
||||
&self,
|
||||
metrics_set: &ExecutionPlanMetricsSet,
|
||||
partition: usize,
|
||||
) -> Result<SendableRecordBatchStream, BoxedError> {
|
||||
self.scan_partition_impl(metrics_set, partition)
|
||||
}
|
||||
|
||||
fn has_predicate(&self) -> bool {
|
||||
@@ -254,13 +264,19 @@ impl RegionScanner for UnorderedScan {
|
||||
}
|
||||
|
||||
impl DisplayAs for UnorderedScan {
|
||||
fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"UnorderedScan: region={}, ",
|
||||
self.stream_ctx.input.mapper.metadata().region_id
|
||||
)?;
|
||||
self.stream_ctx.format_for_explain(f)
|
||||
match t {
|
||||
DisplayFormatType::Default => self.stream_ctx.format_for_explain(false, f),
|
||||
DisplayFormatType::Verbose => {
|
||||
self.stream_ctx.format_for_explain(true, f)?;
|
||||
self.metrics_list.format_verbose_metrics(f)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -331,6 +331,7 @@ impl RegionOpener {
|
||||
);
|
||||
Ok(Provider::kafka_provider(options.topic.to_string()))
|
||||
}
|
||||
WalOptions::Noop => Ok(Provider::noop_provider()),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -248,6 +248,10 @@ impl FileHandle {
|
||||
self.inner.meta.file_size
|
||||
}
|
||||
|
||||
pub fn index_size(&self) -> u64 {
|
||||
self.inner.meta.index_file_size
|
||||
}
|
||||
|
||||
pub fn num_rows(&self) -> usize {
|
||||
self.inner.meta.num_rows as usize
|
||||
}
|
||||
|
||||
@@ -24,7 +24,7 @@ use index::bloom_filter::applier::BloomFilterApplier;
|
||||
use index::bloom_filter::reader::{BloomFilterReader, BloomFilterReaderImpl};
|
||||
use object_store::ObjectStore;
|
||||
use puffin::puffin_manager::cache::PuffinMetadataCacheRef;
|
||||
use puffin::puffin_manager::{BlobGuard, PuffinManager, PuffinReader};
|
||||
use puffin::puffin_manager::{PuffinManager, PuffinReader};
|
||||
use snafu::ResultExt;
|
||||
use store_api::storage::{ColumnId, RegionId};
|
||||
|
||||
|
||||
@@ -304,7 +304,12 @@ impl BloomFilterIndexer {
|
||||
let blob_name = format!("{}-{}", INDEX_BLOB_TYPE, col_id);
|
||||
let (index_finish, puffin_add_blob) = futures::join!(
|
||||
creator.finish(tx.compat_write()),
|
||||
puffin_writer.put_blob(&blob_name, rx.compat(), PutOptions::default())
|
||||
puffin_writer.put_blob(
|
||||
&blob_name,
|
||||
rx.compat(),
|
||||
PutOptions::default(),
|
||||
Default::default(),
|
||||
)
|
||||
);
|
||||
|
||||
match (
|
||||
@@ -351,7 +356,7 @@ pub(crate) mod tests {
|
||||
use index::bloom_filter::reader::{BloomFilterReader, BloomFilterReaderImpl};
|
||||
use object_store::services::Memory;
|
||||
use object_store::ObjectStore;
|
||||
use puffin::puffin_manager::{BlobGuard, PuffinManager, PuffinReader};
|
||||
use puffin::puffin_manager::{PuffinManager, PuffinReader};
|
||||
use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder};
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ use index::inverted_index::search::index_apply::{
|
||||
};
|
||||
use object_store::ObjectStore;
|
||||
use puffin::puffin_manager::cache::PuffinMetadataCacheRef;
|
||||
use puffin::puffin_manager::{BlobGuard, PuffinManager, PuffinReader};
|
||||
use puffin::puffin_manager::{PuffinManager, PuffinReader};
|
||||
use snafu::ResultExt;
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
@@ -250,7 +250,12 @@ mod tests {
|
||||
);
|
||||
let mut writer = puffin_manager.writer(&file_id).await.unwrap();
|
||||
writer
|
||||
.put_blob(INDEX_BLOB_TYPE, Cursor::new(vec![]), Default::default())
|
||||
.put_blob(
|
||||
INDEX_BLOB_TYPE,
|
||||
Cursor::new(vec![]),
|
||||
Default::default(),
|
||||
Default::default(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
writer.finish().await.unwrap();
|
||||
@@ -298,7 +303,12 @@ mod tests {
|
||||
);
|
||||
let mut writer = puffin_manager.writer(&file_id).await.unwrap();
|
||||
writer
|
||||
.put_blob("invalid_blob_type", Cursor::new(vec![]), Default::default())
|
||||
.put_blob(
|
||||
"invalid_blob_type",
|
||||
Cursor::new(vec![]),
|
||||
Default::default(),
|
||||
Default::default(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
writer.finish().await.unwrap();
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user