From b53a0b86fb8b2f3cede177025c8a9ffb4bb40e3a Mon Sep 17 00:00:00 2001 From: LFC <990479+MichaelScofield@users.noreply.github.com> Date: Fri, 24 Oct 2025 10:16:49 +0800 Subject: [PATCH 001/149] feat: create table with new json datatype (#7128) * feat: create table with new json datatype Signed-off-by: luofucong * resolve PR comments Signed-off-by: luofucong --------- Signed-off-by: luofucong --- Cargo.lock | 1 + src/api/src/v1/column_def.rs | 12 ++++- src/common/test-util/src/recordbatch.rs | 2 +- src/datatypes/src/json.rs | 9 +++- src/datatypes/src/schema.rs | 5 +- src/datatypes/src/schema/column_schema.rs | 17 ++++++ src/datatypes/src/types/json_type.rs | 6 ++- src/query/src/error.rs | 14 +++-- src/query/src/sql/show_create_table.rs | 4 ++ src/sql/Cargo.toml | 1 + src/sql/src/error.rs | 12 ++++- src/sql/src/statements.rs | 16 +++++- src/sql/src/statements/create.rs | 44 ++++++++++++++- src/sql/src/statements/option_map.rs | 24 +++++++++ src/sql/src/util.rs | 17 ++++++ tests-integration/src/tests/instance_test.rs | 56 ++++++++++++++++++++ 16 files changed, 226 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f721f58369..231bd594ea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11964,6 +11964,7 @@ dependencies = [ "datafusion-physical-expr", "datafusion-sql", "datatypes", + "either", "hex", "humantime", "iso8601", diff --git a/src/api/src/v1/column_def.rs b/src/api/src/v1/column_def.rs index 5be3d5c196..912b7ee13e 100644 --- a/src/api/src/v1/column_def.rs +++ b/src/api/src/v1/column_def.rs @@ -16,8 +16,8 @@ use std::collections::HashMap; use datatypes::schema::{ COMMENT_KEY, ColumnDefaultConstraint, ColumnSchema, FULLTEXT_KEY, FulltextAnalyzer, - FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, SKIPPING_INDEX_KEY, SkippingIndexOptions, - SkippingIndexType, + FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, JSON_STRUCTURE_SETTINGS_KEY, + SKIPPING_INDEX_KEY, SkippingIndexOptions, SkippingIndexType, }; use greptime_proto::v1::{ Analyzer, FulltextBackend as PbFulltextBackend, SkippingIndexType as PbSkippingIndexType, @@ -68,6 +68,9 @@ pub fn try_as_column_schema(column_def: &ColumnDef) -> Result { if let Some(skipping_index) = options.options.get(SKIPPING_INDEX_GRPC_KEY) { metadata.insert(SKIPPING_INDEX_KEY.to_string(), skipping_index.to_owned()); } + if let Some(settings) = options.options.get(JSON_STRUCTURE_SETTINGS_KEY) { + metadata.insert(JSON_STRUCTURE_SETTINGS_KEY.to_string(), settings.clone()); + } } ColumnSchema::new(&column_def.name, data_type.into(), column_def.is_nullable) @@ -139,6 +142,11 @@ pub fn options_from_column_schema(column_schema: &ColumnSchema) -> Option unreachable!(), }; let pretty_print = recordbatches.pretty_print().unwrap(); - assert_eq!(pretty_print, expected, "actual: \n{}", pretty_print); + assert_eq!(pretty_print, expected.trim(), "actual: \n{}", pretty_print); } pub async fn execute_and_check_output(db: &Database, sql: &str, expected: ExpectedOutput<'_>) { diff --git a/src/datatypes/src/json.rs b/src/datatypes/src/json.rs index 380cc8ce06..902b84a131 100644 --- a/src/datatypes/src/json.rs +++ b/src/datatypes/src/json.rs @@ -24,6 +24,7 @@ use std::sync::Arc; use common_base::bytes::StringBytes; use ordered_float::OrderedFloat; +use serde::{Deserialize, Serialize}; use serde_json::{Map, Value as Json}; use snafu::{ResultExt, ensure}; @@ -45,7 +46,7 @@ use crate::value::{ListValue, StructValue, Value}; /// convert them to fully structured StructValue for user-facing APIs: the UI protocol and the UDF interface. /// /// **Important**: This settings only controls the internal form of JSON encoding. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub enum JsonStructureSettings { // TODO(sunng87): provide a limit Structured(Option), @@ -111,6 +112,12 @@ impl JsonStructureSettings { } } +impl Default for JsonStructureSettings { + fn default() -> Self { + Self::Structured(None) + } +} + impl<'a> JsonContext<'a> { /// Create a new context with an updated key path pub fn with_key(&self, key: &str) -> JsonContext<'a> { diff --git a/src/datatypes/src/schema.rs b/src/datatypes/src/schema.rs index 6bdf321137..9995072b7c 100644 --- a/src/datatypes/src/schema.rs +++ b/src/datatypes/src/schema.rs @@ -32,8 +32,9 @@ pub use crate::schema::column_schema::{ COLUMN_FULLTEXT_OPT_KEY_FALSE_POSITIVE_RATE, COLUMN_FULLTEXT_OPT_KEY_GRANULARITY, COLUMN_SKIPPING_INDEX_OPT_KEY_FALSE_POSITIVE_RATE, COLUMN_SKIPPING_INDEX_OPT_KEY_GRANULARITY, COLUMN_SKIPPING_INDEX_OPT_KEY_TYPE, COMMENT_KEY, ColumnExtType, ColumnSchema, FULLTEXT_KEY, - FulltextAnalyzer, FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, Metadata, - SKIPPING_INDEX_KEY, SkippingIndexOptions, SkippingIndexType, TIME_INDEX_KEY, + FulltextAnalyzer, FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, + JSON_STRUCTURE_SETTINGS_KEY, Metadata, SKIPPING_INDEX_KEY, SkippingIndexOptions, + SkippingIndexType, TIME_INDEX_KEY, }; pub use crate::schema::constraint::ColumnDefaultConstraint; pub use crate::schema::raw::RawSchema; diff --git a/src/datatypes/src/schema/column_schema.rs b/src/datatypes/src/schema/column_schema.rs index f176350b8c..627d898810 100644 --- a/src/datatypes/src/schema/column_schema.rs +++ b/src/datatypes/src/schema/column_schema.rs @@ -23,6 +23,7 @@ use sqlparser_derive::{Visit, VisitMut}; use crate::data_type::{ConcreteDataType, DataType}; use crate::error::{self, Error, InvalidFulltextOptionSnafu, ParseExtendedTypeSnafu, Result}; +use crate::json::JsonStructureSettings; use crate::schema::TYPE_KEY; use crate::schema::constraint::ColumnDefaultConstraint; use crate::value::Value; @@ -41,6 +42,7 @@ pub const FULLTEXT_KEY: &str = "greptime:fulltext"; pub const INVERTED_INDEX_KEY: &str = "greptime:inverted_index"; /// Key used to store skip options in arrow field's metadata. pub const SKIPPING_INDEX_KEY: &str = "greptime:skipping_index"; +pub const JSON_STRUCTURE_SETTINGS_KEY: &str = "greptime:json:structure_settings"; /// Keys used in fulltext options pub const COLUMN_FULLTEXT_CHANGE_OPT_KEY_ENABLE: &str = "enable"; @@ -391,6 +393,21 @@ impl ColumnSchema { self.metadata.remove(SKIPPING_INDEX_KEY); Ok(()) } + + pub fn json_structure_settings(&self) -> Result> { + self.metadata + .get(JSON_STRUCTURE_SETTINGS_KEY) + .map(|json| serde_json::from_str(json).context(error::DeserializeSnafu { json })) + .transpose() + } + + pub fn with_json_structure_settings(&mut self, settings: &JsonStructureSettings) -> Result<()> { + self.metadata.insert( + JSON_STRUCTURE_SETTINGS_KEY.to_string(), + serde_json::to_string(settings).context(error::SerializeSnafu)?, + ); + Ok(()) + } } /// Column extended type set in column schema's metadata. diff --git a/src/datatypes/src/types/json_type.rs b/src/datatypes/src/types/json_type.rs index 01ec81dd08..99dcf9c571 100644 --- a/src/datatypes/src/types/json_type.rs +++ b/src/datatypes/src/types/json_type.rs @@ -15,6 +15,7 @@ use std::str::FromStr; use arrow::datatypes::DataType as ArrowDataType; +use arrow_schema::Fields; use common_base::bytes::Bytes; use serde::{Deserialize, Serialize}; use snafu::ResultExt; @@ -63,7 +64,10 @@ impl DataType for JsonType { } fn as_arrow_type(&self) -> ArrowDataType { - ArrowDataType::Binary + match self.format { + JsonFormat::Jsonb => ArrowDataType::Binary, + JsonFormat::Native(_) => ArrowDataType::Struct(Fields::empty()), + } } fn create_mutable_vector(&self, capacity: usize) -> Box { diff --git a/src/query/src/error.rs b/src/query/src/error.rs index 8cf64dbffc..4649b7fe49 100644 --- a/src/query/src/error.rs +++ b/src/query/src/error.rs @@ -353,6 +353,13 @@ pub enum Error { #[snafu(implicit)] location: Location, }, + + #[snafu(transparent)] + Datatypes { + source: datatypes::error::Error, + #[snafu(implicit)] + location: Location, + }, } impl ErrorExt for Error { @@ -406,9 +413,10 @@ impl ErrorExt for Error { MissingTableMutationHandler { .. } => StatusCode::Unexpected, GetRegionMetadata { .. } => StatusCode::RegionNotReady, TableReadOnly { .. } => StatusCode::Unsupported, - GetFulltextOptions { source, .. } | GetSkippingIndexOptions { source, .. } => { - source.status_code() - } + + GetFulltextOptions { source, .. } + | GetSkippingIndexOptions { source, .. } + | Datatypes { source, .. } => source.status_code(), } } diff --git a/src/query/src/sql/show_create_table.rs b/src/query/src/sql/show_create_table.rs index 5466bb91e6..3b2d8aaceb 100644 --- a/src/query/src/sql/show_create_table.rs +++ b/src/query/src/sql/show_create_table.rs @@ -159,6 +159,10 @@ fn create_column(column_schema: &ColumnSchema, quote_style: char) -> Result StatusCode::InvalidArguments, - SerializeColumnDefaultConstraint { source, .. } => source.status_code(), + SerializeColumnDefaultConstraint { source, .. } + | SetJsonStructureSettings { source, .. } => source.status_code(), + ConvertToGrpcDataType { source, .. } => source.status_code(), SqlCommon { source, .. } => source.status_code(), ConvertToDfStatement { .. } => StatusCode::Internal, diff --git a/src/sql/src/statements.rs b/src/sql/src/statements.rs index b48e208043..823b123011 100644 --- a/src/sql/src/statements.rs +++ b/src/sql/src/statements.rs @@ -49,8 +49,8 @@ use crate::ast::{ }; use crate::error::{ self, ConvertToGrpcDataTypeSnafu, ConvertValueSnafu, Result, - SerializeColumnDefaultConstraintSnafu, SetFulltextOptionSnafu, SetSkippingIndexOptionSnafu, - SqlCommonSnafu, + SerializeColumnDefaultConstraintSnafu, SetFulltextOptionSnafu, SetJsonStructureSettingsSnafu, + SetSkippingIndexOptionSnafu, SqlCommonSnafu, }; use crate::statements::create::Column; pub use crate::statements::option_map::OptionMap; @@ -144,6 +144,18 @@ pub fn column_to_schema( column_schema.set_inverted_index(column.extensions.inverted_index_options.is_some()); + if matches!(column.data_type(), SqlDataType::JSON) { + let settings = column + .extensions + .build_json_structure_settings()? + .unwrap_or_default(); + column_schema + .with_json_structure_settings(&settings) + .with_context(|_| SetJsonStructureSettingsSnafu { + value: format!("{settings:?}"), + })?; + } + Ok(column_schema) } diff --git a/src/sql/src/statements/create.rs b/src/sql/src/statements/create.rs index 9d945e7c8d..3c7f6d1731 100644 --- a/src/sql/src/statements/create.rs +++ b/src/sql/src/statements/create.rs @@ -32,6 +32,7 @@ use crate::error::{ use crate::statements::OptionMap; use crate::statements::statement::Statement; use crate::statements::tql::Tql; +use crate::util::OptionValue; const LINE_SEP: &str = ",\n"; const COMMA_SEP: &str = ", "; @@ -166,7 +167,20 @@ impl Display for Column { return Ok(()); } - write!(f, "{}", self.column_def)?; + write!(f, "{} {}", self.column_def.name, self.column_def.data_type)?; + if let Some(options) = &self.extensions.json_datatype_options { + write!( + f, + "({})", + options + .entries() + .map(|(k, v)| format!("{k} = {v}")) + .join(COMMA_SEP) + )?; + } + for option in &self.column_def.options { + write!(f, " {option}")?; + } if let Some(fulltext_options) = &self.extensions.fulltext_index_options { if !fulltext_options.is_empty() { @@ -251,6 +265,34 @@ impl ColumnExtensions { }) .transpose() } + + pub fn set_json_structure_settings(&mut self, settings: JsonStructureSettings) { + let mut map = OptionMap::default(); + + let format = match settings { + JsonStructureSettings::Structured(_) => JSON_FORMAT_FULL_STRUCTURED, + JsonStructureSettings::PartialUnstructuredByKey { .. } => JSON_FORMAT_PARTIAL, + JsonStructureSettings::UnstructuredRaw => JSON_FORMAT_RAW, + }; + map.insert(JSON_OPT_FORMAT.to_string(), format.to_string()); + + if let JsonStructureSettings::PartialUnstructuredByKey { + fields: _, + unstructured_keys, + } = settings + { + let value = OptionValue::from( + unstructured_keys + .iter() + .map(|x| x.as_str()) + .sorted() + .collect::>(), + ); + map.insert_options(JSON_OPT_UNSTRUCTURED_KEYS, value); + } + + self.json_datatype_options = Some(map); + } } /// Partition on columns or values. diff --git a/src/sql/src/statements/option_map.rs b/src/sql/src/statements/option_map.rs index f67b0dc72a..d6bd4d7608 100644 --- a/src/sql/src/statements/option_map.rs +++ b/src/sql/src/statements/option_map.rs @@ -16,6 +16,7 @@ use std::collections::{BTreeMap, HashMap}; use std::ops::ControlFlow; use common_base::secrets::{ExposeSecret, ExposeSecretMut, SecretString}; +use either::Either; use serde::Serialize; use sqlparser::ast::{Visit, VisitMut, Visitor, VisitorMut}; @@ -56,6 +57,17 @@ impl OptionMap { } } + pub fn insert_options(&mut self, key: &str, value: OptionValue) { + if REDACTED_OPTIONS.contains(&key) { + self.secrets.insert( + key.to_string(), + SecretString::new(Box::new(value.to_string())), + ); + } else { + self.options.insert(key.to_string(), value); + } + } + pub fn get(&self, k: &str) -> Option<&str> { if let Some(value) = self.options.get(k) { value.as_string() @@ -130,6 +142,18 @@ impl OptionMap { } result } + + pub fn entries(&self) -> impl Iterator)> { + let options = self + .options + .iter() + .map(|(k, v)| (k.as_str(), Either::Left(v))); + let secrets = self + .secrets + .keys() + .map(|k| (k.as_str(), Either::Right("******"))); + std::iter::chain(options, secrets) + } } impl> From for OptionMap { diff --git a/src/sql/src/util.rs b/src/sql/src/util.rs index f71dfcc8d7..3b221d7642 100644 --- a/src/sql/src/util.rs +++ b/src/sql/src/util.rs @@ -15,6 +15,7 @@ use std::collections::HashSet; use std::fmt::{Display, Formatter}; +use itertools::Itertools; use serde::Serialize; use snafu::ensure; use sqlparser::ast::{ @@ -131,6 +132,22 @@ impl From> for OptionValue { } } +impl Display for OptionValue { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + if let Some(s) = self.as_string() { + write!(f, "'{s}'") + } else if let Some(s) = self.as_list() { + write!( + f, + "[{}]", + s.into_iter().map(|x| format!("'{x}'")).join(", ") + ) + } else { + write!(f, "'{}'", self.0) + } + } +} + pub fn parse_option_string(option: SqlOption) -> Result<(String, OptionValue)> { let SqlOption::KeyValue { key, value } = option else { return InvalidSqlSnafu { diff --git a/tests-integration/src/tests/instance_test.rs b/tests-integration/src/tests/instance_test.rs index 95664323ff..a29e468bf6 100644 --- a/tests-integration/src/tests/instance_test.rs +++ b/tests-integration/src/tests/instance_test.rs @@ -2338,3 +2338,59 @@ async fn test_copy_parquet_map_to_binary(instance: Arc) { +----+-----------------------------------------+"#; check_output_stream(output, expected).await; } + +#[apply(both_instances_cases)] +async fn test_create_table_with_json_datatype(instance: Arc) { + let instance = instance.frontend(); + + let sql = r#" +CREATE TABLE a ( + j JSON(format = "partial", unstructured_keys = ["foo", "foo.bar"]), + ts TIMESTAMP TIME INDEX, +)"#; + let output = execute_sql(&instance, sql).await.data; + assert!(matches!(output, OutputData::AffectedRows(0))); + + // "show create table" finds the information from table metadata. + // So if the output is expected, we know the options are really set. + let output = execute_sql(&instance, "SHOW CREATE TABLE a").await.data; + let expected = r#" ++-------+------------------------------------------------------------------------------+ +| Table | Create Table | ++-------+------------------------------------------------------------------------------+ +| a | CREATE TABLE IF NOT EXISTS "a" ( | +| | "j" JSON(format = 'partial', unstructured_keys = ['foo', 'foo.bar']) NULL, | +| | "ts" TIMESTAMP(3) NOT NULL, | +| | TIME INDEX ("ts") | +| | ) | +| | | +| | ENGINE=mito | +| | | ++-------+------------------------------------------------------------------------------+"#; + check_output_stream(output, expected).await; + + // test the default options + let sql = r#" +CREATE TABLE b ( + j JSON, + ts TIMESTAMP TIME INDEX, +)"#; + let output = execute_sql(&instance, sql).await.data; + assert!(matches!(output, OutputData::AffectedRows(0))); + + let output = execute_sql(&instance, "SHOW CREATE TABLE b").await.data; + let expected = r#" ++-------+-----------------------------------------+ +| Table | Create Table | ++-------+-----------------------------------------+ +| b | CREATE TABLE IF NOT EXISTS "b" ( | +| | "j" JSON(format = 'structured') NULL, | +| | "ts" TIMESTAMP(3) NOT NULL, | +| | TIME INDEX ("ts") | +| | ) | +| | | +| | ENGINE=mito | +| | | ++-------+-----------------------------------------+"#; + check_output_stream(output, expected).await; +} From a0e6bcbeb34107ddae16e2724c6572e0f5274a3f Mon Sep 17 00:00:00 2001 From: zyy17 Date: Fri, 24 Oct 2025 11:12:45 +0800 Subject: [PATCH 002/149] feat: add `cpu_usage_millicores` and `memory_usage_bytes` in `information_schema.cluster_info` table. (#7051) * refactor: add `hostname` in cluster_info table Signed-off-by: zyy17 * chore: update information schema result Signed-off-by: zyy17 * feat: enable zstd for bulk memtable encoded parts (#7045) feat: enable zstd in bulk memtable Signed-off-by: evenyag * refactor: add `get_total_cpu_millicores()` / `get_total_cpu_cores()` / `get_total_memory_bytes()` / `get_total_memory_readable()` in common-stat Signed-off-by: zyy17 * feat: add `cpu_usage_millicores` and `memory_usage_bytes` in `information_schema.cluster_info` table Signed-off-by: zyy17 * fix: compile warning and integration test failed Signed-off-by: zyy17 * fix: integration test failed Signed-off-by: zyy17 * refactor: add `ResourceStat` Signed-off-by: zyy17 * refactor: apply code review comments Signed-off-by: zyy17 * chore: update greptime-proto Signed-off-by: zyy17 --------- Signed-off-by: zyy17 Signed-off-by: evenyag Co-authored-by: Yingwen --- Cargo.lock | 10 +- Cargo.toml | 2 +- .../information_schema/cluster_info.rs | 49 +++-- src/cmd/src/flownode.rs | 5 + src/cmd/src/frontend.rs | 5 + src/common/config/Cargo.toml | 1 - src/common/config/src/lib.rs | 1 - src/common/config/src/utils.rs | 34 ---- src/common/meta/src/cluster.rs | 16 +- src/common/stat/Cargo.toml | 3 + src/common/stat/src/cgroups.rs | 23 ++- src/common/stat/src/lib.rs | 63 +----- src/common/stat/src/resource.rs | 187 ++++++++++++++++++ src/datanode/Cargo.toml | 1 + src/datanode/src/datanode.rs | 5 + src/datanode/src/heartbeat.rs | 30 ++- src/flow/src/heartbeat.rs | 49 +++-- src/frontend/Cargo.toml | 1 + src/frontend/src/heartbeat.rs | 54 +++-- src/meta-client/src/client.rs | 31 ++- src/meta-srv/Cargo.toml | 1 + src/meta-srv/src/discovery/lease.rs | 18 +- src/meta-srv/src/election/rds/mysql.rs | 6 +- src/meta-srv/src/election/rds/postgres.rs | 6 +- .../handler/collect_cluster_info_handler.rs | 18 +- src/meta-srv/src/metasrv.rs | 38 ++-- src/meta-srv/src/metasrv/builder.rs | 6 +- src/meta-srv/src/service/cluster.rs | 6 +- src/standalone/src/information_extension.rs | 11 +- tests-integration/Cargo.toml | 1 + tests-integration/src/cluster.rs | 5 + .../information_schema/cluster_info.result | 6 +- .../common/system/information_schema.result | 18 +- .../information_schema/cluster_info.result | 6 +- 34 files changed, 504 insertions(+), 212 deletions(-) delete mode 100644 src/common/config/src/utils.rs create mode 100644 src/common/stat/src/resource.rs diff --git a/Cargo.lock b/Cargo.lock index 231bd594ea..07b1695817 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2025,7 +2025,6 @@ dependencies = [ "common-base", "common-error", "common-macro", - "common-stat", "common-telemetry", "common-test-util", "common-wal", @@ -2546,11 +2545,14 @@ name = "common-stat" version = "0.18.0" dependencies = [ "common-base", + "common-runtime", + "common-telemetry", "lazy_static", "nix 0.30.1", "num_cpus", "prometheus", "sysinfo", + "tokio", ] [[package]] @@ -3907,6 +3909,7 @@ dependencies = [ "common-query", "common-recordbatch", "common-runtime", + "common-stat", "common-telemetry", "common-test-util", "common-time", @@ -4904,6 +4907,7 @@ dependencies = [ "common-query", "common-recordbatch", "common-runtime", + "common-stat", "common-telemetry", "common-test-util", "common-time", @@ -5319,7 +5323,7 @@ dependencies = [ [[package]] name = "greptime-proto" version = "0.1.0" -source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=72a0d22e0f5f716b2ee21bca091f87a88c36e5ca#72a0d22e0f5f716b2ee21bca091f87a88c36e5ca" +source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=14b9dc40bdc8288742b0cefc7bb024303b7429ef#14b9dc40bdc8288742b0cefc7bb024303b7429ef" dependencies = [ "prost 0.13.5", "prost-types 0.13.5", @@ -7398,6 +7402,7 @@ dependencies = [ "common-procedure", "common-procedure-test", "common-runtime", + "common-stat", "common-telemetry", "common-time", "common-version", @@ -12996,6 +13001,7 @@ dependencies = [ "common-query", "common-recordbatch", "common-runtime", + "common-stat", "common-telemetry", "common-test-util", "common-time", diff --git a/Cargo.toml b/Cargo.toml index 8a9d574263..a4ce20bfd1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -147,7 +147,7 @@ etcd-client = { git = "https://github.com/GreptimeTeam/etcd-client", rev = "f62d fst = "0.4.7" futures = "0.3" futures-util = "0.3" -greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "72a0d22e0f5f716b2ee21bca091f87a88c36e5ca" } +greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "14b9dc40bdc8288742b0cefc7bb024303b7429ef" } hex = "0.4" http = "1" humantime = "2.1" diff --git a/src/catalog/src/system_schema/information_schema/cluster_info.rs b/src/catalog/src/system_schema/information_schema/cluster_info.rs index f45dc5be06..1ba1a55fb6 100644 --- a/src/catalog/src/system_schema/information_schema/cluster_info.rs +++ b/src/catalog/src/system_schema/information_schema/cluster_info.rs @@ -33,7 +33,6 @@ use datatypes::timestamp::TimestampMillisecond; use datatypes::value::Value; use datatypes::vectors::{ Int64VectorBuilder, StringVectorBuilder, TimestampMillisecondVectorBuilder, - UInt32VectorBuilder, UInt64VectorBuilder, }; use serde::Serialize; use snafu::ResultExt; @@ -53,6 +52,8 @@ const PEER_ADDR: &str = "peer_addr"; const PEER_HOSTNAME: &str = "peer_hostname"; const TOTAL_CPU_MILLICORES: &str = "total_cpu_millicores"; const TOTAL_MEMORY_BYTES: &str = "total_memory_bytes"; +const CPU_USAGE_MILLICORES: &str = "cpu_usage_millicores"; +const MEMORY_USAGE_BYTES: &str = "memory_usage_bytes"; const VERSION: &str = "version"; const GIT_COMMIT: &str = "git_commit"; const START_TIME: &str = "start_time"; @@ -67,15 +68,17 @@ const INIT_CAPACITY: usize = 42; /// - `peer_id`: the peer server id. /// - `peer_type`: the peer type, such as `datanode`, `frontend`, `metasrv` etc. /// - `peer_addr`: the peer gRPC address. +/// - `peer_hostname`: the hostname of the peer. /// - `total_cpu_millicores`: the total CPU millicores of the peer. /// - `total_memory_bytes`: the total memory bytes of the peer. +/// - `cpu_usage_millicores`: the CPU usage millicores of the peer. +/// - `memory_usage_bytes`: the memory usage bytes of the peer. /// - `version`: the build package version of the peer. /// - `git_commit`: the build git commit hash of the peer. /// - `start_time`: the starting time of the peer. /// - `uptime`: the uptime of the peer. /// - `active_time`: the time since the last activity of the peer. /// - `node_status`: the status info of the peer. -/// - `peer_hostname`: the hostname of the peer. /// #[derive(Debug)] pub(super) struct InformationSchemaClusterInfo { @@ -99,12 +102,22 @@ impl InformationSchemaClusterInfo { ColumnSchema::new(PEER_HOSTNAME, ConcreteDataType::string_datatype(), true), ColumnSchema::new( TOTAL_CPU_MILLICORES, - ConcreteDataType::uint32_datatype(), + ConcreteDataType::int64_datatype(), false, ), ColumnSchema::new( TOTAL_MEMORY_BYTES, - ConcreteDataType::uint64_datatype(), + ConcreteDataType::int64_datatype(), + false, + ), + ColumnSchema::new( + CPU_USAGE_MILLICORES, + ConcreteDataType::int64_datatype(), + false, + ), + ColumnSchema::new( + MEMORY_USAGE_BYTES, + ConcreteDataType::int64_datatype(), false, ), ColumnSchema::new(VERSION, ConcreteDataType::string_datatype(), false), @@ -167,8 +180,10 @@ struct InformationSchemaClusterInfoBuilder { peer_types: StringVectorBuilder, peer_addrs: StringVectorBuilder, peer_hostnames: StringVectorBuilder, - cpus: UInt32VectorBuilder, - memory_bytes: UInt64VectorBuilder, + total_cpu_millicores: Int64VectorBuilder, + total_memory_bytes: Int64VectorBuilder, + cpu_usage_millicores: Int64VectorBuilder, + memory_usage_bytes: Int64VectorBuilder, versions: StringVectorBuilder, git_commits: StringVectorBuilder, start_times: TimestampMillisecondVectorBuilder, @@ -186,8 +201,10 @@ impl InformationSchemaClusterInfoBuilder { peer_types: StringVectorBuilder::with_capacity(INIT_CAPACITY), peer_addrs: StringVectorBuilder::with_capacity(INIT_CAPACITY), peer_hostnames: StringVectorBuilder::with_capacity(INIT_CAPACITY), - cpus: UInt32VectorBuilder::with_capacity(INIT_CAPACITY), - memory_bytes: UInt64VectorBuilder::with_capacity(INIT_CAPACITY), + total_cpu_millicores: Int64VectorBuilder::with_capacity(INIT_CAPACITY), + total_memory_bytes: Int64VectorBuilder::with_capacity(INIT_CAPACITY), + cpu_usage_millicores: Int64VectorBuilder::with_capacity(INIT_CAPACITY), + memory_usage_bytes: Int64VectorBuilder::with_capacity(INIT_CAPACITY), versions: StringVectorBuilder::with_capacity(INIT_CAPACITY), git_commits: StringVectorBuilder::with_capacity(INIT_CAPACITY), start_times: TimestampMillisecondVectorBuilder::with_capacity(INIT_CAPACITY), @@ -243,8 +260,14 @@ impl InformationSchemaClusterInfoBuilder { self.start_times.push(None); self.uptimes.push(None); } - self.cpus.push(Some(node_info.cpus)); - self.memory_bytes.push(Some(node_info.memory_bytes)); + self.total_cpu_millicores + .push(Some(node_info.total_cpu_millicores)); + self.total_memory_bytes + .push(Some(node_info.total_memory_bytes)); + self.cpu_usage_millicores + .push(Some(node_info.cpu_usage_millicores)); + self.memory_usage_bytes + .push(Some(node_info.memory_usage_bytes)); if node_info.last_activity_ts > 0 { self.active_times.push(Some( @@ -269,8 +292,10 @@ impl InformationSchemaClusterInfoBuilder { Arc::new(self.peer_types.finish()), Arc::new(self.peer_addrs.finish()), Arc::new(self.peer_hostnames.finish()), - Arc::new(self.cpus.finish()), - Arc::new(self.memory_bytes.finish()), + Arc::new(self.total_cpu_millicores.finish()), + Arc::new(self.total_memory_bytes.finish()), + Arc::new(self.cpu_usage_millicores.finish()), + Arc::new(self.memory_usage_bytes.finish()), Arc::new(self.versions.finish()), Arc::new(self.git_commits.finish()), Arc::new(self.start_times.finish()), diff --git a/src/cmd/src/flownode.rs b/src/cmd/src/flownode.rs index 500e9bfa89..07f3279724 100644 --- a/src/cmd/src/flownode.rs +++ b/src/cmd/src/flownode.rs @@ -30,6 +30,7 @@ use common_meta::heartbeat::handler::invalidate_table_cache::InvalidateCacheHand use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler; use common_meta::key::TableMetadataManager; use common_meta::key::flow::FlowMetadataManager; +use common_stat::ResourceStatImpl; use common_telemetry::info; use common_telemetry::logging::{DEFAULT_LOGGING_DIR, TracingOptions}; use common_version::{short_version, verbose_version}; @@ -372,11 +373,15 @@ impl StartCommand { Arc::new(InvalidateCacheHandler::new(layered_cache_registry.clone())), ]); + let mut resource_stat = ResourceStatImpl::default(); + resource_stat.start_collect_cpu_usage(); + let heartbeat_task = flow::heartbeat::HeartbeatTask::new( &opts, meta_client.clone(), opts.heartbeat.clone(), Arc::new(executor), + Arc::new(resource_stat), ); let flow_metadata_manager = Arc::new(FlowMetadataManager::new(cached_meta_backend.clone())); diff --git a/src/cmd/src/frontend.rs b/src/cmd/src/frontend.rs index 4c72021a47..fda6d968bf 100644 --- a/src/cmd/src/frontend.rs +++ b/src/cmd/src/frontend.rs @@ -30,6 +30,7 @@ use common_meta::cache::{CacheRegistryBuilder, LayeredCacheRegistryBuilder}; use common_meta::heartbeat::handler::HandlerGroupExecutor; use common_meta::heartbeat::handler::invalidate_table_cache::InvalidateCacheHandler; use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler; +use common_stat::ResourceStatImpl; use common_telemetry::info; use common_telemetry::logging::{DEFAULT_LOGGING_DIR, TracingOptions}; use common_time::timezone::set_default_timezone; @@ -421,11 +422,15 @@ impl StartCommand { Arc::new(InvalidateCacheHandler::new(layered_cache_registry.clone())), ]); + let mut resource_stat = ResourceStatImpl::default(); + resource_stat.start_collect_cpu_usage(); + let heartbeat_task = HeartbeatTask::new( &opts, meta_client.clone(), opts.heartbeat.clone(), Arc::new(executor), + Arc::new(resource_stat), ); let heartbeat_task = Some(heartbeat_task); diff --git a/src/common/config/Cargo.toml b/src/common/config/Cargo.toml index 1d2b21602f..b45c03a6c3 100644 --- a/src/common/config/Cargo.toml +++ b/src/common/config/Cargo.toml @@ -11,7 +11,6 @@ workspace = true common-base.workspace = true common-error.workspace = true common-macro.workspace = true -common-stat.workspace = true config.workspace = true humantime-serde.workspace = true object-store.workspace = true diff --git a/src/common/config/src/lib.rs b/src/common/config/src/lib.rs index b806924217..cc25ebce16 100644 --- a/src/common/config/src/lib.rs +++ b/src/common/config/src/lib.rs @@ -14,7 +14,6 @@ pub mod config; pub mod error; -pub mod utils; use std::time::Duration; diff --git a/src/common/config/src/utils.rs b/src/common/config/src/utils.rs deleted file mode 100644 index 1bc986b77e..0000000000 --- a/src/common/config/src/utils.rs +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use common_base::readable_size::ReadableSize; -use common_stat::{get_total_cpu_millicores, get_total_memory_readable}; - -/// `ResourceSpec` holds the static resource specifications of a node, -/// such as CPU cores and memory capacity. These values are fixed -/// at startup and do not change dynamically during runtime. -#[derive(Debug, Clone, Copy)] -pub struct ResourceSpec { - pub cpus: i64, - pub memory: Option, -} - -impl Default for ResourceSpec { - fn default() -> Self { - Self { - cpus: get_total_cpu_millicores(), - memory: get_total_memory_readable(), - } - } -} diff --git a/src/common/meta/src/cluster.rs b/src/common/meta/src/cluster.rs index 63001970b6..74485513e9 100644 --- a/src/common/meta/src/cluster.rs +++ b/src/common/meta/src/cluster.rs @@ -120,10 +120,16 @@ pub struct NodeInfo { pub start_time_ms: u64, // The node build cpus #[serde(default)] - pub cpus: u32, + pub total_cpu_millicores: i64, // The node build memory bytes #[serde(default)] - pub memory_bytes: u64, + pub total_memory_bytes: i64, + // The node build cpu usage millicores + #[serde(default)] + pub cpu_usage_millicores: i64, + // The node build memory usage bytes + #[serde(default)] + pub memory_usage_bytes: i64, // The node build hostname #[serde(default)] pub hostname: String, @@ -333,8 +339,10 @@ mod tests { version: "".to_string(), git_commit: "".to_string(), start_time_ms: 1, - cpus: 0, - memory_bytes: 0, + total_cpu_millicores: 0, + total_memory_bytes: 0, + cpu_usage_millicores: 0, + memory_usage_bytes: 0, hostname: "test_hostname".to_string(), }; diff --git a/src/common/stat/Cargo.toml b/src/common/stat/Cargo.toml index 3d0198f6a2..d0e8b5448f 100644 --- a/src/common/stat/Cargo.toml +++ b/src/common/stat/Cargo.toml @@ -6,11 +6,14 @@ license.workspace = true [dependencies] common-base.workspace = true +common-runtime.workspace = true +common-telemetry.workspace = true lazy_static.workspace = true nix.workspace = true num_cpus.workspace = true prometheus.workspace = true sysinfo.workspace = true +tokio.workspace = true [lints] workspace = true diff --git a/src/common/stat/src/cgroups.rs b/src/common/stat/src/cgroups.rs index fe26f5ec36..ce8f5ac87a 100644 --- a/src/common/stat/src/cgroups.rs +++ b/src/common/stat/src/cgroups.rs @@ -117,7 +117,10 @@ pub fn get_cpu_limit_from_cgroups() -> Option { None } -fn get_cpu_usage() -> Option { +/// Get the usage of cpu in millicores from cgroups filesystem. +/// +/// - Return `None` if it's not in the cgroups v2 environment or fails to read the cpu usage. +pub fn get_cpu_usage_from_cgroups() -> Option { // In certain bare-metal environments, the `/sys/fs/cgroup/cpu.stat` file may be present and reflect system-wide CPU usage rather than container-specific metrics. // To ensure accurate collection of container-level CPU usage, verify the existence of the `/sys/fs/cgroup/memory.current` file. // The presence of this file typically indicates execution within a containerized environment, thereby validating the relevance of the collected CPU usage data. @@ -142,6 +145,22 @@ fn get_cpu_usage() -> Option { fields[1].trim().parse::().ok() } +// Calculate the cpu usage in millicores from cgroups filesystem. +// +// - Return `0` if the current cpu usage is equal to the last cpu usage or the interval is 0. +pub(crate) fn calculate_cpu_usage( + current_cpu_usage_usecs: i64, + last_cpu_usage_usecs: i64, + interval_milliseconds: i64, +) -> i64 { + let diff = current_cpu_usage_usecs - last_cpu_usage_usecs; + if diff > 0 && interval_milliseconds > 0 { + ((diff as f64 / interval_milliseconds as f64).round() as i64).max(1) + } else { + 0 + } +} + // Check whether the cgroup is v2. // - Return `true` if the cgroup is v2, otherwise return `false`. // - Return `None` if the detection fails or not on linux. @@ -230,7 +249,7 @@ impl Collector for CgroupsMetricsCollector { } fn collect(&self) -> Vec { - if let Some(cpu_usage) = get_cpu_usage() { + if let Some(cpu_usage) = get_cpu_usage_from_cgroups() { self.cpu_usage.set(cpu_usage); } diff --git a/src/common/stat/src/lib.rs b/src/common/stat/src/lib.rs index 2c6cbea3f1..544b9439c8 100644 --- a/src/common/stat/src/lib.rs +++ b/src/common/stat/src/lib.rs @@ -13,66 +13,7 @@ // limitations under the License. mod cgroups; +mod resource; pub use cgroups::*; -use common_base::readable_size::ReadableSize; -use sysinfo::System; - -/// Get the total CPU in millicores. -pub fn get_total_cpu_millicores() -> i64 { - // Get CPU limit from cgroups filesystem. - if let Some(cgroup_cpu_limit) = get_cpu_limit_from_cgroups() { - cgroup_cpu_limit - } else { - // Get total CPU cores from host system. - num_cpus::get() as i64 * 1000 - } -} - -/// Get the total memory in bytes. -pub fn get_total_memory_bytes() -> i64 { - // Get memory limit from cgroups filesystem. - if let Some(cgroup_memory_limit) = get_memory_limit_from_cgroups() { - cgroup_memory_limit - } else { - // Get total memory from host system. - if sysinfo::IS_SUPPORTED_SYSTEM { - let mut sys_info = System::new(); - sys_info.refresh_memory(); - sys_info.total_memory() as i64 - } else { - // If the system is not supported, return -1. - -1 - } - } -} - -/// Get the total CPU cores. The result will be rounded to the nearest integer. -/// For example, if the total CPU is 1.5 cores(1500 millicores), the result will be 2. -pub fn get_total_cpu_cores() -> usize { - ((get_total_cpu_millicores() as f64) / 1000.0).round() as usize -} - -/// Get the total memory in readable size. -pub fn get_total_memory_readable() -> Option { - if get_total_memory_bytes() > 0 { - Some(ReadableSize(get_total_memory_bytes() as u64)) - } else { - None - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_get_total_cpu_cores() { - assert!(get_total_cpu_cores() > 0); - } - - #[test] - fn test_get_total_memory_readable() { - assert!(get_total_memory_readable().unwrap() > ReadableSize::mb(0)); - } -} +pub use resource::*; diff --git a/src/common/stat/src/resource.rs b/src/common/stat/src/resource.rs new file mode 100644 index 0000000000..babfa54a19 --- /dev/null +++ b/src/common/stat/src/resource.rs @@ -0,0 +1,187 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; +use std::sync::atomic::{AtomicI64, Ordering}; +use std::time::Duration; + +use common_base::readable_size::ReadableSize; +use common_runtime::JoinHandle; +use common_telemetry::info; +use sysinfo::System; +use tokio::time::sleep; + +use crate::cgroups::calculate_cpu_usage; +use crate::{ + get_cpu_limit_from_cgroups, get_cpu_usage_from_cgroups, get_memory_limit_from_cgroups, + get_memory_usage_from_cgroups, +}; + +/// Get the total CPU in millicores. If the CPU limit is unset, it will return the total CPU cores from host system. +pub fn get_total_cpu_millicores() -> i64 { + // Get CPU limit from cgroups filesystem. + if let Some(cgroup_cpu_limit) = get_cpu_limit_from_cgroups() { + cgroup_cpu_limit + } else { + // Get total CPU cores from host system. + num_cpus::get() as i64 * 1000 + } +} + +/// Get the total memory in bytes. If the memory limit is unset, it will return the total memory from host system. +/// If the system is not supported to get the total host memory, it will return 0. +pub fn get_total_memory_bytes() -> i64 { + // Get memory limit from cgroups filesystem. + if let Some(cgroup_memory_limit) = get_memory_limit_from_cgroups() { + cgroup_memory_limit + } else { + // Get total memory from host system. + if sysinfo::IS_SUPPORTED_SYSTEM { + let mut sys_info = System::new(); + sys_info.refresh_memory(); + sys_info.total_memory() as i64 + } else { + // If the system is not supported, return 0 + 0 + } + } +} + +/// Get the total CPU cores. The result will be rounded to the nearest integer. +/// For example, if the total CPU is 1.5 cores(1500 millicores), the result will be 2. +pub fn get_total_cpu_cores() -> usize { + ((get_total_cpu_millicores() as f64) / 1000.0).round() as usize +} + +/// Get the total memory in readable size. +pub fn get_total_memory_readable() -> Option { + if get_total_memory_bytes() > 0 { + Some(ReadableSize(get_total_memory_bytes() as u64)) + } else { + None + } +} + +/// A reference to a `ResourceStat` implementation. +pub type ResourceStatRef = Arc; + +/// A trait for getting resource statistics. +pub trait ResourceStat { + /// Get the total CPU in millicores. + fn get_total_cpu_millicores(&self) -> i64; + /// Get the total memory in bytes. + fn get_total_memory_bytes(&self) -> i64; + /// Get the CPU usage in millicores. + fn get_cpu_usage_millicores(&self) -> i64; + /// Get the memory usage in bytes. + fn get_memory_usage_bytes(&self) -> i64; +} + +/// A implementation of `ResourceStat` trait. +pub struct ResourceStatImpl { + cpu_usage_millicores: Arc, + last_cpu_usage_usecs: Arc, + calculate_interval: Duration, + handler: Option>, +} + +impl Default for ResourceStatImpl { + fn default() -> Self { + Self { + cpu_usage_millicores: Arc::new(AtomicI64::new(0)), + last_cpu_usage_usecs: Arc::new(AtomicI64::new(0)), + calculate_interval: Duration::from_secs(5), + handler: None, + } + } +} + +impl ResourceStatImpl { + /// Start collecting CPU usage periodically. It will calculate the CPU usage in millicores based on rate of change of CPU usage usage_usec in `/sys/fs/cgroup/cpu.stat`. + /// It ONLY works in cgroup v2 environment. + pub fn start_collect_cpu_usage(&mut self) { + if self.handler.is_some() { + return; + } + + let cpu_usage_millicores = self.cpu_usage_millicores.clone(); + let last_cpu_usage_usecs = self.last_cpu_usage_usecs.clone(); + let calculate_interval = self.calculate_interval; + + let handler = common_runtime::spawn_global(async move { + info!( + "Starting to collect CPU usage periodically for every {} seconds", + calculate_interval.as_secs() + ); + loop { + let current_cpu_usage_usecs = get_cpu_usage_from_cgroups(); + if let Some(current_cpu_usage_usecs) = current_cpu_usage_usecs { + // Skip the first time to collect CPU usage. + if last_cpu_usage_usecs.load(Ordering::Relaxed) == 0 { + last_cpu_usage_usecs.store(current_cpu_usage_usecs, Ordering::Relaxed); + continue; + } + let cpu_usage = calculate_cpu_usage( + current_cpu_usage_usecs, + last_cpu_usage_usecs.load(Ordering::Relaxed), + calculate_interval.as_millis() as i64, + ); + cpu_usage_millicores.store(cpu_usage, Ordering::Relaxed); + last_cpu_usage_usecs.store(current_cpu_usage_usecs, Ordering::Relaxed); + } + sleep(calculate_interval).await; + } + }); + + self.handler = Some(handler); + } +} + +impl ResourceStat for ResourceStatImpl { + /// Get the total CPU in millicores. + fn get_total_cpu_millicores(&self) -> i64 { + get_total_cpu_millicores() + } + + /// Get the total memory in bytes. + fn get_total_memory_bytes(&self) -> i64 { + get_total_memory_bytes() + } + + /// Get the CPU usage in millicores. + fn get_cpu_usage_millicores(&self) -> i64 { + self.cpu_usage_millicores.load(Ordering::Relaxed) + } + + /// Get the memory usage in bytes. + /// It ONLY works in cgroup v2 environment. + fn get_memory_usage_bytes(&self) -> i64 { + get_memory_usage_from_cgroups().unwrap_or_default() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_total_cpu_cores() { + assert!(get_total_cpu_cores() > 0); + } + + #[test] + fn test_get_total_memory_readable() { + assert!(get_total_memory_readable().unwrap() > ReadableSize::mb(0)); + } +} diff --git a/src/datanode/Cargo.toml b/src/datanode/Cargo.toml index 3dcffd0ac9..265ede339e 100644 --- a/src/datanode/Cargo.toml +++ b/src/datanode/Cargo.toml @@ -30,6 +30,7 @@ common-procedure.workspace = true common-query.workspace = true common-recordbatch.workspace = true common-runtime.workspace = true +common-stat.workspace = true common-telemetry.workspace = true common-time.workspace = true common-version.workspace = true diff --git a/src/datanode/src/datanode.rs b/src/datanode/src/datanode.rs index ed8b41f0c7..b9b8edcdba 100644 --- a/src/datanode/src/datanode.rs +++ b/src/datanode/src/datanode.rs @@ -27,6 +27,7 @@ use common_meta::key::runtime_switch::RuntimeSwitchManager; use common_meta::key::{SchemaMetadataManager, SchemaMetadataManagerRef}; use common_meta::kv_backend::KvBackendRef; pub use common_procedure::options::ProcedureConfig; +use common_stat::ResourceStatImpl; use common_telemetry::{error, info, warn}; use common_wal::config::DatanodeWalConfig; use common_wal::config::kafka::DatanodeKafkaConfig; @@ -282,6 +283,9 @@ impl DatanodeBuilder { open_all_regions.await?; } + let mut resource_stat = ResourceStatImpl::default(); + resource_stat.start_collect_cpu_usage(); + let heartbeat_task = if let Some(meta_client) = meta_client { Some( HeartbeatTask::try_new( @@ -290,6 +294,7 @@ impl DatanodeBuilder { meta_client, cache_registry, self.plugins.clone(), + Arc::new(resource_stat), ) .await?, ) diff --git a/src/datanode/src/heartbeat.rs b/src/datanode/src/heartbeat.rs index 9c059e5698..607e031b43 100644 --- a/src/datanode/src/heartbeat.rs +++ b/src/datanode/src/heartbeat.rs @@ -20,7 +20,6 @@ use std::time::Duration; use api::v1::meta::heartbeat_request::NodeWorkloads; use api::v1::meta::{DatanodeWorkloads, HeartbeatRequest, NodeInfo, Peer, RegionRole, RegionStat}; use common_base::Plugins; -use common_config::utils::ResourceSpec; use common_meta::cache_invalidator::CacheInvalidatorRef; use common_meta::datanode::REGION_STATISTIC_KEY; use common_meta::distributed_time_constants::META_KEEP_ALIVE_INTERVAL_SECS; @@ -31,6 +30,7 @@ use common_meta::heartbeat::handler::{ }; use common_meta::heartbeat::mailbox::{HeartbeatMailbox, MailboxRef}; use common_meta::heartbeat::utils::outgoing_message_to_mailbox_message; +use common_stat::ResourceStatRef; use common_telemetry::{debug, error, info, trace, warn}; use common_workload::DatanodeWorkloadType; use meta_client::MetaClientRef; @@ -63,7 +63,7 @@ pub struct HeartbeatTask { interval: u64, resp_handler_executor: HeartbeatResponseHandlerExecutorRef, region_alive_keeper: Arc, - resource_spec: ResourceSpec, + resource_stat: ResourceStatRef, } impl Drop for HeartbeatTask { @@ -80,6 +80,7 @@ impl HeartbeatTask { meta_client: MetaClientRef, cache_invalidator: CacheInvalidatorRef, plugins: Plugins, + resource_stat: ResourceStatRef, ) -> Result { let countdown_task_handler_ext = plugins.get::(); let region_alive_keeper = Arc::new(RegionAliveKeeper::new( @@ -109,7 +110,7 @@ impl HeartbeatTask { interval: opts.heartbeat.interval.as_millis() as u64, resp_handler_executor, region_alive_keeper, - resource_spec: Default::default(), + resource_stat, }) } @@ -186,6 +187,7 @@ impl HeartbeatTask { .context(error::HandleHeartbeatResponseSnafu) } + #[allow(deprecated)] /// Start heartbeat task, spawn background task. pub async fn start( &self, @@ -237,8 +239,9 @@ impl HeartbeatTask { self.region_alive_keeper.start(Some(event_receiver)).await?; let mut last_sent = Instant::now(); - let cpus = self.resource_spec.cpus as u32; - let memory_bytes = self.resource_spec.memory.unwrap_or_default().as_bytes(); + let total_cpu_millicores = self.resource_stat.get_total_cpu_millicores(); + let total_memory_bytes = self.resource_stat.get_total_memory_bytes(); + let resource_stat = self.resource_stat.clone(); common_runtime::spawn_hb(async move { let sleep = tokio::time::sleep(Duration::from_millis(0)); @@ -252,8 +255,13 @@ impl HeartbeatTask { version: build_info.version.to_string(), git_commit: build_info.commit_short.to_string(), start_time_ms: node_epoch, - cpus, - memory_bytes, + total_cpu_millicores, + total_memory_bytes, + cpu_usage_millicores: 0, + memory_usage_bytes: 0, + // TODO(zyy17): Remove these deprecated fields when the deprecated fields are removed from the proto. + cpus: total_cpu_millicores as u32, + memory_bytes: total_memory_bytes as u64, hostname: hostname::get() .unwrap_or_default() .to_string_lossy() @@ -297,12 +305,18 @@ impl HeartbeatTask { let topic_stats = region_server_clone.topic_stats(); let now = Instant::now(); let duration_since_epoch = (now - epoch).as_millis() as u64; - let req = HeartbeatRequest { + let mut req = HeartbeatRequest { region_stats, topic_stats, duration_since_epoch, ..heartbeat_request.clone() }; + + if let Some(info) = req.info.as_mut() { + info.cpu_usage_millicores = resource_stat.get_cpu_usage_millicores(); + info.memory_usage_bytes = resource_stat.get_memory_usage_bytes(); + } + sleep.as_mut().reset(now + Duration::from_millis(interval)); Some(req) } diff --git a/src/flow/src/heartbeat.rs b/src/flow/src/heartbeat.rs index cc42668f5a..89b37860c5 100644 --- a/src/flow/src/heartbeat.rs +++ b/src/flow/src/heartbeat.rs @@ -18,7 +18,6 @@ use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use api::v1::meta::{HeartbeatRequest, Peer}; -use common_config::utils::ResourceSpec; use common_error::ext::BoxedError; use common_meta::heartbeat::handler::{ HeartbeatResponseHandlerContext, HeartbeatResponseHandlerExecutorRef, @@ -26,6 +25,7 @@ use common_meta::heartbeat::handler::{ use common_meta::heartbeat::mailbox::{HeartbeatMailbox, MailboxRef, OutgoingMessage}; use common_meta::heartbeat::utils::outgoing_message_to_mailbox_message; use common_meta::key::flow::flow_state::FlowStat; +use common_stat::ResourceStatRef; use common_telemetry::{debug, error, info, warn}; use greptime_proto::v1::meta::NodeInfo; use meta_client::client::{HeartbeatSender, HeartbeatStream, MetaClient}; @@ -69,7 +69,7 @@ pub struct HeartbeatTask { resp_handler_executor: HeartbeatResponseHandlerExecutorRef, running: Arc, query_stat_size: Option, - resource_spec: ResourceSpec, + resource_stat: ResourceStatRef, } impl HeartbeatTask { @@ -77,11 +77,13 @@ impl HeartbeatTask { self.query_stat_size = Some(query_stat_size); self } + pub fn new( opts: &FlownodeOptions, meta_client: Arc, heartbeat_opts: HeartbeatOptions, resp_handler_executor: HeartbeatResponseHandlerExecutorRef, + resource_stat: ResourceStatRef, ) -> Self { Self { node_id: opts.node_id.unwrap_or(0), @@ -93,7 +95,7 @@ impl HeartbeatTask { resp_handler_executor, running: Arc::new(AtomicBool::new(false)), query_stat_size: None, - resource_spec: Default::default(), + resource_stat, } } @@ -146,6 +148,8 @@ impl HeartbeatTask { heartbeat_request: &HeartbeatRequest, message: Option, latest_report: &Option, + cpu_usage: i64, + memory_usage: i64, ) -> Option { let mailbox_message = match message.map(outgoing_message_to_mailbox_message) { Some(Ok(message)) => Some(message), @@ -170,21 +174,38 @@ impl HeartbeatTask { .collect(), }); - Some(HeartbeatRequest { + let mut heartbeat_request = HeartbeatRequest { mailbox_message, flow_stat, ..heartbeat_request.clone() - }) + }; + + if let Some(info) = heartbeat_request.info.as_mut() { + info.cpu_usage_millicores = cpu_usage; + info.memory_usage_bytes = memory_usage; + } + + Some(heartbeat_request) } - fn build_node_info(start_time_ms: u64, cpus: u32, memory_bytes: u64) -> Option { + #[allow(deprecated)] + fn build_node_info( + start_time_ms: u64, + total_cpu_millicores: i64, + total_memory_bytes: i64, + ) -> Option { let build_info = common_version::build_info(); Some(NodeInfo { version: build_info.version.to_string(), git_commit: build_info.commit_short.to_string(), start_time_ms, - cpus, - memory_bytes, + total_cpu_millicores, + total_memory_bytes, + cpu_usage_millicores: 0, + memory_usage_bytes: 0, + // TODO(zyy17): Remove these deprecated fields when the deprecated fields are removed from the proto. + cpus: total_cpu_millicores as u32, + memory_bytes: total_memory_bytes as u64, hostname: hostname::get() .unwrap_or_default() .to_string_lossy() @@ -203,9 +224,9 @@ impl HeartbeatTask { id: self.node_id, addr: self.peer_addr.clone(), }); - let cpus = self.resource_spec.cpus as u32; - let memory_bytes = self.resource_spec.memory.unwrap_or_default().as_bytes(); - + let total_cpu_millicores = self.resource_stat.get_total_cpu_millicores(); + let total_memory_bytes = self.resource_stat.get_total_memory_bytes(); + let resource_stat = self.resource_stat.clone(); let query_stat_size = self.query_stat_size.clone(); common_runtime::spawn_hb(async move { @@ -218,7 +239,7 @@ impl HeartbeatTask { let heartbeat_request = HeartbeatRequest { peer: self_peer, node_epoch, - info: Self::build_node_info(node_epoch, cpus, memory_bytes), + info: Self::build_node_info(node_epoch, total_cpu_millicores, total_memory_bytes), ..Default::default() }; @@ -226,7 +247,7 @@ impl HeartbeatTask { let req = tokio::select! { message = outgoing_rx.recv() => { if let Some(message) = message { - Self::new_heartbeat_request(&heartbeat_request, Some(message), &latest_report) + Self::new_heartbeat_request(&heartbeat_request, Some(message), &latest_report, 0, 0) } else { warn!("Sender has been dropped, exiting the heartbeat loop"); // Receives None that means Sender was dropped, we need to break the current loop @@ -234,7 +255,7 @@ impl HeartbeatTask { } } _ = interval.tick() => { - Self::new_heartbeat_request(&heartbeat_request, None, &latest_report) + Self::new_heartbeat_request(&heartbeat_request, None, &latest_report, resource_stat.get_cpu_usage_millicores(), resource_stat.get_memory_usage_bytes()) } }; diff --git a/src/frontend/Cargo.toml b/src/frontend/Cargo.toml index b90e4f5eb2..24d9c8c5ff 100644 --- a/src/frontend/Cargo.toml +++ b/src/frontend/Cargo.toml @@ -37,6 +37,7 @@ common-procedure.workspace = true common-query.workspace = true common-recordbatch.workspace = true common-runtime.workspace = true +common-stat.workspace = true common-telemetry.workspace = true common-time.workspace = true common-version.workspace = true diff --git a/src/frontend/src/heartbeat.rs b/src/frontend/src/heartbeat.rs index 76fdc3305b..95645ad1ca 100644 --- a/src/frontend/src/heartbeat.rs +++ b/src/frontend/src/heartbeat.rs @@ -18,12 +18,12 @@ mod tests; use std::sync::Arc; use api::v1::meta::{HeartbeatRequest, NodeInfo, Peer}; -use common_config::utils::ResourceSpec; use common_meta::heartbeat::handler::{ HeartbeatResponseHandlerContext, HeartbeatResponseHandlerExecutorRef, }; use common_meta::heartbeat::mailbox::{HeartbeatMailbox, MailboxRef, OutgoingMessage}; use common_meta::heartbeat::utils::outgoing_message_to_mailbox_message; +use common_stat::ResourceStatRef; use common_telemetry::{debug, error, info, warn}; use meta_client::client::{HeartbeatSender, HeartbeatStream, MetaClient}; use servers::addrs; @@ -47,7 +47,7 @@ pub struct HeartbeatTask { retry_interval: Duration, resp_handler_executor: HeartbeatResponseHandlerExecutorRef, start_time_ms: u64, - resource_spec: ResourceSpec, + resource_stat: ResourceStatRef, } impl HeartbeatTask { @@ -56,6 +56,7 @@ impl HeartbeatTask { meta_client: Arc, heartbeat_opts: HeartbeatOptions, resp_handler_executor: HeartbeatResponseHandlerExecutorRef, + resource_stat: ResourceStatRef, ) -> Self { HeartbeatTask { // if internal grpc is configured, use its address as the peer address @@ -71,7 +72,7 @@ impl HeartbeatTask { retry_interval: heartbeat_opts.retry_interval, resp_handler_executor, start_time_ms: common_time::util::current_time_millis() as u64, - resource_spec: Default::default(), + resource_stat, } } @@ -133,6 +134,8 @@ impl HeartbeatTask { fn new_heartbeat_request( heartbeat_request: &HeartbeatRequest, message: Option, + cpu_usage: i64, + memory_usage: i64, ) -> Option { let mailbox_message = match message.map(outgoing_message_to_mailbox_message) { Some(Ok(message)) => Some(message), @@ -143,21 +146,38 @@ impl HeartbeatTask { None => None, }; - Some(HeartbeatRequest { + let mut heartbeat_request = HeartbeatRequest { mailbox_message, ..heartbeat_request.clone() - }) + }; + + if let Some(info) = heartbeat_request.info.as_mut() { + info.memory_usage_bytes = memory_usage; + info.cpu_usage_millicores = cpu_usage; + } + + Some(heartbeat_request) } - fn build_node_info(start_time_ms: u64, cpus: u32, memory_bytes: u64) -> Option { + #[allow(deprecated)] + fn build_node_info( + start_time_ms: u64, + total_cpu_millicores: i64, + total_memory_bytes: i64, + ) -> Option { let build_info = common_version::build_info(); Some(NodeInfo { version: build_info.version.to_string(), git_commit: build_info.commit_short.to_string(), start_time_ms, - cpus, - memory_bytes, + total_cpu_millicores, + total_memory_bytes, + cpu_usage_millicores: 0, + memory_usage_bytes: 0, + // TODO(zyy17): Remove these deprecated fields when the deprecated fields are removed from the proto. + cpus: total_cpu_millicores as u32, + memory_bytes: total_memory_bytes as u64, hostname: hostname::get() .unwrap_or_default() .to_string_lossy() @@ -177,16 +197,20 @@ impl HeartbeatTask { id: 0, addr: self.peer_addr.clone(), }); - let cpus = self.resource_spec.cpus as u32; - let memory_bytes = self.resource_spec.memory.unwrap_or_default().as_bytes(); - + let total_cpu_millicores = self.resource_stat.get_total_cpu_millicores(); + let total_memory_bytes = self.resource_stat.get_total_memory_bytes(); + let resource_stat = self.resource_stat.clone(); common_runtime::spawn_hb(async move { let sleep = tokio::time::sleep(Duration::from_millis(0)); tokio::pin!(sleep); let heartbeat_request = HeartbeatRequest { peer: self_peer, - info: Self::build_node_info(start_time_ms, cpus, memory_bytes), + info: Self::build_node_info( + start_time_ms, + total_cpu_millicores, + total_memory_bytes, + ), ..Default::default() }; @@ -194,7 +218,7 @@ impl HeartbeatTask { let req = tokio::select! { message = outgoing_rx.recv() => { if let Some(message) = message { - Self::new_heartbeat_request(&heartbeat_request, Some(message)) + Self::new_heartbeat_request(&heartbeat_request, Some(message), 0, 0) } else { warn!("Sender has been dropped, exiting the heartbeat loop"); // Receives None that means Sender was dropped, we need to break the current loop @@ -202,8 +226,8 @@ impl HeartbeatTask { } } _ = &mut sleep => { - sleep.as_mut().reset(Instant::now() + report_interval); - Self::new_heartbeat_request(&heartbeat_request, None) + sleep.as_mut().reset(Instant::now() + report_interval); + Self::new_heartbeat_request(&heartbeat_request, None, resource_stat.get_cpu_usage_millicores(), resource_stat.get_memory_usage_bytes()) } }; diff --git a/src/meta-client/src/client.rs b/src/meta-client/src/client.rs index 2a66c1570a..d819251597 100644 --- a/src/meta-client/src/client.rs +++ b/src/meta-client/src/client.rs @@ -24,7 +24,9 @@ mod util; use std::fmt::Debug; use std::sync::Arc; -use api::v1::meta::{ProcedureDetailResponse, ReconcileRequest, ReconcileResponse, Role}; +use api::v1::meta::{ + MetasrvNodeInfo, ProcedureDetailResponse, ReconcileRequest, ReconcileResponse, Role, +}; pub use ask_leader::{AskLeader, LeaderProvider, LeaderProviderRef}; use cluster::Client as ClusterClient; pub use cluster::ClusterKvBackend; @@ -371,7 +373,8 @@ impl ClusterInfo for MetaClient { let mut nodes = if get_metasrv_nodes { let last_activity_ts = -1; // Metasrv does not provide this information. - let (leader, followers) = cluster_client.get_metasrv_peers().await?; + let (leader, followers): (Option, Vec) = + cluster_client.get_metasrv_peers().await?; followers .into_iter() .map(|node| { @@ -383,8 +386,10 @@ impl ClusterInfo for MetaClient { version: node_info.version, git_commit: node_info.git_commit, start_time_ms: node_info.start_time_ms, - cpus: node_info.cpus, - memory_bytes: node_info.memory_bytes, + total_cpu_millicores: node_info.total_cpu_millicores, + total_memory_bytes: node_info.total_memory_bytes, + cpu_usage_millicores: node_info.cpu_usage_millicores, + memory_usage_bytes: node_info.memory_usage_bytes, hostname: node_info.hostname, } } else { @@ -396,8 +401,10 @@ impl ClusterInfo for MetaClient { version: node.version, git_commit: node.git_commit, start_time_ms: node.start_time_ms, - cpus: node.cpus, - memory_bytes: node.memory_bytes, + total_cpu_millicores: node.cpus as i64, + total_memory_bytes: node.memory_bytes as i64, + cpu_usage_millicores: 0, + memory_usage_bytes: 0, hostname: "".to_string(), } } @@ -411,8 +418,10 @@ impl ClusterInfo for MetaClient { version: node_info.version, git_commit: node_info.git_commit, start_time_ms: node_info.start_time_ms, - cpus: node_info.cpus, - memory_bytes: node_info.memory_bytes, + total_cpu_millicores: node_info.total_cpu_millicores, + total_memory_bytes: node_info.total_memory_bytes, + cpu_usage_millicores: node_info.cpu_usage_millicores, + memory_usage_bytes: node_info.memory_usage_bytes, hostname: node_info.hostname, } } else { @@ -424,8 +433,10 @@ impl ClusterInfo for MetaClient { version: node.version, git_commit: node.git_commit, start_time_ms: node.start_time_ms, - cpus: node.cpus, - memory_bytes: node.memory_bytes, + total_cpu_millicores: node.cpus as i64, + total_memory_bytes: node.memory_bytes as i64, + cpu_usage_millicores: 0, + memory_usage_bytes: 0, hostname: "".to_string(), } } diff --git a/src/meta-srv/Cargo.toml b/src/meta-srv/Cargo.toml index 90a4fdc17b..bd2075501c 100644 --- a/src/meta-srv/Cargo.toml +++ b/src/meta-srv/Cargo.toml @@ -39,6 +39,7 @@ common-meta.workspace = true common-options.workspace = true common-procedure.workspace = true common-runtime.workspace = true +common-stat.workspace = true common-telemetry.workspace = true common-time.workspace = true common-version.workspace = true diff --git a/src/meta-srv/src/discovery/lease.rs b/src/meta-srv/src/discovery/lease.rs index 46b92c0f1a..9d9e0d6c23 100644 --- a/src/meta-srv/src/discovery/lease.rs +++ b/src/meta-srv/src/discovery/lease.rs @@ -243,8 +243,10 @@ mod tests { version: "1.0.0".to_string(), git_commit: "1234567890".to_string(), start_time_ms: current_time_millis() as u64, - cpus: 0, - memory_bytes: 0, + total_cpu_millicores: 0, + total_memory_bytes: 0, + cpu_usage_millicores: 0, + memory_usage_bytes: 0, hostname: "test_hostname".to_string(), }; @@ -269,8 +271,10 @@ mod tests { version: "1.0.0".to_string(), git_commit: "1234567890".to_string(), start_time_ms: current_time_millis() as u64, - cpus: 0, - memory_bytes: 0, + total_cpu_millicores: 0, + total_memory_bytes: 0, + cpu_usage_millicores: 0, + memory_usage_bytes: 0, hostname: "test_hostname".to_string(), }; @@ -307,8 +311,10 @@ mod tests { version: "1.0.0".to_string(), git_commit: "1234567890".to_string(), start_time_ms: last_activity_ts as u64, - cpus: 0, - memory_bytes: 0, + total_cpu_millicores: 0, + total_memory_bytes: 0, + cpu_usage_millicores: 0, + memory_usage_bytes: 0, hostname: "test_hostname".to_string(), }; diff --git a/src/meta-srv/src/election/rds/mysql.rs b/src/meta-srv/src/election/rds/mysql.rs index a0890969f8..014923c7c3 100644 --- a/src/meta-srv/src/election/rds/mysql.rs +++ b/src/meta-srv/src/election/rds/mysql.rs @@ -1161,8 +1161,10 @@ mod tests { version: "test_version".to_string(), git_commit: "test_git_commit".to_string(), start_time_ms: 0, - cpus: 0, - memory_bytes: 0, + total_cpu_millicores: 0, + total_memory_bytes: 0, + cpu_usage_millicores: 0, + memory_usage_bytes: 0, hostname: "test_hostname".to_string(), }; mysql_election.register_candidate(&node_info).await.unwrap(); diff --git a/src/meta-srv/src/election/rds/postgres.rs b/src/meta-srv/src/election/rds/postgres.rs index 14b2bbb409..beab74dac4 100644 --- a/src/meta-srv/src/election/rds/postgres.rs +++ b/src/meta-srv/src/election/rds/postgres.rs @@ -1000,8 +1000,10 @@ mod tests { version: "test_version".to_string(), git_commit: "test_git_commit".to_string(), start_time_ms: 0, - cpus: 0, - memory_bytes: 0, + total_cpu_millicores: 0, + total_memory_bytes: 0, + cpu_usage_millicores: 0, + memory_usage_bytes: 0, hostname: "test_hostname".to_string(), }; pg_election.register_candidate(&node_info).await.unwrap(); diff --git a/src/meta-srv/src/handler/collect_cluster_info_handler.rs b/src/meta-srv/src/handler/collect_cluster_info_handler.rs index f144f3edc5..c96229f9cf 100644 --- a/src/meta-srv/src/handler/collect_cluster_info_handler.rs +++ b/src/meta-srv/src/handler/collect_cluster_info_handler.rs @@ -52,8 +52,10 @@ impl HeartbeatHandler for CollectFrontendClusterInfoHandler { version: info.version, git_commit: info.git_commit, start_time_ms: info.start_time_ms, - cpus: info.cpus, - memory_bytes: info.memory_bytes, + total_cpu_millicores: info.total_cpu_millicores, + total_memory_bytes: info.total_memory_bytes, + cpu_usage_millicores: info.cpu_usage_millicores, + memory_usage_bytes: info.memory_usage_bytes, hostname: info.hostname, }; @@ -88,8 +90,10 @@ impl HeartbeatHandler for CollectFlownodeClusterInfoHandler { version: info.version, git_commit: info.git_commit, start_time_ms: info.start_time_ms, - cpus: info.cpus, - memory_bytes: info.memory_bytes, + total_cpu_millicores: info.total_cpu_millicores, + total_memory_bytes: info.total_memory_bytes, + cpu_usage_millicores: info.cpu_usage_millicores, + memory_usage_bytes: info.memory_usage_bytes, hostname: info.hostname, }; @@ -142,8 +146,10 @@ impl HeartbeatHandler for CollectDatanodeClusterInfoHandler { version: info.version, git_commit: info.git_commit, start_time_ms: info.start_time_ms, - cpus: info.cpus, - memory_bytes: info.memory_bytes, + total_cpu_millicores: info.total_cpu_millicores, + total_memory_bytes: info.total_memory_bytes, + cpu_usage_millicores: info.cpu_usage_millicores, + memory_usage_bytes: info.memory_usage_bytes, hostname: info.hostname, }; diff --git a/src/meta-srv/src/metasrv.rs b/src/meta-srv/src/metasrv.rs index 4c2c7fcf53..aeaea1337b 100644 --- a/src/meta-srv/src/metasrv.rs +++ b/src/meta-srv/src/metasrv.rs @@ -22,7 +22,6 @@ use std::time::Duration; use clap::ValueEnum; use common_base::Plugins; use common_base::readable_size::ReadableSize; -use common_config::utils::ResourceSpec; use common_config::{Configurable, DEFAULT_DATA_HOME}; use common_event_recorder::EventRecorderOptions; use common_greptimedb_telemetry::GreptimeDBTelemetryTask; @@ -47,6 +46,7 @@ use common_options::datanode::DatanodeClientOptions; use common_options::memory::MemoryOptions; use common_procedure::ProcedureManagerRef; use common_procedure::options::ProcedureConfig; +use common_stat::ResourceStatRef; use common_telemetry::logging::{LoggingOptions, TracingOptions}; use common_telemetry::{error, info, warn}; use common_wal::config::MetasrvWalConfig; @@ -372,12 +372,16 @@ pub struct MetasrvNodeInfo { pub git_commit: String, // The node start timestamp in milliseconds pub start_time_ms: u64, - // The node cpus + // The node total cpu millicores #[serde(default)] - pub cpus: u32, - // The node memory bytes + pub total_cpu_millicores: i64, #[serde(default)] - pub memory_bytes: u64, + // The node total memory bytes + pub total_memory_bytes: i64, + /// The node build cpu usage millicores + pub cpu_usage_millicores: i64, + /// The node build memory usage bytes + pub memory_usage_bytes: i64, // The node hostname #[serde(default)] pub hostname: String, @@ -397,15 +401,19 @@ impl From for api::v1::meta::MetasrvNodeInfo { version: node_info.version.clone(), git_commit: node_info.git_commit.clone(), start_time_ms: node_info.start_time_ms, - cpus: node_info.cpus, - memory_bytes: node_info.memory_bytes, + cpus: node_info.total_cpu_millicores as u32, + memory_bytes: node_info.total_memory_bytes as u64, // The canonical location for node information. info: Some(api::v1::meta::NodeInfo { version: node_info.version, git_commit: node_info.git_commit, start_time_ms: node_info.start_time_ms, - cpus: node_info.cpus, - memory_bytes: node_info.memory_bytes, + total_cpu_millicores: node_info.total_cpu_millicores, + total_memory_bytes: node_info.total_memory_bytes, + cpu_usage_millicores: node_info.cpu_usage_millicores, + memory_usage_bytes: node_info.memory_usage_bytes, + cpus: node_info.total_cpu_millicores as u32, + memory_bytes: node_info.total_memory_bytes as u64, hostname: node_info.hostname, }), } @@ -517,7 +525,7 @@ pub struct Metasrv { region_flush_ticker: Option, table_id_sequence: SequenceRef, reconciliation_manager: ReconciliationManagerRef, - resource_spec: ResourceSpec, + resource_stat: ResourceStatRef, plugins: Plugins, } @@ -699,8 +707,8 @@ impl Metasrv { self.start_time_ms } - pub fn resource_spec(&self) -> &ResourceSpec { - &self.resource_spec + pub fn resource_stat(&self) -> &ResourceStatRef { + &self.resource_stat } pub fn node_info(&self) -> MetasrvNodeInfo { @@ -710,8 +718,10 @@ impl Metasrv { version: build_info.version.to_string(), git_commit: build_info.commit_short.to_string(), start_time_ms: self.start_time_ms(), - cpus: self.resource_spec().cpus as u32, - memory_bytes: self.resource_spec().memory.unwrap_or_default().as_bytes(), + total_cpu_millicores: self.resource_stat.get_total_cpu_millicores(), + total_memory_bytes: self.resource_stat.get_total_memory_bytes(), + cpu_usage_millicores: self.resource_stat.get_cpu_usage_millicores(), + memory_usage_bytes: self.resource_stat.get_memory_usage_bytes(), hostname: hostname::get() .unwrap_or_default() .to_string_lossy() diff --git a/src/meta-srv/src/metasrv/builder.rs b/src/meta-srv/src/metasrv/builder.rs index 9cc0b8cc72..0bcc914e27 100644 --- a/src/meta-srv/src/metasrv/builder.rs +++ b/src/meta-srv/src/metasrv/builder.rs @@ -46,6 +46,7 @@ use common_meta::stats::topic::TopicStatsRegistry; use common_meta::wal_options_allocator::{build_kafka_client, build_wal_options_allocator}; use common_procedure::ProcedureManagerRef; use common_procedure::local::{LocalManager, ManagerConfig}; +use common_stat::ResourceStatImpl; use common_telemetry::{info, warn}; use snafu::{ResultExt, ensure}; use store_api::storage::MAX_REGION_SEQ; @@ -517,6 +518,9 @@ impl MetasrvBuilder { .try_start() .context(error::InitReconciliationManagerSnafu)?; + let mut resource_stat = ResourceStatImpl::default(); + resource_stat.start_collect_cpu_usage(); + Ok(Metasrv { state, started: Arc::new(AtomicBool::new(false)), @@ -556,7 +560,7 @@ impl MetasrvBuilder { table_id_sequence, reconciliation_manager, topic_stats_registry, - resource_spec: Default::default(), + resource_stat: Arc::new(resource_stat), }) } } diff --git a/src/meta-srv/src/service/cluster.rs b/src/meta-srv/src/service/cluster.rs index e39337c374..5c0ae4c71f 100644 --- a/src/meta-srv/src/service/cluster.rs +++ b/src/meta-srv/src/service/cluster.rs @@ -97,8 +97,10 @@ impl Metasrv { version: build_info.version.to_string(), git_commit: build_info.commit_short.to_string(), start_time_ms: self.start_time_ms(), - cpus: self.resource_spec().cpus as u32, - memory_bytes: self.resource_spec().memory.unwrap_or_default().as_bytes(), + total_cpu_millicores: self.resource_stat().get_total_cpu_millicores(), + total_memory_bytes: self.resource_stat().get_total_memory_bytes(), + cpu_usage_millicores: self.resource_stat().get_cpu_usage_millicores(), + memory_usage_bytes: self.resource_stat().get_memory_usage_bytes(), hostname: hostname::get() .unwrap_or_default() .to_string_lossy() diff --git a/src/standalone/src/information_extension.rs b/src/standalone/src/information_extension.rs index b15ab74a98..852da25e65 100644 --- a/src/standalone/src/information_extension.rs +++ b/src/standalone/src/information_extension.rs @@ -24,6 +24,7 @@ use common_meta::key::flow::flow_state::FlowStat; use common_meta::peer::Peer; use common_procedure::{ProcedureInfo, ProcedureManagerRef}; use common_query::request::QueryRequest; +use common_stat::{ResourceStatImpl, ResourceStatRef}; use datanode::region_server::RegionServer; use flow::StreamingEngine; use snafu::ResultExt; @@ -35,15 +36,19 @@ pub struct StandaloneInformationExtension { procedure_manager: ProcedureManagerRef, start_time_ms: u64, flow_streaming_engine: RwLock>>, + resource_stat: ResourceStatRef, } impl StandaloneInformationExtension { pub fn new(region_server: RegionServer, procedure_manager: ProcedureManagerRef) -> Self { + let mut resource_stat = ResourceStatImpl::default(); + resource_stat.start_collect_cpu_usage(); Self { region_server, procedure_manager, start_time_ms: common_time::util::current_time_millis() as u64, flow_streaming_engine: RwLock::new(None), + resource_stat: Arc::new(resource_stat), } } @@ -75,8 +80,10 @@ impl InformationExtension for StandaloneInformationExtension { // Use `self.start_time_ms` instead. // It's not precise but enough. start_time_ms: self.start_time_ms, - cpus: common_stat::get_total_cpu_millicores() as u32, - memory_bytes: common_stat::get_total_memory_bytes() as u64, + total_cpu_millicores: self.resource_stat.get_total_cpu_millicores(), + total_memory_bytes: self.resource_stat.get_total_memory_bytes(), + cpu_usage_millicores: self.resource_stat.get_cpu_usage_millicores(), + memory_usage_bytes: self.resource_stat.get_memory_usage_bytes(), hostname: hostname::get() .unwrap_or_default() .to_string_lossy() diff --git a/tests-integration/Cargo.toml b/tests-integration/Cargo.toml index 13e4cc3115..91cb0f5ad2 100644 --- a/tests-integration/Cargo.toml +++ b/tests-integration/Cargo.toml @@ -35,6 +35,7 @@ common-procedure.workspace = true common-query.workspace = true common-recordbatch.workspace = true common-runtime.workspace = true +common-stat.workspace = true common-telemetry.workspace = true common-test-util.workspace = true common-time.workspace = true diff --git a/tests-integration/src/cluster.rs b/tests-integration/src/cluster.rs index 6be94cbcd4..19c2ce4134 100644 --- a/tests-integration/src/cluster.rs +++ b/tests-integration/src/cluster.rs @@ -44,6 +44,7 @@ use common_meta::kv_backend::memory::MemoryKvBackend; use common_meta::peer::Peer; use common_runtime::Builder as RuntimeBuilder; use common_runtime::runtime::BuilderBuild; +use common_stat::ResourceStatImpl; use common_test_util::temp_dir::create_temp_dir; use common_wal::config::{DatanodeWalConfig, MetasrvWalConfig}; use datanode::config::DatanodeOptions; @@ -411,11 +412,15 @@ impl GreptimeDbClusterBuilder { let fe_opts = self.build_frontend_options(); + let mut resource_stat = ResourceStatImpl::default(); + resource_stat.start_collect_cpu_usage(); + let heartbeat_task = HeartbeatTask::new( &fe_opts, meta_client.clone(), HeartbeatOptions::default(), Arc::new(handlers_executor), + Arc::new(resource_stat), ); let instance = FrontendBuilder::new( diff --git a/tests/cases/distributed/information_schema/cluster_info.result b/tests/cases/distributed/information_schema/cluster_info.result index 63d02f4355..4ab8f6808d 100644 --- a/tests/cases/distributed/information_schema/cluster_info.result +++ b/tests/cases/distributed/information_schema/cluster_info.result @@ -11,8 +11,10 @@ DESC TABLE CLUSTER_INFO; | peer_type | String | | NO | | FIELD | | peer_addr | String | | YES | | FIELD | | peer_hostname | String | | YES | | FIELD | -| total_cpu_millicores | UInt32 | | NO | | FIELD | -| total_memory_bytes | UInt64 | | NO | | FIELD | +| total_cpu_millicores | Int64 | | NO | | FIELD | +| total_memory_bytes | Int64 | | NO | | FIELD | +| cpu_usage_millicores | Int64 | | NO | | FIELD | +| memory_usage_bytes | Int64 | | NO | | FIELD | | version | String | | NO | | FIELD | | git_commit | String | | NO | | FIELD | | start_time | TimestampMillisecond | | YES | | FIELD | diff --git a/tests/cases/standalone/common/system/information_schema.result b/tests/cases/standalone/common/system/information_schema.result index eef56b91b2..1cb53ccfe3 100644 --- a/tests/cases/standalone/common/system/information_schema.result +++ b/tests/cases/standalone/common/system/information_schema.result @@ -72,18 +72,20 @@ select * from information_schema.columns order by table_schema, table_name, colu | greptime | information_schema | check_constraints | constraint_catalog | 1 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | | | greptime | information_schema | check_constraints | constraint_name | 3 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | | | greptime | information_schema | check_constraints | constraint_schema | 2 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | | -| greptime | information_schema | cluster_info | active_time | 11 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | Yes | string | | | -| greptime | information_schema | cluster_info | git_commit | 8 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | | -| greptime | information_schema | cluster_info | node_status | 12 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | Yes | string | | | +| greptime | information_schema | cluster_info | active_time | 13 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | Yes | string | | | +| greptime | information_schema | cluster_info | cpu_usage_millicores | 7 | | | 19 | 0 | | | | | | select,insert | | Int64 | bigint | FIELD | | No | bigint | | | +| greptime | information_schema | cluster_info | git_commit | 10 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | | +| greptime | information_schema | cluster_info | memory_usage_bytes | 8 | | | 19 | 0 | | | | | | select,insert | | Int64 | bigint | FIELD | | No | bigint | | | +| greptime | information_schema | cluster_info | node_status | 14 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | Yes | string | | | | greptime | information_schema | cluster_info | peer_addr | 3 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | Yes | string | | | | greptime | information_schema | cluster_info | peer_hostname | 4 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | Yes | string | | | | greptime | information_schema | cluster_info | peer_id | 1 | | | 19 | 0 | | | | | | select,insert | | Int64 | bigint | FIELD | | No | bigint | | | | greptime | information_schema | cluster_info | peer_type | 2 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | | -| greptime | information_schema | cluster_info | start_time | 9 | | | | | 3 | | | | | select,insert | | TimestampMillisecond | timestamp(3) | FIELD | | Yes | timestamp(3) | | | -| greptime | information_schema | cluster_info | total_cpu_millicores | 5 | | | 10 | 0 | | | | | | select,insert | | UInt32 | int unsigned | FIELD | | No | int unsigned | | | -| greptime | information_schema | cluster_info | total_memory_bytes | 6 | | | 20 | 0 | | | | | | select,insert | | UInt64 | bigint unsigned | FIELD | | No | bigint unsigned | | | -| greptime | information_schema | cluster_info | uptime | 10 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | Yes | string | | | -| greptime | information_schema | cluster_info | version | 7 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | | +| greptime | information_schema | cluster_info | start_time | 11 | | | | | 3 | | | | | select,insert | | TimestampMillisecond | timestamp(3) | FIELD | | Yes | timestamp(3) | | | +| greptime | information_schema | cluster_info | total_cpu_millicores | 5 | | | 19 | 0 | | | | | | select,insert | | Int64 | bigint | FIELD | | No | bigint | | | +| greptime | information_schema | cluster_info | total_memory_bytes | 6 | | | 19 | 0 | | | | | | select,insert | | Int64 | bigint | FIELD | | No | bigint | | | +| greptime | information_schema | cluster_info | uptime | 12 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | Yes | string | | | +| greptime | information_schema | cluster_info | version | 9 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | | | greptime | information_schema | collation_character_set_applicability | character_set_name | 2 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | | | greptime | information_schema | collation_character_set_applicability | collation_name | 1 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | | | greptime | information_schema | collations | character_set_name | 2 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | | diff --git a/tests/cases/standalone/information_schema/cluster_info.result b/tests/cases/standalone/information_schema/cluster_info.result index 8542984028..bc9520ba6a 100644 --- a/tests/cases/standalone/information_schema/cluster_info.result +++ b/tests/cases/standalone/information_schema/cluster_info.result @@ -11,8 +11,10 @@ DESC TABLE CLUSTER_INFO; | peer_type | String | | NO | | FIELD | | peer_addr | String | | YES | | FIELD | | peer_hostname | String | | YES | | FIELD | -| total_cpu_millicores | UInt32 | | NO | | FIELD | -| total_memory_bytes | UInt64 | | NO | | FIELD | +| total_cpu_millicores | Int64 | | NO | | FIELD | +| total_memory_bytes | Int64 | | NO | | FIELD | +| cpu_usage_millicores | Int64 | | NO | | FIELD | +| memory_usage_bytes | Int64 | | NO | | FIELD | | version | String | | NO | | FIELD | | git_commit | String | | NO | | FIELD | | start_time | TimestampMillisecond | | YES | | FIELD | From 03a29c6591e97de34196069d5c196e501016d3e1 Mon Sep 17 00:00:00 2001 From: Sicong Hu Date: Fri, 24 Oct 2025 11:24:13 +0800 Subject: [PATCH 003/149] fix: correct test_index_build_type_compact (#7137) Signed-off-by: SNC123 --- src/mito2/src/engine/index_build_test.rs | 17 +++--- src/mito2/src/engine/listener.rs | 63 +++++++++++++++----- src/mito2/src/sst/index.rs | 25 ++++++++ src/mito2/src/worker.rs | 11 +++- src/mito2/src/worker/handle_rebuild_index.rs | 6 +- 5 files changed, 91 insertions(+), 31 deletions(-) diff --git a/src/mito2/src/engine/index_build_test.rs b/src/mito2/src/engine/index_build_test.rs index 6fe27929e5..404aa3ad01 100644 --- a/src/mito2/src/engine/index_build_test.rs +++ b/src/mito2/src/engine/index_build_test.rs @@ -32,11 +32,6 @@ use crate::test_util::{ CreateRequestBuilder, TestEnv, build_rows, flush_region, put_rows, reopen_region, rows_schema, }; -// wait listener receives enough success count. -async fn wait_finish(listener: &IndexBuildListener, times: usize) { - listener.wait_finish(times).await; -} - fn async_build_mode_config(is_create_on_flush: bool) -> MitoConfig { let mut config = MitoConfig::default(); config.index.build_mode = IndexBuildMode::Async; @@ -84,7 +79,7 @@ fn assert_listener_counts( expected_success_count: usize, ) { assert_eq!(listener.begin_count(), expected_begin_count); - assert_eq!(listener.success_count(), expected_success_count); + assert_eq!(listener.finish_count(), expected_success_count); } #[tokio::test] @@ -155,7 +150,7 @@ async fn test_index_build_type_flush() { flush_region(&engine, region_id, None).await; // After 2 index build task are finished, 2 index files should exist. - wait_finish(&listener, 2).await; + listener.wait_finish(2).await; let scanner = engine .scanner(region_id, ScanRequest::default()) .await @@ -204,6 +199,8 @@ async fn test_index_build_type_compact() { put_and_flush(&engine, region_id, &column_schemas, 15..25).await; put_and_flush(&engine, region_id, &column_schemas, 40..50).await; + // all index build tasks begin means flush tasks are all finished. + listener.wait_begin(4).await; // Before compaction is triggered, files should be 4, and not all index files are built. let scanner = engine .scanner(region_id, ScanRequest::default()) @@ -216,8 +213,8 @@ async fn test_index_build_type_compact() { // This explicit compaction call serves to make the process deterministic for the test. compact(&engine, region_id).await; + listener.wait_begin(5).await; // 4 flush + 1 compaction begin // Before compaction is triggered, files should be 2, and not all index files are built. - listener.clear_success_count(); let scanner = engine .scanner(region_id, ScanRequest::default()) .await @@ -226,7 +223,7 @@ async fn test_index_build_type_compact() { assert!(num_of_index_files(&engine, &scanner, region_id).await < 2); // Wait a while to make sure index build tasks are finished. - wait_finish(&listener, 2).await; + listener.wait_stop(5).await; // 4 flush + 1 compaction = some abort + some finish let scanner = engine .scanner(region_id, ScanRequest::default()) .await @@ -292,7 +289,7 @@ async fn test_index_build_type_schema_change() { .handle_request(region_id, RegionRequest::Alter(set_index_request)) .await .unwrap(); - wait_finish(&listener, 1).await; + listener.wait_finish(1).await; let scanner = engine .scanner(region_id, ScanRequest::default()) .await diff --git a/src/mito2/src/engine/listener.rs b/src/mito2/src/engine/listener.rs index 317c3cdfd0..ebc20ac280 100644 --- a/src/mito2/src/engine/listener.rs +++ b/src/mito2/src/engine/listener.rs @@ -75,10 +75,13 @@ pub trait EventListener: Send + Sync { async fn on_notify_region_change_result_begin(&self, _region_id: RegionId) {} /// Notifies the listener that the index build task is executed successfully. - async fn on_index_build_success(&self, _region_file_id: RegionFileId) {} + async fn on_index_build_finish(&self, _region_file_id: RegionFileId) {} /// Notifies the listener that the index build task is started. async fn on_index_build_begin(&self, _region_file_id: RegionFileId) {} + + /// Notifies the listener that the index build task is aborted. + async fn on_index_build_abort(&self, _region_file_id: RegionFileId) {} } pub type EventListenerRef = Arc; @@ -309,45 +312,75 @@ impl EventListener for NotifyRegionChangeResultListener { #[derive(Default)] pub struct IndexBuildListener { - notify: Notify, - success_count: AtomicUsize, - start_count: AtomicUsize, + begin_count: AtomicUsize, + begin_notify: Notify, + finish_count: AtomicUsize, + finish_notify: Notify, + abort_count: AtomicUsize, + abort_notify: Notify, + // stop means finished or aborted + stop_notify: Notify, } impl IndexBuildListener { /// Wait until index build is done for `times` times. pub async fn wait_finish(&self, times: usize) { - while self.success_count.load(Ordering::Relaxed) < times { - self.notify.notified().await; + while self.finish_count.load(Ordering::Relaxed) < times { + self.finish_notify.notified().await; + } + } + + /// Wait until index build is stopped for `times` times. + pub async fn wait_stop(&self, times: usize) { + while self.finish_count.load(Ordering::Relaxed) + self.abort_count.load(Ordering::Relaxed) + < times + { + self.stop_notify.notified().await; + } + } + + /// Wait until index build is begun for `times` times. + pub async fn wait_begin(&self, times: usize) { + while self.begin_count.load(Ordering::Relaxed) < times { + self.begin_notify.notified().await; } } /// Clears the success count. - pub fn clear_success_count(&self) { - self.success_count.store(0, Ordering::Relaxed); + pub fn clear_finish_count(&self) { + self.finish_count.store(0, Ordering::Relaxed); } /// Returns the success count. - pub fn success_count(&self) -> usize { - self.success_count.load(Ordering::Relaxed) + pub fn finish_count(&self) -> usize { + self.finish_count.load(Ordering::Relaxed) } /// Returns the start count. pub fn begin_count(&self) -> usize { - self.start_count.load(Ordering::Relaxed) + self.begin_count.load(Ordering::Relaxed) } } #[async_trait] impl EventListener for IndexBuildListener { - async fn on_index_build_success(&self, region_file_id: RegionFileId) { + async fn on_index_build_finish(&self, region_file_id: RegionFileId) { info!("Region {} index build successfully", region_file_id); - self.success_count.fetch_add(1, Ordering::Relaxed); - self.notify.notify_one(); + self.finish_count.fetch_add(1, Ordering::Relaxed); + self.finish_notify.notify_one(); + self.stop_notify.notify_one(); } async fn on_index_build_begin(&self, region_file_id: RegionFileId) { info!("Region {} index build begin", region_file_id); - self.start_count.fetch_add(1, Ordering::Relaxed); + self.begin_count.fetch_add(1, Ordering::Relaxed); + self.begin_notify.notify_one(); + } + + async fn on_index_build_abort(&self, region_file_id: RegionFileId) { + info!("Region {} index build aborted", region_file_id); + self.abort_count.fetch_add(1, Ordering::Relaxed); + self.abort_notify.notify_one(); + self.stop_notify.notify_one(); } } diff --git a/src/mito2/src/sst/index.rs b/src/mito2/src/sst/index.rs index 8ad7f6ef01..cc8469332a 100644 --- a/src/mito2/src/sst/index.rs +++ b/src/mito2/src/sst/index.rs @@ -62,6 +62,7 @@ use crate::sst::index::inverted_index::creator::InvertedIndexer; use crate::sst::parquet::SstInfo; use crate::sst::parquet::flat_format::primary_key_column_index; use crate::sst::parquet::format::PrimaryKeyArray; +use crate::worker::WorkerListener; pub(crate) const TYPE_INVERTED_INDEX: &str = "inverted_index"; pub(crate) const TYPE_FULLTEXT_INDEX: &str = "fulltext_index"; @@ -451,6 +452,7 @@ pub struct IndexBuildTask { pub file_meta: FileMeta, pub reason: IndexBuildType, pub access_layer: AccessLayerRef, + pub(crate) listener: WorkerListener, pub(crate) manifest_ctx: ManifestContextRef, pub write_cache: Option, pub file_purger: FilePurgerRef, @@ -486,6 +488,12 @@ impl IndexBuildTask { } async fn do_index_build(&mut self, version_control: VersionControlRef) { + self.listener + .on_index_build_begin(RegionFileId::new( + self.file_meta.region_id, + self.file_meta.file_id, + )) + .await; match self.index_build(version_control).await { Ok(outcome) => self.on_success(outcome).await, Err(e) => { @@ -540,6 +548,12 @@ impl IndexBuildTask { if !self.check_sst_file_exists(&version_control).await { // Calls abort to clean up index files. indexer.abort().await; + self.listener + .on_index_build_abort(RegionFileId::new( + self.file_meta.region_id, + self.file_meta.file_id, + )) + .await; return Ok(IndexBuildOutcome::Aborted(format!( "SST file not found during index build, region: {}, file_id: {}", self.file_meta.region_id, self.file_meta.file_id @@ -575,6 +589,12 @@ impl IndexBuildTask { if !self.check_sst_file_exists(&version_control).await { // Calls abort to clean up index files. indexer.abort().await; + self.listener + .on_index_build_abort(RegionFileId::new( + self.file_meta.region_id, + self.file_meta.file_id, + )) + .await; return Ok(IndexBuildOutcome::Aborted(format!( "SST file not found during index build, region: {}, file_id: {}", self.file_meta.region_id, self.file_meta.file_id @@ -1192,6 +1212,7 @@ mod tests { }, reason: IndexBuildType::Flush, access_layer: env.access_layer.clone(), + listener: WorkerListener::default(), manifest_ctx, write_cache: None, file_purger, @@ -1242,6 +1263,7 @@ mod tests { file_meta: file_meta.clone(), reason: IndexBuildType::Flush, access_layer: env.access_layer.clone(), + listener: WorkerListener::default(), manifest_ctx, write_cache: None, file_purger, @@ -1309,6 +1331,7 @@ mod tests { file_meta: file_meta.clone(), reason: IndexBuildType::Flush, access_layer: env.access_layer.clone(), + listener: WorkerListener::default(), manifest_ctx, write_cache: None, file_purger, @@ -1405,6 +1428,7 @@ mod tests { file_meta: file_meta.clone(), reason: IndexBuildType::Flush, access_layer: env.access_layer.clone(), + listener: WorkerListener::default(), manifest_ctx, write_cache: None, file_purger, @@ -1485,6 +1509,7 @@ mod tests { file_meta: file_meta.clone(), reason: IndexBuildType::Flush, access_layer: env.access_layer.clone(), + listener: WorkerListener::default(), manifest_ctx, write_cache: Some(write_cache.clone()), file_purger, diff --git a/src/mito2/src/worker.rs b/src/mito2/src/worker.rs index 87c25cd964..322141fd1b 100644 --- a/src/mito2/src/worker.rs +++ b/src/mito2/src/worker.rs @@ -1220,10 +1220,10 @@ impl WorkerListener { } } - pub(crate) async fn on_index_build_success(&self, _region_file_id: RegionFileId) { + pub(crate) async fn on_index_build_finish(&self, _region_file_id: RegionFileId) { #[cfg(any(test, feature = "test"))] if let Some(listener) = &self.listener { - listener.on_index_build_success(_region_file_id).await; + listener.on_index_build_finish(_region_file_id).await; } } @@ -1233,6 +1233,13 @@ impl WorkerListener { listener.on_index_build_begin(_region_file_id).await; } } + + pub(crate) async fn on_index_build_abort(&self, _region_file_id: RegionFileId) { + #[cfg(any(test, feature = "test"))] + if let Some(listener) = &self.listener { + listener.on_index_build_abort(_region_file_id).await; + } + } } #[cfg(test)] diff --git a/src/mito2/src/worker/handle_rebuild_index.rs b/src/mito2/src/worker/handle_rebuild_index.rs index 71f9bc206f..38ca07f1a9 100644 --- a/src/mito2/src/worker/handle_rebuild_index.rs +++ b/src/mito2/src/worker/handle_rebuild_index.rs @@ -71,6 +71,7 @@ impl RegionWorkerLoop { file_meta: file.meta_ref().clone(), reason: build_type, access_layer: access_layer.clone(), + listener: self.listener.clone(), manifest_ctx: region.manifest_ctx.clone(), write_cache: self.cache_manager.write_cache().cloned(), file_purger: file.file_purger(), @@ -172,9 +173,6 @@ impl RegionWorkerLoop { let _ = self .index_build_scheduler .schedule_build(®ion.version_control, task); - self.listener - .on_index_build_begin(RegionFileId::new(region_id, file_handle.meta_ref().file_id)) - .await; } // Wait for all index build tasks to finish and notify the caller. common_runtime::spawn_global(async move { @@ -212,7 +210,7 @@ impl RegionWorkerLoop { ); for file_meta in &request.edit.files_to_add { self.listener - .on_index_build_success(RegionFileId::new(region_id, file_meta.file_id)) + .on_index_build_finish(RegionFileId::new(region_id, file_meta.file_id)) .await; } } From 6ad23bc9b41a368d7396b46838c6f5570b5539a3 Mon Sep 17 00:00:00 2001 From: LFC <990479+MichaelScofield@users.noreply.github.com> Date: Fri, 24 Oct 2025 11:28:04 +0800 Subject: [PATCH 004/149] refactor: convert to postgres values directly from arrow (#7131) * refactor: convert to pg values directly from arrow Signed-off-by: luofucong * resolve PR comments Signed-off-by: luofucong --------- Signed-off-by: luofucong --- src/servers/src/postgres/handler.rs | 25 +- src/servers/src/postgres/types.rs | 1135 +++++++++++++++++---------- 2 files changed, 714 insertions(+), 446 deletions(-) diff --git a/src/servers/src/postgres/handler.rs b/src/servers/src/postgres/handler.rs index 9561b9605e..daccf9dc26 100644 --- a/src/servers/src/postgres/handler.rs +++ b/src/servers/src/postgres/handler.rs @@ -28,7 +28,7 @@ use futures::{Sink, SinkExt, Stream, StreamExt, future, stream}; use pgwire::api::portal::{Format, Portal}; use pgwire::api::query::{ExtendedQueryHandler, SimpleQueryHandler}; use pgwire::api::results::{ - DataRowEncoder, DescribePortalResponse, DescribeStatementResponse, QueryResponse, Response, Tag, + DescribePortalResponse, DescribeStatementResponse, QueryResponse, Response, Tag, }; use pgwire::api::stmt::{QueryParser, StoredStatement}; use pgwire::api::{ClientInfo, ErrorHandler, Type}; @@ -160,25 +160,16 @@ where let pg_schema = Arc::new(schema_to_pg(schema.as_ref(), field_format).map_err(convert_err)?); let pg_schema_ref = pg_schema.clone(); let data_row_stream = recordbatches_stream - .map(|record_batch_result| match record_batch_result { - Ok(rb) => stream::iter( - // collect rows from a single recordbatch into vector to avoid - // borrowing it - rb.rows().map(Ok).collect::>(), - ) + .map(move |result| match result { + Ok(record_batch) => stream::iter(RecordBatchRowIterator::new( + query_ctx.clone(), + pg_schema_ref.clone(), + record_batch, + )) .boxed(), Err(e) => stream::once(future::err(convert_err(e))).boxed(), }) - .flatten() // flatten into stream> - .map(move |row| { - row.and_then(|row| { - let mut encoder = DataRowEncoder::new(pg_schema_ref.clone()); - for (value, column) in row.into_iter().zip(schema.column_schemas()) { - encode_value(&query_ctx, value, &mut encoder, &column.data_type)?; - } - encoder.finish() - }) - }); + .flatten(); Ok(Response::Query(QueryResponse::new( pg_schema, diff --git a/src/servers/src/postgres/types.rs b/src/servers/src/postgres/types.rs index 9c32ee2fdd..b8251a5d95 100644 --- a/src/servers/src/postgres/types.rs +++ b/src/servers/src/postgres/types.rs @@ -18,23 +18,38 @@ mod error; mod interval; use std::collections::HashMap; -use std::ops::Deref; use std::sync::Arc; +use arrow::array::{Array, ArrayRef, AsArray}; +use arrow::datatypes::{ + Date32Type, Date64Type, Decimal128Type, DurationMicrosecondType, DurationMillisecondType, + DurationNanosecondType, DurationSecondType, Float32Type, Float64Type, Int8Type, Int16Type, + Int32Type, Int64Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalYearMonthType, + Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType, + TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, + TimestampSecondType, UInt8Type, UInt16Type, UInt32Type, UInt64Type, +}; +use arrow_schema::{DataType, IntervalUnit, TimeUnit}; use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime}; -use common_time::{IntervalDayTime, IntervalMonthDayNano, IntervalYearMonth}; +use common_decimal::Decimal128; +use common_recordbatch::RecordBatch; +use common_time::time::Time; +use common_time::{ + Date, Duration, IntervalDayTime, IntervalMonthDayNano, IntervalYearMonth, Timestamp, +}; use datafusion_common::ScalarValue; use datafusion_expr::LogicalPlan; use datatypes::arrow::datatypes::DataType as ArrowDataType; use datatypes::json::JsonStructureSettings; use datatypes::prelude::{ConcreteDataType, Value}; -use datatypes::schema::Schema; -use datatypes::types::{IntervalType, JsonFormat, TimestampType, jsonb_to_string}; -use datatypes::value::{ListValue, StructValue}; +use datatypes::schema::{ColumnSchema, Schema, SchemaRef}; +use datatypes::types::{IntervalType, TimestampType, jsonb_to_string}; +use datatypes::value::StructValue; use pgwire::api::Type; use pgwire::api::portal::{Format, Portal}; use pgwire::api::results::{DataRowEncoder, FieldInfo}; use pgwire::error::{PgWireError, PgWireResult}; +use pgwire::messages::data::DataRow; use session::context::QueryContextRef; use session::session_config::PGByteaOutputValue; use snafu::ResultExt; @@ -88,182 +103,158 @@ fn encode_struct( fn encode_array( query_ctx: &QueryContextRef, - value_list: ListValue, + array: ArrayRef, builder: &mut DataRowEncoder, ) -> PgWireResult<()> { - match value_list.datatype().as_ref() { - ConcreteDataType::Boolean(_) => { - let array = value_list - .items() + macro_rules! encode_primitive_array { + ($array: ident, $data_type: ty, $lower_type: ty, $upper_type: ty) => {{ + let array = $array.iter().collect::>>(); + if array .iter() - .map(|v| match v { - Value::Null => Ok(None), - Value::Boolean(v) => Ok(Some(*v)), - _ => Err(convert_err(Error::Internal { - err_msg: format!("Invalid list item type, find {v:?}, expected bool",), - })), - }) - .collect::>>>()?; + .all(|x| x.is_none_or(|i| i <= <$lower_type>::MAX as $data_type)) + { + builder.encode_field( + &array + .into_iter() + .map(|x| x.map(|i| i as $lower_type)) + .collect::>>(), + ) + } else { + builder.encode_field( + &array + .into_iter() + .map(|x| x.map(|i| i as $upper_type)) + .collect::>>(), + ) + } + }}; + } + + match array.data_type() { + DataType::Boolean => { + let array = array.as_boolean(); + let array = array.iter().collect::>(); builder.encode_field(&array) } - ConcreteDataType::Int8(_) | ConcreteDataType::UInt8(_) => { - let array = value_list - .items() - .iter() - .map(|v| match v { - Value::Null => Ok(None), - Value::Int8(v) => Ok(Some(*v)), - Value::UInt8(v) => Ok(Some(*v as i8)), - _ => Err(convert_err(Error::Internal { - err_msg: format!( - "Invalid list item type, find {v:?}, expected int8 or uint8", - ), - })), - }) - .collect::>>>()?; + DataType::Int8 => { + let array = array.as_primitive::(); + let array = array.iter().collect::>(); builder.encode_field(&array) } - ConcreteDataType::Int16(_) | ConcreteDataType::UInt16(_) => { - let array = value_list - .items() - .iter() - .map(|v| match v { - Value::Null => Ok(None), - Value::Int16(v) => Ok(Some(*v)), - Value::UInt16(v) => Ok(Some(*v as i16)), - _ => Err(convert_err(Error::Internal { - err_msg: format!( - "Invalid list item type, find {v:?}, expected int16 or uint16", - ), - })), - }) - .collect::>>>()?; + DataType::Int16 => { + let array = array.as_primitive::(); + let array = array.iter().collect::>(); builder.encode_field(&array) } - ConcreteDataType::Int32(_) | ConcreteDataType::UInt32(_) => { - let array = value_list - .items() - .iter() - .map(|v| match v { - Value::Null => Ok(None), - Value::Int32(v) => Ok(Some(*v)), - Value::UInt32(v) => Ok(Some(*v as i32)), - _ => Err(convert_err(Error::Internal { - err_msg: format!( - "Invalid list item type, find {v:?}, expected int32 or uint32", - ), - })), - }) - .collect::>>>()?; + DataType::Int32 => { + let array = array.as_primitive::(); + let array = array.iter().collect::>(); builder.encode_field(&array) } - ConcreteDataType::Int64(_) | ConcreteDataType::UInt64(_) => { - let array = value_list - .items() - .iter() - .map(|v| match v { - Value::Null => Ok(None), - Value::Int64(v) => Ok(Some(*v)), - Value::UInt64(v) => Ok(Some(*v as i64)), - _ => Err(convert_err(Error::Internal { - err_msg: format!( - "Invalid list item type, find {v:?}, expected int64 or uint64", - ), - })), - }) - .collect::>>>()?; + DataType::Int64 => { + let array = array.as_primitive::(); + let array = array.iter().collect::>(); builder.encode_field(&array) } - ConcreteDataType::Float32(_) => { - let array = value_list - .items() - .iter() - .map(|v| match v { - Value::Null => Ok(None), - Value::Float32(v) => Ok(Some(v.0)), - _ => Err(convert_err(Error::Internal { - err_msg: format!("Invalid list item type, find {v:?}, expected float32",), - })), - }) - .collect::>>>()?; + DataType::UInt8 => { + let array = array.as_primitive::(); + encode_primitive_array!(array, u8, i8, i16) + } + DataType::UInt16 => { + let array = array.as_primitive::(); + encode_primitive_array!(array, u16, i16, i32) + } + DataType::UInt32 => { + let array = array.as_primitive::(); + encode_primitive_array!(array, u32, i32, i64) + } + DataType::UInt64 => { + let array = array.as_primitive::(); + let array = array.iter().collect::>(); + if array.iter().all(|x| x.is_none_or(|i| i <= i64::MAX as u64)) { + builder.encode_field( + &array + .into_iter() + .map(|x| x.map(|i| i as i64)) + .collect::>>(), + ) + } else { + builder.encode_field( + &array + .into_iter() + .map(|x| x.map(|i| i.to_string())) + .collect::>(), + ) + } + } + DataType::Float32 => { + let array = array.as_primitive::(); + let array = array.iter().collect::>(); builder.encode_field(&array) } - ConcreteDataType::Float64(_) => { - let array = value_list - .items() - .iter() - .map(|v| match v { - Value::Null => Ok(None), - Value::Float64(v) => Ok(Some(v.0)), - _ => Err(convert_err(Error::Internal { - err_msg: format!("Invalid list item type, find {v:?}, expected float64",), - })), - }) - .collect::>>>()?; + DataType::Float64 => { + let array = array.as_primitive::(); + let array = array.iter().collect::>(); builder.encode_field(&array) } - ConcreteDataType::Binary(_) | ConcreteDataType::Vector(_) => { + DataType::Binary => { let bytea_output = query_ctx.configuration_parameter().postgres_bytea_output(); + let array = array.as_binary::(); match *bytea_output { PGByteaOutputValue::ESCAPE => { - let array = value_list - .items() + let array = array .iter() - .map(|v| match v { - Value::Null => Ok(None), - Value::Binary(v) => Ok(Some(EscapeOutputBytea(v.deref()))), - - _ => Err(convert_err(Error::Internal { - err_msg: format!( - "Invalid list item type, find {v:?}, expected binary", - ), - })), - }) - .collect::>>>()?; + .map(|v| v.map(EscapeOutputBytea)) + .collect::>(); builder.encode_field(&array) } PGByteaOutputValue::HEX => { - let array = value_list - .items() + let array = array .iter() - .map(|v| match v { - Value::Null => Ok(None), - Value::Binary(v) => Ok(Some(HexOutputBytea(v.deref()))), - - _ => Err(convert_err(Error::Internal { - err_msg: format!( - "Invalid list item type, find {v:?}, expected binary", - ), - })), - }) - .collect::>>>()?; + .map(|v| v.map(HexOutputBytea)) + .collect::>(); builder.encode_field(&array) } } } - &ConcreteDataType::String(_) => { - let array = value_list - .items() - .iter() - .map(|v| match v { - Value::Null => Ok(None), - Value::String(v) => Ok(Some(v.as_utf8())), - _ => Err(convert_err(Error::Internal { - err_msg: format!("Invalid list item type, find {v:?}, expected string",), - })), - }) - .collect::>>>()?; + DataType::Utf8 => { + let array = array.as_string::(); + let array = array.into_iter().collect::>(); builder.encode_field(&array) } - ConcreteDataType::Date(_) => { - let array = value_list - .items() - .iter() + DataType::LargeUtf8 => { + let array = array.as_string::(); + let array = array.into_iter().collect::>(); + builder.encode_field(&array) + } + DataType::Utf8View => { + let array = array.as_string_view(); + let array = array.into_iter().collect::>(); + builder.encode_field(&array) + } + DataType::Date32 | DataType::Date64 => { + let iter: Box>> = + if matches!(array.data_type(), DataType::Date32) { + let array = array.as_primitive::(); + Box::new(array.into_iter()) + } else { + let array = array.as_primitive::(); + // `Date64` values are milliseconds representation of `Date32` values, according + // to its specification. So we convert them to `Date32` values to process the + // `Date64` array unified with `Date32` array. + Box::new( + array + .into_iter() + .map(|x| x.map(|i| (i / 86_400_000) as i32)), + ) + }; + let array = iter + .into_iter() .map(|v| match v { - Value::Null => Ok(None), - Value::Date(v) => { - if let Some(date) = v.to_chrono_date() { + None => Ok(None), + Some(v) => { + if let Some(date) = Date::new(v).to_chrono_date() { let (style, order) = *query_ctx.configuration_parameter().pg_datetime_style(); Ok(Some(StylingDate(date, style, order))) @@ -273,20 +264,36 @@ fn encode_array( })) } } - _ => Err(convert_err(Error::Internal { - err_msg: format!("Invalid list item type, find {v:?}, expected date",), - })), }) .collect::>>>()?; builder.encode_field(&array) } - ConcreteDataType::Timestamp(_) => { - let array = value_list - .items() - .iter() + DataType::Timestamp(time_unit, _) => { + let array = match time_unit { + TimeUnit::Second => { + let array = array.as_primitive::(); + array.into_iter().collect::>() + } + TimeUnit::Millisecond => { + let array = array.as_primitive::(); + array.into_iter().collect::>() + } + TimeUnit::Microsecond => { + let array = array.as_primitive::(); + array.into_iter().collect::>() + } + TimeUnit::Nanosecond => { + let array = array.as_primitive::(); + array.into_iter().collect::>() + } + }; + let time_unit = time_unit.into(); + let array = array + .into_iter() .map(|v| match v { - Value::Null => Ok(None), - Value::Timestamp(v) => { + None => Ok(None), + Some(v) => { + let v = Timestamp::new(v, time_unit); if let Some(datetime) = v.to_chrono_datetime_with_timezone(Some(&query_ctx.timezone())) { @@ -299,183 +306,404 @@ fn encode_array( })) } } - _ => Err(convert_err(Error::Internal { - err_msg: format!("Invalid list item type, find {v:?}, expected timestamp",), - })), }) .collect::>>>()?; builder.encode_field(&array) } - ConcreteDataType::Time(_) => { - let array = value_list - .items() - .iter() - .map(|v| match v { - Value::Null => Ok(None), - Value::Time(v) => Ok(v.to_chrono_time()), - _ => Err(convert_err(Error::Internal { - err_msg: format!("Invalid list item type, find {v:?}, expected time",), - })), - }) - .collect::>>>()?; + DataType::Time32(time_unit) | DataType::Time64(time_unit) => { + let iter: Box>> = match time_unit { + TimeUnit::Second => { + let array = array.as_primitive::(); + Box::new( + array + .into_iter() + .map(|v| v.map(|i| Time::new_second(i as i64))), + ) + } + TimeUnit::Millisecond => { + let array = array.as_primitive::(); + Box::new( + array + .into_iter() + .map(|v| v.map(|i| Time::new_millisecond(i as i64))), + ) + } + TimeUnit::Microsecond => { + let array = array.as_primitive::(); + Box::new(array.into_iter().map(|v| v.map(Time::new_microsecond))) + } + TimeUnit::Nanosecond => { + let array = array.as_primitive::(); + Box::new(array.into_iter().map(|v| v.map(Time::new_nanosecond))) + } + }; + let array = iter + .into_iter() + .map(|v| v.and_then(|v| v.to_chrono_time())) + .collect::>>(); builder.encode_field(&array) } - ConcreteDataType::Interval(_) => { - let array = value_list - .items() - .iter() - .map(|v| match v { - Value::Null => Ok(None), - Value::IntervalYearMonth(v) => Ok(Some(PgInterval::from(*v))), - Value::IntervalDayTime(v) => Ok(Some(PgInterval::from(*v))), - Value::IntervalMonthDayNano(v) => Ok(Some(PgInterval::from(*v))), - _ => Err(convert_err(Error::Internal { - err_msg: format!("Invalid list item type, find {v:?}, expected interval",), - })), - }) - .collect::>>>()?; + DataType::Interval(interval_unit) => { + let array = match interval_unit { + IntervalUnit::YearMonth => { + let array = array.as_primitive::(); + array + .into_iter() + .map(|v| v.map(|i| PgInterval::from(IntervalYearMonth::from(i)))) + .collect::>() + } + IntervalUnit::DayTime => { + let array = array.as_primitive::(); + array + .into_iter() + .map(|v| v.map(|i| PgInterval::from(IntervalDayTime::from(i)))) + .collect::>() + } + IntervalUnit::MonthDayNano => { + let array = array.as_primitive::(); + array + .into_iter() + .map(|v| v.map(|i| PgInterval::from(IntervalMonthDayNano::from(i)))) + .collect::>() + } + }; builder.encode_field(&array) } - ConcreteDataType::Decimal128(_) => { - let array = value_list - .items() - .iter() - .map(|v| match v { - Value::Null => Ok(None), - Value::Decimal128(v) => Ok(Some(v.to_string())), - _ => Err(convert_err(Error::Internal { - err_msg: format!("Invalid list item type, find {v:?}, expected decimal",), - })), - }) - .collect::>>>()?; + DataType::Decimal128(precision, scale) => { + let array = array.as_primitive::(); + let array = array + .into_iter() + .map(|v| v.map(|i| Decimal128::new(i, *precision, *scale).to_string())) + .collect::>(); builder.encode_field(&array) } - ConcreteDataType::Json(j) => match &j.format { - JsonFormat::Jsonb => { - let array = value_list - .take_items() - .into_iter() - .map(|v| match v { - Value::Null => Ok(None), - Value::Binary(v) => { - let s = jsonb_to_string(&v).map_err(convert_err)?; - Ok(Some(s)) - } - - _ => Err(convert_err(Error::Internal { - err_msg: format!("Invalid list item type, find {v:?}, expected json",), - })), - }) - .collect::>>>()?; - builder.encode_field(&array) - } - JsonFormat::Native(_) => { - let array = value_list - .take_items() - .into_iter() - .map(|v| match v { - Value::Null => Ok(None), - Value::Json(inner) => serde_json::Value::try_from(*inner) - .map(Some) - .map_err(|e| PgWireError::ApiError(Box::new(e))), - _ => Err(convert_err(Error::Internal { - err_msg: format!("Invalid list item type, find {v:?}, expected json",), - })), - }) - .collect::>>>()?; - builder.encode_field(&array) - } - }, _ => Err(convert_err(Error::Internal { err_msg: format!( "cannot write array type {:?} in postgres protocol: unimplemented", - value_list.datatype() + array.data_type() ), })), } } -pub(super) fn encode_value( - query_ctx: &QueryContextRef, - value: Value, - builder: &mut DataRowEncoder, - datatype: &ConcreteDataType, -) -> PgWireResult<()> { - match value { - Value::Null => builder.encode_field(&None::<&i8>), - Value::Boolean(v) => builder.encode_field(&v), - Value::UInt8(v) => builder.encode_field(&(v as i8)), - Value::UInt16(v) => builder.encode_field(&(v as i16)), - Value::UInt32(v) => builder.encode_field(&v), - Value::UInt64(v) => builder.encode_field(&(v as i64)), - Value::Int8(v) => builder.encode_field(&v), - Value::Int16(v) => builder.encode_field(&v), - Value::Int32(v) => builder.encode_field(&v), - Value::Int64(v) => builder.encode_field(&v), - Value::Float32(v) => builder.encode_field(&v.0), - Value::Float64(v) => builder.encode_field(&v.0), - Value::String(v) => builder.encode_field(&v.as_utf8()), - Value::Binary(v) => match datatype { - ConcreteDataType::Json(_j) => { - let s = jsonb_to_string(v.as_ref()).map_err(convert_err)?; - builder.encode_field(&s) +pub(crate) struct RecordBatchRowIterator { + query_ctx: QueryContextRef, + pg_schema: Arc>, + schema: SchemaRef, + record_batch: arrow::record_batch::RecordBatch, + i: usize, +} + +impl Iterator for RecordBatchRowIterator { + type Item = PgWireResult; + + fn next(&mut self) -> Option { + if self.i < self.record_batch.num_rows() { + let mut encoder = DataRowEncoder::new(self.pg_schema.clone()); + if let Err(e) = self.encode_row(self.i, &mut encoder) { + return Some(Err(e)); } - _ => { - let bytea_output = query_ctx.configuration_parameter().postgres_bytea_output(); - match *bytea_output { - PGByteaOutputValue::ESCAPE => { - builder.encode_field(&EscapeOutputBytea(v.deref())) + self.i += 1; + Some(encoder.finish()) + } else { + None + } + } +} + +impl RecordBatchRowIterator { + pub(crate) fn new( + query_ctx: QueryContextRef, + pg_schema: Arc>, + record_batch: RecordBatch, + ) -> Self { + let schema = record_batch.schema.clone(); + let record_batch = record_batch.into_df_record_batch(); + Self { + query_ctx, + pg_schema, + schema, + record_batch, + i: 0, + } + } + + fn encode_row(&mut self, i: usize, encoder: &mut DataRowEncoder) -> PgWireResult<()> { + for (j, column) in self.record_batch.columns().iter().enumerate() { + if column.is_null(i) { + encoder.encode_field(&None::<&i8>)?; + continue; + } + + match column.data_type() { + DataType::Null => { + encoder.encode_field(&None::<&i8>)?; + } + DataType::Boolean => { + let array = column.as_boolean(); + encoder.encode_field(&array.value(i))?; + } + DataType::UInt8 => { + let array = column.as_primitive::(); + let value = array.value(i); + if value <= i8::MAX as u8 { + encoder.encode_field(&(value as i8))?; + } else { + encoder.encode_field(&(value as i16))?; } - PGByteaOutputValue::HEX => builder.encode_field(&HexOutputBytea(v.deref())), + } + DataType::UInt16 => { + let array = column.as_primitive::(); + let value = array.value(i); + if value <= i16::MAX as u16 { + encoder.encode_field(&(value as i16))?; + } else { + encoder.encode_field(&(value as i32))?; + } + } + DataType::UInt32 => { + let array = column.as_primitive::(); + let value = array.value(i); + if value <= i32::MAX as u32 { + encoder.encode_field(&(value as i32))?; + } else { + encoder.encode_field(&(value as i64))?; + } + } + DataType::UInt64 => { + let array = column.as_primitive::(); + let value = array.value(i); + if value <= i64::MAX as u64 { + encoder.encode_field(&(value as i64))?; + } else { + encoder.encode_field(&value.to_string())?; + } + } + DataType::Int8 => { + let array = column.as_primitive::(); + encoder.encode_field(&array.value(i))?; + } + DataType::Int16 => { + let array = column.as_primitive::(); + encoder.encode_field(&array.value(i))?; + } + DataType::Int32 => { + let array = column.as_primitive::(); + encoder.encode_field(&array.value(i))?; + } + DataType::Int64 => { + let array = column.as_primitive::(); + encoder.encode_field(&array.value(i))?; + } + DataType::Float32 => { + let array = column.as_primitive::(); + encoder.encode_field(&array.value(i))?; + } + DataType::Float64 => { + let array = column.as_primitive::(); + encoder.encode_field(&array.value(i))?; + } + DataType::Utf8 => { + let array = column.as_string::(); + let value = array.value(i); + encoder.encode_field(&value)?; + } + DataType::Utf8View => { + let array = column.as_string_view(); + let value = array.value(i); + encoder.encode_field(&value)?; + } + DataType::LargeUtf8 => { + let array = column.as_string::(); + let value = array.value(i); + encoder.encode_field(&value)?; + } + DataType::Binary => { + let array = column.as_binary::(); + let v = array.value(i); + encode_bytes( + &self.schema.column_schemas()[j], + v, + encoder, + &self.query_ctx, + )?; + } + DataType::BinaryView => { + let array = column.as_binary_view(); + let v = array.value(i); + encode_bytes( + &self.schema.column_schemas()[j], + v, + encoder, + &self.query_ctx, + )?; + } + DataType::LargeBinary => { + let array = column.as_binary::(); + let v = array.value(i); + encode_bytes( + &self.schema.column_schemas()[j], + v, + encoder, + &self.query_ctx, + )?; + } + DataType::Date32 | DataType::Date64 => { + let v = if matches!(column.data_type(), DataType::Date32) { + let array = column.as_primitive::(); + array.value(i) + } else { + let array = column.as_primitive::(); + // `Date64` values are milliseconds representation of `Date32` values, + // according to its specification. So we convert the `Date64` value here to + // the `Date32` value to process them unified. + (array.value(i) / 86_400_000) as i32 + }; + let v = Date::new(v); + let date = v.to_chrono_date().map(|v| { + let (style, order) = + *self.query_ctx.configuration_parameter().pg_datetime_style(); + StylingDate(v, style, order) + }); + encoder.encode_field(&date)?; + } + DataType::Timestamp(time_unit, _) => { + let v = match time_unit { + TimeUnit::Second => { + let array = column.as_primitive::(); + array.value(i) + } + TimeUnit::Millisecond => { + let array = column.as_primitive::(); + array.value(i) + } + TimeUnit::Microsecond => { + let array = column.as_primitive::(); + array.value(i) + } + TimeUnit::Nanosecond => { + let array = column.as_primitive::(); + array.value(i) + } + }; + let v = Timestamp::new(v, time_unit.into()); + let datetime = v + .to_chrono_datetime_with_timezone(Some(&self.query_ctx.timezone())) + .map(|v| { + let (style, order) = + *self.query_ctx.configuration_parameter().pg_datetime_style(); + StylingDateTime(v, style, order) + }); + encoder.encode_field(&datetime)?; + } + DataType::Interval(interval_unit) => match interval_unit { + IntervalUnit::YearMonth => { + let array = column.as_primitive::(); + let v: IntervalYearMonth = array.value(i).into(); + encoder.encode_field(&PgInterval::from(v))?; + } + IntervalUnit::DayTime => { + let array = column.as_primitive::(); + let v: IntervalDayTime = array.value(i).into(); + encoder.encode_field(&PgInterval::from(v))?; + } + IntervalUnit::MonthDayNano => { + let array = column.as_primitive::(); + let v: IntervalMonthDayNano = array.value(i).into(); + encoder.encode_field(&PgInterval::from(v))?; + } + }, + DataType::Duration(time_unit) => { + let v = match time_unit { + TimeUnit::Second => { + let array = column.as_primitive::(); + array.value(i) + } + TimeUnit::Millisecond => { + let array = column.as_primitive::(); + array.value(i) + } + TimeUnit::Microsecond => { + let array = column.as_primitive::(); + array.value(i) + } + TimeUnit::Nanosecond => { + let array = column.as_primitive::(); + array.value(i) + } + }; + let d = Duration::new(v, time_unit.into()); + match PgInterval::try_from(d) { + Ok(i) => encoder.encode_field(&i)?, + Err(e) => { + return Err(convert_err(Error::Internal { + err_msg: e.to_string(), + })); + } + } + } + DataType::List(_) => { + let array = column.as_list::(); + let items = array.value(i); + encode_array(&self.query_ctx, items, encoder)?; + } + DataType::Struct(_) => { + encode_struct(&self.query_ctx, Default::default(), encoder)?; + } + DataType::Time32(time_unit) | DataType::Time64(time_unit) => { + let v = match time_unit { + TimeUnit::Second => { + let array = column.as_primitive::(); + Time::new_second(array.value(i) as i64) + } + TimeUnit::Millisecond => { + let array = column.as_primitive::(); + Time::new_millisecond(array.value(i) as i64) + } + TimeUnit::Microsecond => { + let array = column.as_primitive::(); + Time::new_microsecond(array.value(i)) + } + TimeUnit::Nanosecond => { + let array = column.as_primitive::(); + Time::new_nanosecond(array.value(i)) + } + }; + encoder.encode_field(&v.to_chrono_time())?; + } + DataType::Decimal128(precision, scale) => { + let array = column.as_primitive::(); + let v = Decimal128::new(array.value(i), *precision, *scale); + encoder.encode_field(&v.to_string())?; + } + _ => { + return Err(convert_err(Error::Internal { + err_msg: format!( + "cannot convert datatype {} to postgres", + column.data_type() + ), + })); } } - }, - Value::Date(v) => { - if let Some(date) = v.to_chrono_date() { - let (style, order) = *query_ctx.configuration_parameter().pg_datetime_style(); - builder.encode_field(&StylingDate(date, style, order)) - } else { - Err(convert_err(Error::Internal { - err_msg: format!("Failed to convert date to postgres type {v:?}",), - })) - } } - Value::Timestamp(v) => { - if let Some(datetime) = v.to_chrono_datetime_with_timezone(Some(&query_ctx.timezone())) - { - let (style, order) = *query_ctx.configuration_parameter().pg_datetime_style(); - builder.encode_field(&StylingDateTime(datetime, style, order)) - } else { - Err(convert_err(Error::Internal { - err_msg: format!("Failed to convert date to postgres type {v:?}",), - })) - } - } - Value::Time(v) => { - if let Some(time) = v.to_chrono_time() { - builder.encode_field(&time) - } else { - Err(convert_err(Error::Internal { - err_msg: format!("Failed to convert time to postgres type {v:?}",), - })) - } - } - Value::IntervalYearMonth(v) => builder.encode_field(&PgInterval::from(v)), - Value::IntervalDayTime(v) => builder.encode_field(&PgInterval::from(v)), - Value::IntervalMonthDayNano(v) => builder.encode_field(&PgInterval::from(v)), - Value::Decimal128(v) => builder.encode_field(&v.to_string()), - Value::Duration(d) => match PgInterval::try_from(d) { - Ok(i) => builder.encode_field(&i), - Err(e) => Err(convert_err(Error::Internal { - err_msg: e.to_string(), - })), - }, - Value::List(values) => encode_array(query_ctx, values, builder), - Value::Struct(values) => encode_struct(query_ctx, values, builder), - Value::Json(inner) => { - let json_value = serde_json::Value::try_from(*inner) - .map_err(|e| PgWireError::ApiError(Box::new(e)))?; - builder.encode_field(&json_value) + Ok(()) + } +} + +fn encode_bytes( + schema: &ColumnSchema, + v: &[u8], + encoder: &mut DataRowEncoder, + query_ctx: &QueryContextRef, +) -> PgWireResult<()> { + if let ConcreteDataType::Json(_) = &schema.data_type { + let s = jsonb_to_string(v).map_err(convert_err)?; + encoder.encode_field(&s) + } else { + let bytea_output = query_ctx.configuration_parameter().postgres_bytea_output(); + match *bytea_output { + PGByteaOutputValue::ESCAPE => encoder.encode_field(&EscapeOutputBytea(v)), + PGByteaOutputValue::HEX => encoder.encode_field(&HexOutputBytea(v)), } } } @@ -1082,11 +1310,17 @@ pub(super) fn param_types_to_pg_types( mod test { use std::sync::Arc; - use common_time::Timestamp; - use common_time::interval::IntervalUnit; - use common_time::timestamp::TimeUnit; + use arrow::array::{ + Float64Builder, Int64Builder, ListBuilder, StringBuilder, TimestampSecondBuilder, + }; + use arrow_schema::Field; use datatypes::schema::{ColumnSchema, Schema}; - use datatypes::value::ListValue; + use datatypes::vectors::{ + BinaryVector, BooleanVector, DateVector, Float32Vector, Float64Vector, Int8Vector, + Int16Vector, Int32Vector, Int64Vector, IntervalDayTimeVector, IntervalMonthDayNanoVector, + IntervalYearMonthVector, ListVector, NullVector, StringVector, TimeSecondVector, + TimestampSecondVector, UInt8Vector, UInt16Vector, UInt32Vector, UInt64Vector, VectorRef, + }; use pgwire::api::Type; use pgwire::api::results::{FieldFormat, FieldInfo}; use session::context::QueryContextBuilder; @@ -1194,12 +1428,8 @@ mod test { FieldInfo::new("uint32s".into(), None, None, Type::INT4, FieldFormat::Text), FieldInfo::new("uint64s".into(), None, None, Type::INT8, FieldFormat::Text), FieldInfo::new("int8s".into(), None, None, Type::CHAR, FieldFormat::Text), - FieldInfo::new("int8s".into(), None, None, Type::CHAR, FieldFormat::Text), - FieldInfo::new("int16s".into(), None, None, Type::INT2, FieldFormat::Text), FieldInfo::new("int16s".into(), None, None, Type::INT2, FieldFormat::Text), FieldInfo::new("int32s".into(), None, None, Type::INT4, FieldFormat::Text), - FieldInfo::new("int32s".into(), None, None, Type::INT4, FieldFormat::Text), - FieldInfo::new("int64s".into(), None, None, Type::INT8, FieldFormat::Text), FieldInfo::new("int64s".into(), None, None, Type::INT8, FieldFormat::Text), FieldInfo::new( "float32s".into(), @@ -1208,34 +1438,6 @@ mod test { Type::FLOAT4, FieldFormat::Text, ), - FieldInfo::new( - "float32s".into(), - None, - None, - Type::FLOAT4, - FieldFormat::Text, - ), - FieldInfo::new( - "float32s".into(), - None, - None, - Type::FLOAT4, - FieldFormat::Text, - ), - FieldInfo::new( - "float64s".into(), - None, - None, - Type::FLOAT8, - FieldFormat::Text, - ), - FieldInfo::new( - "float64s".into(), - None, - None, - Type::FLOAT8, - FieldFormat::Text, - ), FieldInfo::new( "float64s".into(), None, @@ -1317,95 +1519,170 @@ mod test { ), ]; - let datatypes = vec![ - ConcreteDataType::null_datatype(), - ConcreteDataType::boolean_datatype(), - ConcreteDataType::uint8_datatype(), - ConcreteDataType::uint16_datatype(), - ConcreteDataType::uint32_datatype(), - ConcreteDataType::uint64_datatype(), - ConcreteDataType::int8_datatype(), - ConcreteDataType::int8_datatype(), - ConcreteDataType::int16_datatype(), - ConcreteDataType::int16_datatype(), - ConcreteDataType::int32_datatype(), - ConcreteDataType::int32_datatype(), - ConcreteDataType::int64_datatype(), - ConcreteDataType::int64_datatype(), - ConcreteDataType::float32_datatype(), - ConcreteDataType::float32_datatype(), - ConcreteDataType::float32_datatype(), - ConcreteDataType::float64_datatype(), - ConcreteDataType::float64_datatype(), - ConcreteDataType::float64_datatype(), - ConcreteDataType::string_datatype(), - ConcreteDataType::binary_datatype(), - ConcreteDataType::date_datatype(), - ConcreteDataType::time_datatype(TimeUnit::Second), - ConcreteDataType::timestamp_datatype(TimeUnit::Second), - ConcreteDataType::interval_datatype(IntervalUnit::YearMonth), - ConcreteDataType::interval_datatype(IntervalUnit::DayTime), - ConcreteDataType::interval_datatype(IntervalUnit::MonthDayNano), - ConcreteDataType::list_datatype(Arc::new(ConcreteDataType::int64_datatype())), - ConcreteDataType::list_datatype(Arc::new(ConcreteDataType::float64_datatype())), - ConcreteDataType::list_datatype(Arc::new(ConcreteDataType::string_datatype())), - ConcreteDataType::list_datatype( - Arc::new(ConcreteDataType::timestamp_second_datatype()), + let arrow_schema = arrow_schema::Schema::new(vec![ + Field::new("x", DataType::Null, true), + Field::new("x", DataType::Boolean, true), + Field::new("x", DataType::UInt8, true), + Field::new("x", DataType::UInt16, true), + Field::new("x", DataType::UInt32, true), + Field::new("x", DataType::UInt64, true), + Field::new("x", DataType::Int8, true), + Field::new("x", DataType::Int16, true), + Field::new("x", DataType::Int32, true), + Field::new("x", DataType::Int64, true), + Field::new("x", DataType::Float32, true), + Field::new("x", DataType::Float64, true), + Field::new("x", DataType::Utf8, true), + Field::new("x", DataType::Binary, true), + Field::new("x", DataType::Date32, true), + Field::new("x", DataType::Time32(TimeUnit::Second), true), + Field::new("x", DataType::Timestamp(TimeUnit::Second, None), true), + Field::new("x", DataType::Interval(IntervalUnit::YearMonth), true), + Field::new("x", DataType::Interval(IntervalUnit::DayTime), true), + Field::new("x", DataType::Interval(IntervalUnit::MonthDayNano), true), + Field::new( + "x", + DataType::List(Arc::new(Field::new("item", DataType::Int64, true))), + true, ), - ]; + Field::new( + "x", + DataType::List(Arc::new(Field::new("item", DataType::Float64, true))), + true, + ), + Field::new( + "x", + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), + true, + ), + Field::new( + "x", + DataType::List(Arc::new(Field::new( + "item", + DataType::Timestamp(TimeUnit::Second, None), + true, + ))), + true, + ), + ]); + + let mut builder = ListBuilder::new(Int64Builder::new()); + builder.append_value([Some(1i64), None, Some(2)]); + builder.append_null(); + builder.append_value([Some(-1i64), None, Some(-2)]); + let i64_list_array = builder.finish(); + + let mut builder = ListBuilder::new(Float64Builder::new()); + builder.append_value([Some(1.0f64), None, Some(2.0)]); + builder.append_null(); + builder.append_value([Some(-1.0f64), None, Some(-2.0)]); + let f64_list_array = builder.finish(); + + let mut builder = ListBuilder::new(StringBuilder::new()); + builder.append_value([Some("a"), None, Some("b")]); + builder.append_null(); + builder.append_value([Some("c"), None, Some("d")]); + let string_list_array = builder.finish(); + + let mut builder = ListBuilder::new(TimestampSecondBuilder::new()); + builder.append_value([Some(1i64), None, Some(2)]); + builder.append_null(); + builder.append_value([Some(3i64), None, Some(4)]); + let timestamp_list_array = builder.finish(); + let values = vec![ - Value::Null, - Value::Boolean(true), - Value::UInt8(u8::MAX), - Value::UInt16(u16::MAX), - Value::UInt32(u32::MAX), - Value::UInt64(u64::MAX), - Value::Int8(i8::MAX), - Value::Int8(i8::MIN), - Value::Int16(i16::MAX), - Value::Int16(i16::MIN), - Value::Int32(i32::MAX), - Value::Int32(i32::MIN), - Value::Int64(i64::MAX), - Value::Int64(i64::MIN), - Value::Float32(f32::MAX.into()), - Value::Float32(f32::MIN.into()), - Value::Float32(0f32.into()), - Value::Float64(f64::MAX.into()), - Value::Float64(f64::MIN.into()), - Value::Float64(0f64.into()), - Value::String("greptime".into()), - Value::Binary("greptime".as_bytes().into()), - Value::Date(1001i32.into()), - Value::Time(1001i64.into()), - Value::Timestamp(1000001i64.into()), - Value::IntervalYearMonth(IntervalYearMonth::new(1)), - Value::IntervalDayTime(IntervalDayTime::new(1, 10)), - Value::IntervalMonthDayNano(IntervalMonthDayNano::new(1, 1, 10)), - Value::List(ListValue::new( - vec![Value::Int64(1i64)], - Arc::new(ConcreteDataType::int64_datatype()), - )), - Value::List(ListValue::new( - vec![Value::Float64(1.0f64.into())], - Arc::new(ConcreteDataType::float64_datatype()), - )), - Value::List(ListValue::new( - vec![Value::String("tom".into())], - Arc::new(ConcreteDataType::string_datatype()), - )), - Value::List(ListValue::new( - vec![Value::Timestamp(Timestamp::new(1i64, TimeUnit::Second))], - Arc::new(ConcreteDataType::timestamp_second_datatype()), - )), + Arc::new(NullVector::new(3)) as VectorRef, + Arc::new(BooleanVector::from(vec![Some(true), Some(false), None])), + Arc::new(UInt8Vector::from(vec![Some(u8::MAX), Some(u8::MIN), None])), + Arc::new(UInt16Vector::from(vec![ + Some(u16::MAX), + Some(u16::MIN), + None, + ])), + Arc::new(UInt32Vector::from(vec![ + Some(u32::MAX), + Some(u32::MIN), + None, + ])), + Arc::new(UInt64Vector::from(vec![ + Some(u64::MAX), + Some(u64::MIN), + None, + ])), + Arc::new(Int8Vector::from(vec![Some(i8::MAX), Some(i8::MIN), None])), + Arc::new(Int16Vector::from(vec![ + Some(i16::MAX), + Some(i16::MIN), + None, + ])), + Arc::new(Int32Vector::from(vec![ + Some(i32::MAX), + Some(i32::MIN), + None, + ])), + Arc::new(Int64Vector::from(vec![ + Some(i64::MAX), + Some(i64::MIN), + None, + ])), + Arc::new(Float32Vector::from(vec![ + None, + Some(f32::MAX), + Some(f32::MIN), + ])), + Arc::new(Float64Vector::from(vec![ + None, + Some(f64::MAX), + Some(f64::MIN), + ])), + Arc::new(StringVector::from(vec![ + None, + Some("hello"), + Some("greptime"), + ])), + Arc::new(BinaryVector::from(vec![ + None, + Some("hello".as_bytes().to_vec()), + Some("world".as_bytes().to_vec()), + ])), + Arc::new(DateVector::from(vec![Some(1001), None, Some(1)])), + Arc::new(TimeSecondVector::from(vec![Some(1001), None, Some(1)])), + Arc::new(TimestampSecondVector::from(vec![ + Some(1000001), + None, + Some(1), + ])), + Arc::new(IntervalYearMonthVector::from(vec![Some(1), None, Some(2)])), + Arc::new(IntervalDayTimeVector::from(vec![ + Some(arrow::datatypes::IntervalDayTime::new(1, 1)), + None, + Some(arrow::datatypes::IntervalDayTime::new(2, 2)), + ])), + Arc::new(IntervalMonthDayNanoVector::from(vec![ + Some(arrow::datatypes::IntervalMonthDayNano::new(1, 1, 10)), + None, + Some(arrow::datatypes::IntervalMonthDayNano::new(2, 2, 20)), + ])), + Arc::new(ListVector::from(i64_list_array)), + Arc::new(ListVector::from(f64_list_array)), + Arc::new(ListVector::from(string_list_array)), + Arc::new(ListVector::from(timestamp_list_array)), ]; + let record_batch = + RecordBatch::new(Arc::new(arrow_schema.try_into().unwrap()), values).unwrap(); + let query_context = QueryContextBuilder::default() .configuration_parameter(Default::default()) .build() .into(); - let mut builder = DataRowEncoder::new(Arc::new(schema)); - for (value, datatype) in values.into_iter().zip(datatypes) { - encode_value(&query_context, value, &mut builder, &datatype).unwrap(); + let schema = Arc::new(schema); + + let rows = RecordBatchRowIterator::new(query_context, schema.clone(), record_batch) + .filter_map(|x| x.ok()) + .collect::>(); + assert_eq!(rows.len(), 3); + for row in rows { + assert_eq!(row.field_count, schema.len() as i16); } } From b78ee1743cf7121e4144864c679475e81b8bc996 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Fri, 24 Oct 2025 11:36:07 +0800 Subject: [PATCH 005/149] feat: add a missing pg_catalog function current_database (#7138) feat: add a missing function current_database --- src/common/function/src/system/pg_catalog.rs | 27 ++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/common/function/src/system/pg_catalog.rs b/src/common/function/src/system/pg_catalog.rs index c768aae248..b66e208ea9 100644 --- a/src/common/function/src/system/pg_catalog.rs +++ b/src/common/function/src/system/pg_catalog.rs @@ -32,10 +32,36 @@ use crate::system::define_nullary_udf; const CURRENT_SCHEMA_FUNCTION_NAME: &str = "current_schema"; const CURRENT_SCHEMAS_FUNCTION_NAME: &str = "current_schemas"; const SESSION_USER_FUNCTION_NAME: &str = "session_user"; +const CURRENT_DATABASE_FUNCTION_NAME: &str = "current_database"; define_nullary_udf!(CurrentSchemaFunction); define_nullary_udf!(CurrentSchemasFunction); define_nullary_udf!(SessionUserFunction); +define_nullary_udf!(CurrentDatabaseFunction); + +impl Function for CurrentDatabaseFunction { + fn name(&self) -> &str { + CURRENT_DATABASE_FUNCTION_NAME + } + + fn return_type(&self, _: &[DataType]) -> datafusion_common::Result { + Ok(DataType::Utf8View) + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn invoke_with_args( + &self, + args: ScalarFunctionArgs, + ) -> datafusion_common::Result { + let func_ctx = find_function_context(&args)?; + let db = func_ctx.query_ctx.current_catalog().to_string(); + + Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(Some(db)))) + } +} // Though "current_schema" can be aliased to "database", to not cause any breaking changes, // we are not doing it: not until https://github.com/apache/datafusion/issues/17469 is resolved. @@ -141,6 +167,7 @@ impl PGCatalogFunction { registry.register_scalar(CurrentSchemaFunction::default()); registry.register_scalar(CurrentSchemasFunction::default()); registry.register_scalar(SessionUserFunction::default()); + registry.register_scalar(CurrentDatabaseFunction::default()); registry.register(pg_catalog::format_type::create_format_type_udf()); registry.register(pg_catalog::create_pg_get_partkeydef_udf()); registry.register(pg_catalog::has_privilege_udf::create_has_privilege_udf( From 4c70b4c31d5abd4ffc47c6acc27d1e7546c4a6cd Mon Sep 17 00:00:00 2001 From: Yingwen Date: Fri, 24 Oct 2025 13:53:48 +0800 Subject: [PATCH 006/149] feat: store estimated series num in file meta (#7126) * feat: add num_series to FileMeta Signed-off-by: evenyag * feat: add SeriesEstimator to collect num_series Signed-off-by: evenyag * fix: set num_series in compactor Signed-off-by: evenyag * chore: print num_series in Debug for FileMeta Signed-off-by: evenyag * style: fmt code Signed-off-by: evenyag * style: fix clippy Signed-off-by: evenyag * fix: increase series count when next ts <= last Signed-off-by: evenyag * test: add tests for SeriesEstimator Signed-off-by: evenyag * feat: add num_series to ssts_manifest table Signed-off-by: evenyag * test: update sqlness tests Signed-off-by: evenyag * test: fix metric engine list entry test Signed-off-by: evenyag --------- Signed-off-by: evenyag --- src/metric-engine/src/engine/flush.rs | 12 +- src/mito2/src/compaction/compactor.rs | 1 + src/mito2/src/compaction/test_util.rs | 1 + src/mito2/src/engine/basic_test.rs | 12 +- src/mito2/src/flush.rs | 1 + src/mito2/src/manifest/tests/checkpoint.rs | 2 + src/mito2/src/memtable/bulk/part.rs | 10 +- src/mito2/src/region.rs | 1 + src/mito2/src/remap_manifest.rs | 1 + src/mito2/src/sst.rs | 428 ++++++++++++++++++ src/mito2/src/sst/file.rs | 7 + src/mito2/src/sst/file_purger.rs | 2 + src/mito2/src/sst/file_ref.rs | 1 + src/mito2/src/sst/parquet.rs | 3 + src/mito2/src/sst/parquet/writer.rs | 12 +- src/mito2/src/test_util/sst_util.rs | 1 + src/mito2/src/test_util/version_util.rs | 2 + src/store-api/src/sst_entry.rs | 27 +- .../common/information_schema/ssts.result | 33 +- .../common/system/information_schema.result | 13 +- 20 files changed, 527 insertions(+), 43 deletions(-) diff --git a/src/metric-engine/src/engine/flush.rs b/src/metric-engine/src/engine/flush.rs index 23899cbb05..c82862583d 100644 --- a/src/metric-engine/src/engine/flush.rs +++ b/src/metric-engine/src/engine/flush.rs @@ -127,12 +127,12 @@ mod tests { assert_eq!( debug_format, r#" -ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "", level: 0, file_path: "test_metric_region/11_0000000001/data/.parquet", file_size: 3173, index_file_path: Some("test_metric_region/11_0000000001/data/index/.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(20), origin_region_id: 47244640257(11, 1), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "", level: 0, file_path: "test_metric_region/11_0000000002/data/.parquet", file_size: 3173, index_file_path: Some("test_metric_region/11_0000000002/data/index/.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417473(11, 16777217), table_id: 11, region_number: 16777217, region_group: 1, region_sequence: 1, file_id: "", level: 0, file_path: "test_metric_region/11_0000000001/metadata/.parquet", file_size: 3505, index_file_path: None, index_file_size: None, num_rows: 8, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(8), origin_region_id: 47261417473(11, 16777217), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417474(11, 16777218), table_id: 11, region_number: 16777218, region_group: 1, region_sequence: 2, file_id: "", level: 0, file_path: "test_metric_region/11_0000000002/metadata/.parquet", file_size: 3489, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 47261417474(11, 16777218), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "", level: 0, file_path: "test_metric_region/22_0000000042/data/.parquet", file_size: 3173, index_file_path: Some("test_metric_region/22_0000000042/data/index/.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94506057770(22, 16777258), table_id: 22, region_number: 16777258, region_group: 1, region_sequence: 42, file_id: "", level: 0, file_path: "test_metric_region/22_0000000042/metadata/.parquet", file_size: 3489, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 94506057770(22, 16777258), node_id: None, visible: true }"# +ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "", level: 0, file_path: "test_metric_region/11_0000000001/data/.parquet", file_size: 3173, index_file_path: Some("test_metric_region/11_0000000001/data/index/.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(20), origin_region_id: 47244640257(11, 1), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "", level: 0, file_path: "test_metric_region/11_0000000002/data/.parquet", file_size: 3173, index_file_path: Some("test_metric_region/11_0000000002/data/index/.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417473(11, 16777217), table_id: 11, region_number: 16777217, region_group: 1, region_sequence: 1, file_id: "", level: 0, file_path: "test_metric_region/11_0000000001/metadata/.parquet", file_size: 3505, index_file_path: None, index_file_size: None, num_rows: 8, num_row_groups: 1, num_series: Some(8), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(8), origin_region_id: 47261417473(11, 16777217), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417474(11, 16777218), table_id: 11, region_number: 16777218, region_group: 1, region_sequence: 2, file_id: "", level: 0, file_path: "test_metric_region/11_0000000002/metadata/.parquet", file_size: 3489, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 47261417474(11, 16777218), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "", level: 0, file_path: "test_metric_region/22_0000000042/data/.parquet", file_size: 3173, index_file_path: Some("test_metric_region/22_0000000042/data/index/.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94506057770(22, 16777258), table_id: 22, region_number: 16777258, region_group: 1, region_sequence: 42, file_id: "", level: 0, file_path: "test_metric_region/22_0000000042/metadata/.parquet", file_size: 3489, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 94506057770(22, 16777258), node_id: None, visible: true }"# ); // list from storage let storage_entries = mito diff --git a/src/mito2/src/compaction/compactor.rs b/src/mito2/src/compaction/compactor.rs index ba267f4a48..2b871947c0 100644 --- a/src/mito2/src/compaction/compactor.rs +++ b/src/mito2/src/compaction/compactor.rs @@ -433,6 +433,7 @@ impl Compactor for DefaultCompactor { num_row_groups: sst_info.num_row_groups, sequence: max_sequence, partition_expr: partition_expr.clone(), + num_series: sst_info.num_series, }) .collect::>(); let output_file_names = diff --git a/src/mito2/src/compaction/test_util.rs b/src/mito2/src/compaction/test_util.rs index b785d36bcb..3dc212ff4d 100644 --- a/src/mito2/src/compaction/test_util.rs +++ b/src/mito2/src/compaction/test_util.rs @@ -78,6 +78,7 @@ pub fn new_file_handle_with_size_and_sequence( index_file_size: 0, num_rows: 0, num_row_groups: 0, + num_series: 0, sequence: NonZeroU64::new(sequence), partition_expr: None, }, diff --git a/src/mito2/src/engine/basic_test.rs b/src/mito2/src/engine/basic_test.rs index 39f2366659..ca62f384c7 100644 --- a/src/mito2/src/engine/basic_test.rs +++ b/src/mito2/src/engine/basic_test.rs @@ -859,9 +859,9 @@ async fn test_cache_null_primary_key_with_format(flat_format: bool) { #[tokio::test] async fn test_list_ssts() { test_list_ssts_with_format(false, r#" -ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "", level: 0, file_path: "test/11_0000000001/.parquet", file_size: 2531, index_file_path: Some("test/11_0000000001/index/.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "", level: 0, file_path: "test/11_0000000002/.parquet", file_size: 2531, index_file_path: Some("test/11_0000000002/index/.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "", level: 0, file_path: "test/22_0000000042/.parquet", file_size: 2531, index_file_path: Some("test/22_0000000042/index/.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"# ,r#" +ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "", level: 0, file_path: "test/11_0000000001/.parquet", file_size: 2531, index_file_path: Some("test/11_0000000001/index/.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "", level: 0, file_path: "test/11_0000000002/.parquet", file_size: 2531, index_file_path: Some("test/11_0000000002/index/.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "", level: 0, file_path: "test/22_0000000042/.parquet", file_size: 2531, index_file_path: Some("test/22_0000000042/index/.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"# ,r#" StorageSstEntry { file_path: "test/11_0000000001/.parquet", file_size: None, last_modified_ms: None, node_id: None } StorageSstEntry { file_path: "test/11_0000000001/index/.puffin", file_size: None, last_modified_ms: None, node_id: None } StorageSstEntry { file_path: "test/11_0000000002/.parquet", file_size: None, last_modified_ms: None, node_id: None } @@ -869,9 +869,9 @@ StorageSstEntry { file_path: "test/11_0000000002/index/.puffin", file_s StorageSstEntry { file_path: "test/22_0000000042/.parquet", file_size: None, last_modified_ms: None, node_id: None } StorageSstEntry { file_path: "test/22_0000000042/index/.puffin", file_size: None, last_modified_ms: None, node_id: None }"#).await; test_list_ssts_with_format(true, r#" -ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "", level: 0, file_path: "test/11_0000000001/.parquet", file_size: 2855, index_file_path: Some("test/11_0000000001/index/.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "", level: 0, file_path: "test/11_0000000002/.parquet", file_size: 2855, index_file_path: Some("test/11_0000000002/index/.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "", level: 0, file_path: "test/22_0000000042/.parquet", file_size: 2855, index_file_path: Some("test/22_0000000042/index/.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"#, r#" +ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "", level: 0, file_path: "test/11_0000000001/.parquet", file_size: 2855, index_file_path: Some("test/11_0000000001/index/.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "", level: 0, file_path: "test/11_0000000002/.parquet", file_size: 2855, index_file_path: Some("test/11_0000000002/index/.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "", level: 0, file_path: "test/22_0000000042/.parquet", file_size: 2855, index_file_path: Some("test/22_0000000042/index/.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"#, r#" StorageSstEntry { file_path: "test/11_0000000001/.parquet", file_size: None, last_modified_ms: None, node_id: None } StorageSstEntry { file_path: "test/11_0000000001/index/.puffin", file_size: None, last_modified_ms: None, node_id: None } StorageSstEntry { file_path: "test/11_0000000002/.parquet", file_size: None, last_modified_ms: None, node_id: None } diff --git a/src/mito2/src/flush.rs b/src/mito2/src/flush.rs index eb5e605ce1..ddad947f8a 100644 --- a/src/mito2/src/flush.rs +++ b/src/mito2/src/flush.rs @@ -641,6 +641,7 @@ impl RegionFlushTask { num_row_groups: sst_info.num_row_groups, sequence: NonZeroU64::new(max_sequence), partition_expr, + num_series: sst_info.num_series, } } diff --git a/src/mito2/src/manifest/tests/checkpoint.rs b/src/mito2/src/manifest/tests/checkpoint.rs index e10d3aad46..a99a7878ad 100644 --- a/src/mito2/src/manifest/tests/checkpoint.rs +++ b/src/mito2/src/manifest/tests/checkpoint.rs @@ -269,6 +269,7 @@ async fn checkpoint_with_different_compression_types() { num_row_groups: 0, sequence: None, partition_expr: None, + num_series: 0, }; let action = RegionMetaActionList::new(vec![RegionMetaAction::Edit(RegionEdit { files_to_add: vec![file_meta], @@ -334,6 +335,7 @@ fn generate_action_lists(num: usize) -> (Vec, Vec) num_row_groups: 0, sequence: None, partition_expr: None, + num_series: 0, }; let action = RegionMetaActionList::new(vec![RegionMetaAction::Edit(RegionEdit { files_to_add: vec![file_meta], diff --git a/src/mito2/src/memtable/bulk/part.rs b/src/mito2/src/memtable/bulk/part.rs index 4eb2655755..21ac141cff 100644 --- a/src/mito2/src/memtable/bulk/part.rs +++ b/src/mito2/src/memtable/bulk/part.rs @@ -69,7 +69,7 @@ use crate::sst::parquet::flat_format::primary_key_column_index; use crate::sst::parquet::format::{PrimaryKeyArray, PrimaryKeyArrayBuilder, ReadFormat}; use crate::sst::parquet::helper::parse_parquet_metadata; use crate::sst::parquet::{PARQUET_METADATA_KEY, SstInfo}; -use crate::sst::to_sst_arrow_schema; +use crate::sst::{SeriesEstimator, to_sst_arrow_schema}; const INIT_DICT_VALUE_CAPACITY: usize = 8; @@ -563,6 +563,7 @@ impl EncodedBulkPart { num_row_groups: self.metadata.parquet_metadata.num_row_groups() as u64, file_metadata: Some(self.metadata.parquet_metadata.clone()), index_metadata: IndexOutput::default(), + num_series: self.metadata.num_series, } } @@ -602,6 +603,8 @@ pub struct BulkPartMeta { pub parquet_metadata: Arc, /// Part region schema. pub region_metadata: RegionMetadataRef, + /// Number of series. + pub num_series: u64, } /// Metrics for encoding a part. @@ -669,6 +672,7 @@ impl BulkPartEncoder { let mut writer = ArrowWriter::try_new(&mut buf, arrow_schema, self.writer_props.clone()) .context(EncodeMemtableSnafu)?; let mut total_rows = 0; + let mut series_estimator = SeriesEstimator::default(); // Process each batch from the iterator let mut iter_start = Instant::now(); @@ -679,6 +683,7 @@ impl BulkPartEncoder { continue; } + series_estimator.update_flat(&batch); metrics.raw_size += record_batch_estimated_size(&batch); let write_start = Instant::now(); writer.write(&batch).context(EncodeMemtableSnafu)?; @@ -701,6 +706,7 @@ impl BulkPartEncoder { let buf = Bytes::from(buf); let parquet_metadata = Arc::new(parse_parquet_metadata(file_metadata)?); + let num_series = series_estimator.finish(); Ok(Some(EncodedBulkPart { data: buf, @@ -710,6 +716,7 @@ impl BulkPartEncoder { min_timestamp, parquet_metadata, region_metadata: self.metadata.clone(), + num_series, }, })) } @@ -742,6 +749,7 @@ impl BulkPartEncoder { min_timestamp: part.min_timestamp, parquet_metadata, region_metadata: self.metadata.clone(), + num_series: part.estimated_series_count() as u64, }, })) } diff --git a/src/mito2/src/region.rs b/src/mito2/src/region.rs index aac7090174..ee49da763e 100644 --- a/src/mito2/src/region.rs +++ b/src/mito2/src/region.rs @@ -608,6 +608,7 @@ impl MitoRegion { index_file_size, num_rows: meta.num_rows, num_row_groups: meta.num_row_groups, + num_series: Some(meta.num_series), min_ts: meta.time_range.0, max_ts: meta.time_range.1, sequence: meta.sequence.map(|s| s.get()), diff --git a/src/mito2/src/remap_manifest.rs b/src/mito2/src/remap_manifest.rs index 6800a4bf4d..a10159401b 100644 --- a/src/mito2/src/remap_manifest.rs +++ b/src/mito2/src/remap_manifest.rs @@ -431,6 +431,7 @@ mod tests { num_row_groups: 1, sequence: NonZeroU64::new(1), partition_expr, + num_series: 1, } } diff --git a/src/mito2/src/sst.rs b/src/mito2/src/sst.rs index 1d94e74eaa..f3f51bdc08 100644 --- a/src/mito2/src/sst.rs +++ b/src/mito2/src/sst.rs @@ -21,7 +21,9 @@ use common_base::readable_size::ReadableSize; use datatypes::arrow::datatypes::{ DataType as ArrowDataType, Field, FieldRef, Fields, Schema, SchemaRef, }; +use datatypes::arrow::record_batch::RecordBatch; use datatypes::prelude::ConcreteDataType; +use datatypes::timestamp::timestamp_array_to_primitive; use serde::{Deserialize, Serialize}; use store_api::codec::PrimaryKeyEncoding; use store_api::metadata::RegionMetadata; @@ -29,6 +31,9 @@ use store_api::storage::consts::{ OP_TYPE_COLUMN_NAME, PRIMARY_KEY_COLUMN_NAME, SEQUENCE_COLUMN_NAME, }; +use crate::read::Batch; +use crate::sst::parquet::flat_format::time_index_column_index; + pub mod file; pub mod file_purger; pub mod file_ref; @@ -241,3 +246,426 @@ fn plain_internal_fields() -> [FieldRef; 2] { Arc::new(Field::new(OP_TYPE_COLUMN_NAME, ArrowDataType::UInt8, false)), ] } + +/// Gets the estimated number of series from record batches. +/// +/// This struct tracks the last timestamp value to detect series boundaries +/// by observing when timestamps decrease (indicating a new series). +#[derive(Default)] +pub(crate) struct SeriesEstimator { + /// The last timestamp value seen + last_timestamp: Option, + /// The estimated number of series + series_count: u64, +} + +impl SeriesEstimator { + /// Updates the estimator with a new Batch. + /// + /// Since each Batch contains only one series, this increments the series count + /// and updates the last timestamp. + pub(crate) fn update(&mut self, batch: &Batch) { + let Some(last_ts) = batch.last_timestamp() else { + return; + }; + + // Checks if there's a boundary between the last batch and this batch + if let Some(prev_last_ts) = self.last_timestamp { + // If the first timestamp of this batch is less than the last timestamp + // we've seen, it indicates a new series + if let Some(first_ts) = batch.first_timestamp() + && first_ts.value() <= prev_last_ts + { + self.series_count += 1; + } + } else { + // First batch, counts as first series + self.series_count = 1; + } + + // Updates the last timestamp + self.last_timestamp = Some(last_ts.value()); + } + + /// Updates the estimator with a new record batch in flat format. + /// + /// This method examines the time index column to detect series boundaries. + pub(crate) fn update_flat(&mut self, record_batch: &RecordBatch) { + let batch_rows = record_batch.num_rows(); + if batch_rows == 0 { + return; + } + + let time_index_pos = time_index_column_index(record_batch.num_columns()); + let timestamps = record_batch.column(time_index_pos); + let Some((ts_values, _unit)) = timestamp_array_to_primitive(timestamps) else { + return; + }; + let values = ts_values.values(); + + // Checks if there's a boundary between the last batch and this batch + if let Some(last_ts) = self.last_timestamp { + if values[0] <= last_ts { + self.series_count += 1; + } + } else { + // First batch, counts as first series + self.series_count = 1; + } + + // Counts series boundaries within this batch. + for i in 0..batch_rows - 1 { + // We assumes the same timestamp as a new series, which is different from + // how we split batches. + if values[i] >= values[i + 1] { + self.series_count += 1; + } + } + + // Updates the last timestamp + self.last_timestamp = Some(values[batch_rows - 1]); + } + + /// Returns the estimated number of series. + pub(crate) fn finish(&mut self) -> u64 { + self.last_timestamp = None; + let count = self.series_count; + self.series_count = 0; + + count + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use api::v1::OpType; + use datatypes::arrow::array::{ + BinaryArray, DictionaryArray, TimestampMillisecondArray, UInt8Array, UInt8Builder, + UInt32Array, UInt64Array, + }; + use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit}; + use datatypes::arrow::record_batch::RecordBatch; + + use super::*; + use crate::read::{Batch, BatchBuilder}; + + fn new_batch( + primary_key: &[u8], + timestamps: &[i64], + sequences: &[u64], + op_types: &[OpType], + ) -> Batch { + let timestamps = Arc::new(TimestampMillisecondArray::from(timestamps.to_vec())); + let sequences = Arc::new(UInt64Array::from(sequences.to_vec())); + let mut op_type_builder = UInt8Builder::with_capacity(op_types.len()); + for op_type in op_types { + op_type_builder.append_value(*op_type as u8); + } + let op_types = Arc::new(UInt8Array::from( + op_types.iter().map(|op| *op as u8).collect::>(), + )); + + let mut builder = BatchBuilder::new(primary_key.to_vec()); + builder + .timestamps_array(timestamps) + .unwrap() + .sequences_array(sequences) + .unwrap() + .op_types_array(op_types) + .unwrap(); + builder.build().unwrap() + } + + fn new_flat_record_batch(timestamps: &[i64]) -> RecordBatch { + // Flat format has: [fields..., time_index, __primary_key, __sequence, __op_type] + let num_cols = 4; // time_index + 3 internal columns + let time_index_pos = time_index_column_index(num_cols); + assert_eq!(time_index_pos, 0); // For 4 columns, time index should be at position 0 + + let time_array = Arc::new(TimestampMillisecondArray::from(timestamps.to_vec())); + let pk_array = Arc::new(DictionaryArray::new( + UInt32Array::from(vec![0; timestamps.len()]), + Arc::new(BinaryArray::from(vec![b"test".as_slice()])), + )); + let seq_array = Arc::new(UInt64Array::from(vec![1; timestamps.len()])); + let op_array = Arc::new(UInt8Array::from(vec![1; timestamps.len()])); + + let schema = Arc::new(Schema::new(vec![ + Field::new( + "time", + ArrowDataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + Field::new_dictionary( + "__primary_key", + ArrowDataType::UInt32, + ArrowDataType::Binary, + false, + ), + Field::new("__sequence", ArrowDataType::UInt64, false), + Field::new("__op_type", ArrowDataType::UInt8, false), + ])); + + RecordBatch::try_new(schema, vec![time_array, pk_array, seq_array, op_array]).unwrap() + } + + #[test] + fn test_series_estimator_empty_batch() { + let mut estimator = SeriesEstimator::default(); + let batch = new_batch(b"test", &[], &[], &[]); + estimator.update(&batch); + assert_eq!(0, estimator.finish()); + } + + #[test] + fn test_series_estimator_single_batch() { + let mut estimator = SeriesEstimator::default(); + let batch = new_batch( + b"test", + &[1, 2, 3], + &[1, 2, 3], + &[OpType::Put, OpType::Put, OpType::Put], + ); + estimator.update(&batch); + assert_eq!(1, estimator.finish()); + } + + #[test] + fn test_series_estimator_multiple_batches_same_series() { + let mut estimator = SeriesEstimator::default(); + + // First batch with timestamps 1, 2, 3 + let batch1 = new_batch( + b"test", + &[1, 2, 3], + &[1, 2, 3], + &[OpType::Put, OpType::Put, OpType::Put], + ); + estimator.update(&batch1); + + // Second batch with timestamps 4, 5, 6 (continuation) + let batch2 = new_batch( + b"test", + &[4, 5, 6], + &[4, 5, 6], + &[OpType::Put, OpType::Put, OpType::Put], + ); + estimator.update(&batch2); + + assert_eq!(1, estimator.finish()); + } + + #[test] + fn test_series_estimator_new_series_detected() { + let mut estimator = SeriesEstimator::default(); + + // First batch with timestamps 1, 2, 3 + let batch1 = new_batch( + b"pk0", + &[1, 2, 3], + &[1, 2, 3], + &[OpType::Put, OpType::Put, OpType::Put], + ); + estimator.update(&batch1); + + // Second batch with timestamps 2, 3, 4 (timestamp goes back, new series) + let batch2 = new_batch( + b"pk1", + &[2, 3, 4], + &[4, 5, 6], + &[OpType::Put, OpType::Put, OpType::Put], + ); + estimator.update(&batch2); + + assert_eq!(2, estimator.finish()); + } + + #[test] + fn test_series_estimator_equal_timestamp_boundary() { + let mut estimator = SeriesEstimator::default(); + + // First batch ending at timestamp 5 + let batch1 = new_batch( + b"test", + &[1, 2, 5], + &[1, 2, 3], + &[OpType::Put, OpType::Put, OpType::Put], + ); + estimator.update(&batch1); + + // Second batch starting at timestamp 5 (equal, indicates new series) + let batch2 = new_batch( + b"test", + &[5, 6, 7], + &[4, 5, 6], + &[OpType::Put, OpType::Put, OpType::Put], + ); + estimator.update(&batch2); + + assert_eq!(2, estimator.finish()); + } + + #[test] + fn test_series_estimator_finish_resets_state() { + let mut estimator = SeriesEstimator::default(); + + let batch1 = new_batch( + b"test", + &[1, 2, 3], + &[1, 2, 3], + &[OpType::Put, OpType::Put, OpType::Put], + ); + estimator.update(&batch1); + + assert_eq!(1, estimator.finish()); + + // After finish, state should be reset + let batch2 = new_batch( + b"test", + &[4, 5, 6], + &[4, 5, 6], + &[OpType::Put, OpType::Put, OpType::Put], + ); + estimator.update(&batch2); + + assert_eq!(1, estimator.finish()); + } + + #[test] + fn test_series_estimator_flat_empty_batch() { + let mut estimator = SeriesEstimator::default(); + let record_batch = new_flat_record_batch(&[]); + estimator.update_flat(&record_batch); + assert_eq!(0, estimator.finish()); + } + + #[test] + fn test_series_estimator_flat_single_batch() { + let mut estimator = SeriesEstimator::default(); + let record_batch = new_flat_record_batch(&[1, 2, 3]); + estimator.update_flat(&record_batch); + assert_eq!(1, estimator.finish()); + } + + #[test] + fn test_series_estimator_flat_series_boundary_within_batch() { + let mut estimator = SeriesEstimator::default(); + // Timestamps decrease from 3 to 2, indicating a series boundary + let record_batch = new_flat_record_batch(&[1, 2, 3, 2, 4, 5]); + estimator.update_flat(&record_batch); + // Should detect boundary at position 3 (3 >= 2) + assert_eq!(2, estimator.finish()); + } + + #[test] + fn test_series_estimator_flat_multiple_boundaries_within_batch() { + let mut estimator = SeriesEstimator::default(); + // Multiple series boundaries: 5>=4, 6>=3 + let record_batch = new_flat_record_batch(&[1, 2, 5, 4, 6, 3, 7]); + estimator.update_flat(&record_batch); + assert_eq!(3, estimator.finish()); + } + + #[test] + fn test_series_estimator_flat_equal_timestamps() { + let mut estimator = SeriesEstimator::default(); + // Equal timestamps are considered as new series + let record_batch = new_flat_record_batch(&[1, 2, 2, 3, 3, 3, 4]); + estimator.update_flat(&record_batch); + // Boundaries at: 2>=2, 3>=3, 3>=3 + assert_eq!(4, estimator.finish()); + } + + #[test] + fn test_series_estimator_flat_multiple_batches_continuation() { + let mut estimator = SeriesEstimator::default(); + + // First batch: timestamps 1, 2, 3 + let batch1 = new_flat_record_batch(&[1, 2, 3]); + estimator.update_flat(&batch1); + + // Second batch: timestamps 4, 5, 6 (continuation) + let batch2 = new_flat_record_batch(&[4, 5, 6]); + estimator.update_flat(&batch2); + + assert_eq!(1, estimator.finish()); + } + + #[test] + fn test_series_estimator_flat_multiple_batches_new_series() { + let mut estimator = SeriesEstimator::default(); + + // First batch: timestamps 1, 2, 3 + let batch1 = new_flat_record_batch(&[1, 2, 3]); + estimator.update_flat(&batch1); + + // Second batch: timestamps 2, 3, 4 (goes back to 2, new series) + let batch2 = new_flat_record_batch(&[2, 3, 4]); + estimator.update_flat(&batch2); + + assert_eq!(2, estimator.finish()); + } + + #[test] + fn test_series_estimator_flat_boundary_at_batch_edge_equal() { + let mut estimator = SeriesEstimator::default(); + + // First batch ending at 5 + let batch1 = new_flat_record_batch(&[1, 2, 5]); + estimator.update_flat(&batch1); + + // Second batch starting at 5 (equal timestamp, new series) + let batch2 = new_flat_record_batch(&[5, 6, 7]); + estimator.update_flat(&batch2); + + assert_eq!(2, estimator.finish()); + } + + #[test] + fn test_series_estimator_flat_mixed_batches() { + let mut estimator = SeriesEstimator::default(); + + // Batch 1: single series [10, 20, 30] + let batch1 = new_flat_record_batch(&[10, 20, 30]); + estimator.update_flat(&batch1); + + // Batch 2: starts new series [5, 15], boundary within batch [15, 10, 25] + let batch2 = new_flat_record_batch(&[5, 15, 10, 25]); + estimator.update_flat(&batch2); + + // Batch 3: continues from 25 to [30, 35] + let batch3 = new_flat_record_batch(&[30, 35]); + estimator.update_flat(&batch3); + + // Expected: 1 (batch1) + 1 (batch2 start) + 1 (within batch2) = 3 + assert_eq!(3, estimator.finish()); + } + + #[test] + fn test_series_estimator_flat_descending_timestamps() { + let mut estimator = SeriesEstimator::default(); + // Strictly descending timestamps - each pair creates a boundary + let record_batch = new_flat_record_batch(&[10, 9, 8, 7, 6]); + estimator.update_flat(&record_batch); + // Boundaries: 10>=9, 9>=8, 8>=7, 7>=6 = 4 boundaries + 1 initial = 5 series + assert_eq!(5, estimator.finish()); + } + + #[test] + fn test_series_estimator_flat_finish_resets_state() { + let mut estimator = SeriesEstimator::default(); + + let batch1 = new_flat_record_batch(&[1, 2, 3]); + estimator.update_flat(&batch1); + + assert_eq!(1, estimator.finish()); + + // After finish, state should be reset + let batch2 = new_flat_record_batch(&[4, 5, 6]); + estimator.update_flat(&batch2); + + assert_eq!(1, estimator.finish()); + } +} diff --git a/src/mito2/src/sst/file.rs b/src/mito2/src/sst/file.rs index 4ddde55746..ae255e9407 100644 --- a/src/mito2/src/sst/file.rs +++ b/src/mito2/src/sst/file.rs @@ -175,6 +175,10 @@ pub struct FileMeta { deserialize_with = "deserialize_partition_expr" )] pub partition_expr: Option, + /// Number of series in the file. + /// + /// The number is 0 if the series number is not available. + pub num_series: u64, } impl Debug for FileMeta { @@ -210,6 +214,7 @@ impl Debug for FileMeta { } }) .field("partition_expr", &self.partition_expr) + .field("num_series", &self.num_series) .finish() } } @@ -458,6 +463,7 @@ mod tests { num_row_groups: 0, sequence: None, partition_expr: None, + num_series: 0, } } @@ -503,6 +509,7 @@ mod tests { num_row_groups: 0, sequence: None, partition_expr: Some(partition_expr.clone()), + num_series: 0, }; // Test serialization/deserialization diff --git a/src/mito2/src/sst/file_purger.rs b/src/mito2/src/sst/file_purger.rs index 7bd0e6b515..c5197ea2fb 100644 --- a/src/mito2/src/sst/file_purger.rs +++ b/src/mito2/src/sst/file_purger.rs @@ -236,6 +236,7 @@ mod tests { num_row_groups: 0, sequence: None, partition_expr: None, + num_series: 0, }, file_purger, ); @@ -302,6 +303,7 @@ mod tests { num_row_groups: 1, sequence: NonZeroU64::new(4096), partition_expr: None, + num_series: 0, }, file_purger, ); diff --git a/src/mito2/src/sst/file_ref.rs b/src/mito2/src/sst/file_ref.rs index c8b86ed0fd..de071b3f04 100644 --- a/src/mito2/src/sst/file_ref.rs +++ b/src/mito2/src/sst/file_ref.rs @@ -259,6 +259,7 @@ mod tests { num_row_groups: 1, sequence: NonZeroU64::new(4096), partition_expr: None, + num_series: 0, }; file_ref_mgr.add_file(&file_meta); diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs index 9b56ffd4ae..83cd17acc8 100644 --- a/src/mito2/src/sst/parquet.rs +++ b/src/mito2/src/sst/parquet.rs @@ -84,6 +84,8 @@ pub struct SstInfo { pub file_metadata: Option>, /// Index Meta Data pub index_metadata: IndexOutput, + /// Number of series + pub num_series: u64, } #[cfg(test)] @@ -766,6 +768,7 @@ mod tests { .expect("partition expression should be valid JSON"), None => None, }, + num_series: 0, }, Arc::new(NoopFilePurger), ); diff --git a/src/mito2/src/sst/parquet/writer.rs b/src/mito2/src/sst/parquet/writer.rs index 01e1e95a9c..d52615690f 100644 --- a/src/mito2/src/sst/parquet/writer.rs +++ b/src/mito2/src/sst/parquet/writer.rs @@ -57,7 +57,9 @@ use crate::sst::parquet::flat_format::{FlatWriteFormat, time_index_column_index} use crate::sst::parquet::format::PrimaryKeyWriteFormat; use crate::sst::parquet::helper::parse_parquet_metadata; use crate::sst::parquet::{PARQUET_METADATA_KEY, SstInfo, WriteOptions}; -use crate::sst::{DEFAULT_WRITE_BUFFER_SIZE, DEFAULT_WRITE_CONCURRENCY, FlatSchemaOptions}; +use crate::sst::{ + DEFAULT_WRITE_BUFFER_SIZE, DEFAULT_WRITE_CONCURRENCY, FlatSchemaOptions, SeriesEstimator, +}; /// Parquet SST writer. pub struct ParquetWriter { @@ -176,7 +178,7 @@ where ) -> Result<()> { // maybe_init_writer will re-create a new file. if let Some(mut current_writer) = mem::take(&mut self.writer) { - let stats = mem::take(stats); + let mut stats = mem::take(stats); // At least one row has been written. assert!(stats.num_rows > 0); @@ -211,6 +213,7 @@ where // convert FileMetaData to ParquetMetaData let parquet_metadata = parse_parquet_metadata(file_meta)?; + let num_series = stats.series_estimator.finish(); ssts.push(SstInfo { file_id: self.current_file, time_range, @@ -219,6 +222,7 @@ where num_row_groups: parquet_metadata.num_row_groups() as u64, file_metadata: Some(Arc::new(parquet_metadata)), index_metadata: index_output, + num_series, }); self.current_file = FileId::random(); self.bytes_written.store(0, Ordering::Relaxed) @@ -496,6 +500,8 @@ struct SourceStats { num_rows: usize, /// Time range of fetched batches. time_range: Option<(Timestamp, Timestamp)>, + /// Series estimator for computing num_series. + series_estimator: SeriesEstimator, } impl SourceStats { @@ -505,6 +511,7 @@ impl SourceStats { } self.num_rows += batch.num_rows(); + self.series_estimator.update(batch); // Safety: batch is not empty. let (min_in_batch, max_in_batch) = ( batch.first_timestamp().unwrap(), @@ -524,6 +531,7 @@ impl SourceStats { } self.num_rows += record_batch.num_rows(); + self.series_estimator.update_flat(record_batch); // Get the timestamp column by index let time_index_col_idx = time_index_column_index(record_batch.num_columns()); diff --git a/src/mito2/src/test_util/sst_util.rs b/src/mito2/src/test_util/sst_util.rs index 5eacf06bd5..fc29ca0826 100644 --- a/src/mito2/src/test_util/sst_util.rs +++ b/src/mito2/src/test_util/sst_util.rs @@ -127,6 +127,7 @@ pub fn sst_file_handle_with_file_id(file_id: FileId, start_ms: i64, end_ms: i64) index_file_size: 0, num_rows: 0, num_row_groups: 0, + num_series: 0, sequence: None, partition_expr: None, }, diff --git a/src/mito2/src/test_util/version_util.rs b/src/mito2/src/test_util/version_util.rs index 86cc11eaf5..30da6677e3 100644 --- a/src/mito2/src/test_util/version_util.rs +++ b/src/mito2/src/test_util/version_util.rs @@ -105,6 +105,7 @@ impl VersionControlBuilder { index_file_size: 0, num_rows: 0, num_row_groups: 0, + num_series: 0, sequence: NonZeroU64::new(start_ms as u64), partition_expr: match &self.metadata.partition_expr { Some(json_str) => partition::expr::PartitionExpr::from_json_str(json_str) @@ -193,6 +194,7 @@ pub(crate) fn apply_edit( index_file_size: 0, num_rows: 0, num_row_groups: 0, + num_series: 0, sequence: NonZeroU64::new(*start_ms as u64), partition_expr: match &version_control.current().version.metadata.partition_expr { Some(json_str) => partition::expr::PartitionExpr::from_json_str(json_str) diff --git a/src/store-api/src/sst_entry.rs b/src/store-api/src/sst_entry.rs index 8330af7b2e..52295bdb59 100644 --- a/src/store-api/src/sst_entry.rs +++ b/src/store-api/src/sst_entry.rs @@ -61,6 +61,8 @@ pub struct ManifestSstEntry { pub num_rows: u64, /// Number of row groups in the SST. pub num_row_groups: u64, + /// Number of series in the SST. + pub num_series: Option, /// Min timestamp. pub min_ts: Timestamp, /// Max timestamp. @@ -94,6 +96,7 @@ impl ManifestSstEntry { ColumnSchema::new("index_file_size", Ty::uint64_datatype(), true), ColumnSchema::new("num_rows", Ty::uint64_datatype(), false), ColumnSchema::new("num_row_groups", Ty::uint64_datatype(), false), + ColumnSchema::new("num_series", Ty::uint64_datatype(), true), ColumnSchema::new("min_ts", Ty::timestamp_nanosecond_datatype(), true), ColumnSchema::new("max_ts", Ty::timestamp_nanosecond_datatype(), true), ColumnSchema::new("sequence", Ty::uint64_datatype(), true), @@ -120,6 +123,7 @@ impl ManifestSstEntry { let index_file_sizes = entries.iter().map(|e| e.index_file_size); let num_rows = entries.iter().map(|e| e.num_rows); let num_row_groups = entries.iter().map(|e| e.num_row_groups); + let num_series = entries.iter().map(|e| e.num_series); let min_ts = entries.iter().map(|e| { e.min_ts .convert_to(TimeUnit::Nanosecond) @@ -150,6 +154,7 @@ impl ManifestSstEntry { Arc::new(UInt64Array::from_iter(index_file_sizes)), Arc::new(UInt64Array::from_iter_values(num_rows)), Arc::new(UInt64Array::from_iter_values(num_row_groups)), + Arc::new(UInt64Array::from_iter(num_series)), Arc::new(TimestampNanosecondArray::from_iter(min_ts)), Arc::new(TimestampNanosecondArray::from_iter(max_ts)), Arc::new(UInt64Array::from_iter(sequences)), @@ -434,6 +439,7 @@ mod tests { index_file_size: None, num_rows: 10, num_row_groups: 2, + num_series: Some(5), min_ts: Timestamp::new_millisecond(1000), // 1s -> 1_000_000_000ns max_ts: Timestamp::new_second(2), // 2s -> 2_000_000_000ns sequence: None, @@ -456,6 +462,7 @@ mod tests { index_file_size: Some(11), num_rows: 20, num_row_groups: 4, + num_series: None, min_ts: Timestamp::new_nanosecond(5), // 5ns max_ts: Timestamp::new_microsecond(2000), // 2ms -> 2_000_000ns sequence: Some(9), @@ -590,16 +597,24 @@ mod tests { assert_eq!(2, num_row_groups.value(0)); assert_eq!(4, num_row_groups.value(1)); - let min_ts = batch + let num_series = batch .column(14) .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(5, num_series.value(0)); + assert!(num_series.is_null(1)); + + let min_ts = batch + .column(15) + .as_any() .downcast_ref::() .unwrap(); assert_eq!(1_000_000_000, min_ts.value(0)); assert_eq!(5, min_ts.value(1)); let max_ts = batch - .column(15) + .column(16) .as_any() .downcast_ref::() .unwrap(); @@ -607,7 +622,7 @@ mod tests { assert_eq!(2_000_000, max_ts.value(1)); let sequences = batch - .column(16) + .column(17) .as_any() .downcast_ref::() .unwrap(); @@ -615,7 +630,7 @@ mod tests { assert_eq!(9, sequences.value(1)); let origin_region_ids = batch - .column(17) + .column(18) .as_any() .downcast_ref::() .unwrap(); @@ -623,7 +638,7 @@ mod tests { assert_eq!(region_id2.as_u64(), origin_region_ids.value(1)); let node_ids = batch - .column(18) + .column(19) .as_any() .downcast_ref::() .unwrap(); @@ -631,7 +646,7 @@ mod tests { assert!(node_ids.is_null(1)); let visible = batch - .column(19) + .column(20) .as_any() .downcast_ref::() .unwrap(); diff --git a/tests/cases/standalone/common/information_schema/ssts.result b/tests/cases/standalone/common/information_schema/ssts.result index 2c28e6e63c..d546efbdfb 100644 --- a/tests/cases/standalone/common/information_schema/ssts.result +++ b/tests/cases/standalone/common/information_schema/ssts.result @@ -17,6 +17,7 @@ DESC TABLE information_schema.ssts_manifest; | index_file_size | UInt64 | | YES | | FIELD | | num_rows | UInt64 | | NO | | FIELD | | num_row_groups | UInt64 | | NO | | FIELD | +| num_series | UInt64 | | YES | | FIELD | | min_ts | TimestampNanosecond | | YES | | FIELD | | max_ts | TimestampNanosecond | | YES | | FIELD | | sequence | UInt64 | | YES | | FIELD | @@ -95,13 +96,13 @@ ADMIN FLUSH_TABLE('sst_case'); -- SQLNESS REPLACE (/public/\d+) /public/ SELECT * FROM information_schema.ssts_manifest order by file_path; -+----------------------------+---------------+----------+---------------+--------------+-----------------+--------------------------------------+-------+----------------------------------------------------------------------------------------+-----------+---------------------------------------------------------------------------------------------+-----------------+----------+----------------+-------------------------+-------------------------+----------+------------------+---------+---------+ -| table_dir | region_id | table_id | region_number | region_group | region_sequence | file_id | level | file_path | file_size | index_file_path | index_file_size | num_rows | num_row_groups | min_ts | max_ts | sequence | origin_region_id | node_id | visible | -+----------------------------+---------------+----------+---------------+--------------+-----------------+--------------------------------------+-------+----------------------------------------------------------------------------------------+-----------+---------------------------------------------------------------------------------------------+-----------------+----------+----------------+-------------------------+-------------------------+----------+------------------+---------+---------+ -| data/greptime/public// |||||| || data/greptime/public//_/.parquet || data/greptime/public//_/index/.puffin |||| | |||| true | -| data/greptime/public// |||||| || data/greptime/public//_/.parquet || data/greptime/public//_/index/.puffin |||| | |||| true | -| data/greptime/public// |||||| || data/greptime/public//_/.parquet || data/greptime/public//_/index/.puffin |||| | |||| true | -+----------------------------+---------------+----------+---------------+--------------+-----------------+--------------------------------------+-------+----------------------------------------------------------------------------------------+-----------+---------------------------------------------------------------------------------------------+-----------------+----------+----------------+-------------------------+-------------------------+----------+------------------+---------+---------+ ++----------------------------+---------------+----------+---------------+--------------+-----------------+--------------------------------------+-------+----------------------------------------------------------------------------------------+-----------+---------------------------------------------------------------------------------------------+-----------------+----------+----------------+------------+-------------------------+-------------------------+----------+------------------+---------+---------+ +| table_dir | region_id | table_id | region_number | region_group | region_sequence | file_id | level | file_path | file_size | index_file_path | index_file_size | num_rows | num_row_groups | num_series | min_ts | max_ts | sequence | origin_region_id | node_id | visible | ++----------------------------+---------------+----------+---------------+--------------+-----------------+--------------------------------------+-------+----------------------------------------------------------------------------------------+-----------+---------------------------------------------------------------------------------------------+-----------------+----------+----------------+------------+-------------------------+-------------------------+----------+------------------+---------+---------+ +| data/greptime/public// |||||| || data/greptime/public//_/.parquet || data/greptime/public//_/index/.puffin ||||| | |||| true | +| data/greptime/public// |||||| || data/greptime/public//_/.parquet || data/greptime/public//_/index/.puffin ||||| | |||| true | +| data/greptime/public// |||||| || data/greptime/public//_/.parquet || data/greptime/public//_/index/.puffin ||||| | |||| true | ++----------------------------+---------------+----------+---------------+--------------+-----------------+--------------------------------------+-------+----------------------------------------------------------------------------------------+-----------+---------------------------------------------------------------------------------------------+-----------------+----------+----------------+------------+-------------------------+-------------------------+----------+------------------+---------+---------+ -- SQLNESS REPLACE (\s+\d+\s+) -- SQLNESS REPLACE ([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}) @@ -163,15 +164,15 @@ ADMIN FLUSH_TABLE('sst_case'); -- SQLNESS REPLACE (/public/\d+) /public/ SELECT * FROM information_schema.ssts_manifest order by file_path; -+----------------------------+---------------+----------+---------------+--------------+-----------------+--------------------------------------+-------+----------------------------------------------------------------------------------------+-----------+---------------------------------------------------------------------------------------------+-----------------+----------+----------------+-------------------------+-------------------------+----------+------------------+---------+---------+ -| table_dir | region_id | table_id | region_number | region_group | region_sequence | file_id | level | file_path | file_size | index_file_path | index_file_size | num_rows | num_row_groups | min_ts | max_ts | sequence | origin_region_id | node_id | visible | -+----------------------------+---------------+----------+---------------+--------------+-----------------+--------------------------------------+-------+----------------------------------------------------------------------------------------+-----------+---------------------------------------------------------------------------------------------+-----------------+----------+----------------+-------------------------+-------------------------+----------+------------------+---------+---------+ -| data/greptime/public// |||||| || data/greptime/public//_/.parquet || data/greptime/public//_/index/.puffin |||| | |||| true | -| data/greptime/public// |||||| || data/greptime/public//_/.parquet || data/greptime/public//_/index/.puffin |||| | |||| true | -| data/greptime/public// |||||| || data/greptime/public//_/.parquet || data/greptime/public//_/index/.puffin |||| | |||| true | -| data/greptime/public// |||||| || data/greptime/public//_/.parquet || data/greptime/public//_/index/.puffin |||| | |||| true | -| data/greptime/public// |||||| || data/greptime/public//_/.parquet || data/greptime/public//_/index/.puffin |||| | |||| true | -+----------------------------+---------------+----------+---------------+--------------+-----------------+--------------------------------------+-------+----------------------------------------------------------------------------------------+-----------+---------------------------------------------------------------------------------------------+-----------------+----------+----------------+-------------------------+-------------------------+----------+------------------+---------+---------+ ++----------------------------+---------------+----------+---------------+--------------+-----------------+--------------------------------------+-------+----------------------------------------------------------------------------------------+-----------+---------------------------------------------------------------------------------------------+-----------------+----------+----------------+------------+-------------------------+-------------------------+----------+------------------+---------+---------+ +| table_dir | region_id | table_id | region_number | region_group | region_sequence | file_id | level | file_path | file_size | index_file_path | index_file_size | num_rows | num_row_groups | num_series | min_ts | max_ts | sequence | origin_region_id | node_id | visible | ++----------------------------+---------------+----------+---------------+--------------+-----------------+--------------------------------------+-------+----------------------------------------------------------------------------------------+-----------+---------------------------------------------------------------------------------------------+-----------------+----------+----------------+------------+-------------------------+-------------------------+----------+------------------+---------+---------+ +| data/greptime/public// |||||| || data/greptime/public//_/.parquet || data/greptime/public//_/index/.puffin ||||| | |||| true | +| data/greptime/public// |||||| || data/greptime/public//_/.parquet || data/greptime/public//_/index/.puffin ||||| | |||| true | +| data/greptime/public// |||||| || data/greptime/public//_/.parquet || data/greptime/public//_/index/.puffin ||||| | |||| true | +| data/greptime/public// |||||| || data/greptime/public//_/.parquet || data/greptime/public//_/index/.puffin ||||| | |||| true | +| data/greptime/public// |||||| || data/greptime/public//_/.parquet || data/greptime/public//_/index/.puffin ||||| | |||| true | ++----------------------------+---------------+----------+---------------+--------------+-----------------+--------------------------------------+-------+----------------------------------------------------------------------------------------+-----------+---------------------------------------------------------------------------------------------+-----------------+----------+----------------+------------+-------------------------+-------------------------+----------+------------------+---------+---------+ -- SQLNESS REPLACE (\s+\d+\s+) -- SQLNESS REPLACE ([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}) diff --git a/tests/cases/standalone/common/system/information_schema.result b/tests/cases/standalone/common/system/information_schema.result index 1cb53ccfe3..d211938c2a 100644 --- a/tests/cases/standalone/common/system/information_schema.result +++ b/tests/cases/standalone/common/system/information_schema.result @@ -411,20 +411,21 @@ select * from information_schema.columns order by table_schema, table_name, colu | greptime | information_schema | ssts_manifest | index_file_path | 11 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | Yes | string | | | | greptime | information_schema | ssts_manifest | index_file_size | 12 | | | 20 | 0 | | | | | | select,insert | | UInt64 | bigint unsigned | FIELD | | Yes | bigint unsigned | | | | greptime | information_schema | ssts_manifest | level | 8 | | | 3 | 0 | | | | | | select,insert | | UInt8 | tinyint unsigned | FIELD | | No | tinyint unsigned | | | -| greptime | information_schema | ssts_manifest | max_ts | 16 | | | | | 9 | | | | | select,insert | | TimestampNanosecond | timestamp(9) | FIELD | | Yes | timestamp(9) | | | -| greptime | information_schema | ssts_manifest | min_ts | 15 | | | | | 9 | | | | | select,insert | | TimestampNanosecond | timestamp(9) | FIELD | | Yes | timestamp(9) | | | -| greptime | information_schema | ssts_manifest | node_id | 19 | | | 20 | 0 | | | | | | select,insert | | UInt64 | bigint unsigned | FIELD | | Yes | bigint unsigned | | | +| greptime | information_schema | ssts_manifest | max_ts | 17 | | | | | 9 | | | | | select,insert | | TimestampNanosecond | timestamp(9) | FIELD | | Yes | timestamp(9) | | | +| greptime | information_schema | ssts_manifest | min_ts | 16 | | | | | 9 | | | | | select,insert | | TimestampNanosecond | timestamp(9) | FIELD | | Yes | timestamp(9) | | | +| greptime | information_schema | ssts_manifest | node_id | 20 | | | 20 | 0 | | | | | | select,insert | | UInt64 | bigint unsigned | FIELD | | Yes | bigint unsigned | | | | greptime | information_schema | ssts_manifest | num_row_groups | 14 | | | 20 | 0 | | | | | | select,insert | | UInt64 | bigint unsigned | FIELD | | No | bigint unsigned | | | | greptime | information_schema | ssts_manifest | num_rows | 13 | | | 20 | 0 | | | | | | select,insert | | UInt64 | bigint unsigned | FIELD | | No | bigint unsigned | | | -| greptime | information_schema | ssts_manifest | origin_region_id | 18 | | | 20 | 0 | | | | | | select,insert | | UInt64 | bigint unsigned | FIELD | | No | bigint unsigned | | | +| greptime | information_schema | ssts_manifest | num_series | 15 | | | 20 | 0 | | | | | | select,insert | | UInt64 | bigint unsigned | FIELD | | Yes | bigint unsigned | | | +| greptime | information_schema | ssts_manifest | origin_region_id | 19 | | | 20 | 0 | | | | | | select,insert | | UInt64 | bigint unsigned | FIELD | | No | bigint unsigned | | | | greptime | information_schema | ssts_manifest | region_group | 5 | | | 3 | 0 | | | | | | select,insert | | UInt8 | tinyint unsigned | FIELD | | No | tinyint unsigned | | | | greptime | information_schema | ssts_manifest | region_id | 2 | | | 20 | 0 | | | | | | select,insert | | UInt64 | bigint unsigned | FIELD | | No | bigint unsigned | | | | greptime | information_schema | ssts_manifest | region_number | 4 | | | 10 | 0 | | | | | | select,insert | | UInt32 | int unsigned | FIELD | | No | int unsigned | | | | greptime | information_schema | ssts_manifest | region_sequence | 6 | | | 10 | 0 | | | | | | select,insert | | UInt32 | int unsigned | FIELD | | No | int unsigned | | | -| greptime | information_schema | ssts_manifest | sequence | 17 | | | 20 | 0 | | | | | | select,insert | | UInt64 | bigint unsigned | FIELD | | Yes | bigint unsigned | | | +| greptime | information_schema | ssts_manifest | sequence | 18 | | | 20 | 0 | | | | | | select,insert | | UInt64 | bigint unsigned | FIELD | | Yes | bigint unsigned | | | | greptime | information_schema | ssts_manifest | table_dir | 1 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | | | greptime | information_schema | ssts_manifest | table_id | 3 | | | 10 | 0 | | | | | | select,insert | | UInt32 | int unsigned | FIELD | | No | int unsigned | | | -| greptime | information_schema | ssts_manifest | visible | 20 | | | | | | | | | | select,insert | | Boolean | boolean | FIELD | | No | boolean | | | +| greptime | information_schema | ssts_manifest | visible | 21 | | | | | | | | | | select,insert | | Boolean | boolean | FIELD | | No | boolean | | | | greptime | information_schema | ssts_storage | file_path | 1 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | | | greptime | information_schema | ssts_storage | file_size | 2 | | | 20 | 0 | | | | | | select,insert | | UInt64 | bigint unsigned | FIELD | | Yes | bigint unsigned | | | | greptime | information_schema | ssts_storage | last_modified_ms | 3 | | | | | 3 | | | | | select,insert | | TimestampMillisecond | timestamp(3) | FIELD | | Yes | timestamp(3) | | | From 7da2f5ed12c278601a955620715f4dab4aed5271 Mon Sep 17 00:00:00 2001 From: Weny Xu Date: Fri, 24 Oct 2025 17:11:42 +0800 Subject: [PATCH 007/149] refactor: refactor instruction handler and adds support for batch region downgrade operations (#7130) * refactor: refactor instruction handler Signed-off-by: WenyXu * refactor: support batch downgrade region instructions Signed-off-by: WenyXu * fix compat Signed-off-by: WenyXu * fix clippy Signed-off-by: WenyXu * add tests Signed-off-by: WenyXu * chore: add comments Signed-off-by: WenyXu --------- Signed-off-by: WenyXu --- src/common/meta/src/instruction.rs | 159 ++++- src/datanode/src/heartbeat/handler.rs | 111 ++-- .../src/heartbeat/handler/close_region.rs | 86 +-- .../src/heartbeat/handler/downgrade_region.rs | 617 ++++++++++-------- .../src/heartbeat/handler/flush_region.rs | 161 ++--- .../src/heartbeat/handler/open_region.rs | 94 +-- .../src/heartbeat/handler/upgrade_region.rs | 385 +++++------ .../downgrade_leader_region.rs | 26 +- src/meta-srv/src/procedure/test_util.rs | 18 +- 9 files changed, 953 insertions(+), 704 deletions(-) diff --git a/src/common/meta/src/instruction.rs b/src/common/meta/src/instruction.rs index 9a9d955f58..c7bd82d675 100644 --- a/src/common/meta/src/instruction.rs +++ b/src/common/meta/src/instruction.rs @@ -55,6 +55,10 @@ impl Display for RegionIdent { /// The result of downgrade leader region. #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] pub struct DowngradeRegionReply { + /// The [RegionId]. + /// For compatibility, it is defaulted to [RegionId::new(0, 0)]. + #[serde(default)] + pub region_id: RegionId, /// Returns the `last_entry_id` if available. pub last_entry_id: Option, /// Returns the `metadata_last_entry_id` if available (Only available for metric engine). @@ -423,14 +427,60 @@ pub enum Instruction { CloseRegions(Vec), /// Upgrades a region. UpgradeRegion(UpgradeRegion), + #[serde( + deserialize_with = "single_or_multiple_from", + alias = "DowngradeRegion" + )] /// Downgrades a region. - DowngradeRegion(DowngradeRegion), + DowngradeRegions(Vec), /// Invalidates batch cache. InvalidateCaches(Vec), /// Flushes regions. FlushRegions(FlushRegions), } +impl Instruction { + /// Converts the instruction into a vector of [OpenRegion]. + pub fn into_open_regions(self) -> Option> { + match self { + Self::OpenRegions(open_regions) => Some(open_regions), + _ => None, + } + } + + /// Converts the instruction into a vector of [RegionIdent]. + pub fn into_close_regions(self) -> Option> { + match self { + Self::CloseRegions(close_regions) => Some(close_regions), + _ => None, + } + } + + /// Converts the instruction into a [FlushRegions]. + pub fn into_flush_regions(self) -> Option { + match self { + Self::FlushRegions(flush_regions) => Some(flush_regions), + _ => None, + } + } + + /// Converts the instruction into a [DowngradeRegion]. + pub fn into_downgrade_regions(self) -> Option> { + match self { + Self::DowngradeRegions(downgrade_region) => Some(downgrade_region), + _ => None, + } + } + + /// Converts the instruction into a [UpgradeRegion]. + pub fn into_upgrade_regions(self) -> Option { + match self { + Self::UpgradeRegion(upgrade_region) => Some(upgrade_region), + _ => None, + } + } +} + /// The reply of [UpgradeRegion]. #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] pub struct UpgradeRegionReply { @@ -452,6 +502,39 @@ impl Display for UpgradeRegionReply { } } +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] +pub struct DowngradeRegionsReply { + pub replies: Vec, +} + +impl DowngradeRegionsReply { + pub fn new(replies: Vec) -> Self { + Self { replies } + } + + pub fn single(reply: DowngradeRegionReply) -> Self { + Self::new(vec![reply]) + } +} + +#[derive(Deserialize)] +#[serde(untagged)] +enum DowngradeRegionsCompat { + Single(DowngradeRegionReply), + Multiple(DowngradeRegionsReply), +} + +fn downgrade_regions_compat_from<'de, D>(deserializer: D) -> Result +where + D: Deserializer<'de>, +{ + let helper = DowngradeRegionsCompat::deserialize(deserializer)?; + Ok(match helper { + DowngradeRegionsCompat::Single(x) => DowngradeRegionsReply::new(vec![x]), + DowngradeRegionsCompat::Multiple(reply) => reply, + }) +} + #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] #[serde(tag = "type", rename_all = "snake_case")] pub enum InstructionReply { @@ -460,7 +543,11 @@ pub enum InstructionReply { #[serde(alias = "close_region")] CloseRegions(SimpleReply), UpgradeRegion(UpgradeRegionReply), - DowngradeRegion(DowngradeRegionReply), + #[serde( + alias = "downgrade_region", + deserialize_with = "downgrade_regions_compat_from" + )] + DowngradeRegions(DowngradeRegionsReply), FlushRegions(FlushRegionReply), } @@ -470,8 +557,8 @@ impl Display for InstructionReply { Self::OpenRegions(reply) => write!(f, "InstructionReply::OpenRegions({})", reply), Self::CloseRegions(reply) => write!(f, "InstructionReply::CloseRegions({})", reply), Self::UpgradeRegion(reply) => write!(f, "InstructionReply::UpgradeRegion({})", reply), - Self::DowngradeRegion(reply) => { - write!(f, "InstructionReply::DowngradeRegion({})", reply) + Self::DowngradeRegions(reply) => { + write!(f, "InstructionReply::DowngradeRegions({:?})", reply) } Self::FlushRegions(reply) => write!(f, "InstructionReply::FlushRegions({})", reply), } @@ -493,6 +580,27 @@ impl InstructionReply { _ => panic!("Expected OpenRegions reply"), } } + + pub fn expect_upgrade_region_reply(self) -> UpgradeRegionReply { + match self { + Self::UpgradeRegion(reply) => reply, + _ => panic!("Expected UpgradeRegion reply"), + } + } + + pub fn expect_downgrade_regions_reply(self) -> Vec { + match self { + Self::DowngradeRegions(reply) => reply.replies, + _ => panic!("Expected DowngradeRegion reply"), + } + } + + pub fn expect_flush_regions_reply(self) -> FlushRegionReply { + match self { + Self::FlushRegions(reply) => reply, + _ => panic!("Expected FlushRegions reply"), + } + } } #[cfg(test)] @@ -532,11 +640,27 @@ mod tests { r#"{"CloseRegions":[{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"}]}"#, serialized ); + + let downgrade_region = InstructionReply::DowngradeRegions(DowngradeRegionsReply::single( + DowngradeRegionReply { + region_id: RegionId::new(1024, 1), + last_entry_id: None, + metadata_last_entry_id: None, + exists: true, + error: None, + }, + )); + + let serialized = serde_json::to_string(&downgrade_region).unwrap(); + assert_eq!( + r#"{"type":"downgrade_regions","replies":[{"region_id":4398046511105,"last_entry_id":null,"metadata_last_entry_id":null,"exists":true,"error":null}]}"#, + serialized + ) } #[test] fn test_deserialize_instruction() { - let open_region_instruction = r#"{"OpenRegion":[{"region_ident":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"},"region_storage_path":"test/foo","region_options":{},"region_wal_options":{},"skip_wal_replay":false}]}"#; + let open_region_instruction = r#"{"OpenRegion":{"region_ident":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"},"region_storage_path":"test/foo","region_options":{},"region_wal_options":{},"skip_wal_replay":false}}"#; let open_region_instruction: Instruction = serde_json::from_str(open_region_instruction).unwrap(); let open_region = Instruction::OpenRegions(vec![OpenRegion::new( @@ -553,7 +677,7 @@ mod tests { )]); assert_eq!(open_region_instruction, open_region); - let close_region_instruction = r#"{"CloseRegion":[{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"}]}"#; + let close_region_instruction = r#"{"CloseRegion":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"}}"#; let close_region_instruction: Instruction = serde_json::from_str(close_region_instruction).unwrap(); let close_region = Instruction::CloseRegions(vec![RegionIdent { @@ -564,6 +688,15 @@ mod tests { }]); assert_eq!(close_region_instruction, close_region); + let downgrade_region_instruction = r#"{"DowngradeRegions":{"region_id":4398046511105,"flush_timeout":{"secs":1,"nanos":0}}}"#; + let downgrade_region_instruction: Instruction = + serde_json::from_str(downgrade_region_instruction).unwrap(); + let downgrade_region = Instruction::DowngradeRegions(vec![DowngradeRegion { + region_id: RegionId::new(1024, 1), + flush_timeout: Some(Duration::from_millis(1000)), + }]); + assert_eq!(downgrade_region_instruction, downgrade_region); + let close_region_instruction_reply = r#"{"result":true,"error":null,"type":"close_region"}"#; let close_region_instruction_reply: InstructionReply = @@ -582,6 +715,20 @@ mod tests { error: None, }); assert_eq!(open_region_instruction_reply, open_region_reply); + + let downgrade_region_instruction_reply = r#"{"region_id":4398046511105,"last_entry_id":null,"metadata_last_entry_id":null,"exists":true,"error":null,"type":"downgrade_region"}"#; + let downgrade_region_instruction_reply: InstructionReply = + serde_json::from_str(downgrade_region_instruction_reply).unwrap(); + let downgrade_region_reply = InstructionReply::DowngradeRegions( + DowngradeRegionsReply::single(DowngradeRegionReply { + region_id: RegionId::new(1024, 1), + last_entry_id: None, + metadata_last_entry_id: None, + exists: true, + error: None, + }), + ); + assert_eq!(downgrade_region_instruction_reply, downgrade_region_reply); } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/src/datanode/src/heartbeat/handler.rs b/src/datanode/src/heartbeat/handler.rs index 14a671a14b..71b3181a04 100644 --- a/src/datanode/src/heartbeat/handler.rs +++ b/src/datanode/src/heartbeat/handler.rs @@ -13,16 +13,13 @@ // limitations under the License. use async_trait::async_trait; -use common_meta::RegionIdent; use common_meta::error::{InvalidHeartbeatResponseSnafu, Result as MetaResult}; use common_meta::heartbeat::handler::{ HandleControl, HeartbeatResponseHandler, HeartbeatResponseHandlerContext, }; use common_meta::instruction::{Instruction, InstructionReply}; use common_telemetry::error; -use futures::future::BoxFuture; use snafu::OptionExt; -use store_api::storage::RegionId; mod close_region; mod downgrade_region; @@ -30,10 +27,15 @@ mod flush_region; mod open_region; mod upgrade_region; +use crate::heartbeat::handler::close_region::CloseRegionsHandler; +use crate::heartbeat::handler::downgrade_region::DowngradeRegionsHandler; +use crate::heartbeat::handler::flush_region::FlushRegionsHandler; +use crate::heartbeat::handler::open_region::OpenRegionsHandler; +use crate::heartbeat::handler::upgrade_region::UpgradeRegionsHandler; use crate::heartbeat::task_tracker::TaskTracker; use crate::region_server::RegionServer; -/// Handler for [Instruction::OpenRegion] and [Instruction::CloseRegion]. +/// The handler for [`Instruction`]s. #[derive(Clone)] pub struct RegionHeartbeatResponseHandler { region_server: RegionServer, @@ -43,9 +45,14 @@ pub struct RegionHeartbeatResponseHandler { open_region_parallelism: usize, } -/// Handler of the instruction. -pub type InstructionHandler = - Box BoxFuture<'static, Option> + Send>; +#[async_trait::async_trait] +pub trait InstructionHandler: Send + Sync { + async fn handle( + &self, + ctx: &HandlerContext, + instruction: Instruction, + ) -> Option; +} #[derive(Clone)] pub struct HandlerContext { @@ -56,10 +63,6 @@ pub struct HandlerContext { } impl HandlerContext { - fn region_ident_to_region_id(region_ident: &RegionIdent) -> RegionId { - RegionId::new(region_ident.table_id, region_ident.region_number) - } - #[cfg(test)] pub fn new_for_test(region_server: RegionServer) -> Self { Self { @@ -90,31 +93,16 @@ impl RegionHeartbeatResponseHandler { self } - /// Builds the [InstructionHandler]. - fn build_handler(&self, instruction: Instruction) -> MetaResult { + fn build_handler(&self, instruction: &Instruction) -> MetaResult> { match instruction { - Instruction::OpenRegions(open_regions) => { - let open_region_parallelism = self.open_region_parallelism; - Ok(Box::new(move |handler_context| { - handler_context - .handle_open_regions_instruction(open_regions, open_region_parallelism) - })) - } - Instruction::CloseRegions(close_regions) => Ok(Box::new(move |handler_context| { - handler_context.handle_close_regions_instruction(close_regions) - })), - Instruction::DowngradeRegion(downgrade_region) => { - Ok(Box::new(move |handler_context| { - handler_context.handle_downgrade_region_instruction(downgrade_region) - })) - } - Instruction::UpgradeRegion(upgrade_region) => Ok(Box::new(move |handler_context| { - handler_context.handle_upgrade_region_instruction(upgrade_region) + Instruction::CloseRegions(_) => Ok(Box::new(CloseRegionsHandler)), + Instruction::OpenRegions(_) => Ok(Box::new(OpenRegionsHandler { + open_region_parallelism: self.open_region_parallelism, })), + Instruction::FlushRegions(_) => Ok(Box::new(FlushRegionsHandler)), + Instruction::DowngradeRegions(_) => Ok(Box::new(DowngradeRegionsHandler)), + Instruction::UpgradeRegion(_) => Ok(Box::new(UpgradeRegionsHandler)), Instruction::InvalidateCaches(_) => InvalidHeartbeatResponseSnafu.fail(), - Instruction::FlushRegions(flush_regions) => Ok(Box::new(move |handler_context| { - handler_context.handle_flush_regions_instruction(flush_regions) - })), } } } @@ -124,7 +112,7 @@ impl HeartbeatResponseHandler for RegionHeartbeatResponseHandler { fn is_acceptable(&self, ctx: &HeartbeatResponseHandlerContext) -> bool { matches!(ctx.incoming_message.as_ref(), |Some(( _, - Instruction::DowngradeRegion { .. }, + Instruction::DowngradeRegions { .. }, ))| Some(( _, Instruction::UpgradeRegion { .. } @@ -151,15 +139,19 @@ impl HeartbeatResponseHandler for RegionHeartbeatResponseHandler { let catchup_tasks = self.catchup_tasks.clone(); let downgrade_tasks = self.downgrade_tasks.clone(); let flush_tasks = self.flush_tasks.clone(); - let handler = self.build_handler(instruction)?; + let handler = self.build_handler(&instruction)?; let _handle = common_runtime::spawn_global(async move { - let reply = handler(HandlerContext { - region_server, - catchup_tasks, - downgrade_tasks, - flush_tasks, - }) - .await; + let reply = handler + .handle( + &HandlerContext { + region_server, + catchup_tasks, + downgrade_tasks, + flush_tasks, + }, + instruction, + ) + .await; if let Some(reply) = reply && let Err(e) = mailbox.send((meta, reply)).await @@ -179,6 +171,7 @@ mod tests { use std::sync::Arc; use std::time::Duration; + use common_meta::RegionIdent; use common_meta::heartbeat::mailbox::{ HeartbeatMailbox, IncomingMessage, MailboxRef, MessageMeta, }; @@ -249,10 +242,10 @@ mod tests { ); // Downgrade region - let instruction = Instruction::DowngradeRegion(DowngradeRegion { + let instruction = Instruction::DowngradeRegions(vec![DowngradeRegion { region_id: RegionId::new(2048, 1), flush_timeout: Some(Duration::from_secs(1)), - }); + }]); assert!( heartbeat_handler .is_acceptable(&heartbeat_env.create_handler_ctx((meta.clone(), instruction))) @@ -447,10 +440,10 @@ mod tests { // Should be ok, if we try to downgrade it twice. for _ in 0..2 { let meta = MessageMeta::new_test(1, "test", "dn-1", "me-0"); - let instruction = Instruction::DowngradeRegion(DowngradeRegion { + let instruction = Instruction::DowngradeRegions(vec![DowngradeRegion { region_id, flush_timeout: Some(Duration::from_secs(1)), - }); + }]); let mut ctx = heartbeat_env.create_handler_ctx((meta, instruction)); let control = heartbeat_handler.handle(&mut ctx).await.unwrap(); @@ -458,33 +451,27 @@ mod tests { let (_, reply) = heartbeat_env.receiver.recv().await.unwrap(); - if let InstructionReply::DowngradeRegion(reply) = reply { - assert!(reply.exists); - assert!(reply.error.is_none()); - assert_eq!(reply.last_entry_id.unwrap(), 0); - } else { - unreachable!() - } + let reply = &reply.expect_downgrade_regions_reply()[0]; + assert!(reply.exists); + assert!(reply.error.is_none()); + assert_eq!(reply.last_entry_id.unwrap(), 0); } // Downgrades a not exists region. let meta = MessageMeta::new_test(1, "test", "dn-1", "me-0"); - let instruction = Instruction::DowngradeRegion(DowngradeRegion { + let instruction = Instruction::DowngradeRegions(vec![DowngradeRegion { region_id: RegionId::new(2048, 1), flush_timeout: Some(Duration::from_secs(1)), - }); + }]); let mut ctx = heartbeat_env.create_handler_ctx((meta, instruction)); let control = heartbeat_handler.handle(&mut ctx).await.unwrap(); assert_matches!(control, HandleControl::Continue); let (_, reply) = heartbeat_env.receiver.recv().await.unwrap(); - if let InstructionReply::DowngradeRegion(reply) = reply { - assert!(!reply.exists); - assert!(reply.error.is_none()); - assert!(reply.last_entry_id.is_none()); - } else { - unreachable!() - } + let reply = reply.expect_downgrade_regions_reply(); + assert!(!reply[0].exists); + assert!(reply[0].error.is_none()); + assert!(reply[0].last_entry_id.is_none()); } } diff --git a/src/datanode/src/heartbeat/handler/close_region.rs b/src/datanode/src/heartbeat/handler/close_region.rs index c942642731..88ed043fab 100644 --- a/src/datanode/src/heartbeat/handler/close_region.rs +++ b/src/datanode/src/heartbeat/handler/close_region.rs @@ -12,60 +12,64 @@ // See the License for the specific language governing permissions and // limitations under the License. -use common_meta::RegionIdent; -use common_meta::instruction::{InstructionReply, SimpleReply}; +use common_meta::instruction::{Instruction, InstructionReply, SimpleReply}; use common_telemetry::warn; use futures::future::join_all; -use futures_util::future::BoxFuture; use store_api::region_request::{RegionCloseRequest, RegionRequest}; +use store_api::storage::RegionId; use crate::error; -use crate::heartbeat::handler::HandlerContext; +use crate::heartbeat::handler::{HandlerContext, InstructionHandler}; -impl HandlerContext { - pub(crate) fn handle_close_regions_instruction( - self, - region_idents: Vec, - ) -> BoxFuture<'static, Option> { - Box::pin(async move { - let region_ids = region_idents - .into_iter() - .map(|region_ident| Self::region_ident_to_region_id(®ion_ident)) - .collect::>(); +#[derive(Debug, Clone, Copy, Default)] +pub struct CloseRegionsHandler; - let futs = region_ids.iter().map(|region_id| { - self.region_server - .handle_request(*region_id, RegionRequest::Close(RegionCloseRequest {})) - }); +#[async_trait::async_trait] +impl InstructionHandler for CloseRegionsHandler { + async fn handle( + &self, + ctx: &HandlerContext, + instruction: Instruction, + ) -> Option { + // Safety: must be `Instruction::CloseRegions` instruction. + let region_idents = instruction.into_close_regions().unwrap(); + let region_ids = region_idents + .into_iter() + .map(|region_ident| RegionId::new(region_ident.table_id, region_ident.region_number)) + .collect::>(); - let results = join_all(futs).await; + let futs = region_ids.iter().map(|region_id| { + ctx.region_server + .handle_request(*region_id, RegionRequest::Close(RegionCloseRequest {})) + }); - let mut errors = vec![]; - for (region_id, result) in region_ids.into_iter().zip(results.into_iter()) { - match result { - Ok(_) => (), - Err(error::Error::RegionNotFound { .. }) => { - warn!( - "Received a close regions instruction from meta, but target region:{} is not found.", - region_id - ); - } - Err(err) => errors.push(format!("region:{region_id}: {err:?}")), + let results = join_all(futs).await; + + let mut errors = vec![]; + for (region_id, result) in region_ids.into_iter().zip(results.into_iter()) { + match result { + Ok(_) => (), + Err(error::Error::RegionNotFound { .. }) => { + warn!( + "Received a close regions instruction from meta, but target region:{} is not found.", + region_id + ); } + Err(err) => errors.push(format!("region:{region_id}: {err:?}")), } + } - if errors.is_empty() { - return Some(InstructionReply::CloseRegions(SimpleReply { - result: true, - error: None, - })); - } + if errors.is_empty() { + return Some(InstructionReply::CloseRegions(SimpleReply { + result: true, + error: None, + })); + } - Some(InstructionReply::CloseRegions(SimpleReply { - result: false, - error: Some(errors.join("; ")), - })) - }) + Some(InstructionReply::CloseRegions(SimpleReply { + result: false, + error: Some(errors.join("; ")), + })) } } diff --git a/src/datanode/src/heartbeat/handler/downgrade_region.rs b/src/datanode/src/heartbeat/handler/downgrade_region.rs index 06d3ab046e..91ceddb91a 100644 --- a/src/datanode/src/heartbeat/handler/downgrade_region.rs +++ b/src/datanode/src/heartbeat/handler/downgrade_region.rs @@ -12,209 +12,242 @@ // See the License for the specific language governing permissions and // limitations under the License. -use common_meta::instruction::{DowngradeRegion, DowngradeRegionReply, InstructionReply}; +use common_meta::instruction::{ + DowngradeRegion, DowngradeRegionReply, DowngradeRegionsReply, Instruction, InstructionReply, +}; use common_telemetry::tracing::info; use common_telemetry::{error, warn}; -use futures_util::future::BoxFuture; +use futures::future::join_all; use store_api::region_engine::{SetRegionRoleStateResponse, SettableRegionRoleState}; use store_api::region_request::{RegionFlushRequest, RegionRequest}; use store_api::storage::RegionId; -use crate::heartbeat::handler::HandlerContext; +use crate::heartbeat::handler::{HandlerContext, InstructionHandler}; use crate::heartbeat::task_tracker::WaitResult; -impl HandlerContext { - async fn downgrade_to_follower_gracefully( +#[derive(Debug, Clone, Copy, Default)] +pub struct DowngradeRegionsHandler; + +impl DowngradeRegionsHandler { + async fn handle_downgrade_region( + ctx: &HandlerContext, + DowngradeRegion { + region_id, + flush_timeout, + }: DowngradeRegion, + ) -> DowngradeRegionReply { + let Some(writable) = ctx.region_server.is_region_leader(region_id) else { + warn!("Region: {region_id} is not found"); + return DowngradeRegionReply { + region_id, + last_entry_id: None, + metadata_last_entry_id: None, + exists: false, + error: None, + }; + }; + + let region_server_moved = ctx.region_server.clone(); + + // Ignores flush request + if !writable { + warn!( + "Region: {region_id} is not writable, flush_timeout: {:?}", + flush_timeout + ); + return ctx.downgrade_to_follower_gracefully(region_id).await; + } + + // If flush_timeout is not set, directly convert region to follower. + let Some(flush_timeout) = flush_timeout else { + return ctx.downgrade_to_follower_gracefully(region_id).await; + }; + + // Sets region to downgrading, + // the downgrading region will reject all write requests. + // However, the downgrading region will still accept read, flush requests. + match ctx + .region_server + .set_region_role_state_gracefully(region_id, SettableRegionRoleState::DowngradingLeader) + .await + { + Ok(SetRegionRoleStateResponse::Success { .. }) => {} + Ok(SetRegionRoleStateResponse::NotFound) => { + warn!("Region: {region_id} is not found"); + return DowngradeRegionReply { + region_id, + last_entry_id: None, + metadata_last_entry_id: None, + exists: false, + error: None, + }; + } + Ok(SetRegionRoleStateResponse::InvalidTransition(err)) => { + error!(err; "Failed to convert region to downgrading leader - invalid transition"); + return DowngradeRegionReply { + region_id, + last_entry_id: None, + metadata_last_entry_id: None, + exists: true, + error: Some(format!("{err:?}")), + }; + } + Err(err) => { + error!(err; "Failed to convert region to downgrading leader"); + return DowngradeRegionReply { + region_id, + last_entry_id: None, + metadata_last_entry_id: None, + exists: true, + error: Some(format!("{err:?}")), + }; + } + } + + let register_result = ctx + .downgrade_tasks + .try_register( + region_id, + Box::pin(async move { + info!("Flush region: {region_id} before converting region to follower"); + region_server_moved + .handle_request( + region_id, + RegionRequest::Flush(RegionFlushRequest { + row_group_size: None, + }), + ) + .await?; + + Ok(()) + }), + ) + .await; + + if register_result.is_busy() { + warn!("Another flush task is running for the region: {region_id}"); + } + + let mut watcher = register_result.into_watcher(); + let result = ctx.downgrade_tasks.wait(&mut watcher, flush_timeout).await; + + match result { + WaitResult::Timeout => DowngradeRegionReply { + region_id, + last_entry_id: None, + metadata_last_entry_id: None, + exists: true, + error: Some(format!( + "Flush region timeout, region: {region_id}, timeout: {:?}", + flush_timeout + )), + }, + WaitResult::Finish(Ok(_)) => ctx.downgrade_to_follower_gracefully(region_id).await, + WaitResult::Finish(Err(err)) => DowngradeRegionReply { + region_id, + last_entry_id: None, + metadata_last_entry_id: None, + exists: true, + error: Some(format!("{err:?}")), + }, + } + } +} + +#[async_trait::async_trait] +impl InstructionHandler for DowngradeRegionsHandler { + async fn handle( &self, - region_id: RegionId, + ctx: &HandlerContext, + instruction: Instruction, ) -> Option { + // Safety: must be `Instruction::DowngradeRegion` instruction. + let downgrade_regions = instruction.into_downgrade_regions().unwrap(); + let futures = downgrade_regions + .into_iter() + .map(|downgrade_region| Self::handle_downgrade_region(ctx, downgrade_region)); + // Join all futures; parallelism is governed by the underlying flush scheduler. + let results = join_all(futures).await; + + Some(InstructionReply::DowngradeRegions( + DowngradeRegionsReply::new(results), + )) + } +} + +impl HandlerContext { + async fn downgrade_to_follower_gracefully(&self, region_id: RegionId) -> DowngradeRegionReply { match self .region_server .set_region_role_state_gracefully(region_id, SettableRegionRoleState::Follower) .await { - Ok(SetRegionRoleStateResponse::Success(success)) => { - Some(InstructionReply::DowngradeRegion(DowngradeRegionReply { - last_entry_id: success.last_entry_id(), - metadata_last_entry_id: success.metadata_last_entry_id(), - exists: true, - error: None, - })) - } + Ok(SetRegionRoleStateResponse::Success(success)) => DowngradeRegionReply { + region_id, + last_entry_id: success.last_entry_id(), + metadata_last_entry_id: success.metadata_last_entry_id(), + exists: true, + error: None, + }, Ok(SetRegionRoleStateResponse::NotFound) => { warn!("Region: {region_id} is not found"); - Some(InstructionReply::DowngradeRegion(DowngradeRegionReply { + DowngradeRegionReply { + region_id, last_entry_id: None, metadata_last_entry_id: None, exists: false, error: None, - })) + } } Ok(SetRegionRoleStateResponse::InvalidTransition(err)) => { error!(err; "Failed to convert region to follower - invalid transition"); - Some(InstructionReply::DowngradeRegion(DowngradeRegionReply { + DowngradeRegionReply { + region_id, last_entry_id: None, metadata_last_entry_id: None, exists: true, error: Some(format!("{err:?}")), - })) + } } Err(err) => { error!(err; "Failed to convert region to {}", SettableRegionRoleState::Follower); - Some(InstructionReply::DowngradeRegion(DowngradeRegionReply { + DowngradeRegionReply { + region_id, last_entry_id: None, metadata_last_entry_id: None, exists: true, error: Some(format!("{err:?}")), - })) + } } } } - - pub(crate) fn handle_downgrade_region_instruction( - self, - DowngradeRegion { - region_id, - flush_timeout, - }: DowngradeRegion, - ) -> BoxFuture<'static, Option> { - Box::pin(async move { - let Some(writable) = self.region_server.is_region_leader(region_id) else { - warn!("Region: {region_id} is not found"); - return Some(InstructionReply::DowngradeRegion(DowngradeRegionReply { - last_entry_id: None, - metadata_last_entry_id: None, - exists: false, - error: None, - })); - }; - - let region_server_moved = self.region_server.clone(); - - // Ignores flush request - if !writable { - warn!( - "Region: {region_id} is not writable, flush_timeout: {:?}", - flush_timeout - ); - return self.downgrade_to_follower_gracefully(region_id).await; - } - - // If flush_timeout is not set, directly convert region to follower. - let Some(flush_timeout) = flush_timeout else { - return self.downgrade_to_follower_gracefully(region_id).await; - }; - - // Sets region to downgrading, - // the downgrading region will reject all write requests. - // However, the downgrading region will still accept read, flush requests. - match self - .region_server - .set_region_role_state_gracefully( - region_id, - SettableRegionRoleState::DowngradingLeader, - ) - .await - { - Ok(SetRegionRoleStateResponse::Success { .. }) => {} - Ok(SetRegionRoleStateResponse::NotFound) => { - warn!("Region: {region_id} is not found"); - return Some(InstructionReply::DowngradeRegion(DowngradeRegionReply { - last_entry_id: None, - metadata_last_entry_id: None, - exists: false, - error: None, - })); - } - Ok(SetRegionRoleStateResponse::InvalidTransition(err)) => { - error!(err; "Failed to convert region to downgrading leader - invalid transition"); - return Some(InstructionReply::DowngradeRegion(DowngradeRegionReply { - last_entry_id: None, - metadata_last_entry_id: None, - exists: true, - error: Some(format!("{err:?}")), - })); - } - Err(err) => { - error!(err; "Failed to convert region to downgrading leader"); - return Some(InstructionReply::DowngradeRegion(DowngradeRegionReply { - last_entry_id: None, - metadata_last_entry_id: None, - exists: true, - error: Some(format!("{err:?}")), - })); - } - } - - let register_result = self - .downgrade_tasks - .try_register( - region_id, - Box::pin(async move { - info!("Flush region: {region_id} before converting region to follower"); - region_server_moved - .handle_request( - region_id, - RegionRequest::Flush(RegionFlushRequest { - row_group_size: None, - }), - ) - .await?; - - Ok(()) - }), - ) - .await; - - if register_result.is_busy() { - warn!("Another flush task is running for the region: {region_id}"); - } - - let mut watcher = register_result.into_watcher(); - let result = self.downgrade_tasks.wait(&mut watcher, flush_timeout).await; - - match result { - WaitResult::Timeout => { - Some(InstructionReply::DowngradeRegion(DowngradeRegionReply { - last_entry_id: None, - metadata_last_entry_id: None, - exists: true, - error: Some(format!( - "Flush region timeout, region: {region_id}, timeout: {:?}", - flush_timeout - )), - })) - } - WaitResult::Finish(Ok(_)) => self.downgrade_to_follower_gracefully(region_id).await, - WaitResult::Finish(Err(err)) => { - Some(InstructionReply::DowngradeRegion(DowngradeRegionReply { - last_entry_id: None, - metadata_last_entry_id: None, - exists: true, - error: Some(format!("{err:?}")), - })) - } - } - }) - } } #[cfg(test)] mod tests { use std::assert_matches::assert_matches; + use std::sync::Arc; use std::time::Duration; - use common_meta::instruction::{DowngradeRegion, InstructionReply}; + use common_meta::heartbeat::handler::{HandleControl, HeartbeatResponseHandler}; + use common_meta::heartbeat::mailbox::MessageMeta; + use common_meta::instruction::{DowngradeRegion, Instruction}; + use mito2::config::MitoConfig; use mito2::engine::MITO_ENGINE_NAME; + use mito2::test_util::{CreateRequestBuilder, TestEnv}; use store_api::region_engine::{ - RegionRole, SetRegionRoleStateResponse, SetRegionRoleStateSuccess, + RegionEngine, RegionRole, SetRegionRoleStateResponse, SetRegionRoleStateSuccess, }; use store_api::region_request::RegionRequest; use store_api::storage::RegionId; use tokio::time::Instant; use crate::error; - use crate::heartbeat::handler::HandlerContext; + use crate::heartbeat::handler::downgrade_region::DowngradeRegionsHandler; + use crate::heartbeat::handler::tests::HeartbeatResponseTestEnv; + use crate::heartbeat::handler::{ + HandlerContext, InstructionHandler, RegionHeartbeatResponseHandler, + }; use crate::tests::{MockRegionEngine, mock_region_server}; #[tokio::test] @@ -227,20 +260,20 @@ mod tests { let waits = vec![None, Some(Duration::from_millis(100u64))]; for flush_timeout in waits { - let reply = handler_context - .clone() - .handle_downgrade_region_instruction(DowngradeRegion { - region_id, - flush_timeout, - }) + let reply = DowngradeRegionsHandler + .handle( + &handler_context, + Instruction::DowngradeRegions(vec![DowngradeRegion { + region_id, + flush_timeout, + }]), + ) .await; - assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_))); - if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() { - assert!(!reply.exists); - assert!(reply.error.is_none()); - assert!(reply.last_entry_id.is_none()); - } + let reply = &reply.unwrap().expect_downgrade_regions_reply()[0]; + assert!(!reply.exists); + assert!(reply.error.is_none()); + assert!(reply.last_entry_id.is_none()); } } @@ -270,20 +303,20 @@ mod tests { let waits = vec![None, Some(Duration::from_millis(100u64))]; for flush_timeout in waits { - let reply = handler_context - .clone() - .handle_downgrade_region_instruction(DowngradeRegion { - region_id, - flush_timeout, - }) + let reply = DowngradeRegionsHandler + .handle( + &handler_context, + Instruction::DowngradeRegions(vec![DowngradeRegion { + region_id, + flush_timeout, + }]), + ) .await; - assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_))); - if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() { - assert!(reply.exists); - assert!(reply.error.is_none()); - assert_eq!(reply.last_entry_id.unwrap(), 1024); - } + let reply = &reply.unwrap().expect_downgrade_regions_reply()[0]; + assert!(reply.exists); + assert!(reply.error.is_none()); + assert_eq!(reply.last_entry_id.unwrap(), 1024); } } @@ -305,20 +338,20 @@ mod tests { let handler_context = HandlerContext::new_for_test(mock_region_server); let flush_timeout = Duration::from_millis(100); - let reply = handler_context - .clone() - .handle_downgrade_region_instruction(DowngradeRegion { - region_id, - flush_timeout: Some(flush_timeout), - }) + let reply = DowngradeRegionsHandler + .handle( + &handler_context, + Instruction::DowngradeRegions(vec![DowngradeRegion { + region_id, + flush_timeout: Some(flush_timeout), + }]), + ) .await; - assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_))); - if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() { - assert!(reply.exists); - assert!(reply.error.unwrap().contains("timeout")); - assert!(reply.last_entry_id.is_none()); - } + let reply = &reply.unwrap().expect_downgrade_regions_reply()[0]; + assert!(reply.exists); + assert!(reply.error.as_ref().unwrap().contains("timeout")); + assert!(reply.last_entry_id.is_none()); } #[tokio::test] @@ -344,36 +377,38 @@ mod tests { ]; for flush_timeout in waits { - let reply = handler_context - .clone() - .handle_downgrade_region_instruction(DowngradeRegion { - region_id, - flush_timeout, - }) + let reply = DowngradeRegionsHandler + .handle( + &handler_context, + Instruction::DowngradeRegions(vec![DowngradeRegion { + region_id, + flush_timeout, + }]), + ) .await; - assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_))); - if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() { - assert!(reply.exists); - assert!(reply.error.unwrap().contains("timeout")); - assert!(reply.last_entry_id.is_none()); - } + + let reply = &reply.unwrap().expect_downgrade_regions_reply()[0]; + assert!(reply.exists); + assert!(reply.error.as_ref().unwrap().contains("timeout")); + assert!(reply.last_entry_id.is_none()); } let timer = Instant::now(); - let reply = handler_context - .handle_downgrade_region_instruction(DowngradeRegion { - region_id, - flush_timeout: Some(Duration::from_millis(500)), - }) + let reply = DowngradeRegionsHandler + .handle( + &handler_context, + Instruction::DowngradeRegions(vec![DowngradeRegion { + region_id, + flush_timeout: Some(Duration::from_millis(500)), + }]), + ) .await; - assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_))); // Must less than 300 ms. assert!(timer.elapsed().as_millis() < 300); - if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() { - assert!(reply.exists); - assert!(reply.error.is_none()); - assert_eq!(reply.last_entry_id.unwrap(), 1024); - } + let reply = &reply.unwrap().expect_downgrade_regions_reply()[0]; + assert!(reply.exists); + assert!(reply.error.is_none()); + assert_eq!(reply.last_entry_id.unwrap(), 1024); } #[tokio::test] @@ -405,36 +440,36 @@ mod tests { ]; for flush_timeout in waits { - let reply = handler_context - .clone() - .handle_downgrade_region_instruction(DowngradeRegion { - region_id, - flush_timeout, - }) + let reply = DowngradeRegionsHandler + .handle( + &handler_context, + Instruction::DowngradeRegions(vec![DowngradeRegion { + region_id, + flush_timeout, + }]), + ) .await; - assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_))); - if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() { - assert!(reply.exists); - assert!(reply.error.unwrap().contains("timeout")); - assert!(reply.last_entry_id.is_none()); - } - } - let timer = Instant::now(); - let reply = handler_context - .handle_downgrade_region_instruction(DowngradeRegion { - region_id, - flush_timeout: Some(Duration::from_millis(500)), - }) - .await; - assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_))); - // Must less than 300 ms. - assert!(timer.elapsed().as_millis() < 300); - - if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() { + let reply = &reply.unwrap().expect_downgrade_regions_reply()[0]; assert!(reply.exists); - assert!(reply.error.unwrap().contains("flush failed")); + assert!(reply.error.as_ref().unwrap().contains("timeout")); assert!(reply.last_entry_id.is_none()); } + let timer = Instant::now(); + let reply = DowngradeRegionsHandler + .handle( + &handler_context, + Instruction::DowngradeRegions(vec![DowngradeRegion { + region_id, + flush_timeout: Some(Duration::from_millis(500)), + }]), + ) + .await; + // Must less than 300 ms. + assert!(timer.elapsed().as_millis() < 300); + let reply = &reply.unwrap().expect_downgrade_regions_reply()[0]; + assert!(reply.exists); + assert!(reply.error.as_ref().unwrap().contains("flush failed")); + assert!(reply.last_entry_id.is_none()); } #[tokio::test] @@ -449,19 +484,19 @@ mod tests { }); mock_region_server.register_test_region(region_id, mock_engine); let handler_context = HandlerContext::new_for_test(mock_region_server); - let reply = handler_context - .clone() - .handle_downgrade_region_instruction(DowngradeRegion { - region_id, - flush_timeout: None, - }) + let reply = DowngradeRegionsHandler + .handle( + &handler_context, + Instruction::DowngradeRegions(vec![DowngradeRegion { + region_id, + flush_timeout: None, + }]), + ) .await; - assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_))); - if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() { - assert!(!reply.exists); - assert!(reply.error.is_none()); - assert!(reply.last_entry_id.is_none()); - } + let reply = &reply.unwrap().expect_downgrade_regions_reply()[0]; + assert!(!reply.exists); + assert!(reply.error.is_none()); + assert!(reply.last_entry_id.is_none()); } #[tokio::test] @@ -480,23 +515,77 @@ mod tests { }); mock_region_server.register_test_region(region_id, mock_engine); let handler_context = HandlerContext::new_for_test(mock_region_server); - let reply = handler_context - .clone() - .handle_downgrade_region_instruction(DowngradeRegion { - region_id, - flush_timeout: None, - }) + let reply = DowngradeRegionsHandler + .handle( + &handler_context, + Instruction::DowngradeRegions(vec![DowngradeRegion { + region_id, + flush_timeout: None, + }]), + ) .await; - assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_))); - if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() { - assert!(reply.exists); - assert!( - reply - .error - .unwrap() - .contains("Failed to set region to readonly") - ); - assert!(reply.last_entry_id.is_none()); - } + let reply = &reply.unwrap().expect_downgrade_regions_reply()[0]; + assert!(reply.exists); + assert!( + reply + .error + .as_ref() + .unwrap() + .contains("Failed to set region to readonly") + ); + assert!(reply.last_entry_id.is_none()); + } + + #[tokio::test] + async fn test_downgrade_regions() { + common_telemetry::init_default_ut_logging(); + + let mut region_server = mock_region_server(); + let heartbeat_handler = RegionHeartbeatResponseHandler::new(region_server.clone()); + let mut engine_env = TestEnv::with_prefix("downgrade-regions").await; + let engine = engine_env.create_engine(MitoConfig::default()).await; + region_server.register_engine(Arc::new(engine.clone())); + let region_id = RegionId::new(1024, 1); + let region_id1 = RegionId::new(1024, 2); + let builder = CreateRequestBuilder::new(); + let create_req = builder.build(); + region_server + .handle_request(region_id, RegionRequest::Create(create_req)) + .await + .unwrap(); + let create_req1 = builder.build(); + region_server + .handle_request(region_id1, RegionRequest::Create(create_req1)) + .await + .unwrap(); + let meta = MessageMeta::new_test(1, "test", "dn-1", "meta-0"); + let instruction = Instruction::DowngradeRegions(vec![ + DowngradeRegion { + region_id, + flush_timeout: Some(Duration::from_secs(1)), + }, + DowngradeRegion { + region_id: region_id1, + flush_timeout: Some(Duration::from_secs(1)), + }, + ]); + let mut heartbeat_env = HeartbeatResponseTestEnv::new(); + let mut ctx = heartbeat_env.create_handler_ctx((meta, instruction)); + let control = heartbeat_handler.handle(&mut ctx).await.unwrap(); + assert_matches!(control, HandleControl::Continue); + + let (_, reply) = heartbeat_env.receiver.recv().await.unwrap(); + let reply = reply.expect_downgrade_regions_reply(); + assert_eq!(reply[0].region_id, region_id); + assert!(reply[0].exists); + assert!(reply[0].error.is_none()); + assert_eq!(reply[0].last_entry_id, Some(0)); + assert_eq!(reply[1].region_id, region_id1); + assert!(reply[1].exists); + assert!(reply[1].error.is_none()); + assert_eq!(reply[1].last_entry_id, Some(0)); + + assert_eq!(engine.role(region_id).unwrap(), RegionRole::Follower); + assert_eq!(engine.role(region_id1).unwrap(), RegionRole::Follower); } } diff --git a/src/datanode/src/heartbeat/handler/flush_region.rs b/src/datanode/src/heartbeat/handler/flush_region.rs index 963d3bf488..56b841bf00 100644 --- a/src/datanode/src/heartbeat/handler/flush_region.rs +++ b/src/datanode/src/heartbeat/handler/flush_region.rs @@ -15,19 +15,53 @@ use std::time::Instant; use common_meta::instruction::{ - FlushErrorStrategy, FlushRegionReply, FlushRegions, FlushStrategy, InstructionReply, + FlushErrorStrategy, FlushRegionReply, FlushStrategy, Instruction, InstructionReply, }; use common_telemetry::{debug, warn}; -use futures_util::future::BoxFuture; use store_api::region_request::{RegionFlushRequest, RegionRequest}; use store_api::storage::RegionId; -use crate::error::{self, RegionNotFoundSnafu, RegionNotReadySnafu, UnexpectedSnafu}; -use crate::heartbeat::handler::HandlerContext; +use crate::error::{self, RegionNotFoundSnafu, RegionNotReadySnafu, Result, UnexpectedSnafu}; +use crate::heartbeat::handler::{HandlerContext, InstructionHandler}; + +pub struct FlushRegionsHandler; + +#[async_trait::async_trait] +impl InstructionHandler for FlushRegionsHandler { + async fn handle( + &self, + ctx: &HandlerContext, + instruction: Instruction, + ) -> Option { + let start_time = Instant::now(); + let flush_regions = instruction.into_flush_regions().unwrap(); + let strategy = flush_regions.strategy; + let region_ids = flush_regions.region_ids; + let error_strategy = flush_regions.error_strategy; + + let reply = if matches!(strategy, FlushStrategy::Async) { + // Asynchronous hint mode: fire-and-forget, no reply expected + ctx.handle_flush_hint(region_ids).await; + None + } else { + // Synchronous mode: return reply with results + let reply = ctx.handle_flush_sync(region_ids, error_strategy).await; + Some(InstructionReply::FlushRegions(reply)) + }; + + let elapsed = start_time.elapsed(); + debug!( + "FlushRegions strategy: {:?}, elapsed: {:?}, reply: {:?}", + strategy, elapsed, reply + ); + + reply + } +} impl HandlerContext { /// Performs the actual region flush operation. - async fn perform_region_flush(&self, region_id: RegionId) -> Result<(), error::Error> { + async fn perform_region_flush(&self, region_id: RegionId) -> Result<()> { let request = RegionRequest::Flush(RegionFlushRequest { row_group_size: None, }); @@ -92,7 +126,7 @@ impl HandlerContext { } /// Flushes a single region synchronously with proper error handling. - async fn flush_single_region_sync(&self, region_id: RegionId) -> Result<(), error::Error> { + async fn flush_single_region_sync(&self, region_id: RegionId) -> Result<()> { // Check if region is leader and writable let Some(writable) = self.region_server.is_region_leader(region_id) else { return Err(RegionNotFoundSnafu { region_id }.build()); @@ -135,37 +169,6 @@ impl HandlerContext { .build()), } } - - /// Unified handler for FlushRegions with all flush semantics. - pub(crate) fn handle_flush_regions_instruction( - self, - flush_regions: FlushRegions, - ) -> BoxFuture<'static, Option> { - Box::pin(async move { - let start_time = Instant::now(); - let strategy = flush_regions.strategy; - let region_ids = flush_regions.region_ids; - let error_strategy = flush_regions.error_strategy; - - let reply = if matches!(strategy, FlushStrategy::Async) { - // Asynchronous hint mode: fire-and-forget, no reply expected - self.handle_flush_hint(region_ids).await; - None - } else { - // Synchronous mode: return reply with results - let reply = self.handle_flush_sync(region_ids, error_strategy).await; - Some(InstructionReply::FlushRegions(reply)) - }; - - let elapsed = start_time.elapsed(); - debug!( - "FlushRegions strategy: {:?}, elapsed: {:?}, reply: {:?}", - strategy, elapsed, reply - ); - - reply - }) - } } #[cfg(test)] @@ -201,9 +204,11 @@ mod tests { // Async hint mode let flush_instruction = FlushRegions::async_batch(region_ids.clone()); - let reply = handler_context - .clone() - .handle_flush_regions_instruction(flush_instruction) + let reply = FlushRegionsHandler + .handle( + &handler_context, + Instruction::FlushRegions(flush_instruction), + ) .await; assert!(reply.is_none()); // Hint mode returns no reply assert_eq!(*flushed_region_ids.read().unwrap(), region_ids); @@ -212,8 +217,11 @@ mod tests { flushed_region_ids.write().unwrap().clear(); let not_found_region_ids = (0..2).map(|i| RegionId::new(2048, i)).collect::>(); let flush_instruction = FlushRegions::async_batch(not_found_region_ids); - let reply = handler_context - .handle_flush_regions_instruction(flush_instruction) + let reply = FlushRegionsHandler + .handle( + &handler_context, + Instruction::FlushRegions(flush_instruction), + ) .await; assert!(reply.is_none()); assert!(flushed_region_ids.read().unwrap().is_empty()); @@ -238,20 +246,17 @@ mod tests { let handler_context = HandlerContext::new_for_test(mock_region_server); let flush_instruction = FlushRegions::sync_single(region_id); - let reply = handler_context - .handle_flush_regions_instruction(flush_instruction) + let reply = FlushRegionsHandler + .handle( + &handler_context, + Instruction::FlushRegions(flush_instruction), + ) .await; - - assert!(reply.is_some()); - if let Some(InstructionReply::FlushRegions(flush_reply)) = reply { - assert!(flush_reply.overall_success); - assert_eq!(flush_reply.results.len(), 1); - assert_eq!(flush_reply.results[0].0, region_id); - assert!(flush_reply.results[0].1.is_ok()); - } else { - panic!("Expected FlushRegions reply"); - } - + let flush_reply = reply.unwrap().expect_flush_regions_reply(); + assert!(flush_reply.overall_success); + assert_eq!(flush_reply.results.len(), 1); + assert_eq!(flush_reply.results[0].0, region_id); + assert!(flush_reply.results[0].1.is_ok()); assert_eq!(*flushed_region_ids.read().unwrap(), vec![region_id]); } @@ -281,18 +286,16 @@ mod tests { // Sync batch with fail-fast strategy let flush_instruction = FlushRegions::sync_batch(region_ids.clone(), FlushErrorStrategy::FailFast); - let reply = handler_context - .handle_flush_regions_instruction(flush_instruction) + let reply = FlushRegionsHandler + .handle( + &handler_context, + Instruction::FlushRegions(flush_instruction), + ) .await; - - assert!(reply.is_some()); - if let Some(InstructionReply::FlushRegions(flush_reply)) = reply { - assert!(!flush_reply.overall_success); // Should fail due to non-existent regions - // With fail-fast, only process regions until first failure - assert!(flush_reply.results.len() <= region_ids.len()); - } else { - panic!("Expected FlushRegions reply"); - } + let flush_reply = reply.unwrap().expect_flush_regions_reply(); + assert!(!flush_reply.overall_success); // Should fail due to non-existent regions + // With fail-fast, only process regions until first failure + assert!(flush_reply.results.len() <= region_ids.len()); } #[tokio::test] @@ -317,20 +320,18 @@ mod tests { // Sync batch with try-all strategy let flush_instruction = FlushRegions::sync_batch(region_ids.clone(), FlushErrorStrategy::TryAll); - let reply = handler_context - .handle_flush_regions_instruction(flush_instruction) + let reply = FlushRegionsHandler + .handle( + &handler_context, + Instruction::FlushRegions(flush_instruction), + ) .await; - - assert!(reply.is_some()); - if let Some(InstructionReply::FlushRegions(flush_reply)) = reply { - assert!(!flush_reply.overall_success); // Should fail due to one non-existent region - // With try-all, should process all regions - assert_eq!(flush_reply.results.len(), region_ids.len()); - // First should succeed, second should fail - assert!(flush_reply.results[0].1.is_ok()); - assert!(flush_reply.results[1].1.is_err()); - } else { - panic!("Expected FlushRegions reply"); - } + let flush_reply = reply.unwrap().expect_flush_regions_reply(); + assert!(!flush_reply.overall_success); // Should fail due to one non-existent region + // With try-all, should process all regions + assert_eq!(flush_reply.results.len(), region_ids.len()); + // First should succeed, second should fail + assert!(flush_reply.results[0].1.is_ok()); + assert!(flush_reply.results[1].1.is_err()); } } diff --git a/src/datanode/src/heartbeat/handler/open_region.rs b/src/datanode/src/heartbeat/handler/open_region.rs index e6ea973eec..77cd4fe6a0 100644 --- a/src/datanode/src/heartbeat/handler/open_region.rs +++ b/src/datanode/src/heartbeat/handler/open_region.rs @@ -12,56 +12,62 @@ // See the License for the specific language governing permissions and // limitations under the License. -use common_meta::instruction::{InstructionReply, OpenRegion, SimpleReply}; +use common_meta::instruction::{Instruction, InstructionReply, OpenRegion, SimpleReply}; use common_meta::wal_options_allocator::prepare_wal_options; -use futures_util::future::BoxFuture; use store_api::path_utils::table_dir; use store_api::region_request::{PathType, RegionOpenRequest}; +use store_api::storage::RegionId; -use crate::heartbeat::handler::HandlerContext; +use crate::heartbeat::handler::{HandlerContext, InstructionHandler}; -impl HandlerContext { - pub(crate) fn handle_open_regions_instruction( - self, - open_regions: Vec, - open_region_parallelism: usize, - ) -> BoxFuture<'static, Option> { - Box::pin(async move { - let requests = open_regions - .into_iter() - .map(|open_region| { - let OpenRegion { - region_ident, - region_storage_path, - mut region_options, - region_wal_options, - skip_wal_replay, - } = open_region; - let region_id = Self::region_ident_to_region_id(®ion_ident); - prepare_wal_options(&mut region_options, region_id, ®ion_wal_options); - let request = RegionOpenRequest { - engine: region_ident.engine, - table_dir: table_dir(®ion_storage_path, region_id.table_id()), - path_type: PathType::Bare, - options: region_options, - skip_wal_replay, - checkpoint: None, - }; - (region_id, request) - }) - .collect::>(); +pub struct OpenRegionsHandler { + pub open_region_parallelism: usize, +} - let result = self - .region_server - .handle_batch_open_requests(open_region_parallelism, requests, false) - .await; - let success = result.is_ok(); - let error = result.as_ref().map_err(|e| format!("{e:?}")).err(); - Some(InstructionReply::OpenRegions(SimpleReply { - result: success, - error, - })) - }) +#[async_trait::async_trait] +impl InstructionHandler for OpenRegionsHandler { + async fn handle( + &self, + ctx: &HandlerContext, + instruction: Instruction, + ) -> Option { + let open_regions = instruction.into_open_regions().unwrap(); + + let requests = open_regions + .into_iter() + .map(|open_region| { + let OpenRegion { + region_ident, + region_storage_path, + mut region_options, + region_wal_options, + skip_wal_replay, + } = open_region; + let region_id = RegionId::new(region_ident.table_id, region_ident.region_number); + prepare_wal_options(&mut region_options, region_id, ®ion_wal_options); + let request = RegionOpenRequest { + engine: region_ident.engine, + table_dir: table_dir(®ion_storage_path, region_id.table_id()), + path_type: PathType::Bare, + options: region_options, + skip_wal_replay, + checkpoint: None, + }; + (region_id, request) + }) + .collect::>(); + + let result = ctx + .region_server + .handle_batch_open_requests(self.open_region_parallelism, requests, false) + .await; + let success = result.is_ok(); + let error = result.as_ref().map_err(|e| format!("{e:?}")).err(); + + Some(InstructionReply::OpenRegions(SimpleReply { + result: success, + error, + })) } } diff --git a/src/datanode/src/heartbeat/handler/upgrade_region.rs b/src/datanode/src/heartbeat/handler/upgrade_region.rs index c1f238e059..239eaf1e4c 100644 --- a/src/datanode/src/heartbeat/handler/upgrade_region.rs +++ b/src/datanode/src/heartbeat/handler/upgrade_region.rs @@ -12,18 +12,24 @@ // See the License for the specific language governing permissions and // limitations under the License. -use common_meta::instruction::{InstructionReply, UpgradeRegion, UpgradeRegionReply}; +use common_meta::instruction::{Instruction, InstructionReply, UpgradeRegion, UpgradeRegionReply}; use common_telemetry::{info, warn}; -use futures_util::future::BoxFuture; use store_api::region_request::{RegionCatchupRequest, RegionRequest, ReplayCheckpoint}; -use crate::heartbeat::handler::HandlerContext; +use crate::heartbeat::handler::{HandlerContext, InstructionHandler}; use crate::heartbeat::task_tracker::WaitResult; -impl HandlerContext { - pub(crate) fn handle_upgrade_region_instruction( - self, - UpgradeRegion { +#[derive(Debug, Clone, Copy, Default)] +pub struct UpgradeRegionsHandler; + +#[async_trait::async_trait] +impl InstructionHandler for UpgradeRegionsHandler { + async fn handle( + &self, + ctx: &HandlerContext, + instruction: Instruction, + ) -> Option { + let UpgradeRegion { region_id, last_entry_id, metadata_last_entry_id, @@ -31,116 +37,116 @@ impl HandlerContext { location_id, replay_entry_id, metadata_replay_entry_id, - }: UpgradeRegion, - ) -> BoxFuture<'static, Option> { - Box::pin(async move { - let Some(writable) = self.region_server.is_region_leader(region_id) else { - return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply { - ready: false, - exists: false, - error: None, - })); - }; + } = instruction.into_upgrade_regions().unwrap(); - if writable { - return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply { + let Some(writable) = ctx.region_server.is_region_leader(region_id) else { + return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply { + ready: false, + exists: false, + error: None, + })); + }; + + if writable { + return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply { + ready: true, + exists: true, + error: None, + })); + } + + let region_server_moved = ctx.region_server.clone(); + + let checkpoint = match (replay_entry_id, metadata_replay_entry_id) { + (Some(entry_id), metadata_entry_id) => Some(ReplayCheckpoint { + entry_id, + metadata_entry_id, + }), + _ => None, + }; + + // The catchup task is almost zero cost if the inside region is writable. + // Therefore, it always registers a new catchup task. + let register_result = ctx + .catchup_tasks + .try_register( + region_id, + Box::pin(async move { + info!( + "Executing region: {region_id} catchup to: last entry id {last_entry_id:?}" + ); + region_server_moved + .handle_request( + region_id, + RegionRequest::Catchup(RegionCatchupRequest { + set_writable: true, + entry_id: last_entry_id, + metadata_entry_id: metadata_last_entry_id, + location_id, + checkpoint, + }), + ) + .await?; + + Ok(()) + }), + ) + .await; + + if register_result.is_busy() { + warn!("Another catchup task is running for the region: {region_id}"); + } + + // Returns immediately + let Some(replay_timeout) = replay_timeout else { + return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply { + ready: false, + exists: true, + error: None, + })); + }; + + // We don't care that it returns a newly registered or running task. + let mut watcher = register_result.into_watcher(); + let result = ctx.catchup_tasks.wait(&mut watcher, replay_timeout).await; + + match result { + WaitResult::Timeout => Some(InstructionReply::UpgradeRegion(UpgradeRegionReply { + ready: false, + exists: true, + error: None, + })), + WaitResult::Finish(Ok(_)) => { + Some(InstructionReply::UpgradeRegion(UpgradeRegionReply { ready: true, exists: true, error: None, - })); + })) } - - let region_server_moved = self.region_server.clone(); - - let checkpoint = match (replay_entry_id, metadata_replay_entry_id) { - (Some(entry_id), metadata_entry_id) => Some(ReplayCheckpoint { - entry_id, - metadata_entry_id, - }), - _ => None, - }; - - // The catchup task is almost zero cost if the inside region is writable. - // Therefore, it always registers a new catchup task. - let register_result = self - .catchup_tasks - .try_register( - region_id, - Box::pin(async move { - info!("Executing region: {region_id} catchup to: last entry id {last_entry_id:?}"); - region_server_moved - .handle_request( - region_id, - RegionRequest::Catchup(RegionCatchupRequest { - set_writable: true, - entry_id: last_entry_id, - metadata_entry_id: metadata_last_entry_id, - location_id, - checkpoint, - }), - ) - .await?; - - Ok(()) - }), - ) - .await; - - if register_result.is_busy() { - warn!("Another catchup task is running for the region: {region_id}"); - } - - // Returns immediately - let Some(replay_timeout) = replay_timeout else { - return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply { + WaitResult::Finish(Err(err)) => { + Some(InstructionReply::UpgradeRegion(UpgradeRegionReply { ready: false, exists: true, - error: None, - })); - }; - - // We don't care that it returns a newly registered or running task. - let mut watcher = register_result.into_watcher(); - let result = self.catchup_tasks.wait(&mut watcher, replay_timeout).await; - - match result { - WaitResult::Timeout => Some(InstructionReply::UpgradeRegion(UpgradeRegionReply { - ready: false, - exists: true, - error: None, - })), - WaitResult::Finish(Ok(_)) => { - Some(InstructionReply::UpgradeRegion(UpgradeRegionReply { - ready: true, - exists: true, - error: None, - })) - } - WaitResult::Finish(Err(err)) => { - Some(InstructionReply::UpgradeRegion(UpgradeRegionReply { - ready: false, - exists: true, - error: Some(format!("{err:?}")), - })) - } + error: Some(format!("{err:?}")), + })) } - }) + } } } #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; use std::time::Duration; - use common_meta::instruction::{InstructionReply, UpgradeRegion}; + use common_meta::instruction::{Instruction, UpgradeRegion}; use mito2::engine::MITO_ENGINE_NAME; use store_api::region_engine::RegionRole; use store_api::storage::RegionId; use tokio::time::Instant; use crate::error; - use crate::heartbeat::handler::HandlerContext; + use crate::heartbeat::handler::upgrade_region::UpgradeRegionsHandler; + use crate::heartbeat::handler::{HandlerContext, InstructionHandler}; use crate::tests::{MockRegionEngine, mock_region_server}; #[tokio::test] @@ -155,20 +161,20 @@ mod tests { let waits = vec![None, Some(Duration::from_millis(100u64))]; for replay_timeout in waits { - let reply = handler_context - .clone() - .handle_upgrade_region_instruction(UpgradeRegion { - region_id, - replay_timeout, - ..Default::default() - }) + let reply = UpgradeRegionsHandler + .handle( + &handler_context, + Instruction::UpgradeRegion(UpgradeRegion { + region_id, + replay_timeout, + ..Default::default() + }), + ) .await; - assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_))); - if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() { - assert!(!reply.exists); - assert!(reply.error.is_none()); - } + let reply = reply.unwrap().expect_upgrade_region_reply(); + assert!(!reply.exists); + assert!(reply.error.is_none()); } } @@ -192,21 +198,21 @@ mod tests { let waits = vec![None, Some(Duration::from_millis(100u64))]; for replay_timeout in waits { - let reply = handler_context - .clone() - .handle_upgrade_region_instruction(UpgradeRegion { - region_id, - replay_timeout, - ..Default::default() - }) + let reply = UpgradeRegionsHandler + .handle( + &handler_context, + Instruction::UpgradeRegion(UpgradeRegion { + region_id, + replay_timeout, + ..Default::default() + }), + ) .await; - assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_))); - if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() { - assert!(reply.ready); - assert!(reply.exists); - assert!(reply.error.is_none()); - } + let reply = reply.unwrap().expect_upgrade_region_reply(); + assert!(reply.ready); + assert!(reply.exists); + assert!(reply.error.is_none()); } } @@ -230,21 +236,21 @@ mod tests { let waits = vec![None, Some(Duration::from_millis(100u64))]; for replay_timeout in waits { - let reply = handler_context - .clone() - .handle_upgrade_region_instruction(UpgradeRegion { - region_id, - replay_timeout, - ..Default::default() - }) + let reply = UpgradeRegionsHandler + .handle( + &handler_context, + Instruction::UpgradeRegion(UpgradeRegion { + region_id, + replay_timeout, + ..Default::default() + }), + ) .await; - assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_))); - if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() { - assert!(!reply.ready); - assert!(reply.exists); - assert!(reply.error.is_none()); - } + let reply = reply.unwrap().expect_upgrade_region_reply(); + assert!(!reply.ready); + assert!(reply.exists); + assert!(reply.error.is_none()); } } @@ -271,40 +277,41 @@ mod tests { let handler_context = HandlerContext::new_for_test(mock_region_server); for replay_timeout in waits { - let reply = handler_context - .clone() - .handle_upgrade_region_instruction(UpgradeRegion { - region_id, - replay_timeout, - ..Default::default() - }) + let reply = UpgradeRegionsHandler + .handle( + &handler_context, + Instruction::UpgradeRegion(UpgradeRegion { + region_id, + replay_timeout, + ..Default::default() + }), + ) .await; - assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_))); - if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() { - assert!(!reply.ready); - assert!(reply.exists); - assert!(reply.error.is_none()); - } - } - - let timer = Instant::now(); - let reply = handler_context - .handle_upgrade_region_instruction(UpgradeRegion { - region_id, - replay_timeout: Some(Duration::from_millis(500)), - ..Default::default() - }) - .await; - assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_))); - // Must less than 300 ms. - assert!(timer.elapsed().as_millis() < 300); - - if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() { - assert!(reply.ready); + let reply = reply.unwrap().expect_upgrade_region_reply(); + assert!(!reply.ready); assert!(reply.exists); assert!(reply.error.is_none()); } + + let timer = Instant::now(); + let reply = UpgradeRegionsHandler + .handle( + &handler_context, + Instruction::UpgradeRegion(UpgradeRegion { + region_id, + replay_timeout: Some(Duration::from_millis(500)), + ..Default::default() + }), + ) + .await; + // Must less than 300 ms. + assert!(timer.elapsed().as_millis() < 300); + + let reply = reply.unwrap().expect_upgrade_region_reply(); + assert!(reply.ready); + assert!(reply.exists); + assert!(reply.error.is_none()); } #[tokio::test] @@ -329,37 +336,37 @@ mod tests { let handler_context = HandlerContext::new_for_test(mock_region_server); - let reply = handler_context - .clone() - .handle_upgrade_region_instruction(UpgradeRegion { - region_id, - ..Default::default() - }) + let reply = UpgradeRegionsHandler + .handle( + &handler_context, + Instruction::UpgradeRegion(UpgradeRegion { + region_id, + ..Default::default() + }), + ) .await; - assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_))); // It didn't wait for handle returns; it had no idea about the error. - if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() { - assert!(!reply.ready); - assert!(reply.exists); - assert!(reply.error.is_none()); - } + let reply = reply.unwrap().expect_upgrade_region_reply(); + assert!(!reply.ready); + assert!(reply.exists); + assert!(reply.error.is_none()); - let reply = handler_context - .clone() - .handle_upgrade_region_instruction(UpgradeRegion { - region_id, - replay_timeout: Some(Duration::from_millis(200)), - ..Default::default() - }) + let reply = UpgradeRegionsHandler + .handle( + &handler_context, + Instruction::UpgradeRegion(UpgradeRegion { + region_id, + replay_timeout: Some(Duration::from_millis(200)), + ..Default::default() + }), + ) .await; - assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_))); - if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() { - assert!(!reply.ready); - assert!(reply.exists); - assert!(reply.error.is_some()); - assert!(reply.error.unwrap().contains("mock_error")); - } + let reply = reply.unwrap().expect_upgrade_region_reply(); + assert!(!reply.ready); + assert!(reply.exists); + assert!(reply.error.is_some()); + assert!(reply.error.unwrap().contains("mock_error")); } } diff --git a/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs b/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs index ad805ae680..fb4065748c 100644 --- a/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs +++ b/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs @@ -19,7 +19,7 @@ use api::v1::meta::MailboxMessage; use common_error::ext::BoxedError; use common_meta::distributed_time_constants::REGION_LEASE_SECS; use common_meta::instruction::{ - DowngradeRegion, DowngradeRegionReply, Instruction, InstructionReply, + DowngradeRegion, DowngradeRegionReply, DowngradeRegionsReply, Instruction, InstructionReply, }; use common_procedure::{Context as ProcedureContext, Status}; use common_telemetry::{error, info, warn}; @@ -120,10 +120,10 @@ impl DowngradeLeaderRegion { ) -> Instruction { let pc = &ctx.persistent_ctx; let region_id = pc.region_id; - Instruction::DowngradeRegion(DowngradeRegion { + Instruction::DowngradeRegions(vec![DowngradeRegion { region_id, flush_timeout: Some(flush_timeout), - }) + }]) } /// Tries to downgrade a leader region. @@ -173,12 +173,7 @@ impl DowngradeLeaderRegion { region_id, now.elapsed() ); - let InstructionReply::DowngradeRegion(DowngradeRegionReply { - last_entry_id, - metadata_last_entry_id, - exists, - error, - }) = reply + let InstructionReply::DowngradeRegions(DowngradeRegionsReply { replies }) = reply else { return error::UnexpectedInstructionReplySnafu { mailbox_message: msg.to_string(), @@ -187,6 +182,15 @@ impl DowngradeLeaderRegion { .fail(); }; + // TODO(weny): handle multiple replies. + let DowngradeRegionReply { + region_id, + last_entry_id, + metadata_last_entry_id, + exists, + error, + } = &replies[0]; + if error.is_some() { return error::RetryLaterSnafu { reason: format!( @@ -216,12 +220,12 @@ impl DowngradeLeaderRegion { } if let Some(last_entry_id) = last_entry_id { - ctx.volatile_ctx.set_last_entry_id(last_entry_id); + ctx.volatile_ctx.set_last_entry_id(*last_entry_id); } if let Some(metadata_last_entry_id) = metadata_last_entry_id { ctx.volatile_ctx - .set_metadata_last_entry_id(metadata_last_entry_id); + .set_metadata_last_entry_id(*metadata_last_entry_id); } Ok(()) diff --git a/src/meta-srv/src/procedure/test_util.rs b/src/meta-srv/src/procedure/test_util.rs index 8197087351..247f112514 100644 --- a/src/meta-srv/src/procedure/test_util.rs +++ b/src/meta-srv/src/procedure/test_util.rs @@ -17,7 +17,8 @@ use std::collections::HashMap; use api::v1::meta::mailbox_message::Payload; use api::v1::meta::{HeartbeatResponse, MailboxMessage}; use common_meta::instruction::{ - DowngradeRegionReply, FlushRegionReply, InstructionReply, SimpleReply, UpgradeRegionReply, + DowngradeRegionReply, DowngradeRegionsReply, FlushRegionReply, InstructionReply, SimpleReply, + UpgradeRegionReply, }; use common_meta::key::TableMetadataManagerRef; use common_meta::key::table_route::TableRouteValue; @@ -183,12 +184,15 @@ pub fn new_downgrade_region_reply( to: "meta".to_string(), timestamp_millis: current_time_millis(), payload: Some(Payload::Json( - serde_json::to_string(&InstructionReply::DowngradeRegion(DowngradeRegionReply { - last_entry_id, - metadata_last_entry_id: None, - exists: exist, - error, - })) + serde_json::to_string(&InstructionReply::DowngradeRegions( + DowngradeRegionsReply::new(vec![DowngradeRegionReply { + region_id: RegionId::new(0, 0), + last_entry_id, + metadata_last_entry_id: None, + exists: exist, + error, + }]), + )) .unwrap(), )), } From d8563ba56d91dde0375457f32b50203357887876 Mon Sep 17 00:00:00 2001 From: dennis zhuang Date: Sat, 25 Oct 2025 16:41:49 +0800 Subject: [PATCH 008/149] feat: adds regex_extract function and more type tests (#7107) * feat: adds format, regex_extract function and more type tests Signed-off-by: Dennis Zhuang * fix: forgot functions Signed-off-by: Dennis Zhuang * chore: forgot null type Signed-off-by: Dennis Zhuang * test: forgot date type Signed-off-by: Dennis Zhuang * feat: remove format function Signed-off-by: Dennis Zhuang * test: update results after upgrading datafusion Signed-off-by: Dennis Zhuang --------- Signed-off-by: Dennis Zhuang --- Cargo.lock | 19 +- Cargo.toml | 2 +- src/common/datasource/Cargo.toml | 2 +- src/common/function/Cargo.toml | 1 + src/common/function/src/function_registry.rs | 4 + src/common/function/src/scalars.rs | 1 + .../function/src/scalars/date/date_format.rs | 78 +++- src/common/function/src/scalars/string.rs | 26 ++ .../src/scalars/string/regexp_extract.rs | 339 ++++++++++++++++++ src/mito2/Cargo.toml | 2 +- .../common/function/string/concat.result | 211 +++++++++++ .../common/function/string/concat.sql | 63 ++++ .../common/function/string/length.result | 183 ++++++++++ .../common/function/string/length.sql | 58 +++ .../function/string/like_pattern.result | 280 +++++++++++++++ .../common/function/string/like_pattern.sql | 97 +++++ .../common/function/string/position.result | 278 ++++++++++++++ .../common/function/string/position.sql | 84 +++++ .../common/function/string/regex.result | 143 ++++++++ .../common/function/string/regex.sql | 44 +++ .../common/function/string/repeat.result | 217 +++++++++++ .../common/function/string/repeat.sql | 68 ++++ .../common/function/string/replace.result | 180 ++++++++++ .../common/function/string/replace.sql | 57 +++ .../common/function/string/reverse.result | 200 +++++++++++ .../common/function/string/reverse.sql | 63 ++++ .../function/string/string_split.result | 213 +++++++++++ .../common/function/string/string_split.sql | 75 ++++ .../common/function/string/substring.result | 173 +++++++++ .../common/function/string/substring.sql | 53 +++ .../common/function/string/trim_pad.result | 274 ++++++++++++++ .../common/function/string/trim_pad.sql | 88 +++++ .../common/function/string/upper_lower.result | 291 +++++++++++++++ .../common/function/string/upper_lower.sql | 93 +++++ .../common/order/nulls_first_last.result | 141 ++++++++ .../common/order/nulls_first_last.sql | 46 +++ .../common/order/order_by_basic.result | 134 +++++++ .../common/order/order_by_basic.sql | 39 ++ .../common/order/order_by_expressions.result | 137 +++++++ .../common/order/order_by_expressions.sql | 54 +++ .../common/sample/basic_sample.result | 93 +++++ .../standalone/common/sample/basic_sample.sql | 35 ++ .../common/types/date/test_date.result | 135 +++++++ .../common/types/date/test_date.sql | 50 +++ .../types/float/ieee_floating_points.result | 144 ++++++++ .../types/float/ieee_floating_points.sql | 51 +++ .../common/types/float/infinity_nan.result | 184 ++++++++++ .../common/types/float/infinity_nan.sql | 61 ++++ .../float/nan_arithmetic_extended.result | 317 ++++++++++++++++ .../types/float/nan_arithmetic_extended.sql | 91 +++++ .../types/float/nan_cast_extended.result | 252 +++++++++++++ .../common/types/float/nan_cast_extended.sql | 76 ++++ .../common/types/null/null_handling.result | 171 +++++++++ .../common/types/null/null_handling.sql | 49 +++ .../common/types/string/big_strings.result | 116 ++++++ .../common/types/string/big_strings.sql | 43 +++ .../types/string/unicode_extended.result | 103 ++++++ .../common/types/string/unicode_extended.sql | 35 ++ 58 files changed, 6502 insertions(+), 15 deletions(-) create mode 100644 src/common/function/src/scalars/string.rs create mode 100644 src/common/function/src/scalars/string/regexp_extract.rs create mode 100644 tests/cases/standalone/common/function/string/concat.result create mode 100644 tests/cases/standalone/common/function/string/concat.sql create mode 100644 tests/cases/standalone/common/function/string/length.result create mode 100644 tests/cases/standalone/common/function/string/length.sql create mode 100644 tests/cases/standalone/common/function/string/like_pattern.result create mode 100644 tests/cases/standalone/common/function/string/like_pattern.sql create mode 100644 tests/cases/standalone/common/function/string/position.result create mode 100644 tests/cases/standalone/common/function/string/position.sql create mode 100644 tests/cases/standalone/common/function/string/regex.result create mode 100644 tests/cases/standalone/common/function/string/regex.sql create mode 100644 tests/cases/standalone/common/function/string/repeat.result create mode 100644 tests/cases/standalone/common/function/string/repeat.sql create mode 100644 tests/cases/standalone/common/function/string/replace.result create mode 100644 tests/cases/standalone/common/function/string/replace.sql create mode 100644 tests/cases/standalone/common/function/string/reverse.result create mode 100644 tests/cases/standalone/common/function/string/reverse.sql create mode 100644 tests/cases/standalone/common/function/string/string_split.result create mode 100644 tests/cases/standalone/common/function/string/string_split.sql create mode 100644 tests/cases/standalone/common/function/string/substring.result create mode 100644 tests/cases/standalone/common/function/string/substring.sql create mode 100644 tests/cases/standalone/common/function/string/trim_pad.result create mode 100644 tests/cases/standalone/common/function/string/trim_pad.sql create mode 100644 tests/cases/standalone/common/function/string/upper_lower.result create mode 100644 tests/cases/standalone/common/function/string/upper_lower.sql create mode 100644 tests/cases/standalone/common/order/nulls_first_last.result create mode 100644 tests/cases/standalone/common/order/nulls_first_last.sql create mode 100644 tests/cases/standalone/common/order/order_by_basic.result create mode 100644 tests/cases/standalone/common/order/order_by_basic.sql create mode 100644 tests/cases/standalone/common/order/order_by_expressions.result create mode 100644 tests/cases/standalone/common/order/order_by_expressions.sql create mode 100644 tests/cases/standalone/common/sample/basic_sample.result create mode 100644 tests/cases/standalone/common/sample/basic_sample.sql create mode 100644 tests/cases/standalone/common/types/date/test_date.result create mode 100644 tests/cases/standalone/common/types/date/test_date.sql create mode 100644 tests/cases/standalone/common/types/float/ieee_floating_points.result create mode 100644 tests/cases/standalone/common/types/float/ieee_floating_points.sql create mode 100644 tests/cases/standalone/common/types/float/infinity_nan.result create mode 100644 tests/cases/standalone/common/types/float/infinity_nan.sql create mode 100644 tests/cases/standalone/common/types/float/nan_arithmetic_extended.result create mode 100644 tests/cases/standalone/common/types/float/nan_arithmetic_extended.sql create mode 100644 tests/cases/standalone/common/types/float/nan_cast_extended.result create mode 100644 tests/cases/standalone/common/types/float/nan_cast_extended.sql create mode 100644 tests/cases/standalone/common/types/null/null_handling.result create mode 100644 tests/cases/standalone/common/types/null/null_handling.sql create mode 100644 tests/cases/standalone/common/types/string/big_strings.result create mode 100644 tests/cases/standalone/common/types/string/big_strings.sql create mode 100644 tests/cases/standalone/common/types/string/unicode_extended.result create mode 100644 tests/cases/standalone/common/types/string/unicode_extended.sql diff --git a/Cargo.lock b/Cargo.lock index 07b1695817..c2bad3d971 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1264,7 +1264,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "234113d19d0d7d613b40e86fb654acf958910802bcceab913a4f9e7cda03b1a4" dependencies = [ "memchr", - "regex-automata 0.4.9", + "regex-automata 0.4.13", "serde", ] @@ -2190,6 +2190,7 @@ dependencies = [ "num-traits", "paste", "pretty_assertions", + "regex", "s2", "serde", "serde_json", @@ -4588,7 +4589,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298" dependencies = [ "bit-set", - "regex-automata 0.4.9", + "regex-automata 0.4.13", "regex-syntax 0.8.7", ] @@ -6118,7 +6119,7 @@ dependencies = [ "rand 0.9.1", "rand_chacha 0.9.0", "regex", - "regex-automata 0.4.9", + "regex-automata 0.4.13", "roaring", "serde", "serde_json", @@ -6735,7 +6736,7 @@ version = "0.22.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5baa5e9ff84f1aefd264e6869907646538a52147a755d494517a8007fb48733" dependencies = [ - "regex-automata 0.4.9", + "regex-automata 0.4.13", "rustversion", ] @@ -10469,13 +10470,13 @@ dependencies = [ [[package]] name = "regex" -version = "1.11.1" +version = "1.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.4.9", + "regex-automata 0.4.13", "regex-syntax 0.8.7", ] @@ -10490,9 +10491,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.9" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" dependencies = [ "aho-corasick", "memchr", diff --git a/Cargo.toml b/Cargo.toml index a4ce20bfd1..ebafce51ba 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -191,7 +191,7 @@ prost-types = "0.13" raft-engine = { version = "0.4.1", default-features = false } rand = "0.9" ratelimit = "0.10" -regex = "1.8" +regex = "1.12" regex-automata = "0.4" reqwest = { version = "0.12", default-features = false, features = [ "json", diff --git a/src/common/datasource/Cargo.toml b/src/common/datasource/Cargo.toml index 303d05ceb1..964f41736c 100644 --- a/src/common/datasource/Cargo.toml +++ b/src/common/datasource/Cargo.toml @@ -36,7 +36,7 @@ object_store_opendal.workspace = true orc-rust = { version = "0.6.3", default-features = false, features = ["async"] } parquet.workspace = true paste.workspace = true -regex = "1.7" +regex.workspace = true serde.workspace = true snafu.workspace = true strum.workspace = true diff --git a/src/common/function/Cargo.toml b/src/common/function/Cargo.toml index d5b928e2a1..1d272f5d04 100644 --- a/src/common/function/Cargo.toml +++ b/src/common/function/Cargo.toml @@ -51,6 +51,7 @@ nalgebra.workspace = true num = "0.4" num-traits = "0.2" paste.workspace = true +regex.workspace = true s2 = { version = "0.0.12", optional = true } serde.workspace = true serde_json.workspace = true diff --git a/src/common/function/src/function_registry.rs b/src/common/function/src/function_registry.rs index 75bb71c63a..e51dcf4cb8 100644 --- a/src/common/function/src/function_registry.rs +++ b/src/common/function/src/function_registry.rs @@ -34,6 +34,7 @@ use crate::scalars::json::JsonFunction; use crate::scalars::matches::MatchesFunction; use crate::scalars::matches_term::MatchesTermFunction; use crate::scalars::math::MathFunction; +use crate::scalars::string::register_string_functions; use crate::scalars::timestamp::TimestampFunction; use crate::scalars::uddsketch_calc::UddSketchCalcFunction; use crate::scalars::vector::VectorFunction as VectorScalarFunction; @@ -154,6 +155,9 @@ pub static FUNCTION_REGISTRY: LazyLock> = LazyLock::new(|| // Json related functions JsonFunction::register(&function_registry); + // String related functions + register_string_functions(&function_registry); + // Vector related functions VectorScalarFunction::register(&function_registry); VectorAggrFunction::register(&function_registry); diff --git a/src/common/function/src/scalars.rs b/src/common/function/src/scalars.rs index 6f93f2741d..9a8c9cc3a0 100644 --- a/src/common/function/src/scalars.rs +++ b/src/common/function/src/scalars.rs @@ -20,6 +20,7 @@ pub mod json; pub mod matches; pub mod matches_term; pub mod math; +pub(crate) mod string; pub mod vector; pub(crate) mod hll_count; diff --git a/src/common/function/src/scalars/date/date_format.rs b/src/common/function/src/scalars/date/date_format.rs index 0e321c957e..dfa5a444ca 100644 --- a/src/common/function/src/scalars/date/date_format.rs +++ b/src/common/function/src/scalars/date/date_format.rs @@ -20,7 +20,9 @@ use common_query::error; use common_time::{Date, Timestamp}; use datafusion_common::DataFusionError; use datafusion_common::arrow::array::{Array, AsArray, StringViewBuilder}; -use datafusion_common::arrow::datatypes::{ArrowTimestampType, DataType, Date32Type, TimeUnit}; +use datafusion_common::arrow::datatypes::{ + ArrowTimestampType, DataType, Date32Type, Date64Type, TimeUnit, +}; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature}; use snafu::ResultExt; @@ -40,6 +42,7 @@ impl Default for DateFormatFunction { signature: helper::one_of_sigs2( vec![ DataType::Date32, + DataType::Date64, DataType::Timestamp(TimeUnit::Second, None), DataType::Timestamp(TimeUnit::Millisecond, None), DataType::Timestamp(TimeUnit::Microsecond, None), @@ -115,6 +118,29 @@ impl Function for DateFormatFunction { builder.append_option(result.as_deref()); } } + DataType::Date64 => { + let left = left.as_primitive::(); + for i in 0..size { + let date = left.is_valid(i).then(|| { + let ms = left.value(i); + Timestamp::new_millisecond(ms) + }); + let format = formats.is_valid(i).then(|| formats.value(i)); + + let result = match (date, format) { + (Some(ts), Some(fmt)) => { + Some(ts.as_formatted_string(fmt, Some(timezone)).map_err(|e| { + DataFusionError::Execution(format!( + "cannot format {ts:?} as '{fmt}': {e}" + )) + })?) + } + _ => None, + }; + + builder.append_option(result.as_deref()); + } + } x => { return Err(DataFusionError::Execution(format!( "unsupported input data type {x}" @@ -137,7 +163,9 @@ mod tests { use std::sync::Arc; use arrow_schema::Field; - use datafusion_common::arrow::array::{Date32Array, StringArray, TimestampSecondArray}; + use datafusion_common::arrow::array::{ + Date32Array, Date64Array, StringArray, TimestampSecondArray, + }; use datafusion_common::config::ConfigOptions; use datafusion_expr::{TypeSignature, Volatility}; @@ -166,7 +194,7 @@ mod tests { Signature { type_signature: TypeSignature::OneOf(sigs), volatility: Volatility::Immutable - } if sigs.len() == 5)); + } if sigs.len() == 6)); } #[test] @@ -213,6 +241,50 @@ mod tests { } } + #[test] + fn test_date64_date_format() { + let f = DateFormatFunction::default(); + + let dates = vec![Some(123000), None, Some(42000), None]; + let formats = vec![ + "%Y-%m-%d %T.%3f", + "%Y-%m-%d %T.%3f", + "%Y-%m-%d %T.%3f", + "%Y-%m-%d %T.%3f", + ]; + let results = [ + Some("1970-01-01 00:02:03.000"), + None, + Some("1970-01-01 00:00:42.000"), + None, + ]; + + let mut config_options = ConfigOptions::default(); + config_options.extensions.insert(FunctionContext::default()); + let config_options = Arc::new(config_options); + + let args = ScalarFunctionArgs { + args: vec![ + ColumnarValue::Array(Arc::new(Date64Array::from(dates))), + ColumnarValue::Array(Arc::new(StringArray::from_iter_values(formats))), + ], + arg_fields: vec![], + number_rows: 4, + return_field: Arc::new(Field::new("x", DataType::Utf8View, false)), + config_options, + }; + let result = f + .invoke_with_args(args) + .and_then(|x| x.to_array(4)) + .unwrap(); + let vector = result.as_string_view(); + + assert_eq!(4, vector.len()); + for (actual, expect) in vector.iter().zip(results) { + assert_eq!(actual, expect); + } + } + #[test] fn test_date_date_format() { let f = DateFormatFunction::default(); diff --git a/src/common/function/src/scalars/string.rs b/src/common/function/src/scalars/string.rs new file mode 100644 index 0000000000..95c6201ee2 --- /dev/null +++ b/src/common/function/src/scalars/string.rs @@ -0,0 +1,26 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! String scalar functions + +mod regexp_extract; + +pub(crate) use regexp_extract::RegexpExtractFunction; + +use crate::function_registry::FunctionRegistry; + +/// Register all string functions +pub fn register_string_functions(registry: &FunctionRegistry) { + RegexpExtractFunction::register(registry); +} diff --git a/src/common/function/src/scalars/string/regexp_extract.rs b/src/common/function/src/scalars/string/regexp_extract.rs new file mode 100644 index 0000000000..bc78c4df74 --- /dev/null +++ b/src/common/function/src/scalars/string/regexp_extract.rs @@ -0,0 +1,339 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Implementation of REGEXP_EXTRACT function +use std::fmt; +use std::sync::Arc; + +use datafusion_common::DataFusionError; +use datafusion_common::arrow::array::{Array, AsArray, LargeStringBuilder}; +use datafusion_common::arrow::compute::cast; +use datafusion_common::arrow::datatypes::DataType; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, TypeSignature, Volatility}; +use regex::{Regex, RegexBuilder}; + +use crate::function::Function; +use crate::function_registry::FunctionRegistry; + +const NAME: &str = "regexp_extract"; + +// Safety limits +const MAX_REGEX_SIZE: usize = 1024 * 1024; // compiled regex heap cap +const MAX_DFA_SIZE: usize = 2 * 1024 * 1024; // lazy DFA cap +const MAX_TOTAL_RESULT_SIZE: usize = 64 * 1024 * 1024; // total batch cap +const MAX_SINGLE_MATCH: usize = 1024 * 1024; // per-row cap +const MAX_PATTERN_LEN: usize = 10_000; // pattern text length cap + +/// REGEXP_EXTRACT function implementation +/// Extracts the first substring matching the given regular expression pattern. +/// If no match is found, returns NULL. +/// +#[derive(Debug)] +pub struct RegexpExtractFunction { + signature: Signature, +} + +impl RegexpExtractFunction { + pub fn register(registry: &FunctionRegistry) { + registry.register_scalar(RegexpExtractFunction::default()); + } +} + +impl Default for RegexpExtractFunction { + fn default() -> Self { + Self { + signature: Signature::one_of( + vec![ + TypeSignature::Exact(vec![DataType::Utf8View, DataType::Utf8]), + TypeSignature::Exact(vec![DataType::Utf8View, DataType::Utf8View]), + TypeSignature::Exact(vec![DataType::Utf8, DataType::Utf8View]), + TypeSignature::Exact(vec![DataType::LargeUtf8, DataType::Utf8View]), + TypeSignature::Exact(vec![DataType::Utf8View, DataType::LargeUtf8]), + TypeSignature::Exact(vec![DataType::Utf8, DataType::Utf8]), + TypeSignature::Exact(vec![DataType::LargeUtf8, DataType::Utf8]), + TypeSignature::Exact(vec![DataType::Utf8, DataType::LargeUtf8]), + TypeSignature::Exact(vec![DataType::LargeUtf8, DataType::LargeUtf8]), + ], + Volatility::Immutable, + ), + } + } +} + +impl fmt::Display for RegexpExtractFunction { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", NAME.to_ascii_uppercase()) + } +} + +impl Function for RegexpExtractFunction { + fn name(&self) -> &str { + NAME + } + + // Always return LargeUtf8 for simplicity and safety + fn return_type(&self, _: &[DataType]) -> datafusion_common::Result { + Ok(DataType::LargeUtf8) + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn invoke_with_args( + &self, + args: ScalarFunctionArgs, + ) -> datafusion_common::Result { + if args.args.len() != 2 { + return Err(DataFusionError::Execution( + "REGEXP_EXTRACT requires exactly two arguments (text, pattern)".to_string(), + )); + } + + // Keep original ColumnarValue variants for scalar-pattern fast path + let pattern_is_scalar = matches!(args.args[1], ColumnarValue::Scalar(_)); + + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let text_array = &arrays[0]; + let pattern_array = &arrays[1]; + + // Cast both to LargeUtf8 for uniform access (supports Utf8/Utf8View/Dictionary) + let text_large = cast(text_array.as_ref(), &DataType::LargeUtf8).map_err(|e| { + DataFusionError::Execution(format!("REGEXP_EXTRACT: text cast failed: {e}")) + })?; + let pattern_large = cast(pattern_array.as_ref(), &DataType::LargeUtf8).map_err(|e| { + DataFusionError::Execution(format!("REGEXP_EXTRACT: pattern cast failed: {e}")) + })?; + + let text = text_large.as_string::(); + let pattern = pattern_large.as_string::(); + let len = text.len(); + + // Pre-size result builder with conservative estimate + let mut estimated_total = 0usize; + for i in 0..len { + if !text.is_null(i) { + estimated_total = estimated_total.saturating_add(text.value_length(i) as usize); + if estimated_total > MAX_TOTAL_RESULT_SIZE { + return Err(DataFusionError::ResourcesExhausted(format!( + "REGEXP_EXTRACT total output exceeds {} bytes", + MAX_TOTAL_RESULT_SIZE + ))); + } + } + } + let mut builder = LargeStringBuilder::with_capacity(len, estimated_total); + + // Fast path: if pattern is scalar, compile once + let compiled_scalar: Option = if pattern_is_scalar && len > 0 && !pattern.is_null(0) + { + Some(compile_regex_checked(pattern.value(0))?) + } else { + None + }; + + for i in 0..len { + if text.is_null(i) || pattern.is_null(i) { + builder.append_null(); + continue; + } + + let s = text.value(i); + let pat = pattern.value(i); + + // Compile or reuse regex + let re = if let Some(ref compiled) = compiled_scalar { + compiled + } else { + // TODO: For performance-critical applications with repeating patterns, + // consider adding a small LRU cache here + &compile_regex_checked(pat)? + }; + + // First match only + if let Some(m) = re.find(s) { + let m_str = m.as_str(); + if m_str.len() > MAX_SINGLE_MATCH { + return Err(DataFusionError::Execution( + "REGEXP_EXTRACT match exceeds per-row limit (1MB)".to_string(), + )); + } + builder.append_value(m_str); + } else { + builder.append_null(); + } + } + + Ok(ColumnarValue::Array(Arc::new(builder.finish()))) + } +} + +// Compile a regex with safety checks +fn compile_regex_checked(pattern: &str) -> datafusion_common::Result { + if pattern.len() > MAX_PATTERN_LEN { + return Err(DataFusionError::Execution(format!( + "REGEXP_EXTRACT pattern too long (> {} chars)", + MAX_PATTERN_LEN + ))); + } + RegexBuilder::new(pattern) + .size_limit(MAX_REGEX_SIZE) + .dfa_size_limit(MAX_DFA_SIZE) + .build() + .map_err(|e| { + DataFusionError::Execution(format!("REGEXP_EXTRACT invalid pattern '{}': {e}", pattern)) + }) +} + +#[cfg(test)] +mod tests { + use datafusion_common::arrow::array::StringArray; + use datafusion_common::arrow::datatypes::Field; + use datafusion_expr::ScalarFunctionArgs; + + use super::*; + + #[test] + fn test_regexp_extract_function_basic() { + let text_array = Arc::new(StringArray::from(vec!["version 1.2.3", "no match here"])); + let pattern_array = Arc::new(StringArray::from(vec!["\\d+\\.\\d+\\.\\d+", "\\d+"])); + + let args = ScalarFunctionArgs { + args: vec![ + ColumnarValue::Array(text_array), + ColumnarValue::Array(pattern_array), + ], + arg_fields: vec![ + Arc::new(Field::new("arg_0", DataType::Utf8, false)), + Arc::new(Field::new("arg_1", DataType::Utf8, false)), + ], + return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)), + number_rows: 2, + config_options: Arc::new(datafusion_common::config::ConfigOptions::default()), + }; + + let function = RegexpExtractFunction::default(); + let result = function.invoke_with_args(args).unwrap(); + + if let ColumnarValue::Array(array) = result { + let string_array = array.as_string::(); + assert_eq!(string_array.value(0), "1.2.3"); + assert!(string_array.is_null(1)); // no match should return NULL + } else { + panic!("Expected array result"); + } + } + + #[test] + fn test_regexp_extract_phone_number() { + let text_array = Arc::new(StringArray::from(vec!["Phone: 123-456-7890", "No phone"])); + let pattern_array = Arc::new(StringArray::from(vec![ + "\\d{3}-\\d{3}-\\d{4}", + "\\d{3}-\\d{3}-\\d{4}", + ])); + + let args = ScalarFunctionArgs { + args: vec![ + ColumnarValue::Array(text_array), + ColumnarValue::Array(pattern_array), + ], + arg_fields: vec![ + Arc::new(Field::new("arg_0", DataType::Utf8, false)), + Arc::new(Field::new("arg_1", DataType::Utf8, false)), + ], + return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)), + number_rows: 2, + config_options: Arc::new(datafusion_common::config::ConfigOptions::default()), + }; + + let function = RegexpExtractFunction::default(); + let result = function.invoke_with_args(args).unwrap(); + + if let ColumnarValue::Array(array) = result { + let string_array = array.as_string::(); + assert_eq!(string_array.value(0), "123-456-7890"); + assert!(string_array.is_null(1)); // no match should return NULL + } else { + panic!("Expected array result"); + } + } + + #[test] + fn test_regexp_extract_email() { + let text_array = Arc::new(StringArray::from(vec![ + "Email: user@domain.com", + "Invalid email", + ])); + let pattern_array = Arc::new(StringArray::from(vec![ + "[a-zA-Z0-9]+@[a-zA-Z0-9]+\\.[a-zA-Z]+", + "[a-zA-Z0-9]+@[a-zA-Z0-9]+\\.[a-zA-Z]+", + ])); + + let args = ScalarFunctionArgs { + args: vec![ + ColumnarValue::Array(text_array), + ColumnarValue::Array(pattern_array), + ], + arg_fields: vec![ + Arc::new(Field::new("arg_0", DataType::Utf8, false)), + Arc::new(Field::new("arg_1", DataType::Utf8, false)), + ], + return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)), + number_rows: 2, + config_options: Arc::new(datafusion_common::config::ConfigOptions::default()), + }; + + let function = RegexpExtractFunction::default(); + let result = function.invoke_with_args(args).unwrap(); + + if let ColumnarValue::Array(array) = result { + let string_array = array.as_string::(); + assert_eq!(string_array.value(0), "user@domain.com"); + assert!(string_array.is_null(1)); // no match should return NULL + } else { + panic!("Expected array result"); + } + } + + #[test] + fn test_regexp_extract_with_nulls() { + let text_array = Arc::new(StringArray::from(vec![Some("test 123"), None])); + let pattern_array = Arc::new(StringArray::from(vec![Some("\\d+"), Some("\\d+")])); + + let args = ScalarFunctionArgs { + args: vec![ + ColumnarValue::Array(text_array), + ColumnarValue::Array(pattern_array), + ], + arg_fields: vec![ + Arc::new(Field::new("arg_0", DataType::Utf8, true)), + Arc::new(Field::new("arg_1", DataType::Utf8, false)), + ], + return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)), + number_rows: 2, + config_options: Arc::new(datafusion_common::config::ConfigOptions::default()), + }; + + let function = RegexpExtractFunction::default(); + let result = function.invoke_with_args(args).unwrap(); + + if let ColumnarValue::Array(array) = result { + let string_array = array.as_string::(); + assert_eq!(string_array.value(0), "123"); + assert!(string_array.is_null(1)); // NULL input should return NULL + } else { + panic!("Expected array result"); + } + } +} diff --git a/src/mito2/Cargo.toml b/src/mito2/Cargo.toml index 4cc1efb8bc..7926ae198a 100644 --- a/src/mito2/Cargo.toml +++ b/src/mito2/Cargo.toml @@ -65,7 +65,7 @@ partition.workspace = true puffin.workspace = true rand.workspace = true rayon = "1.10" -regex = "1.5" +regex.workspace = true rskafka = { workspace = true, optional = true } rstest = { workspace = true, optional = true } rstest_reuse = { workspace = true, optional = true } diff --git a/tests/cases/standalone/common/function/string/concat.result b/tests/cases/standalone/common/function/string/concat.result new file mode 100644 index 0000000000..5c0907d5cb --- /dev/null +++ b/tests/cases/standalone/common/function/string/concat.result @@ -0,0 +1,211 @@ +-- String concatenation function tests +-- Test CONCAT function +-- Basic concatenation +SELECT CONCAT('hello', 'world'); + ++-------------------------------------+ +| concat(Utf8("hello"),Utf8("world")) | ++-------------------------------------+ +| helloworld | ++-------------------------------------+ + +SELECT CONCAT('hello', ' ', 'world'); + ++-----------------------------------------------+ +| concat(Utf8("hello"),Utf8(" "),Utf8("world")) | ++-----------------------------------------------+ +| hello world | ++-----------------------------------------------+ + +SELECT CONCAT('a', 'b', 'c', 'd'); + ++-------------------------------------------------+ +| concat(Utf8("a"),Utf8("b"),Utf8("c"),Utf8("d")) | ++-------------------------------------------------+ +| abcd | ++-------------------------------------------------+ + +-- Concatenation with NULL values +SELECT CONCAT('hello', NULL); + ++----------------------------+ +| concat(Utf8("hello"),NULL) | ++----------------------------+ +| hello | ++----------------------------+ + +SELECT CONCAT(NULL, 'world'); + ++----------------------------+ +| concat(NULL,Utf8("world")) | ++----------------------------+ +| world | ++----------------------------+ + +SELECT CONCAT(NULL, NULL); + ++-------------------+ +| concat(NULL,NULL) | ++-------------------+ +| | ++-------------------+ + +-- Concatenation with numbers (should convert to string) +SELECT CONCAT('value: ', 42); + ++-----------------------------------+ +| concat(Utf8("value: "),Int64(42)) | ++-----------------------------------+ +| value: 42 | ++-----------------------------------+ + +SELECT CONCAT(1, 2, 3); + ++------------------------------------+ +| concat(Int64(1),Int64(2),Int64(3)) | ++------------------------------------+ +| 123 | ++------------------------------------+ + +-- Test with table data +CREATE TABLE concat_test(first_name VARCHAR, last_name VARCHAR, age INTEGER, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO concat_test VALUES + ('John', 'Doe', 30, 1000), + ('Jane', 'Smith', 25, 2000), + ('Bob', NULL, 35, 3000), + (NULL, 'Wilson', 40, 4000); + +Affected Rows: 4 + +-- Concatenate table columns +SELECT CONCAT(first_name, ' ', last_name) as full_name FROM concat_test ORDER BY ts; + ++------------+ +| full_name | ++------------+ +| John Doe | +| Jane Smith | +| Bob | +| Wilson | ++------------+ + +SELECT CONCAT(first_name, ' is ', age, ' years old') FROM concat_test ORDER BY ts; + ++--------------------------------------------------------------------------------+ +| concat(concat_test.first_name,Utf8(" is "),concat_test.age,Utf8(" years old")) | ++--------------------------------------------------------------------------------+ +| John is 30 years old | +| Jane is 25 years old | +| Bob is 35 years old | +| is 40 years old | ++--------------------------------------------------------------------------------+ + +-- Test CONCAT_WS (concat with separator) +SELECT CONCAT_WS(' ', first_name, last_name) as full_name FROM concat_test ORDER BY ts; + ++------------+ +| full_name | ++------------+ +| John Doe | +| Jane Smith | +| Bob | +| Wilson | ++------------+ + +SELECT CONCAT_WS('-', first_name, last_name, age) FROM concat_test ORDER BY ts; + ++-----------------------------------------------------------------------------------+ +| concat_ws(Utf8("-"),concat_test.first_name,concat_test.last_name,concat_test.age) | ++-----------------------------------------------------------------------------------+ +| John-Doe-30 | +| Jane-Smith-25 | +| Bob-35 | +| Wilson-40 | ++-----------------------------------------------------------------------------------+ + +SELECT CONCAT_WS(',', 'a', 'b', 'c', 'd'); + ++--------------------------------------------------------------+ +| concat_ws(Utf8(","),Utf8("a"),Utf8("b"),Utf8("c"),Utf8("d")) | ++--------------------------------------------------------------+ +| a,b,c,d | ++--------------------------------------------------------------+ + +-- CONCAT_WS with NULL values (should skip NULLs) +SELECT CONCAT_WS(' ', 'hello', NULL, 'world'); + ++-------------------------------------------------------+ +| concat_ws(Utf8(" "),Utf8("hello"),NULL,Utf8("world")) | ++-------------------------------------------------------+ +| hello world | ++-------------------------------------------------------+ + +SELECT CONCAT_WS('|', first_name, last_name) FROM concat_test ORDER BY ts; + ++-------------------------------------------------------------------+ +| concat_ws(Utf8("|"),concat_test.first_name,concat_test.last_name) | ++-------------------------------------------------------------------+ +| John|Doe | +| Jane|Smith | +| Bob | +| Wilson | ++-------------------------------------------------------------------+ + +-- Test pipe operator || +SELECT 'hello' || 'world'; + ++--------------------------------+ +| Utf8("hello") || Utf8("world") | ++--------------------------------+ +| helloworld | ++--------------------------------+ + +SELECT 'hello' || ' ' || 'world'; + ++---------------------------------------------+ +| Utf8("hello") || Utf8(" ") || Utf8("world") | ++---------------------------------------------+ +| hello world | ++---------------------------------------------+ + +SELECT first_name || ' ' || last_name FROM concat_test WHERE first_name IS NOT NULL AND last_name IS NOT NULL ORDER BY ts; + ++--------------------------------------------------------------+ +| concat_test.first_name || Utf8(" ") || concat_test.last_name | ++--------------------------------------------------------------+ +| John Doe | +| Jane Smith | ++--------------------------------------------------------------+ + +-- Unicode concatenation +SELECT CONCAT('Hello ', '世界'); + ++-------------------------------------+ +| concat(Utf8("Hello "),Utf8("世界")) | ++-------------------------------------+ +| Hello 世界 | ++-------------------------------------+ + +SELECT CONCAT('🚀', ' ', '🌟'); + ++-----------------------------------------+ +| concat(Utf8("🚀"),Utf8(" "),Utf8("🌟")) | ++-----------------------------------------+ +| 🚀 🌟 | ++-----------------------------------------+ + +SELECT '中文' || '🐄'; + ++----------------------------+ +| Utf8("中文") || Utf8("🐄") | ++----------------------------+ +| 中文🐄 | ++----------------------------+ + +DROP TABLE concat_test; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/function/string/concat.sql b/tests/cases/standalone/common/function/string/concat.sql new file mode 100644 index 0000000000..4f73eed62e --- /dev/null +++ b/tests/cases/standalone/common/function/string/concat.sql @@ -0,0 +1,63 @@ +-- String concatenation function tests +-- Test CONCAT function + +-- Basic concatenation +SELECT CONCAT('hello', 'world'); + +SELECT CONCAT('hello', ' ', 'world'); + +SELECT CONCAT('a', 'b', 'c', 'd'); + +-- Concatenation with NULL values +SELECT CONCAT('hello', NULL); + +SELECT CONCAT(NULL, 'world'); + +SELECT CONCAT(NULL, NULL); + +-- Concatenation with numbers (should convert to string) +SELECT CONCAT('value: ', 42); + +SELECT CONCAT(1, 2, 3); + +-- Test with table data +CREATE TABLE concat_test(first_name VARCHAR, last_name VARCHAR, age INTEGER, ts TIMESTAMP TIME INDEX); + +INSERT INTO concat_test VALUES + ('John', 'Doe', 30, 1000), + ('Jane', 'Smith', 25, 2000), + ('Bob', NULL, 35, 3000), + (NULL, 'Wilson', 40, 4000); + +-- Concatenate table columns +SELECT CONCAT(first_name, ' ', last_name) as full_name FROM concat_test ORDER BY ts; + +SELECT CONCAT(first_name, ' is ', age, ' years old') FROM concat_test ORDER BY ts; + +-- Test CONCAT_WS (concat with separator) +SELECT CONCAT_WS(' ', first_name, last_name) as full_name FROM concat_test ORDER BY ts; + +SELECT CONCAT_WS('-', first_name, last_name, age) FROM concat_test ORDER BY ts; + +SELECT CONCAT_WS(',', 'a', 'b', 'c', 'd'); + +-- CONCAT_WS with NULL values (should skip NULLs) +SELECT CONCAT_WS(' ', 'hello', NULL, 'world'); + +SELECT CONCAT_WS('|', first_name, last_name) FROM concat_test ORDER BY ts; + +-- Test pipe operator || +SELECT 'hello' || 'world'; + +SELECT 'hello' || ' ' || 'world'; + +SELECT first_name || ' ' || last_name FROM concat_test WHERE first_name IS NOT NULL AND last_name IS NOT NULL ORDER BY ts; + +-- Unicode concatenation +SELECT CONCAT('Hello ', '世界'); + +SELECT CONCAT('🚀', ' ', '🌟'); + +SELECT '中文' || '🐄'; + +DROP TABLE concat_test; diff --git a/tests/cases/standalone/common/function/string/length.result b/tests/cases/standalone/common/function/string/length.result new file mode 100644 index 0000000000..e508750626 --- /dev/null +++ b/tests/cases/standalone/common/function/string/length.result @@ -0,0 +1,183 @@ +-- String length function tests +-- LENGTH function +SELECT LENGTH('hello'); + ++-----------------------+ +| length(Utf8("hello")) | ++-----------------------+ +| 5 | ++-----------------------+ + +SELECT LENGTH(''); + ++------------------+ +| length(Utf8("")) | ++------------------+ +| 0 | ++------------------+ + +SELECT LENGTH(NULL); + ++--------------+ +| length(NULL) | ++--------------+ +| | ++--------------+ + +SELECT LENGTH('hello world'); + ++-----------------------------+ +| length(Utf8("hello world")) | ++-----------------------------+ +| 11 | ++-----------------------------+ + +-- CHAR_LENGTH (character length) +SELECT CHAR_LENGTH('hello'); + ++----------------------------+ +| char_length(Utf8("hello")) | ++----------------------------+ +| 5 | ++----------------------------+ + +SELECT CHAR_LENGTH(''); + ++-----------------------+ +| char_length(Utf8("")) | ++-----------------------+ +| 0 | ++-----------------------+ + +SELECT CHAR_LENGTH(NULL); + ++-------------------+ +| char_length(NULL) | ++-------------------+ +| | ++-------------------+ + +-- CHARACTER_LENGTH (alias for CHAR_LENGTH) +SELECT CHARACTER_LENGTH('hello world'); + ++---------------------------------------+ +| character_length(Utf8("hello world")) | ++---------------------------------------+ +| 11 | ++---------------------------------------+ + +-- Unicode character length +SELECT LENGTH('世界') AS a, CHAR_LENGTH('世界') AS b; + ++---+---+ +| a | b | ++---+---+ +| 2 | 2 | ++---+---+ + +SELECT LENGTH('🚀🌟') AS a, CHAR_LENGTH('🚀🌟') AS b; + ++---+---+ +| a | b | ++---+---+ +| 2 | 2 | ++---+---+ + +SELECT LENGTH('café') AS a, CHAR_LENGTH('café') AS b; + ++---+---+ +| a | b | ++---+---+ +| 4 | 4 | ++---+---+ + +-- Test with table data +CREATE TABLE length_test(s VARCHAR, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO length_test VALUES + ('hello', 1000), + ('world!', 2000), + ('', 3000), + ('中文测试', 4000), + ('🚀🎉🌟', 5000), + (NULL, 6000); + +Affected Rows: 6 + +SELECT s, LENGTH(s) AS a, CHAR_LENGTH(s) AS b FROM length_test ORDER BY ts; + ++----------+---+---+ +| s | a | b | ++----------+---+---+ +| hello | 5 | 5 | +| world! | 6 | 6 | +| | 0 | 0 | +| 中文测试 | 4 | 4 | +| 🚀🎉🌟 | 3 | 3 | +| | | | ++----------+---+---+ + +-- BIT_LENGTH (length in bits) +SELECT BIT_LENGTH('hello'); + ++---------------------------+ +| bit_length(Utf8("hello")) | ++---------------------------+ +| 40 | ++---------------------------+ + +SELECT BIT_LENGTH(''); + ++----------------------+ +| bit_length(Utf8("")) | ++----------------------+ +| 0 | ++----------------------+ + +SELECT BIT_LENGTH('世界'); + ++--------------------------+ +| bit_length(Utf8("世界")) | ++--------------------------+ +| 48 | ++--------------------------+ + +-- OCTET_LENGTH (length in bytes) +SELECT OCTET_LENGTH('hello'); + ++-----------------------------+ +| octet_length(Utf8("hello")) | ++-----------------------------+ +| 5 | ++-----------------------------+ + +SELECT OCTET_LENGTH(''); + ++------------------------+ +| octet_length(Utf8("")) | ++------------------------+ +| 0 | ++------------------------+ + +SELECT OCTET_LENGTH('世界'); + ++----------------------------+ +| octet_length(Utf8("世界")) | ++----------------------------+ +| 6 | ++----------------------------+ + +SELECT OCTET_LENGTH('🚀'); + ++--------------------------+ +| octet_length(Utf8("🚀")) | ++--------------------------+ +| 4 | ++--------------------------+ + +DROP TABLE length_test; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/function/string/length.sql b/tests/cases/standalone/common/function/string/length.sql new file mode 100644 index 0000000000..26f683858d --- /dev/null +++ b/tests/cases/standalone/common/function/string/length.sql @@ -0,0 +1,58 @@ +-- String length function tests + +-- LENGTH function +SELECT LENGTH('hello'); + +SELECT LENGTH(''); + +SELECT LENGTH(NULL); + +SELECT LENGTH('hello world'); + +-- CHAR_LENGTH (character length) +SELECT CHAR_LENGTH('hello'); + +SELECT CHAR_LENGTH(''); + +SELECT CHAR_LENGTH(NULL); + +-- CHARACTER_LENGTH (alias for CHAR_LENGTH) +SELECT CHARACTER_LENGTH('hello world'); + +-- Unicode character length +SELECT LENGTH('世界') AS a, CHAR_LENGTH('世界') AS b; + +SELECT LENGTH('🚀🌟') AS a, CHAR_LENGTH('🚀🌟') AS b; + +SELECT LENGTH('café') AS a, CHAR_LENGTH('café') AS b; + +-- Test with table data +CREATE TABLE length_test(s VARCHAR, ts TIMESTAMP TIME INDEX); + +INSERT INTO length_test VALUES + ('hello', 1000), + ('world!', 2000), + ('', 3000), + ('中文测试', 4000), + ('🚀🎉🌟', 5000), + (NULL, 6000); + +SELECT s, LENGTH(s) AS a, CHAR_LENGTH(s) AS b FROM length_test ORDER BY ts; + +-- BIT_LENGTH (length in bits) +SELECT BIT_LENGTH('hello'); + +SELECT BIT_LENGTH(''); + +SELECT BIT_LENGTH('世界'); + +-- OCTET_LENGTH (length in bytes) +SELECT OCTET_LENGTH('hello'); + +SELECT OCTET_LENGTH(''); + +SELECT OCTET_LENGTH('世界'); + +SELECT OCTET_LENGTH('🚀'); + +DROP TABLE length_test; diff --git a/tests/cases/standalone/common/function/string/like_pattern.result b/tests/cases/standalone/common/function/string/like_pattern.result new file mode 100644 index 0000000000..515582a1fc --- /dev/null +++ b/tests/cases/standalone/common/function/string/like_pattern.result @@ -0,0 +1,280 @@ +-- String LIKE pattern matching tests +-- Basic LIKE patterns +SELECT 'hello world' LIKE 'hello%'; + ++-----------------------------------------+ +| Utf8("hello world") LIKE Utf8("hello%") | ++-----------------------------------------+ +| true | ++-----------------------------------------+ + +SELECT 'hello world' LIKE '%world'; + ++-----------------------------------------+ +| Utf8("hello world") LIKE Utf8("%world") | ++-----------------------------------------+ +| true | ++-----------------------------------------+ + +SELECT 'hello world' LIKE '%llo%'; + ++----------------------------------------+ +| Utf8("hello world") LIKE Utf8("%llo%") | ++----------------------------------------+ +| true | ++----------------------------------------+ + +SELECT 'hello world' LIKE 'hello_world'; + ++----------------------------------------------+ +| Utf8("hello world") LIKE Utf8("hello_world") | ++----------------------------------------------+ +| true | ++----------------------------------------------+ + +SELECT 'hello world' LIKE 'hello world'; + ++----------------------------------------------+ +| Utf8("hello world") LIKE Utf8("hello world") | ++----------------------------------------------+ +| true | ++----------------------------------------------+ + +-- LIKE with NOT +SELECT 'hello world' NOT LIKE 'goodbye%'; + ++-----------------------------------------------+ +| Utf8("hello world") NOT LIKE Utf8("goodbye%") | ++-----------------------------------------------+ +| true | ++-----------------------------------------------+ + +SELECT 'hello world' NOT LIKE 'hello%'; + ++---------------------------------------------+ +| Utf8("hello world") NOT LIKE Utf8("hello%") | ++---------------------------------------------+ +| false | ++---------------------------------------------+ + +-- Case sensitivity +SELECT 'Hello World' LIKE 'hello%'; + ++-----------------------------------------+ +| Utf8("Hello World") LIKE Utf8("hello%") | ++-----------------------------------------+ +| false | ++-----------------------------------------+ + +SELECT 'Hello World' ILIKE 'hello%'; + ++------------------------------------------+ +| Utf8("Hello World") ILIKE Utf8("hello%") | ++------------------------------------------+ +| true | ++------------------------------------------+ + +SELECT 'Hello World' ILIKE 'HELLO%'; + ++------------------------------------------+ +| Utf8("Hello World") ILIKE Utf8("HELLO%") | ++------------------------------------------+ +| true | ++------------------------------------------+ + +-- Test with table data +CREATE TABLE like_test("name" VARCHAR, email VARCHAR, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO like_test VALUES + ('John Doe', 'john@example.com', 1000), + ('Jane Smith', 'jane@gmail.com', 2000), + ('Bob Wilson', 'bob@yahoo.com', 3000), + ('Alice Johnson', 'alice@company.org', 4000), + ('Charlie Brown', 'charlie@test.net', 5000); + +Affected Rows: 5 + +-- Pattern matching on names +SELECT "name" FROM like_test WHERE "name" LIKE 'J%' ORDER BY ts; + ++------------+ +| name | ++------------+ +| John Doe | +| Jane Smith | ++------------+ + +SELECT "name" FROM like_test WHERE "name" LIKE '%son' ORDER BY ts; + ++---------------+ +| name | ++---------------+ +| Bob Wilson | +| Alice Johnson | ++---------------+ + +-- Contains space +SELECT "name" FROM like_test WHERE "name" LIKE '% %' ORDER BY ts; + ++---------------+ +| name | ++---------------+ +| John Doe | +| Jane Smith | +| Bob Wilson | +| Alice Johnson | +| Charlie Brown | ++---------------+ + +-- Pattern matching on emails +SELECT "name", email FROM like_test WHERE email LIKE '%@gmail.com' ORDER BY ts; + ++------------+----------------+ +| name | email | ++------------+----------------+ +| Jane Smith | jane@gmail.com | ++------------+----------------+ + +SELECT "name", email FROM like_test WHERE email LIKE '%.com' ORDER BY ts; + ++------------+------------------+ +| name | email | ++------------+------------------+ +| John Doe | john@example.com | +| Jane Smith | jane@gmail.com | +| Bob Wilson | bob@yahoo.com | ++------------+------------------+ + +SELECT "name", email FROM like_test WHERE email LIKE '%@%.org' ORDER BY ts; + ++---------------+-------------------+ +| name | email | ++---------------+-------------------+ +| Alice Johnson | alice@company.org | ++---------------+-------------------+ + +-- Underscore wildcard +SELECT "name" FROM like_test WHERE "name" LIKE 'Jo__ ___' ORDER BY ts; + ++----------+ +| name | ++----------+ +| John Doe | ++----------+ + +SELECT email FROM like_test WHERE email LIKE '____@%' ORDER BY ts; + ++------------------+ +| email | ++------------------+ +| john@example.com | +| jane@gmail.com | ++------------------+ + +-- Multiple wildcards +-- Contains 'o' +SELECT "name" FROM like_test WHERE "name" LIKE '%o%' ORDER BY ts; + ++---------------+ +| name | ++---------------+ +| John Doe | +| Bob Wilson | +| Alice Johnson | +| Charlie Brown | ++---------------+ + +-- 'a' before and after @ +SELECT email FROM like_test WHERE email LIKE '%a%@%a%' ORDER BY ts; + ++-------------------+ +| email | ++-------------------+ +| jane@gmail.com | +| alice@company.org | ++-------------------+ + +-- Escaping special characters +CREATE TABLE escape_test("text" VARCHAR, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO escape_test VALUES + ('100% complete', 1000), + ('test_file.txt', 2000), + ('50% done', 3000), + ('backup_2023.sql', 4000); + +Affected Rows: 4 + +-- Need to escape % and _ +-- Contains % +SELECT "text" FROM escape_test WHERE "text" LIKE '%\%%' ORDER BY ts; + ++---------------+ +| text | ++---------------+ +| 100% complete | +| 50% done | ++---------------+ + +-- Contains _ +SELECT "text" FROM escape_test WHERE "text" LIKE '%\_%' ORDER BY ts; + ++-----------------+ +| text | ++-----------------+ +| test_file.txt | +| backup_2023.sql | ++-----------------+ + +-- Unicode pattern matching +CREATE TABLE unicode_like(s VARCHAR, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO unicode_like VALUES + ('Hello 世界', 1000), + ('🚀 rocket', 2000), + ('café shop', 3000); + +Affected Rows: 3 + +SELECT s FROM unicode_like WHERE s LIKE '%世界' ORDER BY ts; + ++------------+ +| s | ++------------+ +| Hello 世界 | ++------------+ + +SELECT s FROM unicode_like WHERE s LIKE '🚀%' ORDER BY ts; + ++-----------+ +| s | ++-----------+ +| 🚀 rocket | ++-----------+ + +SELECT s FROM unicode_like WHERE s LIKE '%é%' ORDER BY ts; + ++-----------+ +| s | ++-----------+ +| café shop | ++-----------+ + +DROP TABLE like_test; + +Affected Rows: 0 + +DROP TABLE escape_test; + +Affected Rows: 0 + +DROP TABLE unicode_like; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/function/string/like_pattern.sql b/tests/cases/standalone/common/function/string/like_pattern.sql new file mode 100644 index 0000000000..460fc42e33 --- /dev/null +++ b/tests/cases/standalone/common/function/string/like_pattern.sql @@ -0,0 +1,97 @@ +-- String LIKE pattern matching tests + +-- Basic LIKE patterns +SELECT 'hello world' LIKE 'hello%'; + +SELECT 'hello world' LIKE '%world'; + +SELECT 'hello world' LIKE '%llo%'; + +SELECT 'hello world' LIKE 'hello_world'; + +SELECT 'hello world' LIKE 'hello world'; + +-- LIKE with NOT +SELECT 'hello world' NOT LIKE 'goodbye%'; + +SELECT 'hello world' NOT LIKE 'hello%'; + +-- Case sensitivity +SELECT 'Hello World' LIKE 'hello%'; + +SELECT 'Hello World' ILIKE 'hello%'; + +SELECT 'Hello World' ILIKE 'HELLO%'; + +-- Test with table data +CREATE TABLE like_test("name" VARCHAR, email VARCHAR, ts TIMESTAMP TIME INDEX); + +INSERT INTO like_test VALUES + ('John Doe', 'john@example.com', 1000), + ('Jane Smith', 'jane@gmail.com', 2000), + ('Bob Wilson', 'bob@yahoo.com', 3000), + ('Alice Johnson', 'alice@company.org', 4000), + ('Charlie Brown', 'charlie@test.net', 5000); + +-- Pattern matching on names +SELECT "name" FROM like_test WHERE "name" LIKE 'J%' ORDER BY ts; + +SELECT "name" FROM like_test WHERE "name" LIKE '%son' ORDER BY ts; + +-- Contains space +SELECT "name" FROM like_test WHERE "name" LIKE '% %' ORDER BY ts; + +-- Pattern matching on emails +SELECT "name", email FROM like_test WHERE email LIKE '%@gmail.com' ORDER BY ts; + +SELECT "name", email FROM like_test WHERE email LIKE '%.com' ORDER BY ts; + +SELECT "name", email FROM like_test WHERE email LIKE '%@%.org' ORDER BY ts; + +-- Underscore wildcard +SELECT "name" FROM like_test WHERE "name" LIKE 'Jo__ ___' ORDER BY ts; + +SELECT email FROM like_test WHERE email LIKE '____@%' ORDER BY ts; + +-- Multiple wildcards +-- Contains 'o' +SELECT "name" FROM like_test WHERE "name" LIKE '%o%' ORDER BY ts; + +-- 'a' before and after @ +SELECT email FROM like_test WHERE email LIKE '%a%@%a%' ORDER BY ts; + +-- Escaping special characters +CREATE TABLE escape_test("text" VARCHAR, ts TIMESTAMP TIME INDEX); + +INSERT INTO escape_test VALUES + ('100% complete', 1000), + ('test_file.txt', 2000), + ('50% done', 3000), + ('backup_2023.sql', 4000); + +-- Need to escape % and _ +-- Contains % +SELECT "text" FROM escape_test WHERE "text" LIKE '%\%%' ORDER BY ts; + +-- Contains _ +SELECT "text" FROM escape_test WHERE "text" LIKE '%\_%' ORDER BY ts; + +-- Unicode pattern matching +CREATE TABLE unicode_like(s VARCHAR, ts TIMESTAMP TIME INDEX); + +INSERT INTO unicode_like VALUES + ('Hello 世界', 1000), + ('🚀 rocket', 2000), + ('café shop', 3000); + +SELECT s FROM unicode_like WHERE s LIKE '%世界' ORDER BY ts; + +SELECT s FROM unicode_like WHERE s LIKE '🚀%' ORDER BY ts; + +SELECT s FROM unicode_like WHERE s LIKE '%é%' ORDER BY ts; + +DROP TABLE like_test; + +DROP TABLE escape_test; + +DROP TABLE unicode_like; diff --git a/tests/cases/standalone/common/function/string/position.result b/tests/cases/standalone/common/function/string/position.result new file mode 100644 index 0000000000..1b65fb6fba --- /dev/null +++ b/tests/cases/standalone/common/function/string/position.result @@ -0,0 +1,278 @@ +-- String position/search function tests +-- POSITION function +SELECT POSITION('world' IN 'hello world'); + ++-------------------------------------------+ +| strpos(Utf8("hello world"),Utf8("world")) | ++-------------------------------------------+ +| 7 | ++-------------------------------------------+ + +SELECT POSITION('xyz' IN 'hello world'); + ++-----------------------------------------+ +| strpos(Utf8("hello world"),Utf8("xyz")) | ++-----------------------------------------+ +| 0 | ++-----------------------------------------+ + +SELECT POSITION('' IN 'hello world'); + ++--------------------------------------+ +| strpos(Utf8("hello world"),Utf8("")) | ++--------------------------------------+ +| 1 | ++--------------------------------------+ + +SELECT POSITION('world' IN ''); + ++--------------------------------+ +| strpos(Utf8(""),Utf8("world")) | ++--------------------------------+ +| 0 | ++--------------------------------+ + +-- STRPOS function (same as POSITION) +SELECT STRPOS('hello world', 'world'); + ++-------------------------------------------+ +| strpos(Utf8("hello world"),Utf8("world")) | ++-------------------------------------------+ +| 7 | ++-------------------------------------------+ + +SELECT STRPOS('hello world', 'xyz'); + ++-----------------------------------------+ +| strpos(Utf8("hello world"),Utf8("xyz")) | ++-----------------------------------------+ +| 0 | ++-----------------------------------------+ + +SELECT STRPOS('hello world', 'hello'); + ++-------------------------------------------+ +| strpos(Utf8("hello world"),Utf8("hello")) | ++-------------------------------------------+ +| 1 | ++-------------------------------------------+ + +SELECT STRPOS('hello world', 'o'); + ++---------------------------------------+ +| strpos(Utf8("hello world"),Utf8("o")) | ++---------------------------------------+ +| 5 | ++---------------------------------------+ + +-- INSTR function +SELECT INSTR('hello world', 'world'); + ++------------------------------------------+ +| instr(Utf8("hello world"),Utf8("world")) | ++------------------------------------------+ +| 7 | ++------------------------------------------+ + +SELECT INSTR('hello world', 'o'); + ++--------------------------------------+ +| instr(Utf8("hello world"),Utf8("o")) | ++--------------------------------------+ +| 5 | ++--------------------------------------+ + +SELECT INSTR('hello world', 'xyz'); + ++----------------------------------------+ +| instr(Utf8("hello world"),Utf8("xyz")) | ++----------------------------------------+ +| 0 | ++----------------------------------------+ + +-- Case sensitive search +SELECT POSITION('WORLD' IN 'hello world'); + ++-------------------------------------------+ +| strpos(Utf8("hello world"),Utf8("WORLD")) | ++-------------------------------------------+ +| 0 | ++-------------------------------------------+ + +SELECT POSITION('World' IN 'hello world'); + ++-------------------------------------------+ +| strpos(Utf8("hello world"),Utf8("World")) | ++-------------------------------------------+ +| 0 | ++-------------------------------------------+ + +-- LEFT and RIGHT functions +SELECT LEFT('hello world', 5); + ++------------------------------------+ +| left(Utf8("hello world"),Int64(5)) | ++------------------------------------+ +| hello | ++------------------------------------+ + +SELECT RIGHT('hello world', 5); + ++-------------------------------------+ +| right(Utf8("hello world"),Int64(5)) | ++-------------------------------------+ +| world | ++-------------------------------------+ + +-- More than string length +SELECT LEFT('hello', 10); + ++-------------------------------+ +| left(Utf8("hello"),Int64(10)) | ++-------------------------------+ +| hello | ++-------------------------------+ + +-- More than string length +SELECT RIGHT('hello', 10); + ++--------------------------------+ +| right(Utf8("hello"),Int64(10)) | ++--------------------------------+ +| hello | ++--------------------------------+ + +-- Test with NULL values +SELECT POSITION('world' IN NULL); + ++----------------------------+ +| strpos(NULL,Utf8("world")) | ++----------------------------+ +| | ++----------------------------+ + +SELECT POSITION(NULL IN 'hello world'); + ++----------------------------------+ +| strpos(Utf8("hello world"),NULL) | ++----------------------------------+ +| | ++----------------------------------+ + +SELECT LEFT(NULL, 5); + ++---------------------+ +| left(NULL,Int64(5)) | ++---------------------+ +| | ++---------------------+ + +SELECT RIGHT('hello', NULL); + ++---------------------------+ +| right(Utf8("hello"),NULL) | ++---------------------------+ +| | ++---------------------------+ + +-- Test with table data +CREATE TABLE position_test(s VARCHAR, "search" VARCHAR, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO position_test VALUES + ('hello world', 'world', 1000), + ('hello world', 'hello', 2000), + ('hello world', 'xyz', 3000), + ('programming', 'gram', 4000), + ('database', 'base', 5000); + +Affected Rows: 5 + +SELECT s, "search", POSITION("search" IN s) AS a, STRPOS(s, "search") AS b FROM position_test ORDER BY ts; + ++-------------+--------+---+---+ +| s | search | a | b | ++-------------+--------+---+---+ +| hello world | world | 7 | 7 | +| hello world | hello | 1 | 1 | +| hello world | xyz | 0 | 0 | +| programming | gram | 4 | 4 | +| database | base | 5 | 5 | ++-------------+--------+---+---+ + +-- Test LEFT and RIGHT with table data +SELECT s, LEFT(s, 5), RIGHT(s, 5) FROM position_test ORDER BY ts; + ++-------------+--------------------------------+---------------------------------+ +| s | left(position_test.s,Int64(5)) | right(position_test.s,Int64(5)) | ++-------------+--------------------------------+---------------------------------+ +| hello world | hello | world | +| hello world | hello | world | +| hello world | hello | world | +| programming | progr | mming | +| database | datab | abase | ++-------------+--------------------------------+---------------------------------+ + +-- Unicode position tests +SELECT POSITION('世' IN 'hello世界'); + ++--------------------------------------+ +| strpos(Utf8("hello世界"),Utf8("世")) | ++--------------------------------------+ +| 6 | ++--------------------------------------+ + +SELECT POSITION('界' IN 'hello世界'); + ++--------------------------------------+ +| strpos(Utf8("hello世界"),Utf8("界")) | ++--------------------------------------+ +| 7 | ++--------------------------------------+ + +SELECT STRPOS('café shop', 'é'); + ++-------------------------------------+ +| strpos(Utf8("café shop"),Utf8("é")) | ++-------------------------------------+ +| 4 | ++-------------------------------------+ + +SELECT LEFT('中文测试', 2); + ++---------------------------------+ +| left(Utf8("中文测试"),Int64(2)) | ++---------------------------------+ +| 中文 | ++---------------------------------+ + +SELECT RIGHT('中文测试', 2); + ++----------------------------------+ +| right(Utf8("中文测试"),Int64(2)) | ++----------------------------------+ +| 测试 | ++----------------------------------+ + +-- Multiple occurrences (finds first one) +SELECT POSITION('o' IN 'hello world'); + ++---------------------------------------+ +| strpos(Utf8("hello world"),Utf8("o")) | ++---------------------------------------+ +| 5 | ++---------------------------------------+ + +SELECT STRPOS('hello world', 'l'); + ++---------------------------------------+ +| strpos(Utf8("hello world"),Utf8("l")) | ++---------------------------------------+ +| 3 | ++---------------------------------------+ + +DROP TABLE position_test; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/function/string/position.sql b/tests/cases/standalone/common/function/string/position.sql new file mode 100644 index 0000000000..519a9146d3 --- /dev/null +++ b/tests/cases/standalone/common/function/string/position.sql @@ -0,0 +1,84 @@ +-- String position/search function tests + +-- POSITION function +SELECT POSITION('world' IN 'hello world'); + +SELECT POSITION('xyz' IN 'hello world'); + +SELECT POSITION('' IN 'hello world'); + +SELECT POSITION('world' IN ''); + +-- STRPOS function (same as POSITION) +SELECT STRPOS('hello world', 'world'); + +SELECT STRPOS('hello world', 'xyz'); + +SELECT STRPOS('hello world', 'hello'); + +SELECT STRPOS('hello world', 'o'); + +-- INSTR function +SELECT INSTR('hello world', 'world'); + +SELECT INSTR('hello world', 'o'); + +SELECT INSTR('hello world', 'xyz'); + +-- Case sensitive search +SELECT POSITION('WORLD' IN 'hello world'); + +SELECT POSITION('World' IN 'hello world'); + +-- LEFT and RIGHT functions +SELECT LEFT('hello world', 5); + +SELECT RIGHT('hello world', 5); + +-- More than string length +SELECT LEFT('hello', 10); + +-- More than string length +SELECT RIGHT('hello', 10); + +-- Test with NULL values +SELECT POSITION('world' IN NULL); + +SELECT POSITION(NULL IN 'hello world'); + +SELECT LEFT(NULL, 5); + +SELECT RIGHT('hello', NULL); + +-- Test with table data +CREATE TABLE position_test(s VARCHAR, "search" VARCHAR, ts TIMESTAMP TIME INDEX); + +INSERT INTO position_test VALUES + ('hello world', 'world', 1000), + ('hello world', 'hello', 2000), + ('hello world', 'xyz', 3000), + ('programming', 'gram', 4000), + ('database', 'base', 5000); + +SELECT s, "search", POSITION("search" IN s) AS a, STRPOS(s, "search") AS b FROM position_test ORDER BY ts; + +-- Test LEFT and RIGHT with table data +SELECT s, LEFT(s, 5), RIGHT(s, 5) FROM position_test ORDER BY ts; + +-- Unicode position tests +SELECT POSITION('世' IN 'hello世界'); + +SELECT POSITION('界' IN 'hello世界'); + +SELECT STRPOS('café shop', 'é'); + +SELECT LEFT('中文测试', 2); + +SELECT RIGHT('中文测试', 2); + +-- Multiple occurrences (finds first one) +SELECT POSITION('o' IN 'hello world'); + +SELECT STRPOS('hello world', 'l'); + +DROP TABLE position_test; diff --git a/tests/cases/standalone/common/function/string/regex.result b/tests/cases/standalone/common/function/string/regex.result new file mode 100644 index 0000000000..b7030f4346 --- /dev/null +++ b/tests/cases/standalone/common/function/string/regex.result @@ -0,0 +1,143 @@ +-- Regular expression function tests +-- REGEXP_MATCHES function +SELECT regexp_like('hello123world', '\d+'); + ++------------------------------------------------+ +| regexp_like(Utf8("hello123world"),Utf8("\d+")) | ++------------------------------------------------+ +| true | ++------------------------------------------------+ + +SELECT regexp_like('no numbers here', '\d+'); + ++--------------------------------------------------+ +| regexp_like(Utf8("no numbers here"),Utf8("\d+")) | ++--------------------------------------------------+ +| false | ++--------------------------------------------------+ + +SELECT regexp_like('email@example.com', '[a-zA-Z0-9]+@[a-zA-Z0-9]+\.[a-zA-Z]+'); + ++-------------------------------------------------------------------------------------+ +| regexp_like(Utf8("email@example.com"),Utf8("[a-zA-Z0-9]+@[a-zA-Z0-9]+\.[a-zA-Z]+")) | ++-------------------------------------------------------------------------------------+ +| true | ++-------------------------------------------------------------------------------------+ + +-- REGEXP_REPLACE function +SELECT REGEXP_REPLACE('hello123world', '\d+', 'XXX'); + ++---------------------------------------------------------------+ +| regexp_replace(Utf8("hello123world"),Utf8("\d+"),Utf8("XXX")) | ++---------------------------------------------------------------+ +| helloXXXworld | ++---------------------------------------------------------------+ + +SELECT REGEXP_REPLACE('phone: 123-456-7890', '\d{3}-\d{3}-\d{4}', 'XXX-XXX-XXXX'); + ++--------------------------------------------------------------------------------------------+ +| regexp_replace(Utf8("phone: 123-456-7890"),Utf8("\d{3}-\d{3}-\d{4}"),Utf8("XXX-XXX-XXXX")) | ++--------------------------------------------------------------------------------------------+ +| phone: XXX-XXX-XXXX | ++--------------------------------------------------------------------------------------------+ + +SELECT REGEXP_REPLACE(' extra spaces ', '\s+', ' '); + ++------------------------------------------------------------------+ +| regexp_replace(Utf8(" extra spaces "),Utf8("\s+"),Utf8(" ")) | ++------------------------------------------------------------------+ +| extra spaces | ++------------------------------------------------------------------+ + +-- REGEXP_EXTRACT function +SELECT REGEXP_EXTRACT('version 1.2.3', '\d+\.\d+\.\d+'); + ++-------------------------------------------------------------+ +| regexp_extract(Utf8("version 1.2.3"),Utf8("\d+\.\d+\.\d+")) | ++-------------------------------------------------------------+ +| 1.2.3 | ++-------------------------------------------------------------+ + +SELECT REGEXP_EXTRACT('no match here', '\d+\.\d+\.\d+'); + ++-------------------------------------------------------------+ +| regexp_extract(Utf8("no match here"),Utf8("\d+\.\d+\.\d+")) | ++-------------------------------------------------------------+ +| | ++-------------------------------------------------------------+ + +-- Test with ~ operator (regex match) +SELECT 'hello123' ~ '\d+'; + ++--------------------------------+ +| Utf8("hello123") ~ Utf8("\d+") | ++--------------------------------+ +| true | ++--------------------------------+ + +SELECT 'hello world' ~ '\d+'; + ++-----------------------------------+ +| Utf8("hello world") ~ Utf8("\d+") | ++-----------------------------------+ +| false | ++-----------------------------------+ + +SELECT 'email@example.com' ~ '[a-zA-Z0-9]+@[a-zA-Z0-9]+\.[a-zA-Z]+'; + ++--------------------------------------------------------------------------+ +| Utf8("email@example.com") ~ Utf8("[a-zA-Z0-9]+@[a-zA-Z0-9]+\.[a-zA-Z]+") | ++--------------------------------------------------------------------------+ +| true | ++--------------------------------------------------------------------------+ + +-- Test with table data +CREATE TABLE regex_test("text" VARCHAR, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO regex_test VALUES + ('Phone: 123-456-7890', 1000), + ('Email: user@domain.com', 2000), + ('Version 2.1.0', 3000), + ('No pattern here', 4000); + +Affected Rows: 4 + +SELECT "text", REGEXP_EXTRACT("text", '\d{3}-\d{3}-\d{4}') as phone FROM regex_test ORDER BY ts; + ++------------------------+--------------+ +| text | phone | ++------------------------+--------------+ +| Phone: 123-456-7890 | 123-456-7890 | +| Email: user@domain.com | | +| Version 2.1.0 | | +| No pattern here | | ++------------------------+--------------+ + +SELECT "text", REGEXP_EXTRACT("text", '[a-zA-Z0-9]+@[a-zA-Z0-9]+\.[a-zA-Z]+') as email FROM regex_test ORDER BY ts; + ++------------------------+-----------------+ +| text | email | ++------------------------+-----------------+ +| Phone: 123-456-7890 | | +| Email: user@domain.com | user@domain.com | +| Version 2.1.0 | | +| No pattern here | | ++------------------------+-----------------+ + +SELECT "text", REGEXP_EXTRACT("text", '\d+\.\d+\.\d+') as version FROM regex_test ORDER BY ts; + ++------------------------+---------+ +| text | version | ++------------------------+---------+ +| Phone: 123-456-7890 | | +| Email: user@domain.com | | +| Version 2.1.0 | 2.1.0 | +| No pattern here | | ++------------------------+---------+ + +DROP TABLE regex_test; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/function/string/regex.sql b/tests/cases/standalone/common/function/string/regex.sql new file mode 100644 index 0000000000..10e0cbcc4d --- /dev/null +++ b/tests/cases/standalone/common/function/string/regex.sql @@ -0,0 +1,44 @@ +-- Regular expression function tests + +-- REGEXP_MATCHES function +SELECT regexp_like('hello123world', '\d+'); + +SELECT regexp_like('no numbers here', '\d+'); + +SELECT regexp_like('email@example.com', '[a-zA-Z0-9]+@[a-zA-Z0-9]+\.[a-zA-Z]+'); + +-- REGEXP_REPLACE function +SELECT REGEXP_REPLACE('hello123world', '\d+', 'XXX'); + +SELECT REGEXP_REPLACE('phone: 123-456-7890', '\d{3}-\d{3}-\d{4}', 'XXX-XXX-XXXX'); + +SELECT REGEXP_REPLACE(' extra spaces ', '\s+', ' '); + +-- REGEXP_EXTRACT function +SELECT REGEXP_EXTRACT('version 1.2.3', '\d+\.\d+\.\d+'); + +SELECT REGEXP_EXTRACT('no match here', '\d+\.\d+\.\d+'); + +-- Test with ~ operator (regex match) +SELECT 'hello123' ~ '\d+'; + +SELECT 'hello world' ~ '\d+'; + +SELECT 'email@example.com' ~ '[a-zA-Z0-9]+@[a-zA-Z0-9]+\.[a-zA-Z]+'; + +-- Test with table data +CREATE TABLE regex_test("text" VARCHAR, ts TIMESTAMP TIME INDEX); + +INSERT INTO regex_test VALUES + ('Phone: 123-456-7890', 1000), + ('Email: user@domain.com', 2000), + ('Version 2.1.0', 3000), + ('No pattern here', 4000); + +SELECT "text", REGEXP_EXTRACT("text", '\d{3}-\d{3}-\d{4}') as phone FROM regex_test ORDER BY ts; + +SELECT "text", REGEXP_EXTRACT("text", '[a-zA-Z0-9]+@[a-zA-Z0-9]+\.[a-zA-Z]+') as email FROM regex_test ORDER BY ts; + +SELECT "text", REGEXP_EXTRACT("text", '\d+\.\d+\.\d+') as version FROM regex_test ORDER BY ts; + +DROP TABLE regex_test; diff --git a/tests/cases/standalone/common/function/string/repeat.result b/tests/cases/standalone/common/function/string/repeat.result new file mode 100644 index 0000000000..32ecc614d1 --- /dev/null +++ b/tests/cases/standalone/common/function/string/repeat.result @@ -0,0 +1,217 @@ +-- String REPEAT function tests +-- Basic REPEAT function +SELECT REPEAT('hello', 3); + ++--------------------------------+ +| repeat(Utf8("hello"),Int64(3)) | ++--------------------------------+ +| hellohellohello | ++--------------------------------+ + +SELECT REPEAT('a', 5); + ++----------------------------+ +| repeat(Utf8("a"),Int64(5)) | ++----------------------------+ +| aaaaa | ++----------------------------+ + +SELECT REPEAT('', 3); + ++---------------------------+ +| repeat(Utf8(""),Int64(3)) | ++---------------------------+ +| | ++---------------------------+ + +SELECT REPEAT('test', 0); + ++-------------------------------+ +| repeat(Utf8("test"),Int64(0)) | ++-------------------------------+ +| | ++-------------------------------+ + +SELECT REPEAT('test', 1); + ++-------------------------------+ +| repeat(Utf8("test"),Int64(1)) | ++-------------------------------+ +| test | ++-------------------------------+ + +-- REPEAT with NULL values +SELECT REPEAT(NULL, 3); + ++-----------------------+ +| repeat(NULL,Int64(3)) | ++-----------------------+ +| | ++-----------------------+ + +SELECT REPEAT('hello', NULL); + ++----------------------------+ +| repeat(Utf8("hello"),NULL) | ++----------------------------+ +| | ++----------------------------+ + +-- REPEAT with negative numbers +SELECT REPEAT('hello', -1); + ++---------------------------------+ +| repeat(Utf8("hello"),Int64(-1)) | ++---------------------------------+ +| | ++---------------------------------+ + +-- REPEAT with special characters +SELECT REPEAT('*', 10); + ++-----------------------------+ +| repeat(Utf8("*"),Int64(10)) | ++-----------------------------+ +| ********** | ++-----------------------------+ + +SELECT REPEAT('-=', 5); + ++-----------------------------+ +| repeat(Utf8("-="),Int64(5)) | ++-----------------------------+ +| -=-=-=-=-= | ++-----------------------------+ + +SELECT REPEAT('!@#', 3); + ++------------------------------+ +| repeat(Utf8("!@#"),Int64(3)) | ++------------------------------+ +| !@#!@#!@# | ++------------------------------+ + +-- Test with table data +CREATE TABLE repeat_test(s VARCHAR, n INTEGER, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO repeat_test VALUES + ('hello', 2, 1000), + ('*', 5, 2000), + ('test', 0, 3000), + ('a', 10, 4000), + (NULL, 3, 5000), + ('hi', NULL, 6000); + +Affected Rows: 6 + +SELECT s, n, REPEAT(s, n) FROM repeat_test ORDER BY ts; + ++-------+----+-------------------------------------+ +| s | n | repeat(repeat_test.s,repeat_test.n) | ++-------+----+-------------------------------------+ +| hello | 2 | hellohello | +| * | 5 | ***** | +| test | 0 | | +| a | 10 | aaaaaaaaaa | +| | 3 | | +| hi | | | ++-------+----+-------------------------------------+ + +-- Unicode REPEAT +SELECT REPEAT('世', 3); + ++-----------------------------+ +| repeat(Utf8("世"),Int64(3)) | ++-----------------------------+ +| 世世世 | ++-----------------------------+ + +SELECT REPEAT('🚀', 5); + ++-----------------------------+ +| repeat(Utf8("🚀"),Int64(5)) | ++-----------------------------+ +| 🚀🚀🚀🚀🚀 | ++-----------------------------+ + +SELECT REPEAT('café', 2); + ++-------------------------------+ +| repeat(Utf8("café"),Int64(2)) | ++-------------------------------+ +| cafécafé | ++-------------------------------+ + +-- REPEAT with spaces and formatting +SELECT REPEAT(' ', 10); + ++-----------------------------+ +| repeat(Utf8(" "),Int64(10)) | ++-----------------------------+ +| | ++-----------------------------+ + +SELECT REPEAT('\t', 3); + ++-----------------------------+ +| repeat(Utf8("\t"),Int64(3)) | ++-----------------------------+ +| \t\t\t | ++-----------------------------+ + +SELECT CONCAT('Start', REPEAT('-', 10), 'End'); + ++---------------------------------------------------------------+ +| concat(Utf8("Start"),repeat(Utf8("-"),Int64(10)),Utf8("End")) | ++---------------------------------------------------------------+ +| Start----------End | ++---------------------------------------------------------------+ + +-- Large REPEAT operations +SELECT LENGTH(REPEAT('a', 100)); + ++---------------------------------------+ +| length(repeat(Utf8("a"), Int64(100))) | ++---------------------------------------+ +| 100 | ++---------------------------------------+ + +SELECT LENGTH(REPEAT('ab', 50)); + ++---------------------------------------+ +| length(repeat(Utf8("ab"), Int64(50))) | ++---------------------------------------+ +| 100 | ++---------------------------------------+ + +-- Combining REPEAT with other functions +SELECT UPPER(REPEAT('hello', 3)); + ++---------------------------------------+ +| upper(repeat(Utf8("hello"),Int64(3))) | ++---------------------------------------+ +| HELLOHELLOHELLO | ++---------------------------------------+ + +SELECT REPEAT(UPPER('hello'), 2); + ++---------------------------------------+ +| repeat(upper(Utf8("hello")),Int64(2)) | ++---------------------------------------+ +| HELLOHELLO | ++---------------------------------------+ + +SELECT REVERSE(REPEAT('abc', 3)); + ++---------------------------------------+ +| reverse(repeat(Utf8("abc"),Int64(3))) | ++---------------------------------------+ +| cbacbacba | ++---------------------------------------+ + +DROP TABLE repeat_test; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/function/string/repeat.sql b/tests/cases/standalone/common/function/string/repeat.sql new file mode 100644 index 0000000000..6e75de8113 --- /dev/null +++ b/tests/cases/standalone/common/function/string/repeat.sql @@ -0,0 +1,68 @@ +-- String REPEAT function tests + +-- Basic REPEAT function +SELECT REPEAT('hello', 3); + +SELECT REPEAT('a', 5); + +SELECT REPEAT('', 3); + +SELECT REPEAT('test', 0); + +SELECT REPEAT('test', 1); + +-- REPEAT with NULL values +SELECT REPEAT(NULL, 3); + +SELECT REPEAT('hello', NULL); + +-- REPEAT with negative numbers +SELECT REPEAT('hello', -1); + +-- REPEAT with special characters +SELECT REPEAT('*', 10); + +SELECT REPEAT('-=', 5); + +SELECT REPEAT('!@#', 3); + +-- Test with table data +CREATE TABLE repeat_test(s VARCHAR, n INTEGER, ts TIMESTAMP TIME INDEX); + +INSERT INTO repeat_test VALUES + ('hello', 2, 1000), + ('*', 5, 2000), + ('test', 0, 3000), + ('a', 10, 4000), + (NULL, 3, 5000), + ('hi', NULL, 6000); + +SELECT s, n, REPEAT(s, n) FROM repeat_test ORDER BY ts; + +-- Unicode REPEAT +SELECT REPEAT('世', 3); + +SELECT REPEAT('🚀', 5); + +SELECT REPEAT('café', 2); + +-- REPEAT with spaces and formatting +SELECT REPEAT(' ', 10); + +SELECT REPEAT('\t', 3); + +SELECT CONCAT('Start', REPEAT('-', 10), 'End'); + +-- Large REPEAT operations +SELECT LENGTH(REPEAT('a', 100)); + +SELECT LENGTH(REPEAT('ab', 50)); + +-- Combining REPEAT with other functions +SELECT UPPER(REPEAT('hello', 3)); + +SELECT REPEAT(UPPER('hello'), 2); + +SELECT REVERSE(REPEAT('abc', 3)); + +DROP TABLE repeat_test; diff --git a/tests/cases/standalone/common/function/string/replace.result b/tests/cases/standalone/common/function/string/replace.result new file mode 100644 index 0000000000..a4e1790d34 --- /dev/null +++ b/tests/cases/standalone/common/function/string/replace.result @@ -0,0 +1,180 @@ +-- String REPLACE function tests +-- Basic REPLACE function +SELECT REPLACE('hello world', 'world', 'universe'); + ++-------------------------------------------------------------+ +| replace(Utf8("hello world"),Utf8("world"),Utf8("universe")) | ++-------------------------------------------------------------+ +| hello universe | ++-------------------------------------------------------------+ + +SELECT REPLACE('hello world', 'xyz', 'abc'); + ++------------------------------------------------------+ +| replace(Utf8("hello world"),Utf8("xyz"),Utf8("abc")) | ++------------------------------------------------------+ +| hello world | ++------------------------------------------------------+ + +SELECT REPLACE('hello hello hello', 'hello', 'hi'); + ++-------------------------------------------------------------+ +| replace(Utf8("hello hello hello"),Utf8("hello"),Utf8("hi")) | ++-------------------------------------------------------------+ +| hi hi hi | ++-------------------------------------------------------------+ + +-- REPLACE with empty strings +SELECT REPLACE('hello world', 'world', ''); + ++-----------------------------------------------------+ +| replace(Utf8("hello world"),Utf8("world"),Utf8("")) | ++-----------------------------------------------------+ +| hello | ++-----------------------------------------------------+ + +SELECT REPLACE('hello world', '', 'xyz'); + ++---------------------------------------------------+ +| replace(Utf8("hello world"),Utf8(""),Utf8("xyz")) | ++---------------------------------------------------+ +| xyzhxyzexyzlxyzlxyzoxyz xyzwxyzoxyzrxyzlxyzdxyz | ++---------------------------------------------------+ + +SELECT REPLACE('', 'xyz', 'abc'); + ++-------------------------------------------+ +| replace(Utf8(""),Utf8("xyz"),Utf8("abc")) | ++-------------------------------------------+ +| | ++-------------------------------------------+ + +-- Case sensitive replacement +SELECT REPLACE('Hello World', 'hello', 'hi'); + ++-------------------------------------------------------+ +| replace(Utf8("Hello World"),Utf8("hello"),Utf8("hi")) | ++-------------------------------------------------------+ +| Hello World | ++-------------------------------------------------------+ + +SELECT REPLACE('Hello World', 'Hello', 'Hi'); + ++-------------------------------------------------------+ +| replace(Utf8("Hello World"),Utf8("Hello"),Utf8("Hi")) | ++-------------------------------------------------------+ +| Hi World | ++-------------------------------------------------------+ + +-- NULL handling +SELECT REPLACE(NULL, 'world', 'universe'); + ++----------------------------------------------+ +| replace(NULL,Utf8("world"),Utf8("universe")) | ++----------------------------------------------+ +| | ++----------------------------------------------+ + +SELECT REPLACE('hello world', NULL, 'universe'); + ++----------------------------------------------------+ +| replace(Utf8("hello world"),NULL,Utf8("universe")) | ++----------------------------------------------------+ +| | ++----------------------------------------------------+ + +SELECT REPLACE('hello world', 'world', NULL); + ++-------------------------------------------------+ +| replace(Utf8("hello world"),Utf8("world"),NULL) | ++-------------------------------------------------+ +| | ++-------------------------------------------------+ + +-- Test with table data +CREATE TABLE replace_test(s VARCHAR, old_str VARCHAR, new_str VARCHAR, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO replace_test VALUES + ('hello world', 'world', 'universe', 1000), + ('programming language', 'language', 'paradigm', 2000), + ('test test test', 'test', 'exam', 3000), + ('no match here', 'xyz', 'abc', 4000); + +Affected Rows: 4 + +SELECT s, old_str, new_str, REPLACE(s, old_str, new_str) FROM replace_test ORDER BY ts; + ++----------------------+----------+----------+-------------------------------------------------------------------+ +| s | old_str | new_str | replace(replace_test.s,replace_test.old_str,replace_test.new_str) | ++----------------------+----------+----------+-------------------------------------------------------------------+ +| hello world | world | universe | hello universe | +| programming language | language | paradigm | programming paradigm | +| test test test | test | exam | exam exam exam | +| no match here | xyz | abc | no match here | ++----------------------+----------+----------+-------------------------------------------------------------------+ + +-- Unicode replacement +SELECT REPLACE('hello 世界', '世界', 'world'); + ++--------------------------------------------------------+ +| replace(Utf8("hello 世界"),Utf8("世界"),Utf8("world")) | ++--------------------------------------------------------+ +| hello world | ++--------------------------------------------------------+ + +SELECT REPLACE('café shop', 'é', 'e'); + ++------------------------------------------------+ +| replace(Utf8("café shop"),Utf8("é"),Utf8("e")) | ++------------------------------------------------+ +| cafe shop | ++------------------------------------------------+ + +SELECT REPLACE('🚀 rocket 🚀', '🚀', '✈️'); + ++-----------------------------------------------------+ +| replace(Utf8("🚀 rocket 🚀"),Utf8("🚀"),Utf8("✈️")) | ++-----------------------------------------------------+ +| ✈️ rocket ✈️ | ++-----------------------------------------------------+ + +-- Multiple character replacement +SELECT REPLACE('hello-world-test', '-', '_'); + ++-------------------------------------------------------+ +| replace(Utf8("hello-world-test"),Utf8("-"),Utf8("_")) | ++-------------------------------------------------------+ +| hello_world_test | ++-------------------------------------------------------+ + +SELECT REPLACE('abc::def::ghi', '::', '-->'); + ++-------------------------------------------------------+ +| replace(Utf8("abc::def::ghi"),Utf8("::"),Utf8("-->")) | ++-------------------------------------------------------+ +| abc-->def-->ghi | ++-------------------------------------------------------+ + +-- Overlapping patterns +SELECT REPLACE('ababab', 'ab', 'xy'); + ++-----------------------------------------------+ +| replace(Utf8("ababab"),Utf8("ab"),Utf8("xy")) | ++-----------------------------------------------+ +| xyxyxy | ++-----------------------------------------------+ + +SELECT REPLACE('aaa', 'aa', 'b'); + ++-------------------------------------------+ +| replace(Utf8("aaa"),Utf8("aa"),Utf8("b")) | ++-------------------------------------------+ +| ba | ++-------------------------------------------+ + +DROP TABLE replace_test; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/function/string/replace.sql b/tests/cases/standalone/common/function/string/replace.sql new file mode 100644 index 0000000000..20006ae7c8 --- /dev/null +++ b/tests/cases/standalone/common/function/string/replace.sql @@ -0,0 +1,57 @@ +-- String REPLACE function tests + +-- Basic REPLACE function +SELECT REPLACE('hello world', 'world', 'universe'); + +SELECT REPLACE('hello world', 'xyz', 'abc'); + +SELECT REPLACE('hello hello hello', 'hello', 'hi'); + +-- REPLACE with empty strings +SELECT REPLACE('hello world', 'world', ''); + +SELECT REPLACE('hello world', '', 'xyz'); + +SELECT REPLACE('', 'xyz', 'abc'); + +-- Case sensitive replacement +SELECT REPLACE('Hello World', 'hello', 'hi'); + +SELECT REPLACE('Hello World', 'Hello', 'Hi'); + +-- NULL handling +SELECT REPLACE(NULL, 'world', 'universe'); + +SELECT REPLACE('hello world', NULL, 'universe'); + +SELECT REPLACE('hello world', 'world', NULL); + +-- Test with table data +CREATE TABLE replace_test(s VARCHAR, old_str VARCHAR, new_str VARCHAR, ts TIMESTAMP TIME INDEX); + +INSERT INTO replace_test VALUES + ('hello world', 'world', 'universe', 1000), + ('programming language', 'language', 'paradigm', 2000), + ('test test test', 'test', 'exam', 3000), + ('no match here', 'xyz', 'abc', 4000); + +SELECT s, old_str, new_str, REPLACE(s, old_str, new_str) FROM replace_test ORDER BY ts; + +-- Unicode replacement +SELECT REPLACE('hello 世界', '世界', 'world'); + +SELECT REPLACE('café shop', 'é', 'e'); + +SELECT REPLACE('🚀 rocket 🚀', '🚀', '✈️'); + +-- Multiple character replacement +SELECT REPLACE('hello-world-test', '-', '_'); + +SELECT REPLACE('abc::def::ghi', '::', '-->'); + +-- Overlapping patterns +SELECT REPLACE('ababab', 'ab', 'xy'); + +SELECT REPLACE('aaa', 'aa', 'b'); + +DROP TABLE replace_test; diff --git a/tests/cases/standalone/common/function/string/reverse.result b/tests/cases/standalone/common/function/string/reverse.result new file mode 100644 index 0000000000..00bd73f49b --- /dev/null +++ b/tests/cases/standalone/common/function/string/reverse.result @@ -0,0 +1,200 @@ +-- String REVERSE function tests +-- Basic REVERSE function +SELECT REVERSE('hello'); + ++------------------------+ +| reverse(Utf8("hello")) | ++------------------------+ +| olleh | ++------------------------+ + +SELECT REVERSE('world'); + ++------------------------+ +| reverse(Utf8("world")) | ++------------------------+ +| dlrow | ++------------------------+ + +SELECT REVERSE(''); + ++-------------------+ +| reverse(Utf8("")) | ++-------------------+ +| | ++-------------------+ + +SELECT REVERSE(NULL); + ++---------------+ +| reverse(NULL) | ++---------------+ +| | ++---------------+ + +-- REVERSE with numbers and special characters +SELECT REVERSE('12345'); + ++------------------------+ +| reverse(Utf8("12345")) | ++------------------------+ +| 54321 | ++------------------------+ + +SELECT REVERSE('hello!'); + ++-------------------------+ +| reverse(Utf8("hello!")) | ++-------------------------+ +| !olleh | ++-------------------------+ + +SELECT REVERSE('a!@#$%b'); + ++--------------------------+ +| reverse(Utf8("a!@#$%b")) | ++--------------------------+ +| b%$#@!a | ++--------------------------+ + +-- REVERSE with palindromes +SELECT REVERSE('radar'); + ++------------------------+ +| reverse(Utf8("radar")) | ++------------------------+ +| radar | ++------------------------+ + +SELECT REVERSE('madam'); + ++------------------------+ +| reverse(Utf8("madam")) | ++------------------------+ +| madam | ++------------------------+ + +SELECT REVERSE('racecar'); + ++--------------------------+ +| reverse(Utf8("racecar")) | ++--------------------------+ +| racecar | ++--------------------------+ + +-- Test with table data +CREATE TABLE reverse_test(s VARCHAR, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO reverse_test VALUES + ('hello', 1000), + ('world', 2000), + ('12345', 3000), + ('radar', 4000), + ('', 5000), + (NULL, 6000); + +Affected Rows: 6 + +SELECT s, REVERSE(s) FROM reverse_test ORDER BY ts; + ++-------+-------------------------+ +| s | reverse(reverse_test.s) | ++-------+-------------------------+ +| hello | olleh | +| world | dlrow | +| 12345 | 54321 | +| radar | radar | +| | | +| | | ++-------+-------------------------+ + +-- Unicode REVERSE +SELECT REVERSE('世界'); + ++-----------------------+ +| reverse(Utf8("世界")) | ++-----------------------+ +| 界世 | ++-----------------------+ + +SELECT REVERSE('café'); + ++-----------------------+ +| reverse(Utf8("café")) | ++-----------------------+ +| éfac | ++-----------------------+ + +SELECT REVERSE('🚀🌟'); + ++-----------------------+ +| reverse(Utf8("🚀🌟")) | ++-----------------------+ +| 🌟🚀 | ++-----------------------+ + +-- REVERSE with spaces +SELECT REVERSE('hello world'); + ++------------------------------+ +| reverse(Utf8("hello world")) | ++------------------------------+ +| dlrow olleh | ++------------------------------+ + +SELECT REVERSE(' spaces '); + ++-----------------------------+ +| reverse(Utf8(" spaces ")) | ++-----------------------------+ +| secaps | ++-----------------------------+ + +-- Combining REVERSE with other functions +SELECT UPPER(REVERSE('hello')); + ++-------------------------------+ +| upper(reverse(Utf8("hello"))) | ++-------------------------------+ +| OLLEH | ++-------------------------------+ + +SELECT REVERSE(UPPER('hello')); + ++-------------------------------+ +| reverse(upper(Utf8("hello"))) | ++-------------------------------+ +| OLLEH | ++-------------------------------+ + +SELECT LENGTH(REVERSE('hello world')); + ++--------------------------------------+ +| length(reverse(Utf8("hello world"))) | ++--------------------------------------+ +| 11 | ++--------------------------------------+ + +-- Double REVERSE (should return original) +SELECT REVERSE(REVERSE('hello world')); + ++---------------------------------------+ +| reverse(reverse(Utf8("hello world"))) | ++---------------------------------------+ +| hello world | ++---------------------------------------+ + +SELECT REVERSE(REVERSE('中文测试')); + ++------------------------------------+ +| reverse(reverse(Utf8("中文测试"))) | ++------------------------------------+ +| 中文测试 | ++------------------------------------+ + +DROP TABLE reverse_test; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/function/string/reverse.sql b/tests/cases/standalone/common/function/string/reverse.sql new file mode 100644 index 0000000000..f33f359f73 --- /dev/null +++ b/tests/cases/standalone/common/function/string/reverse.sql @@ -0,0 +1,63 @@ +-- String REVERSE function tests + +-- Basic REVERSE function +SELECT REVERSE('hello'); + +SELECT REVERSE('world'); + +SELECT REVERSE(''); + +SELECT REVERSE(NULL); + +-- REVERSE with numbers and special characters +SELECT REVERSE('12345'); + +SELECT REVERSE('hello!'); + +SELECT REVERSE('a!@#$%b'); + +-- REVERSE with palindromes +SELECT REVERSE('radar'); + +SELECT REVERSE('madam'); + +SELECT REVERSE('racecar'); + +-- Test with table data +CREATE TABLE reverse_test(s VARCHAR, ts TIMESTAMP TIME INDEX); + +INSERT INTO reverse_test VALUES + ('hello', 1000), + ('world', 2000), + ('12345', 3000), + ('radar', 4000), + ('', 5000), + (NULL, 6000); + +SELECT s, REVERSE(s) FROM reverse_test ORDER BY ts; + +-- Unicode REVERSE +SELECT REVERSE('世界'); + +SELECT REVERSE('café'); + +SELECT REVERSE('🚀🌟'); + +-- REVERSE with spaces +SELECT REVERSE('hello world'); + +SELECT REVERSE(' spaces '); + +-- Combining REVERSE with other functions +SELECT UPPER(REVERSE('hello')); + +SELECT REVERSE(UPPER('hello')); + +SELECT LENGTH(REVERSE('hello world')); + +-- Double REVERSE (should return original) +SELECT REVERSE(REVERSE('hello world')); + +SELECT REVERSE(REVERSE('中文测试')); + +DROP TABLE reverse_test; diff --git a/tests/cases/standalone/common/function/string/string_split.result b/tests/cases/standalone/common/function/string/string_split.result new file mode 100644 index 0000000000..d67adc0a0a --- /dev/null +++ b/tests/cases/standalone/common/function/string/string_split.result @@ -0,0 +1,213 @@ +-- Migrated from DuckDB test: test/sql/function/string/test_string_split.test +-- String split function tests +-- Test basic string_split functionality +SELECT string_to_array(NULL, NULL); + ++----------------------------+ +| string_to_array(NULL,NULL) | ++----------------------------+ +| | ++----------------------------+ + +SELECT string_to_array('hello world', ' '); + ++------------------------------------------------+ +| string_to_array(Utf8("hello world"),Utf8(" ")) | ++------------------------------------------------+ +| [hello, world] | ++------------------------------------------------+ + +SELECT string_to_array(NULL, ' '); + ++---------------------------------+ +| string_to_array(NULL,Utf8(" ")) | ++---------------------------------+ +| | ++---------------------------------+ + +SELECT string_to_array('a b c', NULL); + ++-------------------------------------+ +| string_to_array(Utf8("a b c"),NULL) | ++-------------------------------------+ +| [a, , b, , c] | ++-------------------------------------+ + +SELECT string_to_array('a b c', ' '); + ++------------------------------------------+ +| string_to_array(Utf8("a b c"),Utf8(" ")) | ++------------------------------------------+ +| [a, b, c] | ++------------------------------------------+ + +-- Test with table data +CREATE TABLE split_test(s VARCHAR, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO split_test VALUES + ('hello,world,test', 1000), + ('a|b|c|d', 2000), + ('no-separator', 3000), + ('', 4000), + (NULL, 5000); + +Affected Rows: 5 + +-- Test splitting with different separators +SELECT s, string_to_array(s, ',') FROM split_test ORDER BY ts; + ++------------------+-----------------------------------------+ +| s | string_to_array(split_test.s,Utf8(",")) | ++------------------+-----------------------------------------+ +| hello,world,test | [hello, world, test] | +| a|b|c|d | [a|b|c|d] | +| no-separator | [no-separator] | +| | [] | +| | | ++------------------+-----------------------------------------+ + +SELECT s, string_to_array(s, '|') FROM split_test ORDER BY ts; + ++------------------+-----------------------------------------+ +| s | string_to_array(split_test.s,Utf8("|")) | ++------------------+-----------------------------------------+ +| hello,world,test | [hello,world,test] | +| a|b|c|d | [a, b, c, d] | +| no-separator | [no-separator] | +| | [] | +| | | ++------------------+-----------------------------------------+ + +SELECT s, string_to_array(s, '-') FROM split_test ORDER BY ts; + ++------------------+-----------------------------------------+ +| s | string_to_array(split_test.s,Utf8("-")) | ++------------------+-----------------------------------------+ +| hello,world,test | [hello,world,test] | +| a|b|c|d | [a|b|c|d] | +| no-separator | [no, separator] | +| | [] | +| | | ++------------------+-----------------------------------------+ + +-- Test splitting with multi-character separator +CREATE TABLE multi_sep_test(s VARCHAR, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO multi_sep_test VALUES + ('hello::world::test', 1000), + ('a---b---c', 2000), + ('single', 3000); + +Affected Rows: 3 + +SELECT s, string_to_array(s, '::') FROM multi_sep_test ORDER BY ts; + ++--------------------+----------------------------------------------+ +| s | string_to_array(multi_sep_test.s,Utf8("::")) | ++--------------------+----------------------------------------------+ +| hello::world::test | [hello, world, test] | +| a---b---c | [a---b---c] | +| single | [single] | ++--------------------+----------------------------------------------+ + +SELECT s, string_to_array(s, '---') FROM multi_sep_test ORDER BY ts; + ++--------------------+-----------------------------------------------+ +| s | string_to_array(multi_sep_test.s,Utf8("---")) | ++--------------------+-----------------------------------------------+ +| hello::world::test | [hello::world::test] | +| a---b---c | [a, b, c] | +| single | [single] | ++--------------------+-----------------------------------------------+ + +-- Test with Unicode separators +CREATE TABLE unicode_split_test(s VARCHAR, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO unicode_split_test VALUES + ('hello世world世test', 1000), + ('a🦆b🦆c', 2000); + +Affected Rows: 2 + +SELECT s, string_to_array(s, '世') FROM unicode_split_test ORDER BY ts; + ++--------------------+--------------------------------------------------+ +| s | string_to_array(unicode_split_test.s,Utf8("世")) | ++--------------------+--------------------------------------------------+ +| hello世world世test | [hello, world, test] | +| a🦆b🦆c | [a🦆b🦆c] | ++--------------------+--------------------------------------------------+ + +SELECT s, string_to_array(s, '🦆') FROM unicode_split_test ORDER BY ts; + ++--------------------+--------------------------------------------------+ +| s | string_to_array(unicode_split_test.s,Utf8("🦆")) | ++--------------------+--------------------------------------------------+ +| hello世world世test | [hello世world世test] | +| a🦆b🦆c | [a, b, c] | ++--------------------+--------------------------------------------------+ + +-- Test edge cases +-- Empty string +SELECT string_to_array('', ','); + ++-------------------------------------+ +| string_to_array(Utf8(""),Utf8(",")) | ++-------------------------------------+ +| [] | ++-------------------------------------+ + +-- Empty separator +SELECT string_to_array('hello', ''); + ++-----------------------------------------+ +| string_to_array(Utf8("hello"),Utf8("")) | ++-----------------------------------------+ +| [hello] | ++-----------------------------------------+ + +-- Multiple consecutive separators +SELECT string_to_array(',,hello,,world,,', ','); + ++-----------------------------------------------------+ +| string_to_array(Utf8(",,hello,,world,,"),Utf8(",")) | ++-----------------------------------------------------+ +| [, , hello, , world, , ] | ++-----------------------------------------------------+ + +-- Trailing separator +SELECT string_to_array('hello,', ','); + ++-------------------------------------------+ +| string_to_array(Utf8("hello,"),Utf8(",")) | ++-------------------------------------------+ +| [hello, ] | ++-------------------------------------------+ + +-- Leading separator +SELECT string_to_array(',hello', ','); + ++-------------------------------------------+ +| string_to_array(Utf8(",hello"),Utf8(",")) | ++-------------------------------------------+ +| [, hello] | ++-------------------------------------------+ + +DROP TABLE split_test; + +Affected Rows: 0 + +DROP TABLE multi_sep_test; + +Affected Rows: 0 + +DROP TABLE unicode_split_test; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/function/string/string_split.sql b/tests/cases/standalone/common/function/string/string_split.sql new file mode 100644 index 0000000000..ef0be5fff5 --- /dev/null +++ b/tests/cases/standalone/common/function/string/string_split.sql @@ -0,0 +1,75 @@ +-- Migrated from DuckDB test: test/sql/function/string/test_string_split.test +-- String split function tests + +-- Test basic string_split functionality +SELECT string_to_array(NULL, NULL); + +SELECT string_to_array('hello world', ' '); + +SELECT string_to_array(NULL, ' '); + +SELECT string_to_array('a b c', NULL); + +SELECT string_to_array('a b c', ' '); + +-- Test with table data +CREATE TABLE split_test(s VARCHAR, ts TIMESTAMP TIME INDEX); + +INSERT INTO split_test VALUES + ('hello,world,test', 1000), + ('a|b|c|d', 2000), + ('no-separator', 3000), + ('', 4000), + (NULL, 5000); + +-- Test splitting with different separators +SELECT s, string_to_array(s, ',') FROM split_test ORDER BY ts; + +SELECT s, string_to_array(s, '|') FROM split_test ORDER BY ts; + +SELECT s, string_to_array(s, '-') FROM split_test ORDER BY ts; + +-- Test splitting with multi-character separator +CREATE TABLE multi_sep_test(s VARCHAR, ts TIMESTAMP TIME INDEX); + +INSERT INTO multi_sep_test VALUES + ('hello::world::test', 1000), + ('a---b---c', 2000), + ('single', 3000); + +SELECT s, string_to_array(s, '::') FROM multi_sep_test ORDER BY ts; + +SELECT s, string_to_array(s, '---') FROM multi_sep_test ORDER BY ts; + +-- Test with Unicode separators +CREATE TABLE unicode_split_test(s VARCHAR, ts TIMESTAMP TIME INDEX); + +INSERT INTO unicode_split_test VALUES + ('hello世world世test', 1000), + ('a🦆b🦆c', 2000); + +SELECT s, string_to_array(s, '世') FROM unicode_split_test ORDER BY ts; + +SELECT s, string_to_array(s, '🦆') FROM unicode_split_test ORDER BY ts; + +-- Test edge cases +-- Empty string +SELECT string_to_array('', ','); + +-- Empty separator +SELECT string_to_array('hello', ''); + +-- Multiple consecutive separators +SELECT string_to_array(',,hello,,world,,', ','); + +-- Trailing separator +SELECT string_to_array('hello,', ','); + +-- Leading separator +SELECT string_to_array(',hello', ','); + +DROP TABLE split_test; + +DROP TABLE multi_sep_test; + +DROP TABLE unicode_split_test; diff --git a/tests/cases/standalone/common/function/string/substring.result b/tests/cases/standalone/common/function/string/substring.result new file mode 100644 index 0000000000..642571084a --- /dev/null +++ b/tests/cases/standalone/common/function/string/substring.result @@ -0,0 +1,173 @@ +-- Migrated from DuckDB test: test/sql/function/string/test_substring.test +-- Substring function tests +CREATE TABLE strings(s VARCHAR, "off" INTEGER, length INTEGER, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO strings VALUES + ('hello', 1, 2, 1000), + ('world', 2, 3, 2000), + ('b', 1, 1, 3000), + (NULL, 2, 2, 4000); + +Affected Rows: 4 + +-- Test zero length +SELECT SUBSTRING('🦆ab', 1, 0), SUBSTRING('abc', 1, 0); + ++----------------------------------------+---------------------------------------+ +| substr(Utf8("🦆ab"),Int64(1),Int64(0)) | substr(Utf8("abc"),Int64(1),Int64(0)) | ++----------------------------------------+---------------------------------------+ +| | | ++----------------------------------------+---------------------------------------+ + +-- Normal substring with constant offset/length +SELECT SUBSTRING(s, 1, 2) FROM strings ORDER BY ts; + ++-------------------------------------+ +| substr(strings.s,Int64(1),Int64(2)) | ++-------------------------------------+ +| he | +| wo | +| b | +| | ++-------------------------------------+ + +-- Substring out of range +SELECT SUBSTRING(s, 2, 2) FROM strings ORDER BY ts; + ++-------------------------------------+ +| substr(strings.s,Int64(2),Int64(2)) | ++-------------------------------------+ +| el | +| or | +| | +| | ++-------------------------------------+ + +-- Variable length offset/length +SELECT SUBSTRING(s, "off", "length") FROM strings ORDER BY ts; + ++----------------------------------------------+ +| substr(strings.s,strings.off,strings.length) | ++----------------------------------------------+ +| he | +| orl | +| b | +| | ++----------------------------------------------+ + +SELECT SUBSTRING(s, "off", 2) FROM strings ORDER BY ts; + ++----------------------------------------+ +| substr(strings.s,strings.off,Int64(2)) | ++----------------------------------------+ +| he | +| or | +| b | +| | ++----------------------------------------+ + +SELECT SUBSTRING(s, 1, length) FROM strings ORDER BY ts; + ++-------------------------------------------+ +| substr(strings.s,Int64(1),strings.length) | ++-------------------------------------------+ +| he | +| wor | +| b | +| | ++-------------------------------------------+ + +SELECT SUBSTRING('hello', "off", length) FROM strings ORDER BY ts; + ++--------------------------------------------------+ +| substr(Utf8("hello"),strings.off,strings.length) | ++--------------------------------------------------+ +| he | +| ell | +| h | +| el | ++--------------------------------------------------+ + +-- Test with NULL values +SELECT SUBSTRING(NULL, "off", length) FROM strings ORDER BY ts; + ++-----------------------------------------+ +| substr(NULL,strings.off,strings.length) | ++-----------------------------------------+ +| | +| | +| | +| | ++-----------------------------------------+ + +SELECT SUBSTRING(s, NULL, length) FROM strings ORDER BY ts; + ++---------------------------------------+ +| substr(strings.s,NULL,strings.length) | ++---------------------------------------+ +| | +| | +| | +| | ++---------------------------------------+ + +SELECT SUBSTRING(s, "off", NULL) FROM strings ORDER BY ts; + ++------------------------------------+ +| substr(strings.s,strings.off,NULL) | ++------------------------------------+ +| | +| | +| | +| | ++------------------------------------+ + +-- Test negative offsets +SELECT SUBSTRING('hello', -1, 3); + ++------------------------------------------+ +| substr(Utf8("hello"),Int64(-1),Int64(3)) | ++------------------------------------------+ +| h | ++------------------------------------------+ + +SELECT SUBSTRING('hello', 0, 3); + ++-----------------------------------------+ +| substr(Utf8("hello"),Int64(0),Int64(3)) | ++-----------------------------------------+ +| he | ++-----------------------------------------+ + +-- Test with Unicode characters +CREATE TABLE unicode_strings(s VARCHAR, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO unicode_strings VALUES + ('Hello 世界', 1000), + ('🦆🦀🐧', 2000), + ('café', 3000); + +Affected Rows: 3 + +SELECT s, SUBSTRING(s, 1, 5), SUBSTRING(s, 7, 2) FROM unicode_strings ORDER BY ts; + ++------------+---------------------------------------------+---------------------------------------------+ +| s | substr(unicode_strings.s,Int64(1),Int64(5)) | substr(unicode_strings.s,Int64(7),Int64(2)) | ++------------+---------------------------------------------+---------------------------------------------+ +| Hello 世界 | Hello | 世界 | +| 🦆🦀🐧 | 🦆🦀🐧 | | +| café | café | | ++------------+---------------------------------------------+---------------------------------------------+ + +DROP TABLE strings; + +Affected Rows: 0 + +DROP TABLE unicode_strings; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/function/string/substring.sql b/tests/cases/standalone/common/function/string/substring.sql new file mode 100644 index 0000000000..6c00bbe5f6 --- /dev/null +++ b/tests/cases/standalone/common/function/string/substring.sql @@ -0,0 +1,53 @@ +-- Migrated from DuckDB test: test/sql/function/string/test_substring.test +-- Substring function tests + +CREATE TABLE strings(s VARCHAR, "off" INTEGER, length INTEGER, ts TIMESTAMP TIME INDEX); + +INSERT INTO strings VALUES + ('hello', 1, 2, 1000), + ('world', 2, 3, 2000), + ('b', 1, 1, 3000), + (NULL, 2, 2, 4000); + +-- Test zero length +SELECT SUBSTRING('🦆ab', 1, 0), SUBSTRING('abc', 1, 0); + +-- Normal substring with constant offset/length +SELECT SUBSTRING(s, 1, 2) FROM strings ORDER BY ts; + +-- Substring out of range +SELECT SUBSTRING(s, 2, 2) FROM strings ORDER BY ts; + +-- Variable length offset/length +SELECT SUBSTRING(s, "off", "length") FROM strings ORDER BY ts; + +SELECT SUBSTRING(s, "off", 2) FROM strings ORDER BY ts; + +SELECT SUBSTRING(s, 1, length) FROM strings ORDER BY ts; + +SELECT SUBSTRING('hello', "off", length) FROM strings ORDER BY ts; + +-- Test with NULL values +SELECT SUBSTRING(NULL, "off", length) FROM strings ORDER BY ts; + +SELECT SUBSTRING(s, NULL, length) FROM strings ORDER BY ts; + +SELECT SUBSTRING(s, "off", NULL) FROM strings ORDER BY ts; + +-- Test negative offsets +SELECT SUBSTRING('hello', -1, 3); +SELECT SUBSTRING('hello', 0, 3); + +-- Test with Unicode characters +CREATE TABLE unicode_strings(s VARCHAR, ts TIMESTAMP TIME INDEX); + +INSERT INTO unicode_strings VALUES + ('Hello 世界', 1000), + ('🦆🦀🐧', 2000), + ('café', 3000); + +SELECT s, SUBSTRING(s, 1, 5), SUBSTRING(s, 7, 2) FROM unicode_strings ORDER BY ts; + +DROP TABLE strings; + +DROP TABLE unicode_strings; diff --git a/tests/cases/standalone/common/function/string/trim_pad.result b/tests/cases/standalone/common/function/string/trim_pad.result new file mode 100644 index 0000000000..c29b430180 --- /dev/null +++ b/tests/cases/standalone/common/function/string/trim_pad.result @@ -0,0 +1,274 @@ +-- String TRIM and PAD function tests +-- TRIM functions +SELECT TRIM(' hello world '); + ++--------------------------------+ +| btrim(Utf8(" hello world ")) | ++--------------------------------+ +| hello world | ++--------------------------------+ + +SELECT LTRIM(' hello world '); + ++--------------------------------+ +| ltrim(Utf8(" hello world ")) | ++--------------------------------+ +| hello world | ++--------------------------------+ + +SELECT RTRIM(' hello world '); + ++--------------------------------+ +| rtrim(Utf8(" hello world ")) | ++--------------------------------+ +| hello world | ++--------------------------------+ + +-- TRIM with specific characters +SELECT TRIM('x' FROM 'xxxhello worldxxx'); + ++--------------------------------------------+ +| btrim(Utf8("xxxhello worldxxx"),Utf8("x")) | ++--------------------------------------------+ +| hello world | ++--------------------------------------------+ + +SELECT LTRIM('hello world', 'hel'); + ++----------------------------------------+ +| ltrim(Utf8("hello world"),Utf8("hel")) | ++----------------------------------------+ +| o world | ++----------------------------------------+ + +SELECT RTRIM('hello world', 'dlr'); + ++----------------------------------------+ +| rtrim(Utf8("hello world"),Utf8("dlr")) | ++----------------------------------------+ +| hello wo | ++----------------------------------------+ + +-- PAD functions +SELECT LPAD('hello', 10, '*'); + ++-----------------------------------------+ +| lpad(Utf8("hello"),Int64(10),Utf8("*")) | ++-----------------------------------------+ +| *****hello | ++-----------------------------------------+ + +SELECT RPAD('hello', 10, '*'); + ++-----------------------------------------+ +| rpad(Utf8("hello"),Int64(10),Utf8("*")) | ++-----------------------------------------+ +| hello***** | ++-----------------------------------------+ + +-- Truncate +SELECT LPAD('hello', 3, '*'); + ++----------------------------------------+ +| lpad(Utf8("hello"),Int64(3),Utf8("*")) | ++----------------------------------------+ +| hel | ++----------------------------------------+ + +-- Truncate +SELECT RPAD('hello', 3, '*'); + ++----------------------------------------+ +| rpad(Utf8("hello"),Int64(3),Utf8("*")) | ++----------------------------------------+ +| hel | ++----------------------------------------+ + +-- PAD with multi-character padding +SELECT LPAD('test', 10, 'ab'); + ++-----------------------------------------+ +| lpad(Utf8("test"),Int64(10),Utf8("ab")) | ++-----------------------------------------+ +| abababtest | ++-----------------------------------------+ + +SELECT RPAD('test', 10, 'xy'); + ++-----------------------------------------+ +| rpad(Utf8("test"),Int64(10),Utf8("xy")) | ++-----------------------------------------+ +| testxyxyxy | ++-----------------------------------------+ + +-- Test with table data +CREATE TABLE trim_pad_test(s VARCHAR, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO trim_pad_test VALUES + (' hello ', 1000), + ('world ', 2000), + (' test', 3000), + ('no-spaces', 4000), + ('', 5000), + (NULL, 6000); + +Affected Rows: 6 + +-- Apply TRIM functions to table data +SELECT s, TRIM(s), LTRIM(s), RTRIM(s) FROM trim_pad_test ORDER BY ts; + ++-----------+------------------------+------------------------+------------------------+ +| s | btrim(trim_pad_test.s) | ltrim(trim_pad_test.s) | rtrim(trim_pad_test.s) | ++-----------+------------------------+------------------------+------------------------+ +| hello | hello | hello | hello | +| world | world | world | world | +| test | test | test | test | +| no-spaces | no-spaces | no-spaces | no-spaces | +| | | | | +| | | | | ++-----------+------------------------+------------------------+------------------------+ + +-- Apply PAD functions +SELECT s, LPAD(TRIM(s), 15, '-'), RPAD(TRIM(s), 15, '+') FROM trim_pad_test WHERE s IS NOT NULL ORDER BY ts; + ++-----------+--------------------------------------------------+--------------------------------------------------+ +| s | lpad(btrim(trim_pad_test.s),Int64(15),Utf8("-")) | rpad(btrim(trim_pad_test.s),Int64(15),Utf8("+")) | ++-----------+--------------------------------------------------+--------------------------------------------------+ +| hello | ----------hello | hello++++++++++ | +| world | ----------world | world++++++++++ | +| test | -----------test | test+++++++++++ | +| no-spaces | ------no-spaces | no-spaces++++++ | +| | --------------- | +++++++++++++++ | ++-----------+--------------------------------------------------+--------------------------------------------------+ + +-- Test with Unicode characters +SELECT TRIM(' 中文测试 '); + ++-----------------------------+ +| btrim(Utf8(" 中文测试 ")) | ++-----------------------------+ +| 中文测试 | ++-----------------------------+ + +SELECT LPAD('🚀', 10, '★'); + ++--------------------------------------+ +| lpad(Utf8("🚀"),Int64(10),Utf8("★")) | ++--------------------------------------+ +| ★★★★★★★★★🚀 | ++--------------------------------------+ + +SELECT RPAD('café', 8, '•'); + ++---------------------------------------+ +| rpad(Utf8("café"),Int64(8),Utf8("•")) | ++---------------------------------------+ +| café•••• | ++---------------------------------------+ + +-- Edge cases +SELECT TRIM(''); + ++-----------------+ +| btrim(Utf8("")) | ++-----------------+ +| | ++-----------------+ + +SELECT TRIM(NULL); + ++-------------+ +| btrim(NULL) | ++-------------+ +| | ++-------------+ + +SELECT LPAD('', 5, '*'); + ++-----------------------------------+ +| lpad(Utf8(""),Int64(5),Utf8("*")) | ++-----------------------------------+ +| ***** | ++-----------------------------------+ + +SELECT RPAD('', 5, '*'); + ++-----------------------------------+ +| rpad(Utf8(""),Int64(5),Utf8("*")) | ++-----------------------------------+ +| ***** | ++-----------------------------------+ + +SELECT LPAD('test', 0, '*'); + ++---------------------------------------+ +| lpad(Utf8("test"),Int64(0),Utf8("*")) | ++---------------------------------------+ +| | ++---------------------------------------+ + +SELECT RPAD('test', 0, '*'); + ++---------------------------------------+ +| rpad(Utf8("test"),Int64(0),Utf8("*")) | ++---------------------------------------+ +| | ++---------------------------------------+ + +-- TRIM with various whitespace characters +SELECT TRIM('\t\nhello\r\n\t'); + ++--------------------------------+ +| btrim(Utf8("\t\nhello\r\n\t")) | ++--------------------------------+ +| \t\nhello\r\n\t | ++--------------------------------+ + +SELECT LTRIM('\t\nhello world'); + ++--------------------------------+ +| ltrim(Utf8("\t\nhello world")) | ++--------------------------------+ +| \t\nhello world | ++--------------------------------+ + +SELECT RTRIM('hello world\r\n'); + ++--------------------------------+ +| rtrim(Utf8("hello world\r\n")) | ++--------------------------------+ +| hello world\r\n | ++--------------------------------+ + +-- Custom TRIM characters +CREATE TABLE custom_trim(s VARCHAR, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO custom_trim VALUES + ('***hello***', 1000), + ('---world---', 2000), + ('abcTESTabc', 3000); + +Affected Rows: 3 + +SELECT s, TRIM('*' FROM s), TRIM('-' FROM s), TRIM('abc' FROM s) FROM custom_trim ORDER BY ts; + ++-------------+--------------------------------+--------------------------------+----------------------------------+ +| s | btrim(custom_trim.s,Utf8("*")) | btrim(custom_trim.s,Utf8("-")) | btrim(custom_trim.s,Utf8("abc")) | ++-------------+--------------------------------+--------------------------------+----------------------------------+ +| ***hello*** | hello | ***hello*** | ***hello*** | +| ---world--- | ---world--- | world | ---world--- | +| abcTESTabc | abcTESTabc | abcTESTabc | TEST | ++-------------+--------------------------------+--------------------------------+----------------------------------+ + +DROP TABLE trim_pad_test; + +Affected Rows: 0 + +DROP TABLE custom_trim; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/function/string/trim_pad.sql b/tests/cases/standalone/common/function/string/trim_pad.sql new file mode 100644 index 0000000000..6c6ba92c65 --- /dev/null +++ b/tests/cases/standalone/common/function/string/trim_pad.sql @@ -0,0 +1,88 @@ +-- String TRIM and PAD function tests + +-- TRIM functions +SELECT TRIM(' hello world '); + +SELECT LTRIM(' hello world '); + +SELECT RTRIM(' hello world '); + +-- TRIM with specific characters +SELECT TRIM('x' FROM 'xxxhello worldxxx'); + +SELECT LTRIM('hello world', 'hel'); + +SELECT RTRIM('hello world', 'dlr'); + +-- PAD functions +SELECT LPAD('hello', 10, '*'); + +SELECT RPAD('hello', 10, '*'); + +-- Truncate +SELECT LPAD('hello', 3, '*'); + +-- Truncate +SELECT RPAD('hello', 3, '*'); + +-- PAD with multi-character padding +SELECT LPAD('test', 10, 'ab'); + +SELECT RPAD('test', 10, 'xy'); + +-- Test with table data +CREATE TABLE trim_pad_test(s VARCHAR, ts TIMESTAMP TIME INDEX); + +INSERT INTO trim_pad_test VALUES + (' hello ', 1000), + ('world ', 2000), + (' test', 3000), + ('no-spaces', 4000), + ('', 5000), + (NULL, 6000); + +-- Apply TRIM functions to table data +SELECT s, TRIM(s), LTRIM(s), RTRIM(s) FROM trim_pad_test ORDER BY ts; + +-- Apply PAD functions +SELECT s, LPAD(TRIM(s), 15, '-'), RPAD(TRIM(s), 15, '+') FROM trim_pad_test WHERE s IS NOT NULL ORDER BY ts; + +-- Test with Unicode characters +SELECT TRIM(' 中文测试 '); + +SELECT LPAD('🚀', 10, '★'); + +SELECT RPAD('café', 8, '•'); + +-- Edge cases +SELECT TRIM(''); + +SELECT TRIM(NULL); +SELECT LPAD('', 5, '*'); + +SELECT RPAD('', 5, '*'); + +SELECT LPAD('test', 0, '*'); + +SELECT RPAD('test', 0, '*'); + +-- TRIM with various whitespace characters +SELECT TRIM('\t\nhello\r\n\t'); + +SELECT LTRIM('\t\nhello world'); + +SELECT RTRIM('hello world\r\n'); + +-- Custom TRIM characters +CREATE TABLE custom_trim(s VARCHAR, ts TIMESTAMP TIME INDEX); + +INSERT INTO custom_trim VALUES + ('***hello***', 1000), + ('---world---', 2000), + ('abcTESTabc', 3000); + +SELECT s, TRIM('*' FROM s), TRIM('-' FROM s), TRIM('abc' FROM s) FROM custom_trim ORDER BY ts; + +DROP TABLE trim_pad_test; + +DROP TABLE custom_trim; diff --git a/tests/cases/standalone/common/function/string/upper_lower.result b/tests/cases/standalone/common/function/string/upper_lower.result new file mode 100644 index 0000000000..4f283530ef --- /dev/null +++ b/tests/cases/standalone/common/function/string/upper_lower.result @@ -0,0 +1,291 @@ +-- String case conversion function tests +-- Basic UPPER and LOWER functions +SELECT UPPER('hello world'); + ++----------------------------+ +| upper(Utf8("hello world")) | ++----------------------------+ +| HELLO WORLD | ++----------------------------+ + +SELECT LOWER('HELLO WORLD'); + ++----------------------------+ +| lower(Utf8("HELLO WORLD")) | ++----------------------------+ +| hello world | ++----------------------------+ + +SELECT UPPER('MiXeD cAsE'); + ++---------------------------+ +| upper(Utf8("MiXeD cAsE")) | ++---------------------------+ +| MIXED CASE | ++---------------------------+ + +SELECT LOWER('MiXeD cAsE'); + ++---------------------------+ +| lower(Utf8("MiXeD cAsE")) | ++---------------------------+ +| mixed case | ++---------------------------+ + +-- INITCAP (capitalize first letter of each word) +SELECT INITCAP('hello world'); + ++------------------------------+ +| initcap(Utf8("hello world")) | ++------------------------------+ +| Hello World | ++------------------------------+ + +SELECT INITCAP('HELLO WORLD'); + ++------------------------------+ +| initcap(Utf8("HELLO WORLD")) | ++------------------------------+ +| Hello World | ++------------------------------+ + +SELECT INITCAP('mIxEd CaSe TeSt'); + ++----------------------------------+ +| initcap(Utf8("mIxEd CaSe TeSt")) | ++----------------------------------+ +| Mixed Case Test | ++----------------------------------+ + +-- Test with NULL +SELECT UPPER(NULL); + ++-------------+ +| upper(NULL) | ++-------------+ +| | ++-------------+ + +SELECT LOWER(NULL); + ++-------------+ +| lower(NULL) | ++-------------+ +| | ++-------------+ + +SELECT INITCAP(NULL); + ++---------------+ +| initcap(NULL) | ++---------------+ +| | ++---------------+ + +-- Test with numbers and special characters +SELECT UPPER('hello123!@#'); + ++----------------------------+ +| upper(Utf8("hello123!@#")) | ++----------------------------+ +| HELLO123!@# | ++----------------------------+ + +SELECT LOWER('HELLO123!@#'); + ++----------------------------+ +| lower(Utf8("HELLO123!@#")) | ++----------------------------+ +| hello123!@# | ++----------------------------+ + +SELECT INITCAP('hello-world_test'); + ++-----------------------------------+ +| initcap(Utf8("hello-world_test")) | ++-----------------------------------+ +| Hello-World_Test | ++-----------------------------------+ + +-- Test with table data +CREATE TABLE case_test("name" VARCHAR, city VARCHAR, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO case_test VALUES + ('john doe', 'new york', 1000), + ('JANE SMITH', 'LOS ANGELES', 2000), + ('Bob Wilson', 'Chicago', 3000), + ('alice johnson', 'BOSTON', 4000); + +Affected Rows: 4 + +-- Apply case functions to table data +SELECT "name", UPPER("name"), LOWER("name"), INITCAP("name") FROM case_test ORDER BY ts; + ++---------------+-----------------------+-----------------------+-------------------------+ +| name | upper(case_test.name) | lower(case_test.name) | initcap(case_test.name) | ++---------------+-----------------------+-----------------------+-------------------------+ +| john doe | JOHN DOE | john doe | John Doe | +| JANE SMITH | JANE SMITH | jane smith | Jane Smith | +| Bob Wilson | BOB WILSON | bob wilson | Bob Wilson | +| alice johnson | ALICE JOHNSON | alice johnson | Alice Johnson | ++---------------+-----------------------+-----------------------+-------------------------+ + +SELECT city, UPPER(city), LOWER(city), INITCAP(city) FROM case_test ORDER BY ts; + ++-------------+-----------------------+-----------------------+-------------------------+ +| city | upper(case_test.city) | lower(case_test.city) | initcap(case_test.city) | ++-------------+-----------------------+-----------------------+-------------------------+ +| new york | NEW YORK | new york | New York | +| LOS ANGELES | LOS ANGELES | los angeles | Los Angeles | +| Chicago | CHICAGO | chicago | Chicago | +| BOSTON | BOSTON | boston | Boston | ++-------------+-----------------------+-----------------------+-------------------------+ + +-- Combined case operations +SELECT INITCAP(LOWER("name")) as formatted_name FROM case_test ORDER BY ts; + ++----------------+ +| formatted_name | ++----------------+ +| John Doe | +| Jane Smith | +| Bob Wilson | +| Alice Johnson | ++----------------+ + +-- Unicode case conversion +SELECT UPPER('café'); + ++---------------------+ +| upper(Utf8("café")) | ++---------------------+ +| CAFÉ | ++---------------------+ + +SELECT LOWER('CAFÉ'); + ++---------------------+ +| lower(Utf8("CAFÉ")) | ++---------------------+ +| café | ++---------------------+ + +-- German characters +SELECT UPPER('äöüß'); + ++---------------------+ +| upper(Utf8("äöüß")) | ++---------------------+ +| ÄÖÜSS | ++---------------------+ + +-- German uppercase +SELECT LOWER('ÄÖÜ'); + ++--------------------+ +| lower(Utf8("ÄÖÜ")) | ++--------------------+ +| äöü | ++--------------------+ + +-- Greek letters +SELECT UPPER('αβγ'); + ++--------------------+ +| upper(Utf8("αβγ")) | ++--------------------+ +| ΑΒΓ | ++--------------------+ + +SELECT LOWER('ΑΒΓ'); + ++--------------------+ +| lower(Utf8("ΑΒΓ")) | ++--------------------+ +| αβγ | ++--------------------+ + +-- Test with empty string +SELECT UPPER(''); + ++-----------------+ +| upper(Utf8("")) | ++-----------------+ +| | ++-----------------+ + +SELECT LOWER(''); + ++-----------------+ +| lower(Utf8("")) | ++-----------------+ +| | ++-----------------+ + +SELECT INITCAP(''); + ++-------------------+ +| initcap(Utf8("")) | ++-------------------+ +| | ++-------------------+ + +-- Test with single characters +SELECT UPPER('a'), UPPER('A'), UPPER('1'), UPPER(' '); + ++------------------+------------------+------------------+------------------+ +| upper(Utf8("a")) | upper(Utf8("A")) | upper(Utf8("1")) | upper(Utf8(" ")) | ++------------------+------------------+------------------+------------------+ +| A | A | 1 | | ++------------------+------------------+------------------+------------------+ + +SELECT LOWER('a'), LOWER('A'), LOWER('1'), LOWER(' '); + ++------------------+------------------+------------------+------------------+ +| lower(Utf8("a")) | lower(Utf8("A")) | lower(Utf8("1")) | lower(Utf8(" ")) | ++------------------+------------------+------------------+------------------+ +| a | a | 1 | | ++------------------+------------------+------------------+------------------+ + +SELECT INITCAP('a'), INITCAP('A'), INITCAP('1'); + ++--------------------+--------------------+--------------------+ +| initcap(Utf8("a")) | initcap(Utf8("A")) | initcap(Utf8("1")) | ++--------------------+--------------------+--------------------+ +| A | A | 1 | ++--------------------+--------------------+--------------------+ + +-- Complex Unicode examples +CREATE TABLE unicode_case(s VARCHAR, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO unicode_case VALUES + ('hello 世界', 1000), + ('HELLO 世界', 2000), + ('café à paris', 3000), + ('CAFÉ À PARIS', 4000); + +Affected Rows: 4 + +SELECT s, UPPER(s), LOWER(s), INITCAP(s) FROM unicode_case ORDER BY ts; + ++--------------+-----------------------+-----------------------+-------------------------+ +| s | upper(unicode_case.s) | lower(unicode_case.s) | initcap(unicode_case.s) | ++--------------+-----------------------+-----------------------+-------------------------+ +| hello 世界 | HELLO 世界 | hello 世界 | Hello 世界 | +| HELLO 世界 | HELLO 世界 | hello 世界 | Hello 世界 | +| café à paris | CAFÉ À PARIS | café à paris | Café À Paris | +| CAFÉ À PARIS | CAFÉ À PARIS | café à paris | Café À Paris | ++--------------+-----------------------+-----------------------+-------------------------+ + +DROP TABLE case_test; + +Affected Rows: 0 + +DROP TABLE unicode_case; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/function/string/upper_lower.sql b/tests/cases/standalone/common/function/string/upper_lower.sql new file mode 100644 index 0000000000..d89f78cfe8 --- /dev/null +++ b/tests/cases/standalone/common/function/string/upper_lower.sql @@ -0,0 +1,93 @@ +-- String case conversion function tests + +-- Basic UPPER and LOWER functions +SELECT UPPER('hello world'); + +SELECT LOWER('HELLO WORLD'); + +SELECT UPPER('MiXeD cAsE'); + +SELECT LOWER('MiXeD cAsE'); + +-- INITCAP (capitalize first letter of each word) +SELECT INITCAP('hello world'); + +SELECT INITCAP('HELLO WORLD'); + +SELECT INITCAP('mIxEd CaSe TeSt'); + +-- Test with NULL +SELECT UPPER(NULL); + +SELECT LOWER(NULL); + +SELECT INITCAP(NULL); + +-- Test with numbers and special characters +SELECT UPPER('hello123!@#'); + +SELECT LOWER('HELLO123!@#'); + +SELECT INITCAP('hello-world_test'); + +-- Test with table data +CREATE TABLE case_test("name" VARCHAR, city VARCHAR, ts TIMESTAMP TIME INDEX); + +INSERT INTO case_test VALUES + ('john doe', 'new york', 1000), + ('JANE SMITH', 'LOS ANGELES', 2000), + ('Bob Wilson', 'Chicago', 3000), + ('alice johnson', 'BOSTON', 4000); + +-- Apply case functions to table data +SELECT "name", UPPER("name"), LOWER("name"), INITCAP("name") FROM case_test ORDER BY ts; + +SELECT city, UPPER(city), LOWER(city), INITCAP(city) FROM case_test ORDER BY ts; + +-- Combined case operations +SELECT INITCAP(LOWER("name")) as formatted_name FROM case_test ORDER BY ts; + +-- Unicode case conversion +SELECT UPPER('café'); + +SELECT LOWER('CAFÉ'); + +-- German characters +SELECT UPPER('äöüß'); + +-- German uppercase +SELECT LOWER('ÄÖÜ'); + +-- Greek letters +SELECT UPPER('αβγ'); + +SELECT LOWER('ΑΒΓ'); + +-- Test with empty string +SELECT UPPER(''); + +SELECT LOWER(''); + +SELECT INITCAP(''); + +-- Test with single characters +SELECT UPPER('a'), UPPER('A'), UPPER('1'), UPPER(' '); + +SELECT LOWER('a'), LOWER('A'), LOWER('1'), LOWER(' '); + +SELECT INITCAP('a'), INITCAP('A'), INITCAP('1'); + +-- Complex Unicode examples +CREATE TABLE unicode_case(s VARCHAR, ts TIMESTAMP TIME INDEX); + +INSERT INTO unicode_case VALUES + ('hello 世界', 1000), + ('HELLO 世界', 2000), + ('café à paris', 3000), + ('CAFÉ À PARIS', 4000); + +SELECT s, UPPER(s), LOWER(s), INITCAP(s) FROM unicode_case ORDER BY ts; + +DROP TABLE case_test; + +DROP TABLE unicode_case; diff --git a/tests/cases/standalone/common/order/nulls_first_last.result b/tests/cases/standalone/common/order/nulls_first_last.result new file mode 100644 index 0000000000..3bf9570729 --- /dev/null +++ b/tests/cases/standalone/common/order/nulls_first_last.result @@ -0,0 +1,141 @@ +-- Migrated from DuckDB test: test/sql/order/test_nulls_first.test +-- Test NULLS FIRST/NULLS LAST +CREATE TABLE integers(i INTEGER, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO integers VALUES (1, 1000), (NULL, 2000); + +Affected Rows: 2 + +-- Default NULL ordering (usually NULLS LAST in most systems) +SELECT i FROM integers ORDER BY i; + ++---+ +| i | ++---+ +| 1 | +| | ++---+ + +-- Explicit NULLS FIRST +SELECT i FROM integers ORDER BY i NULLS FIRST; + ++---+ +| i | ++---+ +| | +| 1 | ++---+ + +-- Explicit NULLS LAST +SELECT i FROM integers ORDER BY i NULLS LAST; + ++---+ +| i | ++---+ +| 1 | +| | ++---+ + +-- Multiple columns with mixed NULL handling +CREATE TABLE test(i INTEGER, j INTEGER, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO test VALUES (1, 1, 1000), (NULL, 1, 2000), (1, NULL, 3000); + +Affected Rows: 3 + +SELECT i, j FROM test ORDER BY i NULLS FIRST, j NULLS LAST; + ++---+---+ +| i | j | ++---+---+ +| | 1 | +| 1 | 1 | +| 1 | | ++---+---+ + +SELECT i, j FROM test ORDER BY i NULLS FIRST, j NULLS FIRST; + ++---+---+ +| i | j | ++---+---+ +| | 1 | +| 1 | | +| 1 | 1 | ++---+---+ + +SELECT i, j FROM test ORDER BY i NULLS LAST, j NULLS FIRST; + ++---+---+ +| i | j | ++---+---+ +| 1 | | +| 1 | 1 | +| | 1 | ++---+---+ + +-- Test with DESC ordering +SELECT i, j FROM test ORDER BY i DESC NULLS FIRST, j DESC NULLS LAST; + ++---+---+ +| i | j | ++---+---+ +| | 1 | +| 1 | 1 | +| 1 | | ++---+---+ + +SELECT i, j FROM test ORDER BY i DESC NULLS LAST, j DESC NULLS FIRST; + ++---+---+ +| i | j | ++---+---+ +| 1 | | +| 1 | 1 | +| | 1 | ++---+---+ + +-- Test with strings +CREATE TABLE strings(s VARCHAR, i INTEGER, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO strings VALUES ('apple', 1, 1000), (NULL, 2, 2000), ('banana', NULL, 3000); + +Affected Rows: 3 + +SELECT s, i FROM strings ORDER BY s NULLS FIRST, i NULLS LAST; + ++--------+---+ +| s | i | ++--------+---+ +| | 2 | +| apple | 1 | +| banana | | ++--------+---+ + +SELECT s, i FROM strings ORDER BY s NULLS LAST, i NULLS FIRST; + ++--------+---+ +| s | i | ++--------+---+ +| apple | 1 | +| banana | | +| | 2 | ++--------+---+ + +DROP TABLE integers; + +Affected Rows: 0 + +DROP TABLE test; + +Affected Rows: 0 + +DROP TABLE strings; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/order/nulls_first_last.sql b/tests/cases/standalone/common/order/nulls_first_last.sql new file mode 100644 index 0000000000..dca46b3d21 --- /dev/null +++ b/tests/cases/standalone/common/order/nulls_first_last.sql @@ -0,0 +1,46 @@ +-- Migrated from DuckDB test: test/sql/order/test_nulls_first.test +-- Test NULLS FIRST/NULLS LAST + +CREATE TABLE integers(i INTEGER, ts TIMESTAMP TIME INDEX); + +INSERT INTO integers VALUES (1, 1000), (NULL, 2000); + +-- Default NULL ordering (usually NULLS LAST in most systems) +SELECT i FROM integers ORDER BY i; + +-- Explicit NULLS FIRST +SELECT i FROM integers ORDER BY i NULLS FIRST; + +-- Explicit NULLS LAST +SELECT i FROM integers ORDER BY i NULLS LAST; + +-- Multiple columns with mixed NULL handling +CREATE TABLE test(i INTEGER, j INTEGER, ts TIMESTAMP TIME INDEX); + +INSERT INTO test VALUES (1, 1, 1000), (NULL, 1, 2000), (1, NULL, 3000); + +SELECT i, j FROM test ORDER BY i NULLS FIRST, j NULLS LAST; + +SELECT i, j FROM test ORDER BY i NULLS FIRST, j NULLS FIRST; + +SELECT i, j FROM test ORDER BY i NULLS LAST, j NULLS FIRST; + +-- Test with DESC ordering +SELECT i, j FROM test ORDER BY i DESC NULLS FIRST, j DESC NULLS LAST; + +SELECT i, j FROM test ORDER BY i DESC NULLS LAST, j DESC NULLS FIRST; + +-- Test with strings +CREATE TABLE strings(s VARCHAR, i INTEGER, ts TIMESTAMP TIME INDEX); + +INSERT INTO strings VALUES ('apple', 1, 1000), (NULL, 2, 2000), ('banana', NULL, 3000); + +SELECT s, i FROM strings ORDER BY s NULLS FIRST, i NULLS LAST; + +SELECT s, i FROM strings ORDER BY s NULLS LAST, i NULLS FIRST; + +DROP TABLE integers; + +DROP TABLE test; + +DROP TABLE strings; diff --git a/tests/cases/standalone/common/order/order_by_basic.result b/tests/cases/standalone/common/order/order_by_basic.result new file mode 100644 index 0000000000..747507f7a1 --- /dev/null +++ b/tests/cases/standalone/common/order/order_by_basic.result @@ -0,0 +1,134 @@ +-- Migrated from DuckDB test: test/sql/order/test_order_by.test +-- Test ORDER BY keyword +CREATE TABLE test(a INTEGER, b INTEGER, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO test VALUES (11, 22, 1000), (12, 21, 2000), (13, 22, 3000); + +Affected Rows: 3 + +-- Simple ORDER BY +SELECT b FROM test ORDER BY a DESC; + ++----+ +| b | ++----+ +| 22 | +| 21 | +| 22 | ++----+ + +SELECT a, b FROM test ORDER BY a; + ++----+----+ +| a | b | ++----+----+ +| 11 | 22 | +| 12 | 21 | +| 13 | 22 | ++----+----+ + +SELECT a, b FROM test ORDER BY a DESC; + ++----+----+ +| a | b | ++----+----+ +| 13 | 22 | +| 12 | 21 | +| 11 | 22 | ++----+----+ + +-- ORDER BY on multiple columns +SELECT a, b FROM test ORDER BY b, a; + ++----+----+ +| a | b | ++----+----+ +| 12 | 21 | +| 11 | 22 | +| 13 | 22 | ++----+----+ + +-- ORDER BY using select indices +SELECT a, b FROM test ORDER BY 2, 1; + ++----+----+ +| a | b | ++----+----+ +| 12 | 21 | +| 11 | 22 | +| 13 | 22 | ++----+----+ + +SELECT a, b FROM test ORDER BY b DESC, a; + ++----+----+ +| a | b | ++----+----+ +| 11 | 22 | +| 13 | 22 | +| 12 | 21 | ++----+----+ + +SELECT a, b FROM test ORDER BY b, a DESC; + ++----+----+ +| a | b | ++----+----+ +| 12 | 21 | +| 13 | 22 | +| 11 | 22 | ++----+----+ + +-- TOP N queries with LIMIT +SELECT a, b FROM test ORDER BY b, a DESC LIMIT 1; + ++----+----+ +| a | b | ++----+----+ +| 12 | 21 | ++----+----+ + +-- OFFSET +SELECT a, b FROM test ORDER BY b, a DESC LIMIT 1 OFFSET 1; + ++----+----+ +| a | b | ++----+----+ +| 13 | 22 | ++----+----+ + +-- OFFSET without limit +SELECT a, b FROM test ORDER BY b, a DESC OFFSET 1; + ++----+----+ +| a | b | ++----+----+ +| 13 | 22 | +| 11 | 22 | ++----+----+ + +-- ORDER BY with WHERE +SELECT a, b FROM test WHERE a < 13 ORDER BY b; + ++----+----+ +| a | b | ++----+----+ +| 12 | 21 | +| 11 | 22 | ++----+----+ + +SELECT a, b FROM test WHERE a < 13 ORDER BY 2; + ++----+----+ +| a | b | ++----+----+ +| 12 | 21 | +| 11 | 22 | ++----+----+ + +DROP TABLE test; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/order/order_by_basic.sql b/tests/cases/standalone/common/order/order_by_basic.sql new file mode 100644 index 0000000000..68cba60911 --- /dev/null +++ b/tests/cases/standalone/common/order/order_by_basic.sql @@ -0,0 +1,39 @@ +-- Migrated from DuckDB test: test/sql/order/test_order_by.test +-- Test ORDER BY keyword + +CREATE TABLE test(a INTEGER, b INTEGER, ts TIMESTAMP TIME INDEX); + +INSERT INTO test VALUES (11, 22, 1000), (12, 21, 2000), (13, 22, 3000); + +-- Simple ORDER BY +SELECT b FROM test ORDER BY a DESC; + +SELECT a, b FROM test ORDER BY a; + +SELECT a, b FROM test ORDER BY a DESC; + +-- ORDER BY on multiple columns +SELECT a, b FROM test ORDER BY b, a; + +-- ORDER BY using select indices +SELECT a, b FROM test ORDER BY 2, 1; + +SELECT a, b FROM test ORDER BY b DESC, a; + +SELECT a, b FROM test ORDER BY b, a DESC; + +-- TOP N queries with LIMIT +SELECT a, b FROM test ORDER BY b, a DESC LIMIT 1; + +-- OFFSET +SELECT a, b FROM test ORDER BY b, a DESC LIMIT 1 OFFSET 1; + +-- OFFSET without limit +SELECT a, b FROM test ORDER BY b, a DESC OFFSET 1; + +-- ORDER BY with WHERE +SELECT a, b FROM test WHERE a < 13 ORDER BY b; + +SELECT a, b FROM test WHERE a < 13 ORDER BY 2; + +DROP TABLE test; diff --git a/tests/cases/standalone/common/order/order_by_expressions.result b/tests/cases/standalone/common/order/order_by_expressions.result new file mode 100644 index 0000000000..f121fac188 --- /dev/null +++ b/tests/cases/standalone/common/order/order_by_expressions.result @@ -0,0 +1,137 @@ +-- Migrated from DuckDB test: test/sql/order/test_order_by_expressions.test +-- Test ORDER BY with expressions +CREATE TABLE test(a INTEGER, b INTEGER, s VARCHAR, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO test VALUES + (1, 10, 'apple', 1000), + (2, 20, 'banana', 2000), + (3, 15, 'cherry', 3000), + (4, 25, 'date', 4000); + +Affected Rows: 4 + +-- ORDER BY with arithmetic expressions +SELECT a, b, a + b as sum FROM test ORDER BY a + b; + ++---+----+-----+ +| a | b | sum | ++---+----+-----+ +| 1 | 10 | 11 | +| 3 | 15 | 18 | +| 2 | 20 | 22 | +| 4 | 25 | 29 | ++---+----+-----+ + +SELECT a, b, a * b as product FROM test ORDER BY a * b DESC; + ++---+----+---------+ +| a | b | product | ++---+----+---------+ +| 4 | 25 | 100 | +| 3 | 15 | 45 | +| 2 | 20 | 40 | +| 1 | 10 | 10 | ++---+----+---------+ + +-- ORDER BY with string functions +SELECT s, LENGTH(s) as len FROM test ORDER BY LENGTH(s); + ++--------+-----+ +| s | len | ++--------+-----+ +| date | 4 | +| apple | 5 | +| banana | 6 | +| cherry | 6 | ++--------+-----+ + +SELECT s, UPPER(s) as upper_s FROM test ORDER BY UPPER(s); + ++--------+---------+ +| s | upper_s | ++--------+---------+ +| apple | APPLE | +| banana | BANANA | +| cherry | CHERRY | +| date | DATE | ++--------+---------+ + +-- ORDER BY with CASE expressions +SELECT a, b, + CASE + WHEN a % 2 = 0 THEN 'even' + ELSE 'odd' + END as parity +FROM test +ORDER BY + CASE + WHEN a % 2 = 0 THEN 1 + ELSE 2 + END, a; + ++---+----+--------+ +| a | b | parity | ++---+----+--------+ +| 2 | 20 | even | +| 4 | 25 | even | +| 1 | 10 | odd | +| 3 | 15 | odd | ++---+----+--------+ + +-- ORDER BY with conditional expressions +SELECT a, b FROM test ORDER BY GREATEST(a, b) DESC; + ++---+----+ +| a | b | ++---+----+ +| 4 | 25 | +| 2 | 20 | +| 3 | 15 | +| 1 | 10 | ++---+----+ + +SELECT a, b FROM test ORDER BY LEAST(a, b); + ++---+----+ +| a | b | ++---+----+ +| 1 | 10 | +| 2 | 20 | +| 3 | 15 | +| 4 | 25 | ++---+----+ + +-- ORDER BY with NULL-related expressions +INSERT INTO test VALUES (NULL, NULL, NULL, 5000); + +Affected Rows: 1 + +SELECT a, b, COALESCE(a, 999) as a_or_999 +FROM test +ORDER BY COALESCE(a, 999); + ++---+----+----------+ +| a | b | a_or_999 | ++---+----+----------+ +| 1 | 10 | 1 | +| 2 | 20 | 2 | +| 3 | 15 | 3 | +| 4 | 25 | 4 | +| | | 999 | ++---+----+----------+ + +-- ORDER BY with subqueries in expressions +SELECT a, b, + a - (SELECT MIN(a) FROM test WHERE a IS NOT NULL) as diff_from_min +FROM test +WHERE a IS NOT NULL +ORDER BY a - (SELECT MIN(a) FROM test WHERE a IS NOT NULL); + +Error: 1001(Unsupported), This feature is not implemented: Physical plan does not support logical expression ScalarSubquery() + +DROP TABLE test; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/order/order_by_expressions.sql b/tests/cases/standalone/common/order/order_by_expressions.sql new file mode 100644 index 0000000000..d4467c9528 --- /dev/null +++ b/tests/cases/standalone/common/order/order_by_expressions.sql @@ -0,0 +1,54 @@ +-- Migrated from DuckDB test: test/sql/order/test_order_by_expressions.test +-- Test ORDER BY with expressions + +CREATE TABLE test(a INTEGER, b INTEGER, s VARCHAR, ts TIMESTAMP TIME INDEX); + +INSERT INTO test VALUES + (1, 10, 'apple', 1000), + (2, 20, 'banana', 2000), + (3, 15, 'cherry', 3000), + (4, 25, 'date', 4000); + +-- ORDER BY with arithmetic expressions +SELECT a, b, a + b as sum FROM test ORDER BY a + b; + +SELECT a, b, a * b as product FROM test ORDER BY a * b DESC; + +-- ORDER BY with string functions +SELECT s, LENGTH(s) as len FROM test ORDER BY LENGTH(s); + +SELECT s, UPPER(s) as upper_s FROM test ORDER BY UPPER(s); + +-- ORDER BY with CASE expressions +SELECT a, b, + CASE + WHEN a % 2 = 0 THEN 'even' + ELSE 'odd' + END as parity +FROM test +ORDER BY + CASE + WHEN a % 2 = 0 THEN 1 + ELSE 2 + END, a; + +-- ORDER BY with conditional expressions +SELECT a, b FROM test ORDER BY GREATEST(a, b) DESC; + +SELECT a, b FROM test ORDER BY LEAST(a, b); + +-- ORDER BY with NULL-related expressions +INSERT INTO test VALUES (NULL, NULL, NULL, 5000); + +SELECT a, b, COALESCE(a, 999) as a_or_999 +FROM test +ORDER BY COALESCE(a, 999); + +-- ORDER BY with subqueries in expressions +SELECT a, b, + a - (SELECT MIN(a) FROM test WHERE a IS NOT NULL) as diff_from_min +FROM test +WHERE a IS NOT NULL +ORDER BY a - (SELECT MIN(a) FROM test WHERE a IS NOT NULL); + +DROP TABLE test; diff --git a/tests/cases/standalone/common/sample/basic_sample.result b/tests/cases/standalone/common/sample/basic_sample.result new file mode 100644 index 0000000000..1691337cd4 --- /dev/null +++ b/tests/cases/standalone/common/sample/basic_sample.result @@ -0,0 +1,93 @@ +-- Migrated from DuckDB test: test/sql/sample/same_seed_same_sample.test +-- FIXME: the results are wrong in this test, waits for https://github.com/apache/datafusion/pull/16325 +-- Test basic SAMPLE functionality +-- Create test table +CREATE TABLE test(x INTEGER, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +-- Insert test data +INSERT INTO test SELECT number, number * 1000 FROM numbers limit 10000; + +Affected Rows: 10000 + +-- Test TABLESAMPLE with percentage +SELECT COUNT(*) > 0 FROM test TABLESAMPLE (10 PERCENT); + ++---------------------+ +| count(*) > Int64(0) | ++---------------------+ +| true | ++---------------------+ + +-- Test TABLESAMPLE with row count +SELECT COUNT(*) FROM test TABLESAMPLE (100 ROWS); + ++----------+ +| count(*) | ++----------+ +| 10000 | ++----------+ + +-- Test TABLESAMPLE SYSTEM +SELECT COUNT(*) > 0 FROM test TABLESAMPLE SYSTEM (25 PERCENT); + ++---------------------+ +| count(*) > Int64(0) | ++---------------------+ +| true | ++---------------------+ + +-- Test TABLESAMPLE BERNOULLI +SELECT COUNT(*) > 0 FROM test TABLESAMPLE BERNOULLI (25 PERCENT); + ++---------------------+ +| count(*) > Int64(0) | ++---------------------+ +| true | ++---------------------+ + +-- Test with REPEATABLE for consistent results +SELECT COUNT(*) AS cnt1 FROM test TABLESAMPLE SYSTEM (10 PERCENT) REPEATABLE (42); + ++-------+ +| cnt1 | ++-------+ +| 10000 | ++-------+ + +SELECT COUNT(*) AS cnt2 FROM test TABLESAMPLE SYSTEM (10 PERCENT) REPEATABLE (42); + ++-------+ +| cnt2 | ++-------+ +| 10000 | ++-------+ + +-- Test sampling with WHERE clause +SELECT COUNT(*) FROM test TABLESAMPLE (10 PERCENT) WHERE x > 5000; + ++----------+ +| count(*) | ++----------+ +| 4999 | ++----------+ + +-- Test sampling with ORDER BY +SELECT x FROM test TABLESAMPLE (5 ROWS) ORDER BY x LIMIT 5; + ++---+ +| x | ++---+ +| 0 | +| 1 | +| 2 | +| 3 | +| 4 | ++---+ + +-- cleanup +DROP TABLE test; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/sample/basic_sample.sql b/tests/cases/standalone/common/sample/basic_sample.sql new file mode 100644 index 0000000000..1e00de81af --- /dev/null +++ b/tests/cases/standalone/common/sample/basic_sample.sql @@ -0,0 +1,35 @@ +-- Migrated from DuckDB test: test/sql/sample/same_seed_same_sample.test +-- FIXME: the results are wrong in this test, waits for https://github.com/apache/datafusion/pull/16325 +-- Test basic SAMPLE functionality + +-- Create test table +CREATE TABLE test(x INTEGER, ts TIMESTAMP TIME INDEX); + +-- Insert test data +INSERT INTO test SELECT number, number * 1000 FROM numbers limit 10000; + +-- Test TABLESAMPLE with percentage +SELECT COUNT(*) > 0 FROM test TABLESAMPLE (10 PERCENT); + +-- Test TABLESAMPLE with row count +SELECT COUNT(*) FROM test TABLESAMPLE (100 ROWS); + +-- Test TABLESAMPLE SYSTEM +SELECT COUNT(*) > 0 FROM test TABLESAMPLE SYSTEM (25 PERCENT); + +-- Test TABLESAMPLE BERNOULLI +SELECT COUNT(*) > 0 FROM test TABLESAMPLE BERNOULLI (25 PERCENT); + +-- Test with REPEATABLE for consistent results +SELECT COUNT(*) AS cnt1 FROM test TABLESAMPLE SYSTEM (10 PERCENT) REPEATABLE (42); + +SELECT COUNT(*) AS cnt2 FROM test TABLESAMPLE SYSTEM (10 PERCENT) REPEATABLE (42); + +-- Test sampling with WHERE clause +SELECT COUNT(*) FROM test TABLESAMPLE (10 PERCENT) WHERE x > 5000; + +-- Test sampling with ORDER BY +SELECT x FROM test TABLESAMPLE (5 ROWS) ORDER BY x LIMIT 5; + +-- cleanup +DROP TABLE test; \ No newline at end of file diff --git a/tests/cases/standalone/common/types/date/test_date.result b/tests/cases/standalone/common/types/date/test_date.result new file mode 100644 index 0000000000..ed7f213742 --- /dev/null +++ b/tests/cases/standalone/common/types/date/test_date.result @@ -0,0 +1,135 @@ +-- Migrated from DuckDB test: test/sql/types/date/test_date.test +-- Test basic DATE functionality +-- Create and insert into table +CREATE TABLE dates(i DATE, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO dates VALUES ('1993-08-14', 1000), (NULL, 2000); + +Affected Rows: 2 + +-- Check that we can select dates +SELECT * FROM dates ORDER BY ts; + ++------------+---------------------+ +| i | ts | ++------------+---------------------+ +| 1993-08-14 | 1970-01-01T00:00:01 | +| | 1970-01-01T00:00:02 | ++------------+---------------------+ + +-- extract function +SELECT extract(year FROM i) FROM dates ORDER BY ts; + ++---------------------------------+ +| date_part(Utf8("YEAR"),dates.i) | ++---------------------------------+ +| 1993 | +| | ++---------------------------------+ + +-- Check that we can convert dates to string +SELECT CAST(i AS VARCHAR) FROM dates ORDER BY ts; + ++------------+ +| dates.i | ++------------+ +| 1993-08-14 | +| | ++------------+ + +-- Check that we can add days to a date +SELECT i + INTERVAL '5 days' FROM dates ORDER BY ts; + ++-----------------------------------------------------------------------------------------------+ +| dates.i + IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 5, nanoseconds: 0 }") | ++-----------------------------------------------------------------------------------------------+ +| 1993-08-19 | +| | ++-----------------------------------------------------------------------------------------------+ + +-- Check that we can subtract days from a date +SELECT i - INTERVAL '5 days' FROM dates ORDER BY ts; + ++-----------------------------------------------------------------------------------------------+ +| dates.i - IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 5, nanoseconds: 0 }") | ++-----------------------------------------------------------------------------------------------+ +| 1993-08-09 | +| | ++-----------------------------------------------------------------------------------------------+ + +-- Test date subtraction resulting in interval +SELECT i - DATE '1993-08-14' FROM dates ORDER BY ts; + ++------------------------------+ +| dates.i - Utf8("1993-08-14") | ++------------------------------+ +| P0D | +| | ++------------------------------+ + +-- Test various date formats +CREATE TABLE date_formats(d DATE, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO date_formats VALUES + ('2021-03-01', 1000), + ('2021-12-31', 2000), + ('2000-01-01', 3000), + ('1970-01-01', 4000); + +Affected Rows: 4 + +SELECT d, extract(year FROM d), extract(month FROM d), extract(day FROM d) FROM date_formats ORDER BY d; + ++------------+----------------------------------------+-----------------------------------------+---------------------------------------+ +| d | date_part(Utf8("YEAR"),date_formats.d) | date_part(Utf8("MONTH"),date_formats.d) | date_part(Utf8("DAY"),date_formats.d) | ++------------+----------------------------------------+-----------------------------------------+---------------------------------------+ +| 1970-01-01 | 1970 | 1 | 1 | +| 2000-01-01 | 2000 | 1 | 1 | +| 2021-03-01 | 2021 | 3 | 1 | +| 2021-12-31 | 2021 | 12 | 31 | ++------------+----------------------------------------+-----------------------------------------+---------------------------------------+ + +-- Test date comparison +SELECT d FROM date_formats WHERE d > '2000-01-01' ORDER BY d; + ++------------+ +| d | ++------------+ +| 2021-03-01 | +| 2021-12-31 | ++------------+ + +SELECT d FROM date_formats WHERE d BETWEEN '2000-01-01' AND '2021-06-01' ORDER BY d; + ++------------+ +| d | ++------------+ +| 2000-01-01 | +| 2021-03-01 | ++------------+ + +-- Test NULL handling +INSERT INTO date_formats VALUES (NULL, 5000); + +Affected Rows: 1 + +SELECT COUNT(*), COUNT(d) FROM date_formats; + ++----------+-----------------------+ +| count(*) | count(date_formats.d) | ++----------+-----------------------+ +| 5 | 4 | ++----------+-----------------------+ + +DROP TABLE dates; + +Affected Rows: 0 + +DROP TABLE date_formats; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/types/date/test_date.sql b/tests/cases/standalone/common/types/date/test_date.sql new file mode 100644 index 0000000000..5bf0db4b4b --- /dev/null +++ b/tests/cases/standalone/common/types/date/test_date.sql @@ -0,0 +1,50 @@ +-- Migrated from DuckDB test: test/sql/types/date/test_date.test +-- Test basic DATE functionality + +-- Create and insert into table +CREATE TABLE dates(i DATE, ts TIMESTAMP TIME INDEX); + +INSERT INTO dates VALUES ('1993-08-14', 1000), (NULL, 2000); + +-- Check that we can select dates +SELECT * FROM dates ORDER BY ts; + +-- extract function +SELECT extract(year FROM i) FROM dates ORDER BY ts; + +-- Check that we can convert dates to string +SELECT CAST(i AS VARCHAR) FROM dates ORDER BY ts; + +-- Check that we can add days to a date +SELECT i + INTERVAL '5 days' FROM dates ORDER BY ts; + +-- Check that we can subtract days from a date +SELECT i - INTERVAL '5 days' FROM dates ORDER BY ts; + +-- Test date subtraction resulting in interval +SELECT i - DATE '1993-08-14' FROM dates ORDER BY ts; + +-- Test various date formats +CREATE TABLE date_formats(d DATE, ts TIMESTAMP TIME INDEX); + +INSERT INTO date_formats VALUES + ('2021-03-01', 1000), + ('2021-12-31', 2000), + ('2000-01-01', 3000), + ('1970-01-01', 4000); + +SELECT d, extract(year FROM d), extract(month FROM d), extract(day FROM d) FROM date_formats ORDER BY d; + +-- Test date comparison +SELECT d FROM date_formats WHERE d > '2000-01-01' ORDER BY d; + +SELECT d FROM date_formats WHERE d BETWEEN '2000-01-01' AND '2021-06-01' ORDER BY d; + +-- Test NULL handling +INSERT INTO date_formats VALUES (NULL, 5000); + +SELECT COUNT(*), COUNT(d) FROM date_formats; + +DROP TABLE dates; + +DROP TABLE date_formats; diff --git a/tests/cases/standalone/common/types/float/ieee_floating_points.result b/tests/cases/standalone/common/types/float/ieee_floating_points.result new file mode 100644 index 0000000000..69198d490e --- /dev/null +++ b/tests/cases/standalone/common/types/float/ieee_floating_points.result @@ -0,0 +1,144 @@ +-- Migrated from DuckDB test: test/sql/types/float/ieee_floating_points.test +-- Test IEEE floating point behavior +-- Test special float values +CREATE TABLE float_special(f FLOAT, d DOUBLE, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +-- Insert special values +INSERT INTO float_special VALUES + (0.0, 0.0, 1000), + (-0.0, -0.0, 2000), + ('inf'::FLOAT, 'inf'::DOUBLE, 3000), + ('-inf'::FLOAT, '-inf'::DOUBLE, 4000), + ('nan'::FLOAT, 'nan'::DOUBLE, 5000); + +Affected Rows: 5 + +-- Test basic operations with special values +SELECT f, d FROM float_special ORDER BY ts; + ++------+------+ +| f | d | ++------+------+ +| 0.0 | 0.0 | +| -0.0 | -0.0 | +| inf | inf | +| -inf | -inf | +| NaN | NaN | ++------+------+ + +-- Test comparison with infinity +-- It doesn't follow the IEEE standard, but follows PG instead. +SELECT f, f > 1000000 FROM float_special ORDER BY ts; + ++------+----------------------------------+ +| f | float_special.f > Int64(1000000) | ++------+----------------------------------+ +| 0.0 | false | +| -0.0 | false | +| inf | true | +| -inf | false | +| NaN | true | ++------+----------------------------------+ + +SELECT d, d < -1000000 FROM float_special ORDER BY ts; + ++------+-----------------------------------+ +| d | float_special.d < Int64(-1000000) | ++------+-----------------------------------+ +| 0.0 | false | +| -0.0 | false | +| inf | false | +| -inf | true | +| NaN | false | ++------+-----------------------------------+ + +-- Test NaN behavior +-- NaN != NaN +SELECT f, f = f FROM float_special WHERE f != f ORDER BY ts; + +++ +++ + +SELECT d, d IS NULL FROM float_special ORDER BY ts; + ++------+-------------------------+ +| d | float_special.d IS NULL | ++------+-------------------------+ +| 0.0 | false | +| -0.0 | false | +| inf | false | +| -inf | false | +| NaN | false | ++------+-------------------------+ + +-- Test arithmetic with special values +SELECT f, f + 1 FROM float_special ORDER BY ts; + ++------+----------------------------+ +| f | float_special.f + Int64(1) | ++------+----------------------------+ +| 0.0 | 1.0 | +| -0.0 | 1.0 | +| inf | inf | +| -inf | -inf | +| NaN | NaN | ++------+----------------------------+ + +SELECT d, d * 2 FROM float_special ORDER BY ts; + ++------+----------------------------+ +| d | float_special.d * Int64(2) | ++------+----------------------------+ +| 0.0 | 0.0 | +| -0.0 | -0.0 | +| inf | inf | +| -inf | -inf | +| NaN | NaN | ++------+----------------------------+ + +-- Test normal floating point precision +CREATE TABLE float_precision(f FLOAT, d DOUBLE, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO float_precision VALUES + (1.23456789, 1.23456789012345, 1000), + (0.000001, 0.000000000001, 2000), + (1e10, 1e15, 3000), + (1e-10, 1e-15, 4000); + +Affected Rows: 4 + +SELECT f, d FROM float_precision ORDER BY ts; + ++---------------+--------------------+ +| f | d | ++---------------+--------------------+ +| 1.2345679 | 1.23456789012345 | +| 0.000001 | 1e-12 | +| 10000000000.0 | 1000000000000000.0 | +| 1e-10 | 1e-15 | ++---------------+--------------------+ + +-- Test rounding and precision +SELECT ROUND(f, 3), ROUND(d, 6) FROM float_precision ORDER BY ts; + ++-----------------------------------+-----------------------------------+ +| round(float_precision.f,Int64(3)) | round(float_precision.d,Int64(6)) | ++-----------------------------------+-----------------------------------+ +| 1.235 | 1.234568 | +| 0.0 | 0.0 | +| 10000000000.0 | 1000000000000000.0 | +| 0.0 | 0.0 | ++-----------------------------------+-----------------------------------+ + +DROP TABLE float_special; + +Affected Rows: 0 + +DROP TABLE float_precision; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/types/float/ieee_floating_points.sql b/tests/cases/standalone/common/types/float/ieee_floating_points.sql new file mode 100644 index 0000000000..755b206554 --- /dev/null +++ b/tests/cases/standalone/common/types/float/ieee_floating_points.sql @@ -0,0 +1,51 @@ +-- Migrated from DuckDB test: test/sql/types/float/ieee_floating_points.test +-- Test IEEE floating point behavior + +-- Test special float values +CREATE TABLE float_special(f FLOAT, d DOUBLE, ts TIMESTAMP TIME INDEX); + +-- Insert special values +INSERT INTO float_special VALUES + (0.0, 0.0, 1000), + (-0.0, -0.0, 2000), + ('inf'::FLOAT, 'inf'::DOUBLE, 3000), + ('-inf'::FLOAT, '-inf'::DOUBLE, 4000), + ('nan'::FLOAT, 'nan'::DOUBLE, 5000); + +-- Test basic operations with special values +SELECT f, d FROM float_special ORDER BY ts; + +-- Test comparison with infinity +-- It doesn't follow the IEEE standard, but follows PG instead. +SELECT f, f > 1000000 FROM float_special ORDER BY ts; + +SELECT d, d < -1000000 FROM float_special ORDER BY ts; + +-- Test NaN behavior +-- NaN != NaN +SELECT f, f = f FROM float_special WHERE f != f ORDER BY ts; + +SELECT d, d IS NULL FROM float_special ORDER BY ts; + +-- Test arithmetic with special values +SELECT f, f + 1 FROM float_special ORDER BY ts; + +SELECT d, d * 2 FROM float_special ORDER BY ts; + +-- Test normal floating point precision +CREATE TABLE float_precision(f FLOAT, d DOUBLE, ts TIMESTAMP TIME INDEX); + +INSERT INTO float_precision VALUES + (1.23456789, 1.23456789012345, 1000), + (0.000001, 0.000000000001, 2000), + (1e10, 1e15, 3000), + (1e-10, 1e-15, 4000); + +SELECT f, d FROM float_precision ORDER BY ts; + +-- Test rounding and precision +SELECT ROUND(f, 3), ROUND(d, 6) FROM float_precision ORDER BY ts; + +DROP TABLE float_special; + +DROP TABLE float_precision; diff --git a/tests/cases/standalone/common/types/float/infinity_nan.result b/tests/cases/standalone/common/types/float/infinity_nan.result new file mode 100644 index 0000000000..dcfbdd81ce --- /dev/null +++ b/tests/cases/standalone/common/types/float/infinity_nan.result @@ -0,0 +1,184 @@ +-- Migrated from DuckDB test: test/sql/types/float/infinity_test.test, nan_aggregate.test +-- Test infinity and NaN handling +-- Note: it doesn't follow the IEEE standard, but follows PG instead: https://www.postgresql.org/docs/current/datatype-numeric.html +-- Test infinity operations +CREATE TABLE inf_test(val DOUBLE, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO inf_test VALUES + ('inf'::DOUBLE, 1000), + ('-inf'::DOUBLE, 2000), + (1.0, 3000), + (-1.0, 4000), + (0.0, 5000); + +Affected Rows: 5 + +-- Test infinity comparisons +SELECT val, val > 0 FROM inf_test ORDER BY ts; + ++------+-------------------------+ +| val | inf_test.val > Int64(0) | ++------+-------------------------+ +| inf | true | +| -inf | false | +| 1.0 | true | +| -1.0 | false | +| 0.0 | false | ++------+-------------------------+ + +SELECT val, val < 0 FROM inf_test ORDER BY ts; + ++------+-------------------------+ +| val | inf_test.val < Int64(0) | ++------+-------------------------+ +| inf | false | +| -inf | true | +| 1.0 | false | +| -1.0 | true | +| 0.0 | false | ++------+-------------------------+ + +SELECT val, val = 'inf'::DOUBLE FROM inf_test ORDER BY ts; + ++------+----------------------------+ +| val | inf_test.val = Utf8("inf") | ++------+----------------------------+ +| inf | true | +| -inf | false | +| 1.0 | false | +| -1.0 | false | +| 0.0 | false | ++------+----------------------------+ + +-- Test infinity in aggregates +SELECT MAX(val), MIN(val) FROM inf_test; + ++-------------------+-------------------+ +| max(inf_test.val) | min(inf_test.val) | ++-------------------+-------------------+ +| inf | -inf | ++-------------------+-------------------+ + +SELECT SUM(val), AVG(val) FROM inf_test; + ++-------------------+-------------------+ +| sum(inf_test.val) | avg(inf_test.val) | ++-------------------+-------------------+ +| NaN | NaN | ++-------------------+-------------------+ + +-- Test NaN behavior +CREATE TABLE nan_test(val DOUBLE, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO nan_test VALUES + ('nan'::DOUBLE, 1000), + (1.0, 2000), + (2.0, 3000), + ('nan'::DOUBLE, 4000), + (3.0, 5000); + +Affected Rows: 5 + +-- Test NaN in aggregates +SELECT COUNT(*), COUNT(val) FROM nan_test; + ++----------+---------------------+ +| count(*) | count(nan_test.val) | ++----------+---------------------+ +| 5 | 5 | ++----------+---------------------+ + +SELECT MAX(val), MIN(val) FROM nan_test; + ++-------------------+-------------------+ +| max(nan_test.val) | min(nan_test.val) | ++-------------------+-------------------+ +| NaN | 1.0 | ++-------------------+-------------------+ + +SELECT SUM(val), AVG(val) FROM nan_test; + ++-------------------+-------------------+ +| sum(nan_test.val) | avg(nan_test.val) | ++-------------------+-------------------+ +| NaN | NaN | ++-------------------+-------------------+ + +-- Test NaN comparisons +SELECT val, val = val FROM nan_test ORDER BY ts; + ++-----+-----------------------------+ +| val | nan_test.val = nan_test.val | ++-----+-----------------------------+ +| NaN | true | +| 1.0 | true | +| 2.0 | true | +| NaN | true | +| 3.0 | true | ++-----+-----------------------------+ + +SELECT val, val IS NULL FROM nan_test ORDER BY ts; + ++-----+----------------------+ +| val | nan_test.val IS NULL | ++-----+----------------------+ +| NaN | false | +| 1.0 | false | +| 2.0 | false | +| NaN | false | +| 3.0 | false | ++-----+----------------------+ + +-- Test arithmetic with infinity and NaN +SELECT 'inf'::DOUBLE + 1; + ++------------------------+ +| Utf8("inf") + Int64(1) | ++------------------------+ +| inf | ++------------------------+ + +SELECT 'inf'::DOUBLE - 'inf'::DOUBLE; + ++---------------------------+ +| Utf8("inf") - Utf8("inf") | ++---------------------------+ +| NaN | ++---------------------------+ + +SELECT 'inf'::DOUBLE * 0; + ++------------------------+ +| Utf8("inf") * Int64(0) | ++------------------------+ +| NaN | ++------------------------+ + +SELECT 'nan'::DOUBLE + 1; + ++------------------------+ +| Utf8("nan") + Int64(1) | ++------------------------+ +| NaN | ++------------------------+ + +SELECT 'nan'::DOUBLE * 0; + ++------------------------+ +| Utf8("nan") * Int64(0) | ++------------------------+ +| NaN | ++------------------------+ + +DROP TABLE inf_test; + +Affected Rows: 0 + +DROP TABLE nan_test; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/types/float/infinity_nan.sql b/tests/cases/standalone/common/types/float/infinity_nan.sql new file mode 100644 index 0000000000..5f495170e1 --- /dev/null +++ b/tests/cases/standalone/common/types/float/infinity_nan.sql @@ -0,0 +1,61 @@ +-- Migrated from DuckDB test: test/sql/types/float/infinity_test.test, nan_aggregate.test +-- Test infinity and NaN handling +-- Note: it doesn't follow the IEEE standard, but follows PG instead: https://www.postgresql.org/docs/current/datatype-numeric.html +-- Test infinity operations +CREATE TABLE inf_test(val DOUBLE, ts TIMESTAMP TIME INDEX); + +INSERT INTO inf_test VALUES + ('inf'::DOUBLE, 1000), + ('-inf'::DOUBLE, 2000), + (1.0, 3000), + (-1.0, 4000), + (0.0, 5000); + +-- Test infinity comparisons +SELECT val, val > 0 FROM inf_test ORDER BY ts; + +SELECT val, val < 0 FROM inf_test ORDER BY ts; + +SELECT val, val = 'inf'::DOUBLE FROM inf_test ORDER BY ts; + +-- Test infinity in aggregates +SELECT MAX(val), MIN(val) FROM inf_test; + +SELECT SUM(val), AVG(val) FROM inf_test; + +-- Test NaN behavior +CREATE TABLE nan_test(val DOUBLE, ts TIMESTAMP TIME INDEX); + +INSERT INTO nan_test VALUES + ('nan'::DOUBLE, 1000), + (1.0, 2000), + (2.0, 3000), + ('nan'::DOUBLE, 4000), + (3.0, 5000); + +-- Test NaN in aggregates +SELECT COUNT(*), COUNT(val) FROM nan_test; + +SELECT MAX(val), MIN(val) FROM nan_test; + +SELECT SUM(val), AVG(val) FROM nan_test; + +-- Test NaN comparisons +SELECT val, val = val FROM nan_test ORDER BY ts; + +SELECT val, val IS NULL FROM nan_test ORDER BY ts; + +-- Test arithmetic with infinity and NaN +SELECT 'inf'::DOUBLE + 1; + +SELECT 'inf'::DOUBLE - 'inf'::DOUBLE; + +SELECT 'inf'::DOUBLE * 0; + +SELECT 'nan'::DOUBLE + 1; + +SELECT 'nan'::DOUBLE * 0; + +DROP TABLE inf_test; + +DROP TABLE nan_test; diff --git a/tests/cases/standalone/common/types/float/nan_arithmetic_extended.result b/tests/cases/standalone/common/types/float/nan_arithmetic_extended.result new file mode 100644 index 0000000000..392b79fa4a --- /dev/null +++ b/tests/cases/standalone/common/types/float/nan_arithmetic_extended.result @@ -0,0 +1,317 @@ +-- Migrated from DuckDB test: test/sql/types/float/nan_arithmetic.test +-- Test arithmetic on NaN values +-- Test NaN arithmetic with FLOAT +-- Any arithmetic on a NaN value will result in a NaN value +SELECT 'nan'::FLOAT + 1; + ++------------------------+ +| Utf8("nan") + Int64(1) | ++------------------------+ +| NaN | ++------------------------+ + +SELECT 'nan'::FLOAT + 'inf'::FLOAT; + ++---------------------------+ +| Utf8("nan") + Utf8("inf") | ++---------------------------+ +| NaN | ++---------------------------+ + +SELECT 'nan'::FLOAT - 1; + ++------------------------+ +| Utf8("nan") - Int64(1) | ++------------------------+ +| NaN | ++------------------------+ + +SELECT 'nan'::FLOAT - 'inf'::FLOAT; + ++---------------------------+ +| Utf8("nan") - Utf8("inf") | ++---------------------------+ +| NaN | ++---------------------------+ + +SELECT 'nan'::FLOAT * 1; + ++------------------------+ +| Utf8("nan") * Int64(1) | ++------------------------+ +| NaN | ++------------------------+ + +SELECT 'nan'::FLOAT * 'inf'::FLOAT; + ++---------------------------+ +| Utf8("nan") * Utf8("inf") | ++---------------------------+ +| NaN | ++---------------------------+ + +SELECT 'nan'::FLOAT / 1; + ++------------------------+ +| Utf8("nan") / Int64(1) | ++------------------------+ +| NaN | ++------------------------+ + +SELECT 'nan'::FLOAT / 'inf'::FLOAT; + ++---------------------------+ +| Utf8("nan") / Utf8("inf") | ++---------------------------+ +| NaN | ++---------------------------+ + +SELECT 'nan'::FLOAT % 1; + ++------------------------+ +| Utf8("nan") % Int64(1) | ++------------------------+ +| NaN | ++------------------------+ + +SELECT 'nan'::FLOAT % 'inf'::FLOAT; + ++---------------------------+ +| Utf8("nan") % Utf8("inf") | ++---------------------------+ +| NaN | ++---------------------------+ + +SELECT -('nan'::FLOAT); + ++-----------------+ +| (- Utf8("nan")) | ++-----------------+ +| NaN | ++-----------------+ + +-- Test NaN arithmetic with DOUBLE +SELECT 'nan'::DOUBLE + 1; + ++------------------------+ +| Utf8("nan") + Int64(1) | ++------------------------+ +| NaN | ++------------------------+ + +SELECT 'nan'::DOUBLE + 'inf'::DOUBLE; + ++---------------------------+ +| Utf8("nan") + Utf8("inf") | ++---------------------------+ +| NaN | ++---------------------------+ + +SELECT 'nan'::DOUBLE - 1; + ++------------------------+ +| Utf8("nan") - Int64(1) | ++------------------------+ +| NaN | ++------------------------+ + +SELECT 'nan'::DOUBLE - 'inf'::DOUBLE; + ++---------------------------+ +| Utf8("nan") - Utf8("inf") | ++---------------------------+ +| NaN | ++---------------------------+ + +SELECT 'nan'::DOUBLE * 1; + ++------------------------+ +| Utf8("nan") * Int64(1) | ++------------------------+ +| NaN | ++------------------------+ + +SELECT 'nan'::DOUBLE * 'inf'::DOUBLE; + ++---------------------------+ +| Utf8("nan") * Utf8("inf") | ++---------------------------+ +| NaN | ++---------------------------+ + +SELECT 'nan'::DOUBLE / 1; + ++------------------------+ +| Utf8("nan") / Int64(1) | ++------------------------+ +| NaN | ++------------------------+ + +SELECT 'nan'::DOUBLE / 'inf'::DOUBLE; + ++---------------------------+ +| Utf8("nan") / Utf8("inf") | ++---------------------------+ +| NaN | ++---------------------------+ + +SELECT 'nan'::DOUBLE % 1; + ++------------------------+ +| Utf8("nan") % Int64(1) | ++------------------------+ +| NaN | ++------------------------+ + +SELECT 'nan'::DOUBLE % 'inf'::DOUBLE; + ++---------------------------+ +| Utf8("nan") % Utf8("inf") | ++---------------------------+ +| NaN | ++---------------------------+ + +SELECT -('nan'::DOUBLE); + ++-----------------+ +| (- Utf8("nan")) | ++-----------------+ +| NaN | ++-----------------+ + +-- Test infinity arithmetic +SELECT 'inf'::FLOAT + 1; + ++------------------------+ +| Utf8("inf") + Int64(1) | ++------------------------+ +| inf | ++------------------------+ + +SELECT 'inf'::FLOAT - 1; + ++------------------------+ +| Utf8("inf") - Int64(1) | ++------------------------+ +| inf | ++------------------------+ + +SELECT 'inf'::FLOAT * 2; + ++------------------------+ +| Utf8("inf") * Int64(2) | ++------------------------+ +| inf | ++------------------------+ + +SELECT 'inf'::FLOAT / 2; + ++------------------------+ +| Utf8("inf") / Int64(2) | ++------------------------+ +| inf | ++------------------------+ + +SELECT -('inf'::FLOAT); + ++-----------------+ +| (- Utf8("inf")) | ++-----------------+ +| -inf | ++-----------------+ + +SELECT 'inf'::DOUBLE + 1; + ++------------------------+ +| Utf8("inf") + Int64(1) | ++------------------------+ +| inf | ++------------------------+ + +SELECT 'inf'::DOUBLE - 1; + ++------------------------+ +| Utf8("inf") - Int64(1) | ++------------------------+ +| inf | ++------------------------+ + +SELECT 'inf'::DOUBLE * 2; + ++------------------------+ +| Utf8("inf") * Int64(2) | ++------------------------+ +| inf | ++------------------------+ + +SELECT 'inf'::DOUBLE / 2; + ++------------------------+ +| Utf8("inf") / Int64(2) | ++------------------------+ +| inf | ++------------------------+ + +SELECT -('inf'::DOUBLE); + ++-----------------+ +| (- Utf8("inf")) | ++-----------------+ +| -inf | ++-----------------+ + +-- Test special infinity cases +-- Should be NaN +SELECT 'inf'::FLOAT - 'inf'::FLOAT; + ++---------------------------+ +| Utf8("inf") - Utf8("inf") | ++---------------------------+ +| NaN | ++---------------------------+ + +-- Should be NaN +SELECT 'inf'::FLOAT / 'inf'::FLOAT; + ++---------------------------+ +| Utf8("inf") / Utf8("inf") | ++---------------------------+ +| NaN | ++---------------------------+ + +-- Should be NaN +SELECT 'inf'::FLOAT * 0; + ++------------------------+ +| Utf8("inf") * Int64(0) | ++------------------------+ +| NaN | ++------------------------+ + +-- Should be NaN +SELECT 'inf'::DOUBLE - 'inf'::DOUBLE; + ++---------------------------+ +| Utf8("inf") - Utf8("inf") | ++---------------------------+ +| NaN | ++---------------------------+ + +-- Should be NaN +SELECT 'inf'::DOUBLE / 'inf'::DOUBLE; + ++---------------------------+ +| Utf8("inf") / Utf8("inf") | ++---------------------------+ +| NaN | ++---------------------------+ + +-- Should be NaN +SELECT 'inf'::DOUBLE * 0; + ++------------------------+ +| Utf8("inf") * Int64(0) | ++------------------------+ +| NaN | ++------------------------+ + diff --git a/tests/cases/standalone/common/types/float/nan_arithmetic_extended.sql b/tests/cases/standalone/common/types/float/nan_arithmetic_extended.sql new file mode 100644 index 0000000000..fe3d24c35c --- /dev/null +++ b/tests/cases/standalone/common/types/float/nan_arithmetic_extended.sql @@ -0,0 +1,91 @@ +-- Migrated from DuckDB test: test/sql/types/float/nan_arithmetic.test +-- Test arithmetic on NaN values + +-- Test NaN arithmetic with FLOAT +-- Any arithmetic on a NaN value will result in a NaN value + +SELECT 'nan'::FLOAT + 1; + +SELECT 'nan'::FLOAT + 'inf'::FLOAT; + +SELECT 'nan'::FLOAT - 1; + +SELECT 'nan'::FLOAT - 'inf'::FLOAT; + +SELECT 'nan'::FLOAT * 1; + +SELECT 'nan'::FLOAT * 'inf'::FLOAT; + +SELECT 'nan'::FLOAT / 1; + +SELECT 'nan'::FLOAT / 'inf'::FLOAT; + +SELECT 'nan'::FLOAT % 1; + +SELECT 'nan'::FLOAT % 'inf'::FLOAT; + +SELECT -('nan'::FLOAT); + +-- Test NaN arithmetic with DOUBLE +SELECT 'nan'::DOUBLE + 1; + +SELECT 'nan'::DOUBLE + 'inf'::DOUBLE; + +SELECT 'nan'::DOUBLE - 1; + +SELECT 'nan'::DOUBLE - 'inf'::DOUBLE; + +SELECT 'nan'::DOUBLE * 1; + +SELECT 'nan'::DOUBLE * 'inf'::DOUBLE; + +SELECT 'nan'::DOUBLE / 1; + +SELECT 'nan'::DOUBLE / 'inf'::DOUBLE; + +SELECT 'nan'::DOUBLE % 1; + +SELECT 'nan'::DOUBLE % 'inf'::DOUBLE; + +SELECT -('nan'::DOUBLE); + +-- Test infinity arithmetic +SELECT 'inf'::FLOAT + 1; + +SELECT 'inf'::FLOAT - 1; + +SELECT 'inf'::FLOAT * 2; + +SELECT 'inf'::FLOAT / 2; + +SELECT -('inf'::FLOAT); + +SELECT 'inf'::DOUBLE + 1; + +SELECT 'inf'::DOUBLE - 1; + +SELECT 'inf'::DOUBLE * 2; + +SELECT 'inf'::DOUBLE / 2; + +SELECT -('inf'::DOUBLE); + +-- Test special infinity cases +-- Should be NaN +SELECT 'inf'::FLOAT - 'inf'::FLOAT; + +-- Should be NaN +SELECT 'inf'::FLOAT / 'inf'::FLOAT; + +-- Should be NaN +SELECT 'inf'::FLOAT * 0; + +-- Should be NaN +SELECT 'inf'::DOUBLE - 'inf'::DOUBLE; + +-- Should be NaN +SELECT 'inf'::DOUBLE / 'inf'::DOUBLE; + +-- Should be NaN +SELECT 'inf'::DOUBLE * 0; + diff --git a/tests/cases/standalone/common/types/float/nan_cast_extended.result b/tests/cases/standalone/common/types/float/nan_cast_extended.result new file mode 100644 index 0000000000..11098a1001 --- /dev/null +++ b/tests/cases/standalone/common/types/float/nan_cast_extended.result @@ -0,0 +1,252 @@ +-- Migrated from DuckDB test: test/sql/types/float/nan_cast.test +-- Test casting of NaN and inf values +-- Test valid casts between FLOAT, DOUBLE, and VARCHAR +-- NaN casts +SELECT 'nan'::FLOAT::DOUBLE; + ++-------------+ +| Utf8("nan") | ++-------------+ +| NaN | ++-------------+ + +SELECT 'nan'::FLOAT::VARCHAR; + ++-------------+ +| Utf8("nan") | ++-------------+ +| NaN | ++-------------+ + +SELECT 'nan'::DOUBLE::FLOAT; + ++-------------+ +| Utf8("nan") | ++-------------+ +| NaN | ++-------------+ + +SELECT 'nan'::DOUBLE::VARCHAR; + ++-------------+ +| Utf8("nan") | ++-------------+ +| NaN | ++-------------+ + +SELECT 'nan'::VARCHAR::FLOAT; + ++-------------+ +| Utf8("nan") | ++-------------+ +| NaN | ++-------------+ + +SELECT 'nan'::VARCHAR::DOUBLE; + ++-------------+ +| Utf8("nan") | ++-------------+ +| NaN | ++-------------+ + +-- Infinity casts +SELECT 'inf'::FLOAT::DOUBLE; + ++-------------+ +| Utf8("inf") | ++-------------+ +| inf | ++-------------+ + +SELECT 'inf'::FLOAT::VARCHAR; + ++-------------+ +| Utf8("inf") | ++-------------+ +| inf | ++-------------+ + +SELECT 'inf'::DOUBLE::FLOAT; + ++-------------+ +| Utf8("inf") | ++-------------+ +| inf | ++-------------+ + +SELECT 'inf'::DOUBLE::VARCHAR; + ++-------------+ +| Utf8("inf") | ++-------------+ +| inf | ++-------------+ + +SELECT 'inf'::VARCHAR::FLOAT; + ++-------------+ +| Utf8("inf") | ++-------------+ +| inf | ++-------------+ + +SELECT 'inf'::VARCHAR::DOUBLE; + ++-------------+ +| Utf8("inf") | ++-------------+ +| inf | ++-------------+ + +-- Negative infinity casts +SELECT '-inf'::FLOAT::DOUBLE; + ++--------------+ +| Utf8("-inf") | ++--------------+ +| -inf | ++--------------+ + +SELECT '-inf'::FLOAT::VARCHAR; + ++--------------+ +| Utf8("-inf") | ++--------------+ +| -inf | ++--------------+ + +SELECT '-inf'::DOUBLE::FLOAT; + ++--------------+ +| Utf8("-inf") | ++--------------+ +| -inf | ++--------------+ + +SELECT '-inf'::DOUBLE::VARCHAR; + ++--------------+ +| Utf8("-inf") | ++--------------+ +| -inf | ++--------------+ + +SELECT '-inf'::VARCHAR::FLOAT; + ++--------------+ +| Utf8("-inf") | ++--------------+ +| -inf | ++--------------+ + +SELECT '-inf'::VARCHAR::DOUBLE; + ++--------------+ +| Utf8("-inf") | ++--------------+ +| -inf | ++--------------+ + +-- Test TRY_CAST for invalid conversions (should return NULL) +SELECT TRY_CAST('nan'::FLOAT AS INTEGER); + ++-------------+ +| Utf8("nan") | ++-------------+ +| | ++-------------+ + +SELECT TRY_CAST('inf'::FLOAT AS INTEGER); + ++-------------+ +| Utf8("inf") | ++-------------+ +| | ++-------------+ + +SELECT TRY_CAST('-inf'::FLOAT AS INTEGER); + ++--------------+ +| Utf8("-inf") | ++--------------+ +| | ++--------------+ + +SELECT TRY_CAST('nan'::DOUBLE AS BIGINT); + ++-------------+ +| Utf8("nan") | ++-------------+ +| | ++-------------+ + +SELECT TRY_CAST('inf'::DOUBLE AS BIGINT); + ++-------------+ +| Utf8("inf") | ++-------------+ +| | ++-------------+ + +SELECT TRY_CAST('-inf'::DOUBLE AS BIGINT); + ++--------------+ +| Utf8("-inf") | ++--------------+ +| | ++--------------+ + +-- Test with table data +CREATE TABLE cast_test(f FLOAT, d DOUBLE, s VARCHAR, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO cast_test VALUES + ('nan'::FLOAT, 'nan'::DOUBLE, 'nan', 1000), + ('inf'::FLOAT, 'inf'::DOUBLE, 'inf', 2000), + ('-inf'::FLOAT, '-inf'::DOUBLE, '-inf', 3000), + (1.5, 2.5, '3.5', 4000); + +Affected Rows: 4 + +-- Cast between float types +SELECT f, f::DOUBLE AS fd, d, d::FLOAT AS df FROM cast_test ORDER BY ts; + ++------+------+------+------+ +| f | fd | d | df | ++------+------+------+------+ +| NaN | NaN | NaN | NaN | +| inf | inf | inf | inf | +| -inf | -inf | -inf | -inf | +| 1.5 | 1.5 | 2.5 | 2.5 | ++------+------+------+------+ + +-- Cast to string +SELECT f::VARCHAR, d::VARCHAR FROM cast_test ORDER BY ts; + ++-------------+-------------+ +| cast_test.f | cast_test.d | ++-------------+-------------+ +| NaN | NaN | +| inf | inf | +| -inf | -inf | +| 1.5 | 2.5 | ++-------------+-------------+ + +-- Cast from string +SELECT s, TRY_CAST(s AS FLOAT) AS sf, TRY_CAST(s AS DOUBLE) AS sd FROM cast_test ORDER BY ts; + ++------+------+------+ +| s | sf | sd | ++------+------+------+ +| nan | NaN | NaN | +| inf | inf | inf | +| -inf | -inf | -inf | +| 3.5 | 3.5 | 3.5 | ++------+------+------+ + +DROP TABLE cast_test; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/types/float/nan_cast_extended.sql b/tests/cases/standalone/common/types/float/nan_cast_extended.sql new file mode 100644 index 0000000000..5c5caeeec9 --- /dev/null +++ b/tests/cases/standalone/common/types/float/nan_cast_extended.sql @@ -0,0 +1,76 @@ +-- Migrated from DuckDB test: test/sql/types/float/nan_cast.test +-- Test casting of NaN and inf values + +-- Test valid casts between FLOAT, DOUBLE, and VARCHAR + +-- NaN casts +SELECT 'nan'::FLOAT::DOUBLE; + +SELECT 'nan'::FLOAT::VARCHAR; + +SELECT 'nan'::DOUBLE::FLOAT; + +SELECT 'nan'::DOUBLE::VARCHAR; + +SELECT 'nan'::VARCHAR::FLOAT; + +SELECT 'nan'::VARCHAR::DOUBLE; + +-- Infinity casts +SELECT 'inf'::FLOAT::DOUBLE; + +SELECT 'inf'::FLOAT::VARCHAR; + +SELECT 'inf'::DOUBLE::FLOAT; + +SELECT 'inf'::DOUBLE::VARCHAR; + +SELECT 'inf'::VARCHAR::FLOAT; + +SELECT 'inf'::VARCHAR::DOUBLE; + +-- Negative infinity casts +SELECT '-inf'::FLOAT::DOUBLE; + +SELECT '-inf'::FLOAT::VARCHAR; + +SELECT '-inf'::DOUBLE::FLOAT; + +SELECT '-inf'::DOUBLE::VARCHAR; + +SELECT '-inf'::VARCHAR::FLOAT; + +SELECT '-inf'::VARCHAR::DOUBLE; + +-- Test TRY_CAST for invalid conversions (should return NULL) +SELECT TRY_CAST('nan'::FLOAT AS INTEGER); + +SELECT TRY_CAST('inf'::FLOAT AS INTEGER); + +SELECT TRY_CAST('-inf'::FLOAT AS INTEGER); + +SELECT TRY_CAST('nan'::DOUBLE AS BIGINT); + +SELECT TRY_CAST('inf'::DOUBLE AS BIGINT); + +SELECT TRY_CAST('-inf'::DOUBLE AS BIGINT); + +-- Test with table data +CREATE TABLE cast_test(f FLOAT, d DOUBLE, s VARCHAR, ts TIMESTAMP TIME INDEX); + +INSERT INTO cast_test VALUES + ('nan'::FLOAT, 'nan'::DOUBLE, 'nan', 1000), + ('inf'::FLOAT, 'inf'::DOUBLE, 'inf', 2000), + ('-inf'::FLOAT, '-inf'::DOUBLE, '-inf', 3000), + (1.5, 2.5, '3.5', 4000); + +-- Cast between float types +SELECT f, f::DOUBLE AS fd, d, d::FLOAT AS df FROM cast_test ORDER BY ts; + +-- Cast to string +SELECT f::VARCHAR, d::VARCHAR FROM cast_test ORDER BY ts; + +-- Cast from string +SELECT s, TRY_CAST(s AS FLOAT) AS sf, TRY_CAST(s AS DOUBLE) AS sd FROM cast_test ORDER BY ts; + +DROP TABLE cast_test; diff --git a/tests/cases/standalone/common/types/null/null_handling.result b/tests/cases/standalone/common/types/null/null_handling.result new file mode 100644 index 0000000000..320ced424c --- /dev/null +++ b/tests/cases/standalone/common/types/null/null_handling.result @@ -0,0 +1,171 @@ +-- Migrated from DuckDB test: test/sql/types/null/test_null.test +-- Test NULL value handling across different contexts +-- Test NULL in basic operations +CREATE TABLE null_test(i INTEGER, s VARCHAR, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO null_test VALUES + (1, 'hello', 1000), + (NULL, 'world', 2000), + (3, NULL, 3000), + (NULL, NULL, 4000); + +Affected Rows: 4 + +-- Test NULL comparisons +SELECT i, s FROM null_test WHERE i IS NULL ORDER BY ts; + ++---+-------+ +| i | s | ++---+-------+ +| | world | +| | | ++---+-------+ + +SELECT i, s FROM null_test WHERE i IS NOT NULL ORDER BY ts; + ++---+-------+ +| i | s | ++---+-------+ +| 1 | hello | +| 3 | | ++---+-------+ + +SELECT i, s FROM null_test WHERE s IS NULL ORDER BY ts; + ++---+---+ +| i | s | ++---+---+ +| 3 | | +| | | ++---+---+ + +SELECT i, s FROM null_test WHERE s IS NOT NULL ORDER BY ts; + ++---+-------+ +| i | s | ++---+-------+ +| 1 | hello | +| | world | ++---+-------+ + +-- Test NULL in arithmetic +SELECT i, i + 1, i * 2, i - 5 FROM null_test ORDER BY ts; + ++---+------------------------+------------------------+------------------------+ +| i | null_test.i + Int64(1) | null_test.i * Int64(2) | null_test.i - Int64(5) | ++---+------------------------+------------------------+------------------------+ +| 1 | 2 | 2 | -4 | +| | | | | +| 3 | 4 | 6 | -2 | +| | | | | ++---+------------------------+------------------------+------------------------+ + +-- Test NULL in string operations +SELECT s, CONCAT(s, ' test'), UPPER(s), LENGTH(s) FROM null_test ORDER BY ts; + ++-------+-----------------------------------+--------------------+---------------------+ +| s | concat(null_test.s,Utf8(" test")) | upper(null_test.s) | length(null_test.s) | ++-------+-----------------------------------+--------------------+---------------------+ +| hello | hello test | HELLO | 5 | +| world | world test | WORLD | 5 | +| | test | | | +| | test | | | ++-------+-----------------------------------+--------------------+---------------------+ + +-- Test NULL with COALESCE +SELECT i, s, COALESCE(i, -1), COALESCE(s, 'missing') FROM null_test ORDER BY ts; + ++---+-------+---------------------------------+---------------------------------------+ +| i | s | coalesce(null_test.i,Int64(-1)) | coalesce(null_test.s,Utf8("missing")) | ++---+-------+---------------------------------+---------------------------------------+ +| 1 | hello | 1 | hello | +| | world | -1 | world | +| 3 | | 3 | missing | +| | | -1 | missing | ++---+-------+---------------------------------+---------------------------------------+ + +-- Test NULL in aggregates +SELECT COUNT(*), COUNT(i), COUNT(s) FROM null_test; + ++----------+--------------------+--------------------+ +| count(*) | count(null_test.i) | count(null_test.s) | ++----------+--------------------+--------------------+ +| 4 | 2 | 2 | ++----------+--------------------+--------------------+ + +SELECT SUM(i), AVG(i), MAX(i), MIN(i) FROM null_test; + ++------------------+------------------+------------------+------------------+ +| sum(null_test.i) | avg(null_test.i) | max(null_test.i) | min(null_test.i) | ++------------------+------------------+------------------+------------------+ +| 4 | 2.0 | 3 | 1 | ++------------------+------------------+------------------+------------------+ + +-- Test NULL in CASE expressions +SELECT i, s, + CASE + WHEN i IS NULL THEN 'no number' + WHEN i > 2 THEN 'big number' + ELSE 'small number' + END as category +FROM null_test ORDER BY ts; + ++---+-------+--------------+ +| i | s | category | ++---+-------+--------------+ +| 1 | hello | small number | +| | world | no number | +| 3 | | big number | +| | | no number | ++---+-------+--------------+ + +-- Test NULL in GROUP BY +SELECT i, COUNT(*) FROM null_test GROUP BY i ORDER BY i; + ++---+----------+ +| i | count(*) | ++---+----------+ +| 1 | 1 | +| 3 | 1 | +| | 2 | ++---+----------+ + +SELECT s, COUNT(*) FROM null_test GROUP BY s ORDER BY s; + ++-------+----------+ +| s | count(*) | ++-------+----------+ +| hello | 1 | +| world | 1 | +| | 2 | ++-------+----------+ + +-- Test NULLIF function +SELECT i, NULLIF(i, 1) FROM null_test ORDER BY ts; + ++---+------------------------------+ +| i | nullif(null_test.i,Int64(1)) | ++---+------------------------------+ +| 1 | | +| | | +| 3 | 3 | +| | | ++---+------------------------------+ + +SELECT s, NULLIF(s, 'hello') FROM null_test ORDER BY ts; + ++-------+-----------------------------------+ +| s | nullif(null_test.s,Utf8("hello")) | ++-------+-----------------------------------+ +| hello | | +| world | world | +| | | +| | | ++-------+-----------------------------------+ + +DROP TABLE null_test; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/types/null/null_handling.sql b/tests/cases/standalone/common/types/null/null_handling.sql new file mode 100644 index 0000000000..e0fb460778 --- /dev/null +++ b/tests/cases/standalone/common/types/null/null_handling.sql @@ -0,0 +1,49 @@ +-- Migrated from DuckDB test: test/sql/types/null/test_null.test +-- Test NULL value handling across different contexts + +-- Test NULL in basic operations +CREATE TABLE null_test(i INTEGER, s VARCHAR, ts TIMESTAMP TIME INDEX); + +INSERT INTO null_test VALUES + (1, 'hello', 1000), + (NULL, 'world', 2000), + (3, NULL, 3000), + (NULL, NULL, 4000); + +-- Test NULL comparisons +SELECT i, s FROM null_test WHERE i IS NULL ORDER BY ts; +SELECT i, s FROM null_test WHERE i IS NOT NULL ORDER BY ts; +SELECT i, s FROM null_test WHERE s IS NULL ORDER BY ts; +SELECT i, s FROM null_test WHERE s IS NOT NULL ORDER BY ts; + +-- Test NULL in arithmetic +SELECT i, i + 1, i * 2, i - 5 FROM null_test ORDER BY ts; + +-- Test NULL in string operations +SELECT s, CONCAT(s, ' test'), UPPER(s), LENGTH(s) FROM null_test ORDER BY ts; + +-- Test NULL with COALESCE +SELECT i, s, COALESCE(i, -1), COALESCE(s, 'missing') FROM null_test ORDER BY ts; + +-- Test NULL in aggregates +SELECT COUNT(*), COUNT(i), COUNT(s) FROM null_test; +SELECT SUM(i), AVG(i), MAX(i), MIN(i) FROM null_test; + +-- Test NULL in CASE expressions +SELECT i, s, + CASE + WHEN i IS NULL THEN 'no number' + WHEN i > 2 THEN 'big number' + ELSE 'small number' + END as category +FROM null_test ORDER BY ts; + +-- Test NULL in GROUP BY +SELECT i, COUNT(*) FROM null_test GROUP BY i ORDER BY i; +SELECT s, COUNT(*) FROM null_test GROUP BY s ORDER BY s; + +-- Test NULLIF function +SELECT i, NULLIF(i, 1) FROM null_test ORDER BY ts; +SELECT s, NULLIF(s, 'hello') FROM null_test ORDER BY ts; + +DROP TABLE null_test; diff --git a/tests/cases/standalone/common/types/string/big_strings.result b/tests/cases/standalone/common/types/string/big_strings.result new file mode 100644 index 0000000000..a81ff17cf5 --- /dev/null +++ b/tests/cases/standalone/common/types/string/big_strings.result @@ -0,0 +1,116 @@ +-- Migrated from DuckDB test: test/sql/types/string/test_big_strings.test +-- Test handling of large strings +-- Test large string creation and manipulation +CREATE TABLE big_strings("id" INTEGER, s VARCHAR, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +-- Insert strings of various sizes +INSERT INTO big_strings VALUES + (1, REPEAT('a', 100), 1000), + (2, REPEAT('Hello World! ', 50), 2000), + (3, REPEAT('Unicode 世界 ', 100), 3000), + (4, REPEAT('x', 1000), 4000); + +Affected Rows: 4 + +-- Test length of big strings +SELECT "id", LENGTH(s) FROM big_strings ORDER BY "id"; + ++----+-----------------------+ +| id | length(big_strings.s) | ++----+-----------------------+ +| 1 | 100 | +| 2 | 650 | +| 3 | 1100 | +| 4 | 1000 | ++----+-----------------------+ + +-- Test substring operations on big strings +SELECT "id", SUBSTRING(s, 1, 20) FROM big_strings ORDER BY "id"; + ++----+------------------------------------------+ +| id | substr(big_strings.s,Int64(1),Int64(20)) | ++----+------------------------------------------+ +| 1 | aaaaaaaaaaaaaaaaaaaa | +| 2 | Hello World! Hello W | +| 3 | Unicode 世界 Unicode 世 | +| 4 | xxxxxxxxxxxxxxxxxxxx | ++----+------------------------------------------+ + +SELECT "id", RIGHT(s, 10) FROM big_strings ORDER BY "id"; + ++----+--------------------------------+ +| id | right(big_strings.s,Int64(10)) | ++----+--------------------------------+ +| 1 | aaaaaaaaaa | +| 2 | lo World! | +| 3 | nicode 世界 | +| 4 | xxxxxxxxxx | ++----+--------------------------------+ + +-- Test concatenation with big strings +SELECT "id", LENGTH(s || s) FROM big_strings WHERE "id" = 1; + ++----+----------------------------------------+ +| id | length(big_strings.s || big_strings.s) | ++----+----------------------------------------+ +| 1 | 200 | ++----+----------------------------------------+ + +-- Test pattern matching on big strings +SELECT "id", s LIKE '%World%' FROM big_strings ORDER BY "id"; + ++----+------------------------------------+ +| id | big_strings.s LIKE Utf8("%World%") | ++----+------------------------------------+ +| 1 | false | +| 2 | true | +| 3 | false | +| 4 | false | ++----+------------------------------------+ + +-- Test comparison with big strings +SELECT COUNT(*) FROM big_strings WHERE s = REPEAT('a', 100); + ++----------+ +| count(*) | ++----------+ +| 1 | ++----------+ + +-- Test UPPER/LOWER on big strings +SELECT "id", LENGTH(UPPER(s)) FROM big_strings WHERE "id" <= 2 ORDER BY "id"; + ++----+------------------------------+ +| id | length(upper(big_strings.s)) | ++----+------------------------------+ +| 1 | 100 | +| 2 | 650 | ++----+------------------------------+ + +-- Test trimming big strings +CREATE TABLE padded_strings(s VARCHAR, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO padded_strings VALUES (CONCAT(' ', REPEAT('test', 100), ' '), 1000); + +Affected Rows: 1 + +SELECT LENGTH(s), LENGTH(TRIM(s)) FROM padded_strings; + ++--------------------------+---------------------------------+ +| length(padded_strings.s) | length(btrim(padded_strings.s)) | ++--------------------------+---------------------------------+ +| 406 | 400 | ++--------------------------+---------------------------------+ + +DROP TABLE big_strings; + +Affected Rows: 0 + +DROP TABLE padded_strings; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/types/string/big_strings.sql b/tests/cases/standalone/common/types/string/big_strings.sql new file mode 100644 index 0000000000..0c654ecb01 --- /dev/null +++ b/tests/cases/standalone/common/types/string/big_strings.sql @@ -0,0 +1,43 @@ +-- Migrated from DuckDB test: test/sql/types/string/test_big_strings.test +-- Test handling of large strings + +-- Test large string creation and manipulation +CREATE TABLE big_strings("id" INTEGER, s VARCHAR, ts TIMESTAMP TIME INDEX); + +-- Insert strings of various sizes +INSERT INTO big_strings VALUES + (1, REPEAT('a', 100), 1000), + (2, REPEAT('Hello World! ', 50), 2000), + (3, REPEAT('Unicode 世界 ', 100), 3000), + (4, REPEAT('x', 1000), 4000); + +-- Test length of big strings +SELECT "id", LENGTH(s) FROM big_strings ORDER BY "id"; + +-- Test substring operations on big strings +SELECT "id", SUBSTRING(s, 1, 20) FROM big_strings ORDER BY "id"; + +SELECT "id", RIGHT(s, 10) FROM big_strings ORDER BY "id"; + +-- Test concatenation with big strings +SELECT "id", LENGTH(s || s) FROM big_strings WHERE "id" = 1; + +-- Test pattern matching on big strings +SELECT "id", s LIKE '%World%' FROM big_strings ORDER BY "id"; + +-- Test comparison with big strings +SELECT COUNT(*) FROM big_strings WHERE s = REPEAT('a', 100); + +-- Test UPPER/LOWER on big strings +SELECT "id", LENGTH(UPPER(s)) FROM big_strings WHERE "id" <= 2 ORDER BY "id"; + +-- Test trimming big strings +CREATE TABLE padded_strings(s VARCHAR, ts TIMESTAMP TIME INDEX); + +INSERT INTO padded_strings VALUES (CONCAT(' ', REPEAT('test', 100), ' '), 1000); + +SELECT LENGTH(s), LENGTH(TRIM(s)) FROM padded_strings; + +DROP TABLE big_strings; + +DROP TABLE padded_strings; diff --git a/tests/cases/standalone/common/types/string/unicode_extended.result b/tests/cases/standalone/common/types/string/unicode_extended.result new file mode 100644 index 0000000000..6a1ad83b85 --- /dev/null +++ b/tests/cases/standalone/common/types/string/unicode_extended.result @@ -0,0 +1,103 @@ +-- Migrated from DuckDB test: test/sql/types/string/test_unicode.test +-- Test Unicode string handling +-- Test basic Unicode strings +CREATE TABLE unicode_test(s VARCHAR, ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO unicode_test VALUES + ('Hello 世界', 1000), + ('Ññññ', 2000), + ('🚀🎉🌟', 3000), + ('Здравствуй мир', 4000), + ('مرحبا بالعالم', 5000), + ('こんにちは世界', 6000); + +Affected Rows: 6 + +-- Test basic selection +SELECT s FROM unicode_test ORDER BY ts; + ++----------------+ +| s | ++----------------+ +| Hello 世界 | +| Ññññ | +| 🚀🎉🌟 | +| Здравствуй мир | +| مرحبا بالعالم | +| こんにちは世界 | ++----------------+ + +-- Test length function with Unicode +SELECT s, LENGTH(s) AS a, CHAR_LENGTH(s) AS b FROM unicode_test ORDER BY ts; + ++----------------+----+----+ +| s | a | b | ++----------------+----+----+ +| Hello 世界 | 8 | 8 | +| Ññññ | 4 | 4 | +| 🚀🎉🌟 | 3 | 3 | +| Здравствуй мир | 14 | 14 | +| مرحبا بالعالم | 13 | 13 | +| こんにちは世界 | 7 | 7 | ++----------------+----+----+ + +-- Test substring with Unicode +SELECT s, SUBSTRING(s, 1, 5) FROM unicode_test ORDER BY ts; + ++----------------+------------------------------------------+ +| s | substr(unicode_test.s,Int64(1),Int64(5)) | ++----------------+------------------------------------------+ +| Hello 世界 | Hello | +| Ññññ | Ññññ | +| 🚀🎉🌟 | 🚀🎉🌟 | +| Здравствуй мир | Здрав | +| مرحبا بالعالم | مرحبا | +| こんにちは世界 | こんにちは | ++----------------+------------------------------------------+ + +-- Test UPPER/LOWER with Unicode +SELECT s, UPPER(s), LOWER(s) FROM unicode_test WHERE s = 'Hello 世界'; + ++------------+-----------------------+-----------------------+ +| s | upper(unicode_test.s) | lower(unicode_test.s) | ++------------+-----------------------+-----------------------+ +| Hello 世界 | HELLO 世界 | hello 世界 | ++------------+-----------------------+-----------------------+ + +-- Test comparison with Unicode +SELECT COUNT(*) FROM unicode_test WHERE s LIKE '%世界%'; + ++----------+ +| count(*) | ++----------+ +| 2 | ++----------+ + +SELECT COUNT(*) FROM unicode_test WHERE s LIKE '%🚀%'; + ++----------+ +| count(*) | ++----------+ +| 1 | ++----------+ + +-- Test concatenation with Unicode +SELECT CONCAT(s, ' - test') FROM unicode_test ORDER BY ts; + ++----------------------------------------+ +| concat(unicode_test.s,Utf8(" - test")) | ++----------------------------------------+ +| Hello 世界 - test | +| Ññññ - test | +| 🚀🎉🌟 - test | +| Здравствуй мир - test | +| مرحبا بالعالم - test | +| こんにちは世界 - test | ++----------------------------------------+ + +DROP TABLE unicode_test; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/types/string/unicode_extended.sql b/tests/cases/standalone/common/types/string/unicode_extended.sql new file mode 100644 index 0000000000..3e6f47f3c8 --- /dev/null +++ b/tests/cases/standalone/common/types/string/unicode_extended.sql @@ -0,0 +1,35 @@ +-- Migrated from DuckDB test: test/sql/types/string/test_unicode.test +-- Test Unicode string handling + +-- Test basic Unicode strings +CREATE TABLE unicode_test(s VARCHAR, ts TIMESTAMP TIME INDEX); + +INSERT INTO unicode_test VALUES + ('Hello 世界', 1000), + ('Ññññ', 2000), + ('🚀🎉🌟', 3000), + ('Здравствуй мир', 4000), + ('مرحبا بالعالم', 5000), + ('こんにちは世界', 6000); + +-- Test basic selection +SELECT s FROM unicode_test ORDER BY ts; + +-- Test length function with Unicode +SELECT s, LENGTH(s) AS a, CHAR_LENGTH(s) AS b FROM unicode_test ORDER BY ts; + +-- Test substring with Unicode +SELECT s, SUBSTRING(s, 1, 5) FROM unicode_test ORDER BY ts; + +-- Test UPPER/LOWER with Unicode +SELECT s, UPPER(s), LOWER(s) FROM unicode_test WHERE s = 'Hello 世界'; + +-- Test comparison with Unicode +SELECT COUNT(*) FROM unicode_test WHERE s LIKE '%世界%'; + +SELECT COUNT(*) FROM unicode_test WHERE s LIKE '%🚀%'; + +-- Test concatenation with Unicode +SELECT CONCAT(s, ' - test') FROM unicode_test ORDER BY ts; + +DROP TABLE unicode_test; From e386a366d027dd2a8a69827f5ee6038662eefe03 Mon Sep 17 00:00:00 2001 From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com> Date: Sun, 26 Oct 2025 18:41:19 -0700 Subject: [PATCH 009/149] feat: add HTTP endpoint to control prof.gdump feature (#6999) * feat/gdump: ### Add Support for Jemalloc Gdump Flag - **`jemalloc.rs`**: Introduced `PROF_GDUMP` constant and added functions `set_gdump_active` and `is_gdump_active` to manage the gdump flag. - **`error.rs`**: Added error handling for reading and updating the jemalloc gdump flag with `ReadGdump` and `UpdateGdump` errors. - **`lib.rs`**: Exposed `is_gdump_active` and `set_gdump_active` functions for non-Windows platforms. - **`http.rs`**: Added HTTP routes for checking and toggling the jemalloc gdump flag status. - **`mem_prof.rs`**: Implemented handlers `gdump_toggle_handler` and `gdump_status_handler` for managing gdump flag via HTTP requests. Signed-off-by: Lei, HUANG * Update docs/how-to/how-to-profile-memory.md Co-authored-by: shuiyisong <113876041+shuiyisong@users.noreply.github.com> * fix: typo in docs Signed-off-by: Lei, HUANG --------- Signed-off-by: Lei, HUANG Co-authored-by: shuiyisong <113876041+shuiyisong@users.noreply.github.com> --- docs/how-to/how-to-profile-memory.md | 9 ++++ src/common/mem-prof/src/jemalloc.rs | 14 ++++++ src/common/mem-prof/src/jemalloc/error.rs | 14 ++++++ src/common/mem-prof/src/lib.rs | 12 ++++- src/servers/src/http.rs | 5 ++ src/servers/src/http/mem_prof.rs | 56 +++++++++++++++++++++++ 6 files changed, 109 insertions(+), 1 deletion(-) diff --git a/docs/how-to/how-to-profile-memory.md b/docs/how-to/how-to-profile-memory.md index a860c95246..b4bc00093a 100644 --- a/docs/how-to/how-to-profile-memory.md +++ b/docs/how-to/how-to-profile-memory.md @@ -71,6 +71,15 @@ curl -X POST localhost:4000/debug/prof/mem/activate # Deactivate heap profiling curl -X POST localhost:4000/debug/prof/mem/deactivate + +# Activate gdump feature that dumps memory profiling data every time virtual memory usage exceeds previous maximum value. +curl -X POST localhost:4000/debug/prof/mem/gdump -d 'activate=true' + +# Deactivate gdump. +curl -X POST localhost:4000/debug/prof/mem/gdump -d 'activate=false' + +# Retrieve current gdump status. +curl -X GET localhost:4000/debug/prof/mem/gdump ``` ### Dump memory profiling data diff --git a/src/common/mem-prof/src/jemalloc.rs b/src/common/mem-prof/src/jemalloc.rs index 05966b4754..a9359dad41 100644 --- a/src/common/mem-prof/src/jemalloc.rs +++ b/src/common/mem-prof/src/jemalloc.rs @@ -32,6 +32,7 @@ use crate::error::{FlamegraphSnafu, ParseJeHeapSnafu, Result}; const PROF_DUMP: &[u8] = b"prof.dump\0"; const OPT_PROF: &[u8] = b"opt.prof\0"; const PROF_ACTIVE: &[u8] = b"prof.active\0"; +const PROF_GDUMP: &[u8] = b"prof.gdump\0"; pub async fn dump_profile() -> Result> { ensure!(is_prof_enabled()?, ProfilingNotEnabledSnafu); @@ -119,3 +120,16 @@ fn is_prof_enabled() -> Result { // safety: OPT_PROF variable, if present, is always a boolean value. Ok(unsafe { tikv_jemalloc_ctl::raw::read::(OPT_PROF).context(ReadOptProfSnafu)? }) } + +pub fn set_gdump_active(active: bool) -> Result<()> { + ensure!(is_prof_enabled()?, ProfilingNotEnabledSnafu); + unsafe { + tikv_jemalloc_ctl::raw::update(PROF_GDUMP, active).context(error::UpdateGdumpSnafu)?; + } + Ok(()) +} + +pub fn is_gdump_active() -> Result { + // safety: PROF_GDUMP, if present, is a boolean value. + unsafe { Ok(tikv_jemalloc_ctl::raw::read::(PROF_GDUMP).context(error::ReadGdumpSnafu)?) } +} diff --git a/src/common/mem-prof/src/jemalloc/error.rs b/src/common/mem-prof/src/jemalloc/error.rs index 1787e97a7d..79e4b8f9a6 100644 --- a/src/common/mem-prof/src/jemalloc/error.rs +++ b/src/common/mem-prof/src/jemalloc/error.rs @@ -71,6 +71,18 @@ pub enum Error { #[snafu(source)] error: tikv_jemalloc_ctl::Error, }, + + #[snafu(display("Failed to read jemalloc gdump flag"))] + ReadGdump { + #[snafu(source)] + error: tikv_jemalloc_ctl::Error, + }, + + #[snafu(display("Failed to update jemalloc gdump flag"))] + UpdateGdump { + #[snafu(source)] + error: tikv_jemalloc_ctl::Error, + }, } impl ErrorExt for Error { @@ -84,6 +96,8 @@ impl ErrorExt for Error { Error::ActivateProf { .. } => StatusCode::Internal, Error::DeactivateProf { .. } => StatusCode::Internal, Error::ReadProfActive { .. } => StatusCode::Internal, + Error::ReadGdump { .. } => StatusCode::Internal, + Error::UpdateGdump { .. } => StatusCode::Internal, } } diff --git a/src/common/mem-prof/src/lib.rs b/src/common/mem-prof/src/lib.rs index 3fa6273f6e..9ff67e7277 100644 --- a/src/common/mem-prof/src/lib.rs +++ b/src/common/mem-prof/src/lib.rs @@ -19,7 +19,7 @@ mod jemalloc; #[cfg(not(windows))] pub use jemalloc::{ activate_heap_profile, deactivate_heap_profile, dump_flamegraph, dump_pprof, dump_profile, - is_heap_profile_active, + is_gdump_active, is_heap_profile_active, set_gdump_active, }; #[cfg(windows)] @@ -51,3 +51,13 @@ pub fn deactivate_heap_profile() -> error::Result<()> { pub fn is_heap_profile_active() -> error::Result { error::ProfilingNotSupportedSnafu.fail() } + +#[cfg(windows)] +pub fn is_gdump_active() -> error::Result { + error::ProfilingNotSupportedSnafu.fail() +} + +#[cfg(windows)] +pub fn set_gdump_active(_: bool) -> error::Result<()> { + error::ProfilingNotSupportedSnafu.fail() +} diff --git a/src/servers/src/http.rs b/src/servers/src/http.rs index 404b087535..8fa658b6bb 100644 --- a/src/servers/src/http.rs +++ b/src/servers/src/http.rs @@ -924,6 +924,11 @@ impl HttpServer { .route( "/mem/status", routing::get(mem_prof::heap_prof_status_handler), + ) // jemalloc gdump flag status and toggle + .route( + "/mem/gdump", + routing::get(mem_prof::gdump_status_handler) + .post(mem_prof::gdump_toggle_handler), ), ), )) diff --git a/src/servers/src/http/mem_prof.rs b/src/servers/src/http/mem_prof.rs index 92995fd2de..e6362aef3f 100644 --- a/src/servers/src/http/mem_prof.rs +++ b/src/servers/src/http/mem_prof.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#[cfg(feature = "mem-prof")] +use axum::Form; #[cfg(feature = "mem-prof")] use axum::extract::Query; use axum::http::StatusCode; @@ -127,3 +129,57 @@ pub async fn heap_prof_status_handler() -> crate::error::Result, +) -> crate::error::Result { + use snafu::ResultExt; + + use crate::error::DumpProfileDataSnafu; + + common_mem_prof::set_gdump_active(form.activate).context(DumpProfileDataSnafu)?; + + let msg = if form.activate { + "gdump activated" + } else { + "gdump deactivated" + }; + Ok((StatusCode::OK, msg)) +} + +#[cfg(not(feature = "mem-prof"))] +#[axum_macros::debug_handler] +pub async fn gdump_toggle_handler() -> crate::error::Result { + Ok(( + StatusCode::NOT_IMPLEMENTED, + "The 'mem-prof' feature is disabled", + )) +} + +#[cfg(feature = "mem-prof")] +#[axum_macros::debug_handler] +pub async fn gdump_status_handler() -> crate::error::Result { + use snafu::ResultExt; + + use crate::error::DumpProfileDataSnafu; + + let is_active = common_mem_prof::is_gdump_active().context(DumpProfileDataSnafu)?; + Ok((StatusCode::OK, format!("{{\"active\": {}}}", is_active))) +} + +#[cfg(not(feature = "mem-prof"))] +#[axum_macros::debug_handler] +pub async fn gdump_status_handler() -> crate::error::Result { + Ok(( + StatusCode::NOT_IMPLEMENTED, + "The 'mem-prof' feature is disabled", + )) +} From 68247fc9b1d9f8e5d249b6b169dbb913c4a79be2 Mon Sep 17 00:00:00 2001 From: discord9 <55937128+discord9@users.noreply.github.com> Date: Mon, 27 Oct 2025 10:14:45 +0800 Subject: [PATCH 010/149] fix: count_state use stat to eval&predicate w/out region (#7116) * fix: count_state use stat to eval Signed-off-by: discord9 * cleanup Signed-off-by: discord9 * fix: use predicate without region Signed-off-by: discord9 * test: diverge standalone/dist impl Signed-off-by: discord9 --------- Signed-off-by: discord9 --- src/common/function/src/aggrs/aggr_wrapper.rs | 47 ++ src/mito2/src/read/scan_region.rs | 5 +- src/mito2/src/read/seq_scan.rs | 8 +- src/mito2/src/read/series_scan.rs | 8 +- src/mito2/src/read/unordered_scan.rs | 10 +- src/store-api/src/region_engine.rs | 6 +- src/table/src/table/scan.rs | 39 +- .../cases/distributed/optimizer/count.result | 465 ++++++++++++++++++ tests/cases/distributed/optimizer/count.sql | 1 + .../standalone/common/aggregate/count.result | 103 ---- .../standalone/common/aggregate/count.sql | 43 -- tests/cases/standalone/optimizer/count.result | 462 +++++++++++++++++ tests/cases/standalone/optimizer/count.sql | 201 ++++++++ 13 files changed, 1221 insertions(+), 177 deletions(-) create mode 100644 tests/cases/distributed/optimizer/count.result create mode 120000 tests/cases/distributed/optimizer/count.sql delete mode 100644 tests/cases/standalone/common/aggregate/count.result delete mode 100644 tests/cases/standalone/common/aggregate/count.sql create mode 100644 tests/cases/standalone/optimizer/count.result create mode 100644 tests/cases/standalone/optimizer/count.sql diff --git a/src/common/function/src/aggrs/aggr_wrapper.rs b/src/common/function/src/aggrs/aggr_wrapper.rs index ed691296ee..54dc1ac78e 100644 --- a/src/common/function/src/aggrs/aggr_wrapper.rs +++ b/src/common/function/src/aggrs/aggr_wrapper.rs @@ -29,6 +29,8 @@ use arrow::array::StructArray; use arrow_schema::{FieldRef, Fields}; use common_telemetry::debug; use datafusion::functions_aggregate::all_default_aggregate_functions; +use datafusion::functions_aggregate::count::Count; +use datafusion::functions_aggregate::min_max::{Max, Min}; use datafusion::optimizer::AnalyzerRule; use datafusion::optimizer::analyzer::type_coercion::TypeCoercion; use datafusion::physical_planner::create_aggregate_expr_and_maybe_filter; @@ -413,6 +415,51 @@ impl AggregateUDFImpl for StateWrapper { fn coerce_types(&self, arg_types: &[DataType]) -> datafusion_common::Result> { self.inner.coerce_types(arg_types) } + + fn value_from_stats( + &self, + statistics_args: &datafusion_expr::StatisticsArgs, + ) -> Option { + let inner = self.inner().inner().as_any(); + // only count/min/max need special handling here, for getting result from statistics + // the result of count/min/max is also the result of count_state so can return directly + let can_use_stat = inner.is::() || inner.is::() || inner.is::(); + if !can_use_stat { + return None; + } + + // fix return type by extract the first field's data type from the struct type + let state_type = if let DataType::Struct(fields) = &statistics_args.return_type { + if fields.is_empty() { + return None; + } + fields[0].data_type().clone() + } else { + return None; + }; + + let fixed_args = datafusion_expr::StatisticsArgs { + statistics: statistics_args.statistics, + return_type: &state_type, + is_distinct: statistics_args.is_distinct, + exprs: statistics_args.exprs, + }; + + let ret = self.inner().value_from_stats(&fixed_args)?; + + // wrap the result into struct scalar value + let fields = if let DataType::Struct(fields) = &statistics_args.return_type { + fields + } else { + return None; + }; + + let array = ret.to_array().ok()?; + + let struct_array = StructArray::new(fields.clone(), vec![array], None); + let ret = ScalarValue::Struct(Arc::new(struct_array)); + Some(ret) + } } /// The wrapper's input is the same as the original aggregate function's input, diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs index 536c48e248..29eef2ef6d 100644 --- a/src/mito2/src/read/scan_region.rs +++ b/src/mito2/src/read/scan_region.rs @@ -1106,9 +1106,8 @@ impl ScanInput { rows } - /// Returns table predicate of all exprs. - pub(crate) fn predicate(&self) -> Option<&Predicate> { - self.predicate.predicate() + pub(crate) fn predicate_group(&self) -> &PredicateGroup { + &self.predicate } /// Returns number of memtables to scan. diff --git a/src/mito2/src/read/seq_scan.rs b/src/mito2/src/read/seq_scan.rs index 631c40b42a..c90ea89b90 100644 --- a/src/mito2/src/read/seq_scan.rs +++ b/src/mito2/src/read/seq_scan.rs @@ -632,8 +632,12 @@ impl RegionScanner for SeqScan { Ok(()) } - fn has_predicate(&self) -> bool { - let predicate = self.stream_ctx.input.predicate(); + fn has_predicate_without_region(&self) -> bool { + let predicate = self + .stream_ctx + .input + .predicate_group() + .predicate_without_region(); predicate.map(|p| !p.exprs().is_empty()).unwrap_or(false) } diff --git a/src/mito2/src/read/series_scan.rs b/src/mito2/src/read/series_scan.rs index 3a006dcb67..a99e3c46bb 100644 --- a/src/mito2/src/read/series_scan.rs +++ b/src/mito2/src/read/series_scan.rs @@ -314,8 +314,12 @@ impl RegionScanner for SeriesScan { Ok(()) } - fn has_predicate(&self) -> bool { - let predicate = self.stream_ctx.input.predicate(); + fn has_predicate_without_region(&self) -> bool { + let predicate = self + .stream_ctx + .input + .predicate_group() + .predicate_without_region(); predicate.map(|p| !p.exprs().is_empty()).unwrap_or(false) } diff --git a/src/mito2/src/read/unordered_scan.rs b/src/mito2/src/read/unordered_scan.rs index 4dc5d59b98..8dbfcf07ec 100644 --- a/src/mito2/src/read/unordered_scan.rs +++ b/src/mito2/src/read/unordered_scan.rs @@ -427,8 +427,14 @@ impl RegionScanner for UnorderedScan { .map_err(BoxedError::new) } - fn has_predicate(&self) -> bool { - let predicate = self.stream_ctx.input.predicate(); + /// If this scanner have predicate other than region partition exprs + fn has_predicate_without_region(&self) -> bool { + let predicate = self + .stream_ctx + .input + .predicate_group() + .predicate_without_region(); + predicate.map(|p| !p.exprs().is_empty()).unwrap_or(false) } diff --git a/src/store-api/src/region_engine.rs b/src/store-api/src/region_engine.rs index 000b36cc17..fe8df673d0 100644 --- a/src/store-api/src/region_engine.rs +++ b/src/store-api/src/region_engine.rs @@ -444,8 +444,8 @@ pub trait RegionScanner: Debug + DisplayAs + Send { partition: usize, ) -> Result; - /// Check if there is any predicate that may be executed in this scanner. - fn has_predicate(&self) -> bool; + /// Check if there is any predicate exclude region partition exprs that may be executed in this scanner. + fn has_predicate_without_region(&self) -> bool; /// Sets whether the scanner is reading a logical region. fn set_logical_region(&mut self, logical_region: bool); @@ -857,7 +857,7 @@ impl RegionScanner for SinglePartitionScanner { Ok(result.unwrap()) } - fn has_predicate(&self) -> bool { + fn has_predicate_without_region(&self) -> bool { false } diff --git a/src/table/src/table/scan.rs b/src/table/src/table/scan.rs index 32940dfacc..a60215618b 100644 --- a/src/table/src/table/scan.rs +++ b/src/table/src/table/scan.rs @@ -335,25 +335,26 @@ impl ExecutionPlan for RegionScanExec { return Ok(Statistics::new_unknown(self.schema().as_ref())); } - let statistics = if self.append_mode && !self.scanner.lock().unwrap().has_predicate() { - let column_statistics = self - .arrow_schema - .fields - .iter() - .map(|_| ColumnStatistics { - distinct_count: Precision::Exact(self.total_rows), - null_count: Precision::Exact(0), // all null rows are counted for append-only table - ..Default::default() - }) - .collect(); - Statistics { - num_rows: Precision::Exact(self.total_rows), - total_byte_size: Default::default(), - column_statistics, - } - } else { - Statistics::new_unknown(&self.arrow_schema) - }; + let statistics = + if self.append_mode && !self.scanner.lock().unwrap().has_predicate_without_region() { + let column_statistics = self + .arrow_schema + .fields + .iter() + .map(|_| ColumnStatistics { + distinct_count: Precision::Exact(self.total_rows), + null_count: Precision::Exact(0), // all null rows are counted for append-only table + ..Default::default() + }) + .collect(); + Statistics { + num_rows: Precision::Exact(self.total_rows), + total_byte_size: Default::default(), + column_statistics, + } + } else { + Statistics::new_unknown(&self.arrow_schema) + }; Ok(statistics) } diff --git a/tests/cases/distributed/optimizer/count.result b/tests/cases/distributed/optimizer/count.result new file mode 100644 index 0000000000..79a6fad6f9 --- /dev/null +++ b/tests/cases/distributed/optimizer/count.result @@ -0,0 +1,465 @@ +create table "HelloWorld" (a string, b timestamp time index); + +Affected Rows: 0 + +insert into "HelloWorld" values ("a", 1) ,("b", 2); + +Affected Rows: 2 + +select count(*) from "HelloWorld"; + ++----------+ +| count(*) | ++----------+ +| 2 | ++----------+ + +create table test (a string, "BbB" timestamp time index); + +Affected Rows: 0 + +insert into test values ("c", 1) ; + +Affected Rows: 1 + +select count(*) from test; + ++----------+ +| count(*) | ++----------+ +| 1 | ++----------+ + +select count(*) from (select count(*) from test where a = 'a'); + ++----------+ +| count(*) | ++----------+ +| 1 | ++----------+ + +select count(*) from (select * from test cross join "HelloWorld"); + ++----------+ +| count(*) | ++----------+ +| 2 | ++----------+ + +drop table "HelloWorld"; + +Affected Rows: 0 + +drop table test; + +Affected Rows: 0 + +-- Append table +create table count_where_bug ( + `tag` String, + ts TimestampMillisecond time index, + num Int64, + primary key (`tag`), +) engine=mito with('append_mode'='true'); + +Affected Rows: 0 + +insert into count_where_bug (`tag`, ts, num) +values ('a', '2024-09-06T06:00:01Z', 1), + ('a', '2024-09-06T06:00:02Z', 2), + ('a', '2024-09-06T06:00:03Z', 3), + ('b', '2024-09-06T06:00:04Z', 4), + ('b', '2024-09-06T06:00:05Z', 5); + +Affected Rows: 5 + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (\{count\[count\]:)\d+(\}) {count[count]:REDACTED} +-- SQLNESS REPLACE "partition_count":\{(.*?)\} "partition_count":REDACTED +explain analyze +select count(1) from count_where_bug; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_CooperativeExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_ProjectionExec: expr=[5 as count(Int64(1))] REDACTED +|_|_|_PlaceholderRowExec REDACTED +|_|_|_| +|_|_| Total rows: 1_| ++-+-+-+ + +select count(1) from count_where_bug; + ++-----------------+ +| count(Int64(1)) | ++-----------------+ +| 5 | ++-----------------+ + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +explain analyze +select count(1) from count_where_bug where `tag` = 'b'; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_CooperativeExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_ProjectionExec: expr=[count(count_where_bug.ts)@0 as count(Int64(1))] REDACTED +|_|_|_AggregateExec: mode=Final, gby=[], aggr=[count(count_where_bug.ts)] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[count(count_where_bug.ts)] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_UnorderedScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED +|_|_|_| +|_|_| Total rows: 1_| ++-+-+-+ + +select count(1) from count_where_bug where `tag` = 'b'; + ++-----------------+ +| count(Int64(1)) | ++-----------------+ +| 2 | ++-----------------+ + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (\{count\[count\]:)\d+(\}) {count[count]:REDACTED} +-- SQLNESS REPLACE "partition_count":\{(.*?)\} "partition_count":REDACTED +explain analyze +select count(1) from count_where_bug where ts > '2024-09-06T06:00:04Z'; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_CooperativeExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_ProjectionExec: expr=[count(count_where_bug.ts)@0 as count(Int64(1))] REDACTED +|_|_|_AggregateExec: mode=Final, gby=[], aggr=[count(count_where_bug.ts)] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[count(count_where_bug.ts)] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_UnorderedScan: region=REDACTED, "partition_count":REDACTED REDACTED +|_|_|_| +|_|_| Total rows: 1_| ++-+-+-+ + +select count(1) from count_where_bug where ts > '2024-09-06T06:00:04Z'; + ++-----------------+ +| count(Int64(1)) | ++-----------------+ +| 1 | ++-----------------+ + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +explain analyze +select count(1) from count_where_bug where num != 3; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_CooperativeExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_ProjectionExec: expr=[count(count_where_bug.ts)@0 as count(Int64(1))] REDACTED +|_|_|_AggregateExec: mode=Final, gby=[], aggr=[count(count_where_bug.ts)] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[count(count_where_bug.ts)] REDACTED +|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED +|_|_|_FilterExec: num@1 != 3, projection=[ts@0] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_UnorderedScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED +|_|_|_| +|_|_| Total rows: 1_| ++-+-+-+ + +select count(1) from count_where_bug where num != 3; + ++-----------------+ +| count(Int64(1)) | ++-----------------+ +| 4 | ++-----------------+ + +drop table count_where_bug; + +Affected Rows: 0 + +-- partition-ed Append table +create table count_where_bug ( + `tag` String, + ts TimestampMillisecond time index, + num Int64, + primary key (`tag`), +) PARTITION ON COLUMNS (`tag`) ( + tag <= 'a', + tag > 'a' + ) +engine=mito with('append_mode'='true'); + +Affected Rows: 0 + +insert into count_where_bug (`tag`, ts, num) +values ('a', '2024-09-06T06:00:01Z', 1), + ('a', '2024-09-06T06:00:02Z', 2), + ('a', '2024-09-06T06:00:03Z', 3), + ('b', '2024-09-06T06:00:04Z', 4), + ('b', '2024-09-06T06:00:05Z', 5); + +Affected Rows: 5 + +-- This should use statistics +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (\{count\[count\]:)\d+(\}) {count[count]:REDACTED} +-- SQLNESS REPLACE "partition_count":\{(.*?)\} "partition_count":REDACTED +explain analyze +select count(1) from count_where_bug; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_ProjectionExec: expr=[{count[count]:REDACTED} as __count_state(count_where_bug.ts)] REDACTED +|_|_|_PlaceholderRowExec REDACTED +|_|_|_| +| 1_| 1_|_ProjectionExec: expr=[{count[count]:REDACTED} as __count_state(count_where_bug.ts)] REDACTED +|_|_|_PlaceholderRowExec REDACTED +|_|_|_| +|_|_| Total rows: 1_| ++-+-+-+ + +select count(1) from count_where_bug; + ++-----------------+ +| count(Int64(1)) | ++-----------------+ +| 5 | ++-----------------+ + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +explain analyze +select count(1) from count_where_bug where `tag` = 'b'; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_AggregateExec: mode=Final, gby=[], aggr=[__count_state(count_where_bug.ts)] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[__count_state(count_where_bug.ts)] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_UnorderedScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED +|_|_|_| +|_|_| Total rows: 1_| ++-+-+-+ + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +explain analyze +select count(1) from count_where_bug where `tag` = 'b'; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_AggregateExec: mode=Final, gby=[], aggr=[__count_state(count_where_bug.ts)] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[__count_state(count_where_bug.ts)] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_UnorderedScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED +|_|_|_| +|_|_| Total rows: 1_| ++-+-+-+ + +select count(1) from count_where_bug where `tag` = 'b'; + ++-----------------+ +| count(Int64(1)) | ++-----------------+ +| 2 | ++-----------------+ + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (\{count\[count]:)\d+(\}) {count[count]:REDACTED} +-- SQLNESS REPLACE "partition_count":\{(.*?)\} "partition_count":REDACTED +explain analyze +select count(1) from count_where_bug where ts > '2024-09-06T06:00:04Z'; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_AggregateExec: mode=Final, gby=[], aggr=[__count_state(count_where_bug.ts)] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[__count_state(count_where_bug.ts)] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_UnorderedScan: region=REDACTED, "partition_count":REDACTED REDACTED +|_|_|_| +| 1_| 1_|_AggregateExec: mode=Final, gby=[], aggr=[__count_state(count_where_bug.ts)] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[__count_state(count_where_bug.ts)] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_UnorderedScan: region=REDACTED, "partition_count":REDACTED REDACTED +|_|_|_| +|_|_| Total rows: 1_| ++-+-+-+ + +select count(1) from count_where_bug where ts > '2024-09-06T06:00:04Z'; + ++-----------------+ +| count(Int64(1)) | ++-----------------+ +| 1 | ++-----------------+ + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +explain analyze +select count(1) from count_where_bug where num != 3; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_AggregateExec: mode=Final, gby=[], aggr=[__count_state(count_where_bug.ts)] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[__count_state(count_where_bug.ts)] REDACTED +|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED +|_|_|_FilterExec: num@1 != 3, projection=[ts@0] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_UnorderedScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED +|_|_|_| +| 1_| 1_|_AggregateExec: mode=Final, gby=[], aggr=[__count_state(count_where_bug.ts)] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[__count_state(count_where_bug.ts)] REDACTED +|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED +|_|_|_FilterExec: num@1 != 3, projection=[ts@0] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_UnorderedScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED +|_|_|_| +|_|_| Total rows: 1_| ++-+-+-+ + +select count(1) from count_where_bug where num != 3; + ++-----------------+ +| count(Int64(1)) | ++-----------------+ +| 4 | ++-----------------+ + +drop table count_where_bug; + +Affected Rows: 0 + diff --git a/tests/cases/distributed/optimizer/count.sql b/tests/cases/distributed/optimizer/count.sql new file mode 120000 index 0000000000..b539a04238 --- /dev/null +++ b/tests/cases/distributed/optimizer/count.sql @@ -0,0 +1 @@ +../../standalone/optimizer/count.sql \ No newline at end of file diff --git a/tests/cases/standalone/common/aggregate/count.result b/tests/cases/standalone/common/aggregate/count.result deleted file mode 100644 index 0a8e96b357..0000000000 --- a/tests/cases/standalone/common/aggregate/count.result +++ /dev/null @@ -1,103 +0,0 @@ -create table "HelloWorld" (a string, b timestamp time index); - -Affected Rows: 0 - -insert into "HelloWorld" values ("a", 1) ,("b", 2); - -Affected Rows: 2 - -select count(*) from "HelloWorld"; - -+----------+ -| count(*) | -+----------+ -| 2 | -+----------+ - -create table test (a string, "BbB" timestamp time index); - -Affected Rows: 0 - -insert into test values ("c", 1) ; - -Affected Rows: 1 - -select count(*) from test; - -+----------+ -| count(*) | -+----------+ -| 1 | -+----------+ - -select count(*) from (select count(*) from test where a = 'a'); - -+----------+ -| count(*) | -+----------+ -| 1 | -+----------+ - -select count(*) from (select * from test cross join "HelloWorld"); - -+----------+ -| count(*) | -+----------+ -| 2 | -+----------+ - -drop table "HelloWorld"; - -Affected Rows: 0 - -drop table test; - -Affected Rows: 0 - --- Append table -create table count_where_bug ( - `tag` String, - ts TimestampMillisecond time index, - num Int64, - primary key (`tag`), -) engine=mito with('append_mode'='true'); - -Affected Rows: 0 - -insert into count_where_bug (`tag`, ts, num) -values ('a', '2024-09-06T06:00:01Z', 1), - ('a', '2024-09-06T06:00:02Z', 2), - ('a', '2024-09-06T06:00:03Z', 3), - ('b', '2024-09-06T06:00:04Z', 4), - ('b', '2024-09-06T06:00:05Z', 5); - -Affected Rows: 5 - -select count(1) from count_where_bug where `tag` = 'b'; - -+-----------------+ -| count(Int64(1)) | -+-----------------+ -| 2 | -+-----------------+ - -select count(1) from count_where_bug where ts > '2024-09-06T06:00:04Z'; - -+-----------------+ -| count(Int64(1)) | -+-----------------+ -| 1 | -+-----------------+ - -select count(1) from count_where_bug where num != 3; - -+-----------------+ -| count(Int64(1)) | -+-----------------+ -| 4 | -+-----------------+ - -drop table count_where_bug; - -Affected Rows: 0 - diff --git a/tests/cases/standalone/common/aggregate/count.sql b/tests/cases/standalone/common/aggregate/count.sql deleted file mode 100644 index dfc86f9273..0000000000 --- a/tests/cases/standalone/common/aggregate/count.sql +++ /dev/null @@ -1,43 +0,0 @@ -create table "HelloWorld" (a string, b timestamp time index); - -insert into "HelloWorld" values ("a", 1) ,("b", 2); - -select count(*) from "HelloWorld"; - -create table test (a string, "BbB" timestamp time index); - -insert into test values ("c", 1) ; - -select count(*) from test; - -select count(*) from (select count(*) from test where a = 'a'); - -select count(*) from (select * from test cross join "HelloWorld"); - -drop table "HelloWorld"; - -drop table test; - --- Append table - -create table count_where_bug ( - `tag` String, - ts TimestampMillisecond time index, - num Int64, - primary key (`tag`), -) engine=mito with('append_mode'='true'); - -insert into count_where_bug (`tag`, ts, num) -values ('a', '2024-09-06T06:00:01Z', 1), - ('a', '2024-09-06T06:00:02Z', 2), - ('a', '2024-09-06T06:00:03Z', 3), - ('b', '2024-09-06T06:00:04Z', 4), - ('b', '2024-09-06T06:00:05Z', 5); - -select count(1) from count_where_bug where `tag` = 'b'; - -select count(1) from count_where_bug where ts > '2024-09-06T06:00:04Z'; - -select count(1) from count_where_bug where num != 3; - -drop table count_where_bug; diff --git a/tests/cases/standalone/optimizer/count.result b/tests/cases/standalone/optimizer/count.result new file mode 100644 index 0000000000..3819243393 --- /dev/null +++ b/tests/cases/standalone/optimizer/count.result @@ -0,0 +1,462 @@ +create table "HelloWorld" (a string, b timestamp time index); + +Affected Rows: 0 + +insert into "HelloWorld" values ("a", 1) ,("b", 2); + +Affected Rows: 2 + +select count(*) from "HelloWorld"; + ++----------+ +| count(*) | ++----------+ +| 2 | ++----------+ + +create table test (a string, "BbB" timestamp time index); + +Affected Rows: 0 + +insert into test values ("c", 1) ; + +Affected Rows: 1 + +select count(*) from test; + ++----------+ +| count(*) | ++----------+ +| 1 | ++----------+ + +select count(*) from (select count(*) from test where a = 'a'); + ++----------+ +| count(*) | ++----------+ +| 1 | ++----------+ + +select count(*) from (select * from test cross join "HelloWorld"); + ++----------+ +| count(*) | ++----------+ +| 2 | ++----------+ + +drop table "HelloWorld"; + +Affected Rows: 0 + +drop table test; + +Affected Rows: 0 + +-- Append table +create table count_where_bug ( + `tag` String, + ts TimestampMillisecond time index, + num Int64, + primary key (`tag`), +) engine=mito with('append_mode'='true'); + +Affected Rows: 0 + +insert into count_where_bug (`tag`, ts, num) +values ('a', '2024-09-06T06:00:01Z', 1), + ('a', '2024-09-06T06:00:02Z', 2), + ('a', '2024-09-06T06:00:03Z', 3), + ('b', '2024-09-06T06:00:04Z', 4), + ('b', '2024-09-06T06:00:05Z', 5); + +Affected Rows: 5 + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (\{count\[count\]:)\d+(\}) {count[count]:REDACTED} +-- SQLNESS REPLACE "partition_count":\{(.*?)\} "partition_count":REDACTED +explain analyze +select count(1) from count_where_bug; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_CooperativeExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_ProjectionExec: expr=[5 as count(Int64(1))] REDACTED +|_|_|_PlaceholderRowExec REDACTED +|_|_|_| +|_|_| Total rows: 1_| ++-+-+-+ + +select count(1) from count_where_bug; + ++-----------------+ +| count(Int64(1)) | ++-----------------+ +| 5 | ++-----------------+ + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +explain analyze +select count(1) from count_where_bug where `tag` = 'b'; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_CooperativeExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_UnorderedScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED +|_|_|_| +|_|_| Total rows: 1_| ++-+-+-+ + +select count(1) from count_where_bug where `tag` = 'b'; + ++-----------------+ +| count(Int64(1)) | ++-----------------+ +| 2 | ++-----------------+ + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (\{count\[count\]:)\d+(\}) {count[count]:REDACTED} +-- SQLNESS REPLACE "partition_count":\{(.*?)\} "partition_count":REDACTED +explain analyze +select count(1) from count_where_bug where ts > '2024-09-06T06:00:04Z'; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_CooperativeExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_UnorderedScan: region=REDACTED, "partition_count":REDACTED REDACTED +|_|_|_| +|_|_| Total rows: 1_| ++-+-+-+ + +select count(1) from count_where_bug where ts > '2024-09-06T06:00:04Z'; + ++-----------------+ +| count(Int64(1)) | ++-----------------+ +| 1 | ++-----------------+ + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +explain analyze +select count(1) from count_where_bug where num != 3; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_CooperativeExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED +|_|_|_FilterExec: num@1 != 3, projection=[ts@0] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_UnorderedScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED +|_|_|_| +|_|_| Total rows: 1_| ++-+-+-+ + +select count(1) from count_where_bug where num != 3; + ++-----------------+ +| count(Int64(1)) | ++-----------------+ +| 4 | ++-----------------+ + +drop table count_where_bug; + +Affected Rows: 0 + +-- partition-ed Append table +create table count_where_bug ( + `tag` String, + ts TimestampMillisecond time index, + num Int64, + primary key (`tag`), +) PARTITION ON COLUMNS (`tag`) ( + tag <= 'a', + tag > 'a' + ) +engine=mito with('append_mode'='true'); + +Affected Rows: 0 + +insert into count_where_bug (`tag`, ts, num) +values ('a', '2024-09-06T06:00:01Z', 1), + ('a', '2024-09-06T06:00:02Z', 2), + ('a', '2024-09-06T06:00:03Z', 3), + ('b', '2024-09-06T06:00:04Z', 4), + ('b', '2024-09-06T06:00:05Z', 5); + +Affected Rows: 5 + +-- This should use statistics +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (\{count\[count\]:)\d+(\}) {count[count]:REDACTED} +-- SQLNESS REPLACE "partition_count":\{(.*?)\} "partition_count":REDACTED +explain analyze +select count(1) from count_where_bug; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_ProjectionExec: expr=[{count[count]:REDACTED} as __count_state(count_where_bug.ts)] REDACTED +|_|_|_PlaceholderRowExec REDACTED +|_|_|_| +| 1_| 1_|_ProjectionExec: expr=[{count[count]:REDACTED} as __count_state(count_where_bug.ts)] REDACTED +|_|_|_PlaceholderRowExec REDACTED +|_|_|_| +|_|_| Total rows: 1_| ++-+-+-+ + +select count(1) from count_where_bug; + ++-----------------+ +| count(Int64(1)) | ++-----------------+ +| 5 | ++-----------------+ + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +explain analyze +select count(1) from count_where_bug where `tag` = 'b'; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_AggregateExec: mode=Final, gby=[], aggr=[__count_state(count_where_bug.ts)] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[__count_state(count_where_bug.ts)] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_UnorderedScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED +|_|_|_| +|_|_| Total rows: 1_| ++-+-+-+ + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +explain analyze +select count(1) from count_where_bug where `tag` = 'b'; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_AggregateExec: mode=Final, gby=[], aggr=[__count_state(count_where_bug.ts)] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[__count_state(count_where_bug.ts)] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_UnorderedScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED +|_|_|_| +|_|_| Total rows: 1_| ++-+-+-+ + +select count(1) from count_where_bug where `tag` = 'b'; + ++-----------------+ +| count(Int64(1)) | ++-----------------+ +| 2 | ++-----------------+ + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (\{count\[count]:)\d+(\}) {count[count]:REDACTED} +-- SQLNESS REPLACE "partition_count":\{(.*?)\} "partition_count":REDACTED +explain analyze +select count(1) from count_where_bug where ts > '2024-09-06T06:00:04Z'; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_AggregateExec: mode=Final, gby=[], aggr=[__count_state(count_where_bug.ts)] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[__count_state(count_where_bug.ts)] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_UnorderedScan: region=REDACTED, "partition_count":REDACTED REDACTED +|_|_|_| +| 1_| 1_|_AggregateExec: mode=Final, gby=[], aggr=[__count_state(count_where_bug.ts)] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[__count_state(count_where_bug.ts)] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_UnorderedScan: region=REDACTED, "partition_count":REDACTED REDACTED +|_|_|_| +|_|_| Total rows: 1_| ++-+-+-+ + +select count(1) from count_where_bug where ts > '2024-09-06T06:00:04Z'; + ++-----------------+ +| count(Int64(1)) | ++-----------------+ +| 1 | ++-----------------+ + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +explain analyze +select count(1) from count_where_bug where num != 3; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_AggregateExec: mode=Final, gby=[], aggr=[__count_state(count_where_bug.ts)] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[__count_state(count_where_bug.ts)] REDACTED +|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED +|_|_|_FilterExec: num@1 != 3, projection=[ts@0] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_UnorderedScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED +|_|_|_| +| 1_| 1_|_AggregateExec: mode=Final, gby=[], aggr=[__count_state(count_where_bug.ts)] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[], aggr=[__count_state(count_where_bug.ts)] REDACTED +|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED +|_|_|_FilterExec: num@1 != 3, projection=[ts@0] REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_UnorderedScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED +|_|_|_| +|_|_| Total rows: 1_| ++-+-+-+ + +select count(1) from count_where_bug where num != 3; + ++-----------------+ +| count(Int64(1)) | ++-----------------+ +| 4 | ++-----------------+ + +drop table count_where_bug; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/optimizer/count.sql b/tests/cases/standalone/optimizer/count.sql new file mode 100644 index 0000000000..caa961b1f0 --- /dev/null +++ b/tests/cases/standalone/optimizer/count.sql @@ -0,0 +1,201 @@ +create table "HelloWorld" (a string, b timestamp time index); + +insert into "HelloWorld" values ("a", 1) ,("b", 2); + +select count(*) from "HelloWorld"; + +create table test (a string, "BbB" timestamp time index); + +insert into test values ("c", 1) ; + +select count(*) from test; + +select count(*) from (select count(*) from test where a = 'a'); + +select count(*) from (select * from test cross join "HelloWorld"); + +drop table "HelloWorld"; + +drop table test; + +-- Append table + +create table count_where_bug ( + `tag` String, + ts TimestampMillisecond time index, + num Int64, + primary key (`tag`), +) engine=mito with('append_mode'='true'); + +insert into count_where_bug (`tag`, ts, num) +values ('a', '2024-09-06T06:00:01Z', 1), + ('a', '2024-09-06T06:00:02Z', 2), + ('a', '2024-09-06T06:00:03Z', 3), + ('b', '2024-09-06T06:00:04Z', 4), + ('b', '2024-09-06T06:00:05Z', 5); + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (\{count\[count\]:)\d+(\}) {count[count]:REDACTED} +-- SQLNESS REPLACE "partition_count":\{(.*?)\} "partition_count":REDACTED +explain analyze +select count(1) from count_where_bug; + +select count(1) from count_where_bug; + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +explain analyze +select count(1) from count_where_bug where `tag` = 'b'; + +select count(1) from count_where_bug where `tag` = 'b'; + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (\{count\[count\]:)\d+(\}) {count[count]:REDACTED} +-- SQLNESS REPLACE "partition_count":\{(.*?)\} "partition_count":REDACTED +explain analyze +select count(1) from count_where_bug where ts > '2024-09-06T06:00:04Z'; + +select count(1) from count_where_bug where ts > '2024-09-06T06:00:04Z'; + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +explain analyze +select count(1) from count_where_bug where num != 3; + +select count(1) from count_where_bug where num != 3; + +drop table count_where_bug; + +-- partition-ed Append table +create table count_where_bug ( + `tag` String, + ts TimestampMillisecond time index, + num Int64, + primary key (`tag`), +) PARTITION ON COLUMNS (`tag`) ( + tag <= 'a', + tag > 'a' + ) +engine=mito with('append_mode'='true'); + +insert into count_where_bug (`tag`, ts, num) +values ('a', '2024-09-06T06:00:01Z', 1), + ('a', '2024-09-06T06:00:02Z', 2), + ('a', '2024-09-06T06:00:03Z', 3), + ('b', '2024-09-06T06:00:04Z', 4), + ('b', '2024-09-06T06:00:05Z', 5); + +-- This should use statistics +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (\{count\[count\]:)\d+(\}) {count[count]:REDACTED} +-- SQLNESS REPLACE "partition_count":\{(.*?)\} "partition_count":REDACTED +explain analyze +select count(1) from count_where_bug; + +select count(1) from count_where_bug; + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +explain analyze +select count(1) from count_where_bug where `tag` = 'b'; + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +explain analyze +select count(1) from count_where_bug where `tag` = 'b'; + +select count(1) from count_where_bug where `tag` = 'b'; + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (\{count\[count]:)\d+(\}) {count[count]:REDACTED} +-- SQLNESS REPLACE "partition_count":\{(.*?)\} "partition_count":REDACTED +explain analyze +select count(1) from count_where_bug where ts > '2024-09-06T06:00:04Z'; + +select count(1) from count_where_bug where ts > '2024-09-06T06:00:04Z'; + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +explain analyze +select count(1) from count_where_bug where num != 3; + +select count(1) from count_where_bug where num != 3; + +drop table count_where_bug; \ No newline at end of file From d7ed6a69aba2fe8e7f0d235ff22355073f355fa2 Mon Sep 17 00:00:00 2001 From: LFC <990479+MichaelScofield@users.noreply.github.com> Date: Mon, 27 Oct 2025 11:30:52 +0800 Subject: [PATCH 011/149] feat: merge json datatype (#7142) * feat: merge json datatype Signed-off-by: luofucong * resolve PR comments Signed-off-by: luofucong --------- Signed-off-by: luofucong --- src/datatypes/src/data_type.rs | 4 +- src/datatypes/src/error.rs | 10 +- src/datatypes/src/json.rs | 6 +- src/datatypes/src/types/json_type.rs | 191 ++++++++++++++++++++++++- src/datatypes/src/types/struct_type.rs | 2 +- 5 files changed, 204 insertions(+), 9 deletions(-) diff --git a/src/datatypes/src/data_type.rs b/src/datatypes/src/data_type.rs index eb47d30305..bb84e5a30b 100644 --- a/src/datatypes/src/data_type.rs +++ b/src/datatypes/src/data_type.rs @@ -348,9 +348,9 @@ impl ConcreteDataType { } } - pub fn as_json(&self) -> Option { + pub fn as_json(&self) -> Option<&JsonType> { match self { - ConcreteDataType::Json(j) => Some(j.clone()), + ConcreteDataType::Json(j) => Some(j), _ => None, } } diff --git a/src/datatypes/src/error.rs b/src/datatypes/src/error.rs index 85e78ce1eb..064c78e89d 100644 --- a/src/datatypes/src/error.rs +++ b/src/datatypes/src/error.rs @@ -259,6 +259,13 @@ pub enum Error { #[snafu(implicit)] location: Location, }, + + #[snafu(display("Failed to merge JSON datatype: {reason}"))] + MergeJsonDatatype { + reason: String, + #[snafu(implicit)] + location: Location, + }, } impl ErrorExt for Error { @@ -281,7 +288,8 @@ impl ErrorExt for Error { | InvalidJsonb { .. } | InvalidVector { .. } | InvalidFulltextOption { .. } - | InvalidSkippingIndexOption { .. } => StatusCode::InvalidArguments, + | InvalidSkippingIndexOption { .. } + | MergeJsonDatatype { .. } => StatusCode::InvalidArguments, ValueExceedsPrecision { .. } | CastType { .. } diff --git a/src/datatypes/src/json.rs b/src/datatypes/src/json.rs index 902b84a131..64952bb39a 100644 --- a/src/datatypes/src/json.rs +++ b/src/datatypes/src/json.rs @@ -30,7 +30,7 @@ use snafu::{ResultExt, ensure}; use crate::data_type::{ConcreteDataType, DataType}; use crate::error::{self, Error}; -use crate::types::{StructField, StructType}; +use crate::types::{ListType, StructField, StructType}; use crate::value::{ListValue, StructValue, Value}; /// The configuration of JSON encoding @@ -375,8 +375,8 @@ fn encode_json_value_with_context<'a>( } Json::Array(arr) => { let list_value = encode_json_array_with_context(arr, expected_type, context)?; - let data_type = list_value.datatype().clone(); - Ok((Value::List(list_value), (*data_type).clone())) + let datatype = ConcreteDataType::List(ListType::new(list_value.datatype())); + Ok((Value::List(list_value), datatype)) } Json::Object(obj) => { let struct_value = encode_json_object_with_context(obj, None, context)?; diff --git a/src/datatypes/src/types/json_type.rs b/src/datatypes/src/types/json_type.rs index 99dcf9c571..141db03728 100644 --- a/src/datatypes/src/types/json_type.rs +++ b/src/datatypes/src/types/json_type.rs @@ -12,7 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::BTreeMap; use std::str::FromStr; +use std::sync::Arc; use arrow::datatypes::DataType as ArrowDataType; use arrow_schema::Fields; @@ -21,10 +23,13 @@ use serde::{Deserialize, Serialize}; use snafu::ResultExt; use crate::data_type::DataType; -use crate::error::{DeserializeSnafu, InvalidJsonSnafu, InvalidJsonbSnafu, Result}; +use crate::error::{ + DeserializeSnafu, InvalidJsonSnafu, InvalidJsonbSnafu, MergeJsonDatatypeSnafu, Result, +}; use crate::prelude::ConcreteDataType; use crate::scalars::ScalarVectorBuilder; use crate::type_id::LogicalTypeId; +use crate::types::{ListType, StructField, StructType}; use crate::value::Value; use crate::vectors::{BinaryVectorBuilder, MutableVector}; @@ -48,11 +53,101 @@ impl JsonType { pub fn new(format: JsonFormat) -> Self { Self { format } } + + // TODO(LFC): remove "allow unused" + #[allow(unused)] + /// Make json type a struct type, by: + /// - if the json is an object, its entries are mapped to struct fields, obviously; + /// - if not, the json is one of bool, number, string or array, make it a special field called + /// "__plain" in a struct with only that field. + pub(crate) fn as_struct_type(&self) -> StructType { + match &self.format { + JsonFormat::Jsonb => StructType::default(), + JsonFormat::Native(inner) => match inner.as_ref() { + ConcreteDataType::Struct(t) => t.clone(), + x => StructType::new(Arc::new(vec![StructField::new( + "__plain".to_string(), + x.clone(), + true, + )])), + }, + } + } + + // TODO(LFC): remove "allow unused" + #[allow(unused)] + /// Try to merge this json type with others, error on datatype conflict. + pub(crate) fn merge(&mut self, other: &JsonType) -> Result<()> { + match (&self.format, &other.format) { + (JsonFormat::Jsonb, JsonFormat::Jsonb) => Ok(()), + (JsonFormat::Native(this), JsonFormat::Native(that)) => { + let merged = merge(this.as_ref(), that.as_ref())?; + self.format = JsonFormat::Native(Box::new(merged)); + Ok(()) + } + _ => MergeJsonDatatypeSnafu { + reason: "json format not match", + } + .fail(), + } + } +} + +fn merge(this: &ConcreteDataType, that: &ConcreteDataType) -> Result { + match (this, that) { + (this, that) if this == that => Ok(this.clone()), + (ConcreteDataType::List(this), ConcreteDataType::List(that)) => { + merge_list(this, that).map(ConcreteDataType::List) + } + (ConcreteDataType::Struct(this), ConcreteDataType::Struct(that)) => { + merge_struct(this, that).map(ConcreteDataType::Struct) + } + (ConcreteDataType::Null(_), x) | (x, ConcreteDataType::Null(_)) => Ok(x.clone()), + _ => MergeJsonDatatypeSnafu { + reason: format!("datatypes have conflict, this: {this}, that: {that}"), + } + .fail(), + } +} + +fn merge_list(this: &ListType, that: &ListType) -> Result { + let merged = merge(this.item_type(), that.item_type())?; + Ok(ListType::new(Arc::new(merged))) +} + +fn merge_struct(this: &StructType, that: &StructType) -> Result { + let this = Arc::unwrap_or_clone(this.fields()); + let that = Arc::unwrap_or_clone(that.fields()); + + let mut this: BTreeMap = this + .into_iter() + .map(|x| (x.name().to_string(), x)) + .collect(); + // merge "that" into "this" directly: + for that_field in that { + let field_name = that_field.name().to_string(); + if let Some(this_field) = this.get(&field_name) { + let merged_field = StructField::new( + field_name.clone(), + merge(this_field.data_type(), that_field.data_type())?, + true, // the value in json object must be always nullable + ); + this.insert(field_name, merged_field); + } else { + this.insert(field_name, that_field); + } + } + + let fields = this.into_values().collect::>(); + Ok(StructType::new(Arc::new(fields))) } impl DataType for JsonType { fn name(&self) -> String { - JSON_TYPE_NAME.to_string() + match &self.format { + JsonFormat::Jsonb => JSON_TYPE_NAME.to_string(), + JsonFormat::Native(x) => format!("Json<{x}>"), + } } fn logical_type_id(&self) -> LogicalTypeId { @@ -106,3 +201,95 @@ pub fn parse_string_to_jsonb(s: &str) -> Result> { .map_err(|_| InvalidJsonSnafu { value: s }.build()) .map(|json| json.to_vec()) } + +#[cfg(test)] +mod tests { + use super::*; + use crate::json::JsonStructureSettings; + + #[test] + fn test_merge_json_type() -> Result<()> { + fn test( + json: &str, + json_type: &mut JsonType, + expected: std::result::Result<&str, &str>, + ) -> Result<()> { + let json: serde_json::Value = serde_json::from_str(json).unwrap(); + + let settings = JsonStructureSettings::Structured(None); + let value = settings.encode(json)?; + let value_type = value.data_type(); + let Some(other) = value_type.as_json() else { + unreachable!() + }; + + let result = json_type.merge(other); + match (result, expected) { + (Ok(()), Ok(expected)) => { + assert_eq!(json_type.name(), expected) + } + (Err(err), Err(expected)) => { + assert_eq!(err.to_string(), expected) + } + _ => unreachable!(), + } + Ok(()) + } + + let json_type = &mut JsonType::new(JsonFormat::Native(Box::new( + ConcreteDataType::null_datatype(), + ))); + + // can merge with json object: + let json = r#"{ + "hello": "world", + "list": [1, 2, 3], + "object": {"a": 1} + }"#; + let expected = + r#"Json, "object": Struct<"a": Int64>>>"#; + test(json, json_type, Ok(expected))?; + + // cannot merge with other non-object json values: + let jsons = [r#""s""#, "1", "[1]"]; + let expects = [ + r#"Failed to merge JSON datatype: datatypes have conflict, this: Struct<"hello": String, "list": List, "object": Struct<"a": Int64>>, that: String"#, + r#"Failed to merge JSON datatype: datatypes have conflict, this: Struct<"hello": String, "list": List, "object": Struct<"a": Int64>>, that: Int64"#, + r#"Failed to merge JSON datatype: datatypes have conflict, this: Struct<"hello": String, "list": List, "object": Struct<"a": Int64>>, that: List"#, + ]; + for (json, expect) in jsons.into_iter().zip(expects.into_iter()) { + test(json, json_type, Err(expect))?; + } + + // cannot merge with other json object with conflict field datatype: + let json = r#"{ + "hello": 1, + "float": 0.123, + "no": 42 + }"#; + let expected = + r#"Failed to merge JSON datatype: datatypes have conflict, this: String, that: Int64"#; + test(json, json_type, Err(expected))?; + + // can merge with another json object: + let json = r#"{ + "hello": "greptime", + "float": 0.123, + "int": 42 + }"#; + let expected = r#"Json, "object": Struct<"a": Int64>>>"#; + test(json, json_type, Ok(expected))?; + + // can merge with some complex nested json object: + let json = r#"{ + "list": [4], + "object": {"foo": "bar", "l": ["x"], "o": {"key": "value"}}, + "float": 0.456, + "int": 0 + }"#; + let expected = r#"Json, "object": Struct<"a": Int64, "foo": String, "l": List, "o": Struct<"key": String>>>>"#; + test(json, json_type, Ok(expected))?; + + Ok(()) + } +} diff --git a/src/datatypes/src/types/struct_type.rs b/src/datatypes/src/types/struct_type.rs index 5e3156498f..c082aeb9e6 100644 --- a/src/datatypes/src/types/struct_type.rs +++ b/src/datatypes/src/types/struct_type.rs @@ -52,7 +52,7 @@ impl DataType for StructType { "Struct<{}>", self.fields .iter() - .map(|f| f.name()) + .map(|f| format!(r#""{}": {}"#, f.name(), f.data_type())) .collect::>() .join(", ") ) From 0a3961927d5a70992e32727f41801a5469e7f725 Mon Sep 17 00:00:00 2001 From: zyy17 Date: Mon, 27 Oct 2025 11:36:22 +0800 Subject: [PATCH 012/149] refactor!: add a `opentelemetry_traces_operations` table to aggregate `(service_name, span_name, span_kind)` to improve query performance (#7144) refactor: add a `*_operations` table to aggregate `(service_name, span_name, span_kind)` to improve query performance Signed-off-by: zyy17 --- src/common/catalog/src/consts.rs | 5 +++ src/frontend/src/instance/jaeger.rs | 24 +++++------ src/operator/src/insert.rs | 9 +++-- src/servers/src/http/jaeger.rs | 63 +---------------------------- src/servers/src/otlp/trace/v0.rs | 56 ++++++++++++++++++++++++- src/servers/src/otlp/trace/v1.rs | 56 ++++++++++++++++++++++++- src/servers/src/query_handler.rs | 2 - tests-integration/tests/http.rs | 44 +++++++++++++++++--- 8 files changed, 169 insertions(+), 90 deletions(-) diff --git a/src/common/catalog/src/consts.rs b/src/common/catalog/src/consts.rs index 2bc5db9824..7dd6da9b4f 100644 --- a/src/common/catalog/src/consts.rs +++ b/src/common/catalog/src/consts.rs @@ -150,4 +150,9 @@ pub const TRACE_TABLE_NAME_SESSION_KEY: &str = "trace_table_name"; pub fn trace_services_table_name(trace_table_name: &str) -> String { format!("{}_services", trace_table_name) } + +/// Generate the trace operations table name from the trace table name by adding `_operations` suffix. +pub fn trace_operations_table_name(trace_table_name: &str) -> String { + format!("{}_operations", trace_table_name) +} // ---- End of special table and fields ---- diff --git a/src/frontend/src/instance/jaeger.rs b/src/frontend/src/instance/jaeger.rs index 6208866db2..7654f7bb41 100644 --- a/src/frontend/src/instance/jaeger.rs +++ b/src/frontend/src/instance/jaeger.rs @@ -17,7 +17,9 @@ use std::sync::Arc; use async_trait::async_trait; use catalog::CatalogManagerRef; -use common_catalog::consts::{TRACE_TABLE_NAME, trace_services_table_name}; +use common_catalog::consts::{ + TRACE_TABLE_NAME, trace_operations_table_name, trace_services_table_name, +}; use common_function::function::FunctionRef; use common_function::scalars::json::json_get::{ JsonGetBool, JsonGetFloat, JsonGetInt, JsonGetString, @@ -76,8 +78,6 @@ impl JaegerQueryHandler for Instance { ctx: QueryContextRef, service_name: &str, span_kind: Option<&str>, - start_time: Option, - end_time: Option, ) -> ServerResult { let mut filters = vec![col(SERVICE_NAME_COLUMN).eq(lit(service_name))]; @@ -89,16 +89,6 @@ impl JaegerQueryHandler for Instance { )))); } - if let Some(start_time) = start_time { - // Microseconds to nanoseconds. - filters.push(col(TIMESTAMP_COLUMN).gt_eq(lit_timestamp_nano(start_time * 1_000))); - } - - if let Some(end_time) = end_time { - // Microseconds to nanoseconds. - filters.push(col(TIMESTAMP_COLUMN).lt_eq(lit_timestamp_nano(end_time * 1_000))); - } - // It's equivalent to the following SQL query: // // ``` @@ -107,8 +97,6 @@ impl JaegerQueryHandler for Instance { // {db}.{trace_table} // WHERE // service_name = '{service_name}' AND - // timestamp >= {start_time} AND - // timestamp <= {end_time} AND // span_kind = '{span_kind}' // ORDER BY // span_name ASC @@ -301,12 +289,18 @@ async fn query_trace_table( .unwrap_or(TRACE_TABLE_NAME); // If only select services, use the trace services table. + // If querying operations (distinct by span_name and span_kind), use the trace operations table. let table_name = { if match selects.as_slice() { [SelectExpr::Expression(x)] => x == &col(SERVICE_NAME_COLUMN), _ => false, } { &trace_services_table_name(trace_table_name) + } else if !distincts.is_empty() + && distincts.contains(&col(SPAN_NAME_COLUMN)) + && distincts.contains(&col(SPAN_KIND_COLUMN)) + { + &trace_operations_table_name(trace_table_name) } else { trace_table_name } diff --git a/src/operator/src/insert.rs b/src/operator/src/insert.rs index cb63b07772..59ab06c95e 100644 --- a/src/operator/src/insert.rs +++ b/src/operator/src/insert.rs @@ -29,7 +29,8 @@ use catalog::CatalogManagerRef; use client::{OutputData, OutputMeta}; use common_catalog::consts::{ PARENT_SPAN_ID_COLUMN, SERVICE_NAME_COLUMN, TRACE_ID_COLUMN, TRACE_TABLE_NAME, - TRACE_TABLE_NAME_SESSION_KEY, default_engine, trace_services_table_name, + TRACE_TABLE_NAME_SESSION_KEY, default_engine, trace_operations_table_name, + trace_services_table_name, }; use common_grpc_expr::util::ColumnExpr; use common_meta::cache::TableFlownodeSetCacheRef; @@ -618,8 +619,10 @@ impl Inserter { // note that auto create table shouldn't be ttl instant table // for it's a very unexpected behavior and should be set by user explicitly for mut create_table in create_tables { - if create_table.table_name == trace_services_table_name(trace_table_name) { - // Disable append mode for trace services table since it requires upsert behavior. + if create_table.table_name == trace_services_table_name(trace_table_name) + || create_table.table_name == trace_operations_table_name(trace_table_name) + { + // Disable append mode for auxiliary tables (services/operations) since they require upsert behavior. create_table .table_options .insert(APPEND_MODE_KEY.to_string(), "false".to_string()); diff --git a/src/servers/src/http/jaeger.rs b/src/servers/src/http/jaeger.rs index 77b598ad1a..9420c5ca2f 100644 --- a/src/servers/src/http/jaeger.rs +++ b/src/servers/src/http/jaeger.rs @@ -21,7 +21,6 @@ use axum::Extension; use axum::extract::{Path, Query, State}; use axum::http::{HeaderMap, StatusCode as HttpStatusCode}; use axum::response::IntoResponse; -use chrono::Utc; use common_catalog::consts::{PARENT_SPAN_ID_COLUMN, TRACE_TABLE_NAME}; use common_error::ext::ErrorExt; use common_error::status_code::StatusCode; @@ -52,7 +51,6 @@ pub const JAEGER_QUERY_TABLE_NAME_KEY: &str = "jaeger_query_table_name"; const REF_TYPE_CHILD_OF: &str = "CHILD_OF"; const SPAN_KIND_TIME_FMTS: [&str; 2] = ["%Y-%m-%d %H:%M:%S%.6f%z", "%Y-%m-%d %H:%M:%S%.9f%z"]; -pub const JAEGER_TIME_RANGE_FOR_OPERATIONS_HEADER: &str = "x-greptime-jaeger-query-time-range"; /// JaegerAPIResponse is the response of Jaeger HTTP API. /// The original version is `structuredResponse` which is defined in https://github.com/jaegertracing/jaeger/blob/main/cmd/query/app/http_handler.go. @@ -528,13 +526,6 @@ pub async fn handle_get_operations( query_params, query_ctx, headers ); - let (start, end) = match parse_jaeger_time_range_for_operations(&headers, &query_params) { - Ok((start, end)) => (start, end), - Err(e) => return error_response(e), - }; - - debug!("Get operations with start: {:?}, end: {:?}", start, end); - if let Some(service_name) = &query_params.service_name { update_query_context(&mut query_ctx, table_name); let query_ctx = Arc::new(query_ctx); @@ -546,13 +537,7 @@ pub async fn handle_get_operations( .start_timer(); match handler - .get_operations( - query_ctx, - service_name, - query_params.span_kind.as_deref(), - start, - end, - ) + .get_operations(query_ctx, service_name, query_params.span_kind.as_deref()) .await { Ok(output) => match covert_to_records(output).await { @@ -625,15 +610,7 @@ pub async fn handle_get_operations_by_service( .with_label_values(&[&db, "/api/services"]) .start_timer(); - let (start, end) = match parse_jaeger_time_range_for_operations(&headers, &query_params) { - Ok((start, end)) => (start, end), - Err(e) => return error_response(e), - }; - - match handler - .get_operations(query_ctx, &service_name, None, start, end) - .await - { + match handler.get_operations(query_ctx, &service_name, None).await { Ok(output) => match covert_to_records(output).await { Ok(Some(records)) => match operations_from_records(records, false) { Ok(operations) => { @@ -1117,42 +1094,6 @@ fn convert_string_to_boolean(input: &serde_json::Value) -> Option Result<(Option, Option)> { - if let Some(time_range) = headers.get(JAEGER_TIME_RANGE_FOR_OPERATIONS_HEADER) { - match time_range.to_str() { - Ok(time_range) => match humantime::parse_duration(time_range) { - Ok(duration) => { - debug!( - "Get operations with time range: {:?}, duration: {:?}", - time_range, duration - ); - let now = Utc::now().timestamp_micros(); - Ok((Some(now - duration.as_micros() as i64), Some(now))) - } - Err(e) => { - error!("Failed to parse time range header: {:?}", e); - Err(InvalidJaegerQuerySnafu { - reason: format!("invalid time range header: {:?}", time_range), - } - .build()) - } - }, - Err(e) => { - error!("Failed to convert time range header to string: {:?}", e); - Err(InvalidJaegerQuerySnafu { - reason: format!("invalid time range header: {:?}", time_range), - } - .build()) - } - } - } else { - Ok((query_params.start, query_params.end)) - } -} - #[cfg(test)] mod tests { use serde_json::{Number, Value as JsonValue, json}; diff --git a/src/servers/src/otlp/trace/v0.rs b/src/servers/src/otlp/trace/v0.rs index 03f279fccb..d45b0e6802 100644 --- a/src/servers/src/otlp/trace/v0.rs +++ b/src/servers/src/otlp/trace/v0.rs @@ -16,7 +16,7 @@ use std::collections::HashSet; use api::v1::value::ValueData; use api::v1::{ColumnDataType, RowInsertRequests}; -use common_catalog::consts::trace_services_table_name; +use common_catalog::consts::{trace_operations_table_name, trace_services_table_name}; use common_grpc::precision::Precision; use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest; use pipeline::{GreptimePipelineParams, PipelineWay}; @@ -35,6 +35,9 @@ use crate::row_writer::{self, MultiTableData, TableData}; const APPROXIMATE_COLUMN_COUNT: usize = 24; +// Use a timestamp(2100-01-01 00:00:00) as large as possible. +const MAX_TIMESTAMP: i64 = 4102444800000000000; + /// Convert SpanTraces to GreptimeDB row insert requests. /// Returns `InsertRequests` and total number of rows to ingest pub fn v0_to_grpc_insert_requests( @@ -49,23 +52,40 @@ pub fn v0_to_grpc_insert_requests( let mut multi_table_writer = MultiTableData::default(); let mut trace_writer = TableData::new(APPROXIMATE_COLUMN_COUNT, spans.len()); let mut trace_services_writer = TableData::new(APPROXIMATE_COLUMN_COUNT, 1); + let mut trace_operations_writer = TableData::new(APPROXIMATE_COLUMN_COUNT, 1); let mut services = HashSet::new(); + let mut operations = HashSet::new(); for span in spans { if let Some(service_name) = &span.service_name { // Only insert the service name if it's not already in the set. if !services.contains(service_name) { services.insert(service_name.clone()); } + + // Collect operations (service_name + span_name + span_kind). + let operation = ( + service_name.clone(), + span.span_name.clone(), + span.span_kind.clone(), + ); + if !operations.contains(&operation) { + operations.insert(operation); + } } write_span_to_row(&mut trace_writer, span)?; } write_trace_services_to_row(&mut trace_services_writer, services)?; + write_trace_operations_to_row(&mut trace_operations_writer, operations)?; multi_table_writer.add_table_data( trace_services_table_name(&table_name), trace_services_writer, ); + multi_table_writer.add_table_data( + trace_operations_table_name(&table_name), + trace_operations_writer, + ); multi_table_writer.add_table_data(table_name, trace_writer); Ok(multi_table_writer.into_row_insert_requests()) @@ -161,7 +181,7 @@ fn write_trace_services_to_row(writer: &mut TableData, services: HashSet row_writer::write_ts_to_nanos( writer, TIMESTAMP_COLUMN, - Some(4102444800000000000), // Use a timestamp(2100-01-01 00:00:00) as large as possible. + Some(MAX_TIMESTAMP), Precision::Nanosecond, &mut row, )?; @@ -180,3 +200,35 @@ fn write_trace_services_to_row(writer: &mut TableData, services: HashSet Ok(()) } + +fn write_trace_operations_to_row( + writer: &mut TableData, + operations: HashSet<(String, String, String)>, +) -> Result<()> { + for (service_name, span_name, span_kind) in operations { + let mut row = writer.alloc_one_row(); + // Write the timestamp as 0. + row_writer::write_ts_to_nanos( + writer, + TIMESTAMP_COLUMN, + Some(MAX_TIMESTAMP), + Precision::Nanosecond, + &mut row, + )?; + + // Write the `service_name`, `span_name`, and `span_kind` columns. + row_writer::write_fields( + writer, + vec![ + make_string_column_data(SERVICE_NAME_COLUMN, Some(service_name)), + make_string_column_data(SPAN_NAME_COLUMN, Some(span_name)), + make_string_column_data(SPAN_KIND_COLUMN, Some(span_kind)), + ] + .into_iter(), + &mut row, + )?; + writer.add_row(row); + } + + Ok(()) +} diff --git a/src/servers/src/otlp/trace/v1.rs b/src/servers/src/otlp/trace/v1.rs index 306444bc18..b7dcdbce7c 100644 --- a/src/servers/src/otlp/trace/v1.rs +++ b/src/servers/src/otlp/trace/v1.rs @@ -16,7 +16,7 @@ use std::collections::HashSet; use api::v1::value::ValueData; use api::v1::{ColumnDataType, RowInsertRequests, Value}; -use common_catalog::consts::trace_services_table_name; +use common_catalog::consts::{trace_operations_table_name, trace_services_table_name}; use common_grpc::precision::Precision; use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest; use opentelemetry_proto::tonic::common::v1::any_value::Value as OtlpValue; @@ -37,6 +37,9 @@ use crate::row_writer::{self, MultiTableData, TableData}; const APPROXIMATE_COLUMN_COUNT: usize = 30; +// Use a timestamp(2100-01-01 00:00:00) as large as possible. +const MAX_TIMESTAMP: i64 = 4102444800000000000; + /// Convert SpanTraces to GreptimeDB row insert requests. /// Returns `InsertRequests` and total number of rows to ingest /// @@ -60,23 +63,40 @@ pub fn v1_to_grpc_insert_requests( let mut multi_table_writer = MultiTableData::default(); let mut trace_writer = TableData::new(APPROXIMATE_COLUMN_COUNT, spans.len()); let mut trace_services_writer = TableData::new(APPROXIMATE_COLUMN_COUNT, 1); + let mut trace_operations_writer = TableData::new(APPROXIMATE_COLUMN_COUNT, 1); let mut services = HashSet::new(); + let mut operations = HashSet::new(); for span in spans { if let Some(service_name) = &span.service_name { // Only insert the service name if it's not already in the set. if !services.contains(service_name) { services.insert(service_name.clone()); } + + // Only insert the operation if it's not already in the set. + let operation = ( + service_name.clone(), + span.span_name.clone(), + span.span_kind.clone(), + ); + if !operations.contains(&operation) { + operations.insert(operation); + } } write_span_to_row(&mut trace_writer, span)?; } write_trace_services_to_row(&mut trace_services_writer, services)?; + write_trace_operations_to_row(&mut trace_operations_writer, operations)?; multi_table_writer.add_table_data( trace_services_table_name(&table_name), trace_services_writer, ); + multi_table_writer.add_table_data( + trace_operations_table_name(&table_name), + trace_operations_writer, + ); multi_table_writer.add_table_data(table_name, trace_writer); Ok(multi_table_writer.into_row_insert_requests()) @@ -160,7 +180,7 @@ fn write_trace_services_to_row(writer: &mut TableData, services: HashSet row_writer::write_ts_to_nanos( writer, TIMESTAMP_COLUMN, - Some(4102444800000000000), // Use a timestamp(2100-01-01 00:00:00) as large as possible. + Some(MAX_TIMESTAMP), Precision::Nanosecond, &mut row, )?; @@ -177,6 +197,38 @@ fn write_trace_services_to_row(writer: &mut TableData, services: HashSet Ok(()) } +fn write_trace_operations_to_row( + writer: &mut TableData, + operations: HashSet<(String, String, String)>, +) -> Result<()> { + for (service_name, span_name, span_kind) in operations { + let mut row = writer.alloc_one_row(); + // Write the timestamp as 0. + row_writer::write_ts_to_nanos( + writer, + TIMESTAMP_COLUMN, + Some(MAX_TIMESTAMP), + Precision::Nanosecond, + &mut row, + )?; + + // Write the `service_name`, `span_name`, and `span_kind` columns as tags. + row_writer::write_tags( + writer, + vec![ + (SERVICE_NAME_COLUMN.to_string(), service_name), + (SPAN_NAME_COLUMN.to_string(), span_name), + (SPAN_KIND_COLUMN.to_string(), span_kind), + ] + .into_iter(), + &mut row, + )?; + writer.add_row(row); + } + + Ok(()) +} + fn write_attributes( writer: &mut TableData, prefix: &str, diff --git a/src/servers/src/query_handler.rs b/src/servers/src/query_handler.rs index 74204be3e2..d41f68555b 100644 --- a/src/servers/src/query_handler.rs +++ b/src/servers/src/query_handler.rs @@ -198,8 +198,6 @@ pub trait JaegerQueryHandler { ctx: QueryContextRef, service_name: &str, span_kind: Option<&str>, - start_time: Option, - end_time: Option, ) -> Result; /// Retrieves a trace by its unique identifier. diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index f6c28c6602..9113b356ae 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -25,7 +25,8 @@ use auth::user_provider_from_option; use axum::http::{HeaderName, HeaderValue, StatusCode}; use chrono::Utc; use common_catalog::consts::{ - DEFAULT_PRIVATE_SCHEMA_NAME, TRACE_TABLE_NAME, trace_services_table_name, + DEFAULT_PRIVATE_SCHEMA_NAME, TRACE_TABLE_NAME, trace_operations_table_name, + trace_services_table_name, }; use common_error::status_code::StatusCode as ErrorCode; use common_frontend::slow_query_event::{ @@ -48,7 +49,6 @@ use servers::http::header::constants::{ GREPTIME_LOG_TABLE_NAME_HEADER_NAME, GREPTIME_PIPELINE_NAME_HEADER_NAME, }; use servers::http::header::{GREPTIME_DB_HEADER_NAME, GREPTIME_TIMEZONE_HEADER_NAME}; -use servers::http::jaeger::JAEGER_TIME_RANGE_FOR_OPERATIONS_HEADER; use servers::http::prometheus::{PrometheusJsonResponse, PrometheusResponse}; use servers::http::result::error_result::ErrorResponse; use servers::http::result::greptime_result_v1::GreptimedbV1Response; @@ -4496,6 +4496,19 @@ pub async fn test_otlp_traces_v0(store_type: StorageType) { ) .await; + // Validate operations table + let expected = r#"[["telemetrygen","SPAN_KIND_CLIENT","lets-go"],["telemetrygen","SPAN_KIND_SERVER","okey-dokey-0"]]"#; + validate_data( + "otlp_traces_operations", + &client, + &format!( + "select service_name, span_kind, span_name from {} order by span_kind, span_name;", + trace_operations_table_name(TRACE_TABLE_NAME) + ), + expected, + ) + .await; + // select traces data let expected = r#"[[1736480942444376000,1736480942444499000,123000,"c05d7a4ec8e1f231f02ed6e8da8655b4","d24f921c75f68e23",null,"SPAN_KIND_CLIENT","lets-go","STATUS_CODE_UNSET","","","telemetrygen",{"net.peer.ip":"1.2.3.4","peer.service":"telemetrygen-server"},[],[],"telemetrygen","",{},{"service.name":"telemetrygen"}],[1736480942444376000,1736480942444499000,123000,"c05d7a4ec8e1f231f02ed6e8da8655b4","9630f2916e2f7909","d24f921c75f68e23","SPAN_KIND_SERVER","okey-dokey-0","STATUS_CODE_UNSET","","","telemetrygen",{"net.peer.ip":"1.2.3.4","peer.service":"telemetrygen-client"},[],[],"telemetrygen","",{},{"service.name":"telemetrygen"}],[1736480942444589000,1736480942444712000,123000,"cc9e0991a2e63d274984bd44ee669203","eba7be77e3558179",null,"SPAN_KIND_CLIENT","lets-go","STATUS_CODE_UNSET","","","telemetrygen",{"net.peer.ip":"1.2.3.4","peer.service":"telemetrygen-server"},[],[],"telemetrygen","",{},{"service.name":"telemetrygen"}],[1736480942444589000,1736480942444712000,123000,"cc9e0991a2e63d274984bd44ee669203","8f847259b0f6e1ab","eba7be77e3558179","SPAN_KIND_SERVER","okey-dokey-0","STATUS_CODE_UNSET","","","telemetrygen",{"net.peer.ip":"1.2.3.4","peer.service":"telemetrygen-client"},[],[],"telemetrygen","",{},{"service.name":"telemetrygen"}]]"#; validate_data( @@ -4611,6 +4624,19 @@ pub async fn test_otlp_traces_v1(store_type: StorageType) { ) .await; + // Validate operations table + let expected = r#"[["telemetrygen","SPAN_KIND_CLIENT","lets-go"],["telemetrygen","SPAN_KIND_SERVER","okey-dokey-0"]]"#; + validate_data( + "otlp_traces_operations_v1", + &client, + &format!( + "select service_name, span_kind, span_name from {} order by span_kind, span_name;", + trace_operations_table_name(trace_table_name) + ), + expected, + ) + .await; + // select traces data let expected = r#"[[1736480942444376000,1736480942444499000,123000,null,"c05d7a4ec8e1f231f02ed6e8da8655b4","d24f921c75f68e23","SPAN_KIND_CLIENT","lets-go","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-server",[],[]],[1736480942444376000,1736480942444499000,123000,"d24f921c75f68e23","c05d7a4ec8e1f231f02ed6e8da8655b4","9630f2916e2f7909","SPAN_KIND_SERVER","okey-dokey-0","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-client",[],[]],[1736480942444589000,1736480942444712000,123000,null,"cc9e0991a2e63d274984bd44ee669203","eba7be77e3558179","SPAN_KIND_CLIENT","lets-go","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-server",[],[]],[1736480942444589000,1736480942444712000,123000,"eba7be77e3558179","cc9e0991a2e63d274984bd44ee669203","8f847259b0f6e1ab","SPAN_KIND_SERVER","okey-dokey-0","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-client",[],[]]]"#; validate_data("otlp_traces", &client, "select * from mytable;", expected).await; @@ -6017,19 +6043,26 @@ pub async fn test_jaeger_query_api_for_trace_v1(store_type: StorageType) { let res = client .get("/v1/jaeger/api/operations?service=test-jaeger-query-api") .header("x-greptime-trace-table-name", trace_table_name) - .header(JAEGER_TIME_RANGE_FOR_OPERATIONS_HEADER, "3 days") .send() .await; assert_eq!(StatusCode::OK, res.status()); let expected = r#" { "data": [ + { + "name": "access-mysql", + "spanKind": "server" + }, { "name": "access-pg", "spanKind": "server" + }, + { + "name": "access-redis", + "spanKind": "server" } ], - "total": 1, + "total": 3, "limit": 0, "offset": 0, "errors": [] @@ -6050,9 +6083,10 @@ pub async fn test_jaeger_query_api_for_trace_v1(store_type: StorageType) { { "data": [ "access-mysql", + "access-pg", "access-redis" ], - "total": 2, + "total": 3, "limit": 0, "offset": 0, "errors": [] From a20ac4f9e53594f30b9bee17c6060dd7aaa5e2bc Mon Sep 17 00:00:00 2001 From: shuiyisong <113876041+shuiyisong@users.noreply.github.com> Date: Mon, 27 Oct 2025 16:00:03 +0800 Subject: [PATCH 013/149] feat: prefix option for timestamp index and value column (#7125) * refactor: use GREPTIME_TIMESTAMP const Signed-off-by: shuiyisong * feat: add config for default ts col name Signed-off-by: shuiyisong * refactor: replace GREPTIME_TIMESTAMP with function get Signed-off-by: shuiyisong * chore: update config doc * fix: test Signed-off-by: shuiyisong * chore: remove opts on flownode and metasrv Signed-off-by: shuiyisong * chore: add validation for ts column name Signed-off-by: shuiyisong * chore: use get_or_init to avoid test error Signed-off-by: shuiyisong * chore: fmt Signed-off-by: shuiyisong * chore: update docs Signed-off-by: shuiyisong * chore: using empty string to disable prefix Signed-off-by: shuiyisong * chore: update comment Signed-off-by: shuiyisong * chore: address CR issues Signed-off-by: shuiyisong --------- Signed-off-by: shuiyisong --- Cargo.lock | 4 ++ config/config.md | 3 + config/datanode.example.toml | 4 ++ config/frontend.example.toml | 4 ++ config/standalone.example.toml | 4 ++ src/cmd/src/frontend.rs | 5 ++ src/cmd/src/standalone.rs | 5 ++ src/cmd/tests/load_config_test.rs | 3 + src/common/base/Cargo.toml | 2 + src/common/base/src/lib.rs | 1 + src/common/base/src/regex_pattern.rs | 22 ++++++++ src/common/meta/src/key.rs | 6 +- src/common/query/Cargo.toml | 2 + src/common/query/src/error.rs | 6 +- src/common/query/src/prelude.rs | 56 +++++++++++++++++-- src/datanode/src/config.rs | 2 + src/datanode/src/datanode.rs | 10 +++- src/datanode/src/error.rs | 10 +++- src/frontend/src/frontend.rs | 2 + .../src/handler/persist_stats_handler.rs | 1 + src/metric-engine/src/data_region.rs | 5 +- src/metric-engine/src/engine/alter.rs | 11 ++-- src/metric-engine/src/engine/create.rs | 5 +- src/metric-engine/src/engine/sync.rs | 3 +- src/metric-engine/src/test_util.rs | 17 +++--- src/mito-codec/Cargo.toml | 1 + src/mito-codec/src/primary_key_filter.rs | 5 +- src/mito-codec/src/row_converter/sparse.rs | 5 +- src/mito2/src/memtable/partition_tree.rs | 5 +- src/operator/src/insert.rs | 6 +- src/operator/src/statement/ddl.rs | 8 +-- .../src/etl/transform/transformer/greptime.rs | 16 ++---- src/pipeline/tests/date.rs | 3 +- src/pipeline/tests/dissect.rs | 11 ++-- src/pipeline/tests/epoch.rs | 7 ++- src/pipeline/tests/join.rs | 3 +- src/pipeline/tests/letter.rs | 7 ++- src/pipeline/tests/on_failure.rs | 9 +-- src/pipeline/tests/regex.rs | 5 +- src/pipeline/tests/simple_extract.rs | 3 +- src/pipeline/tests/urlencoding.rs | 5 +- src/query/src/promql/planner.rs | 11 ++-- src/query/src/sql.rs | 8 +-- src/servers/src/http/loki.rs | 8 +-- src/servers/src/influxdb.rs | 9 +-- src/servers/src/opentsdb.rs | 6 +- src/servers/src/opentsdb/codec.rs | 10 ++-- src/servers/src/otlp/metrics.rs | 54 +++++++++--------- src/servers/src/prom_row_builder.rs | 13 +++-- src/servers/src/prom_store.rs | 49 ++++++++-------- src/servers/src/proto.rs | 6 +- src/standalone/src/options.rs | 2 + 52 files changed, 305 insertions(+), 163 deletions(-) create mode 100644 src/common/base/src/regex_pattern.rs diff --git a/Cargo.lock b/Cargo.lock index c2bad3d971..f4ff29fb70 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2004,9 +2004,11 @@ dependencies = [ "common-macro", "common-test-util", "futures", + "lazy_static", "paste", "pin-project", "rand 0.9.1", + "regex", "serde", "snafu 0.8.6", "tokio", @@ -2454,6 +2456,7 @@ dependencies = [ "datafusion-expr", "datatypes", "futures-util", + "once_cell", "serde", "snafu 0.8.6", "sqlparser", @@ -7579,6 +7582,7 @@ dependencies = [ "common-decimal", "common-error", "common-macro", + "common-query", "common-recordbatch", "common-telemetry", "common-time", diff --git a/config/config.md b/config/config.md index 72d48b5bcb..1c20e66540 100644 --- a/config/config.md +++ b/config/config.md @@ -13,6 +13,7 @@ | Key | Type | Default | Descriptions | | --- | -----| ------- | ----------- | | `default_timezone` | String | Unset | The default timezone of the server. | +| `default_column_prefix` | String | Unset | The default column prefix for auto-created time index and value columns. | | `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.
By default, it provides services after all regions have been initialized. | | `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. | | `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited. | @@ -226,6 +227,7 @@ | Key | Type | Default | Descriptions | | --- | -----| ------- | ----------- | | `default_timezone` | String | Unset | The default timezone of the server. | +| `default_column_prefix` | String | Unset | The default column prefix for auto-created time index and value columns. | | `max_in_flight_write_bytes` | String | Unset | The maximum in-flight write bytes. | | `runtime` | -- | -- | The runtime options. | | `runtime.global_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. | @@ -440,6 +442,7 @@ | Key | Type | Default | Descriptions | | --- | -----| ------- | ----------- | | `node_id` | Integer | Unset | The datanode identifier and should be unique in the cluster. | +| `default_column_prefix` | String | Unset | The default column prefix for auto-created time index and value columns. | | `require_lease_before_startup` | Bool | `false` | Start services after regions have obtained leases.
It will block the datanode start if it can't receive leases in the heartbeat from metasrv. | | `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.
By default, it provides services after all regions have been initialized. | | `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. | diff --git a/config/datanode.example.toml b/config/datanode.example.toml index 82ee07bd84..b232f5109f 100644 --- a/config/datanode.example.toml +++ b/config/datanode.example.toml @@ -2,6 +2,10 @@ ## @toml2docs:none-default node_id = 42 +## The default column prefix for auto-created time index and value columns. +## @toml2docs:none-default +default_column_prefix = "greptime" + ## Start services after regions have obtained leases. ## It will block the datanode start if it can't receive leases in the heartbeat from metasrv. require_lease_before_startup = false diff --git a/config/frontend.example.toml b/config/frontend.example.toml index 9ffcdad540..70c61c82c7 100644 --- a/config/frontend.example.toml +++ b/config/frontend.example.toml @@ -2,6 +2,10 @@ ## @toml2docs:none-default default_timezone = "UTC" +## The default column prefix for auto-created time index and value columns. +## @toml2docs:none-default +default_column_prefix = "greptime" + ## The maximum in-flight write bytes. ## @toml2docs:none-default #+ max_in_flight_write_bytes = "500MB" diff --git a/config/standalone.example.toml b/config/standalone.example.toml index 744dbbe751..22f5574ef5 100644 --- a/config/standalone.example.toml +++ b/config/standalone.example.toml @@ -2,6 +2,10 @@ ## @toml2docs:none-default default_timezone = "UTC" +## The default column prefix for auto-created time index and value columns. +## @toml2docs:none-default +default_column_prefix = "greptime" + ## Initialize all regions in the background during the startup. ## By default, it provides services after all regions have been initialized. init_regions_in_background = false diff --git a/src/cmd/src/frontend.rs b/src/cmd/src/frontend.rs index fda6d968bf..89992eba37 100644 --- a/src/cmd/src/frontend.rs +++ b/src/cmd/src/frontend.rs @@ -25,11 +25,13 @@ use clap::Parser; use client::client_manager::NodeClients; use common_base::Plugins; use common_config::{Configurable, DEFAULT_DATA_HOME}; +use common_error::ext::BoxedError; use common_grpc::channel_manager::ChannelConfig; use common_meta::cache::{CacheRegistryBuilder, LayeredCacheRegistryBuilder}; use common_meta::heartbeat::handler::HandlerGroupExecutor; use common_meta::heartbeat::handler::invalidate_table_cache::InvalidateCacheHandler; use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler; +use common_query::prelude::set_default_prefix; use common_stat::ResourceStatImpl; use common_telemetry::info; use common_telemetry::logging::{DEFAULT_LOGGING_DIR, TracingOptions}; @@ -333,6 +335,9 @@ impl StartCommand { .context(error::StartFrontendSnafu)?; set_default_timezone(opts.default_timezone.as_deref()).context(error::InitTimezoneSnafu)?; + set_default_prefix(opts.default_column_prefix.as_deref()) + .map_err(BoxedError::new) + .context(error::BuildCliSnafu)?; let meta_client_options = opts .meta_client diff --git a/src/cmd/src/standalone.rs b/src/cmd/src/standalone.rs index 58602d0a39..bf5aff7825 100644 --- a/src/cmd/src/standalone.rs +++ b/src/cmd/src/standalone.rs @@ -41,6 +41,7 @@ use common_meta::region_registry::LeaderRegionRegistry; use common_meta::sequence::SequenceBuilder; use common_meta::wal_options_allocator::{WalOptionsAllocatorRef, build_wal_options_allocator}; use common_procedure::ProcedureManagerRef; +use common_query::prelude::set_default_prefix; use common_telemetry::info; use common_telemetry::logging::{DEFAULT_LOGGING_DIR, TracingOptions}; use common_time::timezone::set_default_timezone; @@ -355,6 +356,10 @@ impl StartCommand { let mut plugins = Plugins::new(); let plugin_opts = opts.plugins; let mut opts = opts.component; + set_default_prefix(opts.default_column_prefix.as_deref()) + .map_err(BoxedError::new) + .context(error::BuildCliSnafu)?; + opts.grpc.detect_server_addr(); let fe_opts = opts.frontend_options(); let dn_opts = opts.datanode_options(); diff --git a/src/cmd/tests/load_config_test.rs b/src/cmd/tests/load_config_test.rs index b92cf9631d..f4ee324b69 100644 --- a/src/cmd/tests/load_config_test.rs +++ b/src/cmd/tests/load_config_test.rs @@ -48,6 +48,7 @@ fn test_load_datanode_example_config() { let expected = GreptimeOptions:: { component: DatanodeOptions { node_id: Some(42), + default_column_prefix: Some("greptime".to_string()), meta_client: Some(MetaClientOptions { metasrv_addrs: vec!["127.0.0.1:3002".to_string()], timeout: Duration::from_secs(3), @@ -113,6 +114,7 @@ fn test_load_frontend_example_config() { let expected = GreptimeOptions:: { component: FrontendOptions { default_timezone: Some("UTC".to_string()), + default_column_prefix: Some("greptime".to_string()), meta_client: Some(MetaClientOptions { metasrv_addrs: vec!["127.0.0.1:3002".to_string()], timeout: Duration::from_secs(3), @@ -273,6 +275,7 @@ fn test_load_standalone_example_config() { let expected = GreptimeOptions:: { component: StandaloneOptions { default_timezone: Some("UTC".to_string()), + default_column_prefix: Some("greptime".to_string()), wal: DatanodeWalConfig::RaftEngine(RaftEngineConfig { dir: Some(format!("{}/{}", DEFAULT_DATA_HOME, WAL_DIR)), sync_period: Some(Duration::from_secs(10)), diff --git a/src/common/base/Cargo.toml b/src/common/base/Cargo.toml index ae2945b1f5..4a881990b4 100644 --- a/src/common/base/Cargo.toml +++ b/src/common/base/Cargo.toml @@ -18,9 +18,11 @@ bytes.workspace = true common-error.workspace = true common-macro.workspace = true futures.workspace = true +lazy_static.workspace = true paste.workspace = true pin-project.workspace = true rand.workspace = true +regex.workspace = true serde = { version = "1.0", features = ["derive"] } snafu.workspace = true tokio.workspace = true diff --git a/src/common/base/src/lib.rs b/src/common/base/src/lib.rs index cc5acdbf47..1f530c2753 100644 --- a/src/common/base/src/lib.rs +++ b/src/common/base/src/lib.rs @@ -19,6 +19,7 @@ pub mod plugins; pub mod range_read; #[allow(clippy::all)] pub mod readable_size; +pub mod regex_pattern; pub mod secrets; pub mod serde; diff --git a/src/common/base/src/regex_pattern.rs b/src/common/base/src/regex_pattern.rs new file mode 100644 index 0000000000..7ff46693ba --- /dev/null +++ b/src/common/base/src/regex_pattern.rs @@ -0,0 +1,22 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use lazy_static::lazy_static; +use regex::Regex; + +pub const NAME_PATTERN: &str = r"[a-zA-Z_:-][a-zA-Z0-9_:\-\.@#]*"; + +lazy_static! { + pub static ref NAME_PATTERN_REG: Regex = Regex::new(&format!("^{NAME_PATTERN}$")).unwrap(); +} diff --git a/src/common/meta/src/key.rs b/src/common/meta/src/key.rs index a1d98db301..55dbc0ad01 100644 --- a/src/common/meta/src/key.rs +++ b/src/common/meta/src/key.rs @@ -121,6 +121,7 @@ use std::ops::{Deref, DerefMut}; use std::sync::Arc; use bytes::Bytes; +use common_base::regex_pattern::NAME_PATTERN; use common_catalog::consts::{ DEFAULT_CATALOG_NAME, DEFAULT_PRIVATE_SCHEMA_NAME, DEFAULT_SCHEMA_NAME, INFORMATION_SCHEMA_NAME, }; @@ -164,7 +165,6 @@ use crate::rpc::router::{LeaderState, RegionRoute, region_distribution}; use crate::rpc::store::BatchDeleteRequest; use crate::state_store::PoisonValue; -pub const NAME_PATTERN: &str = r"[a-zA-Z_:-][a-zA-Z0-9_:\-\.@#]*"; pub const TOPIC_NAME_PATTERN: &str = r"[a-zA-Z0-9_:-][a-zA-Z0-9_:\-\.@#]*"; pub const LEGACY_MAINTENANCE_KEY: &str = "__maintenance"; pub const MAINTENANCE_KEY: &str = "__switches/maintenance"; @@ -269,10 +269,6 @@ pub type FlowId = u32; /// The partition of flow. pub type FlowPartitionId = u32; -lazy_static! { - pub static ref NAME_PATTERN_REGEX: Regex = Regex::new(NAME_PATTERN).unwrap(); -} - lazy_static! { pub static ref TOPIC_NAME_PATTERN_REGEX: Regex = Regex::new(TOPIC_NAME_PATTERN).unwrap(); } diff --git a/src/common/query/Cargo.toml b/src/common/query/Cargo.toml index 7cdc5a8a45..48328ea612 100644 --- a/src/common/query/Cargo.toml +++ b/src/common/query/Cargo.toml @@ -14,6 +14,7 @@ workspace = true api.workspace = true async-trait.workspace = true bytes.workspace = true +common-base.workspace = true common-error.workspace = true common-macro.workspace = true common-recordbatch.workspace = true @@ -22,6 +23,7 @@ datafusion.workspace = true datafusion-common.workspace = true datafusion-expr.workspace = true datatypes.workspace = true +once_cell.workspace = true serde.workspace = true snafu.workspace = true sqlparser.workspace = true diff --git a/src/common/query/src/error.rs b/src/common/query/src/error.rs index 163efb30a7..618795bb4a 100644 --- a/src/common/query/src/error.rs +++ b/src/common/query/src/error.rs @@ -199,6 +199,9 @@ pub enum Error { #[snafu(implicit)] location: Location, }, + + #[snafu(display("Invalid character in prefix config: {}", prefix))] + InvalidColumnPrefix { prefix: String }, } pub type Result = std::result::Result; @@ -227,7 +230,8 @@ impl ErrorExt for Error { Error::UnsupportedInputDataType { .. } | Error::TypeCast { .. } - | Error::InvalidFuncArgs { .. } => StatusCode::InvalidArguments, + | Error::InvalidFuncArgs { .. } + | Error::InvalidColumnPrefix { .. } => StatusCode::InvalidArguments, Error::ConvertDfRecordBatchStream { source, .. } => source.status_code(), diff --git a/src/common/query/src/prelude.rs b/src/common/query/src/prelude.rs index f467906402..c27b94294e 100644 --- a/src/common/query/src/prelude.rs +++ b/src/common/query/src/prelude.rs @@ -12,15 +12,61 @@ // See the License for the specific language governing permissions and // limitations under the License. +use common_base::regex_pattern::NAME_PATTERN_REG; pub use datafusion_common::ScalarValue; +use once_cell::sync::OnceCell; +use snafu::ensure; pub use crate::columnar_value::ColumnarValue; +use crate::error::{InvalidColumnPrefixSnafu, Result}; -/// Default timestamp column name for Prometheus metrics. -pub const GREPTIME_TIMESTAMP: &str = "greptime_timestamp"; -/// Default value column name for Prometheus metrics. -pub const GREPTIME_VALUE: &str = "greptime_value"; -/// Default counter column name for OTLP metrics. +/// Default time index column name. +static GREPTIME_TIMESTAMP_CELL: OnceCell = OnceCell::new(); + +/// Default value column name. +static GREPTIME_VALUE_CELL: OnceCell = OnceCell::new(); + +pub fn set_default_prefix(prefix: Option<&str>) -> Result<()> { + match prefix { + None => { + // use default greptime prefix + GREPTIME_TIMESTAMP_CELL.get_or_init(|| GREPTIME_TIMESTAMP.to_string()); + GREPTIME_VALUE_CELL.get_or_init(|| GREPTIME_VALUE.to_string()); + } + Some(s) if s.trim().is_empty() => { + // use "" to disable prefix + GREPTIME_TIMESTAMP_CELL.get_or_init(|| "timestamp".to_string()); + GREPTIME_VALUE_CELL.get_or_init(|| "value".to_string()); + } + Some(x) => { + ensure!( + NAME_PATTERN_REG.is_match(x), + InvalidColumnPrefixSnafu { prefix: x } + ); + GREPTIME_TIMESTAMP_CELL.get_or_init(|| format!("{}_timestamp", x)); + GREPTIME_VALUE_CELL.get_or_init(|| format!("{}_value", x)); + } + } + Ok(()) +} + +/// Get the default timestamp column name. +/// Returns the configured value, or `greptime_timestamp` if not set. +pub fn greptime_timestamp() -> &'static str { + GREPTIME_TIMESTAMP_CELL.get_or_init(|| GREPTIME_TIMESTAMP.to_string()) +} + +/// Get the default value column name. +/// Returns the configured value, or `greptime_value` if not set. +pub fn greptime_value() -> &'static str { + GREPTIME_VALUE_CELL.get_or_init(|| GREPTIME_VALUE.to_string()) +} + +/// Default timestamp column name constant for backward compatibility. +const GREPTIME_TIMESTAMP: &str = "greptime_timestamp"; +/// Default value column name constant for backward compatibility. +const GREPTIME_VALUE: &str = "greptime_value"; +/// Default counter column name for OTLP metrics (legacy mode). pub const GREPTIME_COUNT: &str = "greptime_count"; /// Default physical table name pub const GREPTIME_PHYSICAL_TABLE: &str = "greptime_physical_table"; diff --git a/src/datanode/src/config.rs b/src/datanode/src/config.rs index 2f2fcd2697..e40a52bd6b 100644 --- a/src/datanode/src/config.rs +++ b/src/datanode/src/config.rs @@ -66,6 +66,7 @@ impl Default for StorageConfig { #[serde(default)] pub struct DatanodeOptions { pub node_id: Option, + pub default_column_prefix: Option, pub workload_types: Vec, pub require_lease_before_startup: bool, pub init_regions_in_background: bool, @@ -119,6 +120,7 @@ impl Default for DatanodeOptions { fn default() -> Self { Self { node_id: None, + default_column_prefix: None, workload_types: vec![DatanodeWorkloadType::Hybrid], require_lease_before_startup: false, init_regions_in_background: false, diff --git a/src/datanode/src/datanode.rs b/src/datanode/src/datanode.rs index b9b8edcdba..50d0ef4076 100644 --- a/src/datanode/src/datanode.rs +++ b/src/datanode/src/datanode.rs @@ -27,6 +27,7 @@ use common_meta::key::runtime_switch::RuntimeSwitchManager; use common_meta::key::{SchemaMetadataManager, SchemaMetadataManagerRef}; use common_meta::kv_backend::KvBackendRef; pub use common_procedure::options::ProcedureConfig; +use common_query::prelude::set_default_prefix; use common_stat::ResourceStatImpl; use common_telemetry::{error, info, warn}; use common_wal::config::DatanodeWalConfig; @@ -59,9 +60,9 @@ use tokio::sync::Notify; use crate::config::{DatanodeOptions, RegionEngineConfig, StorageConfig}; use crate::error::{ - self, BuildMetricEngineSnafu, BuildMitoEngineSnafu, CreateDirSnafu, GetMetadataSnafu, - MissingCacheSnafu, MissingNodeIdSnafu, OpenLogStoreSnafu, Result, ShutdownInstanceSnafu, - ShutdownServerSnafu, StartServerSnafu, + self, BuildDatanodeSnafu, BuildMetricEngineSnafu, BuildMitoEngineSnafu, CreateDirSnafu, + GetMetadataSnafu, MissingCacheSnafu, MissingNodeIdSnafu, OpenLogStoreSnafu, Result, + ShutdownInstanceSnafu, ShutdownServerSnafu, StartServerSnafu, }; use crate::event_listener::{ NoopRegionServerEventListener, RegionServerEventListenerRef, RegionServerEventReceiver, @@ -220,6 +221,9 @@ impl DatanodeBuilder { pub async fn build(mut self) -> Result { let node_id = self.opts.node_id.context(MissingNodeIdSnafu)?; + set_default_prefix(self.opts.default_column_prefix.as_deref()) + .map_err(BoxedError::new) + .context(BuildDatanodeSnafu)?; let meta_client = self.meta_client.take(); diff --git a/src/datanode/src/error.rs b/src/datanode/src/error.rs index a2e6f674e2..eda483a1e2 100644 --- a/src/datanode/src/error.rs +++ b/src/datanode/src/error.rs @@ -165,6 +165,13 @@ pub enum Error { location: Location, }, + #[snafu(display("Failed to build datanode"))] + BuildDatanode { + #[snafu(implicit)] + location: Location, + source: BoxedError, + }, + #[snafu(display("Failed to build http client"))] BuildHttpClient { #[snafu(implicit)] @@ -429,7 +436,8 @@ impl ErrorExt for Error { | MissingRequiredField { .. } | RegionEngineNotFound { .. } | ParseAddr { .. } - | TomlFormat { .. } => StatusCode::InvalidArguments, + | TomlFormat { .. } + | BuildDatanode { .. } => StatusCode::InvalidArguments, PayloadNotExist { .. } | Unexpected { .. } diff --git a/src/frontend/src/frontend.rs b/src/frontend/src/frontend.rs index bf2e7a0558..dce9ffd158 100644 --- a/src/frontend/src/frontend.rs +++ b/src/frontend/src/frontend.rs @@ -45,6 +45,7 @@ use crate::service_config::{ pub struct FrontendOptions { pub node_id: Option, pub default_timezone: Option, + pub default_column_prefix: Option, pub heartbeat: HeartbeatOptions, pub http: HttpOptions, pub grpc: GrpcOptions, @@ -77,6 +78,7 @@ impl Default for FrontendOptions { Self { node_id: None, default_timezone: None, + default_column_prefix: None, heartbeat: HeartbeatOptions::frontend_default(), http: HttpOptions::default(), grpc: GrpcOptions::default(), diff --git a/src/meta-srv/src/handler/persist_stats_handler.rs b/src/meta-srv/src/handler/persist_stats_handler.rs index 1dc81f49eb..abc2fa3c3e 100644 --- a/src/meta-srv/src/handler/persist_stats_handler.rs +++ b/src/meta-srv/src/handler/persist_stats_handler.rs @@ -77,6 +77,7 @@ struct PersistRegionStat<'a> { sst_size: u64, write_bytes_delta: u64, #[col( + // This col name is for the information schema table, so we don't touch it name = "greptime_timestamp", semantic = "Timestamp", datatype = "TimestampMillisecond" diff --git a/src/metric-engine/src/data_region.rs b/src/metric-engine/src/data_region.rs index 5056cd0352..beab78cd70 100644 --- a/src/metric-engine/src/data_region.rs +++ b/src/metric-engine/src/data_region.rs @@ -240,6 +240,7 @@ impl DataRegion { #[cfg(test)] mod test { + use common_query::prelude::{greptime_timestamp, greptime_value}; use datatypes::prelude::ConcreteDataType; use datatypes::schema::ColumnSchema; @@ -300,8 +301,8 @@ mod test { .map(|c| &c.column_schema.name) .collect::>(); let expected = vec![ - "greptime_timestamp", - "greptime_value", + greptime_timestamp(), + greptime_value(), "__table_id", "__tsid", "job", diff --git a/src/metric-engine/src/engine/alter.rs b/src/metric-engine/src/engine/alter.rs index 1c4cb93639..1ae63915e9 100644 --- a/src/metric-engine/src/engine/alter.rs +++ b/src/metric-engine/src/engine/alter.rs @@ -224,6 +224,7 @@ mod test { use api::v1::SemanticType; use common_meta::ddl::test_util::assert_column_name_and_id; use common_meta::ddl::utils::{parse_column_metadatas, parse_manifest_infos_from_extensions}; + use common_query::prelude::{greptime_timestamp, greptime_value}; use store_api::metric_engine_consts::ALTER_PHYSICAL_EXTENSION_KEY; use store_api::region_engine::RegionEngine; use store_api::region_request::{ @@ -295,7 +296,7 @@ mod test { .unwrap(); assert_eq!(semantic_type, SemanticType::Tag); let timestamp_index = metadata_region - .column_semantic_type(physical_region_id, logical_region_id, "greptime_timestamp") + .column_semantic_type(physical_region_id, logical_region_id, greptime_timestamp()) .await .unwrap() .unwrap(); @@ -305,8 +306,8 @@ mod test { assert_column_name_and_id( &column_metadatas, &[ - ("greptime_timestamp", 0), - ("greptime_value", 1), + (greptime_timestamp(), 0), + (greptime_value(), 1), ("__table_id", ReservedColumnId::table_id()), ("__tsid", ReservedColumnId::tsid()), ("job", 2), @@ -364,8 +365,8 @@ mod test { assert_column_name_and_id( &column_metadatas, &[ - ("greptime_timestamp", 0), - ("greptime_value", 1), + (greptime_timestamp(), 0), + (greptime_value(), 1), ("__table_id", ReservedColumnId::table_id()), ("__tsid", ReservedColumnId::tsid()), ("job", 2), diff --git a/src/metric-engine/src/engine/create.rs b/src/metric-engine/src/engine/create.rs index c506c0e2b4..2796d3652b 100644 --- a/src/metric-engine/src/engine/create.rs +++ b/src/metric-engine/src/engine/create.rs @@ -619,6 +619,7 @@ pub(crate) fn region_options_for_metadata_region( mod test { use common_meta::ddl::test_util::assert_column_name_and_id; use common_meta::ddl::utils::{parse_column_metadatas, parse_manifest_infos_from_extensions}; + use common_query::prelude::{greptime_timestamp, greptime_value}; use store_api::metric_engine_consts::{METRIC_ENGINE_NAME, PHYSICAL_TABLE_METADATA_KEY}; use store_api::region_request::BatchRegionDdlRequest; @@ -856,8 +857,8 @@ mod test { assert_column_name_and_id( &column_metadatas, &[ - ("greptime_timestamp", 0), - ("greptime_value", 1), + (greptime_timestamp(), 0), + (greptime_value(), 1), ("__table_id", ReservedColumnId::table_id()), ("__tsid", ReservedColumnId::tsid()), ("job", 2), diff --git a/src/metric-engine/src/engine/sync.rs b/src/metric-engine/src/engine/sync.rs index b62b138dab..741938f8d7 100644 --- a/src/metric-engine/src/engine/sync.rs +++ b/src/metric-engine/src/engine/sync.rs @@ -110,6 +110,7 @@ mod tests { use std::collections::HashMap; use api::v1::SemanticType; + use common_query::prelude::greptime_timestamp; use common_telemetry::info; use datatypes::data_type::ConcreteDataType; use datatypes::schema::ColumnSchema; @@ -243,7 +244,7 @@ mod tests { .unwrap(); assert_eq!(semantic_type, SemanticType::Tag); let timestamp_index = metadata_region - .column_semantic_type(physical_region_id, logical_region_id, "greptime_timestamp") + .column_semantic_type(physical_region_id, logical_region_id, greptime_timestamp()) .await .unwrap() .unwrap(); diff --git a/src/metric-engine/src/test_util.rs b/src/metric-engine/src/test_util.rs index d594541d84..cc173e534c 100644 --- a/src/metric-engine/src/test_util.rs +++ b/src/metric-engine/src/test_util.rs @@ -17,6 +17,7 @@ use api::v1::value::ValueData; use api::v1::{ColumnDataType, ColumnSchema as PbColumnSchema, Row, SemanticType, Value}; use common_meta::ddl::utils::parse_column_metadatas; +use common_query::prelude::{greptime_timestamp, greptime_value}; use common_telemetry::debug; use datatypes::prelude::ConcreteDataType; use datatypes::schema::ColumnSchema; @@ -132,7 +133,7 @@ impl TestEnv { column_id: 0, semantic_type: SemanticType::Timestamp, column_schema: ColumnSchema::new( - "greptime_timestamp", + greptime_timestamp(), ConcreteDataType::timestamp_millisecond_datatype(), false, ), @@ -141,7 +142,7 @@ impl TestEnv { column_id: 1, semantic_type: SemanticType::Field, column_schema: ColumnSchema::new( - "greptime_value", + greptime_value(), ConcreteDataType::float64_datatype(), false, ), @@ -204,8 +205,8 @@ impl TestEnv { assert_eq!( column_names, vec![ - "greptime_timestamp", - "greptime_value", + greptime_timestamp(), + greptime_value(), "__table_id", "__tsid", "job", @@ -300,7 +301,7 @@ pub fn create_logical_region_request( column_id: 0, semantic_type: SemanticType::Timestamp, column_schema: ColumnSchema::new( - "greptime_timestamp", + greptime_timestamp(), ConcreteDataType::timestamp_millisecond_datatype(), false, ), @@ -309,7 +310,7 @@ pub fn create_logical_region_request( column_id: 1, semantic_type: SemanticType::Field, column_schema: ColumnSchema::new( - "greptime_value", + greptime_value(), ConcreteDataType::float64_datatype(), false, ), @@ -372,14 +373,14 @@ pub fn alter_logical_region_request(tags: &[&str]) -> RegionAlterRequest { pub fn row_schema_with_tags(tags: &[&str]) -> Vec { let mut schema = vec![ PbColumnSchema { - column_name: "greptime_timestamp".to_string(), + column_name: greptime_timestamp().to_string(), datatype: ColumnDataType::TimestampMillisecond as i32, semantic_type: SemanticType::Timestamp as _, datatype_extension: None, options: None, }, PbColumnSchema { - column_name: "greptime_value".to_string(), + column_name: greptime_value().to_string(), datatype: ColumnDataType::Float64 as i32, semantic_type: SemanticType::Field as _, datatype_extension: None, diff --git a/src/mito-codec/Cargo.toml b/src/mito-codec/Cargo.toml index 99a46e8ac9..81808f2714 100644 --- a/src/mito-codec/Cargo.toml +++ b/src/mito-codec/Cargo.toml @@ -15,6 +15,7 @@ common-base.workspace = true common-decimal.workspace = true common-error.workspace = true common-macro.workspace = true +common-query.workspace = true common-recordbatch.workspace = true common-telemetry.workspace = true common-time.workspace = true diff --git a/src/mito-codec/src/primary_key_filter.rs b/src/mito-codec/src/primary_key_filter.rs index e4d1ce5056..c71fafc974 100644 --- a/src/mito-codec/src/primary_key_filter.rs +++ b/src/mito-codec/src/primary_key_filter.rs @@ -154,6 +154,7 @@ mod tests { use std::sync::Arc; use api::v1::SemanticType; + use common_query::prelude::{greptime_timestamp, greptime_value}; use datafusion_common::Column; use datafusion_expr::{BinaryExpr, Expr, Literal, Operator}; use datatypes::prelude::ConcreteDataType; @@ -193,7 +194,7 @@ mod tests { }) .push_column_metadata(ColumnMetadata { column_schema: ColumnSchema::new( - "greptime_value", + greptime_value(), ConcreteDataType::float64_datatype(), false, ), @@ -202,7 +203,7 @@ mod tests { }) .push_column_metadata(ColumnMetadata { column_schema: ColumnSchema::new( - "greptime_timestamp", + greptime_timestamp(), ConcreteDataType::timestamp_nanosecond_datatype(), false, ), diff --git a/src/mito-codec/src/row_converter/sparse.rs b/src/mito-codec/src/row_converter/sparse.rs index edc26db8f0..191c2bd011 100644 --- a/src/mito-codec/src/row_converter/sparse.rs +++ b/src/mito-codec/src/row_converter/sparse.rs @@ -385,6 +385,7 @@ mod tests { use std::sync::Arc; use api::v1::SemanticType; + use common_query::prelude::{greptime_timestamp, greptime_value}; use common_time::Timestamp; use common_time::timestamp::TimeUnit; use datatypes::schema::ColumnSchema; @@ -461,7 +462,7 @@ mod tests { }) .push_column_metadata(ColumnMetadata { column_schema: ColumnSchema::new( - "greptime_value", + greptime_value(), ConcreteDataType::float64_datatype(), false, ), @@ -470,7 +471,7 @@ mod tests { }) .push_column_metadata(ColumnMetadata { column_schema: ColumnSchema::new( - "greptime_timestamp", + greptime_timestamp(), ConcreteDataType::timestamp_nanosecond_datatype(), false, ), diff --git a/src/mito2/src/memtable/partition_tree.rs b/src/mito2/src/memtable/partition_tree.rs index e404a5851e..31cadac4f1 100644 --- a/src/mito2/src/memtable/partition_tree.rs +++ b/src/mito2/src/memtable/partition_tree.rs @@ -384,6 +384,7 @@ mod tests { use api::v1::helper::{field_column_schema, row, tag_column_schema, time_index_column_schema}; use api::v1::value::ValueData; use api::v1::{Mutation, OpType, Rows, SemanticType}; + use common_query::prelude::{greptime_timestamp, greptime_value}; use common_time::Timestamp; use datafusion_common::Column; use datafusion_expr::{BinaryExpr, Expr, Literal, Operator}; @@ -694,7 +695,7 @@ mod tests { }) .push_column_metadata(ColumnMetadata { column_schema: ColumnSchema::new( - "greptime_timestamp", + greptime_timestamp(), ConcreteDataType::timestamp_millisecond_datatype(), false, ), @@ -703,7 +704,7 @@ mod tests { }) .push_column_metadata(ColumnMetadata { column_schema: ColumnSchema::new( - "greptime_value", + greptime_value(), ConcreteDataType::float64_datatype(), true, ), diff --git a/src/operator/src/insert.rs b/src/operator/src/insert.rs index 59ab06c95e..9de4fb3fba 100644 --- a/src/operator/src/insert.rs +++ b/src/operator/src/insert.rs @@ -37,7 +37,7 @@ use common_meta::cache::TableFlownodeSetCacheRef; use common_meta::node_manager::{AffectedRows, NodeManagerRef}; use common_meta::peer::Peer; use common_query::Output; -use common_query::prelude::{GREPTIME_TIMESTAMP, GREPTIME_VALUE}; +use common_query::prelude::{greptime_timestamp, greptime_value}; use common_telemetry::tracing_context::TracingContext; use common_telemetry::{error, info, warn}; use datatypes::schema::SkippingIndexOptions; @@ -721,14 +721,14 @@ impl Inserter { // schema with timestamp and field column let default_schema = vec![ ColumnSchema { - column_name: GREPTIME_TIMESTAMP.to_string(), + column_name: greptime_timestamp().to_string(), datatype: ColumnDataType::TimestampMillisecond as _, semantic_type: SemanticType::Timestamp as _, datatype_extension: None, options: None, }, ColumnSchema { - column_name: GREPTIME_VALUE.to_string(), + column_name: greptime_value().to_string(), datatype: ColumnDataType::Float64 as _, semantic_type: SemanticType::Field as _, datatype_extension: None, diff --git a/src/operator/src/statement/ddl.rs b/src/operator/src/statement/ddl.rs index 3b626d13d0..295e33e43e 100644 --- a/src/operator/src/statement/ddl.rs +++ b/src/operator/src/statement/ddl.rs @@ -26,13 +26,13 @@ use api::v1::{ }; use catalog::CatalogManagerRef; use chrono::Utc; +use common_base::regex_pattern::NAME_PATTERN_REG; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, is_readonly_schema}; use common_catalog::{format_full_flow_name, format_full_table_name}; use common_error::ext::BoxedError; use common_meta::cache_invalidator::Context; use common_meta::ddl::create_flow::FlowType; use common_meta::instruction::CacheIdent; -use common_meta::key::NAME_PATTERN; use common_meta::key::schema_name::{SchemaName, SchemaNameKey}; use common_meta::procedure_executor::ExecutorContext; #[cfg(feature = "enterprise")] @@ -52,14 +52,12 @@ use datafusion_expr::LogicalPlan; use datatypes::prelude::ConcreteDataType; use datatypes::schema::{RawSchema, Schema}; use datatypes::value::Value; -use lazy_static::lazy_static; use partition::expr::{Operand, PartitionExpr, RestrictedOp}; use partition::multi_dim::MultiDimPartitionRule; use query::parser::QueryStatement; use query::plan::extract_and_rewrite_full_table_names; use query::query_engine::DefaultSerializer; use query::sql::create_table_stmt; -use regex::Regex; use session::context::QueryContextRef; use session::table_name::table_idents_to_full_name; use snafu::{OptionExt, ResultExt, ensure}; @@ -96,10 +94,6 @@ use crate::expr_helper; use crate::statement::StatementExecutor; use crate::statement::show::create_partitions_stmt; -lazy_static! { - pub static ref NAME_PATTERN_REG: Regex = Regex::new(&format!("^{NAME_PATTERN}$")).unwrap(); -} - impl StatementExecutor { pub fn catalog_manager(&self) -> CatalogManagerRef { self.catalog_manager.clone() diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs index de98213972..6774842ef1 100644 --- a/src/pipeline/src/etl/transform/transformer/greptime.rs +++ b/src/pipeline/src/etl/transform/transformer/greptime.rs @@ -24,7 +24,7 @@ use api::v1::column_data_type_extension::TypeExt; use api::v1::value::ValueData; use api::v1::{ColumnDataType, ColumnDataTypeExtension, JsonTypeExtension, SemanticType}; use coerce::{coerce_columns, coerce_value}; -use common_query::prelude::{GREPTIME_TIMESTAMP, GREPTIME_VALUE}; +use common_query::prelude::{greptime_timestamp, greptime_value}; use common_telemetry::warn; use greptime_proto::v1::{ColumnSchema, Row, Rows, Value as GreptimeValue}; use itertools::Itertools; @@ -48,7 +48,6 @@ use crate::etl::transform::index::Index; use crate::etl::transform::{Transform, Transforms}; use crate::{PipelineContext, truthy, unwrap_or_continue_if_err}; -const DEFAULT_GREPTIME_TIMESTAMP_COLUMN: &str = "greptime_timestamp"; const DEFAULT_MAX_NESTED_LEVELS_FOR_JSON_FLATTENING: usize = 10; /// fields not in the columns will be discarded @@ -138,10 +137,7 @@ impl GreptimeTransformer { let default = None; let transform = Transform { - fields: Fields::one(Field::new( - DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(), - None, - )), + fields: Fields::one(Field::new(greptime_timestamp().to_string(), None)), type_, default, index: Some(Index::Time), @@ -347,7 +343,7 @@ fn calc_ts(p_ctx: &PipelineContext, values: &VrlValue) -> Result { let ts = values .as_object() - .and_then(|m| m.get(GREPTIME_TIMESTAMP)) + .and_then(|m| m.get(greptime_timestamp())) .and_then(|ts| ts.try_into_i64().ok()) .unwrap_or_default(); Ok(Some(ValueData::TimestampMillisecondValue(ts))) @@ -395,7 +391,7 @@ pub(crate) fn values_to_row( // skip ts column let ts_column_name = custom_ts .as_ref() - .map_or(DEFAULT_GREPTIME_TIMESTAMP_COLUMN, |ts| ts.get_column_name()); + .map_or(greptime_timestamp(), |ts| ts.get_column_name()); let values = values.into_object().context(ValueMustBeMapSnafu)?; @@ -416,7 +412,7 @@ pub(crate) fn values_to_row( } fn decide_semantic(p_ctx: &PipelineContext, column_name: &str) -> i32 { - if p_ctx.channel == Channel::Prometheus && column_name != GREPTIME_VALUE { + if p_ctx.channel == Channel::Prometheus && column_name != greptime_value() { SemanticType::Tag as i32 } else { SemanticType::Field as i32 @@ -563,7 +559,7 @@ fn identity_pipeline_inner( schema_info.schema.push(ColumnSchema { column_name: custom_ts .map(|ts| ts.get_column_name().to_string()) - .unwrap_or_else(|| DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string()), + .unwrap_or_else(|| greptime_timestamp().to_string()), datatype: custom_ts.map(|c| c.get_datatype()).unwrap_or_else(|| { if pipeline_ctx.channel == Channel::Prometheus { ColumnDataType::TimestampMillisecond diff --git a/src/pipeline/tests/date.rs b/src/pipeline/tests/date.rs index fc9e726b61..0164dd4c22 100644 --- a/src/pipeline/tests/date.rs +++ b/src/pipeline/tests/date.rs @@ -15,6 +15,7 @@ mod common; use api::v1::ColumnSchema; +use common_query::prelude::greptime_timestamp; use greptime_proto::v1::value::ValueData; use greptime_proto::v1::{ColumnDataType, SemanticType}; use lazy_static::lazy_static; @@ -35,7 +36,7 @@ lazy_static! { SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), diff --git a/src/pipeline/tests/dissect.rs b/src/pipeline/tests/dissect.rs index a24e374532..b948110511 100644 --- a/src/pipeline/tests/dissect.rs +++ b/src/pipeline/tests/dissect.rs @@ -14,6 +14,7 @@ mod common; +use common_query::prelude::greptime_timestamp; use greptime_proto::v1::value::ValueData::StringValue; use greptime_proto::v1::{ColumnDataType, SemanticType}; use pipeline::{PipelineContext, setup_pipeline}; @@ -51,7 +52,7 @@ transform: make_string_column_schema("a".to_string()), make_string_column_schema("b".to_string()), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -99,7 +100,7 @@ transform: make_string_column_schema("a".to_string()), make_string_column_schema("b".to_string()), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -142,7 +143,7 @@ transform: make_string_column_schema("a".to_string()), make_string_column_schema("b".to_string()), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -185,7 +186,7 @@ transform: make_string_column_schema("key3".to_string()), make_string_column_schema("key5".to_string()), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -236,7 +237,7 @@ transform: let expected_schema = vec![ make_string_column_schema("key1".to_string()), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), diff --git a/src/pipeline/tests/epoch.rs b/src/pipeline/tests/epoch.rs index 84662793b9..ead018ad42 100644 --- a/src/pipeline/tests/epoch.rs +++ b/src/pipeline/tests/epoch.rs @@ -15,6 +15,7 @@ mod common; use api::v1::ColumnSchema; +use common_query::prelude::greptime_timestamp; use greptime_proto::v1::value::ValueData; use greptime_proto::v1::{ColumnDataType, SemanticType}; @@ -128,7 +129,7 @@ transform: make_time_field("input_nanosecond", ColumnDataType::TimestampNanosecond), make_time_field("input_nano", ColumnDataType::TimestampNanosecond), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -187,7 +188,7 @@ transform: SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -238,7 +239,7 @@ transform: SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), diff --git a/src/pipeline/tests/join.rs b/src/pipeline/tests/join.rs index 3625160361..dbc966404f 100644 --- a/src/pipeline/tests/join.rs +++ b/src/pipeline/tests/join.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use common_query::prelude::greptime_timestamp; use greptime_proto::v1::value::ValueData::StringValue; use greptime_proto::v1::{ColumnDataType, ColumnSchema, SemanticType}; use lazy_static::lazy_static; @@ -38,7 +39,7 @@ lazy_static! { SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), diff --git a/src/pipeline/tests/letter.rs b/src/pipeline/tests/letter.rs index d6d9a2cccb..307da50867 100644 --- a/src/pipeline/tests/letter.rs +++ b/src/pipeline/tests/letter.rs @@ -15,6 +15,7 @@ mod common; use api::v1::ColumnSchema; +use common_query::prelude::greptime_timestamp; use greptime_proto::v1::value::ValueData; use greptime_proto::v1::{ColumnDataType, SemanticType}; use lazy_static::lazy_static; @@ -27,7 +28,7 @@ lazy_static! { SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -125,7 +126,7 @@ transform: SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -175,7 +176,7 @@ transform: SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), diff --git a/src/pipeline/tests/on_failure.rs b/src/pipeline/tests/on_failure.rs index 2662a3fa96..d7df1ad7fa 100644 --- a/src/pipeline/tests/on_failure.rs +++ b/src/pipeline/tests/on_failure.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use common_query::prelude::greptime_timestamp; use greptime_proto::v1::value::ValueData::{U8Value, U16Value}; use greptime_proto::v1::{ColumnDataType, SemanticType}; @@ -46,7 +47,7 @@ transform: SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -87,7 +88,7 @@ transform: SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -123,7 +124,7 @@ transform: SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -175,7 +176,7 @@ transform: SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), diff --git a/src/pipeline/tests/regex.rs b/src/pipeline/tests/regex.rs index a8a7daaf5c..a0a3944c8e 100644 --- a/src/pipeline/tests/regex.rs +++ b/src/pipeline/tests/regex.rs @@ -15,6 +15,7 @@ mod common; use api::v1::ColumnSchema; +use common_query::prelude::greptime_timestamp; use greptime_proto::v1::value::ValueData::StringValue; use greptime_proto::v1::{ColumnDataType, SemanticType}; use lazy_static::lazy_static; @@ -27,7 +28,7 @@ lazy_static! { SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -156,7 +157,7 @@ transform: SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), diff --git a/src/pipeline/tests/simple_extract.rs b/src/pipeline/tests/simple_extract.rs index ee2fbcbcae..2a93e5d135 100644 --- a/src/pipeline/tests/simple_extract.rs +++ b/src/pipeline/tests/simple_extract.rs @@ -16,6 +16,7 @@ mod common; use api::v1::value::ValueData; use api::v1::{ColumnDataType, ColumnSchema, SemanticType}; +use common_query::prelude::greptime_timestamp; use lazy_static::lazy_static; lazy_static! { @@ -26,7 +27,7 @@ lazy_static! { SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), diff --git a/src/pipeline/tests/urlencoding.rs b/src/pipeline/tests/urlencoding.rs index dd0c4ffe9f..b8366aa044 100644 --- a/src/pipeline/tests/urlencoding.rs +++ b/src/pipeline/tests/urlencoding.rs @@ -14,6 +14,7 @@ mod common; +use common_query::prelude::greptime_timestamp; use greptime_proto::v1::value::ValueData; use greptime_proto::v1::{ColumnDataType, SemanticType}; @@ -54,7 +55,7 @@ transform: SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -100,7 +101,7 @@ transform: SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), diff --git a/src/query/src/promql/planner.rs b/src/query/src/promql/planner.rs index a1dc1b640a..5cc26cee05 100644 --- a/src/query/src/promql/planner.rs +++ b/src/query/src/promql/planner.rs @@ -22,7 +22,7 @@ use catalog::table_source::DfTableSourceProvider; use common_error::ext::ErrorExt; use common_error::status_code::StatusCode; use common_function::function::FunctionContext; -use common_query::prelude::GREPTIME_VALUE; +use common_query::prelude::greptime_value; use datafusion::common::DFSchemaRef; use datafusion::datasource::DefaultTableSource; use datafusion::functions_aggregate::average::avg_udaf; @@ -2576,7 +2576,7 @@ impl PromPlanner { self.ctx.time_index_column = Some(SPECIAL_TIME_FUNCTION.to_string()); self.ctx.reset_table_name_and_schema(); self.ctx.tag_columns = vec![]; - self.ctx.field_columns = vec![GREPTIME_VALUE.to_string()]; + self.ctx.field_columns = vec![greptime_value().to_string()]; Ok(LogicalPlan::Extension(Extension { node: Arc::new( EmptyMetric::new( @@ -2584,7 +2584,7 @@ impl PromPlanner { self.ctx.end, self.ctx.interval, SPECIAL_TIME_FUNCTION.to_string(), - GREPTIME_VALUE.to_string(), + greptime_value().to_string(), Some(lit), ) .context(DataFusionPlanningSnafu)?, @@ -3433,6 +3433,7 @@ mod test { use catalog::memory::{MemoryCatalogManager, new_memory_catalog_manager}; use common_base::Plugins; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; + use common_query::prelude::greptime_timestamp; use common_query::test_util::DummyDecoder; use datatypes::prelude::ConcreteDataType; use datatypes::schema::{ColumnSchema, Schema}; @@ -3543,14 +3544,14 @@ mod test { } columns.push( ColumnSchema::new( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ConcreteDataType::timestamp_millisecond_datatype(), false, ) .with_time_index(true), ); columns.push(ColumnSchema::new( - "greptime_value".to_string(), + greptime_value().to_string(), ConcreteDataType::float64_datatype(), true, )); diff --git a/src/query/src/sql.rs b/src/query/src/sql.rs index 6b6ee2ed07..693b1aa068 100644 --- a/src/query/src/sql.rs +++ b/src/query/src/sql.rs @@ -34,7 +34,7 @@ use common_datasource::util::find_dir_and_filename; use common_meta::SchemaOptions; use common_meta::key::flow::flow_info::FlowInfoValue; use common_query::Output; -use common_query::prelude::GREPTIME_TIMESTAMP; +use common_query::prelude::greptime_timestamp; use common_recordbatch::RecordBatches; use common_recordbatch::adapter::RecordBatchStreamAdapter; use common_time::Timestamp; @@ -1195,14 +1195,14 @@ pub fn file_column_schemas_to_table( let timestamp_type = ConcreteDataType::timestamp_millisecond_datatype(); let default_zero = Value::Timestamp(Timestamp::new_millisecond(0)); - let timestamp_column_schema = ColumnSchema::new(GREPTIME_TIMESTAMP, timestamp_type, false) + let timestamp_column_schema = ColumnSchema::new(greptime_timestamp(), timestamp_type, false) .with_time_index(true) .with_default_constraint(Some(ColumnDefaultConstraint::Value(default_zero))) .unwrap(); if let Some(column_schema) = column_schemas .iter_mut() - .find(|column_schema| column_schema.name == GREPTIME_TIMESTAMP) + .find(|column_schema| column_schema.name == greptime_timestamp()) { // Replace the column schema with the default one *column_schema = timestamp_column_schema; @@ -1210,7 +1210,7 @@ pub fn file_column_schemas_to_table( column_schemas.push(timestamp_column_schema); } - (column_schemas, GREPTIME_TIMESTAMP.to_string()) + (column_schemas, greptime_timestamp().to_string()) } /// This function checks if the column schemas from a file can be matched with diff --git a/src/servers/src/http/loki.rs b/src/servers/src/http/loki.rs index 45d6eadadd..f10ab53190 100644 --- a/src/servers/src/http/loki.rs +++ b/src/servers/src/http/loki.rs @@ -26,7 +26,7 @@ use axum::extract::State; use axum_extra::TypedHeader; use bytes::Bytes; use chrono::DateTime; -use common_query::prelude::GREPTIME_TIMESTAMP; +use common_query::prelude::greptime_timestamp; use common_query::{Output, OutputData}; use common_telemetry::{error, warn}; use headers::ContentType; @@ -73,7 +73,7 @@ const LINES_KEY: &str = "values"; lazy_static! { static ref LOKI_INIT_SCHEMAS: Vec = vec![ ColumnSchema { - column_name: GREPTIME_TIMESTAMP.to_string(), + column_name: greptime_timestamp().to_string(), datatype: ColumnDataType::TimestampNanosecond.into(), semantic_type: SemanticType::Timestamp.into(), datatype_extension: None, @@ -453,7 +453,7 @@ impl From> for LokiPipeline { let mut map = BTreeMap::new(); map.insert( - KeyString::from(GREPTIME_TIMESTAMP), + KeyString::from(greptime_timestamp()), VrlValue::Timestamp(DateTime::from_timestamp_nanos(ts)), ); map.insert( @@ -586,7 +586,7 @@ impl From>> for LokiPipeline { let mut map = BTreeMap::new(); map.insert( - KeyString::from(GREPTIME_TIMESTAMP), + KeyString::from(greptime_timestamp()), VrlValue::Timestamp(DateTime::from_timestamp_nanos(ts)), ); map.insert( diff --git a/src/servers/src/influxdb.rs b/src/servers/src/influxdb.rs index 2ebfd9dd08..9bff0bbc6e 100644 --- a/src/servers/src/influxdb.rs +++ b/src/servers/src/influxdb.rs @@ -15,7 +15,7 @@ use api::v1::value::ValueData; use api::v1::{ColumnDataType, RowInsertRequests}; use common_grpc::precision::Precision; -use common_query::prelude::GREPTIME_TIMESTAMP; +use common_query::prelude::greptime_timestamp; use hyper::Request; use influxdb_line_protocol::{FieldValue, parse_lines}; use snafu::ResultExt; @@ -91,7 +91,7 @@ impl TryFrom for RowInsertRequests { // timestamp row_writer::write_ts_to_nanos( table_data, - GREPTIME_TIMESTAMP, + greptime_timestamp(), ts, precision, &mut one_row, @@ -117,6 +117,7 @@ fn unwrap_or_default_precision(precision: Option) -> Precision { mod tests { use api::v1::value::ValueData; use api::v1::{ColumnDataType, RowInsertRequests, Rows, SemanticType}; + use common_query::prelude::greptime_timestamp; use crate::influxdb::InfluxdbRequest; @@ -193,7 +194,7 @@ monitor2,host=host4 cpu=66.3,memory=1029 1663840496400340003"; } } } - "greptime_timestamp" => { + _ if column_schema.column_name == greptime_timestamp() => { assert_eq!( ColumnDataType::TimestampNanosecond as i32, column_schema.datatype @@ -268,7 +269,7 @@ monitor2,host=host4 cpu=66.3,memory=1029 1663840496400340003"; } } } - "greptime_timestamp" => { + _ if column_schema.column_name == greptime_timestamp() => { assert_eq!( ColumnDataType::TimestampNanosecond as i32, column_schema.datatype diff --git a/src/servers/src/opentsdb.rs b/src/servers/src/opentsdb.rs index 9ae63c1b9e..203eef8c2b 100644 --- a/src/servers/src/opentsdb.rs +++ b/src/servers/src/opentsdb.rs @@ -16,7 +16,7 @@ pub mod codec; use api::v1::RowInsertRequests; use common_grpc::precision::Precision; -use common_query::prelude::{GREPTIME_TIMESTAMP, GREPTIME_VALUE}; +use common_query::prelude::{greptime_timestamp, greptime_value}; use self::codec::DataPoint; use crate::error::Result; @@ -42,11 +42,11 @@ pub fn data_point_to_grpc_row_insert_requests( row_writer::write_tags(table_data, tags.into_iter(), &mut one_row)?; // value - row_writer::write_f64(table_data, GREPTIME_VALUE, value, &mut one_row)?; + row_writer::write_f64(table_data, greptime_value(), value, &mut one_row)?; // timestamp row_writer::write_ts_to_millis( table_data, - GREPTIME_TIMESTAMP, + greptime_timestamp(), Some(timestamp), Precision::Millisecond, &mut one_row, diff --git a/src/servers/src/opentsdb/codec.rs b/src/servers/src/opentsdb/codec.rs index 16aa9b6381..c4760aa74d 100644 --- a/src/servers/src/opentsdb/codec.rs +++ b/src/servers/src/opentsdb/codec.rs @@ -13,7 +13,7 @@ // limitations under the License. use api::v1::{Column, ColumnDataType, InsertRequest as GrpcInsertRequest, SemanticType, column}; -use common_query::prelude::{GREPTIME_TIMESTAMP, GREPTIME_VALUE}; +use common_query::prelude::{greptime_timestamp, greptime_value}; use crate::error::{self, Result}; @@ -129,7 +129,7 @@ impl DataPoint { let mut columns = Vec::with_capacity(2 + self.tags.len()); let ts_column = Column { - column_name: GREPTIME_TIMESTAMP.to_string(), + column_name: greptime_timestamp().to_string(), values: Some(column::Values { timestamp_millisecond_values: vec![self.ts_millis], ..Default::default() @@ -141,7 +141,7 @@ impl DataPoint { columns.push(ts_column); let field_column = Column { - column_name: GREPTIME_VALUE.to_string(), + column_name: greptime_value().to_string(), values: Some(column::Values { f64_values: vec![self.value], ..Default::default() @@ -267,7 +267,7 @@ mod test { assert_eq!(row_count, 1); assert_eq!(columns.len(), 4); - assert_eq!(columns[0].column_name, GREPTIME_TIMESTAMP); + assert_eq!(columns[0].column_name, greptime_timestamp()); assert_eq!( columns[0] .values @@ -277,7 +277,7 @@ mod test { vec![1000] ); - assert_eq!(columns[1].column_name, GREPTIME_VALUE); + assert_eq!(columns[1].column_name, greptime_value()); assert_eq!(columns[1].values.as_ref().unwrap().f64_values, vec![1.0]); assert_eq!(columns[2].column_name, "tagk1"); diff --git a/src/servers/src/otlp/metrics.rs b/src/servers/src/otlp/metrics.rs index 274a0ba41e..d89cd3f277 100644 --- a/src/servers/src/otlp/metrics.rs +++ b/src/servers/src/otlp/metrics.rs @@ -15,7 +15,7 @@ use ahash::{HashMap, HashSet}; use api::v1::{RowInsertRequests, Value}; use common_grpc::precision::Precision; -use common_query::prelude::{GREPTIME_COUNT, GREPTIME_TIMESTAMP, GREPTIME_VALUE}; +use common_query::prelude::{GREPTIME_COUNT, greptime_timestamp, greptime_value}; use lazy_static::lazy_static; use otel_arrow_rust::proto::opentelemetry::collector::metrics::v1::ExportMetricsServiceRequest; use otel_arrow_rust::proto::opentelemetry::common::v1::{AnyValue, KeyValue, any_value}; @@ -481,7 +481,7 @@ fn write_timestamp( if legacy_mode { row_writer::write_ts_to_nanos( table, - GREPTIME_TIMESTAMP, + greptime_timestamp(), Some(time_nano), Precision::Nanosecond, row, @@ -489,7 +489,7 @@ fn write_timestamp( } else { row_writer::write_ts_to_millis( table, - GREPTIME_TIMESTAMP, + greptime_timestamp(), Some(time_nano / 1000000), Precision::Millisecond, row, @@ -571,7 +571,7 @@ fn encode_gauge( metric_ctx, )?; - write_data_point_value(table, &mut row, GREPTIME_VALUE, &data_point.value)?; + write_data_point_value(table, &mut row, greptime_value(), &data_point.value)?; table.add_row(row); } @@ -606,7 +606,7 @@ fn encode_sum( data_point.time_unix_nano as i64, metric_ctx, )?; - write_data_point_value(table, &mut row, GREPTIME_VALUE, &data_point.value)?; + write_data_point_value(table, &mut row, greptime_value(), &data_point.value)?; table.add_row(row); } @@ -680,7 +680,7 @@ fn encode_histogram( accumulated_count += count; row_writer::write_f64( &mut bucket_table, - GREPTIME_VALUE, + greptime_value(), accumulated_count as f64, &mut bucket_row, )?; @@ -700,7 +700,7 @@ fn encode_histogram( metric_ctx, )?; - row_writer::write_f64(&mut sum_table, GREPTIME_VALUE, sum, &mut sum_row)?; + row_writer::write_f64(&mut sum_table, greptime_value(), sum, &mut sum_row)?; sum_table.add_row(sum_row); } @@ -717,7 +717,7 @@ fn encode_histogram( row_writer::write_f64( &mut count_table, - GREPTIME_VALUE, + greptime_value(), data_point.count as f64, &mut count_row, )?; @@ -807,7 +807,7 @@ fn encode_summary( row_writer::write_tag(quantile_table, "quantile", quantile.quantile, &mut row)?; row_writer::write_f64( quantile_table, - GREPTIME_VALUE, + greptime_value(), quantile.value, &mut row, )?; @@ -833,7 +833,7 @@ fn encode_summary( row_writer::write_f64( count_table, - GREPTIME_VALUE, + greptime_value(), data_point.count as f64, &mut row, )?; @@ -858,7 +858,7 @@ fn encode_summary( metric_ctx, )?; - row_writer::write_f64(sum_table, GREPTIME_VALUE, data_point.sum, &mut row)?; + row_writer::write_f64(sum_table, greptime_value(), data_point.sum, &mut row)?; sum_table.add_row(row); } @@ -1494,8 +1494,8 @@ mod tests { vec![ "otel_scope_scope", "host", - "greptime_timestamp", - "greptime_value" + greptime_timestamp(), + greptime_value() ] ); } @@ -1544,8 +1544,8 @@ mod tests { vec![ "otel_scope_scope", "host", - "greptime_timestamp", - "greptime_value" + greptime_timestamp(), + greptime_value() ] ); } @@ -1594,9 +1594,9 @@ mod tests { vec![ "otel_scope_scope", "host", - "greptime_timestamp", + greptime_timestamp(), "quantile", - "greptime_value" + greptime_value() ] ); @@ -1612,8 +1612,8 @@ mod tests { vec![ "otel_scope_scope", "host", - "greptime_timestamp", - "greptime_value" + greptime_timestamp(), + greptime_value() ] ); @@ -1629,8 +1629,8 @@ mod tests { vec![ "otel_scope_scope", "host", - "greptime_timestamp", - "greptime_value" + greptime_timestamp(), + greptime_value() ] ); } @@ -1681,9 +1681,9 @@ mod tests { vec![ "otel_scope_scope", "host", - "greptime_timestamp", + greptime_timestamp(), "le", - "greptime_value", + greptime_value(), ] ); @@ -1699,8 +1699,8 @@ mod tests { vec![ "otel_scope_scope", "host", - "greptime_timestamp", - "greptime_value" + greptime_timestamp(), + greptime_value() ] ); @@ -1716,8 +1716,8 @@ mod tests { vec![ "otel_scope_scope", "host", - "greptime_timestamp", - "greptime_value" + greptime_timestamp(), + greptime_value() ] ); } diff --git a/src/servers/src/prom_row_builder.rs b/src/servers/src/prom_row_builder.rs index a6591dbfe1..b17048a4dd 100644 --- a/src/servers/src/prom_row_builder.rs +++ b/src/servers/src/prom_row_builder.rs @@ -20,7 +20,7 @@ use api::prom_store::remote::Sample; use api::v1::helper::{field_column_schema, tag_column_schema, time_index_column_schema}; use api::v1::value::ValueData; use api::v1::{ColumnDataType, ColumnSchema, Row, RowInsertRequest, Rows, SemanticType, Value}; -use common_query::prelude::{GREPTIME_TIMESTAMP, GREPTIME_VALUE}; +use common_query::prelude::{greptime_timestamp, greptime_value}; use pipeline::{ContextOpt, ContextReq}; use prost::DecodeError; @@ -114,15 +114,18 @@ impl Default for TableBuilder { impl TableBuilder { pub(crate) fn with_capacity(cols: usize, rows: usize) -> Self { let mut col_indexes = HashMap::with_capacity_and_hasher(cols, Default::default()); - col_indexes.insert(GREPTIME_TIMESTAMP.to_string(), 0); - col_indexes.insert(GREPTIME_VALUE.to_string(), 1); + col_indexes.insert(greptime_timestamp().to_string(), 0); + col_indexes.insert(greptime_value().to_string(), 1); let mut schema = Vec::with_capacity(cols); schema.push(time_index_column_schema( - GREPTIME_TIMESTAMP, + greptime_timestamp(), ColumnDataType::TimestampMillisecond, )); - schema.push(field_column_schema(GREPTIME_VALUE, ColumnDataType::Float64)); + schema.push(field_column_schema( + greptime_value(), + ColumnDataType::Float64, + )); Self { schema, diff --git a/src/servers/src/prom_store.rs b/src/servers/src/prom_store.rs index 9738bbc8a0..81268d8663 100644 --- a/src/servers/src/prom_store.rs +++ b/src/servers/src/prom_store.rs @@ -22,7 +22,7 @@ use api::prom_store::remote::label_matcher::Type as MatcherType; use api::prom_store::remote::{Label, Query, ReadRequest, Sample, TimeSeries, WriteRequest}; use api::v1::RowInsertRequests; use common_grpc::precision::Precision; -use common_query::prelude::{GREPTIME_TIMESTAMP, GREPTIME_VALUE}; +use common_query::prelude::{greptime_timestamp, greptime_value}; use common_recordbatch::{RecordBatch, RecordBatches}; use common_telemetry::tracing; use common_time::timestamp::TimeUnit; @@ -111,8 +111,8 @@ pub fn query_to_plan(dataframe: DataFrame, q: &Query) -> Result { let mut conditions = Vec::with_capacity(label_matches.len() + 1); - conditions.push(col(GREPTIME_TIMESTAMP).gt_eq(lit_timestamp_millisecond(start_timestamp_ms))); - conditions.push(col(GREPTIME_TIMESTAMP).lt_eq(lit_timestamp_millisecond(end_timestamp_ms))); + conditions.push(col(greptime_timestamp()).gt_eq(lit_timestamp_millisecond(start_timestamp_ms))); + conditions.push(col(greptime_timestamp()).lt_eq(lit_timestamp_millisecond(end_timestamp_ms))); for m in label_matches { let name = &m.name; @@ -241,7 +241,8 @@ fn collect_timeseries_ids(table_name: &str, recordbatch: &RecordBatch) -> Vec Result> { - let ts_column = recordbatch.column_by_name(GREPTIME_TIMESTAMP).context( + let ts_column = recordbatch.column_by_name(greptime_timestamp()).context( error::InvalidPromRemoteReadQueryResultSnafu { msg: "missing greptime_timestamp column in query result", }, @@ -289,7 +290,7 @@ fn recordbatch_to_timeseries(table: &str, recordbatch: RecordBatch) -> Result Result<(RowInsertR // value row_writer::write_f64( table_data, - GREPTIME_VALUE, + greptime_value(), series.samples[0].value, &mut one_row, )?; // timestamp row_writer::write_ts_to_millis( table_data, - GREPTIME_TIMESTAMP, + greptime_timestamp(), Some(series.samples[0].timestamp), Precision::Millisecond, &mut one_row, @@ -403,11 +404,11 @@ pub fn to_grpc_row_insert_requests(request: &WriteRequest) -> Result<(RowInsertR let kvs = kvs.clone(); row_writer::write_tags(table_data, kvs, &mut one_row)?; // value - row_writer::write_f64(table_data, GREPTIME_VALUE, *value, &mut one_row)?; + row_writer::write_f64(table_data, greptime_value(), *value, &mut one_row)?; // timestamp row_writer::write_ts_to_millis( table_data, - GREPTIME_TIMESTAMP, + greptime_timestamp(), Some(*timestamp), Precision::Millisecond, &mut one_row, @@ -628,11 +629,11 @@ mod tests { let schema = Arc::new(Schema::new(vec![ ColumnSchema::new( - GREPTIME_TIMESTAMP, + greptime_timestamp(), ConcreteDataType::timestamp_millisecond_datatype(), true, ), - ColumnSchema::new(GREPTIME_VALUE, ConcreteDataType::float64_datatype(), true), + ColumnSchema::new(greptime_value(), ConcreteDataType::float64_datatype(), true), ColumnSchema::new("instance", ConcreteDataType::string_datatype(), true), ColumnSchema::new("job", ConcreteDataType::string_datatype(), true), ])); @@ -655,10 +656,12 @@ mod tests { let plan = query_to_plan(DataFrame::DataFusion(dataframe), &q).unwrap(); let display_string = format!("{}", plan.display_indent()); - assert_eq!( - "Filter: ?table?.greptime_timestamp >= TimestampMillisecond(1000, None) AND ?table?.greptime_timestamp <= TimestampMillisecond(2000, None)\n TableScan: ?table?", - display_string + let ts_col = greptime_timestamp(); + let expected = format!( + "Filter: ?table?.{} >= TimestampMillisecond(1000, None) AND ?table?.{} <= TimestampMillisecond(2000, None)\n TableScan: ?table?", + ts_col, ts_col ); + assert_eq!(expected, display_string); let q = Query { start_timestamp_ms: 1000, @@ -687,22 +690,24 @@ mod tests { let plan = query_to_plan(DataFrame::DataFusion(dataframe), &q).unwrap(); let display_string = format!("{}", plan.display_indent()); - assert_eq!( - "Filter: ?table?.greptime_timestamp >= TimestampMillisecond(1000, None) AND ?table?.greptime_timestamp <= TimestampMillisecond(2000, None) AND regexp_match(?table?.job, Utf8(\"*prom*\")) IS NOT NULL AND ?table?.instance != Utf8(\"localhost\")\n TableScan: ?table?", - display_string + let ts_col = greptime_timestamp(); + let expected = format!( + "Filter: ?table?.{} >= TimestampMillisecond(1000, None) AND ?table?.{} <= TimestampMillisecond(2000, None) AND regexp_match(?table?.job, Utf8(\"*prom*\")) IS NOT NULL AND ?table?.instance != Utf8(\"localhost\")\n TableScan: ?table?", + ts_col, ts_col ); + assert_eq!(expected, display_string); } fn column_schemas_with( mut kts_iter: Vec<(&str, ColumnDataType, SemanticType)>, ) -> Vec { kts_iter.push(( - "greptime_value", + greptime_value(), ColumnDataType::Float64, SemanticType::Field, )); kts_iter.push(( - "greptime_timestamp", + greptime_timestamp(), ColumnDataType::TimestampMillisecond, SemanticType::Timestamp, )); @@ -837,11 +842,11 @@ mod tests { fn test_recordbatches_to_timeseries() { let schema = Arc::new(Schema::new(vec![ ColumnSchema::new( - GREPTIME_TIMESTAMP, + greptime_timestamp(), ConcreteDataType::timestamp_millisecond_datatype(), true, ), - ColumnSchema::new(GREPTIME_VALUE, ConcreteDataType::float64_datatype(), true), + ColumnSchema::new(greptime_value(), ConcreteDataType::float64_datatype(), true), ColumnSchema::new("instance", ConcreteDataType::string_datatype(), true), ])); diff --git a/src/servers/src/proto.rs b/src/servers/src/proto.rs index 564943a152..1ef01c443b 100644 --- a/src/servers/src/proto.rs +++ b/src/servers/src/proto.rs @@ -19,7 +19,7 @@ use std::slice; use api::prom_store::remote::Sample; use bytes::{Buf, Bytes}; -use common_query::prelude::{GREPTIME_TIMESTAMP, GREPTIME_VALUE}; +use common_query::prelude::{greptime_timestamp, greptime_value}; use common_telemetry::warn; use pipeline::{ContextReq, GreptimePipelineParams, PipelineContext, PipelineDefinition}; use prost::DecodeError; @@ -407,10 +407,10 @@ impl PromSeriesProcessor { let timestamp = s.timestamp; pipeline_map.insert( - KeyString::from(GREPTIME_TIMESTAMP), + KeyString::from(greptime_timestamp()), VrlValue::Integer(timestamp), ); - pipeline_map.insert(KeyString::from(GREPTIME_VALUE), VrlValue::Float(value)); + pipeline_map.insert(KeyString::from(greptime_value()), VrlValue::Float(value)); if one_sample { vec_pipeline_map.push(VrlValue::Object(pipeline_map)); break; diff --git a/src/standalone/src/options.rs b/src/standalone/src/options.rs index 20dbcbb850..abbfcf64e2 100644 --- a/src/standalone/src/options.rs +++ b/src/standalone/src/options.rs @@ -37,6 +37,7 @@ use servers::http::HttpOptions; pub struct StandaloneOptions { pub enable_telemetry: bool, pub default_timezone: Option, + pub default_column_prefix: Option, pub http: HttpOptions, pub grpc: GrpcOptions, pub mysql: MysqlOptions, @@ -69,6 +70,7 @@ impl Default for StandaloneOptions { Self { enable_telemetry: true, default_timezone: None, + default_column_prefix: None, http: HttpOptions::default(), grpc: GrpcOptions::default(), mysql: MysqlOptions::default(), From 785f9d7fd7531e85fc793e5ab3a87c32f9dfe25b Mon Sep 17 00:00:00 2001 From: Weny Xu Date: Mon, 27 Oct 2025 16:07:51 +0800 Subject: [PATCH 014/149] fix: add delays in reconcile tests for async cache invalidation (#7147) Signed-off-by: WenyXu --- src/frontend/src/heartbeat.rs | 3 +++ tests-integration/src/tests/reconcile_table.rs | 8 ++++++++ 2 files changed, 11 insertions(+) diff --git a/src/frontend/src/heartbeat.rs b/src/frontend/src/heartbeat.rs index 95645ad1ca..9c3954b0c6 100644 --- a/src/frontend/src/heartbeat.rs +++ b/src/frontend/src/heartbeat.rs @@ -104,6 +104,9 @@ impl HeartbeatTask { match resp_stream.message().await { Ok(Some(resp)) => { debug!("Receiving heartbeat response: {:?}", resp); + if let Some(message) = &resp.mailbox_message { + info!("Received mailbox message: {message:?}"); + } let ctx = HeartbeatResponseHandlerContext::new(mailbox.clone(), resp); if let Err(e) = capture_self.handle_response(ctx).await { error!(e; "Error while handling heartbeat response"); diff --git a/tests-integration/src/tests/reconcile_table.rs b/tests-integration/src/tests/reconcile_table.rs index bd83a7d930..3e8414436d 100644 --- a/tests-integration/src/tests/reconcile_table.rs +++ b/tests-integration/src/tests/reconcile_table.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::time::Duration; + use client::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, OutputData}; use common_meta::reconciliation::ResolveStrategy; use common_meta::reconciliation::manager::ReconciliationManagerRef; @@ -165,6 +167,8 @@ async fn test_reconcile_dropped_column() { "grpc_latencies", ) .await; + // Try best effort to wait for the cache to be invalidated. + tokio::time::sleep(Duration::from_secs(1)).await; // Now we should able to query table again. let output = execute_sql(&frontend, "SELECT * FROM grpc_latencies ORDER BY host").await; @@ -268,6 +272,8 @@ async fn test_reconcile_added_column() { "grpc_latencies", ) .await; + // Try best effort to wait for the cache to be invalidated. + tokio::time::sleep(Duration::from_secs(1)).await; // Now the column cloud_provider is available. let output = execute_sql(&frontend, "SELECT * FROM grpc_latencies ORDER BY host").await; @@ -342,6 +348,8 @@ async fn test_reconcile_modify_column_type() { "grpc_latencies", ) .await; + // Try best effort to wait for the cache to be invalidated. + tokio::time::sleep(Duration::from_secs(1)).await; // Now we can query the table again. let output = execute_sql(&frontend, "SELECT * FROM grpc_latencies ORDER BY host").await; From f2bc92b9e60f57cd418b5f691cf9cd72767bd34c Mon Sep 17 00:00:00 2001 From: discord9 <55937128+discord9@users.noreply.github.com> Date: Mon, 27 Oct 2025 17:09:48 +0800 Subject: [PATCH 015/149] refactor: use generic for heartbeat instruction handler (#7149) * refactor: use generic Signed-off-by: discord9 * w Signed-off-by: discord9 * per review Signed-off-by: discord9 --------- Signed-off-by: discord9 --- src/datanode/src/heartbeat/handler.rs | 113 ++++++++++++++---- .../src/heartbeat/handler/close_region.rs | 9 +- .../src/heartbeat/handler/downgrade_region.rs | 44 +++---- .../src/heartbeat/handler/flush_region.rs | 32 ++--- .../src/heartbeat/handler/open_region.rs | 7 +- .../src/heartbeat/handler/upgrade_region.rs | 42 +++---- 6 files changed, 148 insertions(+), 99 deletions(-) diff --git a/src/datanode/src/heartbeat/handler.rs b/src/datanode/src/heartbeat/handler.rs index 71b3181a04..8566f8806c 100644 --- a/src/datanode/src/heartbeat/handler.rs +++ b/src/datanode/src/heartbeat/handler.rs @@ -47,10 +47,11 @@ pub struct RegionHeartbeatResponseHandler { #[async_trait::async_trait] pub trait InstructionHandler: Send + Sync { + type Instruction; async fn handle( &self, ctx: &HandlerContext, - instruction: Instruction, + instruction: Self::Instruction, ) -> Option; } @@ -93,39 +94,101 @@ impl RegionHeartbeatResponseHandler { self } - fn build_handler(&self, instruction: &Instruction) -> MetaResult> { + fn build_handler(&self, instruction: &Instruction) -> MetaResult> { match instruction { - Instruction::CloseRegions(_) => Ok(Box::new(CloseRegionsHandler)), - Instruction::OpenRegions(_) => Ok(Box::new(OpenRegionsHandler { - open_region_parallelism: self.open_region_parallelism, - })), - Instruction::FlushRegions(_) => Ok(Box::new(FlushRegionsHandler)), - Instruction::DowngradeRegions(_) => Ok(Box::new(DowngradeRegionsHandler)), - Instruction::UpgradeRegion(_) => Ok(Box::new(UpgradeRegionsHandler)), + Instruction::CloseRegions(_) => Ok(Box::new(CloseRegionsHandler.into())), + Instruction::OpenRegions(_) => Ok(Box::new( + OpenRegionsHandler { + open_region_parallelism: self.open_region_parallelism, + } + .into(), + )), + Instruction::FlushRegions(_) => Ok(Box::new(FlushRegionsHandler.into())), + Instruction::DowngradeRegions(_) => Ok(Box::new(DowngradeRegionsHandler.into())), + Instruction::UpgradeRegion(_) => Ok(Box::new(UpgradeRegionsHandler.into())), Instruction::InvalidateCaches(_) => InvalidHeartbeatResponseSnafu.fail(), } } } +#[allow(clippy::enum_variant_names)] +pub enum InstructionHandlers { + CloseRegions(CloseRegionsHandler), + OpenRegions(OpenRegionsHandler), + FlushRegions(FlushRegionsHandler), + DowngradeRegions(DowngradeRegionsHandler), + UpgradeRegions(UpgradeRegionsHandler), +} + +macro_rules! impl_from_handler { + ($($handler:ident => $variant:ident),*) => { + $( + impl From<$handler> for InstructionHandlers { + fn from(handler: $handler) -> Self { + InstructionHandlers::$variant(handler) + } + } + )* + }; +} + +impl_from_handler!( + CloseRegionsHandler => CloseRegions, + OpenRegionsHandler => OpenRegions, + FlushRegionsHandler => FlushRegions, + DowngradeRegionsHandler => DowngradeRegions, + UpgradeRegionsHandler => UpgradeRegions +); + +macro_rules! dispatch_instr { + ( + $( $instr_variant:ident => $handler_variant:ident ),* $(,)? + ) => { + impl InstructionHandlers { + pub async fn handle( + &self, + ctx: &HandlerContext, + instruction: Instruction, + ) -> Option { + match (self, instruction) { + $( + ( + InstructionHandlers::$handler_variant(handler), + Instruction::$instr_variant(instr), + ) => handler.handle(ctx, instr).await, + )* + // Safety: must be used in pairs with `build_handler`. + _ => unreachable!(), + } + } + /// Check whether this instruction is acceptable by any handler. + pub fn is_acceptable(instruction: &Instruction) -> bool { + matches!( + instruction, + $( + Instruction::$instr_variant { .. } + )|* + ) + } + } + }; +} + +dispatch_instr!( + CloseRegions => CloseRegions, + OpenRegions => OpenRegions, + FlushRegions => FlushRegions, + DowngradeRegions => DowngradeRegions, + UpgradeRegion => UpgradeRegions, +); + #[async_trait] impl HeartbeatResponseHandler for RegionHeartbeatResponseHandler { fn is_acceptable(&self, ctx: &HeartbeatResponseHandlerContext) -> bool { - matches!(ctx.incoming_message.as_ref(), |Some(( - _, - Instruction::DowngradeRegions { .. }, - ))| Some(( - _, - Instruction::UpgradeRegion { .. } - )) | Some(( - _, - Instruction::FlushRegions { .. } - )) | Some(( - _, - Instruction::OpenRegions { .. } - )) | Some(( - _, - Instruction::CloseRegions { .. } - ))) + if let Some((_, instruction)) = ctx.incoming_message.as_ref() { + return InstructionHandlers::is_acceptable(instruction); + } + false } async fn handle(&self, ctx: &mut HeartbeatResponseHandlerContext) -> MetaResult { diff --git a/src/datanode/src/heartbeat/handler/close_region.rs b/src/datanode/src/heartbeat/handler/close_region.rs index 88ed043fab..770d6a75cc 100644 --- a/src/datanode/src/heartbeat/handler/close_region.rs +++ b/src/datanode/src/heartbeat/handler/close_region.rs @@ -12,7 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -use common_meta::instruction::{Instruction, InstructionReply, SimpleReply}; +use common_meta::RegionIdent; +use common_meta::instruction::{InstructionReply, SimpleReply}; use common_telemetry::warn; use futures::future::join_all; use store_api::region_request::{RegionCloseRequest, RegionRequest}; @@ -26,13 +27,13 @@ pub struct CloseRegionsHandler; #[async_trait::async_trait] impl InstructionHandler for CloseRegionsHandler { + type Instruction = Vec; + async fn handle( &self, ctx: &HandlerContext, - instruction: Instruction, + region_idents: Self::Instruction, ) -> Option { - // Safety: must be `Instruction::CloseRegions` instruction. - let region_idents = instruction.into_close_regions().unwrap(); let region_ids = region_idents .into_iter() .map(|region_ident| RegionId::new(region_ident.table_id, region_ident.region_number)) diff --git a/src/datanode/src/heartbeat/handler/downgrade_region.rs b/src/datanode/src/heartbeat/handler/downgrade_region.rs index 91ceddb91a..779023a52f 100644 --- a/src/datanode/src/heartbeat/handler/downgrade_region.rs +++ b/src/datanode/src/heartbeat/handler/downgrade_region.rs @@ -13,7 +13,7 @@ // limitations under the License. use common_meta::instruction::{ - DowngradeRegion, DowngradeRegionReply, DowngradeRegionsReply, Instruction, InstructionReply, + DowngradeRegion, DowngradeRegionReply, DowngradeRegionsReply, InstructionReply, }; use common_telemetry::tracing::info; use common_telemetry::{error, warn}; @@ -156,13 +156,13 @@ impl DowngradeRegionsHandler { #[async_trait::async_trait] impl InstructionHandler for DowngradeRegionsHandler { + type Instruction = Vec; + async fn handle( &self, ctx: &HandlerContext, - instruction: Instruction, + downgrade_regions: Self::Instruction, ) -> Option { - // Safety: must be `Instruction::DowngradeRegion` instruction. - let downgrade_regions = instruction.into_downgrade_regions().unwrap(); let futures = downgrade_regions .into_iter() .map(|downgrade_region| Self::handle_downgrade_region(ctx, downgrade_region)); @@ -263,10 +263,10 @@ mod tests { let reply = DowngradeRegionsHandler .handle( &handler_context, - Instruction::DowngradeRegions(vec![DowngradeRegion { + vec![DowngradeRegion { region_id, flush_timeout, - }]), + }], ) .await; @@ -306,10 +306,10 @@ mod tests { let reply = DowngradeRegionsHandler .handle( &handler_context, - Instruction::DowngradeRegions(vec![DowngradeRegion { + vec![DowngradeRegion { region_id, flush_timeout, - }]), + }], ) .await; @@ -341,10 +341,10 @@ mod tests { let reply = DowngradeRegionsHandler .handle( &handler_context, - Instruction::DowngradeRegions(vec![DowngradeRegion { + vec![DowngradeRegion { region_id, flush_timeout: Some(flush_timeout), - }]), + }], ) .await; @@ -380,10 +380,10 @@ mod tests { let reply = DowngradeRegionsHandler .handle( &handler_context, - Instruction::DowngradeRegions(vec![DowngradeRegion { + vec![DowngradeRegion { region_id, flush_timeout, - }]), + }], ) .await; @@ -396,10 +396,10 @@ mod tests { let reply = DowngradeRegionsHandler .handle( &handler_context, - Instruction::DowngradeRegions(vec![DowngradeRegion { + vec![DowngradeRegion { region_id, flush_timeout: Some(Duration::from_millis(500)), - }]), + }], ) .await; // Must less than 300 ms. @@ -443,10 +443,10 @@ mod tests { let reply = DowngradeRegionsHandler .handle( &handler_context, - Instruction::DowngradeRegions(vec![DowngradeRegion { + vec![DowngradeRegion { region_id, flush_timeout, - }]), + }], ) .await; let reply = &reply.unwrap().expect_downgrade_regions_reply()[0]; @@ -458,10 +458,10 @@ mod tests { let reply = DowngradeRegionsHandler .handle( &handler_context, - Instruction::DowngradeRegions(vec![DowngradeRegion { + vec![DowngradeRegion { region_id, flush_timeout: Some(Duration::from_millis(500)), - }]), + }], ) .await; // Must less than 300 ms. @@ -487,10 +487,10 @@ mod tests { let reply = DowngradeRegionsHandler .handle( &handler_context, - Instruction::DowngradeRegions(vec![DowngradeRegion { + vec![DowngradeRegion { region_id, flush_timeout: None, - }]), + }], ) .await; let reply = &reply.unwrap().expect_downgrade_regions_reply()[0]; @@ -518,10 +518,10 @@ mod tests { let reply = DowngradeRegionsHandler .handle( &handler_context, - Instruction::DowngradeRegions(vec![DowngradeRegion { + vec![DowngradeRegion { region_id, flush_timeout: None, - }]), + }], ) .await; let reply = &reply.unwrap().expect_downgrade_regions_reply()[0]; diff --git a/src/datanode/src/heartbeat/handler/flush_region.rs b/src/datanode/src/heartbeat/handler/flush_region.rs index 56b841bf00..721673432e 100644 --- a/src/datanode/src/heartbeat/handler/flush_region.rs +++ b/src/datanode/src/heartbeat/handler/flush_region.rs @@ -15,7 +15,7 @@ use std::time::Instant; use common_meta::instruction::{ - FlushErrorStrategy, FlushRegionReply, FlushStrategy, Instruction, InstructionReply, + FlushErrorStrategy, FlushRegionReply, FlushRegions, FlushStrategy, InstructionReply, }; use common_telemetry::{debug, warn}; use store_api::region_request::{RegionFlushRequest, RegionRequest}; @@ -28,13 +28,14 @@ pub struct FlushRegionsHandler; #[async_trait::async_trait] impl InstructionHandler for FlushRegionsHandler { + type Instruction = FlushRegions; + async fn handle( &self, ctx: &HandlerContext, - instruction: Instruction, + flush_regions: FlushRegions, ) -> Option { let start_time = Instant::now(); - let flush_regions = instruction.into_flush_regions().unwrap(); let strategy = flush_regions.strategy; let region_ids = flush_regions.region_ids; let error_strategy = flush_regions.error_strategy; @@ -205,10 +206,7 @@ mod tests { // Async hint mode let flush_instruction = FlushRegions::async_batch(region_ids.clone()); let reply = FlushRegionsHandler - .handle( - &handler_context, - Instruction::FlushRegions(flush_instruction), - ) + .handle(&handler_context, flush_instruction) .await; assert!(reply.is_none()); // Hint mode returns no reply assert_eq!(*flushed_region_ids.read().unwrap(), region_ids); @@ -218,10 +216,7 @@ mod tests { let not_found_region_ids = (0..2).map(|i| RegionId::new(2048, i)).collect::>(); let flush_instruction = FlushRegions::async_batch(not_found_region_ids); let reply = FlushRegionsHandler - .handle( - &handler_context, - Instruction::FlushRegions(flush_instruction), - ) + .handle(&handler_context, flush_instruction) .await; assert!(reply.is_none()); assert!(flushed_region_ids.read().unwrap().is_empty()); @@ -247,10 +242,7 @@ mod tests { let flush_instruction = FlushRegions::sync_single(region_id); let reply = FlushRegionsHandler - .handle( - &handler_context, - Instruction::FlushRegions(flush_instruction), - ) + .handle(&handler_context, flush_instruction) .await; let flush_reply = reply.unwrap().expect_flush_regions_reply(); assert!(flush_reply.overall_success); @@ -287,10 +279,7 @@ mod tests { let flush_instruction = FlushRegions::sync_batch(region_ids.clone(), FlushErrorStrategy::FailFast); let reply = FlushRegionsHandler - .handle( - &handler_context, - Instruction::FlushRegions(flush_instruction), - ) + .handle(&handler_context, flush_instruction) .await; let flush_reply = reply.unwrap().expect_flush_regions_reply(); assert!(!flush_reply.overall_success); // Should fail due to non-existent regions @@ -321,10 +310,7 @@ mod tests { let flush_instruction = FlushRegions::sync_batch(region_ids.clone(), FlushErrorStrategy::TryAll); let reply = FlushRegionsHandler - .handle( - &handler_context, - Instruction::FlushRegions(flush_instruction), - ) + .handle(&handler_context, flush_instruction) .await; let flush_reply = reply.unwrap().expect_flush_regions_reply(); assert!(!flush_reply.overall_success); // Should fail due to one non-existent region diff --git a/src/datanode/src/heartbeat/handler/open_region.rs b/src/datanode/src/heartbeat/handler/open_region.rs index 77cd4fe6a0..76ca806a98 100644 --- a/src/datanode/src/heartbeat/handler/open_region.rs +++ b/src/datanode/src/heartbeat/handler/open_region.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use common_meta::instruction::{Instruction, InstructionReply, OpenRegion, SimpleReply}; +use common_meta::instruction::{InstructionReply, OpenRegion, SimpleReply}; use common_meta::wal_options_allocator::prepare_wal_options; use store_api::path_utils::table_dir; use store_api::region_request::{PathType, RegionOpenRequest}; @@ -26,13 +26,12 @@ pub struct OpenRegionsHandler { #[async_trait::async_trait] impl InstructionHandler for OpenRegionsHandler { + type Instruction = Vec; async fn handle( &self, ctx: &HandlerContext, - instruction: Instruction, + open_regions: Self::Instruction, ) -> Option { - let open_regions = instruction.into_open_regions().unwrap(); - let requests = open_regions .into_iter() .map(|open_region| { diff --git a/src/datanode/src/heartbeat/handler/upgrade_region.rs b/src/datanode/src/heartbeat/handler/upgrade_region.rs index 239eaf1e4c..06769dcb77 100644 --- a/src/datanode/src/heartbeat/handler/upgrade_region.rs +++ b/src/datanode/src/heartbeat/handler/upgrade_region.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use common_meta::instruction::{Instruction, InstructionReply, UpgradeRegion, UpgradeRegionReply}; +use common_meta::instruction::{InstructionReply, UpgradeRegion, UpgradeRegionReply}; use common_telemetry::{info, warn}; use store_api::region_request::{RegionCatchupRequest, RegionRequest, ReplayCheckpoint}; @@ -24,12 +24,12 @@ pub struct UpgradeRegionsHandler; #[async_trait::async_trait] impl InstructionHandler for UpgradeRegionsHandler { + type Instruction = UpgradeRegion; + async fn handle( &self, ctx: &HandlerContext, - instruction: Instruction, - ) -> Option { - let UpgradeRegion { + UpgradeRegion { region_id, last_entry_id, metadata_last_entry_id, @@ -37,8 +37,8 @@ impl InstructionHandler for UpgradeRegionsHandler { location_id, replay_entry_id, metadata_replay_entry_id, - } = instruction.into_upgrade_regions().unwrap(); - + }: UpgradeRegion, + ) -> Option { let Some(writable) = ctx.region_server.is_region_leader(region_id) else { return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply { ready: false, @@ -138,7 +138,7 @@ impl InstructionHandler for UpgradeRegionsHandler { mod tests { use std::time::Duration; - use common_meta::instruction::{Instruction, UpgradeRegion}; + use common_meta::instruction::UpgradeRegion; use mito2::engine::MITO_ENGINE_NAME; use store_api::region_engine::RegionRole; use store_api::storage::RegionId; @@ -164,11 +164,11 @@ mod tests { let reply = UpgradeRegionsHandler .handle( &handler_context, - Instruction::UpgradeRegion(UpgradeRegion { + UpgradeRegion { region_id, replay_timeout, ..Default::default() - }), + }, ) .await; @@ -201,11 +201,11 @@ mod tests { let reply = UpgradeRegionsHandler .handle( &handler_context, - Instruction::UpgradeRegion(UpgradeRegion { + UpgradeRegion { region_id, replay_timeout, ..Default::default() - }), + }, ) .await; @@ -239,11 +239,11 @@ mod tests { let reply = UpgradeRegionsHandler .handle( &handler_context, - Instruction::UpgradeRegion(UpgradeRegion { + UpgradeRegion { region_id, replay_timeout, ..Default::default() - }), + }, ) .await; @@ -280,11 +280,11 @@ mod tests { let reply = UpgradeRegionsHandler .handle( &handler_context, - Instruction::UpgradeRegion(UpgradeRegion { + UpgradeRegion { region_id, replay_timeout, ..Default::default() - }), + }, ) .await; @@ -298,11 +298,11 @@ mod tests { let reply = UpgradeRegionsHandler .handle( &handler_context, - Instruction::UpgradeRegion(UpgradeRegion { + UpgradeRegion { region_id, replay_timeout: Some(Duration::from_millis(500)), ..Default::default() - }), + }, ) .await; // Must less than 300 ms. @@ -339,10 +339,10 @@ mod tests { let reply = UpgradeRegionsHandler .handle( &handler_context, - Instruction::UpgradeRegion(UpgradeRegion { + UpgradeRegion { region_id, ..Default::default() - }), + }, ) .await; @@ -355,11 +355,11 @@ mod tests { let reply = UpgradeRegionsHandler .handle( &handler_context, - Instruction::UpgradeRegion(UpgradeRegion { + UpgradeRegion { region_id, replay_timeout: Some(Duration::from_millis(200)), ..Default::default() - }), + }, ) .await; From da976e534db807edfa2afaa0c964bba06eb45f18 Mon Sep 17 00:00:00 2001 From: shuiyisong <113876041+shuiyisong@users.noreply.github.com> Date: Mon, 27 Oct 2025 18:16:00 +0800 Subject: [PATCH 016/149] refactor: add test feature gate to numbers table (#7148) * refactor: add test feature gate to numbers table Signed-off-by: shuiyisong * chore: add debug_assertions Signed-off-by: shuiyisong * refactor: extract numbers table provider Signed-off-by: shuiyisong * chore: address CR issues Signed-off-by: shuiyisong --------- Signed-off-by: shuiyisong --- src/catalog/src/kvbackend/builder.rs | 2 + src/catalog/src/kvbackend/manager.rs | 16 +++-- src/catalog/src/system_schema.rs | 1 + .../system_schema/numbers_table_provider.rs | 59 +++++++++++++++++++ 4 files changed, 69 insertions(+), 9 deletions(-) create mode 100644 src/catalog/src/system_schema/numbers_table_provider.rs diff --git a/src/catalog/src/kvbackend/builder.rs b/src/catalog/src/kvbackend/builder.rs index 51a6154948..247a111124 100644 --- a/src/catalog/src/kvbackend/builder.rs +++ b/src/catalog/src/kvbackend/builder.rs @@ -29,6 +29,7 @@ use crate::information_schema::{InformationExtensionRef, InformationSchemaProvid use crate::kvbackend::KvBackendCatalogManager; use crate::kvbackend::manager::{CATALOG_CACHE_MAX_CAPACITY, SystemCatalog}; use crate::process_manager::ProcessManagerRef; +use crate::system_schema::numbers_table_provider::NumbersTableProvider; use crate::system_schema::pg_catalog::PGCatalogProvider; pub struct KvBackendCatalogManagerBuilder { @@ -119,6 +120,7 @@ impl KvBackendCatalogManagerBuilder { DEFAULT_CATALOG_NAME.to_string(), me.clone(), )), + numbers_table_provider: NumbersTableProvider, backend, process_manager, #[cfg(feature = "enterprise")] diff --git a/src/catalog/src/kvbackend/manager.rs b/src/catalog/src/kvbackend/manager.rs index 902f15c09e..29e0cc4ce8 100644 --- a/src/catalog/src/kvbackend/manager.rs +++ b/src/catalog/src/kvbackend/manager.rs @@ -18,8 +18,7 @@ use std::sync::{Arc, Weak}; use async_stream::try_stream; use common_catalog::consts::{ - DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, INFORMATION_SCHEMA_NAME, NUMBERS_TABLE_ID, - PG_CATALOG_NAME, + DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, INFORMATION_SCHEMA_NAME, PG_CATALOG_NAME, }; use common_error::ext::BoxedError; use common_meta::cache::{ @@ -45,7 +44,6 @@ use table::TableRef; use table::dist_table::DistTable; use table::metadata::{TableId, TableInfoRef}; use table::table::PartitionRules; -use table::table::numbers::{NUMBERS_TABLE_NAME, NumbersTable}; use table::table_name::TableName; use tokio::sync::Semaphore; use tokio_stream::wrappers::ReceiverStream; @@ -61,6 +59,7 @@ use crate::information_schema::{InformationExtensionRef, InformationSchemaProvid use crate::kvbackend::TableCacheRef; use crate::process_manager::ProcessManagerRef; use crate::system_schema::SystemSchemaProvider; +use crate::system_schema::numbers_table_provider::NumbersTableProvider; use crate::system_schema::pg_catalog::PGCatalogProvider; /// Access all existing catalog, schema and tables. @@ -555,6 +554,7 @@ pub(super) struct SystemCatalog { // system_schema_provider for default catalog pub(super) information_schema_provider: Arc, pub(super) pg_catalog_provider: Arc, + pub(super) numbers_table_provider: NumbersTableProvider, pub(super) backend: KvBackendRef, pub(super) process_manager: Option, #[cfg(feature = "enterprise")] @@ -584,9 +584,7 @@ impl SystemCatalog { PG_CATALOG_NAME if channel == Channel::Postgres => { self.pg_catalog_provider.table_names() } - DEFAULT_SCHEMA_NAME => { - vec![NUMBERS_TABLE_NAME.to_string()] - } + DEFAULT_SCHEMA_NAME => self.numbers_table_provider.table_names(), _ => vec![], } } @@ -604,7 +602,7 @@ impl SystemCatalog { if schema == INFORMATION_SCHEMA_NAME { self.information_schema_provider.table(table).is_some() } else if schema == DEFAULT_SCHEMA_NAME { - table == NUMBERS_TABLE_NAME + self.numbers_table_provider.table_exists(table) } else if schema == PG_CATALOG_NAME && channel == Channel::Postgres { self.pg_catalog_provider.table(table).is_some() } else { @@ -649,8 +647,8 @@ impl SystemCatalog { }); pg_catalog_provider.table(table_name) } - } else if schema == DEFAULT_SCHEMA_NAME && table_name == NUMBERS_TABLE_NAME { - Some(NumbersTable::table(NUMBERS_TABLE_ID)) + } else if schema == DEFAULT_SCHEMA_NAME { + self.numbers_table_provider.table(table_name) } else { None } diff --git a/src/catalog/src/system_schema.rs b/src/catalog/src/system_schema.rs index c813ab6ab7..2e1c890427 100644 --- a/src/catalog/src/system_schema.rs +++ b/src/catalog/src/system_schema.rs @@ -14,6 +14,7 @@ pub mod information_schema; mod memory_table; +pub mod numbers_table_provider; pub mod pg_catalog; pub mod predicate; mod utils; diff --git a/src/catalog/src/system_schema/numbers_table_provider.rs b/src/catalog/src/system_schema/numbers_table_provider.rs new file mode 100644 index 0000000000..6ea6d554b7 --- /dev/null +++ b/src/catalog/src/system_schema/numbers_table_provider.rs @@ -0,0 +1,59 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#[cfg(any(test, feature = "testing", debug_assertions))] +use common_catalog::consts::NUMBERS_TABLE_ID; +use table::TableRef; +#[cfg(any(test, feature = "testing", debug_assertions))] +use table::table::numbers::NUMBERS_TABLE_NAME; +#[cfg(any(test, feature = "testing", debug_assertions))] +use table::table::numbers::NumbersTable; + +// NumbersTableProvider is a dedicated provider for feature-gating the numbers table. +#[derive(Clone)] +pub struct NumbersTableProvider; + +#[cfg(any(test, feature = "testing", debug_assertions))] +impl NumbersTableProvider { + pub(crate) fn table_exists(&self, name: &str) -> bool { + name == NUMBERS_TABLE_NAME + } + + pub(crate) fn table_names(&self) -> Vec { + vec![NUMBERS_TABLE_NAME.to_string()] + } + + pub(crate) fn table(&self, name: &str) -> Option { + if name == NUMBERS_TABLE_NAME { + Some(NumbersTable::table(NUMBERS_TABLE_ID)) + } else { + None + } + } +} + +#[cfg(not(any(test, feature = "testing", debug_assertions)))] +impl NumbersTableProvider { + pub(crate) fn table_exists(&self, _name: &str) -> bool { + false + } + + pub(crate) fn table_names(&self) -> Vec { + vec![] + } + + pub(crate) fn table(&self, _name: &str) -> Option { + None + } +} From 22d9eb69301e9ff4b1a9ded4685a47541b125a11 Mon Sep 17 00:00:00 2001 From: discord9 <55937128+discord9@users.noreply.github.com> Date: Tue, 28 Oct 2025 10:44:29 +0800 Subject: [PATCH 017/149] feat: part sort provide dyn filter (#7140) * feat: part sort provide dyn filter Signed-off-by: discord9 * fix: reset_state reset dynamic filter Signed-off-by: discord9 --------- Signed-off-by: discord9 --- src/query/src/part_sort.rs | 79 +++++++++++++++++++++++++++++++++++-- src/table/src/table/scan.rs | 13 ++++++ 2 files changed, 88 insertions(+), 4 deletions(-) diff --git a/src/query/src/part_sort.rs b/src/query/src/part_sort.rs index 64ba76a149..ebf4fddc1e 100644 --- a/src/query/src/part_sort.rs +++ b/src/query/src/part_sort.rs @@ -30,14 +30,18 @@ use common_recordbatch::{DfRecordBatch, DfSendableRecordBatchStream}; use datafusion::common::arrow::compute::sort_to_indices; use datafusion::execution::memory_pool::{MemoryConsumer, MemoryReservation}; use datafusion::execution::{RecordBatchStream, TaskContext}; +use datafusion::physical_plan::execution_plan::CardinalityEffect; +use datafusion::physical_plan::filter_pushdown::{ + ChildFilterDescription, FilterDescription, FilterPushdownPhase, +}; use datafusion::physical_plan::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; use datafusion::physical_plan::{ DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, TopK, TopKDynamicFilters, }; use datafusion_common::{DataFusionError, internal_err}; -use datafusion_physical_expr::PhysicalSortExpr; use datafusion_physical_expr::expressions::{DynamicFilterPhysicalExpr, lit}; +use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr}; use futures::{Stream, StreamExt}; use itertools::Itertools; use parking_lot::RwLock; @@ -61,6 +65,10 @@ pub struct PartSortExec { metrics: ExecutionPlanMetricsSet, partition_ranges: Vec>, properties: PlanProperties, + /// Filter matching the state of the sort for dynamic filter pushdown. + /// If `limit` is `Some`, this will also be set and a TopK operator may be used. + /// If `limit` is `None`, this will be `None`. + filter: Option>>, } impl PartSortExec { @@ -79,6 +87,10 @@ impl PartSortExec { properties.boundedness, ); + let filter = limit + .is_some() + .then(|| Self::create_filter(expression.expr.clone())); + Self { expression, limit, @@ -86,9 +98,17 @@ impl PartSortExec { metrics, partition_ranges, properties, + filter, } } + /// Add or reset `self.filter` to a new `TopKDynamicFilters`. + fn create_filter(expr: Arc) -> Arc> { + Arc::new(RwLock::new(TopKDynamicFilters::new(Arc::new( + DynamicFilterPhysicalExpr::new(vec![expr], lit(true)), + )))) + } + pub fn to_stream( &self, context: Arc, @@ -113,6 +133,7 @@ impl PartSortExec { input_stream, self.partition_ranges[partition].clone(), partition, + self.filter.clone(), )?) as _; Ok(df_stream) @@ -192,6 +213,51 @@ impl ExecutionPlan for PartSortExec { fn benefits_from_input_partitioning(&self) -> Vec { vec![false] } + + fn cardinality_effect(&self) -> CardinalityEffect { + if self.limit.is_none() { + CardinalityEffect::Equal + } else { + CardinalityEffect::LowerEqual + } + } + + fn gather_filters_for_pushdown( + &self, + phase: FilterPushdownPhase, + parent_filters: Vec>, + _config: &datafusion::config::ConfigOptions, + ) -> datafusion_common::Result { + if !matches!(phase, FilterPushdownPhase::Post) { + return FilterDescription::from_children(parent_filters, &self.children()); + } + + let mut child = ChildFilterDescription::from_child(&parent_filters, &self.input)?; + + if let Some(filter) = &self.filter { + child = child.with_self_filter(filter.read().expr()); + } + + Ok(FilterDescription::new().with_child(child)) + } + + fn reset_state(self: Arc) -> datafusion_common::Result> { + // shared dynamic filter needs to be reset + let new_filter = self + .limit + .is_some() + .then(|| Self::create_filter(self.expression.expr.clone())); + + Ok(Arc::new(Self { + expression: self.expression.clone(), + limit: self.limit, + input: self.input.clone(), + metrics: self.metrics.clone(), + partition_ranges: self.partition_ranges.clone(), + properties: self.properties.clone(), + filter: new_filter, + })) + } } enum PartSortBuffer { @@ -240,11 +306,16 @@ impl PartSortStream { input: DfSendableRecordBatchStream, partition_ranges: Vec, partition: usize, + filter: Option>>, ) -> datafusion_common::Result { let buffer = if let Some(limit) = limit { - let filter = Arc::new(RwLock::new(TopKDynamicFilters::new(Arc::new( - DynamicFilterPhysicalExpr::new(vec![], lit(true)), - )))); + let Some(filter) = filter else { + return internal_err!( + "TopKDynamicFilters must be provided when limit is set at {}", + snafu::location!() + ); + }; + PartSortBuffer::Top( TopK::try_new( partition, diff --git a/src/table/src/table/scan.rs b/src/table/src/table/scan.rs index a60215618b..1dc3982ed2 100644 --- a/src/table/src/table/scan.rs +++ b/src/table/src/table/scan.rs @@ -26,6 +26,9 @@ use common_telemetry::warn; use datafusion::error::Result as DfResult; use datafusion::execution::context::TaskContext; use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; +use datafusion::physical_plan::filter_pushdown::{ + ChildPushdownResult, FilterPushdownPhase, FilterPushdownPropagation, +}; use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; use datafusion::physical_plan::{ DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, @@ -361,6 +364,16 @@ impl ExecutionPlan for RegionScanExec { fn name(&self) -> &str { "RegionScanExec" } + + fn handle_child_pushdown_result( + &self, + _phase: FilterPushdownPhase, + child_pushdown_result: ChildPushdownResult, + _config: &datafusion::config::ConfigOptions, + ) -> DfResult>> { + // TODO(discord9): use the pushdown result to update the scanner's predicate + Ok(FilterPushdownPropagation::if_all(child_pushdown_result)) + } } impl DisplayAs for RegionScanExec { From a9d1d3313888d5ad92307ddf8e730e03f770e760 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Wed, 29 Oct 2025 02:42:03 +0800 Subject: [PATCH 018/149] feat: update datafusion-pg-catalog for better dbeaver support (#7143) * chore: update datafusion-pg-catalog to 0.12.1 * feat: import more udfs --- Cargo.lock | 37 +++++++++----------- Cargo.toml | 2 +- src/common/function/src/system/pg_catalog.rs | 7 ++-- 3 files changed, 22 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f4ff29fb70..71caf8c8b0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3717,9 +3717,9 @@ dependencies = [ [[package]] name = "datafusion-pg-catalog" -version = "0.11.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f258caedd1593e7dca3bf53912249de6685fa224bcce897ede1fbb7b040ac6f6" +checksum = "15824c98ff2009c23b0398d441499b147f7c5ac0e5ee993e7a473d79040e3626" dependencies = [ "async-trait", "datafusion", @@ -6307,17 +6307,6 @@ dependencies = [ "derive_utils", ] -[[package]] -name = "io-uring" -version = "0.7.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d93587f37623a1a17d94ef2bc9ada592f5465fe7732084ab7beefabe5c77c0c4" -dependencies = [ - "bitflags 2.9.1", - "cfg-if", - "libc", -] - [[package]] name = "ipnet" version = "2.11.0" @@ -13256,23 +13245,20 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.47.1" +version = "1.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89e49afdadebb872d3145a5638b59eb0691ea23e46ca484037cfab3b76b95038" +checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408" dependencies = [ - "backtrace", "bytes", - "io-uring", "libc", "mio", "parking_lot 0.12.4", "pin-project-lite", "signal-hook-registry", - "slab", "socket2 0.6.0", "tokio-macros", "tracing", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -13287,9 +13273,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.5.0" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" +checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" dependencies = [ "proc-macro2", "quote", @@ -14711,6 +14697,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link 0.2.1", +] + [[package]] name = "windows-targets" version = "0.48.5" diff --git a/Cargo.toml b/Cargo.toml index ebafce51ba..e51b0d7632 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -130,7 +130,7 @@ datafusion-functions = "50" datafusion-functions-aggregate-common = "50" datafusion-optimizer = "50" datafusion-orc = "0.5" -datafusion-pg-catalog = "0.11" +datafusion-pg-catalog = "0.12.1" datafusion-physical-expr = "50" datafusion-physical-plan = "50" datafusion-sql = "50" diff --git a/src/common/function/src/system/pg_catalog.rs b/src/common/function/src/system/pg_catalog.rs index b66e208ea9..4ea378b53a 100644 --- a/src/common/function/src/system/pg_catalog.rs +++ b/src/common/function/src/system/pg_catalog.rs @@ -191,7 +191,10 @@ impl PGCatalogFunction { registry.register(pg_catalog::create_pg_get_userbyid_udf()); registry.register(pg_catalog::create_pg_table_is_visible()); registry.register(pg_catalog::pg_get_expr_udf::create_pg_get_expr_udf()); - // TODO(sunng87): upgrade datafusion to add - //registry.register(pg_catalog::create_pg_encoding_to_char_udf()); + registry.register(pg_catalog::create_pg_encoding_to_char_udf()); + registry.register(pg_catalog::create_pg_relation_size_udf()); + registry.register(pg_catalog::create_pg_total_relation_size_udf()); + registry.register(pg_catalog::create_pg_stat_get_numscans()); + registry.register(pg_catalog::create_pg_get_constraintdef()); } } From 37bc2e6b076dfa6f5e7d1cc558e19e0688551024 Mon Sep 17 00:00:00 2001 From: discord9 <55937128+discord9@users.noreply.github.com> Date: Wed, 29 Oct 2025 10:59:36 +0800 Subject: [PATCH 019/149] feat: gc worker heartbeat instruction (#7118) again false by default test: config api refactor: per code review less info! even less info!! docs: gc regions instr refactor: grp by region id per code review per review error handling? test: fix todos aft rebase fix after refactor Signed-off-by: discord9 --- src/common/meta/src/datanode.rs | 58 +++++- src/common/meta/src/instruction.rs | 136 ++++++++++++- src/datanode/src/error.rs | 25 ++- src/datanode/src/heartbeat.rs | 21 +- src/datanode/src/heartbeat/handler.rs | 21 +- .../src/heartbeat/handler/file_ref.rs | 62 ++++++ .../src/heartbeat/handler/gc_worker.rs | 156 +++++++++++++++ src/datanode/src/region_server.rs | 33 +++- src/meta-srv/src/service/admin/heartbeat.rs | 2 +- src/mito2/src/config.rs | 4 + src/mito2/src/engine.rs | 32 ++- src/mito2/src/error.rs | 8 +- src/mito2/src/gc.rs | 182 +++++++++++++----- src/mito2/src/metrics.rs | 6 +- src/mito2/src/region.rs | 4 + src/mito2/src/sst/file_purger.rs | 1 + src/mito2/src/sst/file_ref.rs | 145 +++++++------- src/mito2/src/worker.rs | 11 ++ src/store-api/src/storage.rs | 2 +- src/store-api/src/storage/descriptors.rs | 26 ++- src/store-api/src/storage/file.rs | 73 +++++++ tests-integration/tests/http.rs | 7 + 22 files changed, 869 insertions(+), 146 deletions(-) create mode 100644 src/datanode/src/heartbeat/handler/file_ref.rs create mode 100644 src/datanode/src/heartbeat/handler/gc_worker.rs diff --git a/src/common/meta/src/datanode.rs b/src/common/meta/src/datanode.rs index 2083b5886b..ffa85b4a7e 100644 --- a/src/common/meta/src/datanode.rs +++ b/src/common/meta/src/datanode.rs @@ -25,8 +25,7 @@ use store_api::region_engine::{RegionRole, RegionStatistic}; use store_api::storage::RegionId; use table::metadata::TableId; -use crate::error; -use crate::error::Result; +use crate::error::{self, DeserializeFromJsonSnafu, Result}; use crate::heartbeat::utils::get_datanode_workloads; const DATANODE_STAT_PREFIX: &str = "__meta_datanode_stat"; @@ -66,10 +65,12 @@ pub struct Stat { pub node_epoch: u64, /// The datanode workloads. pub datanode_workloads: DatanodeWorkloads, + /// The GC statistics of the datanode. + pub gc_stat: Option, } /// The statistics of a region. -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct RegionStat { /// The region_id. pub id: RegionId, @@ -126,7 +127,7 @@ pub trait TopicStatsReporter: Send + Sync { fn reportable_topics(&mut self) -> Vec; } -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)] pub enum RegionManifestInfo { Mito { manifest_version: u64, @@ -222,11 +223,12 @@ impl TryFrom<&HeartbeatRequest> for Stat { node_epoch, node_workloads, topic_stats, + extensions, .. } = value; match (header, peer) { - (Some(_header), Some(peer)) => { + (Some(header), Some(peer)) => { let region_stats = region_stats .iter() .map(RegionStat::from) @@ -234,6 +236,14 @@ impl TryFrom<&HeartbeatRequest> for Stat { let topic_stats = topic_stats.iter().map(TopicStat::from).collect::>(); let datanode_workloads = get_datanode_workloads(node_workloads.as_ref()); + + let gc_stat = GcStat::from_extensions(extensions).map_err(|err| { + common_telemetry::error!( + "Failed to deserialize GcStat from extensions: {}", + err + ); + header.clone() + })?; Ok(Self { timestamp_millis: time_util::current_time_millis(), // datanode id @@ -247,6 +257,7 @@ impl TryFrom<&HeartbeatRequest> for Stat { topic_stats, node_epoch: *node_epoch, datanode_workloads, + gc_stat, }) } (header, _) => Err(header.clone()), @@ -319,6 +330,43 @@ impl From<&api::v1::meta::TopicStat> for TopicStat { } } +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct GcStat { + /// Number of GC tasks currently running on the datanode. + pub running_gc_tasks: u32, + /// The maximum number of concurrent GC tasks the datanode can handle. + pub gc_concurrency: u32, +} + +impl GcStat { + pub const GC_STAT_KEY: &str = "__gc_stat"; + + pub fn new(running_gc_tasks: u32, gc_concurrency: u32) -> Self { + Self { + running_gc_tasks, + gc_concurrency, + } + } + + pub fn into_extensions(&self, extensions: &mut std::collections::HashMap>) { + let bytes = serde_json::to_vec(self).unwrap_or_default(); + extensions.insert(Self::GC_STAT_KEY.to_string(), bytes); + } + + pub fn from_extensions( + extensions: &std::collections::HashMap>, + ) -> Result> { + extensions + .get(Self::GC_STAT_KEY) + .map(|bytes| { + serde_json::from_slice(bytes).with_context(|_| DeserializeFromJsonSnafu { + input: String::from_utf8_lossy(bytes).to_string(), + }) + }) + .transpose() + } +} + /// The key of the datanode stat in the memory store. /// /// The format is `__meta_datanode_stat-0-{node_id}`. diff --git a/src/common/meta/src/instruction.rs b/src/common/meta/src/instruction.rs index c7bd82d675..d8e5affe30 100644 --- a/src/common/meta/src/instruction.rs +++ b/src/common/meta/src/instruction.rs @@ -17,7 +17,7 @@ use std::fmt::{Display, Formatter}; use std::time::Duration; use serde::{Deserialize, Deserializer, Serialize}; -use store_api::storage::{RegionId, RegionNumber}; +use store_api::storage::{FileRefsManifest, GcReport, RegionId, RegionNumber}; use strum::Display; use table::metadata::TableId; use table::table_name::TableName; @@ -417,6 +417,88 @@ where }) } +/// Instruction to get file references for specified regions. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct GetFileRefs { + /// List of region IDs to get file references for. + pub region_ids: Vec, +} + +impl Display for GetFileRefs { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "GetFileRefs(region_ids={:?})", self.region_ids) + } +} + +/// Instruction to trigger garbage collection for a region. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct GcRegions { + /// The region ID to perform GC on. + pub regions: Vec, + /// The file references manifest containing temporary file references. + pub file_refs_manifest: FileRefsManifest, + /// Whether to perform a full file listing to find orphan files. + pub full_file_listing: bool, +} + +impl Display for GcRegions { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "GcRegion(regions={:?}, file_refs_count={}, full_file_listing={})", + self.regions, + self.file_refs_manifest.file_refs.len(), + self.full_file_listing + ) + } +} + +/// Reply for GetFileRefs instruction. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct GetFileRefsReply { + /// The file references manifest. + pub file_refs_manifest: FileRefsManifest, + /// Whether the operation was successful. + pub success: bool, + /// Error message if any. + pub error: Option, +} + +impl Display for GetFileRefsReply { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "GetFileRefsReply(success={}, file_refs_count={}, error={:?})", + self.success, + self.file_refs_manifest.file_refs.len(), + self.error + ) + } +} + +/// Reply for GC instruction. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct GcRegionsReply { + pub result: Result, +} + +impl Display for GcRegionsReply { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "GcReply(result={})", + match &self.result { + Ok(report) => format!( + "GcReport(deleted_files_count={}, need_retry_regions_count={})", + report.deleted_files.len(), + report.need_retry_regions.len() + ), + Err(err) => format!("Err({})", err), + } + ) + } +} + #[derive(Debug, Clone, Serialize, Deserialize, Display, PartialEq)] pub enum Instruction { /// Opens regions. @@ -437,6 +519,10 @@ pub enum Instruction { InvalidateCaches(Vec), /// Flushes regions. FlushRegions(FlushRegions), + /// Gets file references for regions. + GetFileRefs(GetFileRefs), + /// Triggers garbage collection for a region. + GcRegions(GcRegions), } impl Instruction { @@ -479,6 +565,20 @@ impl Instruction { _ => None, } } + + pub fn into_get_file_refs(self) -> Option { + match self { + Self::GetFileRefs(get_file_refs) => Some(get_file_refs), + _ => None, + } + } + + pub fn into_gc_regions(self) -> Option { + match self { + Self::GcRegions(gc_regions) => Some(gc_regions), + _ => None, + } + } } /// The reply of [UpgradeRegion]. @@ -549,6 +649,8 @@ pub enum InstructionReply { )] DowngradeRegions(DowngradeRegionsReply), FlushRegions(FlushRegionReply), + GetFileRefs(GetFileRefsReply), + GcRegions(GcRegionsReply), } impl Display for InstructionReply { @@ -561,6 +663,8 @@ impl Display for InstructionReply { write!(f, "InstructionReply::DowngradeRegions({:?})", reply) } Self::FlushRegions(reply) => write!(f, "InstructionReply::FlushRegions({})", reply), + Self::GetFileRefs(reply) => write!(f, "InstructionReply::GetFileRefs({})", reply), + Self::GcRegions(reply) => write!(f, "InstructionReply::GcRegion({})", reply), } } } @@ -605,6 +709,10 @@ impl InstructionReply { #[cfg(test)] mod tests { + use std::collections::HashSet; + + use store_api::storage::FileId; + use super::*; #[test] @@ -903,4 +1011,30 @@ mod tests { _ => panic!("Expected FlushRegions instruction"), } } + + #[test] + fn test_serialize_get_file_refs_instruction_reply() { + let mut manifest = FileRefsManifest::default(); + let r0 = RegionId::new(1024, 1); + let r1 = RegionId::new(1024, 2); + manifest + .file_refs + .insert(r0, HashSet::from([FileId::random()])); + manifest + .file_refs + .insert(r1, HashSet::from([FileId::random()])); + manifest.manifest_version.insert(r0, 10); + manifest.manifest_version.insert(r1, 20); + + let instruction_reply = InstructionReply::GetFileRefs(GetFileRefsReply { + file_refs_manifest: manifest, + success: true, + error: None, + }); + + let serialized = serde_json::to_string(&instruction_reply).unwrap(); + let deserialized = serde_json::from_str(&serialized).unwrap(); + + assert_eq!(instruction_reply, deserialized); + } } diff --git a/src/datanode/src/error.rs b/src/datanode/src/error.rs index eda483a1e2..74bddbaede 100644 --- a/src/datanode/src/error.rs +++ b/src/datanode/src/error.rs @@ -322,6 +322,21 @@ pub enum Error { location: Location, }, + #[snafu(display("Failed to run gc for region {}", region_id))] + GcMitoEngine { + region_id: RegionId, + source: mito2::error::Error, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Invalid arguments for GC: {}", msg))] + InvalidGcArgs { + msg: String, + #[snafu(implicit)] + location: Location, + }, + #[snafu(display("Failed to list SST entries from storage"))] ListStorageSsts { #[snafu(implicit)] @@ -446,9 +461,11 @@ impl ErrorExt for Error { AsyncTaskExecute { source, .. } => source.status_code(), - CreateDir { .. } | RemoveDir { .. } | ShutdownInstance { .. } | DataFusion { .. } => { - StatusCode::Internal - } + CreateDir { .. } + | RemoveDir { .. } + | ShutdownInstance { .. } + | DataFusion { .. } + | InvalidGcArgs { .. } => StatusCode::Internal, RegionNotFound { .. } => StatusCode::RegionNotFound, RegionNotReady { .. } => StatusCode::RegionNotReady, @@ -466,7 +483,7 @@ impl ErrorExt for Error { StopRegionEngine { source, .. } => source.status_code(), FindLogicalRegions { source, .. } => source.status_code(), - BuildMitoEngine { source, .. } => source.status_code(), + BuildMitoEngine { source, .. } | GcMitoEngine { source, .. } => source.status_code(), BuildMetricEngine { source, .. } => source.status_code(), ListStorageSsts { source, .. } => source.status_code(), ConcurrentQueryLimiterClosed { .. } | ConcurrentQueryLimiterTimeout { .. } => { diff --git a/src/datanode/src/heartbeat.rs b/src/datanode/src/heartbeat.rs index 607e031b43..33ba648830 100644 --- a/src/datanode/src/heartbeat.rs +++ b/src/datanode/src/heartbeat.rs @@ -36,14 +36,14 @@ use common_workload::DatanodeWorkloadType; use meta_client::MetaClientRef; use meta_client::client::{HeartbeatSender, MetaClient}; use servers::addrs; -use snafu::ResultExt; +use snafu::{OptionExt as _, ResultExt}; use tokio::sync::{Notify, mpsc}; use tokio::time::Instant; use self::handler::RegionHeartbeatResponseHandler; use crate::alive_keeper::{CountdownTaskHandlerExtRef, RegionAliveKeeper}; use crate::config::DatanodeOptions; -use crate::error::{self, MetaClientInitSnafu, Result}; +use crate::error::{self, MetaClientInitSnafu, RegionEngineNotFoundSnafu, Result}; use crate::event_listener::RegionServerEventReceiver; use crate::metrics::{self, HEARTBEAT_RECV_COUNT, HEARTBEAT_SENT_COUNT}; use crate::region_server::RegionServer; @@ -242,12 +242,18 @@ impl HeartbeatTask { let total_cpu_millicores = self.resource_stat.get_total_cpu_millicores(); let total_memory_bytes = self.resource_stat.get_total_memory_bytes(); let resource_stat = self.resource_stat.clone(); + let gc_limiter = self + .region_server + .mito_engine() + .context(RegionEngineNotFoundSnafu { name: "mito" })? + .gc_limiter(); common_runtime::spawn_hb(async move { let sleep = tokio::time::sleep(Duration::from_millis(0)); tokio::pin!(sleep); let build_info = common_version::build_info(); + let heartbeat_request = HeartbeatRequest { peer: self_peer, node_epoch, @@ -283,8 +289,13 @@ impl HeartbeatTask { if let Some(message) = message { match outgoing_message_to_mailbox_message(message) { Ok(message) => { + let mut extensions = heartbeat_request.extensions.clone(); + let gc_stat = gc_limiter.gc_stat(); + gc_stat.into_extensions(&mut extensions); + let req = HeartbeatRequest { mailbox_message: Some(message), + extensions, ..heartbeat_request.clone() }; HEARTBEAT_RECV_COUNT.with_label_values(&["success"]).inc(); @@ -305,10 +316,16 @@ impl HeartbeatTask { let topic_stats = region_server_clone.topic_stats(); let now = Instant::now(); let duration_since_epoch = (now - epoch).as_millis() as u64; + + let mut extensions = heartbeat_request.extensions.clone(); + let gc_stat = gc_limiter.gc_stat(); + gc_stat.into_extensions(&mut extensions); + let mut req = HeartbeatRequest { region_stats, topic_stats, duration_since_epoch, + extensions, ..heartbeat_request.clone() }; diff --git a/src/datanode/src/heartbeat/handler.rs b/src/datanode/src/heartbeat/handler.rs index 8566f8806c..8573314b82 100644 --- a/src/datanode/src/heartbeat/handler.rs +++ b/src/datanode/src/heartbeat/handler.rs @@ -20,16 +20,21 @@ use common_meta::heartbeat::handler::{ use common_meta::instruction::{Instruction, InstructionReply}; use common_telemetry::error; use snafu::OptionExt; +use store_api::storage::GcReport; mod close_region; mod downgrade_region; +mod file_ref; mod flush_region; +mod gc_worker; mod open_region; mod upgrade_region; use crate::heartbeat::handler::close_region::CloseRegionsHandler; use crate::heartbeat::handler::downgrade_region::DowngradeRegionsHandler; +use crate::heartbeat::handler::file_ref::GetFileRefsHandler; use crate::heartbeat::handler::flush_region::FlushRegionsHandler; +use crate::heartbeat::handler::gc_worker::GcRegionsHandler; use crate::heartbeat::handler::open_region::OpenRegionsHandler; use crate::heartbeat::handler::upgrade_region::UpgradeRegionsHandler; use crate::heartbeat::task_tracker::TaskTracker; @@ -43,6 +48,7 @@ pub struct RegionHeartbeatResponseHandler { downgrade_tasks: TaskTracker<()>, flush_tasks: TaskTracker<()>, open_region_parallelism: usize, + gc_tasks: TaskTracker, } #[async_trait::async_trait] @@ -61,6 +67,7 @@ pub struct HandlerContext { catchup_tasks: TaskTracker<()>, downgrade_tasks: TaskTracker<()>, flush_tasks: TaskTracker<()>, + gc_tasks: TaskTracker, } impl HandlerContext { @@ -71,6 +78,7 @@ impl HandlerContext { catchup_tasks: TaskTracker::new(), downgrade_tasks: TaskTracker::new(), flush_tasks: TaskTracker::new(), + gc_tasks: TaskTracker::new(), } } } @@ -85,6 +93,7 @@ impl RegionHeartbeatResponseHandler { flush_tasks: TaskTracker::new(), // Default to half of the number of CPUs. open_region_parallelism: (num_cpus::get() / 2).max(1), + gc_tasks: TaskTracker::new(), } } @@ -106,6 +115,8 @@ impl RegionHeartbeatResponseHandler { Instruction::FlushRegions(_) => Ok(Box::new(FlushRegionsHandler.into())), Instruction::DowngradeRegions(_) => Ok(Box::new(DowngradeRegionsHandler.into())), Instruction::UpgradeRegion(_) => Ok(Box::new(UpgradeRegionsHandler.into())), + Instruction::GetFileRefs(_) => Ok(Box::new(GetFileRefsHandler.into())), + Instruction::GcRegions(_) => Ok(Box::new(GcRegionsHandler.into())), Instruction::InvalidateCaches(_) => InvalidHeartbeatResponseSnafu.fail(), } } @@ -118,6 +129,8 @@ pub enum InstructionHandlers { FlushRegions(FlushRegionsHandler), DowngradeRegions(DowngradeRegionsHandler), UpgradeRegions(UpgradeRegionsHandler), + GetFileRefs(GetFileRefsHandler), + GcRegions(GcRegionsHandler), } macro_rules! impl_from_handler { @@ -137,7 +150,9 @@ impl_from_handler!( OpenRegionsHandler => OpenRegions, FlushRegionsHandler => FlushRegions, DowngradeRegionsHandler => DowngradeRegions, - UpgradeRegionsHandler => UpgradeRegions + UpgradeRegionsHandler => UpgradeRegions, + GetFileRefsHandler => GetFileRefs, + GcRegionsHandler => GcRegions ); macro_rules! dispatch_instr { @@ -180,6 +195,8 @@ dispatch_instr!( FlushRegions => FlushRegions, DowngradeRegions => DowngradeRegions, UpgradeRegion => UpgradeRegions, + GetFileRefs => GetFileRefs, + GcRegions => GcRegions, ); #[async_trait] @@ -202,6 +219,7 @@ impl HeartbeatResponseHandler for RegionHeartbeatResponseHandler { let catchup_tasks = self.catchup_tasks.clone(); let downgrade_tasks = self.downgrade_tasks.clone(); let flush_tasks = self.flush_tasks.clone(); + let gc_tasks = self.gc_tasks.clone(); let handler = self.build_handler(&instruction)?; let _handle = common_runtime::spawn_global(async move { let reply = handler @@ -211,6 +229,7 @@ impl HeartbeatResponseHandler for RegionHeartbeatResponseHandler { catchup_tasks, downgrade_tasks, flush_tasks, + gc_tasks, }, instruction, ) diff --git a/src/datanode/src/heartbeat/handler/file_ref.rs b/src/datanode/src/heartbeat/handler/file_ref.rs new file mode 100644 index 0000000000..ccad7922b5 --- /dev/null +++ b/src/datanode/src/heartbeat/handler/file_ref.rs @@ -0,0 +1,62 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use common_error::ext::ErrorExt; +use common_meta::instruction::{GetFileRefs, GetFileRefsReply, InstructionReply}; +use store_api::storage::FileRefsManifest; + +use crate::heartbeat::handler::{HandlerContext, InstructionHandler}; + +pub struct GetFileRefsHandler; + +#[async_trait::async_trait] +impl InstructionHandler for GetFileRefsHandler { + type Instruction = GetFileRefs; + + async fn handle( + &self, + ctx: &HandlerContext, + get_file_refs: Self::Instruction, + ) -> Option { + let region_server = &ctx.region_server; + + // Get the MitoEngine + let Some(mito_engine) = region_server.mito_engine() else { + return Some(InstructionReply::GetFileRefs(GetFileRefsReply { + file_refs_manifest: FileRefsManifest::default(), + success: false, + error: Some("MitoEngine not found".to_string()), + })); + }; + + match mito_engine + .get_snapshot_of_unmanifested_refs(get_file_refs.region_ids) + .await + { + Ok(all_file_refs) => { + // Return the file references + Some(InstructionReply::GetFileRefs(GetFileRefsReply { + file_refs_manifest: all_file_refs, + success: true, + error: None, + })) + } + Err(e) => Some(InstructionReply::GetFileRefs(GetFileRefsReply { + file_refs_manifest: FileRefsManifest::default(), + success: false, + error: Some(format!("Failed to get file refs: {}", e.output_msg())), + })), + } + } +} diff --git a/src/datanode/src/heartbeat/handler/gc_worker.rs b/src/datanode/src/heartbeat/handler/gc_worker.rs new file mode 100644 index 0000000000..75b0005e93 --- /dev/null +++ b/src/datanode/src/heartbeat/handler/gc_worker.rs @@ -0,0 +1,156 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use common_meta::instruction::{GcRegions, GcRegionsReply, InstructionReply}; +use common_telemetry::{debug, warn}; +use mito2::gc::LocalGcWorker; +use snafu::{OptionExt, ResultExt}; +use store_api::storage::{FileRefsManifest, RegionId}; + +use crate::error::{GcMitoEngineSnafu, InvalidGcArgsSnafu, Result, UnexpectedSnafu}; +use crate::heartbeat::handler::{HandlerContext, InstructionHandler}; + +pub struct GcRegionsHandler; + +#[async_trait::async_trait] +impl InstructionHandler for GcRegionsHandler { + type Instruction = GcRegions; + + async fn handle( + &self, + ctx: &HandlerContext, + gc_regions: Self::Instruction, + ) -> Option { + let region_ids = gc_regions.regions.clone(); + debug!("Received gc regions instruction: {:?}", region_ids); + + let is_same_table = region_ids.windows(2).all(|w| { + let t1 = w[0].table_id(); + let t2 = w[1].table_id(); + t1 == t2 + }); + if !is_same_table { + return Some(InstructionReply::GcRegions(GcRegionsReply { + result: Err(format!( + "Regions to GC should belong to the same table, found: {:?}", + region_ids + )), + })); + } + + let (region_id, gc_worker) = match self + .create_gc_worker( + ctx, + region_ids, + &gc_regions.file_refs_manifest, + gc_regions.full_file_listing, + ) + .await + { + Ok(worker) => worker, + Err(e) => { + return Some(InstructionReply::GcRegions(GcRegionsReply { + result: Err(format!("Failed to create GC worker: {}", e)), + })); + } + }; + + let register_result = ctx + .gc_tasks + .try_register( + region_id, + Box::pin(async move { + debug!("Starting gc worker for region {}", region_id); + let report = gc_worker + .run() + .await + .context(GcMitoEngineSnafu { region_id })?; + debug!("Gc worker for region {} finished", region_id); + Ok(report) + }), + ) + .await; + if register_result.is_busy() { + warn!("Another gc task is running for the region: {region_id}"); + return Some(InstructionReply::GcRegions(GcRegionsReply { + result: Err(format!( + "Another gc task is running for the region: {region_id}" + )), + })); + } + let mut watcher = register_result.into_watcher(); + let result = ctx.gc_tasks.wait_until_finish(&mut watcher).await; + match result { + Ok(report) => Some(InstructionReply::GcRegions(GcRegionsReply { + result: Ok(report), + })), + Err(err) => Some(InstructionReply::GcRegions(GcRegionsReply { + result: Err(format!("{err:?}")), + })), + } + } +} + +impl GcRegionsHandler { + async fn create_gc_worker( + &self, + ctx: &HandlerContext, + mut region_ids: Vec, + file_ref_manifest: &FileRefsManifest, + full_file_listing: bool, + ) -> Result<(RegionId, LocalGcWorker)> { + // always use the smallest region id on datanode as the target region id + region_ids.sort_by_key(|r| r.region_number()); + let mito_engine = ctx + .region_server + .mito_engine() + .with_context(|| UnexpectedSnafu { + violated: "MitoEngine not found".to_string(), + })?; + let region_id = *region_ids.first().with_context(|| UnexpectedSnafu { + violated: "No region ids provided".to_string(), + })?; + + let mito_config = mito_engine.mito_config(); + + // Find the access layer from one of the regions that exists on this datanode + let access_layer = region_ids + .iter() + .find_map(|rid| mito_engine.find_region(*rid)) + .with_context(|| InvalidGcArgsSnafu { + msg: format!( + "None of the regions is on current datanode:{:?}", + region_ids + ), + })? + .access_layer(); + + let cache_manager = mito_engine.cache_manager(); + + let gc_worker = LocalGcWorker::try_new( + access_layer.clone(), + Some(cache_manager), + region_ids.into_iter().collect(), + Default::default(), + mito_config.clone().into(), + file_ref_manifest.clone(), + &mito_engine.gc_limiter(), + full_file_listing, + ) + .await + .context(GcMitoEngineSnafu { region_id })?; + + Ok((region_id, gc_worker)) + } +} diff --git a/src/datanode/src/region_server.rs b/src/datanode/src/region_server.rs index 341ee9442c..70373ca10c 100644 --- a/src/datanode/src/region_server.rs +++ b/src/datanode/src/region_server.rs @@ -158,6 +158,27 @@ impl RegionServer { } } + /// Gets the MitoEngine if it's registered. + pub fn mito_engine(&self) -> Option { + if let Some(mito) = self.inner.mito_engine.read().unwrap().clone() { + Some(mito) + } else { + self.inner + .engines + .read() + .unwrap() + .get(MITO_ENGINE_NAME) + .cloned() + .and_then(|e| { + let mito = e.as_any().downcast_ref::().cloned(); + if mito.is_none() { + warn!("Mito engine not found in region server engines"); + } + mito + }) + } + } + #[tracing::instrument(skip_all)] pub async fn handle_batch_open_requests( &self, @@ -676,14 +697,14 @@ struct RegionServerInner { runtime: Runtime, event_listener: RegionServerEventListenerRef, table_provider_factory: TableProviderFactoryRef, - // The number of queries allowed to be executed at the same time. - // Act as last line of defense on datanode to prevent query overloading. + /// The number of queries allowed to be executed at the same time. + /// Act as last line of defense on datanode to prevent query overloading. parallelism: Option, - // The topic stats reporter. + /// The topic stats reporter. topic_stats_reporter: RwLock>>, - // HACK(zhongzc): Direct MitoEngine handle for diagnostics. This couples the - // server with a concrete engine; acceptable for now to fetch Mito-specific - // info (e.g., list SSTs). Consider a diagnostics trait later. + /// HACK(zhongzc): Direct MitoEngine handle for diagnostics. This couples the + /// server with a concrete engine; acceptable for now to fetch Mito-specific + /// info (e.g., list SSTs). Consider a diagnostics trait later. mito_engine: RwLock>, } diff --git a/src/meta-srv/src/service/admin/heartbeat.rs b/src/meta-srv/src/service/admin/heartbeat.rs index cb13764d30..35ada0d3ae 100644 --- a/src/meta-srv/src/service/admin/heartbeat.rs +++ b/src/meta-srv/src/service/admin/heartbeat.rs @@ -254,7 +254,7 @@ mod tests { assert_eq!(status, http::StatusCode::OK); assert_eq!( body, - "[[{\"timestamp_millis\":3,\"id\":0,\"addr\":\"127.0.0.1:3001\",\"rcus\":0,\"wcus\":0,\"region_num\":0,\"region_stats\":[],\"topic_stats\":[],\"node_epoch\":0,\"datanode_workloads\":{\"types\":[]}}]]" + "[[{\"timestamp_millis\":3,\"id\":0,\"addr\":\"127.0.0.1:3001\",\"rcus\":0,\"wcus\":0,\"region_num\":0,\"region_stats\":[],\"topic_stats\":[],\"node_epoch\":0,\"datanode_workloads\":{\"types\":[]},\"gc_stat\":null}]]" ); } } diff --git a/src/mito2/src/config.rs b/src/mito2/src/config.rs index edf0709960..e76a8dbe19 100644 --- a/src/mito2/src/config.rs +++ b/src/mito2/src/config.rs @@ -25,6 +25,7 @@ use serde::{Deserialize, Serialize}; use serde_with::serde_as; use crate::error::Result; +use crate::gc::GcConfig; use crate::memtable::MemtableConfig; use crate::sst::DEFAULT_WRITE_BUFFER_SIZE; @@ -148,6 +149,8 @@ pub struct MitoConfig { /// Whether to enable experimental flat format as the default format. /// When enabled, forces using BulkMemtable and BulkMemtableBuilder. pub default_experimental_flat_format: bool, + + pub gc: GcConfig, } impl Default for MitoConfig { @@ -186,6 +189,7 @@ impl Default for MitoConfig { memtable: MemtableConfig::default(), min_compaction_interval: Duration::from_secs(0), default_experimental_flat_format: false, + gc: GcConfig::default(), }; // Adjust buffer and cache size according to system memory if we can. diff --git a/src/mito2/src/engine.rs b/src/mito2/src/engine.rs index 3fb3a8abd8..73cb930f77 100644 --- a/src/mito2/src/engine.rs +++ b/src/mito2/src/engine.rs @@ -102,7 +102,7 @@ use store_api::region_engine::{ }; use store_api::region_request::{AffectedRows, RegionOpenRequest, RegionRequest}; use store_api::sst_entry::{ManifestSstEntry, PuffinIndexMetaEntry, StorageSstEntry}; -use store_api::storage::{FileId, RegionId, ScanRequest, SequenceNumber}; +use store_api::storage::{FileId, FileRefsManifest, RegionId, ScanRequest, SequenceNumber}; use tokio::sync::{Semaphore, oneshot}; use crate::access_layer::RegionFilePathFactory; @@ -115,6 +115,7 @@ use crate::error::{ }; #[cfg(feature = "enterprise")] use crate::extension::BoxedExtensionRangeProviderFactory; +use crate::gc::GcLimiterRef; use crate::manifest::action::RegionEdit; use crate::memtable::MemtableStats; use crate::metrics::HANDLE_REQUEST_ELAPSED; @@ -261,6 +262,33 @@ impl MitoEngine { self.inner.workers.file_ref_manager() } + pub fn gc_limiter(&self) -> GcLimiterRef { + self.inner.workers.gc_limiter() + } + + /// Get all tmp ref files for given region ids, excluding files that's already in manifest. + pub async fn get_snapshot_of_unmanifested_refs( + &self, + region_ids: impl IntoIterator, + ) -> Result { + let file_ref_mgr = self.file_ref_manager(); + + let region_ids = region_ids.into_iter().collect::>(); + + // Convert region IDs to MitoRegionRef objects, error if any region doesn't exist + let regions: Vec = region_ids + .into_iter() + .map(|region_id| { + self.find_region(region_id) + .with_context(|| RegionNotFoundSnafu { region_id }) + }) + .collect::>()?; + + file_ref_mgr + .get_snapshot_of_unmanifested_refs(regions) + .await + } + /// Returns true if the specific region exists. pub fn is_region_exists(&self, region_id: RegionId) -> bool { self.inner.workers.is_region_exists(region_id) @@ -357,7 +385,7 @@ impl MitoEngine { self.find_region(id) } - pub(crate) fn find_region(&self, region_id: RegionId) -> Option { + pub fn find_region(&self, region_id: RegionId) -> Option { self.inner.workers.get_region(region_id) } diff --git a/src/mito2/src/error.rs b/src/mito2/src/error.rs index ad6d7c7caa..2a6fc855bc 100644 --- a/src/mito2/src/error.rs +++ b/src/mito2/src/error.rs @@ -1121,6 +1121,12 @@ pub enum Error { #[snafu(implicit)] location: Location, }, + + #[snafu(display("GC job permit exhausted"))] + TooManyGcJobs { + #[snafu(implicit)] + location: Location, + }, } pub type Result = std::result::Result; @@ -1291,7 +1297,7 @@ impl ErrorExt for Error { InconsistentTimestampLength { .. } => StatusCode::InvalidArguments, - TooManyFilesToRead { .. } => StatusCode::RateLimited, + TooManyFilesToRead { .. } | TooManyGcJobs { .. } => StatusCode::RateLimited, } } diff --git a/src/mito2/src/gc.rs b/src/mito2/src/gc.rs index e4d384d0f9..f7cd266eb4 100644 --- a/src/mito2/src/gc.rs +++ b/src/mito2/src/gc.rs @@ -22,14 +22,17 @@ //! use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; +use std::sync::Arc; use std::time::Duration; -use common_telemetry::{error, info, warn}; +use common_meta::datanode::GcStat; +use common_telemetry::{debug, error, info, warn}; use common_time::Timestamp; use object_store::{Entry, Lister}; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt as _, ensure}; -use store_api::storage::{FileId, RegionId}; +use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId}; +use tokio::sync::{OwnedSemaphorePermit, TryAcquireError}; use tokio_stream::StreamExt; use crate::access_layer::AccessLayerRef; @@ -37,26 +40,64 @@ use crate::cache::CacheManagerRef; use crate::config::MitoConfig; use crate::error::{ DurationOutOfRangeSnafu, EmptyRegionDirSnafu, JoinSnafu, OpenDalSnafu, RegionNotFoundSnafu, - Result, UnexpectedSnafu, + Result, TooManyGcJobsSnafu, UnexpectedSnafu, }; use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions, RemoveFileOptions}; use crate::manifest::storage::manifest_compress_type; -use crate::metrics::GC_FILE_CNT; +use crate::metrics::GC_DEL_FILE_CNT; use crate::region::opener::new_manifest_dir; use crate::sst::file::delete_files; -use crate::sst::file_ref::TableFileRefsManifest; use crate::sst::location::{self, region_dir_from_table_dir}; -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] -pub struct GcReport { - /// deleted files per region - pub deleted_files: HashMap>, - /// Regions that need retry in next gc round, usually because their tmp ref files are outdated - pub need_retry_regions: HashSet, +/// Limit the amount of concurrent GC jobs on the datanode +pub struct GcLimiter { + pub gc_job_limit: Arc, + gc_concurrency: usize, } -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] -pub struct FileGcOption { +pub type GcLimiterRef = Arc; + +impl GcLimiter { + pub fn new(gc_concurrency: usize) -> Self { + Self { + gc_job_limit: Arc::new(tokio::sync::Semaphore::new(gc_concurrency)), + gc_concurrency, + } + } + + pub fn running_gc_tasks(&self) -> u32 { + (self.gc_concurrency - self.gc_job_limit.available_permits()) as u32 + } + + pub fn gc_concurrency(&self) -> u32 { + self.gc_concurrency as u32 + } + + pub fn gc_stat(&self) -> GcStat { + GcStat::new(self.running_gc_tasks(), self.gc_concurrency()) + } + + /// Try to acquire a permit for a GC job. + /// + /// If no permit is available, returns an `TooManyGcJobs` error. + pub fn permit(&self) -> Result { + self.gc_job_limit + .clone() + .try_acquire_owned() + .map_err(|e| match e { + TryAcquireError::Closed => UnexpectedSnafu { + reason: format!("Failed to acquire gc permit: {e}"), + } + .build(), + TryAcquireError::NoPermits => TooManyGcJobsSnafu {}.build(), + }) + } +} + +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct GcConfig { + /// Whether GC is enabled. + pub enable: bool, /// Lingering time before deleting files. /// Should be long enough to allow long running queries to finish. /// @@ -73,16 +114,22 @@ pub struct FileGcOption { /// Maximum concurrent list operations per GC job. /// This is used to limit the number of concurrent listing operations and speed up listing. pub max_concurrent_lister_per_gc_job: usize, + /// Maximum concurrent GC jobs. + /// This is used to limit the number of concurrent GC jobs running on the datanode + /// to prevent too many concurrent GC jobs from overwhelming the datanode. + pub max_concurrent_gc_job: usize, } -impl Default for FileGcOption { +impl Default for GcConfig { fn default() -> Self { Self { + enable: false, // expect long running queries to be finished within a reasonable time lingering_time: Duration::from_secs(60 * 5), // 6 hours, for unknown expel time, which is when this file get removed from manifest, it should rarely happen, can keep it longer unknown_file_lingering_time: Duration::from_secs(60 * 60 * 6), max_concurrent_lister_per_gc_job: 32, + max_concurrent_gc_job: 4, } } } @@ -92,13 +139,23 @@ pub struct LocalGcWorker { pub(crate) cache_manager: Option, pub(crate) manifest_mgrs: HashMap, /// Lingering time before deleting files. - pub(crate) opt: FileGcOption, + pub(crate) opt: GcConfig, pub(crate) manifest_open_config: ManifestOpenConfig, /// Tmp ref files manifest, used to determine which files are still in use by ongoing queries. /// /// Also contains manifest versions of regions when the tmp ref files are generated. /// Used to determine whether the tmp ref files are outdated. - pub(crate) file_ref_manifest: TableFileRefsManifest, + pub(crate) file_ref_manifest: FileRefsManifest, + _permit: OwnedSemaphorePermit, + /// Whether to perform full file listing during GC. + /// When set to false, GC will only delete files that are tracked in the manifest's removed_files, + /// which can significantly improve performance by avoiding expensive list operations. + /// When set to true, GC will perform a full listing to find and delete orphan files + /// (files not tracked in the manifest). + /// + /// Set to false for regular GC operations to optimize performance. + /// Set to true periodically or when you need to clean up orphan files. + pub full_file_listing: bool, } pub struct ManifestOpenConfig { @@ -125,13 +182,16 @@ impl LocalGcWorker { /// Create a new LocalGcWorker, with `regions_to_gc` regions to GC. /// The regions are specified by their `RegionId` and should all belong to the same table. /// + #[allow(clippy::too_many_arguments)] pub async fn try_new( access_layer: AccessLayerRef, cache_manager: Option, regions_to_gc: BTreeSet, - opt: FileGcOption, + opt: GcConfig, manifest_open_config: ManifestOpenConfig, - file_ref_manifest: TableFileRefsManifest, + file_ref_manifest: FileRefsManifest, + limiter: &GcLimiterRef, + full_file_listing: bool, ) -> Result { let table_id = regions_to_gc .first() @@ -139,6 +199,7 @@ impl LocalGcWorker { reason: "Expect at least one region, found none", })? .table_id(); + let permit = limiter.permit()?; let mut zelf = Self { access_layer, cache_manager, @@ -146,6 +207,8 @@ impl LocalGcWorker { opt, manifest_open_config, file_ref_manifest, + _permit: permit, + full_file_listing, }; // dedup just in case @@ -193,15 +256,15 @@ impl LocalGcWorker { // TODO(discord9): verify manifest version before reading tmp ref files let mut tmp_ref_files = HashMap::new(); - for file_ref in &self.file_ref_manifest.file_refs { - if outdated_regions.contains(&file_ref.region_id) { + for (region_id, file_refs) in &self.file_ref_manifest.file_refs { + if outdated_regions.contains(region_id) { // skip outdated regions continue; } tmp_ref_files - .entry(file_ref.region_id) + .entry(*region_id) .or_insert_with(HashSet::new) - .insert(file_ref.file_id); + .extend(file_refs.clone()); } Ok(tmp_ref_files) @@ -220,14 +283,14 @@ impl LocalGcWorker { let mut deleted_files = HashMap::new(); let tmp_ref_files = self.read_tmp_ref_files(&mut outdated_regions).await?; for region_id in self.manifest_mgrs.keys() { - info!("Doing gc for region {}", region_id); + debug!("Doing gc for region {}", region_id); let tmp_ref_files = tmp_ref_files .get(region_id) .cloned() .unwrap_or_else(HashSet::new); let files = self.do_region_gc(*region_id, &tmp_ref_files).await?; deleted_files.insert(*region_id, files); - info!("Gc for region {} finished", region_id); + debug!("Gc for region {} finished", region_id); } info!( "LocalGcWorker finished after {} secs.", @@ -244,7 +307,7 @@ impl LocalGcWorker { impl LocalGcWorker { /// concurrency of listing files per region. /// This is used to limit the number of concurrent listing operations and speed up listing - pub const CONCURRENCY_LIST_PER_FILES: usize = 512; + pub const CONCURRENCY_LIST_PER_FILES: usize = 1024; /// Perform GC for the region. /// 1. Get all the removed files in delta manifest files and their expel times @@ -259,7 +322,7 @@ impl LocalGcWorker { region_id: RegionId, tmp_ref_files: &HashSet, ) -> Result> { - info!("Doing gc for region {}", region_id); + debug!("Doing gc for region {}", region_id); let manifest = self .manifest_mgrs .get(®ion_id) @@ -272,10 +335,10 @@ impl LocalGcWorker { if recently_removed_files.is_empty() { // no files to remove, skip - info!("No recently removed files to gc for region {}", region_id); + debug!("No recently removed files to gc for region {}", region_id); } - info!( + debug!( "Found {} recently removed files sets for region {}", recently_removed_files.len(), region_id @@ -291,27 +354,20 @@ impl LocalGcWorker { .chain(tmp_ref_files.clone().into_iter()) .collect(); - let true_tmp_ref_files = tmp_ref_files - .iter() - .filter(|f| !current_files.contains_key(f)) - .collect::>(); - - info!("True tmp ref files: {:?}", true_tmp_ref_files); - let unused_files = self .list_to_be_deleted_files(region_id, in_used, recently_removed_files, concurrency) .await?; let unused_len = unused_files.len(); - info!( + debug!( "Found {} unused files to delete for region {}", unused_len, region_id ); self.delete_files(region_id, &unused_files).await?; - info!( + debug!( "Successfully deleted {} unused files for region {}", unused_len, region_id ); @@ -329,7 +385,8 @@ impl LocalGcWorker { ) .await?; - GC_FILE_CNT.add(file_ids.len() as i64); + // FIXME(discord9): if files are already deleted before calling delete_files, the metric will be inaccurate, no clean way to fix it now + GC_DEL_FILE_CNT.add(file_ids.len() as i64); Ok(()) } @@ -491,7 +548,7 @@ impl LocalGcWorker { entries: Vec, in_use_filenames: &HashSet<&FileId>, may_linger_filenames: &HashSet<&FileId>, - all_files_appear_in_delta_manifests: &HashSet<&FileId>, + eligible_for_removal: &HashSet<&FileId>, unknown_file_may_linger_until: chrono::DateTime, ) -> (Vec, HashSet) { let mut all_unused_files_ready_for_delete = vec![]; @@ -515,7 +572,7 @@ impl LocalGcWorker { let should_delete = !in_use_filenames.contains(&file_id) && !may_linger_filenames.contains(&file_id) && { - if !all_files_appear_in_delta_manifests.contains(&file_id) { + if !eligible_for_removal.contains(&file_id) { // if the file's expel time is unknown(because not appear in delta manifest), we keep it for a while // using it's last modified time // notice unknown files use a different lingering time @@ -541,6 +598,11 @@ impl LocalGcWorker { /// Concurrently list unused files in the region dir /// because there may be a lot of files in the region dir /// and listing them may take a long time. + /// + /// When `full_file_listing` is false, this method will only delete files tracked in + /// `recently_removed_files` without performing expensive list operations, which significantly + /// improves performance. When `full_file_listing` is true, it performs a full listing to + /// find and delete orphan files. pub async fn list_to_be_deleted_files( &self, region_id: RegionId, @@ -548,6 +610,7 @@ impl LocalGcWorker { recently_removed_files: BTreeMap>, concurrency: usize, ) -> Result> { + let start = tokio::time::Instant::now(); let now = chrono::Utc::now(); let may_linger_until = now - chrono::Duration::from_std(self.opt.lingering_time).with_context(|_| { @@ -569,7 +632,7 @@ impl LocalGcWorker { let may_linger_files = recently_removed_files.split_off(&threshold); let may_linger_filenames = may_linger_files.values().flatten().collect::>(); - let all_files_appear_in_delta_manifests = recently_removed_files + let eligible_for_removal = recently_removed_files .values() .flatten() .collect::>(); @@ -577,23 +640,56 @@ impl LocalGcWorker { // in use filenames, include sst and index files let in_use_filenames = in_used.iter().collect::>(); + // When full_file_listing is false, skip expensive list operations and only delete + // files that are tracked in recently_removed_files + if !self.full_file_listing { + // Only delete files that: + // 1. Are in recently_removed_files (tracked in manifest) + // 2. Are not in use + // 3. Have passed the lingering time + let files_to_delete: Vec = eligible_for_removal + .iter() + .filter(|file_id| !in_use_filenames.contains(*file_id)) + .map(|&f| *f) + .collect(); + + info!( + "gc: fast mode (no full listing) cost {} secs for region {}, found {} files to delete from manifest", + start.elapsed().as_secs_f64(), + region_id, + files_to_delete.len() + ); + + return Ok(files_to_delete); + } + + // Full file listing mode: perform expensive list operations to find orphan files // Step 1: Create partitioned listers for concurrent processing let listers = self.partition_region_files(region_id, concurrency).await?; + let lister_cnt = listers.len(); // Step 2: Concurrently list all files in the region directory let all_entries = self.list_region_files_concurrent(listers).await?; + let cnt = all_entries.len(); + // Step 3: Filter files to determine which ones can be deleted let (all_unused_files_ready_for_delete, all_in_exist_linger_files) = self .filter_deletable_files( all_entries, &in_use_filenames, &may_linger_filenames, - &all_files_appear_in_delta_manifests, + &eligible_for_removal, unknown_file_may_linger_until, ); - info!("All in exist linger files: {:?}", all_in_exist_linger_files); + info!( + "gc: full listing mode cost {} secs using {lister_cnt} lister for {cnt} files in region {}, found {} unused files to delete", + start.elapsed().as_secs_f64(), + region_id, + all_unused_files_ready_for_delete.len() + ); + debug!("All in exist linger files: {:?}", all_in_exist_linger_files); Ok(all_unused_files_ready_for_delete) } diff --git a/src/mito2/src/metrics.rs b/src/mito2/src/metrics.rs index 0f923f60a6..fd8110b526 100644 --- a/src/mito2/src/metrics.rs +++ b/src/mito2/src/metrics.rs @@ -437,7 +437,7 @@ lazy_static! { "mito stalled write request in each worker", &[WORKER_LABEL] ).unwrap(); - /// Number of ref files per table + /// Number of ref files pub static ref GC_REF_FILE_CNT: IntGauge = register_int_gauge!( "greptime_gc_ref_file_count", "gc ref file count", @@ -458,9 +458,9 @@ lazy_static! { .unwrap(); /// Counter for the number of files deleted by the GC worker. - pub static ref GC_FILE_CNT: IntGauge = + pub static ref GC_DEL_FILE_CNT: IntGauge = register_int_gauge!( - "greptime_mito_gc_file_count", + "greptime_mito_gc_delete_file_count", "mito gc deleted file count", ).unwrap(); } diff --git a/src/mito2/src/region.rs b/src/mito2/src/region.rs index ee49da763e..f4a9deb9c6 100644 --- a/src/mito2/src/region.rs +++ b/src/mito2/src/region.rs @@ -565,6 +565,10 @@ impl MitoRegion { Ok(()) } + pub fn access_layer(&self) -> AccessLayerRef { + self.access_layer.clone() + } + /// Returns the SST entries of the region. pub async fn manifest_sst_entries(&self) -> Vec { let table_dir = self.table_dir(); diff --git a/src/mito2/src/sst/file_purger.rs b/src/mito2/src/sst/file_purger.rs index c5197ea2fb..11f38ac1ad 100644 --- a/src/mito2/src/sst/file_purger.rs +++ b/src/mito2/src/sst/file_purger.rs @@ -162,6 +162,7 @@ impl FilePurger for ObjectStoreFilePurger { // notice that no matter whether the file is deleted or not, we need to remove the reference // because the file is no longer in use nonetheless. self.file_ref_manager.remove_file(&file_meta); + // TODO(discord9): consider impl a .tombstone file to reduce files needed to list } fn new_file(&self, file_meta: &FileMeta) { diff --git a/src/mito2/src/sst/file_ref.rs b/src/mito2/src/sst/file_ref.rs index de071b3f04..28f3e95f89 100644 --- a/src/mito2/src/sst/file_ref.rs +++ b/src/mito2/src/sst/file_ref.rs @@ -17,38 +17,23 @@ use std::sync::Arc; use common_telemetry::debug; use dashmap::{DashMap, Entry}; -use serde::{Deserialize, Serialize}; -use store_api::ManifestVersion; -use store_api::storage::{FileId, RegionId, TableId}; +use store_api::storage::{FileRef, FileRefsManifest, RegionId}; use crate::error::Result; use crate::metrics::GC_REF_FILE_CNT; -use crate::region::RegionMapRef; +use crate::region::MitoRegionRef; use crate::sst::file::FileMeta; -#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] -pub struct FileRef { - pub region_id: RegionId, - pub file_id: FileId, -} - -impl FileRef { - pub fn new(region_id: RegionId, file_id: FileId) -> Self { - Self { region_id, file_id } - } -} - -/// File references for a table. -/// It contains all files referenced by the table. +/// File references for a region. +/// It contains all files referenced by the region. #[derive(Debug, Clone, Default)] -pub struct TableFileRefs { +pub struct RegionFileRefs { /// (FileRef, Ref Count) meaning how many FileHandleInner is opened for this file. pub files: HashMap, } /// Manages all file references in one datanode. /// It keeps track of which files are referenced and group by table ids. -/// And periodically update the references to tmp file in object storage. /// This is useful for ensuring that files are not deleted while they are still in use by any /// query. #[derive(Debug)] @@ -56,33 +41,24 @@ pub struct FileReferenceManager { /// Datanode id. used to determine tmp ref file name. node_id: Option, /// TODO(discord9): use no hash hasher since table id is sequential. - files_per_table: DashMap, + files_per_region: DashMap, } pub type FileReferenceManagerRef = Arc; -/// The tmp file uploaded to object storage to record one table's file references. -#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] -pub struct TableFileRefsManifest { - pub file_refs: HashSet, - /// Manifest version when this manifest is read for it's files - pub manifest_version: HashMap, -} - impl FileReferenceManager { pub fn new(node_id: Option) -> Self { Self { node_id, - files_per_table: Default::default(), + files_per_region: Default::default(), } } - fn ref_file_set(&self, table_id: TableId) -> Option> { - let file_refs = if let Some(file_refs) = self.files_per_table.get(&table_id) { + fn ref_file_set(&self, region_id: RegionId) -> Option> { + let file_refs = if let Some(file_refs) = self.files_per_region.get(®ion_id) { file_refs.clone() } else { - // still return an empty manifest to indicate no files are referenced. - // and differentiate from error case where table_id not found. + // region id not found. return None; }; @@ -95,8 +71,8 @@ impl FileReferenceManager { let ref_file_set: HashSet = file_refs.files.keys().cloned().collect(); debug!( - "Get file refs for table {}, node {:?}, {} files", - table_id, + "Get file refs for region {}, node {:?}, {} files", + region_id, self.node_id, ref_file_set.len(), ); @@ -120,22 +96,19 @@ impl FileReferenceManager { #[allow(unused)] pub(crate) async fn get_snapshot_of_unmanifested_refs( &self, - table_id: TableId, - region_map: &RegionMapRef, - ) -> Result { - let Some(ref_files) = self.ref_file_set(table_id) else { - return Ok(Default::default()); - }; - let region_list = region_map.list_regions(); - let table_regions = region_list - .iter() - .filter(|r| r.region_id().table_id() == table_id) - .collect::>(); + regions: Vec, + ) -> Result { + let mut ref_files = HashMap::new(); + for region_id in regions.iter().map(|r| r.region_id()) { + if let Some(files) = self.ref_file_set(region_id) { + ref_files.insert(region_id, files); + } + } let mut in_manifest_files = HashSet::new(); let mut manifest_version = HashMap::new(); - for r in &table_regions { + for r in ®ions { let manifest = r.manifest_ctx.manifest().await; let files = manifest.files.keys().cloned().collect::>(); in_manifest_files.extend(files); @@ -144,11 +117,18 @@ impl FileReferenceManager { let ref_files_excluding_in_manifest = ref_files .iter() - .filter(|f| !in_manifest_files.contains(&f.file_id)) - .cloned() - .collect::>(); - - Ok(TableFileRefsManifest { + .map(|(r, f)| { + ( + *r, + f.iter() + .filter_map(|f| { + (!in_manifest_files.contains(&f.file_id)).then_some(f.file_id) + }) + .collect::>(), + ) + }) + .collect(); + Ok(FileRefsManifest { file_refs: ref_files_excluding_in_manifest, manifest_version, }) @@ -158,12 +138,12 @@ impl FileReferenceManager { /// Also records the access layer for the table if not exists. /// The access layer will be used to upload ref file to object storage. pub fn add_file(&self, file_meta: &FileMeta) { - let table_id = file_meta.region_id.table_id(); + let region_id = file_meta.region_id; let mut is_new = false; { let file_ref = FileRef::new(file_meta.region_id, file_meta.file_id); - self.files_per_table - .entry(table_id) + self.files_per_region + .entry(region_id) .and_modify(|refs| { refs.files .entry(file_ref.clone()) @@ -173,7 +153,7 @@ impl FileReferenceManager { 1 }); }) - .or_insert_with(|| TableFileRefs { + .or_insert_with(|| RegionFileRefs { files: HashMap::from_iter([(file_ref, 1)]), }); } @@ -185,14 +165,14 @@ impl FileReferenceManager { /// Removes a file reference. /// If the reference count reaches zero, the file reference will be removed from the manager. pub fn remove_file(&self, file_meta: &FileMeta) { - let table_id = file_meta.region_id.table_id(); + let region_id = file_meta.region_id; let file_ref = FileRef::new(file_meta.region_id, file_meta.file_id); let mut remove_table_entry = false; let mut remove_file_ref = false; let mut file_cnt = 0; - let table_ref = self.files_per_table.entry(table_id).and_modify(|refs| { + let region_ref = self.files_per_region.entry(region_id).and_modify(|refs| { let entry = refs.files.entry(file_ref.clone()).and_modify(|count| { if *count > 0 { *count -= 1; @@ -214,7 +194,7 @@ impl FileReferenceManager { } }); - if let Entry::Occupied(o) = table_ref + if let Entry::Occupied(o) = region_ref && remove_table_entry { o.remove_entry(); @@ -234,7 +214,7 @@ mod tests { use std::num::NonZeroU64; use smallvec::SmallVec; - use store_api::storage::RegionId; + use store_api::storage::{FileId, RegionId}; use super::*; use crate::sst::file::{FileMeta, FileTimeRange, IndexType, RegionFileId}; @@ -265,54 +245,69 @@ mod tests { file_ref_mgr.add_file(&file_meta); assert_eq!( - file_ref_mgr.files_per_table.get(&0).unwrap().files, + file_ref_mgr + .files_per_region + .get(&file_meta.region_id) + .unwrap() + .files, HashMap::from_iter([(FileRef::new(file_meta.region_id, file_meta.file_id), 1)]) ); file_ref_mgr.add_file(&file_meta); - let expected_table_ref_manifest = + let expected_region_ref_manifest = HashSet::from_iter([FileRef::new(file_meta.region_id, file_meta.file_id)]); assert_eq!( - file_ref_mgr.ref_file_set(0).unwrap(), - expected_table_ref_manifest + file_ref_mgr.ref_file_set(file_meta.region_id).unwrap(), + expected_region_ref_manifest ); assert_eq!( - file_ref_mgr.files_per_table.get(&0).unwrap().files, + file_ref_mgr + .files_per_region + .get(&file_meta.region_id) + .unwrap() + .files, HashMap::from_iter([(FileRef::new(file_meta.region_id, file_meta.file_id), 2)]) ); assert_eq!( - file_ref_mgr.ref_file_set(0).unwrap(), - expected_table_ref_manifest + file_ref_mgr.ref_file_set(file_meta.region_id).unwrap(), + expected_region_ref_manifest ); file_ref_mgr.remove_file(&file_meta); assert_eq!( - file_ref_mgr.files_per_table.get(&0).unwrap().files, + file_ref_mgr + .files_per_region + .get(&file_meta.region_id) + .unwrap() + .files, HashMap::from_iter([(FileRef::new(file_meta.region_id, file_meta.file_id), 1)]) ); assert_eq!( - file_ref_mgr.ref_file_set(0).unwrap(), - expected_table_ref_manifest + file_ref_mgr.ref_file_set(file_meta.region_id).unwrap(), + expected_region_ref_manifest ); file_ref_mgr.remove_file(&file_meta); assert!( - file_ref_mgr.files_per_table.get(&0).is_none(), + file_ref_mgr + .files_per_region + .get(&file_meta.region_id) + .is_none(), "{:?}", - file_ref_mgr.files_per_table + file_ref_mgr.files_per_region ); assert!( - file_ref_mgr.ref_file_set(0).is_none(), + file_ref_mgr.ref_file_set(file_meta.region_id).is_none(), "{:?}", - file_ref_mgr.files_per_table + file_ref_mgr.files_per_region ); } } diff --git a/src/mito2/src/worker.rs b/src/mito2/src/worker.rs index 322141fd1b..75aff36b52 100644 --- a/src/mito2/src/worker.rs +++ b/src/mito2/src/worker.rs @@ -58,6 +58,7 @@ use crate::compaction::CompactionScheduler; use crate::config::MitoConfig; use crate::error::{self, CreateDirSnafu, JoinSnafu, Result, WorkerStoppedSnafu}; use crate::flush::{FlushScheduler, WriteBufferManagerImpl, WriteBufferManagerRef}; +use crate::gc::{GcLimiter, GcLimiterRef}; use crate::memtable::MemtableBuilderProvider; use crate::metrics::{REGION_COUNT, REQUEST_WAIT_TIME, WRITE_STALLING}; use crate::region::opener::PartitionExprFetcherRef; @@ -138,6 +139,8 @@ pub(crate) struct WorkerGroup { cache_manager: CacheManagerRef, /// File reference manager. file_ref_manager: FileReferenceManagerRef, + /// Gc limiter to limit concurrent gc jobs. + gc_limiter: GcLimiterRef, } impl WorkerGroup { @@ -196,6 +199,7 @@ impl WorkerGroup { .build(), ); let time_provider = Arc::new(StdTimeProvider); + let gc_limiter = Arc::new(GcLimiter::new(config.gc.max_concurrent_gc_job)); let workers = (0..config.num_workers) .map(|id| { @@ -234,6 +238,7 @@ impl WorkerGroup { purge_scheduler, cache_manager, file_ref_manager, + gc_limiter, }) } @@ -291,6 +296,10 @@ impl WorkerGroup { self.file_ref_manager.clone() } + pub(crate) fn gc_limiter(&self) -> GcLimiterRef { + self.gc_limiter.clone() + } + /// Get worker for specific `region_id`. pub(crate) fn worker(&self, region_id: RegionId) -> &RegionWorker { let index = region_id_to_index(region_id, self.workers.len()); @@ -361,6 +370,7 @@ impl WorkerGroup { .write_cache(write_cache) .build(), ); + let gc_limiter = Arc::new(GcLimiter::new(config.gc.max_concurrent_gc_job)); let workers = (0..config.num_workers) .map(|id| { WorkerStarter { @@ -398,6 +408,7 @@ impl WorkerGroup { purge_scheduler, cache_manager, file_ref_manager, + gc_limiter, }) } diff --git a/src/store-api/src/storage.rs b/src/store-api/src/storage.rs index 1df7a0aff6..2cafaf027c 100644 --- a/src/store-api/src/storage.rs +++ b/src/store-api/src/storage.rs @@ -26,6 +26,6 @@ pub use datatypes::schema::{ }; pub use self::descriptors::*; -pub use self::file::{FileId, ParseIdError}; +pub use self::file::{FileId, FileRef, FileRefsManifest, GcReport, ParseIdError}; pub use self::requests::{ScanRequest, TimeSeriesDistribution, TimeSeriesRowSelector}; pub use self::types::{SequenceNumber, SequenceRange}; diff --git a/src/store-api/src/storage/descriptors.rs b/src/store-api/src/storage/descriptors.rs index 84bdbdf7a8..ae4f617b88 100644 --- a/src/store-api/src/storage/descriptors.rs +++ b/src/store-api/src/storage/descriptors.rs @@ -48,7 +48,24 @@ pub const MAX_REGION_SEQ: u32 = REGION_SEQ_MASK; /// Region Number(32) /// ``` #[derive(Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] -pub struct RegionId(u64); +pub struct RegionId(#[serde(deserialize_with = "str_or_u64")] u64); + +/// FIXME(discord9): workaround for serde issue: https://github.com/serde-rs/json/issues/1254 +fn str_or_u64<'de, D>(deserializer: D) -> Result +where + D: serde::Deserializer<'de>, +{ + #[derive(Deserialize)] + #[serde(untagged)] + enum StrOrU64 { + U64(u64), + Str(String), + } + match StrOrU64::deserialize(deserializer)? { + StrOrU64::U64(v) => Ok(v), + StrOrU64::Str(s) => s.parse::().map_err(serde::de::Error::custom), + } +} impl RegionId { /// Construct a new [RegionId] from table id and region number. @@ -328,6 +345,13 @@ mod tests { assert_eq!(region_id, parsed); } + #[test] + fn test_region_id_from_str() { + let region_id_str = "\"8589934602\""; + let region_id: RegionId = serde_json::from_str(region_id_str).unwrap(); + assert_eq!(RegionId::new(2, 10), region_id); + } + #[test] fn test_retrieve_region_group_and_seq() { let region_id = RegionId::with_group_and_seq(111, 222, 333); diff --git a/src/store-api/src/storage/file.rs b/src/store-api/src/storage/file.rs index 6e2fa334e4..a028ec0401 100644 --- a/src/store-api/src/storage/file.rs +++ b/src/store-api/src/storage/file.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::{HashMap, HashSet}; use std::fmt; use std::fmt::Debug; use std::str::FromStr; @@ -20,6 +21,9 @@ use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; use uuid::Uuid; +use crate::ManifestVersion; +use crate::storage::RegionId; + #[derive(Debug, Snafu, PartialEq)] pub struct ParseIdError { source: uuid::Error, @@ -66,6 +70,60 @@ impl FromStr for FileId { } } +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct FileRef { + pub region_id: RegionId, + pub file_id: FileId, +} + +impl FileRef { + pub fn new(region_id: RegionId, file_id: FileId) -> Self { + Self { region_id, file_id } + } +} + +/// The tmp file manifest which record a table's file references. +/// Also record the manifest version when these tmp files are read. +#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] +pub struct FileRefsManifest { + pub file_refs: HashMap>, + /// Manifest version when this manifest is read for it's files + pub manifest_version: HashMap, +} + +#[derive(Clone, Default, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct GcReport { + /// deleted files per region + pub deleted_files: HashMap>, + /// Regions that need retry in next gc round, usually because their tmp ref files are outdated + pub need_retry_regions: HashSet, +} + +impl GcReport { + pub fn new( + deleted_files: HashMap>, + need_retry_regions: HashSet, + ) -> Self { + Self { + deleted_files, + need_retry_regions, + } + } + + pub fn merge(&mut self, other: GcReport) { + for (region, files) in other.deleted_files { + let self_files = self.deleted_files.entry(region).or_default(); + let dedup: HashSet = HashSet::from_iter( + std::mem::take(self_files) + .into_iter() + .chain(files.iter().cloned()), + ); + *self_files = dedup.into_iter().collect(); + } + self.need_retry_regions.extend(other.need_retry_regions); + } +} + #[cfg(test)] mod tests { @@ -92,4 +150,19 @@ mod tests { let parsed = serde_json::from_str(&json).unwrap(); assert_eq!(id, parsed); } + + #[test] + fn test_file_refs_manifest_serialization() { + let mut manifest = FileRefsManifest::default(); + let r0 = RegionId::new(1024, 1); + let r1 = RegionId::new(1024, 2); + manifest.file_refs.insert(r0, [FileId::random()].into()); + manifest.file_refs.insert(r1, [FileId::random()].into()); + manifest.manifest_version.insert(r0, 10); + manifest.manifest_version.insert(r1, 20); + + let json = serde_json::to_string(&manifest).unwrap(); + let parsed: FileRefsManifest = serde_json::from_str(&json).unwrap(); + assert_eq!(manifest, parsed); + } } diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index 9113b356ae..bd193769ee 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -1534,6 +1534,13 @@ mem_threshold_on_create = "auto" [region_engine.mito.memtable] type = "time_series" +[region_engine.mito.gc] +enable = false +lingering_time = "5m" +unknown_file_lingering_time = "6h" +max_concurrent_lister_per_gc_job = 32 +max_concurrent_gc_job = 4 + [[region_engine]] [region_engine.file] From f0afd675e3c23d78ff1265632902e4394a8796f8 Mon Sep 17 00:00:00 2001 From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com> Date: Tue, 28 Oct 2025 22:26:29 -0700 Subject: [PATCH 020/149] feat: objbench sub command for datanode (#7114) * feat/objbench-subcmd: ### Add Object Storage Benchmark Tool and Update Dependencies - **`Cargo.lock` & `Cargo.toml`**: Added dependencies for `colored`, `parquet`, and `pprof` to support new features. - **`datanode.rs`**: Introduced `ObjbenchCommand` for benchmarking object storage, including command-line options for configuration and execution. Added `StorageConfig` and `StorageConfigWrapper` for storage engine configuration. - **`datanode.rs`**: Implemented a stub for `build_object_store` function to initialize object storage. These changes introduce a new subcommand for object storage benchmarking and update dependencies to support additional functionality. Signed-off-by: Lei, HUANG * init Signed-off-by: Lei, HUANG * fix: code style and clippy * feat/objbench-subcmd: Improve error handling in `objbench.rs` - Enhanced error handling in `parse_config` and `parse_file_dir_components` functions by replacing `unwrap` with `OptionExt` and `context` for better error messages. - Updated `build_access_layer_simple` and `build_cache_manager` functions to use `map_err` for more descriptive error handling. Signed-off-by: Lei, HUANG * chore: rebase main Signed-off-by: Lei, HUANG --------- Signed-off-by: Lei, HUANG --- Cargo.lock | 14 + src/cmd/Cargo.toml | 5 + src/cmd/src/bin/greptime.rs | 15 +- src/cmd/src/datanode.rs | 47 +- src/cmd/src/datanode/objbench.rs | 676 ++++++++++++++++++++++++++ src/datanode/src/store.rs | 18 +- src/mito2/src/access_layer.rs | 20 +- src/mito2/src/cache/write_cache.rs | 24 +- src/mito2/src/compaction/compactor.rs | 9 +- src/mito2/src/flush.rs | 20 +- src/mito2/src/lib.rs | 2 +- src/mito2/src/sst/file.rs | 4 +- src/mito2/src/sst/index.rs | 6 +- src/mito2/src/sst/parquet.rs | 30 +- src/mito2/src/sst/parquet/reader.rs | 9 +- src/mito2/src/sst/parquet/writer.rs | 21 +- src/mito2/src/worker.rs | 2 +- 17 files changed, 834 insertions(+), 88 deletions(-) create mode 100644 src/cmd/src/datanode/objbench.rs diff --git a/Cargo.lock b/Cargo.lock index 71caf8c8b0..7b076d9273 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1896,6 +1896,7 @@ dependencies = [ "clap 4.5.40", "cli", "client", + "colored", "common-base", "common-catalog", "common-config", @@ -1917,6 +1918,7 @@ dependencies = [ "common-wal", "datanode", "datatypes", + "either", "etcd-client", "file-engine", "flow", @@ -1932,7 +1934,9 @@ dependencies = [ "moka", "nu-ansi-term", "object-store", + "parquet", "plugins", + "pprof", "prometheus", "prost 0.13.5", "query", @@ -1975,6 +1979,16 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +[[package]] +name = "colored" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "117725a109d387c937a1533ce01b450cbde6b88abceea8473c4d7a85853cda3c" +dependencies = [ + "lazy_static", + "windows-sys 0.59.0", +] + [[package]] name = "comfy-table" version = "7.1.2" diff --git a/src/cmd/Cargo.toml b/src/cmd/Cargo.toml index 744d13faeb..94dc3da56b 100644 --- a/src/cmd/Cargo.toml +++ b/src/cmd/Cargo.toml @@ -29,9 +29,11 @@ base64.workspace = true cache.workspace = true catalog.workspace = true chrono.workspace = true +either = "1.15" clap.workspace = true cli.workspace = true client.workspace = true +colored = "2.1.0" common-base.workspace = true common-catalog.workspace = true common-config.workspace = true @@ -63,10 +65,13 @@ lazy_static.workspace = true meta-client.workspace = true meta-srv.workspace = true metric-engine.workspace = true +mito2.workspace = true moka.workspace = true nu-ansi-term = "0.46" object-store.workspace = true +parquet = { workspace = true, features = ["object_store"] } plugins.workspace = true +pprof = "0.14.0" prometheus.workspace = true prost.workspace = true query.workspace = true diff --git a/src/cmd/src/bin/greptime.rs b/src/cmd/src/bin/greptime.rs index cf72b3d32f..f6bbebf7fb 100644 --- a/src/cmd/src/bin/greptime.rs +++ b/src/cmd/src/bin/greptime.rs @@ -103,12 +103,15 @@ async fn main_body() -> Result<()> { async fn start(cli: Command) -> Result<()> { match cli.subcmd { - SubCommand::Datanode(cmd) => { - let opts = cmd.load_options(&cli.global_options)?; - let plugins = Plugins::new(); - let builder = InstanceBuilder::try_new_with_init(opts, plugins).await?; - cmd.build_with(builder).await?.run().await - } + SubCommand::Datanode(cmd) => match cmd.subcmd { + datanode::SubCommand::Start(ref start) => { + let opts = start.load_options(&cli.global_options)?; + let plugins = Plugins::new(); + let builder = InstanceBuilder::try_new_with_init(opts, plugins).await?; + cmd.build_with(builder).await?.run().await + } + datanode::SubCommand::Objbench(ref bench) => bench.run().await, + }, SubCommand::Flownode(cmd) => { cmd.build(cmd.load_options(&cli.global_options)?) .await? diff --git a/src/cmd/src/datanode.rs b/src/cmd/src/datanode.rs index 641d3fc5fd..23ca644ffc 100644 --- a/src/cmd/src/datanode.rs +++ b/src/cmd/src/datanode.rs @@ -13,6 +13,8 @@ // limitations under the License. pub mod builder; +#[allow(clippy::print_stdout)] +mod objbench; use std::path::Path; use std::time::Duration; @@ -23,13 +25,16 @@ use common_config::Configurable; use common_telemetry::logging::{DEFAULT_LOGGING_DIR, TracingOptions}; use common_telemetry::{info, warn}; use common_wal::config::DatanodeWalConfig; +use datanode::config::RegionEngineConfig; use datanode::datanode::Datanode; use meta_client::MetaClientOptions; +use serde::{Deserialize, Serialize}; use snafu::{ResultExt, ensure}; use tracing_appender::non_blocking::WorkerGuard; use crate::App; use crate::datanode::builder::InstanceBuilder; +use crate::datanode::objbench::ObjbenchCommand; use crate::error::{ LoadLayeredConfigSnafu, MissingConfigSnafu, Result, ShutdownDatanodeSnafu, StartDatanodeSnafu, }; @@ -89,7 +94,7 @@ impl App for Instance { #[derive(Parser)] pub struct Command { #[clap(subcommand)] - subcmd: SubCommand, + pub subcmd: SubCommand, } impl Command { @@ -100,13 +105,26 @@ impl Command { pub fn load_options(&self, global_options: &GlobalOptions) -> Result { match &self.subcmd { SubCommand::Start(cmd) => cmd.load_options(global_options), + SubCommand::Objbench(_) => { + // For objbench command, we don't need to load DatanodeOptions + // It's a standalone utility command + let mut opts = datanode::config::DatanodeOptions::default(); + opts.sanitize(); + Ok(DatanodeOptions { + runtime: Default::default(), + plugins: Default::default(), + component: opts, + }) + } } } } #[derive(Parser)] -enum SubCommand { +pub enum SubCommand { Start(StartCommand), + /// Object storage benchmark tool + Objbench(ObjbenchCommand), } impl SubCommand { @@ -116,12 +134,33 @@ impl SubCommand { info!("Building datanode with {:#?}", cmd); builder.build().await } + SubCommand::Objbench(cmd) => { + cmd.run().await?; + std::process::exit(0); + } } } } +/// Storage engine config +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)] +#[serde(default)] +pub struct StorageConfig { + /// The working directory of database + pub data_home: String, + #[serde(flatten)] + pub store: object_store::config::ObjectStoreConfig, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)] +#[serde(default)] +struct StorageConfigWrapper { + storage: StorageConfig, + region_engine: Vec, +} + #[derive(Debug, Parser, Default)] -struct StartCommand { +pub struct StartCommand { #[clap(long)] node_id: Option, /// The address to bind the gRPC server. @@ -149,7 +188,7 @@ struct StartCommand { } impl StartCommand { - fn load_options(&self, global_options: &GlobalOptions) -> Result { + pub fn load_options(&self, global_options: &GlobalOptions) -> Result { let mut opts = DatanodeOptions::load_layered_options( self.config_file.as_deref(), self.env_prefix.as_ref(), diff --git a/src/cmd/src/datanode/objbench.rs b/src/cmd/src/datanode/objbench.rs new file mode 100644 index 0000000000..564e8c744b --- /dev/null +++ b/src/cmd/src/datanode/objbench.rs @@ -0,0 +1,676 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::path::PathBuf; +use std::sync::Arc; +use std::time::Instant; + +use clap::Parser; +use colored::Colorize; +use datanode::config::RegionEngineConfig; +use datanode::store; +use either::Either; +use mito2::access_layer::{ + AccessLayer, AccessLayerRef, Metrics, OperationType, SstWriteRequest, WriteType, +}; +use mito2::cache::{CacheManager, CacheManagerRef}; +use mito2::config::{FulltextIndexConfig, MitoConfig, Mode}; +use mito2::read::Source; +use mito2::sst::file::{FileHandle, FileMeta}; +use mito2::sst::file_purger::{FilePurger, FilePurgerRef}; +use mito2::sst::index::intermediate::IntermediateManager; +use mito2::sst::index::puffin_manager::PuffinManagerFactory; +use mito2::sst::parquet::reader::ParquetReaderBuilder; +use mito2::sst::parquet::{PARQUET_METADATA_KEY, WriteOptions}; +use mito2::worker::write_cache_from_config; +use object_store::ObjectStore; +use regex::Regex; +use snafu::OptionExt; +use store_api::metadata::{RegionMetadata, RegionMetadataRef}; +use store_api::path_utils::region_name; +use store_api::region_request::PathType; +use store_api::storage::FileId; + +use crate::datanode::{StorageConfig, StorageConfigWrapper}; +use crate::error; + +/// Object storage benchmark command +#[derive(Debug, Parser)] +pub struct ObjbenchCommand { + /// Path to the object-store config file (TOML). Must deserialize into object_store::config::ObjectStoreConfig. + #[clap(long, value_name = "FILE")] + pub config: PathBuf, + + /// Source SST file path in object-store (e.g. "region_dir/.parquet"). + #[clap(long, value_name = "PATH")] + pub source: String, + + /// Verbose output + #[clap(short, long, default_value_t = false)] + pub verbose: bool, + + /// Output file path for pprof flamegraph (enables profiling) + #[clap(long, value_name = "FILE")] + pub pprof_file: Option, +} + +fn parse_config(config_path: &PathBuf) -> error::Result<(StorageConfig, MitoConfig)> { + let cfg_str = std::fs::read_to_string(config_path).map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("failed to read config {}: {e}", config_path.display()), + } + .build() + })?; + + let store_cfg: StorageConfigWrapper = toml::from_str(&cfg_str).map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("failed to parse config {}: {e}", config_path.display()), + } + .build() + })?; + + let storage_config = store_cfg.storage; + let mito_engine_config = store_cfg + .region_engine + .into_iter() + .filter_map(|c| { + if let RegionEngineConfig::Mito(mito) = c { + Some(mito) + } else { + None + } + }) + .next() + .with_context(|| error::IllegalConfigSnafu { + msg: format!("Engine config not found in {:?}", config_path), + })?; + Ok((storage_config, mito_engine_config)) +} + +impl ObjbenchCommand { + pub async fn run(&self) -> error::Result<()> { + if self.verbose { + common_telemetry::init_default_ut_logging(); + } + + println!("{}", "Starting objbench with config:".cyan().bold()); + + // Build object store from config + let (store_cfg, mut mito_engine_config) = parse_config(&self.config)?; + + let object_store = build_object_store(&store_cfg).await?; + println!("{} Object store initialized", "✓".green()); + + // Prepare source identifiers + let components = parse_file_dir_components(&self.source)?; + println!( + "{} Source path parsed: {}, components: {:?}", + "✓".green(), + self.source, + components + ); + + // Load parquet metadata to extract RegionMetadata and file stats + println!("{}", "Loading parquet metadata...".yellow()); + let file_size = object_store + .stat(&self.source) + .await + .map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("stat failed: {e}"), + } + .build() + })? + .content_length(); + let parquet_meta = load_parquet_metadata(object_store.clone(), &self.source, file_size) + .await + .map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("read parquet metadata failed: {e}"), + } + .build() + })?; + + let region_meta = extract_region_metadata(&self.source, &parquet_meta)?; + let num_rows = parquet_meta.file_metadata().num_rows() as u64; + let num_row_groups = parquet_meta.num_row_groups() as u64; + + println!( + "{} Metadata loaded - rows: {}, size: {} bytes", + "✓".green(), + num_rows, + file_size + ); + + // Build a FileHandle for the source file + let file_meta = FileMeta { + region_id: region_meta.region_id, + file_id: components.file_id, + time_range: Default::default(), + level: 0, + file_size, + available_indexes: Default::default(), + index_file_size: 0, + num_rows, + num_row_groups, + sequence: None, + partition_expr: None, + num_series: 0, + }; + let src_handle = FileHandle::new(file_meta, new_noop_file_purger()); + + // Build the reader for a single file via ParquetReaderBuilder + let table_dir = components.table_dir(); + let (src_access_layer, cache_manager) = build_access_layer_simple( + &components, + object_store.clone(), + &mut mito_engine_config, + &store_cfg.data_home, + ) + .await?; + let reader_build_start = Instant::now(); + + let reader = ParquetReaderBuilder::new( + table_dir, + components.path_type, + src_handle.clone(), + object_store.clone(), + ) + .expected_metadata(Some(region_meta.clone())) + .build() + .await + .map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("build reader failed: {e:?}"), + } + .build() + })?; + + let reader_build_elapsed = reader_build_start.elapsed(); + let total_rows = reader.parquet_metadata().file_metadata().num_rows(); + println!("{} Reader built in {:?}", "✓".green(), reader_build_elapsed); + + // Build write request + let fulltext_index_config = FulltextIndexConfig { + create_on_compaction: Mode::Disable, + ..Default::default() + }; + + let write_req = SstWriteRequest { + op_type: OperationType::Flush, + metadata: region_meta, + source: Either::Left(Source::Reader(Box::new(reader))), + cache_manager, + storage: None, + max_sequence: None, + index_options: Default::default(), + index_config: mito_engine_config.index.clone(), + inverted_index_config: MitoConfig::default().inverted_index, + fulltext_index_config, + bloom_filter_index_config: MitoConfig::default().bloom_filter_index, + }; + + // Write SST + println!("{}", "Writing SST...".yellow()); + + // Start profiling if pprof_file is specified + #[cfg(unix)] + let profiler_guard = if self.pprof_file.is_some() { + println!("{} Starting profiling...", "⚡".yellow()); + Some( + pprof::ProfilerGuardBuilder::default() + .frequency(99) + .blocklist(&["libc", "libgcc", "pthread", "vdso"]) + .build() + .map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("Failed to start profiler: {e}"), + } + .build() + })?, + ) + } else { + None + }; + + #[cfg(not(unix))] + if self.pprof_file.is_some() { + eprintln!( + "{}: Profiling is not supported on this platform", + "Warning".yellow() + ); + } + + let write_start = Instant::now(); + let mut metrics = Metrics::new(WriteType::Flush); + let infos = src_access_layer + .write_sst(write_req, &WriteOptions::default(), &mut metrics) + .await + .map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("write_sst failed: {e:?}"), + } + .build() + })?; + + let write_elapsed = write_start.elapsed(); + + // Stop profiling and generate flamegraph if enabled + #[cfg(unix)] + if let (Some(guard), Some(pprof_file)) = (profiler_guard, &self.pprof_file) { + println!("{} Generating flamegraph...", "🔥".yellow()); + match guard.report().build() { + Ok(report) => { + let mut flamegraph_data = Vec::new(); + if let Err(e) = report.flamegraph(&mut flamegraph_data) { + println!("{}: Failed to generate flamegraph: {}", "Error".red(), e); + } else if let Err(e) = std::fs::write(pprof_file, flamegraph_data) { + println!( + "{}: Failed to write flamegraph to {}: {}", + "Error".red(), + pprof_file.display(), + e + ); + } else { + println!( + "{} Flamegraph saved to {}", + "✓".green(), + pprof_file.display().to_string().cyan() + ); + } + } + Err(e) => { + println!("{}: Failed to generate pprof report: {}", "Error".red(), e); + } + } + } + assert_eq!(infos.len(), 1); + let dst_file_id = infos[0].file_id; + let dst_file_path = format!("{}/{}.parquet", components.region_dir(), dst_file_id); + let mut dst_index_path = None; + if infos[0].index_metadata.file_size > 0 { + dst_index_path = Some(format!( + "{}/index/{}.puffin", + components.region_dir(), + dst_file_id + )); + } + + // Report results with ANSI colors + println!("\n{} {}", "Write complete!".green().bold(), "✓".green()); + println!(" {}: {}", "Destination file".bold(), dst_file_path.cyan()); + println!(" {}: {}", "Rows".bold(), total_rows.to_string().cyan()); + println!( + " {}: {}", + "File size".bold(), + format!("{} bytes", file_size).cyan() + ); + println!( + " {}: {:?}", + "Reader build time".bold(), + reader_build_elapsed + ); + println!(" {}: {:?}", "Total time".bold(), write_elapsed); + + // Print metrics in a formatted way + println!(" {}: {:?}", "Metrics".bold(), metrics,); + + // Print infos + println!(" {}: {:?}", "Index".bold(), infos[0].index_metadata); + + // Cleanup + println!("\n{}", "Cleaning up...".yellow()); + object_store.delete(&dst_file_path).await.map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("Failed to delete dest file {}: {}", dst_file_path, e), + } + .build() + })?; + println!("{} Temporary file {} deleted", "✓".green(), dst_file_path); + + if let Some(index_path) = dst_index_path { + object_store.delete(&index_path).await.map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("Failed to delete dest index file {}: {}", index_path, e), + } + .build() + })?; + println!( + "{} Temporary index file {} deleted", + "✓".green(), + index_path + ); + } + + println!("\n{}", "Benchmark completed successfully!".green().bold()); + Ok(()) + } +} + +#[derive(Debug)] +struct FileDirComponents { + catalog: String, + schema: String, + table_id: u32, + region_sequence: u32, + path_type: PathType, + file_id: FileId, +} + +impl FileDirComponents { + fn table_dir(&self) -> String { + format!("data/{}/{}/{}", self.catalog, self.schema, self.table_id) + } + + fn region_dir(&self) -> String { + let region_name = region_name(self.table_id, self.region_sequence); + match self.path_type { + PathType::Bare => { + format!( + "data/{}/{}/{}/{}", + self.catalog, self.schema, self.table_id, region_name + ) + } + PathType::Data => { + format!( + "data/{}/{}/{}/{}/data", + self.catalog, self.schema, self.table_id, region_name + ) + } + PathType::Metadata => { + format!( + "data/{}/{}/{}/{}/metadata", + self.catalog, self.schema, self.table_id, region_name + ) + } + } + } +} + +fn parse_file_dir_components(path: &str) -> error::Result { + // Define the regex pattern to match all three path styles + let pattern = + r"^data/([^/]+)/([^/]+)/([^/]+)/([^/]+)_([^/]+)(?:/data|/metadata)?/(.+).parquet$"; + + // Compile the regex + let re = Regex::new(pattern).expect("Invalid regex pattern"); + + // Determine the path type + let path_type = if path.contains("/data/") { + PathType::Data + } else if path.contains("/metadata/") { + PathType::Metadata + } else { + PathType::Bare + }; + + // Try to match the path + let components = (|| { + let captures = re.captures(path)?; + if captures.len() != 7 { + return None; + } + let mut components = FileDirComponents { + catalog: "".to_string(), + schema: "".to_string(), + table_id: 0, + region_sequence: 0, + path_type, + file_id: FileId::default(), + }; + // Extract the components + components.catalog = captures.get(1)?.as_str().to_string(); + components.schema = captures.get(2)?.as_str().to_string(); + components.table_id = captures[3].parse().ok()?; + components.region_sequence = captures[5].parse().ok()?; + let file_id_str = &captures[6]; + components.file_id = FileId::parse_str(file_id_str).ok()?; + Some(components) + })(); + components.context(error::IllegalConfigSnafu { + msg: format!("Expect valid source file path, got: {}", path), + }) +} + +fn extract_region_metadata( + file_path: &str, + meta: &parquet::file::metadata::ParquetMetaData, +) -> error::Result { + use parquet::format::KeyValue; + let kvs: Option<&Vec> = meta.file_metadata().key_value_metadata(); + let Some(kvs) = kvs else { + return Err(error::IllegalConfigSnafu { + msg: format!("{file_path}: missing parquet key_value metadata"), + } + .build()); + }; + let json = kvs + .iter() + .find(|kv| kv.key == PARQUET_METADATA_KEY) + .and_then(|kv| kv.value.as_ref()) + .ok_or_else(|| { + error::IllegalConfigSnafu { + msg: format!("{file_path}: key {PARQUET_METADATA_KEY} not found or empty"), + } + .build() + })?; + let region: RegionMetadata = RegionMetadata::from_json(json).map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("invalid region metadata json: {e}"), + } + .build() + })?; + Ok(Arc::new(region)) +} + +async fn build_object_store(sc: &StorageConfig) -> error::Result { + store::new_object_store(sc.store.clone(), &sc.data_home) + .await + .map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("Failed to build object store: {e:?}"), + } + .build() + }) +} + +async fn build_access_layer_simple( + components: &FileDirComponents, + object_store: ObjectStore, + config: &mut MitoConfig, + data_home: &str, +) -> error::Result<(AccessLayerRef, CacheManagerRef)> { + let _ = config.index.sanitize(data_home, &config.inverted_index); + let puffin_manager = PuffinManagerFactory::new( + &config.index.aux_path, + config.index.staging_size.as_bytes(), + Some(config.index.write_buffer_size.as_bytes() as _), + config.index.staging_ttl, + ) + .await + .map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("Failed to build access layer: {e:?}"), + } + .build() + })?; + + let intermediate_manager = IntermediateManager::init_fs(&config.index.aux_path) + .await + .map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("Failed to build IntermediateManager: {e:?}"), + } + .build() + })? + .with_buffer_size(Some(config.index.write_buffer_size.as_bytes() as _)); + + let cache_manager = + build_cache_manager(config, puffin_manager.clone(), intermediate_manager.clone()).await?; + let layer = AccessLayer::new( + components.table_dir(), + components.path_type, + object_store, + puffin_manager, + intermediate_manager, + ); + Ok((Arc::new(layer), cache_manager)) +} + +async fn build_cache_manager( + config: &MitoConfig, + puffin_manager: PuffinManagerFactory, + intermediate_manager: IntermediateManager, +) -> error::Result { + let write_cache = write_cache_from_config(config, puffin_manager, intermediate_manager) + .await + .map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("Failed to build write cache: {e:?}"), + } + .build() + })?; + let cache_manager = Arc::new( + CacheManager::builder() + .sst_meta_cache_size(config.sst_meta_cache_size.as_bytes()) + .vector_cache_size(config.vector_cache_size.as_bytes()) + .page_cache_size(config.page_cache_size.as_bytes()) + .selector_result_cache_size(config.selector_result_cache_size.as_bytes()) + .index_metadata_size(config.index.metadata_cache_size.as_bytes()) + .index_content_size(config.index.content_cache_size.as_bytes()) + .index_content_page_size(config.index.content_cache_page_size.as_bytes()) + .index_result_cache_size(config.index.result_cache_size.as_bytes()) + .puffin_metadata_size(config.index.metadata_cache_size.as_bytes()) + .write_cache(write_cache) + .build(), + ); + Ok(cache_manager) +} + +fn new_noop_file_purger() -> FilePurgerRef { + #[derive(Debug)] + struct Noop; + impl FilePurger for Noop { + fn remove_file(&self, _file_meta: FileMeta, _is_delete: bool) {} + } + Arc::new(Noop) +} + +async fn load_parquet_metadata( + object_store: ObjectStore, + path: &str, + file_size: u64, +) -> Result> { + use parquet::file::FOOTER_SIZE; + use parquet::file::metadata::ParquetMetaDataReader; + let actual_size = if file_size == 0 { + object_store.stat(path).await?.content_length() + } else { + file_size + }; + if actual_size < FOOTER_SIZE as u64 { + return Err("file too small".into()); + } + let prefetch: u64 = 64 * 1024; + let start = actual_size.saturating_sub(prefetch); + let buffer = object_store + .read_with(path) + .range(start..actual_size) + .await? + .to_vec(); + let buffer_len = buffer.len(); + let mut footer = [0; 8]; + footer.copy_from_slice(&buffer[buffer_len - FOOTER_SIZE..]); + let footer = ParquetMetaDataReader::decode_footer_tail(&footer)?; + let metadata_len = footer.metadata_length() as u64; + if actual_size - (FOOTER_SIZE as u64) < metadata_len { + return Err("invalid footer/metadata length".into()); + } + if (metadata_len as usize) <= buffer_len - FOOTER_SIZE { + let metadata_start = buffer_len - metadata_len as usize - FOOTER_SIZE; + let meta = ParquetMetaDataReader::decode_metadata( + &buffer[metadata_start..buffer_len - FOOTER_SIZE], + )?; + Ok(meta) + } else { + let metadata_start = actual_size - metadata_len - FOOTER_SIZE as u64; + let data = object_store + .read_with(path) + .range(metadata_start..(actual_size - FOOTER_SIZE as u64)) + .await? + .to_vec(); + let meta = ParquetMetaDataReader::decode_metadata(&data)?; + Ok(meta) + } +} + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + use std::str::FromStr; + + use common_base::readable_size::ReadableSize; + use store_api::region_request::PathType; + + use crate::datanode::objbench::{parse_config, parse_file_dir_components}; + + #[test] + fn test_parse_dir() { + let meta_path = "data/greptime/public/1024/1024_0000000000/metadata/00020380-009c-426d-953e-b4e34c15af34.parquet"; + let c = parse_file_dir_components(meta_path).unwrap(); + assert_eq!( + c.file_id.to_string(), + "00020380-009c-426d-953e-b4e34c15af34" + ); + assert_eq!(c.catalog, "greptime"); + assert_eq!(c.schema, "public"); + assert_eq!(c.table_id, 1024); + assert_eq!(c.region_sequence, 0); + assert_eq!(c.path_type, PathType::Metadata); + + let c = parse_file_dir_components( + "data/greptime/public/1024/1024_0000000000/data/00020380-009c-426d-953e-b4e34c15af34.parquet", + ).unwrap(); + assert_eq!( + c.file_id.to_string(), + "00020380-009c-426d-953e-b4e34c15af34" + ); + assert_eq!(c.catalog, "greptime"); + assert_eq!(c.schema, "public"); + assert_eq!(c.table_id, 1024); + assert_eq!(c.region_sequence, 0); + assert_eq!(c.path_type, PathType::Data); + + let c = parse_file_dir_components( + "data/greptime/public/1024/1024_0000000000/00020380-009c-426d-953e-b4e34c15af34.parquet", + ).unwrap(); + assert_eq!( + c.file_id.to_string(), + "00020380-009c-426d-953e-b4e34c15af34" + ); + assert_eq!(c.catalog, "greptime"); + assert_eq!(c.schema, "public"); + assert_eq!(c.table_id, 1024); + assert_eq!(c.region_sequence, 0); + assert_eq!(c.path_type, PathType::Bare); + } + + #[test] + fn test_parse_config() { + let path = "../../config/datanode.example.toml"; + let (storage, engine) = parse_config(&PathBuf::from_str(path).unwrap()).unwrap(); + assert_eq!(storage.data_home, "./greptimedb_data"); + assert_eq!(engine.index.staging_size, ReadableSize::gb(2)); + } +} diff --git a/src/datanode/src/store.rs b/src/datanode/src/store.rs index 8d1c8c99dc..6dc6f280c6 100644 --- a/src/datanode/src/store.rs +++ b/src/datanode/src/store.rs @@ -47,10 +47,7 @@ pub(crate) async fn new_object_store_without_cache( Ok(object_store) } -pub(crate) async fn new_object_store( - store: ObjectStoreConfig, - data_home: &str, -) -> Result { +pub async fn new_object_store(store: ObjectStoreConfig, data_home: &str) -> Result { let object_store = new_raw_object_store(&store, data_home) .await .context(error::ObjectStoreSnafu)?; @@ -59,7 +56,7 @@ pub(crate) async fn new_object_store( let object_store = { // It's safe to unwrap here because we already checked above. let cache_config = store.cache_config().unwrap(); - if let Some(cache_layer) = build_cache_layer(cache_config).await? { + if let Some(cache_layer) = build_cache_layer(cache_config, data_home).await? { // Adds cache layer object_store.layer(cache_layer) } else { @@ -79,17 +76,22 @@ pub(crate) async fn new_object_store( async fn build_cache_layer( cache_config: &ObjectStorageCacheConfig, + data_home: &str, ) -> Result>> { // No need to build cache layer if read cache is disabled. if !cache_config.enable_read_cache { return Ok(None); } - - let atomic_temp_dir = join_dir(&cache_config.cache_path, ATOMIC_WRITE_DIR); + let cache_base_dir = if cache_config.cache_path.is_empty() { + data_home + } else { + &cache_config.cache_path + }; + let atomic_temp_dir = join_dir(cache_base_dir, ATOMIC_WRITE_DIR); clean_temp_dir(&atomic_temp_dir).context(error::ObjectStoreSnafu)?; let cache_store = Fs::default() - .root(&cache_config.cache_path) + .root(cache_base_dir) .atomic_write_dir(&atomic_temp_dir) .build() .context(error::BuildCacheStoreSnafu)?; diff --git a/src/mito2/src/access_layer.rs b/src/mito2/src/access_layer.rs index e5401209ca..2eb8a2ea0e 100644 --- a/src/mito2/src/access_layer.rs +++ b/src/mito2/src/access_layer.rs @@ -72,7 +72,7 @@ pub struct Metrics { } impl Metrics { - pub(crate) fn new(write_type: WriteType) -> Self { + pub fn new(write_type: WriteType) -> Self { Self { write_type, iter_source: Default::default(), @@ -255,12 +255,12 @@ impl AccessLayer { &self, request: SstWriteRequest, write_opts: &WriteOptions, - write_type: WriteType, - ) -> Result<(SstInfoArray, Metrics)> { + metrics: &mut Metrics, + ) -> Result { let region_id = request.metadata.region_id; let cache_manager = request.cache_manager.clone(); - let (sst_info, metrics) = if let Some(write_cache) = cache_manager.write_cache() { + let sst_info = if let Some(write_cache) = cache_manager.write_cache() { // Write to the write cache. write_cache .write_and_upload_sst( @@ -273,7 +273,7 @@ impl AccessLayer { remote_store: self.object_store.clone(), }, write_opts, - write_type, + metrics, ) .await? } else { @@ -303,11 +303,11 @@ impl AccessLayer { request.index_config, indexer_builder, path_provider, - Metrics::new(write_type), + metrics, ) .await .with_file_cleaner(cleaner); - let ssts = match request.source { + match request.source { Either::Left(source) => { writer .write_all(source, request.max_sequence, write_opts) @@ -316,9 +316,7 @@ impl AccessLayer { Either::Right(flat_source) => { writer.write_all_flat(flat_source, write_opts).await? } - }; - let metrics = writer.into_metrics(); - (ssts, metrics) + } }; // Put parquet metadata to cache manager. @@ -333,7 +331,7 @@ impl AccessLayer { } } - Ok((sst_info, metrics)) + Ok(sst_info) } /// Puts encoded SST bytes to the write cache (if enabled) and uploads it to the object store. diff --git a/src/mito2/src/cache/write_cache.rs b/src/mito2/src/cache/write_cache.rs index d2b7e34997..96f2030562 100644 --- a/src/mito2/src/cache/write_cache.rs +++ b/src/mito2/src/cache/write_cache.rs @@ -169,8 +169,8 @@ impl WriteCache { write_request: SstWriteRequest, upload_request: SstUploadRequest, write_opts: &WriteOptions, - write_type: WriteType, - ) -> Result<(SstInfoArray, Metrics)> { + metrics: &mut Metrics, + ) -> Result { let region_id = write_request.metadata.region_id; let store = self.file_cache.local_store(); @@ -197,7 +197,7 @@ impl WriteCache { write_request.index_config, indexer, path_provider.clone(), - Metrics::new(write_type), + metrics, ) .await .with_file_cleaner(cleaner); @@ -210,11 +210,10 @@ impl WriteCache { } either::Right(flat_source) => writer.write_all_flat(flat_source, write_opts).await?, }; - let mut metrics = writer.into_metrics(); // Upload sst file to remote object store. if sst_info.is_empty() { - return Ok((sst_info, metrics)); + return Ok(sst_info); } let mut upload_tracker = UploadTracker::new(region_id); @@ -256,7 +255,7 @@ impl WriteCache { return Err(err); } - Ok((sst_info, metrics)) + Ok(sst_info) } /// Removes a file from the cache by `index_key`. @@ -559,8 +558,9 @@ mod tests { }; // Write to cache and upload sst to mock remote store - let (mut sst_infos, _) = write_cache - .write_and_upload_sst(write_request, upload_request, &write_opts, WriteType::Flush) + let mut metrics = Metrics::new(WriteType::Flush); + let mut sst_infos = write_cache + .write_and_upload_sst(write_request, upload_request, &write_opts, &mut metrics) .await .unwrap(); let sst_info = sst_infos.remove(0); @@ -655,8 +655,9 @@ mod tests { remote_store: mock_store.clone(), }; - let (mut sst_infos, _) = write_cache - .write_and_upload_sst(write_request, upload_request, &write_opts, WriteType::Flush) + let mut metrics = Metrics::new(WriteType::Flush); + let mut sst_infos = write_cache + .write_and_upload_sst(write_request, upload_request, &write_opts, &mut metrics) .await .unwrap(); let sst_info = sst_infos.remove(0); @@ -735,8 +736,9 @@ mod tests { remote_store: mock_store.clone(), }; + let mut metrics = Metrics::new(WriteType::Flush); write_cache - .write_and_upload_sst(write_request, upload_request, &write_opts, WriteType::Flush) + .write_and_upload_sst(write_request, upload_request, &write_opts, &mut metrics) .await .unwrap_err(); let atomic_write_dir = write_cache_dir.path().join(ATOMIC_WRITE_DIR); diff --git a/src/mito2/src/compaction/compactor.rs b/src/mito2/src/compaction/compactor.rs index 2b871947c0..52b8fe068a 100644 --- a/src/mito2/src/compaction/compactor.rs +++ b/src/mito2/src/compaction/compactor.rs @@ -30,7 +30,9 @@ use store_api::metadata::RegionMetadataRef; use store_api::region_request::PathType; use store_api::storage::RegionId; -use crate::access_layer::{AccessLayer, AccessLayerRef, OperationType, SstWriteRequest, WriteType}; +use crate::access_layer::{ + AccessLayer, AccessLayerRef, Metrics, OperationType, SstWriteRequest, WriteType, +}; use crate::cache::{CacheManager, CacheManagerRef}; use crate::compaction::picker::{PickerOutput, new_picker}; use crate::compaction::{CompactionSstReaderBuilder, find_ttl}; @@ -387,7 +389,8 @@ impl Compactor for DefaultCompactor { let reader = builder.build_sst_reader().await?; either::Left(Source::Reader(reader)) }; - let (sst_infos, metrics) = sst_layer + let mut metrics = Metrics::new(WriteType::Compaction); + let sst_infos = sst_layer .write_sst( SstWriteRequest { op_type: OperationType::Compact, @@ -403,7 +406,7 @@ impl Compactor for DefaultCompactor { bloom_filter_index_config, }, &write_opts, - WriteType::Compaction, + &mut metrics, ) .await?; // Convert partition expression once outside the map diff --git a/src/mito2/src/flush.rs b/src/mito2/src/flush.rs index ddad947f8a..0b0b4b05db 100644 --- a/src/mito2/src/flush.rs +++ b/src/mito2/src/flush.rs @@ -525,21 +525,19 @@ impl RegionFlushTask { let source = Either::Left(source); let write_request = self.new_write_request(version, max_sequence, source); - let (ssts_written, metrics) = self + let mut metrics = Metrics::new(WriteType::Flush); + let ssts_written = self .access_layer - .write_sst(write_request, &write_opts, WriteType::Flush) + .write_sst(write_request, &write_opts, &mut metrics) .await?; if ssts_written.is_empty() { // No data written. continue; } - common_telemetry::debug!( + debug!( "Region {} flush one memtable, num_mem_ranges: {}, num_rows: {}, metrics: {:?}", - self.region_id, - num_mem_ranges, - num_mem_rows, - metrics + self.region_id, num_mem_ranges, num_mem_rows, metrics ); flush_metrics = flush_metrics.merge(metrics); @@ -591,9 +589,11 @@ impl RegionFlushTask { let semaphore = self.flush_semaphore.clone(); let task = common_runtime::spawn_global(async move { let _permit = semaphore.acquire().await.unwrap(); - access_layer - .write_sst(write_request, &write_opts, WriteType::Flush) - .await + let mut metrics = Metrics::new(WriteType::Flush); + let ssts = access_layer + .write_sst(write_request, &write_opts, &mut metrics) + .await?; + Ok((ssts, metrics)) }); tasks.push(task); } diff --git a/src/mito2/src/lib.rs b/src/mito2/src/lib.rs index 45ce635148..a15711b34a 100644 --- a/src/mito2/src/lib.rs +++ b/src/mito2/src/lib.rs @@ -47,7 +47,7 @@ pub mod schedule; pub mod sst; mod time_provider; pub mod wal; -mod worker; +pub mod worker; #[cfg_attr(doc, aquamarine::aquamarine)] /// # Mito developer document diff --git a/src/mito2/src/sst/file.rs b/src/mito2/src/sst/file.rs index ae255e9407..dc5727f9cc 100644 --- a/src/mito2/src/sst/file.rs +++ b/src/mito2/src/sst/file.rs @@ -295,8 +295,8 @@ impl FileHandle { } /// Returns the complete file path of the file. - pub fn file_path(&self, file_dir: &str, path_type: PathType) -> String { - location::sst_file_path(file_dir, self.file_id(), path_type) + pub fn file_path(&self, table_dir: &str, path_type: PathType) -> String { + location::sst_file_path(table_dir, self.file_id(), path_type) } /// Returns the time range of the file. diff --git a/src/mito2/src/sst/index.rs b/src/mito2/src/sst/index.rs index cc8469332a..e67a4b6e98 100644 --- a/src/mito2/src/sst/index.rs +++ b/src/mito2/src/sst/index.rs @@ -791,7 +791,7 @@ mod tests { use tokio::sync::mpsc; use super::*; - use crate::access_layer::{FilePathProvider, SstWriteRequest, WriteType}; + use crate::access_layer::{FilePathProvider, Metrics, SstWriteRequest, WriteType}; use crate::cache::write_cache::WriteCache; use crate::config::{FulltextIndexConfig, IndexBuildMode, MitoConfig, Mode}; use crate::memtable::time_partition::TimePartitions; @@ -927,11 +927,11 @@ mod tests { fulltext_index_config: Default::default(), bloom_filter_index_config: Default::default(), }; + let mut metrics = Metrics::new(WriteType::Flush); env.access_layer - .write_sst(write_request, &WriteOptions::default(), WriteType::Flush) + .write_sst(write_request, &WriteOptions::default(), &mut metrics) .await .unwrap() - .0 .remove(0) } diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs index 83cd17acc8..5a11a15e70 100644 --- a/src/mito2/src/sst/parquet.rs +++ b/src/mito2/src/sst/parquet.rs @@ -181,13 +181,14 @@ mod tests { ..Default::default() }; + let mut metrics = Metrics::new(WriteType::Flush); let mut writer = ParquetWriter::new_with_object_store( object_store.clone(), metadata.clone(), IndexConfig::default(), NoopIndexBuilder, file_path, - Metrics::new(WriteType::Flush), + &mut metrics, ) .await; @@ -243,6 +244,7 @@ mod tests { ..Default::default() }; // Prepare data. + let mut metrics = Metrics::new(WriteType::Flush); let mut writer = ParquetWriter::new_with_object_store( object_store.clone(), metadata.clone(), @@ -251,7 +253,7 @@ mod tests { FixedPathProvider { region_file_id: handle.file_id(), }, - Metrics::new(WriteType::Flush), + &mut metrics, ) .await; @@ -329,6 +331,7 @@ mod tests { // write the sst file and get sst info // sst info contains the parquet metadata, which is converted from FileMetaData + let mut metrics = Metrics::new(WriteType::Flush); let mut writer = ParquetWriter::new_with_object_store( object_store.clone(), metadata.clone(), @@ -337,7 +340,7 @@ mod tests { FixedPathProvider { region_file_id: handle.file_id(), }, - Metrics::new(WriteType::Flush), + &mut metrics, ) .await; @@ -378,6 +381,7 @@ mod tests { ..Default::default() }; // Prepare data. + let mut metrics = Metrics::new(WriteType::Flush); let mut writer = ParquetWriter::new_with_object_store( object_store.clone(), metadata.clone(), @@ -386,7 +390,7 @@ mod tests { FixedPathProvider { region_file_id: handle.file_id(), }, - Metrics::new(WriteType::Flush), + &mut metrics, ) .await; writer @@ -437,6 +441,7 @@ mod tests { ..Default::default() }; // Prepare data. + let mut metrics = Metrics::new(WriteType::Flush); let mut writer = ParquetWriter::new_with_object_store( object_store.clone(), metadata.clone(), @@ -445,7 +450,7 @@ mod tests { FixedPathProvider { region_file_id: handle.file_id(), }, - Metrics::new(WriteType::Flush), + &mut metrics, ) .await; writer @@ -481,6 +486,7 @@ mod tests { ..Default::default() }; // Prepare data. + let mut metrics = Metrics::new(WriteType::Flush); let mut writer = ParquetWriter::new_with_object_store( object_store.clone(), metadata.clone(), @@ -489,7 +495,7 @@ mod tests { FixedPathProvider { region_file_id: handle.file_id(), }, - Metrics::new(WriteType::Flush), + &mut metrics, ) .await; @@ -639,13 +645,14 @@ mod tests { table_dir: "test".to_string(), path_type: PathType::Bare, }; + let mut metrics = Metrics::new(WriteType::Flush); let mut writer = ParquetWriter::new_with_object_store( object_store.clone(), metadata.clone(), IndexConfig::default(), NoopIndexBuilder, path_provider, - Metrics::new(WriteType::Flush), + &mut metrics, ) .await; @@ -716,13 +723,14 @@ mod tests { bloom_filter_index_config: Default::default(), }; + let mut metrics = Metrics::new(WriteType::Flush); let mut writer = ParquetWriter::new_with_object_store( object_store.clone(), metadata.clone(), IndexConfig::default(), indexer_builder, file_path.clone(), - Metrics::new(WriteType::Flush), + &mut metrics, ) .await; @@ -1092,13 +1100,14 @@ mod tests { bloom_filter_index_config: Default::default(), }; + let mut metrics = Metrics::new(WriteType::Flush); let mut writer = ParquetWriter::new_with_object_store( object_store.clone(), metadata.clone(), IndexConfig::default(), indexer_builder, file_path.clone(), - Metrics::new(WriteType::Flush), + &mut metrics, ) .await; @@ -1148,13 +1157,14 @@ mod tests { ..Default::default() }; + let mut metrics = Metrics::new(WriteType::Flush); let mut writer = ParquetWriter::new_with_object_store( object_store.clone(), metadata.clone(), IndexConfig::default(), NoopIndexBuilder, file_path, - Metrics::new(WriteType::Flush), + &mut metrics, ) .await; diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs index 60cf654380..21fc6511f8 100644 --- a/src/mito2/src/sst/parquet/reader.rs +++ b/src/mito2/src/sst/parquet/reader.rs @@ -91,7 +91,7 @@ macro_rules! handle_index_error { /// Parquet SST reader builder. pub struct ParquetReaderBuilder { /// SST directory. - file_dir: String, + table_dir: String, /// Path type for generating file paths. path_type: PathType, file_handle: FileHandle, @@ -122,13 +122,13 @@ pub struct ParquetReaderBuilder { impl ParquetReaderBuilder { /// Returns a new [ParquetReaderBuilder] to read specific SST. pub fn new( - file_dir: String, + table_dir: String, path_type: PathType, file_handle: FileHandle, object_store: ObjectStore, ) -> ParquetReaderBuilder { ParquetReaderBuilder { - file_dir, + table_dir, path_type, file_handle, object_store, @@ -237,7 +237,7 @@ impl ParquetReaderBuilder { ) -> Result<(FileRangeContext, RowGroupSelection)> { let start = Instant::now(); - let file_path = self.file_handle.file_path(&self.file_dir, self.path_type); + let file_path = self.file_handle.file_path(&self.table_dir, self.path_type); let file_size = self.file_handle.meta_ref().file_size; // Loads parquet metadata of the file. @@ -1227,7 +1227,6 @@ impl ParquetReader { self.context.read_format().metadata() } - #[cfg(test)] pub fn parquet_metadata(&self) -> Arc { self.context.reader_builder().parquet_meta.clone() } diff --git a/src/mito2/src/sst/parquet/writer.rs b/src/mito2/src/sst/parquet/writer.rs index d52615690f..857ec08878 100644 --- a/src/mito2/src/sst/parquet/writer.rs +++ b/src/mito2/src/sst/parquet/writer.rs @@ -62,7 +62,7 @@ use crate::sst::{ }; /// Parquet SST writer. -pub struct ParquetWriter { +pub struct ParquetWriter<'a, F: WriterFactory, I: IndexerBuilder, P: FilePathProvider> { /// Path provider that creates SST and index file paths according to file id. path_provider: P, writer: Option>>, @@ -81,7 +81,7 @@ pub struct ParquetWriter, /// Write metrics - metrics: Metrics, + metrics: &'a mut Metrics, } pub trait WriterFactory { @@ -107,7 +107,7 @@ impl WriterFactory for ObjectStoreWriterFactory { } } -impl ParquetWriter +impl<'a, I, P> ParquetWriter<'a, ObjectStoreWriterFactory, I, P> where P: FilePathProvider, I: IndexerBuilder, @@ -118,8 +118,8 @@ where index_config: IndexConfig, indexer_builder: I, path_provider: P, - metrics: Metrics, - ) -> ParquetWriter { + metrics: &'a mut Metrics, + ) -> ParquetWriter<'a, ObjectStoreWriterFactory, I, P> { ParquetWriter::new( ObjectStoreWriterFactory { object_store }, metadata, @@ -137,7 +137,7 @@ where } } -impl ParquetWriter +impl<'a, F, I, P> ParquetWriter<'a, F, I, P> where F: WriterFactory, I: IndexerBuilder, @@ -150,8 +150,8 @@ where index_config: IndexConfig, indexer_builder: I, path_provider: P, - metrics: Metrics, - ) -> ParquetWriter { + metrics: &'a mut Metrics, + ) -> ParquetWriter<'a, F, I, P> { let init_file = FileId::random(); let indexer = indexer_builder.build(init_file).await; @@ -487,11 +487,6 @@ where Ok(self.writer.as_mut().unwrap()) } } - - /// Consumes write and return the collected metrics. - pub fn into_metrics(self) -> Metrics { - self.metrics - } } #[derive(Default)] diff --git a/src/mito2/src/worker.rs b/src/mito2/src/worker.rs index 75aff36b52..adb88f3467 100644 --- a/src/mito2/src/worker.rs +++ b/src/mito2/src/worker.rs @@ -423,7 +423,7 @@ fn region_id_to_index(id: RegionId, num_workers: usize) -> usize { % num_workers } -async fn write_cache_from_config( +pub async fn write_cache_from_config( config: &MitoConfig, puffin_manager_factory: PuffinManagerFactory, intermediate_manager: IntermediateManager, From 6576e3555dcd9a1d51e118d436a9bdd7dbb32378 Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Wed, 29 Oct 2025 17:57:28 +0800 Subject: [PATCH 021/149] fix: cache estimate methods (#7157) * fix: cache estimate methods Signed-off-by: Ruihang Xia * revert page value change Signed-off-by: Ruihang Xia * Apply suggestion from @evenyag Co-authored-by: Yingwen * update test Signed-off-by: Ruihang Xia --------- Signed-off-by: Ruihang Xia Co-authored-by: Yingwen --- .../src/cache/index/bloom_filter_index.rs | 53 +++++++++++++++++-- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/src/mito2/src/cache/index/bloom_filter_index.rs b/src/mito2/src/cache/index/bloom_filter_index.rs index 9e8d864d7d..b4e7804b93 100644 --- a/src/mito2/src/cache/index/bloom_filter_index.rs +++ b/src/mito2/src/cache/index/bloom_filter_index.rs @@ -15,7 +15,7 @@ use std::ops::Range; use std::sync::Arc; -use api::v1::index::BloomFilterMeta; +use api::v1::index::{BloomFilterLoc, BloomFilterMeta}; use async_trait::async_trait; use bytes::Bytes; use index::bloom_filter::error::Result; @@ -60,11 +60,17 @@ impl BloomFilterIndexCache { /// Calculates weight for bloom filter index metadata. fn bloom_filter_index_metadata_weight( k: &(FileId, ColumnId, Tag), - _: &Arc, + meta: &Arc, ) -> u32 { - (k.0.as_bytes().len() + let base = k.0.as_bytes().len() + std::mem::size_of::() - + std::mem::size_of::()) as u32 + + std::mem::size_of::() + + std::mem::size_of::(); + + let vec_estimated = meta.segment_loc_indices.len() * std::mem::size_of::() + + meta.bloom_filter_locs.len() * std::mem::size_of::(); + + (base + vec_estimated) as u32 } /// Calculates weight for bloom filter index content. @@ -171,6 +177,45 @@ mod test { const FUZZ_REPEAT_TIMES: usize = 100; + #[test] + fn bloom_filter_metadata_weight_counts_vec_contents() { + let file_id = FileId::parse_str("00000000-0000-0000-0000-000000000001").unwrap(); + let column_id: ColumnId = 42; + let tag = Tag::Skipping; + + let meta = BloomFilterMeta { + rows_per_segment: 128, + segment_count: 2, + row_count: 256, + bloom_filter_size: 1024, + segment_loc_indices: vec![0, 64, 128, 192], + bloom_filter_locs: vec![ + BloomFilterLoc { + offset: 0, + size: 512, + element_count: 1000, + }, + BloomFilterLoc { + offset: 512, + size: 512, + element_count: 1000, + }, + ], + }; + + let weight = + bloom_filter_index_metadata_weight(&(file_id, column_id, tag), &Arc::new(meta.clone())); + + let base = file_id.as_bytes().len() + + std::mem::size_of::() + + std::mem::size_of::() + + std::mem::size_of::(); + let expected_dynamic = meta.segment_loc_indices.len() * std::mem::size_of::() + + meta.bloom_filter_locs.len() * std::mem::size_of::(); + + assert_eq!(weight as usize, base + expected_dynamic); + } + #[test] fn fuzz_index_calculation() { let mut rng = rand::rng(); From 6efffa427d4b8ef794ef923843375ae07fc16a68 Mon Sep 17 00:00:00 2001 From: Weny Xu Date: Wed, 29 Oct 2025 19:41:21 +0800 Subject: [PATCH 022/149] fix: missing flamegraph feature in pprof dependency (#7158) fix: fix pprof deps Signed-off-by: WenyXu --- src/cmd/Cargo.toml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/cmd/Cargo.toml b/src/cmd/Cargo.toml index 94dc3da56b..7a957b509b 100644 --- a/src/cmd/Cargo.toml +++ b/src/cmd/Cargo.toml @@ -71,7 +71,6 @@ nu-ansi-term = "0.46" object-store.workspace = true parquet = { workspace = true, features = ["object_store"] } plugins.workspace = true -pprof = "0.14.0" prometheus.workspace = true prost.workspace = true query.workspace = true @@ -93,6 +92,11 @@ toml.workspace = true tonic.workspace = true tracing-appender.workspace = true +[target.'cfg(unix)'.dependencies] +pprof = { version = "0.14", features = [ + "flamegraph", +] } + [target.'cfg(not(windows))'.dependencies] tikv-jemallocator = "0.6" From e8b7b0ad162e7c4bac7a36789f3bc534582339dd Mon Sep 17 00:00:00 2001 From: LFC <990479+MichaelScofield@users.noreply.github.com> Date: Wed, 29 Oct 2025 21:44:36 +0800 Subject: [PATCH 023/149] fix: memtable value push result was ignored (#7136) * fix: memtable value push result was ignored Signed-off-by: luofucong * chore: apply suggestion Co-authored-by: Yingwen --------- Signed-off-by: luofucong Co-authored-by: dennis zhuang Co-authored-by: Yingwen --- src/mito2/src/memtable/time_series.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/mito2/src/memtable/time_series.rs b/src/mito2/src/memtable/time_series.rs index 60fe2f0bcd..9138fd6da8 100644 --- a/src/mito2/src/memtable/time_series.rs +++ b/src/mito2/src/memtable/time_series.rs @@ -922,7 +922,9 @@ impl ValueBuilder { ) }; mutable_vector.push_nulls(num_rows - 1); - let _ = mutable_vector.push(field_value); + mutable_vector + .push(field_value) + .unwrap_or_else(|e| panic!("unexpected field value: {e:?}")); self.fields[idx] = Some(mutable_vector); MEMTABLE_ACTIVE_FIELD_BUILDER_COUNT.inc(); } From 11c0381fc133eec236cf25fe8f8a16ecbf792d67 Mon Sep 17 00:00:00 2001 From: shuiyisong <113876041+shuiyisong@users.noreply.github.com> Date: Thu, 30 Oct 2025 02:10:58 +0800 Subject: [PATCH 024/149] chore: set default catalog using build env (#7156) * chore: update reference to const Signed-off-by: shuiyisong * chore: use option_env to set default catalog Signed-off-by: shuiyisong * chore: use const_format Signed-off-by: shuiyisong * chore: update reference in cli Signed-off-by: shuiyisong * chore: introduce a build.rs to set default catalog Signed-off-by: shuiyisong * chore: remove unused feature gate Signed-off-by: shuiyisong --------- Signed-off-by: shuiyisong --- Cargo.lock | 3 +++ Cargo.toml | 1 + src/cli/src/data.rs | 5 ++++ src/cli/src/data/export.rs | 3 ++- src/cli/src/data/import.rs | 3 ++- src/common/catalog/Cargo.toml | 1 + src/common/catalog/build.rs | 27 ++++++++++++++++++++ src/common/catalog/src/consts.rs | 6 +++-- src/common/function/src/system/pg_catalog.rs | 9 ++++--- src/common/version/Cargo.toml | 2 +- src/flow/src/adapter/refill.rs | 5 ++-- 11 files changed, 55 insertions(+), 10 deletions(-) create mode 100644 src/common/catalog/build.rs diff --git a/Cargo.lock b/Cargo.lock index 7b076d9273..130eaaeb61 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2033,6 +2033,9 @@ dependencies = [ [[package]] name = "common-catalog" version = "0.18.0" +dependencies = [ + "const_format", +] [[package]] name = "common-config" diff --git a/Cargo.toml b/Cargo.toml index e51b0d7632..3ce3ff48dc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -121,6 +121,7 @@ chrono = { version = "0.4", features = ["serde"] } chrono-tz = "0.10.1" clap = { version = "4.4", features = ["derive"] } config = "0.13.0" +const_format = "0.2" crossbeam-utils = "0.8" dashmap = "6.1" datafusion = "50" diff --git a/src/cli/src/data.rs b/src/cli/src/data.rs index bac7f3e308..86d2b43a98 100644 --- a/src/cli/src/data.rs +++ b/src/cli/src/data.rs @@ -16,6 +16,7 @@ mod export; mod import; use clap::Subcommand; +use client::DEFAULT_CATALOG_NAME; use common_error::ext::BoxedError; use crate::Tool; @@ -37,3 +38,7 @@ impl DataCommand { } } } + +pub(crate) fn default_database() -> String { + format!("{DEFAULT_CATALOG_NAME}-*") +} diff --git a/src/cli/src/data/export.rs b/src/cli/src/data/export.rs index a9f68bf9c9..33ce5a7746 100644 --- a/src/cli/src/data/export.rs +++ b/src/cli/src/data/export.rs @@ -30,6 +30,7 @@ use snafu::{OptionExt, ResultExt}; use tokio::sync::Semaphore; use tokio::time::Instant; +use crate::data::default_database; use crate::database::{DatabaseClient, parse_proxy_opts}; use crate::error::{ EmptyResultSnafu, Error, OpenDalSnafu, OutputDirNotSetSnafu, Result, S3ConfigNotSetSnafu, @@ -63,7 +64,7 @@ pub struct ExportCommand { output_dir: Option, /// The name of the catalog to export. - #[clap(long, default_value = "greptime-*")] + #[clap(long, default_value_t = default_database())] database: String, /// Parallelism of the export. diff --git a/src/cli/src/data/import.rs b/src/cli/src/data/import.rs index 102de8ac91..db2fd42e37 100644 --- a/src/cli/src/data/import.rs +++ b/src/cli/src/data/import.rs @@ -25,6 +25,7 @@ use snafu::{OptionExt, ResultExt}; use tokio::sync::Semaphore; use tokio::time::Instant; +use crate::data::default_database; use crate::database::{DatabaseClient, parse_proxy_opts}; use crate::error::{Error, FileIoSnafu, Result, SchemaNotFoundSnafu}; use crate::{Tool, database}; @@ -52,7 +53,7 @@ pub struct ImportCommand { input_dir: String, /// The name of the catalog to import. - #[clap(long, default_value = "greptime-*")] + #[clap(long, default_value_t = default_database())] database: String, /// Parallelism of the import. diff --git a/src/common/catalog/Cargo.toml b/src/common/catalog/Cargo.toml index 051675fe93..357f180a33 100644 --- a/src/common/catalog/Cargo.toml +++ b/src/common/catalog/Cargo.toml @@ -8,5 +8,6 @@ license.workspace = true workspace = true [dependencies] +const_format.workspace = true [dev-dependencies] diff --git a/src/common/catalog/build.rs b/src/common/catalog/build.rs new file mode 100644 index 0000000000..311d6eef3f --- /dev/null +++ b/src/common/catalog/build.rs @@ -0,0 +1,27 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +fn main() { + // Set DEFAULT_CATALOG_NAME from environment variable or use default value + let default_catalog_name = + std::env::var("DEFAULT_CATALOG_NAME").unwrap_or_else(|_| "greptime".to_string()); + + println!( + "cargo:rustc-env=DEFAULT_CATALOG_NAME={}", + default_catalog_name + ); + + // Rerun build script if the environment variable changes + println!("cargo:rerun-if-env-changed=DEFAULT_CATALOG_NAME"); +} diff --git a/src/common/catalog/src/consts.rs b/src/common/catalog/src/consts.rs index 7dd6da9b4f..8a59a15cc6 100644 --- a/src/common/catalog/src/consts.rs +++ b/src/common/catalog/src/consts.rs @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. +use const_format::concatcp; + pub const SYSTEM_CATALOG_NAME: &str = "system"; pub const INFORMATION_SCHEMA_NAME: &str = "information_schema"; pub const PG_CATALOG_NAME: &str = "pg_catalog"; pub const SYSTEM_CATALOG_TABLE_NAME: &str = "system_catalog"; -pub const DEFAULT_CATALOG_NAME: &str = "greptime"; +pub const DEFAULT_CATALOG_NAME: &str = env!("DEFAULT_CATALOG_NAME"); pub const DEFAULT_SCHEMA_NAME: &str = "public"; -pub const DEFAULT_PRIVATE_SCHEMA_NAME: &str = "greptime_private"; +pub const DEFAULT_PRIVATE_SCHEMA_NAME: &str = concatcp!(DEFAULT_CATALOG_NAME, "_private"); /// Reserves [0,MIN_USER_FLOW_ID) for internal usage. /// User defined table id starts from this value. diff --git a/src/common/function/src/system/pg_catalog.rs b/src/common/function/src/system/pg_catalog.rs index 4ea378b53a..07e7d2abaf 100644 --- a/src/common/function/src/system/pg_catalog.rs +++ b/src/common/function/src/system/pg_catalog.rs @@ -16,6 +16,9 @@ mod version; use std::sync::Arc; +use common_catalog::consts::{ + DEFAULT_PRIVATE_SCHEMA_NAME, INFORMATION_SCHEMA_NAME, PG_CATALOG_NAME, +}; use datafusion::arrow::array::{ArrayRef, StringArray, as_boolean_array}; use datafusion::catalog::TableFunction; use datafusion::common::ScalarValue; @@ -143,9 +146,9 @@ impl Function for CurrentSchemasFunction { let mut values = vec!["public"]; // include implicit schemas if input.value(0) { - values.push("information_schema"); - values.push("pg_catalog"); - values.push("greptime_private"); + values.push(INFORMATION_SCHEMA_NAME); + values.push(PG_CATALOG_NAME); + values.push(DEFAULT_PRIVATE_SCHEMA_NAME); } let list_array = SingleRowListArrayBuilder::new(Arc::new(StringArray::from(values))); diff --git a/src/common/version/Cargo.toml b/src/common/version/Cargo.toml index 3a8a2a511e..adee41afd7 100644 --- a/src/common/version/Cargo.toml +++ b/src/common/version/Cargo.toml @@ -11,7 +11,7 @@ workspace = true codec = ["dep:serde"] [dependencies] -const_format = "0.2" +const_format.workspace = true serde = { workspace = true, optional = true } shadow-rs = { version = "1.2.1", default-features = false } diff --git a/src/flow/src/adapter/refill.rs b/src/flow/src/adapter/refill.rs index 89b7344c0c..6d66505e89 100644 --- a/src/flow/src/adapter/refill.rs +++ b/src/flow/src/adapter/refill.rs @@ -18,6 +18,7 @@ use std::collections::BTreeSet; use std::sync::Arc; use catalog::CatalogManagerRef; +use client::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use common_error::ext::BoxedError; use common_meta::key::flow::FlowMetadataManagerRef; use common_recordbatch::{RecordBatch, RecordBatches, SendableRecordBatchStream}; @@ -396,8 +397,8 @@ impl RefillTask { // we don't need information from query context in this query so a default query context is enough let query_ctx = Arc::new( QueryContextBuilder::default() - .current_catalog("greptime".to_string()) - .current_schema("public".to_string()) + .current_catalog(DEFAULT_CATALOG_NAME.to_string()) + .current_schema(DEFAULT_SCHEMA_NAME.to_string()) .build(), ); From 5d0ef376de319ebda1f74ebb78682ede37100604 Mon Sep 17 00:00:00 2001 From: liyang Date: Thu, 30 Oct 2025 02:11:55 +0800 Subject: [PATCH 025/149] fix: initializer container not work (#7152) * fix: initializer not work Signed-off-by: liyang * use a one version of operator Signed-off-by: liyang --------- Signed-off-by: liyang --- .github/scripts/deploy-greptimedb.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/scripts/deploy-greptimedb.sh b/.github/scripts/deploy-greptimedb.sh index fca21993b4..10831f8625 100755 --- a/.github/scripts/deploy-greptimedb.sh +++ b/.github/scripts/deploy-greptimedb.sh @@ -7,6 +7,8 @@ KUBERNETES_VERSION="${KUBERNETES_VERSION:-v1.32.0}" ENABLE_STANDALONE_MODE="${ENABLE_STANDALONE_MODE:-true}" DEFAULT_INSTALL_NAMESPACE=${DEFAULT_INSTALL_NAMESPACE:-default} GREPTIMEDB_IMAGE_TAG=${GREPTIMEDB_IMAGE_TAG:-latest} +GREPTIMEDB_OPERATOR_IMAGE_TAG=${GREPTIMEDB_OPERATOR_IMAGE_TAG:-v0.5.1} +GREPTIMEDB_INITIALIZER_IMAGE_TAG="${GREPTIMEDB_OPERATOR_IMAGE_TAG}" GREPTIME_CHART="https://greptimeteam.github.io/helm-charts/" ETCD_CHART="oci://registry-1.docker.io/bitnamicharts/etcd" ETCD_CHART_VERSION="${ETCD_CHART_VERSION:-12.0.8}" @@ -58,7 +60,7 @@ function deploy_greptimedb_operator() { # Use the latest chart and image. helm upgrade --install greptimedb-operator greptime/greptimedb-operator \ --create-namespace \ - --set image.tag=latest \ + --set image.tag="$GREPTIMEDB_OPERATOR_IMAGE_TAG" \ -n "$DEFAULT_INSTALL_NAMESPACE" # Wait for greptimedb-operator to be ready. @@ -78,6 +80,7 @@ function deploy_greptimedb_cluster() { helm upgrade --install "$cluster_name" greptime/greptimedb-cluster \ --create-namespace \ --set image.tag="$GREPTIMEDB_IMAGE_TAG" \ + --set initializer.tag="$GREPTIMEDB_INITIALIZER_IMAGE_TAG" \ --set meta.backendStorage.etcd.endpoints="etcd.$install_namespace:2379" \ --set meta.backendStorage.etcd.storeKeyPrefix="$cluster_name" \ -n "$install_namespace" @@ -115,6 +118,7 @@ function deploy_greptimedb_cluster_with_s3_storage() { helm upgrade --install "$cluster_name" greptime/greptimedb-cluster -n "$install_namespace" \ --create-namespace \ --set image.tag="$GREPTIMEDB_IMAGE_TAG" \ + --set initializer.tag="$GREPTIMEDB_INITIALIZER_IMAGE_TAG" \ --set meta.backendStorage.etcd.endpoints="etcd.$install_namespace:2379" \ --set meta.backendStorage.etcd.storeKeyPrefix="$cluster_name" \ --set objectStorage.s3.bucket="$AWS_CI_TEST_BUCKET" \ From ee5b7ff3c80afdae4da601d2b1914ce58b0ce853 Mon Sep 17 00:00:00 2001 From: shuiyisong <113876041+shuiyisong@users.noreply.github.com> Date: Thu, 30 Oct 2025 12:26:02 +0800 Subject: [PATCH 026/149] chore: unify initialization of channel manager (#7159) * chore: unify initialization of channel manager and extract loading tls Signed-off-by: shuiyisong * chore: fix cr issue Signed-off-by: shuiyisong --------- Signed-off-by: shuiyisong --- src/client/src/client.rs | 7 +- src/client/src/client_manager.rs | 2 +- src/common/frontend/src/selector.rs | 2 +- src/common/grpc/src/channel_manager.rs | 87 ++++++++----------- src/common/grpc/tests/mod.rs | 18 ++-- src/flow/src/batching_mode/frontend_client.rs | 12 ++- src/meta-client/examples/meta_client.rs | 2 +- src/meta-client/src/lib.rs | 6 +- src/meta-srv/src/mocks.rs | 2 +- 9 files changed, 67 insertions(+), 71 deletions(-) diff --git a/src/client/src/client.rs b/src/client/src/client.rs index 1506ac5208..611cce954d 100644 --- a/src/client/src/client.rs +++ b/src/client/src/client.rs @@ -20,7 +20,9 @@ use api::v1::health_check_client::HealthCheckClient; use api::v1::prometheus_gateway_client::PrometheusGatewayClient; use api::v1::region::region_client::RegionClient as PbRegionClient; use arrow_flight::flight_service_client::FlightServiceClient; -use common_grpc::channel_manager::{ChannelConfig, ChannelManager, ClientTlsOption}; +use common_grpc::channel_manager::{ + ChannelConfig, ChannelManager, ClientTlsOption, load_tls_config, +}; use parking_lot::RwLock; use snafu::{OptionExt, ResultExt}; use tonic::codec::CompressionEncoding; @@ -94,8 +96,9 @@ impl Client { A: AsRef<[U]>, { let channel_config = ChannelConfig::default().client_tls_config(client_tls); - let channel_manager = ChannelManager::with_tls_config(channel_config) + let tls_config = load_tls_config(channel_config.client_tls.as_ref()) .context(error::CreateTlsChannelSnafu)?; + let channel_manager = ChannelManager::with_config(channel_config, tls_config); Ok(Self::with_manager_and_urls(channel_manager, urls)) } diff --git a/src/client/src/client_manager.rs b/src/client/src/client_manager.rs index 80afd2fb32..edac45a9fe 100644 --- a/src/client/src/client_manager.rs +++ b/src/client/src/client_manager.rs @@ -74,7 +74,7 @@ impl FlownodeManager for NodeClients { impl NodeClients { pub fn new(config: ChannelConfig) -> Self { Self { - channel_manager: ChannelManager::with_config(config), + channel_manager: ChannelManager::with_config(config, None), clients: CacheBuilder::new(1024) .time_to_live(Duration::from_secs(30 * 60)) .time_to_idle(Duration::from_secs(5 * 60)) diff --git a/src/common/frontend/src/selector.rs b/src/common/frontend/src/selector.rs index 4e6cc9566c..f2dc337cc2 100644 --- a/src/common/frontend/src/selector.rs +++ b/src/common/frontend/src/selector.rs @@ -104,7 +104,7 @@ impl MetaClientSelector { let cfg = ChannelConfig::new() .connect_timeout(Duration::from_secs(30)) .timeout(Duration::from_secs(30)); - let channel_manager = ChannelManager::with_config(cfg); + let channel_manager = ChannelManager::with_config(cfg, None); Self { meta_client, channel_manager, diff --git a/src/common/grpc/src/channel_manager.rs b/src/common/grpc/src/channel_manager.rs index cdea89cb86..667b73f5f3 100644 --- a/src/common/grpc/src/channel_manager.rs +++ b/src/common/grpc/src/channel_manager.rs @@ -22,14 +22,14 @@ use dashmap::DashMap; use dashmap::mapref::entry::Entry; use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt}; +use snafu::ResultExt; use tokio_util::sync::CancellationToken; use tonic::transport::{ Certificate, Channel as InnerChannel, ClientTlsConfig, Endpoint, Identity, Uri, }; use tower::Service; -use crate::error::{CreateChannelSnafu, InvalidConfigFilePathSnafu, InvalidTlsConfigSnafu, Result}; +use crate::error::{CreateChannelSnafu, InvalidConfigFilePathSnafu, Result}; const RECYCLE_CHANNEL_INTERVAL_SECS: u64 = 60; pub const DEFAULT_GRPC_REQUEST_TIMEOUT_SECS: u64 = 10; @@ -91,57 +91,18 @@ impl ChannelManager { Default::default() } - pub fn with_config(config: ChannelConfig) -> Self { - let inner = Inner::with_config(config); + /// unified with config function that support tls config + /// use [`load_tls_config`] to load tls config from file system + pub fn with_config(config: ChannelConfig, tls_config: Option) -> Self { + let mut inner = Inner::with_config(config.clone()); + if let Some(tls_config) = tls_config { + inner.client_tls_config = Some(tls_config); + } Self { inner: Arc::new(inner), } } - /// Read tls cert and key files and create a ChannelManager with TLS config. - pub fn with_tls_config(config: ChannelConfig) -> Result { - let mut inner = Inner::with_config(config.clone()); - - // setup tls - let path_config = config.client_tls.context(InvalidTlsConfigSnafu { - msg: "no config input", - })?; - - if !path_config.enabled { - // if TLS not enabled, just ignore other tls config - // and not set `client_tls_config` hence not use TLS - return Ok(Self { - inner: Arc::new(inner), - }); - } - - let mut tls_config = ClientTlsConfig::new(); - - if let Some(server_ca) = path_config.server_ca_cert_path { - let server_root_ca_cert = - std::fs::read_to_string(server_ca).context(InvalidConfigFilePathSnafu)?; - let server_root_ca_cert = Certificate::from_pem(server_root_ca_cert); - tls_config = tls_config.ca_certificate(server_root_ca_cert); - } - - if let (Some(client_cert_path), Some(client_key_path)) = - (&path_config.client_cert_path, &path_config.client_key_path) - { - let client_cert = - std::fs::read_to_string(client_cert_path).context(InvalidConfigFilePathSnafu)?; - let client_key = - std::fs::read_to_string(client_key_path).context(InvalidConfigFilePathSnafu)?; - let client_identity = Identity::from_pem(client_cert, client_key); - tls_config = tls_config.identity(client_identity); - } - - inner.client_tls_config = Some(tls_config); - - Ok(Self { - inner: Arc::new(inner), - }) - } - pub fn config(&self) -> &ChannelConfig { &self.inner.config } @@ -287,6 +248,34 @@ impl ChannelManager { } } +pub fn load_tls_config(tls_option: Option<&ClientTlsOption>) -> Result> { + let path_config = match tls_option { + Some(path_config) if path_config.enabled => path_config, + _ => return Ok(None), + }; + + let mut tls_config = ClientTlsConfig::new(); + + if let Some(server_ca) = &path_config.server_ca_cert_path { + let server_root_ca_cert = + std::fs::read_to_string(server_ca).context(InvalidConfigFilePathSnafu)?; + let server_root_ca_cert = Certificate::from_pem(server_root_ca_cert); + tls_config = tls_config.ca_certificate(server_root_ca_cert); + } + + if let (Some(client_cert_path), Some(client_key_path)) = + (&path_config.client_cert_path, &path_config.client_key_path) + { + let client_cert = + std::fs::read_to_string(client_cert_path).context(InvalidConfigFilePathSnafu)?; + let client_key = + std::fs::read_to_string(client_key_path).context(InvalidConfigFilePathSnafu)?; + let client_identity = Identity::from_pem(client_cert, client_key); + tls_config = tls_config.identity(client_identity); + } + Ok(Some(tls_config)) +} + #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct ClientTlsOption { /// Whether to enable TLS for client. @@ -659,7 +648,7 @@ mod tests { .http2_adaptive_window(true) .tcp_keepalive(Duration::from_secs(2)) .tcp_nodelay(true); - let mgr = ChannelManager::with_config(config); + let mgr = ChannelManager::with_config(config, None); let res = mgr.build_endpoint("test_addr"); diff --git a/src/common/grpc/tests/mod.rs b/src/common/grpc/tests/mod.rs index d119f22836..a437d21cd9 100644 --- a/src/common/grpc/tests/mod.rs +++ b/src/common/grpc/tests/mod.rs @@ -12,14 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -use common_grpc::channel_manager::{ChannelConfig, ChannelManager, ClientTlsOption}; +use common_grpc::channel_manager::{ + ChannelConfig, ChannelManager, ClientTlsOption, load_tls_config, +}; #[tokio::test] async fn test_mtls_config() { // test no config let config = ChannelConfig::new(); - let re = ChannelManager::with_tls_config(config); - assert!(re.is_err()); + let re = load_tls_config(config.client_tls.as_ref()); + assert!(re.is_ok()); + assert!(re.unwrap().is_none()); // test wrong file let config = ChannelConfig::new().client_tls_config(ClientTlsOption { @@ -29,7 +32,7 @@ async fn test_mtls_config() { client_key_path: Some("tests/tls/wrong_client.key".to_string()), }); - let re = ChannelManager::with_tls_config(config); + let re = load_tls_config(config.client_tls.as_ref()); assert!(re.is_err()); // test corrupted file content @@ -40,7 +43,9 @@ async fn test_mtls_config() { client_key_path: Some("tests/tls/corrupted".to_string()), }); - let re = ChannelManager::with_tls_config(config).unwrap(); + let tls_config = load_tls_config(config.client_tls.as_ref()).unwrap(); + let re = ChannelManager::with_config(config, tls_config); + let re = re.get("127.0.0.1:0"); assert!(re.is_err()); @@ -52,7 +57,8 @@ async fn test_mtls_config() { client_key_path: Some("tests/tls/client.key".to_string()), }); - let re = ChannelManager::with_tls_config(config).unwrap(); + let tls_config = load_tls_config(config.client_tls.as_ref()).unwrap(); + let re = ChannelManager::with_config(config, tls_config); let re = re.get("127.0.0.1:0"); let _ = re.unwrap(); } diff --git a/src/flow/src/batching_mode/frontend_client.rs b/src/flow/src/batching_mode/frontend_client.rs index cba8f896d5..e9994b5b14 100644 --- a/src/flow/src/batching_mode/frontend_client.rs +++ b/src/flow/src/batching_mode/frontend_client.rs @@ -23,7 +23,7 @@ use api::v1::query_request::Query; use api::v1::{CreateTableExpr, QueryRequest}; use client::{Client, Database}; use common_error::ext::{BoxedError, ErrorExt}; -use common_grpc::channel_manager::{ChannelConfig, ChannelManager}; +use common_grpc::channel_manager::{ChannelConfig, ChannelManager, load_tls_config}; use common_meta::cluster::{NodeInfo, NodeInfoKey, Role}; use common_meta::peer::Peer; use common_meta::rpc::store::RangeRequest; @@ -123,12 +123,10 @@ impl FrontendClient { let cfg = ChannelConfig::new() .connect_timeout(batch_opts.grpc_conn_timeout) .timeout(batch_opts.query_timeout); - if let Some(tls) = &batch_opts.frontend_tls { - let cfg = cfg.client_tls_config(tls.clone()); - ChannelManager::with_tls_config(cfg).context(InvalidClientConfigSnafu)? - } else { - ChannelManager::with_config(cfg) - } + + let tls_config = load_tls_config(batch_opts.frontend_tls.as_ref()) + .context(InvalidClientConfigSnafu)?; + ChannelManager::with_config(cfg, tls_config) }, auth, query, diff --git a/src/meta-client/examples/meta_client.rs b/src/meta-client/examples/meta_client.rs index fb5125224c..175888f170 100644 --- a/src/meta-client/examples/meta_client.rs +++ b/src/meta-client/examples/meta_client.rs @@ -36,7 +36,7 @@ async fn run() { .timeout(Duration::from_secs(3)) .connect_timeout(Duration::from_secs(5)) .tcp_nodelay(true); - let channel_manager = ChannelManager::with_config(config); + let channel_manager = ChannelManager::with_config(config, None); let mut meta_client = MetaClientBuilder::datanode_default_options(id) .channel_manager(channel_manager) .build(); diff --git a/src/meta-client/src/lib.rs b/src/meta-client/src/lib.rs index 47384785e2..5b56b8e181 100644 --- a/src/meta-client/src/lib.rs +++ b/src/meta-client/src/lib.rs @@ -101,7 +101,7 @@ pub async fn create_meta_client( if let MetaClientType::Frontend = client_type { let ddl_config = base_config.clone().timeout(meta_client_options.ddl_timeout); - builder = builder.ddl_channel_manager(ChannelManager::with_config(ddl_config)); + builder = builder.ddl_channel_manager(ChannelManager::with_config(ddl_config, None)); if let Some(plugins) = plugins { let region_follower = plugins.get::(); if let Some(region_follower) = region_follower { @@ -112,8 +112,8 @@ pub async fn create_meta_client( } builder = builder - .channel_manager(ChannelManager::with_config(base_config)) - .heartbeat_channel_manager(ChannelManager::with_config(heartbeat_config)); + .channel_manager(ChannelManager::with_config(base_config, None)) + .heartbeat_channel_manager(ChannelManager::with_config(heartbeat_config, None)); let mut meta_client = builder.build(); diff --git a/src/meta-srv/src/mocks.rs b/src/meta-srv/src/mocks.rs index 6c2f0d3892..c805f8ea1b 100644 --- a/src/meta-srv/src/mocks.rs +++ b/src/meta-srv/src/mocks.rs @@ -134,7 +134,7 @@ pub async fn mock( .timeout(Duration::from_secs(10)) .connect_timeout(Duration::from_secs(10)) .tcp_nodelay(true); - let channel_manager = ChannelManager::with_config(config); + let channel_manager = ChannelManager::with_config(config, None); // Move client to an option so we can _move_ the inner value // on the first attempt to connect. All other attempts will fail. From 109b70750afd289b7bc84929d93a4f03c3b7769c Mon Sep 17 00:00:00 2001 From: LFC <990479+MichaelScofield@users.noreply.github.com> Date: Thu, 30 Oct 2025 18:24:12 +0800 Subject: [PATCH 027/149] refactor: convert to prometheus values directly from arrow (#7153) * refactor: convert to prometheus values directly from arrow Signed-off-by: luofucong * resolve PR comments Signed-off-by: luofucong --------- Signed-off-by: luofucong --- src/servers/src/http/prometheus.rs | 402 +++++++++++++++++++++++++++-- tests-integration/tests/http.rs | 8 +- 2 files changed, 388 insertions(+), 22 deletions(-) diff --git a/src/servers/src/http/prometheus.rs b/src/servers/src/http/prometheus.rs index e4c9677a4c..f9d1e1c21b 100644 --- a/src/servers/src/http/prometheus.rs +++ b/src/servers/src/http/prometheus.rs @@ -13,22 +13,43 @@ // limitations under the License. //! prom supply the prometheus HTTP API Server compliance + +use std::borrow::Borrow; use std::collections::{BTreeMap, HashMap, HashSet}; +use std::hash::{Hash, Hasher}; use std::sync::Arc; +use arrow::array::AsArray; +use arrow::datatypes::{ + Date32Type, Date64Type, Decimal128Type, DurationMicrosecondType, DurationMillisecondType, + DurationNanosecondType, DurationSecondType, Float32Type, Float64Type, Int8Type, Int16Type, + Int32Type, Int64Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalYearMonthType, + Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType, + TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, + TimestampSecondType, UInt8Type, UInt16Type, UInt32Type, UInt64Type, +}; +use arrow_schema::{DataType, IntervalUnit, TimeUnit}; use axum::extract::{Path, Query, State}; use axum::{Extension, Form}; use catalog::CatalogManagerRef; use common_catalog::parse_catalog_and_schema_from_db_string; +use common_decimal::Decimal128; use common_error::ext::ErrorExt; use common_error::status_code::StatusCode; use common_query::{Output, OutputData}; -use common_recordbatch::RecordBatches; +use common_recordbatch::{RecordBatch, RecordBatches}; use common_telemetry::{debug, tracing}; +use common_time::time::Time; use common_time::util::{current_time_rfc3339, yesterday_rfc3339}; +use common_time::{ + Date, Duration, IntervalDayTime, IntervalMonthDayNano, IntervalYearMonth, Timestamp, +}; use common_version::OwnedBuildInfo; +use datafusion_common::ScalarValue; use datatypes::prelude::ConcreteDataType; use datatypes::scalars::ScalarVector; +use datatypes::schema::{ColumnSchema, SchemaRef}; +use datatypes::types::jsonb_to_string; use datatypes::vectors::Float64Vector; use futures::StreamExt; use futures::future::join_all; @@ -53,8 +74,9 @@ use store_api::metric_engine_consts::{ pub use super::result::prometheus_resp::PrometheusJsonResponse; use crate::error::{ - CatalogSnafu, CollectRecordbatchSnafu, Error, InvalidQuerySnafu, ParseTimestampSnafu, Result, - TableNotFoundSnafu, UnexpectedResultSnafu, + CatalogSnafu, CollectRecordbatchSnafu, ConvertScalarValueSnafu, DataFusionSnafu, Error, + InvalidQuerySnafu, NotSupportedSnafu, ParseTimestampSnafu, Result, TableNotFoundSnafu, + UnexpectedResultSnafu, }; use crate::http::header::collect_plan_metrics; use crate::prom_store::{DATABASE_LABEL, FIELD_NAME_LABEL, METRIC_NAME_LABEL, SCHEMA_LABEL}; @@ -98,12 +120,23 @@ pub struct PromData { pub result: PromQueryResult, } +/// A "holder" for the reference([Arc]) to a column name, +/// to help avoiding cloning [String]s when used as a [HashMap] key. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct Column(Arc); + +impl From<&str> for Column { + fn from(s: &str) -> Self { + Self(Arc::new(s.to_string())) + } +} + #[derive(Debug, Default, Serialize, Deserialize, PartialEq)] #[serde(untagged)] pub enum PrometheusResponse { PromData(PromData), Labels(Vec), - Series(Vec>), + Series(Vec>), LabelValues(Vec), FormatQuery(String), BuildInfo(OwnedBuildInfo), @@ -622,7 +655,7 @@ async fn get_all_column_names( async fn retrieve_series_from_query_result( result: Result, - series: &mut Vec>, + series: &mut Vec>, query_ctx: &QueryContext, table_name: &str, manager: &CatalogManagerRef, @@ -700,7 +733,7 @@ async fn retrieve_labels_name_from_query_result( fn record_batches_to_series( batches: RecordBatches, - series: &mut Vec>, + series: &mut Vec>, table_name: &str, tag_columns: &HashSet, ) -> Result<()> { @@ -723,22 +756,355 @@ fn record_batches_to_series( .try_project(&projection) .context(CollectRecordbatchSnafu)?; - for row in batch.rows() { - let mut element: HashMap = row - .iter() - .enumerate() - .map(|(idx, column)| { - let column_name = batch.schema.column_name_by_index(idx); - (column_name.to_string(), column.to_string()) - }) - .collect(); - let _ = element.insert("__name__".to_string(), table_name.to_string()); - series.push(element); - } + let mut writer = RowWriter::new(&batch.schema, table_name); + writer.write(batch, series)?; } Ok(()) } +/// Writer from a row in the record batch to a Prometheus time series: +/// +/// `{__name__="",