From 3cdf03d830930424125fe5f31677dadb672071b7 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Fri, 13 Mar 2026 11:40:04 +0800 Subject: [PATCH 001/195] feat: introduce APIs for storing perses dashboard definition (#7791) * feat: introduce APIs for storing perses dashboard definition * test: ensure we can update dashboard * refactor: construct dashboard defnition directly * refactor: don't create table on list requests --- src/frontend/src/instance.rs | 1 + src/frontend/src/instance/dashboard.rs | 405 ++++++++++++++++++ src/frontend/src/server.rs | 2 + src/servers/src/http.rs | 37 +- src/servers/src/http/dashboard.rs | 114 ++++- .../src/http/result/greptime_manage_resp.rs | 27 ++ src/servers/src/query_handler.rs | 18 + tests-integration/Cargo.toml | 2 +- tests-integration/src/test_util.rs | 1 + tests-integration/tests/http.rs | 116 +++++ 10 files changed, 717 insertions(+), 6 deletions(-) create mode 100644 src/frontend/src/instance/dashboard.rs diff --git a/src/frontend/src/instance.rs b/src/frontend/src/instance.rs index fa8a74cad2..ce589bb677 100644 --- a/src/frontend/src/instance.rs +++ b/src/frontend/src/instance.rs @@ -13,6 +13,7 @@ // limitations under the License. pub mod builder; +mod dashboard; mod grpc; mod influxdb; mod jaeger; diff --git a/src/frontend/src/instance/dashboard.rs b/src/frontend/src/instance/dashboard.rs new file mode 100644 index 0000000000..373961dbfa --- /dev/null +++ b/src/frontend/src/instance/dashboard.rs @@ -0,0 +1,405 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::Arc; + +use api::v1::value::ValueData; +use api::v1::{ + ColumnDataType, ColumnDef, ColumnSchema as PbColumnSchema, Row, RowInsertRequest, + RowInsertRequests, Rows, SemanticType, +}; +use async_trait::async_trait; +use common_catalog::consts::{DEFAULT_PRIVATE_SCHEMA_NAME, default_engine}; +use common_error::ext::BoxedError; +use common_query::OutputData; +use common_recordbatch::util as record_util; +use common_telemetry::info; +use common_time::FOREVER; +use datafusion::datasource::DefaultTableSource; +use datafusion::logical_expr::col; +use datafusion::sql::TableReference; +use datafusion_expr::{DmlStatement, LogicalPlan, lit}; +use datatypes::arrow::array::{Array, AsArray}; +use servers::error::{ + CatalogSnafu, CollectRecordbatchSnafu, DataFusionSnafu, ExecuteQuerySnafu, NotSupportedSnafu, + TableNotFoundSnafu, +}; +use servers::query_handler::DashboardDefinition; +use session::context::{QueryContextBuilder, QueryContextRef}; +use snafu::{OptionExt, ResultExt}; +use table::TableRef; +use table::metadata::TableInfo; +use table::requests::TTL_KEY; +use table::table::adapter::DfTableProviderAdapter; + +use crate::instance::Instance; + +pub const DASHBOARD_TABLE_NAME: &str = "dashboard"; +pub const DASHBOARD_TABLE_NAME_COLUMN_NAME: &str = "name"; +pub const DASHBOARD_TABLE_DEFINITION_COLUMN_NAME: &str = "definition"; +pub const DASHBOARD_TABLE_CREATED_AT_COLUMN_NAME: &str = "created_at"; + +impl Instance { + /// Build a schema for dashboard table. + /// Returns the (time index, primary keys, column) definitions. + fn build_dashboard_schema() -> (String, Vec, Vec) { + ( + DASHBOARD_TABLE_CREATED_AT_COLUMN_NAME.to_string(), + vec![DASHBOARD_TABLE_NAME_COLUMN_NAME.to_string()], + vec![ + ColumnDef { + name: DASHBOARD_TABLE_NAME_COLUMN_NAME.to_string(), + data_type: ColumnDataType::String as i32, + is_nullable: false, + default_constraint: vec![], + semantic_type: SemanticType::Tag as i32, + comment: String::new(), + datatype_extension: None, + options: None, + }, + ColumnDef { + name: DASHBOARD_TABLE_DEFINITION_COLUMN_NAME.to_string(), + data_type: ColumnDataType::String as i32, + is_nullable: false, + default_constraint: vec![], + semantic_type: SemanticType::Field as i32, + comment: String::new(), + datatype_extension: None, + options: None, + }, + ColumnDef { + name: DASHBOARD_TABLE_CREATED_AT_COLUMN_NAME.to_string(), + data_type: ColumnDataType::TimestampNanosecond as i32, + is_nullable: false, + default_constraint: vec![], + semantic_type: SemanticType::Timestamp as i32, + comment: String::new(), + datatype_extension: None, + options: None, + }, + ], + ) + } + + /// Build a column schemas for inserting a row into the dashboard table. + fn build_dashboard_insert_column_schemas() -> Vec { + vec![ + PbColumnSchema { + column_name: DASHBOARD_TABLE_NAME_COLUMN_NAME.to_string(), + datatype: ColumnDataType::String.into(), + semantic_type: SemanticType::Tag.into(), + ..Default::default() + }, + PbColumnSchema { + column_name: DASHBOARD_TABLE_DEFINITION_COLUMN_NAME.to_string(), + datatype: ColumnDataType::String.into(), + semantic_type: SemanticType::Field.into(), + ..Default::default() + }, + PbColumnSchema { + column_name: DASHBOARD_TABLE_CREATED_AT_COLUMN_NAME.to_string(), + datatype: ColumnDataType::TimestampNanosecond.into(), + semantic_type: SemanticType::Timestamp.into(), + ..Default::default() + }, + ] + } + + fn dashboard_query_ctx(table_info: &TableInfo) -> QueryContextRef { + QueryContextBuilder::default() + .current_catalog(table_info.catalog_name.clone()) + .current_schema(table_info.schema_name.clone()) + .build() + .into() + } + + async fn create_dashboard_table_if_not_exists( + &self, + ctx: QueryContextRef, + ) -> servers::error::Result { + let catalog = ctx.current_catalog(); + + if let Some(table) = self + .catalog_manager + .table( + catalog, + DEFAULT_PRIVATE_SCHEMA_NAME, + DASHBOARD_TABLE_NAME, + Some(&ctx), + ) + .await + .context(CatalogSnafu)? + { + return Ok(table); + } + + let (time_index, primary_keys, column_defs) = Self::build_dashboard_schema(); + + let mut table_options = HashMap::new(); + table_options.insert(TTL_KEY.to_string(), FOREVER.to_string()); + + let mut create_table_expr = api::v1::CreateTableExpr { + catalog_name: catalog.to_string(), + schema_name: DEFAULT_PRIVATE_SCHEMA_NAME.to_string(), + table_name: DASHBOARD_TABLE_NAME.to_string(), + desc: "GreptimeDB dashboard table".to_string(), + column_defs, + time_index, + primary_keys, + create_if_not_exists: true, + table_options, + table_id: None, + engine: default_engine().to_string(), + }; + + self.statement_executor + .create_table_inner(&mut create_table_expr, None, ctx.clone()) + .await + .map_err(BoxedError::new) + .context(ExecuteQuerySnafu)?; + + let table = self + .catalog_manager + .table( + catalog, + DEFAULT_PRIVATE_SCHEMA_NAME, + DASHBOARD_TABLE_NAME, + Some(&ctx), + ) + .await + .context(CatalogSnafu)? + .context(TableNotFoundSnafu { + catalog: catalog.to_string(), + schema: DEFAULT_PRIVATE_SCHEMA_NAME.to_string(), + table: DASHBOARD_TABLE_NAME.to_string(), + })?; + + Ok(table) + } + + /// Insert a dashboard into the dashboard table. + async fn insert_dashboard( + &self, + name: &str, + definition: &str, + query_ctx: QueryContextRef, + ) -> servers::error::Result<()> { + let table = self + .create_dashboard_table_if_not_exists(query_ctx.clone()) + .await?; + let table_info = table.table_info(); + + let insert = RowInsertRequest { + table_name: DASHBOARD_TABLE_NAME.to_string(), + rows: Some(Rows { + schema: Self::build_dashboard_insert_column_schemas(), + rows: vec![Row { + values: vec![ + ValueData::StringValue(name.to_string()).into(), + ValueData::StringValue(definition.to_string()).into(), + ValueData::TimestampNanosecondValue(0).into(), + ], + }], + }), + }; + + let requests = RowInsertRequests { + inserts: vec![insert], + }; + + let output = self + .inserter + .handle_row_inserts( + requests, + Self::dashboard_query_ctx(&table_info), + &self.statement_executor, + false, + false, + ) + .await + .map_err(BoxedError::new) + .context(ExecuteQuerySnafu)?; + + info!( + "Insert dashboard success, name: {}, table: {}, output: {:?}", + name, + table_info.full_table_name(), + output + ); + + Ok(()) + } + + /// List all dashboards. + async fn list_dashboards( + &self, + query_ctx: QueryContextRef, + ) -> servers::error::Result> { + let table = if let Some(table) = self + .catalog_manager + .table( + query_ctx.current_catalog(), + DEFAULT_PRIVATE_SCHEMA_NAME, + DASHBOARD_TABLE_NAME, + Some(&query_ctx), + ) + .await + .context(CatalogSnafu)? + { + table + } else { + return Ok(vec![]); + }; + + let table_info = table.table_info(); + + let dataframe = self + .query_engine + .read_table(table.clone()) + .map_err(BoxedError::new) + .context(ExecuteQuerySnafu)?; + + let dataframe = dataframe + .select_columns(&[ + DASHBOARD_TABLE_NAME_COLUMN_NAME, + DASHBOARD_TABLE_DEFINITION_COLUMN_NAME, + ]) + .context(DataFusionSnafu)?; + + let plan = dataframe.into_parts().1; + + let output = self + .query_engine + .execute(plan, Self::dashboard_query_ctx(&table_info)) + .await + .map_err(BoxedError::new) + .context(ExecuteQuerySnafu)?; + + let stream = match output.data { + OutputData::Stream(stream) => stream, + OutputData::RecordBatches(record_batches) => record_batches.as_stream(), + _ => unreachable!(), + }; + + let records = record_util::collect(stream) + .await + .context(CollectRecordbatchSnafu)?; + + let mut dashboards = Vec::new(); + + for r in &records { + let name_column = r.column(0); + let definition_column = r.column(1); + + let name = name_column + .as_string_opt::() + .context(NotSupportedSnafu { + feat: "Invalid data type for greptime_private.dashboard.name", + })?; + + let definition = + definition_column + .as_string_opt::() + .context(NotSupportedSnafu { + feat: "Invalid data type for greptime_private.dashboard.definition", + })?; + + for i in 0..name.len() { + dashboards.push(DashboardDefinition { + name: name.value(i).to_string(), + definition: definition.value(i).to_string(), + }); + } + } + + Ok(dashboards) + } + + /// Delete a dashboard by name. + async fn delete_dashboard( + &self, + name: &str, + query_ctx: QueryContextRef, + ) -> servers::error::Result<()> { + let table = self + .create_dashboard_table_if_not_exists(query_ctx.clone()) + .await?; + let table_info = table.table_info(); + + let dataframe = self + .query_engine + .read_table(table.clone()) + .map_err(BoxedError::new) + .context(ExecuteQuerySnafu)?; + + let name_condition = col(DASHBOARD_TABLE_NAME_COLUMN_NAME).eq(lit(name)); + + let dataframe = dataframe.filter(name_condition).context(DataFusionSnafu)?; + + let table_name = TableReference::full( + table_info.catalog_name.clone(), + table_info.schema_name.clone(), + table_info.name.clone(), + ); + + let table_provider = Arc::new(DfTableProviderAdapter::new(table.clone())); + let table_source = Arc::new(DefaultTableSource::new(table_provider)); + + let stmt = DmlStatement::new( + table_name, + table_source, + datafusion_expr::WriteOp::Delete, + Arc::new(dataframe.into_parts().1), + ); + + let plan = LogicalPlan::Dml(stmt); + + let output = self + .query_engine + .execute(plan, Self::dashboard_query_ctx(&table_info)) + .await + .map_err(BoxedError::new) + .context(ExecuteQuerySnafu)?; + + info!( + "Delete dashboard success, name: {}, table: {}, output: {:?}", + name, + table_info.full_table_name(), + output + ); + + Ok(()) + } +} + +#[async_trait] +impl servers::query_handler::DashboardHandler for Instance { + async fn save( + &self, + name: &str, + definition: &str, + ctx: QueryContextRef, + ) -> servers::error::Result<()> { + self.insert_dashboard(name, definition, ctx).await + } + + async fn list(&self, ctx: QueryContextRef) -> servers::error::Result> { + self.list_dashboards(ctx).await + } + + async fn delete(&self, name: &str, ctx: QueryContextRef) -> servers::error::Result<()> { + self.delete_dashboard(name, ctx).await + } +} diff --git a/src/frontend/src/server.rs b/src/frontend/src/server.rs index 45c3ec3649..4b51efbd33 100644 --- a/src/frontend/src/server.rs +++ b/src/frontend/src/server.rs @@ -143,6 +143,8 @@ where builder = builder.with_jaeger_handler(self.instance.clone()); } + builder = builder.with_dashboard_handler(self.instance.clone()); + if let Some(configurator) = self.plugins.get::() { info!("Adding extra router from plugins"); builder = builder.with_extra_router(configurator.router()); diff --git a/src/servers/src/http.rs b/src/servers/src/http.rs index ca6a77a077..ffd0745041 100644 --- a/src/servers/src/http.rs +++ b/src/servers/src/http.rs @@ -78,7 +78,7 @@ use crate::metrics_handler::MetricsHandler; use crate::prometheus_handler::PrometheusHandlerRef; use crate::query_handler::sql::ServerSqlQueryHandlerRef; use crate::query_handler::{ - InfluxdbLineProtocolHandlerRef, JaegerQueryHandlerRef, LogQueryHandlerRef, + DashboardHandlerRef, InfluxdbLineProtocolHandlerRef, JaegerQueryHandlerRef, LogQueryHandlerRef, OpenTelemetryProtocolHandlerRef, OpentsdbProtocolHandlerRef, PipelineHandlerRef, PromStoreProtocolHandlerRef, }; @@ -507,6 +507,11 @@ pub struct GreptimeOptionsConfigState { pub greptime_config_options: String, } +#[derive(Clone)] +pub struct DashboardState { + pub handler: DashboardHandlerRef, +} + pub struct HttpServerBuilder { options: HttpOptions, plugins: Plugins, @@ -703,6 +708,16 @@ impl HttpServerBuilder { } } + pub fn with_dashboard_handler(self, handler: DashboardHandlerRef) -> Self { + Self { + router: self.router.nest( + &format!("/{HTTP_API_VERSION}/dashboards"), + HttpServer::route_dashboard(handler), + ), + ..self + } + } + pub fn with_extra_router(self, router: Router) -> Self { Self { router: self.router.merge(router), @@ -1169,6 +1184,26 @@ impl HttpServer { ) .with_state(handler) } + + #[cfg(feature = "dashboard")] + fn route_dashboard(handler: DashboardHandlerRef) -> Router { + use crate::http::dashboard::{add_dashboard, delete_dashboard, list_dashboards}; + + Router::new() + .route("/", routing::get(list_dashboards)) + .route("/{dashboard_name}", routing::post(add_dashboard)) + .route("/{dashboard_name}", routing::delete(delete_dashboard)) + .layer( + ServiceBuilder::new() + .layer(RequestDecompressionLayer::new().pass_through_unaccepted(true)), + ) + .with_state(DashboardState { handler }) + } + + #[cfg(not(feature = "dashboard"))] + fn route_dashboard(handler: DashboardHandlerRef) -> Router { + Router::new().with_state(DashboardState { handler }) + } } pub const HTTP_SERVER: &str = "HTTP_SERVER"; diff --git a/src/servers/src/http/dashboard.rs b/src/servers/src/http/dashboard.rs index bdb98490f0..ea894ca7d0 100644 --- a/src/servers/src/http/dashboard.rs +++ b/src/servers/src/http/dashboard.rs @@ -12,14 +12,21 @@ // See the License for the specific language governing permissions and // limitations under the License. -use axum::body::Body; +use std::sync::Arc; +use std::time::Instant; + +use axum::body::{Body, Bytes}; +use axum::extract::{Extension, Path, State}; use axum::http::{StatusCode, Uri, header}; use axum::response::Response; -use common_telemetry::debug; +use common_telemetry::{debug, error}; use rust_embed::RustEmbed; -use snafu::ResultExt; +use session::context::{Channel, QueryContext}; +use snafu::{ResultExt, ensure}; -use crate::error::{BuildHttpResponseSnafu, Result}; +use crate::error::{BuildHttpResponseSnafu, InvalidParameterSnafu, Result}; +use crate::http::DashboardState; +use crate::http::result::greptime_manage_resp::{DashboardOutput, GreptimedbManageResponse}; #[derive(RustEmbed)] #[folder = "dashboard/dist/"] @@ -61,3 +68,102 @@ fn get_assets(path: &str) -> Result { } .context(BuildHttpResponseSnafu) } + +#[axum_macros::debug_handler] +pub async fn add_dashboard( + State(state): State, + Path(dashboard_name): Path, + Extension(mut query_ctx): Extension, + payload: Bytes, +) -> Result { + let start = Instant::now(); + let handler = state.handler; + ensure!( + !dashboard_name.is_empty(), + InvalidParameterSnafu { + reason: "dashboard_name is required in path", + } + ); + + let definition = String::from_utf8_lossy(&payload).to_string(); + + query_ctx.set_channel(Channel::HttpSql); + let query_ctx = Arc::new(query_ctx); + + handler + .save(&dashboard_name, &definition, query_ctx) + .await + .map(|_| { + GreptimedbManageResponse::from_dashboard( + dashboard_name, + start.elapsed().as_millis() as u64, + ) + }) + .map_err(|e| { + error!(e; "failed to save dashboard"); + e + }) +} + +#[axum_macros::debug_handler] +pub async fn list_dashboards( + State(state): State, + Extension(mut query_ctx): Extension, +) -> Result { + let start = Instant::now(); + let handler = state.handler; + + query_ctx.set_channel(Channel::HttpSql); + let query_ctx = Arc::new(query_ctx); + + handler + .list(query_ctx) + .await + .map(|dashboards| { + let outputs: Vec = dashboards + .into_iter() + .map(|d| DashboardOutput { + name: d.name, + definition: d.definition, + }) + .collect(); + GreptimedbManageResponse::from_dashboards(outputs, start.elapsed().as_millis() as u64) + }) + .map_err(|e| { + error!(e; "failed to list dashboards"); + e + }) +} + +#[axum_macros::debug_handler] +pub async fn delete_dashboard( + State(state): State, + Extension(mut query_ctx): Extension, + Path(dashboard_name): Path, +) -> Result { + let start = Instant::now(); + let handler = state.handler; + ensure!( + !dashboard_name.is_empty(), + InvalidParameterSnafu { + reason: "dashboard_name is required", + } + ); + + query_ctx.set_channel(Channel::HttpSql); + let query_ctx = Arc::new(query_ctx); + + handler + .delete(&dashboard_name, query_ctx) + .await + .map(|_| { + GreptimedbManageResponse::from_dashboard( + dashboard_name, + start.elapsed().as_millis() as u64, + ) + }) + .map_err(|e| { + error!(e; "failed to delete dashboard"); + e + }) +} diff --git a/src/servers/src/http/result/greptime_manage_resp.rs b/src/servers/src/http/result/greptime_manage_resp.rs index 3f7f3c6eec..2b3a5d455c 100644 --- a/src/servers/src/http/result/greptime_manage_resp.rs +++ b/src/servers/src/http/result/greptime_manage_resp.rs @@ -62,6 +62,25 @@ impl GreptimedbManageResponse { } } + pub fn from_dashboard(name: String, execution_time_ms: u64) -> Self { + GreptimedbManageResponse { + manage_result: ManageResult::Dashboards { + dashboards: vec![DashboardOutput { + name, + definition: String::new(), + }], + }, + execution_time_ms, + } + } + + pub fn from_dashboards(dashboards: Vec, execution_time_ms: u64) -> Self { + GreptimedbManageResponse { + manage_result: ManageResult::Dashboards { dashboards }, + execution_time_ms, + } + } + pub fn with_execution_time(mut self, execution_time: u64) -> Self { self.execution_time_ms = execution_time; self @@ -77,6 +96,7 @@ impl GreptimedbManageResponse { pub enum ManageResult { Pipelines { pipelines: Vec }, Sql { sql: SqlOutput }, + Dashboards { dashboards: Vec }, } #[derive(Serialize, Deserialize, Debug)] @@ -87,6 +107,13 @@ pub struct PipelineOutput { pipeline: Option, } +#[derive(Serialize, Deserialize, Debug)] +pub struct DashboardOutput { + pub name: String, + #[serde(skip_serializing_if = "String::is_empty")] + pub definition: String, +} + #[derive(Serialize, Deserialize, Debug)] pub struct SqlOutput { pub(crate) sql: String, diff --git a/src/servers/src/query_handler.rs b/src/servers/src/query_handler.rs index 60efe69faa..21c7646560 100644 --- a/src/servers/src/query_handler.rs +++ b/src/servers/src/query_handler.rs @@ -44,6 +44,12 @@ use pipeline::{GreptimePipelineParams, Pipeline, PipelineInfo, PipelineVersion, use serde_json::Value; use session::context::{QueryContext, QueryContextRef}; +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct DashboardDefinition { + pub name: String, + pub definition: String, +} + use crate::error::Result; use crate::http::jaeger::QueryTraceParams; use crate::influxdb::InfluxdbRequest; @@ -176,6 +182,18 @@ pub trait PipelineHandler { ) -> Result<(String, TimestampNanosecond)>; } +/// Handling dashboard as code CRUD +pub type DashboardHandlerRef = Arc; + +#[async_trait] +pub trait DashboardHandler { + async fn save(&self, name: &str, definition: &str, ctx: QueryContextRef) -> Result<()>; + + async fn list(&self, ctx: QueryContextRef) -> Result>; + + async fn delete(&self, name: &str, ctx: QueryContextRef) -> Result<()>; +} + /// Handle log query requests. #[async_trait] pub trait LogQueryHandler { diff --git a/tests-integration/Cargo.toml b/tests-integration/Cargo.toml index 0c6b965fd3..ec35205a55 100644 --- a/tests-integration/Cargo.toml +++ b/tests-integration/Cargo.toml @@ -5,7 +5,7 @@ edition.workspace = true license.workspace = true [features] -dashboard = [] +dashboard = ["servers/dashboard"] vector_index = [] [lints] diff --git a/tests-integration/src/test_util.rs b/tests-integration/src/test_util.rs index fd0d1ef3c4..2bf6e812c7 100644 --- a/tests-integration/src/test_util.rs +++ b/tests-integration/src/test_util.rs @@ -534,6 +534,7 @@ pub async fn setup_test_http_app_with_frontend_and_custom_options( .with_influxdb_handler(instance.fe_instance().clone()) .with_otlp_handler(instance.fe_instance().clone(), true) .with_jaeger_handler(instance.fe_instance().clone()) + .with_dashboard_handler(instance.fe_instance().clone()) .with_greptime_config_options(instance.opts.to_toml().unwrap()); if let Some(user_provider) = user_provider { diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index 68fa2a228d..c259d3ff24 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -106,6 +106,7 @@ macro_rules! http_tests { test_config_api, test_dynamic_tracer_toggle, test_dashboard_path, + test_dashboard_api, test_prometheus_remote_write, test_prometheus_remote_special_labels, test_prometheus_remote_schema_labels, @@ -1720,6 +1721,121 @@ pub async fn test_dashboard_path(store_type: StorageType) { #[cfg(not(feature = "dashboard"))] pub async fn test_dashboard_path(_: StorageType) {} +#[cfg(feature = "dashboard")] +pub async fn test_dashboard_api(store_type: StorageType) { + common_telemetry::init_default_ut_logging(); + let (app, mut guard) = setup_test_http_app_with_frontend(store_type, "dashboard_api").await; + let client = TestClient::new(app).await; + + // 1. List dashboards - should be empty initially + let res = client.get("/v1/dashboards").send().await; + assert_eq!(res.status(), StatusCode::OK); + let body: Value = res.json().await; + let dashboards = body.get("dashboards").unwrap().as_array().unwrap(); + assert!(dashboards.is_empty()); + + // 2. Save a dashboard + let dashboard_definition = r#"{"title": "My Dashboard", "panels": []}"#; + let res = client + .post("/v1/dashboards/test_dashboard") + .body(dashboard_definition) + .send() + .await; + assert_eq!(res.status(), StatusCode::OK); + let body: Value = res.json().await; + let dashboards = body.get("dashboards").unwrap().as_array().unwrap(); + assert_eq!(dashboards.len(), 1); + assert_eq!(dashboards[0].get("name").unwrap(), "test_dashboard"); + + // 3. Save another dashboard + let res = client + .post("/v1/dashboards/another_dashboard") + .body(r#"{"title": "Another Dashboard"}"#) + .send() + .await; + assert_eq!(res.status(), StatusCode::OK); + + // 4. List dashboards - should have 2 + let res = client.get("/v1/dashboards").send().await; + assert_eq!(res.status(), StatusCode::OK); + let body: Value = res.json().await; + let dashboards = body.get("dashboards").unwrap().as_array().unwrap(); + assert_eq!(dashboards.len(), 2); + + let names: Vec<&str> = dashboards + .iter() + .map(|d| d.get("name").unwrap().as_str().unwrap()) + .collect(); + assert!(names.contains(&"test_dashboard")); + assert!(names.contains(&"another_dashboard")); + + // 5. Update a dashboard by posting again with new definition + let updated_definition = r#"{"title": "Updated Dashboard", "panels": [{"id": 1}]}"#; + let res = client + .post("/v1/dashboards/test_dashboard") + .body(updated_definition) + .send() + .await; + assert_eq!(res.status(), StatusCode::OK); + + let body: Value = res.json().await; + let dashboards = body.get("dashboards").unwrap().as_array().unwrap(); + assert_eq!(dashboards.len(), 1); + assert_eq!(dashboards[0].get("name").unwrap(), "test_dashboard"); + + // Verify the definition was updated by listing again + let res = client.get("/v1/dashboards").send().await; + assert_eq!(res.status(), StatusCode::OK); + let body: Value = res.json().await; + let dashboards = body.get("dashboards").unwrap().as_array().unwrap(); + assert_eq!(dashboards.len(), 2); + + // Find test_dashboard and verify it has updated definition + let test_db = dashboards + .iter() + .find(|d| d.get("name").unwrap() == "test_dashboard") + .unwrap(); + assert_eq!( + test_db.get("definition").unwrap(), + r#"{"title": "Updated Dashboard", "panels": [{"id": 1}]}"# + ); + + // 6. Delete one dashboard + let res = client.delete("/v1/dashboards/test_dashboard").send().await; + assert_eq!(res.status(), StatusCode::OK); + let body: Value = res.json().await; + let dashboards = body.get("dashboards").unwrap().as_array().unwrap(); + assert_eq!(dashboards.len(), 1); + assert_eq!(dashboards[0].get("name").unwrap(), "test_dashboard"); + + // 7. List dashboards - should have 1 + let res = client.get("/v1/dashboards").send().await; + assert_eq!(res.status(), StatusCode::OK); + let body: Value = res.json().await; + let dashboards = body.get("dashboards").unwrap().as_array().unwrap(); + assert_eq!(dashboards.len(), 1); + assert_eq!(dashboards[0].get("name").unwrap(), "another_dashboard"); + + // 8. Delete the remaining dashboard + let res = client + .delete("/v1/dashboards/another_dashboard") + .send() + .await; + assert_eq!(res.status(), StatusCode::OK); + + // 9. List dashboards - should be empty + let res = client.get("/v1/dashboards").send().await; + assert_eq!(res.status(), StatusCode::OK); + let body: Value = res.json().await; + let dashboards = body.get("dashboards").unwrap().as_array().unwrap(); + assert!(dashboards.is_empty()); + + guard.remove_all().await; +} + +#[cfg(not(feature = "dashboard"))] +pub async fn test_dashboard_api(_: StorageType) {} + pub async fn test_prometheus_remote_write(store_type: StorageType) { common_telemetry::init_default_ut_logging(); let (app, mut guard) = From 0572a680af48c1e0fad55a3eea0087852940a273 Mon Sep 17 00:00:00 2001 From: dennis zhuang Date: Fri, 13 Mar 2026 11:57:08 +0800 Subject: [PATCH 002/195] fix: allow empty string for env values (#7803) * fix: allow empty string for env values Signed-off-by: Dennis Zhuang * chore: strip suffix Signed-off-by: Dennis Zhuang --------- Signed-off-by: Dennis Zhuang --- src/common/config/src/config.rs | 29 +++++++++++++++++- src/common/query/src/prelude.rs | 53 ++++++++++++++++++++++++++++++++- 2 files changed, 80 insertions(+), 2 deletions(-) diff --git a/src/common/config/src/config.rs b/src/common/config/src/config.rs index e25c46a0c0..85ce3d206f 100644 --- a/src/common/config/src/config.rs +++ b/src/common/config/src/config.rs @@ -53,7 +53,7 @@ pub trait Configurable: Serialize + DeserializeOwned + Default + Sized { env.try_parsing(true) .separator(ENV_VAR_SEP) - .ignore_empty(true) + .ignore_empty(false) }; // Workaround: Replacement for `Config::try_from(&default_opts)` due to @@ -237,4 +237,31 @@ mod tests { }, ); } + + #[derive(Debug, Serialize, Deserialize, Default)] + struct SimpleConfig { + name: Option, + prefix: Option, + } + + impl Configurable for SimpleConfig {} + + #[test] + fn test_empty_env_var_is_not_ignored() { + let env_prefix = "SIMPLE_CFG_UT"; + temp_env::with_vars( + [( + [env_prefix.to_string(), "PREFIX".to_string()].join(ENV_VAR_SEP), + Some(""), + )], + || { + let opts = SimpleConfig::load_layered_options(None, env_prefix).unwrap(); + // With ignore_empty(false), an empty env var should yield Some("") + // rather than None (which was the previous behavior with ignore_empty(true)). + assert_eq!(opts.prefix, Some("".to_string())); + // Unset env var should remain None. + assert_eq!(opts.name, None); + }, + ); + } } diff --git a/src/common/query/src/prelude.rs b/src/common/query/src/prelude.rs index c27b94294e..50668bbbb1 100644 --- a/src/common/query/src/prelude.rs +++ b/src/common/query/src/prelude.rs @@ -27,7 +27,16 @@ static GREPTIME_TIMESTAMP_CELL: OnceCell = OnceCell::new(); static GREPTIME_VALUE_CELL: OnceCell = OnceCell::new(); pub fn set_default_prefix(prefix: Option<&str>) -> Result<()> { - match prefix { + // Strip surrounding double quotes as a defensive measure against upstream + // sources (scripts, CI, template engines, incorrect shell escaping) that may + // pass literal `""` as the value instead of an empty string. + let stripped = prefix.map(|s| { + s.strip_prefix('"') + .and_then(|s| s.strip_suffix('"')) + .unwrap_or(s) + }); + + match stripped { None => { // use default greptime prefix GREPTIME_TIMESTAMP_CELL.get_or_init(|| GREPTIME_TIMESTAMP.to_string()); @@ -70,3 +79,45 @@ const GREPTIME_VALUE: &str = "greptime_value"; pub const GREPTIME_COUNT: &str = "greptime_count"; /// Default physical table name pub const GREPTIME_PHYSICAL_TABLE: &str = "greptime_physical_table"; + +#[cfg(test)] +mod tests { + use super::*; + + // Each test runs in a separate process via `cargo nextest`, so OnceCell + // state does not leak between tests. + + #[test] + fn test_set_default_prefix_none() { + set_default_prefix(None).unwrap(); + assert_eq!(greptime_timestamp(), "greptime_timestamp"); + assert_eq!(greptime_value(), "greptime_value"); + } + + #[test] + fn test_set_default_prefix_empty_string() { + set_default_prefix(Some("")).unwrap(); + assert_eq!(greptime_timestamp(), "timestamp"); + assert_eq!(greptime_value(), "value"); + } + + #[test] + fn test_set_default_prefix_quoted_empty() { + // Handles upstream sources that pass literal `""` instead of an empty string + set_default_prefix(Some("\"\"")).unwrap(); + assert_eq!(greptime_timestamp(), "timestamp"); + assert_eq!(greptime_value(), "value"); + } + + #[test] + fn test_set_default_prefix_custom() { + set_default_prefix(Some("mydb")).unwrap(); + assert_eq!(greptime_timestamp(), "mydb_timestamp"); + assert_eq!(greptime_value(), "mydb_value"); + } + + #[test] + fn test_set_default_prefix_invalid() { + assert!(set_default_prefix(Some("invalid prefix!")).is_err()); + } +} From 37105c8354c0fa86f941f563af1f8db8399e9e14 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 13 Mar 2026 14:28:58 +0800 Subject: [PATCH 003/195] chore(deps): bump quinn-proto from 0.11.12 to 0.11.14 (#7805) Bumps [quinn-proto](https://github.com/quinn-rs/quinn) from 0.11.12 to 0.11.14. - [Release notes](https://github.com/quinn-rs/quinn/releases) - [Commits](https://github.com/quinn-rs/quinn/compare/quinn-proto-0.11.12...quinn-proto-0.11.14) --- updated-dependencies: - dependency-name: quinn-proto dependency-version: 0.11.14 dependency-type: indirect ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 85c2b1ed2d..94f7a3eca1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7301,7 +7301,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" dependencies = [ "cfg-if", - "windows-targets 0.52.6", + "windows-targets 0.48.5", ] [[package]] @@ -10771,9 +10771,9 @@ dependencies = [ [[package]] name = "quinn-proto" -version = "0.11.12" +version = "0.11.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49df843a9161c85bb8aae55f101bc0bac8bcafd637a620d9122fd7e0b2f7422e" +checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098" dependencies = [ "bytes", "getrandom 0.3.3", From 20f38d8a6aabeb905e2f4a0c21743ad98fa7aee2 Mon Sep 17 00:00:00 2001 From: Weny Xu Date: Fri, 13 Mar 2026 16:00:09 +0800 Subject: [PATCH 004/195] test(fuzz): add metric table repartition fuzz target (#7754) * test: add fuzz_repartition_metric_table target scaffold Signed-off-by: WenyXu * test: add metric logical lifecycle in repartition fuzz target Signed-off-by: WenyXu * test: support partitioned metric tables in repartition fuzz Signed-off-by: WenyXu * test: add repartition loop and partition assertions for metric target Signed-off-by: WenyXu * test: use shared timestamp clock in metric repartition writes Signed-off-by: WenyXu * refactor: unify string value and bound generation for fuzzing Signed-off-by: WenyXu * test: use fixed physical table name in metric repartition fuzz Signed-off-by: WenyXu * chore: fmt Signed-off-by: WenyXu * ci: update ci config Signed-off-by: WenyXu * refactor: use btreemap Signed-off-by: WenyXu * print count result Signed-off-by: WenyXu * test: add csv translator for insert expr Introduce a dedicated top-level csv translator so fuzz insert expressions can be converted into writer-ready records through a structured path instead of ad-hoc formatting in targets. Signed-off-by: WenyXu * test: add csv dump session utilities Introduce CSV dump env helpers and a session writer that creates run directories, emits seed metadata, and flushes staged CSV records for fuzz workflows. Signed-off-by: WenyXu * test: bound csv dump buffer with auto flush Parse readable buffer sizes from env and flush staged CSV records automatically when the in-memory threshold is reached to prevent unbounded growth during long fuzz runs. Signed-off-by: WenyXu * test: flush csv dump before repartition validation Wire csv dump session into the metric repartition fuzz flow so successful inserts are translated from insert expressions into CSV records during write loops and flushed to disk right before row validation. Signed-off-by: WenyXu * test: keep csv dumps on failure and cleanup on pass Capture run outcomes in metric repartition fuzz, remove dump directories only after successful validation, and retain dump paths on failures so CI and local investigations can use the same artifacts. Signed-off-by: WenyXu * test: align partial csv records with table headers Keep append payload compact by storing partial insert-expression columns, then expand to full table-context headers at flush time and fill missing values with empty strings. Signed-off-by: WenyXu * chore: add logs Signed-off-by: WenyXu * dump csv Signed-off-by: WenyXu * ci: dump csv Signed-off-by: WenyXu * refactor Signed-off-by: WenyXu * test: add table-scoped sql dump writer primitives Signed-off-by: WenyXu * test: capture table-scoped sql traces after execution Record insert and repartition SQL only after successful execution, include started_at_ms and elapsed_ms in trace comments, and broadcast repartition events into every logical-table trace file for consistent debugging context. Signed-off-by: WenyXu * test: harden sql trace comments and include create sql Normalize multiline trace comments into valid SQL comment lines and append logical-table CREATE SQL to per-table traces for better timeline reconstruction during repartition debugging. Signed-off-by: WenyXu * test: dump physical create and repartition SQL traces Signed-off-by: WenyXu * dump repartition sql Signed-off-by: WenyXu * test: scaffold writer control channel for barrier flow Add Barrier/Resume/Stop control skeleton and channel wiring in write_loop to prepare per-repartition validation barriers. Also align SQL dump tests with broadcast SQL payload behavior. Signed-off-by: WenyXu * test: implement writer barrier pause and resume control Make writer control messages effective by pausing writes on barrier, resuming on resume, and stopping via channel signaling so the next commit can enforce deterministic per-repartition validation boundaries. Signed-off-by: WenyXu * test: validate rows after each repartition barrier Add per-action barrier/ack synchronization with timeout, run immediate logical-table row validation after each repartition, and resume writer only after validation completes to improve minimal failure localization. Signed-off-by: WenyXu * test: flush dump sessions before per-epoch validation Extract a shared flush-and-snapshot helper and call it before each immediate row validation so CSV/SQL artifacts are persisted at the same epoch boundary being validated. Signed-off-by: WenyXu * fix: fix unit tests Signed-off-by: WenyXu * chore: add retry Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu --------- Signed-off-by: WenyXu --- .github/workflows/develop.yml | 16 +- tests-fuzz/Cargo.toml | 7 + tests-fuzz/README.md | 20 + tests-fuzz/src/fake.rs | 20 + tests-fuzz/src/generator/create_expr.rs | 129 +++- tests-fuzz/src/ir.rs | 37 +- tests-fuzz/src/ir/partition_expr.rs | 6 +- tests-fuzz/src/ir/string_value.rs | 162 +++++ tests-fuzz/src/translator.rs | 2 + tests-fuzz/src/translator/csv.rs | 121 ++++ tests-fuzz/src/utils.rs | 42 ++ tests-fuzz/src/utils/csv_dump_writer.rs | 383 ++++++++++ tests-fuzz/src/utils/retry.rs | 49 ++ tests-fuzz/src/utils/sql_dump_writer.rs | 267 +++++++ .../ddl/fuzz_repartition_metric_table.rs | 684 ++++++++++++++++++ 15 files changed, 1892 insertions(+), 53 deletions(-) create mode 100644 tests-fuzz/src/ir/string_value.rs create mode 100644 tests-fuzz/src/translator/csv.rs create mode 100644 tests-fuzz/src/utils/csv_dump_writer.rs create mode 100644 tests-fuzz/src/utils/retry.rs create mode 100644 tests-fuzz/src/utils/sql_dump_writer.rs create mode 100644 tests-fuzz/targets/ddl/fuzz_repartition_metric_table.rs diff --git a/.github/workflows/develop.yml b/.github/workflows/develop.yml index 0238e92c8d..b6ab0f8926 100644 --- a/.github/workflows/develop.yml +++ b/.github/workflows/develop.yml @@ -319,7 +319,13 @@ jobs: include: - target: "fuzz_repartition_table" mode: - name: "Local WAL Repartition GC" + name: "Local WAL mito table repartition" + minio: true + kafka: false + values: "with-minio-repartition-gc.yaml" + - target: "fuzz_repartition_metric_table" + mode: + name: "Local WAL metric table repartition" minio: true kafka: false values: "with-minio-repartition-gc.yaml" @@ -455,6 +461,14 @@ jobs: path: /tmp/fuzz-monitor-dumps if-no-files-found: warn retention-days: 3 + - name: Upload CSV dumps + if: failure() + uses: actions/upload-artifact@v4 + with: + name: fuzz-tests-csv-dumps-${{ matrix.mode.name }}-${{ matrix.target }} + path: /tmp/greptime-fuzz-dumps + if-no-files-found: warn + retention-days: 3 - name: Delete cluster if: success() shell: bash diff --git a/tests-fuzz/Cargo.toml b/tests-fuzz/Cargo.toml index a537ca0687..bc687092c0 100644 --- a/tests-fuzz/Cargo.toml +++ b/tests-fuzz/Cargo.toml @@ -100,6 +100,13 @@ test = false bench = false doc = false +[[bin]] +name = "fuzz_repartition_metric_table" +path = "targets/ddl/fuzz_repartition_metric_table.rs" +test = false +bench = false +doc = false + [[bin]] name = "fuzz_alter_table" path = "targets/ddl/fuzz_alter_table.rs" diff --git a/tests-fuzz/README.md b/tests-fuzz/README.md index 6807e19a1c..cc9d7eb84e 100644 --- a/tests-fuzz/README.md +++ b/tests-fuzz/README.md @@ -66,3 +66,23 @@ GT_FUZZ_OVERRIDE_SEED=6666 GT_FUZZ_OVERRIDE_ACTIONS=175 cargo fuzz run fuzz_targ ``` For more details, visit [cargo fuzz](https://rust-fuzz.github.io/book/cargo-fuzz/tutorial.html) or run the command `cargo fuzz --help`. + +## Repartition Metric Dump Artifacts + +For `fuzz_repartition_metric_table`, dump artifacts are written under one run directory. + +- Table data snapshots: `.table-data.csv` +- SQL traces per logical table: `.trace.sql` +- Seed metadata: `seed.meta` + +SQL trace behavior: + +- Insert SQL is appended after successful execution with comment fields including + `started_at_ms` and `elapsed_ms`. +- Repartition events are broadcast to all logical table trace files with comment fields including + `action_idx`, `started_at_ms`, `elapsed_ms`, and SQL text. + +Run directory lifecycle: + +- On success, the run directory is cleaned up. +- On failure, the run directory is retained for CI/local diffing. diff --git a/tests-fuzz/src/fake.rs b/tests-fuzz/src/fake.rs index aa92e0293a..8910a39206 100644 --- a/tests-fuzz/src/fake.rs +++ b/tests-fuzz/src/fake.rs @@ -65,6 +65,26 @@ where _v: PhantomData, } +pub struct ConstGenerator { + value: V, +} + +impl ConstGenerator { + pub fn new(value: V) -> Self { + Self { value } + } +} + +impl Random for ConstGenerator +where + R: Rng, + V: Clone, +{ + fn choose(&self, _rng: &mut R, amount: usize) -> Vec { + vec![self.value.clone(); amount] + } +} + pub fn random_capitalize_map(rng: &mut R, s: Ident) -> Ident { let mut v = s.value.chars().collect::>(); diff --git a/tests-fuzz/src/generator/create_expr.rs b/tests-fuzz/src/generator/create_expr.rs index fae6a95eda..261a310db2 100644 --- a/tests-fuzz/src/generator/create_expr.rs +++ b/tests-fuzz/src/generator/create_expr.rs @@ -193,6 +193,26 @@ fn generate_partition_def( } } +fn generate_metric_partition(partitions: usize) -> Option<(Column, PartitionDef)> { + if partitions <= 1 { + return None; + } + + let partition_column = Column { + name: Ident::new("host"), + column_type: ConcreteDataType::string_datatype(), + options: vec![ColumnOption::PrimaryKey], + }; + let bounds = generate_partition_bounds(&partition_column.column_type, partitions - 1); + let partitions = SimplePartitions::new(partition_column.name.clone(), bounds); + let partition_def = PartitionDef { + columns: vec![partitions.column_name.clone()], + exprs: partitions.generate().unwrap(), + }; + + Some((partition_column, partition_def)) +} + /// Generate a physical table with 2 columns: ts of TimestampType::Millisecond as time index and val of Float64Type. #[derive(Builder)] #[builder(pattern = "owned")] @@ -201,6 +221,8 @@ pub struct CreatePhysicalTableExprGenerator { name_generator: Box>, #[builder(default = "false")] if_not_exists: bool, + #[builder(default = "0")] + partition: usize, #[builder(default, setter(into))] with_clause: HashMap, } @@ -215,25 +237,35 @@ impl Generator for CreatePhysicalTableExpr options.insert(key.clone(), Value::from(value.clone())); } + let mut columns = vec![ + Column { + name: Ident::new("ts"), + column_type: ConcreteDataType::timestamp_millisecond_datatype(), + options: vec![ColumnOption::TimeIndex], + }, + Column { + name: Ident::new("val"), + column_type: ConcreteDataType::float64_datatype(), + options: vec![], + }, + ]; + + let mut partition = None; + let mut primary_keys = vec![]; + if let Some((partition_column, partition_def)) = generate_metric_partition(self.partition) { + columns.push(partition_column); + partition = Some(partition_def); + primary_keys.push(columns.len() - 1); + } + Ok(CreateTableExpr { table_name: self.name_generator.generate(rng), - columns: vec![ - Column { - name: Ident::new("ts"), - column_type: ConcreteDataType::timestamp_millisecond_datatype(), - options: vec![ColumnOption::TimeIndex], - }, - Column { - name: Ident::new("val"), - column_type: ConcreteDataType::float64_datatype(), - options: vec![], - }, - ], + columns, if_not_exists: self.if_not_exists, - partition: None, + partition, engine: "metric".to_string(), options, - primary_keys: vec![], + primary_keys, }) } } @@ -245,6 +277,8 @@ pub struct CreateLogicalTableExprGenerator { physical_table_ctx: TableContextRef, labels: usize, if_not_exists: bool, + #[builder(default = "true")] + include_partition_column: bool, #[builder(default = "Box::new(WordGenerator)")] name_generator: Box>, } @@ -253,11 +287,11 @@ impl Generator for CreateLogicalTableExprG type Error = Error; fn generate(&self, rng: &mut R) -> Result { - // Currently we mock the usage of GreptimeDB as Prometheus' backend, the physical table must have two columns. + // Currently we mock the usage of GreptimeDB as Prometheus' backend, the physical table must have ts and val. ensure!( - self.physical_table_ctx.columns.len() == 2, + self.physical_table_ctx.columns.len() >= 2, error::UnexpectedSnafu { - violated: "The physical table must have two columns" + violated: "The physical table must have at least two columns" } ); @@ -265,9 +299,16 @@ impl Generator for CreateLogicalTableExprG let logical_table_name = self .physical_table_ctx .generate_unique_table_name(rng, self.name_generator.as_ref()); + let mut physical_columns = self.physical_table_ctx.columns.clone(); + if !self.include_partition_column + && let Some(partition_def) = &self.physical_table_ctx.partition + { + physical_columns.retain(|column| !partition_def.columns.contains(&column.name)); + } + let mut logical_table = CreateTableExpr { table_name: logical_table_name, - columns: self.physical_table_ctx.columns.clone(), + columns: physical_columns, if_not_exists: self.if_not_exists, partition: None, engine: "metric".to_string(), @@ -459,6 +500,58 @@ mod tests { })); } + #[test] + fn test_create_physical_table_expr_generator_with_partition() { + let mut rng = rand::rng(); + let physical_table_expr = CreatePhysicalTableExprGeneratorBuilder::default() + .partition(3) + .if_not_exists(false) + .build() + .unwrap() + .generate(&mut rng) + .unwrap(); + + assert_eq!(physical_table_expr.engine, "metric"); + assert!(physical_table_expr.partition.is_some()); + assert_eq!(physical_table_expr.partition.unwrap().exprs.len(), 3); + } + + #[test] + fn test_create_logical_table_expr_generator_without_partition_column() { + let mut rng = rand::rng(); + let physical_table_expr = CreatePhysicalTableExprGeneratorBuilder::default() + .partition(3) + .if_not_exists(false) + .build() + .unwrap() + .generate(&mut rng) + .unwrap(); + let partition_columns = physical_table_expr + .partition + .as_ref() + .unwrap() + .columns + .clone(); + let physical_table_ctx = Arc::new(TableContext::from(&physical_table_expr)); + + let logical_table_expr = CreateLogicalTableExprGeneratorBuilder::default() + .physical_table_ctx(physical_table_ctx) + .labels(3) + .include_partition_column(false) + .if_not_exists(false) + .build() + .unwrap() + .generate(&mut rng) + .unwrap(); + + assert!( + logical_table_expr + .columns + .iter() + .all(|column| !partition_columns.contains(&column.name)) + ); + } + #[test] fn test_create_logical_table_expr_generator_deterministic() { let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(0); diff --git a/tests-fuzz/src/ir.rs b/tests-fuzz/src/ir.rs index e8c15dcf95..ce1628cd61 100644 --- a/tests-fuzz/src/ir.rs +++ b/tests-fuzz/src/ir.rs @@ -20,6 +20,7 @@ pub(crate) mod insert_expr; pub(crate) mod partition_expr; pub(crate) mod repartition_expr; pub(crate) mod select_expr; +pub(crate) mod string_value; use core::fmt; use std::collections::HashMap; @@ -126,20 +127,7 @@ pub fn generate_partition_bounds(datatype: &ConcreteDataType, bounds: usize) -> ConcreteDataType::Int64(_) => generate_values!(i64, bounds), ConcreteDataType::Float32(_) => generate_values!(f32, bounds), ConcreteDataType::Float64(_) => generate_values!(f64, bounds), - ConcreteDataType::String(_) => { - let base = b'A'; - let range = b'z' - b'A'; - let step = range / (bounds as u8 + 1); - (1..=bounds) - .map(|i| { - Value::from( - char::from(base + step * i as u8) - .escape_default() - .to_string(), - ) - }) - .collect() - } + ConcreteDataType::String(_) => string_value::generate_partition_bounds(bounds), _ => unimplemented!("unsupported type: {datatype}"), } } @@ -157,10 +145,7 @@ pub fn generate_random_value( ConcreteDataType::Int64(_) => Value::from(rng.random::()), ConcreteDataType::Float32(_) => Value::from(rng.random::()), ConcreteDataType::Float64(_) => Value::from(rng.random::()), - ConcreteDataType::String(_) => match random_str { - Some(random) => Value::from(random.generate(rng).value), - None => Value::from(rng.random::().to_string()), - }, + ConcreteDataType::String(_) => string_value::generate_data_string_value(rng, random_str), ConcreteDataType::Date(_) => generate_random_date(rng), _ => unimplemented!("unsupported type: {datatype}"), @@ -341,21 +326,7 @@ pub fn generate_partition_value( } } datatypes::data_type::ConcreteDataType::String(_) => { - let upper = match first { - datatypes::value::Value::String(v) => v.as_utf8(), - _ => "", - }; - if bound_idx == 0 { - if upper <= "A" { - datatypes::value::Value::from("") - } else { - datatypes::value::Value::from("A") - } - } else if bound_idx < bounds.len() { - bounds[bound_idx - 1].clone() - } else { - last.clone() - } + string_value::generate_partition_value(bounds, bound_idx) } _ => unimplemented!("unsupported partition column type: {column_type}"), } diff --git a/tests-fuzz/src/ir/partition_expr.rs b/tests-fuzz/src/ir/partition_expr.rs index c91dd487ae..908223366c 100644 --- a/tests-fuzz/src/ir/partition_expr.rs +++ b/tests-fuzz/src/ir/partition_expr.rs @@ -20,7 +20,7 @@ use snafu::ensure; use crate::context::TableContext; use crate::error::{self, Result}; -use crate::ir::{Ident, generate_random_value}; +use crate::ir::{Ident, generate_random_value, string_value}; /// A partitioning scheme that divides a single column into multiple ranges based on provided bounds. /// @@ -245,6 +245,10 @@ pub fn generate_unique_bound( datatype: &ConcreteDataType, bounds: &[Value], ) -> Result { + if matches!(datatype, ConcreteDataType::String(_)) { + return string_value::generate_unique_partition_bound(rng, bounds); + } + for _ in 0..16 { let candidate = generate_random_value(rng, datatype, None); if !bounds.contains(&candidate) { diff --git a/tests-fuzz/src/ir/string_value.rs b/tests-fuzz/src/ir/string_value.rs new file mode 100644 index 0000000000..6a53aa69de --- /dev/null +++ b/tests-fuzz/src/ir/string_value.rs @@ -0,0 +1,162 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datatypes::value::Value; +use rand::Rng; + +use crate::error::{self, Result}; +use crate::generator::Random; +use crate::ir::Ident; + +const READABLE_CHARSET: &[u8] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + +fn readable_token(index: usize) -> String { + let base = READABLE_CHARSET.len(); + let mut n = index + 1; + let mut buf = Vec::new(); + + while n > 0 { + let rem = (n - 1) % base; + buf.push(READABLE_CHARSET[rem] as char); + n = (n - 1) / base; + } + + buf.iter().rev().collect() +} + +pub fn generate_data_string_value( + rng: &mut R, + random_str: Option<&dyn Random>, +) -> Value { + match random_str { + Some(random) => Value::from(random.generate(rng).value), + None => { + let idx = rng.random_range(0..(READABLE_CHARSET.len() * READABLE_CHARSET.len() * 4)); + Value::from(readable_token(idx)) + } + } +} + +/// Generates ordered readable string bounds for partition expressions. +pub fn generate_partition_bounds(bounds: usize) -> Vec { + let token_space = READABLE_CHARSET.len() * READABLE_CHARSET.len() * 1024; + (1..=bounds) + .map(|i| { + let idx = i * token_space / (bounds + 1); + Value::from(readable_token(idx)) + }) + .collect() +} + +/// Picks a representative string value for the target partition range. +pub fn generate_partition_value(bounds: &[Value], bound_idx: usize) -> Value { + let first = bounds.first().unwrap(); + let last = bounds.last().unwrap(); + let upper = match first { + Value::String(v) => v.as_utf8(), + _ => "", + }; + + if bound_idx == 0 { + if upper <= "0" { + Value::from("") + } else { + Value::from("0") + } + } else if bound_idx < bounds.len() { + bounds[bound_idx - 1].clone() + } else { + last.clone() + } +} + +/// Generates a unique readable bound not present in existing bounds. +pub fn generate_unique_partition_bound(rng: &mut R, bounds: &[Value]) -> Result { + let search_space = READABLE_CHARSET.len() * READABLE_CHARSET.len() * 1024; + let start = rng.random_range(0..search_space); + for offset in 0..search_space { + let idx = start + offset; + let candidate = Value::from(readable_token(idx)); + if !bounds.contains(&candidate) { + return Ok(candidate); + } + } + + error::UnexpectedSnafu { + violated: "unable to generate unique string partition bound".to_string(), + } + .fail() +} + +#[cfg(test)] +mod tests { + use rand::SeedableRng; + use rand_chacha::ChaCha8Rng; + + use super::*; + + #[test] + fn test_readable_token_grows_length() { + assert_eq!("0", readable_token(0)); + assert_eq!("9", readable_token(9)); + assert_eq!("A", readable_token(10)); + assert_eq!("z", readable_token(61)); + assert_eq!("00", readable_token(62)); + } + + #[test] + fn test_generate_partition_bounds_are_readable_and_unique() { + let bounds = generate_partition_bounds(8); + assert_eq!(8, bounds.len()); + + let mut values = bounds + .iter() + .map(|v| match v { + Value::String(s) => s.as_utf8().to_string(), + _ => panic!("expected string value"), + }) + .collect::>(); + let mut dedup = values.clone(); + dedup.sort(); + dedup.dedup(); + assert_eq!(values.len(), dedup.len()); + + for s in values.drain(..) { + assert!(s.chars().all(|c| c.is_ascii_alphanumeric())); + } + } + + #[test] + fn test_generate_partition_value_for_string_bounds() { + let bounds = vec![Value::from("A"), Value::from("M")]; + assert_eq!(Value::from("0"), generate_partition_value(&bounds, 0)); + assert_eq!(Value::from("A"), generate_partition_value(&bounds, 1)); + assert_eq!(Value::from("M"), generate_partition_value(&bounds, 2)); + } + + #[test] + fn test_generate_unique_partition_bound_not_in_existing() { + let mut rng = ChaCha8Rng::seed_from_u64(42); + let bounds = vec![Value::from("0"), Value::from("1"), Value::from("2")]; + let candidate = generate_unique_partition_bound(&mut rng, &bounds).unwrap(); + assert!(!bounds.contains(&candidate)); + match candidate { + Value::String(s) => { + assert!(!s.as_utf8().is_empty()); + assert!(s.as_utf8().chars().all(|c| c.is_ascii_alphanumeric())); + } + _ => panic!("expected string value"), + } + } +} diff --git a/tests-fuzz/src/translator.rs b/tests-fuzz/src/translator.rs index 673b543f2c..4c5e0bb6a4 100644 --- a/tests-fuzz/src/translator.rs +++ b/tests-fuzz/src/translator.rs @@ -13,6 +13,8 @@ // limitations under the License. mod common; +/// Translator that converts insert expressions into CSV records. +pub mod csv; pub mod mysql; pub mod postgres; diff --git a/tests-fuzz/src/translator/csv.rs b/tests-fuzz/src/translator/csv.rs new file mode 100644 index 0000000000..e95956862c --- /dev/null +++ b/tests-fuzz/src/translator/csv.rs @@ -0,0 +1,121 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::error::Error; +use crate::ir::insert_expr::{InsertIntoExpr, RowValue}; +use crate::translator::DslTranslator; + +/// One CSV record converted from an insert row. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct CsvRecord { + /// Cell values in column order. + pub values: Vec, +} + +/// CSV records converted from an insert expression. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct CsvRecords { + /// Target table name from insert expression. + pub table_name: String, + /// Header values from insert columns. + pub headers: Vec, + /// Converted row records. + pub records: Vec, +} + +/// Translates `InsertIntoExpr` into CSV-writer-ready records. +pub struct InsertExprToCsvRecordsTranslator; + +impl DslTranslator for InsertExprToCsvRecordsTranslator { + type Error = Error; + + fn translate(&self, input: &InsertIntoExpr) -> Result { + let headers = input + .columns + .iter() + .map(|column| column.name.to_string()) + .collect::>(); + let records = input + .values_list + .iter() + .map(|row| CsvRecord { + values: row.iter().map(Self::format_row_value).collect(), + }) + .collect::>(); + + Ok(CsvRecords { + table_name: input.table_name.to_string(), + headers, + records, + }) + } +} + +impl InsertExprToCsvRecordsTranslator { + fn format_row_value(value: &RowValue) -> String { + match value { + RowValue::Value(datatypes::value::Value::Null) => String::new(), + RowValue::Value(v) => v.to_string(), + RowValue::Default => "DEFAULT".to_string(), + } + } +} + +#[cfg(test)] +mod tests { + use datatypes::data_type::ConcreteDataType; + + use super::InsertExprToCsvRecordsTranslator; + use crate::ir::create_expr::ColumnOption; + use crate::ir::insert_expr::{InsertIntoExpr, RowValue}; + use crate::ir::{Column, Ident}; + use crate::translator::DslTranslator; + + #[test] + fn test_translate_insert_expr_to_csv_records() { + let input = InsertIntoExpr { + table_name: Ident::new("metric_a"), + omit_column_list: false, + columns: vec![ + Column { + name: "host".into(), + column_type: ConcreteDataType::string_datatype(), + options: vec![ColumnOption::PrimaryKey], + }, + Column { + name: "value".into(), + column_type: ConcreteDataType::float64_datatype(), + options: vec![], + }, + ], + values_list: vec![ + vec![ + RowValue::Value(datatypes::value::Value::String("web-1".into())), + RowValue::Value(datatypes::value::Value::Int32(15)), + ], + vec![ + RowValue::Value(datatypes::value::Value::Null), + RowValue::Default, + ], + ], + }; + + let output = InsertExprToCsvRecordsTranslator.translate(&input).unwrap(); + assert_eq!(output.table_name, "metric_a"); + assert_eq!(output.headers, vec!["host", "value"]); + assert_eq!(output.records.len(), 2); + assert_eq!(output.records[0].values, vec!["web-1", "15"]); + assert_eq!(output.records[1].values, vec!["", "DEFAULT"]); + } +} diff --git a/tests-fuzz/src/utils.rs b/tests-fuzz/src/utils.rs index 0780f6c93d..d55abab3c2 100644 --- a/tests-fuzz/src/utils.rs +++ b/tests-fuzz/src/utils.rs @@ -15,6 +15,8 @@ pub mod cluster_info; pub mod config; pub mod crd; +/// CSV dump writer utilities for fuzz tests. +pub mod csv_dump_writer; pub mod health; pub mod migration; pub mod partition; @@ -22,10 +24,15 @@ pub mod pod_failure; pub mod procedure; #[cfg(feature = "unstable")] pub mod process; +pub mod retry; +/// SQL dump writer utilities for fuzz tests. +pub mod sql_dump_writer; pub mod wait; use std::env; +use std::str::FromStr; +use common_base::readable_size::ReadableSize; use common_telemetry::info; use common_telemetry::tracing::log::LevelFilter; use paste::paste; @@ -126,6 +133,14 @@ pub const GT_FUZZ_INPUT_MAX_COLUMNS: &str = "GT_FUZZ_INPUT_MAX_COLUMNS"; pub const GT_FUZZ_INPUT_MAX_ALTER_ACTIONS: &str = "GT_FUZZ_INPUT_MAX_ALTER_ACTIONS"; pub const GT_FUZZ_INPUT_MAX_INSERT_ACTIONS: &str = "GT_FUZZ_INPUT_MAX_INSERT_ACTIONS"; pub const FUZZ_OVERRIDE_PREFIX: &str = "GT_FUZZ_OVERRIDE_"; +/// Enables CSV dump generation for fuzz runs. +pub const GT_FUZZ_DUMP_TABLE_CSV: &str = "GT_FUZZ_DUMP_TABLE_CSV"; +/// Base directory for CSV dump sessions. +pub const GT_FUZZ_DUMP_DIR: &str = "GT_FUZZ_DUMP_DIR"; +/// Directory suffix used by one CSV dump session. +pub const GT_FUZZ_DUMP_SUFFIX: &str = "GT_FUZZ_DUMP_SUFFIX"; +/// Max in-memory CSV buffer size before auto flush. +pub const GT_FUZZ_DUMP_BUFFER_MAX_BYTES: &str = "GT_FUZZ_DUMP_BUFFER_MAX_BYTES"; /// Reads an override value for a fuzz parameter from env `GT_FUZZ_OVERRIDE_`. pub fn get_fuzz_override(name: &str) -> Option @@ -137,6 +152,33 @@ where env::var(&key).ok().and_then(|v| v.parse().ok()) } +/// Returns CSV dump base directory. +pub fn get_gt_fuzz_dump_dir() -> String { + let _ = dotenv::dotenv(); + env::var(GT_FUZZ_DUMP_DIR).unwrap_or_else(|_| "/tmp/greptime-fuzz-dumps".to_string()) +} + +/// Returns CSV dump directory suffix. +pub fn get_gt_fuzz_dump_suffix() -> String { + let _ = dotenv::dotenv(); + env::var(GT_FUZZ_DUMP_SUFFIX).unwrap_or_else(|_| ".repartition-metric-csv".to_string()) +} + +/// Returns max CSV in-memory buffer size. +pub fn get_gt_fuzz_dump_buffer_max_bytes() -> usize { + let _ = dotenv::dotenv(); + env::var(GT_FUZZ_DUMP_BUFFER_MAX_BYTES) + .ok() + .and_then(|value| { + value.parse::().ok().or_else(|| { + ReadableSize::from_str(&value) + .ok() + .map(|size| size.as_bytes() as usize) + }) + }) + .unwrap_or(8 * 1024 * 1024) +} + macro_rules! make_get_from_env_helper { ($key:expr, $default: expr) => { paste! { diff --git a/tests-fuzz/src/utils/csv_dump_writer.rs b/tests-fuzz/src/utils/csv_dump_writer.rs new file mode 100644 index 0000000000..de16a23c24 --- /dev/null +++ b/tests-fuzz/src/utils/csv_dump_writer.rs @@ -0,0 +1,383 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::{HashMap, HashSet}; +use std::fs::{File, OpenOptions, create_dir_all, remove_dir_all}; +use std::io::Write; +use std::path::{Path, PathBuf}; + +use common_telemetry::{info, warn}; +use common_time::util::current_time_millis; +use snafu::ResultExt; + +use crate::error::{self, Result}; +use crate::translator::csv::CsvRecords; +use crate::utils::{ + get_gt_fuzz_dump_buffer_max_bytes, get_gt_fuzz_dump_dir, get_gt_fuzz_dump_suffix, +}; + +/// Metadata for one CSV dump session. +#[derive(Debug, Clone)] +pub struct CsvDumpMetadata { + /// Fuzz target name. + pub target: String, + /// Seed used by current fuzz input. + pub seed: u64, + /// Repartition action count. + pub actions: usize, + /// Initial partition count. + pub partitions: usize, + /// Logical table count. + pub tables: usize, + /// Session start time in unix milliseconds. + pub started_at_unix_ms: i64, +} + +impl CsvDumpMetadata { + /// Builds dump metadata with current timestamp. + pub fn new( + target: impl Into, + seed: u64, + actions: usize, + partitions: usize, + tables: usize, + ) -> Self { + Self { + target: target.into(), + seed, + actions, + partitions, + tables, + started_at_unix_ms: current_time_millis(), + } + } +} + +/// Session writer for staged CSV dump records. +#[derive(Debug)] +pub struct CsvDumpSession { + /// Session metadata. + pub metadata: CsvDumpMetadata, + /// Session directory path. + pub run_dir: PathBuf, + /// Max in-memory buffer size before auto flush. + pub max_buffer_bytes: usize, + records: Vec, + buffered_bytes: usize, + written_tables: HashSet, + full_headers_by_table: HashMap>, +} + +impl CsvDumpSession { + /// Creates session directory and writes seed metadata file. + pub fn new(metadata: CsvDumpMetadata) -> Result { + Self::new_with_buffer_limit(metadata, get_gt_fuzz_dump_buffer_max_bytes()) + } + + /// Creates session with a custom in-memory buffer limit. + pub fn new_with_buffer_limit( + metadata: CsvDumpMetadata, + max_buffer_bytes: usize, + ) -> Result { + let run_dir = build_run_dir(&metadata); + create_dir_all(&run_dir).context(error::CreateFileSnafu { + path: run_dir.to_string_lossy().to_string(), + })?; + write_seed_meta(&run_dir, &metadata)?; + info!( + "Create csv dump session, target: {}, run_dir: {}, max_buffer_bytes: {}", + metadata.target, + run_dir.display(), + max_buffer_bytes + ); + + Ok(Self { + metadata, + run_dir, + max_buffer_bytes, + records: Vec::new(), + buffered_bytes: 0, + written_tables: HashSet::new(), + full_headers_by_table: HashMap::new(), + }) + } + + /// Appends one table CSV records batch with full table headers. + pub fn append(&mut self, records: CsvRecords, full_headers: Vec) -> Result<()> { + self.full_headers_by_table + .entry(records.table_name.clone()) + .or_insert(full_headers); + self.buffered_bytes += estimate_csv_records_size(&records); + self.records.push(records); + if self.buffered_bytes >= self.max_buffer_bytes { + self.flush_buffered_records()?; + } + Ok(()) + } + + /// Flushes all appended batches to CSV files. + pub fn flush_all(&mut self) -> Result<()> { + self.flush_buffered_records() + } + + /// Removes session directory after successful validation. + pub fn cleanup_on_success(&self) -> std::io::Result<()> { + match remove_dir_all(&self.run_dir) { + Ok(_) => { + info!( + "Cleanup csv dump directory on success: {}", + self.run_dir.display() + ); + Ok(()) + } + Err(err) => { + warn!( + "Cleanup csv dump directory failed: {}, error: {:?}", + self.run_dir.display(), + err + ); + Err(err) + } + } + } + + fn flush_buffered_records(&mut self) -> Result<()> { + if self.records.is_empty() { + return Ok(()); + } + for batch in &self.records { + write_batch_csv( + &self.run_dir, + batch, + &mut self.written_tables, + &self.full_headers_by_table, + )?; + } + self.records.clear(); + self.buffered_bytes = 0; + Ok(()) + } +} + +fn write_seed_meta(run_dir: &Path, metadata: &CsvDumpMetadata) -> Result<()> { + let path = run_dir.join("seed.meta"); + let mut file = File::create(&path).context(error::CreateFileSnafu { + path: path.to_string_lossy().to_string(), + })?; + + let content = format!( + "target={}\nseed={}\nactions={}\npartitions={}\ntables={}\nstarted_at_unix_ms={}\n", + metadata.target, + metadata.seed, + metadata.actions, + metadata.partitions, + metadata.tables, + metadata.started_at_unix_ms, + ); + file.write_all(content.as_bytes()) + .context(error::WriteFileSnafu { + path: path.to_string_lossy().to_string(), + }) +} + +fn write_batch_csv( + run_dir: &Path, + batch: &CsvRecords, + written_tables: &mut HashSet, + full_headers_by_table: &HashMap>, +) -> Result<()> { + let output_headers = full_headers_by_table + .get(&batch.table_name) + .cloned() + .unwrap_or_else(|| batch.headers.clone()); + let file_name = format!("{}.table-data.csv", sanitize_file_name(&batch.table_name)); + let path = run_dir.join(file_name); + let mut file = OpenOptions::new() + .create(true) + .append(true) + .open(&path) + .context(error::CreateFileSnafu { + path: path.to_string_lossy().to_string(), + })?; + + if written_tables.insert(batch.table_name.clone()) { + file.write_all(join_line(&output_headers).as_bytes()) + .context(error::WriteFileSnafu { + path: path.to_string_lossy().to_string(), + })?; + file.write_all(b"\n").context(error::WriteFileSnafu { + path: path.to_string_lossy().to_string(), + })?; + } + + let header_index = batch + .headers + .iter() + .enumerate() + .map(|(idx, header)| (header.as_str(), idx)) + .collect::>(); + + for record in &batch.records { + let aligned_values = output_headers + .iter() + .map(|header| { + header_index + .get(header.as_str()) + .and_then(|idx| record.values.get(*idx)) + .cloned() + .unwrap_or_default() + }) + .collect::>(); + file.write_all(join_line(&aligned_values).as_bytes()) + .context(error::WriteFileSnafu { + path: path.to_string_lossy().to_string(), + })?; + file.write_all(b"\n").context(error::WriteFileSnafu { + path: path.to_string_lossy().to_string(), + })?; + } + + Ok(()) +} + +fn estimate_csv_records_size(records: &CsvRecords) -> usize { + let headers = records.headers.iter().map(String::len).sum::(); + let rows = records + .records + .iter() + .flat_map(|record| record.values.iter()) + .map(String::len) + .sum::(); + headers + rows +} + +fn join_line(cells: &[String]) -> String { + cells + .iter() + .map(|cell| escape_csv_cell(cell)) + .collect::>() + .join(",") +} + +fn escape_csv_cell(value: &str) -> String { + if value.contains([',', '"', '\n', '\r']) { + format!("\"{}\"", value.replace('"', "\"\"")) + } else { + value.to_string() + } +} + +fn sanitize_file_name(raw: &str) -> String { + raw.chars() + .map(|ch| { + if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' { + ch + } else { + '_' + } + }) + .collect() +} + +fn build_run_dir(metadata: &CsvDumpMetadata) -> PathBuf { + let base = PathBuf::from(get_gt_fuzz_dump_dir()); + let suffix = get_gt_fuzz_dump_suffix(); + let name = format!( + "{}_seed_{}_actions_{}_ts_{}{}", + metadata.target, metadata.seed, metadata.actions, metadata.started_at_unix_ms, suffix + ); + base.join(name) +} + +#[cfg(test)] +mod tests { + use super::{CsvDumpMetadata, CsvDumpSession}; + use crate::translator::csv::{CsvRecord, CsvRecords}; + + #[test] + fn test_create_session_and_flush() { + let mut session = CsvDumpSession::new_with_buffer_limit( + CsvDumpMetadata::new("fuzz_case", 1, 2, 3, 4), + 1024, + ) + .unwrap(); + session + .append( + CsvRecords { + table_name: "metric-a".to_string(), + headers: vec!["host".to_string(), "value".to_string()], + records: vec![CsvRecord { + values: vec!["web-1".to_string(), "10".to_string()], + }], + }, + vec!["host".to_string(), "value".to_string()], + ) + .unwrap(); + session.flush_all().unwrap(); + + assert!(session.run_dir.exists()); + assert!(session.run_dir.join("seed.meta").exists()); + assert!(session.run_dir.join("metric-a.table-data.csv").exists()); + } + + #[test] + fn test_auto_flush_on_buffer_limit() { + let mut session = + CsvDumpSession::new_with_buffer_limit(CsvDumpMetadata::new("fuzz_case", 5, 2, 3, 4), 1) + .unwrap(); + session + .append( + CsvRecords { + table_name: "metric-b".to_string(), + headers: vec!["host".to_string()], + records: vec![CsvRecord { + values: vec!["web-2".to_string()], + }], + }, + vec!["host".to_string()], + ) + .unwrap(); + + assert!(session.run_dir.join("metric-b.table-data.csv").exists()); + assert_eq!(session.buffered_bytes, 0); + } + + #[test] + fn test_flush_with_partial_headers_uses_full_headers() { + let mut session = CsvDumpSession::new_with_buffer_limit( + CsvDumpMetadata::new("fuzz_case", 7, 2, 3, 4), + 1024, + ) + .unwrap(); + session + .append( + CsvRecords { + table_name: "metric-c".to_string(), + headers: vec!["host".to_string(), "value".to_string()], + records: vec![CsvRecord { + values: vec!["web-3".to_string(), "12".to_string()], + }], + }, + vec!["host".to_string(), "idc".to_string(), "value".to_string()], + ) + .unwrap(); + session.flush_all().unwrap(); + + let file = + std::fs::read_to_string(session.run_dir.join("metric-c.table-data.csv")).unwrap(); + let mut lines = file.lines(); + assert_eq!(lines.next().unwrap(), "host,idc,value"); + assert_eq!(lines.next().unwrap(), "web-3,,12"); + } +} diff --git a/tests-fuzz/src/utils/retry.rs b/tests-fuzz/src/utils/retry.rs new file mode 100644 index 0000000000..06d1ede54f --- /dev/null +++ b/tests-fuzz/src/utils/retry.rs @@ -0,0 +1,49 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::future::Future; +use std::time::Duration; + +use common_telemetry::warn; + +pub async fn retry_with_backoff( + mut operation: F, + max_attempts: usize, + init_backoff: Duration, + max_backoff: Duration, +) -> Result +where + F: FnMut() -> Fut, + Fut: Future>, + E: std::fmt::Debug, +{ + let mut backoff = init_backoff; + for attempt in 0..max_attempts { + match operation().await { + Ok(result) => return Ok(result), + Err(err) if attempt + 1 == max_attempts => return Err(err), + Err(err) => { + let current_attempt = attempt + 1; + warn!( + "Retryable operation failed, attempt: {}, max_attempts: {}, backoff: {:?}, error: {:?}", + current_attempt, max_attempts, backoff, err + ); + tokio::time::sleep(backoff).await; + backoff = std::cmp::min(backoff * 2, max_backoff); + } + } + } + + panic!("retry loop should always return") +} diff --git a/tests-fuzz/src/utils/sql_dump_writer.rs b/tests-fuzz/src/utils/sql_dump_writer.rs new file mode 100644 index 0000000000..6f098d9584 --- /dev/null +++ b/tests-fuzz/src/utils/sql_dump_writer.rs @@ -0,0 +1,267 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::fs::{OpenOptions, create_dir_all}; +use std::io::Write; +use std::path::PathBuf; + +use snafu::ResultExt; + +use crate::error::{self, Result}; +use crate::utils::get_gt_fuzz_dump_buffer_max_bytes; + +/// Session writer for table-scoped SQL trace files. +#[derive(Debug)] +pub struct SqlDumpSession { + /// Session directory path. + pub run_dir: PathBuf, + /// Max in-memory buffer size before auto flush. + pub max_buffer_bytes: usize, + buffered_bytes: usize, + entries_by_table: HashMap>, +} + +impl SqlDumpSession { + /// Creates SQL dump session with default buffer limit. + pub fn new(run_dir: PathBuf) -> Result { + Self::new_with_buffer_limit(run_dir, get_gt_fuzz_dump_buffer_max_bytes()) + } + + /// Creates SQL dump session with custom buffer limit. + pub fn new_with_buffer_limit(run_dir: PathBuf, max_buffer_bytes: usize) -> Result { + create_dir_all(&run_dir).context(error::CreateFileSnafu { + path: run_dir.to_string_lossy().to_string(), + })?; + + Ok(Self { + run_dir, + max_buffer_bytes, + buffered_bytes: 0, + entries_by_table: HashMap::new(), + }) + } + + /// Appends one SQL statement for a logical table. + pub fn append_sql(&mut self, table: &str, sql: &str, comment: Option<&str>) -> Result<()> { + let entry = format_sql_entry(sql, comment); + self.push_entry(table, entry)?; + Ok(()) + } + + /// Broadcasts one comment event to all table trace files. + pub fn broadcast_event(&mut self, tables: I, event: &str, sql: &str) -> Result<()> + where + I: IntoIterator, + T: AsRef, + { + let entry = format_sql_entry(sql, Some(event)); + for table in tables { + self.push_entry(table.as_ref(), entry.clone())?; + } + Ok(()) + } + + /// Flushes all staged SQL traces to table-scoped files. + pub fn flush_all(&mut self) -> Result<()> { + self.flush_buffered_entries() + } + + fn push_entry(&mut self, table: &str, entry: String) -> Result<()> { + self.buffered_bytes += entry.len(); + self.entries_by_table + .entry(table.to_string()) + .or_default() + .push(entry); + + if self.buffered_bytes >= self.max_buffer_bytes { + self.flush_buffered_entries()?; + } + Ok(()) + } + + fn flush_buffered_entries(&mut self) -> Result<()> { + if self.entries_by_table.is_empty() { + return Ok(()); + } + + for (table, entries) in &self.entries_by_table { + let path = self + .run_dir + .join(format!("{}.trace.sql", sanitize_file_name(table))); + let mut file = OpenOptions::new() + .create(true) + .append(true) + .open(&path) + .context(error::CreateFileSnafu { + path: path.to_string_lossy().to_string(), + })?; + + for entry in entries { + file.write_all(entry.as_bytes()) + .context(error::WriteFileSnafu { + path: path.to_string_lossy().to_string(), + })?; + file.write_all(b"\n").context(error::WriteFileSnafu { + path: path.to_string_lossy().to_string(), + })?; + } + } + + self.entries_by_table.clear(); + self.buffered_bytes = 0; + Ok(()) + } +} + +fn format_sql_entry(sql: &str, comment: Option<&str>) -> String { + let normalized_sql = normalize_sql(sql); + if let Some(comment) = comment { + format!("{}\n{normalized_sql}", format_comment(comment)) + } else { + normalized_sql + } +} + +fn format_comment(comment: &str) -> String { + comment + .lines() + .map(|line| format!("-- {line}")) + .collect::>() + .join("\n") +} + +fn normalize_sql(sql: &str) -> String { + let trimmed = sql.trim_end(); + if trimmed.ends_with(';') { + trimmed.to_string() + } else { + format!("{trimmed};") + } +} + +fn sanitize_file_name(raw: &str) -> String { + raw.chars() + .map(|ch| { + if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' { + ch + } else { + '_' + } + }) + .collect() +} + +#[cfg(test)] +mod tests { + use std::time::{SystemTime, UNIX_EPOCH}; + + use super::SqlDumpSession; + + #[test] + fn test_append_sql_writes_table_trace_file() { + let run_dir = std::env::temp_dir().join(format!( + "tests-fuzz-sql-dump-{}", + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() + )); + + let mut session = SqlDumpSession::new_with_buffer_limit(run_dir.clone(), 1024).unwrap(); + session + .append_sql( + "metric-a", + "INSERT INTO t VALUES(1)", + Some("kind=insert elapsed_ms=10"), + ) + .unwrap(); + session.flush_all().unwrap(); + + let content = std::fs::read_to_string(run_dir.join("metric-a.trace.sql")).unwrap(); + assert!(content.contains("-- kind=insert elapsed_ms=10")); + assert!(content.contains("INSERT INTO t VALUES(1);")); + } + + #[test] + fn test_broadcast_event_writes_to_all_tables() { + let run_dir = std::env::temp_dir().join(format!( + "tests-fuzz-sql-broadcast-{}", + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() + )); + + let mut session = SqlDumpSession::new_with_buffer_limit(run_dir.clone(), 1024).unwrap(); + session + .broadcast_event( + ["metric-a", "metric-b"], + "repartition action_idx=3", + "ALTER TABLE t REPARTITION", + ) + .unwrap(); + session.flush_all().unwrap(); + + let content_a = std::fs::read_to_string(run_dir.join("metric-a.trace.sql")).unwrap(); + let content_b = std::fs::read_to_string(run_dir.join("metric-b.trace.sql")).unwrap(); + assert!(content_a.contains("-- repartition action_idx=3")); + assert!(content_a.contains("ALTER TABLE t REPARTITION;")); + assert!(content_b.contains("-- repartition action_idx=3")); + assert!(content_b.contains("ALTER TABLE t REPARTITION;")); + } + + #[test] + fn test_multiline_comment_is_prefixed_per_line() { + let run_dir = std::env::temp_dir().join(format!( + "tests-fuzz-sql-dump-comment-{}", + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() + )); + + let mut session = SqlDumpSession::new_with_buffer_limit(run_dir.clone(), 1024).unwrap(); + session + .append_sql( + "metric-a", + "INSERT INTO t VALUES(1)", + Some("kind=insert\nstarted_at_ms=1 elapsed_ms=2"), + ) + .unwrap(); + session.flush_all().unwrap(); + + let content = std::fs::read_to_string(run_dir.join("metric-a.trace.sql")).unwrap(); + assert!(content.contains("-- kind=insert\n-- started_at_ms=1 elapsed_ms=2")); + } + + #[test] + fn test_auto_flush_on_buffer_limit() { + let run_dir = std::env::temp_dir().join(format!( + "tests-fuzz-sql-dump-limit-{}", + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() + )); + + let mut session = SqlDumpSession::new_with_buffer_limit(run_dir.clone(), 1).unwrap(); + session + .append_sql("metric-a", "INSERT INTO t VALUES(1)", None) + .unwrap(); + + assert!(run_dir.join("metric-a.trace.sql").exists()); + assert_eq!(session.buffered_bytes, 0); + } +} diff --git a/tests-fuzz/targets/ddl/fuzz_repartition_metric_table.rs b/tests-fuzz/targets/ddl/fuzz_repartition_metric_table.rs new file mode 100644 index 0000000000..7932bc7759 --- /dev/null +++ b/tests-fuzz/targets/ddl/fuzz_repartition_metric_table.rs @@ -0,0 +1,684 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#![no_main] + +use std::collections::{BTreeMap, HashMap}; +use std::sync::{Arc, Mutex}; +use std::time::{Duration, Instant}; + +use arbitrary::{Arbitrary, Unstructured}; +use common_telemetry::{info, warn}; +use common_time::Timestamp; +use common_time::util::current_time_millis; +use libfuzzer_sys::fuzz_target; +use rand::{Rng, SeedableRng}; +use rand_chacha::ChaChaRng; +use snafu::{ResultExt, ensure}; +use sqlx::{MySql, Pool}; +use tests_fuzz::context::{TableContext, TableContextRef}; +use tests_fuzz::error::{self, Result}; +use tests_fuzz::fake::{ + ConstGenerator, MappedGenerator, WordGenerator, merge_two_word_map_fn, random_capitalize_map, + uppercase_and_keyword_backtick_map, +}; +use tests_fuzz::generator::Generator; +use tests_fuzz::generator::create_expr::{ + CreateLogicalTableExprGeneratorBuilder, CreatePhysicalTableExprGeneratorBuilder, +}; +use tests_fuzz::generator::insert_expr::InsertExprGeneratorBuilder; +use tests_fuzz::generator::repartition_expr::{ + MergePartitionExprGeneratorBuilder, SplitPartitionExprGeneratorBuilder, +}; +use tests_fuzz::ir::{ + CreateTableExpr, Ident, InsertIntoExpr, RepartitionExpr, generate_random_value, + generate_unique_timestamp_for_mysql_with_clock, +}; +use tests_fuzz::translator::DslTranslator; +use tests_fuzz::translator::csv::InsertExprToCsvRecordsTranslator; +use tests_fuzz::translator::mysql::create_expr::CreateTableExprTranslator; +use tests_fuzz::translator::mysql::insert_expr::InsertIntoExprTranslator; +use tests_fuzz::translator::mysql::repartition_expr::RepartitionExprTranslator; +use tests_fuzz::utils::csv_dump_writer::{CsvDumpMetadata, CsvDumpSession}; +use tests_fuzz::utils::retry::retry_with_backoff; +use tests_fuzz::utils::sql_dump_writer::SqlDumpSession; +use tests_fuzz::utils::{ + Connections, get_fuzz_override, get_gt_fuzz_input_max_alter_actions, + get_gt_fuzz_input_max_tables, init_greptime_connections_via_env, +}; +use tests_fuzz::validator::row::count_values; +use tokio::sync::{mpsc, oneshot}; + +const BARRIER_ACK_TIMEOUT_SECS: u64 = 10; +const VALIDATE_QUERY_MAX_ATTEMPTS: usize = 6; +const VALIDATE_QUERY_INIT_BACKOFF: Duration = Duration::from_millis(50); +const VALIDATE_QUERY_MAX_BACKOFF: Duration = Duration::from_millis(800); + +#[derive(Clone)] +struct FuzzContext { + greptime: Pool, +} + +impl FuzzContext { + async fn close(self) { + self.greptime.close().await; + } +} + +#[derive(Clone, Debug)] +struct FuzzInput { + seed: u64, + actions: usize, + partitions: usize, + tables: usize, +} + +fn generate_create_physical_table_expr( + partitions: usize, + rng: &mut R, +) -> Result { + CreatePhysicalTableExprGeneratorBuilder::default() + .name_generator(Box::new(ConstGenerator::new(Ident::new( + "fuzz_repartition_metric_physical", + )))) + .if_not_exists(rng.random_bool(0.5)) + .partition(partitions) + .build() + .unwrap() + .generate(rng) +} + +fn generate_create_logical_table_expr( + physical_table_ctx: TableContextRef, + include_partition_column: bool, + rng: &mut R, +) -> Result { + CreateLogicalTableExprGeneratorBuilder::default() + .name_generator(Box::new(MappedGenerator::new( + WordGenerator, + merge_two_word_map_fn(random_capitalize_map, uppercase_and_keyword_backtick_map), + ))) + .physical_table_ctx(physical_table_ctx) + .labels(rng.random_range(1..=5)) + .if_not_exists(rng.random_bool(0.5)) + .include_partition_column(include_partition_column) + .build() + .unwrap() + .generate(rng) +} + +fn generate_insert_expr( + rows: usize, + rng: &mut R, + table_ctx: TableContextRef, + clock: Arc>, +) -> Result { + let ts_value_generator = generate_unique_timestamp_for_mysql_with_clock(clock); + InsertExprGeneratorBuilder::default() + .omit_column_list(false) + .table_ctx(table_ctx) + .rows(rows) + .value_generator(Box::new(generate_random_value)) + .ts_value_generator(ts_value_generator) + .build() + .unwrap() + .generate(rng) +} + +async fn create_metric_tables( + ctx: &FuzzContext, + rng: &mut R, + partitions: usize, + table_count: usize, +) -> Result<( + TableContextRef, + BTreeMap, + HashMap, + String, +)> { + let create_physical_expr = generate_create_physical_table_expr(partitions, rng)?; + let translator = CreateTableExprTranslator; + let create_physical_sql = translator.translate(&create_physical_expr)?; + let result = sqlx::query(&create_physical_sql) + .execute(&ctx.greptime) + .await + .context(error::ExecuteQuerySnafu { + sql: &create_physical_sql, + })?; + info!("Create physical table: {create_physical_sql}, result: {result:?}"); + let physical_table_ctx = Arc::new(TableContext::from(&create_physical_expr)); + ensure!( + physical_table_ctx.partition.is_some(), + error::AssertSnafu { + reason: "Physical metric table must have partition".to_string() + } + ); + + let mut logical_tables = BTreeMap::new(); + let mut create_logical_sqls = HashMap::new(); + let max_attempts = table_count * 3; + for _ in 0..max_attempts { + if logical_tables.len() >= table_count { + break; + } + + let include_partition_column = rng.random_bool(0.5); + let create_logical_expr = generate_create_logical_table_expr( + physical_table_ctx.clone(), + include_partition_column, + rng, + )?; + if logical_tables.contains_key(&create_logical_expr.table_name) { + continue; + } + + let create_logical_sql = translator.translate(&create_logical_expr)?; + let result = sqlx::query(&create_logical_sql) + .execute(&ctx.greptime) + .await + .context(error::ExecuteQuerySnafu { + sql: &create_logical_sql, + })?; + info!("Create logical table: {create_logical_sql}, result: {result:?}"); + let logical_ctx = Arc::new(TableContext::from(&create_logical_expr)); + create_logical_sqls.insert(logical_ctx.name.to_string(), create_logical_sql); + logical_tables.insert(logical_ctx.name.clone(), logical_ctx); + } + + ensure!( + !logical_tables.is_empty(), + error::AssertSnafu { + reason: "No logical table created".to_string() + } + ); + + Ok(( + physical_table_ctx, + logical_tables, + create_logical_sqls, + create_physical_sql, + )) +} + +async fn execute_insert_with_retry(ctx: &FuzzContext, sql: &str) -> Result<()> { + let mut delay = Duration::from_millis(100); + let mut attempt = 0; + let max_attempts = 10; + loop { + match sqlx::query(sql) + .persistent(false) + .execute(&ctx.greptime) + .await + { + Ok(_) => return Ok(()), + Err(err) => { + tokio::time::sleep(delay).await; + delay = std::cmp::min(delay * 2, Duration::from_secs(1)); + attempt += 1; + warn!("Execute insert with retry: {sql}, attempt: {attempt}, error: {err:?}"); + if attempt >= max_attempts { + return Err(err).context(error::ExecuteQuerySnafu { sql }); + } + } + } + } +} + +struct SharedState { + clock: Arc>, + inserted_rows: HashMap, + csv_dump_session: Option, + sql_dump_session: Option, + running: bool, +} + +enum WriterControl { + Barrier { + epoch: usize, + ack: oneshot::Sender<()>, + }, + Resume { + epoch: usize, + }, + Stop, +} + +fn handle_writer_control(control: WriterControl, paused: &mut bool) -> bool { + match control { + WriterControl::Barrier { epoch, ack } => { + info!("Writer received barrier control, epoch: {epoch}"); + *paused = true; + let _ = ack.send(()); + false + } + WriterControl::Resume { epoch } => { + info!("Writer received resume control, epoch: {epoch}"); + *paused = false; + false + } + WriterControl::Stop => { + info!("Writer received stop control"); + true + } + } +} + +async fn write_loop( + mut rng: R, + ctx: FuzzContext, + logical_tables: BTreeMap, + shared_state: Arc>, + mut control_rx: mpsc::UnboundedReceiver, +) -> Result<()> { + info!("Start write loop"); + let mut paused = false; + loop { + while let Ok(control) = control_rx.try_recv() { + if handle_writer_control(control, &mut paused) { + return Ok(()); + } + } + + if paused { + match control_rx.recv().await { + Some(control) => { + if handle_writer_control(control, &mut paused) { + return Ok(()); + } + } + None => return Ok(()), + } + continue; + } + + let (running, clock) = { + let state = shared_state.lock().unwrap(); + (state.running, state.clock.clone()) + }; + if !running { + break; + } + + for table_ctx in logical_tables.values() { + let rows = rng.random_range(1..=3); + let insert_expr = + generate_insert_expr(rows, &mut rng, table_ctx.clone(), clock.clone())?; + let translator = InsertIntoExprTranslator; + let sql = translator.translate(&insert_expr)?; + let inserted = insert_expr.values_list.len() as u64; + let csv_records = InsertExprToCsvRecordsTranslator.translate(&insert_expr)?; + let table_name = table_ctx.name.to_string(); + let full_headers = table_ctx + .columns + .iter() + .map(|column| column.name.value.clone()) + .collect::>(); + + let started_at_ms = current_time_millis(); + let now = Instant::now(); + execute_insert_with_retry(&ctx, &sql).await?; + let elapsed = now.elapsed(); + info!("Execute insert sql: {sql}, elapsed: {elapsed:?}"); + + let mut state = shared_state.lock().unwrap(); + if let Some(csv_dump_session) = state.csv_dump_session.as_mut() { + csv_dump_session.append(csv_records, full_headers)?; + } + if let Some(sql_dump_session) = state.sql_dump_session.as_mut() { + let comment = format!( + "kind=insert table={} started_at_ms={} elapsed_ms={}", + table_name, + started_at_ms, + elapsed.as_millis() + ); + sql_dump_session.append_sql(&table_name, &sql, Some(&comment))?; + } + *state.inserted_rows.entry(table_name).or_insert(0) += inserted; + } + + tokio::time::sleep(Duration::from_millis(100)).await; + } + info!("Write loop ended"); + + Ok(()) +} + +async fn validate_rows( + ctx: &FuzzContext, + logical_tables: &BTreeMap, + inserted_rows: &HashMap, +) -> Result<()> { + for table_ctx in logical_tables.values() { + let expected = *inserted_rows.get(&table_ctx.name.to_string()).unwrap_or(&0) as usize; + let count_sql = format!("SELECT COUNT(1) AS count FROM {}", table_ctx.name); + let count = retry_with_backoff( + || count_values(&ctx.greptime, &count_sql), + VALIDATE_QUERY_MAX_ATTEMPTS, + VALIDATE_QUERY_INIT_BACKOFF, + VALIDATE_QUERY_MAX_BACKOFF, + ) + .await?; + let distinct_count_sql = format!( + "SELECT COUNT(DISTINCT {}) AS count FROM {}", + table_ctx.timestamp_column().unwrap().name, + table_ctx.name + ); + let distinct_count = retry_with_backoff( + || count_values(&ctx.greptime, &distinct_count_sql), + VALIDATE_QUERY_MAX_ATTEMPTS, + VALIDATE_QUERY_INIT_BACKOFF, + VALIDATE_QUERY_MAX_BACKOFF, + ) + .await?; + info!( + "Validate rows for table: {}, expected: {}, count: {}, distinct_count: {}", + table_ctx.name, expected, count.count as usize, distinct_count.count as usize + ); + assert_eq!(count.count as usize, expected); + + assert_eq!(distinct_count.count as usize, expected); + } + Ok(()) +} + +fn flush_dump_sessions_and_snapshot( + shared_state: &Arc>, +) -> Result> { + let mut state = shared_state.lock().unwrap(); + if let Some(csv_dump_session) = state.csv_dump_session.as_mut() { + csv_dump_session.flush_all()?; + } + if let Some(sql_dump_session) = state.sql_dump_session.as_mut() { + sql_dump_session.flush_all()?; + } + Ok(state.inserted_rows.clone()) +} + +async fn cleanup_tables( + ctx: &FuzzContext, + physical_table_ctx: &TableContextRef, + logical_tables: &BTreeMap, +) -> Result<()> { + for table_ctx in logical_tables.values() { + let drop_logical_sql = format!("DROP TABLE {}", table_ctx.name); + let result = sqlx::query(&drop_logical_sql) + .execute(&ctx.greptime) + .await + .context(error::ExecuteQuerySnafu { + sql: &drop_logical_sql, + })?; + info!("Drop logical table: {drop_logical_sql}, result: {result:?}"); + } + + let drop_physical_sql = format!("DROP TABLE {}", physical_table_ctx.name); + let result = sqlx::query(&drop_physical_sql) + .execute(&ctx.greptime) + .await + .context(error::ExecuteQuerySnafu { + sql: &drop_physical_sql, + })?; + info!("Drop physical table: {drop_physical_sql}, result: {result:?}"); + Ok(()) +} + +fn repartition_operation( + table_ctx: &TableContextRef, + rng: &mut R, +) -> Result { + let split = rng.random_bool(0.5); + if table_ctx.partition.as_ref().unwrap().exprs.len() <= 2 || split { + let expr = SplitPartitionExprGeneratorBuilder::default() + .table_ctx(table_ctx.clone()) + .build() + .unwrap() + .generate(rng)?; + Ok(RepartitionExpr::Split(expr)) + } else { + let expr = MergePartitionExprGeneratorBuilder::default() + .table_ctx(table_ctx.clone()) + .build() + .unwrap() + .generate(rng)?; + Ok(RepartitionExpr::Merge(expr)) + } +} + +impl Arbitrary<'_> for FuzzInput { + fn arbitrary(u: &mut Unstructured<'_>) -> arbitrary::Result { + let seed = get_fuzz_override::("SEED").unwrap_or(u.int_in_range(u64::MIN..=u64::MAX)?); + let mut rng = ChaChaRng::seed_from_u64(seed); + let partitions = + get_fuzz_override::("PARTITIONS").unwrap_or_else(|| rng.random_range(2..8)); + let max_tables = get_gt_fuzz_input_max_tables(); + let tables = get_fuzz_override::("TABLES") + .unwrap_or_else(|| rng.random_range(1..=std::cmp::max(1, max_tables))); + let max_actions = get_gt_fuzz_input_max_alter_actions(); + let actions = get_fuzz_override::("ACTIONS") + .unwrap_or_else(|| rng.random_range(1..max_actions)); + + Ok(FuzzInput { + seed, + actions, + partitions, + tables, + }) + } +} + +async fn execute_repartition_metric_table(ctx: FuzzContext, input: FuzzInput) -> Result<()> { + info!("input: {input:?}"); + let mut rng = ChaChaRng::seed_from_u64(input.seed); + let clock = Arc::new(Mutex::new(Timestamp::current_millis())); + + let (mut physical_table_ctx, logical_tables, create_logical_sqls, create_physical_sql) = + create_metric_tables(&ctx, &mut rng, input.partitions, input.tables).await?; + + let mut inserted_rows = HashMap::with_capacity(logical_tables.len()); + for table_ctx in logical_tables.values() { + inserted_rows.insert(table_ctx.name.to_string(), 0); + } + let csv_dump_session = CsvDumpSession::new(CsvDumpMetadata::new( + "fuzz_repartition_metric_table", + input.seed, + input.actions, + input.partitions, + input.tables, + ))?; + let sql_dump_session = SqlDumpSession::new(csv_dump_session.run_dir.clone())?; + let logical_table_names = logical_tables + .values() + .map(|table_ctx| table_ctx.name.to_string()) + .collect::>(); + + let mut sql_dump_session = sql_dump_session; + sql_dump_session.append_sql( + &physical_table_ctx.name.to_string(), + &create_physical_sql, + Some("kind=create_physical_table"), + )?; + for table_name in &logical_table_names { + if let Some(create_sql) = create_logical_sqls.get(table_name) { + sql_dump_session.append_sql( + table_name, + create_sql, + Some("kind=create_logical_table"), + )?; + } + } + + let shared_state = Arc::new(Mutex::new(SharedState { + clock, + inserted_rows, + csv_dump_session: Some(csv_dump_session), + sql_dump_session: Some(sql_dump_session), + running: true, + })); + let writer_rng = ChaChaRng::seed_from_u64(input.seed ^ 0xA5A5_A5A5_A5A5_A5A5); + let (control_tx, control_rx) = mpsc::unbounded_channel::(); + let writer_task = tokio::spawn(write_loop( + writer_rng, + ctx.clone(), + logical_tables.clone(), + shared_state.clone(), + control_rx, + )); + tokio::time::sleep(Duration::from_millis(100)).await; + + for i in 0..input.actions { + let partition_num = physical_table_ctx.partition.as_ref().unwrap().exprs.len(); + info!( + "partition_num: {partition_num}, action: {}/{}, table: {}, logical table num: {}", + i + 1, + input.actions, + physical_table_ctx.name, + logical_tables.len() + ); + + let repartition_expr = repartition_operation(&physical_table_ctx, &mut rng)?; + let translator = RepartitionExprTranslator; + let sql = translator.translate(&repartition_expr)?; + info!("Repartition sql: {sql}"); + let started_at_ms = current_time_millis(); + let now = Instant::now(); + let result = sqlx::query(&sql) + .execute(&ctx.greptime) + .await + .context(error::ExecuteQuerySnafu { sql: &sql })?; + let elapsed = now.elapsed(); + info!("Repartition result: {result:?}, elapsed: {elapsed:?}"); + + physical_table_ctx = Arc::new( + Arc::unwrap_or_clone(physical_table_ctx) + .repartition(repartition_expr) + .unwrap(), + ); + + let partition_entries = tests_fuzz::validator::partition::fetch_partitions_info_schema( + &ctx.greptime, + "public".into(), + &physical_table_ctx.name, + ) + .await?; + tests_fuzz::validator::partition::assert_partitions( + physical_table_ctx.partition.as_ref().unwrap(), + &partition_entries, + )?; + + { + let mut state = shared_state.lock().unwrap(); + if let Some(sql_dump_session) = state.sql_dump_session.as_mut() { + let repartition_comment = format!( + "kind=repartition table={} action_idx={} started_at_ms={} elapsed_ms={}", + physical_table_ctx.name, + i + 1, + started_at_ms, + elapsed.as_millis() + ); + sql_dump_session.append_sql( + &physical_table_ctx.name.to_string(), + &sql, + Some(&repartition_comment), + )?; + let event = format!( + "repartition action_idx={} started_at_ms={} elapsed_ms={} sql={}", + i + 1, + started_at_ms, + elapsed.as_millis(), + sql + ); + sql_dump_session.broadcast_event(logical_table_names.iter(), &event, &sql)?; + } + } + + let (ack_tx, ack_rx) = oneshot::channel(); + control_tx + .send(WriterControl::Barrier { + epoch: i + 1, + ack: ack_tx, + }) + .expect("barrier control send must succeed"); + tokio::time::timeout(Duration::from_secs(BARRIER_ACK_TIMEOUT_SECS), ack_rx) + .await + .expect("barrier ack timeout") + .expect("barrier ack dropped"); + + let inserted_rows_snapshot = flush_dump_sessions_and_snapshot(&shared_state)?; + info!("validate rows, epoch: {}", i + 1); + validate_rows(&ctx, &logical_tables, &inserted_rows_snapshot).await?; + + control_tx + .send(WriterControl::Resume { epoch: i + 1 }) + .expect("resume control send must succeed"); + } + + let _ = control_tx.send(WriterControl::Stop); + shared_state.lock().unwrap().running = false; + writer_task.await.unwrap().unwrap(); + let inserted_rows = flush_dump_sessions_and_snapshot(&shared_state)?; + let (mut csv_dump_session, mut sql_dump_session) = { + let mut state = shared_state.lock().unwrap(); + (state.csv_dump_session.take(), state.sql_dump_session.take()) + }; + + let run_result = async { + validate_rows(&ctx, &logical_tables, &inserted_rows).await?; + cleanup_tables(&ctx, &physical_table_ctx, &logical_tables).await?; + Ok(()) + } + .await; + + if let Some(csv_dump_session) = csv_dump_session.take() { + match &run_result { + Ok(_) => { + if let Err(err) = csv_dump_session.cleanup_on_success() { + warn!( + "Cleanup csv dump directory failed, path: {}, error: {:?}", + csv_dump_session.run_dir.display(), + err + ); + } + } + Err(_) => { + warn!( + "Keep csv dump directory for failure analysis, path: {}", + csv_dump_session.run_dir.display() + ); + } + } + } + if let Some(sql_dump_session) = sql_dump_session.take() + && run_result.is_err() + { + warn!( + "Keep sql dump directory for failure analysis, path: {}", + sql_dump_session.run_dir.display() + ); + } + + ctx.close().await; + run_result +} + +fuzz_target!(|input: FuzzInput| { + common_telemetry::init_default_ut_logging(); + common_runtime::block_on_global(async { + let Connections { mysql } = init_greptime_connections_via_env().await; + let ctx = FuzzContext { + greptime: mysql.expect("mysql connection init must be succeed"), + }; + execute_repartition_metric_table(ctx, input) + .await + .unwrap_or_else(|err| panic!("fuzz test must be succeed: {err:?}")); + }) +}); From 74ff5c37eaf45484d10f702b25e2aded92aa6eba Mon Sep 17 00:00:00 2001 From: LFC <990479+MichaelScofield@users.noreply.github.com> Date: Fri, 13 Mar 2026 17:25:21 +0800 Subject: [PATCH 005/195] refactor: customize standalone instance build (#7807) * refactor: customize standalone instance build Signed-off-by: luofucong * resolve PR comments Signed-off-by: luofucong --------- Signed-off-by: luofucong --- src/cmd/src/standalone.rs | 186 ++++++++++++++++++++--- tests/conf/datanode-test.toml.template | 2 +- tests/conf/frontend-test.toml.template | 4 +- tests/conf/standalone-test.toml.template | 6 +- tests/runner/src/server_mode.rs | 63 +++----- 5 files changed, 187 insertions(+), 74 deletions(-) diff --git a/src/cmd/src/standalone.rs b/src/cmd/src/standalone.rs index 92638d3c4a..215bea0ec5 100644 --- a/src/cmd/src/standalone.rs +++ b/src/cmd/src/standalone.rs @@ -32,14 +32,15 @@ use common_meta::cache::LayeredCacheRegistryBuilder; use common_meta::ddl::flow_meta::FlowMetadataAllocator; use common_meta::ddl::table_meta::TableMetadataAllocator; use common_meta::ddl::{DdlContext, NoopRegionFailureDetectorControl}; -use common_meta::ddl_manager::{DdlManager, DdlManagerConfiguratorRef}; +use common_meta::ddl_manager::{DdlManager, DdlManagerConfiguratorRef, DdlManagerRef}; use common_meta::key::flow::FlowMetadataManager; use common_meta::key::{TableMetadataManager, TableMetadataManagerRef}; use common_meta::kv_backend::KvBackendRef; -use common_meta::procedure_executor::LocalProcedureExecutor; +use common_meta::node_manager::{FlownodeRef, NodeManagerRef}; +use common_meta::procedure_executor::{LocalProcedureExecutor, ProcedureExecutorRef}; use common_meta::region_keeper::MemoryRegionKeeper; use common_meta::region_registry::LeaderRegionRegistry; -use common_meta::sequence::SequenceBuilder; +use common_meta::sequence::{Sequence, SequenceBuilder}; use common_meta::wal_provider::{WalProviderRef, build_wal_provider}; use common_procedure::ProcedureManagerRef; use common_query::prelude::set_default_prefix; @@ -49,6 +50,7 @@ use common_time::timezone::set_default_timezone; use common_version::{short_version, verbose_version}; use datanode::config::DatanodeOptions; use datanode::datanode::{Datanode, DatanodeBuilder}; +use datanode::region_server::RegionServer; use flow::{ FlownodeBuilder, FlownodeInstance, FlownodeOptions, FrontendClient, FrontendInvoker, GrpcQueryHandlerWithBoxedError, @@ -58,6 +60,7 @@ use frontend::instance::StandaloneDatanodeManager; use frontend::instance::builder::FrontendBuilder; use frontend::server::Services; use meta_srv::metasrv::{FLOW_ID_SEQ, TABLE_ID_SEQ}; +use plugins::PluginOptions; use plugins::frontend::context::{ CatalogManagerConfigureContext, StandaloneCatalogManagerConfigureContext, }; @@ -130,6 +133,18 @@ impl Instance { pub fn server_addr(&self, name: &str) -> Option { self.frontend.server_handlers().addr(name) } + + /// Get the mutable Frontend component of this Standalone instance for externally modification + /// by others (might not be in this code base, so don't delete this function). + pub fn mut_frontend(&mut self) -> &mut Frontend { + &mut self.frontend + } + + /// Get the Datanode component of this Standalone instance for externally usage + /// by others (might not be in this code base, so don't delete this function). + pub fn datanode(&self) -> &Datanode { + &self.datanode + } } #[async_trait] @@ -342,9 +357,18 @@ impl StartCommand { info!("Standalone start command: {:#?}", self); info!("Standalone options: {opts:#?}"); + let (mut instance, _) = + Self::build_with(opts.component, opts.plugins, InstanceCreator::default()).await?; + instance._guard.extend(guard); + Ok(instance) + } + + pub async fn build_with( + mut opts: StandaloneOptions, + plugin_opts: Vec, + creator: InstanceCreator, + ) -> Result<(Instance, InstanceCreatorResult)> { let mut plugins = Plugins::new(); - let plugin_opts = opts.plugins; - let mut opts = opts.component; set_default_prefix(opts.default_column_prefix.as_deref()) .map_err(BoxedError::new) .context(error::BuildCliSnafu)?; @@ -462,17 +486,16 @@ impl StartCommand { .await; } - let node_manager = Arc::new(StandaloneDatanodeManager { - region_server: datanode.region_server(), - flow_server: flownode.flow_engine(), - }); + let node_manager = creator + .node_manager_creator + .create( + &kv_backend, + datanode.region_server(), + flownode.flow_engine(), + ) + .await?; - let table_id_allocator = Arc::new( - SequenceBuilder::new(TABLE_ID_SEQ, kv_backend.clone()) - .initial(MIN_USER_TABLE_ID as u64) - .step(10) - .build(), - ); + let table_id_allocator = creator.table_id_allocator_creator.create(&kv_backend); let flow_id_sequence = Arc::new( SequenceBuilder::new(FLOW_ID_SEQ, kv_backend.clone()) .initial(MIN_USER_FLOW_ID as u64) @@ -489,7 +512,7 @@ impl StartCommand { .context(error::BuildWalProviderSnafu)?; let wal_provider = Arc::new(wal_provider); let table_metadata_allocator = Arc::new(TableMetadataAllocator::new( - table_id_allocator, + table_id_allocator.clone(), wal_provider.clone(), )); let flow_metadata_allocator = Arc::new(FlowMetadataAllocator::with_noop_peer_allocator( @@ -532,10 +555,10 @@ impl StartCommand { ddl_manager }; - let procedure_executor = Arc::new(LocalProcedureExecutor::new( - Arc::new(ddl_manager), - procedure_manager.clone(), - )); + let procedure_executor = creator + .procedure_executor_creator + .create(Arc::new(ddl_manager), procedure_manager.clone()) + .await?; let fe_instance = FrontendBuilder::new( fe_opts.clone(), @@ -568,7 +591,7 @@ impl StartCommand { kv_backend.clone(), layered_cache_registry.clone(), procedure_executor, - node_manager, + node_manager.clone(), ) .await .context(StartFlownodeSnafu)?; @@ -584,14 +607,20 @@ impl StartCommand { heartbeat_task: None, }; - Ok(Instance { + let instance = Instance { datanode, frontend, flownode, procedure_manager, wal_provider, - _guard: guard, - }) + _guard: vec![], + }; + let result = InstanceCreatorResult { + kv_backend, + node_manager, + table_id_allocator, + }; + Ok((instance, result)) } pub async fn create_table_metadata_manager( @@ -608,6 +637,115 @@ impl StartCommand { } } +#[async_trait] +pub trait NodeManagerCreator { + async fn create( + &self, + kv_backend: &KvBackendRef, + region_server: RegionServer, + flow_server: FlownodeRef, + ) -> Result; +} + +pub struct DefaultNodeManagerCreator; + +#[async_trait] +impl NodeManagerCreator for DefaultNodeManagerCreator { + async fn create( + &self, + _: &KvBackendRef, + region_server: RegionServer, + flow_server: FlownodeRef, + ) -> Result { + Ok(Arc::new(StandaloneDatanodeManager { + region_server, + flow_server, + })) + } +} + +pub trait TableIdAllocatorCreator { + fn create(&self, kv_backend: &KvBackendRef) -> Arc; +} + +struct DefaultTableIdAllocatorCreator; + +impl TableIdAllocatorCreator for DefaultTableIdAllocatorCreator { + fn create(&self, kv_backend: &KvBackendRef) -> Arc { + Arc::new( + SequenceBuilder::new(TABLE_ID_SEQ, kv_backend.clone()) + .initial(MIN_USER_TABLE_ID as u64) + .step(10) + .build(), + ) + } +} + +#[async_trait] +pub trait ProcedureExecutorCreator { + async fn create( + &self, + ddl_manager: DdlManagerRef, + procedure_manager: ProcedureManagerRef, + ) -> Result; +} + +pub struct DefaultProcedureExecutorCreator; + +#[async_trait] +impl ProcedureExecutorCreator for DefaultProcedureExecutorCreator { + async fn create( + &self, + ddl_manager: DdlManagerRef, + procedure_manager: ProcedureManagerRef, + ) -> Result { + Ok(Arc::new(LocalProcedureExecutor::new( + ddl_manager, + procedure_manager, + ))) + } +} + +/// `InstanceCreator` is used for grouping various component creators for building the +/// Standalone instance, suitable for customizing how the instance can be built. +pub struct InstanceCreator { + node_manager_creator: Box, + table_id_allocator_creator: Box, + procedure_executor_creator: Box, +} + +impl InstanceCreator { + pub fn new( + node_manager_creator: Box, + table_id_allocator_creator: Box, + procedure_executor_creator: Box, + ) -> Self { + Self { + node_manager_creator, + table_id_allocator_creator, + procedure_executor_creator, + } + } +} + +impl Default for InstanceCreator { + fn default() -> Self { + Self { + node_manager_creator: Box::new(DefaultNodeManagerCreator), + table_id_allocator_creator: Box::new(DefaultTableIdAllocatorCreator), + procedure_executor_creator: Box::new(DefaultProcedureExecutorCreator), + } + } +} + +/// `InstanceCreatorResult` is expected to be used paired with [InstanceCreator]. +/// It stores the created and other important components for further reusing. +pub struct InstanceCreatorResult { + pub kv_backend: KvBackendRef, + pub node_manager: NodeManagerRef, + pub table_id_allocator: Arc, +} + #[cfg(test)] mod tests { use std::default::Default; diff --git a/tests/conf/datanode-test.toml.template b/tests/conf/datanode-test.toml.template index 4cb0423c72..3ec8a2f695 100644 --- a/tests/conf/datanode-test.toml.template +++ b/tests/conf/datanode-test.toml.template @@ -28,7 +28,7 @@ type = 'File' data_home = '{data_home}' [meta_client_options] -metasrv_addrs = ['{metasrv_addr}'] +metasrv_addrs = ['{addrs.metasrv_addr}'] timeout_millis = 3000 connect_timeout_millis = 5000 tcp_nodelay = false diff --git a/tests/conf/frontend-test.toml.template b/tests/conf/frontend-test.toml.template index de4ce86adc..25d44ff6e4 100644 --- a/tests/conf/frontend-test.toml.template +++ b/tests/conf/frontend-test.toml.template @@ -1,3 +1,3 @@ [grpc] -bind_addr = "{grpc_addr}" -server_addr = "{grpc_addr}" +bind_addr = "{addrs.grpc_addr}" +server_addr = "{addrs.grpc_addr}" diff --git a/tests/conf/standalone-test.toml.template b/tests/conf/standalone-test.toml.template index 509eac7ca6..50c014e991 100644 --- a/tests/conf/standalone-test.toml.template +++ b/tests/conf/standalone-test.toml.template @@ -26,12 +26,12 @@ type = 'File' data_home = '{data_home}' [grpc] -bind_addr = '{grpc_addr}' +bind_addr = '{addrs.grpc_addr}' runtime_size = 8 [mysql] enable = true -addr = "{mysql_addr}" +addr = "{addrs.mysql_addr}" runtime_size = 2 prepared_stmt_cache_size= 10000 @@ -40,7 +40,7 @@ mode = "disable" [postgres] enable = true -addr = "{postgres_addr}" +addr = "{addrs.postgres_addr}" runtime_size = 2 [procedure] diff --git a/tests/runner/src/server_mode.rs b/tests/runner/src/server_mode.rs index 172baf32ff..1f7cb72bf4 100644 --- a/tests/runner/src/server_mode.rs +++ b/tests/runner/src/server_mode.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::path::Path; use std::sync::{Mutex, OnceLock}; @@ -96,15 +96,7 @@ struct ConfigContext { use_etcd: bool, store_addrs: String, instance_id: usize, - // for following addrs, leave it empty if not needed - // required for datanode - metasrv_addr: String, - // for frontend and standalone - grpc_addr: String, - // for standalone - mysql_addr: String, - // for standalone - postgres_addr: String, + addrs: HashMap, // enable flat format for storage engine enable_flat_format: bool, } @@ -275,40 +267,26 @@ impl ServerMode { let procedure_dir = data_home.join("procedure").display().to_string(); // Get the required addresses based on server mode - let (metasrv_addr, grpc_addr, mysql_addr, postgres_addr) = match self { + let addrs: HashMap = match self { ServerMode::Standalone { rpc_bind_addr, mysql_addr, postgres_addr, - .. - } => ( - String::new(), - rpc_bind_addr.clone(), - mysql_addr.clone(), - postgres_addr.clone(), - ), - ServerMode::Frontend { - rpc_bind_addr, - mysql_addr, - postgres_addr, - .. - } => ( - String::new(), - rpc_bind_addr.clone(), - mysql_addr.clone(), - postgres_addr.clone(), - ), - ServerMode::Datanode { - rpc_bind_addr, - metasrv_addr, - .. - } => ( - metasrv_addr.clone(), - rpc_bind_addr.clone(), - String::new(), - String::new(), - ), - _ => (String::new(), String::new(), String::new(), String::new()), + http_addr, + } => [ + ("http_addr".to_string(), http_addr.clone()), + ("grpc_addr".to_string(), rpc_bind_addr.clone()), + ("mysql_addr".to_string(), mysql_addr.clone()), + ("postgres_addr".to_string(), postgres_addr.clone()), + ] + .into(), + ServerMode::Frontend { rpc_bind_addr, .. } => { + [("grpc_addr".to_string(), rpc_bind_addr.clone())].into() + } + ServerMode::Datanode { metasrv_addr, .. } => { + [("metasrv_addr".to_string(), metasrv_addr.clone())].into() + } + _ => HashMap::new(), }; let ctx = ConfigContext { @@ -326,10 +304,7 @@ impl ServerMode { .collect::>() .join(","), instance_id: id, - metasrv_addr, - grpc_addr, - mysql_addr, - postgres_addr, + addrs, enable_flat_format: db_ctx.store_config().enable_flat_format, }; From e215851c8a6487f8ba7e4e385b57f3ceaae11d86 Mon Sep 17 00:00:00 2001 From: Yingwen Date: Fri, 13 Mar 2026 17:44:13 +0800 Subject: [PATCH 006/195] refactor: unify flush and compaction to always use FlatSource (#7799) * feat: support write flat as primary key format Signed-off-by: evenyag * feat: migrate flush to always use FlatSource Add FormatType propagation in SstWriteRequest and use it to choose Flat vs PrimaryKey write paths (write_all_flat vs write_all_flat_as_primary_key) in AccessLayer and WriteCache. Make compactor and flush derive the sst_write_format from region options or engine config. Simplify flush logic and remove the old memtable_source helper. Update tests to set default sst_write_format. Signed-off-by: evenyag * refactor: compaction use flat source Signed-off-by: evenyag * refactor: read parquet sequentially as flat batches Signed-off-by: evenyag * refactor: remove new_batch_with_binary in favor of new_record_batch_with_binary Replace PrimaryKeyWriteFormat with FlatWriteFormat in test_read_large_binary test and use new_record_batch_with_binary directly, removing the now-unused new_batch_with_binary function and its BinaryArray import. Signed-off-by: evenyag * test: add tests for PrimaryKeyWriteFormat::convert_flat_batch Signed-off-by: evenyag * refactor: remove Either from SstWriteRequest Signed-off-by: evenyag * fix: handle index build mode Signed-off-by: evenyag * fix: consider sparse encoding and last non null in flush Signed-off-by: evenyag * test: add unit tests for field_column_start edge cases Signed-off-by: evenyag --------- Signed-off-by: evenyag --- src/cmd/src/datanode/objbench.rs | 13 +- src/mito2/src/access_layer.rs | 22 +- src/mito2/src/cache/write_cache.rs | 50 +++-- src/mito2/src/compaction.rs | 19 +- src/mito2/src/compaction/compactor.rs | 16 +- src/mito2/src/flush.rs | 166 +++++---------- src/mito2/src/memtable/bulk.rs | 9 +- src/mito2/src/read/prune.rs | 5 - src/mito2/src/read/seq_scan.rs | 56 ----- src/mito2/src/sst.rs | 183 +--------------- src/mito2/src/sst/index.rs | 23 +- src/mito2/src/sst/parquet.rs | 256 ++++++++++++----------- src/mito2/src/sst/parquet/flat_format.rs | 105 +++++++++- src/mito2/src/sst/parquet/format.rs | 211 ++++++++++--------- src/mito2/src/sst/parquet/reader.rs | 124 +++-------- src/mito2/src/sst/parquet/writer.rs | 223 +++++++------------- src/mito2/src/test_util/sst_util.rs | 91 +++++--- 17 files changed, 668 insertions(+), 904 deletions(-) diff --git a/src/cmd/src/datanode/objbench.rs b/src/cmd/src/datanode/objbench.rs index d8f53b9d71..f6d8674d4c 100644 --- a/src/cmd/src/datanode/objbench.rs +++ b/src/cmd/src/datanode/objbench.rs @@ -20,13 +20,14 @@ use clap::Parser; use colored::Colorize; use datanode::config::RegionEngineConfig; use datanode::store; -use either::Either; +use futures::stream; use mito2::access_layer::{ AccessLayer, AccessLayerRef, Metrics, OperationType, SstWriteRequest, WriteType, }; use mito2::cache::{CacheManager, CacheManagerRef}; use mito2::config::{FulltextIndexConfig, MitoConfig, Mode}; -use mito2::read::Source; +use mito2::read::FlatSource; +use mito2::sst::FormatType; use mito2::sst::file::{FileHandle, FileMeta}; use mito2::sst::file_purger::{FilePurger, FilePurgerRef}; use mito2::sst::index::intermediate::IntermediateManager; @@ -210,6 +211,7 @@ impl ObjbenchCommand { object_store.clone(), ) .expected_metadata(Some(region_meta.clone())) + .flat_format(true) .build() .await .map_err(|e| { @@ -231,6 +233,10 @@ impl ObjbenchCommand { let reader_build_elapsed = reader_build_start.elapsed(); let total_rows = reader.parquet_metadata().file_metadata().num_rows(); println!("{} Reader built in {:?}", "✓".green(), reader_build_elapsed); + let reader_stream = Box::pin(stream::try_unfold(reader, |mut reader| async move { + let batch = reader.next_record_batch().await?; + Ok(batch.map(|batch| (batch, reader))) + })); // Build write request let fulltext_index_config = FulltextIndexConfig { @@ -241,10 +247,11 @@ impl ObjbenchCommand { let write_req = SstWriteRequest { op_type: OperationType::Flush, metadata: region_meta, - source: Either::Left(Source::Reader(Box::new(reader))), + source: FlatSource::Stream(reader_stream), cache_manager, storage: None, max_sequence: None, + sst_write_format: FormatType::PrimaryKey, index_options: Default::default(), index_config: mito_engine_config.index.clone(), inverted_index_config: MitoConfig::default().inverted_index, diff --git a/src/mito2/src/access_layer.rs b/src/mito2/src/access_layer.rs index 92c8a3bc36..231285215e 100644 --- a/src/mito2/src/access_layer.rs +++ b/src/mito2/src/access_layer.rs @@ -17,7 +17,6 @@ use std::time::{Duration, Instant}; use async_stream::try_stream; use common_time::Timestamp; -use either::Either; use futures::{Stream, TryStreamExt}; use object_store::services::Fs; use object_store::util::{join_dir, with_instrument_layers}; @@ -37,7 +36,7 @@ use crate::error::{ CleanDirSnafu, DeleteIndexSnafu, DeleteIndexesSnafu, DeleteSstsSnafu, OpenDalSnafu, Result, }; use crate::metrics::{COMPACTION_STAGE_ELAPSED, FLUSH_ELAPSED}; -use crate::read::{FlatSource, Source}; +use crate::read::FlatSource; use crate::region::options::IndexOptions; use crate::sst::file::{FileHandle, RegionFileId, RegionIndexId}; use crate::sst::index::IndexerBuilderImpl; @@ -47,7 +46,7 @@ use crate::sst::location::{self, region_dir_from_table_dir}; use crate::sst::parquet::reader::ParquetReaderBuilder; use crate::sst::parquet::writer::ParquetWriter; use crate::sst::parquet::{SstInfo, WriteOptions}; -use crate::sst::{DEFAULT_WRITE_BUFFER_SIZE, DEFAULT_WRITE_CONCURRENCY}; +use crate::sst::{DEFAULT_WRITE_BUFFER_SIZE, DEFAULT_WRITE_CONCURRENCY, FormatType}; pub type AccessLayerRef = Arc; /// SST write results. @@ -391,15 +390,19 @@ impl AccessLayer { ) .await .with_file_cleaner(cleaner); - match request.source { - Either::Left(source) => { + match request.sst_write_format { + FormatType::PrimaryKey => { writer - .write_all(source, request.max_sequence, write_opts) + .write_all_flat_as_primary_key( + request.source, + request.max_sequence, + write_opts, + ) .await? } - Either::Right(flat_source) => { + FormatType::Flat => { writer - .write_all_flat(flat_source, request.max_sequence, write_opts) + .write_all_flat(request.source, request.max_sequence, write_opts) .await? } } @@ -520,11 +523,12 @@ pub enum OperationType { pub struct SstWriteRequest { pub op_type: OperationType, pub metadata: RegionMetadataRef, - pub source: Either, + pub source: FlatSource, pub cache_manager: CacheManagerRef, #[allow(dead_code)] pub storage: Option, pub max_sequence: Option, + pub sst_write_format: FormatType, /// Configs for index pub index_options: IndexOptions, diff --git a/src/mito2/src/cache/write_cache.rs b/src/mito2/src/cache/write_cache.rs index a28df3f54c..3d373efe91 100644 --- a/src/mito2/src/cache/write_cache.rs +++ b/src/mito2/src/cache/write_cache.rs @@ -244,15 +244,19 @@ impl WriteCache { .await .with_file_cleaner(cleaner); - let sst_info = match write_request.source { - either::Left(source) => { + let sst_info = match write_request.sst_write_format { + crate::sst::FormatType::PrimaryKey => { writer - .write_all(source, write_request.max_sequence, write_opts) + .write_all_flat_as_primary_key( + write_request.source, + write_request.max_sequence, + write_opts, + ) .await? } - either::Right(flat_source) => { + crate::sst::FormatType::Flat => { writer - .write_all_flat(flat_source, write_request.max_sequence, write_opts) + .write_all_flat(write_request.source, write_request.max_sequence, write_opts) .await? } }; @@ -509,12 +513,13 @@ mod tests { use crate::cache::test_util::{assert_parquet_metadata_equal, new_fs_store}; use crate::cache::{CacheManager, CacheStrategy}; use crate::error::InvalidBatchSnafu; - use crate::read::Source; + use crate::read::FlatSource; use crate::region::options::IndexOptions; use crate::sst::parquet::reader::ParquetReaderBuilder; use crate::test_util::TestEnv; use crate::test_util::sst_util::{ - new_batch_by_range, new_source, sst_file_handle_with_file_id, sst_region_metadata, + new_flat_source_from_record_batches, new_record_batch_by_range, + sst_file_handle_with_file_id, sst_region_metadata, }; #[tokio::test] @@ -532,21 +537,22 @@ mod tests { .create_write_cache(local_store.clone(), ReadableSize::mb(10)) .await; - // Create Source + // Create source. let metadata = Arc::new(sst_region_metadata()); let region_id = metadata.region_id; - let source = new_source(&[ - new_batch_by_range(&["a", "d"], 0, 60), - new_batch_by_range(&["b", "f"], 0, 40), - new_batch_by_range(&["b", "h"], 100, 200), + let source = new_flat_source_from_record_batches(vec![ + new_record_batch_by_range(&["a", "d"], 0, 60), + new_record_batch_by_range(&["b", "f"], 0, 40), + new_record_batch_by_range(&["b", "h"], 100, 200), ]); let write_request = SstWriteRequest { op_type: OperationType::Flush, metadata, - source: either::Left(source), + source, storage: None, max_sequence: None, + sst_write_format: Default::default(), cache_manager: Default::default(), index_options: IndexOptions::default(), index_config: Default::default(), @@ -636,19 +642,20 @@ mod tests { // Create source let metadata = Arc::new(sst_region_metadata()); - let source = new_source(&[ - new_batch_by_range(&["a", "d"], 0, 60), - new_batch_by_range(&["b", "f"], 0, 40), - new_batch_by_range(&["b", "h"], 100, 200), + let source = new_flat_source_from_record_batches(vec![ + new_record_batch_by_range(&["a", "d"], 0, 60), + new_record_batch_by_range(&["b", "f"], 0, 40), + new_record_batch_by_range(&["b", "h"], 100, 200), ]); // Write to local cache and upload sst to mock remote store let write_request = SstWriteRequest { op_type: OperationType::Flush, metadata, - source: either::Left(source), + source, storage: None, max_sequence: None, + sst_write_format: Default::default(), cache_manager: cache_manager.clone(), index_options: IndexOptions::default(), index_config: Default::default(), @@ -715,9 +722,9 @@ mod tests { let metadata = Arc::new(sst_region_metadata()); // Creates a source that can return an error to abort the writer. - let source = Source::Iter(Box::new( + let source = FlatSource::Iter(Box::new( [ - Ok(new_batch_by_range(&["a", "d"], 0, 60)), + Ok(new_record_batch_by_range(&["a", "d"], 0, 60)), InvalidBatchSnafu { reason: "Abort the writer", } @@ -730,9 +737,10 @@ mod tests { let write_request = SstWriteRequest { op_type: OperationType::Flush, metadata, - source: either::Left(source), + source, storage: None, max_sequence: None, + sst_write_format: Default::default(), cache_manager: cache_manager.clone(), index_options: IndexOptions::default(), index_config: Default::default(), diff --git a/src/mito2/src/compaction.rs b/src/mito2/src/compaction.rs index 6d51d1dd59..ba6957fdae 100644 --- a/src/mito2/src/compaction.rs +++ b/src/mito2/src/compaction.rs @@ -58,10 +58,10 @@ use crate::error::{ TimeRangePredicateOverflowSnafu, TimeoutSnafu, }; use crate::metrics::{COMPACTION_STAGE_ELAPSED, INFLIGHT_COMPACTION_COUNT}; +use crate::read::BoxedRecordBatchStream; use crate::read::projection::ProjectionMapper; use crate::read::scan_region::{PredicateGroup, ScanInput}; use crate::read::seq_scan::SeqScan; -use crate::read::{BoxedBatchReader, BoxedRecordBatchStream}; use crate::region::options::{MergeMode, RegionOptions}; use crate::region::version::VersionControlRef; use crate::region::{ManifestContextRef, RegionLeaderState, RegionRoleState}; @@ -828,7 +828,7 @@ pub struct SerializedCompactionOutput { output_time_range: Option, } -/// Builders to create [BoxedBatchReader] for compaction. +/// Builders to create [BoxedRecordBatchStream] for compaction. struct CompactionSstReaderBuilder<'a> { metadata: RegionMetadataRef, sst_layer: AccessLayerRef, @@ -841,24 +841,17 @@ struct CompactionSstReaderBuilder<'a> { } impl CompactionSstReaderBuilder<'_> { - /// Builds [BoxedBatchReader] that reads all SST files and yields batches in primary key order. - async fn build_sst_reader(self) -> Result { - let scan_input = self.build_scan_input(false)?.with_compaction(true); - - SeqScan::new(scan_input).build_reader_for_compaction().await - } - /// Builds [BoxedRecordBatchStream] that reads all SST files and yields batches in flat format for compaction. async fn build_flat_sst_reader(self) -> Result { - let scan_input = self.build_scan_input(true)?.with_compaction(true); + let scan_input = self.build_scan_input()?.with_compaction(true); SeqScan::new(scan_input) .build_flat_reader_for_compaction() .await } - fn build_scan_input(self, flat_format: bool) -> Result { - let mapper = ProjectionMapper::all(&self.metadata, flat_format)?; + fn build_scan_input(self) -> Result { + let mapper = ProjectionMapper::all(&self.metadata, true)?; let mut scan_input = ScanInput::new(self.sst_layer, mapper) .with_files(self.inputs.to_vec()) .with_append_mode(self.append_mode) @@ -868,7 +861,7 @@ impl CompactionSstReaderBuilder<'_> { // We ignore file not found error during compaction. .with_ignore_file_not_found(true) .with_merge_mode(self.merge_mode) - .with_flat_format(flat_format); + .with_flat_format(true); // This serves as a workaround of https://github.com/GreptimeTeam/greptimedb/issues/3944 // by converting time ranges into predicate. diff --git a/src/mito2/src/compaction/compactor.rs b/src/mito2/src/compaction/compactor.rs index 1876972b0d..b03e6415e8 100644 --- a/src/mito2/src/compaction/compactor.rs +++ b/src/mito2/src/compaction/compactor.rs @@ -43,7 +43,7 @@ use crate::error::{ use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList}; use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions}; use crate::metrics; -use crate::read::{FlatSource, Source}; +use crate::read::FlatSource; use crate::region::options::RegionOptions; use crate::region::version::VersionRef; use crate::region::{ManifestContext, RegionLeaderState, RegionRoleState}; @@ -356,13 +356,8 @@ impl DefaultCompactor { time_range: output.output_time_range, merge_mode, }; - let source = if flat_format { - let reader = builder.build_flat_sst_reader().await?; - Either::Right(FlatSource::Stream(reader)) - } else { - let reader = builder.build_sst_reader().await?; - Either::Left(Source::Reader(reader)) - }; + let reader = builder.build_flat_sst_reader().await?; + let source = FlatSource::Stream(reader); let mut metrics = Metrics::new(WriteType::Compaction); let region_metadata = compaction_region.region_metadata.clone(); let sst_infos = compaction_region @@ -375,6 +370,11 @@ impl DefaultCompactor { cache_manager: compaction_region.cache_manager.clone(), storage, max_sequence: max_sequence.map(NonZero::get), + sst_write_format: if flat_format { + FormatType::Flat + } else { + FormatType::PrimaryKey + }, index_options, index_config, inverted_index_config, diff --git a/src/mito2/src/flush.rs b/src/mito2/src/flush.rs index 0c16544b6e..fedac95d27 100644 --- a/src/mito2/src/flush.rs +++ b/src/mito2/src/flush.rs @@ -22,7 +22,6 @@ use std::time::Instant; use common_telemetry::{debug, error, info}; use datatypes::arrow::datatypes::SchemaRef; -use either::Either; use partition::expr::PartitionExpr; use smallvec::{SmallVec, smallvec}; use snafu::ResultExt; @@ -41,18 +40,14 @@ use crate::error::{ }; use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList}; use crate::memtable::bulk::ENCODE_ROW_THRESHOLD; -use crate::memtable::{ - BoxedRecordBatchIterator, EncodedRange, IterBuilder, MemtableRanges, RangesOptions, -}; +use crate::memtable::{BoxedRecordBatchIterator, EncodedRange, MemtableRanges, RangesOptions}; use crate::metrics::{ FLUSH_BYTES_TOTAL, FLUSH_ELAPSED, FLUSH_FAILURE_TOTAL, FLUSH_FILE_TOTAL, FLUSH_REQUESTS_TOTAL, INFLIGHT_FLUSH_COUNT, }; -use crate::read::dedup::{DedupReader, LastNonNull, LastRow}; +use crate::read::FlatSource; use crate::read::flat_dedup::{FlatDedupIterator, FlatLastNonNull, FlatLastRow}; use crate::read::flat_merge::FlatMergeIterator; -use crate::read::merge::MergeReaderBuilder; -use crate::read::{FlatSource, Source}; use crate::region::options::{IndexOptions, MergeMode, RegionOptions}; use crate::region::version::{VersionControlData, VersionControlRef, VersionRef}; use crate::region::{ManifestContextRef, RegionLeaderState, RegionRoleState, parse_partition_expr}; @@ -62,8 +57,10 @@ use crate::request::{ }; use crate::schedule::scheduler::{Job, SchedulerRef}; use crate::sst::file::FileMeta; -use crate::sst::parquet::{DEFAULT_READ_BATCH_SIZE, DEFAULT_ROW_GROUP_SIZE, SstInfo, WriteOptions}; -use crate::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema}; +use crate::sst::parquet::{ + DEFAULT_READ_BATCH_SIZE, DEFAULT_ROW_GROUP_SIZE, SstInfo, WriteOptions, flat_format, +}; +use crate::sst::{FlatSchemaOptions, FormatType, to_flat_sst_arrow_schema}; use crate::worker::WorkerListener; /// Global write buffer (memtable) manager. @@ -480,78 +477,29 @@ impl RegionFlushTask { // the counter may have more series than the actual series count. series_count += memtable_series_count; - if mem_ranges.is_record_batch() { - let flush_start = Instant::now(); - let FlushFlatMemResult { - num_encoded, - num_sources, - results, - } = self - .flush_flat_mem_ranges(version, &write_opts, mem_ranges) - .await?; - encoded_part_count += num_encoded; - for (source_idx, result) in results.into_iter().enumerate() { - let (max_sequence, ssts_written, metrics) = result?; - if ssts_written.is_empty() { - // No data written. - continue; - } - - common_telemetry::debug!( - "Region {} flush one memtable {} {}/{}, metrics: {:?}", - self.region_id, - memtable_id, - source_idx, - num_sources, - metrics - ); - - flush_metrics = flush_metrics.merge(metrics); - - file_metas.extend(ssts_written.into_iter().map(|sst_info| { - flushed_bytes += sst_info.file_size; - Self::new_file_meta( - self.region_id, - max_sequence, - sst_info, - partition_expr.clone(), - ) - })); - } - - common_telemetry::debug!( - "Region {} flush {} memtables for {}, num_mem_ranges: {}, num_encoded: {}, num_rows: {}, flush_cost: {:?}, compact_cost: {:?}", - self.region_id, - num_sources, - memtable_id, - num_mem_ranges, - num_encoded, - num_mem_rows, - flush_start.elapsed(), - compact_cost, - ); - } else { - let max_sequence = mem_ranges.max_sequence(); - let source = memtable_source(mem_ranges, &version.options).await?; - - // Flush to level 0. - let source = Either::Left(source); - let write_request = self.new_write_request(version, max_sequence, source); - - let mut metrics = Metrics::new(WriteType::Flush); - let ssts_written = self - .access_layer - .write_sst(write_request, &write_opts, &mut metrics) - .await?; - FLUSH_FILE_TOTAL.inc_by(ssts_written.len() as u64); + let flush_start = Instant::now(); + let FlushFlatMemResult { + num_encoded, + num_sources, + results, + } = self + .flush_flat_mem_ranges(version, &write_opts, mem_ranges) + .await?; + encoded_part_count += num_encoded; + for (source_idx, result) in results.into_iter().enumerate() { + let (max_sequence, ssts_written, metrics) = result?; if ssts_written.is_empty() { // No data written. continue; } - debug!( - "Region {} flush one memtable, num_mem_ranges: {}, num_rows: {}, metrics: {:?}", - self.region_id, num_mem_ranges, num_mem_rows, metrics + common_telemetry::debug!( + "Region {} flush one memtable {} {}/{}, metrics: {:?}", + self.region_id, + memtable_id, + source_idx, + num_sources, + metrics ); flush_metrics = flush_metrics.merge(metrics); @@ -565,7 +513,19 @@ impl RegionFlushTask { partition_expr.clone(), ) })); - }; + } + + common_telemetry::debug!( + "Region {} flush {} memtables for {}, num_mem_ranges: {}, num_encoded: {}, num_rows: {}, flush_cost: {:?}, compact_cost: {:?}", + self.region_id, + num_sources, + memtable_id, + num_mem_ranges, + num_encoded, + num_mem_rows, + flush_start.elapsed(), + compact_cost, + ); } Ok(DoFlushMemtablesResult { @@ -587,16 +547,17 @@ impl RegionFlushTask { &version.metadata, &FlatSchemaOptions::from_encoding(version.metadata.primary_key_encoding), ); + let field_column_start = + flat_format::field_column_start(&version.metadata, batch_schema.fields().len()); let flat_sources = memtable_flat_sources( batch_schema, mem_ranges, &version.options, - version.metadata.primary_key.len(), + field_column_start, )?; let mut tasks = Vec::with_capacity(flat_sources.encoded.len() + flat_sources.sources.len()); let num_encoded = flat_sources.encoded.len(); for (source, max_sequence) in flat_sources.sources { - let source = Either::Right(source); let write_request = self.new_write_request(version, max_sequence, source); let access_layer = self.access_layer.clone(); let write_opts = write_opts.clone(); @@ -667,8 +628,13 @@ impl RegionFlushTask { &self, version: &VersionRef, max_sequence: u64, - source: Either, + source: FlatSource, ) -> SstWriteRequest { + let flat_format = version + .options + .sst_format + .map(|f| f == FormatType::Flat) + .unwrap_or(self.engine_config.default_experimental_flat_format); SstWriteRequest { op_type: OperationType::Flush, metadata: version.metadata.clone(), @@ -676,6 +642,11 @@ impl RegionFlushTask { cache_manager: self.cache_manager.clone(), storage: version.options.storage.clone(), max_sequence: Some(max_sequence), + sst_write_format: if flat_format { + FormatType::Flat + } else { + FormatType::PrimaryKey + }, index_options: self.index_options.clone(), index_config: self.engine_config.index.clone(), inverted_index_config: self.engine_config.inverted_index.clone(), @@ -722,41 +693,6 @@ struct DoFlushMemtablesResult { flush_metrics: Metrics, } -/// Returns a [Source] for the given memtable. -async fn memtable_source(mem_ranges: MemtableRanges, options: &RegionOptions) -> Result { - let source = if mem_ranges.ranges.len() == 1 { - let only_range = mem_ranges.ranges.into_values().next().unwrap(); - let iter = only_range.build_iter()?; - Source::Iter(iter) - } else { - // todo(hl): a workaround since sync version of MergeReader is wip. - let sources = mem_ranges - .ranges - .into_values() - .map(|r| r.build_iter().map(Source::Iter)) - .collect::>>()?; - let merge_reader = MergeReaderBuilder::from_sources(sources).build().await?; - let maybe_dedup = if options.append_mode { - // no dedup in append mode - Box::new(merge_reader) as _ - } else { - // dedup according to merge mode - match options.merge_mode.unwrap_or(MergeMode::LastRow) { - MergeMode::LastRow => { - Box::new(DedupReader::new(merge_reader, LastRow::new(false), None)) as _ - } - MergeMode::LastNonNull => Box::new(DedupReader::new( - merge_reader, - LastNonNull::new(false), - None, - )) as _, - } - }; - Source::Reader(maybe_dedup) - }; - Ok(source) -} - struct FlatSources { sources: SmallVec<[(FlatSource, SequenceNumber); 4]>, encoded: SmallVec<[(EncodedRange, SequenceNumber); 4]>, diff --git a/src/mito2/src/memtable/bulk.rs b/src/mito2/src/memtable/bulk.rs index cf2ced06fe..6056a42013 100644 --- a/src/mito2/src/memtable/bulk.rs +++ b/src/mito2/src/memtable/bulk.rs @@ -57,7 +57,7 @@ use crate::memtable::{ use crate::read::flat_dedup::{FlatDedupIterator, FlatLastNonNull, FlatLastRow}; use crate::read::flat_merge::FlatMergeIterator; use crate::region::options::MergeMode; -use crate::sst::parquet::format::FIXED_POS_COLUMN_NUM; +use crate::sst::parquet::flat_format::field_column_start; use crate::sst::parquet::{DEFAULT_READ_BATCH_SIZE, DEFAULT_ROW_GROUP_SIZE}; use crate::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema}; @@ -1186,13 +1186,8 @@ impl MemtableCompactor { Box::new(dedup_iter) } MergeMode::LastNonNull => { - // Calculates field column start: total columns - fixed columns - field columns - // Field column count = total metadata columns - time index column - primary key columns - let field_column_count = - metadata.column_metadatas.len() - 1 - metadata.primary_key.len(); - let total_columns = arrow_schema.fields().len(); let field_column_start = - total_columns - FIXED_POS_COLUMN_NUM - field_column_count; + field_column_start(metadata, arrow_schema.fields().len()); let dedup_iter = FlatDedupIterator::new( merged_iter, diff --git a/src/mito2/src/read/prune.rs b/src/mito2/src/read/prune.rs index 29ded3d49a..2f9fa002d4 100644 --- a/src/mito2/src/read/prune.rs +++ b/src/mito2/src/read/prune.rs @@ -80,11 +80,6 @@ impl PruneReader { } } - pub(crate) fn reset_source(&mut self, source: Source, skip_fields: bool) { - self.source = source; - self.skip_fields = skip_fields; - } - /// Merge metrics with the inner reader and return the merged metrics. pub(crate) fn metrics(&self) -> ReaderMetrics { let mut metrics = self.metrics.clone(); diff --git a/src/mito2/src/read/seq_scan.rs b/src/mito2/src/read/seq_scan.rs index c13b40d111..d2be17cc83 100644 --- a/src/mito2/src/read/seq_scan.rs +++ b/src/mito2/src/read/seq_scan.rs @@ -128,28 +128,6 @@ impl SeqScan { Ok(Box::pin(futures::stream::iter(streams).flatten())) } - /// Builds a [BoxedBatchReader] from sequential scan for compaction. - /// - /// # Panics - /// Panics if the compaction flag is not set. - pub async fn build_reader_for_compaction(&self) -> Result { - assert!(self.stream_ctx.input.compaction); - - let metrics_set = ExecutionPlanMetricsSet::new(); - let part_metrics = self.new_partition_metrics(false, &metrics_set, 0); - debug_assert_eq!(1, self.properties.partitions.len()); - let partition_ranges = &self.properties.partitions[0]; - - let reader = Self::merge_all_ranges_for_compaction( - &self.stream_ctx, - partition_ranges, - &part_metrics, - self.pruner.clone(), - ) - .await?; - Ok(Box::new(reader)) - } - /// Builds a [BoxedRecordBatchStream] from sequential scan for flat format compaction. /// /// # Panics @@ -172,40 +150,6 @@ impl SeqScan { Ok(reader) } - /// Builds a merge reader that reads all ranges. - /// Callers MUST not split ranges before calling this method. - async fn merge_all_ranges_for_compaction( - stream_ctx: &Arc, - partition_ranges: &[PartitionRange], - part_metrics: &PartitionMetrics, - pruner: Arc, - ) -> Result { - pruner.add_partition_ranges(partition_ranges); - let partition_pruner = Arc::new(PartitionPruner::new(pruner, partition_ranges)); - - let mut sources = Vec::new(); - for part_range in partition_ranges { - build_sources( - stream_ctx, - part_range, - true, - part_metrics, - partition_pruner.clone(), - &mut sources, - None, - ) - .await?; - } - - common_telemetry::debug!( - "Build reader to read all parts, region_id: {}, num_part_ranges: {}, num_sources: {}", - stream_ctx.input.mapper.metadata().region_id, - partition_ranges.len(), - sources.len() - ); - Self::build_reader_from_sources(stream_ctx, sources, None, None).await - } - /// Builds a merge reader that reads all flat ranges. /// Callers MUST not split ranges before calling this method. async fn merge_all_flat_ranges_for_compaction( diff --git a/src/mito2/src/sst.rs b/src/mito2/src/sst.rs index 78e4c563b1..94bc1feea8 100644 --- a/src/mito2/src/sst.rs +++ b/src/mito2/src/sst.rs @@ -31,7 +31,6 @@ use store_api::storage::consts::{ OP_TYPE_COLUMN_NAME, PRIMARY_KEY_COLUMN_NAME, SEQUENCE_COLUMN_NAME, }; -use crate::read::Batch; use crate::sst::parquet::flat_format::time_index_column_index; pub mod file; @@ -260,33 +259,6 @@ pub(crate) struct SeriesEstimator { } impl SeriesEstimator { - /// Updates the estimator with a new Batch. - /// - /// Since each Batch contains only one series, this increments the series count - /// and updates the last timestamp. - pub(crate) fn update(&mut self, batch: &Batch) { - let Some(last_ts) = batch.last_timestamp() else { - return; - }; - - // Checks if there's a boundary between the last batch and this batch - if let Some(prev_last_ts) = self.last_timestamp { - // If the first timestamp of this batch is less than the last timestamp - // we've seen, it indicates a new series - if let Some(first_ts) = batch.first_timestamp() - && first_ts.value() <= prev_last_ts - { - self.series_count += 1; - } - } else { - // First batch, counts as first series - self.series_count = 1; - } - - // Updates the last timestamp - self.last_timestamp = Some(last_ts.value()); - } - /// Updates the estimator with a new record batch in flat format. /// /// This method examines the time index column to detect series boundaries. @@ -340,43 +312,14 @@ impl SeriesEstimator { mod tests { use std::sync::Arc; - use api::v1::OpType; use datatypes::arrow::array::{ - BinaryArray, DictionaryArray, TimestampMillisecondArray, UInt8Array, UInt8Builder, - UInt32Array, UInt64Array, + BinaryArray, DictionaryArray, TimestampMillisecondArray, UInt8Array, UInt32Array, + UInt64Array, }; use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit}; use datatypes::arrow::record_batch::RecordBatch; use super::*; - use crate::read::{Batch, BatchBuilder}; - - fn new_batch( - primary_key: &[u8], - timestamps: &[i64], - sequences: &[u64], - op_types: &[OpType], - ) -> Batch { - let timestamps = Arc::new(TimestampMillisecondArray::from(timestamps.to_vec())); - let sequences = Arc::new(UInt64Array::from(sequences.to_vec())); - let mut op_type_builder = UInt8Builder::with_capacity(op_types.len()); - for op_type in op_types { - op_type_builder.append_value(*op_type as u8); - } - let op_types = Arc::new(UInt8Array::from( - op_types.iter().map(|op| *op as u8).collect::>(), - )); - - let mut builder = BatchBuilder::new(primary_key.to_vec()); - builder - .timestamps_array(timestamps) - .unwrap() - .sequences_array(sequences) - .unwrap() - .op_types_array(op_types) - .unwrap(); - builder.build().unwrap() - } fn new_flat_record_batch(timestamps: &[i64]) -> RecordBatch { // Flat format has: [fields..., time_index, __primary_key, __sequence, __op_type] @@ -411,128 +354,6 @@ mod tests { RecordBatch::try_new(schema, vec![time_array, pk_array, seq_array, op_array]).unwrap() } - #[test] - fn test_series_estimator_empty_batch() { - let mut estimator = SeriesEstimator::default(); - let batch = new_batch(b"test", &[], &[], &[]); - estimator.update(&batch); - assert_eq!(0, estimator.finish()); - } - - #[test] - fn test_series_estimator_single_batch() { - let mut estimator = SeriesEstimator::default(); - let batch = new_batch( - b"test", - &[1, 2, 3], - &[1, 2, 3], - &[OpType::Put, OpType::Put, OpType::Put], - ); - estimator.update(&batch); - assert_eq!(1, estimator.finish()); - } - - #[test] - fn test_series_estimator_multiple_batches_same_series() { - let mut estimator = SeriesEstimator::default(); - - // First batch with timestamps 1, 2, 3 - let batch1 = new_batch( - b"test", - &[1, 2, 3], - &[1, 2, 3], - &[OpType::Put, OpType::Put, OpType::Put], - ); - estimator.update(&batch1); - - // Second batch with timestamps 4, 5, 6 (continuation) - let batch2 = new_batch( - b"test", - &[4, 5, 6], - &[4, 5, 6], - &[OpType::Put, OpType::Put, OpType::Put], - ); - estimator.update(&batch2); - - assert_eq!(1, estimator.finish()); - } - - #[test] - fn test_series_estimator_new_series_detected() { - let mut estimator = SeriesEstimator::default(); - - // First batch with timestamps 1, 2, 3 - let batch1 = new_batch( - b"pk0", - &[1, 2, 3], - &[1, 2, 3], - &[OpType::Put, OpType::Put, OpType::Put], - ); - estimator.update(&batch1); - - // Second batch with timestamps 2, 3, 4 (timestamp goes back, new series) - let batch2 = new_batch( - b"pk1", - &[2, 3, 4], - &[4, 5, 6], - &[OpType::Put, OpType::Put, OpType::Put], - ); - estimator.update(&batch2); - - assert_eq!(2, estimator.finish()); - } - - #[test] - fn test_series_estimator_equal_timestamp_boundary() { - let mut estimator = SeriesEstimator::default(); - - // First batch ending at timestamp 5 - let batch1 = new_batch( - b"test", - &[1, 2, 5], - &[1, 2, 3], - &[OpType::Put, OpType::Put, OpType::Put], - ); - estimator.update(&batch1); - - // Second batch starting at timestamp 5 (equal, indicates new series) - let batch2 = new_batch( - b"test", - &[5, 6, 7], - &[4, 5, 6], - &[OpType::Put, OpType::Put, OpType::Put], - ); - estimator.update(&batch2); - - assert_eq!(2, estimator.finish()); - } - - #[test] - fn test_series_estimator_finish_resets_state() { - let mut estimator = SeriesEstimator::default(); - - let batch1 = new_batch( - b"test", - &[1, 2, 3], - &[1, 2, 3], - &[OpType::Put, OpType::Put, OpType::Put], - ); - estimator.update(&batch1); - - assert_eq!(1, estimator.finish()); - - // After finish, state should be reset - let batch2 = new_batch( - b"test", - &[4, 5, 6], - &[4, 5, 6], - &[OpType::Put, OpType::Put, OpType::Put], - ); - estimator.update(&batch2); - - assert_eq!(1, estimator.finish()); - } - #[test] fn test_series_estimator_flat_empty_batch() { let mut estimator = SeriesEstimator::default(); diff --git a/src/mito2/src/sst/index.rs b/src/mito2/src/sst/index.rs index 0df3229e9c..88aebfc001 100644 --- a/src/mito2/src/sst/index.rs +++ b/src/mito2/src/sst/index.rs @@ -58,7 +58,7 @@ use crate::error::{ }; use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList}; use crate::metrics::INDEX_CREATE_MEMORY_USAGE; -use crate::read::{Batch, BatchReader}; +use crate::read::Batch; use crate::region::options::IndexOptions; use crate::region::version::VersionControlRef; use crate::region::{ManifestContextRef, RegionLeaderState}; @@ -802,9 +802,9 @@ impl IndexBuildTask { if let Some(mut parquet_reader) = parquet_reader { // TODO(SNC123): optimize index batch loop { - match parquet_reader.next_batch().await { - Ok(Some(mut batch)) => { - indexer.update(&mut batch).await; + match parquet_reader.next_record_batch().await { + Ok(Some(batch)) => { + indexer.update_flat(&batch).await; } Ok(None) => break, Err(e) => { @@ -1227,7 +1227,9 @@ mod tests { use crate::sst::parquet::WriteOptions; use crate::test_util::memtable_util::EmptyMemtableBuilder; use crate::test_util::scheduler_util::SchedulerEnv; - use crate::test_util::sst_util::{new_batch_by_range, new_source, sst_region_metadata}; + use crate::test_util::sst_util::{ + new_flat_source_from_record_batches, new_record_batch_by_range, sst_region_metadata, + }; struct MetaConfig { with_inverted: bool, @@ -1358,19 +1360,20 @@ mod tests { env: &SchedulerEnv, build_mode: IndexBuildMode, ) -> SstInfo { - let source = new_source(&[ - new_batch_by_range(&["a", "d"], 0, 60), - new_batch_by_range(&["b", "f"], 0, 40), - new_batch_by_range(&["b", "h"], 100, 200), + let source = new_flat_source_from_record_batches(vec![ + new_record_batch_by_range(&["a", "d"], 0, 60), + new_record_batch_by_range(&["b", "f"], 0, 40), + new_record_batch_by_range(&["b", "h"], 100, 200), ]); let mut index_config = MitoConfig::default().index; index_config.build_mode = build_mode; let write_request = SstWriteRequest { op_type: OperationType::Flush, metadata: metadata.clone(), - source: either::Left(source), + source, storage: None, max_sequence: None, + sst_write_format: Default::default(), cache_manager: Default::default(), index_options: IndexOptions::default(), index_config, diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs index aa98b69176..1c5bfd9db0 100644 --- a/src/mito2/src/sst/parquet.rs +++ b/src/mito2/src/sst/parquet.rs @@ -110,6 +110,7 @@ mod tests { TimestampMillisecondArray, UInt8Array, UInt64Array, }; use datatypes::arrow::datatypes::{DataType, Field, Schema, UInt32Type}; + use datatypes::arrow::util::pretty::pretty_format_batches; use datatypes::prelude::ConcreteDataType; use datatypes::schema::{FulltextAnalyzer, FulltextBackend, FulltextOptions}; use object_store::ObjectStore; @@ -129,7 +130,7 @@ mod tests { use crate::cache::test_util::assert_parquet_metadata_equal; use crate::cache::{CacheManager, CacheStrategy, PageKey}; use crate::config::IndexConfig; - use crate::read::{BatchBuilder, BatchReader, FlatSource}; + use crate::read::FlatSource; use crate::region::options::{IndexOptions, InvertedIndexOptions}; use crate::sst::file::{FileHandle, FileMeta, RegionFileId, RegionIndexId}; use crate::sst::file_purger::NoopFilePurger; @@ -137,19 +138,19 @@ mod tests { use crate::sst::index::fulltext_index::applier::builder::FulltextIndexApplierBuilder; use crate::sst::index::inverted_index::applier::builder::InvertedIndexApplierBuilder; use crate::sst::index::{IndexBuildType, Indexer, IndexerBuilder, IndexerBuilderImpl}; - use crate::sst::parquet::format::PrimaryKeyWriteFormat; + use crate::sst::parquet::flat_format::FlatWriteFormat; use crate::sst::parquet::reader::{ParquetReader, ParquetReaderBuilder, ReaderMetrics}; use crate::sst::parquet::writer::ParquetWriter; use crate::sst::{ DEFAULT_WRITE_CONCURRENCY, FlatSchemaOptions, location, to_flat_sst_arrow_schema, }; + use crate::test_util::TestEnv; use crate::test_util::sst_util::{ - build_test_binary_test_region_metadata, new_batch_by_range, new_batch_with_binary, - new_batch_with_custom_sequence, new_primary_key, new_source, new_sparse_primary_key, - sst_file_handle, sst_file_handle_with_file_id, sst_region_metadata, + build_test_binary_test_region_metadata, new_flat_source_from_record_batches, + new_primary_key, new_record_batch_by_range, new_record_batch_with_custom_sequence, + new_sparse_primary_key, sst_file_handle, sst_file_handle_with_file_id, sst_region_metadata, sst_region_metadata_with_encoding, }; - use crate::test_util::{TestEnv, check_reader_result}; const FILE_DIR: &str = "/"; const REGION_ID: RegionId = RegionId::new(0, 0); @@ -191,10 +192,10 @@ mod tests { region_file_id: handle.file_id(), }; let metadata = Arc::new(sst_region_metadata()); - let source = new_source(&[ - new_batch_by_range(&["a", "d"], 0, 60), - new_batch_by_range(&["b", "f"], 0, 40), - new_batch_by_range(&["b", "h"], 100, 200), + let source = new_flat_source_from_record_batches(vec![ + new_record_batch_by_range(&["a", "d"], 0, 60), + new_record_batch_by_range(&["b", "f"], 0, 40), + new_record_batch_by_range(&["b", "h"], 100, 200), ]); // Use a small row group size for test. let write_opts = WriteOptions { @@ -214,7 +215,7 @@ mod tests { .await; let info = writer - .write_all(source, None, &write_opts) + .write_all_flat_as_primary_key(source, None, &write_opts) .await .unwrap() .remove(0); @@ -235,14 +236,14 @@ mod tests { object_store, ); let mut reader = builder.build().await.unwrap().unwrap(); - check_reader_result( + check_record_batch_reader_result( &mut reader, &[ - new_batch_by_range(&["a", "d"], 0, 50), - new_batch_by_range(&["a", "d"], 50, 60), - new_batch_by_range(&["b", "f"], 0, 40), - new_batch_by_range(&["b", "h"], 100, 150), - new_batch_by_range(&["b", "h"], 150, 200), + new_record_batch_by_range(&["a", "d"], 0, 50), + new_record_batch_by_range(&["a", "d"], 50, 60), + new_record_batch_by_range(&["b", "f"], 0, 40), + new_record_batch_by_range(&["b", "h"], 100, 150), + new_record_batch_by_range(&["b", "h"], 150, 200), ], ) .await; @@ -254,10 +255,10 @@ mod tests { let object_store = env.init_object_store_manager(); let handle = sst_file_handle(0, 1000); let metadata = Arc::new(sst_region_metadata()); - let source = new_source(&[ - new_batch_by_range(&["a", "d"], 0, 60), - new_batch_by_range(&["b", "f"], 0, 40), - new_batch_by_range(&["b", "h"], 100, 200), + let source = new_flat_source_from_record_batches(vec![ + new_record_batch_by_range(&["a", "d"], 0, 60), + new_record_batch_by_range(&["b", "f"], 0, 40), + new_record_batch_by_range(&["b", "h"], 100, 200), ]); // Use a small row group size for test. let write_opts = WriteOptions { @@ -279,7 +280,7 @@ mod tests { .await; let sst_info = writer - .write_all(source, None, &write_opts) + .write_all_flat_as_primary_key(source, None, &write_opts) .await .unwrap() .remove(0); @@ -299,14 +300,14 @@ mod tests { .cache(cache.clone()); for _ in 0..3 { let mut reader = builder.build().await.unwrap().unwrap(); - check_reader_result( + check_record_batch_reader_result( &mut reader, &[ - new_batch_by_range(&["a", "d"], 0, 50), - new_batch_by_range(&["a", "d"], 50, 60), - new_batch_by_range(&["b", "f"], 0, 40), - new_batch_by_range(&["b", "h"], 100, 150), - new_batch_by_range(&["b", "h"], 150, 200), + new_record_batch_by_range(&["a", "d"], 0, 50), + new_record_batch_by_range(&["a", "d"], 50, 60), + new_record_batch_by_range(&["b", "f"], 0, 40), + new_record_batch_by_range(&["b", "h"], 100, 150), + new_record_batch_by_range(&["b", "h"], 150, 200), ], ) .await; @@ -340,10 +341,10 @@ mod tests { let object_store = env.init_object_store_manager(); let handle = sst_file_handle(0, 1000); let metadata = Arc::new(sst_region_metadata()); - let source = new_source(&[ - new_batch_by_range(&["a", "d"], 0, 60), - new_batch_by_range(&["b", "f"], 0, 40), - new_batch_by_range(&["b", "h"], 100, 200), + let source = new_flat_source_from_record_batches(vec![ + new_record_batch_by_range(&["a", "d"], 0, 60), + new_record_batch_by_range(&["b", "f"], 0, 40), + new_record_batch_by_range(&["b", "h"], 100, 200), ]); let write_opts = WriteOptions { row_group_size: 50, @@ -366,7 +367,7 @@ mod tests { .await; let sst_info = writer - .write_all(source, None, &write_opts) + .write_all_flat_as_primary_key(source, None, &write_opts) .await .unwrap() .remove(0); @@ -392,10 +393,10 @@ mod tests { let object_store = env.init_object_store_manager(); let handle = sst_file_handle(0, 1000); let metadata = Arc::new(sst_region_metadata()); - let source = new_source(&[ - new_batch_by_range(&["a", "d"], 0, 60), - new_batch_by_range(&["b", "f"], 0, 40), - new_batch_by_range(&["b", "h"], 100, 200), + let source = new_flat_source_from_record_batches(vec![ + new_record_batch_by_range(&["a", "d"], 0, 60), + new_record_batch_by_range(&["b", "f"], 0, 40), + new_record_batch_by_range(&["b", "h"], 100, 200), ]); // Use a small row group size for test. let write_opts = WriteOptions { @@ -416,7 +417,7 @@ mod tests { ) .await; writer - .write_all(source, None, &write_opts) + .write_all_flat_as_primary_key(source, None, &write_opts) .await .unwrap() .remove(0); @@ -436,11 +437,11 @@ mod tests { ) .predicate(predicate); let mut reader = builder.build().await.unwrap().unwrap(); - check_reader_result( + check_record_batch_reader_result( &mut reader, &[ - new_batch_by_range(&["a", "d"], 0, 50), - new_batch_by_range(&["a", "d"], 50, 60), + new_record_batch_by_range(&["a", "d"], 0, 50), + new_record_batch_by_range(&["a", "d"], 50, 60), ], ) .await; @@ -452,10 +453,10 @@ mod tests { let object_store = env.init_object_store_manager(); let handle = sst_file_handle(0, 1000); let metadata = Arc::new(sst_region_metadata()); - let source = new_source(&[ - new_batch_by_range(&["a", "z"], 0, 0), - new_batch_by_range(&["a", "z"], 100, 100), - new_batch_by_range(&["a", "z"], 200, 230), + let source = new_flat_source_from_record_batches(vec![ + new_record_batch_by_range(&["a", "z"], 0, 0), + new_record_batch_by_range(&["a", "z"], 100, 100), + new_record_batch_by_range(&["a", "z"], 200, 230), ]); // Use a small row group size for test. let write_opts = WriteOptions { @@ -476,7 +477,7 @@ mod tests { ) .await; writer - .write_all(source, None, &write_opts) + .write_all_flat_as_primary_key(source, None, &write_opts) .await .unwrap() .remove(0); @@ -488,7 +489,11 @@ mod tests { object_store, ); let mut reader = builder.build().await.unwrap().unwrap(); - check_reader_result(&mut reader, &[new_batch_by_range(&["a", "z"], 200, 230)]).await; + check_record_batch_reader_result( + &mut reader, + &[new_record_batch_by_range(&["a", "z"], 200, 230)], + ) + .await; } #[tokio::test] @@ -497,10 +502,10 @@ mod tests { let object_store = env.init_object_store_manager(); let handle = sst_file_handle(0, 1000); let metadata = Arc::new(sst_region_metadata()); - let source = new_source(&[ - new_batch_by_range(&["a", "d"], 0, 60), - new_batch_by_range(&["b", "f"], 0, 40), - new_batch_by_range(&["b", "h"], 100, 200), + let source = new_flat_source_from_record_batches(vec![ + new_record_batch_by_range(&["a", "d"], 0, 60), + new_record_batch_by_range(&["b", "f"], 0, 40), + new_record_batch_by_range(&["b", "h"], 100, 200), ]); // Use a small row group size for test. let write_opts = WriteOptions { @@ -522,7 +527,7 @@ mod tests { .await; writer - .write_all(source, None, &write_opts) + .write_all_flat_as_primary_key(source, None, &write_opts) .await .unwrap() .remove(0); @@ -542,7 +547,11 @@ mod tests { ) .predicate(predicate); let mut reader = builder.build().await.unwrap().unwrap(); - check_reader_result(&mut reader, &[new_batch_by_range(&["b", "h"], 150, 200)]).await; + check_record_batch_reader_result( + &mut reader, + &[new_record_batch_by_range(&["b", "h"], 150, 200)], + ) + .await; } #[tokio::test] @@ -569,7 +578,7 @@ mod tests { let writer_props = props_builder.build(); - let write_format = PrimaryKeyWriteFormat::new(metadata); + let write_format = FlatWriteFormat::new(metadata, &FlatSchemaOptions::default()); let fields: Vec<_> = write_format .arrow_schema() .fields() @@ -603,9 +612,8 @@ mod tests { ) .unwrap(); - let batch = new_batch_with_binary(&["a"], 0, 60); - let arrow_batch = write_format.convert_batch(&batch).unwrap(); - let arrays: Vec<_> = arrow_batch + let batch = new_record_batch_with_binary(&["a"], 0, 60); + let arrays: Vec<_> = batch .columns() .iter() .map(|array| { @@ -629,11 +637,11 @@ mod tests { object_store, ); let mut reader = builder.build().await.unwrap().unwrap(); - check_reader_result( + check_record_batch_reader_result( &mut reader, &[ - new_batch_with_binary(&["a"], 0, 50), - new_batch_with_binary(&["a"], 50, 60), + new_record_batch_with_binary(&["a"], 0, 50), + new_record_batch_with_binary(&["a"], 50, 60), ], ) .await; @@ -646,17 +654,17 @@ mod tests { let mut env = TestEnv::new().await; let object_store = env.init_object_store_manager(); let metadata = Arc::new(sst_region_metadata()); - let batches = &[ - new_batch_by_range(&["a", "d"], 0, 1000), - new_batch_by_range(&["b", "f"], 0, 1000), - new_batch_by_range(&["c", "g"], 0, 1000), - new_batch_by_range(&["b", "h"], 100, 200), - new_batch_by_range(&["b", "h"], 200, 300), - new_batch_by_range(&["b", "h"], 300, 1000), + let batches = vec![ + new_record_batch_by_range(&["a", "d"], 0, 1000), + new_record_batch_by_range(&["b", "f"], 0, 1000), + new_record_batch_by_range(&["c", "g"], 0, 1000), + new_record_batch_by_range(&["b", "h"], 100, 200), + new_record_batch_by_range(&["b", "h"], 200, 300), + new_record_batch_by_range(&["b", "h"], 300, 1000), ]; let total_rows: usize = batches.iter().map(|batch| batch.num_rows()).sum(); - let source = new_source(batches); + let source = new_flat_source_from_record_batches(batches); let write_opts = WriteOptions { row_group_size: 50, max_file_size: Some(1024 * 16), @@ -678,7 +686,10 @@ mod tests { ) .await; - let files = writer.write_all(source, None, &write_opts).await.unwrap(); + let files = writer + .write_all_flat_as_primary_key(source, None, &write_opts) + .await + .unwrap(); assert_eq!(2, files.len()); let mut rows_read = 0; @@ -695,7 +706,7 @@ mod tests { object_store.clone(), ); let mut reader = builder.build().await.unwrap().unwrap(); - while let Some(batch) = reader.next_batch().await.unwrap() { + while let Some(batch) = reader.next_record_batch().await.unwrap() { rows_read += batch.num_rows(); } } @@ -710,12 +721,12 @@ mod tests { let metadata = Arc::new(sst_region_metadata()); let row_group_size = 50; - let source = new_source(&[ - new_batch_by_range(&["a", "d"], 0, 20), - new_batch_by_range(&["b", "d"], 0, 20), - new_batch_by_range(&["c", "d"], 0, 20), - new_batch_by_range(&["c", "f"], 0, 40), - new_batch_by_range(&["c", "h"], 100, 200), + let source = new_flat_source_from_record_batches(vec![ + new_record_batch_by_range(&["a", "d"], 0, 20), + new_record_batch_by_range(&["b", "d"], 0, 20), + new_record_batch_by_range(&["c", "d"], 0, 20), + new_record_batch_by_range(&["c", "f"], 0, 40), + new_record_batch_by_range(&["c", "h"], 100, 200), ]); // Use a small row group size for test. let write_opts = WriteOptions { @@ -760,7 +771,7 @@ mod tests { .await; let info = writer - .write_all(source, None, &write_opts) + .write_all_flat_as_primary_key(source, None, &write_opts) .await .unwrap() .remove(0); @@ -877,6 +888,7 @@ mod tests { handle.clone(), object_store.clone(), ) + .flat_format(true) .predicate(Some(Predicate::new(preds))) .inverted_index_appliers([inverted_index_applier.clone(), None]) .bloom_filter_index_appliers([bloom_filter_applier.clone(), None]) @@ -891,7 +903,11 @@ mod tests { let mut reader = ParquetReader::new(Arc::new(context), selection) .await .unwrap(); - check_reader_result(&mut reader, &[new_batch_by_range(&["b", "d"], 0, 20)]).await; + check_record_batch_reader_result( + &mut reader, + &[new_record_batch_by_range(&["b", "d"], 0, 20)], + ) + .await; assert_eq!(metrics.filter_metrics.rg_total, 4); assert_eq!(metrics.filter_metrics.rg_minmax_filtered, 3); @@ -937,6 +953,7 @@ mod tests { handle.clone(), object_store.clone(), ) + .flat_format(true) .predicate(Some(Predicate::new(preds))) .inverted_index_appliers([inverted_index_applier.clone(), None]) .bloom_filter_index_appliers([bloom_filter_applier.clone(), None]) @@ -991,6 +1008,7 @@ mod tests { handle.clone(), object_store.clone(), ) + .flat_format(true) .predicate(Some(Predicate::new(preds))) .inverted_index_appliers([inverted_index_applier.clone(), None]) .bloom_filter_index_appliers([bloom_filter_applier.clone(), None]) @@ -1005,13 +1023,13 @@ mod tests { let mut reader = ParquetReader::new(Arc::new(context), selection) .await .unwrap(); - check_reader_result( + check_record_batch_reader_result( &mut reader, &[ - new_batch_by_range(&["a", "d"], 0, 20), - new_batch_by_range(&["b", "d"], 0, 20), - new_batch_by_range(&["c", "d"], 0, 10), - new_batch_by_range(&["c", "d"], 10, 20), + new_record_batch_by_range(&["a", "d"], 0, 20), + new_record_batch_by_range(&["b", "d"], 0, 20), + new_record_batch_by_range(&["c", "d"], 0, 10), + new_record_batch_by_range(&["c", "d"], 10, 20), ], ) .await; @@ -1032,37 +1050,32 @@ mod tests { assert!(cached.contains_row_group(3)); } - /// Creates a flat format RecordBatch for testing. - /// Similar to `new_batch_by_range` but returns a RecordBatch in flat format. - fn new_record_batch_by_range(tags: &[&str], start: usize, end: usize) -> RecordBatch { + fn new_record_batch_with_binary(tags: &[&str], start: usize, end: usize) -> RecordBatch { assert!(end >= start); - let metadata = Arc::new(sst_region_metadata()); + let metadata = build_test_binary_test_region_metadata(); let flat_schema = to_flat_sst_arrow_schema(&metadata, &FlatSchemaOptions::default()); let num_rows = end - start; let mut columns = Vec::new(); - // Add primary key columns (tag_0, tag_1) as dictionary arrays let mut tag_0_builder = StringDictionaryBuilder::::new(); - let mut tag_1_builder = StringDictionaryBuilder::::new(); - for _ in 0..num_rows { tag_0_builder.append_value(tags[0]); - tag_1_builder.append_value(tags[1]); } - columns.push(Arc::new(tag_0_builder.finish()) as ArrayRef); - columns.push(Arc::new(tag_1_builder.finish()) as ArrayRef); - // Add field column (field_0) - let field_values: Vec = (start..end).map(|v| v as u64).collect(); - columns.push(Arc::new(UInt64Array::from(field_values))); + let values = (0..num_rows) + .map(|_| "some data".as_bytes()) + .collect::>(); + columns.push( + Arc::new(datatypes::arrow::array::BinaryArray::from_iter_values( + values, + )) as ArrayRef, + ); - // Add time index column (ts) let timestamps: Vec = (start..end).map(|v| v as i64).collect(); columns.push(Arc::new(TimestampMillisecondArray::from(timestamps))); - // Add encoded primary key column let pk = new_primary_key(tags); let mut pk_builder = BinaryDictionaryBuilder::::new(); for _ in 0..num_rows { @@ -1070,10 +1083,7 @@ mod tests { } columns.push(Arc::new(pk_builder.finish())); - // Add sequence column columns.push(Arc::new(UInt64Array::from_value(1000, num_rows))); - - // Add op_type column columns.push(Arc::new(UInt8Array::from_value( OpType::Put as u8, num_rows, @@ -1082,9 +1092,19 @@ mod tests { RecordBatch::try_new(flat_schema, columns).unwrap() } - /// Creates a FlatSource from flat format RecordBatches. - fn new_flat_source_from_record_batches(batches: Vec) -> FlatSource { - FlatSource::Iter(Box::new(batches.into_iter().map(Ok))) + async fn check_record_batch_reader_result( + reader: &mut ParquetReader, + expected: &[RecordBatch], + ) { + let mut actual = Vec::new(); + while let Some(batch) = reader.next_record_batch().await.unwrap() { + actual.push(batch); + } + assert_eq!( + pretty_format_batches(expected).unwrap().to_string(), + pretty_format_batches(&actual).unwrap().to_string() + ); + assert!(reader.next_record_batch().await.unwrap().is_none()); } /// Creates a flat format RecordBatch for testing with sparse primary key encoding. @@ -1333,10 +1353,11 @@ mod tests { }; let metadata = Arc::new(sst_region_metadata()); - // Create batches with sequence 0 to trigger override functionality - let batch1 = new_batch_with_custom_sequence(&["a", "d"], 0, 60, 0); - let batch2 = new_batch_with_custom_sequence(&["b", "f"], 0, 40, 0); - let source = new_source(&[batch1, batch2]); + // Create batches with sequence 0 to trigger override functionality. + let source = new_flat_source_from_record_batches(vec![ + new_record_batch_with_custom_sequence(&["a", "d"], 0, 60, 0), + new_record_batch_with_custom_sequence(&["b", "f"], 0, 40, 0), + ]); let write_opts = WriteOptions { row_group_size: 50, @@ -1355,7 +1376,7 @@ mod tests { .await; writer - .write_all(source, None, &write_opts) + .write_all_flat_as_primary_key(source, None, &write_opts) .await .unwrap() .remove(0); @@ -1369,7 +1390,7 @@ mod tests { ); let mut reader = builder.build().await.unwrap().unwrap(); let mut normal_batches = Vec::new(); - while let Some(batch) = reader.next_batch().await.unwrap() { + while let Some(batch) = reader.next_record_batch().await.unwrap() { normal_batches.push(batch); } @@ -1391,22 +1412,19 @@ mod tests { ); let mut reader = builder.build().await.unwrap().unwrap(); let mut override_batches = Vec::new(); - while let Some(batch) = reader.next_batch().await.unwrap() { + while let Some(batch) = reader.next_record_batch().await.unwrap() { override_batches.push(batch); } // Compare the results assert_eq!(normal_batches.len(), override_batches.len()); for (normal, override_batch) in normal_batches.into_iter().zip(override_batches.iter()) { - // Create expected batch with override sequence let expected_batch = { - let num_rows = normal.num_rows(); - let mut builder = BatchBuilder::from(normal); - builder - .sequences_array(Arc::new(UInt64Array::from_value(custom_sequence, num_rows))) - .unwrap(); - - builder.build().unwrap() + let mut columns = normal.columns().to_vec(); + let num_cols = columns.len(); + columns[num_cols - 2] = + Arc::new(UInt64Array::from_value(custom_sequence, normal.num_rows())); + RecordBatch::try_new(normal.schema(), columns).unwrap() }; // Override batch should match expected batch diff --git a/src/mito2/src/sst/parquet/flat_format.rs b/src/mito2/src/sst/parquet/flat_format.rs index d6b061e468..8a59e9a97d 100644 --- a/src/mito2/src/sst/parquet/flat_format.rs +++ b/src/mito2/src/sst/parquet/flat_format.rs @@ -52,8 +52,8 @@ use crate::error::{ NewRecordBatchSnafu, Result, }; use crate::sst::parquet::format::{ - FormatProjection, INTERNAL_COLUMN_NUM, PrimaryKeyArray, PrimaryKeyReadFormat, ReadFormat, - StatValues, + FIXED_POS_COLUMN_NUM, FormatProjection, INTERNAL_COLUMN_NUM, PrimaryKeyArray, + PrimaryKeyReadFormat, ReadFormat, StatValues, }; use crate::sst::{ FlatSchemaOptions, flat_sst_arrow_schema_column_num, tag_maybe_to_dictionary_field, @@ -127,6 +127,21 @@ pub(crate) fn op_type_column_index(num_columns: usize) -> usize { num_columns - 1 } +/// Returns the start index of field columns in a flat batch. +/// +/// `num_columns` is the total number of columns in the flat batch schema, +/// including tag columns (if present), field columns, and fixed position columns +/// (time index, primary key, sequence, op type). +/// +/// For Dense encoding (raw PK columns included): field_column_start = primary_key.len() +/// For Sparse encoding (no raw PK columns): field_column_start = 0 +pub(crate) fn field_column_start(metadata: &RegionMetadata, num_columns: usize) -> usize { + // Calculates field column start: total columns - fixed columns - field columns + // Field column count = total metadata columns - time index column - primary key columns + let field_column_count = metadata.column_metadatas.len() - 1 - metadata.primary_key.len(); + num_columns - FIXED_POS_COLUMN_NUM - field_column_count +} + // TODO(yingwen): Add an option to skip reading internal columns if the region is // append only and doesn't use sparse encoding (We need to check the table id under // sparse encoding). @@ -765,3 +780,89 @@ impl FlatReadFormat { .unwrap() } } + +#[cfg(test)] +mod tests { + use api::v1::SemanticType; + use datatypes::prelude::ConcreteDataType; + use datatypes::schema::ColumnSchema; + use store_api::codec::PrimaryKeyEncoding; + use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder}; + use store_api::storage::RegionId; + + use super::field_column_start; + use crate::sst::{FlatSchemaOptions, flat_sst_arrow_schema_column_num}; + + /// Builds a `RegionMetadata` with the given number of tags and fields. + fn build_metadata( + num_tags: usize, + num_fields: usize, + encoding: PrimaryKeyEncoding, + ) -> RegionMetadata { + let mut builder = RegionMetadataBuilder::new(RegionId::new(0, 0)); + let mut col_id = 0u32; + + for i in 0..num_tags { + builder.push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + format!("tag_{i}"), + ConcreteDataType::string_datatype(), + true, + ), + semantic_type: SemanticType::Tag, + column_id: col_id, + }); + col_id += 1; + } + + for i in 0..num_fields { + builder.push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + format!("field_{i}"), + ConcreteDataType::uint64_datatype(), + true, + ), + semantic_type: SemanticType::Field, + column_id: col_id, + }); + col_id += 1; + } + + builder.push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "ts".to_string(), + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + semantic_type: SemanticType::Timestamp, + column_id: col_id, + }); + + let primary_key: Vec = (0..num_tags as u32).collect(); + builder.primary_key(primary_key); + builder.primary_key_encoding(encoding); + builder.build().unwrap() + } + + #[test] + fn test_field_column_start() { + // (num_tags, num_fields, encoding, expected) + let cases = [ + (1, 1, PrimaryKeyEncoding::Dense, 1), + (2, 2, PrimaryKeyEncoding::Dense, 2), + (0, 2, PrimaryKeyEncoding::Dense, 0), + (2, 2, PrimaryKeyEncoding::Sparse, 0), + ]; + + for (num_tags, num_fields, encoding, expected) in cases { + let metadata = build_metadata(num_tags, num_fields, encoding); + let options = FlatSchemaOptions::from_encoding(encoding); + let num_columns = flat_sst_arrow_schema_column_num(&metadata, &options); + let result = field_column_start(&metadata, num_columns); + assert_eq!( + result, expected, + "num_tags={num_tags}, num_fields={num_fields}, encoding={encoding:?}" + ); + } + } +} diff --git a/src/mito2/src/sst/parquet/format.rs b/src/mito2/src/sst/parquet/format.rs index 70d026e6db..ba64eac78b 100644 --- a/src/mito2/src/sst/parquet/format.rs +++ b/src/mito2/src/sst/parquet/format.rs @@ -34,12 +34,12 @@ use api::v1::SemanticType; use common_time::Timestamp; use datafusion_common::ScalarValue; use datatypes::arrow::array::{ - ArrayRef, BinaryArray, BinaryDictionaryBuilder, DictionaryArray, UInt32Array, UInt64Array, + ArrayRef, BinaryArray, BinaryDictionaryBuilder, DictionaryArray, UInt64Array, }; use datatypes::arrow::datatypes::{SchemaRef, UInt32Type}; use datatypes::arrow::record_batch::RecordBatch; use datatypes::prelude::DataType; -use datatypes::vectors::{Helper, Vector}; +use datatypes::vectors::Helper; use mito_codec::row_converter::{ CompositeValues, PrimaryKeyCodec, SortField, build_primary_key_codec, build_primary_key_codec_with_fields, @@ -51,8 +51,7 @@ use store_api::metadata::{ColumnMetadata, RegionMetadataRef}; use store_api::storage::{ColumnId, SequenceNumber}; use crate::error::{ - ConvertVectorSnafu, DecodeSnafu, InvalidBatchSnafu, InvalidRecordBatchSnafu, - NewRecordBatchSnafu, Result, + ConvertVectorSnafu, DecodeSnafu, InvalidRecordBatchSnafu, NewRecordBatchSnafu, Result, }; use crate::read::{Batch, BatchBuilder, BatchColumn}; use crate::sst::file::{FileMeta, FileTimeRange}; @@ -73,7 +72,6 @@ pub(crate) const INTERNAL_COLUMN_NUM: usize = 3; /// Helper for writing the SST format with primary key. pub(crate) struct PrimaryKeyWriteFormat { - metadata: RegionMetadataRef, /// SST file schema. arrow_schema: SchemaRef, override_sequence: Option, @@ -84,7 +82,6 @@ impl PrimaryKeyWriteFormat { pub(crate) fn new(metadata: RegionMetadataRef) -> PrimaryKeyWriteFormat { let arrow_schema = to_sst_arrow_schema(&metadata); PrimaryKeyWriteFormat { - metadata, arrow_schema, override_sequence: None, } @@ -104,40 +101,25 @@ impl PrimaryKeyWriteFormat { &self.arrow_schema } - /// Convert `batch` to a arrow record batch to store in parquet. - pub(crate) fn convert_batch(&self, batch: &Batch) -> Result { - debug_assert_eq!( - batch.fields().len() + FIXED_POS_COLUMN_NUM, - self.arrow_schema.fields().len() - ); - let mut columns = Vec::with_capacity(batch.fields().len() + FIXED_POS_COLUMN_NUM); - // Store all fields first. - for (column, column_metadata) in batch.fields().iter().zip(self.metadata.field_columns()) { - ensure!( - column.column_id == column_metadata.column_id, - InvalidBatchSnafu { - reason: format!( - "Batch has column {} but metadata has column {}", - column.column_id, column_metadata.column_id - ), - } - ); - - columns.push(column.data.to_arrow_array()); - } - // Add time index column. - columns.push(batch.timestamps().to_arrow_array()); - // Add internal columns: primary key, sequences, op types. - columns.push(new_primary_key_array(batch.primary_key(), batch.num_rows())); + /// Convert a flat `RecordBatch` to primary-key format, retaining only + /// field columns, time index, and internal columns. + /// + /// `num_fields` is the number of field columns. The method strips + /// leading tag columns: `num_tag_columns = batch.num_columns() - num_fields - FIXED_POS_COLUMN_NUM`. + pub(crate) fn convert_flat_batch( + &self, + batch: &RecordBatch, + num_fields: usize, + ) -> Result { + let num_tag_columns = batch.num_columns() - num_fields - FIXED_POS_COLUMN_NUM; + let mut columns: Vec = batch.columns()[num_tag_columns..].to_vec(); if let Some(override_sequence) = self.override_sequence { - let sequence_array = + let num_cols = columns.len(); + // sequence is at num_cols - 2 (before op_type) + columns[num_cols - 2] = Arc::new(UInt64Array::from(vec![override_sequence; batch.num_rows()])); - columns.push(sequence_array); - } else { - columns.push(batch.sequences().to_arrow_array()); } - columns.push(batch.op_types().to_arrow_array()); RecordBatch::try_new(self.arrow_schema.clone(), columns).context(NewRecordBatchSnafu) } @@ -926,15 +908,6 @@ pub(crate) fn primary_key_offsets(pk_dict_array: &PrimaryKeyArray) -> Result ArrayRef { - let values = Arc::new(BinaryArray::from_iter_values([primary_key])); - let keys = UInt32Array::from_value(0, num_rows); - - // Safety: The key index is valid. - Arc::new(DictionaryArray::new(keys, values)) -} - /// Gets the min/max time index of the row group from the parquet meta. /// It assumes the parquet is created by the mito engine. pub(crate) fn parquet_row_group_time_range( @@ -1017,7 +990,7 @@ mod tests { use api::v1::OpType; use datatypes::arrow::array::{ - Int64Array, StringArray, TimestampMillisecondArray, UInt8Array, UInt64Array, + Int64Array, StringArray, TimestampMillisecondArray, UInt8Array, UInt32Array, UInt64Array, }; use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit}; use datatypes::prelude::ConcreteDataType; @@ -1145,13 +1118,6 @@ mod tests { assert_eq!(&build_test_arrow_schema(), write_format.arrow_schema()); } - #[test] - fn test_new_primary_key_array() { - let array = new_primary_key_array(b"test", 3); - let expect = build_test_pk_array(&[(b"test".to_vec(), 3)]) as ArrayRef; - assert_eq!(&expect, &array); - } - fn build_test_pk_array(pk_row_nums: &[(Vec, usize)]) -> Arc { let values = Arc::new(BinaryArray::from_iter_values( pk_row_nums.iter().map(|v| &v.0), @@ -1164,49 +1130,6 @@ mod tests { Arc::new(DictionaryArray::new(keys, values)) } - #[test] - fn test_convert_batch() { - let metadata = build_test_region_metadata(); - let write_format = PrimaryKeyWriteFormat::new(metadata); - - let num_rows = 4; - let batch = new_batch(b"test", 1, 2, num_rows); - let columns: Vec = vec![ - Arc::new(Int64Array::from(vec![2; num_rows])), // field1 - Arc::new(Int64Array::from(vec![3; num_rows])), // field0 - Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3, 4])), // ts - build_test_pk_array(&[(b"test".to_vec(), num_rows)]), // primary key - Arc::new(UInt64Array::from(vec![TEST_SEQUENCE; num_rows])), // sequence - Arc::new(UInt8Array::from(vec![TEST_OP_TYPE; num_rows])), // op type - ]; - let expect_record = RecordBatch::try_new(build_test_arrow_schema(), columns).unwrap(); - - let actual = write_format.convert_batch(&batch).unwrap(); - assert_eq!(expect_record, actual); - } - - #[test] - fn test_convert_batch_with_override_sequence() { - let metadata = build_test_region_metadata(); - let write_format = - PrimaryKeyWriteFormat::new(metadata).with_override_sequence(Some(415411)); - - let num_rows = 4; - let batch = new_batch(b"test", 1, 2, num_rows); - let columns: Vec = vec![ - Arc::new(Int64Array::from(vec![2; num_rows])), // field1 - Arc::new(Int64Array::from(vec![3; num_rows])), // field0 - Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3, 4])), // ts - build_test_pk_array(&[(b"test".to_vec(), num_rows)]), // primary key - Arc::new(UInt64Array::from(vec![415411; num_rows])), // sequence - Arc::new(UInt8Array::from(vec![TEST_OP_TYPE; num_rows])), // op type - ]; - let expect_record = RecordBatch::try_new(build_test_arrow_schema(), columns).unwrap(); - - let actual = write_format.convert_batch(&batch).unwrap(); - assert_eq!(expect_record, actual); - } - #[test] fn test_projection_indices() { let metadata = build_test_region_metadata(); @@ -1867,4 +1790,100 @@ mod tests { let result = format.convert_batch(record_batch.clone(), None).unwrap(); assert_eq!(record_batch, result); } + + #[test] + fn test_convert_flat_batch() { + let metadata = build_test_region_metadata(); + let write_format = PrimaryKeyWriteFormat::new(metadata); + + let num_rows = 4; + // Build a flat record batch: tag0, tag1, field1, field0, ts, __primary_key, __sequence, __op_type + let flat_columns: Vec = input_columns_for_flat_batch(num_rows); + let flat_batch = RecordBatch::try_new(build_test_flat_sst_schema(), flat_columns).unwrap(); + + // num_fields = 2 (field1, field0) + let result = write_format.convert_flat_batch(&flat_batch, 2).unwrap(); + + // Expected: tag columns stripped, only field1, field0, ts, __primary_key, __sequence, __op_type + let expected_columns: Vec = vec![ + Arc::new(Int64Array::from(vec![2; num_rows])), // field1 + Arc::new(Int64Array::from(vec![3; num_rows])), // field0 + Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3, 4])), // ts + build_test_pk_array(&[(b"test".to_vec(), num_rows)]), // __primary_key + Arc::new(UInt64Array::from(vec![TEST_SEQUENCE; num_rows])), // __sequence + Arc::new(UInt8Array::from(vec![TEST_OP_TYPE; num_rows])), // __op_type + ]; + let expected = RecordBatch::try_new(build_test_arrow_schema(), expected_columns).unwrap(); + + assert_eq!(expected, result); + } + + #[test] + fn test_convert_flat_batch_with_override_sequence() { + let metadata = build_test_region_metadata(); + let write_format = PrimaryKeyWriteFormat::new(metadata).with_override_sequence(Some(999)); + + let num_rows = 4; + let flat_columns: Vec = input_columns_for_flat_batch(num_rows); + let flat_batch = RecordBatch::try_new(build_test_flat_sst_schema(), flat_columns).unwrap(); + + let result = write_format.convert_flat_batch(&flat_batch, 2).unwrap(); + + let expected_columns: Vec = vec![ + Arc::new(Int64Array::from(vec![2; num_rows])), // field1 + Arc::new(Int64Array::from(vec![3; num_rows])), // field0 + Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3, 4])), // ts + build_test_pk_array(&[(b"test".to_vec(), num_rows)]), // __primary_key + Arc::new(UInt64Array::from(vec![999; num_rows])), // overridden __sequence + Arc::new(UInt8Array::from(vec![TEST_OP_TYPE; num_rows])), // __op_type + ]; + let expected = RecordBatch::try_new(build_test_arrow_schema(), expected_columns).unwrap(); + + assert_eq!(expected, result); + } + + #[test] + fn test_convert_flat_batch_no_tags() { + // Test with a region that has no primary key columns (no tags to strip). + let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1)); + builder + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "field0", + ConcreteDataType::int64_datatype(), + true, + ), + semantic_type: SemanticType::Field, + column_id: 1, + }) + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + semantic_type: SemanticType::Timestamp, + column_id: 2, + }); + let metadata = Arc::new(builder.build().unwrap()); + let write_format = PrimaryKeyWriteFormat::new(metadata); + + let num_rows = 3; + // No tag columns, so flat batch is: field0, ts, __primary_key, __sequence, __op_type + let sst_schema = write_format.arrow_schema().clone(); + let columns: Vec = vec![ + Arc::new(Int64Array::from(vec![10; num_rows])), // field0 + Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3])), // ts + build_test_pk_array(&[(b"".to_vec(), num_rows)]), // __primary_key + Arc::new(UInt64Array::from(vec![TEST_SEQUENCE; num_rows])), // __sequence + Arc::new(UInt8Array::from(vec![TEST_OP_TYPE; num_rows])), // __op_type + ]; + let flat_batch = RecordBatch::try_new(sst_schema.clone(), columns.clone()).unwrap(); + + // num_fields = 1, num_tag_columns = 5 - 1 - 4 = 0, so nothing is stripped + let result = write_format.convert_flat_batch(&flat_batch, 1).unwrap(); + let expected = RecordBatch::try_new(sst_schema, columns).unwrap(); + + assert_eq!(expected, result); + } } diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs index 500f32ae91..4d7122ccc6 100644 --- a/src/mito2/src/sst/parquet/reader.rs +++ b/src/mito2/src/sst/parquet/reader.rs @@ -21,9 +21,8 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use api::v1::SemanticType; -use async_trait::async_trait; use common_recordbatch::filter::SimpleFilterEvaluator; -use common_telemetry::{debug, tracing, warn}; +use common_telemetry::{tracing, warn}; use datafusion_expr::Expr; use datatypes::arrow::array::ArrayRef; use datatypes::arrow::datatypes::Field; @@ -57,7 +56,7 @@ use crate::metrics::{ READ_ROWS_TOTAL, READ_STAGE_ELAPSED, }; use crate::read::flat_projection::CompactionProjectionMapper; -use crate::read::prune::{PruneReader, Source}; +use crate::read::prune::FlatPruneReader; use crate::read::{Batch, BatchReader}; use crate::sst::file::FileHandle; use crate::sst::index::bloom_filter::applier::{ @@ -303,7 +302,8 @@ impl ParquetReaderBuilder { pub async fn build(&self) -> Result> { let mut metrics = ReaderMetrics::default(); - let Some((context, selection)) = self.build_reader_input(&mut metrics).await? else { + let Some((context, selection)) = self.build_reader_input_inner(&mut metrics, true).await? + else { return Ok(None); }; ParquetReader::new(Arc::new(context), selection) @@ -325,12 +325,14 @@ impl ParquetReaderBuilder { &self, metrics: &mut ReaderMetrics, ) -> Result> { - self.build_reader_input_inner(metrics).await + self.build_reader_input_inner(metrics, self.flat_format) + .await } async fn build_reader_input_inner( &self, metrics: &mut ReaderMetrics, + flat_format: bool, ) -> Result> { let start = Instant::now(); @@ -373,7 +375,7 @@ impl ParquetReaderBuilder { // before compat handling. let compaction_projection_mapper = if self.compaction && !is_same_region_partition - && self.flat_format + && flat_format && region_meta.primary_key_encoding == PrimaryKeyEncoding::Sparse { Some(CompactionProjectionMapper::try_new(®ion_meta)?) @@ -385,7 +387,7 @@ impl ParquetReaderBuilder { ReadFormat::new( region_meta.clone(), Some(column_ids), - self.flat_format, + flat_format, Some(parquet_meta.file_metadata().schema_descr().num_columns()), &file_path, skip_auto_convert, @@ -401,7 +403,7 @@ impl ParquetReaderBuilder { ReadFormat::new( region_meta.clone(), Some(&column_ids), - self.flat_format, + flat_format, Some(parquet_meta.file_metadata().schema_descr().num_columns()), &file_path, skip_auto_convert, @@ -1751,24 +1753,6 @@ impl RowGroupReaderBuilder { } } -/// The state of a [ParquetReader]. -enum ReaderState { - /// The reader is reading a row group. - Readable(PruneReader), - /// The reader is exhausted. - Exhausted(ReaderMetrics), -} - -impl ReaderState { - /// Returns the metrics of the reader. - fn metrics(&self) -> ReaderMetrics { - match self { - ReaderState::Readable(reader) => reader.metrics(), - ReaderState::Exhausted(m) => m.clone(), - } - } -} - /// The filter to evaluate or the prune result of the default value. pub(crate) enum MaybeFilter { /// The filter to evaluate. @@ -1879,13 +1863,12 @@ pub struct ParquetReader { /// Row group selection to read. selection: RowGroupSelection, /// Reader of current row group. - reader_state: ReaderState, + reader: Option, /// Metrics for tracking row group fetch operations. fetch_metrics: ParquetFetchMetrics, } -#[async_trait] -impl BatchReader for ParquetReader { +impl ParquetReader { #[tracing::instrument( skip_all, fields( @@ -1893,18 +1876,20 @@ impl BatchReader for ParquetReader { file_id = %self.context.reader_builder().file_handle.file_id() ) )] - async fn next_batch(&mut self) -> Result> { - let ReaderState::Readable(reader) = &mut self.reader_state else { - return Ok(None); - }; + pub async fn next_record_batch(&mut self) -> Result> { + loop { + if let Some(reader) = &mut self.reader { + if let Some(batch) = reader.next_batch()? { + return Ok(Some(batch)); + } + self.reader = None; + continue; + } - // We don't collect the elapsed time if the reader returns an error. - if let Some(batch) = reader.next_batch().await? { - return Ok(Some(batch)); - } + let Some((row_group_idx, row_selection)) = self.selection.pop_first() else { + return Ok(None); + }; - // No more items in current row group, reads next row group. - while let Some((row_group_idx, row_selection)) = self.selection.pop_first() { let parquet_reader = self .context .reader_builder() @@ -1915,54 +1900,14 @@ impl BatchReader for ParquetReader { ) .await?; - // Resets the parquet reader. - // Compute skip_fields for this row group let skip_fields = self.context.should_skip_fields(row_group_idx); - reader.reset_source( - Source::RowGroup(RowGroupReader::new(self.context.clone(), parquet_reader)), + self.reader = Some(FlatPruneReader::new_with_row_group_reader( + self.context.clone(), + FlatRowGroupReader::new(self.context.clone(), parquet_reader), skip_fields, - ); - if let Some(batch) = reader.next_batch().await? { - return Ok(Some(batch)); - } + )); } - - // The reader is exhausted. - self.reader_state = ReaderState::Exhausted(reader.metrics().clone()); - Ok(None) } -} - -impl Drop for ParquetReader { - fn drop(&mut self) { - let metrics = self.reader_state.metrics(); - debug!( - "Read parquet {} {}, range: {:?}, {}/{} row groups, metrics: {:?}", - self.context.reader_builder().file_handle.region_id(), - self.context.reader_builder().file_handle.file_id(), - self.context.reader_builder().file_handle.time_range(), - metrics.filter_metrics.rg_total - - metrics.filter_metrics.rg_inverted_filtered - - metrics.filter_metrics.rg_minmax_filtered - - metrics.filter_metrics.rg_fulltext_filtered - - metrics.filter_metrics.rg_bloom_filtered, - metrics.filter_metrics.rg_total, - metrics - ); - - // Report metrics. - READ_STAGE_ELAPSED - .with_label_values(&["build_parquet_reader"]) - .observe(metrics.build_cost.as_secs_f64()); - READ_STAGE_ELAPSED - .with_label_values(&["scan_row_groups"]) - .observe(metrics.scan_cost.as_secs_f64()); - metrics.observe_rows("parquet_reader"); - metrics.filter_metrics.observe(); - } -} - -impl ParquetReader { /// Creates a new reader. #[tracing::instrument( skip_all, @@ -1975,28 +1920,27 @@ impl ParquetReader { context: FileRangeContextRef, mut selection: RowGroupSelection, ) -> Result { + debug_assert!(context.read_format().as_flat().is_some()); let fetch_metrics = ParquetFetchMetrics::default(); - // No more items in current row group, reads next row group. - let reader_state = if let Some((row_group_idx, row_selection)) = selection.pop_first() { + let reader = if let Some((row_group_idx, row_selection)) = selection.pop_first() { let parquet_reader = context .reader_builder() .build(row_group_idx, Some(row_selection), Some(&fetch_metrics)) .await?; - // Compute skip_fields once for this row group let skip_fields = context.should_skip_fields(row_group_idx); - ReaderState::Readable(PruneReader::new_with_row_group_reader( + Some(FlatPruneReader::new_with_row_group_reader( context.clone(), - RowGroupReader::new(context.clone(), parquet_reader), + FlatRowGroupReader::new(context.clone(), parquet_reader), skip_fields, )) } else { - ReaderState::Exhausted(ReaderMetrics::default()) + None }; Ok(ParquetReader { context, selection, - reader_state, + reader, fetch_metrics, }) } diff --git a/src/mito2/src/sst/parquet/writer.rs b/src/mito2/src/sst/parquet/writer.rs index b207f11ef8..4e75073e26 100644 --- a/src/mito2/src/sst/parquet/writer.rs +++ b/src/mito2/src/sst/parquet/writer.rs @@ -50,7 +50,7 @@ use crate::config::{IndexBuildMode, IndexConfig}; use crate::error::{ InvalidMetadataSnafu, OpenDalSnafu, Result, UnexpectedSnafu, WriteParquetSnafu, }; -use crate::read::{Batch, FlatSource, Source}; +use crate::read::FlatSource; use crate::sst::file::RegionFileId; use crate::sst::index::{IndexOutput, Indexer, IndexerBuilder}; use crate::sst::parquet::flat_format::{FlatWriteFormat, time_index_column_index}; @@ -60,6 +60,35 @@ use crate::sst::{ DEFAULT_WRITE_BUFFER_SIZE, DEFAULT_WRITE_CONCURRENCY, FlatSchemaOptions, SeriesEstimator, }; +/// Converts a flat RecordBatch for writing to parquet. +enum FlatBatchConverter { + /// Write as-is in flat format. + Flat(FlatWriteFormat), + /// Convert flat batch to primary-key format by stripping tag columns. + PrimaryKey { + format: PrimaryKeyWriteFormat, + num_fields: usize, + }, +} + +impl FlatBatchConverter { + fn arrow_schema(&self) -> &SchemaRef { + match self { + FlatBatchConverter::Flat(f) => f.arrow_schema(), + FlatBatchConverter::PrimaryKey { format, .. } => format.arrow_schema(), + } + } + + fn convert_batch(&self, batch: &RecordBatch) -> Result { + match self { + FlatBatchConverter::Flat(f) => f.convert_batch(batch), + FlatBatchConverter::PrimaryKey { format, num_fields } => { + format.convert_flat_batch(batch, *num_fields) + } + } + } +} + /// Parquet SST writer. pub struct ParquetWriter<'a, F: WriterFactory, I: IndexerBuilder, P: FilePathProvider> { /// Path provider that creates SST and index file paths according to file id. @@ -240,81 +269,6 @@ where Ok(()) } - /// Iterates source and writes all rows to Parquet file. - /// - /// Returns the [SstInfo] if the SST is written. - pub async fn write_all( - &mut self, - source: Source, - override_sequence: Option, // override the `sequence` field from `Source` - opts: &WriteOptions, - ) -> Result { - let res = self - .write_all_without_cleaning(source, override_sequence, opts) - .await; - if res.is_err() { - // Clean tmp files explicitly on failure. - let file_id = self.current_file; - if let Some(cleaner) = &self.file_cleaner { - cleaner.clean_by_file_id(file_id).await; - } - } - res - } - - async fn write_all_without_cleaning( - &mut self, - mut source: Source, - override_sequence: Option, // override the `sequence` field from `Source` - opts: &WriteOptions, - ) -> Result { - let mut results = smallvec![]; - let write_format = PrimaryKeyWriteFormat::new(self.metadata.clone()) - .with_override_sequence(override_sequence); - let mut stats = SourceStats::default(); - - while let Some(res) = self - .write_next_batch(&mut source, &write_format, opts) - .await - .transpose() - { - match res { - Ok(mut batch) => { - stats.update(&batch); - let start = Instant::now(); - // safety: self.current_indexer must be set when first batch has been written. - match self.index_config.build_mode { - IndexBuildMode::Sync => { - self.current_indexer - .as_mut() - .unwrap() - .update(&mut batch) - .await; - } - IndexBuildMode::Async => {} - } - self.metrics.update_index += start.elapsed(); - if let Some(max_file_size) = opts.max_file_size - && self.bytes_written.load(Ordering::Relaxed) > max_file_size - { - self.finish_current_file(&mut results, &mut stats).await?; - } - } - Err(e) => { - if let Some(indexer) = &mut self.current_indexer { - indexer.abort().await; - } - return Err(e); - } - } - } - - self.finish_current_file(&mut results, &mut stats).await?; - - // object_store.write will make sure all bytes are written or an error is raised. - Ok(results) - } - /// Iterates FlatSource and writes all RecordBatch in flat format to Parquet file. /// /// Returns the [SstInfo] if the SST is written. @@ -324,11 +278,15 @@ where override_sequence: Option, opts: &WriteOptions, ) -> Result { - let res = self - .write_all_flat_without_cleaning(source, override_sequence, opts) - .await; + let converter = FlatBatchConverter::Flat( + FlatWriteFormat::new( + self.metadata.clone(), + &FlatSchemaOptions::from_encoding(self.metadata.primary_key_encoding), + ) + .with_override_sequence(override_sequence), + ); + let res = self.write_all_flat_inner(source, &converter, opts).await; if res.is_err() { - // Clean tmp files explicitly on failure. let file_id = self.current_file; if let Some(cleaner) = &self.file_cleaner { cleaner.clean_by_file_id(file_id).await; @@ -337,36 +295,58 @@ where res } - async fn write_all_flat_without_cleaning( + /// Iterates FlatSource and writes all RecordBatch in primary-key format to Parquet file. + /// + /// Returns the [SstInfo] if the SST is written. + pub async fn write_all_flat_as_primary_key( &mut self, - mut source: FlatSource, + source: FlatSource, override_sequence: Option, opts: &WriteOptions, + ) -> Result { + let num_fields = self.metadata.field_columns().count(); + let converter = FlatBatchConverter::PrimaryKey { + format: PrimaryKeyWriteFormat::new(self.metadata.clone()) + .with_override_sequence(override_sequence), + num_fields, + }; + let res = self.write_all_flat_inner(source, &converter, opts).await; + if res.is_err() { + let file_id = self.current_file; + if let Some(cleaner) = &self.file_cleaner { + cleaner.clean_by_file_id(file_id).await; + } + } + res + } + + async fn write_all_flat_inner( + &mut self, + mut source: FlatSource, + converter: &FlatBatchConverter, + opts: &WriteOptions, ) -> Result { let mut results = smallvec![]; - let flat_format = FlatWriteFormat::new( - self.metadata.clone(), - &FlatSchemaOptions::from_encoding(self.metadata.primary_key_encoding), - ) - .with_override_sequence(override_sequence); let mut stats = SourceStats::default(); while let Some(record_batch) = self - .write_next_flat_batch(&mut source, &flat_format, opts) + .write_next_flat_batch(&mut source, converter, opts) .await .transpose() { match record_batch { Ok(batch) => { stats.update_flat(&batch)?; - let start = Instant::now(); - // safety: self.current_indexer must be set when first batch has been written. - self.current_indexer - .as_mut() - .unwrap() - .update_flat(&batch) - .await; - self.metrics.update_index += start.elapsed(); + if matches!(self.index_config.build_mode, IndexBuildMode::Sync) { + let start = Instant::now(); + // safety: self.current_indexer must be set when first batch has been written. + self.current_indexer + .as_mut() + .unwrap() + .update_flat(&batch) + .await; + self.metrics.update_index += start.elapsed(); + } if let Some(max_file_size) = opts.max_file_size && self.bytes_written.load(Ordering::Relaxed) > max_file_size { @@ -411,34 +391,10 @@ where .set_column_compression(op_type_col, Compression::UNCOMPRESSED) } - async fn write_next_batch( - &mut self, - source: &mut Source, - write_format: &PrimaryKeyWriteFormat, - opts: &WriteOptions, - ) -> Result> { - let start = Instant::now(); - let Some(batch) = source.next_batch().await? else { - return Ok(None); - }; - self.metrics.iter_source += start.elapsed(); - - let arrow_batch = write_format.convert_batch(&batch)?; - - let start = Instant::now(); - self.maybe_init_writer(write_format.arrow_schema(), opts) - .await? - .write(&arrow_batch) - .await - .context(WriteParquetSnafu)?; - self.metrics.write_batch += start.elapsed(); - Ok(Some(batch)) - } - async fn write_next_flat_batch( &mut self, source: &mut FlatSource, - flat_format: &FlatWriteFormat, + converter: &FlatBatchConverter, opts: &WriteOptions, ) -> Result> { let start = Instant::now(); @@ -447,15 +403,16 @@ where }; self.metrics.iter_source += start.elapsed(); - let arrow_batch = flat_format.convert_batch(&record_batch)?; + let arrow_batch = converter.convert_batch(&record_batch)?; let start = Instant::now(); - self.maybe_init_writer(flat_format.arrow_schema(), opts) + self.maybe_init_writer(converter.arrow_schema(), opts) .await? .write(&arrow_batch) .await .context(WriteParquetSnafu)?; self.metrics.write_batch += start.elapsed(); + // Return original flat batch for stats/indexer which use flat layout. Ok(Some(record_batch)) } @@ -515,26 +472,6 @@ struct SourceStats { } impl SourceStats { - fn update(&mut self, batch: &Batch) { - if batch.is_empty() { - return; - } - - self.num_rows += batch.num_rows(); - self.series_estimator.update(batch); - // Safety: batch is not empty. - let (min_in_batch, max_in_batch) = ( - batch.first_timestamp().unwrap(), - batch.last_timestamp().unwrap(), - ); - if let Some(time_range) = &mut self.time_range { - time_range.0 = time_range.0.min(min_in_batch); - time_range.1 = time_range.1.max(max_in_batch); - } else { - self.time_range = Some((min_in_batch, max_in_batch)); - } - } - fn update_flat(&mut self, record_batch: &RecordBatch) -> Result<()> { if record_batch.num_rows() == 0 { return Ok(()); diff --git a/src/mito2/src/test_util/sst_util.rs b/src/mito2/src/test_util/sst_util.rs index 389d9bf107..e9515030c0 100644 --- a/src/mito2/src/test_util/sst_util.rs +++ b/src/mito2/src/test_util/sst_util.rs @@ -18,7 +18,11 @@ use std::sync::Arc; use api::v1::{OpType, SemanticType}; use common_time::Timestamp; -use datatypes::arrow::array::{BinaryArray, TimestampMillisecondArray, UInt8Array, UInt64Array}; +use datatypes::arrow::array::{ + ArrayRef, BinaryDictionaryBuilder, RecordBatch, StringDictionaryBuilder, + TimestampMillisecondArray, UInt8Array, UInt64Array, +}; +use datatypes::arrow::datatypes::UInt32Type; use datatypes::prelude::ConcreteDataType; use datatypes::schema::{ColumnSchema, SkippingIndexOptions}; use datatypes::value::ValueRef; @@ -32,8 +36,9 @@ use store_api::metric_engine_consts::{ use store_api::storage::consts::ReservedColumnId; use store_api::storage::{FileId, RegionId}; -use crate::read::{Batch, BatchBuilder, Source}; +use crate::read::{Batch, FlatSource, Source}; use crate::sst::file::{FileHandle, FileMeta}; +use crate::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema}; use crate::test_util::{VecBatchReader, new_batch_builder, new_noop_file_purger}; /// Test region id. @@ -246,34 +251,68 @@ pub fn new_batch_by_range(tags: &[&str], start: usize, end: usize) -> Batch { new_batch_with_custom_sequence(tags, start, end, 1000) } -pub fn new_batch_with_binary(tags: &[&str], start: usize, end: usize) -> Batch { +/// Creates a flat format RecordBatch for testing. +/// Similar to `new_batch_by_range` but returns a RecordBatch in flat format. +pub fn new_record_batch_by_range(tags: &[&str], start: usize, end: usize) -> RecordBatch { + new_record_batch_with_custom_sequence(tags, start, end, 1000) +} + +/// Creates a flat format RecordBatch for testing with a custom sequence. +pub fn new_record_batch_with_custom_sequence( + tags: &[&str], + start: usize, + end: usize, + sequence: u64, +) -> RecordBatch { assert!(end >= start); + let metadata = Arc::new(sst_region_metadata()); + let flat_schema = to_flat_sst_arrow_schema(&metadata, &FlatSchemaOptions::default()); + + let num_rows = end - start; + let mut columns = Vec::new(); + + // Add primary key columns (tag_0, tag_1) as dictionary arrays + let mut tag_0_builder = StringDictionaryBuilder::::new(); + let mut tag_1_builder = StringDictionaryBuilder::::new(); + + for _ in 0..num_rows { + tag_0_builder.append_value(tags[0]); + tag_1_builder.append_value(tags[1]); + } + + columns.push(Arc::new(tag_0_builder.finish()) as ArrayRef); + columns.push(Arc::new(tag_1_builder.finish()) as ArrayRef); + + // Add field column (field_0) + let field_values: Vec = (start..end).map(|v| v as u64).collect(); + columns.push(Arc::new(UInt64Array::from(field_values))); + + // Add time index column (ts) + let timestamps: Vec = (start..end).map(|v| v as i64).collect(); + columns.push(Arc::new(TimestampMillisecondArray::from(timestamps))); + + // Add encoded primary key column let pk = new_primary_key(tags); - let timestamps: Vec<_> = (start..end).map(|v| v as i64).collect(); - let sequences = vec![1000; end - start]; - let op_types = vec![OpType::Put; end - start]; + let mut pk_builder = BinaryDictionaryBuilder::::new(); + for _ in 0..num_rows { + pk_builder.append(&pk).unwrap(); + } + columns.push(Arc::new(pk_builder.finish())); - let field: Vec<_> = (start..end) - .map(|_v| "some data".as_bytes().to_vec()) - .collect(); + // Add sequence column + columns.push(Arc::new(UInt64Array::from_value(sequence, num_rows))); - let mut builder = BatchBuilder::new(pk); - builder - .timestamps_array(Arc::new(TimestampMillisecondArray::from_iter_values( - timestamps.iter().copied(), - ))) - .unwrap() - .sequences_array(Arc::new(UInt64Array::from_iter_values( - sequences.iter().copied(), - ))) - .unwrap() - .op_types_array(Arc::new(UInt8Array::from_iter_values( - op_types.iter().map(|v| *v as u8), - ))) - .unwrap() - .push_field_array(1, Arc::new(BinaryArray::from_iter_values(field))) - .unwrap(); - builder.build().unwrap() + // Add op_type column + columns.push(Arc::new(UInt8Array::from_value( + OpType::Put as u8, + num_rows, + ))); + RecordBatch::try_new(flat_schema, columns).unwrap() +} + +/// Creates a FlatSource from flat format RecordBatches. +pub fn new_flat_source_from_record_batches(batches: Vec) -> FlatSource { + FlatSource::Iter(Box::new(batches.into_iter().map(Ok))) } /// Creates a new region metadata for testing SSTs with binary datatype. From 0dfbba0b3f4333a20c9a861c6339d085b988be7b Mon Sep 17 00:00:00 2001 From: liyang Date: Fri, 13 Mar 2026 20:42:15 +0800 Subject: [PATCH 007/195] ci: upload artifacts use s3 proxy (#7800) * ci: upload artifacts use s3 proxy Signed-off-by: liyang * update echo context Signed-off-by: liyang --------- Signed-off-by: liyang --- .../actions/release-cn-artifacts/action.yaml | 32 +++++----------- .github/scripts/upload-artifacts-to-s3.sh | 38 ++++++++++++------- .github/workflows/dev-build.yml | 7 ++-- .github/workflows/nightly-build.yml | 7 ++-- .github/workflows/release.yml | 7 ++-- 5 files changed, 44 insertions(+), 47 deletions(-) diff --git a/.github/actions/release-cn-artifacts/action.yaml b/.github/actions/release-cn-artifacts/action.yaml index 2825d3f5d0..fe78d5a760 100644 --- a/.github/actions/release-cn-artifacts/action.yaml +++ b/.github/actions/release-cn-artifacts/action.yaml @@ -37,17 +37,14 @@ inputs: description: Whether to push the latest tag of the image required: false default: 'true' - aws-cn-s3-bucket: - description: S3 bucket to store released artifacts in CN region + proxy-url: + description: The url of the S3 proxy server required: true - aws-cn-access-key-id: - description: AWS access key id in CN region + proxy-username: + description: The username of the S3 proxy required: true - aws-cn-secret-access-key: - description: AWS secret access key in CN region - required: true - aws-cn-region: - description: AWS region in CN + proxy-password: + description: The password of the S3 proxy required: true upload-to-s3: description: Upload to S3 @@ -77,21 +74,13 @@ runs: with: path: ${{ inputs.artifacts-dir }} - - name: Install s5cmd - shell: bash - run: | - wget https://github.com/peak/s5cmd/releases/download/v2.3.0/s5cmd_2.3.0_Linux-64bit.tar.gz - tar -xzf s5cmd_2.3.0_Linux-64bit.tar.gz - sudo mv s5cmd /usr/local/bin/ - sudo chmod +x /usr/local/bin/s5cmd - - name: Release artifacts to cn region uses: nick-invision/retry@v2 if: ${{ inputs.upload-to-s3 == 'true' }} env: - AWS_ACCESS_KEY_ID: ${{ inputs.aws-cn-access-key-id }} - AWS_SECRET_ACCESS_KEY: ${{ inputs.aws-cn-secret-access-key }} - AWS_REGION: ${{ inputs.aws-cn-region }} + PROXY_URL: ${{ inputs.proxy-url }} + PROXY_USERNAME: ${{ inputs.proxy-username }} + PROXY_PASSWORD: ${{ inputs.proxy-password }} UPDATE_VERSION_INFO: ${{ inputs.update-version-info }} with: max_attempts: ${{ inputs.upload-max-retry-times }} @@ -99,8 +88,7 @@ runs: command: | ./.github/scripts/upload-artifacts-to-s3.sh \ ${{ inputs.artifacts-dir }} \ - ${{ inputs.version }} \ - ${{ inputs.aws-cn-s3-bucket }} + ${{ inputs.version }} - name: Push greptimedb image from Dockerhub to ACR shell: bash diff --git a/.github/scripts/upload-artifacts-to-s3.sh b/.github/scripts/upload-artifacts-to-s3.sh index 75c8f8d932..310575c069 100755 --- a/.github/scripts/upload-artifacts-to-s3.sh +++ b/.github/scripts/upload-artifacts-to-s3.sh @@ -5,16 +5,15 @@ set -o pipefail ARTIFACTS_DIR=$1 VERSION=$2 -AWS_S3_BUCKET=$3 RELEASE_DIRS="releases/greptimedb" GREPTIMEDB_REPO="GreptimeTeam/greptimedb" # Check if necessary variables are set. function check_vars() { - for var in AWS_S3_BUCKET VERSION ARTIFACTS_DIR; do + for var in VERSION ARTIFACTS_DIR; do if [ -z "${!var}" ]; then echo "$var is not set or empty." - echo "Usage: $0 " + echo "Usage: $0 " exit 1 fi done @@ -33,8 +32,13 @@ function upload_artifacts() { # ├── greptime-darwin-amd64-v0.2.0.sha256sum # └── greptime-darwin-amd64-v0.2.0.tar.gz find "$ARTIFACTS_DIR" -type f \( -name "*.tar.gz" -o -name "*.sha256sum" \) | while IFS= read -r file; do - s5cmd cp \ - "$file" "s3://$AWS_S3_BUCKET/$RELEASE_DIRS/$VERSION/$(basename "$file")" + filename=$(basename "$file") + TARGET_URL="$PROXY_URL/$RELEASE_DIRS/$VERSION/$filename" + + curl -X PUT \ + -u "$PROXY_USERNAME:$PROXY_PASSWORD" \ + -F "file=@$file" \ + "$TARGET_URL" done } @@ -45,16 +49,24 @@ function update_version_info() { if [[ "$VERSION" =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then echo "Updating latest-version.txt" echo "$VERSION" > latest-version.txt - s5cmd cp \ - latest-version.txt "s3://$AWS_S3_BUCKET/$RELEASE_DIRS/latest-version.txt" + TARGET_URL="$PROXY_URL/$RELEASE_DIRS/latest-version.txt" + + curl -X PUT \ + -u "$PROXY_USERNAME:$PROXY_PASSWORD" \ + -F "file=@latest-version.txt" \ + "$TARGET_URL" fi # If it's the nightly release, update latest-nightly-version.txt. if [[ "$VERSION" == *"nightly"* ]]; then echo "Updating latest-nightly-version.txt" echo "$VERSION" > latest-nightly-version.txt - s5cmd cp \ - latest-nightly-version.txt "s3://$AWS_S3_BUCKET/$RELEASE_DIRS/latest-nightly-version.txt" + + TARGET_URL="$PROXY_URL/$RELEASE_DIRS/latest-nightly-version.txt" + curl -X PUT \ + -u "$PROXY_USERNAME:$PROXY_PASSWORD" \ + -F "file=@latest-nightly-version.txt" \ + "$TARGET_URL" fi fi } @@ -93,10 +105,10 @@ function main() { } # Usage example: -# AWS_ACCESS_KEY_ID= \ -# AWS_SECRET_ACCESS_KEY= \ -# AWS_DEFAULT_REGION= \ +# PROXY_URL= \ +# PROXY_USERNAME= \ +# PROXY_PASSWORD= \ # UPDATE_VERSION_INFO=true \ # DOWNLOAD_ARTIFACTS_FROM_GITHUB=false \ -# ./upload-artifacts-to-s3.sh +# ./upload-artifacts-to-s3.sh main diff --git a/.github/workflows/dev-build.yml b/.github/workflows/dev-build.yml index 021867e4ed..d03fbeff14 100644 --- a/.github/workflows/dev-build.yml +++ b/.github/workflows/dev-build.yml @@ -285,10 +285,9 @@ jobs: dst-image-registry: ${{ vars.ACR_IMAGE_REGISTRY }} dst-image-namespace: ${{ vars.IMAGE_NAMESPACE }} version: ${{ needs.allocate-runners.outputs.version }} - aws-cn-s3-bucket: ${{ vars.AWS_RELEASE_BUCKET }} - aws-cn-access-key-id: ${{ secrets.AWS_CN_ACCESS_KEY_ID }} - aws-cn-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }} - aws-cn-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }} + proxy-url: ${{ secrets.PROXY_URL }} + proxy-username: ${{ secrets.PROXY_USERNAME }} + proxy-password: ${{ secrets.PROXY_PASSWORD }} upload-to-s3: ${{ inputs.upload_artifacts_to_s3 }} dev-mode: true # Only build the standard images(exclude centos images). push-latest-tag: false # Don't push the latest tag to registry. diff --git a/.github/workflows/nightly-build.yml b/.github/workflows/nightly-build.yml index 9eaa38c789..14ebb6e715 100644 --- a/.github/workflows/nightly-build.yml +++ b/.github/workflows/nightly-build.yml @@ -236,10 +236,9 @@ jobs: dst-image-registry: ${{ vars.ACR_IMAGE_REGISTRY }} dst-image-namespace: ${{ vars.IMAGE_NAMESPACE }} version: ${{ needs.allocate-runners.outputs.version }} - aws-cn-s3-bucket: ${{ vars.AWS_RELEASE_BUCKET }} - aws-cn-access-key-id: ${{ secrets.AWS_CN_ACCESS_KEY_ID }} - aws-cn-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }} - aws-cn-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }} + proxy-url: ${{ secrets.PROXY_URL }} + proxy-username: ${{ secrets.PROXY_USERNAME }} + proxy-password: ${{ secrets.PROXY_PASSWORD }} upload-to-s3: false dev-mode: false update-version-info: false # Don't update version info in S3. diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 3b0eb2d68c..9f8f2d9703 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -358,10 +358,9 @@ jobs: dst-image-registry: ${{ vars.ACR_IMAGE_REGISTRY }} dst-image-namespace: ${{ vars.IMAGE_NAMESPACE }} version: ${{ needs.allocate-runners.outputs.version }} - aws-cn-s3-bucket: ${{ vars.AWS_RELEASE_BUCKET }} - aws-cn-access-key-id: ${{ secrets.AWS_CN_ACCESS_KEY_ID }} - aws-cn-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }} - aws-cn-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }} + proxy-url: ${{ secrets.PROXY_URL }} + proxy-username: ${{ secrets.PROXY_USERNAME }} + proxy-password: ${{ secrets.PROXY_PASSWORD }} dev-mode: false upload-to-s3: true update-version-info: true From 306e8398cf441ab9041da8297144000eca4657b6 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Mon, 16 Mar 2026 11:01:02 +0800 Subject: [PATCH 008/195] fix: correct unicode representation for jsonb_to_string (#7810) * fix: correct unicode representation for jsonb_to_string * refactor: correct function name and behavior * fix: fix json_to_string and provide tests --- .../src/scalars/json/json_to_string.rs | 3 +- src/datatypes/src/types/json_type.rs | 146 ++++++++---------- .../standalone/common/types/json/json.result | 64 ++++---- .../standalone/common/types/json/json.sql | 30 ++-- 4 files changed, 119 insertions(+), 124 deletions(-) diff --git a/src/common/function/src/scalars/json/json_to_string.rs b/src/common/function/src/scalars/json/json_to_string.rs index 6c0cc260b2..6364dff4de 100644 --- a/src/common/function/src/scalars/json/json_to_string.rs +++ b/src/common/function/src/scalars/json/json_to_string.rs @@ -19,6 +19,7 @@ use datafusion_common::DataFusionError; use datafusion_common::arrow::array::{Array, AsArray, StringViewBuilder}; use datafusion_common::arrow::datatypes::DataType; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility}; +use datatypes::types::jsonb_to_string; use crate::function::{Function, extract_args}; @@ -74,7 +75,7 @@ impl Function for JsonToStringFunction { for i in 0..size { let json = jsons.is_valid(i).then(|| jsons.value(i)); let result = json - .map(|json| jsonb::from_slice(json).map(|x| x.to_string())) + .map(jsonb_to_string) .transpose() .map_err(|e| DataFusionError::Execution(format!("invalid json binary: {e}")))?; diff --git a/src/datatypes/src/types/json_type.rs b/src/datatypes/src/types/json_type.rs index 61586fc460..912bbfca54 100644 --- a/src/datatypes/src/types/json_type.rs +++ b/src/datatypes/src/types/json_type.rs @@ -396,7 +396,7 @@ pub fn jsonb_to_string(val: &[u8]) -> Result { match jsonb::from_slice(val) { Ok(jsonb_value) => { let serialized = jsonb_value.to_string(); - Ok(serialized) + fix_unicode_point(&serialized) } Err(e) => InvalidJsonbSnafu { error: e }.fail(), } @@ -405,18 +405,12 @@ pub fn jsonb_to_string(val: &[u8]) -> Result { /// Converts a json type value to serde_json::Value pub fn jsonb_to_serde_json(val: &[u8]) -> Result { let json_string = jsonb_to_string(val)?; - jsonb_string_to_serde_value(&json_string) + serde_json::Value::from_str(&json_string).context(DeserializeSnafu { json: json_string }) } -/// Attempts to deserialize a JSON text into `serde_json::Value`, with a best-effort -/// fallback for Rust-style Unicode escape sequences. +/// Normalizes a JSON string by converting Rust-style Unicode escape sequences to JSON-compatible format. /// -/// This function is intended to be used on JSON strings produced from the internal -/// JSONB representation (e.g. via [`jsonb_to_string`]). It first calls -/// `serde_json::Value::from_str` directly. If that succeeds, the parsed value is -/// returned as-is. -/// -/// If the initial parse fails, the input is scanned for Rust-style Unicode code +/// The input is scanned for Rust-style Unicode code /// point escapes of the form `\\u{H...}` (a backslash, `u`, an opening brace, /// followed by 1–6 hexadecimal digits, and a closing brace). Each such escape is /// converted into JSON-compatible UTF‑16 escape sequences: @@ -427,59 +421,44 @@ pub fn jsonb_to_serde_json(val: &[u8]) -> Result { /// the code point is encoded as a UTF‑16 surrogate pair and emitted as two consecutive /// `\\uXXXX` sequences (as JSON format required). /// -/// After this normalization, the function retries parsing the resulting string as -/// JSON and returns the deserialized value or a `DeserializeSnafu` error if it -/// still cannot be parsed. -fn jsonb_string_to_serde_value(json: &str) -> Result { - match serde_json::Value::from_str(json) { - Ok(v) => Ok(v), - Err(e) => { - // If above deserialization is failed, the JSON string might contain some Rust chars - // that are somehow incorrectly represented as Unicode code point literal. For example, - // "\u{fe0f}". We have to convert them to JSON compatible format, like "\uFE0F", then - // try to deserialize the JSON string again. - if !e.is_syntax() || !e.to_string().contains("invalid escape") { - return Err(e).context(DeserializeSnafu { json }); - } +/// After this normalization, the function returns the normalized string +fn fix_unicode_point(json: &str) -> Result { + static UNICODE_CODE_POINT_PATTERN: LazyLock = LazyLock::new(|| { + // Match literal "\u{...}" sequences, capturing 1–6 (code point range) hex digits + // inside braces. + Regex::new(r"\\u\{([0-9a-fA-F]{1,6})}").unwrap_or_else(|e| panic!("{}", e)) + }); - static UNICODE_CODE_POINT_PATTERN: LazyLock = LazyLock::new(|| { - // Match literal "\u{...}" sequences, capturing 1–6 (code point range) hex digits - // inside braces. - Regex::new(r"\\u\{([0-9a-fA-F]{1,6})}").unwrap_or_else(|e| panic!("{}", e)) - }); + let v = UNICODE_CODE_POINT_PATTERN.replace_all(json, |caps: &Captures| { + // Extract the hex payload (without braces) and parse to a code point. + let hex = &caps[1]; + let Ok(code) = u32::from_str_radix(hex, 16) else { + // On parse failure, leave the original escape sequence unchanged. + return caps[0].to_string(); + }; - let v = UNICODE_CODE_POINT_PATTERN.replace_all(json, |caps: &Captures| { - // Extract the hex payload (without braces) and parse to a code point. - let hex = &caps[1]; - let Ok(code) = u32::from_str_radix(hex, 16) else { - // On parse failure, leave the original escape sequence unchanged. - return caps[0].to_string(); - }; + if code <= 0xFFFF { + // Basic Multilingual Plane: JSON can represent this directly as \uXXXX. + format!("\\u{:04X}", code) + } else if code > 0x10FFFF { + // Beyond max Unicode code point + caps[0].to_string() + } else { + // Supplementary planes: JSON needs UTF-16 surrogate pairs. + // Convert the code point to a 20-bit value. + let code = code - 0x10000; - if code <= 0xFFFF { - // Basic Multilingual Plane: JSON can represent this directly as \uXXXX. - format!("\\u{:04X}", code) - } else if code > 0x10FFFF { - // Beyond max Unicode code point - caps[0].to_string() - } else { - // Supplementary planes: JSON needs UTF-16 surrogate pairs. - // Convert the code point to a 20-bit value. - let code = code - 0x10000; + // High surrogate: top 10 bits, offset by 0xD800. + let high = 0xD800 + ((code >> 10) & 0x3FF); - // High surrogate: top 10 bits, offset by 0xD800. - let high = 0xD800 + ((code >> 10) & 0x3FF); + // Low surrogate: bottom 10 bits, offset by 0xDC00. + let low = 0xDC00 + (code & 0x3FF); - // Low surrogate: bottom 10 bits, offset by 0xDC00. - let low = 0xDC00 + (code & 0x3FF); - - // Emit two \uXXXX escapes in sequence. - format!("\\u{:04X}\\u{:04X}", high, low) - } - }); - serde_json::Value::from_str(&v).context(DeserializeSnafu { json }) + // Emit two \uXXXX escapes in sequence. + format!("\\u{:04X}\\u{:04X}", high, low) } - } + }); + Ok(v.to_string()) } /// Parses a string to a json type value @@ -495,45 +474,54 @@ mod tests { use crate::json::JsonStructureSettings; #[test] - fn test_jsonb_string_to_serde_value() -> Result<()> { + fn test_fix_unicode_point() -> Result<()> { let valid_cases = vec![ - (r#"{"data": "simple ascii"}"#, r#"{"data":"simple ascii"}"#), + (r#"{"data": "simple ascii"}"#, r#"{"data": "simple ascii"}"#), ( - r#"{"data": "Greek sigma: \u{03a3}"}"#, - r#"{"data":"Greek sigma: Σ"}"#, + r#"{"data":"Greek sigma: \u{03a3}"}"#, + r#"{"data":"Greek sigma: \u03A3"}"#, ), ( - r#"{"data": "Joker card: \u{1f0df}"}"#, - r#"{"data":"Joker card: 🃟"}"#, + r#"{"data":"Joker card: \u{1f0df}"}"#, + r#"{"data":"Joker card: \uD83C\uDCDF"}"#, ), ( - r#"{"data": "BMP boundary: \u{ffff}"}"#, - r#"{"data":"BMP boundary: ￿"}"#, + r#"{"data":"BMP boundary: \u{ffff}"}"#, + r#"{"data":"BMP boundary: \uFFFF"}"#, ), ( - r#"{"data": "Supplementary min: \u{10000}"}"#, - r#"{"data":"Supplementary min: 𐀀"}"#, + r#"{"data":"Supplementary min: \u{10000}"}"#, + r#"{"data":"Supplementary min: \uD800\uDC00"}"#, ), ( - r#"{"data": "Supplementary max: \u{10ffff}"}"#, - r#"{"data":"Supplementary max: 􏿿"}"#, + r#"{"data":"Supplementary max: \u{10ffff}"}"#, + r#"{"data":"Supplementary max: \uDBFF\uDFFF"}"#, ), ]; for (input, expect) in valid_cases { - let v = jsonb_string_to_serde_value(input)?; - assert_eq!(v.to_string(), expect); + let v = fix_unicode_point(input)?; + assert_eq!(v, expect); } - let invalid_cases = vec![ - r#"{"data": "Invalid hex: \u{gggg}"}"#, - r#"{"data": "Beyond max Unicode code point: \u{110000}"}"#, - r#"{"data": "Out of range: \u{1100000}"}"#, // 7 digit - r#"{"data": "Empty braces: \u{}"}"#, + let invalid_escape_cases = vec![ + ( + r#"{"data": "Invalid hex: \u{gggg}"}"#, + r#"{"data": "Invalid hex: \u{gggg}"}"#, + ), + ( + r#"{"data": "Empty braces: \u{}"}"#, + r#"{"data": "Empty braces: \u{}"}"#, + ), + ( + r#"{"data": "Out of range: \u{1100000}"}"#, + r#"{"data": "Out of range: \u{1100000}"}"#, + ), ]; - for input in invalid_cases { - let result = jsonb_string_to_serde_value(input); - assert!(result.is_err()); + for (input, expect) in invalid_escape_cases { + let v = fix_unicode_point(input)?; + assert_eq!(v, expect); } + Ok(()) } diff --git a/tests/cases/standalone/common/types/json/json.result b/tests/cases/standalone/common/types/json/json.result index 8c4755f4ae..8fad9632b1 100644 --- a/tests/cases/standalone/common/types/json/json.result +++ b/tests/cases/standalone/common/types/json/json.result @@ -37,22 +37,23 @@ INSERT INTO jsons VALUES('[null]', 0), } ] } -}}', 11); +}}', 11), +('{"a":"abc\u2028tom"}', 12); -Affected Rows: 12 +Affected Rows: 13 -INSERT INTO jsons VALUES(parse_json('[null]'), 12), -(parse_json('[true]'), 13), -(parse_json('[false]'), 14), -(parse_json('[0]'), 15), -(parse_json('["foo"]'), 16), -(parse_json('[]'), 17), -(parse_json('{}'), 18), -(parse_json('[0,1]'), 19), -(parse_json('{"foo":"bar"}'), 20), -(parse_json('{"a":null,"foo":"bar"}'), 21), -(parse_json('[-1]'), 22), -(parse_json('[-2147483648]'), 23), +INSERT INTO jsons VALUES(parse_json('[null]'), 1000), +(parse_json('[true]'), 1001), +(parse_json('[false]'), 1002), +(parse_json('[0]'), 1003), +(parse_json('["foo"]'), 1004), +(parse_json('[]'), 1005), +(parse_json('{}'), 1006), +(parse_json('[0,1]'), 1007), +(parse_json('{"foo":"bar"}'), 1008), +(parse_json('{"a":null,"foo":"bar"}'), 1009), +(parse_json('[-1]'), 1010), +(parse_json('[-2147483648]'), 1011), (parse_json('{"entities": { "description": { "urls": [ @@ -76,9 +77,10 @@ INSERT INTO jsons VALUES(parse_json('[null]'), 12), } ] } - }}'), 24); + }}'), 1012), +(parse_json('{"a":"abc\u2028tom"}'), 1013); -Affected Rows: 13 +Affected Rows: 14 SELECT json_to_string(j), t FROM jsons; @@ -97,25 +99,27 @@ SELECT json_to_string(j), t FROM jsons; | {"a":null,"foo":"bar"} | 1970-01-01T00:00:00.009 | | [-1] | 1970-01-01T00:00:00.010 | | {"entities":{"description":{"urls":[{"display_url":"pixiv.net/member.php?id=…","expanded_url":"http://www.pixiv.net/member.php?id=4776","indices":[58,80],"url":"http://t.co/QMLJeFmfMT"},{"display_url":"ask.fm/KATANA77","expanded_url":"http://ask.fm/KATANA77","indices":[95,117],"url":"http://t.co/LU8T7vmU3h"}]}}} | 1970-01-01T00:00:00.011 | -| [null] | 1970-01-01T00:00:00.012 | -| [true] | 1970-01-01T00:00:00.013 | -| [false] | 1970-01-01T00:00:00.014 | -| [0] | 1970-01-01T00:00:00.015 | -| ["foo"] | 1970-01-01T00:00:00.016 | -| [] | 1970-01-01T00:00:00.017 | -| {} | 1970-01-01T00:00:00.018 | -| [0,1] | 1970-01-01T00:00:00.019 | -| {"foo":"bar"} | 1970-01-01T00:00:00.020 | -| {"a":null,"foo":"bar"} | 1970-01-01T00:00:00.021 | -| [-1] | 1970-01-01T00:00:00.022 | -| [-2147483648] | 1970-01-01T00:00:00.023 | -| {"entities":{"description":{"urls":[{"display_url":"pixiv.net/member.php?id=…","expanded_url":"http://www.pixiv.net/member.php?id=4776","indices":[58,80],"url":"http://t.co/QMLJeFmfMT"},{"display_url":"ask.fm/KATANA77","expanded_url":"http://ask.fm/KATANA77","indices":[95,117],"url":"http://t.co/LU8T7vmU3h"}]}}} | 1970-01-01T00:00:00.024 | +| {"a":"abc\u2028tom"} | 1970-01-01T00:00:00.012 | +| [null] | 1970-01-01T00:00:01 | +| [true] | 1970-01-01T00:00:01.001 | +| [false] | 1970-01-01T00:00:01.002 | +| [0] | 1970-01-01T00:00:01.003 | +| ["foo"] | 1970-01-01T00:00:01.004 | +| [] | 1970-01-01T00:00:01.005 | +| {} | 1970-01-01T00:00:01.006 | +| [0,1] | 1970-01-01T00:00:01.007 | +| {"foo":"bar"} | 1970-01-01T00:00:01.008 | +| {"a":null,"foo":"bar"} | 1970-01-01T00:00:01.009 | +| [-1] | 1970-01-01T00:00:01.010 | +| [-2147483648] | 1970-01-01T00:00:01.011 | +| {"entities":{"description":{"urls":[{"display_url":"pixiv.net/member.php?id=…","expanded_url":"http://www.pixiv.net/member.php?id=4776","indices":[58,80],"url":"http://t.co/QMLJeFmfMT"},{"display_url":"ask.fm/KATANA77","expanded_url":"http://ask.fm/KATANA77","indices":[95,117],"url":"http://t.co/LU8T7vmU3h"}]}}} | 1970-01-01T00:00:01.012 | +| {"a":"abc\u2028tom"} | 1970-01-01T00:00:01.013 | +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------+ --Insert invalid json strings-- DELETE FROM jsons; -Affected Rows: 25 +Affected Rows: 27 INSERT INTO jsons VALUES(parse_json('{"a":1, "b":2, "c":3'), 4); diff --git a/tests/cases/standalone/common/types/json/json.sql b/tests/cases/standalone/common/types/json/json.sql index 868edc59e8..5a521ee1c6 100644 --- a/tests/cases/standalone/common/types/json/json.sql +++ b/tests/cases/standalone/common/types/json/json.sql @@ -35,20 +35,21 @@ INSERT INTO jsons VALUES('[null]', 0), } ] } -}}', 11); +}}', 11), +('{"a":"abc\u2028tom"}', 12); -INSERT INTO jsons VALUES(parse_json('[null]'), 12), -(parse_json('[true]'), 13), -(parse_json('[false]'), 14), -(parse_json('[0]'), 15), -(parse_json('["foo"]'), 16), -(parse_json('[]'), 17), -(parse_json('{}'), 18), -(parse_json('[0,1]'), 19), -(parse_json('{"foo":"bar"}'), 20), -(parse_json('{"a":null,"foo":"bar"}'), 21), -(parse_json('[-1]'), 22), -(parse_json('[-2147483648]'), 23), +INSERT INTO jsons VALUES(parse_json('[null]'), 1000), +(parse_json('[true]'), 1001), +(parse_json('[false]'), 1002), +(parse_json('[0]'), 1003), +(parse_json('["foo"]'), 1004), +(parse_json('[]'), 1005), +(parse_json('{}'), 1006), +(parse_json('[0,1]'), 1007), +(parse_json('{"foo":"bar"}'), 1008), +(parse_json('{"a":null,"foo":"bar"}'), 1009), +(parse_json('[-1]'), 1010), +(parse_json('[-2147483648]'), 1011), (parse_json('{"entities": { "description": { "urls": [ @@ -72,7 +73,8 @@ INSERT INTO jsons VALUES(parse_json('[null]'), 12), } ] } - }}'), 24); + }}'), 1012), +(parse_json('{"a":"abc\u2028tom"}'), 1013); SELECT json_to_string(j), t FROM jsons; From c6f1ef8aecfd78044fbadb88d1de70a7f1a94b39 Mon Sep 17 00:00:00 2001 From: jeremyhi Date: Sun, 15 Mar 2026 20:52:27 -0700 Subject: [PATCH 009/195] feat: track unlimited usage in memory manager (#7811) * feat: track unlimited usage in memory manager Signed-off-by: jeremyhi * chore: by gemini comment Signed-off-by: jeremyhi * chore: remove unused import Signed-off-by: jeremyhi --------- Signed-off-by: jeremyhi --- Cargo.lock | 1 - src/common/memory-manager/Cargo.toml | 1 - src/common/memory-manager/src/guard.rs | 131 +++++++++++++++-------- src/common/memory-manager/src/manager.rs | 113 +++++++++++++++---- src/common/memory-manager/src/tests.rs | 18 ++-- 5 files changed, 192 insertions(+), 72 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 94f7a3eca1..1f65f1289c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2488,7 +2488,6 @@ version = "1.0.0-rc.2" dependencies = [ "common-error", "common-macro", - "common-telemetry", "humantime", "serde", "snafu 0.8.6", diff --git a/src/common/memory-manager/Cargo.toml b/src/common/memory-manager/Cargo.toml index a6be50f774..6686c98167 100644 --- a/src/common/memory-manager/Cargo.toml +++ b/src/common/memory-manager/Cargo.toml @@ -10,7 +10,6 @@ workspace = true [dependencies] common-error = { workspace = true } common-macro = { workspace = true } -common-telemetry = { workspace = true } humantime = { workspace = true } serde = { workspace = true } snafu = { workspace = true } diff --git a/src/common/memory-manager/src/guard.rs b/src/common/memory-manager/src/guard.rs index 770b6dec24..ad3111581b 100644 --- a/src/common/memory-manager/src/guard.rs +++ b/src/common/memory-manager/src/guard.rs @@ -14,14 +14,13 @@ use std::{fmt, mem}; -use common_telemetry::debug; use snafu::ensure; use tokio::sync::{OwnedSemaphorePermit, TryAcquireError}; use crate::error::{ MemoryAcquireTimeoutSnafu, MemoryLimitExceededSnafu, MemorySemaphoreClosedSnafu, Result, }; -use crate::manager::{MemoryMetrics, MemoryQuota}; +use crate::manager::{MemoryMetrics, MemoryQuota, UnlimitedMemoryQuota}; use crate::policy::OnExhaustedPolicy; /// Guard representing a slice of reserved memory. @@ -30,31 +29,57 @@ pub struct MemoryGuard { } pub(crate) enum GuardState { - Unlimited, + Released, + Unlimited { + quota: UnlimitedMemoryQuota, + granted_bytes: u64, + }, Limited { - permit: OwnedSemaphorePermit, quota: MemoryQuota, + permit: OwnedSemaphorePermit, }, } +impl GuardState { + fn release(self) { + match self { + GuardState::Released => {} + GuardState::Unlimited { + quota, + granted_bytes, + } => { + quota.sub_in_use(granted_bytes); + } + GuardState::Limited { quota, permit } => { + quota.release_permit(permit); + } + } + } +} + impl MemoryGuard { - pub(crate) fn unlimited() -> Self { + pub(crate) fn unlimited(quota: UnlimitedMemoryQuota, bytes: u64) -> Self { + quota.add_in_use(bytes); Self { - state: GuardState::Unlimited, + state: GuardState::Unlimited { + quota, + granted_bytes: bytes, + }, } } - pub(crate) fn limited(permit: OwnedSemaphorePermit, quota: MemoryQuota) -> Self { + pub(crate) fn limited(quota: MemoryQuota, permit: OwnedSemaphorePermit) -> Self { Self { - state: GuardState::Limited { permit, quota }, + state: GuardState::Limited { quota, permit }, } } /// Returns granted quota in bytes. pub fn granted_bytes(&self) -> u64 { match &self.state { - GuardState::Unlimited => 0, - GuardState::Limited { permit, quota } => { + GuardState::Released => 0, + GuardState::Unlimited { granted_bytes, .. } => *granted_bytes, + GuardState::Limited { quota, permit } => { quota.permits_to_bytes(permit.num_permits() as u32) } } @@ -68,13 +93,24 @@ impl MemoryGuard { /// - Returns error if requested bytes would exceed the manager's total limit /// - Returns error if the semaphore is unexpectedly closed pub async fn acquire_additional(&mut self, bytes: u64) -> Result<()> { - match &mut self.state { - GuardState::Unlimited => Ok(()), - GuardState::Limited { permit, quota } => { - if bytes == 0 { - return Ok(()); - } + if bytes == 0 { + return Ok(()); + } + match &mut self.state { + GuardState::Released => { + debug_assert!(false, "released memory guard state should not be reused"); + Ok(()) + } + GuardState::Unlimited { + quota, + granted_bytes, + } => { + quota.add_in_use(bytes); + *granted_bytes = granted_bytes.saturating_add(bytes); + Ok(()) + } + GuardState::Limited { quota, permit } => { let additional_permits = quota.bytes_to_permits(bytes); let current_permits = permit.num_permits() as u32; @@ -95,7 +131,6 @@ impl MemoryGuard { permit.merge(additional_permit); quota.update_in_use_metric(); - debug!("Acquired additional {} bytes", bytes); Ok(()) } } @@ -106,13 +141,24 @@ impl MemoryGuard { /// On success, merges the new memory into this guard and returns true. /// On failure, returns false and leaves this guard unchanged. pub fn try_acquire_additional(&mut self, bytes: u64) -> bool { - match &mut self.state { - GuardState::Unlimited => true, - GuardState::Limited { permit, quota } => { - if bytes == 0 { - return true; - } + if bytes == 0 { + return true; + } + match &mut self.state { + GuardState::Released => { + debug_assert!(false, "released memory guard state should not be reused"); + false + } + GuardState::Unlimited { + quota, + granted_bytes, + } => { + quota.add_in_use(bytes); + *granted_bytes = granted_bytes.saturating_add(bytes); + true + } + GuardState::Limited { quota, permit } => { let additional_permits = quota.bytes_to_permits(bytes); match quota @@ -123,7 +169,6 @@ impl MemoryGuard { Ok(additional_permit) => { permit.merge(additional_permit); quota.update_in_use_metric(); - debug!("Acquired additional {} bytes", bytes); true } Err(TryAcquireError::NoPermits) | Err(TryAcquireError::Closed) => { @@ -168,7 +213,8 @@ impl MemoryGuard { MemoryLimitExceededSnafu { requested_bytes: bytes, limit_bytes: match &self.state { - GuardState::Unlimited => 0, // unreachable: unlimited mode always succeeds + GuardState::Released => 0, + GuardState::Unlimited { .. } => 0, // unreachable: unlimited mode always succeeds GuardState::Limited { quota, .. } => { quota.permits_to_bytes(quota.limit_permits) } @@ -184,22 +230,30 @@ impl MemoryGuard { /// /// Returns true if the release succeeds or is a no-op; false if the request exceeds granted. pub fn release_partial(&mut self, bytes: u64) -> bool { + if bytes == 0 { + return true; + } + match &mut self.state { - GuardState::Unlimited => true, - GuardState::Limited { permit, quota } => { - if bytes == 0 { - return true; + GuardState::Released => true, + GuardState::Unlimited { + quota, + granted_bytes, + } => { + if bytes > *granted_bytes { + return false; } + quota.sub_in_use(bytes); + *granted_bytes = granted_bytes.saturating_sub(bytes); + true + } + GuardState::Limited { quota, permit } => { let release_permits = quota.bytes_to_permits(bytes); match permit.split(release_permits as usize) { Some(released_permit) => { - let released_bytes = - quota.permits_to_bytes(released_permit.num_permits() as u32); - drop(released_permit); - quota.update_in_use_metric(); - debug!("Released {} bytes from memory guard", released_bytes); + quota.release_permit(released_permit); true } None => false, @@ -211,14 +265,7 @@ impl MemoryGuard { impl Drop for MemoryGuard { fn drop(&mut self) { - if let GuardState::Limited { permit, quota } = - mem::replace(&mut self.state, GuardState::Unlimited) - { - let bytes = quota.permits_to_bytes(permit.num_permits() as u32); - drop(permit); - quota.update_in_use_metric(); - debug!("Released memory: {} bytes", bytes); - } + mem::replace(&mut self.state, GuardState::Released).release(); } } diff --git a/src/common/memory-manager/src/manager.rs b/src/common/memory-manager/src/manager.rs index 50360d2a31..8cca5f220c 100644 --- a/src/common/memory-manager/src/manager.rs +++ b/src/common/memory-manager/src/manager.rs @@ -13,9 +13,10 @@ // limitations under the License. use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; use snafu::ensure; -use tokio::sync::{Semaphore, TryAcquireError}; +use tokio::sync::{OwnedSemaphorePermit, Semaphore, TryAcquireError}; use crate::error::{ MemoryAcquireTimeoutSnafu, MemoryLimitExceededSnafu, MemorySemaphoreClosedSnafu, Result, @@ -34,7 +35,7 @@ pub trait MemoryMetrics: Clone + Send + Sync + 'static { /// Generic memory manager for quota-controlled operations. #[derive(Clone)] pub struct MemoryManager { - quota: Option>, + quota: MemoryQuotaState, } impl Default for MemoryManager { @@ -51,6 +52,18 @@ pub(crate) struct MemoryQuota { pub(crate) metrics: M, } +#[derive(Clone)] +pub(crate) struct UnlimitedMemoryQuota { + pub(crate) current_bytes: Arc, + pub(crate) metrics: M, +} + +#[derive(Clone)] +pub(crate) enum MemoryQuotaState { + Unlimited(UnlimitedMemoryQuota), + Limited(MemoryQuota), +} + impl MemoryManager { /// Creates a new memory manager with the given limit in bytes. /// `limit_bytes = 0` disables the limit. @@ -62,7 +75,12 @@ impl MemoryManager { pub fn with_granularity(limit_bytes: u64, granularity: PermitGranularity, metrics: M) -> Self { if limit_bytes == 0 { metrics.set_limit(0); - return Self { quota: None }; + return Self { + quota: MemoryQuotaState::Unlimited(UnlimitedMemoryQuota { + current_bytes: Arc::new(AtomicU64::new(0)), + metrics, + }), + }; } let limit_permits = granularity.bytes_to_permits(limit_bytes); @@ -70,7 +88,7 @@ impl MemoryManager { metrics.set_limit(limit_aligned_bytes as i64); Self { - quota: Some(MemoryQuota { + quota: MemoryQuotaState::Limited(MemoryQuota { semaphore: Arc::new(Semaphore::new(limit_permits as usize)), limit_permits, granularity, @@ -81,26 +99,30 @@ impl MemoryManager { /// Returns the configured limit in bytes (0 if unlimited). pub fn limit_bytes(&self) -> u64 { - self.quota - .as_ref() - .map(|quota| quota.permits_to_bytes(quota.limit_permits)) - .unwrap_or(0) + match &self.quota { + MemoryQuotaState::Unlimited(_) => 0, + MemoryQuotaState::Limited(quota) => quota.permits_to_bytes(quota.limit_permits), + } } /// Returns currently used bytes. pub fn used_bytes(&self) -> u64 { - self.quota - .as_ref() - .map(|quota| quota.permits_to_bytes(quota.used_permits())) - .unwrap_or(0) + match &self.quota { + MemoryQuotaState::Unlimited(quota) => quota.current_bytes.load(Ordering::Acquire), + MemoryQuotaState::Limited(quota) => quota.permits_to_bytes(quota.used_permits()), + } } /// Returns available bytes. + /// + /// Unlimited managers report `u64::MAX`. pub fn available_bytes(&self) -> u64 { - self.quota - .as_ref() - .map(|quota| quota.permits_to_bytes(quota.available_permits_clamped())) - .unwrap_or(0) + match &self.quota { + MemoryQuotaState::Unlimited(_) => u64::MAX, + MemoryQuotaState::Limited(quota) => { + quota.permits_to_bytes(quota.available_permits_clamped()) + } + } } /// Acquires memory, waiting if necessary until enough is available. @@ -110,8 +132,8 @@ impl MemoryManager { /// - Returns error if the semaphore is unexpectedly closed pub async fn acquire(&self, bytes: u64) -> Result> { match &self.quota { - None => Ok(MemoryGuard::unlimited()), - Some(quota) => { + MemoryQuotaState::Unlimited(quota) => Ok(MemoryGuard::unlimited(quota.clone(), bytes)), + MemoryQuotaState::Limited(quota) => { let permits = quota.bytes_to_permits(bytes); ensure!( @@ -129,7 +151,7 @@ impl MemoryManager { .await .map_err(|_| MemorySemaphoreClosedSnafu.build())?; quota.update_in_use_metric(); - Ok(MemoryGuard::limited(permit, quota.clone())) + Ok(MemoryGuard::limited(quota.clone(), permit)) } } } @@ -137,14 +159,16 @@ impl MemoryManager { /// Tries to acquire memory. Returns Some(guard) on success, None if insufficient. pub fn try_acquire(&self, bytes: u64) -> Option> { match &self.quota { - None => Some(MemoryGuard::unlimited()), - Some(quota) => { + MemoryQuotaState::Unlimited(quota) => { + Some(MemoryGuard::unlimited(quota.clone(), bytes)) + } + MemoryQuotaState::Limited(quota) => { let permits = quota.bytes_to_permits(bytes); match quota.semaphore.clone().try_acquire_many_owned(permits) { Ok(permit) => { quota.update_in_use_metric(); - Some(MemoryGuard::limited(permit, quota.clone())) + Some(MemoryGuard::limited(quota.clone(), permit)) } Err(TryAcquireError::NoPermits) | Err(TryAcquireError::Closed) => { quota.metrics.inc_rejected("try_acquire"); @@ -219,4 +243,49 @@ impl MemoryQuota { let bytes = self.permits_to_bytes(self.used_permits()); self.metrics.set_in_use(bytes as i64); } + + pub(crate) fn release_permit(&self, permit: OwnedSemaphorePermit) { + drop(permit); + self.update_in_use_metric(); + } +} + +impl UnlimitedMemoryQuota { + pub(crate) fn add_in_use(&self, bytes: u64) { + if bytes == 0 { + return; + } + + let previous = self + .current_bytes + .fetch_update(Ordering::AcqRel, Ordering::Acquire, |current| { + Some(current.saturating_add(bytes)) + }) + .unwrap(); + let new_total = previous.saturating_add(bytes); + debug_assert!( + new_total >= previous, + "unlimited memory usage counter overflowed" + ); + self.metrics.set_in_use(new_total as i64); + } + + pub(crate) fn sub_in_use(&self, bytes: u64) { + if bytes == 0 { + return; + } + + let previous = self + .current_bytes + .fetch_update(Ordering::AcqRel, Ordering::Acquire, |current| { + Some(current.saturating_sub(bytes)) + }) + .unwrap(); + debug_assert!( + previous >= bytes, + "unlimited memory usage counter underflowed: current={previous}, release={bytes}" + ); + let new_total = previous.saturating_sub(bytes); + self.metrics.set_in_use(new_total as i64); + } } diff --git a/src/common/memory-manager/src/tests.rs b/src/common/memory-manager/src/tests.rs index 886eef9dac..fe02703f0b 100644 --- a/src/common/memory-manager/src/tests.rs +++ b/src/common/memory-manager/src/tests.rs @@ -24,7 +24,9 @@ fn test_try_acquire_unlimited() { let manager = MemoryManager::new(0, NoOpMetrics); let guard = manager.try_acquire(10 * PERMIT_GRANULARITY_BYTES).unwrap(); assert_eq!(manager.limit_bytes(), 0); - assert_eq!(guard.granted_bytes(), 0); + assert_eq!(manager.available_bytes(), u64::MAX); + assert_eq!(guard.granted_bytes(), 10 * PERMIT_GRANULARITY_BYTES); + assert_eq!(manager.used_bytes(), 10 * PERMIT_GRANULARITY_BYTES); } #[test] @@ -136,7 +138,10 @@ fn test_request_additional_unlimited() { // Should always succeed with unlimited manager assert!(guard.try_acquire_additional(100 * PERMIT_GRANULARITY_BYTES)); - assert_eq!(guard.granted_bytes(), 0); + assert_eq!(guard.granted_bytes(), 105 * PERMIT_GRANULARITY_BYTES); + assert_eq!(manager.used_bytes(), 105 * PERMIT_GRANULARITY_BYTES); + + drop(guard); assert_eq!(manager.used_bytes(), 0); } @@ -187,9 +192,10 @@ fn test_early_release_partial_unlimited() { let manager = MemoryManager::new(0, NoOpMetrics); let mut guard = manager.try_acquire(100 * PERMIT_GRANULARITY_BYTES).unwrap(); - // Unlimited guard - release should succeed (no-op) + // Unlimited guard should track and release exact bytes. assert!(guard.release_partial(50 * PERMIT_GRANULARITY_BYTES)); - assert_eq!(guard.granted_bytes(), 0); + assert_eq!(guard.granted_bytes(), 50 * PERMIT_GRANULARITY_BYTES); + assert_eq!(manager.used_bytes(), 50 * PERMIT_GRANULARITY_BYTES); } #[test] @@ -406,6 +412,6 @@ async fn test_acquire_additional_unlimited() { .acquire_additional(1000 * PERMIT_GRANULARITY_BYTES) .await .unwrap(); - assert_eq!(guard.granted_bytes(), 0); - assert_eq!(manager.used_bytes(), 0); + assert_eq!(guard.granted_bytes(), 1000 * PERMIT_GRANULARITY_BYTES); + assert_eq!(manager.used_bytes(), 1000 * PERMIT_GRANULARITY_BYTES); } From b007f8598633e2ad6a5ec9b226e1dbe02a6b083d Mon Sep 17 00:00:00 2001 From: maximk777 Date: Mon, 16 Mar 2026 12:10:33 +0500 Subject: [PATCH 010/195] feat(http): improve error logging with client IP (#7503) * feat(http): improve error logging with client IP - Add logging to ErrorResponse::from_error_message() - Add middleware to log HTTP errors with client IP Closes #7328 Signed-off-by: maximk777 * fix(http): address review comments for error logging Restore rich Debug logging in from_error(), add URI/method/matched path to client IP middleware, and only log when client address is available. Signed-off-by: evenyag --------- Signed-off-by: maximk777 Signed-off-by: evenyag Co-authored-by: evenyag --- src/servers/src/http.rs | 8 +- src/servers/src/http/client_ip.rs | 109 ++++++++++++++++++++ src/servers/src/http/result/error_result.rs | 13 ++- 3 files changed, 125 insertions(+), 5 deletions(-) create mode 100644 src/servers/src/http/client_ip.rs diff --git a/src/servers/src/http.rs b/src/servers/src/http.rs index ffd0745041..506a240cac 100644 --- a/src/servers/src/http.rs +++ b/src/servers/src/http.rs @@ -112,8 +112,8 @@ pub mod utils; use result::HttpOutputWriter; pub(crate) use timeout::DynamicTimeoutLayer; +mod client_ip; use crate::prom_remote_write::validation::PromValidationMode; - mod hints; mod read_preference; #[cfg(any(test, feature = "testing"))] @@ -883,6 +883,7 @@ impl HttpServer { authorize::check_http_auth, )) .layer(middleware::from_fn(hints::extract_hints)) + .layer(middleware::from_fn(client_ip::log_error_with_client_ip)) .layer(middleware::from_fn( read_preference::extract_read_preference, )), @@ -1247,7 +1248,10 @@ impl Server for HttpServer { error!(e; "Failed to set TCP_NODELAY on incoming connection"); } }); - let serve = axum::serve(listener, app.into_make_service()); + let serve = axum::serve( + listener, + app.into_make_service_with_connect_info::(), + ); // FIXME(yingwen): Support keepalive. // See: diff --git a/src/servers/src/http/client_ip.rs b/src/servers/src/http/client_ip.rs new file mode 100644 index 0000000000..70df554ebb --- /dev/null +++ b/src/servers/src/http/client_ip.rs @@ -0,0 +1,109 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::net::SocketAddr; + +use axum::body::Body; +use axum::extract::{ConnectInfo, MatchedPath}; +use axum::http::Request; +use axum::middleware::Next; +use axum::response::Response; +use common_telemetry::warn; + +/// Middleware that logs HTTP error responses (4xx/5xx) with client IP address. +/// +/// Extracts client address from [`ConnectInfo`] if available. +pub async fn log_error_with_client_ip(req: Request, next: Next) -> Response { + let request_info = req + .extensions() + .get::>() + .map(|c| c.0) + .map(|addr| { + let method = req.method().clone(); + let uri = req.uri().clone(); + let matched_path = req.extensions().get::().cloned(); + (addr, method, uri, matched_path) + }); + + let response = next.run(req).await; + + if (response.status().is_client_error() || response.status().is_server_error()) + && let Some((addr, method, uri, matched_path)) = request_info + { + warn!( + "HTTP error response {} for {} {} (matched: {}) from client {}", + response.status(), + method, + uri, + matched_path + .as_ref() + .map(|p| p.as_str()) + .unwrap_or(""), + addr + ); + } + + response +} + +#[cfg(test)] +mod tests { + use axum::Router; + use axum::routing::get; + use http::StatusCode; + use tower::ServiceExt; + + use super::*; + + #[tokio::test] + async fn test_middleware_passes_error_response() { + async fn not_found_handler() -> StatusCode { + StatusCode::NOT_FOUND + } + + let app = Router::new() + .route("/not-found", get(not_found_handler)) + .layer(axum::middleware::from_fn(log_error_with_client_ip)); + + let response = app + .oneshot( + Request::builder() + .uri("/not-found") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::NOT_FOUND); + } + + #[tokio::test] + async fn test_middleware_passes_success_response() { + async fn ok_handler() -> StatusCode { + StatusCode::OK + } + + let app = Router::new() + .route("/ok", get(ok_handler)) + .layer(axum::middleware::from_fn(log_error_with_client_ip)); + + let response = app + .oneshot(Request::builder().uri("/ok").body(Body::empty()).unwrap()) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::OK); + } +} diff --git a/src/servers/src/http/result/error_result.rs b/src/servers/src/http/result/error_result.rs index 7b70066b68..9bd6e1a7a3 100644 --- a/src/servers/src/http/result/error_result.rs +++ b/src/servers/src/http/result/error_result.rs @@ -32,17 +32,24 @@ pub struct ErrorResponse { impl ErrorResponse { pub fn from_error(error: impl ErrorExt) -> Self { let code = error.status_code(); - if code.should_log_error() { error!(error; "Failed to handle HTTP request"); } else { debug!("Failed to handle HTTP request, err: {:?}", error); } - - Self::from_error_message(code, error.output_msg()) + ErrorResponse { + code: code as u32, + error: error.output_msg(), + execution_time_ms: 0, + } } pub fn from_error_message(code: StatusCode, msg: String) -> Self { + if code.should_log_error() { + error!("Failed to handle HTTP request: {}", msg); + } else { + debug!("Failed to handle HTTP request: {}", msg); + } ErrorResponse { code: code as u32, error: msg, From be4a7a6d371f29377bcc8acfa2c1f1a24b31d7e1 Mon Sep 17 00:00:00 2001 From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com> Date: Mon, 16 Mar 2026 15:49:31 +0800 Subject: [PATCH 011/195] refactor: remove Memtable::iter (#7809) * refactor: remove Memtable::iter Signed-off-by: Lei, HUANG * fix: review comments Signed-off-by: Lei, HUANG --------- Signed-off-by: Lei, HUANG --- src/mito2/benches/memtable_bench.rs | 20 ++++- src/mito2/benches/simple_bulk_memtable.rs | 8 +- src/mito2/src/memtable.rs | 30 +++---- src/mito2/src/memtable/bulk.rs | 10 --- src/mito2/src/memtable/partition_tree.rs | 85 +++++++----------- .../src/memtable/simple_bulk_memtable.rs | 68 +++++++++----- .../simple_bulk_memtable/test_only.rs | 88 +------------------ src/mito2/src/memtable/time_partition.rs | 62 +++++++++++-- src/mito2/src/memtable/time_series.rs | 55 ++++-------- src/mito2/src/test_util/memtable_util.rs | 10 --- 10 files changed, 184 insertions(+), 252 deletions(-) diff --git a/src/mito2/benches/memtable_bench.rs b/src/mito2/benches/memtable_bench.rs index ebe994f861..df991f6f92 100644 --- a/src/mito2/benches/memtable_bench.rs +++ b/src/mito2/benches/memtable_bench.rs @@ -28,7 +28,7 @@ use mito2::memtable::bulk::part_reader::BulkPartBatchIter; use mito2::memtable::bulk::{BulkMemtable, BulkMemtableConfig}; use mito2::memtable::partition_tree::{PartitionTreeConfig, PartitionTreeMemtable}; use mito2::memtable::time_series::TimeSeriesMemtable; -use mito2::memtable::{KeyValues, Memtable, RangesOptions}; +use mito2::memtable::{IterBuilder, KeyValues, Memtable, RangesOptions}; use mito2::read::flat_merge::FlatMergeIterator; use mito2::read::scan_region::PredicateGroup; use mito2::region::options::MergeMode; @@ -105,7 +105,11 @@ fn full_scan(c: &mut Criterion) { } b.iter(|| { - let iter = memtable.iter(None, None, None).unwrap(); + let iter = memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); for batch in iter { let _batch = batch.unwrap(); } @@ -145,7 +149,17 @@ fn filter_1_host(c: &mut Criterion) { let predicate = generator.random_host_filter(); b.iter(|| { - let iter = memtable.iter(None, Some(predicate.clone()), None).unwrap(); + let iter = memtable + .ranges( + None, + RangesOptions { + predicate: PredicateGroup::new(&metadata, predicate.exprs()).unwrap(), + ..Default::default() + }, + ) + .unwrap() + .build(None) + .unwrap(); for batch in iter { let _batch = batch.unwrap(); } diff --git a/src/mito2/benches/simple_bulk_memtable.rs b/src/mito2/benches/simple_bulk_memtable.rs index 0277397768..05035734de 100644 --- a/src/mito2/benches/simple_bulk_memtable.rs +++ b/src/mito2/benches/simple_bulk_memtable.rs @@ -21,7 +21,7 @@ use criterion::{Criterion, criterion_group, criterion_main}; use datatypes::data_type::ConcreteDataType; use datatypes::schema::ColumnSchema; use mito2::memtable::simple_bulk_memtable::SimpleBulkMemtable; -use mito2::memtable::{KeyValues, Memtable, MemtableRanges, RangesOptions}; +use mito2::memtable::{IterBuilder, KeyValues, Memtable, MemtableRanges, RangesOptions}; use mito2::read; use mito2::read::Source; use mito2::read::dedup::DedupReader; @@ -156,7 +156,11 @@ async fn flush(mem: &SimpleBulkMemtable) { } async fn flush_original(mem: &SimpleBulkMemtable) { - let iter = mem.iter(None, None, None).unwrap(); + let iter = mem + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); for b in iter { black_box(b.unwrap()); } diff --git a/src/mito2/src/memtable.rs b/src/mito2/src/memtable.rs index c39bbfa346..7494ec68ed 100644 --- a/src/mito2/src/memtable.rs +++ b/src/mito2/src/memtable.rs @@ -28,6 +28,7 @@ use mito_codec::key_values::KeyValue; pub use mito_codec::key_values::KeyValues; use mito_codec::row_converter::{PrimaryKeyCodec, build_primary_key_codec}; use serde::{Deserialize, Serialize}; +use snafu::ensure; use store_api::metadata::RegionMetadataRef; use store_api::storage::{ColumnId, SequenceNumber, SequenceRange}; @@ -231,10 +232,17 @@ impl MemtableRanges { impl IterBuilder for MemtableRanges { fn build(&self, _metrics: Option) -> Result { - UnsupportedOperationSnafu { - err_msg: "MemtableRanges does not support build iterator", - } - .fail() + ensure!( + self.ranges.len() == 1, + UnsupportedOperationSnafu { + err_msg: format!( + "Building an iterator from MemtableRanges expects 1 range, but got {}", + self.ranges.len() + ), + } + ); + + self.ranges.values().next().unwrap().build_iter() } fn is_record_batch(&self) -> bool { @@ -256,20 +264,6 @@ pub trait Memtable: Send + Sync + fmt::Debug { /// Writes an encoded batch of into memtable. fn write_bulk(&self, part: crate::memtable::bulk::part::BulkPart) -> Result<()>; - /// Scans the memtable. - /// `projection` selects columns to read, `None` means reading all columns. - /// `filters` are the predicates to be pushed down to memtable. - /// - /// # Note - /// This method should only be used for tests. - #[cfg(any(test, feature = "test"))] - fn iter( - &self, - projection: Option<&[ColumnId]>, - predicate: Option, - sequence: Option, - ) -> Result; - /// Returns the ranges in the memtable. /// /// The returned map contains the range id and the range after applying the predicate. diff --git a/src/mito2/src/memtable/bulk.rs b/src/mito2/src/memtable/bulk.rs index 6056a42013..4dad4fb885 100644 --- a/src/mito2/src/memtable/bulk.rs +++ b/src/mito2/src/memtable/bulk.rs @@ -462,16 +462,6 @@ impl Memtable for BulkMemtable { Ok(()) } - #[cfg(any(test, feature = "test"))] - fn iter( - &self, - _projection: Option<&[ColumnId]>, - _predicate: Option, - _sequence: Option, - ) -> Result { - todo!() - } - fn ranges( &self, projection: Option<&[ColumnId]>, diff --git a/src/mito2/src/memtable/partition_tree.rs b/src/mito2/src/memtable/partition_tree.rs index febae46784..662bfd99f6 100644 --- a/src/mito2/src/memtable/partition_tree.rs +++ b/src/mito2/src/memtable/partition_tree.rs @@ -177,16 +177,6 @@ impl Memtable for PartitionTreeMemtable { .fail() } - #[cfg(any(test, feature = "test"))] - fn iter( - &self, - projection: Option<&[ColumnId]>, - predicate: Option, - sequence: Option, - ) -> Result { - self.tree.read(projection, predicate, sequence, None) - } - fn ranges( &self, projection: Option<&[ColumnId]>, @@ -396,8 +386,6 @@ mod tests { use api::v1::{Mutation, OpType, Rows, SemanticType}; use common_query::prelude::{greptime_timestamp, greptime_value}; use common_time::Timestamp; - use datafusion_common::Column; - use datafusion_expr::{BinaryExpr, Expr, Literal, Operator}; use datatypes::data_type::ConcreteDataType; use datatypes::prelude::Vector; use datatypes::scalars::ScalarVector; @@ -548,7 +536,10 @@ mod tests { let expect = (0..100).collect::>(); let kvs = memtable_util::build_key_values(&metadata, "hello".to_string(), 10, &expect, 1); memtable.write(&kvs).unwrap(); - let iter = memtable.iter(Some(&[3]), None, None).unwrap(); + let ranges = memtable + .ranges(Some(&[3]), RangesOptions::default()) + .unwrap(); + let iter = ranges.build(None).unwrap(); let mut v0_all = vec![]; for res in iter { @@ -625,41 +616,6 @@ mod tests { assert_eq!(expect, read); } - #[test] - fn test_memtable_filter() { - let metadata = Arc::new(memtable_util::metadata_with_primary_key(vec![0, 1], false)); - // Try to build a memtable via the builder. - let memtable = PartitionTreeMemtableBuilder::new( - PartitionTreeConfig { - index_max_keys_per_shard: 40, - ..Default::default() - }, - None, - ) - .build(1, &metadata); - - for i in 0..100 { - let timestamps: Vec<_> = (0..10).map(|v| i as i64 * 1000 + v).collect(); - let kvs = - memtable_util::build_key_values(&metadata, "hello".to_string(), i, ×tamps, 1); - memtable.write(&kvs).unwrap(); - } - - for i in 0..100 { - let timestamps: Vec<_> = (0..10).map(|v| i as i64 * 1000 + v).collect(); - let expr = Expr::BinaryExpr(BinaryExpr { - left: Box::new(Expr::Column(Column::from_name("k1"))), - op: Operator::Eq, - right: Box::new((i as u32).lit()), - }); - let iter = memtable - .iter(None, Some(Predicate::new(vec![expr])), None) - .unwrap(); - let read = collect_iter_timestamps(iter); - assert_eq!(timestamps, read); - } - } - #[test] fn test_deserialize_config() { let config = PartitionTreeConfig { @@ -811,7 +767,11 @@ mod tests { )) .unwrap(); - let mut reader = new_memtable.iter(None, None, None).unwrap(); + let mut reader = new_memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let batch = reader.next().unwrap().unwrap(); let pk = codec.decode(batch.primary_key()).unwrap().into_dense(); if let Value::String(s) = &pk[2] { @@ -916,7 +876,14 @@ mod tests { .unwrap(); memtable.freeze().unwrap(); assert_eq!( - collect_kvs(memtable.iter(None, None, None).unwrap(), &metadata), + collect_kvs( + memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(), + &metadata + ), ('a'..'h').map(|c| (c.to_string(), c.to_string())).collect() ); let forked = memtable.fork(2, &metadata); @@ -925,7 +892,14 @@ mod tests { forked.write(&key_values(&metadata, keys.iter())).unwrap(); forked.freeze().unwrap(); assert_eq!( - collect_kvs(forked.iter(None, None, None).unwrap(), &metadata), + collect_kvs( + forked + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(), + &metadata + ), keys.iter() .map(|c| (c.to_string(), c.to_string())) .collect() @@ -936,7 +910,14 @@ mod tests { let keys = ["g", "e", "a", "f", "b", "c", "h"]; forked2.write(&key_values(&metadata, keys.iter())).unwrap(); - let kvs = collect_kvs(forked2.iter(None, None, None).unwrap(), &metadata); + let kvs = collect_kvs( + forked2 + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(), + &metadata, + ); let expected = keys .iter() .map(|c| (c.to_string(), c.to_string())) diff --git a/src/mito2/src/memtable/simple_bulk_memtable.rs b/src/mito2/src/memtable/simple_bulk_memtable.rs index 4dcaa2bac0..6d91f00361 100644 --- a/src/mito2/src/memtable/simple_bulk_memtable.rs +++ b/src/mito2/src/memtable/simple_bulk_memtable.rs @@ -213,22 +213,6 @@ impl Memtable for SimpleBulkMemtable { Ok(()) } - #[cfg(any(test, feature = "test"))] - fn iter( - &self, - projection: Option<&[ColumnId]>, - _predicate: Option, - sequence: Option, - ) -> error::Result { - let iter = self.create_iter(projection, sequence)?.build(None)?; - if self.merge_mode == MergeMode::LastNonNull { - let iter = LastNonNullIter::new(iter); - Ok(Box::new(iter)) - } else { - Ok(Box::new(iter)) - } - } - fn ranges( &self, projection: Option<&[ColumnId]>, @@ -526,7 +510,11 @@ mod tests { )) .unwrap(); - let mut iter = memtable.iter(None, None, None).unwrap(); + let mut iter = memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let batch = iter.next().unwrap().unwrap(); assert_eq!(2, batch.num_rows()); assert_eq!(2, batch.fields().len()); @@ -551,7 +539,11 @@ mod tests { )) .unwrap(); - let mut iter = memtable.iter(None, None, None).unwrap(); + let mut iter = memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let batch = iter.next().unwrap().unwrap(); assert_eq!(1, batch.num_rows()); assert_eq!(2, batch.fields().len()); @@ -565,7 +557,11 @@ mod tests { // Only project column 2 (f1) let projection = vec![2]; - let mut iter = memtable.iter(Some(&projection), None, None).unwrap(); + let mut iter = memtable + .ranges(Some(&projection), RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let batch = iter.next().unwrap().unwrap(); assert_eq!(1, batch.num_rows()); @@ -592,7 +588,11 @@ mod tests { OpType::Put, )) .unwrap(); - let mut iter = memtable.iter(None, None, None).unwrap(); + let mut iter = memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let batch = iter.next().unwrap().unwrap(); assert_eq!(1, batch.num_rows()); // deduped to 1 row @@ -611,7 +611,11 @@ mod tests { let kv = kvs.iter().next().unwrap(); memtable.write_one(kv).unwrap(); - let mut iter = memtable.iter(None, None, None).unwrap(); + let mut iter = memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let batch = iter.next().unwrap().unwrap(); assert_eq!(1, batch.num_rows()); } @@ -745,7 +749,11 @@ mod tests { }; memtable.write_bulk(part).unwrap(); - let mut iter = memtable.iter(None, None, None).unwrap(); + let mut iter = memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let batch = iter.next().unwrap().unwrap(); assert_eq!(2, batch.num_rows()); @@ -764,7 +772,11 @@ mod tests { OpType::Put, ); memtable.write(&kvs).unwrap(); - let mut iter = memtable.iter(None, None, None).unwrap(); + let mut iter = memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let batch = iter.next().unwrap().unwrap(); assert_eq!(3, batch.num_rows()); assert_eq!( @@ -854,7 +866,15 @@ mod tests { // Filter with sequence 0 should only return first write let mut iter = memtable - .iter(None, None, Some(SequenceRange::LtEq { max: 0 })) + .ranges( + None, + RangesOptions { + sequence: Some(SequenceRange::LtEq { max: 0 }), + ..Default::default() + }, + ) + .unwrap() + .build(None) .unwrap(); let batch = iter.next().unwrap().unwrap(); assert_eq!(1, batch.num_rows()); diff --git a/src/mito2/src/memtable/simple_bulk_memtable/test_only.rs b/src/mito2/src/memtable/simple_bulk_memtable/test_only.rs index b71a86c554..08edebdbb2 100644 --- a/src/mito2/src/memtable/simple_bulk_memtable/test_only.rs +++ b/src/mito2/src/memtable/simple_bulk_memtable/test_only.rs @@ -12,98 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashSet; -use std::time::Instant; - use store_api::metadata::RegionMetadataRef; -use store_api::storage::{ColumnId, SequenceRange}; -use crate::error; -use crate::memtable::simple_bulk_memtable::{Iter, SimpleBulkMemtable}; -use crate::memtable::time_series::Values; -use crate::memtable::{BoxedBatchIterator, IterBuilder, MemScanMetrics}; -use crate::read::dedup::LastNonNullIter; -use crate::region::options::MergeMode; +use crate::memtable::simple_bulk_memtable::SimpleBulkMemtable; impl SimpleBulkMemtable { pub fn region_metadata(&self) -> RegionMetadataRef { self.region_metadata.clone() } - - pub(crate) fn create_iter( - &self, - projection: Option<&[ColumnId]>, - sequence: Option, - ) -> error::Result { - let mut series = self.series.write().unwrap(); - - let values = if series.is_empty() { - None - } else { - Some(series.compact(&self.region_metadata)?.clone()) - }; - let projection = self.build_projection(projection); - Ok(BatchIterBuilderDeprecated { - region_metadata: self.region_metadata.clone(), - values, - projection, - dedup: self.dedup, - sequence, - merge_mode: self.merge_mode, - }) - } -} - -#[derive(Clone)] -pub(crate) struct BatchIterBuilderDeprecated { - region_metadata: RegionMetadataRef, - values: Option, - projection: HashSet, - sequence: Option, - dedup: bool, - merge_mode: MergeMode, -} - -impl IterBuilder for BatchIterBuilderDeprecated { - fn build(&self, metrics: Option) -> error::Result { - let start_time = Instant::now(); - let Some(values) = self.values.clone() else { - return Ok(Box::new(Iter { batch: None })); - }; - - let maybe_batch = values - .to_batch( - &[], - &self.region_metadata, - &self.projection, - self.sequence, - self.dedup, - self.merge_mode, - ) - .map(Some) - .transpose(); - - // Collect metrics from the batch - if let Some(metrics) = metrics { - let (num_rows, num_batches) = match &maybe_batch { - Some(Ok(batch)) => (batch.num_rows(), 1), - _ => (0, 0), - }; - let inner = crate::memtable::MemScanMetricsData { - total_series: 1, - num_rows, - num_batches, - scan_cost: start_time.elapsed(), - }; - metrics.merge_inner(&inner); - } - - let iter = Iter { batch: maybe_batch }; - - if self.merge_mode == MergeMode::LastNonNull { - Ok(Box::new(LastNonNullIter::new(iter))) - } else { - Ok(Box::new(iter)) - } - } } diff --git a/src/mito2/src/memtable/time_partition.rs b/src/mito2/src/memtable/time_partition.rs index 6f11c813cb..ee695aceb8 100644 --- a/src/mito2/src/memtable/time_partition.rs +++ b/src/mito2/src/memtable/time_partition.rs @@ -827,6 +827,7 @@ mod tests { use super::*; use crate::memtable::partition_tree::PartitionTreeMemtableBuilder; use crate::memtable::time_series::TimeSeriesMemtableBuilder; + use crate::memtable::{IterBuilder, RangesOptions}; use crate::test_util::memtable_util::{self, collect_iter_timestamps}; #[test] @@ -852,7 +853,11 @@ mod tests { partitions.list_memtables(&mut memtables); assert_eq!(0, memtables[0].id()); - let iter = memtables[0].iter(None, None, None).unwrap(); + let iter = memtables[0] + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let timestamps = collect_iter_timestamps(iter); assert_eq!(&[1000, 3000, 5000, 6000, 7000], ×tamps[..]); } @@ -890,7 +895,11 @@ mod tests { let mut memtables = Vec::new(); partitions.list_memtables(&mut memtables); - let iter = memtables[0].iter(None, None, None).unwrap(); + let iter = memtables[0] + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let timestamps = collect_iter_timestamps(iter); assert_eq!(&[0, 2000, 3000, 4000, 5000, 7000], ×tamps[..]); let parts = partitions.list_partitions(); @@ -943,7 +952,12 @@ mod tests { let partitions = new_multi_partitions(&metadata); let parts = partitions.list_partitions(); - let iter = parts[0].memtable.iter(None, None, None).unwrap(); + let iter = parts[0] + .memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let timestamps = collect_iter_timestamps(iter); assert_eq!(0, parts[0].memtable.id()); assert_eq!( @@ -955,7 +969,12 @@ mod tests { parts[0].time_range.max_timestamp ); assert_eq!(&[0, 2000, 3000, 4000], ×tamps[..]); - let iter = parts[1].memtable.iter(None, None, None).unwrap(); + let iter = parts[1] + .memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); assert_eq!(1, parts[1].memtable.id()); let timestamps = collect_iter_timestamps(iter); assert_eq!(&[5000, 7000], ×tamps[..]); @@ -1273,7 +1292,12 @@ mod tests { let parts = partitions.list_partitions(); assert_eq!(1, parts.len()); - let iter = parts[0].memtable.iter(None, None, None).unwrap(); + let iter = parts[0] + .memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let timestamps = collect_iter_timestamps(iter); assert_eq!(&[1000, 2000, 3000], ×tamps[..]); @@ -1284,11 +1308,21 @@ mod tests { let parts = partitions.list_partitions(); assert_eq!(2, parts.len()); // Check first partition [0, 5000) - let iter = parts[0].memtable.iter(None, None, None).unwrap(); + let iter = parts[0] + .memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let timestamps = collect_iter_timestamps(iter); assert_eq!(&[1000, 2000, 3000, 4000], ×tamps[..]); // Check second partition [5000, 10000) - let iter = parts[1].memtable.iter(None, None, None).unwrap(); + let iter = parts[1] + .memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let timestamps = collect_iter_timestamps(iter); assert_eq!(&[5000, 6000], ×tamps[..]); @@ -1301,7 +1335,12 @@ mod tests { assert_eq!(3, parts.len()); // Check new partition [10000, 15000) - let iter = parts[2].memtable.iter(None, None, None).unwrap(); + let iter = parts[2] + .memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let timestamps = collect_iter_timestamps(iter); assert_eq!(&[11000, 12000], ×tamps[..]); @@ -1314,7 +1353,12 @@ mod tests { let parts = partitions.list_partitions(); assert_eq!(1, parts.len()); - let iter = parts[0].memtable.iter(None, None, None).unwrap(); + let iter = parts[0] + .memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let timestamps = collect_iter_timestamps(iter); assert_eq!(&[1000, 5000, 9000], ×tamps[..]); } diff --git a/src/mito2/src/memtable/time_series.rs b/src/mito2/src/memtable/time_series.rs index 271a9343eb..97f5f3c9ce 100644 --- a/src/mito2/src/memtable/time_series.rs +++ b/src/mito2/src/memtable/time_series.rs @@ -267,39 +267,6 @@ impl Memtable for TimeSeriesMemtable { Ok(()) } - #[cfg(any(test, feature = "test"))] - fn iter( - &self, - projection: Option<&[ColumnId]>, - filters: Option, - sequence: Option, - ) -> Result { - let projection = if let Some(projection) = projection { - projection.iter().copied().collect() - } else { - self.region_metadata - .field_columns() - .map(|c| c.column_id) - .collect() - }; - - let iter = self.series_set.iter_series( - projection, - filters, - self.dedup, - self.merge_mode, - sequence, - None, - )?; - - if self.merge_mode == MergeMode::LastNonNull { - let iter = LastNonNullIter::new(iter); - Ok(Box::new(iter)) - } else { - Ok(Box::new(iter)) - } - } - fn ranges( &self, projection: Option<&[ColumnId]>, @@ -1798,7 +1765,9 @@ mod tests { *expected_ts.entry(ts).or_default() += if dedup { 1 } else { 2 }; } - let iter = memtable.iter(None, None, None).unwrap(); + let ranges = memtable.ranges(None, RangesOptions::default()).unwrap(); + let range = ranges.ranges.into_values().next().unwrap(); + let iter = range.build_iter().unwrap(); let mut read = HashMap::new(); for ts in iter @@ -1838,7 +1807,11 @@ mod tests { let memtable = TimeSeriesMemtable::new(schema, 42, None, true, MergeMode::LastRow); memtable.write(&kvs).unwrap(); - let iter = memtable.iter(Some(&[3]), None, None).unwrap(); + let iter = memtable + .ranges(Some(&[3]), RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let mut v0_all = vec![]; @@ -1917,7 +1890,11 @@ mod tests { barrier.wait(); for _ in 0..10 { - let iter = memtable.iter(None, None, None).unwrap(); + let iter = memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); for batch_result in iter { let _ = batch_result.unwrap(); } @@ -1936,7 +1913,11 @@ mod tests { handle.join().unwrap(); } - let iter = memtable.iter(None, None, None).unwrap(); + let iter = memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let mut series_count = 0; let mut row_count = 0; diff --git a/src/mito2/src/test_util/memtable_util.rs b/src/mito2/src/test_util/memtable_util.rs index 7ddac4ee0d..58ea49fa41 100644 --- a/src/mito2/src/test_util/memtable_util.rs +++ b/src/mito2/src/test_util/memtable_util.rs @@ -83,16 +83,6 @@ impl Memtable for EmptyMemtable { Ok(()) } - #[cfg(any(test, feature = "test"))] - fn iter( - &self, - _projection: Option<&[ColumnId]>, - _filters: Option, - _sequence: Option, - ) -> Result { - Ok(Box::new(std::iter::empty())) - } - fn ranges( &self, _projection: Option<&[ColumnId]>, From dd82fcac00856a6dc3317fa4920b490bba959b84 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Mon, 16 Mar 2026 17:56:34 +0800 Subject: [PATCH 012/195] chore: update visibility of BatchToRecordBatchAdapter::new (#7817) --- src/mito2/src/read/batch_adapter.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mito2/src/read/batch_adapter.rs b/src/mito2/src/read/batch_adapter.rs index 461dbeba69..4698229c5b 100644 --- a/src/mito2/src/read/batch_adapter.rs +++ b/src/mito2/src/read/batch_adapter.rs @@ -59,7 +59,7 @@ impl BatchToRecordBatchAdapter { /// - `metadata`: region metadata describing the schema. /// - `codec`: codec for decoding the encoded primary key bytes. /// - `read_column_ids`: projected column ids to read. - pub(crate) fn new( + pub fn new( iter: BoxedBatchIterator, metadata: RegionMetadataRef, codec: Arc, From 5a37e58b4f4c4475e251d15e57436bb78acfe167 Mon Sep 17 00:00:00 2001 From: Yingwen Date: Tue, 17 Mar 2026 11:53:20 +0800 Subject: [PATCH 013/195] feat(mito2): add partition range cache infrastructure (#7798) * feat: add partition range cache infra Signed-off-by: evenyag * refactor: optimize scan request fingerprint cloning Signed-off-by: evenyag * refactor: merge loops Signed-off-by: evenyag * chore: more docs Signed-off-by: evenyag * chore: update estimated size method and comment Signed-off-by: evenyag * chore: fix clippy Signed-off-by: evenyag * feat: only cache when we scan files Signed-off-by: evenyag * fix: address PR review comments for partition range cache - Remove TimeSeriesDistribution from fingerprint as it only affects yield order - Disable range cache when dyn filters are present since they change at runtime Signed-off-by: evenyag * chore: fmt code Signed-off-by: evenyag --------- Signed-off-by: evenyag --- src/mito2/src/cache.rs | 134 ++++++++++++++++ src/mito2/src/read.rs | 1 + src/mito2/src/read/range_cache.rs | 252 ++++++++++++++++++++++++++++++ src/mito2/src/read/scan_region.rs | 243 +++++++++++++++++++++++++++- src/mito2/src/region/options.rs | 2 +- 5 files changed, 629 insertions(+), 3 deletions(-) create mode 100644 src/mito2/src/read/range_cache.rs diff --git a/src/mito2/src/cache.rs b/src/mito2/src/cache.rs index 3ad71d2a61..e232489768 100644 --- a/src/mito2/src/cache.rs +++ b/src/mito2/src/cache.rs @@ -49,6 +49,7 @@ use crate::cache::write_cache::WriteCacheRef; use crate::memtable::record_batch_estimated_size; use crate::metrics::{CACHE_BYTES, CACHE_EVICTION, CACHE_HIT, CACHE_MISS}; use crate::read::Batch; +use crate::read::range_cache::{RangeScanCacheKey, RangeScanCacheValue}; use crate::sst::file::{RegionFileId, RegionIndexId}; use crate::sst::parquet::reader::MetadataCacheMetrics; @@ -64,6 +65,8 @@ const FILE_TYPE: &str = "file"; const INDEX_TYPE: &str = "index"; /// Metrics type key for selector result cache. const SELECTOR_RESULT_TYPE: &str = "selector_result"; +/// Metrics type key for range scan result cache. +const RANGE_RESULT_TYPE: &str = "range_result"; /// Cache strategies that may only enable a subset of caches. #[derive(Clone)] @@ -223,6 +226,32 @@ impl CacheStrategy { } } + /// Calls [CacheManager::get_range_result()]. + /// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled]. + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) fn get_range_result( + &self, + key: &RangeScanCacheKey, + ) -> Option> { + match self { + CacheStrategy::EnableAll(cache_manager) => cache_manager.get_range_result(key), + CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None, + } + } + + /// Calls [CacheManager::put_range_result()]. + /// It does nothing if the strategy isn't [CacheStrategy::EnableAll]. + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) fn put_range_result( + &self, + key: RangeScanCacheKey, + result: Arc, + ) { + if let CacheStrategy::EnableAll(cache_manager) = self { + cache_manager.put_range_result(key, result); + } + } + /// Calls [CacheManager::write_cache()]. /// It returns None if the strategy is [CacheStrategy::Disabled]. pub fn write_cache(&self) -> Option<&WriteCacheRef> { @@ -324,6 +353,9 @@ pub struct CacheManager { puffin_metadata_cache: Option, /// Cache for time series selectors. selector_result_cache: Option, + /// Cache for range scan outputs in flat format. + #[cfg_attr(not(test), allow(dead_code))] + range_result_cache: Option, /// Cache for index result. index_result_cache: Option, } @@ -512,6 +544,32 @@ impl CacheManager { } } + /// Gets cached result for range scan. + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) fn get_range_result( + &self, + key: &RangeScanCacheKey, + ) -> Option> { + self.range_result_cache + .as_ref() + .and_then(|cache| update_hit_miss(cache.get(key), RANGE_RESULT_TYPE)) + } + + /// Puts range scan result into the cache. + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) fn put_range_result( + &self, + key: RangeScanCacheKey, + result: Arc, + ) { + if let Some(cache) = &self.range_result_cache { + CACHE_BYTES + .with_label_values(&[RANGE_RESULT_TYPE]) + .add(range_result_cache_weight(&key, &result).into()); + cache.insert(key, result); + } + } + /// Gets the write cache. pub(crate) fn write_cache(&self) -> Option<&WriteCacheRef> { self.write_cache.as_ref() @@ -562,6 +620,7 @@ pub struct CacheManagerBuilder { puffin_metadata_size: u64, write_cache: Option, selector_result_cache_size: u64, + range_result_cache_size: u64, } impl CacheManagerBuilder { @@ -625,6 +684,12 @@ impl CacheManagerBuilder { self } + /// Sets range result cache size. + pub fn range_result_cache_size(mut self, bytes: u64) -> Self { + self.range_result_cache_size = bytes; + self + } + /// Builds the [CacheManager]. pub fn build(self) -> CacheManager { fn to_str(cause: RemovalCause) -> &'static str { @@ -712,6 +777,21 @@ impl CacheManagerBuilder { }) .build() }); + let range_result_cache = (self.range_result_cache_size != 0).then(|| { + Cache::builder() + .max_capacity(self.range_result_cache_size) + .weigher(range_result_cache_weight) + .eviction_listener(|k, v, cause| { + let size = range_result_cache_weight(&k, &v); + CACHE_BYTES + .with_label_values(&[RANGE_RESULT_TYPE]) + .sub(size.into()); + CACHE_EVICTION + .with_label_values(&[RANGE_RESULT_TYPE, to_str(cause)]) + .inc(); + }) + .build() + }); CacheManager { sst_meta_cache, vector_cache, @@ -723,6 +803,7 @@ impl CacheManagerBuilder { vector_index_cache, puffin_metadata_cache: Some(Arc::new(puffin_metadata_cache)), selector_result_cache, + range_result_cache, index_result_cache, } } @@ -746,6 +827,10 @@ fn selector_result_cache_weight(k: &SelectorResultKey, v: &Arc) -> u32 { + (k.estimated_size() + v.estimated_size()) as u32 +} + /// Updates cache hit/miss metrics. fn update_hit_miss(value: Option, cache_type: &str) -> Option { if value.is_some() { @@ -902,6 +987,8 @@ type VectorCache = Cache<(ConcreteDataType, Value), VectorRef>; type PageCache = Cache>; /// Maps (file id, row group id, time series row selector) to [SelectorResultValue]. type SelectorResultCache = Cache>; +/// Maps partition-range scan key to cached flat batches. +type RangeResultCache = Cache>; #[cfg(test)] mod tests { @@ -916,6 +1003,9 @@ mod tests { use crate::cache::index::bloom_filter_index::Tag; use crate::cache::index::result_cache::PredicateKey; use crate::cache::test_util::parquet_meta; + use crate::read::range_cache::{ + RangeScanCacheKey, RangeScanCacheValue, ScanRequestFingerprintBuilder, + }; use crate::sst::parquet::row_selection::RowGroupSelection; #[tokio::test] @@ -1028,6 +1118,50 @@ mod tests { assert!(cache.get_selector_result(&key).is_some()); } + #[test] + fn test_range_result_cache() { + let cache = Arc::new( + CacheManager::builder() + .range_result_cache_size(1024 * 1024) + .build(), + ); + + let key = RangeScanCacheKey { + region_id: RegionId::new(1, 1), + row_groups: vec![(FileId::random(), 0)], + scan: ScanRequestFingerprintBuilder { + read_column_ids: vec![], + read_column_types: vec![], + filters: vec!["tag_0 = 1".to_string()], + time_filters: vec![], + series_row_selector: None, + append_mode: false, + filter_deleted: true, + merge_mode: crate::region::options::MergeMode::LastRow, + partition_expr_version: 0, + } + .build(), + }; + let value = Arc::new(RangeScanCacheValue::new(Vec::new())); + + assert!(cache.get_range_result(&key).is_none()); + cache.put_range_result(key.clone(), value.clone()); + assert!(cache.get_range_result(&key).is_some()); + + let enable_all = CacheStrategy::EnableAll(cache.clone()); + assert!(enable_all.get_range_result(&key).is_some()); + + let compaction = CacheStrategy::Compaction(cache.clone()); + assert!(compaction.get_range_result(&key).is_none()); + compaction.put_range_result(key.clone(), value.clone()); + assert!(cache.get_range_result(&key).is_some()); + + let disabled = CacheStrategy::Disabled; + assert!(disabled.get_range_result(&key).is_none()); + disabled.put_range_result(key.clone(), value); + assert!(cache.get_range_result(&key).is_some()); + } + #[tokio::test] async fn test_evict_puffin_cache_clears_all_entries() { use std::collections::{BTreeMap, HashMap}; diff --git a/src/mito2/src/read.rs b/src/mito2/src/read.rs index 5fbd63ce8b..240a99c247 100644 --- a/src/mito2/src/read.rs +++ b/src/mito2/src/read.rs @@ -27,6 +27,7 @@ pub mod projection; pub(crate) mod prune; pub(crate) mod pruner; pub mod range; +pub(crate) mod range_cache; pub mod scan_region; pub mod scan_util; pub(crate) mod seq_scan; diff --git a/src/mito2/src/read/range_cache.rs b/src/mito2/src/read/range_cache.rs new file mode 100644 index 0000000000..5b90e68bae --- /dev/null +++ b/src/mito2/src/read/range_cache.rs @@ -0,0 +1,252 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Utilities for the partition range scan result cache. + +use std::mem; +use std::sync::Arc; + +use datatypes::arrow::record_batch::RecordBatch; +use datatypes::prelude::ConcreteDataType; +use store_api::storage::{ColumnId, FileId, RegionId, TimeSeriesRowSelector}; + +use crate::memtable::record_batch_estimated_size; +use crate::region::options::MergeMode; + +/// Fingerprint of the scan request fields that affect partition range cache reuse. +/// +/// It records a normalized view of the projected columns and filters, plus +/// scan options that can change the returned rows. Schema-dependent metadata +/// and the partition expression version are included so cached results are not +/// reused across incompatible schema or partitioning changes. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct ScanRequestFingerprint { + /// Projection and filters without the time index and partition exprs. + inner: Arc, + /// Filters with the time index column. + time_filters: Option>>, + series_row_selector: Option, + append_mode: bool, + filter_deleted: bool, + merge_mode: MergeMode, + /// We keep the partition expr version to ensure we won't reuse the fingerprint after we change the partition expr. + /// We store the version instead of the whole partition expr or partition expr filters. + partition_expr_version: u64, +} + +#[derive(Debug)] +pub(crate) struct ScanRequestFingerprintBuilder { + pub(crate) read_column_ids: Vec, + pub(crate) read_column_types: Vec>, + pub(crate) filters: Vec, + pub(crate) time_filters: Vec, + pub(crate) series_row_selector: Option, + pub(crate) append_mode: bool, + pub(crate) filter_deleted: bool, + pub(crate) merge_mode: MergeMode, + pub(crate) partition_expr_version: u64, +} + +impl ScanRequestFingerprintBuilder { + pub(crate) fn build(self) -> ScanRequestFingerprint { + let Self { + read_column_ids, + read_column_types, + filters, + time_filters, + series_row_selector, + append_mode, + filter_deleted, + merge_mode, + partition_expr_version, + } = self; + + ScanRequestFingerprint { + inner: Arc::new(SharedScanRequestFingerprint { + read_column_ids, + read_column_types, + filters, + }), + time_filters: (!time_filters.is_empty()).then(|| Arc::new(time_filters)), + series_row_selector, + append_mode, + filter_deleted, + merge_mode, + partition_expr_version, + } + } +} + +/// Non-copiable struct of the fingerprint. +#[derive(Debug, PartialEq, Eq, Hash)] +struct SharedScanRequestFingerprint { + /// Column ids of the projection. + read_column_ids: Vec, + /// Column types of the projection. + /// We keep this to ensure we won't reuse the fingerprint after a schema change. + read_column_types: Vec>, + /// Filters without the time index column and region partition exprs. + filters: Vec, +} + +impl ScanRequestFingerprint { + #[cfg(test)] + pub(crate) fn read_column_ids(&self) -> &[ColumnId] { + &self.inner.read_column_ids + } + + #[cfg(test)] + pub(crate) fn read_column_types(&self) -> &[Option] { + &self.inner.read_column_types + } + + #[cfg(test)] + pub(crate) fn filters(&self) -> &[String] { + &self.inner.filters + } + + #[cfg(test)] + pub(crate) fn time_filters(&self) -> &[String] { + self.time_filters + .as_deref() + .map(Vec::as_slice) + .unwrap_or(&[]) + } + + #[cfg(test)] + pub(crate) fn without_time_filters(&self) -> Self { + Self { + inner: Arc::clone(&self.inner), + time_filters: None, + series_row_selector: self.series_row_selector, + append_mode: self.append_mode, + filter_deleted: self.filter_deleted, + merge_mode: self.merge_mode, + partition_expr_version: self.partition_expr_version, + } + } + + pub(crate) fn estimated_size(&self) -> usize { + mem::size_of::() + + self.inner.read_column_ids.capacity() * mem::size_of::() + + self.inner.read_column_types.capacity() * mem::size_of::>() + + self.inner.filters.capacity() * mem::size_of::() + + self + .inner + .filters + .iter() + .map(|filter| filter.capacity()) + .sum::() + + self.time_filters.as_ref().map_or(0, |filters| { + mem::size_of::>() + + filters.capacity() * mem::size_of::() + + filters + .iter() + .map(|filter| filter.capacity()) + .sum::() + }) + } +} + +/// Cache key for range scan outputs. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct RangeScanCacheKey { + pub(crate) region_id: RegionId, + /// Sorted (file_id, row_group_index) pairs that uniquely identify the covered data. + pub(crate) row_groups: Vec<(FileId, i64)>, + pub(crate) scan: ScanRequestFingerprint, +} + +impl RangeScanCacheKey { + pub(crate) fn estimated_size(&self) -> usize { + mem::size_of::() + + self.row_groups.capacity() * mem::size_of::<(FileId, i64)>() + + self.scan.estimated_size() + } +} + +/// Cached result for one range scan. +pub(crate) struct RangeScanCacheValue { + pub(crate) batches: Vec, +} + +impl RangeScanCacheValue { + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) fn new(batches: Vec) -> Self { + Self { batches } + } + + pub(crate) fn estimated_size(&self) -> usize { + mem::size_of::() + + self.batches.capacity() * mem::size_of::() + + self + .batches + .iter() + .map(record_batch_estimated_size) + .sum::() + } +} + +#[cfg(test)] +mod tests { + use store_api::storage::TimeSeriesRowSelector; + + use super::*; + + #[test] + fn normalizes_and_clears_time_filters() { + let normalized = ScanRequestFingerprintBuilder { + read_column_ids: vec![1, 2], + read_column_types: vec![None, None], + filters: vec!["k0 = 'foo'".to_string()], + time_filters: vec![], + series_row_selector: None, + append_mode: false, + filter_deleted: true, + merge_mode: MergeMode::LastRow, + partition_expr_version: 0, + } + .build(); + + assert!(normalized.time_filters().is_empty()); + + let fingerprint = ScanRequestFingerprintBuilder { + read_column_ids: vec![1, 2], + read_column_types: vec![None, None], + filters: vec!["k0 = 'foo'".to_string()], + time_filters: vec!["ts >= 1000".to_string()], + series_row_selector: Some(TimeSeriesRowSelector::LastRow), + append_mode: false, + filter_deleted: true, + merge_mode: MergeMode::LastRow, + partition_expr_version: 7, + } + .build(); + + let reset = fingerprint.without_time_filters(); + + assert_eq!(reset.read_column_ids(), fingerprint.read_column_ids()); + assert_eq!(reset.read_column_types(), fingerprint.read_column_types()); + assert_eq!(reset.filters(), fingerprint.filters()); + assert!(reset.time_filters().is_empty()); + assert_eq!(reset.series_row_selector, fingerprint.series_row_selector); + assert_eq!(reset.append_mode, fingerprint.append_mode); + assert_eq!(reset.filter_deleted, fingerprint.filter_deleted); + assert_eq!(reset.merge_mode, fingerprint.merge_mode); + assert_eq!( + reset.partition_expr_version, + fingerprint.partition_expr_version + ); + } +} diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs index 5d934afd2d..5cb2d75e25 100644 --- a/src/mito2/src/read/scan_region.rs +++ b/src/mito2/src/read/scan_region.rs @@ -55,6 +55,7 @@ use crate::metrics::READ_SST_COUNT; use crate::read::compat::{self, CompatBatch, FlatCompatBatch, PrimaryKeyCompatBatch}; use crate::read::projection::ProjectionMapper; use crate::read::range::{FileRangeBuilder, MemRangeBuilder, RangeMeta, RowGroupIndex}; +use crate::read::range_cache::ScanRequestFingerprint; use crate::read::seq_scan::SeqScan; use crate::read::series_scan::SeriesScan; use crate::read::stream::ScanBatchStream; @@ -815,7 +816,7 @@ pub struct ScanInput { /// But this read columns might also include non-projected columns needed for filtering. pub(crate) read_column_ids: Vec, /// Time range filter for time index. - time_range: Option, + pub(crate) time_range: Option, /// Predicate to push down. pub(crate) predicate: PredicateGroup, /// Region partition expr applied at read time. @@ -1417,6 +1418,92 @@ fn pre_filter_mode(append_mode: bool, merge_mode: MergeMode) -> PreFilterMode { } } +/// Builds a [ScanRequestFingerprint] from a [ScanInput] if the scan is eligible +/// for partition range caching. +#[cfg_attr(not(test), allow(dead_code))] +pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option { + let eligible = input.flat_format + && !input.compaction + && !input.files.is_empty() + && matches!(input.cache_strategy, CacheStrategy::EnableAll(_)); + + if !eligible { + return None; + } + + let metadata = input.region_metadata(); + let tag_names: HashSet<&str> = metadata + .column_metadatas + .iter() + .filter(|col| col.semantic_type == SemanticType::Tag) + .map(|col| col.column_schema.name.as_str()) + .collect(); + + let time_index_name = metadata.time_index_column().column_schema.name.clone(); + + let exprs = input + .predicate_group() + .predicate_without_region() + .map(|predicate| predicate.exprs()) + .unwrap_or_default(); + + let mut filters = Vec::new(); + let mut time_filters = Vec::new(); + let mut has_tag_filter = false; + let mut columns = HashSet::new(); + + for expr in exprs { + columns.clear(); + let is_time_only = match expr_to_columns(expr, &mut columns) { + Ok(()) if !columns.is_empty() => { + has_tag_filter |= columns + .iter() + .any(|col| tag_names.contains(col.name.as_str())); + columns.iter().all(|col| col.name == time_index_name) + } + _ => false, + }; + + if is_time_only { + time_filters.push(expr.to_string()); + } else { + filters.push(expr.to_string()); + } + } + + if !has_tag_filter { + // We only cache requests that have tag filters to avoid caching all series. + return None; + } + + // Ensure the filters are sorted for consistent fingerprinting. + filters.sort_unstable(); + time_filters.sort_unstable(); + + Some( + crate::read::range_cache::ScanRequestFingerprintBuilder { + read_column_ids: input.read_column_ids.clone(), + read_column_types: input + .read_column_ids + .iter() + .map(|id| { + metadata + .column_by_id(*id) + .map(|col| col.column_schema.data_type.clone()) + }) + .collect(), + filters, + time_filters, + series_row_selector: input.series_row_selector, + append_mode: input.append_mode, + filter_deleted: input.filter_deleted, + merge_mode: input.merge_mode, + partition_expr_version: metadata.partition_expr_version, + } + .build(), + ) +} + /// Context shared by different streams from a scanner. /// It contains the input and ranges to scan. pub struct StreamContext { @@ -1763,10 +1850,15 @@ mod tests { use datafusion::physical_plan::expressions::lit as physical_lit; use datafusion_expr::{col, lit}; - use store_api::storage::ScanRequest; + use datatypes::value::Value; + use partition::expr::col as partition_col; + use store_api::metadata::RegionMetadataBuilder; + use store_api::storage::{ScanRequest, TimeSeriesDistribution, TimeSeriesRowSelector}; use super::*; + use crate::cache::CacheManager; use crate::memtable::time_partition::TimePartitions; + use crate::read::range_cache::ScanRequestFingerprintBuilder; use crate::region::options::RegionOptions; use crate::region::version::VersionBuilder; use crate::sst::FormatType; @@ -1804,6 +1896,26 @@ mod tests { ) } + async fn new_scan_input(metadata: RegionMetadataRef, filters: Vec) -> ScanInput { + let env = SchedulerEnv::new().await; + let mapper = ProjectionMapper::new(&metadata, [0, 2, 3].into_iter(), true).unwrap(); + let predicate = PredicateGroup::new(metadata.as_ref(), &filters).unwrap(); + let file = FileHandle::new( + crate::sst::file::FileMeta::default(), + Arc::new(crate::sst::file_purger::NoopFilePurger), + ); + + ScanInput::new(env.access_layer.clone(), mapper) + .with_predicate(predicate) + .with_cache(CacheStrategy::EnableAll(Arc::new( + CacheManager::builder() + .range_result_cache_size(1024) + .build(), + ))) + .with_flat_format(true) + .with_files(vec![file]) + } + #[tokio::test] async fn test_build_read_column_ids_includes_filters() { let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false)); @@ -1923,6 +2035,133 @@ mod tests { assert!(scan_region.use_flat_format()); } + #[tokio::test] + async fn test_build_scan_fingerprint_for_eligible_scan() { + let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false)); + let input = new_scan_input( + metadata.clone(), + vec![ + col("ts").gt_eq(lit(1000)), + col("k0").eq(lit("foo")), + col("v0").gt(lit(1)), + ], + ) + .await + .with_distribution(Some(TimeSeriesDistribution::PerSeries)) + .with_series_row_selector(Some(TimeSeriesRowSelector::LastRow)) + .with_merge_mode(MergeMode::LastNonNull) + .with_filter_deleted(false); + + let fingerprint = build_scan_fingerprint(&input).unwrap(); + + let expected = ScanRequestFingerprintBuilder { + read_column_ids: input.read_column_ids.clone(), + read_column_types: vec![ + metadata + .column_by_id(0) + .map(|col| col.column_schema.data_type.clone()), + metadata + .column_by_id(2) + .map(|col| col.column_schema.data_type.clone()), + metadata + .column_by_id(3) + .map(|col| col.column_schema.data_type.clone()), + ], + filters: vec![ + col("k0").eq(lit("foo")).to_string(), + col("v0").gt(lit(1)).to_string(), + ], + time_filters: vec![col("ts").gt_eq(lit(1000)).to_string()], + series_row_selector: Some(TimeSeriesRowSelector::LastRow), + append_mode: false, + filter_deleted: false, + merge_mode: MergeMode::LastNonNull, + partition_expr_version: 0, + } + .build(); + assert_eq!(expected, fingerprint); + } + + #[tokio::test] + async fn test_build_scan_fingerprint_requires_tag_filter() { + let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false)); + let input = new_scan_input( + metadata, + vec![col("ts").gt_eq(lit(1000)), col("v0").gt(lit(1))], + ) + .await; + + assert!(build_scan_fingerprint(&input).is_none()); + } + + #[tokio::test] + async fn test_build_scan_fingerprint_respects_scan_eligibility() { + let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false)); + let filters = vec![col("k0").eq(lit("foo"))]; + + let disabled = ScanInput::new( + SchedulerEnv::new().await.access_layer.clone(), + ProjectionMapper::new(&metadata, [0, 2, 3].into_iter(), true).unwrap(), + ) + .with_predicate(PredicateGroup::new(metadata.as_ref(), &filters).unwrap()) + .with_flat_format(true); + assert!(build_scan_fingerprint(&disabled).is_none()); + + let non_flat = new_scan_input(metadata.clone(), filters.clone()) + .await + .with_flat_format(false); + assert!(build_scan_fingerprint(&non_flat).is_none()); + + let compaction = new_scan_input(metadata.clone(), filters.clone()) + .await + .with_compaction(true); + assert!(build_scan_fingerprint(&compaction).is_none()); + + // No files to read. + let no_files = new_scan_input(metadata, filters).await.with_files(vec![]); + assert!(build_scan_fingerprint(&no_files).is_none()); + } + + #[tokio::test] + async fn test_build_scan_fingerprint_tracks_schema_and_partition_expr_changes() { + let base = metadata_with_primary_key(vec![0, 1], false); + let mut builder = RegionMetadataBuilder::from_existing(base); + let partition_expr = partition_col("k0") + .gt_eq(Value::String("foo".into())) + .as_json_str() + .unwrap(); + builder.partition_expr_json(Some(partition_expr)); + let metadata = Arc::new(builder.build_without_validation().unwrap()); + + let input = new_scan_input(metadata.clone(), vec![col("k0").eq(lit("foo"))]).await; + let fingerprint = build_scan_fingerprint(&input).unwrap(); + + let expected = ScanRequestFingerprintBuilder { + read_column_ids: input.read_column_ids.clone(), + read_column_types: vec![ + metadata + .column_by_id(0) + .map(|col| col.column_schema.data_type.clone()), + metadata + .column_by_id(2) + .map(|col| col.column_schema.data_type.clone()), + metadata + .column_by_id(3) + .map(|col| col.column_schema.data_type.clone()), + ], + filters: vec![col("k0").eq(lit("foo")).to_string()], + time_filters: vec![], + series_row_selector: None, + append_mode: false, + filter_deleted: true, + merge_mode: MergeMode::LastRow, + partition_expr_version: metadata.partition_expr_version, + } + .build(); + assert_eq!(expected, fingerprint); + assert_ne!(0, metadata.partition_expr_version); + } + #[test] fn test_update_dyn_filters_with_empty_base_predicates() { let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false)); diff --git a/src/mito2/src/region/options.rs b/src/mito2/src/region/options.rs index 0fe0a8f12a..fcf68a9216 100644 --- a/src/mito2/src/region/options.rs +++ b/src/mito2/src/region/options.rs @@ -50,7 +50,7 @@ pub(crate) fn parse_wal_options( } /// Mode to handle duplicate rows while merging. -#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, EnumString)] +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, EnumString)] #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] pub enum MergeMode { From e0aadffb911cece5988bf981a126b2a744337490 Mon Sep 17 00:00:00 2001 From: Yingwen Date: Tue, 17 Mar 2026 15:55:48 +0800 Subject: [PATCH 014/195] feat: add flat last row reader to the final stream (#7818) Signed-off-by: evenyag --- src/mito2/src/engine/row_selector_test.rs | 25 +++++++++++---- src/mito2/src/read/last_row.rs | 38 ++++++++++++++++++++++- src/mito2/src/read/seq_scan.rs | 9 +++++- 3 files changed, 64 insertions(+), 8 deletions(-) diff --git a/src/mito2/src/engine/row_selector_test.rs b/src/mito2/src/engine/row_selector_test.rs index 317ede5a97..d79152e57f 100644 --- a/src/mito2/src/engine/row_selector_test.rs +++ b/src/mito2/src/engine/row_selector_test.rs @@ -24,7 +24,7 @@ use crate::test_util::{ CreateRequestBuilder, TestEnv, build_rows_for_key, flush_region, put_rows, rows_schema, }; -async fn test_last_row(append_mode: bool) { +async fn test_last_row(append_mode: bool, flat_format: bool) { let mut env = TestEnv::new().await; let engine = env.create_engine(MitoConfig::default()).await; let region_id = RegionId::new(1, 1); @@ -39,9 +39,12 @@ async fn test_last_row(append_mode: bool) { env.get_kv_backend(), ) .await; - let request = CreateRequestBuilder::new() - .insert_option("append_mode", &append_mode.to_string()) - .build(); + let mut request_builder = + CreateRequestBuilder::new().insert_option("append_mode", &append_mode.to_string()); + if flat_format { + request_builder = request_builder.insert_option("sst_format", "flat"); + } + let request = request_builder.build(); let column_schemas = rows_schema(&request); engine .handle_request(region_id, RegionRequest::Create(request)) @@ -106,10 +109,20 @@ async fn test_last_row(append_mode: bool) { #[tokio::test] async fn test_last_row_append_mode_disabled() { - test_last_row(false).await; + test_last_row(false, false).await; } #[tokio::test] async fn test_last_row_append_mode_enabled() { - test_last_row(true).await; + test_last_row(true, false).await; +} + +#[tokio::test] +async fn test_last_row_flat_format_append_mode_disabled() { + test_last_row(false, true).await; +} + +#[tokio::test] +async fn test_last_row_flat_format_append_mode_enabled() { + test_last_row(true, true).await; } diff --git a/src/mito2/src/read/last_row.rs b/src/mito2/src/read/last_row.rs index c2336f218d..0c13c120a0 100644 --- a/src/mito2/src/read/last_row.rs +++ b/src/mito2/src/read/last_row.rs @@ -21,6 +21,7 @@ use datatypes::arrow::array::{Array, BinaryArray}; use datatypes::arrow::compute::concat_batches; use datatypes::arrow::record_batch::RecordBatch; use datatypes::vectors::UInt32Vector; +use futures::{Stream, TryStreamExt}; use snafu::ResultExt; use store_api::storage::{FileId, TimeSeriesRowSelector}; @@ -30,7 +31,7 @@ use crate::cache::{ }; use crate::error::{ComputeArrowSnafu, Result}; use crate::memtable::partition_tree::data::timestamp_array_to_i64_slice; -use crate::read::{Batch, BatchReader, BoxedBatchReader}; +use crate::read::{Batch, BatchReader, BoxedBatchReader, BoxedRecordBatchStream}; use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE; use crate::sst::parquet::flat_format::{primary_key_column_index, time_index_column_index}; use crate::sst::parquet::format::{PrimaryKeyArray, primary_key_offsets}; @@ -610,6 +611,41 @@ impl FlatLastTimestampSelector { } } +/// Reader that keeps only the last row of each time series from a flat RecordBatch stream. +/// Assumes input is sorted, deduped, and contains no delete operations. +pub(crate) struct FlatLastRowReader { + stream: BoxedRecordBatchStream, + selector: FlatLastTimestampSelector, + pending: BatchBuffer, +} + +impl FlatLastRowReader { + /// Creates a new `FlatLastRowReader`. + pub(crate) fn new(stream: BoxedRecordBatchStream) -> Self { + Self { + stream, + selector: FlatLastTimestampSelector::default(), + pending: BatchBuffer::new(), + } + } + + /// Converts the reader into a stream of RecordBatches. + pub(crate) fn into_stream(mut self) -> impl Stream> { + async_stream::try_stream! { + while let Some(batch) = self.stream.try_next().await? { + self.selector.on_next(batch, &mut self.pending)?; + if self.pending.is_full() { + yield self.pending.concat()?; + } + } + self.selector.finish(&mut self.pending)?; + if !self.pending.is_empty() { + yield self.pending.concat()?; + } + } + } +} + /// Gets the primary key bytes at `index` from the primary key dictionary column. fn primary_key_bytes_at(batch: &RecordBatch, pk_col_idx: usize, index: usize) -> &[u8] { let pk_dict = batch diff --git a/src/mito2/src/read/seq_scan.rs b/src/mito2/src/read/seq_scan.rs index d2be17cc83..a1b3b8f350 100644 --- a/src/mito2/src/read/seq_scan.rs +++ b/src/mito2/src/read/seq_scan.rs @@ -39,7 +39,7 @@ use crate::error::{PartitionOutOfRangeSnafu, Result, TooManyFilesToReadSnafu, Un use crate::read::dedup::{DedupReader, LastNonNull, LastRow}; use crate::read::flat_dedup::{FlatDedupReader, FlatLastNonNull, FlatLastRow}; use crate::read::flat_merge::FlatMergeReader; -use crate::read::last_row::LastRowReader; +use crate::read::last_row::{FlatLastRowReader, LastRowReader}; use crate::read::merge::MergeReaderBuilder; use crate::read::pruner::{PartitionPruner, Pruner}; use crate::read::range::RangeMeta; @@ -289,6 +289,13 @@ impl SeqScan { Box::pin(reader.into_stream()) as _ }; + let reader = match &stream_ctx.input.series_row_selector { + Some(TimeSeriesRowSelector::LastRow) => { + Box::pin(FlatLastRowReader::new(reader).into_stream()) as _ + } + None => reader, + }; + Ok(reader) } From dc98e0215bd19312f136dfecd5f3d64fc26023b7 Mon Sep 17 00:00:00 2001 From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com> Date: Tue, 17 Mar 2026 19:28:06 +0800 Subject: [PATCH 015/195] feat(metric-engine): support bulk inserts with put fallback (#7792) * feat(metric-engine): support bulk inserts Implement `RegionRequest::BulkInserts` to support efficient columnar data ingestion in the metric engine. Key changes: - Implement `bulk_insert_region` to handle logical-to-physical region mapping and dispatch writes. - Add `batch_modifier` for `RecordBatch` transformations, specifically for `__tsid` generation and sparse primary key encoding. - Integrate `BulkInserts` into the `MetricEngine` request handling logic. - Provide a row-based fallback mechanism if the underlying storage doesn't support bulk writes. Signed-off-by: Lei, HUANG * feat/metric-engine-bulk-insert: ### Update `bulk_insert.rs` to Support Partition Expression Version - **Enhancements**: - Added support for `partition_expr_version` in `RegionBulkInsertsRequest` and `RegionPutRequest`. - Modified the handling of `partition_expr_version` to be dynamically set from the `request` object. Files affected: - `src/metric-engine/src/engine/bulk_insert.rs` Signed-off-by: Lei, HUANG * fix: cargo lock revert Signed-off-by: Lei, HUANG * add doc for conversions Signed-off-by: Lei, HUANG * chore: simplify test Signed-off-by: Lei, HUANG * feat/metric-engine-bulk-insert: ### Refactor `bulk_insert.rs` in `metric-engine` - **Refactor Functionality**: - Replaced `resolve_tag_columns` with `resolve_tag_columns_from_metadata` to streamline tag column resolution. - Moved logic for resolving tag columns directly into `resolve_tag_columns_from_metadata`, removing the need for an external function call. - **Enhancements**: - Improved error handling and context provision for missing physical regions and columns. - Optimized tag column sorting and index management within the batch processing logic. Signed-off-by: Lei, HUANG * feat/metric-engine-bulk-insert: ### Refactor `record_batch_to_rows` Function in `bulk_insert.rs` - Simplified the `record_batch_to_rows` function by removing the `logical_metadata` parameter and directly validating column types within the function. - Enhanced error handling for timestamp, value, and tag columns by checking their data types and providing detailed error messages. - Replaced the use of `Helper::try_into_vector` with direct downcasting to `TimestampMillisecondArray`, `Float64Array`, and `StringArray` for improved type safety and clarity. - Updated the construction of `api::v1::Rows` to directly handle null values and construct `api::v1::Value` objects accordingly. Signed-off-by: Lei, HUANG * feat/metric-engine-bulk-insert: ## Commit Message Refactor `bulk_insert.rs` to optimize state access - Moved the state read operation inside a new block to limit its scope and improve code clarity. - Adjusted logic for processing `tag_columns` and `non_tag_indices` to work within the new block structure. Signed-off-by: Lei, HUANG * feat/metric-engine-bulk-insert: ### Refactor `compute_tsid_array` Function - **Refactored `compute_tsid_array` function**: Modified the function signature to accept `tag_arrays` as a parameter instead of building it internally. This change affects the following files: - `src/metric-engine/src/batch_modifier.rs` - **Updated test cases**: Adjusted test cases to accommodate the new `compute_tsid_array` function signature by passing `tag_arrays` explicitly. Signed-off-by: Lei, HUANG * docs: add doc for bulk_insert_region Signed-off-by: Lei, HUANG * feat/metric-engine-bulk-insert: ### Commit Message Refactor `bulk_insert.rs` in `metric-engine`: - Removed error handling for unsupported status codes in `write_data` method. - Eliminated `record_batch_to_rows` function, simplifying the data insertion process. - Streamlined the `write_data` method by removing fallback logic for unsupported operations. Signed-off-by: Lei, HUANG * feat/metric-engine-bulk-insert: - **Optimize Primary Key Construction**: Refactored `modify_batch_sparse` in `batch_modifier.rs` to use `BinaryBuilder` for more efficient primary key construction. - **Add Fallback for Unsupported Bulk Inserts**: Updated `bulk_insert.rs` to handle unsupported bulk inserts by converting record batches to rows and using `RegionPutRequest`. - **Implement Record Batch to Rows Conversion**: Added `record_batch_to_rows` function in `bulk_insert.rs` to convert `RecordBatch` to `api::v1::Rows` for fallback operations. Signed-off-by: Lei, HUANG * feat/metric-engine-bulk-insert: Add test for handling null values in `record_batch_to_rows` - Added a new test `test_record_batch_to_rows_with_null_values` in `bulk_insert.rs` to verify the handling of null values in the `record_batch_to_rows` function. - The test checks the conversion of a `RecordBatch` with null values in various fields to ensure correct row creation and schema handling. Signed-off-by: Lei, HUANG * feat/metric-engine-bulk-insert: Add fallback path for unsupported status and improve error context handling - **`bulk_insert.rs`**: - Added a fallback path for `PartitionTreeMemtable` in case of unsupported status code. - Enhanced error handling by using `with_context` for better error messages when timestamp and value columns are not found in `RecordBatch`. Signed-off-by: Lei, HUANG --------- Signed-off-by: Lei, HUANG --- Cargo.lock | 1 + src/metric-engine/Cargo.toml | 1 + src/metric-engine/src/batch_modifier.rs | 426 +++++++++++ src/metric-engine/src/engine.rs | 6 +- src/metric-engine/src/engine/bulk_insert.rs | 783 ++++++++++++++++++++ src/metric-engine/src/engine/put.rs | 2 +- src/metric-engine/src/lib.rs | 1 + 7 files changed, 1216 insertions(+), 4 deletions(-) create mode 100644 src/metric-engine/src/batch_modifier.rs create mode 100644 src/metric-engine/src/engine/bulk_insert.rs diff --git a/Cargo.lock b/Cargo.lock index 1f65f1289c..605b037fc9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7886,6 +7886,7 @@ dependencies = [ "common-base", "common-error", "common-function", + "common-grpc", "common-macro", "common-meta", "common-query", diff --git a/src/metric-engine/Cargo.toml b/src/metric-engine/Cargo.toml index 567210b952..5b561997ab 100644 --- a/src/metric-engine/Cargo.toml +++ b/src/metric-engine/Cargo.toml @@ -17,6 +17,7 @@ bytes.workspace = true fxhash = "0.2" common-base.workspace = true common-error.workspace = true +common-grpc.workspace = true common-macro.workspace = true common-query.workspace = true common-recordbatch.workspace = true diff --git a/src/metric-engine/src/batch_modifier.rs b/src/metric-engine/src/batch_modifier.rs new file mode 100644 index 0000000000..8a5774889b --- /dev/null +++ b/src/metric-engine/src/batch_modifier.rs @@ -0,0 +1,426 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::hash::Hasher; +use std::sync::Arc; + +use datatypes::arrow::array::{Array, BinaryBuilder, StringArray, UInt64Array}; +use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; +use datatypes::arrow::record_batch::RecordBatch; +use datatypes::value::ValueRef; +use fxhash::FxHasher; +use mito_codec::row_converter::SparsePrimaryKeyCodec; +use snafu::ResultExt; +use store_api::storage::ColumnId; +use store_api::storage::consts::{PRIMARY_KEY_COLUMN_NAME, ReservedColumnId}; + +use crate::error::{EncodePrimaryKeySnafu, Result, UnexpectedRequestSnafu}; + +/// Info about a tag column for TSID computation and sparse primary key encoding. +#[allow(dead_code)] +pub(crate) struct TagColumnInfo { + /// Column name (used for label-name hash). + pub name: String, + /// Column index in the RecordBatch. + pub index: usize, + /// Column ID in the physical region. + pub column_id: ColumnId, +} + +/// Computes `__tsid` values for each row. +#[allow(dead_code)] +pub(crate) fn compute_tsid_array( + batch: &RecordBatch, + sorted_tag_columns: &[TagColumnInfo], + tag_arrays: &[&StringArray], +) -> UInt64Array { + let num_rows = batch.num_rows(); + + let label_name_hash = { + let mut hasher = FxHasher::default(); + for tag_col in sorted_tag_columns { + hasher.write(tag_col.name.as_bytes()); + hasher.write_u8(0xff); + } + hasher.finish() + }; + + let mut tsid_values = Vec::with_capacity(num_rows); + for row in 0..num_rows { + let has_null = tag_arrays.iter().any(|arr| arr.is_null(row)); + + let tsid = if !has_null { + let mut hasher = FxHasher::default(); + hasher.write_u64(label_name_hash); + for arr in tag_arrays { + hasher.write(arr.value(row).as_bytes()); + hasher.write_u8(0xff); + } + hasher.finish() + } else { + let mut name_hasher = FxHasher::default(); + for (tc, arr) in sorted_tag_columns.iter().zip(tag_arrays.iter()) { + if !arr.is_null(row) { + name_hasher.write(tc.name.as_bytes()); + name_hasher.write_u8(0xff); + } + } + let row_label_hash = name_hasher.finish(); + + let mut val_hasher = FxHasher::default(); + val_hasher.write_u64(row_label_hash); + for arr in tag_arrays { + if !arr.is_null(row) { + val_hasher.write(arr.value(row).as_bytes()); + val_hasher.write_u8(0xff); + } + } + val_hasher.finish() + }; + + tsid_values.push(tsid); + } + + UInt64Array::from(tsid_values) +} + +fn build_tag_arrays<'a>( + batch: &'a RecordBatch, + sorted_tag_columns: &[TagColumnInfo], +) -> Vec<&'a StringArray> { + sorted_tag_columns + .iter() + .map(|tc| { + batch + .column(tc.index) + .as_any() + .downcast_ref::() + .expect("tag column must be utf8") + }) + .collect() +} + +/// Modifies a RecordBatch for sparse primary key encoding. +#[allow(dead_code)] +pub(crate) fn modify_batch_sparse( + batch: RecordBatch, + table_id: u32, + sorted_tag_columns: &[TagColumnInfo], + non_tag_column_indices: &[usize], +) -> Result { + let num_rows = batch.num_rows(); + let codec = SparsePrimaryKeyCodec::schemaless(); + let tag_arrays: Vec<&StringArray> = build_tag_arrays(&batch, sorted_tag_columns); + let tsid_array = compute_tsid_array(&batch, sorted_tag_columns, &tag_arrays); + + let mut pk_builder = BinaryBuilder::with_capacity(num_rows, 0); + let mut buffer = Vec::new(); + for row in 0..num_rows { + buffer.clear(); + let internal = [ + (ReservedColumnId::table_id(), ValueRef::UInt32(table_id)), + ( + ReservedColumnId::tsid(), + ValueRef::UInt64(tsid_array.value(row)), + ), + ]; + codec + .encode_to_vec(internal.into_iter(), &mut buffer) + .context(EncodePrimaryKeySnafu)?; + + let tags = sorted_tag_columns + .iter() + .zip(tag_arrays.iter()) + .filter(|(_, arr)| !arr.is_null(row)) + .map(|(tc, arr)| (tc.column_id, ValueRef::String(arr.value(row)))); + codec + .encode_to_vec(tags, &mut buffer) + .context(EncodePrimaryKeySnafu)?; + + pk_builder.append_value(&buffer); + } + + let pk_array = pk_builder.finish(); + + let mut fields = vec![Arc::new(Field::new( + PRIMARY_KEY_COLUMN_NAME, + DataType::Binary, + false, + ))]; + let mut columns: Vec> = vec![Arc::new(pk_array)]; + + for &idx in non_tag_column_indices { + fields.push(batch.schema().fields()[idx].clone()); + columns.push(batch.column(idx).clone()); + } + + let new_schema = Arc::new(ArrowSchema::new(fields)); + RecordBatch::try_new(new_schema, columns).map_err(|e| { + UnexpectedRequestSnafu { + reason: format!("Failed to build modified sparse RecordBatch: {e}"), + } + .build() + }) +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::sync::Arc; + + use api::v1::value::ValueData; + use api::v1::{ColumnDataType, ColumnSchema, Row, Rows, SemanticType, Value}; + use datatypes::arrow::array::{BinaryArray, Int64Array, StringArray}; + use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + use datatypes::arrow::record_batch::RecordBatch; + use store_api::codec::PrimaryKeyEncoding; + use store_api::storage::consts::PRIMARY_KEY_COLUMN_NAME; + + use super::*; + use crate::row_modifier::{RowModifier, RowsIter, TableIdInput}; + + fn build_sparse_test_batch() -> RecordBatch { + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("greptime_timestamp", DataType::Int64, false), + Field::new("greptime_value", DataType::Float64, true), + Field::new("namespace", DataType::Utf8, true), + Field::new("host", DataType::Utf8, true), + ])); + RecordBatch::try_new( + schema, + vec![ + Arc::new(Int64Array::from(vec![1000])), + Arc::new(datatypes::arrow::array::Float64Array::from(vec![42.0])), + Arc::new(StringArray::from(vec!["greptimedb"])), + Arc::new(StringArray::from(vec!["127.0.0.1"])), + ], + ) + .unwrap() + } + + fn sparse_tag_columns() -> Vec { + vec![ + TagColumnInfo { + name: "host".to_string(), + index: 3, + column_id: 3, + }, + TagColumnInfo { + name: "namespace".to_string(), + index: 2, + column_id: 2, + }, + ] + } + + #[test] + fn test_compute_tsid_basic() { + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("namespace", DataType::Utf8, true), + Field::new("host", DataType::Utf8, true), + ])); + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(StringArray::from(vec!["greptimedb"])), + Arc::new(StringArray::from(vec!["127.0.0.1"])), + ], + ) + .unwrap(); + + let tag_columns: Vec = vec![ + TagColumnInfo { + name: "host".to_string(), + index: 1, + column_id: 2, + }, + TagColumnInfo { + name: "namespace".to_string(), + index: 0, + column_id: 1, + }, + ]; + let tag_arrays = build_tag_arrays(&batch, &tag_columns); + let tsid_array = compute_tsid_array(&batch, &tag_columns, &tag_arrays); + + assert_eq!(tsid_array.value(0), 2721566936019240841); + } + + #[test] + fn test_compute_tsid_with_nulls() { + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Utf8, true), + ])); + let batch_no_null = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(StringArray::from(vec!["A"])), + Arc::new(StringArray::from(vec!["B"])), + ], + ) + .unwrap(); + let tag_cols_2: Vec = vec![ + TagColumnInfo { + name: "a".to_string(), + index: 0, + column_id: 1, + }, + TagColumnInfo { + name: "b".to_string(), + index: 1, + column_id: 2, + }, + ]; + let tag_arrays_2 = build_tag_arrays(&batch_no_null, &tag_cols_2); + let tsid_no_null = compute_tsid_array(&batch_no_null, &tag_cols_2, &tag_arrays_2); + + let schema3 = Arc::new(ArrowSchema::new(vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Utf8, true), + Field::new("c", DataType::Utf8, true), + ])); + let batch_with_null = RecordBatch::try_new( + schema3, + vec![ + Arc::new(StringArray::from(vec!["A"])), + Arc::new(StringArray::from(vec!["B"])), + Arc::new(StringArray::from(vec![None as Option<&str>])), + ], + ) + .unwrap(); + let tag_cols_3: Vec = vec![ + TagColumnInfo { + name: "a".to_string(), + index: 0, + column_id: 1, + }, + TagColumnInfo { + name: "b".to_string(), + index: 1, + column_id: 2, + }, + TagColumnInfo { + name: "c".to_string(), + index: 2, + column_id: 3, + }, + ]; + let tag_arrays_3 = build_tag_arrays(&batch_with_null, &tag_cols_3); + let tsid_with_null = compute_tsid_array(&batch_with_null, &tag_cols_3, &tag_arrays_3); + + assert_eq!(tsid_no_null.value(0), tsid_with_null.value(0)); + } + + #[test] + fn test_modify_batch_sparse() { + let batch = build_sparse_test_batch(); + let tag_columns = sparse_tag_columns(); + let non_tag_indices = vec![0, 1]; + let table_id: u32 = 1025; + + let modified = + modify_batch_sparse(batch, table_id, &tag_columns, &non_tag_indices).unwrap(); + + assert_eq!(modified.num_columns(), 3); + assert_eq!(modified.schema().field(0).name(), PRIMARY_KEY_COLUMN_NAME); + assert_eq!(modified.schema().field(1).name(), "greptime_timestamp"); + assert_eq!(modified.schema().field(2).name(), "greptime_value"); + } + + #[test] + fn test_modify_batch_sparse_matches_row_modifier() { + let batch = build_sparse_test_batch(); + let tag_columns = sparse_tag_columns(); + let non_tag_indices = vec![0, 1]; + let table_id: u32 = 1025; + let modified = + modify_batch_sparse(batch, table_id, &tag_columns, &non_tag_indices).unwrap(); + + let name_to_column_id: HashMap = [ + ("greptime_timestamp".to_string(), 0), + ("greptime_value".to_string(), 1), + ("namespace".to_string(), 2), + ("host".to_string(), 3), + ] + .into_iter() + .collect(); + + let rows = Rows { + schema: vec![ + ColumnSchema { + column_name: "greptime_timestamp".to_string(), + datatype: ColumnDataType::TimestampMillisecond as i32, + semantic_type: SemanticType::Timestamp as i32, + ..Default::default() + }, + ColumnSchema { + column_name: "greptime_value".to_string(), + datatype: ColumnDataType::Float64 as i32, + semantic_type: SemanticType::Field as i32, + ..Default::default() + }, + ColumnSchema { + column_name: "namespace".to_string(), + datatype: ColumnDataType::String as i32, + semantic_type: SemanticType::Tag as i32, + ..Default::default() + }, + ColumnSchema { + column_name: "host".to_string(), + datatype: ColumnDataType::String as i32, + semantic_type: SemanticType::Tag as i32, + ..Default::default() + }, + ], + rows: vec![Row { + values: vec![ + Value { + value_data: Some(ValueData::TimestampMillisecondValue(1000)), + }, + Value { + value_data: Some(ValueData::F64Value(42.0)), + }, + Value { + value_data: Some(ValueData::StringValue("greptimedb".to_string())), + }, + Value { + value_data: Some(ValueData::StringValue("127.0.0.1".to_string())), + }, + ], + }], + }; + + let row_iter = RowsIter::new(rows, &name_to_column_id); + let rows = RowModifier::default() + .modify_rows( + row_iter, + TableIdInput::Single(table_id), + PrimaryKeyEncoding::Sparse, + ) + .unwrap(); + let ValueData::BinaryValue(expected_pk) = + rows.rows[0].values[0].value_data.clone().unwrap() + else { + panic!("expected binary primary key"); + }; + + let actual_array = modified + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(actual_array.value(0), expected_pk.as_slice()); + } +} diff --git a/src/metric-engine/src/engine.rs b/src/metric-engine/src/engine.rs index 7a1efedac4..ba90ca960d 100644 --- a/src/metric-engine/src/engine.rs +++ b/src/metric-engine/src/engine.rs @@ -13,6 +13,7 @@ // limitations under the License. mod alter; +mod bulk_insert; mod catchup; mod close; mod create; @@ -288,9 +289,8 @@ impl RegionEngine for MetricEngine { debug_assert_eq!(region_id, resp_region_id); return response; } - RegionRequest::BulkInserts(_) => { - // todo(hl): find a way to support bulk inserts in metric engine. - UnsupportedRegionRequestSnafu { request }.fail() + RegionRequest::BulkInserts(bulk) => { + self.inner.bulk_insert_region(region_id, bulk).await } }; diff --git a/src/metric-engine/src/engine/bulk_insert.rs b/src/metric-engine/src/engine/bulk_insert.rs new file mode 100644 index 0000000000..2a3c26c80c --- /dev/null +++ b/src/metric-engine/src/engine/bulk_insert.rs @@ -0,0 +1,783 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashSet; + +use api::v1::{ArrowIpc, ColumnDataType, SemanticType}; +use bytes::Bytes; +use common_error::ext::ErrorExt; +use common_error::status_code::StatusCode; +use common_grpc::flight::{FlightEncoder, FlightMessage}; +use common_query::prelude::{greptime_timestamp, greptime_value}; +use datatypes::arrow::array::{Array, Float64Array, StringArray, TimestampMillisecondArray}; +use datatypes::arrow::record_batch::RecordBatch; +use snafu::{OptionExt, ensure}; +use store_api::codec::PrimaryKeyEncoding; +use store_api::metadata::RegionMetadataRef; +use store_api::region_request::{ + AffectedRows, RegionBulkInsertsRequest, RegionPutRequest, RegionRequest, +}; +use store_api::storage::RegionId; + +use crate::batch_modifier::{TagColumnInfo, modify_batch_sparse}; +use crate::engine::MetricEngineInner; +use crate::error; +use crate::error::Result; + +impl MetricEngineInner { + /// Bulk-inserts logical rows into a metric region. + /// + /// This method accepts a `RegionBulkInsertsRequest` whose payload is a logical + /// `RecordBatch` (timestamp, value and tag columns) for the given logical `region_id`. + /// + /// The transformed batch is encoded to Arrow IPC and forwarded as a `BulkInserts` + /// request to the data region, along with the original `partition_expr_version`. + /// If the data region reports `StatusCode::Unsupported` for bulk inserts, the request + /// is transparently retried as a `Put` by converting the original logical batch into + /// `api::v1::Rows`, so callers observe the same semantics as `put_region`. + /// + /// Returns the number of affected rows, or `0` if the input batch is empty. + pub async fn bulk_insert_region( + &self, + region_id: RegionId, + request: RegionBulkInsertsRequest, + ) -> Result { + ensure!( + !self.is_physical_region(region_id), + error::UnsupportedRegionRequestSnafu { + request: RegionRequest::BulkInserts(request), + } + ); + + let (physical_region_id, data_region_id, primary_key_encoding) = + self.find_data_region_meta(region_id)?; + + if primary_key_encoding != PrimaryKeyEncoding::Sparse { + return error::UnsupportedRegionRequestSnafu { + request: RegionRequest::BulkInserts(request), + } + .fail(); + } + + let batch = request.payload; + if batch.num_rows() == 0 { + return Ok(0); + } + + let logical_metadata = self + .logical_region_metadata(physical_region_id, region_id) + .await?; + let (tag_columns, non_tag_indices) = self.resolve_tag_columns_from_metadata( + region_id, + data_region_id, + &batch, + &logical_metadata, + )?; + let modified_batch = modify_batch_sparse( + batch.clone(), + region_id.table_id(), + &tag_columns, + &non_tag_indices, + )?; + let (schema, data_header, payload) = record_batch_to_ipc(&modified_batch)?; + + let partition_expr_version = request.partition_expr_version; + let request = RegionBulkInsertsRequest { + region_id: data_region_id, + payload: modified_batch, + raw_data: ArrowIpc { + schema, + data_header, + payload, + }, + partition_expr_version, + }; + match self + .data_region + .write_data(data_region_id, RegionRequest::BulkInserts(request)) + .await + { + Ok(affected_rows) => Ok(affected_rows), + Err(err) if err.status_code() == StatusCode::Unsupported => { + // todo(hl): fallback path for PartitionTreeMemtable, remove this once we remove it + let rows = record_batch_to_rows(&batch, region_id)?; + self.put_region( + region_id, + RegionPutRequest { + rows, + hint: None, + partition_expr_version, + }, + ) + .await + } + Err(err) => Err(err), + } + } + + fn resolve_tag_columns_from_metadata( + &self, + logical_region_id: RegionId, + data_region_id: RegionId, + batch: &RecordBatch, + logical_metadata: &RegionMetadataRef, + ) -> Result<(Vec, Vec)> { + let tag_names: HashSet<&str> = logical_metadata + .column_metadatas + .iter() + .filter_map(|column| { + if column.semantic_type == SemanticType::Tag { + Some(column.column_schema.name.as_str()) + } else { + None + } + }) + .collect(); + + let mut tag_columns = Vec::new(); + let mut non_tag_indices = Vec::new(); + { + let state = self.state.read().unwrap(); + let physical_columns = state + .physical_region_states() + .get(&data_region_id) + .context(error::PhysicalRegionNotFoundSnafu { + region_id: data_region_id, + })? + .physical_columns(); + + for (index, field) in batch.schema().fields().iter().enumerate() { + let name = field.name(); + let column_id = + *physical_columns + .get(name) + .with_context(|| error::ColumnNotFoundSnafu { + name: name.clone(), + region_id: logical_region_id, + })?; + if tag_names.contains(name.as_str()) { + tag_columns.push(TagColumnInfo { + name: name.clone(), + index, + column_id, + }); + } else { + non_tag_indices.push(index); + } + } + } + + tag_columns.sort_by(|a, b| a.name.cmp(&b.name)); + Ok((tag_columns, non_tag_indices)) + } +} + +fn record_batch_to_rows(batch: &RecordBatch, logical_region_id: RegionId) -> Result { + let schema_ref = batch.schema(); + let fields = schema_ref.fields(); + + let mut ts_idx = None; + let mut val_idx = None; + let mut tag_indices = Vec::new(); + + for (idx, field) in fields.iter().enumerate() { + if field.name() == greptime_timestamp() { + ts_idx = Some(idx); + if !matches!( + field.data_type(), + datatypes::arrow::datatypes::DataType::Timestamp( + datatypes::arrow::datatypes::TimeUnit::Millisecond, + _ + ) + ) { + return error::UnexpectedRequestSnafu { + reason: format!( + "Timestamp column '{}' in region {:?} has incompatible type: {:?}", + field.name(), + logical_region_id, + field.data_type() + ), + } + .fail(); + } + } else if field.name() == greptime_value() { + val_idx = Some(idx); + if !matches!( + field.data_type(), + datatypes::arrow::datatypes::DataType::Float64 + ) { + return error::UnexpectedRequestSnafu { + reason: format!( + "Value column '{}' in region {:?} has incompatible type: {:?}", + field.name(), + logical_region_id, + field.data_type() + ), + } + .fail(); + } + } else { + if !matches!( + field.data_type(), + datatypes::arrow::datatypes::DataType::Utf8 + ) { + return error::UnexpectedRequestSnafu { + reason: format!( + "Tag column '{}' in region {:?} must be Utf8, found: {:?}", + field.name(), + logical_region_id, + field.data_type() + ), + } + .fail(); + } + tag_indices.push(idx); + } + } + + let ts_idx = ts_idx.with_context(|| error::UnexpectedRequestSnafu { + reason: format!( + "Timestamp column '{}' not found in RecordBatch for region {:?}", + greptime_timestamp(), + logical_region_id + ), + })?; + let val_idx = val_idx.with_context(|| error::UnexpectedRequestSnafu { + reason: format!( + "Value column '{}' not found in RecordBatch for region {:?}", + greptime_value(), + logical_region_id + ), + })?; + + let mut schema = Vec::with_capacity(2 + tag_indices.len()); + schema.push(api::v1::ColumnSchema { + column_name: greptime_timestamp().to_string(), + datatype: ColumnDataType::TimestampMillisecond as i32, + semantic_type: SemanticType::Timestamp as i32, + datatype_extension: None, + options: None, + }); + schema.push(api::v1::ColumnSchema { + column_name: greptime_value().to_string(), + datatype: ColumnDataType::Float64 as i32, + semantic_type: SemanticType::Field as i32, + datatype_extension: None, + options: None, + }); + for &idx in &tag_indices { + let field = &fields[idx]; + schema.push(api::v1::ColumnSchema { + column_name: field.name().clone(), + datatype: ColumnDataType::String as i32, + semantic_type: SemanticType::Tag as i32, + datatype_extension: None, + options: None, + }); + } + + let ts_array = batch + .column(ts_idx) + .as_any() + .downcast_ref::() + .expect("validated as TimestampMillisecond"); + let val_array = batch + .column(val_idx) + .as_any() + .downcast_ref::() + .expect("validated as Float64"); + let tag_arrays: Vec<&StringArray> = tag_indices + .iter() + .map(|&idx| { + batch + .column(idx) + .as_any() + .downcast_ref::() + .expect("validated as Utf8") + }) + .collect(); + + let num_rows = batch.num_rows(); + let mut rows = Vec::with_capacity(num_rows); + for row_idx in 0..num_rows { + let mut values = Vec::with_capacity(2 + tag_arrays.len()); + + if ts_array.is_null(row_idx) { + values.push(api::v1::Value { value_data: None }); + } else { + values.push(api::v1::Value { + value_data: Some(api::v1::value::ValueData::TimestampMillisecondValue( + ts_array.value(row_idx), + )), + }); + } + + if val_array.is_null(row_idx) { + values.push(api::v1::Value { value_data: None }); + } else { + values.push(api::v1::Value { + value_data: Some(api::v1::value::ValueData::F64Value( + val_array.value(row_idx), + )), + }); + } + + for arr in &tag_arrays { + if arr.is_null(row_idx) { + values.push(api::v1::Value { value_data: None }); + } else { + values.push(api::v1::Value { + value_data: Some(api::v1::value::ValueData::StringValue( + arr.value(row_idx).to_string(), + )), + }); + } + } + + rows.push(api::v1::Row { values }); + } + + Ok(api::v1::Rows { schema, rows }) +} + +fn record_batch_to_ipc(record_batch: &RecordBatch) -> Result<(Bytes, Bytes, Bytes)> { + let mut encoder = FlightEncoder::default(); + let schema = encoder.encode_schema(record_batch.schema().as_ref()); + let mut iter = encoder + .encode(FlightMessage::RecordBatch(record_batch.clone())) + .into_iter(); + + let Some(flight_data) = iter.next() else { + return error::UnexpectedRequestSnafu { + reason: "Failed to encode empty flight data", + } + .fail(); + }; + ensure!( + iter.next().is_none(), + error::UnexpectedRequestSnafu { + reason: "Bulk insert RecordBatch with dictionary arrays is unsupported".to_string(), + } + ); + + Ok(( + schema.data_header, + flight_data.data_header, + flight_data.data_body, + )) +} + +#[cfg(test)] +mod tests { + use std::assert_matches::assert_matches; + use std::sync::Arc; + + use api::v1::ArrowIpc; + use common_error::ext::ErrorExt; + use common_query::prelude::{greptime_timestamp, greptime_value}; + use common_recordbatch::RecordBatches; + use datatypes::arrow::array::{Float64Array, StringArray, TimestampMillisecondArray}; + use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema, TimeUnit}; + use datatypes::arrow::record_batch::RecordBatch; + use store_api::metric_engine_consts::MEMTABLE_PARTITION_TREE_PRIMARY_KEY_ENCODING; + use store_api::path_utils::table_dir; + use store_api::region_engine::RegionEngine; + use store_api::region_request::{RegionBulkInsertsRequest, RegionPutRequest, RegionRequest}; + use store_api::storage::{RegionId, ScanRequest}; + + use super::record_batch_to_ipc; + use crate::error::Error; + use crate::test_util::{self, TestEnv}; + + fn build_logical_batch(start: usize, rows: usize) -> RecordBatch { + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + greptime_timestamp(), + DataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + Field::new(greptime_value(), DataType::Float64, true), + Field::new("job", DataType::Utf8, true), + ])); + + let mut ts = Vec::with_capacity(rows); + let mut values = Vec::with_capacity(rows); + let mut tags = Vec::with_capacity(rows); + for i in start..start + rows { + ts.push(i as i64); + values.push(i as f64); + tags.push("tag_0".to_string()); + } + + RecordBatch::try_new( + schema, + vec![ + Arc::new(TimestampMillisecondArray::from(ts)), + Arc::new(Float64Array::from(values)), + Arc::new(StringArray::from(tags)), + ], + ) + .unwrap() + } + + fn build_bulk_request(logical_region_id: RegionId, batch: RecordBatch) -> RegionRequest { + let (schema, data_header, payload) = record_batch_to_ipc(&batch).unwrap(); + RegionRequest::BulkInserts(RegionBulkInsertsRequest { + region_id: logical_region_id, + payload: batch, + raw_data: ArrowIpc { + schema, + data_header, + payload, + }, + partition_expr_version: None, + }) + } + + async fn init_dense_metric_region(env: &TestEnv) -> RegionId { + let physical_region_id = env.default_physical_region_id(); + env.create_physical_region( + physical_region_id, + &TestEnv::default_table_dir(), + vec![( + MEMTABLE_PARTITION_TREE_PRIMARY_KEY_ENCODING.to_string(), + "dense".to_string(), + )], + ) + .await; + + let logical_region_id = env.default_logical_region_id(); + let request = test_util::create_logical_region_request( + &["job"], + physical_region_id, + &table_dir("test", logical_region_id.table_id()), + ); + env.metric() + .handle_request(logical_region_id, RegionRequest::Create(request)) + .await + .unwrap(); + logical_region_id + } + + #[tokio::test] + async fn test_bulk_insert_empty_batch_returns_zero() { + let env = TestEnv::new().await; + env.init_metric_region().await; + let logical_region_id = env.default_logical_region_id(); + + let batch = build_logical_batch(0, 0); + let request = RegionRequest::BulkInserts(RegionBulkInsertsRequest { + region_id: logical_region_id, + payload: batch, + raw_data: ArrowIpc::default(), + partition_expr_version: None, + }); + let response = env + .metric() + .handle_request(logical_region_id, request) + .await + .unwrap(); + assert_eq!(response.affected_rows, 0); + } + + #[tokio::test] + async fn test_bulk_insert_physical_region_rejected() { + let env = TestEnv::new().await; + env.init_metric_region().await; + + let physical_region_id = env.default_physical_region_id(); + let batch = build_logical_batch(0, 2); + let request = build_bulk_request(physical_region_id, batch); + + let err = env + .metric() + .handle_request(physical_region_id, request) + .await + .unwrap_err(); + let Some(err) = err.as_any().downcast_ref::() else { + panic!("unexpected error type"); + }; + assert_matches!(err, Error::UnsupportedRegionRequest { .. }); + } + + #[tokio::test] + async fn test_bulk_insert_unknown_column_errors() { + let env = TestEnv::new().await; + env.init_metric_region().await; + let logical_region_id = env.default_logical_region_id(); + + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + greptime_timestamp(), + DataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + Field::new(greptime_value(), DataType::Float64, true), + Field::new("nonexistent_column", DataType::Utf8, true), + ])); + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(TimestampMillisecondArray::from(vec![0i64])), + Arc::new(Float64Array::from(vec![1.0])), + Arc::new(StringArray::from(vec!["val"])), + ], + ) + .unwrap(); + + let request = build_bulk_request(logical_region_id, batch); + let err = env + .metric() + .handle_request(logical_region_id, request) + .await + .unwrap_err(); + let Some(err) = err.as_any().downcast_ref::() else { + panic!("unexpected error type"); + }; + assert_matches!(err, Error::ColumnNotFound { .. }); + } + + #[tokio::test] + async fn test_bulk_insert_multiple_tag_columns() { + let env = TestEnv::new().await; + let physical_region_id = env.default_physical_region_id(); + env.create_physical_region(physical_region_id, &TestEnv::default_table_dir(), vec![]) + .await; + let logical_region_id = env.default_logical_region_id(); + let request = test_util::create_logical_region_request( + &["host", "region"], + physical_region_id, + &table_dir("test", logical_region_id.table_id()), + ); + env.metric() + .handle_request(logical_region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + greptime_timestamp(), + DataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + Field::new(greptime_value(), DataType::Float64, true), + Field::new("host", DataType::Utf8, true), + Field::new("region", DataType::Utf8, true), + ])); + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(TimestampMillisecondArray::from(vec![0i64, 1, 2])), + Arc::new(Float64Array::from(vec![10.0, 20.0, 30.0])), + Arc::new(StringArray::from(vec!["h1", "h2", "h1"])), + Arc::new(StringArray::from(vec!["us-east", "us-west", "eu-west"])), + ], + ) + .unwrap(); + + let request = build_bulk_request(logical_region_id, batch); + let response = env + .metric() + .handle_request(logical_region_id, request) + .await + .unwrap(); + assert_eq!(response.affected_rows, 3); + + let stream = env + .metric() + .scan_to_stream(logical_region_id, ScanRequest::default()) + .await + .unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(batches.iter().map(|b| b.num_rows()).sum::(), 3); + } + + #[tokio::test] + async fn test_bulk_insert_accumulates_rows() { + let env = TestEnv::new().await; + env.init_metric_region().await; + let logical_region_id = env.default_logical_region_id(); + + let request = build_bulk_request(logical_region_id, build_logical_batch(0, 3)); + let response = env + .metric() + .handle_request(logical_region_id, request) + .await + .unwrap(); + assert_eq!(response.affected_rows, 3); + + let request = build_bulk_request(logical_region_id, build_logical_batch(3, 5)); + let response = env + .metric() + .handle_request(logical_region_id, request) + .await + .unwrap(); + assert_eq!(response.affected_rows, 5); + + let stream = env + .metric() + .scan_to_stream(logical_region_id, ScanRequest::default()) + .await + .unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(batches.iter().map(|b| b.num_rows()).sum::(), 8); + } + + #[tokio::test] + async fn test_bulk_insert_sparse_encoding() { + let env = TestEnv::new().await; + env.init_metric_region().await; + let logical_region_id = env.default_logical_region_id(); + + let request = build_bulk_request(logical_region_id, build_logical_batch(0, 4)); + let response = env + .metric() + .handle_request(logical_region_id, request) + .await + .unwrap(); + assert_eq!(response.affected_rows, 4); + + let stream = env + .metric() + .scan_to_stream(logical_region_id, ScanRequest::default()) + .await + .unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(batches.iter().map(|b| b.num_rows()).sum::(), 4); + } + + #[tokio::test] + async fn test_bulk_insert_dense_encoding_rejected() { + let env = TestEnv::new().await; + let logical_region_id = init_dense_metric_region(&env).await; + + let request = build_bulk_request(logical_region_id, build_logical_batch(0, 2)); + let err = env + .metric() + .handle_request(logical_region_id, request) + .await + .unwrap_err(); + let Some(err) = err.as_any().downcast_ref::() else { + panic!("unexpected error type"); + }; + assert_matches!(err, Error::UnsupportedRegionRequest { .. }); + } + + #[tokio::test] + async fn test_bulk_insert_matches_put() { + let env_put = TestEnv::new().await; + env_put.init_metric_region().await; + let logical_region_id = env_put.default_logical_region_id(); + let schema = test_util::row_schema_with_tags(&["job"]); + let rows = test_util::build_rows(1, 5); + env_put + .metric() + .handle_request( + logical_region_id, + RegionRequest::Put(RegionPutRequest { + rows: api::v1::Rows { schema, rows }, + hint: None, + partition_expr_version: None, + }), + ) + .await + .unwrap(); + let put_stream = env_put + .metric() + .scan_to_stream(logical_region_id, ScanRequest::default()) + .await + .unwrap(); + let put_batches = RecordBatches::try_collect(put_stream).await.unwrap(); + let put_output = put_batches.pretty_print().unwrap(); + + let env_bulk = TestEnv::new().await; + env_bulk.init_metric_region().await; + let request = build_bulk_request(logical_region_id, build_logical_batch(0, 5)); + env_bulk + .metric() + .handle_request(logical_region_id, request) + .await + .unwrap(); + let bulk_stream = env_bulk + .metric() + .scan_to_stream(logical_region_id, ScanRequest::default()) + .await + .unwrap(); + let bulk_batches = RecordBatches::try_collect(bulk_stream).await.unwrap(); + let bulk_output = bulk_batches.pretty_print().unwrap(); + + assert_eq!(put_output, bulk_output); + } + + #[test] + fn test_record_batch_to_rows_with_null_values() { + use datatypes::arrow::array::{Float64Array, StringArray, TimestampMillisecondArray}; + use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema, TimeUnit}; + use datatypes::arrow::record_batch::RecordBatch; + use store_api::storage::RegionId; + + use crate::engine::bulk_insert::record_batch_to_rows; + + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + greptime_timestamp(), + DataType::Timestamp(TimeUnit::Millisecond, None), + true, + ), + Field::new(greptime_value(), DataType::Float64, true), + Field::new("job", DataType::Utf8, true), + Field::new("host", DataType::Utf8, true), + ])); + + let ts_array = TimestampMillisecondArray::from(vec![Some(1000), None, Some(3000)]); + let val_array = Float64Array::from(vec![Some(1.0), Some(2.0), None]); + let job_array = StringArray::from(vec![Some("job1"), None, Some("job3")]); + let host_array = StringArray::from(vec![None, Some("host2"), Some("host3")]); + + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(ts_array), + Arc::new(val_array), + Arc::new(job_array), + Arc::new(host_array), + ], + ) + .unwrap(); + + let region_id = RegionId::new(1, 1); + let rows = record_batch_to_rows(&batch, region_id).unwrap(); + + assert_eq!(rows.rows.len(), 3); + assert_eq!(rows.schema.len(), 4); + + // Row 0: all non-null except host + assert!(rows.rows[0].values[0].value_data.is_some()); + assert!(rows.rows[0].values[1].value_data.is_some()); + assert!(rows.rows[0].values[2].value_data.is_some()); + assert!(rows.rows[0].values[3].value_data.is_none()); + + // Row 1: null timestamp, null job + assert!(rows.rows[1].values[0].value_data.is_none()); + assert!(rows.rows[1].values[1].value_data.is_some()); + assert!(rows.rows[1].values[2].value_data.is_none()); + assert!(rows.rows[1].values[3].value_data.is_some()); + + // Row 2: null value + assert!(rows.rows[2].values[0].value_data.is_some()); + assert!(rows.rows[2].values[1].value_data.is_none()); + assert!(rows.rows[2].values[2].value_data.is_some()); + assert!(rows.rows[2].values[3].value_data.is_some()); + } +} diff --git a/src/metric-engine/src/engine/put.rs b/src/metric-engine/src/engine/put.rs index 9251605aea..edae0d2bb4 100644 --- a/src/metric-engine/src/engine/put.rs +++ b/src/metric-engine/src/engine/put.rs @@ -460,7 +460,7 @@ impl MetricEngineInner { .await } - fn find_data_region_meta( + pub(crate) fn find_data_region_meta( &self, logical_region_id: RegionId, ) -> Result<(RegionId, RegionId, PrimaryKeyEncoding)> { diff --git a/src/metric-engine/src/lib.rs b/src/metric-engine/src/lib.rs index 30daa80b91..b93029f2f4 100644 --- a/src/metric-engine/src/lib.rs +++ b/src/metric-engine/src/lib.rs @@ -52,6 +52,7 @@ #![feature(assert_matches)] +mod batch_modifier; pub mod config; mod data_region; pub mod engine; From cc441b564238562b25767be31c5d93d86c3fdc00 Mon Sep 17 00:00:00 2001 From: ZonaHe Date: Wed, 18 Mar 2026 02:25:14 +0800 Subject: [PATCH 016/195] feat: update dashboard to v0.12.0 (#7823) Co-authored-by: sunchanglong --- src/servers/dashboard/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/servers/dashboard/VERSION b/src/servers/dashboard/VERSION index 03ee1a5314..87a1cf595a 100644 --- a/src/servers/dashboard/VERSION +++ b/src/servers/dashboard/VERSION @@ -1 +1 @@ -v0.11.13 +v0.12.0 From f2bccbd96adadff6d1e07f62f5e4c467c4b7d8ae Mon Sep 17 00:00:00 2001 From: discord9 <55937128+discord9@users.noreply.github.com> Date: Thu, 19 Mar 2026 08:37:40 +0800 Subject: [PATCH 017/195] docs: flow inc query rfc (#7816) * docs: flow inc query rfc Signed-off-by: discord9 * chore: typo Signed-off-by: discord9 * chore Signed-off-by: discord9 * docs: clarify flow incremental stale recovery Clarify that flush-boundary invalidation is part of IncrementalQueryStale, and document the in-memory checkpoint plus cold-start full snapshot recovery model. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus Signed-off-by: discord9 --------- Signed-off-by: discord9 Co-authored-by: Sisyphus --- docs/rfcs/2026-03-16-flow-inc-query.md | 190 +++++++++++++++++++++++++ 1 file changed, 190 insertions(+) create mode 100644 docs/rfcs/2026-03-16-flow-inc-query.md diff --git a/docs/rfcs/2026-03-16-flow-inc-query.md b/docs/rfcs/2026-03-16-flow-inc-query.md new file mode 100644 index 0000000000..8041d37d2b --- /dev/null +++ b/docs/rfcs/2026-03-16-flow-inc-query.md @@ -0,0 +1,190 @@ +--- +Feature Name: Flow Batching Sequence-Based Incremental Query Plan (Lite) +Tracking Issue: TBD +Date: 2026-03-16 +Author: @discord9 +--- + +# Summary + +This RFC proposes a correctness-first incremental query mode for Flow batching. +Flow queries can read only `seq > checkpoint` and advance checkpoints using per-region correctness watermarks. +When incremental reads are stale or correctness cannot be proven, Flow falls back to full recomputation. + +# Motivation + +Flow batching still needs to repeatedly compute old data in the same time window, so incremental query can improve Flow performance. + +# Goals + +1. Add opt-in incremental reads (`seq > given_seq`) for Flow. +2. Return per-region correctness watermarks for checkpoint advancement. +3. Keep existing query behavior unchanged unless explicitly enabled. +4. Define deterministic fallback for stale or unprovable incremental reads. + +# Non-Goals + +1. No business-schema changes (no synthetic watermark columns in result rows). +2. No global throughput optimization in v1 (correctness first). +3. No observational watermark output when correctness is unprovable. + +# Proposal + +## 1) Query options + +Introduce three `QueryContext` extension keys: + +- `flow.incremental_after_seqs` +- `flow.incremental_mode` +- `flow.return_region_seq` + +These options are opt-in and only affect Flow incremental execution paths. + +## 2) Scan mapping + +When incremental mode is enabled: + +- map `after_seq` to `memtable_min_sequence` (exclusive lower bound) +- keep existing snapshot upper-bound behavior (`memtable_max_sequence`) + +Important limitation in v1: + +- incremental filtering is correctness-proven only for memtable rows +- SST files do not preserve detailed row-level sequence metadata; they only expose coarser file-level sequence information +- therefore `seq > checkpoint` must not assume precise incremental pruning across memtable->SST flush boundaries + +If required incremental parameters are missing or invalid, return argument error. + +## 3) Stale protection + +Add dedicated stale error: + +- `IncrementalQueryStale { region_id, given_seq, min_readable_seq }` + +Behavior: + +- if `given_seq < min_readable_seq`, return stale error +- if `given_seq == min_readable_seq`, query is valid and reads `seq > given_seq` +- if `given_seq > min_readable_seq`, query is also valid and reads `seq > given_seq` + +`IncrementalQueryStale` also covers the case where rows newer than the checkpoint have crossed a memtable->SST flush boundary and sequence-precise incremental exclusion can no longer be proven. +In other words, the flush-boundary case is not a separate fallback category in v1; it is one concrete way an incremental cursor becomes stale. + +## 4) Watermark return + +Extend query metrics with optional per-region watermark map: + +- `region_latest_sequences: Vec<(region_id: u64, latest_sequence: u64)>` + +Rules: + +- only terminal metrics of successful query can advance checkpoints +- for multi-region query, watermark must be complete map or absent +- if correctness is unprovable, business rows may return but watermark is absent + +## 5) Flow state machine + +Checkpoint and watermark state are kept only in flownode memory in v1; they are not persisted as durable flow metadata. +Cold start or flownode restart therefore always re-enters through a full snapshot read. +Only after that full query succeeds with a complete correctness watermark may Flow switch back to incremental mode. + +Flow starts in full mode, then transitions: + +1. Full query succeeds with correctness watermark -> enter incremental mode +2. Incremental query succeeds with correctness watermark -> advance checkpoint +3. Incremental stale/failure -> fallback to full mode +4. Full query without correctness watermark -> remain in full mode + +```mermaid +stateDiagram-v2 + [*] --> FullSnapshot: Flow starts + + state FullSnapshot { + [*] --> RunFull + RunFull --> RunFull: Full query succeeds but watermark is unprovable
no region_latest_sequences returned + } + + FullSnapshot --> Incremental: Full query succeeds and correctness watermark is returned
(checkpoint updated) + + state Incremental { + [*] --> RunInc + RunInc --> RunInc: Incremental succeeds
(checkpoint advances) + } + + Incremental --> FullSnapshot: IncrementalQueryStale
(cursor too old, fallback required) + Incremental --> FullSnapshot: Incremental fails
and fallback policy is triggered + + FullSnapshot --> [*]: Flow stops + Incremental --> [*]: Flow stops +``` + +### Fallback Policy + +Fallback to full mode is deterministic and is triggered by any of the following: + +1. `IncrementalQueryStale` is returned. +2. Incremental query fails with execution errors. +3. Incremental query succeeds but watermark is absent or incomplete for participating regions. + +Policy behavior: + +1. Do not advance any checkpoint in the failed/incomplete round. +2. Switch to full mode for the affected flow/window in the next round. +3. Return to incremental mode only after a full query succeeds with a complete correctness watermark map. + +### Persistence and recovery model + +The v1 design is intentionally correctness-first and keeps the progress cursor lightweight: + +1. Watermarks/checkpoints live only in flownode memory; v1 does not persist them separately. +2. On cold start, the flow re-establishes progress by running a successful full-query snapshot read, then resumes incremental mode only after that round returns a complete correctness watermark map. +3. Sequence-precise incremental correctness is currently limited to rows still visible in memtables. +4. Once relevant rows have been flushed into SST, the system cannot use `seq > checkpoint` alone to prove precise incremental exclusion, because SST lacks detailed row-level sequence metadata. +5. In that case the correct behavior is to fall back to full recomputation, not to continue a best-effort incremental scan. + +# Distributed and Compatibility Requirements + +1. Distributed path must preserve region-level snapshot/read-bound semantics end-to-end. +2. `snapshot_seqs` transport and `flow.*` options must both be carried correctly. + - `snapshot_seqs` means the per-region snapshot upper-bound map: `region_id -> sequence`. +3. New metrics fields must be backward-compatible (old clients ignore unknown fields). + +# Rollout Plan + +## Phase 1 (MVP, correctness first) + +1. Add extension constants and parsing. +2. Add incremental scan mapping and stale detection. +3. Add watermark metrics field and terminal-watermark checkpoint update path. +4. Complete standalone and distributed passthrough. + +## Phase 2 (performance and observability) + +1. Improve batching key strategy with sequence/watermark context. +2. Optimize watermark serialization overhead. +3. Add metrics: incremental hit rate, fallback rate, fallback window size. + +# Testing Plan + +1. Unit tests for incremental bounds and stale detection. +2. Query-path tests for extension mapping and watermark semantics. +3. Flow integration tests for full->incremental->fallback transitions. +4. Distributed tests for end-to-end snapshot/watermark propagation. +5. Compatibility tests for old/new client-server combinations. + +# Risks + +1. Boundary semantic mismatch (`<` vs `<=`) may cause correctness bugs. +2. Incomplete distributed propagation can silently invalidate watermark safety. +3. Frequent fallback can reduce throughput before phase-2 optimizations. +4. Memtable->SST flushes may force more full recomputation than expected until finer-grained SST sequence tracking exists. + +# Alternatives + +1. Put watermark into business rows (rejected: schema pollution). +2. Add new dedicated Flight message type in v1 (deferred to reduce scope). + +# Conclusion + +This plan enables a practical, correctness-first incremental path for Flow batching. +It reuses existing sequence scan capability, adds strict stale handling, and advances checkpoints only from correctness-proven per-region watermarks. From 2af39519445d7a8ac1169c42fc190b036ea44c75 Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Thu, 19 Mar 2026 11:09:47 +0800 Subject: [PATCH 018/195] feat: cache decoded region metadata alone with parquet metadata (#7813) * cache decoded region metadata Signed-off-by: Ruihang Xia * fix: account for decoded sst metadata cache weight * take optional pre-exist metadata Signed-off-by: Ruihang Xia --------- Signed-off-by: Ruihang Xia --- src/datatypes/src/schema.rs | 33 +- src/datatypes/src/schema/column_schema.rs | 30 +- src/mito2/src/access_layer.rs | 2 + src/mito2/src/cache.rs | 382 ++++++++++++++++++---- src/mito2/src/cache/file_cache.rs | 30 +- src/mito2/src/cache/test_util.rs | 39 ++- src/mito2/src/cache/write_cache.rs | 8 +- src/mito2/src/region/opener.rs | 34 +- src/mito2/src/sst/parquet.rs | 6 +- src/mito2/src/sst/parquet/reader.rs | 55 +--- src/store-api/src/metadata.rs | 27 +- 11 files changed, 526 insertions(+), 120 deletions(-) diff --git a/src/datatypes/src/schema.rs b/src/datatypes/src/schema.rs index 9070e2babe..50f2dba270 100644 --- a/src/datatypes/src/schema.rs +++ b/src/datatypes/src/schema.rs @@ -16,8 +16,8 @@ mod column_schema; pub mod constraint; use std::collections::HashMap; -use std::fmt; use std::sync::Arc; +use std::{fmt, mem}; use arrow::datatypes::{Field, Schema as ArrowSchema}; use datafusion_common::DFSchemaRef; @@ -177,6 +177,26 @@ impl Schema { &self.arrow_schema.metadata } + /// Returns the estimated memory footprint of this schema. + pub fn estimated_size(&self) -> usize { + mem::size_of_val(self) + + mem::size_of::() * self.column_schemas.capacity() + + self + .column_schemas + .iter() + .map(|column_schema| { + column_schema.estimated_size() - mem::size_of::() + }) + .sum::() + + mem::size_of::<(String, usize)>() * self.name_to_index.capacity() + + self + .name_to_index + .keys() + .map(|name| name.capacity()) + .sum::() + + arrow_schema_size(self.arrow_schema.as_ref()) + } + /// Generate a new projected schema /// /// # Panic @@ -213,6 +233,17 @@ impl Schema { } } +fn arrow_schema_size(schema: &ArrowSchema) -> usize { + mem::size_of_val(schema) + + schema.fields.size() + + mem::size_of::<(String, String)>() * schema.metadata.capacity() + + schema + .metadata + .iter() + .map(|(key, value)| key.capacity() + value.capacity()) + .sum::() +} + #[derive(Default)] pub struct SchemaBuilder { column_schemas: Vec, diff --git a/src/datatypes/src/schema/column_schema.rs b/src/datatypes/src/schema/column_schema.rs index 183cf05da8..2479f4fc41 100644 --- a/src/datatypes/src/schema/column_schema.rs +++ b/src/datatypes/src/schema/column_schema.rs @@ -13,8 +13,8 @@ // limitations under the License. use std::collections::HashMap; -use std::fmt; use std::str::FromStr; +use std::{fmt, mem}; use arrow::datatypes::Field; use arrow_schema::extension::{ @@ -178,6 +178,19 @@ impl ColumnSchema { self } + /// Returns the estimated memory footprint of this schema. + pub fn estimated_size(&self) -> usize { + mem::size_of_val(self) - mem::size_of_val(&self.data_type) + + self.data_type.as_arrow_type().size() + + self.name.capacity() + + self + .default_constraint + .as_ref() + .map(column_default_constraint_size) + .unwrap_or_default() + + metadata_size(&self.metadata) + } + /// Set the inverted index for the column. /// Similar to [with_inverted_index] but don't take the ownership. /// @@ -493,6 +506,21 @@ impl ColumnSchema { } } +fn metadata_size(metadata: &Metadata) -> usize { + mem::size_of::<(String, String)>() * metadata.capacity() + + metadata + .iter() + .map(|(key, value)| key.capacity() + value.capacity()) + .sum::() +} + +fn column_default_constraint_size(default_constraint: &ColumnDefaultConstraint) -> usize { + match default_constraint { + ColumnDefaultConstraint::Function(expr) => expr.capacity(), + ColumnDefaultConstraint::Value(value) => value.as_value_ref().data_size(), + } +} + /// Column extended type set in column schema's metadata. #[derive(Debug, Clone, PartialEq, Eq)] pub enum ColumnExtType { diff --git a/src/mito2/src/access_layer.rs b/src/mito2/src/access_layer.rs index 231285215e..33180ebf46 100644 --- a/src/mito2/src/access_layer.rs +++ b/src/mito2/src/access_layer.rs @@ -338,6 +338,7 @@ impl AccessLayer { metrics: &mut Metrics, ) -> Result { let region_id = request.metadata.region_id; + let region_metadata = request.metadata.clone(); let cache_manager = request.cache_manager.clone(); let sst_info = if let Some(write_cache) = cache_manager.write_cache() { @@ -415,6 +416,7 @@ impl AccessLayer { cache_manager.put_parquet_meta_data( RegionFileId::new(region_id, sst.file_id), parquet_metadata.clone(), + Some(region_metadata.clone()), ) } } diff --git a/src/mito2/src/cache.rs b/src/mito2/src/cache.rs index e232489768..c9a8b99166 100644 --- a/src/mito2/src/cache.rs +++ b/src/mito2/src/cache.rs @@ -28,6 +28,7 @@ use std::ops::Range; use std::sync::Arc; use bytes::Bytes; +use common_telemetry::warn; use datatypes::arrow::record_batch::RecordBatch; use datatypes::value::Value; use datatypes::vectors::VectorRef; @@ -36,8 +37,10 @@ use index::result_cache::IndexResultCache; use moka::notification::RemovalCause; use moka::sync::Cache; use object_store::ObjectStore; -use parquet::file::metadata::{PageIndexPolicy, ParquetMetaData}; +use parquet::file::metadata::{FileMetaData, PageIndexPolicy, ParquetMetaData}; use puffin::puffin_manager::cache::{PuffinMetadataCache, PuffinMetadataCacheRef}; +use snafu::{OptionExt, ResultExt}; +use store_api::metadata::RegionMetadataRef; use store_api::storage::{ConcreteDataType, FileId, RegionId, TimeSeriesRowSelector}; use crate::cache::cache_size::parquet_meta_size; @@ -46,11 +49,13 @@ use crate::cache::index::inverted_index::{InvertedIndexCache, InvertedIndexCache #[cfg(feature = "vector_index")] use crate::cache::index::vector_index::{VectorIndexCache, VectorIndexCacheRef}; use crate::cache::write_cache::WriteCacheRef; +use crate::error::{InvalidMetadataSnafu, InvalidParquetSnafu, Result}; use crate::memtable::record_batch_estimated_size; use crate::metrics::{CACHE_BYTES, CACHE_EVICTION, CACHE_HIT, CACHE_MISS}; use crate::read::Batch; use crate::read::range_cache::{RangeScanCacheKey, RangeScanCacheValue}; use crate::sst::file::{RegionFileId, RegionIndexId}; +use crate::sst::parquet::PARQUET_METADATA_KEY; use crate::sst::parquet::reader::MetadataCacheMetrics; /// Metrics type key for sst meta. @@ -68,6 +73,106 @@ const SELECTOR_RESULT_TYPE: &str = "selector_result"; /// Metrics type key for range scan result cache. const RANGE_RESULT_TYPE: &str = "range_result"; +/// Cached SST metadata combines the parquet footer with the decoded region metadata. +/// +/// The cached parquet footer strips the `greptime:metadata` JSON payload and stores the decoded +/// [RegionMetadata] separately so readers can skip repeated deserialization work. +#[derive(Debug)] +pub(crate) struct CachedSstMeta { + parquet_metadata: Arc, + region_metadata: RegionMetadataRef, + region_metadata_weight: usize, +} + +impl CachedSstMeta { + pub(crate) fn try_new(file_path: &str, parquet_metadata: ParquetMetaData) -> Result { + Self::try_new_with_region_metadata(file_path, parquet_metadata, None) + } + + pub(crate) fn try_new_with_region_metadata( + file_path: &str, + parquet_metadata: ParquetMetaData, + region_metadata: Option, + ) -> Result { + let file_metadata = parquet_metadata.file_metadata(); + let key_values = file_metadata + .key_value_metadata() + .context(InvalidParquetSnafu { + file: file_path, + reason: "missing key value meta", + })?; + let meta_value = key_values + .iter() + .find(|kv| kv.key == PARQUET_METADATA_KEY) + .with_context(|| InvalidParquetSnafu { + file: file_path, + reason: format!("key {} not found", PARQUET_METADATA_KEY), + })?; + let json = meta_value + .value + .as_ref() + .with_context(|| InvalidParquetSnafu { + file: file_path, + reason: format!("No value for key {}", PARQUET_METADATA_KEY), + })?; + let region_metadata = match region_metadata { + Some(region_metadata) => region_metadata, + None => Arc::new( + store_api::metadata::RegionMetadata::from_json(json) + .context(InvalidMetadataSnafu)?, + ), + }; + // Keep the previous JSON-byte floor and charge the decoded structures as well. + let region_metadata_weight = region_metadata.estimated_size().max(json.len()); + let parquet_metadata = Arc::new(strip_region_metadata_from_parquet(parquet_metadata)); + + Ok(Self { + parquet_metadata, + region_metadata, + region_metadata_weight, + }) + } + + pub(crate) fn parquet_metadata(&self) -> Arc { + self.parquet_metadata.clone() + } + + pub(crate) fn region_metadata(&self) -> RegionMetadataRef { + self.region_metadata.clone() + } +} + +fn strip_region_metadata_from_parquet(parquet_metadata: ParquetMetaData) -> ParquetMetaData { + let file_metadata = parquet_metadata.file_metadata(); + let filtered_key_values = file_metadata.key_value_metadata().and_then(|key_values| { + let filtered = key_values + .iter() + .filter(|kv| kv.key != PARQUET_METADATA_KEY) + .cloned() + .collect::>(); + (!filtered.is_empty()).then_some(filtered) + }); + let stripped_file_metadata = FileMetaData::new( + file_metadata.version(), + file_metadata.num_rows(), + file_metadata.created_by().map(ToString::to_string), + filtered_key_values, + file_metadata.schema_descr_ptr(), + file_metadata.column_orders().cloned(), + ); + + let mut builder = parquet_metadata.into_builder(); + let row_groups = builder.take_row_groups(); + let column_index = builder.take_column_index(); + let offset_index = builder.take_offset_index(); + + parquet::file::metadata::ParquetMetaDataBuilder::new(stripped_file_metadata) + .set_row_groups(row_groups) + .set_column_index(column_index) + .set_offset_index(offset_index) + .build() +} + /// Cache strategies that may only enable a subset of caches. #[derive(Clone)] pub enum CacheStrategy { @@ -84,18 +189,17 @@ pub enum CacheStrategy { } impl CacheStrategy { - /// Gets parquet metadata with cache metrics tracking. - /// Returns the metadata and updates the provided metrics. - pub(crate) async fn get_parquet_meta_data( + /// Gets fused SST metadata with cache metrics tracking. + pub(crate) async fn get_sst_meta_data( &self, file_id: RegionFileId, metrics: &mut MetadataCacheMetrics, page_index_policy: PageIndexPolicy, - ) -> Option> { + ) -> Option> { match self { CacheStrategy::EnableAll(cache_manager) | CacheStrategy::Compaction(cache_manager) => { cache_manager - .get_parquet_meta_data(file_id, metrics, page_index_policy) + .get_sst_meta_data(file_id, metrics, page_index_policy) .await } CacheStrategy::Disabled => { @@ -105,30 +209,48 @@ impl CacheStrategy { } } - /// Calls [CacheManager::get_parquet_meta_data_from_mem_cache()]. - pub fn get_parquet_meta_data_from_mem_cache( + /// Calls [CacheManager::get_sst_meta_data_from_mem_cache()]. + pub(crate) fn get_sst_meta_data_from_mem_cache( &self, file_id: RegionFileId, - ) -> Option> { + ) -> Option> { match self { - CacheStrategy::EnableAll(cache_manager) => { - cache_manager.get_parquet_meta_data_from_mem_cache(file_id) - } - CacheStrategy::Compaction(cache_manager) => { - cache_manager.get_parquet_meta_data_from_mem_cache(file_id) + CacheStrategy::EnableAll(cache_manager) | CacheStrategy::Compaction(cache_manager) => { + cache_manager.get_sst_meta_data_from_mem_cache(file_id) } CacheStrategy::Disabled => None, } } - /// Calls [CacheManager::put_parquet_meta_data()]. - pub fn put_parquet_meta_data(&self, file_id: RegionFileId, metadata: Arc) { + /// Calls [CacheManager::get_parquet_meta_data_from_mem_cache()]. + pub fn get_parquet_meta_data_from_mem_cache( + &self, + file_id: RegionFileId, + ) -> Option> { + self.get_sst_meta_data_from_mem_cache(file_id) + .map(|metadata| metadata.parquet_metadata()) + } + + /// Calls [CacheManager::put_sst_meta_data()]. + pub(crate) fn put_sst_meta_data(&self, file_id: RegionFileId, metadata: Arc) { match self { - CacheStrategy::EnableAll(cache_manager) => { - cache_manager.put_parquet_meta_data(file_id, metadata); + CacheStrategy::EnableAll(cache_manager) | CacheStrategy::Compaction(cache_manager) => { + cache_manager.put_sst_meta_data(file_id, metadata); } - CacheStrategy::Compaction(cache_manager) => { - cache_manager.put_parquet_meta_data(file_id, metadata); + CacheStrategy::Disabled => {} + } + } + + /// Calls [CacheManager::put_parquet_meta_data()]. + pub fn put_parquet_meta_data( + &self, + file_id: RegionFileId, + metadata: Arc, + region_metadata: Option, + ) { + match self { + CacheStrategy::EnableAll(cache_manager) | CacheStrategy::Compaction(cache_manager) => { + cache_manager.put_parquet_meta_data(file_id, metadata, region_metadata); } CacheStrategy::Disabled => {} } @@ -368,6 +490,35 @@ impl CacheManager { CacheManagerBuilder::default() } + /// Gets fused SST metadata with metrics tracking. + /// Tries in-memory cache first, then file cache, updating metrics accordingly. + pub(crate) async fn get_sst_meta_data( + &self, + file_id: RegionFileId, + metrics: &mut MetadataCacheMetrics, + page_index_policy: PageIndexPolicy, + ) -> Option> { + if let Some(metadata) = self.get_sst_meta_data_from_mem_cache(file_id) { + metrics.mem_cache_hit += 1; + return Some(metadata); + } + + let key = IndexKey::new(file_id.region_id(), file_id.file_id(), FileType::Parquet); + if let Some(write_cache) = &self.write_cache + && let Some(metadata) = write_cache + .file_cache() + .get_sst_meta_data(key, metrics, page_index_policy) + .await + { + metrics.file_cache_hit += 1; + self.put_sst_meta_data(file_id, metadata.clone()); + return Some(metadata); + } + + metrics.cache_miss += 1; + None + } + /// Gets cached [ParquetMetaData] with metrics tracking. /// Tries in-memory cache first, then file cache, updating metrics accordingly. pub(crate) async fn get_parquet_meta_data( @@ -376,29 +527,21 @@ impl CacheManager { metrics: &mut MetadataCacheMetrics, page_index_policy: PageIndexPolicy, ) -> Option> { - // Try to get metadata from sst meta cache - if let Some(metadata) = self.get_parquet_meta_data_from_mem_cache(file_id) { - metrics.mem_cache_hit += 1; - return Some(metadata); - } + self.get_sst_meta_data(file_id, metrics, page_index_policy) + .await + .map(|metadata| metadata.parquet_metadata()) + } - // Try to get metadata from write cache - let key = IndexKey::new(file_id.region_id(), file_id.file_id(), FileType::Parquet); - if let Some(write_cache) = &self.write_cache - && let Some(metadata) = write_cache - .file_cache() - .get_parquet_meta_data(key, metrics, page_index_policy) - .await - { - metrics.file_cache_hit += 1; - let metadata = Arc::new(metadata); - // Put metadata into sst meta cache - self.put_parquet_meta_data(file_id, metadata.clone()); - return Some(metadata); - }; - metrics.cache_miss += 1; - - None + /// Gets cached fused SST metadata from in-memory cache. + /// This method does not perform I/O. + pub(crate) fn get_sst_meta_data_from_mem_cache( + &self, + file_id: RegionFileId, + ) -> Option> { + self.sst_meta_cache.as_ref().and_then(|sst_meta_cache| { + let value = sst_meta_cache.get(&SstMetaKey(file_id.region_id(), file_id.file_id())); + update_hit_miss(value, SST_META_TYPE) + }) } /// Gets cached [ParquetMetaData] from in-memory cache. @@ -407,15 +550,12 @@ impl CacheManager { &self, file_id: RegionFileId, ) -> Option> { - // Try to get metadata from sst meta cache - self.sst_meta_cache.as_ref().and_then(|sst_meta_cache| { - let value = sst_meta_cache.get(&SstMetaKey(file_id.region_id(), file_id.file_id())); - update_hit_miss(value, SST_META_TYPE) - }) + self.get_sst_meta_data_from_mem_cache(file_id) + .map(|metadata| metadata.parquet_metadata()) } - /// Puts [ParquetMetaData] into the cache. - pub fn put_parquet_meta_data(&self, file_id: RegionFileId, metadata: Arc) { + /// Puts fused SST metadata into the cache. + pub(crate) fn put_sst_meta_data(&self, file_id: RegionFileId, metadata: Arc) { if let Some(cache) = &self.sst_meta_cache { let key = SstMetaKey(file_id.region_id(), file_id.file_id()); CACHE_BYTES @@ -425,6 +565,34 @@ impl CacheManager { } } + /// Puts [ParquetMetaData] into the cache. + pub fn put_parquet_meta_data( + &self, + file_id: RegionFileId, + metadata: Arc, + region_metadata: Option, + ) { + if self.sst_meta_cache.is_some() { + let file_path = format!( + "region_id={}, file_id={}", + file_id.region_id(), + file_id.file_id() + ); + match CachedSstMeta::try_new_with_region_metadata( + &file_path, + Arc::unwrap_or_clone(metadata), + region_metadata, + ) { + Ok(metadata) => self.put_sst_meta_data(file_id, Arc::new(metadata)), + Err(err) => warn!( + err; "Failed to decode region metadata while caching parquet metadata, region_id: {}, file_id: {}", + file_id.region_id(), + file_id.file_id() + ), + } + } + } + /// Removes [ParquetMetaData] from the cache. pub fn remove_parquet_meta_data(&self, file_id: RegionFileId) { if let Some(cache) = &self.sst_meta_cache { @@ -809,9 +977,9 @@ impl CacheManagerBuilder { } } -fn meta_cache_weight(k: &SstMetaKey, v: &Arc) -> u32 { +fn meta_cache_weight(k: &SstMetaKey, v: &Arc) -> u32 { // We ignore the size of `Arc`. - (k.estimated_size() + parquet_meta_size(v)) as u32 + (k.estimated_size() + parquet_meta_size(&v.parquet_metadata) + v.region_metadata_weight) as u32 } fn vector_cache_weight(_k: &(ConcreteDataType, Value), v: &VectorRef) -> u32 { @@ -977,8 +1145,8 @@ impl SelectorResultValue { } } -/// Maps (region id, file id) to [ParquetMetaData]. -type SstMetaCache = Cache>; +/// Maps (region id, file id) to fused SST metadata. +type SstMetaCache = Cache>; /// Maps [Value] to a vector that holds this value repeatedly. /// /// e.g. `"hello" => ["hello", "hello", "hello"]` @@ -994,15 +1162,20 @@ type RangeResultCache = Cache>; mod tests { use std::sync::Arc; + use api::v1::SemanticType; use api::v1::index::{BloomFilterMeta, InvertedIndexMetas}; + use datatypes::schema::ColumnSchema; use datatypes::vectors::Int64Vector; use puffin::file_metadata::FileMetadata; + use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder}; use store_api::storage::ColumnId; use super::*; use crate::cache::index::bloom_filter_index::Tag; use crate::cache::index::result_cache::PredicateKey; - use crate::cache::test_util::parquet_meta; + use crate::cache::test_util::{ + parquet_meta, sst_parquet_meta, sst_parquet_meta_with_region_metadata, + }; use crate::read::range_cache::{ RangeScanCacheKey, RangeScanCacheValue, ScanRequestFingerprintBuilder, }; @@ -1019,7 +1192,7 @@ mod tests { let file_id = RegionFileId::new(region_id, FileId::random()); let metadata = parquet_meta(); let mut metrics = MetadataCacheMetrics::default(); - cache.put_parquet_meta_data(file_id, metadata); + cache.put_parquet_meta_data(file_id, metadata, None); assert!( cache .get_parquet_meta_data(file_id, &mut metrics, Default::default()) @@ -1056,13 +1229,23 @@ mod tests { .await .is_none() ); - let metadata = parquet_meta(); - cache.put_parquet_meta_data(file_id, metadata); + let (metadata, region_metadata) = sst_parquet_meta(); + cache.put_parquet_meta_data(file_id, metadata, None); + let cached = cache + .get_sst_meta_data(file_id, &mut metrics, Default::default()) + .await + .unwrap(); + assert_eq!(region_metadata, cached.region_metadata()); assert!( - cache - .get_parquet_meta_data(file_id, &mut metrics, Default::default()) - .await - .is_some() + cached + .parquet_metadata() + .file_metadata() + .key_value_metadata() + .is_none_or(|key_values| { + key_values + .iter() + .all(|key_value| key_value.key != PARQUET_METADATA_KEY) + }) ); cache.remove_parquet_meta_data(file_id); assert!( @@ -1073,6 +1256,42 @@ mod tests { ); } + #[tokio::test] + async fn test_parquet_meta_cache_with_provided_region_metadata() { + let cache = CacheManager::builder().sst_meta_cache_size(2000).build(); + let mut metrics = MetadataCacheMetrics::default(); + let region_id = RegionId::new(1, 1); + let file_id = RegionFileId::new(region_id, FileId::random()); + let (metadata, region_metadata) = sst_parquet_meta(); + + cache.put_parquet_meta_data(file_id, metadata, Some(region_metadata.clone())); + + let cached = cache + .get_sst_meta_data(file_id, &mut metrics, Default::default()) + .await + .unwrap(); + assert!(Arc::ptr_eq(®ion_metadata, &cached.region_metadata())); + } + + #[test] + fn test_meta_cache_weight_accounts_for_decoded_region_metadata() { + let region_metadata = Arc::new(wide_region_metadata(128)); + let json_len = region_metadata.to_json().unwrap().len(); + let metadata = sst_parquet_meta_with_region_metadata(region_metadata.clone()); + let cached = Arc::new( + CachedSstMeta::try_new("test.parquet", Arc::unwrap_or_clone(metadata)).unwrap(), + ); + let key = SstMetaKey(region_metadata.region_id, FileId::random()); + + assert!(cached.region_metadata_weight > json_len); + assert_eq!( + meta_cache_weight(&key, &cached) as usize, + key.estimated_size() + + parquet_meta_size(&cached.parquet_metadata) + + cached.region_metadata_weight + ); + } + #[test] fn test_repeated_vector_cache() { let cache = CacheManager::builder().vector_cache_size(4096).build(); @@ -1256,4 +1475,45 @@ mod tests { assert!(result_cache.get(&predicate, index_id.file_id()).is_none()); assert!(puffin_metadata_cache.get_metadata(&file_id_str).is_none()); } + + fn wide_region_metadata(column_count: u32) -> RegionMetadata { + let region_id = RegionId::new(1024, 7); + let mut builder = RegionMetadataBuilder::new(region_id); + let mut primary_key = Vec::new(); + + for column_id in 0..column_count { + let semantic_type = if column_id < 32 { + primary_key.push(column_id); + SemanticType::Tag + } else { + SemanticType::Field + }; + let mut column_schema = ColumnSchema::new( + format!("wide_column_{column_id}"), + ConcreteDataType::string_datatype(), + true, + ); + column_schema + .mut_metadata() + .insert(format!("cache_key_{column_id}"), "cache_value".repeat(4)); + builder.push_column_metadata(ColumnMetadata { + column_schema, + semantic_type, + column_id, + }); + } + + builder.push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + semantic_type: SemanticType::Timestamp, + column_id: column_count, + }); + builder.primary_key(primary_key); + + builder.build().unwrap() + } } diff --git a/src/mito2/src/cache/file_cache.rs b/src/mito2/src/cache/file_cache.rs index 32a276d0e4..278838b369 100644 --- a/src/mito2/src/cache/file_cache.rs +++ b/src/mito2/src/cache/file_cache.rs @@ -34,7 +34,7 @@ use store_api::storage::{FileId, RegionId}; use tokio::sync::mpsc::{Sender, UnboundedReceiver}; use crate::access_layer::TempFileCleaner; -use crate::cache::{FILE_TYPE, INDEX_TYPE}; +use crate::cache::{CachedSstMeta, FILE_TYPE, INDEX_TYPE}; use crate::error::{self, OpenDalSnafu, Result}; use crate::metrics::{ CACHE_BYTES, CACHE_HIT, CACHE_MISS, WRITE_CACHE_DOWNLOAD_BYTES_TOTAL, @@ -612,6 +612,34 @@ impl FileCache { } } + /// Get fused SST metadata from the file cache. + /// If the file is not in the cache, or metadata loading/decoding fails, return None. + pub(crate) async fn get_sst_meta_data( + &self, + key: IndexKey, + cache_metrics: &mut MetadataCacheMetrics, + page_index_policy: PageIndexPolicy, + ) -> Option> { + let file_path = self.inner.cache_file_path(key); + self.get_parquet_meta_data(key, cache_metrics, page_index_policy) + .await + .and_then( + |metadata| match CachedSstMeta::try_new(&file_path, metadata) { + Ok(metadata) => Some(Arc::new(metadata)), + Err(err) => { + CACHE_MISS + .with_label_values(&[key.file_type.metric_label()]) + .inc(); + warn!( + err; "Failed to decode cached parquet metadata for key {:?}", + key + ); + None + } + }, + ) + } + async fn get_reader(&self, file_path: &str) -> object_store::Result> { if self.inner.local_store.exists(file_path).await? { Ok(Some(self.inner.local_store.reader(file_path).await?)) diff --git a/src/mito2/src/cache/test_util.rs b/src/mito2/src/cache/test_util.rs index 65ad9d87eb..ef3d8e9315 100644 --- a/src/mito2/src/cache/test_util.rs +++ b/src/mito2/src/cache/test_util.rs @@ -23,8 +23,13 @@ use object_store::ObjectStore; use object_store::services::Fs; use parquet::arrow::ArrowWriter; use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; -use parquet::file::metadata::ParquetMetaData; +use parquet::file::metadata::{KeyValue, ParquetMetaData}; +use parquet::file::properties::WriterProperties; use parquet::file::statistics::Statistics; +use store_api::metadata::RegionMetadataRef; + +use crate::sst::parquet::PARQUET_METADATA_KEY; +use crate::test_util::sst_util::sst_region_metadata; /// Returns a parquet meta data. pub(crate) fn parquet_meta() -> Arc { @@ -33,13 +38,43 @@ pub(crate) fn parquet_meta() -> Arc { builder.metadata().clone() } +/// Returns parquet metadata for an SST parquet file and its decoded region metadata. +pub(crate) fn sst_parquet_meta() -> (Arc, RegionMetadataRef) { + let region_metadata = Arc::new(sst_region_metadata()); + let file_data = parquet_file_data_with_region_metadata(®ion_metadata); + let builder = ParquetRecordBatchReaderBuilder::try_new(Bytes::from(file_data)).unwrap(); + (builder.metadata().clone(), region_metadata) +} + +/// Returns parquet metadata for an SST parquet file with custom region metadata. +pub(crate) fn sst_parquet_meta_with_region_metadata( + region_metadata: RegionMetadataRef, +) -> Arc { + let file_data = parquet_file_data_with_region_metadata(®ion_metadata); + let builder = ParquetRecordBatchReaderBuilder::try_new(Bytes::from(file_data)).unwrap(); + builder.metadata().clone() +} + /// Write a test parquet file to a buffer fn parquet_file_data() -> Vec { + parquet_file_data_inner(None) +} + +fn parquet_file_data_with_region_metadata(region_metadata: &RegionMetadataRef) -> Vec { + let json = region_metadata.to_json().unwrap(); + let key_value = KeyValue::new(PARQUET_METADATA_KEY.to_string(), json); + parquet_file_data_inner(Some(vec![key_value])) +} + +fn parquet_file_data_inner(key_value_metadata: Option>) -> Vec { let col = Arc::new(Int64Array::from_iter_values([1, 2, 3])) as ArrayRef; let to_write = RecordBatch::try_from_iter([("col", col)]).unwrap(); let mut buffer = Vec::new(); - let mut writer = ArrowWriter::try_new(&mut buffer, to_write.schema(), None).unwrap(); + let props = WriterProperties::builder() + .set_key_value_metadata(key_value_metadata) + .build(); + let mut writer = ArrowWriter::try_new(&mut buffer, to_write.schema(), Some(props)).unwrap(); writer.write(&to_write).unwrap(); writer.close().unwrap(); diff --git a/src/mito2/src/cache/write_cache.rs b/src/mito2/src/cache/write_cache.rs index 3d373efe91..e2483ed4e4 100644 --- a/src/mito2/src/cache/write_cache.rs +++ b/src/mito2/src/cache/write_cache.rs @@ -693,9 +693,15 @@ mod tests { .cache(CacheStrategy::EnableAll(cache_manager.clone())) .page_index_policy(PageIndexPolicy::Optional); let reader = builder.build().await.unwrap().unwrap(); + let cached_write_parquet_metadata = crate::cache::CachedSstMeta::try_new( + "test.sst", + Arc::unwrap_or_clone(write_parquet_metadata), + ) + .unwrap() + .parquet_metadata(); // Check parquet metadata - assert_parquet_metadata_equal(write_parquet_metadata, reader.parquet_metadata()); + assert_parquet_metadata_equal(cached_write_parquet_metadata, reader.parquet_metadata()); } #[tokio::test] diff --git a/src/mito2/src/region/opener.rs b/src/mito2/src/region/opener.rs index 014c50820f..d089493f81 100644 --- a/src/mito2/src/region/opener.rs +++ b/src/mito2/src/region/opener.rs @@ -1043,7 +1043,7 @@ async fn preload_parquet_meta_cache_for_files( let loader = MetadataLoader::new(object_store.clone(), &file_path, file_size); match loader.load(&mut cache_metrics).await { Ok(metadata) => { - cache_manager.put_parquet_meta_data(file_id, Arc::new(metadata)); + cache_manager.put_parquet_meta_data(file_id, Arc::new(metadata), None); loaded += 1; } Err(err) => { @@ -1153,6 +1153,8 @@ mod tests { use object_store::ObjectStore; use object_store::services::{Fs, Memory}; use parquet::arrow::ArrowWriter; + use parquet::file::metadata::KeyValue; + use parquet::file::properties::WriterProperties; use store_api::region_request::PathType; use store_api::storage::{FileId, RegionId}; @@ -1161,7 +1163,27 @@ mod tests { use crate::cache::file_cache::{FileType, IndexKey}; use crate::sst::file::{FileHandle, FileMeta}; use crate::sst::file_purger::NoopFilePurger; + use crate::sst::parquet::PARQUET_METADATA_KEY; use crate::test_util::TestEnv; + use crate::test_util::sst_util::sst_region_metadata; + + fn sst_parquet_bytes(batch: &RecordBatch) -> Vec { + let key_value_meta = KeyValue::new( + PARQUET_METADATA_KEY.to_string(), + sst_region_metadata().to_json().unwrap(), + ); + let props = WriterProperties::builder() + .set_key_value_metadata(Some(vec![key_value_meta])) + .build(); + + let mut parquet_bytes = Vec::new(); + let mut writer = + ArrowWriter::try_new(&mut parquet_bytes, batch.schema(), Some(props)).unwrap(); + writer.write(batch).unwrap(); + writer.close().unwrap(); + + parquet_bytes + } #[tokio::test] async fn test_preload_parquet_meta_cache_uses_file_cache() { @@ -1183,10 +1205,7 @@ mod tests { let col = Arc::new(Int64Array::from_iter_values([1, 2, 3])) as ArrayRef; let batch = RecordBatch::try_from_iter([("col", col)]).unwrap(); - let mut parquet_bytes = Vec::new(); - let mut writer = ArrowWriter::try_new(&mut parquet_bytes, batch.schema(), None).unwrap(); - writer.write(&batch).unwrap(); - writer.close().unwrap(); + let parquet_bytes = sst_parquet_bytes(&batch); let file_size = parquet_bytes.len() as u64; let file_meta = FileMeta { @@ -1334,10 +1353,7 @@ mod tests { let col = Arc::new(Int64Array::from_iter_values([1, 2, 3])) as ArrayRef; let batch = RecordBatch::try_from_iter([("col", col)]).unwrap(); - let mut parquet_bytes = Vec::new(); - let mut writer = ArrowWriter::try_new(&mut parquet_bytes, batch.schema(), None).unwrap(); - writer.write(&batch).unwrap(); - writer.close().unwrap(); + let parquet_bytes = sst_parquet_bytes(&batch); // file_size is 0 when it's missing/defaulted in manifests; MetadataLoader::load will stat // the local filesystem to retrieve it. diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs index 1c5bfd9db0..26bed76fd6 100644 --- a/src/mito2/src/sst/parquet.rs +++ b/src/mito2/src/sst/parquet.rs @@ -383,8 +383,12 @@ mod tests { .page_index_policy(PageIndexPolicy::Optional); let reader = builder.build().await.unwrap().unwrap(); let reader_metadata = reader.parquet_metadata(); + let cached_writer_metadata = + crate::cache::CachedSstMeta::try_new("test.sst", Arc::unwrap_or_clone(writer_metadata)) + .unwrap() + .parquet_metadata(); - assert_parquet_metadata_equal(writer_metadata, reader_metadata); + assert_parquet_metadata_equal(cached_writer_metadata, reader_metadata); } #[tokio::test] diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs index 4d7122ccc6..855204b80e 100644 --- a/src/mito2/src/sst/parquet/reader.rs +++ b/src/mito2/src/sst/parquet/reader.rs @@ -34,22 +34,21 @@ use mito_codec::row_converter::build_primary_key_codec; use object_store::ObjectStore; use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, RowSelection}; use parquet::arrow::{FieldLevels, ProjectionMask, parquet_to_arrow_field_levels}; -use parquet::file::metadata::{KeyValue, PageIndexPolicy, ParquetMetaData}; +use parquet::file::metadata::{PageIndexPolicy, ParquetMetaData}; use partition::expr::PartitionExpr; -use snafu::{OptionExt, ResultExt}; +use snafu::ResultExt; use store_api::codec::PrimaryKeyEncoding; use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataRef}; use store_api::region_request::PathType; use store_api::storage::{ColumnId, FileId}; use table::predicate::Predicate; -use crate::cache::CacheStrategy; use crate::cache::index::result_cache::PredicateKey; +use crate::cache::{CacheStrategy, CachedSstMeta}; #[cfg(feature = "vector_index")] use crate::error::ApplyVectorIndexSnafu; use crate::error::{ - ArrowReaderSnafu, InvalidMetadataSnafu, InvalidParquetSnafu, ReadDataPartSnafu, - ReadParquetSnafu, Result, SerializePartitionExprSnafu, + ArrowReaderSnafu, ReadDataPartSnafu, ReadParquetSnafu, Result, SerializePartitionExprSnafu, }; use crate::metrics::{ PRECISE_FILTER_ROWS_TOTAL, READ_ROW_GROUPS_TOTAL, READ_ROWS_IN_ROW_GROUP_TOTAL, @@ -70,6 +69,7 @@ use crate::sst::index::inverted_index::applier::{ }; #[cfg(feature = "vector_index")] use crate::sst::index::vector_index::applier::VectorIndexApplierRef; +use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE; use crate::sst::parquet::file_range::{ FileRangeContext, FileRangeContextRef, PartitionFilterContext, PreFilterMode, RangeBase, row_group_contains_delete, @@ -79,7 +79,6 @@ use crate::sst::parquet::metadata::MetadataLoader; use crate::sst::parquet::row_group::{InMemoryRowGroup, ParquetFetchMetrics}; use crate::sst::parquet::row_selection::RowGroupSelection; use crate::sst::parquet::stats::RowGroupPruningStats; -use crate::sst::parquet::{DEFAULT_READ_BATCH_SIZE, PARQUET_METADATA_KEY}; use crate::sst::tag_maybe_to_dictionary_field; const INDEX_TYPE_FULLTEXT: &str = "fulltext"; @@ -340,7 +339,7 @@ impl ParquetReaderBuilder { let file_size = self.file_handle.meta_ref().file_size; // Loads parquet metadata of the file. - let (parquet_meta, cache_miss) = self + let (sst_meta, cache_miss) = self .read_parquet_metadata( &file_path, file_size, @@ -348,9 +347,8 @@ impl ParquetReaderBuilder { self.page_index_policy, ) .await?; - // Decodes region metadata. - let key_value_meta = parquet_meta.file_metadata().key_value_metadata(); - let region_meta = Arc::new(Self::get_region_metadata(&file_path, key_value_meta)?); + let parquet_meta = sst_meta.parquet_metadata(); + let region_meta = sst_meta.region_metadata(); let region_partition_expr_str = self .expected_metadata .as_ref() @@ -601,42 +599,15 @@ impl ParquetReaderBuilder { })) } - /// Decodes region metadata from key value. - fn get_region_metadata( - file_path: &str, - key_value_meta: Option<&Vec>, - ) -> Result { - let key_values = key_value_meta.context(InvalidParquetSnafu { - file: file_path, - reason: "missing key value meta", - })?; - let meta_value = key_values - .iter() - .find(|kv| kv.key == PARQUET_METADATA_KEY) - .with_context(|| InvalidParquetSnafu { - file: file_path, - reason: format!("key {} not found", PARQUET_METADATA_KEY), - })?; - let json = meta_value - .value - .as_ref() - .with_context(|| InvalidParquetSnafu { - file: file_path, - reason: format!("No value for key {}", PARQUET_METADATA_KEY), - })?; - - RegionMetadata::from_json(json).context(InvalidMetadataSnafu) - } - /// Reads parquet metadata of specific file. - /// Returns (metadata, cache_miss_flag). + /// Returns (fused metadata, cache_miss_flag). async fn read_parquet_metadata( &self, file_path: &str, file_size: u64, cache_metrics: &mut MetadataCacheMetrics, page_index_policy: PageIndexPolicy, - ) -> Result<(Arc, bool)> { + ) -> Result<(Arc, bool)> { let start = Instant::now(); let _t = READ_STAGE_ELAPSED .with_label_values(&["read_parquet_metadata"]) @@ -646,7 +617,7 @@ impl ParquetReaderBuilder { // Tries to get from cache with metrics tracking. if let Some(metadata) = self .cache_strategy - .get_parquet_meta_data(file_id, cache_metrics, page_index_policy) + .get_sst_meta_data(file_id, cache_metrics, page_index_policy) .await { cache_metrics.metadata_load_cost += start.elapsed(); @@ -659,10 +630,10 @@ impl ParquetReaderBuilder { metadata_loader.with_page_index_policy(page_index_policy); let metadata = metadata_loader.load(cache_metrics).await?; - let metadata = Arc::new(metadata); + let metadata = Arc::new(CachedSstMeta::try_new(file_path, metadata)?); // Cache the metadata. self.cache_strategy - .put_parquet_meta_data(file_id, metadata.clone()); + .put_sst_meta_data(file_id, metadata.clone()); cache_metrics.metadata_load_cost += start.elapsed(); Ok((metadata, true)) diff --git a/src/store-api/src/metadata.rs b/src/store-api/src/metadata.rs index d571a5392f..0c663bccc0 100644 --- a/src/store-api/src/metadata.rs +++ b/src/store-api/src/metadata.rs @@ -18,8 +18,8 @@ use std::any::Any; use std::collections::{HashMap, HashSet}; -use std::fmt; use std::sync::Arc; +use std::{fmt, mem}; use api::v1::SemanticType; use api::v1::column_def::try_as_column_schema; @@ -99,6 +99,12 @@ impl ColumnMetadata { pub fn is_same_datatype(&self, other: &Self) -> bool { self.column_schema.data_type == other.column_schema.data_type } + + /// Returns the estimated memory footprint of this metadata. + pub fn estimated_size(&self) -> usize { + mem::size_of_val(self) - mem::size_of_val(&self.column_schema) + + self.column_schema.estimated_size() + } } #[cfg_attr(doc, aquamarine::aquamarine)] @@ -226,6 +232,25 @@ impl RegionMetadata { serde_json::from_str(s).context(SerdeJsonSnafu) } + /// Returns the estimated memory footprint of this metadata. + pub fn estimated_size(&self) -> usize { + mem::size_of_val(self) + + mem::size_of::() * self.column_metadatas.capacity() + + self + .column_metadatas + .iter() + .map(|column| column.estimated_size() - mem::size_of::()) + .sum::() + + mem::size_of::() * self.primary_key.capacity() + + mem::size_of::<(ColumnId, usize)>() * self.id_to_index.capacity() + + self.schema.estimated_size() + + self + .partition_expr + .as_ref() + .map(|expr| expr.capacity()) + .unwrap_or_default() + } + /// Encode the metadata to a JSON string. pub fn to_json(&self) -> Result { serde_json::to_string(&self).context(SerdeJsonSnafu) From 16fcbb27298f106c0142ac3c2ba7c4865da6f0f5 Mon Sep 17 00:00:00 2001 From: jeremyhi Date: Thu, 19 Mar 2026 14:26:41 -0700 Subject: [PATCH 019/195] feat: export import v2 pr1 (#7785) * feat: v2 schema handling Signed-off-by: jeremyhi * feat: impl m1.5 ddl export/import and schema tests Signed-off-by: jeremyhi * chore: git ignore update Signed-off-by: jeremyhi * chore: add license header Signed-off-by: jeremyhi * chore: make fmt-check happy Signed-off-by: jeremyhi * fix: Run imported DDL against the intended schema Signed-off-by: jeremyhi * fix: Canonicalize schema names after case-insensitive check Signed-off-by: jeremyhi * fix: escape sql funcs Signed-off-by: jeremyhi * fix: Fixed by carrying explicit execution_schema in DdlStatement instead of parsing schema from SQL Signed-off-by: jeremyhi * fix: Fixed by encoding schema names as safe path segments in shared DDL path helpers Signed-off-by: jeremyhi * refactor(cli): make export/import v2 schema recovery DDL-only Signed-off-by: jeremyhi * chore: by clippy Signed-off-by: jeremyhi * chore: follow our styling Signed-off-by: jeremyhi * fix(cli): reject remote snapshot URIs with empty root Signed-off-by: jeremyhi * fix(cli): dedupe schema filters after canonicalization Signed-off-by: jeremyhi * fix(cli): schema-scoped detection to cover external tables Signed-off-by: jeremyhi --------- Signed-off-by: jeremyhi --- .gitignore | 3 + Cargo.lock | 1 + docs/rfcs/2025-12-30-export-import-v2.md | 11 +- src/cli/Cargo.toml | 3 +- src/cli/src/data.rs | 16 + src/cli/src/data/export.rs | 10 +- src/cli/src/data/export_v2.rs | 49 ++ src/cli/src/data/export_v2/command.rs | 496 +++++++++++++++++ src/cli/src/data/export_v2/error.rs | 181 +++++++ src/cli/src/data/export_v2/extractor.rs | 254 +++++++++ src/cli/src/data/export_v2/manifest.rs | 381 +++++++++++++ src/cli/src/data/export_v2/schema.rs | 98 ++++ src/cli/src/data/export_v2/tests.rs | 341 ++++++++++++ src/cli/src/data/import.rs | 11 +- src/cli/src/data/import_v2.rs | 41 ++ src/cli/src/data/import_v2/command.rs | 542 +++++++++++++++++++ src/cli/src/data/import_v2/error.rs | 82 +++ src/cli/src/data/import_v2/executor.rs | 122 +++++ src/cli/src/data/path.rs | 76 +++ src/cli/src/data/snapshot_storage.rs | 649 +++++++++++++++++++++++ src/cli/src/data/sql.rs | 40 ++ src/cli/src/database.rs | 21 +- src/cli/src/lib.rs | 2 +- 23 files changed, 3412 insertions(+), 18 deletions(-) create mode 100644 src/cli/src/data/export_v2.rs create mode 100644 src/cli/src/data/export_v2/command.rs create mode 100644 src/cli/src/data/export_v2/error.rs create mode 100644 src/cli/src/data/export_v2/extractor.rs create mode 100644 src/cli/src/data/export_v2/manifest.rs create mode 100644 src/cli/src/data/export_v2/schema.rs create mode 100644 src/cli/src/data/export_v2/tests.rs create mode 100644 src/cli/src/data/import_v2.rs create mode 100644 src/cli/src/data/import_v2/command.rs create mode 100644 src/cli/src/data/import_v2/error.rs create mode 100644 src/cli/src/data/import_v2/executor.rs create mode 100644 src/cli/src/data/path.rs create mode 100644 src/cli/src/data/snapshot_storage.rs create mode 100644 src/cli/src/data/sql.rs diff --git a/.gitignore b/.gitignore index 862eb8c5b4..87412d570c 100644 --- a/.gitignore +++ b/.gitignore @@ -70,3 +70,6 @@ CLAUDE.md # AGENTS.md AGENTS.md + +# local design docs +docs/specs/ diff --git a/Cargo.lock b/Cargo.lock index 605b037fc9..1b2a44d0e4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1946,6 +1946,7 @@ dependencies = [ "tokio", "tracing-appender", "url", + "uuid", ] [[package]] diff --git a/docs/rfcs/2025-12-30-export-import-v2.md b/docs/rfcs/2025-12-30-export-import-v2.md index 197eb7cc9d..6bc8428300 100644 --- a/docs/rfcs/2025-12-30-export-import-v2.md +++ b/docs/rfcs/2025-12-30-export-import-v2.md @@ -67,6 +67,7 @@ snapshot-20250101/ - Self-contained (all information needed for restore) - Immutable (content never changes after creation) - Verifiable (checksums at file, chunk, and snapshot levels) +- Schema-only snapshots contain only `manifest.json` and `schema/`; `data/` is absent, `chunks` is empty, and later data append is rejected (use `--force` to recreate) ### Chunk @@ -116,6 +117,8 @@ greptime export create \ --schema-only \ --to s3://my-bucket/snapshots/prod-schema-only +Schema-only snapshots cannot be resumed with data; use `--force` to recreate. + # Export with specific format (default: parquet) greptime export create \ --format csv \ @@ -173,7 +176,9 @@ The manifest is a JSON file containing snapshot metadata and chunk index: - `snapshot_id`: Unique identifier (UUID) - `catalog`, `schemas`: Catalog and schema list - `time_range`: Overall time range covered +- `schema_only`: Whether the snapshot contains schema only - `chunks[]`: Array of chunk metadata +- `format`: Data format for exported files - `checksum`: Snapshot-level SHA256 checksum **Chunk metadata structure**: @@ -182,7 +187,7 @@ Each chunk entry in the manifest contains: - `id`: Chunk identifier (sequential number) - `time_range`: Start and end timestamps -- `status`: Export status (Pending, Completed, Failed) +- `status`: Export status (Pending, InProgress, Completed, Failed) - `files`: List of data files in the chunk directory - `checksum`: Chunk-level checksum for integrity verification @@ -292,9 +297,9 @@ Checksums are verified during import before data is written to the database. **Resume capability**: -- Manifest tracks chunk status (Pending, Completed, Failed) +- Manifest tracks chunk status (Pending, InProgress, Completed, Failed) - Export/import automatically resumes when executed on existing snapshot -- Skips completed chunks, retries failed chunks, processes pending chunks +- Skips completed chunks, retries failed/in-progress chunks, processes pending chunks - Works across process restarts - Use `--force` (export only) to delete existing snapshot and start over diff --git a/src/cli/Cargo.toml b/src/cli/Cargo.toml index 46e79efd00..1eb2736007 100644 --- a/src/cli/Cargo.toml +++ b/src/cli/Cargo.toml @@ -65,6 +65,8 @@ store-api.workspace = true table.workspace = true tokio.workspace = true tracing-appender.workspace = true +url.workspace = true +uuid.workspace = true [dev-dependencies] common-meta = { workspace = true, features = ["testing"] } @@ -72,4 +74,3 @@ common-test-util.workspace = true common-version.workspace = true serde.workspace = true tempfile.workspace = true -url.workspace = true diff --git a/src/cli/src/data.rs b/src/cli/src/data.rs index 5966040a3b..114886542e 100644 --- a/src/cli/src/data.rs +++ b/src/cli/src/data.rs @@ -13,7 +13,12 @@ // limitations under the License. mod export; +pub mod export_v2; mod import; +pub mod import_v2; +pub(crate) mod path; +pub mod snapshot_storage; +pub(crate) mod sql; mod storage_export; use clap::Subcommand; @@ -22,15 +27,24 @@ use common_error::ext::BoxedError; use crate::Tool; use crate::data::export::ExportCommand; +use crate::data::export_v2::ExportV2Command; use crate::data::import::ImportCommand; +use crate::data::import_v2::ImportV2Command; pub(crate) const COPY_PATH_PLACEHOLDER: &str = ""; /// Command for data operations including exporting data from and importing data into GreptimeDB. #[derive(Subcommand)] pub enum DataCommand { + /// Export data (V1 - legacy). Export(ExportCommand), + /// Import data (V1 - legacy). Import(ImportCommand), + /// Export V2 - JSON-based schema export with manifest support. + #[clap(subcommand)] + ExportV2(ExportV2Command), + /// Import V2 - Import from V2 snapshot. + ImportV2(ImportV2Command), } impl DataCommand { @@ -38,6 +52,8 @@ impl DataCommand { match self { DataCommand::Export(cmd) => cmd.build().await, DataCommand::Import(cmd) => cmd.build().await, + DataCommand::ExportV2(cmd) => cmd.build().await, + DataCommand::ImportV2(cmd) => cmd.build().await, } } } diff --git a/src/cli/src/data/export.rs b/src/cli/src/data/export.rs index 1cdb159336..b5d547d4f3 100644 --- a/src/cli/src/data/export.rs +++ b/src/cli/src/data/export.rs @@ -107,13 +107,16 @@ pub struct ExportCommand { #[clap(long, value_parser = humantime::parse_duration)] timeout: Option, - /// The proxy server address to connect, if set, will override the system proxy. + /// The proxy server address to connect. /// - /// The default behavior will use the system proxy if neither `proxy` nor `no_proxy` is set. + /// If set, it overrides the system proxy unless `--no-proxy` is specified. + /// If neither `--proxy` nor `--no-proxy` is set, system proxy (env) may be used. #[clap(long)] proxy: Option, - /// Disable proxy server, if set, will not use any proxy. + /// Disable all proxy usage (ignores `--proxy` and system proxy). + /// + /// When set and `--proxy` is not provided, this explicitly disables system proxy. #[clap(long)] no_proxy: bool, @@ -173,6 +176,7 @@ impl ExportCommand { // Treats `None` as `0s` to disable server-side default timeout. self.timeout.unwrap_or_default(), proxy, + self.no_proxy, ); Ok(Box::new(Export { diff --git a/src/cli/src/data/export_v2.rs b/src/cli/src/data/export_v2.rs new file mode 100644 index 0000000000..91020d2f2e --- /dev/null +++ b/src/cli/src/data/export_v2.rs @@ -0,0 +1,49 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Export V2 module. +//! +//! This module provides the V2 implementation of database export functionality, +//! featuring: +//! - JSON-based schema export (version-agnostic) +//! - Manifest-based snapshot management +//! - Support for multiple storage backends (S3, OSS, GCS, Azure Blob, local FS) +//! - Resume capability for interrupted exports +//! +//! # Example +//! +//! ```bash +//! # Export schema only +//! greptime cli data export-v2 create \ +//! --addr 127.0.0.1:4000 \ +//! --to file:///tmp/snapshot \ +//! --schema-only +//! +//! # Export with time range (M2) +//! greptime cli data export-v2 create \ +//! --addr 127.0.0.1:4000 \ +//! --to s3://bucket/snapshots/prod-20250101 \ +//! --start-time 2025-01-01T00:00:00Z \ +//! --end-time 2025-01-31T23:59:59Z +//! ``` + +mod command; +pub mod error; +pub mod extractor; +pub mod manifest; +pub mod schema; +pub use command::ExportV2Command; + +#[cfg(test)] +mod tests; diff --git a/src/cli/src/data/export_v2/command.rs b/src/cli/src/data/export_v2/command.rs new file mode 100644 index 0000000000..341436fe0f --- /dev/null +++ b/src/cli/src/data/export_v2/command.rs @@ -0,0 +1,496 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Export V2 CLI commands. + +use std::collections::HashSet; +use std::time::Duration; + +use async_trait::async_trait; +use clap::{Parser, Subcommand}; +use common_error::ext::BoxedError; +use common_telemetry::info; +use serde_json::Value; +use snafu::{OptionExt, ResultExt}; + +use crate::Tool; +use crate::common::ObjectStoreConfig; +use crate::data::export_v2::error::{ + CannotResumeSchemaOnlySnafu, DataExportNotImplementedSnafu, DatabaseSnafu, EmptyResultSnafu, + ManifestVersionMismatchSnafu, Result, UnexpectedValueTypeSnafu, +}; +use crate::data::export_v2::extractor::SchemaExtractor; +use crate::data::export_v2::manifest::{DataFormat, MANIFEST_VERSION, Manifest}; +use crate::data::path::ddl_path_for_schema; +use crate::data::snapshot_storage::{OpenDalStorage, SnapshotStorage, validate_uri}; +use crate::data::sql::{escape_sql_identifier, escape_sql_literal}; +use crate::database::{DatabaseClient, parse_proxy_opts}; + +/// Export V2 commands. +#[derive(Debug, Subcommand)] +pub enum ExportV2Command { + /// Create a new snapshot. + Create(ExportCreateCommand), +} + +impl ExportV2Command { + pub async fn build(&self) -> std::result::Result, BoxedError> { + match self { + ExportV2Command::Create(cmd) => cmd.build().await, + } + } +} + +/// Create a new snapshot. +#[derive(Debug, Parser)] +pub struct ExportCreateCommand { + /// Server address to connect (e.g., 127.0.0.1:4000). + #[clap(long)] + addr: String, + + /// Target storage location (e.g., s3://bucket/path, file:///tmp/backup). + #[clap(long)] + to: String, + + /// Catalog name. + #[clap(long, default_value = "greptime")] + catalog: String, + + /// Schema list to export (default: all non-system schemas). + /// Can be specified multiple times or comma-separated. + #[clap(long, value_delimiter = ',')] + schemas: Vec, + + /// Export schema only, no data. + #[clap(long)] + schema_only: bool, + + /// Time range start (ISO 8601 format, e.g., 2024-01-01T00:00:00Z). + #[clap(long)] + start_time: Option, + + /// Time range end (ISO 8601 format, e.g., 2024-12-31T23:59:59Z). + #[clap(long)] + end_time: Option, + + /// Data format: parquet, csv, json. + #[clap(long, value_enum, default_value = "parquet")] + format: DataFormat, + + /// Delete existing snapshot and recreate. + #[clap(long)] + force: bool, + + /// Concurrency level (for future use). + #[clap(long, default_value = "1")] + parallelism: usize, + + /// Basic authentication (user:password). + #[clap(long)] + auth_basic: Option, + + /// Request timeout. + #[clap(long, value_parser = humantime::parse_duration)] + timeout: Option, + + /// Proxy server address. + /// + /// If set, it overrides the system proxy unless `--no-proxy` is specified. + /// If neither `--proxy` nor `--no-proxy` is set, system proxy (env) may be used. + #[clap(long)] + proxy: Option, + + /// Disable all proxy usage (ignores `--proxy` and system proxy). + /// + /// When set and `--proxy` is not provided, this explicitly disables system proxy. + #[clap(long)] + no_proxy: bool, + + /// Object store configuration for remote storage backends. + #[clap(flatten)] + storage: ObjectStoreConfig, +} + +impl ExportCreateCommand { + pub async fn build(&self) -> std::result::Result, BoxedError> { + // Validate URI format + validate_uri(&self.to).map_err(BoxedError::new)?; + + if !self.schema_only { + return DataExportNotImplementedSnafu + .fail() + .map_err(BoxedError::new); + } + + // Parse schemas (empty vec means all schemas) + let schemas = if self.schemas.is_empty() { + None + } else { + Some(self.schemas.clone()) + }; + + // Build storage + let storage = OpenDalStorage::from_uri(&self.to, &self.storage).map_err(BoxedError::new)?; + + // Build database client + let proxy = parse_proxy_opts(self.proxy.clone(), self.no_proxy)?; + let database_client = DatabaseClient::new( + self.addr.clone(), + self.catalog.clone(), + self.auth_basic.clone(), + self.timeout.unwrap_or(Duration::from_secs(60)), + proxy, + self.no_proxy, + ); + + Ok(Box::new(ExportCreate { + catalog: self.catalog.clone(), + schemas, + schema_only: self.schema_only, + _format: self.format, + force: self.force, + _parallelism: self.parallelism, + storage: Box::new(storage), + database_client, + })) + } +} + +/// Export tool implementation. +pub struct ExportCreate { + catalog: String, + schemas: Option>, + schema_only: bool, + _format: DataFormat, + force: bool, + _parallelism: usize, + storage: Box, + database_client: DatabaseClient, +} + +#[async_trait] +impl Tool for ExportCreate { + async fn do_work(&self) -> std::result::Result<(), BoxedError> { + self.run().await.map_err(BoxedError::new) + } +} + +impl ExportCreate { + async fn run(&self) -> Result<()> { + // 1. Check if snapshot exists + let exists = self.storage.exists().await?; + + if exists { + if self.force { + info!("Deleting existing snapshot (--force)"); + self.storage.delete_snapshot().await?; + } else { + // Resume mode - read existing manifest + let manifest = self.storage.read_manifest().await?; + + // Check version compatibility + if manifest.version != MANIFEST_VERSION { + return ManifestVersionMismatchSnafu { + expected: MANIFEST_VERSION, + found: manifest.version, + } + .fail(); + } + + // Cannot resume schema-only with data export + if manifest.schema_only && !self.schema_only { + return CannotResumeSchemaOnlySnafu.fail(); + } + + info!( + "Resuming existing snapshot: {} (completed: {}/{} chunks)", + manifest.snapshot_id, + manifest.completed_count(), + manifest.chunks.len() + ); + + // For M1, we only handle schema-only exports + // M2 will add chunk resume logic + if manifest.is_complete() { + info!("Snapshot is already complete"); + return Ok(()); + } + + // TODO: Resume data export in M2 + info!("Data export resume not yet implemented (M2)"); + return Ok(()); + } + } + + // 2. Get schema list + let extractor = SchemaExtractor::new(&self.database_client, &self.catalog); + let schema_snapshot = extractor.extract(self.schemas.as_deref()).await?; + + let schema_names: Vec = schema_snapshot + .schemas + .iter() + .map(|s| s.name.clone()) + .collect(); + info!("Exporting schemas: {:?}", schema_names); + + // 3. Create manifest + let manifest = Manifest::new_schema_only(self.catalog.clone(), schema_names.clone()); + + // 4. Write schema files + self.storage.write_schema(&schema_snapshot).await?; + info!("Exported {} schemas", schema_snapshot.schemas.len()); + + // 5. Export DDL files for import recovery. + let ddl_by_schema = self.build_ddl_by_schema(&schema_names).await?; + for (schema, ddl) in ddl_by_schema { + let ddl_path = ddl_path_for_schema(&schema); + self.storage.write_text(&ddl_path, &ddl).await?; + info!("Exported DDL for schema {} to {}", schema, ddl_path); + } + + // 6. Write manifest last. + // + // The manifest is the snapshot commit point: only write it after the schema + // index and all DDL files are durable, so a crash cannot leave a "valid" + // snapshot that is missing required schema artifacts. + self.storage.write_manifest(&manifest).await?; + info!("Snapshot created: {}", manifest.snapshot_id); + + Ok(()) + } + + async fn build_ddl_by_schema(&self, schema_names: &[String]) -> Result> { + let mut schemas = schema_names.to_vec(); + schemas.sort(); + + let mut ddl_by_schema = Vec::with_capacity(schemas.len()); + for schema in schemas { + let create_database = self.show_create("DATABASE", &schema, None).await?; + + let (mut physical_tables, mut tables, mut views) = + self.get_schema_objects(&schema).await?; + physical_tables.sort(); + let mut physical_ddls = Vec::with_capacity(physical_tables.len()); + for table in physical_tables { + physical_ddls.push(self.show_create("TABLE", &schema, Some(&table)).await?); + } + + tables.sort(); + let mut table_ddls = Vec::with_capacity(tables.len()); + for table in tables { + table_ddls.push(self.show_create("TABLE", &schema, Some(&table)).await?); + } + + views.sort(); + let mut view_ddls = Vec::with_capacity(views.len()); + for view in views { + view_ddls.push(self.show_create("VIEW", &schema, Some(&view)).await?); + } + + let ddl = build_schema_ddl( + &schema, + create_database, + physical_ddls, + table_ddls, + view_ddls, + ); + ddl_by_schema.push((schema, ddl)); + } + + Ok(ddl_by_schema) + } + + async fn get_schema_objects( + &self, + schema: &str, + ) -> Result<(Vec, Vec, Vec)> { + let physical_tables = self.get_metric_physical_tables(schema).await?; + let physical_set: HashSet<&str> = physical_tables.iter().map(String::as_str).collect(); + let sql = format!( + "SELECT table_name, table_type FROM information_schema.tables \ + WHERE table_catalog = '{}' AND table_schema = '{}' \ + AND (table_type = 'BASE TABLE' OR table_type = 'VIEW')", + escape_sql_literal(&self.catalog), + escape_sql_literal(schema) + ); + let records: Option>> = self + .database_client + .sql_in_public(&sql) + .await + .context(DatabaseSnafu)?; + + let mut tables = Vec::new(); + let mut views = Vec::new(); + if let Some(rows) = records { + for row in rows { + let name = match row.first() { + Some(Value::String(name)) => name.clone(), + _ => return UnexpectedValueTypeSnafu.fail(), + }; + let table_type = match row.get(1) { + Some(Value::String(table_type)) => table_type.as_str(), + _ => return UnexpectedValueTypeSnafu.fail(), + }; + if !physical_set.contains(name.as_str()) { + if table_type == "VIEW" { + views.push(name); + } else { + tables.push(name); + } + } + } + } + + Ok((physical_tables, tables, views)) + } + + async fn get_metric_physical_tables(&self, schema: &str) -> Result> { + let sql = format!( + "SELECT DISTINCT table_name FROM information_schema.columns \ + WHERE table_catalog = '{}' AND table_schema = '{}' AND column_name = '__tsid'", + escape_sql_literal(&self.catalog), + escape_sql_literal(schema) + ); + let records: Option>> = self + .database_client + .sql_in_public(&sql) + .await + .context(DatabaseSnafu)?; + + let mut tables = HashSet::new(); + if let Some(rows) = records { + for row in rows { + let name = match row.first() { + Some(Value::String(name)) => name.clone(), + _ => return UnexpectedValueTypeSnafu.fail(), + }; + tables.insert(name); + } + } + + Ok(tables.into_iter().collect()) + } + + async fn show_create( + &self, + show_type: &str, + schema: &str, + table: Option<&str>, + ) -> Result { + let sql = match table { + Some(table) => format!( + r#"SHOW CREATE {} "{}"."{}"."{}""#, + show_type, + escape_sql_identifier(&self.catalog), + escape_sql_identifier(schema), + escape_sql_identifier(table) + ), + None => format!( + r#"SHOW CREATE {} "{}"."{}""#, + show_type, + escape_sql_identifier(&self.catalog), + escape_sql_identifier(schema) + ), + }; + + let records: Option>> = self + .database_client + .sql_in_public(&sql) + .await + .context(DatabaseSnafu)?; + let rows = records.context(EmptyResultSnafu)?; + let row = rows.first().context(EmptyResultSnafu)?; + let Some(Value::String(create)) = row.get(1) else { + return UnexpectedValueTypeSnafu.fail(); + }; + + Ok(format!("{};\n", create)) + } +} + +fn build_schema_ddl( + schema: &str, + create_database: String, + physical_tables: Vec, + tables: Vec, + views: Vec, +) -> String { + let mut ddl = String::new(); + ddl.push_str(&format!("-- Schema: {}\n", schema)); + ddl.push_str(&create_database); + for stmt in physical_tables { + ddl.push_str(&stmt); + } + for stmt in tables { + ddl.push_str(&stmt); + } + for stmt in views { + ddl.push_str(&stmt); + } + ddl.push('\n'); + ddl +} + +#[cfg(test)] +mod tests { + use clap::Parser; + + use super::*; + use crate::data::path::ddl_path_for_schema; + + #[test] + fn test_ddl_path_for_schema() { + assert_eq!(ddl_path_for_schema("public"), "schema/ddl/public.sql"); + assert_eq!( + ddl_path_for_schema("../evil"), + "schema/ddl/%2E%2E%2Fevil.sql" + ); + } + + #[test] + fn test_build_schema_ddl_order() { + let ddl = build_schema_ddl( + "public", + "CREATE DATABASE public;\n".to_string(), + vec!["PHYSICAL;\n".to_string()], + vec!["TABLE;\n".to_string()], + vec!["VIEW;\n".to_string()], + ); + + let db_pos = ddl.find("CREATE DATABASE").unwrap(); + let physical_pos = ddl.find("PHYSICAL;").unwrap(); + let table_pos = ddl.find("TABLE;").unwrap(); + let view_pos = ddl.find("VIEW;").unwrap(); + assert!(db_pos < physical_pos); + assert!(physical_pos < table_pos); + assert!(table_pos < view_pos); + } + + #[tokio::test] + async fn test_build_rejects_non_schema_only_export() { + let cmd = ExportCreateCommand::parse_from([ + "export-v2-create", + "--addr", + "127.0.0.1:4000", + "--to", + "file:///tmp/export-v2-test", + ]); + + let result = cmd.build().await; + assert!(result.is_err()); + let error = result.err().unwrap().to_string(); + + assert!(error.contains("Data export is not implemented yet")); + } +} diff --git a/src/cli/src/data/export_v2/error.rs b/src/cli/src/data/export_v2/error.rs new file mode 100644 index 0000000000..2db71d5326 --- /dev/null +++ b/src/cli/src/data/export_v2/error.rs @@ -0,0 +1,181 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; + +use common_error::ext::ErrorExt; +use common_error::status_code::StatusCode; +use common_macro::stack_trace_debug; +use snafu::{Location, Snafu}; + +#[derive(Snafu)] +#[snafu(visibility(pub))] +#[stack_trace_debug] +pub enum Error { + #[snafu(display("Invalid URI '{}': {}", uri, reason))] + InvalidUri { + uri: String, + reason: String, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Unsupported storage scheme: {}", scheme))] + UnsupportedScheme { + scheme: String, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Storage operation '{}' failed", operation))] + StorageOperation { + operation: String, + #[snafu(source)] + error: object_store::Error, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Failed to parse manifest"))] + ManifestParse { + #[snafu(source)] + error: serde_json::Error, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Failed to serialize manifest"))] + ManifestSerialize { + #[snafu(source)] + error: serde_json::Error, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Failed to decode text file as UTF-8"))] + TextDecode { + #[snafu(source)] + error: std::string::FromUtf8Error, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display( + "Cannot resume schema-only snapshot with data export. Use --force to recreate." + ))] + CannotResumeSchemaOnly { + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display( + "Data export is not implemented yet. Use --schema-only to create a schema snapshot." + ))] + DataExportNotImplemented { + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Empty result from query"))] + EmptyResult { + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Unexpected value type in query result"))] + UnexpectedValueType { + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Database error"))] + Database { + #[snafu(source)] + error: crate::error::Error, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Snapshot not found at '{}'", uri))] + SnapshotNotFound { + uri: String, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Schema '{}' not found in catalog '{}'", schema, catalog))] + SchemaNotFound { + catalog: String, + schema: String, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Failed to parse URL"))] + UrlParse { + #[snafu(source)] + error: url::ParseError, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Failed to build object store"))] + BuildObjectStore { + #[snafu(source)] + error: object_store::Error, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Manifest version mismatch: expected {}, found {}", expected, found))] + ManifestVersionMismatch { + expected: u32, + found: u32, + #[snafu(implicit)] + location: Location, + }, +} + +pub type Result = std::result::Result; + +impl ErrorExt for Error { + fn status_code(&self) -> StatusCode { + match self { + Error::InvalidUri { .. } + | Error::UnsupportedScheme { .. } + | Error::CannotResumeSchemaOnly { .. } + | Error::DataExportNotImplemented { .. } + | Error::ManifestVersionMismatch { .. } => StatusCode::InvalidArguments, + + Error::StorageOperation { .. } + | Error::ManifestParse { .. } + | Error::ManifestSerialize { .. } + | Error::TextDecode { .. } + | Error::BuildObjectStore { .. } => StatusCode::StorageUnavailable, + + Error::EmptyResult { .. } + | Error::UnexpectedValueType { .. } + | Error::UrlParse { .. } => StatusCode::Internal, + + Error::Database { error, .. } => error.status_code(), + + Error::SnapshotNotFound { .. } => StatusCode::InvalidArguments, + Error::SchemaNotFound { .. } => StatusCode::DatabaseNotFound, + } + } + + fn as_any(&self) -> &dyn Any { + self + } +} diff --git a/src/cli/src/data/export_v2/extractor.rs b/src/cli/src/data/export_v2/extractor.rs new file mode 100644 index 0000000000..ae15b199af --- /dev/null +++ b/src/cli/src/data/export_v2/extractor.rs @@ -0,0 +1,254 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Schema extraction from information_schema. +//! +//! For V2 DDL-only snapshots, extractor only persists the schema index. + +use std::collections::{HashMap, HashSet}; + +use serde_json::Value; +use snafu::ResultExt; + +use crate::data::export_v2::error::{ + DatabaseSnafu, EmptyResultSnafu, Result, SchemaNotFoundSnafu, UnexpectedValueTypeSnafu, +}; +use crate::data::export_v2::schema::{SchemaDefinition, SchemaSnapshot}; +use crate::data::sql::escape_sql_literal; +use crate::database::DatabaseClient; + +/// System schemas that should be excluded from export. +const SYSTEM_SCHEMAS: &[&str] = &["information_schema", "pg_catalog"]; + +/// Extracts schema definitions from information_schema. +pub struct SchemaExtractor<'a> { + client: &'a DatabaseClient, + catalog: &'a str, +} + +impl<'a> SchemaExtractor<'a> { + /// Creates a new schema extractor. + pub fn new(client: &'a DatabaseClient, catalog: &'a str) -> Self { + Self { client, catalog } + } + + /// Extracts the schema index for the given schemas. + /// + /// If `schemas` is None, extracts all non-system schemas. + pub async fn extract(&self, schemas: Option<&[String]>) -> Result { + let mut snapshot = SchemaSnapshot::new(); + + let schema_names = match schemas { + Some(names) => self.validate_schemas(names).await?, + None => self.get_all_schemas().await?, + }; + + for schema_name in &schema_names { + let schema_def = self.extract_schema_definition(schema_name).await?; + snapshot.add_schema(schema_def); + } + + Ok(snapshot) + } + + /// Gets all non-system schemas in the catalog. + async fn get_all_schemas(&self) -> Result> { + let sql = format!( + "SELECT schema_name FROM information_schema.schemata \ + WHERE catalog_name = '{}'", + escape_sql_literal(self.catalog) + ); + + let records = self.query(&sql).await?; + let mut schemas = Vec::new(); + + for row in records { + let name = extract_string(&row, 0)?; + if !SYSTEM_SCHEMAS.contains(&name.as_str()) { + schemas.push(name); + } + } + + Ok(schemas) + } + + /// Validates that all specified schemas exist. + async fn validate_schemas(&self, schemas: &[String]) -> Result> { + let all_schemas = self.get_all_schemas().await?; + dedupe_canonicalized_schemas(schemas, &all_schemas, self.catalog) + } + + /// Extracts schema (database) definition. + async fn extract_schema_definition(&self, schema: &str) -> Result { + let sql = format!( + "SELECT schema_name, options FROM information_schema.schemata \ + WHERE catalog_name = '{}' AND schema_name = '{}'", + escape_sql_literal(self.catalog), + escape_sql_literal(schema) + ); + + let records = self.query(&sql).await?; + if records.is_empty() { + return SchemaNotFoundSnafu { + catalog: self.catalog, + schema, + } + .fail(); + } + + let name = extract_string(&records[0], 0)?; + let options = extract_optional_string(&records[0], 1) + .map(|opts| parse_options(&opts)) + .unwrap_or_default(); + + Ok(SchemaDefinition { + catalog: self.catalog.to_string(), + name, + options, + }) + } + + /// Executes a SQL query and returns the results. + async fn query(&self, sql: &str) -> Result>> { + self.client + .sql_in_public(sql) + .await + .context(DatabaseSnafu)? + .ok_or_else(|| EmptyResultSnafu.build()) + } +} + +/// Extracts a string value from a row. +fn extract_string(row: &[Value], index: usize) -> Result { + match row.get(index) { + Some(Value::String(s)) => Ok(s.clone()), + Some(Value::Null) => UnexpectedValueTypeSnafu.fail(), + _ => UnexpectedValueTypeSnafu.fail(), + } +} + +/// Extracts an optional string value from a row. +fn extract_optional_string(row: &[Value], index: usize) -> Option { + match row.get(index) { + Some(Value::String(s)) if !s.is_empty() => Some(s.clone()), + _ => None, + } +} + +/// Parses options string into a HashMap. +fn parse_options(options_str: &str) -> HashMap { + if let Ok(map) = serde_json::from_str::>(options_str) { + return map; + } + + let mut options = HashMap::new(); + for line in options_str.lines() { + let trimmed = line.trim(); + if trimmed.is_empty() { + continue; + } + + if let Some((key, value)) = parse_quoted_option_line(trimmed) { + options.insert(key, value); + continue; + } + + for part in trimmed.split_whitespace() { + if let Some((key, value)) = part.split_once('=') { + options.insert(key.to_string(), value.to_string()); + } + } + } + options +} + +fn parse_quoted_option_line(line: &str) -> Option<(String, String)> { + let key = line.strip_prefix('\'')?; + let (key, rest) = key.split_once("'='")?; + let value = rest.strip_suffix('\'')?; + Some((key.to_string(), value.to_string())) +} + +fn dedupe_canonicalized_schemas( + requested: &[String], + available: &[String], + catalog: &str, +) -> Result> { + let mut canonicalized = Vec::new(); + let mut seen = HashSet::new(); + + for schema in requested { + let Some(canonical) = available.iter().find(|s| s.eq_ignore_ascii_case(schema)) else { + return SchemaNotFoundSnafu { catalog, schema }.fail(); + }; + + if seen.insert(canonical.to_ascii_lowercase()) { + canonicalized.push(canonical.clone()); + } + } + + Ok(canonicalized) +} + +#[cfg(test)] +mod tests { + use serde_json::Value; + + use super::*; + + #[test] + fn test_parse_options_json() { + let opts = r#"{"ttl": "30d", "custom": "value"}"#; + let parsed = parse_options(opts); + assert_eq!(parsed.get("ttl"), Some(&"30d".to_string())); + assert_eq!(parsed.get("custom"), Some(&"value".to_string())); + } + + #[test] + fn test_parse_options_key_value() { + let opts = "ttl=30d custom=value"; + let parsed = parse_options(opts); + assert_eq!(parsed.get("ttl"), Some(&"30d".to_string())); + assert_eq!(parsed.get("custom"), Some(&"value".to_string())); + } + + #[test] + fn test_parse_options_schema_display_format() { + let opts = "'ttl'='30d'\n'custom'='value with spaces'\n"; + let parsed = parse_options(opts); + assert_eq!(parsed.get("ttl"), Some(&"30d".to_string())); + assert_eq!(parsed.get("custom"), Some(&"value with spaces".to_string())); + } + + #[test] + fn test_extract_string_rejects_null() { + let row = vec![Value::Null]; + assert!(extract_string(&row, 0).is_err()); + } + + #[test] + fn test_dedupe_canonicalized_schemas() { + let available = vec!["public".to_string(), "test_db".to_string()]; + let requested = vec![ + "PUBLIC".to_string(), + "public".to_string(), + "Test_Db".to_string(), + ]; + + let canonicalized = dedupe_canonicalized_schemas(&requested, &available, "greptime") + .expect("schemas should be canonicalized"); + + assert_eq!(canonicalized, vec!["public", "test_db"]); + } +} diff --git a/src/cli/src/data/export_v2/manifest.rs b/src/cli/src/data/export_v2/manifest.rs new file mode 100644 index 0000000000..0ebf753fa4 --- /dev/null +++ b/src/cli/src/data/export_v2/manifest.rs @@ -0,0 +1,381 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Manifest data structures for Export/Import V2. + +use std::{fmt, str}; + +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +/// Current manifest format version. +pub const MANIFEST_VERSION: u32 = 1; + +/// Manifest file name within snapshot directory. +pub const MANIFEST_FILE: &str = "manifest.json"; + +/// Time range for data export (half-open interval: [start, end)). +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct TimeRange { + /// Start time (inclusive). None means earliest available data. + #[serde(skip_serializing_if = "Option::is_none")] + pub start: Option>, + /// End time (exclusive). None means current time. + #[serde(skip_serializing_if = "Option::is_none")] + pub end: Option>, +} + +impl TimeRange { + /// Creates a new time range with specified bounds. + pub fn new(start: Option>, end: Option>) -> Self { + Self { start, end } + } + + /// Creates an unbounded time range (all data). + pub fn unbounded() -> Self { + Self { + start: None, + end: None, + } + } + + /// Returns true if this time range is unbounded. + pub fn is_unbounded(&self) -> bool { + self.start.is_none() && self.end.is_none() + } +} + +impl Default for TimeRange { + fn default() -> Self { + Self::unbounded() + } +} + +/// Status of a chunk during export/import. +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] +#[serde(rename_all = "snake_case")] +pub enum ChunkStatus { + /// Chunk is pending export. + #[default] + Pending, + /// Chunk export is in progress. + InProgress, + /// Chunk export completed successfully. + Completed, + /// Chunk export failed. + Failed, +} + +/// Metadata for a single chunk of exported data. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ChunkMeta { + /// Chunk identifier (sequential number starting from 1). + pub id: u32, + /// Time range covered by this chunk. + pub time_range: TimeRange, + /// Export status. + pub status: ChunkStatus, + /// List of data files in this chunk (relative paths from snapshot root). + #[serde(default)] + pub files: Vec, + /// SHA256 checksum of all files in this chunk (aggregated). + #[serde(skip_serializing_if = "Option::is_none")] + pub checksum: Option, + /// Error message if status is Failed. + #[serde(skip_serializing_if = "Option::is_none")] + pub error: Option, +} + +impl ChunkMeta { + /// Creates a new pending chunk with the given id and time range. + pub fn new(id: u32, time_range: TimeRange) -> Self { + Self { + id, + time_range, + status: ChunkStatus::Pending, + files: vec![], + checksum: None, + error: None, + } + } + + /// Marks this chunk as in progress. + pub fn mark_in_progress(&mut self) { + self.status = ChunkStatus::InProgress; + self.error = None; + } + + /// Marks this chunk as completed with the given files and checksum. + pub fn mark_completed(&mut self, files: Vec, checksum: Option) { + self.status = ChunkStatus::Completed; + self.files = files; + self.checksum = checksum; + self.error = None; + } + + /// Marks this chunk as failed with the given error message. + pub fn mark_failed(&mut self, error: String) { + self.status = ChunkStatus::Failed; + self.error = Some(error); + } +} + +/// Supported data formats for export. +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default, clap::ValueEnum)] +#[serde(rename_all = "lowercase")] +#[value(rename_all = "lowercase")] +pub enum DataFormat { + /// Apache Parquet format (default, recommended for production). + #[default] + Parquet, + /// CSV format (human-readable). + Csv, + /// JSON format (structured text). + Json, +} + +impl fmt::Display for DataFormat { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + DataFormat::Parquet => write!(f, "parquet"), + DataFormat::Csv => write!(f, "csv"), + DataFormat::Json => write!(f, "json"), + } + } +} + +impl str::FromStr for DataFormat { + type Err = String; + + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "parquet" => Ok(DataFormat::Parquet), + "csv" => Ok(DataFormat::Csv), + "json" => Ok(DataFormat::Json), + _ => Err(format!( + "invalid format '{}': expected one of parquet, csv, json", + s + )), + } + } +} + +/// Snapshot manifest containing all metadata. +/// +/// The manifest is stored as `manifest.json` in the snapshot root directory. +/// It contains: +/// - Snapshot identification (UUID, timestamps) +/// - Scope (catalog, schemas, time range) +/// - Export configuration (format, schema_only) +/// - Chunk metadata for resume support +/// - Integrity checksums +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Manifest { + /// Manifest format version for compatibility checking. + pub version: u32, + /// Unique snapshot identifier. + pub snapshot_id: Uuid, + /// Catalog name. + pub catalog: String, + /// List of schemas included in this snapshot. + pub schemas: Vec, + /// Overall time range covered by this snapshot. + pub time_range: TimeRange, + /// Whether this is a schema-only snapshot (no data). + pub schema_only: bool, + /// Data format used for export. + pub format: DataFormat, + /// Chunk metadata (empty for schema-only snapshots). + #[serde(default)] + pub chunks: Vec, + /// Snapshot-level SHA256 checksum (aggregated from all chunks). + #[serde(skip_serializing_if = "Option::is_none")] + pub checksum: Option, + /// Creation timestamp. + pub created_at: DateTime, + /// Last updated timestamp. + pub updated_at: DateTime, +} + +impl Manifest { + /// Creates a new manifest for schema-only export. + pub fn new_schema_only(catalog: String, schemas: Vec) -> Self { + let now = Utc::now(); + Self { + version: MANIFEST_VERSION, + snapshot_id: Uuid::new_v4(), + catalog, + schemas, + time_range: TimeRange::unbounded(), + schema_only: true, + format: DataFormat::Parquet, + chunks: vec![], + checksum: None, + created_at: now, + updated_at: now, + } + } + + /// Creates a new manifest for full export with time range and format. + pub fn new_full( + catalog: String, + schemas: Vec, + time_range: TimeRange, + format: DataFormat, + ) -> Self { + let now = Utc::now(); + Self { + version: MANIFEST_VERSION, + snapshot_id: Uuid::new_v4(), + catalog, + schemas, + time_range, + schema_only: false, + format, + chunks: vec![], + checksum: None, + created_at: now, + updated_at: now, + } + } + + /// Returns true if all chunks are completed (or if schema-only). + pub fn is_complete(&self) -> bool { + self.schema_only + || (!self.chunks.is_empty() + && self + .chunks + .iter() + .all(|c| c.status == ChunkStatus::Completed)) + } + + /// Returns the number of pending chunks. + pub fn pending_count(&self) -> usize { + self.chunks + .iter() + .filter(|c| c.status == ChunkStatus::Pending) + .count() + } + + /// Returns the number of in-progress chunks. + pub fn in_progress_count(&self) -> usize { + self.chunks + .iter() + .filter(|c| c.status == ChunkStatus::InProgress) + .count() + } + + /// Returns the number of completed chunks. + pub fn completed_count(&self) -> usize { + self.chunks + .iter() + .filter(|c| c.status == ChunkStatus::Completed) + .count() + } + + /// Returns the number of failed chunks. + pub fn failed_count(&self) -> usize { + self.chunks + .iter() + .filter(|c| c.status == ChunkStatus::Failed) + .count() + } + + /// Updates the `updated_at` timestamp to now. + pub fn touch(&mut self) { + self.updated_at = Utc::now(); + } + + /// Adds a chunk to the manifest. + pub fn add_chunk(&mut self, chunk: ChunkMeta) { + self.chunks.push(chunk); + self.touch(); + } + + /// Updates a chunk by id. + pub fn update_chunk(&mut self, id: u32, updater: impl FnOnce(&mut ChunkMeta)) { + if let Some(chunk) = self.chunks.iter_mut().find(|c| c.id == id) { + updater(chunk); + self.touch(); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_time_range_serialization() { + let range = TimeRange::unbounded(); + let json = serde_json::to_string(&range).unwrap(); + assert_eq!(json, "{}"); + + let range: TimeRange = serde_json::from_str("{}").unwrap(); + assert!(range.is_unbounded()); + } + + #[test] + fn test_manifest_schema_only() { + let manifest = + Manifest::new_schema_only("greptime".to_string(), vec!["public".to_string()]); + + assert_eq!(manifest.version, MANIFEST_VERSION); + assert!(manifest.schema_only); + assert!(manifest.chunks.is_empty()); + assert!(manifest.is_complete()); + } + + #[test] + fn test_manifest_full() { + let manifest = Manifest::new_full( + "greptime".to_string(), + vec!["public".to_string()], + TimeRange::unbounded(), + DataFormat::Parquet, + ); + + assert!(!manifest.schema_only); + assert!(manifest.chunks.is_empty()); + assert!(!manifest.is_complete()); + } + + #[test] + fn test_data_format_parsing() { + assert_eq!( + "parquet".parse::().unwrap(), + DataFormat::Parquet + ); + assert_eq!("CSV".parse::().unwrap(), DataFormat::Csv); + assert_eq!("JSON".parse::().unwrap(), DataFormat::Json); + assert!("invalid".parse::().is_err()); + } + + #[test] + fn test_chunk_status_transitions() { + let mut chunk = ChunkMeta::new(1, TimeRange::unbounded()); + assert_eq!(chunk.status, ChunkStatus::Pending); + + chunk.mark_in_progress(); + assert_eq!(chunk.status, ChunkStatus::InProgress); + + chunk.mark_completed( + vec!["file1.parquet".to_string()], + Some("abc123".to_string()), + ); + assert_eq!(chunk.status, ChunkStatus::Completed); + assert_eq!(chunk.files.len(), 1); + } +} diff --git a/src/cli/src/data/export_v2/schema.rs b/src/cli/src/data/export_v2/schema.rs new file mode 100644 index 0000000000..1aab6ac900 --- /dev/null +++ b/src/cli/src/data/export_v2/schema.rs @@ -0,0 +1,98 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Minimal schema index structures for Export/Import V2. +//! +//! The canonical schema representation is the per-schema DDL file under +//! `schema/ddl/`. `schemas.json` only records which schemas exist in a snapshot. + +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +/// Schema directory name within snapshot. +pub const SCHEMA_DIR: &str = "schema"; + +/// DDL directory name within schema directory. +pub const DDL_DIR: &str = "ddl"; + +/// Schema definition file name. +pub const SCHEMAS_FILE: &str = "schemas.json"; + +/// Schema (database) definition. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct SchemaDefinition { + /// Catalog name. + pub catalog: String, + /// Schema (database) name. + pub name: String, + /// Schema options (if any). + #[serde(default, skip_serializing_if = "HashMap::is_empty")] + pub options: HashMap, +} + +/// Minimal schema index stored in a snapshot. +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)] +pub struct SchemaSnapshot { + /// Schema (database) definitions. + pub schemas: Vec, +} + +impl SchemaSnapshot { + /// Creates an empty schema snapshot. + pub fn new() -> Self { + Self::default() + } + + /// Adds a schema definition. + pub fn add_schema(&mut self, schema: SchemaDefinition) { + self.schemas.push(schema); + } + + /// Filters the snapshot to only include specified schemas. + pub fn filter_schemas(&self, schemas: &[String]) -> Self { + Self { + schemas: self + .schemas + .iter() + .filter(|s| schemas.contains(&s.name)) + .cloned() + .collect(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_schema_snapshot_filter() { + let mut snapshot = SchemaSnapshot::new(); + snapshot.add_schema(SchemaDefinition { + catalog: "greptime".to_string(), + name: "public".to_string(), + options: HashMap::new(), + }); + snapshot.add_schema(SchemaDefinition { + catalog: "greptime".to_string(), + name: "private".to_string(), + options: HashMap::new(), + }); + + let filtered = snapshot.filter_schemas(&["public".to_string()]); + assert_eq!(filtered.schemas.len(), 1); + assert_eq!(filtered.schemas[0].name, "public"); + } +} diff --git a/src/cli/src/data/export_v2/tests.rs b/src/cli/src/data/export_v2/tests.rs new file mode 100644 index 0000000000..bd28801a0d --- /dev/null +++ b/src/cli/src/data/export_v2/tests.rs @@ -0,0 +1,341 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::env; +use std::time::Duration; + +use clap::Parser; +use common_error::ext::BoxedError; +use snafu::ResultExt; +use tempfile::tempdir; +use url::Url; + +use super::command::ExportCreateCommand; +use crate::common::ObjectStoreConfig; +use crate::data::import_v2::ImportV2Command; +use crate::data::snapshot_storage::OpenDalStorage; +use crate::database::DatabaseClient; +use crate::error::{FileIoSnafu, InvalidArgumentsSnafu, OtherSnafu, Result}; + +#[tokio::test] +#[ignore] +async fn export_import_v2_schema_parity_e2e() -> Result<()> { + let addr = env::var("GREPTIME_ADDR").unwrap_or_else(|_| "127.0.0.1:4000".to_string()); + let catalog = env::var("GREPTIME_CATALOG").unwrap_or_else(|_| "greptime".to_string()); + let auth_basic = env::var("GREPTIME_AUTH_BASIC").ok(); + let schema = "test_db_schema_parity"; + + let database_client = DatabaseClient::new( + addr.clone(), + catalog.clone(), + auth_basic.clone(), + Duration::from_secs(60), + None, + false, + ); + + database_client + .sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}")) + .await?; + database_client + .sql_in_public(&format!("CREATE DATABASE {schema}")) + .await?; + database_client + .sql( + "CREATE TABLE metrics (\ + ts TIMESTAMP TIME INDEX, \ + host STRING PRIMARY KEY, \ + cpu DOUBLE DEFAULT 0.0, \ + region_name STRING \ + ) ENGINE = mito WITH (ttl='7d', 'compaction.type'='twcs')", + schema, + ) + .await?; + database_client + .sql( + "CREATE TABLE logs (\ + ts TIMESTAMP TIME INDEX, \ + app STRING PRIMARY KEY, \ + msg STRING NOT NULL COMMENT 'log message' \ + ) ENGINE = mito", + schema, + ) + .await?; + database_client + .sql( + "CREATE TABLE metrics_physical (\ + ts TIMESTAMP TIME INDEX, \ + host STRING, \ + region_name STRING, \ + cpu DOUBLE DEFAULT 0.0, \ + PRIMARY KEY (host, region_name) \ + ) ENGINE = metric WITH (physical_metric_table='true')", + schema, + ) + .await?; + database_client + .sql( + "CREATE TABLE metrics_logical (\ + ts TIMESTAMP TIME INDEX, \ + host STRING, \ + region_name STRING, \ + cpu DOUBLE DEFAULT 0.0, \ + PRIMARY KEY (host, region_name) \ + ) ENGINE = metric WITH (on_physical_table='metrics_physical')", + schema, + ) + .await?; + database_client + .sql( + "CREATE VIEW metrics_view AS SELECT * FROM metrics WHERE cpu > 0.5", + schema, + ) + .await?; + + let src_dir = tempdir().context(FileIoSnafu)?; + let src_uri = Url::from_directory_path(src_dir.path()) + .map_err(|_| { + InvalidArgumentsSnafu { + msg: "invalid temp dir path".to_string(), + } + .build() + })? + .to_string(); + + let mut export_args = vec![ + "export-v2-create", + "--addr", + &addr, + "--to", + &src_uri, + "--catalog", + &catalog, + "--schemas", + schema, + "--schema-only", + ]; + if let Some(auth) = &auth_basic { + export_args.push("--auth-basic"); + export_args.push(auth); + } + let export_cmd = ExportCreateCommand::parse_from(export_args); + export_cmd + .build() + .await + .context(OtherSnafu)? + .do_work() + .await + .context(OtherSnafu)?; + + database_client + .sql_in_public(&format!("DROP DATABASE {schema}")) + .await?; + + let mut import_args = vec![ + "import-v2", + "--addr", + &addr, + "--from", + &src_uri, + "--catalog", + &catalog, + "--schemas", + schema, + ]; + if let Some(auth) = &auth_basic { + import_args.push("--auth-basic"); + import_args.push(auth); + } + let import_cmd = ImportV2Command::parse_from(import_args); + import_cmd + .build() + .await + .context(OtherSnafu)? + .do_work() + .await + .context(OtherSnafu)?; + + let dst_dir = tempdir().context(FileIoSnafu)?; + let dst_uri = Url::from_directory_path(dst_dir.path()) + .map_err(|_| { + InvalidArgumentsSnafu { + msg: "invalid temp dir path".to_string(), + } + .build() + })? + .to_string(); + + let mut export_args = vec![ + "export-v2-create", + "--addr", + &addr, + "--to", + &dst_uri, + "--catalog", + &catalog, + "--schemas", + schema, + "--schema-only", + ]; + if let Some(auth) = &auth_basic { + export_args.push("--auth-basic"); + export_args.push(auth); + } + let export_cmd = ExportCreateCommand::parse_from(export_args); + export_cmd + .build() + .await + .context(OtherSnafu)? + .do_work() + .await + .context(OtherSnafu)?; + + let storage_config = ObjectStoreConfig::default(); + let src_storage = OpenDalStorage::from_uri(&src_uri, &storage_config) + .map_err(BoxedError::new) + .context(OtherSnafu)?; + let dst_storage = OpenDalStorage::from_uri(&dst_uri, &storage_config) + .map_err(BoxedError::new) + .context(OtherSnafu)?; + + let src_schema_snapshot = src_storage + .read_schema() + .await + .map_err(BoxedError::new) + .context(OtherSnafu)?; + let dst_schema_snapshot = dst_storage + .read_schema() + .await + .map_err(BoxedError::new) + .context(OtherSnafu)?; + assert_eq!(src_schema_snapshot, dst_schema_snapshot); + + database_client + .sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}")) + .await?; + + Ok(()) +} + +#[tokio::test] +#[ignore] +async fn import_v2_ddl_dry_run_e2e() -> Result<()> { + let addr = env::var("GREPTIME_ADDR").unwrap_or_else(|_| "127.0.0.1:4000".to_string()); + let catalog = env::var("GREPTIME_CATALOG").unwrap_or_else(|_| "greptime".to_string()); + let auth_basic = env::var("GREPTIME_AUTH_BASIC").ok(); + let schema = "test_db_ddl_dry_run"; + + let database_client = DatabaseClient::new( + addr.clone(), + catalog.clone(), + auth_basic.clone(), + Duration::from_secs(60), + None, + false, + ); + + database_client + .sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}")) + .await?; + database_client + .sql_in_public(&format!("CREATE DATABASE {schema}")) + .await?; + database_client + .sql( + "CREATE TABLE metrics (\ + ts TIMESTAMP TIME INDEX, \ + host STRING PRIMARY KEY, \ + cpu DOUBLE DEFAULT 0.0, \ + region_name STRING \ + ) ENGINE = mito WITH (ttl='7d', 'compaction.type'='twcs')", + schema, + ) + .await?; + database_client + .sql( + "CREATE TABLE logs (\ + ts TIMESTAMP TIME INDEX, \ + app STRING PRIMARY KEY, \ + msg STRING NOT NULL COMMENT 'log message' \ + ) ENGINE = mito", + schema, + ) + .await?; + + let src_dir = tempdir().context(FileIoSnafu)?; + let src_uri = Url::from_directory_path(src_dir.path()) + .map_err(|_| { + InvalidArgumentsSnafu { + msg: "invalid temp dir path".to_string(), + } + .build() + })? + .to_string(); + + let mut export_args = vec![ + "export-v2-create", + "--addr", + &addr, + "--to", + &src_uri, + "--catalog", + &catalog, + "--schemas", + schema, + "--schema-only", + ]; + if let Some(auth) = &auth_basic { + export_args.push("--auth-basic"); + export_args.push(auth); + } + let export_cmd = ExportCreateCommand::parse_from(export_args); + export_cmd + .build() + .await + .context(OtherSnafu)? + .do_work() + .await + .context(OtherSnafu)?; + + let mut import_args = vec![ + "import-v2", + "--addr", + &addr, + "--from", + &src_uri, + "--catalog", + &catalog, + "--schemas", + schema, + "--dry-run", + ]; + if let Some(auth) = &auth_basic { + import_args.push("--auth-basic"); + import_args.push(auth); + } + let import_cmd = ImportV2Command::parse_from(import_args); + import_cmd + .build() + .await + .context(OtherSnafu)? + .do_work() + .await + .context(OtherSnafu)?; + + database_client + .sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}")) + .await?; + + Ok(()) +} diff --git a/src/cli/src/data/import.rs b/src/cli/src/data/import.rs index ffe8b62c7e..f5c234f1a7 100644 --- a/src/cli/src/data/import.rs +++ b/src/cli/src/data/import.rs @@ -81,13 +81,16 @@ pub struct ImportCommand { #[clap(long, value_parser = humantime::parse_duration)] timeout: Option, - /// The proxy server address to connect, if set, will override the system proxy. + /// The proxy server address to connect. /// - /// The default behavior will use the system proxy if neither `proxy` nor `no_proxy` is set. + /// If set, it overrides the system proxy unless `--no-proxy` is specified. + /// If neither `--proxy` nor `--no-proxy` is set, system proxy (env) may be used. #[clap(long)] proxy: Option, - /// Disable proxy server, if set, will not use any proxy. + /// Disable all proxy usage (ignores `--proxy` and system proxy). + /// + /// When set and `--proxy` is not provided, this explicitly disables system proxy. #[clap(long, default_value = "false")] no_proxy: bool, } @@ -104,6 +107,7 @@ impl ImportCommand { // Treats `None` as `0s` to disable server-side default timeout. self.timeout.unwrap_or_default(), proxy, + self.no_proxy, ); Ok(Box::new(Import { @@ -314,6 +318,7 @@ mod tests { None, Duration::from_secs(0), None, + false, ), input_dir: input_dir.to_string(), parallelism: 1, diff --git a/src/cli/src/data/import_v2.rs b/src/cli/src/data/import_v2.rs new file mode 100644 index 0000000000..772e18cc93 --- /dev/null +++ b/src/cli/src/data/import_v2.rs @@ -0,0 +1,41 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Import V2 module. +//! +//! This module provides the V2 implementation of database import functionality, +//! featuring: +//! - DDL-based schema import +//! - Dry-run mode for verification +//! +//! # Example +//! +//! ```bash +//! # Dry-run import (verify without executing) +//! greptime cli data import-v2 \ +//! --addr 127.0.0.1:4000 \ +//! --from file:///tmp/snapshot \ +//! --dry-run +//! +//! # Actual import +//! greptime cli data import-v2 \ +//! --addr 127.0.0.1:4000 \ +//! --from s3://bucket/snapshots/prod-20250101 +//! ``` + +mod command; +pub mod error; +pub mod executor; + +pub use command::ImportV2Command; diff --git a/src/cli/src/data/import_v2/command.rs b/src/cli/src/data/import_v2/command.rs new file mode 100644 index 0000000000..544763d92b --- /dev/null +++ b/src/cli/src/data/import_v2/command.rs @@ -0,0 +1,542 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Import V2 CLI command. + +use std::collections::HashSet; +use std::time::Duration; + +use async_trait::async_trait; +use clap::Parser; +use common_error::ext::BoxedError; +use common_telemetry::info; +use snafu::ResultExt; + +use crate::Tool; +use crate::common::ObjectStoreConfig; +use crate::data::export_v2::manifest::MANIFEST_VERSION; +use crate::data::import_v2::error::{ + ManifestVersionMismatchSnafu, Result, SchemaNotInSnapshotSnafu, SnapshotStorageSnafu, +}; +use crate::data::import_v2::executor::{DdlExecutor, DdlStatement}; +use crate::data::path::ddl_path_for_schema; +use crate::data::snapshot_storage::{OpenDalStorage, SnapshotStorage, validate_uri}; +use crate::database::{DatabaseClient, parse_proxy_opts}; + +/// Import from a snapshot. +#[derive(Debug, Parser)] +pub struct ImportV2Command { + /// Server address to connect (e.g., 127.0.0.1:4000). + #[clap(long)] + addr: String, + + /// Source snapshot location (e.g., s3://bucket/path, file:///tmp/backup). + #[clap(long)] + from: String, + + /// Target catalog name. + #[clap(long, default_value = "greptime")] + catalog: String, + + /// Schema list to import (default: all in snapshot). + /// Can be specified multiple times or comma-separated. + #[clap(long, value_delimiter = ',')] + schemas: Vec, + + /// Verify without importing (dry-run). + #[clap(long)] + dry_run: bool, + + /// Concurrency level (for future use). + #[clap(long, default_value = "1")] + parallelism: usize, + + /// Basic authentication (user:password). + #[clap(long)] + auth_basic: Option, + + /// Request timeout. + #[clap(long, value_parser = humantime::parse_duration)] + timeout: Option, + + /// Proxy server address. + /// + /// If set, it overrides the system proxy unless `--no-proxy` is specified. + /// If neither `--proxy` nor `--no-proxy` is set, system proxy (env) may be used. + #[clap(long)] + proxy: Option, + + /// Disable all proxy usage (ignores `--proxy` and system proxy). + /// + /// When set and `--proxy` is not provided, this explicitly disables system proxy. + #[clap(long)] + no_proxy: bool, + + /// Object store configuration for remote storage backends. + #[clap(flatten)] + storage: ObjectStoreConfig, +} + +impl ImportV2Command { + pub async fn build(&self) -> std::result::Result, BoxedError> { + // Validate URI format + validate_uri(&self.from) + .context(SnapshotStorageSnafu) + .map_err(BoxedError::new)?; + + // Parse schemas (empty vec means all schemas) + let schemas = if self.schemas.is_empty() { + None + } else { + Some(self.schemas.clone()) + }; + + // Build storage + let storage = OpenDalStorage::from_uri(&self.from, &self.storage) + .context(SnapshotStorageSnafu) + .map_err(BoxedError::new)?; + + // Build database client + let proxy = parse_proxy_opts(self.proxy.clone(), self.no_proxy)?; + let database_client = DatabaseClient::new( + self.addr.clone(), + self.catalog.clone(), + self.auth_basic.clone(), + self.timeout.unwrap_or(Duration::from_secs(60)), + proxy, + self.no_proxy, + ); + + Ok(Box::new(Import { + schemas, + dry_run: self.dry_run, + _parallelism: self.parallelism, + storage: Box::new(storage), + database_client, + })) + } +} + +/// Import tool implementation. +pub struct Import { + schemas: Option>, + dry_run: bool, + _parallelism: usize, + storage: Box, + database_client: DatabaseClient, +} + +#[async_trait] +impl Tool for Import { + async fn do_work(&self) -> std::result::Result<(), BoxedError> { + self.run().await.map_err(BoxedError::new) + } +} + +impl Import { + async fn run(&self) -> Result<()> { + // 1. Read manifest + let manifest = self + .storage + .read_manifest() + .await + .context(SnapshotStorageSnafu)?; + + info!( + "Loading snapshot: {} (version: {}, schema_only: {})", + manifest.snapshot_id, manifest.version, manifest.schema_only + ); + + // Check version compatibility + if manifest.version != MANIFEST_VERSION { + return ManifestVersionMismatchSnafu { + expected: MANIFEST_VERSION, + found: manifest.version, + } + .fail(); + } + + info!("Snapshot contains {} schema(s)", manifest.schemas.len()); + + // 2. Determine schemas to import + let schemas_to_import = match &self.schemas { + Some(filter) => canonicalize_schema_filter(filter, &manifest.schemas)?, + None => manifest.schemas.clone(), + }; + + info!("Importing schemas: {:?}", schemas_to_import); + + // 3. Read DDL statements + let ddl_statements = self.read_ddl_statements(&schemas_to_import).await?; + + info!("Generated {} DDL statements", ddl_statements.len()); + + // 4. Dry-run mode: print DDL and exit + if self.dry_run { + info!("Dry-run mode - DDL statements to execute:"); + println!(); + for (i, stmt) in ddl_statements.iter().enumerate() { + println!("-- Statement {}", i + 1); + println!("{};", stmt.sql); + println!(); + } + return Ok(()); + } + + // 5. Execute DDL + let executor = DdlExecutor::new(&self.database_client); + executor.execute_strict(&ddl_statements).await?; + + info!( + "Import completed: {} DDL statements executed", + ddl_statements.len() + ); + + // 6. Data import would happen here for non-schema-only snapshots (M2/M3) + if !manifest.schema_only && !manifest.chunks.is_empty() { + info!( + "Data import not yet implemented (M3). {} chunks pending.", + manifest.chunks.len() + ); + } + + Ok(()) + } + + async fn read_ddl_statements(&self, schemas: &[String]) -> Result> { + let mut statements = Vec::new(); + for schema in schemas { + let path = ddl_path_for_schema(schema); + let content = self + .storage + .read_text(&path) + .await + .context(SnapshotStorageSnafu)?; + statements.extend( + parse_ddl_statements(&content) + .into_iter() + .map(|sql| ddl_statement_for_schema(schema, sql)), + ); + } + + Ok(statements) + } +} + +fn parse_ddl_statements(content: &str) -> Vec { + let mut statements = Vec::new(); + let mut current = String::new(); + let mut chars = content.chars().peekable(); + let mut in_single_quote = false; + let mut in_double_quote = false; + let mut in_line_comment = false; + let mut in_block_comment = false; + + while let Some(ch) = chars.next() { + if in_line_comment { + if ch == '\n' { + in_line_comment = false; + current.push('\n'); + } + continue; + } + + if in_block_comment { + if ch == '*' && chars.peek() == Some(&'/') { + chars.next(); + in_block_comment = false; + } + continue; + } + + if in_single_quote { + current.push(ch); + if ch == '\'' { + if chars.peek() == Some(&'\'') { + current.push(chars.next().expect("peeked quote must exist")); + } else { + in_single_quote = false; + } + } + continue; + } + + if in_double_quote { + current.push(ch); + if ch == '"' { + if chars.peek() == Some(&'"') { + current.push(chars.next().expect("peeked quote must exist")); + } else { + in_double_quote = false; + } + } + continue; + } + + match ch { + '-' if chars.peek() == Some(&'-') => { + chars.next(); + in_line_comment = true; + } + '/' if chars.peek() == Some(&'*') => { + chars.next(); + in_block_comment = true; + } + '\'' => { + in_single_quote = true; + current.push(ch); + } + '"' => { + in_double_quote = true; + current.push(ch); + } + ';' => { + let statement = current.trim(); + if !statement.is_empty() { + statements.push(statement.to_string()); + } + current.clear(); + } + _ => current.push(ch), + } + } + + let statement = current.trim(); + if !statement.is_empty() { + statements.push(statement.to_string()); + } + + statements +} + +fn ddl_statement_for_schema(schema: &str, sql: String) -> DdlStatement { + if is_schema_scoped_statement(&sql) { + DdlStatement::with_execution_schema(sql, schema.to_string()) + } else { + DdlStatement::new(sql) + } +} + +fn is_schema_scoped_statement(sql: &str) -> bool { + let trimmed = sql.trim_start(); + if !starts_with_keyword(trimmed, "CREATE") { + return false; + } + + let Some(rest) = trimmed.get("CREATE".len()..) else { + return false; + }; + let mut rest = rest.trim_start(); + if starts_with_keyword(rest, "OR") { + let Some(next) = rest.get("OR".len()..) else { + return false; + }; + rest = next.trim_start(); + if !starts_with_keyword(rest, "REPLACE") { + return false; + } + let Some(next) = rest.get("REPLACE".len()..) else { + return false; + }; + rest = next.trim_start(); + } + + if starts_with_keyword(rest, "EXTERNAL") { + let Some(next) = rest.get("EXTERNAL".len()..) else { + return false; + }; + rest = next.trim_start(); + } + + starts_with_keyword(rest, "TABLE") || starts_with_keyword(rest, "VIEW") +} + +fn starts_with_keyword(input: &str, keyword: &str) -> bool { + input + .get(0..keyword.len()) + .map(|s| s.eq_ignore_ascii_case(keyword)) + .unwrap_or(false) + && input + .as_bytes() + .get(keyword.len()) + .map(|b| !b.is_ascii_alphanumeric() && *b != b'_') + .unwrap_or(true) +} + +fn canonicalize_schema_filter( + filter: &[String], + manifest_schemas: &[String], +) -> Result> { + let mut canonicalized = Vec::new(); + let mut seen = HashSet::new(); + + for schema in filter { + let canonical = manifest_schemas + .iter() + .find(|candidate| candidate.eq_ignore_ascii_case(schema)) + .cloned() + .ok_or_else(|| { + SchemaNotInSnapshotSnafu { + schema: schema.clone(), + } + .build() + })?; + + if seen.insert(canonical.to_ascii_lowercase()) { + canonicalized.push(canonical); + } + } + + Ok(canonicalized) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_ddl_statements() { + let content = r#" +-- Schema: public +CREATE DATABASE public; +CREATE TABLE t (ts TIMESTAMP TIME INDEX, host STRING, PRIMARY KEY (host)) ENGINE=mito; + +-- comment +CREATE VIEW v AS SELECT * FROM t; +"#; + let statements = parse_ddl_statements(content); + assert_eq!(statements.len(), 3); + assert!(statements[0].starts_with("CREATE DATABASE public")); + assert!(statements[1].starts_with("CREATE TABLE t")); + assert!(statements[2].starts_with("CREATE VIEW v")); + } + + #[test] + fn test_parse_ddl_statements_preserves_semicolons_in_string_literals() { + let content = r#" +CREATE TABLE t ( + host STRING DEFAULT 'a;b' +); +CREATE VIEW v AS SELECT ';' AS marker; +"#; + + let statements = parse_ddl_statements(content); + + assert_eq!(statements.len(), 2); + assert!(statements[0].contains("'a;b'")); + assert!(statements[1].contains("';' AS marker")); + } + + #[test] + fn test_parse_ddl_statements_handles_comments_without_splitting() { + let content = r#" +-- leading comment +CREATE TABLE t (ts TIMESTAMP TIME INDEX); /* block; comment */ +CREATE VIEW v AS SELECT 1; +"#; + + let statements = parse_ddl_statements(content); + + assert_eq!(statements.len(), 2); + assert!(statements[0].starts_with("CREATE TABLE t")); + assert!(statements[1].starts_with("CREATE VIEW v")); + } + + #[test] + fn test_canonicalize_schema_filter_uses_manifest_casing() { + let filter = vec!["TEST_DB".to_string(), "PUBLIC".to_string()]; + let manifest_schemas = vec!["test_db".to_string(), "public".to_string()]; + + let canonicalized = canonicalize_schema_filter(&filter, &manifest_schemas).unwrap(); + + assert_eq!(canonicalized, vec!["test_db", "public"]); + } + + #[test] + fn test_canonicalize_schema_filter_dedupes_case_insensitive_matches() { + let filter = vec![ + "TEST_DB".to_string(), + "test_db".to_string(), + "PUBLIC".to_string(), + "public".to_string(), + ]; + let manifest_schemas = vec!["test_db".to_string(), "public".to_string()]; + + let canonicalized = canonicalize_schema_filter(&filter, &manifest_schemas).unwrap(); + + assert_eq!(canonicalized, vec!["test_db", "public"]); + } + + #[test] + fn test_canonicalize_schema_filter_rejects_missing_schema() { + let filter = vec!["missing".to_string()]; + let manifest_schemas = vec!["test_db".to_string()]; + + let error = canonicalize_schema_filter(&filter, &manifest_schemas) + .expect_err("missing schema should fail") + .to_string(); + + assert!(error.contains("missing")); + } + + #[test] + fn test_ddl_statement_for_schema_create_table_uses_execution_schema() { + let stmt = ddl_statement_for_schema( + "test_db", + "CREATE TABLE metrics (ts TIMESTAMP TIME INDEX) ENGINE=mito".to_string(), + ); + assert_eq!(stmt.execution_schema.as_deref(), Some("test_db")); + } + + #[test] + fn test_ddl_statement_for_schema_create_view_uses_execution_schema() { + let stmt = ddl_statement_for_schema( + "test_db", + "CREATE VIEW metrics_view AS SELECT * FROM metrics".to_string(), + ); + assert_eq!(stmt.execution_schema.as_deref(), Some("test_db")); + } + + #[test] + fn test_ddl_statement_for_schema_create_or_replace_view_uses_execution_schema() { + let stmt = ddl_statement_for_schema( + "test_db", + "CREATE OR REPLACE VIEW metrics_view AS SELECT * FROM metrics".to_string(), + ); + assert_eq!(stmt.execution_schema.as_deref(), Some("test_db")); + } + + #[test] + fn test_ddl_statement_for_schema_create_external_table_uses_execution_schema() { + let stmt = ddl_statement_for_schema( + "test_db", + "CREATE EXTERNAL TABLE IF NOT EXISTS ext_metrics (ts TIMESTAMP TIME INDEX) ENGINE=file" + .to_string(), + ); + assert_eq!(stmt.execution_schema.as_deref(), Some("test_db")); + } + + #[test] + fn test_ddl_statement_for_schema_create_database_uses_public_context() { + let stmt = ddl_statement_for_schema("test_db", "CREATE DATABASE test_db".to_string()); + assert_eq!(stmt.execution_schema, None); + } + + #[test] + fn test_starts_with_keyword_requires_word_boundary() { + assert!(starts_with_keyword("CREATE TABLE t", "CREATE")); + assert!(!starts_with_keyword("CREATED TABLE t", "CREATE")); + assert!(!starts_with_keyword("TABLESPACE foo", "TABLE")); + } +} diff --git a/src/cli/src/data/import_v2/error.rs b/src/cli/src/data/import_v2/error.rs new file mode 100644 index 0000000000..5ae3db1583 --- /dev/null +++ b/src/cli/src/data/import_v2/error.rs @@ -0,0 +1,82 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; + +use common_error::ext::ErrorExt; +use common_error::status_code::StatusCode; +use common_macro::stack_trace_debug; +use snafu::{Location, Snafu}; + +#[derive(Snafu)] +#[snafu(visibility(pub))] +#[stack_trace_debug] +pub enum Error { + #[snafu(display("Snapshot not found at '{}'", uri))] + SnapshotNotFound { + uri: String, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Manifest version mismatch: expected {}, found {}", expected, found))] + ManifestVersionMismatch { + expected: u32, + found: u32, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Schema '{}' not found in snapshot", schema))] + SchemaNotInSnapshot { + schema: String, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Snapshot storage error"))] + SnapshotStorage { + #[snafu(source)] + error: crate::data::export_v2::error::Error, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Database error"))] + Database { + #[snafu(source)] + error: crate::error::Error, + #[snafu(implicit)] + location: Location, + }, +} + +pub type Result = std::result::Result; + +impl ErrorExt for Error { + fn status_code(&self) -> StatusCode { + match self { + Error::SnapshotNotFound { .. } | Error::SchemaNotInSnapshot { .. } => { + StatusCode::InvalidArguments + } + Error::ManifestVersionMismatch { .. } => StatusCode::InvalidArguments, + Error::Database { error, .. } => error.status_code(), + Error::SnapshotStorage { error, .. } => error.status_code(), + } + } + + fn as_any(&self) -> &dyn Any { + self + } +} diff --git a/src/cli/src/data/import_v2/executor.rs b/src/cli/src/data/import_v2/executor.rs new file mode 100644 index 0000000000..3f2bf66ae6 --- /dev/null +++ b/src/cli/src/data/import_v2/executor.rs @@ -0,0 +1,122 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! DDL execution for import. + +use common_telemetry::info; +use snafu::ResultExt; + +use crate::data::import_v2::error::{DatabaseSnafu, Result}; +use crate::database::DatabaseClient; + +/// A DDL statement with an explicit execution schema context. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DdlStatement { + pub sql: String, + pub execution_schema: Option, +} + +impl DdlStatement { + pub fn new(sql: String) -> Self { + Self { + sql, + execution_schema: None, + } + } + + pub fn with_execution_schema(sql: String, schema: String) -> Self { + Self { + sql, + execution_schema: Some(schema), + } + } +} + +/// Executes DDL statements against the database. +pub struct DdlExecutor<'a> { + client: &'a DatabaseClient, +} + +impl<'a> DdlExecutor<'a> { + /// Creates a new DDL executor. + pub fn new(client: &'a DatabaseClient) -> Self { + Self { client } + } + + /// Executes a list of DDL statements, stopping on first error. + pub async fn execute_strict(&self, statements: &[DdlStatement]) -> Result<()> { + let total = statements.len(); + + for (i, stmt) in statements.iter().enumerate() { + let preview = preview_sql(&stmt.sql); + + info!("Executing DDL ({}/{}): {}", i + 1, total, preview); + + if let Some(schema) = stmt.execution_schema.as_deref() { + self.client + .sql(&stmt.sql, schema) + .await + .context(DatabaseSnafu)?; + } else { + self.client + .sql_in_public(&stmt.sql) + .await + .context(DatabaseSnafu)?; + } + } + + Ok(()) + } +} + +fn preview_sql(sql: &str) -> String { + let mut chars = sql.chars(); + let preview: String = chars.by_ref().take(80).collect(); + if chars.next().is_some() { + format!("{preview}...") + } else { + preview + } +} +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_statement_without_execution_schema_uses_public() { + let stmt = DdlStatement::new("CREATE DATABASE IF NOT EXISTS test_db".to_string()); + assert_eq!(stmt.execution_schema, None); + } + + #[test] + fn test_statement_with_execution_schema_preserves_context() { + let stmt = DdlStatement::with_execution_schema( + r#"CREATE TABLE IF NOT EXISTS "my""schema"."metrics" (ts TIMESTAMP TIME INDEX)"# + .to_string(), + r#"my"schema"#.to_string(), + ); + assert_eq!(stmt.execution_schema.as_deref(), Some(r#"my"schema"#)); + } + + #[test] + fn test_preview_sql_truncates_at_char_boundary() { + let sql = format!( + "CREATE TABLE {} (ts TIMESTAMP TIME INDEX)", + "测".repeat(100) + ); + let preview = preview_sql(&sql); + assert!(preview.ends_with("...")); + assert!(preview.is_char_boundary(preview.len())); + } +} diff --git a/src/cli/src/data/path.rs b/src/cli/src/data/path.rs new file mode 100644 index 0000000000..2e0f5d3f1a --- /dev/null +++ b/src/cli/src/data/path.rs @@ -0,0 +1,76 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Shared path helpers for export/import data files. + +use crate::data::export_v2::schema::{DDL_DIR, SCHEMA_DIR}; + +pub(crate) fn ddl_path_for_schema(schema: &str) -> String { + format!( + "{}/{}/{}.sql", + SCHEMA_DIR, + DDL_DIR, + encode_path_segment(schema) + ) +} + +pub(crate) fn encode_path_segment(value: &str) -> String { + let mut encoded = String::with_capacity(value.len()); + for byte in value.bytes() { + match byte { + b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' => { + encoded.push(byte as char); + } + _ => { + encoded.push('%'); + encoded.push(hex_char(byte >> 4)); + encoded.push(hex_char(byte & 0x0F)); + } + } + } + encoded +} + +fn hex_char(nibble: u8) -> char { + match nibble { + 0..=9 => (b'0' + nibble) as char, + 10..=15 => (b'A' + (nibble - 10)) as char, + _ => unreachable!("nibble must be in 0..=15"), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_encode_path_segment_preserves_safe_ascii() { + assert_eq!(encode_path_segment("test_db"), "test_db"); + } + + #[test] + fn test_encode_path_segment_escapes_path_traversal_chars() { + assert_eq!(encode_path_segment("../evil"), "%2E%2E%2Fevil"); + assert_eq!(encode_path_segment(r"..\\evil"), "%2E%2E%5C%5Cevil"); + } + + #[test] + fn test_ddl_path_for_schema_encodes_schema_segment() { + assert_eq!(ddl_path_for_schema("public"), "schema/ddl/public.sql"); + assert_eq!( + ddl_path_for_schema("../evil"), + "schema/ddl/%2E%2E%2Fevil.sql" + ); + } +} diff --git a/src/cli/src/data/snapshot_storage.rs b/src/cli/src/data/snapshot_storage.rs new file mode 100644 index 0000000000..b6ff1c9222 --- /dev/null +++ b/src/cli/src/data/snapshot_storage.rs @@ -0,0 +1,649 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Storage abstraction for Export/Import V2. +//! +//! This module provides a unified interface for reading and writing snapshot data +//! to various storage backends (S3, OSS, GCS, Azure Blob, local filesystem). + +use async_trait::async_trait; +use object_store::services::{Azblob, Fs, Gcs, Oss, S3}; +use object_store::util::{with_instrument_layers, with_retry_layers}; +use object_store::{AzblobConnection, GcsConnection, ObjectStore, OssConnection, S3Connection}; +use snafu::ResultExt; +use url::Url; + +use crate::common::ObjectStoreConfig; +use crate::data::export_v2::error::{ + BuildObjectStoreSnafu, InvalidUriSnafu, ManifestParseSnafu, ManifestSerializeSnafu, Result, + SnapshotNotFoundSnafu, StorageOperationSnafu, TextDecodeSnafu, UnsupportedSchemeSnafu, + UrlParseSnafu, +}; +use crate::data::export_v2::manifest::{MANIFEST_FILE, Manifest}; +#[cfg(test)] +use crate::data::export_v2::schema::SchemaDefinition; +use crate::data::export_v2::schema::{SCHEMA_DIR, SCHEMAS_FILE, SchemaSnapshot}; + +struct RemoteLocation { + bucket_or_container: String, + root: String, +} + +/// URI schemes supported for snapshot storage. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum StorageScheme { + /// Amazon S3. + S3, + /// Alibaba Cloud OSS. + Oss, + /// Google Cloud Storage. + Gcs, + /// Azure Blob Storage. + Azblob, + /// Local filesystem (file://). + File, +} + +impl StorageScheme { + /// Parses storage scheme from URI. + pub fn from_uri(uri: &str) -> Result { + let url = Url::parse(uri).context(UrlParseSnafu)?; + + match url.scheme() { + "s3" => Ok(Self::S3), + "oss" => Ok(Self::Oss), + "gs" | "gcs" => Ok(Self::Gcs), + "azblob" => Ok(Self::Azblob), + "file" => Ok(Self::File), + scheme => UnsupportedSchemeSnafu { scheme }.fail(), + } + } +} + +/// Extracts bucket/container and root path from a URI. +fn extract_remote_location(uri: &str) -> Result { + let url = Url::parse(uri).context(UrlParseSnafu)?; + let bucket_or_container = url.host_str().unwrap_or("").to_string(); + if bucket_or_container.is_empty() { + return InvalidUriSnafu { + uri, + reason: "URI must include bucket/container in host", + } + .fail(); + } + + let root = url.path().trim_start_matches('/').to_string(); + if root.is_empty() { + return InvalidUriSnafu { + uri, + reason: "snapshot URI must include a non-empty path after the bucket/container", + } + .fail(); + } + + Ok(RemoteLocation { + bucket_or_container, + root, + }) +} + +/// Validates that a URI has a proper scheme. +/// +/// Rejects bare paths (e.g., `/tmp/backup`, `./backup`) because: +/// - Schema export (CLI) and data export (server) run in different processes +/// - Using bare paths would split the snapshot across machines +/// +/// Supported URI schemes: +/// - `s3://bucket/path` - Amazon S3 +/// - `oss://bucket/path` - Alibaba Cloud OSS +/// - `gs://bucket/path` - Google Cloud Storage +/// - `azblob://container/path` - Azure Blob Storage +/// - `file:///absolute/path` - Local filesystem +pub fn validate_uri(uri: &str) -> Result { + // Must have a scheme + if !uri.contains("://") { + return InvalidUriSnafu { + uri, + reason: "URI must have a scheme (e.g., s3://, file://). Bare paths are not supported.", + } + .fail(); + } + + StorageScheme::from_uri(uri) +} + +fn schema_index_path() -> String { + format!("{}/{}", SCHEMA_DIR, SCHEMAS_FILE) +} + +/// Extracts the absolute filesystem path from a file:// URI. +fn extract_file_path_from_uri(uri: &str) -> Result { + let url = Url::parse(uri).context(UrlParseSnafu)?; + + match url.host_str() { + Some(host) if !host.is_empty() && host != "localhost" => InvalidUriSnafu { + uri, + reason: "file:// URI must use an absolute path like file:///tmp/backup", + } + .fail(), + _ => Ok(url.path().to_string()), + } +} + +async fn ensure_snapshot_exists(storage: &OpenDalStorage) -> Result<()> { + if storage.exists().await? { + Ok(()) + } else { + SnapshotNotFoundSnafu { + uri: storage.target_uri.as_str(), + } + .fail() + } +} + +/// Snapshot storage abstraction. +/// +/// Provides operations for reading and writing snapshot data to various storage backends. +#[async_trait] +pub trait SnapshotStorage: Send + Sync { + /// Checks if a snapshot exists at this location (manifest.json exists). + async fn exists(&self) -> Result; + + /// Reads the manifest file. + async fn read_manifest(&self) -> Result; + + /// Writes the manifest file. + async fn write_manifest(&self, manifest: &Manifest) -> Result<()>; + + /// Writes the schema index to schema/schemas.json. + async fn write_schema(&self, schema: &SchemaSnapshot) -> Result<()>; + + /// Writes a text file to a relative path under the snapshot root. + async fn write_text(&self, path: &str, content: &str) -> Result<()>; + + /// Reads a text file from a relative path under the snapshot root. + async fn read_text(&self, path: &str) -> Result; + + /// Deletes the entire snapshot (for --force). + async fn delete_snapshot(&self) -> Result<()>; +} + +/// OpenDAL-based implementation of SnapshotStorage. +pub struct OpenDalStorage { + object_store: ObjectStore, + target_uri: String, +} + +impl OpenDalStorage { + fn new_operator_rooted(object_store: ObjectStore, target_uri: &str) -> Self { + Self { + object_store, + target_uri: target_uri.to_string(), + } + } + + fn finish_local_store(object_store: ObjectStore) -> ObjectStore { + with_instrument_layers(object_store, false) + } + + fn finish_remote_store(object_store: ObjectStore) -> ObjectStore { + with_instrument_layers(with_retry_layers(object_store), false) + } + + fn ensure_backend_enabled(uri: &str, enabled: bool, reason: &'static str) -> Result<()> { + if enabled { + Ok(()) + } else { + InvalidUriSnafu { uri, reason }.fail() + } + } + + fn validate_remote_config( + uri: &str, + backend: &str, + result: std::result::Result<(), E>, + ) -> Result<()> { + result.map_err(|error| { + InvalidUriSnafu { + uri, + reason: format!("invalid {} config: {}", backend, error), + } + .build() + }) + } + + /// Creates a new storage from a file:// URI. + pub fn from_file_uri(uri: &str) -> Result { + let path = extract_file_path_from_uri(uri)?; + + let builder = Fs::default().root(&path); + let object_store = ObjectStore::new(builder) + .context(BuildObjectStoreSnafu)? + .finish(); + Ok(Self::new_operator_rooted( + Self::finish_local_store(object_store), + uri, + )) + } + + fn from_file_uri_with_config(uri: &str, storage: &ObjectStoreConfig) -> Result { + if storage.enable_s3 || storage.enable_oss || storage.enable_gcs || storage.enable_azblob { + return InvalidUriSnafu { + uri, + reason: "file:// cannot be used with remote storage flags", + } + .fail(); + } + + Self::from_file_uri(uri) + } + + fn from_s3_uri(uri: &str, storage: &ObjectStoreConfig) -> Result { + Self::ensure_backend_enabled( + uri, + storage.enable_s3, + "s3:// requires --s3 and related options", + )?; + + let location = extract_remote_location(uri)?; + let mut config = storage.s3.clone(); + config.s3_bucket = location.bucket_or_container; + config.s3_root = location.root; + Self::validate_remote_config(uri, "s3", config.validate())?; + + let conn: S3Connection = config.into(); + let object_store = ObjectStore::new(S3::from(&conn)) + .context(BuildObjectStoreSnafu)? + .finish(); + Ok(Self::new_operator_rooted( + Self::finish_remote_store(object_store), + uri, + )) + } + + fn from_oss_uri(uri: &str, storage: &ObjectStoreConfig) -> Result { + Self::ensure_backend_enabled( + uri, + storage.enable_oss, + "oss:// requires --oss and related options", + )?; + + let location = extract_remote_location(uri)?; + let mut config = storage.oss.clone(); + config.oss_bucket = location.bucket_or_container; + config.oss_root = location.root; + Self::validate_remote_config(uri, "oss", config.validate())?; + + let conn: OssConnection = config.into(); + let object_store = ObjectStore::new(Oss::from(&conn)) + .context(BuildObjectStoreSnafu)? + .finish(); + Ok(Self::new_operator_rooted( + Self::finish_remote_store(object_store), + uri, + )) + } + + fn from_gcs_uri(uri: &str, storage: &ObjectStoreConfig) -> Result { + Self::ensure_backend_enabled( + uri, + storage.enable_gcs, + "gs:// or gcs:// requires --gcs and related options", + )?; + + let location = extract_remote_location(uri)?; + let mut config = storage.gcs.clone(); + config.gcs_bucket = location.bucket_or_container; + config.gcs_root = location.root; + Self::validate_remote_config(uri, "gcs", config.validate())?; + + let conn: GcsConnection = config.into(); + let object_store = ObjectStore::new(Gcs::from(&conn)) + .context(BuildObjectStoreSnafu)? + .finish(); + Ok(Self::new_operator_rooted( + Self::finish_remote_store(object_store), + uri, + )) + } + + fn from_azblob_uri(uri: &str, storage: &ObjectStoreConfig) -> Result { + Self::ensure_backend_enabled( + uri, + storage.enable_azblob, + "azblob:// requires --azblob and related options", + )?; + + let location = extract_remote_location(uri)?; + let mut config = storage.azblob.clone(); + config.azblob_container = location.bucket_or_container; + config.azblob_root = location.root; + Self::validate_remote_config(uri, "azblob", config.validate())?; + + let conn: AzblobConnection = config.into(); + let object_store = ObjectStore::new(Azblob::from(&conn)) + .context(BuildObjectStoreSnafu)? + .finish(); + Ok(Self::new_operator_rooted( + Self::finish_remote_store(object_store), + uri, + )) + } + + /// Creates a new storage from a URI and object store config. + pub fn from_uri(uri: &str, storage: &ObjectStoreConfig) -> Result { + match StorageScheme::from_uri(uri)? { + StorageScheme::File => Self::from_file_uri_with_config(uri, storage), + StorageScheme::S3 => Self::from_s3_uri(uri, storage), + StorageScheme::Oss => Self::from_oss_uri(uri, storage), + StorageScheme::Gcs => Self::from_gcs_uri(uri, storage), + StorageScheme::Azblob => Self::from_azblob_uri(uri, storage), + } + } + + /// Reads a file as bytes. + async fn read_file(&self, path: &str) -> Result> { + let data = self + .object_store + .read(path) + .await + .context(StorageOperationSnafu { + operation: format!("read {}", path), + })?; + Ok(data.to_vec()) + } + + /// Writes bytes to a file. + async fn write_file(&self, path: &str, data: Vec) -> Result<()> { + self.object_store + .write(path, data) + .await + .map(|_| ()) + .context(StorageOperationSnafu { + operation: format!("write {}", path), + }) + } + + /// Checks if a file exists using stat. + async fn file_exists(&self, path: &str) -> Result { + match self.object_store.stat(path).await { + Ok(_) => Ok(true), + Err(e) if e.kind() == object_store::ErrorKind::NotFound => Ok(false), + Err(e) => Err(e).context(StorageOperationSnafu { + operation: format!("check exists {}", path), + }), + } + } + + #[cfg(test)] + pub async fn read_schema(&self) -> Result { + let schemas_path = schema_index_path(); + let schemas: Vec = if self.file_exists(&schemas_path).await? { + let data = self.read_file(&schemas_path).await?; + serde_json::from_slice(&data).context(ManifestParseSnafu)? + } else { + vec![] + }; + + Ok(SchemaSnapshot { schemas }) + } +} + +#[async_trait] +impl SnapshotStorage for OpenDalStorage { + async fn exists(&self) -> Result { + self.file_exists(MANIFEST_FILE).await + } + + async fn read_manifest(&self) -> Result { + ensure_snapshot_exists(self).await?; + + let data = self.read_file(MANIFEST_FILE).await?; + serde_json::from_slice(&data).context(ManifestParseSnafu) + } + + async fn write_manifest(&self, manifest: &Manifest) -> Result<()> { + let data = serde_json::to_vec_pretty(manifest).context(ManifestSerializeSnafu)?; + self.write_file(MANIFEST_FILE, data).await + } + + async fn write_schema(&self, schema: &SchemaSnapshot) -> Result<()> { + let schemas_path = schema_index_path(); + let schemas_data = + serde_json::to_vec_pretty(&schema.schemas).context(ManifestSerializeSnafu)?; + self.write_file(&schemas_path, schemas_data).await + } + + async fn write_text(&self, path: &str, content: &str) -> Result<()> { + self.write_file(path, content.as_bytes().to_vec()).await + } + + async fn read_text(&self, path: &str) -> Result { + let data = self.read_file(path).await?; + String::from_utf8(data).context(TextDecodeSnafu) + } + + async fn delete_snapshot(&self) -> Result<()> { + self.object_store + .remove_all("/") + .await + .context(StorageOperationSnafu { + operation: "delete snapshot", + }) + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use object_store::ObjectStore; + use object_store::services::Fs; + use tempfile::tempdir; + use url::Url; + + use super::*; + use crate::data::export_v2::manifest::{DataFormat, TimeRange}; + use crate::data::export_v2::schema::SchemaDefinition; + + fn make_storage_with_rooted_fs(dir: &std::path::Path) -> OpenDalStorage { + let object_store = ObjectStore::new(Fs::default().root(dir.to_str().unwrap())) + .unwrap() + .finish(); + OpenDalStorage::new_operator_rooted( + OpenDalStorage::finish_local_store(object_store), + Url::from_directory_path(dir).unwrap().as_ref(), + ) + } + + #[test] + fn test_validate_uri_valid() { + assert_eq!(validate_uri("s3://bucket/path").unwrap(), StorageScheme::S3); + assert_eq!( + validate_uri("oss://bucket/path").unwrap(), + StorageScheme::Oss + ); + assert_eq!( + validate_uri("gs://bucket/path").unwrap(), + StorageScheme::Gcs + ); + assert_eq!( + validate_uri("gcs://bucket/path").unwrap(), + StorageScheme::Gcs + ); + assert_eq!( + validate_uri("azblob://container/path").unwrap(), + StorageScheme::Azblob + ); + assert_eq!( + validate_uri("file:///tmp/backup").unwrap(), + StorageScheme::File + ); + } + + #[test] + fn test_validate_uri_invalid() { + // Bare paths should be rejected + assert!(validate_uri("/tmp/backup").is_err()); + assert!(validate_uri("./backup").is_err()); + assert!(validate_uri("backup").is_err()); + + // Unknown schemes + assert!(validate_uri("ftp://server/path").is_err()); + } + + #[test] + fn test_extract_remote_location_requires_non_empty_root() { + assert!(extract_remote_location("s3://bucket").is_err()); + assert!(extract_remote_location("s3://bucket/").is_err()); + assert!(extract_remote_location("oss://bucket").is_err()); + assert!(extract_remote_location("gs://bucket").is_err()); + assert!(extract_remote_location("azblob://container").is_err()); + } + + #[test] + fn test_extract_path_from_uri() { + assert_eq!( + extract_file_path_from_uri("file:///tmp/backup").unwrap(), + "/tmp/backup" + ); + assert_eq!( + extract_file_path_from_uri("file://localhost/tmp/backup").unwrap(), + "/tmp/backup" + ); + } + + #[test] + fn test_extract_file_path_from_uri_rejects_file_host() { + assert!(extract_file_path_from_uri("file://tmp/backup").is_err()); + } + + #[tokio::test] + async fn test_read_manifest_reports_requested_uri() { + let dir = tempdir().unwrap(); + let uri = Url::from_directory_path(dir.path()).unwrap().to_string(); + let storage = OpenDalStorage::from_file_uri(&uri).unwrap(); + + let error = storage.read_manifest().await.unwrap_err().to_string(); + + assert!(error.contains(uri.as_str())); + } + + #[tokio::test] + async fn test_manifest_round_trip() { + let dir = tempdir().unwrap(); + let storage = make_storage_with_rooted_fs(dir.path()); + + let manifest = Manifest::new_full( + "greptime".to_string(), + vec!["public".to_string()], + TimeRange::unbounded(), + DataFormat::Parquet, + ); + + storage.write_manifest(&manifest).await.unwrap(); + let loaded = storage.read_manifest().await.unwrap(); + + assert_eq!(loaded.catalog, manifest.catalog); + assert_eq!(loaded.schemas, manifest.schemas); + assert_eq!(loaded.schema_only, manifest.schema_only); + assert_eq!(loaded.format, manifest.format); + assert_eq!(loaded.snapshot_id, manifest.snapshot_id); + } + + #[tokio::test] + async fn test_schema_round_trip() { + let dir = tempdir().unwrap(); + let storage = make_storage_with_rooted_fs(dir.path()); + + let mut snapshot = SchemaSnapshot::new(); + snapshot.add_schema(SchemaDefinition { + catalog: "greptime".to_string(), + name: "test_db".to_string(), + options: HashMap::from([("ttl".to_string(), "7d".to_string())]), + }); + + storage.write_schema(&snapshot).await.unwrap(); + let loaded = storage.read_schema().await.unwrap(); + + assert_eq!(loaded, snapshot); + } + + #[tokio::test] + async fn test_text_round_trip() { + let dir = tempdir().unwrap(); + let storage = make_storage_with_rooted_fs(dir.path()); + let content = "CREATE TABLE metrics (ts TIMESTAMP TIME INDEX);"; + + storage + .write_text("schema/ddl/public.sql", content) + .await + .unwrap(); + let loaded = storage.read_text("schema/ddl/public.sql").await.unwrap(); + + assert_eq!(loaded, content); + } + + #[tokio::test] + async fn test_read_text_rejects_invalid_utf8() { + let dir = tempdir().unwrap(); + let storage = make_storage_with_rooted_fs(dir.path()); + + storage + .write_file("schema/ddl/public.sql", vec![0xff, 0xfe, 0xfd]) + .await + .unwrap(); + + let error = storage + .read_text("schema/ddl/public.sql") + .await + .unwrap_err(); + assert!(error.to_string().contains("UTF-8")); + } + + #[tokio::test] + async fn test_exists_follows_manifest_presence() { + let dir = tempdir().unwrap(); + let storage = make_storage_with_rooted_fs(dir.path()); + + assert!(!storage.exists().await.unwrap()); + + storage + .write_manifest(&Manifest::new_schema_only( + "greptime".to_string(), + vec!["public".to_string()], + )) + .await + .unwrap(); + + assert!(storage.exists().await.unwrap()); + } + + #[tokio::test] + async fn test_delete_snapshot_only_removes_rooted_contents() { + let parent = tempdir().unwrap(); + let snapshot_root = parent.path().join("snapshot"); + let sibling = parent.path().join("sibling"); + std::fs::create_dir_all(&snapshot_root).unwrap(); + std::fs::create_dir_all(&sibling).unwrap(); + std::fs::write(snapshot_root.join("manifest.json"), b"{}").unwrap(); + std::fs::write(sibling.join("keep.txt"), b"keep").unwrap(); + + let storage = make_storage_with_rooted_fs(&snapshot_root); + storage.delete_snapshot().await.unwrap(); + + assert!(!snapshot_root.join("manifest.json").exists()); + assert!(sibling.join("keep.txt").exists()); + } +} diff --git a/src/cli/src/data/sql.rs b/src/cli/src/data/sql.rs new file mode 100644 index 0000000000..7de4206b26 --- /dev/null +++ b/src/cli/src/data/sql.rs @@ -0,0 +1,40 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Shared SQL escaping helpers for CLI-generated statements. + +pub(crate) fn escape_sql_literal(value: &str) -> String { + value.replace('\'', "''") +} + +pub(crate) fn escape_sql_identifier(value: &str) -> String { + value.replace('"', "\"\"") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_escape_sql_literal_escapes_single_quotes() { + assert_eq!(escape_sql_literal("test_db"), "test_db"); + assert_eq!(escape_sql_literal("te'st"), "te''st"); + } + + #[test] + fn test_escape_sql_identifier_escapes_double_quotes() { + assert_eq!(escape_sql_identifier("test_db"), "test_db"); + assert_eq!(escape_sql_identifier(r#"te"st"#), r#"te""st"#); + } +} diff --git a/src/cli/src/database.rs b/src/cli/src/database.rs index db98c38e38..fa3f6faefb 100644 --- a/src/cli/src/database.rs +++ b/src/cli/src/database.rs @@ -36,6 +36,7 @@ pub struct DatabaseClient { auth_header: Option, timeout: Duration, proxy: Option, + no_proxy: bool, } pub fn parse_proxy_opts( @@ -61,6 +62,7 @@ impl DatabaseClient { auth_basic: Option, timeout: Duration, proxy: Option, + no_proxy: bool, ) -> Self { let auth_header = if let Some(basic) = auth_basic { let encoded = general_purpose::STANDARD.encode(basic); @@ -69,7 +71,9 @@ impl DatabaseClient { None }; - if let Some(ref proxy) = proxy { + if no_proxy { + common_telemetry::info!("Proxy disabled"); + } else if let Some(ref proxy) = proxy { common_telemetry::info!("Using proxy: {:?}", proxy); } else { common_telemetry::info!("Using system proxy(if any)"); @@ -81,6 +85,7 @@ impl DatabaseClient { auth_header, timeout, proxy, + no_proxy, } } @@ -95,12 +100,14 @@ impl DatabaseClient { ("db", format!("{}-{}", self.catalog, schema)), ("sql", sql.to_string()), ]; - let client = self - .proxy - .clone() - .map(|proxy| reqwest::Client::builder().proxy(proxy).build()) - .unwrap_or_else(|| Ok(reqwest::Client::new())) - .context(BuildClientSnafu)?; + let mut builder = reqwest::Client::builder(); + if let Some(proxy) = self.proxy.clone() { + builder = builder.proxy(proxy); + } + if self.no_proxy { + builder = builder.no_proxy(); + } + let client = builder.build().context(BuildClientSnafu)?; let mut request = client .post(&url) .form(¶ms) diff --git a/src/cli/src/lib.rs b/src/cli/src/lib.rs index acf5df4086..4305da9c8f 100644 --- a/src/cli/src/lib.rs +++ b/src/cli/src/lib.rs @@ -29,7 +29,7 @@ pub use database::DatabaseClient; use error::Result; pub use crate::bench::BenchTableMetadataCommand; -pub use crate::data::DataCommand; +pub use crate::data::{DataCommand, export_v2, import_v2}; pub use crate::metadata::MetadataCommand; #[async_trait] From f034255fe6d7ce9d3b81e08c7a91e7f960dda96c Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Fri, 20 Mar 2026 06:40:52 +0800 Subject: [PATCH 020/195] perf: support group accumulators for state wrapper (#7826) * perf: support group accumulators for state wrapper * new tests and avoid clone Signed-off-by: Ruihang Xia --------- Signed-off-by: Ruihang Xia --- src/common/function/src/aggrs/aggr_wrapper.rs | 153 +++++++++++++++++- .../function/src/aggrs/aggr_wrapper/tests.rs | 126 ++++++++++++++- 2 files changed, 270 insertions(+), 9 deletions(-) diff --git a/src/common/function/src/aggrs/aggr_wrapper.rs b/src/common/function/src/aggrs/aggr_wrapper.rs index 3780d39582..6242ab9454 100644 --- a/src/common/function/src/aggrs/aggr_wrapper.rs +++ b/src/common/function/src/aggrs/aggr_wrapper.rs @@ -25,7 +25,7 @@ use std::hash::{Hash, Hasher}; use std::sync::Arc; -use arrow::array::StructArray; +use arrow::array::{ArrayRef, BooleanArray, StructArray}; use arrow_schema::{FieldRef, Fields}; use common_telemetry::debug; use datafusion::functions_aggregate::all_default_aggregate_functions; @@ -38,8 +38,8 @@ use datafusion_common::{Column, ScalarValue}; use datafusion_expr::expr::{AggregateFunction, AggregateFunctionParams}; use datafusion_expr::function::StateFieldsArgs; use datafusion_expr::{ - Accumulator, Aggregate, AggregateUDF, AggregateUDFImpl, Expr, ExprSchemable, LogicalPlan, - Signature, + Accumulator, Aggregate, AggregateUDF, AggregateUDFImpl, EmitTo, Expr, ExprSchemable, + GroupsAccumulator, LogicalPlan, Signature, }; use datafusion_physical_expr::aggregate::AggregateFunctionExpr; use datatypes::arrow::datatypes::{DataType, Field}; @@ -322,6 +322,14 @@ impl StateWrapper { ); }) } + + fn fix_inner_acc_args<'b>( + &self, + mut acc_args: datafusion_expr::function::AccumulatorArgs<'b>, + ) -> datafusion_common::Result> { + acc_args.return_field = self.deduce_aggr_return_type(&acc_args)?; + Ok(acc_args) + } } impl AggregateUDFImpl for StateWrapper { @@ -331,15 +339,32 @@ impl AggregateUDFImpl for StateWrapper { ) -> datafusion_common::Result> { // fix and recover proper acc args for the original aggregate function. let state_type = acc_args.return_type().clone(); - let inner = { - let mut new_acc_args = acc_args.clone(); - new_acc_args.return_field = self.deduce_aggr_return_type(&acc_args)?; - self.inner.accumulator(new_acc_args)? - }; + let inner = self.inner.accumulator(self.fix_inner_acc_args(acc_args)?)?; Ok(Box::new(StateAccum::new(inner, state_type)?)) } + fn groups_accumulator_supported( + &self, + acc_args: datafusion_expr::function::AccumulatorArgs, + ) -> bool { + self.fix_inner_acc_args(acc_args) + .map(|args| self.inner.inner().groups_accumulator_supported(args)) + .unwrap_or(false) + } + + fn create_groups_accumulator( + &self, + acc_args: datafusion_expr::function::AccumulatorArgs, + ) -> datafusion_common::Result> { + let state_type = acc_args.return_type().clone(); + let inner = self + .inner + .inner() + .create_groups_accumulator(self.fix_inner_acc_args(acc_args)?)?; + Ok(Box::new(StateGroupsAccum::new(inner, state_type)?)) + } + fn as_any(&self) -> &dyn std::any::Any { self } @@ -462,6 +487,118 @@ pub struct StateAccum { state_fields: Fields, } +pub struct StateGroupsAccum { + inner: Box, + state_fields: Fields, +} + +impl StateGroupsAccum { + fn new( + inner: Box, + state_type: DataType, + ) -> datafusion_common::Result { + let DataType::Struct(fields) = state_type else { + return Err(datafusion_common::DataFusionError::Internal(format!( + "Expected a struct type for state, got: {:?}", + state_type + ))); + }; + Ok(Self { + inner, + state_fields: fields, + }) + } + + fn wrap_state_arrays(&self, arrays: Vec) -> datafusion_common::Result { + let array_type = arrays + .iter() + .map(|array| array.data_type().clone()) + .collect::>(); + let expected_type = self + .state_fields + .iter() + .map(|field| field.data_type().clone()) + .collect::>(); + if array_type != expected_type { + debug!( + "State mismatch, expected: {}, got: {} for expected fields: {:?} and given array types: {:?}", + self.state_fields.len(), + arrays.len(), + self.state_fields, + array_type, + ); + let guess_schema = arrays + .iter() + .enumerate() + .map(|(index, array)| { + Field::new( + format!("col_{index}[mismatch_state]").as_str(), + array.data_type().clone(), + true, + ) + }) + .collect::(); + let array = StructArray::try_new(guess_schema, arrays, None)?; + return Ok(Arc::new(array)); + } + + Ok(Arc::new(StructArray::try_new( + self.state_fields.clone(), + arrays, + None, + )?)) + } +} + +impl GroupsAccumulator for StateGroupsAccum { + fn update_batch( + &mut self, + values: &[ArrayRef], + group_indices: &[usize], + opt_filter: Option<&BooleanArray>, + total_num_groups: usize, + ) -> datafusion_common::Result<()> { + self.inner + .update_batch(values, group_indices, opt_filter, total_num_groups) + } + + fn merge_batch( + &mut self, + values: &[ArrayRef], + group_indices: &[usize], + opt_filter: Option<&BooleanArray>, + total_num_groups: usize, + ) -> datafusion_common::Result<()> { + self.inner + .merge_batch(values, group_indices, opt_filter, total_num_groups) + } + + fn evaluate(&mut self, emit_to: EmitTo) -> datafusion_common::Result { + let state = self.inner.state(emit_to)?; + self.wrap_state_arrays(state) + } + + fn state(&mut self, emit_to: EmitTo) -> datafusion_common::Result> { + self.inner.state(emit_to) + } + + fn convert_to_state( + &self, + values: &[ArrayRef], + opt_filter: Option<&BooleanArray>, + ) -> datafusion_common::Result> { + self.inner.convert_to_state(values, opt_filter) + } + + fn supports_convert_to_state(&self) -> bool { + self.inner.supports_convert_to_state() + } + + fn size(&self) -> usize { + self.inner.size() + } +} + impl StateAccum { pub fn new( inner: Box, diff --git a/src/common/function/src/aggrs/aggr_wrapper/tests.rs b/src/common/function/src/aggrs/aggr_wrapper/tests.rs index 8821b9fd24..de3a77df6b 100644 --- a/src/common/function/src/aggrs/aggr_wrapper/tests.rs +++ b/src/common/function/src/aggrs/aggr_wrapper/tests.rs @@ -40,10 +40,13 @@ use datafusion_common::arrow::array::AsArray; use datafusion_common::arrow::datatypes::{Float64Type, UInt64Type}; use datafusion_common::{Column, TableReference}; use datafusion_expr::expr::{AggregateFunction, NullTreatment}; +use datafusion_expr::function::AccumulatorArgs; use datafusion_expr::{ - Aggregate, ColumnarValue, Expr, LogicalPlan, ScalarFunctionArgs, SortExpr, TableScan, lit, + Aggregate, AggregateUDFImpl, ColumnarValue, Expr, LogicalPlan, ScalarFunctionArgs, SortExpr, + TableScan, lit, }; use datafusion_physical_expr::aggregate::AggregateExprBuilder; +use datafusion_physical_expr::expressions::col; use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; use datatypes::arrow_array::StringArray; use futures::{Stream, StreamExt as _}; @@ -256,6 +259,38 @@ fn dummy_table_scan_with_ts() -> LogicalPlan { ) } +fn create_avg_state_groups_accumulator() -> Box { + let state_wrapper = StateWrapper::new((*avg_udaf()).clone()).unwrap(); + let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new( + "number", + DataType::Float64, + true, + )])); + let expr = col("number", &schema).unwrap(); + let expr_field = expr.return_field(&schema).unwrap(); + let return_field = Arc::new(Field::new( + "__avg_state(number)", + state_wrapper.return_type(&[DataType::Float64]).unwrap(), + true, + )); + let exprs = [expr]; + let expr_fields = [expr_field]; + let acc_args = AccumulatorArgs { + return_field, + schema: &schema, + ignore_nulls: false, + order_bys: &[], + is_reversed: false, + name: "__avg_state(number)", + is_distinct: false, + exprs: &exprs, + expr_fields: &expr_fields, + }; + + assert!(state_wrapper.groups_accumulator_supported(acc_args.clone())); + state_wrapper.create_groups_accumulator(acc_args).unwrap() +} + #[tokio::test] async fn test_sum_udaf() { let ctx = SessionContext::new(); @@ -796,6 +831,95 @@ async fn test_last_value_order_by_udaf() { assert_eq!(merge_eval_res, ScalarValue::Int64(Some(4))); } +#[test] +fn test_avg_state_groups_accumulator_evaluate() { + let mut state_accum = create_avg_state_groups_accumulator(); + let values = vec![Arc::new(Float64Array::from(vec![ + Some(1.0), + Some(2.0), + None, + Some(3.0), + Some(4.0), + Some(5.0), + ])) as ArrayRef]; + let group_indices = vec![0, 1, 0, 0, 1, 2]; + + state_accum + .update_batch(&values, &group_indices, None, 3) + .unwrap(); + + let result = state_accum.evaluate(EmitTo::All).unwrap(); + let result = result.as_any().downcast_ref::().unwrap(); + + assert_eq!( + result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(), + &UInt64Array::from(vec![2, 2, 1]) + ); + assert_eq!( + result + .column(1) + .as_any() + .downcast_ref::() + .unwrap(), + &Float64Array::from(vec![4.0, 6.0, 5.0]) + ); +} + +#[test] +fn test_avg_state_groups_accumulator_state_merge_evaluate() { + let mut source_accum = create_avg_state_groups_accumulator(); + let source_values = vec![Arc::new(Float64Array::from(vec![ + Some(1.0), + Some(2.0), + None, + Some(3.0), + Some(4.0), + Some(5.0), + ])) as ArrayRef]; + let source_group_indices = vec![0, 1, 0, 0, 1, 2]; + + source_accum + .update_batch(&source_values, &source_group_indices, None, 3) + .unwrap(); + let source_state = source_accum.state(EmitTo::All).unwrap(); + + let mut merged_accum = create_avg_state_groups_accumulator(); + let merged_values = + vec![Arc::new(Float64Array::from(vec![Some(10.0), Some(20.0), Some(30.0)])) as ArrayRef]; + let merged_group_indices = vec![0, 1, 2]; + + merged_accum + .update_batch(&merged_values, &merged_group_indices, None, 3) + .unwrap(); + merged_accum + .merge_batch(&source_state, &[1, 2, 0], None, 3) + .unwrap(); + + let result = merged_accum.evaluate(EmitTo::All).unwrap(); + let result = result.as_any().downcast_ref::().unwrap(); + + assert_eq!( + result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(), + &UInt64Array::from(vec![2, 3, 3]) + ); + assert_eq!( + result + .column(1) + .as_any() + .downcast_ref::() + .unwrap(), + &Float64Array::from(vec![15.0, 24.0, 36.0]) + ); +} + /// For testing whether the UDAF state fields are correctly implemented. /// esp. for our own custom UDAF's state fields. /// By compare eval results before and after split to state/merge functions. From d14817bfa6ecf3a7f6a4cf98817c1afd42a2a8c5 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Fri, 20 Mar 2026 11:58:39 +0800 Subject: [PATCH 021/195] fix: resolve optimization issue for extended query (#7824) * fix: resolve optimization issue for extended query * fix: type cast from subquery * chore: update error information in sqlness * chore: switch to released pgwire * refactor: remove optimize function completely * chore: add more tests * test: attempt to fix the fuzz issue * fix: try to resolve the test issue --- Cargo.lock | 6 +- .../information_schema/region_peers.rs | 2 +- src/query/src/datafusion.rs | 56 ++++--------------- src/query/src/planner.rs | 42 +++++++++++++- src/servers/Cargo.toml | 2 +- tests-fuzz/src/utils/partition.rs | 2 +- .../migration/fuzz_migrate_mito_regions.rs | 11 +++- .../common/prepare/mysql_prepare.result | 21 ++++++- .../common/prepare/mysql_prepare.sql | 9 +++ 9 files changed, 94 insertions(+), 57 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1b2a44d0e4..073ae03525 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7301,7 +7301,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" dependencies = [ "cfg-if", - "windows-targets 0.48.5", + "windows-targets 0.52.6", ] [[package]] @@ -9620,9 +9620,9 @@ dependencies = [ [[package]] name = "pgwire" -version = "0.38.0" +version = "0.38.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89d5e5a60d3f6e40c91f6a2a7f8d09665e636272bd5611977253559b6651aabb" +checksum = "f2a798d130b8975a566c2cf6d8955746e1f09a9ee2c3ff2e6020a2c6528c5bd1" dependencies = [ "async-trait", "base64 0.22.1", diff --git a/src/catalog/src/system_schema/information_schema/region_peers.rs b/src/catalog/src/system_schema/information_schema/region_peers.rs index 5bc91d207e..b1438ef53d 100644 --- a/src/catalog/src/system_schema/information_schema/region_peers.rs +++ b/src/catalog/src/system_schema/information_schema/region_peers.rs @@ -267,7 +267,7 @@ impl InformationSchemaRegionPeersBuilder { ]; if !predicates.eval(&row) { - return; + continue; } self.table_catalogs.push(Some(table_catalog)); diff --git a/src/query/src/datafusion.rs b/src/query/src/datafusion.rs index dc84c4afac..e2e577debf 100644 --- a/src/query/src/datafusion.rs +++ b/src/query/src/datafusion.rs @@ -354,25 +354,6 @@ impl DatafusionQueryEngine { Ok(physical_plan) } - #[tracing::instrument(skip_all)] - pub fn optimize( - &self, - context: &QueryEngineContext, - plan: &LogicalPlan, - ) -> Result { - let _timer = metrics::OPTIMIZE_LOGICAL_ELAPSED.start_timer(); - - // Optimized by extension rules - let optimized_plan = self - .state - .optimize_by_extension_rules(plan.clone(), context)?; - - // Optimized by datafusion optimizer - let optimized_plan = self.state.session_state().optimize(&optimized_plan)?; - - Ok(optimized_plan) - } - #[tracing::instrument(skip_all)] fn optimize_physical_plan( &self, @@ -444,32 +425,17 @@ impl QueryEngine for DatafusionQueryEngine { async fn describe( &self, plan: LogicalPlan, - query_ctx: QueryContextRef, + _query_ctx: QueryContextRef, ) -> Result { - let ctx = self.engine_context(query_ctx); - if let Ok(optimised_plan) = self.optimize(&ctx, &plan) { - let schema = optimised_plan - .schema() - .clone() - .try_into() - .context(ConvertSchemaSnafu)?; - Ok(DescribeResult { - schema, - logical_plan: optimised_plan, - }) - } else { - // Table's like those in information_schema cannot be optimized when - // it contains parameters. So we fallback to original plans. - let schema = plan - .schema() - .clone() - .try_into() - .context(ConvertSchemaSnafu)?; - Ok(DescribeResult { - schema, - logical_plan: plan, - }) - } + let schema = plan + .schema() + .clone() + .try_into() + .context(ConvertSchemaSnafu)?; + Ok(DescribeResult { + schema, + logical_plan: plan, + }) } async fn execute(&self, plan: LogicalPlan, query_ctx: QueryContextRef) -> Result { @@ -924,7 +890,7 @@ mod tests { ) ); assert_eq!( - "Limit: skip=0, fetch=20\n Aggregate: groupBy=[[]], aggr=[[sum(CAST(numbers.number AS UInt64))]]\n TableScan: numbers projection=[number]", + "Limit: skip=0, fetch=20\n Projection: sum(numbers.number)\n Aggregate: groupBy=[[]], aggr=[[sum(numbers.number)]]\n TableScan: numbers", format!("{}", logical_plan.display_indent()) ); } diff --git a/src/query/src/planner.rs b/src/query/src/planner.rs index 44c9bc3956..f522dc567a 100644 --- a/src/query/src/planner.rs +++ b/src/query/src/planner.rs @@ -28,6 +28,7 @@ use datafusion::execution::context::SessionState; use datafusion::sql::planner::PlannerContext; use datafusion_common::ToDFSchema; use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; +use datafusion_expr::expr::{Exists, InSubquery}; use datafusion_expr::{ Analyze, Explain, ExplainFormat, Expr as DfExpr, LogicalPlan, LogicalPlanBuilder, PlanType, ToStringifiedPlan, col, @@ -424,9 +425,20 @@ impl DfLogicalPlanner { let mut placeholder_types = HashMap::new(); let mut casted_placeholders = HashSet::new(); + Self::extract_from_plan(plan, &mut placeholder_types, &mut casted_placeholders)?; + + Ok(placeholder_types) + } + + fn extract_from_plan( + plan: &LogicalPlan, + placeholder_types: &mut HashMap>, + casted_placeholders: &mut HashSet, + ) -> Result<()> { plan.apply(|node| { for expr in node.expressions() { let _ = expr.apply(|e| { + // Handle casted placeholders if let DfExpr::Cast(cast) = e && let DfExpr::Placeholder(ph) = &*cast.expr { @@ -434,6 +446,7 @@ impl DfLogicalPlanner { casted_placeholders.insert(ph.id.clone()); } + // Handle bare (non-casted) placeholders if let DfExpr::Placeholder(ph) = e && !casted_placeholders.contains(&ph.id) && !placeholder_types.contains_key(&ph.id) @@ -441,13 +454,26 @@ impl DfLogicalPlanner { placeholder_types.insert(ph.id.clone(), None); } + // Recurse into subquery plans embedded in expressions + match e { + DfExpr::Exists(Exists { subquery, .. }) + | DfExpr::InSubquery(InSubquery { subquery, .. }) + | DfExpr::ScalarSubquery(subquery) => { + Self::extract_from_plan( + &subquery.subquery, + placeholder_types, + casted_placeholders, + )?; + } + _ => {} + } + Ok(TreeNodeRecursion::Continue) }); } Ok(TreeNodeRecursion::Continue) })?; - - Ok(placeholder_types) + Ok(()) } /// Gets inferred parameter types from a logical plan. @@ -619,4 +645,16 @@ mod tests { assert_eq!(type_2, &Some(DataType::Utf8)); assert_eq!(type_3, &Some(DataType::Int32)); } + + #[tokio::test] + async fn test_get_inferred_parameter_types_subquery() { + let plan = parse_sql_to_plan( + r#"SELECT * FROM test WHERE id = (SELECT id FROM test CROSS JOIN (SELECT parse_ident($1::TEXT) AS parts) p LIMIT 1)"#, + ).await; + let types = DfLogicalPlanner::get_inferred_parameter_types(&plan).unwrap(); + + assert_eq!(types.len(), 1); + let type_1 = types.get("$1").unwrap(); + assert_eq!(type_1, &Some(DataType::Utf8)); + } } diff --git a/src/servers/Cargo.toml b/src/servers/Cargo.toml index e75192c9ba..8b64a256e7 100644 --- a/src/servers/Cargo.toml +++ b/src/servers/Cargo.toml @@ -89,7 +89,7 @@ operator.workspace = true otel-arrow-rust.workspace = true parking_lot.workspace = true pg_interval = { version = "0.5.2", package = "pg_interval_2" } -pgwire = { version = "0.38", default-features = false, features = [ +pgwire = { version = "0.38.1", default-features = false, features = [ "server-api-ring", "pg-ext-types", ] } diff --git a/tests-fuzz/src/utils/partition.rs b/tests-fuzz/src/utils/partition.rs index d3dc30061d..89a684326b 100644 --- a/tests-fuzz/src/utils/partition.rs +++ b/tests-fuzz/src/utils/partition.rs @@ -36,7 +36,7 @@ pub struct PartitionCount { } pub async fn count_partitions(db: &MySqlPool, datanode_id: u64) -> Result { - let sql = "select count(1) as count from information_schema.region_peers where peer_id == ?"; + let sql = "select count(1) as count from information_schema.region_peers where peer_id = ?"; sqlx::query_as::<_, PartitionCount>(sql) .bind(datanode_id) .fetch_one(db) diff --git a/tests-fuzz/targets/migration/fuzz_migrate_mito_regions.rs b/tests-fuzz/targets/migration/fuzz_migrate_mito_regions.rs index c8ebbb54af..17cbfb9251 100644 --- a/tests-fuzz/targets/migration/fuzz_migrate_mito_regions.rs +++ b/tests-fuzz/targets/migration/fuzz_migrate_mito_regions.rs @@ -261,13 +261,18 @@ async fn migrate_regions(ctx: &FuzzContext, migrations: &[Migration]) -> Result< { let output = procedure_state(&greptime, &procedure_id).await; info!("Checking procedure: {procedure_id}, output: {output}"); - (fetch_partition(&greptime, region_id).await.unwrap(), output) + (fetch_partition(&greptime, region_id).await.ok(), output) } }) }, |(partition, output)| { - info!("Region: {region_id}, datanode: {}", partition.datanode_id); - partition.datanode_id == migration.to_peer && output.contains("Done") + if let Some(partition) = partition { + info!("Region: {region_id}, datanode: {}", partition.datanode_id); + partition.datanode_id == migration.to_peer && output.contains("Done") + } else { + info!("Region: {region_id}, partition not found yet"); + false + } }, Duration::from_secs(5), ) diff --git a/tests/cases/standalone/common/prepare/mysql_prepare.result b/tests/cases/standalone/common/prepare/mysql_prepare.result index abc267b50e..5ef242a891 100644 --- a/tests/cases/standalone/common/prepare/mysql_prepare.result +++ b/tests/cases/standalone/common/prepare/mysql_prepare.result @@ -42,7 +42,7 @@ affected_rows: 0 -- SQLNESS PROTOCOL MYSQL EXECUTE stmt USING 'a'; -Failed to execute query, err: MySqlError { ERROR 1815 (HY000): (EngineExecuteQuery): Cast error: Cannot cast string 'a' to value of Int32 type } +Failed to execute query, err: MySqlError { ERROR 1210 (HY000): (InvalidArguments): Invalid request parameter: Unable to convert a to datatype Int32(Int32Type) } -- SQLNESS PROTOCOL MYSQL DEALLOCATE stmt; @@ -124,6 +124,25 @@ DEALLOCATE stmt; affected_rows: 0 +-- SQLNESS PROTOCOL MYSQL +PREPARE stmt FROM 'SELECT table_name, table_schema FROM information_schema.tables WHERE table_name = ?'; + +affected_rows: 0 + +-- SQLNESS PROTOCOL MYSQL +EXECUTE stmt USING 'cake'; + ++------------+--------------+ +| table_name | table_schema | ++------------+--------------+ +| cake | public | ++------------+--------------+ + +-- SQLNESS PROTOCOL MYSQL +DEALLOCATE stmt; + +affected_rows: 0 + -- SQLNESS PROTOCOL MYSQL DROP TABLE cake; diff --git a/tests/cases/standalone/common/prepare/mysql_prepare.sql b/tests/cases/standalone/common/prepare/mysql_prepare.sql index 8e80a0a867..e96e945f88 100644 --- a/tests/cases/standalone/common/prepare/mysql_prepare.sql +++ b/tests/cases/standalone/common/prepare/mysql_prepare.sql @@ -72,5 +72,14 @@ EXECUTE stmt USING 'happy', 42, 0; -- SQLNESS PROTOCOL MYSQL DEALLOCATE stmt; +-- SQLNESS PROTOCOL MYSQL +PREPARE stmt FROM 'SELECT table_name, table_schema FROM information_schema.tables WHERE table_name = ?'; + +-- SQLNESS PROTOCOL MYSQL +EXECUTE stmt USING 'cake'; + +-- SQLNESS PROTOCOL MYSQL +DEALLOCATE stmt; + -- SQLNESS PROTOCOL MYSQL DROP TABLE cake; From 805536aed1fc17ba9ea83f522a9413030972ae46 Mon Sep 17 00:00:00 2001 From: jeremyhi Date: Fri, 20 Mar 2026 01:19:41 -0700 Subject: [PATCH 022/195] fix: windows file path (#7839) Signed-off-by: jeremyhi --- src/cli/src/data/snapshot_storage.rs | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/cli/src/data/snapshot_storage.rs b/src/cli/src/data/snapshot_storage.rs index b6ff1c9222..50c8734a67 100644 --- a/src/cli/src/data/snapshot_storage.rs +++ b/src/cli/src/data/snapshot_storage.rs @@ -137,7 +137,16 @@ fn extract_file_path_from_uri(uri: &str) -> Result { reason: "file:// URI must use an absolute path like file:///tmp/backup", } .fail(), - _ => Ok(url.path().to_string()), + _ => url + .to_file_path() + .map(|path| path.to_string_lossy().into_owned()) + .map_err(|_| { + InvalidUriSnafu { + uri, + reason: "file:// URI must use a valid absolute filesystem path", + } + .build() + }), } } @@ -447,6 +456,7 @@ impl SnapshotStorage for OpenDalStorage { #[cfg(test)] mod tests { use std::collections::HashMap; + use std::path::Path; use object_store::ObjectStore; use object_store::services::Fs; @@ -512,8 +522,9 @@ mod tests { assert!(extract_remote_location("azblob://container").is_err()); } + #[cfg(not(windows))] #[test] - fn test_extract_path_from_uri() { + fn test_extract_path_from_uri_unix_examples() { assert_eq!( extract_file_path_from_uri("file:///tmp/backup").unwrap(), "/tmp/backup" @@ -529,6 +540,15 @@ mod tests { assert!(extract_file_path_from_uri("file://tmp/backup").is_err()); } + #[test] + fn test_extract_file_path_from_uri_round_trips_directory_url() { + let dir = tempdir().unwrap(); + let uri = Url::from_directory_path(dir.path()).unwrap().to_string(); + let path = extract_file_path_from_uri(&uri).unwrap(); + + assert_eq!(Path::new(&path), dir.path()); + } + #[tokio::test] async fn test_read_manifest_reports_requested_uri() { let dir = tempdir().unwrap(); From 72f289df503d9c4496d383362f35de152775e489 Mon Sep 17 00:00:00 2001 From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com> Date: Mon, 23 Mar 2026 15:12:39 +0800 Subject: [PATCH 023/195] chore: remove GrpcQueryHandler::put_record_batch (#7844) chore: remove GrpcQueryHandler::put_record_batch, we should use GrpcQueryHandler::handle_put_record_batch_stream instead Signed-off-by: Lei, HUANG --- src/frontend/src/instance/grpc.rs | 57 --------------------------- src/servers/src/query_handler/grpc.rs | 11 +----- src/servers/tests/mod.rs | 10 ----- 3 files changed, 1 insertion(+), 77 deletions(-) diff --git a/src/frontend/src/instance/grpc.rs b/src/frontend/src/instance/grpc.rs index c4191145f8..70ff50fadc 100644 --- a/src/frontend/src/instance/grpc.rs +++ b/src/frontend/src/instance/grpc.rs @@ -27,7 +27,6 @@ use api::v1::{ use async_stream::try_stream; use async_trait::async_trait; use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq}; -use common_base::AffectedRows; use common_error::ext::BoxedError; use common_grpc::flight::do_put::DoPutResponse; use common_query::Output; @@ -260,62 +259,6 @@ impl GrpcQueryHandler for Instance { .context(server_error::ExecuteGrpcQuerySnafu) } - async fn put_record_batch( - &self, - request: servers::grpc::flight::PutRecordBatchRequest, - table_ref: &mut Option, - ctx: QueryContextRef, - ) -> server_error::Result { - let result: Result = async { - let table = if let Some(table) = table_ref { - table.clone() - } else { - let table = self - .catalog_manager() - .table( - &request.table_name.catalog_name, - &request.table_name.schema_name, - &request.table_name.table_name, - None, - ) - .await - .context(CatalogSnafu)? - .with_context(|| TableNotFoundSnafu { - table_name: request.table_name.to_string(), - })?; - *table_ref = Some(table.clone()); - table - }; - - let interceptor_ref = self.plugins.get::>(); - let interceptor = interceptor_ref.as_ref(); - interceptor.pre_bulk_insert(table.clone(), ctx.clone())?; - - self.plugins - .get::() - .as_ref() - .check_permission(ctx.current_user(), PermissionReq::BulkInsert) - .context(PermissionSnafu)?; - - // do we check limit for bulk insert? - - self.inserter - .handle_bulk_insert( - table, - request.flight_data, - request.record_batch, - request.schema_bytes, - ) - .await - .context(TableOperationSnafu) - } - .await; - - result - .map_err(BoxedError::new) - .context(server_error::ExecuteGrpcRequestSnafu) - } - fn handle_put_record_batch_stream( &self, stream: servers::grpc::flight::PutRecordBatchRequestStream, diff --git a/src/servers/src/query_handler/grpc.rs b/src/servers/src/query_handler/grpc.rs index 67d8b3890e..d66a76464e 100644 --- a/src/servers/src/query_handler/grpc.rs +++ b/src/servers/src/query_handler/grpc.rs @@ -17,15 +17,13 @@ use std::sync::Arc; use api::v1::greptime_request::Request; use async_trait::async_trait; -use common_base::AffectedRows; use common_grpc::flight::do_put::DoPutResponse; use common_query::Output; use futures::Stream; use session::context::QueryContextRef; -use table::TableRef; use crate::error::Result; -use crate::grpc::flight::{PutRecordBatchRequest, PutRecordBatchRequestStream}; +use crate::grpc::flight::PutRecordBatchRequestStream; pub type ServerGrpcQueryHandlerRef = Arc; @@ -35,13 +33,6 @@ pub type RawRecordBatch = bytes::Bytes; pub trait GrpcQueryHandler { async fn do_query(&self, query: Request, ctx: QueryContextRef) -> Result; - async fn put_record_batch( - &self, - request: PutRecordBatchRequest, - table_ref: &mut Option, - ctx: QueryContextRef, - ) -> Result; - fn handle_put_record_batch_stream( &self, stream: PutRecordBatchRequestStream, diff --git a/src/servers/tests/mod.rs b/src/servers/tests/mod.rs index e3f8f8fc79..c4f83c5e6c 100644 --- a/src/servers/tests/mod.rs +++ b/src/servers/tests/mod.rs @@ -18,7 +18,6 @@ use api::v1::greptime_request::Request; use api::v1::query_request::Query; use async_trait::async_trait; use catalog::memory::MemoryCatalogManager; -use common_base::AffectedRows; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use common_grpc::flight::do_put::DoPutResponse; use common_query::Output; @@ -149,15 +148,6 @@ impl GrpcQueryHandler for DummyInstance { Ok(output) } - async fn put_record_batch( - &self, - _request: servers::grpc::flight::PutRecordBatchRequest, - _table_ref: &mut Option, - _ctx: QueryContextRef, - ) -> Result { - unimplemented!() - } - fn handle_put_record_batch_stream( &self, _stream: servers::grpc::flight::PutRecordBatchRequestStream, From 78742820891c245e277260fd3f62bd478d6fdc34 Mon Sep 17 00:00:00 2001 From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com> Date: Tue, 24 Mar 2026 03:39:57 +0800 Subject: [PATCH 024/195] feat(mito): flat scan for time series memtable (#7814) * feat/flat-for-time-series: ### Commit Message Enhance `TimeSeriesMemtable` with Record Batch Support - **`time_series.rs`**: - Introduced `BatchToRecordBatchContext` to facilitate conversion of batch iterators to record batch iterators. - Added `build_record_batch` method in `TimeSeriesIterBuilder` to support record batch creation. - Implemented multiple test cases to validate the functionality of record batch creation, including tests for projections, deduplication, sequence filtering, and data correctness. Signed-off-by: Lei, HUANG * feat/flat-for-time-series: Refactor `TimeSeriesMemtable` and `TimeSeriesIterBuilder` - Renamed `adapter_context` to `batch_to_record_batch` in `TimeSeriesMemtable` for clarity. - Simplified `MemtableRangeContext` initialization by removing the `batch_to_record_batch` parameter. - Added `is_record_batch` method to `TimeSeriesIterBuilder` to indicate record batch status. Signed-off-by: Lei, HUANG * feat/flat-for-time-series: ### Add Time Range Filtering and Predicate Group Enhancements - **`memtable.rs`**: Updated `IterBuilder` to include `time_range` parameter in `build_record_batch` method, enhancing record batch iteration with time range filtering. - **`time_series.rs`**: Modified `TimeSeriesIterBuilder` to use `PredicateGroup` instead of `Predicate`, and integrated `PruneTimeIterator` for time-based filtering. - **`memtable_util.rs`**: Removed unused `Predicate` import, reflecting changes in predicate handling. Signed-off-by: Lei, HUANG --------- Signed-off-by: Lei, HUANG --- src/mito2/src/memtable.rs | 6 +- src/mito2/src/memtable/bulk.rs | 4 + src/mito2/src/memtable/time_series.rs | 323 +++++++++++++++++++++-- src/mito2/src/test_util/memtable_util.rs | 1 - 4 files changed, 310 insertions(+), 24 deletions(-) diff --git a/src/mito2/src/memtable.rs b/src/mito2/src/memtable.rs index 7494ec68ed..3ebfdd3628 100644 --- a/src/mito2/src/memtable.rs +++ b/src/mito2/src/memtable.rs @@ -537,11 +537,15 @@ pub trait IterBuilder: Send + Sync { } /// Returns the record batch iterator to read the range. + /// ## Note + /// Implementations should ensure the iterator yields data within given time range. fn build_record_batch( &self, + time_range: Option<(Timestamp, Timestamp)>, metrics: Option, ) -> Result { let _metrics = metrics; + let _ = time_range; UnsupportedOperationSnafu { err_msg: "Record batch iterator is not supported by this memtable", } @@ -700,7 +704,7 @@ impl MemtableRange { metrics: Option, ) -> Result { if self.context.builder.is_record_batch() { - return self.context.builder.build_record_batch(metrics); + return self.context.builder.build_record_batch(time_range, metrics); } if let Some(context) = self.context.batch_to_record_batch.as_ref() { diff --git a/src/mito2/src/memtable/bulk.rs b/src/mito2/src/memtable/bulk.rs index 4dad4fb885..e649681b76 100644 --- a/src/mito2/src/memtable/bulk.rs +++ b/src/mito2/src/memtable/bulk.rs @@ -34,6 +34,7 @@ fn env_usize(name: &str, default: usize) -> usize { .unwrap_or(default) } +use common_time::Timestamp; use datatypes::arrow::datatypes::SchemaRef; use mito_codec::key_values::KeyValue; use rayon::prelude::*; @@ -792,6 +793,7 @@ impl IterBuilder for BulkRangeIterBuilder { fn build_record_batch( &self, + _time_range: Option<(Timestamp, Timestamp)>, metrics: Option, ) -> Result { let series_count = self.part.estimated_series_count(); @@ -825,6 +827,7 @@ impl IterBuilder for MultiBulkRangeIterBuilder { fn build_record_batch( &self, + _time_range: Option<(Timestamp, Timestamp)>, metrics: Option, ) -> Result { self.part @@ -864,6 +867,7 @@ impl IterBuilder for EncodedBulkRangeIterBuilder { fn build_record_batch( &self, + _time_range: Option<(Timestamp, Timestamp)>, metrics: Option, ) -> Result { if let Some(iter) = self diff --git a/src/mito2/src/memtable/time_series.rs b/src/mito2/src/memtable/time_series.rs index 97f5f3c9ce..d3d00d0703 100644 --- a/src/mito2/src/memtable/time_series.rs +++ b/src/mito2/src/memtable/time_series.rs @@ -51,15 +51,18 @@ use crate::memtable::bulk::part::BulkPart; use crate::memtable::simple_bulk_memtable::SimpleBulkMemtable; use crate::memtable::stats::WriteMetrics; use crate::memtable::{ - AllocTracker, BatchToRecordBatchContext, BoxedBatchIterator, IterBuilder, KeyValues, - MemScanMetrics, Memtable, MemtableBuilder, MemtableId, MemtableRange, MemtableRangeContext, - MemtableRanges, MemtableRef, MemtableStats, RangesOptions, read_column_ids_from_projection, + AllocTracker, BatchToRecordBatchContext, BoxedBatchIterator, BoxedRecordBatchIterator, + IterBuilder, KeyValues, MemScanMetrics, Memtable, MemtableBuilder, MemtableId, MemtableRange, + MemtableRangeContext, MemtableRanges, MemtableRef, MemtableStats, RangesOptions, + read_column_ids_from_projection, }; use crate::metrics::{ MEMTABLE_ACTIVE_FIELD_BUILDER_COUNT, MEMTABLE_ACTIVE_SERIES_COUNT, READ_ROWS_TOTAL, READ_STAGE_ELAPSED, }; use crate::read::dedup::LastNonNullIter; +use crate::read::prune::PruneTimeIterator; +use crate::read::scan_region::PredicateGroup; use crate::read::{Batch, BatchBuilder, BatchColumn}; use crate::region::options::MergeMode; @@ -283,25 +286,20 @@ impl Memtable for TimeSeriesMemtable { .map(|c| c.column_id) .collect() }; - let builder = Box::new(TimeSeriesIterBuilder { - series_set: self.series_set.clone(), - projection, - predicate: predicate.predicate().cloned(), - dedup: self.dedup, - merge_mode: self.merge_mode, - sequence, - }); - let adapter_context = Arc::new(BatchToRecordBatchContext::new( + let batch_to_record_batch = Arc::new(BatchToRecordBatchContext::new( self.region_metadata.clone(), read_column_ids, )); - let context = Arc::new(MemtableRangeContext::new_with_batch_to_record_batch( - self.id, - builder, - predicate, - Some(adapter_context), - )); - + let builder = Box::new(TimeSeriesIterBuilder { + series_set: self.series_set.clone(), + projection, + predicate: predicate.clone(), + dedup: self.dedup, + merge_mode: self.merge_mode, + sequence, + batch_to_record_batch, + }); + let context = Arc::new(MemtableRangeContext::new(self.id, builder, predicate)); let range_stats = self.stats(); let range = MemtableRange::new(context, range_stats); Ok(MemtableRanges { @@ -443,7 +441,7 @@ impl SeriesSet { fn iter_series( &self, projection: HashSet, - predicate: Option, + predicate: PredicateGroup, dedup: bool, merge_mode: MergeMode, sequence: Option, @@ -460,7 +458,7 @@ impl SeriesSet { self.region_metadata.clone(), self.series.clone(), projection, - predicate, + predicate.predicate().cloned(), primary_key_schema, primary_key_datatypes, self.codec.clone(), @@ -1245,10 +1243,11 @@ impl From for Values { struct TimeSeriesIterBuilder { series_set: SeriesSet, projection: HashSet, - predicate: Option, + predicate: PredicateGroup, dedup: bool, sequence: Option, merge_mode: MergeMode, + batch_to_record_batch: Arc, } impl IterBuilder for TimeSeriesIterBuilder { @@ -1268,6 +1267,25 @@ impl IterBuilder for TimeSeriesIterBuilder { Ok(Box::new(iter)) } } + + fn is_record_batch(&self) -> bool { + true + } + + fn build_record_batch( + &self, + time_range: Option<(Timestamp, Timestamp)>, + metrics: Option, + ) -> Result { + let iter = self.build(metrics)?; + let iter: BoxedBatchIterator = if let Some(time_range) = time_range { + let time_filters = self.predicate.time_filters(); + Box::new(PruneTimeIterator::new(iter, time_range, time_filters)) + } else { + iter + }; + Ok(self.batch_to_record_batch.adapt_iter(iter)) + } } #[cfg(test)] @@ -2014,4 +2032,265 @@ mod tests { all_timestamps.sort(); assert_eq!(vec![3, 4, 5, 6, 7], all_timestamps); } + + /// Helper to create a TimeSeriesIterBuilder from a memtable and schema. + fn build_iter_builder( + schema: &RegionMetadataRef, + memtable: &TimeSeriesMemtable, + projection: Option<&[ColumnId]>, + dedup: bool, + merge_mode: MergeMode, + sequence: Option, + ) -> TimeSeriesIterBuilder { + let read_column_ids = read_column_ids_from_projection(schema, projection); + let field_projection = if let Some(projection) = projection { + projection.iter().copied().collect() + } else { + schema.field_columns().map(|c| c.column_id).collect() + }; + let adapter_context = Arc::new(BatchToRecordBatchContext::new( + schema.clone(), + read_column_ids, + )); + TimeSeriesIterBuilder { + series_set: memtable.series_set.clone(), + projection: field_projection, + predicate: PredicateGroup::default(), + dedup, + merge_mode, + sequence, + batch_to_record_batch: adapter_context, + } + } + + #[test] + fn test_iter_builder_build_record_batch_basic() { + let schema = schema_for_test(); + let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow); + + let kvs = build_key_values(&schema, "hello".to_string(), 42, 10); + memtable.write(&kvs).unwrap(); + + let builder = build_iter_builder(&schema, &memtable, None, true, MergeMode::LastRow, None); + + let mut iter = builder.build_record_batch(None, None).unwrap(); + let rb = iter.next().transpose().unwrap().unwrap(); + assert_eq!(10, rb.num_rows()); + + let rb_schema = rb.schema(); + let col_names: Vec<_> = rb_schema + .fields() + .iter() + .map(|f| f.name().as_str()) + .collect(); + assert_eq!( + col_names, + vec![ + "k0", + "k1", + "v0", + "v1", + "ts", + "__primary_key", + "__sequence", + "__op_type", + ] + ); + + assert!(iter.next().is_none()); + } + + #[test] + fn test_iter_builder_build_record_batch_with_projection() { + let schema = schema_for_test(); + let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow); + + let kvs = build_key_values(&schema, "test".to_string(), 1, 5); + memtable.write(&kvs).unwrap(); + + // Project only field v0 (column_id=3) and ts (column_id=2). + let projection = vec![2, 3]; + let builder = build_iter_builder( + &schema, + &memtable, + Some(&projection), + true, + MergeMode::LastRow, + None, + ); + + let mut iter = builder.build_record_batch(None, None).unwrap(); + let rb = iter.next().transpose().unwrap().unwrap(); + assert_eq!(5, rb.num_rows()); + + let rb_schema = rb.schema(); + let col_names: Vec<_> = rb_schema + .fields() + .iter() + .map(|f| f.name().as_str()) + .collect(); + // Only projected columns + internal columns. + assert_eq!( + col_names, + vec!["v0", "ts", "__primary_key", "__sequence", "__op_type",] + ); + + assert!(iter.next().is_none()); + } + + #[test] + fn test_iter_builder_build_record_batch_multiple_series() { + let schema = schema_for_test(); + let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow); + + let kvs_a = build_key_values(&schema, "aaa".to_string(), 1, 3); + let kvs_b = build_key_values(&schema, "bbb".to_string(), 2, 4); + memtable.write(&kvs_a).unwrap(); + memtable.write(&kvs_b).unwrap(); + + let builder = build_iter_builder(&schema, &memtable, None, true, MergeMode::LastRow, None); + + let iter = builder.build_record_batch(None, None).unwrap(); + let mut total_rows = 0; + for rb in iter { + let rb = rb.unwrap(); + total_rows += rb.num_rows(); + assert_eq!(8, rb.num_columns()); + } + assert_eq!(7, total_rows); + } + + #[test] + fn test_iter_builder_build_record_batch_dedup() { + let schema = schema_for_test(); + let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow); + + // Write same data twice — dedup should keep only one copy per timestamp. + let kvs = build_key_values(&schema, "dup".to_string(), 10, 5); + memtable.write(&kvs).unwrap(); + memtable.write(&kvs).unwrap(); + + let builder = build_iter_builder(&schema, &memtable, None, true, MergeMode::LastRow, None); + + let iter = builder.build_record_batch(None, None).unwrap(); + let total_rows: usize = iter.map(|rb| rb.unwrap().num_rows()).sum(); + assert_eq!(5, total_rows); + } + + #[test] + fn test_iter_builder_build_record_batch_no_dedup() { + let schema = schema_for_test(); + let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, false, MergeMode::LastRow); + + let kvs = build_key_values(&schema, "dup".to_string(), 10, 5); + memtable.write(&kvs).unwrap(); + memtable.write(&kvs).unwrap(); + + let builder = build_iter_builder(&schema, &memtable, None, false, MergeMode::LastRow, None); + + let iter = builder.build_record_batch(None, None).unwrap(); + let total_rows: usize = iter.map(|rb| rb.unwrap().num_rows()).sum(); + assert_eq!(10, total_rows); + } + + #[test] + fn test_iter_builder_build_record_batch_with_sequence_filter() { + let schema = schema_for_test(); + let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow); + + // build_key_values creates a mutation with base sequence=0. + // Each row gets sequence = base + row_index, so 5 rows get sequences 0,1,2,3,4. + let kvs = build_key_values(&schema, "seq".to_string(), 1, 5); + memtable.write(&kvs).unwrap(); + + // Filter to sequence > 4 — should yield no rows. + let builder = build_iter_builder( + &schema, + &memtable, + None, + true, + MergeMode::LastRow, + Some(SequenceRange::Gt { min: 4 }), + ); + + let iter = builder.build_record_batch(None, None).unwrap(); + let total_rows: usize = iter.map(|rb| rb.unwrap().num_rows()).sum(); + assert_eq!(0, total_rows); + + // Filter to sequence <= 2 — should yield 3 rows (sequences 0, 1, 2). + let builder = build_iter_builder( + &schema, + &memtable, + None, + true, + MergeMode::LastRow, + Some(SequenceRange::LtEq { max: 2 }), + ); + + let iter = builder.build_record_batch(None, None).unwrap(); + let total_rows: usize = iter.map(|rb| rb.unwrap().num_rows()).sum(); + assert_eq!(3, total_rows); + } + + #[test] + fn test_iter_builder_build_record_batch_data_correctness() { + use datatypes::arrow::array::{ + Float64Array, Int64Array, TimestampMillisecondArray, UInt8Array, + }; + + let schema = schema_for_test(); + let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow); + + let kvs = build_key_values(&schema, "check".to_string(), 7, 3); + memtable.write(&kvs).unwrap(); + + let builder = build_iter_builder(&schema, &memtable, None, true, MergeMode::LastRow, None); + + let mut iter = builder.build_record_batch(None, None).unwrap(); + let rb = iter.next().transpose().unwrap().unwrap(); + assert_eq!(3, rb.num_rows()); + + // Verify timestamp values. + let ts_col = rb + .column_by_name("ts") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let timestamps: Vec<_> = (0..ts_col.len()).map(|i| ts_col.value(i)).collect(); + assert_eq!(vec![0, 1, 2], timestamps); + + // Verify field v0 values. + let v0_col = rb + .column_by_name("v0") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let v0_values: Vec<_> = (0..v0_col.len()).map(|i| v0_col.value(i)).collect(); + assert_eq!(vec![0, 1, 2], v0_values); + + // Verify field v1 values. + let v1_col = rb + .column_by_name("v1") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let v1_values: Vec<_> = (0..v1_col.len()).map(|i| v1_col.value(i)).collect(); + assert_eq!(vec![0.0, 1.0, 2.0], v1_values); + + // Verify op_type is all Put (1). + let op_col = rb + .column_by_name("__op_type") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..op_col.len() { + assert_eq!(OpType::Put as u8, op_col.value(i)); + } + + assert!(iter.next().is_none()); + } } diff --git a/src/mito2/src/test_util/memtable_util.rs b/src/mito2/src/test_util/memtable_util.rs index 58ea49fa41..8917875250 100644 --- a/src/mito2/src/test_util/memtable_util.rs +++ b/src/mito2/src/test_util/memtable_util.rs @@ -31,7 +31,6 @@ use store_api::metadata::{ ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef, }; use store_api::storage::{ColumnId, RegionId, SequenceNumber, SequenceRange}; -use table::predicate::Predicate; use crate::error::Result; use crate::memtable::bulk::part::BulkPart; From f999d5e70e3076d7c45223613a7d6465bfa07c3e Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Tue, 24 Mar 2026 08:11:37 +0800 Subject: [PATCH 025/195] feat: avoid some vector-array conversions on flat projection (#7804) * perf(mito2): optimize flat projection conversion * shrink the diff size Signed-off-by: Ruihang Xia * apply gemini's sugg Signed-off-by: Ruihang Xia * nit Signed-off-by: Ruihang Xia --------- Signed-off-by: Ruihang Xia --- src/mito2/src/read/flat_projection.rs | 78 +++++++++++++++++++++++++-- src/mito2/src/read/projection.rs | 23 ++++---- src/mito2/src/read/stream.rs | 5 +- 3 files changed, 89 insertions(+), 17 deletions(-) diff --git a/src/mito2/src/read/flat_projection.rs b/src/mito2/src/read/flat_projection.rs index 3e0f1169df..02b4c6b3c1 100644 --- a/src/mito2/src/read/flat_projection.rs +++ b/src/mito2/src/read/flat_projection.rs @@ -18,18 +18,21 @@ use std::sync::Arc; use api::v1::SemanticType; use common_error::ext::BoxedError; -use common_recordbatch::error::{ArrowComputeSnafu, ExternalSnafu}; +use common_recordbatch::error::{ArrowComputeSnafu, ExternalSnafu, NewDfRecordBatchSnafu}; use common_recordbatch::{DfRecordBatch, RecordBatch}; -use datatypes::arrow::datatypes::Field; +use datatypes::arrow::array::Array; +use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field}; use datatypes::prelude::{ConcreteDataType, DataType}; use datatypes::schema::{Schema, SchemaRef}; +use datatypes::value::Value; use datatypes::vectors::Helper; use snafu::{OptionExt, ResultExt}; use store_api::metadata::{RegionMetadata, RegionMetadataRef}; use store_api::storage::ColumnId; +use crate::cache::CacheStrategy; use crate::error::{InvalidRequestSnafu, RecordBatchSnafu, Result}; -use crate::read::projection::read_column_ids_from_projection; +use crate::read::projection::{read_column_ids_from_projection, repeated_vector_with_cache}; use crate::sst::parquet::flat_format::sst_column_id_indices; use crate::sst::parquet::format::FormatProjection; use crate::sst::{ @@ -248,12 +251,55 @@ impl FlatProjectionMapper { pub(crate) fn convert( &self, batch: &datatypes::arrow::record_batch::RecordBatch, + cache_strategy: &CacheStrategy, ) -> common_recordbatch::error::Result { if self.is_empty_projection { return RecordBatch::new_with_count(self.output_schema.clone(), batch.num_rows()); } - let columns = self.project_vectors(batch)?; - RecordBatch::new(self.output_schema.clone(), columns) + // Construct output record batch directly from Arrow arrays to avoid + // Arrow -> Vector -> Arrow roundtrips in the hot path. + let mut arrays = Vec::with_capacity(self.output_schema.num_columns()); + for (output_idx, index) in self.batch_indices.iter().enumerate() { + let mut array = batch.column(*index).clone(); + // Cast dictionary values to the target type. + if let ArrowDataType::Dictionary(_key_type, value_type) = array.data_type() { + // When a string dictionary column contains only a single value, reuse a cached + // repeated vector to avoid repeatedly expanding the dictionary. + if let Some(dict_array) = single_value_string_dictionary( + &array, + &self.output_schema.column_schemas()[output_idx].data_type, + value_type.as_ref(), + ) { + let dict_values = dict_array.values(); + let value = if dict_values.is_null(0) { + Value::Null + } else { + Value::from(datatypes::arrow_array::string_array_value(dict_values, 0)) + }; + + let repeated = repeated_vector_with_cache( + &self.output_schema.column_schemas()[output_idx].data_type, + &value, + batch.num_rows(), + cache_strategy, + )?; + array = repeated.to_arrow_array(); + } else { + let casted = datatypes::arrow::compute::cast(&array, value_type) + .context(ArrowComputeSnafu)?; + array = casted; + } + } + arrays.push(array); + } + + let df_record_batch = + DfRecordBatch::try_new(self.output_schema.arrow_schema().clone(), arrays) + .context(NewDfRecordBatchSnafu)?; + Ok(RecordBatch::from_df_record_batch( + self.output_schema.clone(), + df_record_batch, + )) } /// Projects columns from the input batch and converts them into vectors. @@ -281,6 +327,28 @@ impl FlatProjectionMapper { } } +fn single_value_string_dictionary<'a>( + array: &'a Arc, + output_type: &ConcreteDataType, + value_type: &ArrowDataType, +) -> Option<&'a datatypes::arrow::array::DictionaryArray> { + if !matches!( + value_type, + ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 | ArrowDataType::Utf8View + ) || !output_type.is_string() + { + return None; + } + + let dict_array = array + .as_any() + .downcast_ref::>()?; + + (dict_array.values().len() == 1 && dict_array.null_count() == 0).then_some(dict_array) +} + /// Returns ids and datatypes of columns of the output batch after applying the `projection`. /// /// It adds the time index column if it doesn't present in the projection. diff --git a/src/mito2/src/read/projection.rs b/src/mito2/src/read/projection.rs index 2c000e7bdc..b5b6904521 100644 --- a/src/mito2/src/read/projection.rs +++ b/src/mito2/src/read/projection.rs @@ -21,7 +21,7 @@ use std::sync::Arc; use api::v1::SemanticType; use common_error::ext::BoxedError; use common_recordbatch::RecordBatch; -use common_recordbatch::error::ExternalSnafu; +use common_recordbatch::error::{DataTypesSnafu, ExternalSnafu}; use datatypes::prelude::{ConcreteDataType, DataType}; use datatypes::schema::{Schema, SchemaRef}; use datatypes::value::Value; @@ -37,7 +37,7 @@ use crate::read::Batch; use crate::read::flat_projection::FlatProjectionMapper; /// Only cache vector when its length `<=` this value. -const MAX_VECTOR_LENGTH_TO_CACHE: usize = 16384; +pub(crate) const MAX_VECTOR_LENGTH_TO_CACHE: usize = 16384; /// Wrapper enum for different projection mapper implementations. pub enum ProjectionMapper { @@ -423,7 +423,7 @@ enum BatchIndex { } /// Gets a vector with repeated values from specific cache or creates a new one. -fn repeated_vector_with_cache( +pub(crate) fn repeated_vector_with_cache( data_type: &ConcreteDataType, value: &Value, num_rows: usize, @@ -450,7 +450,7 @@ fn repeated_vector_with_cache( } /// Returns a vector with repeated values. -fn new_repeated_vector( +pub(crate) fn new_repeated_vector( data_type: &ConcreteDataType, value: &Value, num_rows: usize, @@ -458,8 +458,7 @@ fn new_repeated_vector( let mut mutable_vector = data_type.create_mutable_vector(1); mutable_vector .try_push_value_ref(&value.as_value_ref()) - .map_err(BoxedError::new) - .context(ExternalSnafu)?; + .context(DataTypesSnafu)?; // This requires an additional allocation. let base_vector = mutable_vector.to_vector(); Ok(base_vector.replicate(&[num_rows])) @@ -809,6 +808,7 @@ mod tests { .num_fields(2) .build(), ); + let cache = CacheStrategy::Disabled; let mapper = ProjectionMapper::all(&metadata, true).unwrap(); assert_eq!([0, 1, 2, 3, 4], mapper.column_ids()); assert_eq!( @@ -823,7 +823,7 @@ mod tests { ); let batch = new_flat_batch(Some(0), &[(1, 1), (2, 2)], &[(3, 3), (4, 4)], 3); - let record_batch = mapper.as_flat().unwrap().convert(&batch).unwrap(); + let record_batch = mapper.as_flat().unwrap().convert(&batch, &cache).unwrap(); let expect = "\ +---------------------+----+----+----+----+ | ts | k0 | k1 | v0 | v1 | @@ -843,6 +843,7 @@ mod tests { .num_fields(2) .build(), ); + let cache = CacheStrategy::Disabled; // Columns v1, k0 let mapper = ProjectionMapper::new(&metadata, [4, 1].into_iter(), true).unwrap(); assert_eq!([4, 1], mapper.column_ids()); @@ -856,7 +857,7 @@ mod tests { ); let batch = new_flat_batch(None, &[(1, 1)], &[(4, 4)], 3); - let record_batch = mapper.as_flat().unwrap().convert(&batch).unwrap(); + let record_batch = mapper.as_flat().unwrap().convert(&batch, &cache).unwrap(); let expect = "\ +----+----+ | v1 | k0 | @@ -876,6 +877,7 @@ mod tests { .num_fields(2) .build(), ); + let cache = CacheStrategy::Disabled; // Output columns v1, k0. Read also includes v0. let mapper = ProjectionMapper::new_with_read_columns( &metadata, @@ -887,7 +889,7 @@ mod tests { assert_eq!([4, 1, 3], mapper.column_ids()); let batch = new_flat_batch(None, &[(1, 1)], &[(3, 3), (4, 4)], 3); - let record_batch = mapper.as_flat().unwrap().convert(&batch).unwrap(); + let record_batch = mapper.as_flat().unwrap().convert(&batch, &cache).unwrap(); let expect = "\ +----+----+ | v1 | k0 | @@ -907,6 +909,7 @@ mod tests { .num_fields(2) .build(), ); + let cache = CacheStrategy::Disabled; // Empty projection let mapper = ProjectionMapper::new(&metadata, [].into_iter(), true).unwrap(); assert_eq!([0], mapper.column_ids()); // Should still read the time index column @@ -918,7 +921,7 @@ mod tests { ); let batch = new_flat_batch(Some(0), &[], &[], 3); - let record_batch = flat_mapper.convert(&batch).unwrap(); + let record_batch = flat_mapper.convert(&batch, &cache).unwrap(); assert_eq!(3, record_batch.num_rows()); assert_eq!(0, record_batch.num_columns()); assert!(record_batch.schema.is_empty()); diff --git a/src/mito2/src/read/stream.rs b/src/mito2/src/read/stream.rs index dd85616241..80002147ea 100644 --- a/src/mito2/src/read/stream.rs +++ b/src/mito2/src/read/stream.rs @@ -99,7 +99,8 @@ impl ConvertBatchStream { let mapper = self.projection_mapper.as_flat().unwrap(); for batch in flat_batch.batches { - self.pending.push_back(mapper.convert(&batch)?); + self.pending + .push_back(mapper.convert(&batch, &self.cache_strategy)?); } } } @@ -114,7 +115,7 @@ impl ConvertBatchStream { // Safety: Only flat format returns this batch. let mapper = self.projection_mapper.as_flat().unwrap(); - mapper.convert(&df_record_batch) + mapper.convert(&df_record_batch, &self.cache_strategy) } } } From 223f6cfdf727f9b7622126d24a98ac19bec61353 Mon Sep 17 00:00:00 2001 From: dennis zhuang Date: Tue, 24 Mar 2026 10:05:16 +0800 Subject: [PATCH 026/195] feat: supports sst_format for x-greptime-hints and database options (#7843) Signed-off-by: Dennis Zhuang --- src/table/src/requests.rs | 9 ++- tests-integration/tests/http.rs | 38 +++++++++ .../common/alter/alter_database.result | 79 +++++++++++++++++++ .../common/alter/alter_database.sql | 22 +++++- 4 files changed, 144 insertions(+), 4 deletions(-) diff --git a/src/table/src/requests.rs b/src/table/src/requests.rs index 43fc36644b..15b4278f51 100644 --- a/src/table/src/requests.rs +++ b/src/table/src/requests.rs @@ -36,8 +36,9 @@ use store_api::metric_engine_consts::{ LOGICAL_TABLE_METADATA_KEY, PHYSICAL_TABLE_METADATA_KEY, is_metric_engine_option_key, }; use store_api::mito_engine_options::{ - APPEND_MODE_KEY, COMPACTION_TYPE, MEMTABLE_TYPE, MERGE_MODE_KEY, TWCS_FALLBACK_TO_LOCAL, - TWCS_MAX_OUTPUT_FILE_SIZE, TWCS_TIME_WINDOW, TWCS_TRIGGER_FILE_NUM, is_mito_engine_option_key, + APPEND_MODE_KEY, COMPACTION_TYPE, MEMTABLE_TYPE, MERGE_MODE_KEY, SST_FORMAT_KEY, + TWCS_FALLBACK_TO_LOCAL, TWCS_MAX_OUTPUT_FILE_SIZE, TWCS_TIME_WINDOW, TWCS_TRIGGER_FILE_NUM, + is_mito_engine_option_key, }; use store_api::region_request::{SetRegionOption, UnsetRegionOption}; @@ -56,13 +57,14 @@ pub const TABLE_DATA_MODEL_TRACE_V1: &str = "greptime_trace_v1"; pub const OTLP_METRIC_COMPAT_KEY: &str = "otlp_metric_compat"; pub const OTLP_METRIC_COMPAT_PROM: &str = "prom"; -pub const VALID_TABLE_OPTION_KEYS: [&str; 12] = [ +pub const VALID_TABLE_OPTION_KEYS: [&str; 13] = [ // common keys: WRITE_BUFFER_SIZE_KEY, TTL_KEY, STORAGE_KEY, COMMENT_KEY, SKIP_WAL_KEY, + SST_FORMAT_KEY, // file engine keys: FILE_TABLE_LOCATION_KEY, FILE_TABLE_FORMAT_KEY, @@ -94,6 +96,7 @@ static VALID_DB_OPT_KEYS: Lazy> = Lazy::new(|| { set.insert(TWCS_TIME_WINDOW); set.insert(TWCS_TRIGGER_FILE_NUM); set.insert(TWCS_MAX_OUTPUT_FILE_SIZE); + set.insert(SST_FORMAT_KEY); set }); diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index c259d3ff24..65e56fa15e 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -148,6 +148,7 @@ macro_rules! http_tests { test_jaeger_query_api_for_trace_v1, test_influxdb_write, + test_influxdb_write_with_hints, test_http_memory_limit, ); )* @@ -3638,6 +3639,43 @@ transform: guard.remove_all().await; } +pub async fn test_influxdb_write_with_hints(storage_type: StorageType) { + common_telemetry::init_default_ut_logging(); + let (app, mut guard) = + setup_test_http_app_with_frontend(storage_type, "test_influxdb_write_with_hints").await; + + let client = TestClient::new(app).await; + + let result = client + .post("/v1/influxdb/write?db=public") + .header("x-greptime-hints", "sst_format=flat,ttl=30d,skip_wal=true") + .body("sst_fmt_table,host=host1 cpu=1.2 1664370459457010101") + .send() + .await; + assert_eq!(result.status(), 204); + + let res = client + .get("/v1/sql?sql=show create table sst_fmt_table") + .send() + .await; + assert_eq!(res.status(), StatusCode::OK); + let resp = res.text().await; + assert!( + resp.contains("sst_format = 'flat'"), + "expected sst_format = 'flat' in SHOW CREATE TABLE output, got: {resp}" + ); + assert!( + resp.contains("ttl = '30days'"), + "expected ttl = '30days' in SHOW CREATE TABLE output, got: {resp}" + ); + assert!( + resp.contains("skip_wal = 'true'"), + "expected skip_wal = 'true' in SHOW CREATE TABLE output, got: {resp}" + ); + + guard.remove_all().await; +} + /// Test one-to-many VRL pipeline expansion. /// This test verifies that a VRL processor can return an array, which results in /// multiple output rows from a single input row. diff --git a/tests/cases/standalone/common/alter/alter_database.result b/tests/cases/standalone/common/alter/alter_database.result index 911ef5ddfc..2fccce10de 100644 --- a/tests/cases/standalone/common/alter/alter_database.result +++ b/tests/cases/standalone/common/alter/alter_database.result @@ -314,6 +314,85 @@ SHOW CREATE DATABASE alter_database; | | ) | +----------------+----------------------------------------------+ +-- Test sst_format option +ALTER DATABASE alter_database SET 'sst_format'='flat'; + +Affected Rows: 0 + +SHOW CREATE DATABASE alter_database; + ++----------------+----------------------------------------------+ +| Database | Create Database | ++----------------+----------------------------------------------+ +| alter_database | CREATE DATABASE IF NOT EXISTS alter_database | +| | WITH( | +| | 'compaction.twcs.time_window' = '30m', | +| | 'compaction.type' = 'twcs', | +| | sst_format = 'flat' | +| | ) | ++----------------+----------------------------------------------+ + +USE alter_database; + +Affected Rows: 0 + +CREATE TABLE monitor(ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +SHOW CREATE TABLE monitor; + ++---------+----------------------------------------+ +| Table | Create Table | ++---------+----------------------------------------+ +| monitor | CREATE TABLE IF NOT EXISTS "monitor" ( | +| | "ts" TIMESTAMP(3) NOT NULL, | +| | TIME INDEX ("ts") | +| | ) | +| | | +| | ENGINE=mito | +| | WITH( | +| | sst_format = 'flat' | +| | ) | ++---------+----------------------------------------+ + +USE public; + +Affected Rows: 0 + +ALTER DATABASE alter_database SET 'sst_format'='primary_key'; + +Affected Rows: 0 + +SHOW CREATE DATABASE alter_database; + ++----------------+----------------------------------------------+ +| Database | Create Database | ++----------------+----------------------------------------------+ +| alter_database | CREATE DATABASE IF NOT EXISTS alter_database | +| | WITH( | +| | 'compaction.twcs.time_window' = '30m', | +| | 'compaction.type' = 'twcs', | +| | sst_format = 'primary_key' | +| | ) | ++----------------+----------------------------------------------+ + +ALTER DATABASE alter_database UNSET 'sst_format'; + +Affected Rows: 0 + +SHOW CREATE DATABASE alter_database; + ++----------------+----------------------------------------------+ +| Database | Create Database | ++----------------+----------------------------------------------+ +| alter_database | CREATE DATABASE IF NOT EXISTS alter_database | +| | WITH( | +| | 'compaction.twcs.time_window' = '30m', | +| | 'compaction.type' = 'twcs' | +| | ) | ++----------------+----------------------------------------------+ + DROP DATABASE alter_database; Affected Rows: 0 diff --git a/tests/cases/standalone/common/alter/alter_database.sql b/tests/cases/standalone/common/alter/alter_database.sql index 1b2f75637a..33b309153e 100644 --- a/tests/cases/standalone/common/alter/alter_database.sql +++ b/tests/cases/standalone/common/alter/alter_database.sql @@ -90,5 +90,25 @@ ALTER DATABASE alter_database UNSET 'ttl'; SHOW CREATE DATABASE alter_database; -DROP DATABASE alter_database; +-- Test sst_format option +ALTER DATABASE alter_database SET 'sst_format'='flat'; +SHOW CREATE DATABASE alter_database; + +USE alter_database; + +CREATE TABLE monitor(ts TIMESTAMP TIME INDEX); + +SHOW CREATE TABLE monitor; + +USE public; + +ALTER DATABASE alter_database SET 'sst_format'='primary_key'; + +SHOW CREATE DATABASE alter_database; + +ALTER DATABASE alter_database UNSET 'sst_format'; + +SHOW CREATE DATABASE alter_database; + +DROP DATABASE alter_database; From 7afe16ddf75d8857ad75d98108e4740c87eac966 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 24 Mar 2026 10:15:06 +0800 Subject: [PATCH 027/195] chore(deps): bump rustls-webpki from 0.103.3 to 0.103.10 (#7847) Bumps [rustls-webpki](https://github.com/rustls/webpki) from 0.103.3 to 0.103.10. - [Release notes](https://github.com/rustls/webpki/releases) - [Commits](https://github.com/rustls/webpki/compare/v/0.103.3...v/0.103.10) --- updated-dependencies: - dependency-name: rustls-webpki dependency-version: 0.103.10 dependency-type: indirect ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 073ae03525..2e419019c7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7301,7 +7301,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" dependencies = [ "cfg-if", - "windows-targets 0.52.6", + "windows-targets 0.48.5", ] [[package]] @@ -11635,9 +11635,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.3" +version = "0.103.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4a72fe2bcf7a6ac6fd7d0b9e5cb68aeb7d4c0a0271730218b3e92d43b4eb435" +checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef" dependencies = [ "ring", "rustls-pki-types", From 6bebf93caf18022e985da867be3d703e67bb002c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 24 Mar 2026 10:15:27 +0800 Subject: [PATCH 028/195] chore(deps): bump tar from 0.4.44 to 0.4.45 (#7846) Bumps [tar](https://github.com/alexcrichton/tar-rs) from 0.4.44 to 0.4.45. - [Commits](https://github.com/alexcrichton/tar-rs/compare/0.4.44...0.4.45) --- updated-dependencies: - dependency-name: tar dependency-version: 0.4.45 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2e419019c7..32f9aa27d4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13404,9 +13404,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" [[package]] name = "tar" -version = "0.4.44" +version = "0.4.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d863878d212c87a19c1a610eb53bb01fe12951c0501cf5a0d65f724914a667a" +checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973" dependencies = [ "filetime", "libc", From 5231ee40c8666561732a63cb043c3a4c08cd50c9 Mon Sep 17 00:00:00 2001 From: Yingwen Date: Tue, 24 Mar 2026 11:57:18 +0800 Subject: [PATCH 029/195] feat: add parquet pk prefilter helpers (#7850) * feat: extract parquet pk prefilter helpers Signed-off-by: evenyag * chore: fmt code Signed-off-by: evenyag * chore: fix warnings Signed-off-by: evenyag * chore: update todo Signed-off-by: evenyag --------- Signed-off-by: evenyag --- src/mito2/src/sst/parquet.rs | 1 + src/mito2/src/sst/parquet/prefilter.rs | 528 +++++++++++++++++++++++++ 2 files changed, 529 insertions(+) create mode 100644 src/mito2/src/sst/parquet/prefilter.rs diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs index 26bed76fd6..fb8e1d1fc2 100644 --- a/src/mito2/src/sst/parquet.rs +++ b/src/mito2/src/sst/parquet.rs @@ -29,6 +29,7 @@ pub mod flat_format; pub mod format; pub(crate) mod helper; pub(crate) mod metadata; +pub mod prefilter; pub mod reader; pub mod row_group; pub mod row_selection; diff --git a/src/mito2/src/sst/parquet/prefilter.rs b/src/mito2/src/sst/parquet/prefilter.rs new file mode 100644 index 0000000000..5de2e3512f --- /dev/null +++ b/src/mito2/src/sst/parquet/prefilter.rs @@ -0,0 +1,528 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Helpers for parquet prefiltering. + +use std::ops::Range; + +use api::v1::SemanticType; +use common_recordbatch::filter::SimpleFilterEvaluator; +use datatypes::arrow::array::{BinaryArray, BooleanArray}; +use datatypes::arrow::record_batch::RecordBatch; +use mito_codec::primary_key_filter::is_partition_column; +use mito_codec::row_converter::PrimaryKeyFilter; +use snafu::{OptionExt, ResultExt}; +use store_api::metadata::{RegionMetadata, RegionMetadataRef}; + +use crate::error::{ComputeArrowSnafu, Result, UnexpectedSnafu}; +use crate::sst::parquet::flat_format::primary_key_column_index; +use crate::sst::parquet::format::PrimaryKeyArray; + +#[cfg_attr(not(test), allow(dead_code))] +pub(crate) fn matching_row_ranges_by_primary_key( + input: &RecordBatch, + pk_filter: &mut dyn PrimaryKeyFilter, +) -> Result>> { + let primary_key_index = primary_key_column_index(input.num_columns()); + let pk_dict_array = input + .column(primary_key_index) + .as_any() + .downcast_ref::() + .context(UnexpectedSnafu { + reason: "Primary key column is not a dictionary array", + })?; + let pk_values = pk_dict_array + .values() + .as_any() + .downcast_ref::() + .context(UnexpectedSnafu { + reason: "Primary key values are not binary array", + })?; + let keys = pk_dict_array.keys(); + let key_values = keys.values(); + + if key_values.is_empty() { + return Ok(std::iter::once(0..input.num_rows()).collect()); + } + + let mut matched_row_ranges: Vec> = Vec::new(); + let mut start = 0; + while start < key_values.len() { + let key = key_values[start]; + let mut end = start + 1; + while end < key_values.len() && key_values[end] == key { + end += 1; + } + + if pk_filter.matches(pk_values.value(key as usize)) { + if let Some(last) = matched_row_ranges.last_mut() + && last.end == start + { + last.end = end; + } else { + matched_row_ranges.push(start..end); + } + } + + start = end; + } + + Ok(matched_row_ranges) +} + +#[cfg_attr(not(test), allow(dead_code))] +pub(crate) fn prefilter_flat_batch_by_primary_key( + input: RecordBatch, + pk_filter: &mut dyn PrimaryKeyFilter, +) -> Result> { + if input.num_rows() == 0 { + return Ok(Some(input)); + } + + let matched_row_ranges = matching_row_ranges_by_primary_key(&input, pk_filter)?; + if matched_row_ranges.is_empty() { + return Ok(None); + } + + if matched_row_ranges.len() == 1 + && matched_row_ranges[0].start == 0 + && matched_row_ranges[0].end == input.num_rows() + { + return Ok(Some(input)); + } + + if matched_row_ranges.len() == 1 { + let span = &matched_row_ranges[0]; + return Ok(Some(input.slice(span.start, span.end - span.start))); + } + + let mut mask = vec![false; input.num_rows()]; + for span in matched_row_ranges { + mask[span].fill(true); + } + + let filtered = + datatypes::arrow::compute::filter_record_batch(&input, &BooleanArray::from(mask)) + .context(ComputeArrowSnafu)?; + if filtered.num_rows() == 0 { + Ok(None) + } else { + Ok(Some(filtered)) + } +} + +#[cfg_attr(not(test), allow(dead_code))] +pub(crate) fn retain_usable_primary_key_filters( + sst_metadata: &RegionMetadataRef, + expected_metadata: Option<&RegionMetadata>, + filters: &mut Vec, +) { + filters.retain(|filter| is_usable_primary_key_filter(sst_metadata, expected_metadata, filter)); +} + +#[cfg_attr(not(test), allow(dead_code))] +pub(crate) fn is_usable_primary_key_filter( + sst_metadata: &RegionMetadataRef, + expected_metadata: Option<&RegionMetadata>, + filter: &SimpleFilterEvaluator, +) -> bool { + // TODO(yingwen): The primary key filter always skips the partition column. Consider using a flag + // to control this behavior. We can remove this behavior after we remove the PartitionTreeMemtable. + if is_partition_column(filter.column_name()) { + return false; + } + + let sst_column = match expected_metadata { + Some(expected_metadata) => { + let Some(expected_column) = expected_metadata.column_by_name(filter.column_name()) + else { + return false; + }; + let Some(sst_column) = sst_metadata.column_by_id(expected_column.column_id) else { + return false; + }; + + if sst_column.column_schema.name != expected_column.column_schema.name + || sst_column.semantic_type != expected_column.semantic_type + || sst_column.column_schema.data_type != expected_column.column_schema.data_type + { + return false; + } + + sst_column + } + None => { + let Some(sst_column) = sst_metadata.column_by_name(filter.column_name()) else { + return false; + }; + sst_column + } + }; + + sst_column.semantic_type == SemanticType::Tag + && sst_metadata + .primary_key_index(sst_column.column_id) + .is_some() +} + +#[cfg_attr(not(test), allow(dead_code))] +pub(crate) struct CachedPrimaryKeyFilter { + inner: Box, + last_primary_key: Vec, + last_match: Option, +} + +impl CachedPrimaryKeyFilter { + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) fn new(inner: Box) -> Self { + Self { + inner, + last_primary_key: Vec::new(), + last_match: None, + } + } +} + +impl PrimaryKeyFilter for CachedPrimaryKeyFilter { + fn matches(&mut self, pk: &[u8]) -> bool { + if let Some(last_match) = self.last_match + && self.last_primary_key == pk + { + return last_match; + } + + let matched = self.inner.matches(pk); + self.last_primary_key.clear(); + self.last_primary_key.extend_from_slice(pk); + self.last_match = Some(matched); + matched + } +} + +#[cfg_attr(not(test), allow(dead_code))] +pub(crate) fn batch_single_primary_key(batch: &RecordBatch) -> Result> { + let primary_key_index = primary_key_column_index(batch.num_columns()); + let pk_dict_array = batch + .column(primary_key_index) + .as_any() + .downcast_ref::() + .context(UnexpectedSnafu { + reason: "Primary key column is not a dictionary array", + })?; + let pk_values = pk_dict_array + .values() + .as_any() + .downcast_ref::() + .context(UnexpectedSnafu { + reason: "Primary key values are not binary array", + })?; + let keys = pk_dict_array.keys(); + if keys.is_empty() { + return Ok(None); + } + + let first_key = keys.value(0); + if first_key != keys.value(keys.len() - 1) { + return Ok(None); + } + + Ok(Some(pk_values.value(first_key as usize))) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + use std::sync::atomic::{AtomicUsize, Ordering}; + + use api::v1::SemanticType; + use common_recordbatch::filter::SimpleFilterEvaluator; + use datafusion_expr::{col, lit}; + use datatypes::arrow::array::{ + ArrayRef, BinaryArray, DictionaryArray, TimestampMillisecondArray, UInt8Array, UInt32Array, + UInt64Array, + }; + use datatypes::arrow::datatypes::{Schema, UInt32Type}; + use datatypes::arrow::record_batch::RecordBatch; + use datatypes::prelude::ConcreteDataType; + use mito_codec::row_converter::{PrimaryKeyFilter, build_primary_key_codec}; + use store_api::codec::PrimaryKeyEncoding; + use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder}; + use store_api::storage::ColumnSchema; + + use super::*; + use crate::sst::internal_fields; + use crate::sst::parquet::format::ReadFormat; + use crate::test_util::sst_util::{ + new_primary_key, sst_region_metadata, sst_region_metadata_with_encoding, + }; + + fn new_test_filters(exprs: &[datafusion_expr::Expr]) -> Vec { + exprs + .iter() + .filter_map(SimpleFilterEvaluator::try_new) + .collect() + } + + fn expected_metadata_with_reused_tag_name( + old_metadata: &RegionMetadata, + ) -> Arc { + let mut builder = RegionMetadataBuilder::new(old_metadata.region_id); + builder + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "tag_0".to_string(), + ConcreteDataType::string_datatype(), + true, + ), + semantic_type: SemanticType::Tag, + column_id: 10, + }) + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "tag_1".to_string(), + ConcreteDataType::string_datatype(), + true, + ), + semantic_type: SemanticType::Tag, + column_id: 1, + }) + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "field_0".to_string(), + ConcreteDataType::uint64_datatype(), + true, + ), + semantic_type: SemanticType::Field, + column_id: 2, + }) + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "ts".to_string(), + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + semantic_type: SemanticType::Timestamp, + column_id: 3, + }) + .primary_key(vec![10, 1]); + + Arc::new(builder.build().unwrap()) + } + + fn new_raw_batch_with_metadata( + metadata: Arc, + primary_keys: &[&[u8]], + field_values: &[u64], + ) -> RecordBatch { + assert_eq!(primary_keys.len(), field_values.len()); + + let arrow_schema = metadata.schema.arrow_schema(); + let field_column = arrow_schema + .field(arrow_schema.index_of("field_0").unwrap()) + .clone(); + let time_index_column = arrow_schema + .field(arrow_schema.index_of("ts").unwrap()) + .clone(); + let mut fields = vec![field_column, time_index_column]; + fields.extend( + internal_fields() + .into_iter() + .map(|field| field.as_ref().clone()), + ); + let schema = Arc::new(Schema::new(fields)); + + let mut dict_values = Vec::new(); + let mut keys = Vec::with_capacity(primary_keys.len()); + for pk in primary_keys { + let key = dict_values + .iter() + .position(|existing: &&[u8]| existing == pk) + .unwrap_or_else(|| { + dict_values.push(*pk); + dict_values.len() - 1 + }); + keys.push(key as u32); + } + + let pk_array: ArrayRef = Arc::new(DictionaryArray::::new( + UInt32Array::from(keys), + Arc::new(BinaryArray::from_iter_values(dict_values.iter().copied())), + )); + + RecordBatch::try_new( + schema, + vec![ + Arc::new(UInt64Array::from(field_values.to_vec())), + Arc::new(TimestampMillisecondArray::from_iter_values( + 0..primary_keys.len() as i64, + )), + pk_array, + Arc::new(UInt64Array::from(vec![1; primary_keys.len()])), + Arc::new(UInt8Array::from(vec![1; primary_keys.len()])), + ], + ) + .unwrap() + } + + fn new_raw_batch(primary_keys: &[&[u8]], field_values: &[u64]) -> RecordBatch { + new_raw_batch_with_metadata(Arc::new(sst_region_metadata()), primary_keys, field_values) + } + + fn field_values(batch: &RecordBatch) -> Vec { + batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .values() + .to_vec() + } + + #[test] + fn test_retain_usable_primary_key_filters_skips_non_tag_filters() { + let metadata = Arc::new(sst_region_metadata()); + let mut filters = + new_test_filters(&[col("field_0").eq(lit(1_u64)), col("ts").gt(lit(0_i64))]); + + retain_usable_primary_key_filters(&metadata, None, &mut filters); + + assert!(filters.is_empty()); + } + + #[test] + fn test_retain_usable_primary_key_filters_skips_reused_expected_tag_name() { + let metadata = Arc::new(sst_region_metadata()); + let expected_metadata = expected_metadata_with_reused_tag_name(&metadata); + let mut filters = new_test_filters(&[col("tag_0").eq(lit("b"))]); + + retain_usable_primary_key_filters( + &metadata, + Some(expected_metadata.as_ref()), + &mut filters, + ); + + assert!(filters.is_empty()); + } + + #[test] + fn test_is_usable_primary_key_filter_skips_legacy_primary_key_batches() { + let metadata = Arc::new(sst_region_metadata_with_encoding( + PrimaryKeyEncoding::Sparse, + )); + let read_format = ReadFormat::new_flat( + metadata.clone(), + metadata.column_metadatas.iter().map(|c| c.column_id), + None, + "test", + true, + ) + .unwrap(); + assert!(read_format.as_flat().is_some()); + + let filter = SimpleFilterEvaluator::try_new(&col("tag_0").eq(lit("b"))).unwrap(); + assert!(is_usable_primary_key_filter(&metadata, None, &filter)); + } + + #[test] + fn test_prefilter_primary_key_drops_single_dictionary_batch() { + let metadata = Arc::new(sst_region_metadata()); + let filters = Arc::new(new_test_filters(&[col("tag_0").eq(lit("b"))])); + let mut primary_key_filter = + build_primary_key_codec(metadata.as_ref()).primary_key_filter(&metadata, filters); + let pk_a = new_primary_key(&["a", "x"]); + let batch = new_raw_batch(&[pk_a.as_slice(), pk_a.as_slice()], &[10, 11]); + + let filtered = + prefilter_flat_batch_by_primary_key(batch, primary_key_filter.as_mut()).unwrap(); + + assert!(filtered.is_none()); + } + + #[test] + fn test_prefilter_primary_key_builds_mask_for_fragmented_matches() { + let metadata = Arc::new(sst_region_metadata()); + let filters = Arc::new(new_test_filters(&[col("tag_0") + .eq(lit("a")) + .or(col("tag_0").eq(lit("c")))])); + let mut primary_key_filter = + build_primary_key_codec(metadata.as_ref()).primary_key_filter(&metadata, filters); + let pk_a = new_primary_key(&["a", "x"]); + let pk_b = new_primary_key(&["b", "x"]); + let pk_c = new_primary_key(&["c", "x"]); + let pk_d = new_primary_key(&["d", "x"]); + let batch = new_raw_batch( + &[ + pk_a.as_slice(), + pk_a.as_slice(), + pk_b.as_slice(), + pk_b.as_slice(), + pk_c.as_slice(), + pk_c.as_slice(), + pk_d.as_slice(), + pk_d.as_slice(), + ], + &[10, 11, 12, 13, 14, 15, 16, 17], + ); + + let filtered = prefilter_flat_batch_by_primary_key(batch, primary_key_filter.as_mut()) + .unwrap() + .unwrap(); + + assert_eq!(filtered.num_rows(), 4); + assert_eq!(field_values(&filtered), vec![10, 11, 14, 15]); + } + + struct CountingPrimaryKeyFilter { + hits: Arc, + expected: Vec, + } + + impl PrimaryKeyFilter for CountingPrimaryKeyFilter { + fn matches(&mut self, pk: &[u8]) -> bool { + self.hits.fetch_add(1, Ordering::Relaxed); + pk == self.expected.as_slice() + } + } + + #[test] + fn test_cached_primary_key_filter_reuses_previous_result() { + let expected = new_primary_key(&["a", "x"]); + let hits = Arc::new(AtomicUsize::new(0)); + let mut filter = CachedPrimaryKeyFilter::new(Box::new(CountingPrimaryKeyFilter { + hits: Arc::clone(&hits), + expected: expected.clone(), + })); + + assert!(filter.matches(expected.as_slice())); + assert!(filter.matches(expected.as_slice())); + assert!(!filter.matches(new_primary_key(&["b", "x"]).as_slice())); + + assert_eq!(hits.load(Ordering::Relaxed), 2); + } + + #[test] + fn test_batch_single_primary_key() { + let pk_a = new_primary_key(&["a", "x"]); + let pk_b = new_primary_key(&["b", "x"]); + + let batch = new_raw_batch(&[pk_a.as_slice(), pk_a.as_slice()], &[10, 11]); + assert_eq!( + batch_single_primary_key(&batch).unwrap(), + Some(pk_a.as_slice()) + ); + + let batch = new_raw_batch(&[pk_a.as_slice(), pk_b.as_slice()], &[10, 11]); + assert_eq!(batch_single_primary_key(&batch).unwrap(), None); + } +} From 9bd983ea4063191679f82eda1523839746cb6aa4 Mon Sep 17 00:00:00 2001 From: Weny Xu Date: Tue, 24 Mar 2026 12:24:15 +0800 Subject: [PATCH 030/195] fix: prevent stale in-flight cache refill after invalidation in CacheContainer (#7825) * fix: prevent stale cache refill after invalidate Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu * feat: introduce `get_latest` Signed-off-by: WenyXu * chore: styling Signed-off-by: WenyXu * fix: enforce construction-time cache init strategy Make cache initialization behavior explicit via InitStrategy selected at construction and document dirty-vs-checked semantics. Keep latest-read call compatibility while partition manager uses strategy-driven get paths. Signed-off-by: WenyXu * test: rename get_by_ref freshness test Signed-off-by: WenyXu * feat: use `InitStrategy::VersionChecked` for table route cache Signed-off-by: WenyXu * chore: apply suggestions Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu --------- Signed-off-by: WenyXu --- src/catalog/src/kvbackend/table_cache.rs | 8 +- src/common/meta/Cargo.toml | 5 +- src/common/meta/src/cache/container.rs | 288 +++++++++++++++--- .../meta/src/cache/flow/table_flownode.rs | 24 +- src/common/meta/src/cache/table/schema.rs | 8 +- src/common/meta/src/cache/table/table_info.rs | 8 +- src/common/meta/src/cache/table/table_name.rs | 8 +- .../meta/src/cache/table/table_route.rs | 18 +- .../meta/src/cache/table/table_schema.rs | 2 +- src/common/meta/src/cache/table/view_info.rs | 8 +- src/common/meta/src/error.rs | 16 +- src/partition/src/cache.rs | 8 +- 12 files changed, 313 insertions(+), 88 deletions(-) diff --git a/src/catalog/src/kvbackend/table_cache.rs b/src/catalog/src/kvbackend/table_cache.rs index ea328c3e17..42b3fbc74b 100644 --- a/src/catalog/src/kvbackend/table_cache.rs +++ b/src/catalog/src/kvbackend/table_cache.rs @@ -65,11 +65,13 @@ fn init_factory( fn invalidator<'a>( cache: &'a Cache, - ident: &'a CacheIdent, + idents: &'a [&CacheIdent], ) -> BoxFuture<'a, MetaResult<()>> { Box::pin(async move { - if let CacheIdent::TableName(table_name) = ident { - cache.invalidate(table_name).await + for ident in idents { + if let CacheIdent::TableName(table_name) = ident { + cache.invalidate(table_name).await + } } Ok(()) }) diff --git a/src/common/meta/Cargo.toml b/src/common/meta/Cargo.toml index ec000c710d..f5ca9d2c09 100644 --- a/src/common/meta/Cargo.toml +++ b/src/common/meta/Cargo.toml @@ -8,7 +8,6 @@ license.workspace = true testing = [] pg_kvbackend = [ "dep:tokio-postgres", - "dep:backon", "dep:deadpool-postgres", "dep:deadpool", "dep:tokio-postgres-rustls", @@ -16,7 +15,7 @@ pg_kvbackend = [ "dep:rustls-native-certs", "dep:rustls", ] -mysql_kvbackend = ["dep:sqlx", "dep:backon"] +mysql_kvbackend = ["dep:sqlx"] enterprise = ["prost-types"] [lints] @@ -28,7 +27,7 @@ api.workspace = true async-recursion = "1.0" async-stream.workspace = true async-trait.workspace = true -backon = { workspace = true, optional = true } +backon.workspace = true base64.workspace = true bytes.workspace = true chrono.workspace = true diff --git a/src/common/meta/src/cache/container.rs b/src/common/meta/src/cache/container.rs index 0510476d15..e3a3e13a76 100644 --- a/src/common/meta/src/cache/container.rs +++ b/src/common/meta/src/cache/container.rs @@ -15,10 +15,14 @@ use std::borrow::Borrow; use std::hash::Hash; use std::sync::Arc; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::time::Duration; -use futures::future::{BoxFuture, join_all}; +use backon::{BackoffBuilder, ExponentialBuilder}; +use futures::future::BoxFuture; use moka::future::Cache; use snafu::{OptionExt, ResultExt}; +use tokio::time::sleep; use crate::cache_invalidator::{CacheInvalidator, Context}; use crate::error::{self, Error, Result}; @@ -29,12 +33,29 @@ use crate::metrics; pub type TokenFilter = Box bool + Send + Sync>; /// Invalidates cached values by [CacheToken]s. -pub type Invalidator = - Box Fn(&'a Cache, &'a CacheToken) -> BoxFuture<'a, Result<()>> + Send + Sync>; +pub type Invalidator = Box< + dyn for<'a> Fn(&'a Cache, &'a [&CacheToken]) -> BoxFuture<'a, Result<()>> + Send + Sync, +>; /// Initializes value (i.e., fetches from remote). pub type Initializer = Arc BoxFuture<'_, Result>> + Send + Sync>; +#[derive(Debug, Clone, Copy)] +/// Initialization strategy for cache-miss loading. +/// +/// This strategy is selected when building [CacheContainer] and remains immutable +/// for the lifetime of the container instance. +pub enum InitStrategy { + /// Fast path: load once without version conflict retry. + /// + /// Under concurrent invalidation, callers may observe stale/dirty value. + Unchecked, + /// Strict path: retry load when version changes during initialization. + /// + /// This avoids returning dirty value under invalidate/load races. + VersionChecked, +} + /// [CacheContainer] provides ability to: /// - Cache value loaded by [Initializer]. /// - Invalidate caches by [Invalidator]. @@ -44,6 +65,16 @@ pub struct CacheContainer { invalidator: Invalidator, initializer: Initializer, token_filter: fn(&CacheToken) -> bool, + version: Arc, + init_strategy: InitStrategy, +} + +fn latest_get_backoff() -> impl Iterator { + ExponentialBuilder::default() + .with_min_delay(Duration::from_millis(10)) + .with_max_delay(Duration::from_millis(100)) + .with_max_times(3) + .build() } impl CacheContainer @@ -52,13 +83,37 @@ where V: Send + Sync, CacheToken: Send + Sync, { - /// Constructs an [CacheContainer]. + /// Constructs an [CacheContainer] with [InitStrategy::Unchecked]. + /// + /// This keeps the historical behavior and can return stale/dirty value under + /// concurrent invalidation. pub fn new( name: String, cache: Cache, invalidator: Invalidator, initializer: Initializer, token_filter: fn(&CacheToken) -> bool, + ) -> Self { + Self::with_strategy( + name, + cache, + invalidator, + initializer, + token_filter, + InitStrategy::Unchecked, + ) + } + + /// Constructs an [CacheContainer] with explicit [InitStrategy]. + /// + /// The strategy is fixed at construction time and cannot be changed later. + pub fn with_strategy( + name: String, + cache: Cache, + invalidator: Invalidator, + initializer: Initializer, + token_filter: fn(&CacheToken) -> bool, + init_strategy: InitStrategy, ) -> Self { Self { name, @@ -66,6 +121,8 @@ where invalidator, initializer, token_filter, + version: Arc::new(AtomicUsize::new(0)), + init_strategy, } } @@ -75,6 +132,67 @@ where } } +impl CacheContainer { + fn inc_version(&self) { + self.version.fetch_add(1, Ordering::Relaxed); + } +} + +async fn init<'a, K, V>(init: Initializer, key: K, cache_name: &'a str) -> Result +where + K: Send + Sync + 'a, + V: Send + 'a, +{ + metrics::CACHE_CONTAINER_CACHE_MISS + .with_label_values(&[cache_name]) + .inc(); + let _timer = metrics::CACHE_CONTAINER_LOAD_CACHE + .with_label_values(&[cache_name]) + .start_timer(); + init(&key) + .await + .transpose() + .context(error::ValueNotExistSnafu)? +} + +async fn init_with_retry<'a, K, V>( + init: Initializer, + key: K, + mut backoff: impl Iterator + 'a, + version: Arc, + cache_name: &'a str, +) -> Result +where + K: Send + Sync + 'a, + V: Send + 'a, +{ + let mut attempts = 1usize; + loop { + let pre_version = version.load(Ordering::Relaxed); + metrics::CACHE_CONTAINER_CACHE_MISS + .with_label_values(&[cache_name]) + .inc(); + let _timer = metrics::CACHE_CONTAINER_LOAD_CACHE + .with_label_values(&[cache_name]) + .start_timer(); + let value = init(&key) + .await + .transpose() + .context(error::ValueNotExistSnafu)??; + + if pre_version == version.load(Ordering::Relaxed) { + return Ok(value); + } + + if let Some(duration) = backoff.next() { + sleep(duration).await; + attempts += 1; + } else { + return error::GetLatestCacheRetryExceededSnafu { attempts }.fail(); + } + } +} + #[async_trait::async_trait] impl CacheInvalidator for CacheContainer where @@ -82,14 +200,15 @@ where V: Send + Sync, { async fn invalidate(&self, _ctx: &Context, caches: &[CacheIdent]) -> Result<()> { - let tasks = caches + let idents = caches .iter() .filter(|token| (self.token_filter)(token)) - .map(|token| (self.invalidator)(&self.cache, token)); - join_all(tasks) - .await - .into_iter() - .collect::>>()?; + .collect::>(); + if !idents.is_empty() { + self.inc_version(); + (self.invalidator)(&self.cache, &idents).await?; + } + Ok(()) } } @@ -99,27 +218,39 @@ where K: Copy + Hash + Eq + Send + Sync + 'static, V: Clone + Send + Sync + 'static, { - /// Returns a _clone_ of the value corresponding to the key. + /// Returns a value from cache for copyable keys. + /// + /// With [InitStrategy::Unchecked], this method prioritizes latency and may + /// return stale/dirty value. With [InitStrategy::VersionChecked], this method + /// retries initialization on version change and avoids dirty returns. pub async fn get(&self, key: K) -> Result> { metrics::CACHE_CONTAINER_CACHE_GET .with_label_values(&[&self.name]) .inc(); - let moved_init = self.initializer.clone(); - let moved_key = key; - let init = async move { - metrics::CACHE_CONTAINER_CACHE_MISS - .with_label_values(&[&self.name]) - .inc(); - let _timer = metrics::CACHE_CONTAINER_LOAD_CACHE - .with_label_values(&[&self.name]) - .start_timer(); - moved_init(&moved_key) - .await - .transpose() - .context(error::ValueNotExistSnafu)? + + let result = match self.init_strategy { + InitStrategy::Unchecked => { + self.cache + .try_get_with(key, init(self.initializer.clone(), key, &self.name)) + .await + } + InitStrategy::VersionChecked => { + self.cache + .try_get_with( + key, + init_with_retry( + self.initializer.clone(), + key, + latest_get_backoff(), + self.version.clone(), + &self.name, + ), + ) + .await + } }; - match self.cache.try_get_with(key, init).await { + match result { Ok(value) => Ok(Some(value)), Err(err) => match err.as_ref() { Error::ValueNotExist { .. } => Ok(None), @@ -136,14 +267,15 @@ where { /// Invalidates cache by [CacheToken]. pub async fn invalidate(&self, caches: &[CacheToken]) -> Result<()> { - let tasks = caches + let idents = caches .iter() .filter(|token| (self.token_filter)(token)) - .map(|token| (self.invalidator)(&self.cache, token)); - join_all(tasks) - .await - .into_iter() - .collect::>>()?; + .collect::>(); + if !idents.is_empty() { + self.inc_version(); + (self.invalidator)(&self.cache, &idents).await?; + } + Ok(()) } @@ -156,7 +288,11 @@ where self.cache.contains_key(key) } - /// Returns a _clone_ of the value corresponding to the key. + /// Returns a value from cache by key reference. + /// + /// With [InitStrategy::Unchecked], this method prioritizes latency and may + /// return stale/dirty value. With [InitStrategy::VersionChecked], this method + /// retries initialization on version change and avoids dirty returns. pub async fn get_by_ref(&self, key: &Q) -> Result> where K: Borrow, @@ -165,24 +301,32 @@ where metrics::CACHE_CONTAINER_CACHE_GET .with_label_values(&[&self.name]) .inc(); - let moved_init = self.initializer.clone(); - let moved_key = key.to_owned(); - - let init = async move { - metrics::CACHE_CONTAINER_CACHE_MISS - .with_label_values(&[&self.name]) - .inc(); - let _timer = metrics::CACHE_CONTAINER_LOAD_CACHE - .with_label_values(&[&self.name]) - .start_timer(); - - moved_init(&moved_key) - .await - .transpose() - .context(error::ValueNotExistSnafu)? + let result = match self.init_strategy { + InitStrategy::Unchecked => { + self.cache + .try_get_with_by_ref( + key, + init(self.initializer.clone(), key.to_owned(), &self.name), + ) + .await + } + InitStrategy::VersionChecked => { + self.cache + .try_get_with_by_ref( + key, + init_with_retry( + self.initializer.clone(), + key.to_owned(), + latest_get_backoff(), + self.version.clone(), + &self.name, + ), + ) + .await + } }; - match self.cache.try_get_with_by_ref(key, init).await { + match result { Ok(value) => Ok(Some(value)), Err(err) => match err.as_ref() { Error::ValueNotExist { .. } => Ok(None), @@ -296,9 +440,11 @@ mod tests { moved_counter.fetch_add(1, Ordering::Relaxed); Box::pin(async { Ok(Some("hi".to_string())) }) }); - let invalidator: Invalidator = Box::new(|cache, key| { + let invalidator: Invalidator = Box::new(|cache, keys| { Box::pin(async move { - cache.invalidate(key).await; + for key in keys { + cache.invalidate(*key).await; + } Ok(()) }) }); @@ -323,4 +469,46 @@ mod tests { assert_eq!(value, "hi"); assert_eq!(counter.load(Ordering::Relaxed), 2); } + + #[tokio::test(flavor = "multi_thread")] + async fn test_get_by_ref_returns_fresh_value_after_invalidate() { + let cache: Cache = CacheBuilder::new(128).build(); + let counter = Arc::new(AtomicI32::new(0)); + let moved_counter = counter.clone(); + let init: Initializer = Arc::new(move |_| { + let counter = moved_counter.clone(); + Box::pin(async move { + let n = counter.fetch_add(1, Ordering::Relaxed) + 1; + sleep(Duration::from_millis(100)).await; + Ok(Some(format!("v{n}"))) + }) + }); + let invalidator: Invalidator = Box::new(|cache, keys| { + Box::pin(async move { + for key in keys { + cache.invalidate(*key).await; + } + Ok(()) + }) + }); + + let adv_cache = Arc::new(CacheContainer::with_strategy( + "test".to_string(), + cache, + invalidator, + init, + always_true_filter, + InitStrategy::VersionChecked, + )); + + let moved_cache = adv_cache.clone(); + let get_task = tokio::spawn(async move { moved_cache.get_by_ref("foo").await }); + + sleep(Duration::from_millis(50)).await; + adv_cache.invalidate(&["foo".to_string()]).await.unwrap(); + + let value = get_task.await.unwrap().unwrap().unwrap(); + assert_eq!(value, "v2"); + assert_eq!(counter.load(Ordering::Relaxed), 2); + } } diff --git a/src/common/meta/src/cache/flow/table_flownode.rs b/src/common/meta/src/cache/flow/table_flownode.rs index a7777f3361..ebe3664202 100644 --- a/src/common/meta/src/cache/flow/table_flownode.rs +++ b/src/common/meta/src/cache/flow/table_flownode.rs @@ -170,20 +170,22 @@ async fn handle_drop_flow( fn invalidator<'a>( cache: &'a Cache, - ident: &'a CacheIdent, + idents: &'a [&CacheIdent], ) -> BoxFuture<'a, Result<()>> { Box::pin(async move { - match ident { - CacheIdent::CreateFlow(create_flow) => handle_create_flow(cache, create_flow).await, - CacheIdent::DropFlow(drop_flow) => handle_drop_flow(cache, drop_flow).await, - CacheIdent::FlowNodeAddressChange(node_id) => { - info!( - "Invalidate flow node cache for node_id in table_flownode: {}", - node_id - ); - cache.invalidate_all(); + for ident in idents { + match ident { + CacheIdent::CreateFlow(create_flow) => handle_create_flow(cache, create_flow).await, + CacheIdent::DropFlow(drop_flow) => handle_drop_flow(cache, drop_flow).await, + CacheIdent::FlowNodeAddressChange(node_id) => { + info!( + "Invalidate flow node cache for node_id in table_flownode: {}", + node_id + ); + cache.invalidate_all(); + } + _ => {} } - _ => {} } Ok(()) }) diff --git a/src/common/meta/src/cache/table/schema.rs b/src/common/meta/src/cache/table/schema.rs index bcf81d4fe6..bd9e8e6dc1 100644 --- a/src/common/meta/src/cache/table/schema.rs +++ b/src/common/meta/src/cache/table/schema.rs @@ -58,11 +58,13 @@ fn init_factory(schema_manager: SchemaManager) -> Initializer( cache: &'a Cache>, - ident: &'a CacheIdent, + idents: &'a [&CacheIdent], ) -> BoxFuture<'a, crate::error::Result<()>> { Box::pin(async move { - if let CacheIdent::SchemaName(schema_name) = ident { - cache.invalidate(schema_name).await + for ident in idents { + if let CacheIdent::SchemaName(schema_name) = ident { + cache.invalidate(schema_name).await + } } Ok(()) }) diff --git a/src/common/meta/src/cache/table/table_info.rs b/src/common/meta/src/cache/table/table_info.rs index b853d908e8..97af5bcdb7 100644 --- a/src/common/meta/src/cache/table/table_info.rs +++ b/src/common/meta/src/cache/table/table_info.rs @@ -61,11 +61,13 @@ fn init_factory(table_info_manager: TableInfoManagerRef) -> Initializer( cache: &'a Cache>, - ident: &'a CacheIdent, + idents: &'a [&CacheIdent], ) -> BoxFuture<'a, Result<()>> { Box::pin(async move { - if let CacheIdent::TableId(table_id) = ident { - cache.invalidate(table_id).await + for ident in idents { + if let CacheIdent::TableId(table_id) = ident { + cache.invalidate(table_id).await + } } Ok(()) }) diff --git a/src/common/meta/src/cache/table/table_name.rs b/src/common/meta/src/cache/table/table_name.rs index 540da5e5f4..927a5b3480 100644 --- a/src/common/meta/src/cache/table/table_name.rs +++ b/src/common/meta/src/cache/table/table_name.rs @@ -71,11 +71,13 @@ fn init_factory(table_name_manager: TableNameManagerRef) -> Initializer( cache: &'a Cache, - ident: &'a CacheIdent, + idents: &'a [&CacheIdent], ) -> BoxFuture<'a, Result<()>> { Box::pin(async move { - if let CacheIdent::TableName(table_name) = ident { - cache.invalidate(table_name).await + for ident in idents { + if let CacheIdent::TableName(table_name) = ident { + cache.invalidate(table_name).await + } } Ok(()) }) diff --git a/src/common/meta/src/cache/table/table_route.rs b/src/common/meta/src/cache/table/table_route.rs index 47abdaa728..be820b0c52 100644 --- a/src/common/meta/src/cache/table/table_route.rs +++ b/src/common/meta/src/cache/table/table_route.rs @@ -19,6 +19,7 @@ use moka::future::Cache; use snafu::OptionExt; use store_api::storage::TableId; +use crate::cache::container::InitStrategy; use crate::cache::{CacheContainer, Initializer}; use crate::error; use crate::error::Result; @@ -65,7 +66,14 @@ pub fn new_table_route_cache( let table_info_manager = Arc::new(TableRouteManager::new(kv_backend)); let init = init_factory(table_info_manager); - CacheContainer::new(name, cache, Box::new(invalidator), init, filter) + CacheContainer::with_strategy( + name, + cache, + Box::new(invalidator), + init, + filter, + InitStrategy::VersionChecked, + ) } fn init_factory( @@ -92,11 +100,13 @@ fn init_factory( fn invalidator<'a>( cache: &'a Cache>, - ident: &'a CacheIdent, + idents: &'a [&CacheIdent], ) -> BoxFuture<'a, Result<()>> { Box::pin(async move { - if let CacheIdent::TableId(table_id) = ident { - cache.invalidate(table_id).await + for ident in idents { + if let CacheIdent::TableId(table_id) = ident { + cache.invalidate(table_id).await + } } Ok(()) }) diff --git a/src/common/meta/src/cache/table/table_schema.rs b/src/common/meta/src/cache/table/table_schema.rs index 99ece65683..33b1773f45 100644 --- a/src/common/meta/src/cache/table/table_schema.rs +++ b/src/common/meta/src/cache/table/table_schema.rs @@ -65,7 +65,7 @@ fn init_factory(table_info_manager: TableInfoManager) -> Initializer( _cache: &'a Cache>, - _ident: &'a CacheIdent, + _idents: &'a [&CacheIdent], ) -> BoxFuture<'a, error::Result<()>> { Box::pin(std::future::ready(Ok(()))) } diff --git a/src/common/meta/src/cache/table/view_info.rs b/src/common/meta/src/cache/table/view_info.rs index 6a85493d42..d0e1058a7e 100644 --- a/src/common/meta/src/cache/table/view_info.rs +++ b/src/common/meta/src/cache/table/view_info.rs @@ -60,11 +60,13 @@ fn init_factory(view_info_manager: ViewInfoManagerRef) -> Initializer( cache: &'a Cache>, - ident: &'a CacheIdent, + idents: &'a [&CacheIdent], ) -> BoxFuture<'a, Result<()>> { Box::pin(async move { - if let CacheIdent::TableId(view_id) = ident { - cache.invalidate(view_id).await + for ident in idents { + if let CacheIdent::TableId(view_id) = ident { + cache.invalidate(view_id).await + } } Ok(()) }) diff --git a/src/common/meta/src/error.rs b/src/common/meta/src/error.rs index c6613af828..b9fcbd6188 100644 --- a/src/common/meta/src/error.rs +++ b/src/common/meta/src/error.rs @@ -714,6 +714,16 @@ pub enum Error { #[snafu(display("Failed to get cache"))] GetCache { source: Arc }, + #[snafu(display( + "Failed to get latest cache value after {} attempts due to concurrent invalidation", + attempts + ))] + GetLatestCacheRetryExceeded { + attempts: usize, + #[snafu(implicit)] + location: Location, + }, + #[cfg(feature = "pg_kvbackend")] #[snafu(display("Failed to execute via Postgres, sql: {}", sql))] PostgresExecution { @@ -1063,6 +1073,7 @@ impl ErrorExt for Error { | ConnectEtcd { .. } | MoveValues { .. } | GetCache { .. } + | GetLatestCacheRetryExceeded { .. } | SerializeToJson { .. } | DeserializeFromJson { .. } => StatusCode::Internal, @@ -1243,7 +1254,10 @@ impl Error { /// Determine whether it is a retry later type through [StatusCode] pub fn is_retry_later(&self) -> bool { - matches!(self, Error::RetryLater { .. }) + matches!( + self, + Error::RetryLater { .. } | Error::GetLatestCacheRetryExceeded { .. } + ) } /// Determine whether it needs to clean poisons. diff --git a/src/partition/src/cache.rs b/src/partition/src/cache.rs index a886e1e08d..4066b69aa3 100644 --- a/src/partition/src/cache.rs +++ b/src/partition/src/cache.rs @@ -121,10 +121,12 @@ pub fn new_partition_info_cache( CacheContainer::new( name, cache, - Box::new(|cache, ident| { + Box::new(|cache, idents| { Box::pin(async move { - if let CacheIdent::TableId(table_id) = ident { - cache.invalidate(table_id).await + for ident in idents { + if let CacheIdent::TableId(table_id) = ident { + cache.invalidate(table_id).await + } } Ok(()) }) From 30e895abbef7ec63be7afd8dfdecf448ce88453e Mon Sep 17 00:00:00 2001 From: discord9 <55937128+discord9@users.noreply.github.com> Date: Tue, 24 Mar 2026 14:24:52 +0800 Subject: [PATCH 031/195] fix: prom cast to f64 (#7840) * fix: cast to f64 Signed-off-by: discord9 * test: div case Signed-off-by: discord9 * test: int test Signed-off-by: discord9 * chore: sqlness update Signed-off-by: discord9 * chore: test Signed-off-by: discord9 * chore: update test Signed-off-by: discord9 --------- Signed-off-by: discord9 --- src/query/src/promql/planner.rs | 55 ++-- tests-integration/src/tests/promql_test.rs | 238 +++++++++++++++++- .../explain/step_aggr_advance.result | 90 +++---- .../promql/anon_promql_ratio_repro.result | 106 ++++++++ .../common/promql/anon_promql_ratio_repro.sql | 63 +++++ .../standalone/common/tql/tql-cte.result | 4 +- 6 files changed, 494 insertions(+), 62 deletions(-) create mode 100644 tests/cases/standalone/common/promql/anon_promql_ratio_repro.result create mode 100644 tests/cases/standalone/common/promql/anon_promql_ratio_repro.sql diff --git a/src/query/src/promql/planner.rs b/src/query/src/promql/planner.rs index 427644e26a..b6f4f2d28f 100644 --- a/src/query/src/promql/planner.rs +++ b/src/query/src/promql/planner.rs @@ -3323,28 +3323,55 @@ impl PromPlanner { fn prom_token_to_binary_expr_builder( token: TokenType, ) -> Result Result>> { + let cast_float = |expr| { + if matches!( + &expr, + DfExpr::Cast(Cast { + data_type: ArrowDataType::Float64, + .. + }) + ) || matches!(&expr, DfExpr::Literal(ScalarValue::Float64(_), _)) + { + expr + } else { + DfExpr::Cast(Cast { + expr: Box::new(expr), + data_type: ArrowDataType::Float64, + }) + } + }; match token.id() { - token::T_ADD => Ok(Box::new(|lhs, rhs| Ok(lhs + rhs))), - token::T_SUB => Ok(Box::new(|lhs, rhs| Ok(lhs - rhs))), - token::T_MUL => Ok(Box::new(|lhs, rhs| Ok(lhs * rhs))), - token::T_DIV => Ok(Box::new(|lhs, rhs| Ok(lhs / rhs))), - token::T_MOD => Ok(Box::new(|lhs: DfExpr, rhs| Ok(lhs % rhs))), + token::T_ADD => Ok(Box::new(move |lhs, rhs| { + Ok(cast_float(lhs) + cast_float(rhs)) + })), + token::T_SUB => Ok(Box::new(move |lhs, rhs| { + Ok(cast_float(lhs) - cast_float(rhs)) + })), + token::T_MUL => Ok(Box::new(move |lhs, rhs| { + Ok(cast_float(lhs) * cast_float(rhs)) + })), + token::T_DIV => Ok(Box::new(move |lhs, rhs| { + Ok(cast_float(lhs) / cast_float(rhs)) + })), + token::T_MOD => Ok(Box::new(move |lhs: DfExpr, rhs| { + Ok(cast_float(lhs) % cast_float(rhs)) + })), token::T_EQLC => Ok(Box::new(|lhs, rhs| Ok(lhs.eq(rhs)))), token::T_NEQ => Ok(Box::new(|lhs, rhs| Ok(lhs.not_eq(rhs)))), token::T_GTR => Ok(Box::new(|lhs, rhs| Ok(lhs.gt(rhs)))), token::T_LSS => Ok(Box::new(|lhs, rhs| Ok(lhs.lt(rhs)))), token::T_GTE => Ok(Box::new(|lhs, rhs| Ok(lhs.gt_eq(rhs)))), token::T_LTE => Ok(Box::new(|lhs, rhs| Ok(lhs.lt_eq(rhs)))), - token::T_POW => Ok(Box::new(|lhs, rhs| { + token::T_POW => Ok(Box::new(move |lhs, rhs| { Ok(DfExpr::ScalarFunction(ScalarFunction { func: datafusion_functions::math::power(), - args: vec![lhs, rhs], + args: vec![cast_float(lhs), cast_float(rhs)], })) })), - token::T_ATAN2 => Ok(Box::new(|lhs, rhs| { + token::T_ATAN2 => Ok(Box::new(move |lhs, rhs| { Ok(DfExpr::ScalarFunction(ScalarFunction { func: datafusion_functions::math::atan2(), - args: vec![lhs, rhs], + args: vec![cast_float(lhs), cast_float(rhs)], })) })), _ => UnexpectedTokenSnafu { token }.fail(), @@ -5169,7 +5196,7 @@ mod test { .unwrap(); let expected = String::from( - "Projection: rhs.tag_0, rhs.timestamp, lhs.field_0 + rhs.field_0 AS lhs.field_0 + rhs.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), lhs.field_0 + rhs.field_0:Float64;N]\ + "Projection: rhs.tag_0, rhs.timestamp, CAST(lhs.field_0 AS Float64) + CAST(rhs.field_0 AS Float64) AS lhs.field_0 + rhs.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), lhs.field_0 + rhs.field_0:Float64;N]\ \n Inner Join: lhs.tag_0 = rhs.tag_0, lhs.timestamp = rhs.timestamp [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\ \n SubqueryAlias: lhs [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\ \n PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\ @@ -5224,7 +5251,7 @@ mod test { async fn binary_op_literal_column() { let query = r#"1 + some_metric{tag_0="bar"}"#; let expected = String::from( - "Projection: some_metric.tag_0, some_metric.timestamp, Float64(1) + some_metric.field_0 AS Float64(1) + field_0 [tag_0:Utf8, timestamp:Timestamp(ms), Float64(1) + field_0:Float64;N]\ + "Projection: some_metric.tag_0, some_metric.timestamp, Float64(1) + CAST(some_metric.field_0 AS Float64) AS Float64(1) + field_0 [tag_0:Utf8, timestamp:Timestamp(ms), Float64(1) + field_0:Float64;N]\ \n PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\ \n PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\ \n Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\ @@ -5262,7 +5289,7 @@ mod test { async fn bool_with_additional_arithmetic() { let query = "some_metric + (1 == bool 2)"; let expected = String::from( - "Projection: some_metric.tag_0, some_metric.timestamp, some_metric.field_0 + CAST(Float64(1) = Float64(2) AS Float64) AS field_0 + Float64(1) = Float64(2) [tag_0:Utf8, timestamp:Timestamp(ms), field_0 + Float64(1) = Float64(2):Float64;N]\ + "Projection: some_metric.tag_0, some_metric.timestamp, CAST(some_metric.field_0 AS Float64) + CAST(Float64(1) = Float64(2) AS Float64) AS field_0 + Float64(1) = Float64(2) [tag_0:Utf8, timestamp:Timestamp(ms), field_0 + Float64(1) = Float64(2):Float64;N]\ \n PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\ \n PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\ \n Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\ @@ -5372,7 +5399,7 @@ mod test { PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state()) .await .unwrap(); - let expected = "Projection: http_server_requests_seconds_count.uri, http_server_requests_seconds_count.kubernetes_namespace, http_server_requests_seconds_count.kubernetes_pod_name, http_server_requests_seconds_count.greptime_timestamp, http_server_requests_seconds_sum.greptime_value / http_server_requests_seconds_count.greptime_value AS http_server_requests_seconds_sum.greptime_value / http_server_requests_seconds_count.greptime_value\ + let expected = "Projection: http_server_requests_seconds_count.uri, http_server_requests_seconds_count.kubernetes_namespace, http_server_requests_seconds_count.kubernetes_pod_name, http_server_requests_seconds_count.greptime_timestamp, CAST(http_server_requests_seconds_sum.greptime_value AS Float64) / CAST(http_server_requests_seconds_count.greptime_value AS Float64) AS http_server_requests_seconds_sum.greptime_value / http_server_requests_seconds_count.greptime_value\ \n Inner Join: http_server_requests_seconds_sum.greptime_timestamp = http_server_requests_seconds_count.greptime_timestamp, http_server_requests_seconds_sum.uri = http_server_requests_seconds_count.uri\ \n SubqueryAlias: http_server_requests_seconds_sum\ \n PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp]\ @@ -5763,7 +5790,7 @@ mod test { let query = "some_alt_metric{__schema__=\"greptime_private\"} / some_metric"; let expected = String::from( - "Projection: some_metric.tag_0, some_metric.timestamp, greptime_private.some_alt_metric.field_0 / some_metric.field_0 AS greptime_private.some_alt_metric.field_0 / some_metric.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), greptime_private.some_alt_metric.field_0 / some_metric.field_0:Float64;N]\ + "Projection: some_metric.tag_0, some_metric.timestamp, CAST(greptime_private.some_alt_metric.field_0 AS Float64) / CAST(some_metric.field_0 AS Float64) AS greptime_private.some_alt_metric.field_0 / some_metric.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), greptime_private.some_alt_metric.field_0 / some_metric.field_0:Float64;N]\ \n Inner Join: greptime_private.some_alt_metric.tag_0 = some_metric.tag_0, greptime_private.some_alt_metric.timestamp = some_metric.timestamp [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\ \n SubqueryAlias: greptime_private.some_alt_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\ \n PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\ diff --git a/tests-integration/src/tests/promql_test.rs b/tests-integration/src/tests/promql_test.rs index 7fbce91ea6..ede4663118 100644 --- a/tests-integration/src/tests/promql_test.rs +++ b/tests-integration/src/tests/promql_test.rs @@ -15,7 +15,9 @@ use std::sync::Arc; use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use common_query::Output; +use common_query::{Output, OutputData}; +use common_recordbatch::util::collect_batches; +use datatypes::arrow::array::{Float64Array, Int64Array}; use frontend::instance::Instance; use query::parser::{PromQuery, QueryLanguageParser, QueryStatement}; use rstest::rstest; @@ -151,6 +153,103 @@ async fn create_insert_tql_assert( check_unordered_output_stream(query_output, expected).await; } +async fn execute_all(instance: &Arc, sql: &str, query_ctx: Arc) { + instance + .do_query(sql, query_ctx) + .await + .into_iter() + .for_each(|v| { + let _ = v.unwrap(); + }); +} + +#[allow(clippy::too_many_arguments)] +async fn promql_query_as_batches( + ins: Arc, + promql: &str, + alias: Option, + query_ctx: Arc, + start: SystemTime, + end: SystemTime, + interval: Duration, + lookback: Duration, +) -> common_recordbatch::RecordBatches { + let output = promql_query( + ins, promql, alias, query_ctx, start, end, interval, lookback, + ) + .await + .unwrap(); + match output.data { + OutputData::Stream(stream) => collect_batches(stream).await.unwrap(), + OutputData::RecordBatches(recordbatches) => recordbatches, + _ => unreachable!(), + } +} + +const ANON_PROMQL_RATIO_REPRO_DB: &str = "repro_db"; + +const ANON_PROMQL_RATIO_REPRO_CREATE: &str = r#" +CREATE TABLE phy ( + t TIMESTAMP TIME INDEX, + v DOUBLE +) ENGINE=metric WITH ("physical_metric_table" = ""); + +CREATE TABLE metric_a ( + l1 STRING NULL, + l2 STRING NULL, + l3 STRING NULL, + l4 STRING NULL, + l5 STRING NULL, + t TIMESTAMP NOT NULL, + v DOUBLE NULL, + TIME INDEX (t), + PRIMARY KEY (l1, l2, l3, l4, l5) +) ENGINE=metric WITH (on_physical_table = 'phy'); + +CREATE TABLE metric_b ( + l6 STRING NULL, + l1 STRING NULL, + l2 STRING NULL, + l3 STRING NULL, + l4 STRING NULL, + t TIMESTAMP NOT NULL, + v DOUBLE NULL, + TIME INDEX (t), + PRIMARY KEY (l6, l1, l2, l3, l4) +) ENGINE=metric WITH (on_physical_table = 'phy'); +"#; + +const ANON_PROMQL_RATIO_REPRO_INSERT: &str = r#" +INSERT INTO metric_a (l1, l2, l3, l4, l5, t, v) VALUES + ('v1', 'v2', 'v3', 'v4a', 'v5a', 1, 0), + ('v1', 'v2', 'v3', 'v4a', 'v5a', 180000, 120), + ('v1', 'v2', 'v3', 'v4a', 'v5a', 360000, 240), + ('v1', 'v2', 'v3', 'v4a', 'v5b', 1, 0), + ('v1', 'v2', 'v3', 'v4a', 'v5b', 180000, 30), + ('v1', 'v2', 'v3', 'v4a', 'v5b', 360000, 60), + ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 1, 0), + ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 180000, 60), + ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 360000, 120); + +INSERT INTO metric_b (l6, l1, l2, l3, l4, t, v) VALUES + ('v6', 'v1', 'v2', 'v3', 'v4a', 1, 1), + ('v6', 'v1', 'v2', 'v3', 'v4a', 180000, 1), + ('v6', 'v1', 'v2', 'v3', 'v4a', 360000, 1), + ('v6', 'v1', 'v2', 'v3-b', 'v4b', 1, 2), + ('v6', 'v1', 'v2', 'v3-b', 'v4b', 180000, 2), + ('v6', 'v1', 'v2', 'v3-b', 'v4b', 360000, 2); +"#; + +const ANON_PROMQL_RATIO_REPRO_NUMERATOR: &str = r#"count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}) > 0.50))"#; + +const ANON_PROMQL_RATIO_REPRO_DENOMINATOR: &str = + r#"count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m]))"#; + +const ANON_PROMQL_RATIO_REPRO_WHOLE: &str = r#"(count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}) > 0.50)) / count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m]))) * 100"#; + +const ANON_PROMQL_RATIO_REPRO_SCALAR_DIV: &str = + r#"count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m])) / 2"#; + #[apply(both_instances_cases)] async fn sql_insert_tql_query_ceil(instance: Arc) { let instance = instance.frontend(); @@ -709,3 +808,140 @@ async fn cross_schema_query(instance: Arc) { check_unordered_output_stream(query_output, expected).await; } + +#[apply(both_instances_cases)] +async fn anon_promql_ratio_repro(instance: Arc) { + let ins = instance.frontend(); + + execute_all( + &ins, + &format!("CREATE DATABASE {ANON_PROMQL_RATIO_REPRO_DB}"), + QueryContext::arc(), + ) + .await; + + let repro_ctx: Arc = + QueryContext::with_db_name(Some(ANON_PROMQL_RATIO_REPRO_DB)).into(); + execute_all(&ins, ANON_PROMQL_RATIO_REPRO_CREATE, repro_ctx.clone()).await; + execute_all(&ins, ANON_PROMQL_RATIO_REPRO_INSERT, repro_ctx).await; + + let start = UNIX_EPOCH.checked_add(Duration::from_secs(180)).unwrap(); + let end = UNIX_EPOCH.checked_add(Duration::from_secs(360)).unwrap(); + let interval = Duration::from_secs(180); + let lookback = Duration::from_secs(1); + + let numerator = promql_query_as_batches( + ins.clone(), + ANON_PROMQL_RATIO_REPRO_NUMERATOR, + Some("num".to_string()), + QueryContext::arc(), + start, + end, + interval, + lookback, + ) + .await; + let denominator = promql_query_as_batches( + ins.clone(), + ANON_PROMQL_RATIO_REPRO_DENOMINATOR, + Some("den".to_string()), + QueryContext::arc(), + start, + end, + interval, + lookback, + ) + .await; + let whole = promql_query_as_batches( + ins.clone(), + ANON_PROMQL_RATIO_REPRO_WHOLE, + Some("pct".to_string()), + QueryContext::arc(), + start, + end, + interval, + lookback, + ) + .await; + let scalar_div = promql_query_as_batches( + ins, + ANON_PROMQL_RATIO_REPRO_SCALAR_DIV, + Some("half_den".to_string()), + QueryContext::arc(), + start, + end, + interval, + lookback, + ) + .await; + + let numerator = numerator.iter().collect::>(); + let denominator = denominator.iter().collect::>(); + let whole = whole.iter().collect::>(); + let scalar_div = scalar_div.iter().collect::>(); + + let numerator_values = numerator[0] + .column_by_name("num") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let denominator_values = denominator[0] + .column_by_name("den") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let percentage_values = whole[0] + .column_by_name("pct") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let scalar_div_values = scalar_div[0] + .column_by_name("half_den") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(numerator_values.len(), 1, "{}", numerator[0].pretty_print()); + assert_eq!( + denominator_values.len(), + 1, + "{}", + denominator[0].pretty_print() + ); + assert_eq!(percentage_values.len(), 1, "{}", whole[0].pretty_print()); + assert_eq!( + scalar_div_values.len(), + 1, + "{}", + scalar_div[0].pretty_print() + ); + + assert_eq!( + numerator_values.value(0), + 1, + "{}", + numerator[0].pretty_print() + ); + assert_eq!( + denominator_values.value(0), + 3, + "{}", + denominator[0].pretty_print() + ); + assert!( + (scalar_div_values.value(0) - 1.5).abs() < 1e-9, + "{}", + scalar_div[0].pretty_print() + ); + + let expected = 100.0 / 3.0; + assert!( + (percentage_values.value(0) - expected).abs() < 1e-9, + "{}", + whole[0].pretty_print() + ); +} diff --git a/tests/cases/distributed/explain/step_aggr_advance.result b/tests/cases/distributed/explain/step_aggr_advance.result index 4bd83b7afa..5938fa202d 100644 --- a/tests/cases/distributed/explain/step_aggr_advance.result +++ b/tests/cases/distributed/explain/step_aggr_advance.result @@ -442,54 +442,54 @@ Affected Rows: 0 -- SQLNESS REPLACE (Hash.*) REDACTED tql explain (1752591864, 1752592164, '30s') sum by (a, b, c) (rate(aggr_optimize_not [2m])) / sum by (a, b, c) (rate(aggr_optimize_not_count [2m])); -+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Projection: aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp, aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) AS aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) | -| | Inner Join: aggr_optimize_not.a = aggr_optimize_not_count.a, aggr_optimize_not.b = aggr_optimize_not_count.b, aggr_optimize_not.c = aggr_optimize_not_count.c, aggr_optimize_not.greptime_timestamp = aggr_optimize_not_count.greptime_timestamp | -| | MergeSort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST | -| | MergeScan [is_placeholder=false, remote_input=[ | -| | SubqueryAlias: aggr_optimize_not | -| | Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST | -| | Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]] | -| | Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL | -| | Projection: aggr_optimize_not.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d | -| | PromRangeManipulate: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"] | -| | PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true] | -| | PromSeriesDivide: tags=["a", "b", "c", "d"] | -| | Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST | -| | Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(1752591744001, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(1752592164000, None) | -| | TableScan: aggr_optimize_not | -| | ]] | -| | SubqueryAlias: aggr_optimize_not_count | -| | Sort: aggr_optimize_not_count.a ASC NULLS LAST, aggr_optimize_not_count.b ASC NULLS LAST, aggr_optimize_not_count.c ASC NULLS LAST, aggr_optimize_not_count.greptime_timestamp ASC NULLS LAST | -| | Aggregate: groupBy=[[aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]] | -| | Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL | -| | Projection: aggr_optimize_not_count.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not_count.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c | -| | PromRangeManipulate: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"] | -| | PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true] | -| | PromSeriesDivide: tags=["a", "b", "c", "d"] | -| | Sort: aggr_optimize_not_count.a ASC NULLS FIRST, aggr_optimize_not_count.b ASC NULLS FIRST, aggr_optimize_not_count.c ASC NULLS FIRST, aggr_optimize_not_count.d ASC NULLS FIRST, aggr_optimize_not_count.greptime_timestamp ASC NULLS FIRST | -| | MergeScan [is_placeholder=false, remote_input=[ | -| | Filter: aggr_optimize_not_count.greptime_timestamp >= TimestampMillisecond(1752591744001, None) AND aggr_optimize_not_count.greptime_timestamp <= TimestampMillisecond(1752592164000, None) | -| | TableScan: aggr_optimize_not_count | -| | ]] | -| physical_plan | ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp, sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@5 / sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@4 as aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))] | ++---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp, CAST(aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) AS Float64) / CAST(aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) AS Float64) AS aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) | +| | Inner Join: aggr_optimize_not.a = aggr_optimize_not_count.a, aggr_optimize_not.b = aggr_optimize_not_count.b, aggr_optimize_not.c = aggr_optimize_not_count.c, aggr_optimize_not.greptime_timestamp = aggr_optimize_not_count.greptime_timestamp | +| | MergeSort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST | +| | MergeScan [is_placeholder=false, remote_input=[ | +| | SubqueryAlias: aggr_optimize_not | +| | Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST | +| | Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]] | +| | Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL | +| | Projection: aggr_optimize_not.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d | +| | PromRangeManipulate: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"] | +| | PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true] | +| | PromSeriesDivide: tags=["a", "b", "c", "d"] | +| | Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST | +| | Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(1752591744001, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(1752592164000, None) | +| | TableScan: aggr_optimize_not | +| | ]] | +| | SubqueryAlias: aggr_optimize_not_count | +| | Sort: aggr_optimize_not_count.a ASC NULLS LAST, aggr_optimize_not_count.b ASC NULLS LAST, aggr_optimize_not_count.c ASC NULLS LAST, aggr_optimize_not_count.greptime_timestamp ASC NULLS LAST | +| | Aggregate: groupBy=[[aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]] | +| | Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL | +| | Projection: aggr_optimize_not_count.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not_count.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c | +| | PromRangeManipulate: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"] | +| | PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true] | +| | PromSeriesDivide: tags=["a", "b", "c", "d"] | +| | Sort: aggr_optimize_not_count.a ASC NULLS FIRST, aggr_optimize_not_count.b ASC NULLS FIRST, aggr_optimize_not_count.c ASC NULLS FIRST, aggr_optimize_not_count.d ASC NULLS FIRST, aggr_optimize_not_count.greptime_timestamp ASC NULLS FIRST | +| | MergeScan [is_placeholder=false, remote_input=[ | +| | Filter: aggr_optimize_not_count.greptime_timestamp >= TimestampMillisecond(1752591744001, None) AND aggr_optimize_not_count.greptime_timestamp <= TimestampMillisecond(1752592164000, None) | +| | TableScan: aggr_optimize_not_count | +| | ]] | +| physical_plan | ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp, sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@5 / sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@4 as aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))] | | | REDACTED -| | CoalescePartitionsExec | -| | AggregateExec: mode=SinglePartitioned, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))] | -| | FilterExec: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))@1 IS NOT NULL | -| | ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_rate(greptime_timestamp_range@6, greptime_value@5, greptime_timestamp@4, 120000) as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@0 as a, b@1 as b, c@2 as c] | -| | PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] | -| | PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] | -| | PromSeriesDivideExec: tags=["a", "b", "c", "d"] | -| | SortExec: expr=[a@0 ASC, b@1 ASC, c@2 ASC, d@3 ASC, greptime_timestamp@4 ASC], preserve_partitioning=[true] | +| | CoalescePartitionsExec | +| | AggregateExec: mode=SinglePartitioned, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))] | +| | FilterExec: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))@1 IS NOT NULL | +| | ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_rate(greptime_timestamp_range@6, greptime_value@5, greptime_timestamp@4, 120000) as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@0 as a, b@1 as b, c@2 as c] | +| | PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] | +| | PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] | +| | PromSeriesDivideExec: tags=["a", "b", "c", "d"] | +| | SortExec: expr=[a@0 ASC, b@1 ASC, c@2 ASC, d@3 ASC, greptime_timestamp@4 ASC], preserve_partitioning=[true] | | | MergeScanExec: REDACTED -| | SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true] | -| | CooperativeExec | +| | SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true] | +| | CooperativeExec | | | MergeScanExec: REDACTED -| | | -+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| | | ++---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -- SQLNESS REPLACE (metrics.*) REDACTED -- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED diff --git a/tests/cases/standalone/common/promql/anon_promql_ratio_repro.result b/tests/cases/standalone/common/promql/anon_promql_ratio_repro.result new file mode 100644 index 0000000000..ab3c4db715 --- /dev/null +++ b/tests/cases/standalone/common/promql/anon_promql_ratio_repro.result @@ -0,0 +1,106 @@ +CREATE TABLE phy ( + t TIMESTAMP TIME INDEX, + v DOUBLE +) ENGINE=metric WITH ("physical_metric_table" = ""); + +Affected Rows: 0 + +CREATE TABLE metric_a ( + l1 STRING NULL, + l2 STRING NULL, + l3 STRING NULL, + l4 STRING NULL, + l5 STRING NULL, + t TIMESTAMP NOT NULL, + v DOUBLE NULL, + TIME INDEX (t), + PRIMARY KEY (l1, l2, l3, l4, l5) +) ENGINE=metric WITH (on_physical_table = 'phy'); + +Affected Rows: 0 + +CREATE TABLE metric_b ( + l6 STRING NULL, + l1 STRING NULL, + l2 STRING NULL, + l3 STRING NULL, + l4 STRING NULL, + t TIMESTAMP NOT NULL, + v DOUBLE NULL, + TIME INDEX (t), + PRIMARY KEY (l6, l1, l2, l3, l4) +) ENGINE=metric WITH (on_physical_table = 'phy'); + +Affected Rows: 0 + +INSERT INTO metric_a (l1, l2, l3, l4, l5, t, v) VALUES + ('v1', 'v2', 'v3', 'v4a', 'v5a', 1, 0), + ('v1', 'v2', 'v3', 'v4a', 'v5a', 180000, 120), + ('v1', 'v2', 'v3', 'v4a', 'v5a', 360000, 240), + ('v1', 'v2', 'v3', 'v4a', 'v5b', 1, 0), + ('v1', 'v2', 'v3', 'v4a', 'v5b', 180000, 30), + ('v1', 'v2', 'v3', 'v4a', 'v5b', 360000, 60), + ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 1, 0), + ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 180000, 60), + ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 360000, 120); + +Affected Rows: 9 + +INSERT INTO metric_b (l6, l1, l2, l3, l4, t, v) VALUES + ('v6', 'v1', 'v2', 'v3', 'v4a', 1, 1), + ('v6', 'v1', 'v2', 'v3', 'v4a', 180000, 1), + ('v6', 'v1', 'v2', 'v3', 'v4a', 360000, 1), + ('v6', 'v1', 'v2', 'v3-b', 'v4b', 1, 2), + ('v6', 'v1', 'v2', 'v3-b', 'v4b', 180000, 2), + ('v6', 'v1', 'v2', 'v3-b', 'v4b', 360000, 2); + +Affected Rows: 6 + +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (180, 360, '180s') count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)"}) > 0.50)); + ++---------------------+-------------------------------------------------------------------+ +| t | count(metric_a.prom_rate(t_range,v,t,Int64(180000)) / metric_b.v) | ++---------------------+-------------------------------------------------------------------+ +| 1970-01-01T00:03:00 | 1 | ++---------------------+-------------------------------------------------------------------+ + +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (180, 360, '180s') count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m])); + ++---------------------+---------------------------------------------+ +| t | count(prom_rate(t_range,v,t,Int64(180000))) | ++---------------------+---------------------------------------------+ +| 1970-01-01T00:03:00 | 3 | ++---------------------+---------------------------------------------+ + +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (180, 360, '180s') count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m])) / 2; + ++---------------------+----------------------------------------------------------+ +| t | count(prom_rate(t_range,v,t,Int64(180000))) / Float64(2) | ++---------------------+----------------------------------------------------------+ +| 1970-01-01T00:03:00 | 1.5 | ++---------------------+----------------------------------------------------------+ + +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (180, 360, '180s') (count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)"}) > 0.50)) / count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]))) * 100; + ++---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+ +| t | metric_b.count(metric_a.prom_rate(t_range,v,t,Int64(180000)) / metric_b.v) / metric_a.count(prom_rate(t_range,v,t,Int64(180000))) * Float64(100) | ++---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+ +| 1970-01-01T00:03:00 | 33.33333333333333 | ++---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+ + +DROP TABLE metric_a; + +Affected Rows: 0 + +DROP TABLE metric_b; + +Affected Rows: 0 + +DROP TABLE phy; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/promql/anon_promql_ratio_repro.sql b/tests/cases/standalone/common/promql/anon_promql_ratio_repro.sql new file mode 100644 index 0000000000..946d4f93a1 --- /dev/null +++ b/tests/cases/standalone/common/promql/anon_promql_ratio_repro.sql @@ -0,0 +1,63 @@ +CREATE TABLE phy ( + t TIMESTAMP TIME INDEX, + v DOUBLE +) ENGINE=metric WITH ("physical_metric_table" = ""); + +CREATE TABLE metric_a ( + l1 STRING NULL, + l2 STRING NULL, + l3 STRING NULL, + l4 STRING NULL, + l5 STRING NULL, + t TIMESTAMP NOT NULL, + v DOUBLE NULL, + TIME INDEX (t), + PRIMARY KEY (l1, l2, l3, l4, l5) +) ENGINE=metric WITH (on_physical_table = 'phy'); + +CREATE TABLE metric_b ( + l6 STRING NULL, + l1 STRING NULL, + l2 STRING NULL, + l3 STRING NULL, + l4 STRING NULL, + t TIMESTAMP NOT NULL, + v DOUBLE NULL, + TIME INDEX (t), + PRIMARY KEY (l6, l1, l2, l3, l4) +) ENGINE=metric WITH (on_physical_table = 'phy'); + +INSERT INTO metric_a (l1, l2, l3, l4, l5, t, v) VALUES + ('v1', 'v2', 'v3', 'v4a', 'v5a', 1, 0), + ('v1', 'v2', 'v3', 'v4a', 'v5a', 180000, 120), + ('v1', 'v2', 'v3', 'v4a', 'v5a', 360000, 240), + ('v1', 'v2', 'v3', 'v4a', 'v5b', 1, 0), + ('v1', 'v2', 'v3', 'v4a', 'v5b', 180000, 30), + ('v1', 'v2', 'v3', 'v4a', 'v5b', 360000, 60), + ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 1, 0), + ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 180000, 60), + ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 360000, 120); + +INSERT INTO metric_b (l6, l1, l2, l3, l4, t, v) VALUES + ('v6', 'v1', 'v2', 'v3', 'v4a', 1, 1), + ('v6', 'v1', 'v2', 'v3', 'v4a', 180000, 1), + ('v6', 'v1', 'v2', 'v3', 'v4a', 360000, 1), + ('v6', 'v1', 'v2', 'v3-b', 'v4b', 1, 2), + ('v6', 'v1', 'v2', 'v3-b', 'v4b', 180000, 2), + ('v6', 'v1', 'v2', 'v3-b', 'v4b', 360000, 2); + +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (180, 360, '180s') count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)"}) > 0.50)); + +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (180, 360, '180s') count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m])); + +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (180, 360, '180s') count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m])) / 2; + +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (180, 360, '180s') (count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)"}) > 0.50)) / count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]))) * 100; + +DROP TABLE metric_a; +DROP TABLE metric_b; +DROP TABLE phy; diff --git a/tests/cases/standalone/common/tql/tql-cte.result b/tests/cases/standalone/common/tql/tql-cte.result index a8c0c45d5d..e8278e80bd 100644 --- a/tests/cases/standalone/common/tql/tql-cte.result +++ b/tests/cases/standalone/common/tql/tql-cte.result @@ -427,8 +427,8 @@ SELECT min(val) as min_computed, max(val) as max_computed FROM computed; | | Aggregate: groupBy=[[]], aggr=[[min(computed.val), max(computed.val)]] | | | SubqueryAlias: computed | | | Projection: metric.ts AS ts, val * Float64(2) + Float64(1) AS val | -| | Projection: metric.ts, val * Float64(2) + Float64(1) AS val * Float64(2) + Float64(1) | -| | Projection: metric.ts, metric.val * Float64(2) AS val * Float64(2) | +| | Projection: metric.ts, CAST(val * Float64(2) AS Float64) + Float64(1) AS val * Float64(2) + Float64(1) | +| | Projection: metric.ts, CAST(metric.val AS Float64) * Float64(2) AS val * Float64(2) | | | PromInstantManipulate: range=[0..40000], lookback=[300000], interval=[10000], time index=[ts] | | | PromSeriesDivide: tags=[] | | | Filter: metric.ts >= TimestampMillisecond(-299999, None) AND metric.ts <= TimestampMillisecond(40000, None) | From 187b8d3798b69d4881d2bb667e4facd16729b115 Mon Sep 17 00:00:00 2001 From: liyang Date: Tue, 24 Mar 2026 17:19:18 +0800 Subject: [PATCH 032/195] ci: remove redundant directory level when uploading artifacts to S3 (#7852) Signed-off-by: liyang --- .github/scripts/upload-artifacts-to-s3.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/scripts/upload-artifacts-to-s3.sh b/.github/scripts/upload-artifacts-to-s3.sh index 310575c069..1ddf32044b 100755 --- a/.github/scripts/upload-artifacts-to-s3.sh +++ b/.github/scripts/upload-artifacts-to-s3.sh @@ -33,7 +33,7 @@ function upload_artifacts() { # └── greptime-darwin-amd64-v0.2.0.tar.gz find "$ARTIFACTS_DIR" -type f \( -name "*.tar.gz" -o -name "*.sha256sum" \) | while IFS= read -r file; do filename=$(basename "$file") - TARGET_URL="$PROXY_URL/$RELEASE_DIRS/$VERSION/$filename" + TARGET_URL="$PROXY_URL/$RELEASE_DIRS/$VERSION" curl -X PUT \ -u "$PROXY_USERNAME:$PROXY_PASSWORD" \ @@ -49,7 +49,7 @@ function update_version_info() { if [[ "$VERSION" =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then echo "Updating latest-version.txt" echo "$VERSION" > latest-version.txt - TARGET_URL="$PROXY_URL/$RELEASE_DIRS/latest-version.txt" + TARGET_URL="$PROXY_URL/$RELEASE_DIRS" curl -X PUT \ -u "$PROXY_USERNAME:$PROXY_PASSWORD" \ @@ -62,7 +62,7 @@ function update_version_info() { echo "Updating latest-nightly-version.txt" echo "$VERSION" > latest-nightly-version.txt - TARGET_URL="$PROXY_URL/$RELEASE_DIRS/latest-nightly-version.txt" + TARGET_URL="$PROXY_URL/$RELEASE_DIRS" curl -X PUT \ -u "$PROXY_USERNAME:$PROXY_PASSWORD" \ -F "file=@latest-nightly-version.txt" \ From 0e22d6a72b7ee66b5e3c284a47da97ec6af2837e Mon Sep 17 00:00:00 2001 From: Yingwen Date: Tue, 24 Mar 2026 18:01:13 +0800 Subject: [PATCH 033/195] feat: implement partition range cache stream (#7842) * feat: add cache stream helpers, key construction, config wiring, and metrics for partition range cache Add range result cache size config field and wire it through cache builder chains. Implement cache key building (build_range_cache_key), stream replay/store helpers (cached_flat_range_stream, cache_flat_range_stream), dictionary compaction (compact_pk_dictionary), and partition range row group collection. Add range cache metrics (size, hit, miss) to ScanMetricsSet and PartitionMetrics. Move fingerprint tests from scan_region to range_cache module. These functions are not yet wired into scan execution. Signed-off-by: evenyag * feat: add benchmark for cache stream Signed-off-by: evenyag * refactor: move bench_util to test_util Signed-off-by: evenyag * feat: share dict Signed-off-by: evenyag * test: test ptr_eq Signed-off-by: evenyag * chore: fmt code Signed-off-by: evenyag * refactor: simplify value array handling Signed-off-by: evenyag * chore: add todo for estimate size Signed-off-by: evenyag * feat: simplify size calculation Signed-off-by: evenyag * chore: remove one test Signed-off-by: evenyag * test: update config test Signed-off-by: evenyag * chore: address review comment Only ignore exprs that can extract time ranges Signed-off-by: evenyag * test: fix tests Signed-off-by: evenyag --------- Signed-off-by: evenyag --- src/mito2/Cargo.toml | 5 + src/mito2/benches/bench_cache_stream.rs | 126 +++++ src/mito2/benches/memtable_bench.rs | 245 +------- src/mito2/src/cache.rs | 13 +- src/mito2/src/config.rs | 4 + src/mito2/src/memtable/bulk/part.rs | 11 +- src/mito2/src/memtable/bulk/part_reader.rs | 2 +- src/mito2/src/read.rs | 3 + src/mito2/src/read/range_cache.rs | 628 ++++++++++++++++++++- src/mito2/src/read/scan_region.rs | 39 +- src/mito2/src/read/scan_util.rs | 40 ++ src/mito2/src/test_util.rs | 1 + src/mito2/src/test_util/bench_util.rs | 259 +++++++++ src/mito2/src/test_util/memtable_util.rs | 2 +- src/mito2/src/worker.rs | 2 + src/table/src/predicate.rs | 2 +- tests-integration/tests/http.rs | 1 + 17 files changed, 1113 insertions(+), 270 deletions(-) create mode 100644 src/mito2/benches/bench_cache_stream.rs create mode 100644 src/mito2/src/test_util/bench_util.rs diff --git a/src/mito2/Cargo.toml b/src/mito2/Cargo.toml index 1d7cf7b6d7..a78bf079b0 100644 --- a/src/mito2/Cargo.toml +++ b/src/mito2/Cargo.toml @@ -108,6 +108,11 @@ name = "memtable_bench" harness = false required-features = ["test"] +[[bench]] +name = "bench_cache_stream" +harness = false +required-features = ["test"] + [[bench]] name = "bench_filter_time_partition" harness = false diff --git a/src/mito2/benches/bench_cache_stream.rs b/src/mito2/benches/bench_cache_stream.rs new file mode 100644 index 0000000000..f2314f2ccb --- /dev/null +++ b/src/mito2/benches/bench_cache_stream.rs @@ -0,0 +1,126 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Benchmarks for `cache_flat_range_stream` overhead. +//! +//! Compares consuming batches from a plain stream vs through the caching wrapper +//! that clones batches for the range cache. +//! +//! Run with: +//! ```sh +//! cargo bench -p mito2 --features test --bench bench_cache_stream +//! ``` + +use std::collections::VecDeque; +use std::sync::Arc; + +use criterion::{Criterion, criterion_group, criterion_main}; +use futures::TryStreamExt; +use mito_codec::row_converter::DensePrimaryKeyCodec; +use mito2::memtable::bulk::context::BulkIterContext; +use mito2::memtable::bulk::part::{BulkPartConverter, BulkPartEncoder}; +use mito2::memtable::bulk::part_reader::EncodedBulkPartIter; +use mito2::read::range_cache::bench_cache_flat_range_stream; +use mito2::sst::parquet::DEFAULT_ROW_GROUP_SIZE; +use mito2::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema}; +use mito2::test_util::bench_util::{CpuDataGenerator, cpu_metadata}; + +fn cache_flat_range_stream_bench(c: &mut Criterion) { + let metadata = Arc::new(cpu_metadata()); + let region_id = metadata.region_id; + let start_sec = 1710043200; + // 2000 hosts × 51 steps = 102,000 rows ≈ DEFAULT_ROW_GROUP_SIZE + let num_hosts = 2000; + let end_sec = start_sec + 510; + let generator = CpuDataGenerator::new(metadata.clone(), num_hosts, start_sec, end_sec); + + // Build a BulkPart from all the generated data + let schema = to_flat_sst_arrow_schema(&metadata, &FlatSchemaOptions::default()); + let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata)); + + let mut converter = BulkPartConverter::new( + &metadata, + schema, + DEFAULT_ROW_GROUP_SIZE, + codec, + true, // store_pk_columns + ); + for kvs in generator.iter() { + converter.append_key_values(&kvs).unwrap(); + } + let bulk_part = converter.convert().unwrap(); + + // Encode to parquet + let encoder = BulkPartEncoder::new(metadata.clone(), DEFAULT_ROW_GROUP_SIZE).unwrap(); + let encoded_part = encoder.encode_part(&bulk_part).unwrap().unwrap(); + + // Decode all record batches + let num_row_groups = encoded_part.metadata().parquet_metadata.num_row_groups(); + let context = Arc::new( + BulkIterContext::new( + metadata.clone(), + None, // No projection + None, // No predicate + false, + ) + .unwrap(), + ); + let row_groups: VecDeque = (0..num_row_groups).collect(); + + let rt = tokio::runtime::Runtime::new().unwrap(); + + let mut group = c.benchmark_group("cache_flat_range_stream"); + group.sample_size(10); + + group.bench_function("baseline_iter_stream", |b| { + b.iter(|| { + rt.block_on(async { + let iter = EncodedBulkPartIter::try_new( + &encoded_part, + context.clone(), + row_groups.clone(), + None, + None, + ) + .unwrap(); + let stream: mito2::read::BoxedRecordBatchStream = + Box::pin(futures::stream::iter(iter)); + let mut stream = stream; + while let Some(_batch) = stream.try_next().await.unwrap() {} + }); + }); + }); + + group.bench_function("cache_flat_range_stream", |b| { + b.iter(|| { + rt.block_on(async { + let iter = EncodedBulkPartIter::try_new( + &encoded_part, + context.clone(), + row_groups.clone(), + None, + None, + ) + .unwrap(); + let stream: mito2::read::BoxedRecordBatchStream = + Box::pin(futures::stream::iter(iter)); + let mut stream = bench_cache_flat_range_stream(stream, 64 * 1024 * 1024, region_id); + while let Some(_batch) = stream.try_next().await.unwrap() {} + }); + }); + }); +} + +criterion_group!(benches, cache_flat_range_stream_bench); +criterion_main!(benches); diff --git a/src/mito2/benches/memtable_bench.rs b/src/mito2/benches/memtable_bench.rs index df991f6f92..8336625e3c 100644 --- a/src/mito2/benches/memtable_bench.rs +++ b/src/mito2/benches/memtable_bench.rs @@ -12,15 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. +//! Benchmarks for memtable operations: writes, full scans, filtered scans, +//! bulk part conversion, record batch iteration with filters, and flat merge. +//! +//! Run with: +//! ```sh +//! cargo bench -p mito2 --features test --bench memtable_bench +//! ``` + use std::sync::Arc; -use api::v1::value::ValueData; -use api::v1::{Row, Rows, SemanticType}; use criterion::{Criterion, criterion_group, criterion_main}; -use datafusion_common::Column; -use datafusion_expr::{Expr, lit}; -use datatypes::data_type::ConcreteDataType; -use datatypes::schema::ColumnSchema; use mito_codec::row_converter::DensePrimaryKeyCodec; use mito2::memtable::bulk::context::BulkIterContext; use mito2::memtable::bulk::part::BulkPartConverter; @@ -28,20 +30,13 @@ use mito2::memtable::bulk::part_reader::BulkPartBatchIter; use mito2::memtable::bulk::{BulkMemtable, BulkMemtableConfig}; use mito2::memtable::partition_tree::{PartitionTreeConfig, PartitionTreeMemtable}; use mito2::memtable::time_series::TimeSeriesMemtable; -use mito2::memtable::{IterBuilder, KeyValues, Memtable, RangesOptions}; +use mito2::memtable::{IterBuilder, Memtable, RangesOptions}; use mito2::read::flat_merge::FlatMergeIterator; use mito2::read::scan_region::PredicateGroup; use mito2::region::options::MergeMode; use mito2::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema}; -use mito2::test_util::memtable_util::{self, region_metadata_to_row_schema}; -use rand::Rng; -use rand::rngs::ThreadRng; -use rand::seq::IndexedRandom; -use store_api::metadata::{ - ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef, -}; -use store_api::storage::RegionId; -use table::predicate::Predicate; +use mito2::test_util::bench_util::{CpuDataGenerator, cpu_metadata}; +use mito2::test_util::memtable_util; /// Writes rows. fn write_rows(c: &mut Criterion) { @@ -216,224 +211,6 @@ fn filter_1_host(c: &mut Criterion) { }); } -struct Host { - hostname: String, - region: String, - datacenter: String, - rack: String, - os: String, - arch: String, - team: String, - service: String, - service_version: String, - service_environment: String, -} - -impl Host { - fn random_with_id(id: usize) -> Host { - let mut rng = rand::rng(); - let region = format!("ap-southeast-{}", rng.random_range(0..10)); - let datacenter = format!( - "{}{}", - region, - ['a', 'b', 'c', 'd', 'e'].choose(&mut rng).unwrap() - ); - Host { - hostname: format!("host_{id}"), - region, - datacenter, - rack: rng.random_range(0..100).to_string(), - os: "Ubuntu16.04LTS".to_string(), - arch: "x86".to_string(), - team: "CHI".to_string(), - service: rng.random_range(0..100).to_string(), - service_version: rng.random_range(0..10).to_string(), - service_environment: "test".to_string(), - } - } - - fn fill_values(&self, values: &mut Vec) { - let tags = [ - api::v1::Value { - value_data: Some(ValueData::StringValue(self.hostname.clone())), - }, - api::v1::Value { - value_data: Some(ValueData::StringValue(self.region.clone())), - }, - api::v1::Value { - value_data: Some(ValueData::StringValue(self.datacenter.clone())), - }, - api::v1::Value { - value_data: Some(ValueData::StringValue(self.rack.clone())), - }, - api::v1::Value { - value_data: Some(ValueData::StringValue(self.os.clone())), - }, - api::v1::Value { - value_data: Some(ValueData::StringValue(self.arch.clone())), - }, - api::v1::Value { - value_data: Some(ValueData::StringValue(self.team.clone())), - }, - api::v1::Value { - value_data: Some(ValueData::StringValue(self.service.clone())), - }, - api::v1::Value { - value_data: Some(ValueData::StringValue(self.service_version.clone())), - }, - api::v1::Value { - value_data: Some(ValueData::StringValue(self.service_environment.clone())), - }, - ]; - for tag in tags { - values.push(tag); - } - } -} - -struct CpuDataGenerator { - metadata: RegionMetadataRef, - column_schemas: Vec, - hosts: Vec, - start_sec: i64, - end_sec: i64, -} - -impl CpuDataGenerator { - fn new(metadata: RegionMetadataRef, num_hosts: usize, start_sec: i64, end_sec: i64) -> Self { - let column_schemas = region_metadata_to_row_schema(&metadata); - Self { - metadata, - column_schemas, - hosts: Self::generate_hosts(num_hosts), - start_sec, - end_sec, - } - } - - fn iter(&self) -> impl Iterator + '_ { - // point per 10s. - (self.start_sec..self.end_sec) - .step_by(10) - .enumerate() - .map(|(seq, ts)| self.build_key_values(seq, ts)) - } - - fn build_key_values(&self, seq: usize, current_sec: i64) -> KeyValues { - let rows = self - .hosts - .iter() - .map(|host| { - let mut rng = rand::rng(); - let mut values = Vec::with_capacity(21); - values.push(api::v1::Value { - value_data: Some(ValueData::TimestampMillisecondValue(current_sec * 1000)), - }); - host.fill_values(&mut values); - for _ in 0..10 { - values.push(api::v1::Value { - value_data: Some(ValueData::F64Value(Self::random_f64(&mut rng))), - }); - } - Row { values } - }) - .collect(); - let mutation = api::v1::Mutation { - op_type: api::v1::OpType::Put as i32, - sequence: seq as u64, - rows: Some(Rows { - schema: self.column_schemas.clone(), - rows, - }), - write_hint: None, - }; - - KeyValues::new(&self.metadata, mutation).unwrap() - } - - fn random_host_filter(&self) -> Predicate { - let host = self.random_hostname(); - let expr = Expr::Column(Column::from_name("hostname")).eq(lit(host)); - Predicate::new(vec![expr]) - } - - fn random_host_filter_exprs(&self) -> Vec { - let host = self.random_hostname(); - vec![Expr::Column(Column::from_name("hostname")).eq(lit(host))] - } - - fn random_hostname(&self) -> String { - let mut rng = rand::rng(); - self.hosts.choose(&mut rng).unwrap().hostname.clone() - } - - fn random_f64(rng: &mut ThreadRng) -> f64 { - let base: u32 = rng.random_range(30..95); - base as f64 - } - - fn generate_hosts(num_hosts: usize) -> Vec { - (0..num_hosts).map(Host::random_with_id).collect() - } -} - -/// Creates a metadata for TSBS cpu-like table. -fn cpu_metadata() -> RegionMetadata { - let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1)); - builder.push_column_metadata(ColumnMetadata { - column_schema: ColumnSchema::new( - "ts", - ConcreteDataType::timestamp_millisecond_datatype(), - false, - ), - semantic_type: SemanticType::Timestamp, - column_id: 0, - }); - let mut column_id = 1; - let tags = [ - "hostname", - "region", - "datacenter", - "rack", - "os", - "arch", - "team", - "service", - "service_version", - "service_environment", - ]; - for tag in tags { - builder.push_column_metadata(ColumnMetadata { - column_schema: ColumnSchema::new(tag, ConcreteDataType::string_datatype(), true), - semantic_type: SemanticType::Tag, - column_id, - }); - column_id += 1; - } - let fields = [ - "usage_user", - "usage_system", - "usage_idle", - "usage_nice", - "usage_iowait", - "usage_irq", - "usage_softirq", - "usage_steal", - "usage_guest", - "usage_guest_nice", - ]; - for field in fields { - builder.push_column_metadata(ColumnMetadata { - column_schema: ColumnSchema::new(field, ConcreteDataType::float64_datatype(), true), - semantic_type: SemanticType::Field, - column_id, - }); - column_id += 1; - } - builder.primary_key(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); - builder.build().unwrap() -} - fn bulk_part_converter(c: &mut Criterion) { let metadata = Arc::new(cpu_metadata()); let start_sec = 1710043200; diff --git a/src/mito2/src/cache.rs b/src/mito2/src/cache.rs index c9a8b99166..35db74eee6 100644 --- a/src/mito2/src/cache.rs +++ b/src/mito2/src/cache.rs @@ -350,7 +350,7 @@ impl CacheStrategy { /// Calls [CacheManager::get_range_result()]. /// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled]. - #[cfg_attr(not(test), allow(dead_code))] + #[allow(dead_code)] pub(crate) fn get_range_result( &self, key: &RangeScanCacheKey, @@ -363,7 +363,6 @@ impl CacheStrategy { /// Calls [CacheManager::put_range_result()]. /// It does nothing if the strategy isn't [CacheStrategy::EnableAll]. - #[cfg_attr(not(test), allow(dead_code))] pub(crate) fn put_range_result( &self, key: RangeScanCacheKey, @@ -476,7 +475,6 @@ pub struct CacheManager { /// Cache for time series selectors. selector_result_cache: Option, /// Cache for range scan outputs in flat format. - #[cfg_attr(not(test), allow(dead_code))] range_result_cache: Option, /// Cache for index result. index_result_cache: Option, @@ -713,7 +711,7 @@ impl CacheManager { } /// Gets cached result for range scan. - #[cfg_attr(not(test), allow(dead_code))] + #[allow(dead_code)] pub(crate) fn get_range_result( &self, key: &RangeScanCacheKey, @@ -723,8 +721,7 @@ impl CacheManager { .and_then(|cache| update_hit_miss(cache.get(key), RANGE_RESULT_TYPE)) } - /// Puts range scan result into the cache. - #[cfg_attr(not(test), allow(dead_code))] + /// Puts range scan result into cache. pub(crate) fn put_range_result( &self, key: RangeScanCacheKey, @@ -949,7 +946,7 @@ impl CacheManagerBuilder { Cache::builder() .max_capacity(self.range_result_cache_size) .weigher(range_result_cache_weight) - .eviction_listener(|k, v, cause| { + .eviction_listener(move |k, v, cause| { let size = range_result_cache_weight(&k, &v); CACHE_BYTES .with_label_values(&[RANGE_RESULT_TYPE]) @@ -1361,7 +1358,7 @@ mod tests { } .build(), }; - let value = Arc::new(RangeScanCacheValue::new(Vec::new())); + let value = Arc::new(RangeScanCacheValue::new(Vec::new(), 0)); assert!(cache.get_range_result(&key).is_none()); cache.put_range_result(key.clone(), value.clone()); diff --git a/src/mito2/src/config.rs b/src/mito2/src/config.rs index 602f5508ba..0eee067ab6 100644 --- a/src/mito2/src/config.rs +++ b/src/mito2/src/config.rs @@ -116,6 +116,8 @@ pub struct MitoConfig { pub page_cache_size: ReadableSize, /// Cache size for time series selector (e.g. `last_value()`). Setting it to 0 to disable the cache. pub selector_result_cache_size: ReadableSize, + /// Cache size for flat range scan results. Setting it to 0 to disable the cache. + pub range_result_cache_size: ReadableSize, /// Whether to enable the write cache. pub enable_write_cache: bool, /// File system path for write cache dir's root, defaults to `{data_home}`. @@ -200,6 +202,7 @@ impl Default for MitoConfig { vector_cache_size: ReadableSize::mb(512), page_cache_size: ReadableSize::mb(512), selector_result_cache_size: ReadableSize::mb(512), + range_result_cache_size: ReadableSize::mb(512), enable_write_cache: false, write_cache_path: String::new(), write_cache_size: ReadableSize::gb(5), @@ -336,6 +339,7 @@ impl MitoConfig { self.vector_cache_size = mem_cache_size; self.page_cache_size = page_cache_size; self.selector_result_cache_size = mem_cache_size; + self.range_result_cache_size = mem_cache_size; self.index.adjust_buffer_and_cache_size(sys_memory); } diff --git a/src/mito2/src/memtable/bulk/part.rs b/src/mito2/src/memtable/bulk/part.rs index 71e49776c0..bf345c038e 100644 --- a/src/mito2/src/memtable/bulk/part.rs +++ b/src/mito2/src/memtable/bulk/part.rs @@ -967,7 +967,7 @@ impl EncodedBulkPart { Self { data, metadata } } - pub(crate) fn metadata(&self) -> &BulkPartMeta { + pub fn metadata(&self) -> &BulkPartMeta { &self.metadata } @@ -977,7 +977,7 @@ impl EncodedBulkPart { } /// Returns the encoded data. - pub(crate) fn data(&self) -> &Bytes { + pub fn data(&self) -> &Bytes { &self.data } @@ -1121,10 +1121,7 @@ pub struct BulkPartEncoder { } impl BulkPartEncoder { - pub(crate) fn new( - metadata: RegionMetadataRef, - row_group_size: usize, - ) -> Result { + pub fn new(metadata: RegionMetadataRef, row_group_size: usize) -> Result { // TODO(yingwen): Skip arrow schema if needed. let json = metadata.to_json().context(InvalidMetadataSnafu)?; let key_value_meta = @@ -1216,7 +1213,7 @@ impl BulkPartEncoder { } /// Encodes bulk part to a [EncodedBulkPart], returns the encoded data. - fn encode_part(&self, part: &BulkPart) -> Result> { + pub fn encode_part(&self, part: &BulkPart) -> Result> { if part.batch.num_rows() == 0 { return Ok(None); } diff --git a/src/mito2/src/memtable/bulk/part_reader.rs b/src/mito2/src/memtable/bulk/part_reader.rs index 1e9d955321..904aae8c90 100644 --- a/src/mito2/src/memtable/bulk/part_reader.rs +++ b/src/mito2/src/memtable/bulk/part_reader.rs @@ -50,7 +50,7 @@ pub struct EncodedBulkPartIter { impl EncodedBulkPartIter { /// Creates a new [BulkPartIter]. - pub(crate) fn try_new( + pub fn try_new( encoded_part: &EncodedBulkPart, context: BulkIterContextRef, mut row_groups_to_read: VecDeque, diff --git a/src/mito2/src/read.rs b/src/mito2/src/read.rs index 240a99c247..84931b9f37 100644 --- a/src/mito2/src/read.rs +++ b/src/mito2/src/read.rs @@ -27,6 +27,9 @@ pub mod projection; pub(crate) mod prune; pub(crate) mod pruner; pub mod range; +#[cfg(feature = "test")] +pub mod range_cache; +#[cfg(not(feature = "test"))] pub(crate) mod range_cache; pub mod scan_region; pub mod scan_util; diff --git a/src/mito2/src/read/range_cache.rs b/src/mito2/src/read/range_cache.rs index 5b90e68bae..5fc8931691 100644 --- a/src/mito2/src/read/range_cache.rs +++ b/src/mito2/src/read/range_cache.rs @@ -17,12 +17,23 @@ use std::mem; use std::sync::Arc; +use async_stream::try_stream; +use common_time::range::TimestampRange; +use datatypes::arrow::array::{Array, AsArray, DictionaryArray}; +use datatypes::arrow::datatypes::UInt32Type; use datatypes::arrow::record_batch::RecordBatch; use datatypes::prelude::ConcreteDataType; +use futures::TryStreamExt; +use store_api::region_engine::PartitionRange; use store_api::storage::{ColumnId, FileId, RegionId, TimeSeriesRowSelector}; -use crate::memtable::record_batch_estimated_size; +use crate::cache::CacheStrategy; +use crate::read::BoxedRecordBatchStream; +use crate::read::scan_region::StreamContext; +use crate::read::scan_util::PartitionMetrics; use crate::region::options::MergeMode; +use crate::sst::file::FileTimeRange; +use crate::sst::parquet::flat_format::primary_key_column_index; /// Fingerprint of the scan request fields that affect partition range cache reuse. /// @@ -124,7 +135,6 @@ impl ScanRequestFingerprint { .unwrap_or(&[]) } - #[cfg(test)] pub(crate) fn without_time_filters(&self) -> Self { Self { inner: Arc::clone(&self.inner), @@ -163,7 +173,7 @@ impl ScanRequestFingerprint { #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub(crate) struct RangeScanCacheKey { pub(crate) region_id: RegionId, - /// Sorted (file_id, row_group_index) pairs that uniquely identify the covered data. + /// Sorted (file_id, row_group_index) pairs that uniquely identify the data this range covers. pub(crate) row_groups: Vec<(FileId, i64)>, pub(crate) scan: ScanRequestFingerprint, } @@ -179,30 +189,458 @@ impl RangeScanCacheKey { /// Cached result for one range scan. pub(crate) struct RangeScanCacheValue { pub(crate) batches: Vec, + /// Precomputed size of all batches, accounting for shared dictionary values. + estimated_batches_size: usize, } impl RangeScanCacheValue { - #[cfg_attr(not(test), allow(dead_code))] - pub(crate) fn new(batches: Vec) -> Self { - Self { batches } + pub(crate) fn new(batches: Vec, estimated_batches_size: usize) -> Self { + Self { + batches, + estimated_batches_size, + } } pub(crate) fn estimated_size(&self) -> usize { mem::size_of::() + self.batches.capacity() * mem::size_of::() - + self - .batches - .iter() - .map(record_batch_estimated_size) - .sum::() + + self.estimated_batches_size } } +/// Row groups and whether all sources are file-only for a partition range. +#[allow(dead_code)] +pub(crate) struct PartitionRangeRowGroups { + /// Sorted (file_id, row_group_index) pairs. + pub(crate) row_groups: Vec<(FileId, i64)>, + pub(crate) only_file_sources: bool, +} + +/// Collects (file_id, row_group_index) pairs from a partition range's row group indices. +#[allow(dead_code)] +pub(crate) fn collect_partition_range_row_groups( + stream_ctx: &StreamContext, + part_range: &PartitionRange, +) -> PartitionRangeRowGroups { + let range_meta = &stream_ctx.ranges[part_range.identifier]; + let mut row_groups = Vec::new(); + let mut only_file_sources = true; + + for index in &range_meta.row_group_indices { + if stream_ctx.is_file_range_index(*index) { + let file_id = stream_ctx.input.file_from_index(*index).file_id().file_id(); + row_groups.push((file_id, index.row_group_index)); + } else { + only_file_sources = false; + } + } + + row_groups.sort_unstable_by(|a, b| a.0.as_bytes().cmp(b.0.as_bytes()).then(a.1.cmp(&b.1))); + + PartitionRangeRowGroups { + row_groups, + only_file_sources, + } +} + +/// Builds a cache key for the given partition range if it is eligible for caching. +#[allow(dead_code)] +pub(crate) fn build_range_cache_key( + stream_ctx: &StreamContext, + part_range: &PartitionRange, +) -> Option { + let fingerprint = stream_ctx.scan_fingerprint.as_ref()?; + + // Dyn filters can change at runtime, so we can't cache when they're present. + let has_dyn_filters = stream_ctx + .input + .predicate_group() + .predicate_without_region() + .is_some_and(|p| !p.dyn_filters().is_empty()); + if has_dyn_filters { + return None; + } + + let rg = collect_partition_range_row_groups(stream_ctx, part_range); + if !rg.only_file_sources || rg.row_groups.is_empty() { + return None; + } + + let range_meta = &stream_ctx.ranges[part_range.identifier]; + let scan = if query_time_range_covers_partition_range( + stream_ctx.input.time_range.as_ref(), + range_meta.time_range, + ) { + fingerprint.without_time_filters() + } else { + fingerprint.clone() + }; + + Some(RangeScanCacheKey { + region_id: stream_ctx.input.region_metadata().region_id, + row_groups: rg.row_groups, + scan, + }) +} + +#[allow(dead_code)] +fn query_time_range_covers_partition_range( + query_time_range: Option<&TimestampRange>, + partition_time_range: FileTimeRange, +) -> bool { + let Some(query_time_range) = query_time_range else { + return true; + }; + + let (part_start, part_end) = partition_time_range; + query_time_range.contains(&part_start) && query_time_range.contains(&part_end) +} + +/// Returns a stream that replays cached record batches. +#[allow(dead_code)] +pub(crate) fn cached_flat_range_stream(value: Arc) -> BoxedRecordBatchStream { + Box::pin(futures::stream::iter( + value.batches.clone().into_iter().map(Ok), + )) +} + +/// Returns true if two primary key dictionary arrays share the same underlying +/// values buffers by pointer comparison. +/// +/// The primary key column is always `DictionaryArray` with `Binary` values. +fn pk_values_ptr_eq(a: &DictionaryArray, b: &DictionaryArray) -> bool { + let a = a.values().as_binary::(); + let b = b.values().as_binary::(); + let values_eq = a.values().ptr_eq(b.values()) && a.offsets().ptr_eq(b.offsets()); + match (a.nulls(), b.nulls()) { + (Some(a), Some(b)) => values_eq && a.inner().ptr_eq(b.inner()), + (None, None) => values_eq, + _ => false, + } +} + +/// Buffers record batches for caching, tracking memory size while deduplicating +/// shared dictionary values across batches. +/// +/// Uses the primary key column as a proxy to detect dictionary sharing: if the PK +/// column's dictionary values are pointer-equal across batches, we assume all +/// dictionary columns share their values and deduct the total dictionary values size. +struct CacheBatchBuffer { + batches: Vec, + /// Running total of batch memory. + total_size: usize, + /// The first batch's PK dictionary array, for pointer comparison. + /// `None` if no dictionary PK column exists or no batch has been added yet. + first_pk_dict: Option>, + /// Sum of `get_array_memory_size()` of all dictionary value arrays from the first batch. + total_dict_values_size: usize, + /// Whether the PK dictionary is still shared across all batches seen so far. + shared: bool, +} + +impl CacheBatchBuffer { + fn new() -> Self { + Self { + batches: Vec::new(), + total_size: 0, + first_pk_dict: None, + total_dict_values_size: 0, + shared: true, + } + } + + fn push(&mut self, batch: RecordBatch) { + if self.batches.is_empty() { + self.init_first_batch(&batch); + } else { + self.add_subsequent_batch(&batch); + } + self.batches.push(batch); + } + + fn init_first_batch(&mut self, batch: &RecordBatch) { + self.total_size += batch.get_array_memory_size(); + + let pk_col_idx = primary_key_column_index(batch.num_columns()); + let mut total_dict_values_size = 0; + for col_idx in 0..batch.num_columns() { + let col = batch.column(col_idx); + if let Some(dict) = col.as_any().downcast_ref::>() { + total_dict_values_size += dict.values().get_array_memory_size(); + if col_idx == pk_col_idx { + self.first_pk_dict = Some(dict.clone()); + } + } + } + self.total_dict_values_size = total_dict_values_size; + } + + fn add_subsequent_batch(&mut self, batch: &RecordBatch) { + let batch_size = batch.get_array_memory_size(); + + if self.shared + && let Some(first_pk_dict) = &self.first_pk_dict + { + let pk_col_idx = primary_key_column_index(batch.num_columns()); + let col = batch.column(pk_col_idx); + if let Some(dict) = col.as_any().downcast_ref::>() + && pk_values_ptr_eq(first_pk_dict, dict) + { + // PK dict is shared, deduct all dict values sizes. + self.total_size += batch_size - self.total_dict_values_size; + return; + } + // Dictionary diverged. + self.shared = false; + } + + self.total_size += batch_size; + } + + fn estimated_batches_size(&self) -> usize { + self.total_size + } + + fn into_batches(self) -> Vec { + self.batches + } +} + +/// Wraps a stream to cache its output for future range cache hits. +#[allow(dead_code)] +pub(crate) fn cache_flat_range_stream( + mut stream: BoxedRecordBatchStream, + cache_strategy: CacheStrategy, + key: RangeScanCacheKey, + part_metrics: PartitionMetrics, +) -> BoxedRecordBatchStream { + Box::pin(try_stream! { + let mut buffer = CacheBatchBuffer::new(); + while let Some(batch) = stream.try_next().await? { + buffer.push(batch.clone()); + yield batch; + } + + let estimated_size = buffer.estimated_batches_size(); + let batches = buffer.into_batches(); + let value = Arc::new(RangeScanCacheValue::new(batches, estimated_size)); + part_metrics.inc_range_cache_size(key.estimated_size() + value.estimated_size()); + cache_strategy.put_range_result(key, value); + }) +} + +/// Creates a `cache_flat_range_stream` with dummy internals for benchmarking. +/// +/// This avoids exposing `RangeScanCacheKey`, `ScanRequestFingerprint`, and +/// `PartitionMetrics` publicly. +#[cfg(feature = "test")] +pub fn bench_cache_flat_range_stream( + stream: BoxedRecordBatchStream, + cache_size_bytes: u64, + region_id: RegionId, +) -> BoxedRecordBatchStream { + use std::time::Instant; + + use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; + + use crate::region::options::MergeMode; + + let cache_manager = Arc::new( + crate::cache::CacheManager::builder() + .range_result_cache_size(cache_size_bytes) + .build(), + ); + let cache_strategy = CacheStrategy::EnableAll(cache_manager); + + let fingerprint = ScanRequestFingerprintBuilder { + read_column_ids: vec![], + read_column_types: vec![], + filters: vec![], + time_filters: vec![], + series_row_selector: None, + append_mode: false, + filter_deleted: false, + merge_mode: MergeMode::LastRow, + partition_expr_version: 0, + } + .build(); + + let key = RangeScanCacheKey { + region_id, + row_groups: vec![], + scan: fingerprint, + }; + + let metrics_set = ExecutionPlanMetricsSet::new(); + let part_metrics = + PartitionMetrics::new(region_id, 0, "bench", Instant::now(), false, &metrics_set); + + cache_flat_range_stream(stream, cache_strategy, key, part_metrics) +} + #[cfg(test)] mod tests { - use store_api::storage::TimeSeriesRowSelector; + use std::sync::Arc; + use std::time::Instant; + + use common_time::Timestamp; + use common_time::range::TimestampRange; + use common_time::timestamp::TimeUnit; + use datafusion_common::ScalarValue; + use datafusion_expr::{Expr, col, lit}; + use smallvec::smallvec; + use store_api::storage::FileId; use super::*; + use crate::cache::CacheManager; + use crate::read::projection::ProjectionMapper; + use crate::read::range::{RangeMeta, RowGroupIndex, SourceIndex}; + use crate::read::scan_region::{PredicateGroup, ScanInput}; + use crate::test_util::memtable_util::metadata_with_primary_key; + use crate::test_util::scheduler_util::SchedulerEnv; + use crate::test_util::sst_util::sst_file_handle_with_file_id; + + fn test_cache_strategy() -> CacheStrategy { + CacheStrategy::EnableAll(Arc::new( + CacheManager::builder() + .range_result_cache_size(1024) + .build(), + )) + } + + async fn new_stream_context( + filters: Vec, + query_time_range: Option, + partition_time_range: FileTimeRange, + ) -> (StreamContext, PartitionRange) { + let env = SchedulerEnv::new().await; + let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false)); + let mapper = ProjectionMapper::new(&metadata, [0, 2, 3].into_iter(), true).unwrap(); + let predicate = PredicateGroup::new(metadata.as_ref(), &filters).unwrap(); + let file_id = FileId::random(); + let file = sst_file_handle_with_file_id( + file_id, + partition_time_range.0.value(), + partition_time_range.1.value(), + ); + let input = ScanInput::new(env.access_layer.clone(), mapper) + .with_predicate(predicate) + .with_time_range(query_time_range) + .with_files(vec![file]) + .with_cache(test_cache_strategy()) + .with_flat_format(true); + let range_meta = RangeMeta { + time_range: partition_time_range, + indices: smallvec![SourceIndex { + index: 0, + num_row_groups: 1, + }], + row_group_indices: smallvec![RowGroupIndex { + index: 0, + row_group_index: 0, + }], + num_rows: 10, + }; + let partition_range = range_meta.new_partition_range(0); + let scan_fingerprint = crate::read::scan_region::build_scan_fingerprint(&input); + let stream_ctx = StreamContext { + input, + ranges: vec![range_meta], + scan_fingerprint, + query_start: Instant::now(), + }; + + (stream_ctx, partition_range) + } + + /// Helper to create a timestamp millisecond literal. + fn ts_lit(val: i64) -> Expr { + lit(ScalarValue::TimestampMillisecond(Some(val), None)) + } + + #[tokio::test] + async fn strips_time_only_filters_when_query_covers_partition_range() { + let (stream_ctx, part_range) = new_stream_context( + vec![ + col("ts").gt_eq(ts_lit(1000)), + col("ts").lt(ts_lit(2001)), + col("ts").is_not_null(), + col("k0").eq(lit("foo")), + ], + TimestampRange::with_unit(1000, 2002, TimeUnit::Millisecond), + ( + Timestamp::new_millisecond(1000), + Timestamp::new_millisecond(2000), + ), + ) + .await; + + let key = build_range_cache_key(&stream_ctx, &part_range).unwrap(); + + // Range-reducible time filters should be cleared when query covers partition range. + assert!(key.scan.time_filters().is_empty()); + // Non-range time predicates stay in filters. + let mut expected_filters = [ + col("k0").eq(lit("foo")).to_string(), + col("ts").is_not_null().to_string(), + ]; + expected_filters.sort_unstable(); + assert_eq!(key.scan.filters(), expected_filters.as_slice()); + } + + #[tokio::test] + async fn preserves_time_filters_when_query_does_not_cover_partition_range() { + let (stream_ctx, part_range) = new_stream_context( + vec![col("ts").gt_eq(ts_lit(1000)), col("k0").eq(lit("foo"))], + TimestampRange::with_unit(1000, 1500, TimeUnit::Millisecond), + ( + Timestamp::new_millisecond(1000), + Timestamp::new_millisecond(2000), + ), + ) + .await; + + let key = build_range_cache_key(&stream_ctx, &part_range).unwrap(); + + // Time filters should be preserved when query does not cover partition range. + assert_eq!( + key.scan.time_filters(), + [col("ts").gt_eq(ts_lit(1000)).to_string()].as_slice() + ); + assert_eq!( + key.scan.filters(), + [col("k0").eq(lit("foo")).to_string()].as_slice() + ); + } + + #[tokio::test] + async fn strips_time_only_filters_when_query_has_no_time_range_limit() { + let (stream_ctx, part_range) = new_stream_context( + vec![ + col("ts").gt_eq(ts_lit(1000)), + col("ts").is_not_null(), + col("k0").eq(lit("foo")), + ], + None, + ( + Timestamp::new_millisecond(1000), + Timestamp::new_millisecond(2000), + ), + ) + .await; + + let key = build_range_cache_key(&stream_ctx, &part_range).unwrap(); + + // Range-reducible time filters should be cleared when query has no time range limit. + assert!(key.scan.time_filters().is_empty()); + // Non-range time predicates stay in filters. + let mut expected_filters = [ + col("k0").eq(lit("foo")).to_string(), + col("ts").is_not_null().to_string(), + ]; + expected_filters.sort_unstable(); + assert_eq!(key.scan.filters(), expected_filters.as_slice()); + } #[test] fn normalizes_and_clears_time_filters() { @@ -249,4 +687,170 @@ mod tests { fingerprint.partition_expr_version ); } + + /// Creates a test schema with 5 columns where the primary key dictionary column + /// is at index 2 (`num_columns - 3`), matching the flat format layout. + /// + /// Layout: `[field0: Int64, field1: Int64, pk: Dictionary, ts: Int64, seq: Int64]` + fn dict_test_schema() -> Arc { + use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field, Schema}; + Arc::new(Schema::new(vec![ + Field::new("field0", ArrowDataType::Int64, false), + Field::new("field1", ArrowDataType::Int64, false), + Field::new( + "pk", + ArrowDataType::Dictionary( + Box::new(ArrowDataType::UInt32), + Box::new(ArrowDataType::Binary), + ), + false, + ), + Field::new("ts", ArrowDataType::Int64, false), + Field::new("seq", ArrowDataType::Int64, false), + ])) + } + + /// Helper to create a record batch with a dictionary column at the primary key position. + fn make_dict_batch( + schema: Arc, + dict_values: &datatypes::arrow::array::BinaryArray, + keys: &[u32], + int_values: &[i64], + ) -> RecordBatch { + use datatypes::arrow::array::{Int64Array, UInt32Array}; + + let key_array = UInt32Array::from(keys.to_vec()); + let dict_array: DictionaryArray = + DictionaryArray::new(key_array, Arc::new(dict_values.clone())); + let int_array = Int64Array::from(int_values.to_vec()); + let zeros = Int64Array::from(vec![0i64; int_values.len()]); + RecordBatch::try_new( + schema, + vec![ + Arc::new(zeros.clone()), + Arc::new(int_array), + Arc::new(dict_array), + Arc::new(zeros.clone()), + Arc::new(zeros), + ], + ) + .unwrap() + } + + /// Computes the total `get_array_memory_size()` of all dictionary value arrays in a batch. + fn compute_total_dict_values_size(batch: &RecordBatch) -> usize { + batch + .columns() + .iter() + .filter_map(|col| { + col.as_any() + .downcast_ref::>() + .map(|dict| dict.values().get_array_memory_size()) + }) + .sum() + } + + #[test] + fn cache_batch_buffer_empty() { + let buffer = CacheBatchBuffer::new(); + assert_eq!(buffer.estimated_batches_size(), 0); + assert!(buffer.into_batches().is_empty()); + } + + #[test] + fn cache_batch_buffer_single_batch() { + use datatypes::arrow::array::BinaryArray; + + let schema = dict_test_schema(); + let dict_values = BinaryArray::from_vec(vec![b"a", b"b", b"c"]); + let batch = make_dict_batch(schema, &dict_values, &[0, 1, 2], &[10, 20, 30]); + + let full_size = batch.get_array_memory_size(); + + let mut buffer = CacheBatchBuffer::new(); + buffer.push(batch); + assert_eq!(buffer.estimated_batches_size(), full_size); + assert_eq!(buffer.into_batches().len(), 1); + } + + #[test] + fn cache_batch_buffer_shared_dictionary() { + use datatypes::arrow::array::BinaryArray; + + let schema = dict_test_schema(); + let dict_values = BinaryArray::from_vec(vec![b"alpha", b"beta", b"gamma"]); + + // Two batches sharing the same dictionary values array. + let batch1 = make_dict_batch(schema.clone(), &dict_values, &[0, 1], &[10, 20]); + let batch2 = make_dict_batch(schema, &dict_values, &[1, 2], &[30, 40]); + + let batch1_full = batch1.get_array_memory_size(); + let batch2_full = batch2.get_array_memory_size(); + + // The total dictionary values size that should be deduplicated for the second batch. + let dict_values_size = compute_total_dict_values_size(&batch2); + + let mut buffer = CacheBatchBuffer::new(); + buffer.push(batch1); + buffer.push(batch2); + + // Second batch's dict values should not be counted again. + assert_eq!( + buffer.estimated_batches_size(), + batch1_full + batch2_full - dict_values_size + ); + assert_eq!(buffer.into_batches().len(), 2); + } + + #[test] + fn cache_batch_buffer_non_shared_dictionary() { + use datatypes::arrow::array::BinaryArray; + + let schema = dict_test_schema(); + let dict_values1 = BinaryArray::from_vec(vec![b"a", b"b"]); + let dict_values2 = BinaryArray::from_vec(vec![b"x", b"y"]); + + let batch1 = make_dict_batch(schema.clone(), &dict_values1, &[0, 1], &[10, 20]); + let batch2 = make_dict_batch(schema, &dict_values2, &[0, 1], &[30, 40]); + + let batch1_full = batch1.get_array_memory_size(); + let batch2_full = batch2.get_array_memory_size(); + + let mut buffer = CacheBatchBuffer::new(); + buffer.push(batch1); + buffer.push(batch2); + + // Different dictionaries: full size for both. + assert_eq!(buffer.estimated_batches_size(), batch1_full + batch2_full); + } + + #[test] + fn cache_batch_buffer_shared_then_diverged() { + use datatypes::arrow::array::BinaryArray; + + let schema = dict_test_schema(); + let shared_values = BinaryArray::from_vec(vec![b"a", b"b", b"c"]); + let different_values = BinaryArray::from_vec(vec![b"x", b"y"]); + + let batch1 = make_dict_batch(schema.clone(), &shared_values, &[0], &[1]); + let batch2 = make_dict_batch(schema.clone(), &shared_values, &[1], &[2]); + let batch3 = make_dict_batch(schema, &different_values, &[0], &[3]); + + let size1 = batch1.get_array_memory_size(); + let size2 = batch2.get_array_memory_size(); + let size3 = batch3.get_array_memory_size(); + + let dict_values_size = compute_total_dict_values_size(&batch2); + + let mut buffer = CacheBatchBuffer::new(); + buffer.push(batch1); + buffer.push(batch2); + buffer.push(batch3); + + // batch2 shares dict with batch1 (dedup), batch3 does not (full size). + assert_eq!( + buffer.estimated_batches_size(), + size1 + (size2 - dict_values_size) + size3 + ); + } } diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs index 5cb2d75e25..e7cae7e7b8 100644 --- a/src/mito2/src/read/scan_region.rs +++ b/src/mito2/src/read/scan_region.rs @@ -40,7 +40,7 @@ use store_api::region_engine::{PartitionRange, RegionScannerRef}; use store_api::storage::{ ColumnId, RegionId, ScanRequest, SequenceRange, TimeSeriesDistribution, TimeSeriesRowSelector, }; -use table::predicate::{Predicate, build_time_range_predicate}; +use table::predicate::{Predicate, build_time_range_predicate, extract_time_range_from_expr}; use tokio::sync::{Semaphore, mpsc}; use tokio_stream::wrappers::ReceiverStream; @@ -1420,7 +1420,6 @@ fn pre_filter_mode(append_mode: bool, merge_mode: MergeMode) -> PreFilterMode { /// Builds a [ScanRequestFingerprint] from a [ScanInput] if the scan is eligible /// for partition range caching. -#[cfg_attr(not(test), allow(dead_code))] pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option { let eligible = input.flat_format && !input.compaction @@ -1439,7 +1438,14 @@ pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option Option false, }; - if is_time_only { + if is_time_only + && extract_time_range_from_expr(&time_index_name, ts_col_unit, expr).is_some() + { + // Range-reducible time predicates can be safely dropped from the + // cache key when the query time range covers the partition range. time_filters.push(expr.to_string()); } else { + // Non-time filters and non-range time predicates (those that + // extract_time_range_from_expr cannot convert to a TimestampRange) + // always stay in the cache key. filters.push(expr.to_string()); } } @@ -1511,6 +1524,10 @@ pub struct StreamContext { pub input: ScanInput, /// Metadata for partition ranges. pub(crate) ranges: Vec, + /// Precomputed scan fingerprint for partition range caching. + /// `None` when the scan is not eligible for caching. + #[allow(dead_code)] + pub(crate) scan_fingerprint: Option, // Metrics: /// The start time of the query. @@ -1523,10 +1540,12 @@ impl StreamContext { let query_start = input.query_start.unwrap_or_else(Instant::now); let ranges = RangeMeta::seq_scan_ranges(&input); READ_SST_COUNT.observe(input.num_files() as f64); + let scan_fingerprint = build_scan_fingerprint(&input); Self { input, ranges, + scan_fingerprint, query_start, } } @@ -1536,10 +1555,12 @@ impl StreamContext { let query_start = input.query_start.unwrap_or_else(Instant::now); let ranges = RangeMeta::unordered_scan_ranges(&input); READ_SST_COUNT.observe(input.num_files() as f64); + let scan_fingerprint = build_scan_fingerprint(&input); Self { input, ranges, + scan_fingerprint, query_start, } } @@ -1849,6 +1870,7 @@ mod tests { use std::sync::Arc; use datafusion::physical_plan::expressions::lit as physical_lit; + use datafusion_common::ScalarValue; use datafusion_expr::{col, lit}; use datatypes::value::Value; use partition::expr::col as partition_col; @@ -2035,13 +2057,18 @@ mod tests { assert!(scan_region.use_flat_format()); } + /// Helper to create a timestamp millisecond literal. + fn ts_lit(val: i64) -> datafusion_expr::Expr { + lit(ScalarValue::TimestampMillisecond(Some(val), None)) + } + #[tokio::test] async fn test_build_scan_fingerprint_for_eligible_scan() { let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false)); let input = new_scan_input( metadata.clone(), vec![ - col("ts").gt_eq(lit(1000)), + col("ts").gt_eq(ts_lit(1000)), col("k0").eq(lit("foo")), col("v0").gt(lit(1)), ], @@ -2071,7 +2098,7 @@ mod tests { col("k0").eq(lit("foo")).to_string(), col("v0").gt(lit(1)).to_string(), ], - time_filters: vec![col("ts").gt_eq(lit(1000)).to_string()], + time_filters: vec![col("ts").gt_eq(ts_lit(1000)).to_string()], series_row_selector: Some(TimeSeriesRowSelector::LastRow), append_mode: false, filter_deleted: false, diff --git a/src/mito2/src/read/scan_util.rs b/src/mito2/src/read/scan_util.rs index 0ee6a4437d..6f68616709 100644 --- a/src/mito2/src/read/scan_util.rs +++ b/src/mito2/src/read/scan_util.rs @@ -247,6 +247,12 @@ pub(crate) struct ScanMetricsSet { num_range_builders: isize, /// Peak number of file range builders. num_peak_range_builders: isize, + /// Total bytes added to the range cache during this scan. + range_cache_size: usize, + /// Number of range cache hits during this scan. + range_cache_hit: usize, + /// Number of range cache misses during this scan. + range_cache_miss: usize, } /// Wrapper for file metrics that compares by total cost in reverse order. @@ -345,6 +351,9 @@ impl fmt::Debug for ScanMetricsSet { build_ranges_peak_mem_size, num_range_builders: _, num_peak_range_builders, + range_cache_size, + range_cache_hit, + range_cache_miss, } = self; // Write core metrics @@ -590,6 +599,16 @@ impl fmt::Debug for ScanMetricsSet { write!(f, "}}")?; } + if *range_cache_size > 0 { + write!(f, ", \"range_cache_size\":{range_cache_size}")?; + } + if *range_cache_hit > 0 { + write!(f, ", \"range_cache_hit\":{range_cache_hit}")?; + } + if *range_cache_miss > 0 { + write!(f, ", \"range_cache_miss\":{range_cache_miss}")?; + } + write!( f, ", \"build_ranges_peak_mem_size\":{build_ranges_peak_mem_size}, \ @@ -1097,6 +1116,27 @@ impl PartitionMetrics { pub(crate) fn dedup_metrics_reporter(&self) -> Arc { self.0.clone() } + + /// Increments the total bytes added to the range cache. + #[allow(dead_code)] + pub(crate) fn inc_range_cache_size(&self, size: usize) { + let mut metrics = self.0.metrics.lock().unwrap(); + metrics.range_cache_size += size; + } + + /// Increments the range cache hit counter. + #[allow(dead_code)] + pub(crate) fn inc_range_cache_hit(&self) { + let mut metrics = self.0.metrics.lock().unwrap(); + metrics.range_cache_hit += 1; + } + + /// Increments the range cache miss counter. + #[allow(dead_code)] + pub(crate) fn inc_range_cache_miss(&self) { + let mut metrics = self.0.metrics.lock().unwrap(); + metrics.range_cache_miss += 1; + } } impl fmt::Debug for PartitionMetrics { diff --git a/src/mito2/src/test_util.rs b/src/mito2/src/test_util.rs index 842689bba6..350195bfa9 100644 --- a/src/mito2/src/test_util.rs +++ b/src/mito2/src/test_util.rs @@ -15,6 +15,7 @@ //! Utilities for testing. pub mod batch_util; +pub mod bench_util; pub mod memtable_util; pub mod scheduler_util; pub mod sst_util; diff --git a/src/mito2/src/test_util/bench_util.rs b/src/mito2/src/test_util/bench_util.rs new file mode 100644 index 0000000000..8f182e4157 --- /dev/null +++ b/src/mito2/src/test_util/bench_util.rs @@ -0,0 +1,259 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Shared utilities for mito2 benchmarks. +//! +//! Provides a TSBS cpu-like data generator ([`CpuDataGenerator`]) and schema +//! ([`cpu_metadata`]) used by multiple benchmark binaries in this directory. + +use api::v1::value::ValueData; +use api::v1::{Row, Rows, SemanticType}; +use datafusion_common::Column; +use datafusion_expr::{Expr, lit}; +use datatypes::data_type::ConcreteDataType; +use datatypes::schema::ColumnSchema; +use rand::Rng; +use rand::rngs::ThreadRng; +use rand::seq::IndexedRandom; +use store_api::metadata::{ + ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef, +}; +use store_api::storage::RegionId; +use table::predicate::Predicate; + +use crate::memtable::KeyValues; +use crate::test_util::memtable_util::region_metadata_to_row_schema; + +pub struct Host { + pub hostname: String, + pub region: String, + pub datacenter: String, + pub rack: String, + pub os: String, + pub arch: String, + pub team: String, + pub service: String, + pub service_version: String, + pub service_environment: String, +} + +impl Host { + pub fn random_with_id(id: usize) -> Host { + let mut rng = rand::rng(); + let region = format!("ap-southeast-{}", rng.random_range(0..10)); + let datacenter = format!( + "{}{}", + region, + ['a', 'b', 'c', 'd', 'e'].choose(&mut rng).unwrap() + ); + Host { + hostname: format!("host_{id}"), + region, + datacenter, + rack: rng.random_range(0..100).to_string(), + os: "Ubuntu16.04LTS".to_string(), + arch: "x86".to_string(), + team: "CHI".to_string(), + service: rng.random_range(0..100).to_string(), + service_version: rng.random_range(0..10).to_string(), + service_environment: "test".to_string(), + } + } + + pub fn fill_values(&self, values: &mut Vec) { + let tags = [ + api::v1::Value { + value_data: Some(ValueData::StringValue(self.hostname.clone())), + }, + api::v1::Value { + value_data: Some(ValueData::StringValue(self.region.clone())), + }, + api::v1::Value { + value_data: Some(ValueData::StringValue(self.datacenter.clone())), + }, + api::v1::Value { + value_data: Some(ValueData::StringValue(self.rack.clone())), + }, + api::v1::Value { + value_data: Some(ValueData::StringValue(self.os.clone())), + }, + api::v1::Value { + value_data: Some(ValueData::StringValue(self.arch.clone())), + }, + api::v1::Value { + value_data: Some(ValueData::StringValue(self.team.clone())), + }, + api::v1::Value { + value_data: Some(ValueData::StringValue(self.service.clone())), + }, + api::v1::Value { + value_data: Some(ValueData::StringValue(self.service_version.clone())), + }, + api::v1::Value { + value_data: Some(ValueData::StringValue(self.service_environment.clone())), + }, + ]; + for tag in tags { + values.push(tag); + } + } +} + +pub struct CpuDataGenerator { + pub metadata: RegionMetadataRef, + column_schemas: Vec, + hosts: Vec, + start_sec: i64, + end_sec: i64, +} + +impl CpuDataGenerator { + pub fn new( + metadata: RegionMetadataRef, + num_hosts: usize, + start_sec: i64, + end_sec: i64, + ) -> Self { + let column_schemas = region_metadata_to_row_schema(&metadata); + Self { + metadata, + column_schemas, + hosts: Self::generate_hosts(num_hosts), + start_sec, + end_sec, + } + } + + pub fn iter(&self) -> impl Iterator + '_ { + // point per 10s. + (self.start_sec..self.end_sec) + .step_by(10) + .enumerate() + .map(|(seq, ts)| self.build_key_values(seq, ts)) + } + + pub fn build_key_values(&self, seq: usize, current_sec: i64) -> KeyValues { + let rows = self + .hosts + .iter() + .map(|host| { + let mut rng = rand::rng(); + let mut values = Vec::with_capacity(21); + values.push(api::v1::Value { + value_data: Some(ValueData::TimestampMillisecondValue(current_sec * 1000)), + }); + host.fill_values(&mut values); + for _ in 0..10 { + values.push(api::v1::Value { + value_data: Some(ValueData::F64Value(Self::random_f64(&mut rng))), + }); + } + Row { values } + }) + .collect(); + let mutation = api::v1::Mutation { + op_type: api::v1::OpType::Put as i32, + sequence: seq as u64, + rows: Some(Rows { + schema: self.column_schemas.clone(), + rows, + }), + write_hint: None, + }; + + KeyValues::new(&self.metadata, mutation).unwrap() + } + + pub fn random_host_filter(&self) -> Predicate { + let host = self.random_hostname(); + let expr = Expr::Column(Column::from_name("hostname")).eq(lit(host)); + Predicate::new(vec![expr]) + } + + pub fn random_host_filter_exprs(&self) -> Vec { + let host = self.random_hostname(); + vec![Expr::Column(Column::from_name("hostname")).eq(lit(host))] + } + + pub fn random_hostname(&self) -> String { + let mut rng = rand::rng(); + self.hosts.choose(&mut rng).unwrap().hostname.clone() + } + + pub fn random_f64(rng: &mut ThreadRng) -> f64 { + let base: u32 = rng.random_range(30..95); + base as f64 + } + + pub fn generate_hosts(num_hosts: usize) -> Vec { + (0..num_hosts).map(Host::random_with_id).collect() + } +} + +/// Creates a metadata for TSBS cpu-like table. +pub fn cpu_metadata() -> RegionMetadata { + let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1)); + builder.push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + semantic_type: SemanticType::Timestamp, + column_id: 0, + }); + let mut column_id = 1; + let tags = [ + "hostname", + "region", + "datacenter", + "rack", + "os", + "arch", + "team", + "service", + "service_version", + "service_environment", + ]; + for tag in tags { + builder.push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new(tag, ConcreteDataType::string_datatype(), true), + semantic_type: SemanticType::Tag, + column_id, + }); + column_id += 1; + } + let fields = [ + "usage_user", + "usage_system", + "usage_idle", + "usage_nice", + "usage_iowait", + "usage_irq", + "usage_softirq", + "usage_steal", + "usage_guest", + "usage_guest_nice", + ]; + for field in fields { + builder.push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new(field, ConcreteDataType::float64_datatype(), true), + semantic_type: SemanticType::Field, + column_id, + }); + column_id += 1; + } + builder.primary_key(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + builder.build().unwrap() +} diff --git a/src/mito2/src/test_util/memtable_util.rs b/src/mito2/src/test_util/memtable_util.rs index 8917875250..25ab9bb8b4 100644 --- a/src/mito2/src/test_util/memtable_util.rs +++ b/src/mito2/src/test_util/memtable_util.rs @@ -30,7 +30,7 @@ use mito_codec::row_converter::{DensePrimaryKeyCodec, PrimaryKeyCodecExt, SortFi use store_api::metadata::{ ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef, }; -use store_api::storage::{ColumnId, RegionId, SequenceNumber, SequenceRange}; +use store_api::storage::{ColumnId, RegionId, SequenceNumber}; use crate::error::Result; use crate::memtable::bulk::part::BulkPart; diff --git a/src/mito2/src/worker.rs b/src/mito2/src/worker.rs index 71896b3d5d..fd5ad82f3f 100644 --- a/src/mito2/src/worker.rs +++ b/src/mito2/src/worker.rs @@ -207,6 +207,7 @@ impl WorkerGroup { .vector_cache_size(config.vector_cache_size.as_bytes()) .page_cache_size(config.page_cache_size.as_bytes()) .selector_result_cache_size(config.selector_result_cache_size.as_bytes()) + .range_result_cache_size(config.range_result_cache_size.as_bytes()) .index_metadata_size(config.index.metadata_cache_size.as_bytes()) .index_content_size(config.index.content_cache_size.as_bytes()) .index_content_page_size(config.index.content_cache_page_size.as_bytes()) @@ -421,6 +422,7 @@ impl WorkerGroup { .vector_cache_size(config.vector_cache_size.as_bytes()) .page_cache_size(config.page_cache_size.as_bytes()) .selector_result_cache_size(config.selector_result_cache_size.as_bytes()) + .range_result_cache_size(config.range_result_cache_size.as_bytes()) .write_cache(write_cache) .build(), ); diff --git a/src/table/src/predicate.rs b/src/table/src/predicate.rs index f9be7be16e..2c9ac41560 100644 --- a/src/table/src/predicate.rs +++ b/src/table/src/predicate.rs @@ -203,7 +203,7 @@ pub fn build_time_range_predicate( /// Extract time range filter from `WHERE`/`IN (...)`/`BETWEEN` clauses. /// Return None if no time range can be found in expr. -fn extract_time_range_from_expr( +pub fn extract_time_range_from_expr( ts_col_name: &str, ts_col_unit: TimeUnit, expr: &Expr, diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index 65e56fa15e..7ae59ae9fc 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -1642,6 +1642,7 @@ fn drop_lines_with_inconsistent_results(input: String) -> String { "metadata_cache_size =", "content_cache_size =", "result_cache_size =", + "range_result_cache_size =", "name =", "recovery_parallelism =", "max_background_index_builds =", From c8c2e09eedd5a2f42acd599d76d4301e29abae53 Mon Sep 17 00:00:00 2001 From: shuiyisong <113876041+shuiyisong@users.noreply.github.com> Date: Tue, 24 Mar 2026 18:21:31 +0800 Subject: [PATCH 034/195] refactor: move election trait and implementations to the `common-meta` crate (#7820) * refactor: move election impl to common-meta Signed-off-by: shuiyisong * fix: adding back comment Signed-off-by: shuiyisong --------- Signed-off-by: shuiyisong --- src/{meta-srv => common/meta}/src/election.rs | 76 ++++++++++++++++++- .../meta}/src/election/etcd.rs | 14 ++-- .../meta}/src/election/rds.rs | 4 +- .../meta}/src/election/rds/mysql.rs | 42 +++++----- .../meta}/src/election/rds/postgres.rs | 45 ++++++----- src/common/meta/src/error.rs | 70 +++++++++++++++-- src/common/meta/src/lib.rs | 1 + src/meta-srv/src/bootstrap.rs | 21 ++--- src/meta-srv/src/cluster.rs | 4 +- src/meta-srv/src/lib.rs | 1 - src/meta-srv/src/metasrv.rs | 74 +----------------- src/meta-srv/src/service/admin/leader.rs | 2 +- src/meta-srv/src/service/cluster.rs | 5 +- src/meta-srv/src/service/heartbeat.rs | 4 +- 14 files changed, 218 insertions(+), 145 deletions(-) rename src/{meta-srv => common/meta}/src/election.rs (67%) rename src/{meta-srv => common/meta}/src/election/etcd.rs (94%) rename src/{meta-srv => common/meta}/src/election/rds.rs (96%) rename src/{meta-srv => common/meta}/src/election/rds/mysql.rs (97%) rename src/{meta-srv => common/meta}/src/election/rds/postgres.rs (97%) diff --git a/src/meta-srv/src/election.rs b/src/common/meta/src/election.rs similarity index 67% rename from src/meta-srv/src/election.rs rename to src/common/meta/src/election.rs index 2d2826b286..12173beda8 100644 --- a/src/meta-srv/src/election.rs +++ b/src/common/meta/src/election.rs @@ -21,15 +21,85 @@ use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use common_telemetry::{error, info, warn}; +use serde::{Deserialize, Serialize}; use tokio::sync::broadcast::error::RecvError; use tokio::sync::broadcast::{self, Receiver, Sender}; use crate::error::Result; -use crate::metasrv::MetasrvNodeInfo; -pub(crate) const CANDIDATE_LEASE_SECS: u64 = 600; +pub const CANDIDATE_LEASE_SECS: u64 = 600; const KEEP_ALIVE_INTERVAL_SECS: u64 = CANDIDATE_LEASE_SECS / 2; +/// The value of the leader. It is used to store the leader's address. +pub struct LeaderValue(pub String); + +impl> From for LeaderValue { + fn from(value: T) -> Self { + let string = String::from_utf8_lossy(value.as_ref()); + Self(string.to_string()) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct MetasrvNodeInfo { + // The metasrv's address + pub addr: String, + // The node build version + pub version: String, + // The node build git commit hash + pub git_commit: String, + // The node start timestamp in milliseconds + pub start_time_ms: u64, + // The node total cpu millicores + #[serde(default)] + pub total_cpu_millicores: i64, + // The node total memory bytes + #[serde(default)] + pub total_memory_bytes: i64, + /// The node build cpu usage millicores + #[serde(default)] + pub cpu_usage_millicores: i64, + /// The node build memory usage bytes + #[serde(default)] + pub memory_usage_bytes: i64, + // The node hostname + #[serde(default)] + pub hostname: String, +} + +// TODO(zyy17): Allow deprecated fields for backward compatibility. Remove this when the deprecated top-level fields are removed from the proto. +#[allow(deprecated)] +impl From for api::v1::meta::MetasrvNodeInfo { + fn from(node_info: MetasrvNodeInfo) -> Self { + Self { + peer: Some(api::v1::meta::Peer { + addr: node_info.addr, + ..Default::default() + }), + // TODO(zyy17): The following top-level fields are deprecated. They are kept for backward compatibility and will be removed in a future version. + // New code should use the fields in `info.NodeInfo` instead. + version: node_info.version.clone(), + git_commit: node_info.git_commit.clone(), + start_time_ms: node_info.start_time_ms, + cpus: node_info.total_cpu_millicores as u32, + memory_bytes: node_info.total_memory_bytes as u64, + // The canonical location for node information. + info: Some(api::v1::meta::NodeInfo { + version: node_info.version, + git_commit: node_info.git_commit, + start_time_ms: node_info.start_time_ms, + total_cpu_millicores: node_info.total_cpu_millicores, + total_memory_bytes: node_info.total_memory_bytes, + cpu_usage_millicores: node_info.cpu_usage_millicores, + memory_usage_bytes: node_info.memory_usage_bytes, + cpus: node_info.total_cpu_millicores as u32, + memory_bytes: node_info.total_memory_bytes as u64, + hostname: node_info.hostname, + }), + } + } +} + /// Messages sent when the leader changes. #[derive(Debug, Clone)] pub enum LeaderChangeMessage { @@ -168,3 +238,5 @@ pub trait Election: Send + Sync { fn subscribe_leader_change(&self) -> Receiver; } + +pub type ElectionRef = Arc>; diff --git a/src/meta-srv/src/election/etcd.rs b/src/common/meta/src/election/etcd.rs similarity index 94% rename from src/meta-srv/src/election/etcd.rs rename to src/common/meta/src/election/etcd.rs index 883f723d74..affad31ef4 100644 --- a/src/meta-srv/src/election/etcd.rs +++ b/src/common/meta/src/election/etcd.rs @@ -16,8 +16,6 @@ use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use std::time::Duration; -use common_meta::distributed_time_constants::{META_KEEP_ALIVE_INTERVAL_SECS, META_LEASE_SECS}; -use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY}; use common_telemetry::{error, info, warn}; use etcd_client::{ Client, GetOptions, LeaderKey as EtcdLeaderKey, LeaseKeepAliveStream, LeaseKeeper, PutOptions, @@ -27,13 +25,15 @@ use tokio::sync::broadcast; use tokio::sync::broadcast::Receiver; use tokio::time::{MissedTickBehavior, timeout}; +use crate::distributed_time_constants::{META_KEEP_ALIVE_INTERVAL_SECS, META_LEASE_SECS}; use crate::election::{ - CANDIDATE_LEASE_SECS, Election, KEEP_ALIVE_INTERVAL_SECS, LeaderChangeMessage, LeaderKey, - listen_leader_change, send_leader_change_and_set_flags, + CANDIDATE_LEASE_SECS, Election, ElectionRef, KEEP_ALIVE_INTERVAL_SECS, LeaderChangeMessage, + LeaderKey, LeaderValue, MetasrvNodeInfo, listen_leader_change, + send_leader_change_and_set_flags, }; use crate::error; use crate::error::Result; -use crate::metasrv::{ElectionRef, LeaderValue, MetasrvNodeInfo}; +use crate::key::{CANDIDATES_ROOT, ELECTION_KEY}; impl LeaderKey for EtcdLeaderKey { fn name(&self) -> &[u8] { @@ -253,7 +253,7 @@ impl Election for EtcdElection { .leader(self.election_key()) .await .context(error::EtcdFailedSnafu)?; - let leader_value = res.kv().context(error::NoLeaderSnafu)?.value(); + let leader_value = res.kv().context(error::ElectionNoLeaderSnafu)?.value(); Ok(leader_value.into()) } } @@ -279,7 +279,7 @@ impl EtcdElection { ensure!( res.ttl() > 0, error::UnexpectedSnafu { - violated: "Failed to refresh the lease", + err_msg: "Failed to refresh the lease".to_string(), } ); diff --git a/src/meta-srv/src/election/rds.rs b/src/common/meta/src/election/rds.rs similarity index 96% rename from src/meta-srv/src/election/rds.rs rename to src/common/meta/src/election/rds.rs index 16e113415a..6ee529ee02 100644 --- a/src/meta-srv/src/election/rds.rs +++ b/src/common/meta/src/election/rds.rs @@ -36,7 +36,7 @@ fn parse_value_and_expire_time(value: &str) -> Result<(String, Timestamp)> { .split(LEASE_SEP) .collect_tuple() .with_context(|| UnexpectedSnafu { - violated: format!( + err_msg: format!( "Invalid value {}, expect node info || {} || expire time", value, LEASE_SEP ), @@ -45,7 +45,7 @@ fn parse_value_and_expire_time(value: &str) -> Result<(String, Timestamp)> { let expire_time = match Timestamp::from_str(expire_time, None) { Ok(ts) => ts, Err(_) => UnexpectedSnafu { - violated: format!("Invalid timestamp: {}", expire_time), + err_msg: format!("Invalid timestamp: {}", expire_time), } .fail()?, }; diff --git a/src/meta-srv/src/election/rds/mysql.rs b/src/common/meta/src/election/rds/mysql.rs similarity index 97% rename from src/meta-srv/src/election/rds/mysql.rs rename to src/common/meta/src/election/rds/mysql.rs index 20051a2610..80f3d8ca7c 100644 --- a/src/meta-srv/src/election/rds/mysql.rs +++ b/src/common/meta/src/election/rds/mysql.rs @@ -16,7 +16,6 @@ use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use std::time::Duration; -use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY}; use common_telemetry::{error, info, warn}; use common_time::Timestamp; use snafu::{OptionExt, ResultExt, ensure}; @@ -29,14 +28,15 @@ use tokio::time::MissedTickBehavior; use crate::election::rds::{LEASE_SEP, Lease, RdsLeaderKey, parse_value_and_expire_time}; use crate::election::{ - Election, LeaderChangeMessage, listen_leader_change, send_leader_change_and_set_flags, + Election, ElectionRef, LeaderChangeMessage, LeaderValue, MetasrvNodeInfo, listen_leader_change, + send_leader_change_and_set_flags, }; use crate::error::{ AcquireMySqlClientSnafu, DecodeSqlValueSnafu, DeserializeFromJsonSnafu, - LeaderLeaseChangedSnafu, LeaderLeaseExpiredSnafu, MySqlExecutionSnafu, NoLeaderSnafu, Result, - SerializeToJsonSnafu, SqlExecutionTimeoutSnafu, UnexpectedSnafu, + ElectionLeaderLeaseChangedSnafu, ElectionLeaderLeaseExpiredSnafu, ElectionNoLeaderSnafu, + MySqlExecutionSnafu, Result, SerializeToJsonSnafu, SqlExecutionTimeoutSnafu, UnexpectedSnafu, }; -use crate::metasrv::{ElectionRef, LeaderValue, MetasrvNodeInfo}; +use crate::key::{CANDIDATES_ROOT, ELECTION_KEY}; struct ElectionSqlFactory<'a> { table_name: &'a str, @@ -592,7 +592,7 @@ impl Election for MySqlElection { ensure!( lease.expire_time > lease.current, UnexpectedSnafu { - violated: format!( + err_msg: format!( "Candidate lease expired at {:?} (current time: {:?}), key: {:?}", lease.expire_time, lease.current, @@ -667,10 +667,10 @@ impl Election for MySqlElection { let client = self.client.lock().await; let mut executor = Executor::Default(client); if let Some(lease) = self.get_value_with_lease(&key, &mut executor).await? { - ensure!(lease.expire_time > lease.current, NoLeaderSnafu); + ensure!(lease.expire_time > lease.current, ElectionNoLeaderSnafu); Ok(lease.leader_value.as_bytes().into()) } else { - NoLeaderSnafu.fail() + ElectionNoLeaderSnafu.fail() } } } @@ -705,7 +705,7 @@ impl MySqlElection { let current_time = match Timestamp::from_str(¤t_time_str, None) { Ok(ts) => ts, Err(_) => UnexpectedSnafu { - violated: format!("Invalid timestamp: {}", current_time_str), + err_msg: format!("Invalid timestamp: {}", current_time_str), } .fail()?, }; @@ -740,7 +740,7 @@ impl MySqlElection { current = match Timestamp::from_str(current_time_str, None) { Ok(ts) => ts, Err(_) => UnexpectedSnafu { - violated: format!("Invalid timestamp: {}", current_time_str), + err_msg: format!("Invalid timestamp: {}", current_time_str), } .fail()?, }; @@ -777,7 +777,7 @@ impl MySqlElection { ensure!( res == 1, UnexpectedSnafu { - violated: format!("Failed to update key: {}", String::from_utf8_lossy(key)), + err_msg: format!("Failed to update key: {}", String::from_utf8_lossy(key)), } ); @@ -920,9 +920,12 @@ impl MySqlElection { /// will be released. /// - **Case 2**: If all checks pass, the function returns without performing any actions. fn lease_check(&self, lease: &Option) -> Result { - let lease = lease.as_ref().context(NoLeaderSnafu)?; + let lease = lease.as_ref().context(ElectionNoLeaderSnafu)?; // Case 1: Lease expired - ensure!(lease.expire_time > lease.current, LeaderLeaseExpiredSnafu); + ensure!( + lease.expire_time > lease.current, + ElectionLeaderLeaseExpiredSnafu + ); // Case 2: Everything is fine Ok(lease.clone()) } @@ -960,7 +963,7 @@ impl MySqlElection { let remote_lease = self.get_value_with_lease(&key, &mut executor).await?; ensure!( expected_lease.map(|lease| lease.origin) == remote_lease.map(|lease| lease.origin), - LeaderLeaseChangedSnafu + ElectionLeaderLeaseChangedSnafu ); self.delete_value(&key, &mut executor).await?; self.put_value_with_lease( @@ -987,12 +990,11 @@ mod tests { use std::assert_matches::assert_matches; use std::env; - use common_meta::maybe_skip_mysql_integration_test; use common_telemetry::init_default_ut_logging; + use sqlx::MySqlPool; use super::*; - use crate::error; - use crate::utils::mysql::create_mysql_pool; + use crate::{error, maybe_skip_mysql_integration_test}; async fn create_mysql_client( table_name: Option<&str>, @@ -1003,11 +1005,11 @@ mod tests { let endpoint = env::var("GT_MYSQL_ENDPOINTS").unwrap_or_default(); if endpoint.is_empty() { return UnexpectedSnafu { - violated: "MySQL endpoint is empty".to_string(), + err_msg: "MySQL endpoint is empty".to_string(), } .fail(); } - let pool = create_mysql_pool(&[endpoint], None).await.unwrap(); + let pool = MySqlPool::connect(&endpoint).await.unwrap(); let mut client = ElectionMysqlClient::new( pool, execution_timeout, @@ -1302,7 +1304,7 @@ mod tests { let err = elected(&leader_mysql_election, table_name, Some(incorrect_lease)) .await .unwrap_err(); - assert_matches!(err, error::Error::LeaderLeaseChanged { .. }); + assert_matches!(err, error::Error::ElectionLeaderLeaseChanged { .. }); let lease = get_lease(&leader_mysql_election).await; assert!(lease.is_none()); drop_table(&leader_mysql_election.client, table_name).await; diff --git a/src/meta-srv/src/election/rds/postgres.rs b/src/common/meta/src/election/rds/postgres.rs similarity index 97% rename from src/meta-srv/src/election/rds/postgres.rs rename to src/common/meta/src/election/rds/postgres.rs index c21efd780b..01910335a0 100644 --- a/src/meta-srv/src/election/rds/postgres.rs +++ b/src/common/meta/src/election/rds/postgres.rs @@ -16,7 +16,6 @@ use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use std::time::Duration; -use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY}; use common_telemetry::{error, info, warn}; use common_time::Timestamp; use deadpool_postgres::{Manager, Pool}; @@ -28,13 +27,15 @@ use tokio_postgres::types::ToSql; use crate::election::rds::{LEASE_SEP, Lease, RdsLeaderKey, parse_value_and_expire_time}; use crate::election::{ - Election, LeaderChangeMessage, listen_leader_change, send_leader_change_and_set_flags, + Election, ElectionRef, LeaderChangeMessage, LeaderValue, MetasrvNodeInfo, listen_leader_change, + send_leader_change_and_set_flags, }; use crate::error::{ - DeserializeFromJsonSnafu, GetPostgresClientSnafu, NoLeaderSnafu, PostgresExecutionSnafu, - Result, SerializeToJsonSnafu, SqlExecutionTimeoutSnafu, UnexpectedSnafu, + DeserializeFromJsonSnafu, ElectionNoLeaderSnafu, GetPostgresClientSnafu, + PostgresExecutionSnafu, Result, SerializeToJsonSnafu, SqlExecutionTimeoutSnafu, + UnexpectedSnafu, }; -use crate::metasrv::{ElectionRef, LeaderValue, MetasrvNodeInfo}; +use crate::key::{CANDIDATES_ROOT, ELECTION_KEY}; struct ElectionSqlFactory<'a> { lock_id: u64, @@ -404,13 +405,13 @@ impl Election for PgElection { .get_value_with_lease(&key) .await? .context(UnexpectedSnafu { - violated: format!("Failed to get lease for key: {:?}", key), + err_msg: format!("Failed to get lease for key: {:?}", key), })?; ensure!( lease.expire_time > lease.current, UnexpectedSnafu { - violated: format!( + err_msg: format!( "Candidate lease expired at {:?} (current time {:?}), key: {:?}", lease.expire_time, lease.current, key ), @@ -464,11 +465,11 @@ impl Election for PgElection { .query(&self.sql_set.campaign, &[]) .await?; let row = res.first().context(UnexpectedSnafu { - violated: "Failed to get the result of acquiring advisory lock", + err_msg: "Failed to get the result of acquiring advisory lock".to_string(), })?; let is_leader = row.try_get(0).map_err(|_| { UnexpectedSnafu { - violated: "Failed to get the result of get lock", + err_msg: "Failed to get the result of get lock".to_string(), } .build() })?; @@ -500,10 +501,10 @@ impl Election for PgElection { } else { let key = self.election_key(); if let Some(lease) = self.get_value_with_lease(&key).await? { - ensure!(lease.expire_time > lease.current, NoLeaderSnafu); + ensure!(lease.expire_time > lease.current, ElectionNoLeaderSnafu); Ok(lease.leader_value.as_bytes().into()) } else { - NoLeaderSnafu.fail() + ElectionNoLeaderSnafu.fail() } } } @@ -537,7 +538,7 @@ impl PgElection { let current_time = match Timestamp::from_str(current_time_str, None) { Ok(ts) => ts, Err(_) => UnexpectedSnafu { - violated: format!("Invalid timestamp: {}", current_time_str), + err_msg: format!("Invalid timestamp: {}", current_time_str), } .fail()?, }; @@ -576,7 +577,7 @@ impl PgElection { current = match Timestamp::from_str(current_time_str, None) { Ok(ts) => ts, Err(_) => UnexpectedSnafu { - violated: format!("Invalid timestamp: {}", current_time_str), + err_msg: format!("Invalid timestamp: {}", current_time_str), } .fail()?, }; @@ -613,7 +614,7 @@ impl PgElection { ensure!( res == 1, UnexpectedSnafu { - violated: format!("Failed to update key: {}", String::from_utf8_lossy(key)), + err_msg: format!("Failed to update key: {}", String::from_utf8_lossy(key)), } ); @@ -742,9 +743,9 @@ impl PgElection { let lease = self .get_value_with_lease(&key) .await? - .context(NoLeaderSnafu)?; + .context(ElectionNoLeaderSnafu)?; // Case 2 - ensure!(lease.expire_time > lease.current, NoLeaderSnafu); + ensure!(lease.expire_time > lease.current, ElectionNoLeaderSnafu); // Case 3 Ok(()) } @@ -831,11 +832,11 @@ mod tests { use std::assert_matches::assert_matches; use std::env; - use common_meta::maybe_skip_postgres_integration_test; + use deadpool_postgres::{Config, Runtime}; + use tokio_postgres::NoTls; use super::*; - use crate::error; - use crate::utils::postgres::create_postgres_pool; + use crate::{error, maybe_skip_postgres_integration_test}; async fn create_postgres_client( table_name: Option<&str>, @@ -846,11 +847,13 @@ mod tests { let endpoint = env::var("GT_POSTGRES_ENDPOINTS").unwrap_or_default(); if endpoint.is_empty() { return UnexpectedSnafu { - violated: "Postgres endpoint is empty".to_string(), + err_msg: "Postgres endpoint is empty".to_string(), } .fail(); } - let pool = create_postgres_pool(&[endpoint], None, None).await.unwrap(); + let mut cfg = Config::new(); + cfg.url = Some(endpoint); + let pool = cfg.create_pool(Some(Runtime::Tokio1), NoTls).unwrap(); let mut pg_client = ElectionPgClient::new( pool, execution_timeout, diff --git a/src/common/meta/src/error.rs b/src/common/meta/src/error.rs index b9fcbd6188..05b5af393b 100644 --- a/src/common/meta/src/error.rs +++ b/src/common/meta/src/error.rs @@ -338,6 +338,24 @@ pub enum Error { location: Location, }, + #[snafu(display("Metasrv election has no leader at this moment"))] + ElectionNoLeader { + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Metasrv election leader lease expired"))] + ElectionLeaderLeaseExpired { + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Metasrv election leader lease changed during election"))] + ElectionLeaderLeaseChanged { + #[snafu(implicit)] + location: Location, + }, + #[snafu(display("Table already exists, table: {}", table_name))] TableAlreadyExists { table_name: String, @@ -751,6 +769,15 @@ pub enum Error { location: Location, }, + #[cfg(feature = "pg_kvbackend")] + #[snafu(display("Failed to get Postgres client"))] + GetPostgresClient { + #[snafu(source)] + error: deadpool::managed::PoolError, + #[snafu(implicit)] + location: Location, + }, + #[cfg(feature = "pg_kvbackend")] #[snafu(display("Failed to {} Postgres transaction", operation))] PostgresTransaction { @@ -805,6 +832,24 @@ pub enum Error { location: Location, }, + #[cfg(feature = "mysql_kvbackend")] + #[snafu(display("Failed to decode sql value"))] + DecodeSqlValue { + #[snafu(source)] + error: sqlx::error::Error, + #[snafu(implicit)] + location: Location, + }, + + #[cfg(feature = "mysql_kvbackend")] + #[snafu(display("Failed to acquire mysql client from pool"))] + AcquireMySqlClient { + #[snafu(source)] + error: sqlx::Error, + #[snafu(implicit)] + location: Location, + }, + #[cfg(feature = "mysql_kvbackend")] #[snafu(display("Failed to {} MySql transaction", operation))] MySqlTransaction { @@ -822,6 +867,15 @@ pub enum Error { location: Location, }, + #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))] + #[snafu(display("Sql execution timeout, sql: {}, duration: {:?}", sql, duration))] + SqlExecutionTimeout { + sql: String, + duration: std::time::Duration, + #[snafu(implicit)] + location: Location, + }, + #[snafu(display( "Datanode table info not found, table id: {}, datanode id: {}", table_id, @@ -1075,7 +1129,10 @@ impl ErrorExt for Error { | GetCache { .. } | GetLatestCacheRetryExceeded { .. } | SerializeToJson { .. } - | DeserializeFromJson { .. } => StatusCode::Internal, + | DeserializeFromJson { .. } + | ElectionNoLeader { .. } + | ElectionLeaderLeaseExpired { .. } + | ElectionLeaderLeaseChanged { .. } => StatusCode::Internal, NoLeader { .. } => StatusCode::TableUnavailable, ValueNotExist { .. } @@ -1198,15 +1255,18 @@ impl ErrorExt for Error { PostgresExecution { .. } | CreatePostgresPool { .. } | GetPostgresConnection { .. } + | GetPostgresClient { .. } | PostgresTransaction { .. } | PostgresTlsConfig { .. } | InvalidTlsConfig { .. } => StatusCode::Internal, #[cfg(feature = "mysql_kvbackend")] - MySqlExecution { .. } | CreateMySqlPool { .. } | MySqlTransaction { .. } => { - StatusCode::Internal - } + MySqlExecution { .. } + | CreateMySqlPool { .. } + | DecodeSqlValue { .. } + | AcquireMySqlClient { .. } + | MySqlTransaction { .. } => StatusCode::Internal, #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))] - RdsTransactionRetryFailed { .. } => StatusCode::Internal, + RdsTransactionRetryFailed { .. } | SqlExecutionTimeout { .. } => StatusCode::Internal, DatanodeTableInfoNotFound { .. } => StatusCode::Internal, } } diff --git a/src/common/meta/src/lib.rs b/src/common/meta/src/lib.rs index 93cd229b16..36aae1026e 100644 --- a/src/common/meta/src/lib.rs +++ b/src/common/meta/src/lib.rs @@ -22,6 +22,7 @@ pub mod datanode; pub mod ddl; pub mod ddl_manager; pub mod distributed_time_constants; +pub mod election; pub mod error; pub mod flow_name; pub mod heartbeat; diff --git a/src/meta-srv/src/bootstrap.rs b/src/meta-srv/src/bootstrap.rs index 2cfe7d2f7d..eadb7cdc75 100644 --- a/src/meta-srv/src/bootstrap.rs +++ b/src/meta-srv/src/bootstrap.rs @@ -24,6 +24,8 @@ use common_base::Plugins; use common_config::Configurable; #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))] use common_meta::distributed_time_constants::META_LEASE_SECS; +use common_meta::election::CANDIDATE_LEASE_SECS; +use common_meta::election::etcd::EtcdElection; use common_meta::kv_backend::chroot::ChrootKvBackend; use common_meta::kv_backend::etcd::EtcdStore; use common_meta::kv_backend::memory::MemoryKvBackend; @@ -42,9 +44,6 @@ use tonic::codec::CompressionEncoding; use tonic::transport::server::{Router, TcpIncoming}; use crate::cluster::{MetaPeerClientBuilder, MetaPeerClientRef}; -#[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))] -use crate::election::CANDIDATE_LEASE_SECS; -use crate::election::etcd::EtcdElection; use crate::error::OtherSnafu; use crate::metasrv::builder::MetasrvBuilder; use crate::metasrv::{ @@ -281,7 +280,8 @@ pub async fn metasrv_builder( etcd_client, opts.store_key_prefix.clone(), ) - .await?; + .await + .context(error::KvBackendSnafu)?; (kv_backend, Some(election)) } @@ -290,10 +290,10 @@ pub async fn metasrv_builder( use std::time::Duration; use common_meta::distributed_time_constants::POSTGRES_KEEP_ALIVE_SECS; + use common_meta::election::rds::postgres::{ElectionPgClient, PgElection}; use common_meta::kv_backend::rds::PgStore; use deadpool_postgres::{Config, ManagerConfig, RecyclingMethod}; - use crate::election::rds::postgres::{ElectionPgClient, PgElection}; use crate::utils::postgres::create_postgres_pool; let candidate_lease_ttl = Duration::from_secs(CANDIDATE_LEASE_SECS); @@ -321,7 +321,8 @@ pub async fn metasrv_builder( execution_timeout, idle_session_timeout, statement_timeout, - )?; + ) + .context(error::KvBackendSnafu)?; let election = PgElection::with_pg_client( opts.grpc.server_addr.clone(), election_client, @@ -332,7 +333,8 @@ pub async fn metasrv_builder( &opts.meta_table_name, opts.meta_election_lock_id, ) - .await?; + .await + .context(error::KvBackendSnafu)?; let pool = create_postgres_pool(&opts.store_addrs, Some(cfg), opts.backend_tls.clone()) .await?; @@ -352,9 +354,9 @@ pub async fn metasrv_builder( (None, BackendImpl::MysqlStore) => { use std::time::Duration; + use common_meta::election::rds::mysql::{ElectionMysqlClient, MySqlElection}; use common_meta::kv_backend::rds::MySqlStore; - use crate::election::rds::mysql::{ElectionMysqlClient, MySqlElection}; use crate::utils::mysql::create_mysql_pool; let pool = create_mysql_pool(&opts.store_addrs, opts.backend_tls.as_ref()).await?; @@ -389,7 +391,8 @@ pub async fn metasrv_builder( meta_lease_ttl, &election_table_name, ) - .await?; + .await + .context(error::KvBackendSnafu)?; (kv_backend, Some(election)) } }; diff --git a/src/meta-srv/src/cluster.rs b/src/meta-srv/src/cluster.rs index 35b15b3b29..ef3ba07702 100644 --- a/src/meta-srv/src/cluster.rs +++ b/src/meta-srv/src/cluster.rs @@ -247,7 +247,7 @@ impl MetaPeerClient { // Safety: when self.is_leader() == false, election must not empty. let election = self.election.as_ref().unwrap(); - let leader_addr = election.leader().await?.0; + let leader_addr = election.leader().await.context(error::KvBackendSnafu)?.0; let channel = self .channel_manager @@ -279,7 +279,7 @@ impl MetaPeerClient { // Safety: when self.is_leader() == false, election must not empty. let election = self.election.as_ref().unwrap(); - let leader_addr = election.leader().await?.0; + let leader_addr = election.leader().await.context(error::KvBackendSnafu)?.0; let channel = self .channel_manager diff --git a/src/meta-srv/src/lib.rs b/src/meta-srv/src/lib.rs index c67bc32b40..0e87d4421a 100644 --- a/src/meta-srv/src/lib.rs +++ b/src/meta-srv/src/lib.rs @@ -21,7 +21,6 @@ pub mod bootstrap; pub mod cache_invalidator; pub mod cluster; pub mod discovery; -pub mod election; pub mod error; pub mod events; mod failure_detector; diff --git a/src/meta-srv/src/metasrv.rs b/src/meta-srv/src/metasrv.rs index 165efd0555..a1515d897e 100644 --- a/src/meta-srv/src/metasrv.rs +++ b/src/meta-srv/src/metasrv.rs @@ -32,6 +32,8 @@ use common_meta::ddl_manager::DdlManagerRef; use common_meta::distributed_time_constants::{ self, BASE_HEARTBEAT_INTERVAL, default_distributed_time_constants, frontend_heartbeat_interval, }; +use common_meta::election::LeaderChangeMessage; +pub use common_meta::election::{ElectionRef, MetasrvNodeInfo}; use common_meta::key::TableMetadataManagerRef; use common_meta::key::runtime_switch::RuntimeSwitchManagerRef; use common_meta::kv_backend::{KvBackendRef, ResettableKvBackend, ResettableKvBackendRef}; @@ -64,7 +66,6 @@ use tokio::sync::broadcast::error::RecvError; use crate::cluster::MetaPeerClientRef; use crate::discovery; -use crate::election::{Election, LeaderChangeMessage}; use crate::error::{ self, InitMetadataSnafu, KvBackendSnafu, Result, StartProcedureManagerSnafu, StartTelemetryTaskSnafu, StopProcedureManagerSnafu, @@ -459,76 +460,6 @@ impl Context { } } -/// The value of the leader. It is used to store the leader's address. -pub struct LeaderValue(pub String); - -impl> From for LeaderValue { - fn from(value: T) -> Self { - let string = String::from_utf8_lossy(value.as_ref()); - Self(string.to_string()) - } -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct MetasrvNodeInfo { - // The metasrv's address - pub addr: String, - // The node build version - pub version: String, - // The node build git commit hash - pub git_commit: String, - // The node start timestamp in milliseconds - pub start_time_ms: u64, - // The node total cpu millicores - #[serde(default)] - pub total_cpu_millicores: i64, - // The node total memory bytes - #[serde(default)] - pub total_memory_bytes: i64, - /// The node build cpu usage millicores - #[serde(default)] - pub cpu_usage_millicores: i64, - /// The node build memory usage bytes - #[serde(default)] - pub memory_usage_bytes: i64, - // The node hostname - #[serde(default)] - pub hostname: String, -} - -// TODO(zyy17): Allow deprecated fields for backward compatibility. Remove this when the deprecated top-level fields are removed from the proto. -#[allow(deprecated)] -impl From for api::v1::meta::MetasrvNodeInfo { - fn from(node_info: MetasrvNodeInfo) -> Self { - Self { - peer: Some(api::v1::meta::Peer { - addr: node_info.addr, - ..Default::default() - }), - // TODO(zyy17): The following top-level fields are deprecated. They are kept for backward compatibility and will be removed in a future version. - // New code should use the fields in `info.NodeInfo` instead. - version: node_info.version.clone(), - git_commit: node_info.git_commit.clone(), - start_time_ms: node_info.start_time_ms, - cpus: node_info.total_cpu_millicores as u32, - memory_bytes: node_info.total_memory_bytes as u64, - // The canonical location for node information. - info: Some(api::v1::meta::NodeInfo { - version: node_info.version, - git_commit: node_info.git_commit, - start_time_ms: node_info.start_time_ms, - total_cpu_millicores: node_info.total_cpu_millicores, - total_memory_bytes: node_info.total_memory_bytes, - cpu_usage_millicores: node_info.cpu_usage_millicores, - memory_usage_bytes: node_info.memory_usage_bytes, - cpus: node_info.total_cpu_millicores as u32, - memory_bytes: node_info.total_memory_bytes as u64, - hostname: node_info.hostname, - }), - } - } -} - #[derive(Clone, Copy)] pub enum SelectTarget { Datanode, @@ -552,7 +483,6 @@ pub struct SelectorContext { pub type SelectorRef = Arc>>; pub type RegionStatAwareSelectorRef = Arc>>; -pub type ElectionRef = Arc>; pub struct MetaStateHandler { subscribe_manager: Option, diff --git a/src/meta-srv/src/service/admin/leader.rs b/src/meta-srv/src/service/admin/leader.rs index 1fadb4a3ef..17329e7b47 100644 --- a/src/meta-srv/src/service/admin/leader.rs +++ b/src/meta-srv/src/service/admin/leader.rs @@ -32,7 +32,7 @@ pub struct LeaderHandler { impl LeaderHandler { async fn get_leader(&self) -> Result> { if let Some(election) = &self.election { - let leader_addr = election.leader().await?.0; + let leader_addr = election.leader().await.context(error::KvBackendSnafu)?.0; return Ok(Some(leader_addr)); } Ok(None) diff --git a/src/meta-srv/src/service/cluster.rs b/src/meta-srv/src/service/cluster.rs index 5c0ae4c71f..366a8aa5fb 100644 --- a/src/meta-srv/src/service/cluster.rs +++ b/src/meta-srv/src/service/cluster.rs @@ -63,7 +63,10 @@ impl cluster_server::Cluster for Metasrv { let leader_addr = &self.options().grpc.server_addr; let (leader, followers) = match self.election() { Some(election) => { - let nodes = election.all_candidates().await?; + let nodes = election + .all_candidates() + .await + .context(error::KvBackendSnafu)?; let followers = nodes .into_iter() .filter(|node_info| &node_info.addr != leader_addr) diff --git a/src/meta-srv/src/service/heartbeat.rs b/src/meta-srv/src/service/heartbeat.rs index e09073546a..238ed99df2 100644 --- a/src/meta-srv/src/service/heartbeat.rs +++ b/src/meta-srv/src/service/heartbeat.rs @@ -23,7 +23,7 @@ use api::v1::meta::{ use common_telemetry::{debug, error, info, warn}; use futures::StreamExt; use once_cell::sync::OnceCell; -use snafu::OptionExt; +use snafu::{OptionExt, ResultExt}; use tokio::sync::mpsc; use tokio::sync::mpsc::Sender; use tokio_stream::wrappers::ReceiverStream; @@ -148,7 +148,7 @@ async fn handle_ask_leader(_req: AskLeaderRequest, ctx: Context) -> Result ctx.server_addr, From 13cdfa9b59a2d2b7a1f166c3993f50597807a368 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Tue, 24 Mar 2026 20:16:38 +0800 Subject: [PATCH 035/195] fix: update 8-bit int to smallint in postgres (#7854) --- src/servers/src/postgres/types.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/servers/src/postgres/types.rs b/src/servers/src/postgres/types.rs index a95890e78c..d4d15ef64a 100644 --- a/src/servers/src/postgres/types.rs +++ b/src/servers/src/postgres/types.rs @@ -235,7 +235,7 @@ pub(super) fn type_gt_to_pg(origin: &ConcreteDataType) -> Result { match origin { &ConcreteDataType::Null(_) => Ok(Type::UNKNOWN), &ConcreteDataType::Boolean(_) => Ok(Type::BOOL), - &ConcreteDataType::Int8(_) => Ok(Type::CHAR), + &ConcreteDataType::Int8(_) => Ok(Type::INT2), &ConcreteDataType::Int16(_) | &ConcreteDataType::UInt8(_) => Ok(Type::INT2), &ConcreteDataType::Int32(_) | &ConcreteDataType::UInt16(_) => Ok(Type::INT4), &ConcreteDataType::Int64(_) | &ConcreteDataType::UInt32(_) => Ok(Type::INT8), @@ -253,7 +253,7 @@ pub(super) fn type_gt_to_pg(origin: &ConcreteDataType) -> Result { ConcreteDataType::List(list) => match list.item_type() { &ConcreteDataType::Null(_) => Ok(Type::UNKNOWN), &ConcreteDataType::Boolean(_) => Ok(Type::BOOL_ARRAY), - &ConcreteDataType::Int8(_) => Ok(Type::CHAR_ARRAY), + &ConcreteDataType::Int8(_) => Ok(Type::INT2_ARRAY), &ConcreteDataType::Int16(_) | &ConcreteDataType::UInt8(_) => Ok(Type::INT2_ARRAY), &ConcreteDataType::Int32(_) | &ConcreteDataType::UInt16(_) => Ok(Type::INT4_ARRAY), &ConcreteDataType::Int64(_) | &ConcreteDataType::UInt32(_) => Ok(Type::INT8_ARRAY), @@ -1151,7 +1151,7 @@ mod test { let pg_field_info = vec![ FieldInfo::new("nulls".into(), None, None, Type::UNKNOWN, FieldFormat::Text), FieldInfo::new("bools".into(), None, None, Type::BOOL, FieldFormat::Text), - FieldInfo::new("int8s".into(), None, None, Type::CHAR, FieldFormat::Text), + FieldInfo::new("int8s".into(), None, None, Type::INT2, FieldFormat::Text), FieldInfo::new("int16s".into(), None, None, Type::INT2, FieldFormat::Text), FieldInfo::new("int32s".into(), None, None, Type::INT4, FieldFormat::Text), FieldInfo::new("int64s".into(), None, None, Type::INT8, FieldFormat::Text), @@ -1230,7 +1230,7 @@ mod test { Type::NUMERIC, FieldFormat::Text, ), - FieldInfo::new("int8s".into(), None, None, Type::CHAR, FieldFormat::Text), + FieldInfo::new("int8s".into(), None, None, Type::INT2, FieldFormat::Text), FieldInfo::new("int16s".into(), None, None, Type::INT2, FieldFormat::Text), FieldInfo::new("int32s".into(), None, None, Type::INT4, FieldFormat::Text), FieldInfo::new("int64s".into(), None, None, Type::INT8, FieldFormat::Text), From 04aa84af62640df1e2480ca671e3468649b99df2 Mon Sep 17 00:00:00 2001 From: Yingwen Date: Wed, 25 Mar 2026 11:10:19 +0800 Subject: [PATCH 036/195] feat: use ArrowReaderBuilder instead of the RowGroups API (#7853) * feat: use ArrowReaderBuilder instead of the RowGroups API Signed-off-by: evenyag * refactor: make row_group_idx required Signed-off-by: evenyag * chore: remove unsed variant Signed-off-by: evenyag * fix: collect total_fetch_elapsed metrics Signed-off-by: evenyag --------- Signed-off-by: evenyag --- src/mito2/src/error.rs | 10 - src/mito2/src/memtable/bulk.rs | 1 + src/mito2/src/memtable/bulk/chunk_reader.rs | 65 +++ src/mito2/src/memtable/bulk/part_reader.rs | 1 - .../src/memtable/bulk/row_group_reader.rs | 152 +----- src/mito2/src/read/last_row.rs | 8 +- src/mito2/src/read/prune.rs | 21 +- src/mito2/src/read/scan_util.rs | 2 +- src/mito2/src/sst/parquet.rs | 1 + src/mito2/src/sst/parquet/async_reader.rs | 221 ++++++++ src/mito2/src/sst/parquet/reader.rs | 176 ++++--- src/mito2/src/sst/parquet/row_group.rs | 470 +----------------- 12 files changed, 423 insertions(+), 705 deletions(-) create mode 100644 src/mito2/src/memtable/bulk/chunk_reader.rs create mode 100644 src/mito2/src/sst/parquet/async_reader.rs diff --git a/src/mito2/src/error.rs b/src/mito2/src/error.rs index 923d8a2713..c6b69fe607 100644 --- a/src/mito2/src/error.rs +++ b/src/mito2/src/error.rs @@ -616,15 +616,6 @@ pub enum Error { location: Location, }, - #[snafu(display("Failed to read arrow record batch from parquet file {}", path))] - ArrowReader { - path: String, - #[snafu(source)] - error: ArrowError, - #[snafu(implicit)] - location: Location, - }, - #[snafu(display("Column not found, column: {column}"))] ColumnNotFound { column: String, @@ -1349,7 +1340,6 @@ impl ErrorExt for Error { RegionState { .. } | UpdateManifest { .. } => StatusCode::RegionNotReady, JsonOptions { .. } => StatusCode::InvalidArguments, EmptyRegionDir { .. } | EmptyManifestDir { .. } => StatusCode::RegionNotFound, - ArrowReader { .. } => StatusCode::StorageUnavailable, ConvertValue { source, .. } => source.status_code(), ApplyBloomFilterIndex { source, .. } => source.status_code(), InvalidPartitionExpr { source, .. } => source.status_code(), diff --git a/src/mito2/src/memtable/bulk.rs b/src/mito2/src/memtable/bulk.rs index e649681b76..502b61759d 100644 --- a/src/mito2/src/memtable/bulk.rs +++ b/src/mito2/src/memtable/bulk.rs @@ -14,6 +14,7 @@ //! Memtable implementation for bulk load +pub(crate) mod chunk_reader; #[allow(unused)] pub mod context; #[allow(unused)] diff --git a/src/mito2/src/memtable/bulk/chunk_reader.rs b/src/mito2/src/memtable/bulk/chunk_reader.rs new file mode 100644 index 0000000000..e632cd1b37 --- /dev/null +++ b/src/mito2/src/memtable/bulk/chunk_reader.rs @@ -0,0 +1,65 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! ChunkReader implementation for in-memory parquet bytes. + +use std::io::Cursor; + +use bytes::Bytes; +use parquet::errors::{ParquetError, Result}; +use parquet::file::reader::{ChunkReader, Length}; + +/// A [ChunkReader] implementation for in-memory parquet bytes. +/// +/// This provides byte access to parquet data stored in memory (Bytes), +/// used for reading parquet data from bulk memtable. +#[derive(Clone)] +pub struct MemtableChunkReader { + /// The in-memory parquet data. + data: Bytes, +} + +impl MemtableChunkReader { + /// Creates a new [MemtableChunkReader] from the given bytes. + pub fn new(data: Bytes) -> Self { + Self { data } + } +} + +impl Length for MemtableChunkReader { + fn len(&self) -> u64 { + self.data.len() as u64 + } +} + +impl ChunkReader for MemtableChunkReader { + type T = Cursor; + + fn get_read(&self, start: u64) -> Result { + let start = start as usize; + if start > self.data.len() { + return Err(ParquetError::IndexOutOfBound(start, self.data.len())); + } + Ok(Cursor::new(self.data.slice(start..))) + } + + fn get_bytes(&self, start: u64, length: usize) -> Result { + let start = start as usize; + let end = start + length; + if end > self.data.len() { + return Err(ParquetError::IndexOutOfBound(end, self.data.len())); + } + Ok(self.data.slice(start..end)) + } +} diff --git a/src/mito2/src/memtable/bulk/part_reader.rs b/src/mito2/src/memtable/bulk/part_reader.rs index 904aae8c90..edb9ff52d9 100644 --- a/src/mito2/src/memtable/bulk/part_reader.rs +++ b/src/mito2/src/memtable/bulk/part_reader.rs @@ -30,7 +30,6 @@ use crate::memtable::{MemScanMetrics, MemScanMetricsData}; use crate::metrics::{READ_ROWS_TOTAL, READ_STAGE_ELAPSED}; use crate::sst::parquet::file_range::{PreFilterMode, TagDecodeState}; use crate::sst::parquet::flat_format::sequence_column_index; -use crate::sst::parquet::reader::RowGroupReaderContext; /// Iterator for reading data inside a bulk part. pub struct EncodedBulkPartIter { diff --git a/src/mito2/src/memtable/bulk/row_group_reader.rs b/src/mito2/src/memtable/bulk/row_group_reader.rs index fccd22db10..40a5b2f85d 100644 --- a/src/mito2/src/memtable/bulk/row_group_reader.rs +++ b/src/mito2/src/memtable/bulk/row_group_reader.rs @@ -12,124 +12,27 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::ops::Range; use std::sync::Arc; use bytes::Bytes; -use datatypes::arrow::array::RecordBatch; -use datatypes::arrow::error::ArrowError; -use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, RowGroups, RowSelection}; -use parquet::arrow::{FieldLevels, ProjectionMask, parquet_to_arrow_field_levels}; -use parquet::column::page::{PageIterator, PageReader}; -use parquet::file::metadata::{ParquetMetaData, RowGroupMetaData}; +use parquet::arrow::ProjectionMask; +use parquet::arrow::arrow_reader::{ + ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReader, + ParquetRecordBatchReaderBuilder, RowSelection, +}; +use parquet::file::metadata::ParquetMetaData; use snafu::ResultExt; use crate::error; use crate::error::ReadDataPartSnafu; +use crate::memtable::bulk::chunk_reader::MemtableChunkReader; use crate::memtable::bulk::context::BulkIterContextRef; use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE; -use crate::sst::parquet::format::ReadFormat; -use crate::sst::parquet::reader::RowGroupReaderContext; -use crate::sst::parquet::row_group::{ColumnChunkIterator, RowGroupBase}; - -/// Helper for reading specific row group inside Memtable Parquet parts. -// This is similar to [mito2::sst::parquet::row_group::InMemoryRowGroup] since -// it's a workaround for lacking of keyword generics. -pub struct MemtableRowGroupPageFetcher<'a> { - /// Shared structs for reading row group. - base: RowGroupBase<'a>, - bytes: Bytes, -} - -impl<'a> MemtableRowGroupPageFetcher<'a> { - pub(crate) fn create( - row_group_idx: usize, - parquet_meta: &'a ParquetMetaData, - bytes: Bytes, - ) -> Self { - Self { - // the cached `column_uncompressed_pages` would never be used in Memtable readers. - base: RowGroupBase::new(parquet_meta, row_group_idx), - bytes, - } - } - - /// Fetches column pages from memory file. - pub(crate) fn fetch(&mut self, projection: &ProjectionMask, selection: Option<&RowSelection>) { - if let Some((selection, offset_index)) = selection.zip(self.base.offset_index) { - // Selection provided. - let (fetch_ranges, page_start_offsets) = - self.base - .calc_sparse_read_ranges(projection, offset_index, selection); - if fetch_ranges.is_empty() { - return; - } - let chunk_data = self.fetch_bytes(&fetch_ranges); - - self.base - .assign_sparse_chunk(projection, chunk_data, page_start_offsets); - } else { - let fetch_ranges = self.base.calc_dense_read_ranges(projection); - if fetch_ranges.is_empty() { - // Nothing to fetch. - return; - } - let chunk_data = self.fetch_bytes(&fetch_ranges); - self.base.assign_dense_chunk(projection, chunk_data); - } - } - - fn fetch_bytes(&self, ranges: &[Range]) -> Vec { - ranges - .iter() - .map(|range| self.bytes.slice(range.start as usize..range.end as usize)) - .collect() - } - - /// Creates a page reader to read column at `i`. - fn column_page_reader(&self, i: usize) -> parquet::errors::Result> { - let reader = self.base.column_reader(i)?; - Ok(Box::new(reader)) - } -} - -impl RowGroups for MemtableRowGroupPageFetcher<'_> { - fn num_rows(&self) -> usize { - self.base.row_count - } - - fn column_chunks(&self, i: usize) -> parquet::errors::Result> { - Ok(Box::new(ColumnChunkIterator { - reader: Some(self.column_page_reader(i)), - })) - } - - fn row_groups(&self) -> Box + '_> { - Box::new(std::iter::once(self.base.row_group_metadata())) - } - - fn metadata(&self) -> &ParquetMetaData { - self.base.parquet_metadata() - } -} - -impl RowGroupReaderContext for BulkIterContextRef { - fn map_result( - &self, - result: Result, ArrowError>, - ) -> error::Result> { - result.context(error::DecodeArrowRowGroupSnafu) - } - - fn read_format(&self) -> &ReadFormat { - self.as_ref().read_format() - } -} pub(crate) struct MemtableRowGroupReaderBuilder { projection: ProjectionMask, parquet_metadata: Arc, - field_levels: FieldLevels, + arrow_metadata: ArrowReaderMetadata, data: Bytes, } @@ -140,15 +43,16 @@ impl MemtableRowGroupReaderBuilder { parquet_metadata: Arc, data: Bytes, ) -> error::Result { - let parquet_schema_desc = parquet_metadata.file_metadata().schema_descr(); - let hint = Some(context.read_format().arrow_schema().fields()); - let field_levels = - parquet_to_arrow_field_levels(parquet_schema_desc, projection.clone(), hint) + // Create ArrowReaderMetadata for building the reader. + let arrow_reader_options = + ArrowReaderOptions::new().with_schema(context.read_format().arrow_schema().clone()); + let arrow_metadata = + ArrowReaderMetadata::try_new(parquet_metadata.clone(), arrow_reader_options) .context(ReadDataPartSnafu)?; Ok(Self { projection, parquet_metadata, - field_levels, + arrow_metadata, data, }) } @@ -159,23 +63,21 @@ impl MemtableRowGroupReaderBuilder { row_group_idx: usize, row_selection: Option, ) -> error::Result { - let mut row_group = MemtableRowGroupPageFetcher::create( - row_group_idx, - &self.parquet_metadata, - self.data.clone(), - ); - // Fetches data from memory part. Currently, row selection is not supported. - row_group.fetch(&self.projection, row_selection.as_ref()); + let chunk_reader = MemtableChunkReader::new(self.data.clone()); - // Builds the parquet reader. - // Now the row selection is None. - ParquetRecordBatchReader::try_new_with_row_groups( - &self.field_levels, - &row_group, - DEFAULT_READ_BATCH_SIZE, - row_selection, + let mut builder = ParquetRecordBatchReaderBuilder::new_with_metadata( + chunk_reader, + self.arrow_metadata.clone(), ) - .context(ReadDataPartSnafu) + .with_row_groups(vec![row_group_idx]) + .with_projection(self.projection.clone()) + .with_batch_size(DEFAULT_READ_BATCH_SIZE); + + if let Some(selection) = row_selection { + builder = builder.with_row_selection(selection); + } + + builder.build().context(ReadDataPartSnafu) } /// Computes whether to skip field filters for a specific row group based on PreFilterMode. diff --git a/src/mito2/src/read/last_row.rs b/src/mito2/src/read/last_row.rs index 0c13c120a0..1dc4102311 100644 --- a/src/mito2/src/read/last_row.rs +++ b/src/mito2/src/read/last_row.rs @@ -333,10 +333,10 @@ impl FlatRowGroupLastRowCachedReader { } /// Returns the next RecordBatch. - pub(crate) fn next_batch(&mut self) -> Result> { + pub(crate) async fn next_batch(&mut self) -> Result> { match self { FlatRowGroupLastRowCachedReader::Hit(r) => r.next_batch(), - FlatRowGroupLastRowCachedReader::Miss(r) => r.next_batch(), + FlatRowGroupLastRowCachedReader::Miss(r) => r.next_batch().await, } } @@ -466,12 +466,12 @@ impl FlatRowGroupLastRowReader { Ok(Some(merged)) } - fn next_batch(&mut self) -> Result> { + async fn next_batch(&mut self) -> Result> { if self.pending.is_full() { return self.flush_pending(); } - while let Some(batch) = self.reader.next_batch()? { + while let Some(batch) = self.reader.next_batch().await? { self.selector.on_next(batch, &mut self.pending)?; if self.pending.is_full() { return self.flush_pending(); diff --git a/src/mito2/src/read/prune.rs b/src/mito2/src/read/prune.rs index 2f9fa002d4..6766bf3f38 100644 --- a/src/mito2/src/read/prune.rs +++ b/src/mito2/src/read/prune.rs @@ -247,10 +247,10 @@ pub enum FlatSource { } impl FlatSource { - fn next_batch(&mut self) -> Result> { + async fn next_batch(&mut self) -> Result> { match self { - FlatSource::RowGroup(r) => r.next_batch(), - FlatSource::LastRow(r) => r.next_batch(), + FlatSource::RowGroup(r) => r.next_batch().await, + FlatSource::LastRow(r) => r.next_batch().await, } } } @@ -297,13 +297,16 @@ impl FlatPruneReader { self.metrics.clone() } - pub(crate) fn next_batch(&mut self) -> Result> { - while let Some(record_batch) = { + pub(crate) async fn next_batch(&mut self) -> Result> { + loop { let start = std::time::Instant::now(); - let batch = self.source.next_batch()?; + let batch = self.source.next_batch().await?; self.metrics.scan_cost += start.elapsed(); - batch - } { + + let Some(record_batch) = batch else { + return Ok(None); + }; + // Update metrics for the received batch self.metrics.num_rows += record_batch.num_rows(); self.metrics.num_batches += 1; @@ -317,8 +320,6 @@ impl FlatPruneReader { } } } - - Ok(None) } /// Prunes batches by the pushed down predicate and returns RecordBatch. diff --git a/src/mito2/src/read/scan_util.rs b/src/mito2/src/read/scan_util.rs index 6f68616709..9bf1c17276 100644 --- a/src/mito2/src/read/scan_util.rs +++ b/src/mito2/src/read/scan_util.rs @@ -1533,7 +1533,7 @@ pub fn build_flat_file_range_scan_stream( .transpose()?; let mapper = range.compaction_projection_mapper(); - while let Some(record_batch) = reader.next_batch()? { + while let Some(record_batch) = reader.next_batch().await? { let record_batch = if let Some(mapper) = mapper { let batch = mapper.project(record_batch)?; batch diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs index fb8e1d1fc2..79a08a209d 100644 --- a/src/mito2/src/sst/parquet.rs +++ b/src/mito2/src/sst/parquet.rs @@ -24,6 +24,7 @@ use crate::sst::DEFAULT_WRITE_BUFFER_SIZE; use crate::sst::file::FileTimeRange; use crate::sst::index::IndexOutput; +pub(crate) mod async_reader; pub mod file_range; pub mod flat_format; pub mod format; diff --git a/src/mito2/src/sst/parquet/async_reader.rs b/src/mito2/src/sst/parquet/async_reader.rs new file mode 100644 index 0000000000..a060fd367d --- /dev/null +++ b/src/mito2/src/sst/parquet/async_reader.rs @@ -0,0 +1,221 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Async file reader implementation for SST parquet files. + +use std::ops::Range; +use std::sync::Arc; + +use bytes::Bytes; +use futures::FutureExt; +use futures::future::BoxFuture; +use object_store::ObjectStore; +use parquet::arrow::async_reader::AsyncFileReader; +use parquet::errors::{ParquetError, Result as ParquetResult}; +use parquet::file::metadata::ParquetMetaData; + +use crate::cache::file_cache::{FileType, IndexKey}; +use crate::cache::{CacheStrategy, PageKey, PageValue}; +use crate::metrics::{READ_STAGE_ELAPSED, READ_STAGE_FETCH_PAGES}; +use crate::sst::file::RegionFileId; +use crate::sst::parquet::helper::fetch_byte_ranges; +use crate::sst::parquet::row_group::{ParquetFetchMetrics, compute_total_range_size}; + +/// An [AsyncFileReader] implementation for SST parquet files. +/// +/// This reader provides async byte access to parquet data in object storage, +/// with caching support (page cache and write cache). +pub struct SstAsyncFileReader { + /// Region file ID for cache key. + region_file_id: RegionFileId, + /// Path to the parquet file in object storage. + file_path: String, + /// Object store for reading data. + object_store: ObjectStore, + /// Cache strategy for reading pages. + cache_strategy: CacheStrategy, + /// Cached parquet metadata. + metadata: Arc, + /// Row group index for cache key. + row_group_idx: usize, + /// Optional metrics for tracking fetch operations. + fetch_metrics: Option, +} + +impl SstAsyncFileReader { + /// Creates a new [SstAsyncFileReader]. + pub fn new( + region_file_id: RegionFileId, + file_path: String, + object_store: ObjectStore, + cache_strategy: CacheStrategy, + metadata: Arc, + row_group_idx: usize, + ) -> Self { + Self { + region_file_id, + file_path, + object_store, + cache_strategy, + metadata, + row_group_idx, + fetch_metrics: None, + } + } + + /// Sets the fetch metrics. + pub fn with_fetch_metrics(mut self, metrics: Option) -> Self { + self.fetch_metrics = metrics; + self + } + + /// Fetches byte ranges from page cache, write cache, or object store. + async fn fetch_bytes_with_cache(&self, ranges: Vec>) -> ParquetResult> { + let fetch_start = self + .fetch_metrics + .as_ref() + .map(|_| std::time::Instant::now()); + let _timer = READ_STAGE_FETCH_PAGES.start_timer(); + + let page_key = PageKey::new( + self.region_file_id.file_id(), + self.row_group_idx, + ranges.clone(), + ); + + // Check page cache first. + if let Some(pages) = self.cache_strategy.get_pages(&page_key) { + if let Some(metrics) = &self.fetch_metrics { + let total_size: u64 = ranges.iter().map(|r| r.end - r.start).sum(); + let mut metrics_data = metrics.data.lock().unwrap(); + metrics_data.page_cache_hit += 1; + metrics_data.pages_to_fetch_mem += ranges.len(); + metrics_data.page_size_to_fetch_mem += total_size; + metrics_data.page_size_needed += total_size; + if let Some(start) = fetch_start { + metrics_data.total_fetch_elapsed += start.elapsed(); + } + } + return Ok(pages.compressed.clone()); + } + + // Calculate total range size for metrics. + let (total_range_size, unaligned_size) = compute_total_range_size(&ranges); + + // Check write cache. + let key = IndexKey::new( + self.region_file_id.region_id(), + self.region_file_id.file_id(), + FileType::Parquet, + ); + let fetch_write_cache_start = self + .fetch_metrics + .as_ref() + .map(|_| std::time::Instant::now()); + let write_cache_result = self.fetch_ranges_from_write_cache(key, &ranges).await; + + let pages = match write_cache_result { + Some(data) => { + if let Some(metrics) = &self.fetch_metrics { + let elapsed = fetch_write_cache_start + .map(|start| start.elapsed()) + .unwrap_or_default(); + let range_size_needed: u64 = ranges.iter().map(|r| r.end - r.start).sum(); + let mut metrics_data = metrics.data.lock().unwrap(); + metrics_data.write_cache_fetch_elapsed += elapsed; + metrics_data.write_cache_hit += 1; + metrics_data.pages_to_fetch_write_cache += ranges.len(); + metrics_data.page_size_to_fetch_write_cache += unaligned_size; + metrics_data.page_size_needed += range_size_needed; + } + data + } + None => { + // Fetch data from object store. + let _timer = READ_STAGE_ELAPSED + .with_label_values(&["cache_miss_read"]) + .start_timer(); + + let start = self + .fetch_metrics + .as_ref() + .map(|_| std::time::Instant::now()); + let data = fetch_byte_ranges(&self.file_path, self.object_store.clone(), &ranges) + .await + .map_err(|e| ParquetError::External(Box::new(e)))?; + + if let Some(metrics) = &self.fetch_metrics { + let elapsed = start.map(|start| start.elapsed()).unwrap_or_default(); + let range_size_needed: u64 = ranges.iter().map(|r| r.end - r.start).sum(); + let mut metrics_data = metrics.data.lock().unwrap(); + metrics_data.store_fetch_elapsed += elapsed; + metrics_data.cache_miss += 1; + metrics_data.pages_to_fetch_store += ranges.len(); + metrics_data.page_size_to_fetch_store += unaligned_size; + metrics_data.page_size_needed += range_size_needed; + } + data + } + }; + + // Put pages back to the cache. + let page_value = PageValue::new(pages.clone(), total_range_size); + self.cache_strategy + .put_pages(page_key, Arc::new(page_value)); + + if let (Some(metrics), Some(start)) = (&self.fetch_metrics, fetch_start) { + metrics.data.lock().unwrap().total_fetch_elapsed += start.elapsed(); + } + + Ok(pages) + } + + /// Fetches data from write cache. + /// Returns `None` if the data is not in the cache. + async fn fetch_ranges_from_write_cache( + &self, + key: IndexKey, + ranges: &[Range], + ) -> Option> { + if let Some(cache) = self.cache_strategy.write_cache() { + return cache.file_cache().read_ranges(key, ranges).await; + } + None + } +} + +impl AsyncFileReader for SstAsyncFileReader { + fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, ParquetResult> { + async move { + let mut result = self.fetch_bytes_with_cache(vec![range]).await?; + Ok(result.pop().unwrap_or_default()) + } + .boxed() + } + + fn get_byte_ranges( + &mut self, + ranges: Vec>, + ) -> BoxFuture<'_, ParquetResult>> { + async move { self.fetch_bytes_with_cache(ranges).await }.boxed() + } + + fn get_metadata( + &mut self, + _options: Option<&parquet::arrow::arrow_reader::ArrowReaderOptions>, + ) -> BoxFuture<'_, ParquetResult>> { + // Metadata is already cached, return it immediately. + std::future::ready(Ok(self.metadata.clone())).boxed() + } +} diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs index 855204b80e..f152c97075 100644 --- a/src/mito2/src/sst/parquet/reader.rs +++ b/src/mito2/src/sst/parquet/reader.rs @@ -26,14 +26,15 @@ use common_telemetry::{tracing, warn}; use datafusion_expr::Expr; use datatypes::arrow::array::ArrayRef; use datatypes::arrow::datatypes::Field; -use datatypes::arrow::error::ArrowError; use datatypes::arrow::record_batch::RecordBatch; use datatypes::data_type::ConcreteDataType; use datatypes::prelude::DataType; +use futures::StreamExt; use mito_codec::row_converter::build_primary_key_codec; use object_store::ObjectStore; -use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, RowSelection}; -use parquet::arrow::{FieldLevels, ProjectionMask, parquet_to_arrow_field_levels}; +use parquet::arrow::ProjectionMask; +use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions, RowSelection}; +use parquet::arrow::async_reader::{ParquetRecordBatchStream, ParquetRecordBatchStreamBuilder}; use parquet::file::metadata::{PageIndexPolicy, ParquetMetaData}; use partition::expr::PartitionExpr; use snafu::ResultExt; @@ -47,9 +48,7 @@ use crate::cache::index::result_cache::PredicateKey; use crate::cache::{CacheStrategy, CachedSstMeta}; #[cfg(feature = "vector_index")] use crate::error::ApplyVectorIndexSnafu; -use crate::error::{ - ArrowReaderSnafu, ReadDataPartSnafu, ReadParquetSnafu, Result, SerializePartitionExprSnafu, -}; +use crate::error::{ReadDataPartSnafu, ReadParquetSnafu, Result, SerializePartitionExprSnafu}; use crate::metrics::{ PRECISE_FILTER_ROWS_TOTAL, READ_ROW_GROUPS_TOTAL, READ_ROWS_IN_ROW_GROUP_TOTAL, READ_ROWS_TOTAL, READ_STAGE_ELAPSED, @@ -70,13 +69,14 @@ use crate::sst::index::inverted_index::applier::{ #[cfg(feature = "vector_index")] use crate::sst::index::vector_index::applier::VectorIndexApplierRef; use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE; +use crate::sst::parquet::async_reader::SstAsyncFileReader; use crate::sst::parquet::file_range::{ FileRangeContext, FileRangeContextRef, PartitionFilterContext, PreFilterMode, RangeBase, row_group_contains_delete, }; use crate::sst::parquet::format::{ReadFormat, need_override_sequence}; use crate::sst::parquet::metadata::MetadataLoader; -use crate::sst::parquet::row_group::{InMemoryRowGroup, ParquetFetchMetrics}; +use crate::sst::parquet::row_group::ParquetFetchMetrics; use crate::sst::parquet::row_selection::RowGroupSelection; use crate::sst::parquet::stats::RowGroupPruningStats; use crate::sst::tag_maybe_to_dictionary_field; @@ -415,6 +415,12 @@ impl ParquetReaderBuilder { .set_override_sequence(self.file_handle.meta_ref().sequence.map(|x| x.get())); } + // Computes the projection mask. + let parquet_schema_desc = parquet_meta.file_metadata().schema_descr(); + let indices = read_format.projection_indices(); + // Now we assumes we don't have nested schemas. + // TODO(yingwen): Revisit this if we introduce nested types such as JSON type. + let projection_mask = ProjectionMask::roots(parquet_schema_desc, indices.iter().copied()); let selection = self .row_groups_to_read(&read_format, &parquet_meta, &mut metrics.filter_metrics) .await; @@ -446,26 +452,20 @@ impl ParquetReaderBuilder { .map(|meta| meta.schema.clone()) .unwrap_or_else(|| region_meta.schema.clone()); - // Computes the projection mask. - let parquet_schema_desc = parquet_meta.file_metadata().schema_descr(); - let indices = read_format.projection_indices(); - // Now we assumes we don't have nested schemas. - // TODO(yingwen): Revisit this if we introduce nested types such as JSON type. - let projection_mask = ProjectionMask::roots(parquet_schema_desc, indices.iter().copied()); - - // Computes the field levels. - let hint = Some(read_format.arrow_schema().fields()); - let field_levels = - parquet_to_arrow_field_levels(parquet_schema_desc, projection_mask.clone(), hint) + // Create ArrowReaderMetadata for async stream building. + let arrow_reader_options = + ArrowReaderOptions::new().with_schema(read_format.arrow_schema().clone()); + let arrow_metadata = + ArrowReaderMetadata::try_new(parquet_meta.clone(), arrow_reader_options) .context(ReadDataPartSnafu)?; let reader_builder = RowGroupReaderBuilder { file_handle: self.file_handle.clone(), file_path, parquet_meta, + arrow_metadata, object_store: self.object_store.clone(), projection: projection_mask, - field_levels, cache_strategy: self.cache_strategy.clone(), }; @@ -1640,7 +1640,7 @@ impl ReaderMetrics { } } -/// Builder to build a [ParquetRecordBatchReader] for a row group. +/// Builder to build a [ParquetRecordBatchStream] for a row group. pub(crate) struct RowGroupReaderBuilder { /// SST file to read. /// @@ -1650,12 +1650,12 @@ pub(crate) struct RowGroupReaderBuilder { file_path: String, /// Metadata of the parquet file. parquet_meta: Arc, + /// Arrow reader metadata for building async stream. + arrow_metadata: ArrowReaderMetadata, /// Object store as an Operator. object_store: ObjectStore, /// Projection mask. projection: ProjectionMask, - /// Field levels to read. - field_levels: FieldLevels, /// Cache. cache_strategy: CacheStrategy, } @@ -1679,48 +1679,43 @@ impl RowGroupReaderBuilder { &self.cache_strategy } - /// Builds a [ParquetRecordBatchReader] to read the row group at `row_group_idx`. + /// Builds a [ParquetRecordBatchStream] to read the row group at `row_group_idx`. pub(crate) async fn build( &self, row_group_idx: usize, row_selection: Option, fetch_metrics: Option<&ParquetFetchMetrics>, - ) -> Result { - let fetch_start = Instant::now(); - - let mut row_group = InMemoryRowGroup::create( - self.file_handle.region_id(), - self.file_handle.file_id().file_id(), - &self.parquet_meta, - row_group_idx, - self.cache_strategy.clone(), - &self.file_path, + ) -> Result> { + // Create async file reader with caching support. + let async_reader = SstAsyncFileReader::new( + self.file_handle.file_id(), + self.file_path.clone(), self.object_store.clone(), - ); - // Fetches data into memory. - row_group - .fetch(&self.projection, row_selection.as_ref(), fetch_metrics) - .await - .context(ReadParquetSnafu { - path: &self.file_path, - })?; + self.cache_strategy.clone(), + self.parquet_meta.clone(), + row_group_idx, + ) + .with_fetch_metrics(fetch_metrics.cloned()); - // Record total fetch elapsed time. - if let Some(metrics) = fetch_metrics { - metrics.data.lock().unwrap().total_fetch_elapsed += fetch_start.elapsed(); + // Build the async stream using ArrowReaderBuilder API. + let mut builder = ParquetRecordBatchStreamBuilder::new_with_metadata( + async_reader, + self.arrow_metadata.clone(), + ); + builder = builder + .with_row_groups(vec![row_group_idx]) + .with_projection(self.projection.clone()) + .with_batch_size(DEFAULT_READ_BATCH_SIZE); + + if let Some(selection) = row_selection { + builder = builder.with_row_selection(selection); } - // Builds the parquet reader. - // Now the row selection is None. - ParquetRecordBatchReader::try_new_with_row_groups( - &self.field_levels, - &row_group, - DEFAULT_READ_BATCH_SIZE, - row_selection, - ) - .context(ReadParquetSnafu { + let stream = builder.build().context(ReadParquetSnafu { path: &self.file_path, - }) + })?; + + Ok(stream) } } @@ -1850,7 +1845,7 @@ impl ParquetReader { pub async fn next_record_batch(&mut self) -> Result> { loop { if let Some(reader) = &mut self.reader { - if let Some(batch) = reader.next_batch()? { + if let Some(batch) = reader.next_batch().await? { return Ok(Some(batch)); } self.reader = None; @@ -1929,27 +1924,19 @@ impl ParquetReader { /// RowGroupReaderContext represents the fields that cannot be shared /// between different `RowGroupReader`s. pub(crate) trait RowGroupReaderContext: Send { - fn map_result( - &self, - result: std::result::Result, ArrowError>, - ) -> Result>; - fn read_format(&self) -> &ReadFormat; + + fn file_path(&self) -> &str; } impl RowGroupReaderContext for FileRangeContextRef { - fn map_result( - &self, - result: std::result::Result, ArrowError>, - ) -> Result> { - result.context(ArrowReaderSnafu { - path: self.file_path(), - }) - } - fn read_format(&self) -> &ReadFormat { self.as_ref().read_format() } + + fn file_path(&self) -> &str { + self.as_ref().file_path() + } } /// [RowGroupReader] that reads from [FileRange]. @@ -1957,8 +1944,11 @@ pub(crate) type RowGroupReader = RowGroupReaderBase; impl RowGroupReader { /// Creates a new reader from file range. - pub(crate) fn new(context: FileRangeContextRef, reader: ParquetRecordBatchReader) -> Self { - Self::create(context, reader) + pub(crate) fn new( + context: FileRangeContextRef, + stream: ParquetRecordBatchStream, + ) -> Self { + Self::create(context, stream) } } @@ -1966,8 +1956,8 @@ impl RowGroupReader { pub(crate) struct RowGroupReaderBase { /// Context of [RowGroupReader] so adapts to different underlying implementation. context: T, - /// Inner parquet reader. - reader: ParquetRecordBatchReader, + /// Inner parquet record batch stream. + stream: ParquetRecordBatchStream, /// Buffered batches to return. batches: VecDeque, /// Local scan metrics. @@ -1981,7 +1971,7 @@ where T: RowGroupReaderContext, { /// Creates a new reader to read the primary key format. - pub(crate) fn create(context: T, reader: ParquetRecordBatchReader) -> Self { + pub(crate) fn create(context: T, stream: ParquetRecordBatchStream) -> Self { // The batch length from the reader should be less than or equal to DEFAULT_READ_BATCH_SIZE. let override_sequence = context .read_format() @@ -1990,7 +1980,7 @@ where Self { context, - reader, + stream, batches: VecDeque::new(), metrics: ReaderMetrics::default(), override_sequence, @@ -2007,13 +1997,18 @@ where self.context.read_format() } - /// Tries to fetch next [RecordBatch] from the reader. - fn fetch_next_record_batch(&mut self) -> Result> { - self.context.map_result(self.reader.next().transpose()) + /// Tries to fetch next [RecordBatch] from the stream asynchronously. + async fn fetch_next_record_batch(&mut self) -> Result> { + match self.stream.next().await.transpose() { + Ok(batch) => Ok(batch), + Err(e) => Err(e).context(ReadParquetSnafu { + path: self.context.file_path(), + }), + } } /// Returns the next [Batch]. - pub(crate) fn next_inner(&mut self) -> Result> { + pub(crate) async fn next_inner(&mut self) -> Result> { let scan_start = Instant::now(); if let Some(batch) = self.batches.pop_front() { self.metrics.num_rows += batch.num_rows(); @@ -2023,7 +2018,7 @@ where // We need to fetch next record batch and convert it to batches. while self.batches.is_empty() { - let Some(record_batch) = self.fetch_next_record_batch()? else { + let Some(record_batch) = self.fetch_next_record_batch().await? else { self.metrics.scan_cost += scan_start.elapsed(); return Ok(None); }; @@ -2051,10 +2046,10 @@ where #[async_trait::async_trait] impl BatchReader for RowGroupReaderBase where - T: RowGroupReaderContext, + T: RowGroupReaderContext + Send + Sync, { async fn next_batch(&mut self) -> Result> { - self.next_inner() + self.next_inner().await } } @@ -2062,15 +2057,18 @@ where pub(crate) struct FlatRowGroupReader { /// Context for file ranges. context: FileRangeContextRef, - /// Inner parquet reader. - reader: ParquetRecordBatchReader, + /// Inner parquet record batch stream. + stream: ParquetRecordBatchStream, /// Cached sequence array to override sequences. override_sequence: Option, } impl FlatRowGroupReader { /// Creates a new flat reader from file range. - pub(crate) fn new(context: FileRangeContextRef, reader: ParquetRecordBatchReader) -> Self { + pub(crate) fn new( + context: FileRangeContextRef, + stream: ParquetRecordBatchStream, + ) -> Self { // The batch length from the reader should be less than or equal to DEFAULT_READ_BATCH_SIZE. let override_sequence = context .read_format() @@ -2078,16 +2076,16 @@ impl FlatRowGroupReader { Self { context, - reader, + stream, override_sequence, } } /// Returns the next RecordBatch. - pub(crate) fn next_batch(&mut self) -> Result> { - match self.reader.next() { + pub(crate) async fn next_batch(&mut self) -> Result> { + match self.stream.next().await { Some(batch_result) => { - let record_batch = batch_result.context(ArrowReaderSnafu { + let record_batch = batch_result.context(ReadParquetSnafu { path: self.context.file_path(), })?; diff --git a/src/mito2/src/sst/parquet/row_group.rs b/src/mito2/src/sst/parquet/row_group.rs index 8f3f6c5f62..38ef62c6b8 100644 --- a/src/mito2/src/sst/parquet/row_group.rs +++ b/src/mito2/src/sst/parquet/row_group.rs @@ -12,28 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -//! Ports private structs from [parquet crate](https://github.com/apache/arrow-rs/blob/7e134f4d277c0b62c27529fc15a4739de3ad0afd/parquet/src/arrow/async_reader/mod.rs#L644-L650). +//! Parquet row group reading utilities. use std::ops::Range; use std::sync::Arc; -use bytes::{Buf, Bytes}; -use object_store::ObjectStore; -use parquet::arrow::ProjectionMask; -use parquet::arrow::arrow_reader::{RowGroups, RowSelection}; -use parquet::column::page::{PageIterator, PageReader}; -use parquet::errors::{ParquetError, Result}; -use parquet::file::metadata::{ParquetMetaData, RowGroupMetaData}; -use parquet::file::page_index::offset_index::OffsetIndexMetaData; -use parquet::file::reader::{ChunkReader, Length}; -use parquet::file::serialized_reader::SerializedPageReader; -use store_api::storage::{FileId, RegionId}; -use tokio::task::yield_now; - -use crate::cache::file_cache::{FileType, IndexKey}; -use crate::cache::{CacheStrategy, PageKey, PageValue}; -use crate::metrics::{READ_STAGE_ELAPSED, READ_STAGE_FETCH_PAGES}; -use crate::sst::parquet::helper::{MERGE_GAP, fetch_byte_ranges}; +use crate::sst::parquet::helper::MERGE_GAP; /// Inner data for ParquetFetchMetrics. #[derive(Default, Debug, Clone)] @@ -74,9 +58,9 @@ impl ParquetFetchMetricsData { } /// Metrics for tracking page/row group fetch operations. -#[derive(Default)] +#[derive(Default, Clone)] pub struct ParquetFetchMetrics { - pub data: std::sync::Mutex, + pub data: Arc>, } impl std::fmt::Debug for ParquetFetchMetrics { @@ -204,363 +188,12 @@ impl ParquetFetchMetrics { } } -pub(crate) struct RowGroupBase<'a> { - parquet_metadata: &'a ParquetMetaData, - row_group_idx: usize, - pub(crate) offset_index: Option<&'a [OffsetIndexMetaData]>, - /// Compressed page of each column. - column_chunks: Vec>>, - pub(crate) row_count: usize, -} - -impl<'a> RowGroupBase<'a> { - pub(crate) fn new(parquet_meta: &'a ParquetMetaData, row_group_idx: usize) -> Self { - let metadata = parquet_meta.row_group(row_group_idx); - // `offset_index` is always `None` if we don't set - // [with_page_index()](https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index) - // to `true`. - let offset_index = parquet_meta - .offset_index() - // filter out empty offset indexes (old versions specified Some(vec![]) when no present) - .filter(|index| !index.is_empty()) - .map(|x| x[row_group_idx].as_slice()); - - Self { - parquet_metadata: parquet_meta, - row_group_idx, - offset_index, - column_chunks: vec![None; metadata.columns().len()], - row_count: metadata.num_rows() as usize, - } - } - - pub(crate) fn calc_sparse_read_ranges( - &self, - projection: &ProjectionMask, - offset_index: &[OffsetIndexMetaData], - selection: &RowSelection, - ) -> (Vec>, Vec>) { - // If we have a `RowSelection` and an `OffsetIndex` then only fetch pages required for the - // `RowSelection` - let mut page_start_offsets: Vec> = vec![]; - let ranges = self - .column_chunks - .iter() - .zip(self.row_group_metadata().columns()) - .enumerate() - .filter(|&(idx, (chunk, _chunk_meta))| chunk.is_none() && projection.leaf_included(idx)) - .flat_map(|(idx, (_chunk, chunk_meta))| { - // If the first page does not start at the beginning of the column, - // then we need to also fetch a dictionary page. - let mut ranges = vec![]; - let (start, _len) = chunk_meta.byte_range(); - match offset_index[idx].page_locations.first() { - Some(first) if first.offset as u64 != start => { - ranges.push(start..first.offset as u64); - } - _ => (), - } - - ranges.extend( - selection - .scan_ranges(&offset_index[idx].page_locations) - .iter() - .map(|range| range.start..range.end), - ); - page_start_offsets.push(ranges.iter().map(|range| range.start as usize).collect()); - - ranges - }) - .collect::>(); - (ranges, page_start_offsets) - } - - pub(crate) fn assign_sparse_chunk( - &mut self, - projection: &ProjectionMask, - data: Vec, - page_start_offsets: Vec>, - ) { - let mut page_start_offsets = page_start_offsets.into_iter(); - let mut chunk_data = data.into_iter(); - - for (idx, chunk) in self.column_chunks.iter_mut().enumerate() { - if chunk.is_some() || !projection.leaf_included(idx) { - continue; - } - - if let Some(offsets) = page_start_offsets.next() { - let mut chunks = Vec::with_capacity(offsets.len()); - for _ in 0..offsets.len() { - chunks.push(chunk_data.next().unwrap()); - } - - let column = self - .parquet_metadata - .row_group(self.row_group_idx) - .column(idx); - *chunk = Some(Arc::new(ColumnChunkData::Sparse { - length: column.byte_range().1 as usize, - data: offsets.into_iter().zip(chunks).collect(), - })) - } - } - } - - pub(crate) fn calc_dense_read_ranges(&self, projection: &ProjectionMask) -> Vec> { - self.column_chunks - .iter() - .enumerate() - .filter(|&(idx, chunk)| chunk.is_none() && projection.leaf_included(idx)) - .map(|(idx, _chunk)| { - let column = self.row_group_metadata().column(idx); - let (start, length) = column.byte_range(); - start..(start + length) - }) - .collect::>() - } - - /// Assigns compressed chunk binary data to [RowGroupBase::column_chunks] - /// and returns the chunk offset and binary data assigned. - pub(crate) fn assign_dense_chunk( - &mut self, - projection: &ProjectionMask, - chunk_data: Vec, - ) { - let mut chunk_data = chunk_data.into_iter(); - - for (idx, chunk) in self.column_chunks.iter_mut().enumerate() { - if chunk.is_some() || !projection.leaf_included(idx) { - continue; - } - - // Get the fetched page. - let Some(data) = chunk_data.next() else { - continue; - }; - - let column = self - .parquet_metadata - .row_group(self.row_group_idx) - .column(idx); - *chunk = Some(Arc::new(ColumnChunkData::Dense { - offset: column.byte_range().0 as usize, - data, - })); - } - } - - /// Create [PageReader] from [RowGroupBase::column_chunks] - pub(crate) fn column_reader( - &self, - col_idx: usize, - ) -> Result> { - let page_reader = match &self.column_chunks[col_idx] { - None => { - return Err(ParquetError::General(format!( - "Invalid column index {col_idx}, column was not fetched" - ))); - } - Some(data) => { - let page_locations = self - .offset_index - // filter out empty offset indexes (old versions specified Some(vec![]) when no present) - .filter(|index| !index.is_empty()) - .map(|index| index[col_idx].page_locations.clone()); - SerializedPageReader::new( - data.clone(), - self.row_group_metadata().column(col_idx), - self.row_count, - page_locations, - )? - } - }; - - Ok(page_reader) - } - - pub(crate) fn parquet_metadata(&self) -> &ParquetMetaData { - self.parquet_metadata - } - - pub(crate) fn row_group_metadata(&self) -> &RowGroupMetaData { - self.parquet_metadata().row_group(self.row_group_idx) - } -} - -/// An in-memory collection of column chunks -pub struct InMemoryRowGroup<'a> { - region_id: RegionId, - file_id: FileId, - row_group_idx: usize, - cache_strategy: CacheStrategy, - file_path: &'a str, - /// Object store. - object_store: ObjectStore, - base: RowGroupBase<'a>, -} - -impl<'a> InMemoryRowGroup<'a> { - /// Creates a new [InMemoryRowGroup] by `row_group_idx`. - /// - /// # Panics - /// Panics if the `row_group_idx` is invalid. - pub fn create( - region_id: RegionId, - file_id: FileId, - parquet_meta: &'a ParquetMetaData, - row_group_idx: usize, - cache_strategy: CacheStrategy, - file_path: &'a str, - object_store: ObjectStore, - ) -> Self { - Self { - region_id, - file_id, - row_group_idx, - cache_strategy, - file_path, - object_store, - base: RowGroupBase::new(parquet_meta, row_group_idx), - } - } - - /// Fetches the necessary column data into memory - pub async fn fetch( - &mut self, - projection: &ProjectionMask, - selection: Option<&RowSelection>, - metrics: Option<&ParquetFetchMetrics>, - ) -> Result<()> { - if let Some((selection, offset_index)) = selection.zip(self.base.offset_index) { - let (fetch_ranges, page_start_offsets) = - self.base - .calc_sparse_read_ranges(projection, offset_index, selection); - - let chunk_data = self.fetch_bytes(&fetch_ranges, metrics).await?; - // Assign sparse chunk data to base. - self.base - .assign_sparse_chunk(projection, chunk_data, page_start_offsets); - } else { - // Release the CPU to avoid blocking the runtime. Since `fetch_pages_from_cache` - // is a synchronous, CPU-bound operation. - yield_now().await; - - // Calculate ranges to read. - let fetch_ranges = self.base.calc_dense_read_ranges(projection); - - if fetch_ranges.is_empty() { - // Nothing to fetch. - return Ok(()); - } - - // Fetch data with ranges - let chunk_data = self.fetch_bytes(&fetch_ranges, metrics).await?; - - // Assigns fetched data to base. - self.base.assign_dense_chunk(projection, chunk_data); - } - - Ok(()) - } - - /// Try to fetch data from the memory cache or the WriteCache, - /// if not in WriteCache, fetch data from object store directly. - async fn fetch_bytes( - &self, - ranges: &[Range], - metrics: Option<&ParquetFetchMetrics>, - ) -> Result> { - // Now fetch page timer includes the whole time to read pages. - let _timer = READ_STAGE_FETCH_PAGES.start_timer(); - - let page_key = PageKey::new(self.file_id, self.row_group_idx, ranges.to_vec()); - if let Some(pages) = self.cache_strategy.get_pages(&page_key) { - if let Some(metrics) = metrics { - let total_size: u64 = ranges.iter().map(|r| r.end - r.start).sum(); - let mut metrics_data = metrics.data.lock().unwrap(); - metrics_data.page_cache_hit += 1; - metrics_data.pages_to_fetch_mem += ranges.len(); - metrics_data.page_size_to_fetch_mem += total_size; - metrics_data.page_size_needed += total_size; - } - return Ok(pages.compressed.clone()); - } - - // Calculate total range size for metrics. - let (total_range_size, unaligned_size) = compute_total_range_size(ranges); - - let key = IndexKey::new(self.region_id, self.file_id, FileType::Parquet); - let fetch_write_cache_start = metrics.map(|_| std::time::Instant::now()); - let write_cache_result = self.fetch_ranges_from_write_cache(key, ranges).await; - let pages = match write_cache_result { - Some(data) => { - if let Some(metrics) = metrics { - let elapsed = fetch_write_cache_start - .map(|start| start.elapsed()) - .unwrap_or_default(); - let range_size_needed: u64 = ranges.iter().map(|r| r.end - r.start).sum(); - let mut metrics_data = metrics.data.lock().unwrap(); - metrics_data.write_cache_fetch_elapsed += elapsed; - metrics_data.write_cache_hit += 1; - metrics_data.pages_to_fetch_write_cache += ranges.len(); - metrics_data.page_size_to_fetch_write_cache += unaligned_size; - metrics_data.page_size_needed += range_size_needed; - } - data - } - None => { - // Fetch data from object store. - let _timer = READ_STAGE_ELAPSED - .with_label_values(&["cache_miss_read"]) - .start_timer(); - - let start = metrics.map(|_| std::time::Instant::now()); - let data = fetch_byte_ranges(self.file_path, self.object_store.clone(), ranges) - .await - .map_err(|e| ParquetError::External(Box::new(e)))?; - if let Some(metrics) = metrics { - let elapsed = start.map(|start| start.elapsed()).unwrap_or_default(); - let range_size_needed: u64 = ranges.iter().map(|r| r.end - r.start).sum(); - let mut metrics_data = metrics.data.lock().unwrap(); - metrics_data.store_fetch_elapsed += elapsed; - metrics_data.cache_miss += 1; - metrics_data.pages_to_fetch_store += ranges.len(); - metrics_data.page_size_to_fetch_store += unaligned_size; - metrics_data.page_size_needed += range_size_needed; - } - data - } - }; - - // Put pages back to the cache. - let page_value = PageValue::new(pages.clone(), total_range_size); - self.cache_strategy - .put_pages(page_key, Arc::new(page_value)); - - Ok(pages) - } - - /// Fetches data from write cache. - /// Returns `None` if the data is not in the cache. - async fn fetch_ranges_from_write_cache( - &self, - key: IndexKey, - ranges: &[Range], - ) -> Option> { - if let Some(cache) = self.cache_strategy.write_cache() { - return cache.file_cache().read_ranges(key, ranges).await; - } - None - } -} - /// Computes the max possible buffer size to read the given `ranges`. /// Returns (aligned_size, unaligned_size) where: /// - aligned_size: total size aligned to pooled buffer size /// - unaligned_size: actual total size without alignment // See https://github.com/apache/opendal/blob/v0.54.0/core/src/types/read/reader.rs#L166-L192 -fn compute_total_range_size(ranges: &[Range]) -> (u64, u64) { +pub(crate) fn compute_total_range_size(ranges: &[Range]) -> (u64, u64) { if ranges.is_empty() { return (0, 0); } @@ -602,96 +235,3 @@ fn align_to_pooled_buf_size(size: u64) -> u64 { const POOLED_BUF_SIZE: u64 = 2 * 1024 * 1024; size.div_ceil(POOLED_BUF_SIZE) * POOLED_BUF_SIZE } - -impl RowGroups for InMemoryRowGroup<'_> { - fn num_rows(&self) -> usize { - self.base.row_count - } - - fn column_chunks(&self, i: usize) -> Result> { - // Creates a page reader to read column at `i`. - let page_reader = self.base.column_reader(i)?; - - Ok(Box::new(ColumnChunkIterator { - reader: Some(Ok(Box::new(page_reader))), - })) - } - - fn row_groups(&self) -> Box + '_> { - Box::new(std::iter::once(self.base.row_group_metadata())) - } - - fn metadata(&self) -> &ParquetMetaData { - self.base.parquet_metadata() - } -} - -/// An in-memory column chunk -#[derive(Clone)] -pub(crate) enum ColumnChunkData { - /// Column chunk data representing only a subset of data pages - Sparse { - /// Length of the full column chunk - length: usize, - /// Set of data pages included in this sparse chunk. Each element is a tuple - /// of (page offset, page data) - data: Vec<(usize, Bytes)>, - }, - /// Full column chunk and its offset - Dense { offset: usize, data: Bytes }, -} - -impl ColumnChunkData { - fn get(&self, start: u64) -> Result { - match &self { - ColumnChunkData::Sparse { data, .. } => data - .binary_search_by_key(&start, |(offset, _)| *offset as u64) - .map(|idx| data[idx].1.clone()) - .map_err(|_| { - ParquetError::General(format!( - "Invalid offset in sparse column chunk data: {start}" - )) - }), - ColumnChunkData::Dense { offset, data } => { - let start = start as usize - *offset; - Ok(data.slice(start..)) - } - } - } -} - -impl Length for ColumnChunkData { - fn len(&self) -> u64 { - match &self { - ColumnChunkData::Sparse { length, .. } => *length as u64, - ColumnChunkData::Dense { data, .. } => data.len() as u64, - } - } -} - -impl ChunkReader for ColumnChunkData { - type T = bytes::buf::Reader; - - fn get_read(&self, start: u64) -> Result { - Ok(self.get(start)?.reader()) - } - - fn get_bytes(&self, start: u64, length: usize) -> Result { - Ok(self.get(start)?.slice(..length)) - } -} - -/// Implements [`PageIterator`] for a single column chunk, yielding a single [`PageReader`] -pub(crate) struct ColumnChunkIterator { - pub(crate) reader: Option>>, -} - -impl Iterator for ColumnChunkIterator { - type Item = Result>; - - fn next(&mut self) -> Option { - self.reader.take() - } -} - -impl PageIterator for ColumnChunkIterator {} From 35c5a4adb7c390969d8d42f1ec23300ad14dc90b Mon Sep 17 00:00:00 2001 From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com> Date: Wed, 25 Mar 2026 20:26:27 +0800 Subject: [PATCH 037/195] fix(mito2): accept post-truncate flush for skip-wal tables (#7858) Allow flush edits with equal entry ids when flushed sequence advances, so close-time flush after truncate still succeeds for skip-wal regions while stale pre-truncate flushes are rejected. Add a regression test for create->truncate->write->close timing. Signed-off-by: Lei, HUANG --- src/mito2/src/engine/skip_wal_test.rs | 77 ++++++++++++++++++++++++++- src/mito2/src/region.rs | 17 +++++- 2 files changed, 92 insertions(+), 2 deletions(-) diff --git a/src/mito2/src/engine/skip_wal_test.rs b/src/mito2/src/engine/skip_wal_test.rs index d1b38c47fb..c59be6ba2c 100644 --- a/src/mito2/src/engine/skip_wal_test.rs +++ b/src/mito2/src/engine/skip_wal_test.rs @@ -15,7 +15,9 @@ use api::v1::Rows; use common_wal::options::{WAL_OPTIONS_KEY, WalOptions}; use store_api::region_engine::{RegionEngine, RegionRole}; -use store_api::region_request::{RegionCloseRequest, RegionRequest}; +use store_api::region_request::{ + RegionCloseRequest, RegionOpenRequest, RegionRequest, RegionTruncateRequest, +}; use store_api::storage::{RegionId, ScanRequest}; use crate::config::MitoConfig; @@ -168,3 +170,76 @@ async fn test_close_follower_region_skip_wal() { let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); assert_eq!(0, total_rows); } + +#[tokio::test] +async fn test_close_region_after_truncate_skip_wal() { + common_telemetry::init_default_ut_logging(); + let mut env = TestEnv::with_prefix("close-truncate-skip-wal").await; + let engine = env.create_engine(MitoConfig::default()).await; + + let region_id = RegionId::new(1, 1); + let mut request = CreateRequestBuilder::new().build(); + let wal_options = WalOptions::Noop; + request.options.insert( + WAL_OPTIONS_KEY.to_string(), + serde_json::to_string(&wal_options).unwrap(), + ); + + engine + .handle_request(region_id, RegionRequest::Create(request.clone())) + .await + .unwrap(); + + engine + .handle_request( + region_id, + RegionRequest::Truncate(RegionTruncateRequest::All), + ) + .await + .unwrap(); + + let region = engine.get_region(region_id).unwrap(); + let version_data = region.version_control.current(); + assert_eq!( + version_data.version.truncated_entry_id, + Some(version_data.last_entry_id) + ); + + let rows = Rows { + schema: rows_schema(&request), + rows: build_rows(0, 3), + }; + put_rows(&engine, region_id, rows).await; + + let region = engine.get_region(region_id).unwrap(); + assert!(!region.version().memtables.is_empty()); + + engine + .handle_request(region_id, RegionRequest::Close(RegionCloseRequest {})) + .await + .unwrap(); + + engine + .handle_request( + region_id, + RegionRequest::Open(RegionOpenRequest { + engine: String::new(), + table_dir: request.table_dir, + path_type: store_api::region_request::PathType::Bare, + options: request.options, + skip_wal_replay: false, + checkpoint: None, + }), + ) + .await + .unwrap(); + let stream = engine + .scan_to_stream(region_id, ScanRequest::default()) + .await + .unwrap(); + let batches = common_recordbatch::RecordBatches::try_collect(stream) + .await + .unwrap(); + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(3, total_rows); +} diff --git a/src/mito2/src/region.rs b/src/mito2/src/region.rs index de8927c4de..3020c9ecf4 100644 --- a/src/mito2/src/region.rs +++ b/src/mito2/src/region.rs @@ -973,8 +973,23 @@ impl ManifestContext { // This is an edit from flush. if let Some(flushed_entry_id) = edit.flushed_entry_id { + // A flush edit is valid after truncate in two cases: + // 1. `flushed_entry_id` moves past `truncated_entry_id`, meaning it definitely + // flushed data newer than the truncate point. + // 2. `flushed_entry_id` equals `truncated_entry_id`, but `flushed_sequence` + // increases. This happens in skip-WAL tables where entry id can stay at 0, + // while sequence still advances for post-truncate writes. + // + // We still reject stale flushes from before truncate: + // if entry id is equal and sequence does not advance, the flush is outdated. + let is_newer_entry = truncated_entry_id < flushed_entry_id; + let is_same_entry_with_newer_sequence = truncated_entry_id == flushed_entry_id + && edit.flushed_sequence.is_some_and(|flushed_sequence| { + manifest.flushed_sequence < flushed_sequence + }); + ensure!( - truncated_entry_id < flushed_entry_id, + is_newer_entry || is_same_entry_with_newer_sequence, RegionTruncatedSnafu { region_id: manifest.metadata.region_id, } From ec9d57cecc098b72a2382dd4de3817bb18fbdc12 Mon Sep 17 00:00:00 2001 From: Boudewijn van Groos Date: Wed, 25 Mar 2026 18:58:45 +0100 Subject: [PATCH 038/195] fix: nested views not working (#7857) Signed-off-by: Boudewijn van Groos --- src/catalog/src/table_source.rs | 6 +++- .../standalone/common/view/create.result | 31 ++++++++++++++++++- tests/cases/standalone/common/view/create.sql | 8 ++++- 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/src/catalog/src/table_source.rs b/src/catalog/src/table_source.rs index 132e02fe14..8aabf64e99 100644 --- a/src/catalog/src/table_source.rs +++ b/src/catalog/src/table_source.rs @@ -151,7 +151,11 @@ impl DfTableSourceProvider { let catalog_list = Arc::new(DummyCatalogList::new(self.catalog_manager.clone())); let logical_plan = self .plan_decoder - .decode(Bytes::from(view_info.view_info.clone()), catalog_list, true) + .decode( + Bytes::from(view_info.view_info.clone()), + catalog_list, + false, + ) .await .context(DecodePlanSnafu { name: &table.table_info().name, diff --git a/tests/cases/standalone/common/view/create.result b/tests/cases/standalone/common/view/create.result index 1c6e0ee50b..76b9838628 100644 --- a/tests/cases/standalone/common/view/create.result +++ b/tests/cases/standalone/common/view/create.result @@ -30,6 +30,10 @@ CREATE VIEW test_view as SELECT * FROM public.numbers; Affected Rows: 0 +CREATE VIEW test_view2 as SELECT * FROM test_view; + +Affected Rows: 0 + --- View already exists ---- CREATE VIEW test_view as SELECT * FROM public.numbers; @@ -51,6 +55,7 @@ SHOW TABLES; | numbers | | test_table | | test_view | +| test_view2 | +------------------+ SHOW FULL TABLES; @@ -61,6 +66,7 @@ SHOW FULL TABLES; | numbers | LOCAL TEMPORARY | | test_table | BASE TABLE | | test_view | VIEW | +| test_view2 | VIEW | +------------------+-----------------+ -- psql: \dv @@ -124,17 +130,19 @@ SELECT * FROM INFORMATION_SCHEMA.TABLES ORDER BY TABLE_NAME, TABLE_TYPE; |greptime|information_schema|tables|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||Y| |greptime|public|test_table|BASETABLE|ID|ID|ID|ID|ID|ID|mito|ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||N| |greptime|public|test_view|VIEW|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||N| +|greptime|public|test_view2|VIEW|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||N| |greptime|information_schema|views|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||Y| +++++++++++++++++++++++++ -- SQLNESS REPLACE (\s\d+\s) ID -- SQLNESS REPLACE (\s[\-0-9T:\.]{15,}) DATETIME -SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'VIEW'; +SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'VIEW' ORDER BY TABLE_NAME; +---------------+--------------+------------+------------+----------+-------------+-----------------+--------------+------------------+----------------+--------+---------+------------+------------+-----------+----------------+---------------------+---------------------+------------+-----------------+----------+----------------+---------------+-----------+ | table_catalog | table_schema | table_name | table_type | table_id | data_length | max_data_length | index_length | max_index_length | avg_row_length | engine | version | row_format | table_rows | data_free | auto_increment | create_time | update_time | check_time | table_collation | checksum | create_options | table_comment | temporary | +---------------+--------------+------------+------------+----------+-------------+-----------------+--------------+------------------+----------------+--------+---------+------------+------------+-----------+----------------+---------------------+---------------------+------------+-----------------+----------+----------------+---------------+-----------+ | greptime | public | test_view | VIEW |ID |ID |ID |ID |ID |ID | |ID | Fixed |ID |ID |ID |DATETIME |DATETIME | | utf8_bin |ID | | | N | +| greptime | public | test_view2 | VIEW |ID |ID |ID |ID |ID |ID | |ID | Fixed |ID |ID |ID |DATETIME |DATETIME | | utf8_bin |ID | | | N | +---------------+--------------+------------+------------+----------+-------------+-----------------+--------------+------------------+----------------+--------+---------+------------+------------+-----------+----------------+---------------------+---------------------+------------+-----------------+----------+----------------+---------------+-----------+ SHOW COLUMNS FROM test_view; @@ -169,10 +177,31 @@ SELECT * FROM test_view LIMIT 10; | 9 | +--------+ +SELECT * FROM test_view2 LIMIT 10; + ++--------+ +| number | ++--------+ +| 0 | +| 1 | +| 2 | +| 3 | +| 4 | +| 5 | +| 6 | +| 7 | +| 8 | +| 9 | ++--------+ + DROP VIEW test_view; Affected Rows: 0 +DROP VIEW test_view2; + +Affected Rows: 0 + DROP TABLE test_table; Affected Rows: 0 diff --git a/tests/cases/standalone/common/view/create.sql b/tests/cases/standalone/common/view/create.sql index b82704d3a9..91149f44f4 100644 --- a/tests/cases/standalone/common/view/create.sql +++ b/tests/cases/standalone/common/view/create.sql @@ -16,6 +16,8 @@ CREATE OR REPLACE VIEW test_table as SELECT * FROM public.numbers; CREATE VIEW test_view as SELECT * FROM public.numbers; +CREATE VIEW test_view2 as SELECT * FROM test_view; + --- View already exists ---- CREATE VIEW test_view as SELECT * FROM public.numbers; @@ -48,7 +50,7 @@ SELECT * FROM INFORMATION_SCHEMA.TABLES ORDER BY TABLE_NAME, TABLE_TYPE; -- SQLNESS REPLACE (\s\d+\s) ID -- SQLNESS REPLACE (\s[\-0-9T:\.]{15,}) DATETIME -SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'VIEW'; +SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'VIEW' ORDER BY TABLE_NAME; SHOW COLUMNS FROM test_view; @@ -58,8 +60,12 @@ SELECT * FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 'test_view'; SELECT * FROM test_view LIMIT 10; +SELECT * FROM test_view2 LIMIT 10; + DROP VIEW test_view; +DROP VIEW test_view2; + DROP TABLE test_table; SELECT * FROM test_view LIMIT 10; From 59dd4186297f4cbc026fbe43c43289b8477f68e9 Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Thu, 26 Mar 2026 08:08:38 +0800 Subject: [PATCH 039/195] feat: simplify nested aggr inside count query (#7859) * as optimizer rule Signed-off-by: Ruihang Xia * dump changes Signed-off-by: Ruihang Xia * perf: tighten count-count optimizer rewrite * extend inner op set Signed-off-by: Ruihang Xia * simplify and more coverage Signed-off-by: Ruihang Xia * remove prom-non-null Signed-off-by: Ruihang Xia * preserve value column through pruning Signed-off-by: Ruihang Xia * more sqlness cases Signed-off-by: Ruihang Xia * rename Signed-off-by: Ruihang Xia * enforce is not null before inner aggr Signed-off-by: Ruihang Xia * finalize Signed-off-by: Ruihang Xia * update sqlness result Signed-off-by: Ruihang Xia --------- Signed-off-by: Ruihang Xia --- src/query/src/optimizer.rs | 1 + src/query/src/optimizer/count_nest_aggr.rs | 346 ++++++++++++++++++ src/query/src/planner.rs | 210 ++++++++++- src/query/src/promql/planner.rs | 170 +++++++++ src/query/src/query_engine/state.rs | 2 + .../standalone/common/promql/scalar.result | 130 ++++++- .../cases/standalone/common/promql/scalar.sql | 56 ++- .../tql-explain-analyze/tsid_column.result | 59 ++- .../tql-explain-analyze/tsid_column.sql | 10 +- 9 files changed, 973 insertions(+), 11 deletions(-) create mode 100644 src/query/src/optimizer/count_nest_aggr.rs diff --git a/src/query/src/optimizer.rs b/src/query/src/optimizer.rs index 4259b587ba..aaac1e3124 100644 --- a/src/query/src/optimizer.rs +++ b/src/query/src/optimizer.rs @@ -13,6 +13,7 @@ // limitations under the License. pub mod constant_term; +pub mod count_nest_aggr; pub mod count_wildcard; pub mod parallelize_scan; pub mod pass_distribution; diff --git a/src/query/src/optimizer/count_nest_aggr.rs b/src/query/src/optimizer/count_nest_aggr.rs new file mode 100644 index 0000000000..89ba426074 --- /dev/null +++ b/src/query/src/optimizer/count_nest_aggr.rs @@ -0,0 +1,346 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashSet; +use std::sync::Arc; + +use datafusion::config::ConfigOptions; +use datafusion::functions_aggregate::count::count_udaf; +use datafusion::logical_expr::{Extension, LogicalPlan, LogicalPlanBuilder, Sort}; +use datafusion_common::Result; +use datafusion_common::tree_node::{Transformed, TreeNode}; +use datafusion_expr::{Expr, UserDefinedLogicalNodeCore, lit}; +use promql::extension_plan::{InstantManipulate, SeriesDivide, SeriesNormalize}; +use store_api::metric_engine_consts::DATA_SCHEMA_TSID_COLUMN_NAME; + +use crate::QueryEngineContext; +use crate::optimizer::ExtensionAnalyzerRule; + +/// Rewrites `count(() by (...))` into a presence-based +/// group count. +/// +/// This stays intentionally narrow: +/// - the outer aggregate must be plain `count` +/// - the inner aggregate must be a plain aggregate whose result existence is equivalent to input +/// group existence +/// - the inner input must be the direct instant-vector-selector plan +/// - the outer count must only group by the evaluation timestamp +#[derive(Debug)] +pub struct CountNestAggrRule; + +impl ExtensionAnalyzerRule for CountNestAggrRule { + fn analyze( + &self, + plan: LogicalPlan, + _ctx: &QueryEngineContext, + _config: &ConfigOptions, + ) -> Result { + plan.transform_down(&Self::rewrite_plan).map(|x| x.data) + } +} + +impl CountNestAggrRule { + fn rewrite_plan(plan: LogicalPlan) -> Result> { + let LogicalPlan::Sort(sort) = plan else { + return Ok(Transformed::no(plan)); + }; + + if let Some(rewritten) = Self::try_rewrite_sort(&sort)? { + Ok(Transformed::yes(rewritten)) + } else { + Ok(Transformed::no(LogicalPlan::Sort(sort))) + } + } + + fn try_rewrite_sort(sort: &Sort) -> Result> { + if sort.fetch.is_some() { + return Ok(None); + } + + let LogicalPlan::Aggregate(outer_agg) = sort.input.as_ref() else { + return Ok(None); + }; + if outer_agg.group_expr.len() != 1 || outer_agg.aggr_expr.len() != 1 { + return Ok(None); + } + let outer_time_expr = outer_agg.group_expr[0].clone(); + let outer_count_arg = + match Self::aggregate_if(&outer_agg.aggr_expr[0], |name| name == "count") { + Some((_, arg)) => arg, + None => return Ok(None), + }; + + let LogicalPlan::Sort(inner_sort) = outer_agg.input.as_ref() else { + return Ok(None); + }; + if inner_sort.fetch.is_some() { + return Ok(None); + } + + let LogicalPlan::Aggregate(inner_agg) = inner_sort.input.as_ref() else { + return Ok(None); + }; + if inner_agg.aggr_expr.len() != 1 || inner_agg.group_expr.is_empty() { + return Ok(None); + } + let (inner_is_count, inner_value_expr) = + match Self::aggregate_if(&inner_agg.aggr_expr[0], |name| { + Self::is_supported_inner_aggregate(name) + }) { + Some((name, arg)) => (name == "count", arg), + None => return Ok(None), + }; + let Expr::Column(_) = inner_value_expr else { + return Ok(None); + }; + + let Expr::Column(outer_count_column) = outer_count_arg else { + return Ok(None); + }; + let inner_output_field = inner_agg.schema.field(inner_agg.group_expr.len()); + if outer_count_column.name != *inner_output_field.name() { + return Ok(None); + } + + if !Self::is_projection_chain_to_instant(inner_agg.input.as_ref()) { + return Ok(None); + } + + if !inner_agg + .group_expr + .iter() + .all(|expr| matches!(expr, Expr::Column(_))) + { + return Ok(None); + } + + let Some(time_expr_pos) = inner_agg + .group_expr + .iter() + .position(|expr| expr == &outer_time_expr) + else { + return Ok(None); + }; + + let mut presence_group_exprs = Vec::with_capacity(inner_agg.group_expr.len()); + presence_group_exprs.push(outer_time_expr.clone()); + presence_group_exprs.extend( + inner_agg + .group_expr + .iter() + .enumerate() + .filter(|(idx, _)| *idx != time_expr_pos) + .map(|(_, expr)| expr.clone()), + ); + + let mut required_input_columns = + Self::collect_required_input_columns(&presence_group_exprs, inner_value_expr); + required_input_columns.extend(Self::collect_required_instant_columns( + inner_agg.input.as_ref(), + )); + let presence_source = Self::rebuild_projection_chain_to_instant( + inner_agg.input.as_ref(), + &required_input_columns, + )?; + + let outer_value_name = outer_agg + .schema + .field(outer_agg.group_expr.len()) + .name() + .clone(); + let mut presence_input = LogicalPlanBuilder::from(presence_source); + if !inner_is_count { + presence_input = presence_input.filter(inner_value_expr.clone().is_not_null())?; + } + let presence_input = presence_input + .project(presence_group_exprs.clone())? + .distinct()? + .build()?; + + let rewritten = LogicalPlanBuilder::from(presence_input) + .aggregate( + outer_agg.group_expr.clone(), + vec![count_udaf().call(vec![lit(1_i64)]).alias(outer_value_name)], + )? + .sort(sort.expr.clone())? + .build()?; + + Ok(Some(rewritten)) + } + + fn collect_required_input_columns(group_exprs: &[Expr], value_expr: &Expr) -> HashSet { + let mut required = HashSet::new(); + + for expr in group_exprs { + if let Expr::Column(column) = expr { + required.insert(column.name.clone()); + } + } + if let Expr::Column(column) = value_expr { + // Keep the value column in the pruned instant input so `InstantManipulate` + // can still perform stale-NaN filtering before we project down to keys. + required.insert(column.name.clone()); + } + + required + } + + fn collect_required_instant_columns(plan: &LogicalPlan) -> HashSet { + let mut required = HashSet::new(); + Self::collect_required_instant_columns_into(plan, &mut required); + required + } + + fn collect_required_instant_columns_into(plan: &LogicalPlan, required: &mut HashSet) { + match plan { + LogicalPlan::Projection(projection) => { + Self::collect_required_instant_columns_into(projection.input.as_ref(), required); + } + LogicalPlan::Extension(extension) => { + for expr in extension.node.expressions() { + if let Expr::Column(column) = expr { + required.insert(column.name); + } + } + + if extension.node.as_any().is::() + && extension.node.inputs()[0] + .schema() + .fields() + .iter() + .any(|field| field.name() == DATA_SCHEMA_TSID_COLUMN_NAME) + { + required.insert(DATA_SCHEMA_TSID_COLUMN_NAME.to_string()); + } + + if let Some(input) = extension.node.inputs().into_iter().next() { + Self::collect_required_instant_columns_into(input, required); + } + } + _ => {} + } + } + + fn aggregate_if(expr: &Expr, accept_name: F) -> Option<(&str, &Expr)> + where + F: FnOnce(&str) -> bool, + { + let Expr::AggregateFunction(func) = expr else { + return None; + }; + let name = func.func.name(); + if !accept_name(name) + || func.params.filter.is_some() + || func.params.distinct + || !func.params.order_by.is_empty() + || func.params.args.len() != 1 + { + return None; + } + + Some((name, &func.params.args[0])) + } + + fn is_supported_inner_aggregate(name: &str) -> bool { + matches!( + name, + "count" | "sum" | "avg" | "min" | "max" | "stddev_pop" | "var_pop" + ) + } + + fn is_projection_chain_to_instant(plan: &LogicalPlan) -> bool { + let mut current = plan; + loop { + match current { + LogicalPlan::Projection(projection) => current = projection.input.as_ref(), + LogicalPlan::Extension(ext) => { + return ext.node.as_any().is::(); + } + _ => return false, + } + } + } + + fn rebuild_projection_chain_to_instant( + plan: &LogicalPlan, + required_columns: &HashSet, + ) -> Result { + match plan { + LogicalPlan::Projection(projection) => { + let input = Self::rebuild_projection_chain_to_instant( + projection.input.as_ref(), + required_columns, + )?; + LogicalPlanBuilder::from(input) + .project(projection.expr.clone())? + .build() + } + LogicalPlan::Extension(extension) => { + if let Some(instant) = extension.node.as_any().downcast_ref::() { + let input = + Self::prune_instant_input(extension.node.inputs()[0], required_columns)?; + return Ok(LogicalPlan::Extension(Extension { + node: Arc::new(instant.with_exprs_and_inputs(vec![], vec![input])?), + })); + } + + Ok(plan.clone()) + } + _ => Ok(plan.clone()), + } + } + + fn prune_instant_input( + plan: &LogicalPlan, + required_columns: &HashSet, + ) -> Result { + match plan { + LogicalPlan::Extension(extension) => { + if let Some(normalize) = extension.node.as_any().downcast_ref::() { + let input = + Self::prune_instant_input(extension.node.inputs()[0], required_columns)?; + return Ok(LogicalPlan::Extension(Extension { + node: Arc::new(normalize.with_exprs_and_inputs(vec![], vec![input])?), + })); + } + + if let Some(divide) = extension.node.as_any().downcast_ref::() { + let divide_input = extension.node.inputs()[0].clone(); + + let projection_exprs = divide_input + .schema() + .fields() + .iter() + .filter(|field| required_columns.contains(field.name())) + .map(|field| { + Expr::Column(datafusion_common::Column::from_name(field.name().clone())) + }) + .collect::>(); + let projected_input = LogicalPlanBuilder::from(divide_input) + .project(projection_exprs)? + .build()?; + + return Ok(LogicalPlan::Extension(Extension { + node: Arc::new( + divide.with_exprs_and_inputs(vec![], vec![projected_input])?, + ), + })); + } + + Ok(plan.clone()) + } + _ => Ok(plan.clone()), + } + } +} diff --git a/src/query/src/planner.rs b/src/query/src/planner.rs index f522dc567a..6b206b9d8d 100644 --- a/src/query/src/planner.rs +++ b/src/query/src/planner.rs @@ -278,17 +278,22 @@ impl DfLogicalPlanner { let table_provider = DfTableSourceProvider::new( self.engine_state.catalog_manager().clone(), self.engine_state.disallow_cross_catalog_query(), - query_ctx, + query_ctx.clone(), plan_decoder, self.session_state .config_options() .sql_parser .enable_ident_normalization, ); - PromPlanner::stmt_to_plan(table_provider, stmt, &self.engine_state) + let plan = PromPlanner::stmt_to_plan(table_provider, stmt, &self.engine_state) .await .map_err(BoxedError::new) - .context(QueryPlanSnafu) + .context(QueryPlanSnafu)?; + + let context = QueryEngineContext::new(self.session_state.clone(), query_ctx); + Ok(self + .engine_state + .optimize_by_extension_rules(plan, &context)?) } #[tracing::instrument(skip_all)] @@ -571,15 +576,22 @@ mod tests { use std::sync::Arc; use arrow_schema::DataType; + use catalog::RegisterTableRequest; + use catalog::memory::MemoryCatalogManager; + use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use datatypes::prelude::ConcreteDataType; use datatypes::schema::{ColumnSchema, Schema}; use session::context::QueryContext; + use store_api::metric_engine_consts::{ + DATA_SCHEMA_TABLE_ID_COLUMN_NAME, DATA_SCHEMA_TSID_COLUMN_NAME, LOGICAL_TABLE_METADATA_KEY, + METRIC_ENGINE_NAME, + }; use table::metadata::{TableInfoBuilder, TableMetaBuilder}; use table::test_util::EmptyTable; use super::*; - use crate::QueryEngineRef; - use crate::parser::QueryLanguageParser; + use crate::parser::{PromQuery, QueryLanguageParser}; + use crate::{QueryEngineFactory, QueryEngineRef}; async fn create_test_engine() -> QueryEngineRef { let columns = vec![ @@ -600,6 +612,109 @@ mod tests { crate::tests::new_query_engine_with_table(table) } + fn create_promql_test_engine() -> QueryEngineRef { + let catalog_manager = MemoryCatalogManager::with_default_setup(); + let physical_table_name = "phy"; + let physical_table_id = 999u32; + + let physical_schema = Arc::new(Schema::new(vec![ + ColumnSchema::new( + DATA_SCHEMA_TABLE_ID_COLUMN_NAME.to_string(), + ConcreteDataType::uint32_datatype(), + false, + ), + ColumnSchema::new( + DATA_SCHEMA_TSID_COLUMN_NAME.to_string(), + ConcreteDataType::uint64_datatype(), + false, + ), + ColumnSchema::new("tag_0", ConcreteDataType::string_datatype(), false), + ColumnSchema::new("tag_1", ConcreteDataType::string_datatype(), false), + ColumnSchema::new( + "timestamp", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ) + .with_time_index(true), + ColumnSchema::new("field_0", ConcreteDataType::float64_datatype(), true), + ])); + let physical_meta = TableMetaBuilder::empty() + .schema(physical_schema) + .primary_key_indices(vec![0, 1, 2, 3]) + .value_indices(vec![4, 5]) + .engine(METRIC_ENGINE_NAME.to_string()) + .next_column_id(1024) + .build() + .unwrap(); + let physical_info = TableInfoBuilder::default() + .table_id(physical_table_id) + .name(physical_table_name) + .meta(physical_meta) + .build() + .unwrap(); + catalog_manager + .register_table_sync(RegisterTableRequest { + catalog: DEFAULT_CATALOG_NAME.to_string(), + schema: DEFAULT_SCHEMA_NAME.to_string(), + table_name: physical_table_name.to_string(), + table_id: physical_table_id, + table: EmptyTable::from_table_info(&physical_info), + }) + .unwrap(); + + let mut options = table::requests::TableOptions::default(); + options.extra_options.insert( + LOGICAL_TABLE_METADATA_KEY.to_string(), + physical_table_name.to_string(), + ); + let logical_schema = Arc::new(Schema::new(vec![ + ColumnSchema::new("tag_0", ConcreteDataType::string_datatype(), false), + ColumnSchema::new("tag_1", ConcreteDataType::string_datatype(), false), + ColumnSchema::new( + "timestamp", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ) + .with_time_index(true), + ColumnSchema::new("field_0", ConcreteDataType::float64_datatype(), true), + ])); + let logical_meta = TableMetaBuilder::empty() + .schema(logical_schema) + .primary_key_indices(vec![0, 1]) + .value_indices(vec![3]) + .engine(METRIC_ENGINE_NAME.to_string()) + .options(options) + .next_column_id(1024) + .build() + .unwrap(); + let logical_info = TableInfoBuilder::default() + .table_id(1024) + .name("some_metric") + .meta(logical_meta) + .build() + .unwrap(); + catalog_manager + .register_table_sync(RegisterTableRequest { + catalog: DEFAULT_CATALOG_NAME.to_string(), + schema: DEFAULT_SCHEMA_NAME.to_string(), + table_name: "some_metric".to_string(), + table_id: 1024, + table: EmptyTable::from_table_info(&logical_info), + }) + .unwrap(); + + QueryEngineFactory::new( + catalog_manager, + None, + None, + None, + None, + false, + crate::options::QueryOptions::default(), + ) + .query_engine() + } + async fn parse_sql_to_plan(sql: &str) -> LogicalPlan { let stmt = QueryLanguageParser::parse_sql(sql, &QueryContext::arc()).unwrap(); let engine = create_test_engine().await; @@ -610,6 +725,25 @@ mod tests { .unwrap() } + async fn parse_promql_to_plan(query: &str) -> LogicalPlan { + let engine = create_promql_test_engine(); + let query_ctx = QueryContext::arc(); + let stmt = QueryLanguageParser::parse_promql( + &PromQuery { + query: query.to_string(), + start: "0".to_string(), + end: "10".to_string(), + step: "5s".to_string(), + lookback: "300s".to_string(), + alias: None, + }, + &query_ctx, + ) + .unwrap(); + + engine.planner().plan(&stmt, query_ctx).await.unwrap() + } + #[tokio::test] async fn test_extract_placeholder_cast_types_multiple() { let plan = parse_sql_to_plan( @@ -646,6 +780,72 @@ mod tests { assert_eq!(type_3, &Some(DataType::Int32)); } + #[tokio::test] + async fn test_plan_pql_applies_extension_rules() { + for inner_agg in ["count", "sum", "avg", "min", "max", "stddev", "stdvar"] { + let plan = parse_promql_to_plan(&format!( + "sum(irate(some_metric[1h])) / scalar(count({inner_agg}(some_metric) by (tag_0)))" + )) + .await; + let plan_str = plan.display_indent_schema().to_string(); + assert!(plan_str.contains("Distinct:"), "{inner_agg}: {plan_str}"); + } + } + + #[tokio::test] + async fn test_plan_pql_filters_null_only_groups_for_non_count_inner_aggs() { + let count_plan = parse_promql_to_plan("scalar(count(count(some_metric) by (tag_0)))").await; + let count_plan_str = count_plan.display_indent_schema().to_string(); + assert!( + !count_plan_str.contains("field_0 IS NOT NULL"), + "{count_plan_str}" + ); + + for inner_agg in ["sum", "avg", "min", "max", "stddev", "stdvar"] { + let plan = parse_promql_to_plan(&format!( + "scalar(count({inner_agg}(some_metric) by (tag_0)))" + )) + .await; + let plan_str = plan.display_indent_schema().to_string(); + assert!( + plan_str.contains("field_0 IS NOT NULL"), + "{inner_agg}: {plan_str}" + ); + } + } + + #[tokio::test] + async fn test_plan_pql_skips_extension_rules_for_non_direct_or_unsupported_inner_agg() { + for query in [ + "sum(irate(some_metric[1h])) / scalar(count(sum(irate(some_metric[1h])) by (tag_0)))", + "sum(irate(some_metric[1h])) / scalar(count(group(some_metric) by (tag_0)))", + ] { + let plan = parse_promql_to_plan(query).await; + let plan_str = plan.display_indent_schema().to_string(); + assert!(!plan_str.contains("Distinct:"), "{query}: {plan_str}"); + } + } + + #[tokio::test] + async fn test_plan_sql_does_not_apply_nested_count_rule() { + let plan = parse_sql_to_plan( + "SELECT id, count(inner_count) \ + FROM ( \ + SELECT id, count(name) AS inner_count \ + FROM test \ + GROUP BY id \ + ORDER BY id \ + LIMIT 1000000 \ + ) t \ + GROUP BY id \ + ORDER BY id", + ) + .await; + + let plan_str = plan.display_indent_schema().to_string(); + assert!(!plan_str.contains("Distinct:"), "{plan_str}"); + } + #[tokio::test] async fn test_get_inferred_parameter_types_subquery() { let plan = parse_sql_to_plan( diff --git a/src/query/src/promql/planner.rs b/src/query/src/promql/planner.rs index b6f4f2d28f..23d654d2b6 100644 --- a/src/query/src/promql/planner.rs +++ b/src/query/src/promql/planner.rs @@ -4056,6 +4056,7 @@ mod test { use table::test_util::EmptyTable; use super::*; + use crate::QueryEngineContext; use crate::options::QueryOptions; use crate::parser::QueryLanguageParser; @@ -4073,6 +4074,64 @@ mod test { ) } + async fn build_optimized_promql_plan( + table_provider: DfTableSourceProvider, + eval_stmt: &EvalStmt, + ) -> LogicalPlan { + let state = build_query_engine_state(); + let raw_plan = PromPlanner::stmt_to_plan(table_provider, eval_stmt, &state) + .await + .unwrap(); + let context = QueryEngineContext::new(state.session_state(), QueryContext::arc()); + state + .optimize_by_extension_rules(raw_plan, &context) + .unwrap() + } + + async fn build_optimized_tsid_plan( + query: &str, + num_tag: usize, + num_field: usize, + end_secs: u64, + lookback_secs: u64, + ) -> String { + let eval_stmt = EvalStmt { + expr: parser::parse(query).unwrap(), + start: UNIX_EPOCH, + end: UNIX_EPOCH + .checked_add(Duration::from_secs(end_secs)) + .unwrap(), + interval: Duration::from_secs(5), + lookback_delta: Duration::from_secs(lookback_secs), + }; + let table_provider = build_test_table_provider_with_tsid( + &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())], + num_tag, + num_field, + ) + .await; + + build_optimized_promql_plan(table_provider, &eval_stmt) + .await + .display_indent_schema() + .to_string() + } + + async fn assert_nested_count_rewrite_applies(query: &str, expected_outer_agg: &str) { + let plan_str = build_optimized_tsid_plan(query, 2, 1, 100_000, 1).await; + + assert!(plan_str.contains("PromSeriesDivide: tags=[\"__tsid\"]")); + assert!(plan_str.contains("Projection: some_metric.timestamp, some_metric.tag_0")); + assert!(plan_str.contains("Distinct:")); + assert!(plan_str.contains(expected_outer_agg), "{plan_str}"); + assert!(!plan_str.contains("PromSeriesDivide: tags=[\"tag_0\"]")); + } + + async fn assert_nested_count_rewrite_missing(query: &str, num_tag: usize, lookback_secs: u64) { + let plan_str = build_optimized_tsid_plan(query, num_tag, 1, 100_000, lookback_secs).await; + assert!(!plan_str.contains("Distinct:"), "{plan_str}"); + } + async fn build_test_table_provider( table_name_tuples: &[(String, String)], num_tag: usize, @@ -4685,6 +4744,117 @@ mod test { ); } + #[tokio::test] + async fn scalar_count_count_range_keeps_full_window() { + let plan_str = build_optimized_tsid_plan( + "scalar(count(count(some_metric) by (tag_0)))", + 1, + 1, + 100_000, + 1, + ) + .await; + assert!(plan_str.contains("ScalarCalculate: tags=[]")); + assert!(plan_str.contains("PromInstantManipulate: range=[0..100000000]")); + assert!(!plan_str.contains("PromInstantManipulate: range=[99999000..99999000]")); + } + + #[tokio::test] + async fn scalar_count_count_rewrite_applies_inside_binary_expr_for_tsid_input() { + let plan_str = build_optimized_tsid_plan( + "sum(irate(some_metric[1h])) / scalar(count(count(some_metric) by (tag_0)))", + 2, + 1, + 10, + 300, + ) + .await; + assert!(plan_str.contains("Distinct:"), "{plan_str}"); + } + + #[tokio::test] + async fn nested_count_rewrite_keeps_full_series_key_with_tsid_input() { + assert_nested_count_rewrite_applies( + "count(count(some_metric) by (tag_0))", + "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(count(some_metric.field_0))]]" + ) + .await; + } + + #[tokio::test] + async fn nested_sum_count_rewrite_keeps_full_series_key_with_tsid_input() { + assert_nested_count_rewrite_applies( + "count(sum(some_metric) by (tag_0))", + "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(sum(some_metric.field_0))]]" + ) + .await; + } + + #[tokio::test] + async fn nested_supported_inner_aggs_rewrite_apply_for_tsid_input() { + for (query, expected_outer_agg) in [ + ( + "count(avg(some_metric) by (tag_0))", + "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(avg(some_metric.field_0))]]", + ), + ( + "count(min(some_metric) by (tag_0))", + "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(min(some_metric.field_0))]]", + ), + ( + "count(max(some_metric) by (tag_0))", + "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(max(some_metric.field_0))]]", + ), + ( + "count(stddev(some_metric) by (tag_0))", + "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(stddev_pop(some_metric.field_0))]]", + ), + ( + "count(stdvar(some_metric) by (tag_0))", + "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(var_pop(some_metric.field_0))]]", + ), + ] { + assert_nested_count_rewrite_applies(query, expected_outer_agg).await; + } + } + + #[tokio::test] + async fn nested_non_count_inner_aggs_rewrite_filter_null_values_for_tsid_input() { + let count_plan = + build_optimized_tsid_plan("count(count(some_metric) by (tag_0))", 2, 1, 100_000, 1) + .await; + assert!( + !count_plan.contains("some_metric.field_0 IS NOT NULL"), + "{count_plan}" + ); + + for query in [ + "count(sum(some_metric) by (tag_0))", + "count(avg(some_metric) by (tag_0))", + "count(min(some_metric) by (tag_0))", + "count(max(some_metric) by (tag_0))", + "count(stddev(some_metric) by (tag_0))", + "count(stdvar(some_metric) by (tag_0))", + ] { + let plan_str = build_optimized_tsid_plan(query, 2, 1, 100_000, 1).await; + assert!( + plan_str.contains("Filter: some_metric.field_0 IS NOT NULL"), + "{query}: {plan_str}" + ); + } + } + + #[tokio::test] + async fn nested_unsupported_or_non_direct_inner_aggs_do_not_rewrite() { + assert_nested_count_rewrite_missing("count(group(some_metric) by (tag_0))", 2, 1).await; + assert_nested_count_rewrite_missing( + "count(sum(irate(some_metric[1h])) by (tag_0))", + 2, + 300, + ) + .await; + } + #[tokio::test] async fn physical_table_name_is_not_leaked_in_plan() { let prom_expr = parser::parse("some_metric").unwrap(); diff --git a/src/query/src/query_engine/state.rs b/src/query/src/query_engine/state.rs index a45fc4c896..f696c8b53e 100644 --- a/src/query/src/query_engine/state.rs +++ b/src/query/src/query_engine/state.rs @@ -60,6 +60,7 @@ use crate::dist_plan::{ use crate::metrics::{QUERY_MEMORY_POOL_REJECTED_TOTAL, QUERY_MEMORY_POOL_USAGE_BYTES}; use crate::optimizer::ExtensionAnalyzerRule; use crate::optimizer::constant_term::MatchesConstantTermOptimizer; +use crate::optimizer::count_nest_aggr::CountNestAggrRule; use crate::optimizer::count_wildcard::CountWildcardToTimeIndexRule; use crate::optimizer::parallelize_scan::ParallelizeScan; use crate::optimizer::pass_distribution::PassDistribution; @@ -146,6 +147,7 @@ impl QueryEngineState { // The [`TypeConversionRule`] must be at first extension_rules.insert(0, Arc::new(TypeConversionRule) as _); + extension_rules.push(Arc::new(CountNestAggrRule) as _); // Apply the datafusion rules let mut analyzer = Analyzer::new(); diff --git a/tests/cases/standalone/common/promql/scalar.result b/tests/cases/standalone/common/promql/scalar.result index c5c3e5ebd1..c3292b4f5c 100644 --- a/tests/cases/standalone/common/promql/scalar.result +++ b/tests/cases/standalone/common/promql/scalar.result @@ -136,6 +136,42 @@ TQL EVAL (0, 15, '5s') scalar(count(count(host) by (host))); | 1970-01-01T00:00:15 | 2.0 | +---------------------+--------------------------------+ +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 15, '5s') scalar(count(sum(host) by (host))); + ++---------------------+------------------------------+ +| ts | scalar(count(sum(host.val))) | ++---------------------+------------------------------+ +| 1970-01-01T00:00:00 | 2.0 | +| 1970-01-01T00:00:05 | 2.0 | +| 1970-01-01T00:00:10 | 2.0 | +| 1970-01-01T00:00:15 | 2.0 | ++---------------------+------------------------------+ + +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 15, '5s') scalar(count(avg(host) by (host))); + ++---------------------+------------------------------+ +| ts | scalar(count(avg(host.val))) | ++---------------------+------------------------------+ +| 1970-01-01T00:00:00 | 2.0 | +| 1970-01-01T00:00:05 | 2.0 | +| 1970-01-01T00:00:10 | 2.0 | +| 1970-01-01T00:00:15 | 2.0 | ++---------------------+------------------------------+ + +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 15, '5s') scalar(count(stddev(host) by (host))); + ++---------------------+-------------------------------------+ +| ts | scalar(count(stddev_pop(host.val))) | ++---------------------+-------------------------------------+ +| 1970-01-01T00:00:00 | 2.0 | +| 1970-01-01T00:00:05 | 2.0 | +| 1970-01-01T00:00:10 | 2.0 | +| 1970-01-01T00:00:15 | 2.0 | ++---------------------+-------------------------------------+ + -- SQLNESS SORT_RESULT 3 1 TQL EVAL (0, 15, '5s') scalar(host{host="host1"} + scalar(host{host="host2"})); @@ -516,7 +552,99 @@ TQL EVAL (0, 15, '5s') clamp_max(clamp(host{host="host1"}, 0, 15), 6); | 1970-01-01T00:00:15 | 6.0 | host1 | +---------------------+---------------------------------------------------------+-------+ -Drop table host; +DROP TABLE host; + +Affected Rows: 0 + +CREATE TABLE presence_metric ( + ts timestamp(3) time index, + instance STRING, + cpu STRING, + shard STRING, + val DOUBLE, + PRIMARY KEY (instance, cpu, shard), +); + +Affected Rows: 0 + +INSERT INTO TABLE presence_metric VALUES + (0, 'i1', 'cpu0', 'a', 1.0), + (0, 'i1', 'cpu0', 'b', 2.0), + (0, 'i1', 'cpu1', 'a', 10.0), + (0, 'i1', 'cpu2', 'a', 20.0), + (0, 'i2', 'cpu9', 'a', 100.0), + (200000, 'i1', 'cpu0', 'a', 'NAN'::DOUBLE), + (200000, 'i1', 'cpu0', 'b', 'NAN'::DOUBLE), + (200000, 'i1', 'cpu1', 'a', 11.0), + (200000, 'i1', 'cpu2', 'a', NULL), + (200000, 'i2', 'cpu9', 'a', 101.0), + (400000, 'i1', 'cpu1', 'a', 12.0), + (400000, 'i2', 'cpu9', 'a', 102.0), + (600000, 'i1', 'cpu0', 'a', 7.0), + (600000, 'i1', 'cpu0', 'b', 8.0), + (600000, 'i2', 'cpu9', 'a', 103.0); + +Affected Rows: 15 + +-- NaN drops `cpu0` from the grouped count, while the NULL sample on `cpu2` +-- still leaves a zero-valued row in `count(...) by (cpu)`. +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 600, '200s') count(presence_metric{instance="i1"}) by (cpu); + ++------+---------------------+----------------------------+ +| cpu | ts | count(presence_metric.val) | ++------+---------------------+----------------------------+ +| cpu0 | 1970-01-01T00:00:00 | 2 | +| cpu0 | 1970-01-01T00:10:00 | 2 | +| cpu1 | 1970-01-01T00:00:00 | 1 | +| cpu1 | 1970-01-01T00:03:20 | 1 | +| cpu1 | 1970-01-01T00:06:40 | 1 | +| cpu1 | 1970-01-01T00:10:00 | 1 | +| cpu2 | 1970-01-01T00:00:00 | 1 | +| cpu2 | 1970-01-01T00:03:20 | 0 | +| cpu2 | 1970-01-01T00:06:40 | 0 | ++------+---------------------+----------------------------+ + +-- Nested-count rewrite should preserve grouped presence after stale-NaN filtering and null-value pruning. +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 600, '200s') scalar(count(count(presence_metric{instance="i1"}) by (cpu))); + ++---------------------+-------------------------------------------+ +| ts | scalar(count(count(presence_metric.val))) | ++---------------------+-------------------------------------------+ +| 1970-01-01T00:00:00 | 3.0 | +| 1970-01-01T00:03:20 | 2.0 | +| 1970-01-01T00:06:40 | 2.0 | +| 1970-01-01T00:10:00 | 2.0 | ++---------------------+-------------------------------------------+ + +-- Non-count inner aggregates must drop NULL-only groups before the outer count. +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 600, '200s') scalar(count(sum(presence_metric{instance="i1"}) by (cpu))); + ++---------------------+-----------------------------------------+ +| ts | scalar(count(sum(presence_metric.val))) | ++---------------------+-----------------------------------------+ +| 1970-01-01T00:00:00 | 3.0 | +| 1970-01-01T00:03:20 | 1.0 | +| 1970-01-01T00:06:40 | 1.0 | +| 1970-01-01T00:10:00 | 2.0 | ++---------------------+-----------------------------------------+ + +-- False case: outer `by (instance)` keeps multiple series at the scalar input, so scalar should still yield NaN. +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 600, '200s') scalar(count(count(presence_metric) by (instance, cpu)) by (instance)); + ++---------------------+-------------------------------------------+ +| ts | scalar(count(count(presence_metric.val))) | ++---------------------+-------------------------------------------+ +| 1970-01-01T00:00:00 | NaN | +| 1970-01-01T00:03:20 | NaN | +| 1970-01-01T00:06:40 | NaN | +| 1970-01-01T00:10:00 | NaN | ++---------------------+-------------------------------------------+ + +DROP TABLE presence_metric; Affected Rows: 0 diff --git a/tests/cases/standalone/common/promql/scalar.sql b/tests/cases/standalone/common/promql/scalar.sql index b4007bbf15..662f9665fe 100644 --- a/tests/cases/standalone/common/promql/scalar.sql +++ b/tests/cases/standalone/common/promql/scalar.sql @@ -43,6 +43,15 @@ TQL EVAL (0, 15, '5s') scalar(host{host="host1"}) + host; -- SQLNESS SORT_RESULT 3 1 TQL EVAL (0, 15, '5s') scalar(count(count(host) by (host))); +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 15, '5s') scalar(count(sum(host) by (host))); + +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 15, '5s') scalar(count(avg(host) by (host))); + +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 15, '5s') scalar(count(stddev(host) by (host))); + -- SQLNESS SORT_RESULT 3 1 TQL EVAL (0, 15, '5s') scalar(host{host="host1"} + scalar(host{host="host2"})); @@ -149,4 +158,49 @@ TQL EVAL (0, 15, '5s') clamp(clamp_min(host{host="host1"}, 1), 0, 12); -- SQLNESS SORT_RESULT 3 1 TQL EVAL (0, 15, '5s') clamp_max(clamp(host{host="host1"}, 0, 15), 6); -Drop table host; +DROP TABLE host; + +CREATE TABLE presence_metric ( + ts timestamp(3) time index, + instance STRING, + cpu STRING, + shard STRING, + val DOUBLE, + PRIMARY KEY (instance, cpu, shard), +); + +INSERT INTO TABLE presence_metric VALUES + (0, 'i1', 'cpu0', 'a', 1.0), + (0, 'i1', 'cpu0', 'b', 2.0), + (0, 'i1', 'cpu1', 'a', 10.0), + (0, 'i1', 'cpu2', 'a', 20.0), + (0, 'i2', 'cpu9', 'a', 100.0), + (200000, 'i1', 'cpu0', 'a', 'NAN'::DOUBLE), + (200000, 'i1', 'cpu0', 'b', 'NAN'::DOUBLE), + (200000, 'i1', 'cpu1', 'a', 11.0), + (200000, 'i1', 'cpu2', 'a', NULL), + (200000, 'i2', 'cpu9', 'a', 101.0), + (400000, 'i1', 'cpu1', 'a', 12.0), + (400000, 'i2', 'cpu9', 'a', 102.0), + (600000, 'i1', 'cpu0', 'a', 7.0), + (600000, 'i1', 'cpu0', 'b', 8.0), + (600000, 'i2', 'cpu9', 'a', 103.0); + +-- NaN drops `cpu0` from the grouped count, while the NULL sample on `cpu2` +-- still leaves a zero-valued row in `count(...) by (cpu)`. +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 600, '200s') count(presence_metric{instance="i1"}) by (cpu); + +-- Nested-count rewrite should preserve grouped presence after stale-NaN filtering and null-value pruning. +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 600, '200s') scalar(count(count(presence_metric{instance="i1"}) by (cpu))); + +-- Non-count inner aggregates must drop NULL-only groups before the outer count. +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 600, '200s') scalar(count(sum(presence_metric{instance="i1"}) by (cpu))); + +-- False case: outer `by (instance)` keeps multiple series at the scalar input, so scalar should still yield NaN. +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 600, '200s') scalar(count(count(presence_metric) by (instance, cpu)) by (instance)); + +DROP TABLE presence_metric; diff --git a/tests/cases/standalone/tql-explain-analyze/tsid_column.result b/tests/cases/standalone/tql-explain-analyze/tsid_column.result index 84544b1655..4a7a875060 100644 --- a/tests/cases/standalone/tql-explain-analyze/tsid_column.result +++ b/tests/cases/standalone/tql-explain-analyze/tsid_column.result @@ -112,10 +112,63 @@ TQL ANALYZE (0, 10, '5s') sum(irate(tsid_metric[1h])) / scalar(count(count(tsid |_|_|_AggregateExec: mode=FinalPartitioned, gby=[ts@0 as ts], aggr=[count(count(tsid_metric.val))] REDACTED |_|_|_RepartitionExec: partitioning=REDACTED |_|_|_AggregateExec: mode=Partial, gby=[ts@0 as ts], aggr=[count(count(tsid_metric.val))] REDACTED -|_|_|_ProjectionExec: expr=[ts@1 as ts, count(tsid_metric.val)@2 as count(tsid_metric.val)] REDACTED -|_|_|_AggregateExec: mode=FinalPartitioned, gby=[job@0 as job, ts@1 as ts], aggr=[count(tsid_metric.val)] REDACTED +|_|_|_ProjectionExec: expr=[ts@0 as ts] REDACTED +|_|_|_AggregateExec: mode=FinalPartitioned, gby=[ts@0 as ts, job@1 as job], aggr=[] REDACTED |_|_|_RepartitionExec: partitioning=REDACTED -|_|_|_AggregateExec: mode=Partial, gby=[job@1 as job, ts@2 as ts], aggr=[count(tsid_metric.val)] REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[ts@0 as ts, job@1 as job], aggr=[] REDACTED +|_|_|_ProjectionExec: expr=[ts@3 as ts, job@1 as job] REDACTED +|_|_|_PromInstantManipulateExec: range=[0..10000], lookback=[300000], interval=[5000], time index=[ts] REDACTED +|_|_|_PromSeriesDivideExec: tags=["__tsid"] REDACTED +|_|_|_ProjectionExec: expr=[val@1 as val, job@3 as job, __tsid@2 as __tsid, ts@0 as ts] REDACTED +|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED +|_|_|_| +| 1_| 0_|_SortPreservingMergeExec: [ts@0 ASC NULLS LAST] REDACTED +|_|_|_SortExec: expr=[ts@0 ASC NULLS LAST], preserve_partitioning=[true] REDACTED +|_|_|_AggregateExec: mode=FinalPartitioned, gby=[ts@0 as ts], aggr=[sum(prom_irate(ts_range,val))] REDACTED +|_|_|_RepartitionExec: partitioning=REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[ts@0 as ts], aggr=[sum(prom_irate(ts_range,val))] REDACTED +|_|_|_FilterExec: prom_irate(ts_range,val)@1 IS NOT NULL REDACTED +|_|_|_ProjectionExec: expr=[ts@2 as ts, prom_irate(ts_range@3, val@0) as prom_irate(ts_range,val)] REDACTED +|_|_|_PromRangeManipulateExec: req range=[0..10000], interval=[5000], eval range=[3600000], time index=[ts] REDACTED +|_|_|_PromSeriesNormalizeExec: offset=[0], time index=[ts], filter NaN: [true] REDACTED +|_|_|_PromSeriesDivideExec: tags=["__tsid"] REDACTED +|_|_|_ProjectionExec: expr=[val@1 as val, __tsid@2 as __tsid, ts@0 as ts] REDACTED +|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED +|_|_|_| +|_|_| Total rows: 2_| ++-+-+-+ + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +TQL ANALYZE (0, 10, '5s') sum(irate(tsid_metric[1h])) / scalar(count(sum(tsid_metric) by (job))); + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_ProjectionExec: expr=[ts@1 as ts, sum(prom_irate(ts_range,val))@2 / scalar(count(sum(tsid_metric.val)))@0 as lhs.sum(prom_irate(ts_range,val)) / rhs.scalar(count(sum(tsid_metric.val)))] REDACTED +|_|_|_REDACTED +|_|_|_ScalarCalculateExec: tags=[] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_SortPreservingMergeExec: [ts@0 ASC NULLS LAST] REDACTED +|_|_|_SortExec: expr=[ts@0 ASC NULLS LAST], preserve_partitioning=[true] REDACTED +|_|_|_AggregateExec: mode=FinalPartitioned, gby=[ts@0 as ts], aggr=[count(sum(tsid_metric.val))] REDACTED +|_|_|_RepartitionExec: partitioning=REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[ts@0 as ts], aggr=[count(sum(tsid_metric.val))] REDACTED +|_|_|_ProjectionExec: expr=[ts@0 as ts] REDACTED +|_|_|_AggregateExec: mode=FinalPartitioned, gby=[ts@0 as ts, job@1 as job], aggr=[] REDACTED +|_|_|_RepartitionExec: partitioning=REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[ts@0 as ts, job@1 as job], aggr=[] REDACTED +|_|_|_ProjectionExec: expr=[ts@1 as ts, job@0 as job] REDACTED +|_|_|_FilterExec: val@0 IS NOT NULL, projection=[job@1, ts@2] REDACTED |_|_|_ProjectionExec: expr=[val@0 as val, job@1 as job, ts@3 as ts] REDACTED |_|_|_PromInstantManipulateExec: range=[0..10000], lookback=[300000], interval=[5000], time index=[ts] REDACTED |_|_|_PromSeriesDivideExec: tags=["__tsid"] REDACTED diff --git a/tests/cases/standalone/tql-explain-analyze/tsid_column.sql b/tests/cases/standalone/tql-explain-analyze/tsid_column.sql index 7b3de23f33..dedce2dfb1 100644 --- a/tests/cases/standalone/tql-explain-analyze/tsid_column.sql +++ b/tests/cases/standalone/tql-explain-analyze/tsid_column.sql @@ -51,6 +51,14 @@ TQL ANALYZE (0, 10, '5s') sum by (job, instance) (tsid_metric); -- SQLNESS REPLACE (Hash.*) REDACTED TQL ANALYZE (0, 10, '5s') sum(irate(tsid_metric[1h])) / scalar(count(count(tsid_metric) by (job))); +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +TQL ANALYZE (0, 10, '5s') sum(irate(tsid_metric[1h])) / scalar(count(sum(tsid_metric) by (job))); + DROP TABLE tsid_metric; DROP TABLE tsid_physical; - From 73b48b14c187638cd35224cd581f26d60ab7dc93 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Thu, 26 Mar 2026 11:10:39 +0800 Subject: [PATCH 040/195] feat: update postgres ParameterDescription size limit (#7861) * feat: update postgres ParameterDescription size limit * chore: don't log io error --- Cargo.lock | 6 +++--- src/query/src/planner.rs | 12 ++++++++++++ src/servers/Cargo.toml | 2 +- src/servers/src/postgres/handler.rs | 7 +++++-- 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 32f9aa27d4..2ba96d0801 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7301,7 +7301,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" dependencies = [ "cfg-if", - "windows-targets 0.48.5", + "windows-targets 0.52.6", ] [[package]] @@ -9620,9 +9620,9 @@ dependencies = [ [[package]] name = "pgwire" -version = "0.38.1" +version = "0.38.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2a798d130b8975a566c2cf6d8955746e1f09a9ee2c3ff2e6020a2c6528c5bd1" +checksum = "3a1bdf05fc8231cc5024572fe056e3ce34eb6b9b755ba7aba110e1c64119cec3" dependencies = [ "async-trait", "base64 0.22.1", diff --git a/src/query/src/planner.rs b/src/query/src/planner.rs index 6b206b9d8d..278058974a 100644 --- a/src/query/src/planner.rs +++ b/src/query/src/planner.rs @@ -857,4 +857,16 @@ mod tests { let type_1 = types.get("$1").unwrap(); assert_eq!(type_1, &Some(DataType::Utf8)); } + + #[tokio::test] + async fn test_get_inferred_parameter_types_insert() { + let plan = parse_sql_to_plan("INSERT INTO test (id, name) VALUES ($1, $2), ($3, $4)").await; + let types = DfLogicalPlanner::get_inferred_parameter_types(&plan).unwrap(); + + assert_eq!(types.len(), 4); + assert_eq!(types.get("$1"), Some(&Some(DataType::Int32))); + assert_eq!(types.get("$2"), Some(&Some(DataType::Utf8))); + assert_eq!(types.get("$3"), Some(&Some(DataType::Int32))); + assert_eq!(types.get("$4"), Some(&Some(DataType::Utf8))); + } } diff --git a/src/servers/Cargo.toml b/src/servers/Cargo.toml index 8b64a256e7..8e84ef77d6 100644 --- a/src/servers/Cargo.toml +++ b/src/servers/Cargo.toml @@ -89,7 +89,7 @@ operator.workspace = true otel-arrow-rust.workspace = true parking_lot.workspace = true pg_interval = { version = "0.5.2", package = "pg_interval_2" } -pgwire = { version = "0.38.1", default-features = false, features = [ +pgwire = { version = "0.38.2", default-features = false, features = [ "server-api-ring", "pg-ext-types", ] } diff --git a/src/servers/src/postgres/handler.rs b/src/servers/src/postgres/handler.rs index daf4bfc646..7e9b75c036 100644 --- a/src/servers/src/postgres/handler.rs +++ b/src/servers/src/postgres/handler.rs @@ -20,7 +20,7 @@ use async_trait::async_trait; use common_query::{Output, OutputData}; use common_recordbatch::RecordBatch; use common_recordbatch::error::Result as RecordBatchResult; -use common_telemetry::{debug, tracing}; +use common_telemetry::{debug, info, tracing}; use datafusion::sql::sqlparser::ast::{CopyOption, CopyTarget, Statement as SqlParserStatement}; use datafusion_common::ParamValues; use datafusion_pg_catalog::sql::PostgresCompatibilityParser; @@ -628,7 +628,10 @@ impl ErrorHandler for PostgresServerHandlerInner { where C: ClientInfo, { - debug!("Postgres interface error {}", error) + match error { + PgWireError::IoError(e) => debug!("Postgres client disconnected: {}", e), + _ => info!("Postgres interface error: {}", error), + } } } From 8d40b129f1fd6259267d8bd4d1e266d87933509b Mon Sep 17 00:00:00 2001 From: fys <40801205+fengys1996@users.noreply.github.com> Date: Thu, 26 Mar 2026 11:14:59 +0800 Subject: [PATCH 041/195] chore: remove unused rexpect dev-dependency (#7865) * chore: remove unused rexpect dev-dependency * fix: taplo fmt --- Cargo.lock | 45 +-------------------------------------------- src/cmd/Cargo.toml | 24 +++++++++--------------- 2 files changed, 10 insertions(+), 59 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2ba96d0801..74440726d8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2085,7 +2085,6 @@ dependencies = [ "rand 0.9.1", "regex", "reqwest", - "rexpect", "serde", "serde_json", "servers", @@ -2154,12 +2153,6 @@ dependencies = [ "unicode-width 0.2.1", ] -[[package]] -name = "comma" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335" - [[package]] name = "common-base" version = "1.0.0-rc.2" @@ -7724,15 +7717,6 @@ dependencies = [ "libc", ] -[[package]] -name = "memoffset" -version = "0.6.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" -dependencies = [ - "autocfg", -] - [[package]] name = "memoffset" version = "0.7.1" @@ -8400,20 +8384,6 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" -[[package]] -name = "nix" -version = "0.25.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f346ff70e7dbfd675fe90590b92d59ef2de15a8779ae305ebcbfd3f0caf59be4" -dependencies = [ - "autocfg", - "bitflags 1.3.2", - "cfg-if", - "libc", - "memoffset 0.6.5", - "pin-utils", -] - [[package]] name = "nix" version = "0.26.4" @@ -8423,7 +8393,7 @@ dependencies = [ "bitflags 1.3.2", "cfg-if", "libc", - "memoffset 0.7.1", + "memoffset", "pin-utils", ] @@ -11219,19 +11189,6 @@ dependencies = [ "webpki-roots 1.0.1", ] -[[package]] -name = "rexpect" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01ff60778f96fb5a48adbe421d21bf6578ed58c0872d712e7e08593c195adff8" -dependencies = [ - "comma", - "nix 0.25.1", - "regex", - "tempfile", - "thiserror 1.0.69", -] - [[package]] name = "rgb" version = "0.8.50" diff --git a/src/cmd/Cargo.toml b/src/cmd/Cargo.toml index 74309e2024..d547ec6e81 100644 --- a/src/cmd/Cargo.toml +++ b/src/cmd/Cargo.toml @@ -30,10 +30,6 @@ base64.workspace = true cache.workspace = true catalog.workspace = true chrono.workspace = true -datafusion-physical-plan.workspace = true -datafusion.workspace = true -datafusion-common.workspace = true -either = "1.15" clap.workspace = true cli.workspace = true client.workspace = true @@ -51,14 +47,19 @@ common-procedure.workspace = true common-query.workspace = true common-recordbatch.workspace = true common-runtime.workspace = true +common-stat.workspace = true common-telemetry = { workspace = true, features = [ "deadlock_detection", ] } common-time.workspace = true common-version.workspace = true common-wal.workspace = true +datafusion.workspace = true +datafusion-common.workspace = true +datafusion-physical-plan.workspace = true datanode.workspace = true datatypes.workspace = true +either = "1.15" etcd-client.workspace = true flow.workspace = true frontend = { workspace = true, default-features = false } @@ -81,21 +82,20 @@ query.workspace = true rand.workspace = true regex.workspace = true reqwest.workspace = true -standalone.workspace = true serde.workspace = true serde_json.workspace = true servers.workspace = true session.workspace = true similar-asserts.workspace = true snafu.workspace = true -common-stat.workspace = true +sqlparser.workspace = true +standalone.workspace = true store-api.workspace = true table.workspace = true tokio.workspace = true toml.workspace = true tonic.workspace = true tracing-appender.workspace = true -sqlparser.workspace = true [target.'cfg(unix)'.dependencies] pprof = { version = "0.14", features = [ @@ -110,14 +110,8 @@ api.workspace = true client = { workspace = true, features = ["testing"] } common-test-util.workspace = true common-version.workspace = true +file-engine.workspace = true +mito2.workspace = true serde.workspace = true temp-env = "0.3" tempfile.workspace = true -file-engine.workspace = true -mito2.workspace = true - -[target.'cfg(not(windows))'.dev-dependencies] -rexpect = "0.5" - -[package.metadata.cargo-udeps.ignore] -development = ["rexpect"] From 8058ce7cf22215f6bcdc17b7bbd1d858618170d1 Mon Sep 17 00:00:00 2001 From: jeremyhi Date: Wed, 25 Mar 2026 20:25:50 -0700 Subject: [PATCH 042/195] refactor: simplify scan memory tracking (#7827) * refactor: simplify scan memory tracking Signed-off-by: jeremyhi * chore: make confg-docs Signed-off-by: jeremyhi * chore: by codex review comment Signed-off-by: jeremyhi * feat: track_with_policy Signed-off-by: jeremyhi * chore: minor change Signed-off-by: jeremyhi * chore: mem granularity mb to kb Signed-off-by: jeremyhi * chore: by review comment Signed-off-by: jeremyhi * chore: by scan_memory_on_exhausted comment Signed-off-by: jeremyhi * fix: by review comment Signed-off-by: jeremyhi * chore: typo Signed-off-by: jeremyhi --------- Signed-off-by: jeremyhi --- Cargo.lock | 1 + config/config.md | 10 +- config/datanode.example.toml | 15 +- config/standalone.example.toml | 15 +- src/common/recordbatch/Cargo.toml | 1 + src/common/recordbatch/src/lib.rs | 807 +++++++++++++++-------------- src/datanode/src/datanode.rs | 3 - src/mito2/src/config.rs | 6 + src/mito2/src/engine.rs | 35 +- src/query/src/dummy_catalog.rs | 4 +- src/store-api/src/region_engine.rs | 6 +- src/table/src/table/scan.rs | 14 +- tests-integration/tests/http.rs | 1 + 13 files changed, 469 insertions(+), 449 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 74440726d8..b3000970b3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2657,6 +2657,7 @@ dependencies = [ "common-base", "common-error", "common-macro", + "common-memory-manager", "common-telemetry", "common-time", "criterion 0.7.0", diff --git a/config/config.md b/config/config.md index 2ac11dd6e6..4861675217 100644 --- a/config/config.md +++ b/config/config.md @@ -18,7 +18,7 @@ | `write_bytes_exhausted_policy` | String | Unset | Policy when write bytes quota is exhausted.
Options: "wait" (default, 10s timeout), "wait()" (e.g., "wait(30s)"), "fail" | | `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.
By default, it provides services after all regions have been initialized. | | `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. | -| `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited.
NOTE: This setting affects scan_memory_limit's privileged tier allocation.
When set, 70% of queries get privileged memory access (full scan_memory_limit).
The remaining 30% get standard tier access (70% of scan_memory_limit). | +| `max_concurrent_queries` | Integer | `0` | The maximum concurrent queries allowed to be executed. Zero means unlimited. | | `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. Enabled by default. | | `runtime` | -- | -- | The runtime options. | | `runtime.global_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. | @@ -160,7 +160,8 @@ | `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. | | `region_engine.mito.max_concurrent_scan_files` | Integer | `384` | Maximum number of SST files to scan concurrently. | | `region_engine.mito.allow_stale_entries` | Bool | `false` | Whether to allow stale WAL entries read during replay. | -| `region_engine.mito.scan_memory_limit` | String | `50%` | Memory limit for table scans across all queries.
Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").
Setting it to 0 disables the limit.
NOTE: Works with max_concurrent_queries for tiered memory allocation.
- If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access.
- If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access. | +| `region_engine.mito.scan_memory_limit` | String | `50%` | Memory limit for table scans across all queries.
Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").
Setting it to 0 disables the limit. | +| `region_engine.mito.scan_memory_on_exhausted` | String | `fail` | Controls what happens when a scan cannot get memory immediately.
"fail" (default) fails fast and is the recommended option for most users.
"wait" / "wait()" waits for memory to become available. This is mainly
for advanced tuning in bursty workloads where temporary contention is common and
higher latency is acceptable.
"wait" means "wait(10s)", not unlimited waiting. | | `region_engine.mito.min_compaction_interval` | String | `0m` | Minimum time interval between two compactions.
To align with the old behavior, the default value is 0 (no restrictions). | | `region_engine.mito.default_experimental_flat_format` | Bool | `false` | Whether to enable experimental flat format as the default format. | | `region_engine.mito.index` | -- | -- | The options for index in Mito engine. | @@ -440,7 +441,7 @@ | `require_lease_before_startup` | Bool | `false` | Start services after regions have obtained leases.
It will block the datanode start if it can't receive leases in the heartbeat from metasrv. | | `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.
By default, it provides services after all regions have been initialized. | | `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. | -| `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited.
NOTE: This setting affects scan_memory_limit's privileged tier allocation.
When set, 70% of queries get privileged memory access (full scan_memory_limit).
The remaining 30% get standard tier access (70% of scan_memory_limit). | +| `max_concurrent_queries` | Integer | `0` | The maximum concurrent queries allowed to be executed. Zero means unlimited. | | `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. Enabled by default. | | `http` | -- | -- | The HTTP server options. | | `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. | @@ -552,7 +553,8 @@ | `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. | | `region_engine.mito.max_concurrent_scan_files` | Integer | `384` | Maximum number of SST files to scan concurrently. | | `region_engine.mito.allow_stale_entries` | Bool | `false` | Whether to allow stale WAL entries read during replay. | -| `region_engine.mito.scan_memory_limit` | String | `50%` | Memory limit for table scans across all queries.
Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").
Setting it to 0 disables the limit.
NOTE: Works with max_concurrent_queries for tiered memory allocation.
- If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access.
- If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access. | +| `region_engine.mito.scan_memory_limit` | String | `50%` | Memory limit for table scans across all queries.
Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").
Setting it to 0 disables the limit. | +| `region_engine.mito.scan_memory_on_exhausted` | String | `fail` | Controls what happens when a scan cannot get memory immediately.
"fail" (default) fails fast and is the recommended option for most users.
"wait" / "wait()" waits for memory to become available. This is mainly
for advanced tuning in bursty workloads where temporary contention is common and
higher latency is acceptable.
"wait" means "wait(10s)", not unlimited waiting. | | `region_engine.mito.min_compaction_interval` | String | `0m` | Minimum time interval between two compactions.
To align with the old behavior, the default value is 0 (no restrictions). | | `region_engine.mito.default_experimental_flat_format` | Bool | `false` | Whether to enable experimental flat format as the default format. | | `region_engine.mito.index` | -- | -- | The options for index in Mito engine. | diff --git a/config/datanode.example.toml b/config/datanode.example.toml index 2631a089e1..833a567d74 100644 --- a/config/datanode.example.toml +++ b/config/datanode.example.toml @@ -17,10 +17,7 @@ init_regions_in_background = false ## Parallelism of initializing regions. init_regions_parallelism = 16 -## The maximum current queries allowed to be executed. Zero means unlimited. -## NOTE: This setting affects scan_memory_limit's privileged tier allocation. -## When set, 70% of queries get privileged memory access (full scan_memory_limit). -## The remaining 30% get standard tier access (70% of scan_memory_limit). +## The maximum concurrent queries allowed to be executed. Zero means unlimited. max_concurrent_queries = 0 ## Enable telemetry to collect anonymous usage data. Enabled by default. @@ -535,10 +532,14 @@ allow_stale_entries = false ## Memory limit for table scans across all queries. ## Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%"). ## Setting it to 0 disables the limit. -## NOTE: Works with max_concurrent_queries for tiered memory allocation. -## - If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access. -## - If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access. scan_memory_limit = "50%" +## Controls what happens when a scan cannot get memory immediately. +## "fail" (default) fails fast and is the recommended option for most users. +## "wait" / "wait()" waits for memory to become available. This is mainly +## for advanced tuning in bursty workloads where temporary contention is common and +## higher latency is acceptable. +## "wait" means "wait(10s)", not unlimited waiting. +scan_memory_on_exhausted = "fail" ## Minimum time interval between two compactions. ## To align with the old behavior, the default value is 0 (no restrictions). diff --git a/config/standalone.example.toml b/config/standalone.example.toml index ef96406316..94c5feebf1 100644 --- a/config/standalone.example.toml +++ b/config/standalone.example.toml @@ -23,10 +23,7 @@ init_regions_in_background = false ## Parallelism of initializing regions. init_regions_parallelism = 16 -## The maximum current queries allowed to be executed. Zero means unlimited. -## NOTE: This setting affects scan_memory_limit's privileged tier allocation. -## When set, 70% of queries get privileged memory access (full scan_memory_limit). -## The remaining 30% get standard tier access (70% of scan_memory_limit). +## The maximum concurrent queries allowed to be executed. Zero means unlimited. max_concurrent_queries = 0 ## Enable telemetry to collect anonymous usage data. Enabled by default. @@ -627,10 +624,14 @@ allow_stale_entries = false ## Memory limit for table scans across all queries. ## Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%"). ## Setting it to 0 disables the limit. -## NOTE: Works with max_concurrent_queries for tiered memory allocation. -## - If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access. -## - If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access. scan_memory_limit = "50%" +## Controls what happens when a scan cannot get memory immediately. +## "fail" (default) fails fast and is the recommended option for most users. +## "wait" / "wait()" waits for memory to become available. This is mainly +## for advanced tuning in bursty workloads where temporary contention is common and +## higher latency is acceptable. +## "wait" means "wait(10s)", not unlimited waiting. +scan_memory_on_exhausted = "fail" ## Minimum time interval between two compactions. ## To align with the old behavior, the default value is 0 (no restrictions). diff --git a/src/common/recordbatch/Cargo.toml b/src/common/recordbatch/Cargo.toml index 5887dc31c5..efc6b6f60e 100644 --- a/src/common/recordbatch/Cargo.toml +++ b/src/common/recordbatch/Cargo.toml @@ -12,6 +12,7 @@ arc-swap = "1.6" common-base.workspace = true common-error.workspace = true common-macro.workspace = true +common-memory-manager.workspace = true common-telemetry.workspace = true common-time.workspace = true datafusion.workspace = true diff --git a/src/common/recordbatch/src/lib.rs b/src/common/recordbatch/src/lib.rs index 85e0d5c496..0a2d697407 100644 --- a/src/common/recordbatch/src/lib.rs +++ b/src/common/recordbatch/src/lib.rs @@ -22,13 +22,17 @@ pub mod recordbatch; pub mod util; use std::fmt; +use std::future::Future; use std::pin::Pin; use std::sync::Arc; -use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use adapter::RecordBatchMetrics; use arc_swap::ArcSwapOption; use common_base::readable_size::ReadableSize; +use common_error::ext::BoxedError; +use common_memory_manager::{ + MemoryGuard, MemoryManager, MemoryMetrics, OnExhaustedPolicy, PermitGranularity, +}; use common_telemetry::tracing::Span; pub use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream; use datatypes::arrow::array::{ArrayRef, AsArray, StringBuilder}; @@ -42,7 +46,7 @@ use error::Result; use futures::task::{Context, Poll}; use futures::{Stream, TryStreamExt}; pub use recordbatch::RecordBatch; -use snafu::{ResultExt, ensure}; +use snafu::{IntoError, ResultExt, ensure}; use crate::error::NewDfRecordBatchSnafu; @@ -416,205 +420,93 @@ impl> + Unpin> Stream for RecordBatchStream } } -/// Memory permit for a stream, providing privileged access or rate limiting. -/// -/// The permit tracks whether this stream has privileged Top-K status. -/// When dropped, it automatically releases any privileged slot it holds. -pub struct MemoryPermit { - tracker: QueryMemoryTracker, - is_privileged: AtomicBool, -} - -impl MemoryPermit { - /// Check if this permit currently has privileged status. - pub fn is_privileged(&self) -> bool { - self.is_privileged.load(Ordering::Acquire) - } - - /// Ensure this permit has privileged status by acquiring a slot if available. - /// Returns true if privileged (either already privileged or just acquired privilege). - fn ensure_privileged(&self) -> bool { - if self.is_privileged.load(Ordering::Acquire) { - return true; - } - - // Try to claim a privileged slot - self.tracker - .privileged_count - .fetch_update(Ordering::AcqRel, Ordering::Acquire, |count| { - if count < self.tracker.privileged_slots { - Some(count + 1) - } else { - None - } - }) - .map(|_| { - self.is_privileged.store(true, Ordering::Release); - true - }) - .unwrap_or(false) - } - - /// Track additional memory usage with this permit. - /// Returns error if limit is exceeded. - /// - /// # Arguments - /// * `additional` - Additional memory size to track in bytes - /// * `stream_tracked` - Total memory already tracked by this stream - /// - /// # Behavior - /// - Privileged streams: Can push global memory usage up to full limit - /// - Standard-tier streams: Can push global memory usage up to limit * standard_tier_memory_fraction (default: 0.7) - /// - Standard-tier streams automatically attempt to acquire privilege if slots become available - /// - The configured limit is absolute hard limit - no stream can exceed it - pub fn track(&self, additional: usize, stream_tracked: usize) -> Result<()> { - // Ensure privileged status if possible - let is_privileged = self.ensure_privileged(); - - self.tracker - .track_internal(additional, is_privileged, stream_tracked) - } - - /// Release tracked memory. - /// - /// # Arguments - /// * `amount` - Amount of memory to release in bytes - pub fn release(&self, amount: usize) { - self.tracker.release(amount); - } -} - -impl Drop for MemoryPermit { - fn drop(&mut self) { - // Release privileged slot if we had one - if self.is_privileged.load(Ordering::Acquire) { - self.tracker - .privileged_count - .fetch_sub(1, Ordering::Release); - } - } -} - /// Memory tracker for RecordBatch streams. Clone to share the same limit across queries. /// -/// Implements a two-tier memory allocation strategy: -/// - **Privileged tier**: First N streams (default: 20) can use up to the full memory limit -/// - **Standard tier**: Remaining streams are restricted to a fraction of the limit (default: 70%) -/// - Privilege is granted on a first-come-first-served basis -/// - The configured limit is an absolute hard cap - no stream can exceed it +/// Each stream acquires quota independently from this tracker. #[derive(Clone)] pub struct QueryMemoryTracker { - current: Arc, - limit: usize, - standard_tier_memory_fraction: f64, - privileged_count: Arc, - privileged_slots: usize, - on_update: Option>, - on_reject: Option>, + manager: MemoryManager, + metrics: CallbackMemoryMetrics, + on_exhausted_policy: OnExhaustedPolicy, } impl fmt::Debug for QueryMemoryTracker { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("QueryMemoryTracker") - .field("current", &self.current.load(Ordering::Acquire)) - .field("limit", &self.limit) - .field( - "standard_tier_memory_fraction", - &self.standard_tier_memory_fraction, - ) - .field( - "privileged_count", - &self.privileged_count.load(Ordering::Acquire), - ) - .field("privileged_slots", &self.privileged_slots) - .field("on_update", &self.on_update.is_some()) - .field("on_reject", &self.on_reject.is_some()) + .field("current", &self.current()) + .field("limit", &self.limit()) + .field("on_exhausted_policy", &self.on_exhausted_policy) + .field("on_update", &self.metrics.has_on_update()) + .field("on_reject", &self.metrics.has_on_reject()) .finish() } } impl QueryMemoryTracker { - // Default privileged slots when max_concurrent_queries is 0. - const DEFAULT_PRIVILEGED_SLOTS: usize = 20; - // Ratio for privileged tier: 70% queries get privileged access, standard tier uses 70% memory. - const DEFAULT_PRIVILEGED_TIER_RATIO: f64 = 0.7; - - /// Create a new memory tracker with the given limit and max_concurrent_queries. - /// Calculates privileged slots as 70% of max_concurrent_queries (or 20 if max_concurrent_queries is 0). - /// - /// # Arguments - /// * `limit` - Maximum memory usage in bytes (hard limit for all streams). 0 means unlimited. - /// * `max_concurrent_queries` - Maximum number of concurrent queries (0 = unlimited). - pub fn new(limit: usize, max_concurrent_queries: usize) -> Self { - let privileged_slots = Self::calculate_privileged_slots(max_concurrent_queries); - Self::with_privileged_slots(limit, privileged_slots) - } - - /// Create a new memory tracker with custom privileged slots limit. - pub fn with_privileged_slots(limit: usize, privileged_slots: usize) -> Self { - Self::with_config(limit, privileged_slots, Self::DEFAULT_PRIVILEGED_TIER_RATIO) - } - - /// Create a new memory tracker with full configuration. - /// - /// # Arguments - /// * `limit` - Maximum memory usage in bytes (hard limit for all streams). 0 means unlimited. - /// * `privileged_slots` - Maximum number of streams that can get privileged status. - /// * `standard_tier_memory_fraction` - Memory fraction for standard-tier streams (range: [0.0, 1.0]). - /// - /// # Panics - /// Panics if `standard_tier_memory_fraction` is not in the range [0.0, 1.0]. - pub fn with_config( + /// Create a builder for a query memory tracker. + pub fn builder( limit: usize, - privileged_slots: usize, - standard_tier_memory_fraction: f64, - ) -> Self { - assert!( - (0.0..=1.0).contains(&standard_tier_memory_fraction), - "standard_tier_memory_fraction must be in [0.0, 1.0], got {}", - standard_tier_memory_fraction - ); - - Self { - current: Arc::new(AtomicUsize::new(0)), + on_exhausted_policy: OnExhaustedPolicy, + ) -> QueryMemoryTrackerBuilder { + QueryMemoryTrackerBuilder { limit, - standard_tier_memory_fraction, - privileged_count: Arc::new(AtomicUsize::new(0)), - privileged_slots, + on_exhausted_policy, on_update: None, on_reject: None, } } - /// Register a new permit for memory tracking. - /// The first `privileged_slots` permits get privileged status automatically. - /// The returned permit can be shared across multiple streams of the same query. - pub fn register_permit(&self) -> MemoryPermit { - // Try to claim a privileged slot - let is_privileged = self - .privileged_count - .fetch_update(Ordering::AcqRel, Ordering::Acquire, |count| { - if count < self.privileged_slots { - Some(count + 1) - } else { - None - } - }) - .is_ok(); - - MemoryPermit { + fn new_stream_tracker(&self) -> StreamMemoryTracker { + StreamMemoryTracker { tracker: self.clone(), - is_privileged: AtomicBool::new(is_privileged), + guard: self.manager.try_acquire(0).unwrap(), + tracked_bytes: 0, } } + /// Get the current memory usage in bytes. + pub fn current(&self) -> usize { + self.manager.used_bytes() as usize + } + fn limit(&self) -> usize { + self.manager.limit_bytes() as usize + } + + fn reject_error( + &self, + current: usize, + additional: usize, + stream_tracked: usize, + ) -> error::Error { + let limit = self.limit(); + let msg = format!( + "{} requested, {} used globally ({}%), {} used by this stream, hard limit: {}", + ReadableSize(additional as u64), + ReadableSize(current as u64), + if limit > 0 { current * 100 / limit } else { 0 }, + ReadableSize(stream_tracked as u64), + ReadableSize(limit as u64) + ); + error::ExceedMemoryLimitSnafu { msg }.build() + } +} + +/// Builder for constructing a [`QueryMemoryTracker`] with optional callbacks. +pub struct QueryMemoryTrackerBuilder { + limit: usize, + on_exhausted_policy: OnExhaustedPolicy, + on_update: Option, + on_reject: Option, +} + +impl QueryMemoryTrackerBuilder { /// Set a callback to be called whenever the usage changes successfully. /// The callback receives the new total usage in bytes. /// /// # Note - /// The callback is called after both successful `track()` and `release()` operations. - /// It is called even when `limit == 0` (unlimited mode) to track actual usage. - pub fn with_on_update(mut self, on_update: F) -> Self + /// The callback is called after both successful `track()` and stream drop. + /// Usage is exact in unlimited mode and 1KB-aligned in limited mode. + pub fn on_update(mut self, on_update: F) -> Self where F: Fn(usize) + Send + Sync + 'static, { @@ -627,7 +519,7 @@ impl QueryMemoryTracker { /// # Note /// This is only called when `track()` fails due to exceeding the limit. /// It is never called when `limit == 0` (unlimited mode). - pub fn with_on_reject(mut self, on_reject: F) -> Self + pub fn on_reject(mut self, on_reject: F) -> Self where F: Fn() + Send + Sync + 'static, { @@ -635,105 +527,130 @@ impl QueryMemoryTracker { self } - /// Get the current memory usage in bytes. - pub fn current(&self) -> usize { - self.current.load(Ordering::Acquire) - } + /// Build a [`QueryMemoryTracker`] from this builder. + pub fn build(self) -> QueryMemoryTracker { + let metrics = CallbackMemoryMetrics::new(self.on_update, self.on_reject); + let manager = MemoryManager::with_granularity( + self.limit as u64, + PermitGranularity::Kilobyte, + metrics.clone(), + ); - fn calculate_privileged_slots(max_concurrent_queries: usize) -> usize { - if max_concurrent_queries == 0 { - Self::DEFAULT_PRIVILEGED_SLOTS + QueryMemoryTracker { + manager, + metrics, + on_exhausted_policy: self.on_exhausted_policy, + } + } +} + +struct StreamMemoryTracker { + tracker: QueryMemoryTracker, + guard: MemoryGuard, + tracked_bytes: usize, +} + +type MemoryAcquireResult = std::result::Result<(), common_memory_manager::Error>; + +impl StreamMemoryTracker { + fn try_track(&mut self, additional: usize) -> Result<()> { + if self.guard.try_acquire_additional(additional as u64) { + self.tracked_bytes = self.tracked_bytes.saturating_add(additional); + Ok(()) } else { - ((max_concurrent_queries as f64 * Self::DEFAULT_PRIVILEGED_TIER_RATIO) as usize).max(1) + Err(self.reject_error(additional)) } } - /// Internal method to track additional memory usage. - /// - /// Called by `MemoryPermit::track()`. Use `MemoryPermit::track()` instead of calling this directly. - fn track_internal( - &self, - additional: usize, - is_privileged: bool, - stream_tracked: usize, - ) -> Result<()> { - // Calculate effective global limit based on stream privilege - // Privileged streams: can push global usage up to full limit - // Standard-tier streams: can only push global usage up to fraction of limit - let effective_limit = if is_privileged { - self.limit - } else { - (self.limit as f64 * self.standard_tier_memory_fraction) as usize - }; - - let mut new_total = 0; + async fn track_with_policy(mut self, additional: usize) -> (Self, MemoryAcquireResult) { let result = self - .current - .fetch_update(Ordering::AcqRel, Ordering::Acquire, |current| { - new_total = current.saturating_add(additional); + .guard + .acquire_additional_with_policy(additional as u64, self.tracker.on_exhausted_policy) + .await; + if result.is_ok() { + self.tracked_bytes = self.tracked_bytes.saturating_add(additional); + } + (self, result) + } - if self.limit == 0 { - // Unlimited mode - return Some(new_total); - } + fn reject_error(&self, additional: usize) -> error::Error { + let current = self.tracker.current(); + self.tracker + .reject_error(current, additional, self.tracked_bytes) + } - // Check if new global total exceeds effective limit - // The configured limit is absolute hard limit - no stream can exceed it - if new_total <= effective_limit { - Some(new_total) - } else { - None - } - }); - - match result { - Ok(_) => { - if let Some(callback) = &self.on_update { - callback(new_total); - } - Ok(()) + fn wait_error(&self, additional: usize, source: common_memory_manager::Error) -> error::Error { + match source { + common_memory_manager::Error::MemoryLimitExceeded { .. } => { + self.reject_error(additional) } - Err(current) => { - if let Some(callback) = &self.on_reject { - callback(); - } + common_memory_manager::Error::MemoryAcquireTimeout { waited, .. } => { + let current = self.tracker.current(); + let limit = self.tracker.limit(); let msg = format!( - "{} requested, {} used globally ({}%), {} used by this stream (privileged: {}), effective limit: {} ({}%), hard limit: {}", + "timed out waiting {:?} for {}, {} used globally ({}%), {} used by this stream, hard limit: {}", + waited, ReadableSize(additional as u64), ReadableSize(current as u64), - if self.limit > 0 { - current * 100 / self.limit - } else { - 0 - }, - ReadableSize(stream_tracked as u64), - is_privileged, - ReadableSize(effective_limit as u64), - if self.limit > 0 { - effective_limit * 100 / self.limit - } else { - 0 - }, - ReadableSize(self.limit as u64) + if limit > 0 { current * 100 / limit } else { 0 }, + ReadableSize(self.tracked_bytes as u64), + ReadableSize(limit as u64) ); - error::ExceedMemoryLimitSnafu { msg }.fail() + error::ExceedMemoryLimitSnafu { msg }.build() } + error => error::ExternalSnafu.into_error(BoxedError::new(error)), + } + } +} + +type PendingTrackFuture = Pin< + Box + Send>, +>; + +#[derive(Clone)] +struct CallbackMemoryMetrics { + inner: Arc, +} + +type UpdateCallback = Arc; +type RejectCallback = Arc; + +struct CallbackMemoryMetricsInner { + on_update: Option, + on_reject: Option, +} + +impl CallbackMemoryMetrics { + fn new(on_update: Option, on_reject: Option) -> Self { + Self { + inner: Arc::new(CallbackMemoryMetricsInner { + on_update, + on_reject, + }), } } - /// Release tracked memory. - /// - /// # Arguments - /// * `amount` - Amount of memory to release in bytes - pub fn release(&self, amount: usize) { - if let Ok(old_value) = - self.current - .fetch_update(Ordering::AcqRel, Ordering::Acquire, |current| { - Some(current.saturating_sub(amount)) - }) - && let Some(callback) = &self.on_update - { - callback(old_value.saturating_sub(amount)); + fn has_on_update(&self) -> bool { + self.inner.on_update.is_some() + } + + fn has_on_reject(&self) -> bool { + self.inner.on_reject.is_some() + } +} + +impl MemoryMetrics for CallbackMemoryMetrics { + fn set_limit(&self, _: i64) {} + + fn set_in_use(&self, bytes: i64) { + if let Some(callback) = &self.inner.on_update { + callback(bytes.max(0) as usize); + } + } + + fn inc_rejected(&self, _: &str) { + if let Some(callback) = &self.inner.on_reject { + callback(); } } } @@ -741,38 +658,107 @@ impl QueryMemoryTracker { /// A wrapper stream that tracks memory usage of RecordBatches. pub struct MemoryTrackedStream { inner: SendableRecordBatchStream, - permit: Arc, - // Total tracked size, released when stream drops. - total_tracked: usize, + tracker: Option, + // Waiting stores a batch that has already been pulled from the inner stream but has not yet + // acquired additional quota. This keeps `poll_next()` non-blocking and allows bounded waits, + // at the cost of temporarily holding one untracked batch per blocked stream in memory. + waiting: Option, } impl MemoryTrackedStream { - pub fn new(inner: SendableRecordBatchStream, permit: Arc) -> Self { + pub fn new(inner: SendableRecordBatchStream, tracker: QueryMemoryTracker) -> Self { Self { inner, - permit, - total_tracked: 0, + tracker: Some(tracker.new_stream_tracker()), + waiting: None, } } + + fn ready_tracker_mut(&mut self) -> &mut StreamMemoryTracker { + debug_assert!( + self.waiting.is_none(), + "a ready tracker must not coexist with a waiting future" + ); + self.tracker.as_mut().unwrap() + } + + fn enter_waiting(&mut self, batch: RecordBatch, additional: usize) { + debug_assert!( + self.waiting.is_none(), + "enter_waiting should only be called from the ready state" + ); + debug_assert!( + self.tracker.is_some(), + "enter_waiting requires a tracker in the ready state" + ); + let tracker = self.tracker.take().unwrap(); + self.waiting = Some(Self::start_waiting(tracker, batch, additional)); + } + + fn start_waiting( + tracker: StreamMemoryTracker, + batch: RecordBatch, + additional: usize, + ) -> PendingTrackFuture { + Box::pin(async move { + let (tracker, result) = tracker.track_with_policy(additional).await; + (tracker, batch, additional, result) + }) + } + + fn poll_waiting(&mut self, cx: &mut Context<'_>) -> Poll>> { + let future = self.waiting.as_mut().unwrap(); + match future.as_mut().poll(cx) { + Poll::Ready((tracker, batch, additional, result)) => { + let output = match result { + Ok(()) => Ok(batch), + Err(error) => Err(tracker.wait_error(additional, error)), + }; + self.waiting = None; + self.tracker = Some(tracker); + Poll::Ready(Some(output)) + } + Poll::Pending => Poll::Pending, + } + } + + fn poll_batch( + &mut self, + batch: RecordBatch, + cx: &mut Context<'_>, + ) -> Poll>> { + let additional = batch.buffer_memory_size(); + let tracker = self.ready_tracker_mut(); + + if let Err(error) = tracker.try_track(additional) { + match tracker.tracker.on_exhausted_policy { + OnExhaustedPolicy::Fail => return Poll::Ready(Some(Err(error))), + // `Wait` is a deliberate tradeoff: the batch has already been materialized, so we + // keep it in memory while waiting for quota instead of failing immediately. Under + // contention, real memory usage can therefore exceed `scan_memory_limit` by up to + // one buffered batch per blocked stream. + OnExhaustedPolicy::Wait { .. } => { + self.enter_waiting(batch, additional); + return self.poll_waiting(cx); + } + } + } + + Poll::Ready(Some(Ok(batch))) + } } impl Stream for MemoryTrackedStream { type Item = Result; fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + if self.waiting.is_some() { + return self.poll_waiting(cx); + } + match Pin::new(&mut self.inner).poll_next(cx) { - Poll::Ready(Some(Ok(batch))) => { - let additional = batch.buffer_memory_size(); - - if let Err(e) = self.permit.track(additional, self.total_tracked) { - return Poll::Ready(Some(Err(e))); - } - - self.total_tracked += additional; - - Poll::Ready(Some(Ok(batch))) - } - Poll::Ready(Some(Err(e))) => Poll::Ready(Some(Err(e))), + Poll::Ready(Some(Ok(batch))) => self.poll_batch(batch, cx), + Poll::Ready(Some(Err(error))) => Poll::Ready(Some(Err(error))), Poll::Ready(None) => Poll::Ready(None), Poll::Pending => Poll::Pending, } @@ -783,14 +769,6 @@ impl Stream for MemoryTrackedStream { } } -impl Drop for MemoryTrackedStream { - fn drop(&mut self) { - if self.total_tracked > 0 { - self.permit.release(self.total_tracked); - } - } -} - impl RecordBatchStream for MemoryTrackedStream { fn schema(&self) -> SchemaRef { self.inner.schema() @@ -808,13 +786,34 @@ impl RecordBatchStream for MemoryTrackedStream { #[cfg(test)] mod tests { use std::sync::Arc; + use std::time::Duration; + use common_memory_manager::{OnExhaustedPolicy, PermitGranularity}; use datatypes::prelude::{ConcreteDataType, VectorRef}; use datatypes::schema::{ColumnSchema, Schema}; use datatypes::vectors::{BooleanVector, Int32Vector, StringVector}; + use futures::StreamExt; + use tokio::time::{sleep, timeout}; use super::*; + fn large_string_batch(bytes: usize) -> RecordBatch { + let schema = Arc::new(Schema::new(vec![ColumnSchema::new( + "payload", + ConcreteDataType::string_datatype(), + false, + )])); + let payload = "x".repeat(bytes); + let vector: VectorRef = Arc::new(StringVector::from(vec![payload])); + RecordBatch::new(schema, vec![vector]).unwrap() + } + + fn aligned_tracked_bytes(bytes: usize) -> usize { + PermitGranularity::Kilobyte + .permits_to_bytes(PermitGranularity::Kilobyte.bytes_to_permits(bytes as u64)) + as usize + } + #[test] fn test_recordbatches_try_from_columns() { let schema = Arc::new(Schema::new(vec![ColumnSchema::new( @@ -896,156 +895,168 @@ mod tests { assert_eq!(collected[1], batch2); } + const MB: usize = 1024 * 1024; + #[test] fn test_query_memory_tracker_basic() { - let tracker = Arc::new(QueryMemoryTracker::new(1000, 0)); + let tracker = + Arc::new(QueryMemoryTracker::builder(10 * MB, OnExhaustedPolicy::Fail).build()); - // Register first stream - should get privileged status - let permit1 = tracker.register_permit(); - assert!(permit1.is_privileged()); + let mut stream1 = tracker.new_stream_tracker(); + assert!(stream1.try_track(5 * MB).is_ok()); + assert_eq!(tracker.current(), 5 * MB); - // Privileged stream can use up to limit - assert!(permit1.track(500, 0).is_ok()); - assert_eq!(tracker.current(), 500); + let mut stream2 = tracker.new_stream_tracker(); + assert!(stream2.try_track(4 * MB).is_ok()); + assert_eq!(tracker.current(), 9 * MB); - // Register second stream - also privileged - let permit2 = tracker.register_permit(); - assert!(permit2.is_privileged()); - // Can add more but cannot exceed hard limit (1000) - assert!(permit2.track(400, 0).is_ok()); - assert_eq!(tracker.current(), 900); - - permit1.release(500); - permit2.release(400); + drop(stream1); + drop(stream2); assert_eq!(tracker.current(), 0); } #[test] - fn test_query_memory_tracker_privileged_limit() { - // Privileged slots = 2 for easy testing - // Limit: 1000, standard-tier fraction: 0.7 (default) - // Privileged can push global to 1000, standard-tier can push global to 700 - let tracker = Arc::new(QueryMemoryTracker::with_privileged_slots(1000, 2)); + fn test_query_memory_tracker_shared_global_limit() { + let tracker = + Arc::new(QueryMemoryTracker::builder(10 * MB, OnExhaustedPolicy::Fail).build()); + let mut stream1 = tracker.new_stream_tracker(); + let mut stream2 = tracker.new_stream_tracker(); - // First 2 streams are privileged - let permit1 = tracker.register_permit(); - let permit2 = tracker.register_permit(); - assert!(permit1.is_privileged()); - assert!(permit2.is_privileged()); + assert!(stream1.try_track(3 * MB).is_ok()); + assert_eq!(tracker.current(), 3 * MB); + assert!(stream2.try_track(6 * MB).is_ok()); + assert_eq!(tracker.current(), 9 * MB); - // Third stream is standard-tier (not privileged) - let permit3 = tracker.register_permit(); - assert!(!permit3.is_privileged()); - - // Privileged stream uses some memory - assert!(permit1.track(300, 0).is_ok()); - assert_eq!(tracker.current(), 300); - - // Standard-tier can add up to 400 (total becomes 700, its effective limit) - assert!(permit3.track(400, 0).is_ok()); - assert_eq!(tracker.current(), 700); - - // Standard-tier stream cannot push global beyond 700 - let err = permit3.track(100, 400).unwrap_err(); + let err = stream2.try_track(2 * MB).unwrap_err(); let err_msg = err.to_string(); - assert!(err_msg.contains("400B used by this stream")); - assert!(err_msg.contains("effective limit: 700B (70%)")); - assert!(err_msg.contains("700B used globally (70%)")); - assert_eq!(tracker.current(), 700); + assert!(err_msg.contains("6.0MiB used by this stream")); + assert!(err_msg.contains("9.0MiB used globally (90%)")); + assert!(err_msg.contains("hard limit: 10.0MiB")); + assert_eq!(tracker.current(), 9 * MB); - permit1.release(300); - permit3.release(400); + drop(stream1); + assert_eq!(tracker.current(), 6 * MB); + drop(stream2); assert_eq!(tracker.current(), 0); } #[test] - fn test_query_memory_tracker_promotion() { - // Privileged slots = 1 for easy testing - let tracker = Arc::new(QueryMemoryTracker::with_privileged_slots(1000, 1)); + fn test_query_memory_tracker_hard_limit() { + let tracker = + Arc::new(QueryMemoryTracker::builder(10 * MB, OnExhaustedPolicy::Fail).build()); + let mut stream = tracker.new_stream_tracker(); - // First stream is privileged - let permit1 = tracker.register_permit(); - assert!(permit1.is_privileged()); + assert!(stream.try_track(9 * MB).is_ok()); + assert_eq!(tracker.current(), 9 * MB); - // Second stream is standard-tier (can only use 500) - let permit2 = tracker.register_permit(); - assert!(!permit2.is_privileged()); + assert!(stream.try_track(2 * MB).is_err()); + assert_eq!(tracker.current(), 9 * MB); - // Standard-tier can only track 500 - assert!(permit2.track(400, 0).is_ok()); - assert_eq!(tracker.current(), 400); + assert!(stream.try_track(MB).is_ok()); + assert_eq!(tracker.current(), 10 * MB); - // Drop first permit to release privileged slot - drop(permit1); + assert!(stream.try_track(MB).is_err()); + assert_eq!(tracker.current(), 10 * MB); - // Second stream can now be promoted and use more memory - assert!(permit2.track(500, 400).is_ok()); - assert!(permit2.is_privileged()); - assert_eq!(tracker.current(), 900); - - permit2.release(900); + drop(stream); assert_eq!(tracker.current(), 0); } #[test] - fn test_query_memory_tracker_privileged_hard_limit() { - // Test that the configured limit is absolute hard limit for all streams - // Privileged: can use full limit (1000) - // Standard-tier: can use 0.7x limit (700 with defaults) - let tracker = Arc::new(QueryMemoryTracker::new(1000, 0)); + fn test_query_memory_tracker_unlimited() { + let tracker = Arc::new(QueryMemoryTracker::builder(0, OnExhaustedPolicy::Fail).build()); + let mut stream = tracker.new_stream_tracker(); - let permit1 = tracker.register_permit(); - assert!(permit1.is_privileged()); - - // Privileged can use up to full limit (1000) - assert!(permit1.track(900, 0).is_ok()); - assert_eq!(tracker.current(), 900); - - // Privileged cannot exceed hard limit (1000) - assert!(permit1.track(200, 900).is_err()); - assert_eq!(tracker.current(), 900); - - // Can add within hard limit - assert!(permit1.track(100, 900).is_ok()); - assert_eq!(tracker.current(), 1000); - - // Cannot exceed even by 1 byte - assert!(permit1.track(1, 1000).is_err()); - assert_eq!(tracker.current(), 1000); - - permit1.release(1000); + assert!(stream.try_track(10 * MB).is_ok()); + assert_eq!(tracker.current(), 10 * MB); + drop(stream); assert_eq!(tracker.current(), 0); } #[test] - fn test_query_memory_tracker_standard_tier_fraction() { - // Test standard-tier streams use fraction of limit - // Limit: 1000, default fraction: 0.7, so standard-tier can use 700 - let tracker = Arc::new(QueryMemoryTracker::with_privileged_slots(1000, 1)); + fn test_query_memory_tracker_rounds_to_kilobytes() { + let tracker = + Arc::new(QueryMemoryTracker::builder(10 * MB, OnExhaustedPolicy::Fail).build()); + let mut stream = tracker.new_stream_tracker(); - let permit1 = tracker.register_permit(); - assert!(permit1.is_privileged()); + assert!(stream.try_track(1_537).is_ok()); + assert_eq!(tracker.current(), 2 * 1024); - let permit2 = tracker.register_permit(); - assert!(!permit2.is_privileged()); - - // Standard-tier can use up to 700 (1000 * 0.7 default) - assert!(permit2.track(600, 0).is_ok()); - assert_eq!(tracker.current(), 600); - - // Cannot exceed standard-tier limit (700) - assert!(permit2.track(200, 600).is_err()); - assert_eq!(tracker.current(), 600); - - // Can add within standard-tier limit - assert!(permit2.track(100, 600).is_ok()); - assert_eq!(tracker.current(), 700); - - // Cannot exceed standard-tier limit - assert!(permit2.track(1, 700).is_err()); - assert_eq!(tracker.current(), 700); - - permit2.release(700); + drop(stream); assert_eq!(tracker.current(), 0); } + + #[tokio::test] + async fn test_memory_tracked_stream_waits_for_capacity() { + let tracker = QueryMemoryTracker::builder( + MB, + OnExhaustedPolicy::Wait { + timeout: Duration::from_millis(200), + }, + ) + .build(); + let batch = large_string_batch(700 * 1024); + let expected_bytes = aligned_tracked_bytes(batch.buffer_memory_size()); + + let mut stream1 = MemoryTrackedStream::new( + RecordBatches::try_new(batch.schema.clone(), vec![batch.clone()]) + .unwrap() + .as_stream(), + tracker.clone(), + ); + let first = stream1.next().await.unwrap().unwrap(); + assert_eq!(first.num_rows(), 1); + assert_eq!(tracker.current(), expected_bytes); + + let stream2 = MemoryTrackedStream::new( + RecordBatches::try_new(batch.schema.clone(), vec![batch]) + .unwrap() + .as_stream(), + tracker.clone(), + ); + let waiter = tokio::spawn(async move { + let mut stream2 = stream2; + stream2.next().await.unwrap() + }); + + sleep(Duration::from_millis(50)).await; + assert!(!waiter.is_finished()); + + drop(stream1); + let second = waiter.await.unwrap().unwrap(); + assert_eq!(second.num_rows(), 1); + } + + #[tokio::test] + async fn test_memory_tracked_stream_wait_times_out() { + let tracker = QueryMemoryTracker::builder( + MB, + OnExhaustedPolicy::Wait { + timeout: Duration::from_millis(50), + }, + ) + .build(); + let batch = large_string_batch(700 * 1024); + + let mut stream1 = MemoryTrackedStream::new( + RecordBatches::try_new(batch.schema.clone(), vec![batch.clone()]) + .unwrap() + .as_stream(), + tracker.clone(), + ); + let first = stream1.next().await.unwrap().unwrap(); + assert_eq!(first.num_rows(), 1); + + let mut stream2 = MemoryTrackedStream::new( + RecordBatches::try_new(batch.schema.clone(), vec![batch]) + .unwrap() + .as_stream(), + tracker, + ); + let result = timeout(Duration::from_secs(1), stream2.next()) + .await + .unwrap(); + let error = result.unwrap().unwrap_err(); + assert!(error.to_string().contains("timed out waiting")); + } } diff --git a/src/datanode/src/datanode.rs b/src/datanode/src/datanode.rs index 3c62015179..859235fa9f 100644 --- a/src/datanode/src/datanode.rs +++ b/src/datanode/src/datanode.rs @@ -538,7 +538,6 @@ impl DatanodeBuilder { file_ref_manager, partition_expr_fetcher.clone(), plugins, - opts.max_concurrent_queries, ); #[cfg(feature = "enterprise")] @@ -581,7 +580,6 @@ impl DatanodeBuilder { file_ref_manager, partition_expr_fetcher, plugins, - opts.max_concurrent_queries, ); #[cfg(feature = "enterprise")] @@ -603,7 +601,6 @@ impl DatanodeBuilder { file_ref_manager, partition_expr_fetcher.clone(), plugins, - opts.max_concurrent_queries, ); #[cfg(feature = "enterprise")] diff --git a/src/mito2/src/config.rs b/src/mito2/src/config.rs index 0eee067ab6..da0ec74022 100644 --- a/src/mito2/src/config.rs +++ b/src/mito2/src/config.rs @@ -151,6 +151,11 @@ pub struct MitoConfig { /// Memory limit for table scans across all queries. Setting it to 0 disables the limit. /// Supports absolute size (e.g., "2GB") or percentage (e.g., "50%"). pub scan_memory_limit: MemoryLimit, + /// Behavior when scan memory tracking cannot acquire memory from the budget. + /// `wait` means `wait(10s)`, not unlimited waiting. + /// Defaults to [`OnExhaustedPolicy::Fail`], which intentionally differs from + /// [`OnExhaustedPolicy::default()`]. + pub scan_memory_on_exhausted: OnExhaustedPolicy, /// Index configs. pub index: IndexConfig, @@ -216,6 +221,7 @@ impl Default for MitoConfig { max_concurrent_scan_files: DEFAULT_MAX_CONCURRENT_SCAN_FILES, allow_stale_entries: false, scan_memory_limit: MemoryLimit::default(), + scan_memory_on_exhausted: OnExhaustedPolicy::Fail, index: IndexConfig::default(), inverted_index: InvertedIndexConfig::default(), fulltext_index: FulltextIndexConfig::default(), diff --git a/src/mito2/src/engine.rs b/src/mito2/src/engine.rs index 1af79daff6..fbafe1da67 100644 --- a/src/mito2/src/engine.rs +++ b/src/mito2/src/engine.rs @@ -95,7 +95,7 @@ use common_base::Plugins; use common_error::ext::BoxedError; use common_meta::error::UnexpectedSnafu; use common_meta::key::SchemaMetadataManagerRef; -use common_recordbatch::{MemoryPermit, QueryMemoryTracker, SendableRecordBatchStream}; +use common_recordbatch::{QueryMemoryTracker, SendableRecordBatchStream}; use common_stat::get_total_memory_bytes; use common_telemetry::{info, tracing, warn}; use common_wal::options::WalOptions; @@ -167,7 +167,6 @@ pub struct MitoEngineBuilder<'a, S: LogStore> { file_ref_manager: FileReferenceManagerRef, partition_expr_fetcher: PartitionExprFetcherRef, plugins: Plugins, - max_concurrent_queries: usize, #[cfg(feature = "enterprise")] extension_range_provider_factory: Option, } @@ -183,7 +182,6 @@ impl<'a, S: LogStore> MitoEngineBuilder<'a, S> { file_ref_manager: FileReferenceManagerRef, partition_expr_fetcher: PartitionExprFetcherRef, plugins: Plugins, - max_concurrent_queries: usize, ) -> Self { Self { data_home, @@ -194,7 +192,6 @@ impl<'a, S: LogStore> MitoEngineBuilder<'a, S> { file_ref_manager, plugins, partition_expr_fetcher, - max_concurrent_queries, #[cfg(feature = "enterprise")] extension_range_provider_factory: None, } @@ -230,13 +227,14 @@ impl<'a, S: LogStore> MitoEngineBuilder<'a, S> { let total_memory = get_total_memory_bytes().max(0) as u64; let scan_memory_limit = config.scan_memory_limit.resolve(total_memory) as usize; let scan_memory_tracker = - QueryMemoryTracker::new(scan_memory_limit, self.max_concurrent_queries) - .with_on_update(|usage| { + QueryMemoryTracker::builder(scan_memory_limit, config.scan_memory_on_exhausted) + .on_update(|usage| { SCAN_MEMORY_USAGE_BYTES.set(usage as i64); }) - .with_on_reject(|| { + .on_reject(|| { SCAN_REQUESTS_REJECTED_TOTAL.inc(); - }); + }) + .build(); let inner = EngineInner { workers, @@ -285,7 +283,6 @@ impl MitoEngine { file_ref_manager, partition_expr_fetcher, plugins, - 0, // Default: no limit on concurrent queries ); builder.try_build().await } @@ -1212,8 +1209,8 @@ impl RegionEngine for MitoEngine { .map_err(BoxedError::new) } - fn register_query_memory_permit(&self) -> Option> { - Some(Arc::new(self.inner.scan_memory_tracker.register_permit())) + fn query_memory_tracker(&self) -> Option { + Some(self.inner.scan_memory_tracker.clone()) } async fn get_committed_sequence( @@ -1378,13 +1375,15 @@ impl MitoEngine { let wal_raw_entry_reader = Arc::new(LogStoreRawEntryReader::new(log_store.clone())); let total_memory = get_total_memory_bytes().max(0) as u64; let scan_memory_limit = config.scan_memory_limit.resolve(total_memory) as usize; - let scan_memory_tracker = QueryMemoryTracker::new(scan_memory_limit, 0) - .with_on_update(|usage| { - SCAN_MEMORY_USAGE_BYTES.set(usage as i64); - }) - .with_on_reject(|| { - SCAN_REQUESTS_REJECTED_TOTAL.inc(); - }); + let scan_memory_tracker = + QueryMemoryTracker::builder(scan_memory_limit, config.scan_memory_on_exhausted) + .on_update(|usage| { + SCAN_MEMORY_USAGE_BYTES.set(usage as i64); + }) + .on_reject(|| { + SCAN_REQUESTS_REJECTED_TOTAL.inc(); + }) + .build(); Ok(MitoEngine { inner: Arc::new(EngineInner { workers: WorkerGroup::start_for_test( diff --git a/src/query/src/dummy_catalog.rs b/src/query/src/dummy_catalog.rs index 239cf7cea8..7ce85afbbb 100644 --- a/src/query/src/dummy_catalog.rs +++ b/src/query/src/dummy_catalog.rs @@ -187,8 +187,8 @@ impl TableProvider for DummyTableProvider { .handle_query(self.region_id, request.clone()) .await .map_err(|e| DataFusionError::External(Box::new(e)))?; - let query_memory_permit = self.engine.register_query_memory_permit(); - let mut scan_exec = RegionScanExec::new(scanner, request, query_memory_permit)?; + let query_memory_tracker = self.engine.query_memory_tracker(); + let mut scan_exec = RegionScanExec::new(scanner, request, query_memory_tracker)?; if let Some(query_ctx) = &self.query_ctx { scan_exec.set_explain_verbose(query_ctx.explain_verbose()); } diff --git a/src/store-api/src/region_engine.rs b/src/store-api/src/region_engine.rs index b3f460d01d..115c841f93 100644 --- a/src/store-api/src/region_engine.rs +++ b/src/store-api/src/region_engine.rs @@ -23,7 +23,7 @@ use api::greptime_proto::v1::meta::{GrantedRegion as PbGrantedRegion, RegionRole use api::region::RegionResponse; use async_trait::async_trait; use common_error::ext::BoxedError; -use common_recordbatch::{EmptyRecordBatchStream, MemoryPermit, SendableRecordBatchStream}; +use common_recordbatch::{EmptyRecordBatchStream, QueryMemoryTracker, SendableRecordBatchStream}; use common_time::Timestamp; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion_physical_plan::{DisplayAs, DisplayFormatType, PhysicalExpr}; @@ -886,8 +886,8 @@ pub trait RegionEngine: Send + Sync { request: ScanRequest, ) -> Result; - /// Registers and returns a query memory permit. - fn register_query_memory_permit(&self) -> Option> { + /// Returns the query memory tracker for scan execution. + fn query_memory_tracker(&self) -> Option { None } diff --git a/src/table/src/table/scan.rs b/src/table/src/table/scan.rs index e2d8f794da..83319f2688 100644 --- a/src/table/src/table/scan.rs +++ b/src/table/src/table/scan.rs @@ -20,7 +20,7 @@ use std::time::Instant; use common_error::ext::BoxedError; use common_recordbatch::{ - DfRecordBatch, DfSendableRecordBatchStream, MemoryPermit, MemoryTrackedStream, + DfRecordBatch, DfSendableRecordBatchStream, MemoryTrackedStream, QueryMemoryTracker, SendableRecordBatchStream, }; use common_telemetry::tracing::Span; @@ -67,7 +67,7 @@ pub struct RegionScanExec { // TODO(ruihang): handle TimeWindowed dist via this parameter distribution: Option, explain_verbose: bool, - query_memory_permit: Option>, + query_memory_tracker: Option, } impl std::fmt::Debug for RegionScanExec { @@ -91,7 +91,7 @@ impl RegionScanExec { pub fn new( scanner: RegionScannerRef, request: ScanRequest, - query_memory_permit: Option>, + query_memory_tracker: Option, ) -> DfResult { let arrow_schema = scanner.schema().arrow_schema().clone(); let scanner_props = scanner.properties(); @@ -226,7 +226,7 @@ impl RegionScanExec { is_partition_set: false, distribution: request.distribution, explain_verbose: false, - query_memory_permit, + query_memory_tracker, }) } @@ -299,7 +299,7 @@ impl RegionScanExec { is_partition_set: true, distribution: self.distribution, explain_verbose: self.explain_verbose, - query_memory_permit: self.query_memory_permit.clone(), + query_memory_tracker: self.query_memory_tracker.clone(), }) } @@ -387,8 +387,8 @@ impl ExecutionPlan for RegionScanExec { .scan_partition(&ctx, &self.metric, partition) .map_err(|e| DataFusionError::External(Box::new(e)))?; - let stream = if let Some(permit) = &self.query_memory_permit { - Box::pin(MemoryTrackedStream::new(stream, permit.clone())) + let stream = if let Some(tracker) = &self.query_memory_tracker { + Box::pin(MemoryTrackedStream::new(stream, tracker.clone())) } else { stream }; diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index 7ae59ae9fc..05a34eb5b7 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -1548,6 +1548,7 @@ sst_write_buffer_size = "8MiB" parallel_scan_channel_size = 32 max_concurrent_scan_files = 384 allow_stale_entries = false +scan_memory_on_exhausted = "fail" min_compaction_interval = "0s" default_experimental_flat_format = false From d7bc5ad16bf86735e987016e0597850a1d1cf26c Mon Sep 17 00:00:00 2001 From: discord9 <55937128+discord9@users.noreply.github.com> Date: Thu, 26 Mar 2026 15:00:22 +0800 Subject: [PATCH 043/195] feat: add incremental read context and scan boundaries (#7848) * feat: add incremental read context and scan boundaries Signed-off-by: discord9 * chore: per review Signed-off-by: discord9 * docs: explain field Signed-off-by: discord9 --------- Signed-off-by: discord9 --- src/common/meta/src/ddl/tests/create_flow.rs | 22 ++ src/common/meta/src/rpc/ddl.rs | 137 ++++++- src/file-engine/src/engine.rs | 2 +- src/operator/src/utils.rs | 40 ++ src/query/src/error.rs | 10 +- src/query/src/options.rs | 362 +++++++++++++++++++ src/session/src/context.rs | 39 ++ src/store-api/src/region_engine.rs | 11 + src/store-api/src/storage/requests.rs | 21 ++ src/table/src/table/scan.rs | 7 +- 10 files changed, 637 insertions(+), 14 deletions(-) diff --git a/src/common/meta/src/ddl/tests/create_flow.rs b/src/common/meta/src/ddl/tests/create_flow.rs index 8803f98e0d..5b22c81857 100644 --- a/src/common/meta/src/ddl/tests/create_flow.rs +++ b/src/common/meta/src/ddl/tests/create_flow.rs @@ -37,6 +37,8 @@ fn test_query_context() -> QueryContext { timezone: "UTC".to_string(), extensions: HashMap::new(), channel: 0, + snapshot_seqs: HashMap::new(), + sst_min_sequences: HashMap::new(), } } @@ -251,6 +253,10 @@ fn test_create_flow_data_new_format_serialization() { catalog: "new_catalog".to_string(), schema: "new_schema".to_string(), timezone: "America/New_York".to_string(), + extensions: HashMap::new(), + channel: 0, + snapshot_seqs: HashMap::new(), + sst_min_sequences: HashMap::new(), }; let data = CreateFlowData { @@ -272,6 +278,9 @@ fn test_create_flow_data_new_format_serialization() { assert_eq!(deserialized.flow_context.catalog, "new_catalog"); assert_eq!(deserialized.flow_context.schema, "new_schema"); assert_eq!(deserialized.flow_context.timezone, "America/New_York"); + assert_eq!(deserialized.flow_context.channel, 0); + assert_eq!(deserialized.flow_context.snapshot_seqs, HashMap::new()); + assert_eq!(deserialized.flow_context.sst_min_sequences, HashMap::new()); } #[test] @@ -286,6 +295,8 @@ fn test_flow_query_context_conversion_from_query_context() { ] .into(), channel: 99, + snapshot_seqs: HashMap::from([(1, 10)]), + sst_min_sequences: HashMap::from([(1, 8)]), }; let flow_context: FlowQueryContext = query_context.into(); @@ -293,6 +304,9 @@ fn test_flow_query_context_conversion_from_query_context() { assert_eq!(flow_context.catalog, "prod_catalog"); assert_eq!(flow_context.schema, "public"); assert_eq!(flow_context.timezone, "America/Los_Angeles"); + assert_eq!(flow_context.channel, 99); + assert_eq!(flow_context.snapshot_seqs, HashMap::from([(1, 10)])); + assert_eq!(flow_context.sst_min_sequences, HashMap::from([(1, 8)])); } #[test] @@ -301,6 +315,10 @@ fn test_flow_info_conversion_with_flow_context() { catalog: "info_catalog".to_string(), schema: "info_schema".to_string(), timezone: "Europe/Berlin".to_string(), + extensions: HashMap::new(), + channel: 0, + snapshot_seqs: HashMap::new(), + sst_min_sequences: HashMap::new(), }; let data = CreateFlowData { @@ -349,6 +367,10 @@ fn test_mixed_serialization_format_support() { catalog: "test".to_string(), schema: "test".to_string(), timezone: "UTC".to_string(), + extensions: HashMap::new(), + channel: 0, + snapshot_seqs: HashMap::new(), + sst_min_sequences: HashMap::new(), }; assert_eq!(ctx_from_new, expected_new); } diff --git a/src/common/meta/src/rpc/ddl.rs b/src/common/meta/src/rpc/ddl.rs index 1e94a8f092..ed6f78154a 100644 --- a/src/common/meta/src/rpc/ddl.rs +++ b/src/common/meta/src/rpc/ddl.rs @@ -1432,6 +1432,12 @@ pub struct QueryContext { pub timezone: String, pub extensions: HashMap, pub channel: u8, + /// Maps region id -> snapshot upper bound sequence for that region. + #[serde(default)] + pub snapshot_seqs: HashMap, + /// Maps region id -> minimal SST sequence allowed for that region. + #[serde(default)] + pub sst_min_sequences: HashMap, } impl QueryContext { @@ -1459,6 +1465,14 @@ impl QueryContext { pub fn channel(&self) -> u8 { self.channel } + + pub fn snapshot_seqs(&self) -> &HashMap { + &self.snapshot_seqs + } + + pub fn sst_min_sequences(&self) -> &HashMap { + &self.sst_min_sequences + } } /// Lightweight query context for flow operations containing only essential fields. @@ -1466,12 +1480,24 @@ impl QueryContext { /// for flow creation and execution. #[derive(Debug, Clone, Serialize, PartialEq)] pub struct FlowQueryContext { - /// Current catalog name - needed for flow metadata and recovery + /// Current catalog name used for flow metadata and execution. pub catalog: String, - /// Current schema name - needed for table resolution during flow execution + /// Current schema name used for table resolution during flow execution. pub schema: String, - /// Timezone for timestamp operations in the flow + /// Timezone used for timestamp evaluation in the flow. pub timezone: String, + /// Query extensions carried into flow execution. + #[serde(default)] + pub extensions: HashMap, + /// Request channel propagated from the original query context. + #[serde(default)] + pub channel: u8, + /// Per-region snapshot upper bounds bound during query planning/execution. + #[serde(default)] + pub snapshot_seqs: HashMap, + /// Per-region lower SST scan bounds carried with the flow context. + #[serde(default)] + pub sst_min_sequences: HashMap, } impl<'de> Deserialize<'de> for FlowQueryContext { @@ -1492,6 +1518,14 @@ impl<'de> Deserialize<'de> for FlowQueryContext { catalog: String, schema: String, timezone: String, + #[serde(default)] + extensions: HashMap, + #[serde(default)] + channel: u8, + #[serde(default)] + snapshot_seqs: HashMap, + #[serde(default)] + sst_min_sequences: HashMap, } match ContextCompat::deserialize(deserializer)? { @@ -1499,6 +1533,10 @@ impl<'de> Deserialize<'de> for FlowQueryContext { catalog: helper.catalog, schema: helper.schema, timezone: helper.timezone, + extensions: helper.extensions, + channel: helper.channel, + snapshot_seqs: helper.snapshot_seqs, + sst_min_sequences: helper.sst_min_sequences, }), ContextCompat::Full(full_ctx) => Ok(full_ctx.into()), } @@ -1507,12 +1545,19 @@ impl<'de> Deserialize<'de> for FlowQueryContext { impl From for QueryContext { fn from(pb_ctx: PbQueryContext) -> Self { + let (snapshot_seqs, sst_min_sequences) = pb_ctx + .snapshot_seqs + .map(|seqs| (seqs.snapshot_seqs, seqs.sst_min_sequences)) + .unwrap_or_default(); + Self { current_catalog: pb_ctx.current_catalog, current_schema: pb_ctx.current_schema, timezone: pb_ctx.timezone, extensions: pb_ctx.extensions, channel: pb_ctx.channel as u8, + snapshot_seqs, + sst_min_sequences, } } } @@ -1525,6 +1570,8 @@ impl From for PbQueryContext { timezone, extensions, channel, + snapshot_seqs, + sst_min_sequences, }: QueryContext, ) -> Self { PbQueryContext { @@ -1533,7 +1580,12 @@ impl From for PbQueryContext { timezone, extensions, channel: channel as u32, - snapshot_seqs: None, + snapshot_seqs: (!snapshot_seqs.is_empty() || !sst_min_sequences.is_empty()).then_some( + api::v1::SnapshotSequences { + snapshot_seqs, + sst_min_sequences, + }, + ), explain: None, } } @@ -1545,6 +1597,10 @@ impl From for FlowQueryContext { catalog: ctx.current_catalog, schema: ctx.current_schema, timezone: ctx.timezone, + extensions: ctx.extensions, + channel: ctx.channel, + snapshot_seqs: ctx.snapshot_seqs, + sst_min_sequences: ctx.sst_min_sequences, } } } @@ -1555,8 +1611,10 @@ impl From for QueryContext { current_catalog: flow_ctx.catalog, current_schema: flow_ctx.schema, timezone: flow_ctx.timezone, - extensions: HashMap::new(), - channel: 0, // Use default channel for flows + extensions: flow_ctx.extensions, + channel: flow_ctx.channel, + snapshot_seqs: flow_ctx.snapshot_seqs, + sst_min_sequences: flow_ctx.sst_min_sequences, } } } @@ -1720,6 +1778,8 @@ mod tests { timezone: "UTC".to_string(), extensions, channel: 5, + snapshot_seqs: HashMap::from([(10, 100)]), + sst_min_sequences: HashMap::from([(10, 90)]), }; let flow_ctx: FlowQueryContext = query_ctx.into(); @@ -1727,6 +1787,9 @@ mod tests { assert_eq!(flow_ctx.catalog, "test_catalog"); assert_eq!(flow_ctx.schema, "test_schema"); assert_eq!(flow_ctx.timezone, "UTC"); + assert_eq!(flow_ctx.channel, 5); + assert_eq!(flow_ctx.snapshot_seqs, HashMap::from([(10, 100)])); + assert_eq!(flow_ctx.sst_min_sequences, HashMap::from([(10, 90)])); } #[test] @@ -1735,6 +1798,10 @@ mod tests { catalog: "prod_catalog".to_string(), schema: "public".to_string(), timezone: "America/New_York".to_string(), + extensions: HashMap::from([("k".to_string(), "v".to_string())]), + channel: 7, + snapshot_seqs: HashMap::from([(11, 111)]), + sst_min_sequences: HashMap::from([(11, 101)]), }; let query_ctx: QueryContext = flow_ctx.clone().into(); @@ -1742,8 +1809,13 @@ mod tests { assert_eq!(query_ctx.current_catalog, "prod_catalog"); assert_eq!(query_ctx.current_schema, "public"); assert_eq!(query_ctx.timezone, "America/New_York"); - assert!(query_ctx.extensions.is_empty()); - assert_eq!(query_ctx.channel, 0); + assert_eq!( + query_ctx.extensions, + HashMap::from([("k".to_string(), "v".to_string())]) + ); + assert_eq!(query_ctx.channel, 7); + assert_eq!(query_ctx.snapshot_seqs, HashMap::from([(11, 111)])); + assert_eq!(query_ctx.sst_min_sequences, HashMap::from([(11, 101)])); // Test roundtrip conversion let flow_ctx_roundtrip: FlowQueryContext = query_ctx.into(); @@ -1756,6 +1828,10 @@ mod tests { catalog: "test_catalog".to_string(), schema: "test_schema".to_string(), timezone: "UTC".to_string(), + extensions: HashMap::new(), + channel: 0, + snapshot_seqs: HashMap::new(), + sst_min_sequences: HashMap::new(), }; let serialized = serde_json::to_string(&flow_ctx).unwrap(); @@ -1776,6 +1852,10 @@ mod tests { catalog: "pb_catalog".to_string(), schema: "pb_schema".to_string(), timezone: "Asia/Tokyo".to_string(), + extensions: HashMap::from([("x".to_string(), "y".to_string())]), + channel: 6, + snapshot_seqs: HashMap::from([(3, 30)]), + sst_min_sequences: HashMap::from([(3, 21)]), }; let pb_ctx: PbQueryContext = flow_ctx.into(); @@ -1783,9 +1863,44 @@ mod tests { assert_eq!(pb_ctx.current_catalog, "pb_catalog"); assert_eq!(pb_ctx.current_schema, "pb_schema"); assert_eq!(pb_ctx.timezone, "Asia/Tokyo"); - assert!(pb_ctx.extensions.is_empty()); - assert_eq!(pb_ctx.channel, 0); - assert!(pb_ctx.snapshot_seqs.is_none()); + assert_eq!( + pb_ctx.extensions, + HashMap::from([("x".to_string(), "y".to_string())]) + ); + assert_eq!(pb_ctx.channel, 6); + assert_eq!( + pb_ctx.snapshot_seqs, + Some(api::v1::SnapshotSequences { + snapshot_seqs: HashMap::from([(3, 30)]), + sst_min_sequences: HashMap::from([(3, 21)]), + }) + ); assert!(pb_ctx.explain.is_none()); } + + #[test] + fn test_pb_query_context_roundtrip_with_snapshot_sequences() { + let pb = PbQueryContext { + current_catalog: "c1".to_string(), + current_schema: "s1".to_string(), + timezone: "UTC".to_string(), + extensions: HashMap::from([("flow.return_region_seq".to_string(), "true".to_string())]), + channel: 3, + snapshot_seqs: Some(api::v1::SnapshotSequences { + snapshot_seqs: HashMap::from([(1, 100)]), + sst_min_sequences: HashMap::from([(1, 90)]), + }), + explain: None, + }; + + let query_ctx: QueryContext = pb.clone().into(); + let pb_roundtrip: PbQueryContext = query_ctx.into(); + + assert_eq!(pb_roundtrip.current_catalog, pb.current_catalog); + assert_eq!(pb_roundtrip.current_schema, pb.current_schema); + assert_eq!(pb_roundtrip.timezone, pb.timezone); + assert_eq!(pb_roundtrip.extensions, pb.extensions); + assert_eq!(pb_roundtrip.channel, pb.channel); + assert_eq!(pb_roundtrip.snapshot_seqs, pb.snapshot_seqs); + } } diff --git a/src/file-engine/src/engine.rs b/src/file-engine/src/engine.rs index 693ae325df..175ebef237 100644 --- a/src/file-engine/src/engine.rs +++ b/src/file-engine/src/engine.rs @@ -94,7 +94,7 @@ impl RegionEngine for FileRegionEngine { let stream = self.handle_query(region_id, request).await?; let metadata = self.get_metadata(region_id).await?; // We don't support enabling append mode for file engine. - let scanner = Box::new(SinglePartitionScanner::new(stream, false, metadata)); + let scanner = Box::new(SinglePartitionScanner::new(stream, false, metadata, None)); Ok(scanner) } diff --git a/src/operator/src/utils.rs b/src/operator/src/utils.rs index 93da5f028e..6e9386b3fa 100644 --- a/src/operator/src/utils.rs +++ b/src/operator/src/utils.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::sync::{Arc, RwLock}; + use common_time::Timezone; use session::context::{QueryContextBuilder, QueryContextRef}; use snafu::ResultExt; @@ -27,6 +29,8 @@ pub fn to_meta_query_context( timezone: query_context.timezone().to_string(), extensions: query_context.extensions(), channel: query_context.channel() as u8, + snapshot_seqs: query_context.snapshots(), + sst_min_sequences: query_context.sst_min_sequences(), } } @@ -43,5 +47,41 @@ pub fn try_to_session_query_context( ) .extensions(value.extensions) .channel((value.channel as u32).into()) + .snapshot_seqs(Arc::new(RwLock::new(value.snapshot_seqs))) + .sst_min_sequences(Arc::new(RwLock::new(value.sst_min_sequences))) .build()) } + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::sync::{Arc, RwLock}; + + use common_time::Timezone; + use session::context::QueryContextBuilder; + + use super::{to_meta_query_context, try_to_session_query_context}; + + #[test] + fn test_query_context_meta_roundtrip_with_sequences() { + let session_ctx = Arc::new( + QueryContextBuilder::default() + .current_catalog("c1".to_string()) + .current_schema("s1".to_string()) + .timezone(Timezone::from_tz_string("UTC").unwrap()) + .set_extension("flow.return_region_seq".to_string(), "true".to_string()) + .snapshot_seqs(Arc::new(RwLock::new(HashMap::from([(10, 100)])))) + .sst_min_sequences(Arc::new(RwLock::new(HashMap::from([(10, 90)])))) + .build(), + ); + + let meta_ctx = to_meta_query_context(session_ctx); + let roundtrip = try_to_session_query_context(meta_ctx).unwrap(); + + assert_eq!(roundtrip.current_catalog(), "c1"); + assert_eq!(roundtrip.current_schema(), "s1"); + assert_eq!(roundtrip.snapshots(), HashMap::from([(10, 100)])); + assert_eq!(roundtrip.sst_min_sequences(), HashMap::from([(10, 90)])); + assert_eq!(roundtrip.extension("flow.return_region_seq"), Some("true")); + } +} diff --git a/src/query/src/error.rs b/src/query/src/error.rs index f863a26c4a..b3a4ebeba5 100644 --- a/src/query/src/error.rs +++ b/src/query/src/error.rs @@ -368,6 +368,13 @@ pub enum Error { location: Location, }, + #[snafu(display("Invalid query context extension: {}", reason))] + InvalidQueryContextExtension { + reason: String, + #[snafu(implicit)] + location: Location, + }, + #[snafu(transparent)] Datatypes { source: datatypes::error::Error, @@ -399,7 +406,8 @@ impl ErrorExt for Error { | ColumnSchemaNoDefault { .. } | CteColumnSchemaMismatch { .. } | ConvertValue { .. } - | TryIntoDuration { .. } => StatusCode::InvalidArguments, + | TryIntoDuration { .. } + | InvalidQueryContextExtension { .. } => StatusCode::InvalidArguments, BuildBackend { .. } | ListObjects { .. } => StatusCode::StorageUnavailable, diff --git a/src/query/src/options.rs b/src/query/src/options.rs index 50ca1177a5..9b60b64759 100644 --- a/src/query/src/options.rs +++ b/src/query/src/options.rs @@ -12,8 +12,21 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashMap; + use common_base::memory_limit::MemoryLimit; use serde::{Deserialize, Serialize}; +use store_api::storage::RegionId; +use table::metadata::TableId; + +use crate::error::{Error, InvalidQueryContextExtensionSnafu, Result}; + +pub const FLOW_INCREMENTAL_AFTER_SEQS: &str = "flow.incremental_after_seqs"; +pub const FLOW_INCREMENTAL_MODE: &str = "flow.incremental_mode"; +pub const FLOW_RETURN_REGION_SEQ: &str = "flow.return_region_seq"; +pub const FLOW_SINK_TABLE_ID: &str = "flow.sink_table_id"; + +pub const FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY: &str = "memtable_only"; /// Query engine config #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] @@ -39,3 +52,352 @@ impl Default for QueryOptions { } } } + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum FlowIncrementalMode { + MemtableOnly, +} + +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub struct FlowQueryExtensions { + /// Maps region id -> lower exclusive sequence bound for incremental reads. + pub incremental_after_seqs: Option>, + /// Incremental read mode requested by the caller. + pub incremental_mode: Option, + /// Whether the caller expects per-region watermark metadata in terminal metrics. + pub return_region_seq: bool, + /// Optional sink table id used to distinguish source scans from sink reads. + pub sink_table_id: Option, +} + +impl FlowQueryExtensions { + pub fn from_extensions(extensions: &HashMap) -> Result { + let incremental_mode = extensions + .get(FLOW_INCREMENTAL_MODE) + .map(|value| match value.as_str() { + v if v.eq_ignore_ascii_case(FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY) => { + Ok(FlowIncrementalMode::MemtableOnly) + } + _ => Err(invalid_query_context_extension(format!( + "Invalid value for {}: {}", + FLOW_INCREMENTAL_MODE, value + ))), + }) + .transpose()?; + + let incremental_after_seqs = extensions + .get(FLOW_INCREMENTAL_AFTER_SEQS) + .map(|value| parse_incremental_after_seqs(value.as_str())) + .transpose()?; + + let return_region_seq = extensions + .get(FLOW_RETURN_REGION_SEQ) + .map(|value| parse_bool(value.as_str())) + .transpose()? + .unwrap_or(false); + + let sink_table_id = extensions + .get(FLOW_SINK_TABLE_ID) + .map(|value| { + value.parse::().map_err(|_| { + invalid_query_context_extension(format!( + "Invalid value for {}: {}", + FLOW_SINK_TABLE_ID, value + )) + }) + }) + .transpose()?; + + if matches!(incremental_mode, Some(FlowIncrementalMode::MemtableOnly)) { + let after_seqs = incremental_after_seqs.as_ref().ok_or_else(|| { + invalid_query_context_extension(format!( + "{} is required when {}={}.", + FLOW_INCREMENTAL_AFTER_SEQS, + FLOW_INCREMENTAL_MODE, + FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY + )) + })?; + if after_seqs.is_empty() { + return Err(invalid_query_context_extension(format!( + "{} must not be empty when {}={}.", + FLOW_INCREMENTAL_AFTER_SEQS, + FLOW_INCREMENTAL_MODE, + FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY + ))); + } + } + + Ok(Self { + incremental_after_seqs, + incremental_mode, + return_region_seq, + sink_table_id, + }) + } + + pub fn validate_for_scan(&self, source_region_id: RegionId) -> Result { + if self.sink_table_id.is_some() && self.sink_table_id == Some(source_region_id.table_id()) { + return Ok(false); + } + + if matches!( + self.incremental_mode, + Some(FlowIncrementalMode::MemtableOnly) + ) { + let after_seqs = self.incremental_after_seqs.as_ref().ok_or_else(|| { + invalid_query_context_extension(format!( + "{} is required when {}=memtable_only.", + FLOW_INCREMENTAL_AFTER_SEQS, FLOW_INCREMENTAL_MODE + )) + })?; + + if !after_seqs.contains_key(&source_region_id.as_u64()) { + return Err(invalid_query_context_extension(format!( + "Missing region {} in {} when {}=memtable_only.", + source_region_id, FLOW_INCREMENTAL_AFTER_SEQS, FLOW_INCREMENTAL_MODE + ))); + } + } + + Ok(self.incremental_after_seqs.is_some()) + } + + pub fn should_collect_region_watermark(&self) -> bool { + self.return_region_seq || self.incremental_after_seqs.is_some() + } +} + +fn parse_incremental_after_seqs(value: &str) -> Result> { + let raw = serde_json::from_str::>(value).map_err(|e| { + invalid_query_context_extension(format!( + "Invalid JSON for {}: {} ({})", + FLOW_INCREMENTAL_AFTER_SEQS, value, e + )) + })?; + + raw.into_iter() + .map(|(region_id, raw_seq)| { + let region_id = region_id.parse::().map_err(|_| { + invalid_query_context_extension(format!( + "Invalid region id in {}: {}", + FLOW_INCREMENTAL_AFTER_SEQS, region_id + )) + })?; + + let seq = match raw_seq { + serde_json::Value::Number(num) => num.as_u64().ok_or_else(|| { + invalid_query_context_extension(format!( + "Invalid sequence value in {} for region {}: {}", + FLOW_INCREMENTAL_AFTER_SEQS, region_id, num + )) + })?, + serde_json::Value::String(s) => s.parse::().map_err(|_| { + invalid_query_context_extension(format!( + "Invalid sequence string in {} for region {}: {}", + FLOW_INCREMENTAL_AFTER_SEQS, region_id, s + )) + })?, + _ => { + return Err(invalid_query_context_extension(format!( + "Invalid sequence value type in {} for region {}", + FLOW_INCREMENTAL_AFTER_SEQS, region_id + ))); + } + }; + + Ok((region_id, seq)) + }) + .collect() +} + +fn parse_bool(value: &str) -> Result { + match value { + v if v.eq_ignore_ascii_case("true") => Ok(true), + v if v.eq_ignore_ascii_case("false") => Ok(false), + _ => Err(invalid_query_context_extension(format!( + "Invalid value for {}: {}", + FLOW_RETURN_REGION_SEQ, value + ))), + } +} + +fn invalid_query_context_extension(reason: String) -> Error { + InvalidQueryContextExtensionSnafu { reason }.build() +} + +#[cfg(test)] +mod flow_extension_tests { + use super::*; + + #[test] + fn test_parse_flow_extensions_default() { + let exts = HashMap::new(); + let parsed = FlowQueryExtensions::from_extensions(&exts).unwrap(); + + assert_eq!(parsed.incremental_mode, None); + assert_eq!(parsed.incremental_after_seqs, None); + assert!(!parsed.return_region_seq); + assert_eq!(parsed.sink_table_id, None); + } + + #[test] + fn test_parse_flow_extensions_memtable_only_success() { + let exts = HashMap::from([ + ( + FLOW_INCREMENTAL_MODE.to_string(), + FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY.to_string(), + ), + ( + FLOW_INCREMENTAL_AFTER_SEQS.to_string(), + r#"{"1":10,"2":20}"#.to_string(), + ), + (FLOW_RETURN_REGION_SEQ.to_string(), "true".to_string()), + (FLOW_SINK_TABLE_ID.to_string(), "1024".to_string()), + ]); + + let parsed = FlowQueryExtensions::from_extensions(&exts).unwrap(); + assert_eq!( + parsed.incremental_mode, + Some(FlowIncrementalMode::MemtableOnly) + ); + assert_eq!( + parsed.incremental_after_seqs.unwrap(), + HashMap::from([(1, 10), (2, 20)]) + ); + assert!(parsed.return_region_seq); + assert_eq!(parsed.sink_table_id, Some(1024)); + } + + #[test] + fn test_parse_flow_extensions_mode_requires_after_seqs() { + let exts = HashMap::from([( + FLOW_INCREMENTAL_MODE.to_string(), + FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY.to_string(), + )]); + + let err = FlowQueryExtensions::from_extensions(&exts).unwrap_err(); + assert!(format!("{err}").contains(FLOW_INCREMENTAL_AFTER_SEQS)); + } + + #[test] + fn test_parse_flow_extensions_invalid_mode() { + let exts = HashMap::from([(FLOW_INCREMENTAL_MODE.to_string(), "foo".to_string())]); + + let err = FlowQueryExtensions::from_extensions(&exts).unwrap_err(); + assert!(format!("{err}").contains(FLOW_INCREMENTAL_MODE)); + } + + #[test] + fn test_parse_flow_extensions_invalid_after_seqs_json() { + let exts = HashMap::from([ + ( + FLOW_INCREMENTAL_MODE.to_string(), + FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY.to_string(), + ), + ( + FLOW_INCREMENTAL_AFTER_SEQS.to_string(), + "not-json".to_string(), + ), + ]); + + let err = FlowQueryExtensions::from_extensions(&exts).unwrap_err(); + assert!(format!("{err}").contains(FLOW_INCREMENTAL_AFTER_SEQS)); + } + + #[test] + fn test_parse_flow_extensions_after_seqs_string_values() { + let exts = HashMap::from([( + FLOW_INCREMENTAL_AFTER_SEQS.to_string(), + r#"{"1":"10","2":"20"}"#.to_string(), + )]); + + let parsed = FlowQueryExtensions::from_extensions(&exts).unwrap(); + assert_eq!( + parsed.incremental_after_seqs.unwrap(), + HashMap::from([(1, 10), (2, 20)]) + ); + } + + #[test] + fn test_parse_flow_extensions_after_seqs_invalid_value_type() { + let exts = HashMap::from([( + FLOW_INCREMENTAL_AFTER_SEQS.to_string(), + r#"{"1":true}"#.to_string(), + )]); + + let err = FlowQueryExtensions::from_extensions(&exts).unwrap_err(); + assert!(format!("{err}").contains(FLOW_INCREMENTAL_AFTER_SEQS)); + } + + #[test] + fn test_parse_flow_extensions_invalid_sink_table_id() { + let exts = HashMap::from([(FLOW_SINK_TABLE_ID.to_string(), "x".to_string())]); + + let err = FlowQueryExtensions::from_extensions(&exts).unwrap_err(); + assert!(format!("{err}").contains(FLOW_SINK_TABLE_ID)); + } + + #[test] + fn test_validate_for_scan_missing_source_region() { + let source_region_id = RegionId::new(100, 2); + let existing_region_id = RegionId::new(100, 1); + let exts = HashMap::from([ + ( + FLOW_INCREMENTAL_MODE.to_string(), + FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY.to_string(), + ), + ( + FLOW_INCREMENTAL_AFTER_SEQS.to_string(), + format!(r#"{{"{}":10}}"#, existing_region_id.as_u64()), + ), + ]); + + let parsed = FlowQueryExtensions::from_extensions(&exts).unwrap(); + let err = parsed.validate_for_scan(source_region_id).unwrap_err(); + assert!(format!("{err}").contains("Missing region")); + } + + #[test] + fn test_validate_for_scan_sink_table_excluded() { + let source_region_id = RegionId::new(1024, 1); + let exts = HashMap::from([ + ( + FLOW_INCREMENTAL_MODE.to_string(), + FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY.to_string(), + ), + ( + FLOW_INCREMENTAL_AFTER_SEQS.to_string(), + format!(r#"{{"{}":10}}"#, source_region_id.as_u64()), + ), + (FLOW_SINK_TABLE_ID.to_string(), "1024".to_string()), + ]); + + let parsed = FlowQueryExtensions::from_extensions(&exts).unwrap(); + let apply_incremental = parsed.validate_for_scan(source_region_id).unwrap(); + assert!(!apply_incremental); + } + + #[test] + fn test_should_collect_region_watermark_defaults_false() { + let parsed = FlowQueryExtensions::default(); + assert!(!parsed.should_collect_region_watermark()); + } + + #[test] + fn test_should_collect_region_watermark_true_for_return_region_seq() { + let parsed = FlowQueryExtensions { + return_region_seq: true, + ..Default::default() + }; + assert!(parsed.should_collect_region_watermark()); + } + + #[test] + fn test_should_collect_region_watermark_true_for_incremental_query() { + let parsed = FlowQueryExtensions { + incremental_after_seqs: Some(HashMap::from([(1, 10)])), + ..Default::default() + }; + assert!(parsed.should_collect_region_watermark()); + } +} diff --git a/src/session/src/context.rs b/src/session/src/context.rs index 2b9483aca8..5f16ea8b5a 100644 --- a/src/session/src/context.rs +++ b/src/session/src/context.rs @@ -433,10 +433,21 @@ impl QueryContext { self.snapshot_seqs.read().unwrap().clone() } + pub fn sst_min_sequences(&self) -> HashMap { + self.sst_min_sequences.read().unwrap().clone() + } + pub fn get_snapshot(&self, region_id: u64) -> Option { self.snapshot_seqs.read().unwrap().get(®ion_id).cloned() } + pub fn set_snapshot(&self, region_id: u64, sequence: u64) { + self.snapshot_seqs + .write() + .unwrap() + .insert(region_id, sequence); + } + /// Returns `true` if the session can cast strings to numbers in MySQL style. pub fn auto_string_to_numeric(&self) -> bool { matches!(self.channel, Channel::Mysql) @@ -669,6 +680,8 @@ impl ConfigurationVariables { #[cfg(test)] mod test { + use std::collections::HashMap; + use common_catalog::consts::DEFAULT_CATALOG_NAME; use super::*; @@ -704,4 +717,30 @@ mod test { let context = QueryContext::with(DEFAULT_CATALOG_NAME, "test"); assert_eq!("test", context.get_db_string()); } + + #[test] + fn test_api_query_context_roundtrip_with_sequences() { + let api_ctx = api::v1::QueryContext { + current_catalog: "c1".to_string(), + current_schema: "s1".to_string(), + timezone: "UTC".to_string(), + extensions: HashMap::from([("flow.return_region_seq".to_string(), "true".to_string())]), + channel: Channel::Grpc as u32, + snapshot_seqs: Some(api::v1::SnapshotSequences { + snapshot_seqs: HashMap::from([(1, 100)]), + sst_min_sequences: HashMap::from([(1, 90)]), + }), + explain: None, + }; + + let session_ctx: QueryContext = api_ctx.clone().into(); + let roundtrip_api: api::v1::QueryContext = session_ctx.into(); + + assert_eq!(roundtrip_api.current_catalog, api_ctx.current_catalog); + assert_eq!(roundtrip_api.current_schema, api_ctx.current_schema); + assert_eq!(roundtrip_api.timezone, api_ctx.timezone); + assert_eq!(roundtrip_api.extensions, api_ctx.extensions); + assert_eq!(roundtrip_api.channel, api_ctx.channel); + assert_eq!(roundtrip_api.snapshot_seqs, api_ctx.snapshot_seqs); + } } diff --git a/src/store-api/src/region_engine.rs b/src/store-api/src/region_engine.rs index 115c841f93..287f64d225 100644 --- a/src/store-api/src/region_engine.rs +++ b/src/store-api/src/region_engine.rs @@ -462,6 +462,10 @@ pub trait RegionScanner: Debug + DisplayAs + Send { /// Sets whether the scanner is reading a logical region. fn set_logical_region(&mut self, logical_region: bool); + + fn snapshot_sequence(&self) -> Option { + None + } } pub type RegionScannerRef = Box; @@ -945,6 +949,7 @@ pub struct SinglePartitionScanner { schema: SchemaRef, properties: ScannerProperties, metadata: RegionMetadataRef, + snapshot_sequence: Option, } impl SinglePartitionScanner { @@ -953,6 +958,7 @@ impl SinglePartitionScanner { stream: SendableRecordBatchStream, append_mode: bool, metadata: RegionMetadataRef, + snapshot_sequence: Option, ) -> Self { let schema = stream.schema(); Self { @@ -960,6 +966,7 @@ impl SinglePartitionScanner { schema, properties: ScannerProperties::default().with_append_mode(append_mode), metadata, + snapshot_sequence, } } } @@ -1019,6 +1026,10 @@ impl RegionScanner for SinglePartitionScanner { fn set_logical_region(&mut self, logical_region: bool) { self.properties.set_logical_region(logical_region); } + + fn snapshot_sequence(&self) -> Option { + self.snapshot_sequence + } } impl DisplayAs for SinglePartitionScanner { diff --git a/src/store-api/src/storage/requests.rs b/src/store-api/src/storage/requests.rs index 6725de92e3..d072ec1b39 100644 --- a/src/store-api/src/storage/requests.rs +++ b/src/store-api/src/storage/requests.rs @@ -112,6 +112,8 @@ pub struct ScanRequest { /// Optional constraint on the sequence number of the rows to read. /// If set, only rows with a sequence number **lesser or equal** to this value /// will be returned. + /// This is the effective memtable upper bound used by the scan, whether provided + /// explicitly or bound on scan open. pub memtable_max_sequence: Option, /// Optional constraint on the minimal sequence number in the memtable. /// If set, only the memtables that contain sequences **greater than** this value will be scanned @@ -119,6 +121,8 @@ pub struct ScanRequest { /// Optional constraint on the minimal sequence number in the SST files. /// If set, only the SST files that contain sequences greater than this value will be scanned. pub sst_min_sequence: Option, + /// Whether to bind the effective snapshot upper bound when opening the scan. + pub snapshot_on_scan: bool, /// Optional hint for the distribution of time-series data. pub distribution: Option, /// Optional hint for KNN vector search. When set, the scan should use @@ -195,6 +199,14 @@ impl Display for ScanRequest { sst_min_sequence )?; } + if self.snapshot_on_scan { + write!( + f, + "{}snapshot_on_scan: {}", + delimiter.as_str(), + self.snapshot_on_scan + )?; + } if let Some(distribution) = &self.distribution { write!(f, "{}distribution: {}", delimiter.as_str(), distribution)?; } @@ -278,5 +290,14 @@ mod tests { request.to_string(), "ScanRequest { force_flat_format: true }" ); + + let request = ScanRequest { + snapshot_on_scan: true, + ..Default::default() + }; + assert_eq!( + request.to_string(), + "ScanRequest { snapshot_on_scan: true }" + ); } } diff --git a/src/table/src/table/scan.rs b/src/table/src/table/scan.rs index 83319f2688..02511456ae 100644 --- a/src/table/src/table/scan.rs +++ b/src/table/src/table/scan.rs @@ -599,7 +599,12 @@ mod test { .primary_key(vec![1]); let region_metadata = Arc::new(builder.build().unwrap()); - let scanner = Box::new(SinglePartitionScanner::new(stream, false, region_metadata)); + let scanner = Box::new(SinglePartitionScanner::new( + stream, + false, + region_metadata, + None, + )); let plan = RegionScanExec::new(scanner, ScanRequest::default(), None).unwrap(); let actual: SchemaRef = Arc::new( plan.properties From 1118fe243f8010579b3afc73cebf6c54f48e84ea Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Thu, 26 Mar 2026 15:01:47 +0800 Subject: [PATCH 044/195] test: filter on region_peers table (#7864) Signed-off-by: Ruihang Xia --- .../information_schema/region_peers.rs | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/src/catalog/src/system_schema/information_schema/region_peers.rs b/src/catalog/src/system_schema/information_schema/region_peers.rs index b1438ef53d..9eddb061f8 100644 --- a/src/catalog/src/system_schema/information_schema/region_peers.rs +++ b/src/catalog/src/system_schema/information_schema/region_peers.rs @@ -331,3 +331,87 @@ impl DfPartitionStream for InformationSchemaRegionPeers { )) } } + +#[cfg(test)] +mod tests { + use api::v1::meta::Peer; + use arrow::array::AsArray; + use common_meta::rpc::router::{Region, RegionRoute}; + use datafusion::common::ScalarValue; + use datafusion::logical_expr::{BinaryExpr, Expr, Operator, col}; + use store_api::storage::{RegionId, ScanRequest}; + + use super::*; + + fn new_region_route(table_id: u32, region_number: u32, peer_id: u64) -> RegionRoute { + RegionRoute { + region: Region { + id: RegionId::new(table_id, region_number), + ..Default::default() + }, + leader_peer: Some(Peer { + id: peer_id, + addr: format!("127.0.0.1:{}", 3000 + peer_id), + }), + follower_peers: vec![], + leader_state: None, + leader_down_since: None, + write_route_policy: None, + } + } + + #[test] + fn test_add_region_peers_predicate_filters_correctly() { + let schema = InformationSchemaRegionPeers::schema(); + let mut builder = InformationSchemaRegionPeersBuilder::new( + schema, + "greptime".to_string(), + Weak::::new(), + ); + + let table_id = 1; + // 3 regions: region_number 0, 1, 2 + let routes = vec![ + new_region_route(table_id, 0, 1), + new_region_route(table_id, 1, 2), + new_region_route(table_id, 2, 3), + ]; + + // Build a predicate that matches only the last region (region_number=2). + // With the old `return` bug, encountering the first non-matching region + // (region_number=0) would exit add_region_peers entirely, so region_number=2 + // would never be found. + let target_region_id = RegionId::new(table_id, 2).as_u64(); + let filter = Expr::BinaryExpr(BinaryExpr::new( + Box::new(col(REGION_ID)), + Operator::Eq, + Box::new(Expr::Literal( + ScalarValue::UInt64(Some(target_region_id)), + None, + )), + )); + let request = ScanRequest { + filters: vec![filter], + ..Default::default() + }; + let predicates = Predicates::from_scan_request(&Some(request)); + + builder.add_region_peers( + "greptime", + "public", + "test_table", + &predicates, + table_id, + &routes, + ); + + let batch = builder.finish().unwrap(); + // Should have exactly 1 row for the matching region + assert_eq!(batch.num_rows(), 1); + // Verify it's the correct region + let region_id_col = batch + .column(3) + .as_primitive::(); + assert_eq!(region_id_col.value(0), target_region_id); + } +} From 08ded45c7a37f712fb790bb3c386ca251b701e34 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Thu, 26 Mar 2026 18:18:06 +0800 Subject: [PATCH 045/195] feat: add common_version customization (#7869) * feat: add product name customization * chore: update tests --- Cargo.lock | 1 + src/cmd/Cargo.toml | 1 + src/cmd/src/bin/greptime.rs | 10 +++++----- src/cmd/src/cli.rs | 2 +- src/cmd/src/datanode.rs | 2 +- src/cmd/src/flownode.rs | 3 ++- src/cmd/src/frontend.rs | 2 +- src/cmd/src/metasrv.rs | 3 ++- src/cmd/src/standalone.rs | 3 ++- src/common/function/src/system/version.rs | 9 +++++++-- src/common/version/build.rs | 8 ++++++++ src/common/version/src/lib.rs | 4 ++++ src/servers/src/mysql/federated.rs | 5 +++-- src/servers/src/postgres.rs | 6 +++++- tests/cases/standalone/common/mysql.result | 2 +- 15 files changed, 44 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b3000970b3..d8f6241136 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2055,6 +2055,7 @@ dependencies = [ "common-time", "common-version", "common-wal", + "const_format", "datafusion", "datafusion-common", "datafusion-physical-plan", diff --git a/src/cmd/Cargo.toml b/src/cmd/Cargo.toml index d547ec6e81..003f1434f4 100644 --- a/src/cmd/Cargo.toml +++ b/src/cmd/Cargo.toml @@ -54,6 +54,7 @@ common-telemetry = { workspace = true, features = [ common-time.workspace = true common-version.workspace = true common-wal.workspace = true +const_format.workspace = true datafusion.workspace = true datafusion-common.workspace = true datafusion-physical-plan.workspace = true diff --git a/src/cmd/src/bin/greptime.rs b/src/cmd/src/bin/greptime.rs index a34d8e0f38..7ddc2cd176 100644 --- a/src/cmd/src/bin/greptime.rs +++ b/src/cmd/src/bin/greptime.rs @@ -20,11 +20,11 @@ use cmd::error::{InitTlsProviderSnafu, Result}; use cmd::options::GlobalOptions; use cmd::{App, cli, datanode, flownode, frontend, metasrv, standalone}; use common_base::Plugins; -use common_version::{verbose_version, version}; +use common_version::{product_name, verbose_version, version}; use servers::install_ring_crypto_provider; #[derive(Parser)] -#[command(name = "greptime", author, version, long_version = verbose_version(), about)] +#[command(name = product_name(), author, version, long_version = verbose_version(), about)] #[command(propagate_version = true)] pub(crate) struct Command { #[clap(subcommand)] @@ -52,11 +52,11 @@ enum SubCommand { #[clap(name = "metasrv")] Metasrv(metasrv::Command), - /// Run greptimedb as a standalone service. + /// Start service in standalone mode. #[clap(name = "standalone")] Standalone(standalone::Command), - /// Execute the cli tools for greptimedb. + /// Execute the cli tools. #[clap(name = "cli")] Cli(cli::Command), } @@ -148,7 +148,7 @@ async fn start(cli: Command) -> Result<()> { fn setup_human_panic() { human_panic::setup_panic!( - human_panic::Metadata::new("GreptimeDB", version()) + human_panic::Metadata::new(product_name(), version()) .homepage("https://github.com/GreptimeTeam/greptimedb/discussions") ); diff --git a/src/cmd/src/cli.rs b/src/cmd/src/cli.rs index 84e797c291..501b7b1615 100644 --- a/src/cmd/src/cli.rs +++ b/src/cmd/src/cli.rs @@ -21,7 +21,7 @@ use tracing_appender::non_blocking::WorkerGuard; use crate::options::GlobalOptions; use crate::{App, Result, error}; -pub const APP_NAME: &str = "greptime-cli"; +pub const APP_NAME: &str = const_format::concatcp!(common_version::product_name(), "-cli"); use async_trait::async_trait; pub struct Instance { diff --git a/src/cmd/src/datanode.rs b/src/cmd/src/datanode.rs index 06e2568b72..2fadb1d210 100644 --- a/src/cmd/src/datanode.rs +++ b/src/cmd/src/datanode.rs @@ -43,7 +43,7 @@ use crate::error::{ }; use crate::options::{GlobalOptions, GreptimeOptions}; -pub const APP_NAME: &str = "greptime-datanode"; +pub const APP_NAME: &str = const_format::concatcp!(common_version::product_name(), "-datanode"); type DatanodeOptions = GreptimeOptions; diff --git a/src/cmd/src/flownode.rs b/src/cmd/src/flownode.rs index 3f8458cddf..8e3277cdb3 100644 --- a/src/cmd/src/flownode.rs +++ b/src/cmd/src/flownode.rs @@ -35,6 +35,7 @@ use common_stat::ResourceStatImpl; use common_telemetry::info; use common_telemetry::logging::{DEFAULT_LOGGING_DIR, TracingOptions}; use common_version::{short_version, verbose_version}; +use const_format::concatcp; use flow::{ FlownodeBuilder, FlownodeInstance, FlownodeServiceBuilder, FrontendClient, FrontendInvoker, get_flow_auth_options, @@ -52,7 +53,7 @@ use crate::error::{ use crate::options::{GlobalOptions, GreptimeOptions}; use crate::{App, create_resource_limit_metrics, log_versions, maybe_activate_heap_profile}; -pub const APP_NAME: &str = "greptime-flownode"; +pub const APP_NAME: &str = concatcp!(common_version::product_name(), "-flownode"); type FlownodeOptions = GreptimeOptions; diff --git a/src/cmd/src/frontend.rs b/src/cmd/src/frontend.rs index cb802791c5..07c9f775f2 100644 --- a/src/cmd/src/frontend.rs +++ b/src/cmd/src/frontend.rs @@ -72,7 +72,7 @@ pub struct Instance { _guard: Vec, } -pub const APP_NAME: &str = "greptime-frontend"; +pub const APP_NAME: &str = const_format::concatcp!(common_version::product_name(), "-frontend"); impl Instance { pub fn new(frontend: Frontend, _guard: Vec) -> Self { diff --git a/src/cmd/src/metasrv.rs b/src/cmd/src/metasrv.rs index 2ce5fb3a02..dec9edc193 100644 --- a/src/cmd/src/metasrv.rs +++ b/src/cmd/src/metasrv.rs @@ -24,6 +24,7 @@ use common_meta::distributed_time_constants::init_distributed_time_constants; use common_telemetry::info; use common_telemetry::logging::{DEFAULT_LOGGING_DIR, TracingOptions}; use common_version::{short_version, verbose_version}; +use const_format::concatcp; use meta_srv::bootstrap::{MetasrvInstance, metasrv_builder}; use meta_srv::metasrv::BackendImpl; use snafu::ResultExt; @@ -35,7 +36,7 @@ use crate::{App, create_resource_limit_metrics, log_versions, maybe_activate_hea type MetasrvOptions = GreptimeOptions; -pub const APP_NAME: &str = "greptime-metasrv"; +pub const APP_NAME: &str = concatcp!(common_version::product_name(), "-metasrv"); pub struct Instance { instance: MetasrvInstance, diff --git a/src/cmd/src/standalone.rs b/src/cmd/src/standalone.rs index 215bea0ec5..196ff07c92 100644 --- a/src/cmd/src/standalone.rs +++ b/src/cmd/src/standalone.rs @@ -48,6 +48,7 @@ use common_telemetry::info; use common_telemetry::logging::{DEFAULT_LOGGING_DIR, TracingOptions}; use common_time::timezone::set_default_timezone; use common_version::{short_version, verbose_version}; +use const_format::concatcp; use datanode::config::DatanodeOptions; use datanode::datanode::{Datanode, DatanodeBuilder}; use datanode::region_server::RegionServer; @@ -75,7 +76,7 @@ use crate::error::{OtherSnafu, Result, StartFlownodeSnafu}; use crate::options::{GlobalOptions, GreptimeOptions}; use crate::{App, create_resource_limit_metrics, error, log_versions, maybe_activate_heap_profile}; -pub const APP_NAME: &str = "greptime-standalone"; +pub const APP_NAME: &str = concatcp!(common_version::product_name(), "-standalone"); #[derive(Parser)] pub struct Command { diff --git a/src/common/function/src/system/version.rs b/src/common/function/src/system/version.rs index 1c148bd7ab..9170fb29f1 100644 --- a/src/common/function/src/system/version.rs +++ b/src/common/function/src/system/version.rs @@ -43,14 +43,19 @@ impl Function for VersionFunction { let version = match func_ctx.query_ctx.channel() { Channel::Mysql => { format!( - "{}-greptimedb-{}", + "{}-{}-{}", std::env::var("GREPTIMEDB_MYSQL_SERVER_VERSION") .unwrap_or_else(|_| "8.4.2".to_string()), + common_version::product_name(), common_version::version() ) } Channel::Postgres => { - format!("PostgreSQL 16.3 GreptimeDB {}", common_version::version()) + format!( + "PostgreSQL 16.3 {} {}", + common_version::product_name(), + common_version::version() + ) } _ => common_version::version().to_string(), }; diff --git a/src/common/version/build.rs b/src/common/version/build.rs index f147918c21..81c805484b 100644 --- a/src/common/version/build.rs +++ b/src/common/version/build.rs @@ -29,6 +29,7 @@ fn main() -> Result<(), Box> { let refresh = profile == "release"; println!("cargo:rerun-if-env-changed=RUSTC"); + println!("cargo:rerun-if-env-changed=GREPTIME_PRODUCT_NAME"); // The "CARGO_WORKSPACE_DIR" is set manually (not by Rust itself) in Cargo config file, to // solve the problem where the "CARGO_MANIFEST_DIR" is not what we want when this repo is @@ -44,6 +45,9 @@ fn main() -> Result<(), Box> { let product_version = load_product_version(&workspace_root); println!("cargo:rustc-env=GREPTIME_PRODUCT_VERSION={product_version}"); + let product_name = load_product_name(); + println!("cargo:rustc-env=GREPTIME_PRODUCT_NAME={product_name}"); + let repository = open_repository(&workspace_root); if refresh { @@ -100,6 +104,10 @@ fn load_product_version(workspace_root: &Path) -> String { .unwrap_or_else(|| env::var("CARGO_PKG_VERSION").unwrap()) } +fn load_product_name() -> String { + env::var("GREPTIME_PRODUCT_NAME").unwrap_or_else(|_| "GreptimeDB".to_string()) +} + fn emit_workspace_watch_list( workspace_root: &Path, repository: Option<&Repository>, diff --git a/src/common/version/src/lib.rs b/src/common/version/src/lib.rs index 82d64f532f..e651225d68 100644 --- a/src/common/version/src/lib.rs +++ b/src/common/version/src/lib.rs @@ -109,6 +109,10 @@ pub const fn version() -> &'static str { BUILD_INFO.version } +pub const fn product_name() -> &'static str { + env!("GREPTIME_PRODUCT_NAME") +} + pub const fn verbose_version() -> &'static str { const_format::formatcp!( "\nbranch: {}\ncommit: {}\nclean: {}\nversion: {}", diff --git a/src/servers/src/mysql/federated.rs b/src/servers/src/mysql/federated.rs index 92085f1e58..fa3aec144c 100644 --- a/src/servers/src/mysql/federated.rs +++ b/src/servers/src/mysql/federated.rs @@ -21,6 +21,7 @@ use std::sync::Arc; use common_query::Output; use common_recordbatch::RecordBatches; use common_time::timezone::system_timezone_name; +use common_version; use datatypes::prelude::ConcreteDataType; use datatypes::schema::{ColumnSchema, Schema}; use datatypes::vectors::StringVector; @@ -119,7 +120,7 @@ static VAR_VALUES: Lazy> = Lazy::new(|| { ("interactive_timeout", "31536000"), ("wait_timeout", "31536000"), ("net_write_timeout", "31536000"), - ("version_comment", "Greptime"), + ("version_comment", common_version::product_name()), ]) }); @@ -380,7 +381,7 @@ mod test { +-------------------+ | @@version_comment | +-------------------+ -| Greptime | +| GreptimeDB | +-------------------+"; test(query, expected); diff --git a/src/servers/src/postgres.rs b/src/servers/src/postgres.rs index 7533fe084c..58ef4fdd7b 100644 --- a/src/servers/src/postgres.rs +++ b/src/servers/src/postgres.rs @@ -50,7 +50,11 @@ pub(crate) struct GreptimeDBStartupParameters { impl GreptimeDBStartupParameters { fn new() -> GreptimeDBStartupParameters { GreptimeDBStartupParameters { - version: format!("16.3-greptimedb-{}", common_version::version()), + version: format!( + "16.3-{}-{}", + common_version::product_name(), + common_version::version() + ), } } } diff --git a/tests/cases/standalone/common/mysql.result b/tests/cases/standalone/common/mysql.result index 232cd3aed2..7414d2290d 100644 --- a/tests/cases/standalone/common/mysql.result +++ b/tests/cases/standalone/common/mysql.result @@ -13,7 +13,7 @@ SELECT @@version_comment; +-------------------+ | @@version_comment | +-------------------+ -| Greptime | +| GreptimeDB | +-------------------+ -- SQLNESS PROTOCOL MYSQL From d3c8df70f512d16358315b93c9a4d1303ec32b7e Mon Sep 17 00:00:00 2001 From: Yingwen Date: Fri, 27 Mar 2026 01:52:18 +0800 Subject: [PATCH 046/195] fix: fix SeriesScan verbose mode mising metrics (#7872) Signed-off-by: evenyag --- src/mito2/src/read/series_scan.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/mito2/src/read/series_scan.rs b/src/mito2/src/read/series_scan.rs index 2d6994d0af..5109120d92 100644 --- a/src/mito2/src/read/series_scan.rs +++ b/src/mito2/src/read/series_scan.rs @@ -168,7 +168,7 @@ impl SeriesScan { } ); - self.maybe_start_distributor(metrics_set, &self.metrics_list); + self.maybe_start_distributor(metrics_set, &self.metrics_list, ctx.explain_verbose); let mut receiver = self.take_receiver(partition)?; let stream = try_stream! { @@ -214,6 +214,7 @@ impl SeriesScan { &self, metrics_set: &ExecutionPlanMetricsSet, metrics_list: &Arc, + explain_verbose: bool, ) { let mut rx_list = self.receivers.lock().unwrap(); if !rx_list.is_empty() { @@ -229,6 +230,7 @@ impl SeriesScan { senders, metrics_set: metrics_set.clone(), metrics_list: metrics_list.clone(), + explain_verbose, }; let region_id = distributor.stream_ctx.input.mapper.metadata().region_id; let span = tracing::info_span!("SeriesScan::distributor", region_id = %region_id); @@ -430,6 +432,8 @@ struct SeriesDistributor { /// distributor. metrics_set: ExecutionPlanMetricsSet, metrics_list: Arc, + /// Whether to use verbose logging and collect detailed metrics. + explain_verbose: bool, } impl SeriesDistributor { @@ -470,7 +474,7 @@ impl SeriesDistributor { let part_metrics = new_partition_metrics( &self.stream_ctx, - false, + self.explain_verbose, &self.metrics_set, self.partitions.len(), &self.metrics_list, @@ -576,7 +580,7 @@ impl SeriesDistributor { let part_metrics = new_partition_metrics( &self.stream_ctx, - false, + self.explain_verbose, &self.metrics_set, self.partitions.len(), &self.metrics_list, From b57dfc18dc74a0ea6a0af2436c90f061d6987ead Mon Sep 17 00:00:00 2001 From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com> Date: Fri, 27 Mar 2026 10:19:00 +0800 Subject: [PATCH 047/195] feat: pending rows batching for metrics (#7831) * feat: metric batch 2s PoC Signed-off-by: jeremyhi * chore: max_concurrent_flushes Signed-off-by: jeremyhi * chore: work channel size Signed-off-by: jeremyhi * feat(servers): add metrics and logs for pending rows batch flush Add the `FLUSH_ELAPSED` histogram metric to track the duration of pending rows batch flushes in the Prometheus store protocol handler. This provides better observability into the performance and latency of the batcher. Also update telemetry by: - Recording elapsed time for both successful and failed flush operations. - Adding an informational log upon successful flush including row count and duration. - Including elapsed time in error logs when a flush fails. Signed-off-by: Lei, HUANG * feat(servers): implement columnar batching for pending rows Refactor PendingRowsBatcher to use columnar batching for the metrics store. Incoming RowInsertRequests are now converted to RecordBatches, partitioned, and flushed via BulkInsert requests to datanodes. - Enhance MultiDimPartitionRule to handle scalar boolean predicates. - Add metrics for tracking flush failures and dropped rows. - Update dependencies to support columnar batching in servers. Signed-off-by: Lei, HUANG * feat(servers): add backpressure for pending rows Implement backpressure in PendingRowsBatcher by limiting in-flight requests with a semaphore and making the submission wait for the flush result. This ensures Prometheus write requests are throttled and only return once the data has been successfully flushed to datanodes. - Add max_inflight_requests to PromStoreOptions. - Use oneshot channels to notify submitters of flush completion. - Limit concurrent requests using a new inflight_semaphore. - Update PendingRowsBatcher::submit to wait for the flush outcome. Signed-off-by: Lei, HUANG * feat: add stage-level metrics for bulk ingestion Introduce histograms to track the elapsed time of various stages in the metric engine bulk insert path and the server's pending rows batcher. This provides better observability into the performance bottlenecks of the ingestion pipeline. Signed-off-by: Lei, HUANG * - `src/metric-engine/src/engine/bulk_insert.rs`: Removed the fallback mechanism that converted record batches to rows when bulk inserts were unsupported, along with related helper functions and unused imports. - `src/operator/src/insert.rs`: Removed an unused import (`common_time::TimeToLive::Instant`). Signed-off-by: Lei, HUANG * feat(servers): columnar Prom remote write Optimize the Prometheus remote write path by allowing direct conversion from decoded Prometheus samples to Arrow RecordBatches. This bypasses intermediate row-based representations when `PendingRowsBatcher` is active and no pipeline is used, improving ingestion efficiency. - Implement `as_record_batch_groups` in `TablesBuilder` and `PromWriteRequest`. - Add `submit_prom_record_batch_groups` to `PendingRowsBatcher`. - Introduce `DecodedPromWriteRequest` in `prom_store`. - Implement row-to-RecordBatch conversion logic in `prom_row_builder`. Signed-off-by: Lei, HUANG * Revert "feat(servers): columnar Prom remote write" This reverts commit efbb63c12a3e7fcec03858ea0351efd94fec8242. * refactor(servers): improve row to RecordBatch conversion - Use `snafu::ensure` for row validation in `rows_to_record_batch`. - Add explicit type hint for `MutableVector` to improve clarity. - Reorganize and clean up imports in `pending_rows_batcher.rs`. Signed-off-by: Lei, HUANG * perf(servers): use arrow builders for row conversion This commit optimizes the conversion from `api::v1::Rows` to `RecordBatch` by using Arrow builders directly. This avoids the overhead of `MutableVector` and `common_recordbatch`, leading to better performance in the `pending_rows_batcher`. Additionally, the `#[allow(dead_code)]` attribute is removed from `modify_batch_sparse` in the metric engine as it is now utilized. Signed-off-by: Lei, HUANG * perf(metric-engine): optimize batch modification Optimize `modify_batch_sparse` by reusing buffers, using Arrow builders, and employing fast-path encoding methods. This reduces allocations and avoids redundant downcasting and serializer overhead. Signed-off-by: Lei, HUANG * feat/metric-engine-support-bulk: **Add Environment Variable for Batch Sync Control** - `pending_rows_batcher.rs`: Introduced an environment variable `PENDING_ROWS_BATCH_SYNC` to control the synchronization behavior of batch processing. If set to true, the function will wait for the flush result; otherwise, it will return immediatel with the total rows count. Signed-off-by: Lei, HUANG * wip Signed-off-by: Lei, HUANG * chore: update and fix clippy Signed-off-by: Lei, HUANG * fix: failing test Signed-off-by: Lei, HUANG * picking-pending-rows-batcher: ### Commit Message Remove Unused Code and Simplify Error Handling - **`src/error.rs`**: Removed the `BatcherQueueFull` error variant and its associated logic, simplifying the error handling by removing unused code. - **`src/http/prom_store.rs`**: Eliminated the `try_decompress` function, streamlining the decompression logic by directly using `snappy_decompress` in `decode_remote_read_request`. Signed-off-by: Lei, HUANG * chore: parse PENDING_ROWS_BATCH_SYNC once Signed-off-by: Lei, HUANG * chore: revert unrelated changes Signed-off-by: Lei, HUANG * **Refactor Prometheus Write Handling** - **`prom_store.rs`**: Introduced `pre_write` method in `PromStoreProtocolHandler` to handle pre-write checks for Prometheus remote write requests. Updated `write` method to utilize `pre_write`. - **`server.rs`**: Modified `PendingRowsBatcher` initialization to conditionally create a batcher based on `with_metric_engine` flag. - **`http/prom_store.rs`**: Integrated `pre_write` checks before submitting requests to `PendingRowsBatcher`. - **`query_handler.rs`**: Added `pre_write` method to `PromStoreProtocolHandler` trait for pre-write operations. Signed-off-by: Lei, HUANG * picking-pending-rows-batcher: - **Fix Label Typo**: Corrected a typo in the label value from `"flush_wn ite_region"` to `"flush_write_region"` in `pending_rows_batcher.rs`. - **Refactor Array Building Logic**: Introduced a macro `build_array!` to streamline the construction of `ArrayRef` for different data types, reducing code duplication in `pending_rows_batcher.rs`. Signed-off-by: Lei, HUANG * format toml Signed-off-by: Lei, HUANG * picking-pending-rows-batcher: ### Update PromStore and PendingRowsBatcher Configuration - **`prom_store.rs`**: Set `pending_rows_flush_interval` to `Duration::ZERO` to disable automatic flushing. - **`pending_rows_batcher.rs`**: Enhance validation to disable the batcher when `flush_interval` is zero or configuration values like `max_batch_rows`, `max_concurrent_flushes`, `worker_channel_capacity`, or `max_inflight_requests` are zero, preventing potential panics or deadlocks. Signed-off-by: Lei, HUANG * picking-pending-rows-batcher: ### Update `pending_rows_flush_interval` to Zero - **Files Modified**: - `src/frontend/src/service_config/prom_store.rs` - `tests-integration/tests/http.rs` - **Key Changes**: - Updated `pending_rows_flush_interval` from `Duration::from_secs(2)` to `Duration::ZERO` in `prom_store.rs`. - Changed `pending_rows_flush_interval` configuration from `"2s"` to `"0s"` in `http.rs`. These changes set the flush interval to zero, potentially affecting how frequently pending rows are flushed. Signed-off-by: Lei, HUANG * picking-pending-rows-batcher: **Add Worker Management Enhancements** - **`metrics.rs`**: Introduced `PENDING_WORKERS` gauge to track active pending rows batch workers. - **`pending_rows_batcher.rs`**: - Added worker idle timeout logic with `WORKER_IDLE_TIMEOUT_MULTIPLIER`. - Implemented worker management functions: `spawn_worker`, `remove_worker_if_same_channel`, and `should_close_worker_on_idle_timeout`. - Enhanced worker lifecycle management to handle idle workers and ensure proper cleanup. - **Tests**: Added unit tests for worker removal and idle timeout logic. Signed-off-by: Lei, HUANG * fix: clippy Signed-off-by: Lei, HUANG --------- Signed-off-by: jeremyhi Signed-off-by: Lei, HUANG Co-authored-by: jeremyhi --- Cargo.lock | 2 + src/frontend/src/instance/prom_store.rs | 19 +- src/frontend/src/server.rs | 16 + src/frontend/src/service_config/prom_store.rs | 55 +- src/metric-engine/src/batch_modifier.rs | 17 +- src/partition/Cargo.toml | 1 + src/servers/Cargo.toml | 1 + src/servers/src/error.rs | 4 + src/servers/src/http.rs | 5 +- src/servers/src/http/prom_store.rs | 24 +- src/servers/src/lib.rs | 1 + src/servers/src/metrics.rs | 51 +- src/servers/src/pending_rows_batcher.rs | 1253 +++++++++++++++++ src/servers/src/query_handler.rs | 5 + src/servers/tests/http/prom_store_test.rs | 2 +- tests-integration/src/test_util.rs | 1 + tests-integration/tests/http.rs | 5 + 17 files changed, 1437 insertions(+), 25 deletions(-) create mode 100644 src/servers/src/pending_rows_batcher.rs diff --git a/Cargo.lock b/Cargo.lock index d8f6241136..676eaf0822 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9384,6 +9384,7 @@ dependencies = [ "common-macro", "common-meta", "common-query", + "common-telemetry", "criterion 0.7.0", "datafusion-common", "datafusion-expr", @@ -12067,6 +12068,7 @@ dependencies = [ "operator", "otel-arrow-rust", "parking_lot 0.12.4", + "partition", "permutation", "pg_interval_2", "pgwire", diff --git a/src/frontend/src/instance/prom_store.rs b/src/frontend/src/instance/prom_store.rs index 9a323eb989..c8f76753af 100644 --- a/src/frontend/src/instance/prom_store.rs +++ b/src/frontend/src/instance/prom_store.rs @@ -161,12 +161,11 @@ impl Instance { #[async_trait] impl PromStoreProtocolHandler for Instance { - async fn write( + async fn pre_write( &self, - request: RowInsertRequests, + request: &RowInsertRequests, ctx: QueryContextRef, - with_metric_engine: bool, - ) -> ServerResult { + ) -> ServerResult<()> { self.plugins .get::() .as_ref() @@ -175,7 +174,17 @@ impl PromStoreProtocolHandler for Instance { let interceptor_ref = self .plugins .get::>(); - interceptor_ref.pre_write(&request, ctx.clone())?; + interceptor_ref.pre_write(request, ctx)?; + Ok(()) + } + + async fn write( + &self, + request: RowInsertRequests, + ctx: QueryContextRef, + with_metric_engine: bool, + ) -> ServerResult { + self.pre_write(&request, ctx.clone()).await?; let output = if with_metric_engine { let physical_table = ctx diff --git a/src/frontend/src/server.rs b/src/frontend/src/server.rs index 4b51efbd33..4d0db700d1 100644 --- a/src/frontend/src/server.rs +++ b/src/frontend/src/server.rs @@ -37,6 +37,7 @@ use servers::interceptor::LogIngestInterceptorRef; use servers::metrics_handler::MetricsHandler; use servers::mysql::server::{MysqlServer, MysqlSpawnConfig, MysqlSpawnRef}; use servers::otel_arrow::OtelArrowServiceHandler; +use servers::pending_rows_batcher::PendingRowsBatcher; use servers::postgres::PostgresServer; use servers::request_memory_limiter::ServerMemoryLimiter; use servers::server::{Server, ServerHandlers}; @@ -124,12 +125,27 @@ where } if opts.prom_store.enable { + let pending_rows_batcher = if opts.prom_store.with_metric_engine { + PendingRowsBatcher::try_new( + self.instance.partition_manager().clone(), + self.instance.node_manager().clone(), + self.instance.catalog_manager().clone(), + opts.prom_store.pending_rows_flush_interval, + opts.prom_store.max_batch_rows, + opts.prom_store.max_concurrent_flushes, + opts.prom_store.worker_channel_capacity, + opts.prom_store.max_inflight_requests, + ) + } else { + None + }; builder = builder .with_prom_handler( self.instance.clone(), Some(self.instance.clone()), opts.prom_store.with_metric_engine, opts.http.prom_validation_mode, + pending_rows_batcher, ) .with_prometheus_handler(self.instance.clone()); } diff --git a/src/frontend/src/service_config/prom_store.rs b/src/frontend/src/service_config/prom_store.rs index b3adf889d2..99f1eada6d 100644 --- a/src/frontend/src/service_config/prom_store.rs +++ b/src/frontend/src/service_config/prom_store.rs @@ -12,12 +12,40 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::time::Duration; + use serde::{Deserialize, Serialize}; #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] pub struct PromStoreOptions { pub enable: bool, pub with_metric_engine: bool, + #[serde(default, with = "humantime_serde")] + pub pending_rows_flush_interval: Duration, + #[serde(default = "default_max_batch_rows")] + pub max_batch_rows: usize, + #[serde(default = "default_max_concurrent_flushes")] + pub max_concurrent_flushes: usize, + #[serde(default = "default_worker_channel_capacity")] + pub worker_channel_capacity: usize, + #[serde(default = "default_max_inflight_requests")] + pub max_inflight_requests: usize, +} + +fn default_max_batch_rows() -> usize { + 100_000 +} + +fn default_max_concurrent_flushes() -> usize { + 256 +} + +fn default_worker_channel_capacity() -> usize { + 65526 +} + +fn default_max_inflight_requests() -> usize { + 3000 } impl Default for PromStoreOptions { @@ -25,18 +53,43 @@ impl Default for PromStoreOptions { Self { enable: true, with_metric_engine: true, + pending_rows_flush_interval: Duration::ZERO, + max_batch_rows: default_max_batch_rows(), + max_concurrent_flushes: default_max_concurrent_flushes(), + worker_channel_capacity: default_worker_channel_capacity(), + max_inflight_requests: default_max_inflight_requests(), } } } #[cfg(test)] mod tests { + use std::time::Duration; + use super::PromStoreOptions; + use crate::service_config::prom_store::{ + default_max_batch_rows, default_max_concurrent_flushes, default_max_inflight_requests, + default_worker_channel_capacity, + }; #[test] fn test_prom_store_options() { let default = PromStoreOptions::default(); assert!(default.enable); - assert!(default.with_metric_engine) + assert!(default.with_metric_engine); + assert_eq!(default.pending_rows_flush_interval, Duration::ZERO); + assert_eq!(default.max_batch_rows, default_max_batch_rows()); + assert_eq!( + default.max_concurrent_flushes, + default_max_concurrent_flushes() + ); + assert_eq!( + default.worker_channel_capacity, + default_worker_channel_capacity() + ); + assert_eq!( + default.max_inflight_requests, + default_max_inflight_requests() + ); } } diff --git a/src/metric-engine/src/batch_modifier.rs b/src/metric-engine/src/batch_modifier.rs index 8a5774889b..d06eaa976b 100644 --- a/src/metric-engine/src/batch_modifier.rs +++ b/src/metric-engine/src/batch_modifier.rs @@ -18,12 +18,11 @@ use std::sync::Arc; use datatypes::arrow::array::{Array, BinaryBuilder, StringArray, UInt64Array}; use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; use datatypes::arrow::record_batch::RecordBatch; -use datatypes::value::ValueRef; use fxhash::FxHasher; use mito_codec::row_converter::SparsePrimaryKeyCodec; use snafu::ResultExt; use store_api::storage::ColumnId; -use store_api::storage::consts::{PRIMARY_KEY_COLUMN_NAME, ReservedColumnId}; +use store_api::storage::consts::PRIMARY_KEY_COLUMN_NAME; use crate::error::{EncodePrimaryKeySnafu, Result, UnexpectedRequestSnafu}; @@ -112,7 +111,6 @@ fn build_tag_arrays<'a>( } /// Modifies a RecordBatch for sparse primary key encoding. -#[allow(dead_code)] pub(crate) fn modify_batch_sparse( batch: RecordBatch, table_id: u32, @@ -128,24 +126,17 @@ pub(crate) fn modify_batch_sparse( let mut buffer = Vec::new(); for row in 0..num_rows { buffer.clear(); - let internal = [ - (ReservedColumnId::table_id(), ValueRef::UInt32(table_id)), - ( - ReservedColumnId::tsid(), - ValueRef::UInt64(tsid_array.value(row)), - ), - ]; codec - .encode_to_vec(internal.into_iter(), &mut buffer) + .encode_internal(table_id, tsid_array.value(row), &mut buffer) .context(EncodePrimaryKeySnafu)?; let tags = sorted_tag_columns .iter() .zip(tag_arrays.iter()) .filter(|(_, arr)| !arr.is_null(row)) - .map(|(tc, arr)| (tc.column_id, ValueRef::String(arr.value(row)))); + .map(|(tc, arr)| (tc.column_id, arr.value(row).as_bytes())); codec - .encode_to_vec(tags, &mut buffer) + .encode_raw_tag_value(tags, &mut buffer) .context(EncodePrimaryKeySnafu)?; pk_builder.append_value(&buffer); diff --git a/src/partition/Cargo.toml b/src/partition/Cargo.toml index d498ed8c13..a8e3a8ae11 100644 --- a/src/partition/Cargo.toml +++ b/src/partition/Cargo.toml @@ -15,6 +15,7 @@ common-error.workspace = true common-macro.workspace = true common-meta.workspace = true common-query.workspace = true +common-telemetry.workspace = true datafusion-common.workspace = true datafusion-expr.workspace = true datafusion-physical-expr.workspace = true diff --git a/src/servers/Cargo.toml b/src/servers/Cargo.toml index 8e84ef77d6..6531390ca3 100644 --- a/src/servers/Cargo.toml +++ b/src/servers/Cargo.toml @@ -88,6 +88,7 @@ opentelemetry-proto.workspace = true operator.workspace = true otel-arrow-rust.workspace = true parking_lot.workspace = true +partition.workspace = true pg_interval = { version = "0.5.2", package = "pg_interval_2" } pgwire = { version = "0.38.2", default-features = false, features = [ "server-api-ring", diff --git a/src/servers/src/error.rs b/src/servers/src/error.rs index 18ac964f05..5fae7a82db 100644 --- a/src/servers/src/error.rs +++ b/src/servers/src/error.rs @@ -56,6 +56,9 @@ pub enum Error { #[snafu(display("Internal error: {}", err_msg))] Internal { err_msg: String }, + #[snafu(display("Pending rows batcher channel closed"))] + BatcherChannelClosed, + #[snafu(display("Unsupported data type: {}, reason: {}", data_type, reason))] UnsupportedDataType { data_type: ConcreteDataType, @@ -684,6 +687,7 @@ impl ErrorExt for Error { use Error::*; match self { Internal { .. } + | BatcherChannelClosed | InternalIo { .. } | TokioIo { .. } | StartHttp { .. } diff --git a/src/servers/src/http.rs b/src/servers/src/http.rs index 506a240cac..eb2086726a 100644 --- a/src/servers/src/http.rs +++ b/src/servers/src/http.rs @@ -16,7 +16,7 @@ use std::collections::HashMap; use std::convert::Infallible; use std::fmt::Display; use std::net::SocketAddr; -use std::sync::Mutex as StdMutex; +use std::sync::{Arc, Mutex as StdMutex}; use std::time::Duration; use async_trait::async_trait; @@ -75,6 +75,7 @@ use crate::http::result::null_result::NullResponse; use crate::interceptor::LogIngestInterceptorRef; use crate::metrics::http_metrics_layer; use crate::metrics_handler::MetricsHandler; +use crate::pending_rows_batcher::PendingRowsBatcher; use crate::prometheus_handler::PrometheusHandlerRef; use crate::query_handler::sql::ServerSqlQueryHandlerRef; use crate::query_handler::{ @@ -585,12 +586,14 @@ impl HttpServerBuilder { pipeline_handler: Option, prom_store_with_metric_engine: bool, prom_validation_mode: PromValidationMode, + pending_rows_batcher: Option>, ) -> Self { let state = PromStoreState { prom_store_handler: handler, pipeline_handler, prom_store_with_metric_engine, prom_validation_mode, + pending_rows_batcher, }; Self { diff --git a/src/servers/src/http/prom_store.rs b/src/servers/src/http/prom_store.rs index 58c6e0eddd..bfc072e84e 100644 --- a/src/servers/src/http/prom_store.rs +++ b/src/servers/src/http/prom_store.rs @@ -35,6 +35,7 @@ use snafu::prelude::*; use crate::error::{self, InternalSnafu, PipelineSnafu, Result}; use crate::http::extractor::PipelineInfo; use crate::http::header::{GREPTIME_DB_HEADER_METRICS, write_cost_header_map}; +use crate::pending_rows_batcher::PendingRowsBatcher; use crate::prom_remote_write::decode::PromSeriesProcessor; use crate::prom_remote_write::decode_remote_write_request; use crate::prom_remote_write::validation::PromValidationMode; @@ -52,6 +53,7 @@ pub struct PromStoreState { pub pipeline_handler: Option, pub prom_store_with_metric_engine: bool, pub prom_validation_mode: PromValidationMode, + pub pending_rows_batcher: Option>, } #[derive(Debug, Serialize, Deserialize)] @@ -92,6 +94,7 @@ pub async fn remote_write( pipeline_handler, prom_store_with_metric_engine, prom_validation_mode, + pending_rows_batcher, } = state; if let Some(_vm_handshake) = params.get_vm_proto_version { @@ -100,9 +103,11 @@ pub async fn remote_write( let db = params.db.clone().unwrap_or_default(); query_ctx.set_channel(Channel::Prometheus); - if let Some(physical_table) = params.physical_table { - query_ctx.set_extension(PHYSICAL_TABLE_PARAM, physical_table); - } + let physical_table = params + .physical_table + .clone() + .unwrap_or_else(|| GREPTIME_PHYSICAL_TABLE.to_string()); + query_ctx.set_extension(PHYSICAL_TABLE_PARAM, physical_table.clone()); let query_ctx = Arc::new(query_ctx); let _timer = crate::metrics::METRIC_HTTP_PROM_STORE_WRITE_ELAPSED .with_label_values(&[db.as_str()]) @@ -135,6 +140,19 @@ pub async fn remote_write( req.as_insert_requests() }; + if prom_store_with_metric_engine && let Some(batcher) = pending_rows_batcher { + for (temp_ctx, reqs) in req.as_req_iter(query_ctx) { + prom_store_handler + .pre_write(&reqs, temp_ctx.clone()) + .await?; + let rows = batcher.submit(reqs, temp_ctx).await?; + crate::metrics::PROM_STORE_REMOTE_WRITE_SAMPLES + .with_label_values(&[db.as_str()]) + .inc_by(rows); + } + return Ok((StatusCode::NO_CONTENT, write_cost_header_map(0)).into_response()); + } + let mut cost = 0; for (temp_ctx, reqs) in req.as_req_iter(query_ctx) { let cnt: u64 = reqs diff --git a/src/servers/src/lib.rs b/src/servers/src/lib.rs index 9ee7395691..c44c674b9e 100644 --- a/src/servers/src/lib.rs +++ b/src/servers/src/lib.rs @@ -41,6 +41,7 @@ pub mod mysql; pub mod opentsdb; pub mod otel_arrow; pub mod otlp; +pub mod pending_rows_batcher; mod pipeline; pub mod postgres; pub mod prom_remote_write; diff --git a/src/servers/src/metrics.rs b/src/servers/src/metrics.rs index 25a900ed3d..37f923b73d 100644 --- a/src/servers/src/metrics.rs +++ b/src/servers/src/metrics.rs @@ -121,13 +121,62 @@ lazy_static! { /// Duration to convert prometheus write request to gRPC request. pub static ref METRIC_HTTP_PROM_STORE_CONVERT_ELAPSED: Histogram = METRIC_HTTP_PROM_STORE_CODEC_ELAPSED .with_label_values(&["convert"]); - /// The samples count of Prometheus remote write. + /// The samples count of Prometheus remote write. pub static ref PROM_STORE_REMOTE_WRITE_SAMPLES: IntCounterVec = register_int_counter_vec!( "greptime_servers_prometheus_remote_write_samples", "frontend prometheus remote write samples", &[METRIC_DB_LABEL] ) .unwrap(); + pub static ref PENDING_BATCHES: IntGauge = register_int_gauge!( + "greptime_prom_store_pending_batches", + "Number of pending batches waiting to be flushed" + ) + .unwrap(); + pub static ref PENDING_ROWS: IntGauge = register_int_gauge!( + "greptime_prom_store_pending_rows", + "Number of pending rows waiting to be flushed" + ) + .unwrap(); + pub static ref PENDING_WORKERS: IntGauge = register_int_gauge!( + "greptime_prom_store_pending_workers", + "Number of active pending rows batch workers" + ) + .unwrap(); + pub static ref FLUSH_TOTAL: IntCounter = register_int_counter!( + "greptime_prom_store_flush_total", + "Total number of batch flushes" + ) + .unwrap(); + pub static ref FLUSH_ROWS: Histogram = register_histogram!( + "greptime_prom_store_flush_rows", + "Number of rows per flush", + vec![100.0, 1000.0, 10000.0, 50000.0, 100000.0, 500000.0] + ) + .unwrap(); + pub static ref FLUSH_ELAPSED: Histogram = register_histogram!( + "greptime_prom_store_flush_elapsed", + "Elapsed time of pending rows batch flush in seconds", + vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0] + ) + .unwrap(); + pub static ref FLUSH_DROPPED_ROWS: IntCounter = register_int_counter!( + "greptime_pending_rows_flush_dropped_rows", + "Total rows dropped due to pending rows flush failures" + ) + .unwrap(); + pub static ref FLUSH_FAILURES: IntCounter = register_int_counter!( + "greptime_pending_rows_flush_failures", + "Total pending rows flush failures" + ) + .unwrap(); + pub static ref PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED: HistogramVec = register_histogram_vec!( + "greptime_prom_store_pending_rows_batch_ingest_stage_elapsed", + "Elapsed time of pending rows batch ingestion stages in seconds", + &["stage"], + vec![0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0] + ) + .unwrap(); /// Http prometheus read duration per database. pub static ref METRIC_HTTP_PROM_STORE_READ_ELAPSED: HistogramVec = register_histogram_vec!( "greptime_servers_http_prometheus_read_elapsed", diff --git a/src/servers/src/pending_rows_batcher.rs b/src/servers/src/pending_rows_batcher.rs new file mode 100644 index 0000000000..f8486e3636 --- /dev/null +++ b/src/servers/src/pending_rows_batcher.rs @@ -0,0 +1,1253 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use api::helper::ColumnDataTypeWrapper; +use api::v1::region::{ + BulkInsertRequest, RegionRequest, RegionRequestHeader, bulk_insert_request, region_request, +}; +use api::v1::value::ValueData; +use api::v1::{ArrowIpc, RowInsertRequests, Rows}; +use arrow::array::{ + ArrayRef, Float64Builder, StringBuilder, TimestampMicrosecondBuilder, + TimestampMillisecondBuilder, TimestampNanosecondBuilder, TimestampSecondBuilder, + new_null_array, +}; +use arrow::compute::{cast, concat_batches, filter_record_batch}; +use arrow::datatypes::{Field, Schema as ArrowSchema}; +use arrow::record_batch::RecordBatch; +use arrow_schema::TimeUnit; +use bytes::Bytes; +use catalog::CatalogManagerRef; +use common_grpc::flight::{FlightEncoder, FlightMessage}; +use common_meta::node_manager::NodeManagerRef; +use common_query::prelude::GREPTIME_PHYSICAL_TABLE; +use common_telemetry::tracing_context::TracingContext; +use common_telemetry::{debug, error, info, warn}; +use dashmap::DashMap; +use dashmap::mapref::entry::Entry; +use datatypes::data_type::DataType; +use datatypes::prelude::ConcreteDataType; +use partition::manager::PartitionRuleManagerRef; +use session::context::QueryContextRef; +use snafu::{ResultExt, ensure}; +use store_api::storage::RegionId; +use tokio::sync::{OwnedSemaphorePermit, Semaphore, broadcast, mpsc, oneshot}; + +use crate::error; +use crate::error::{Error, Result}; +use crate::metrics::{ + FLUSH_DROPPED_ROWS, FLUSH_ELAPSED, FLUSH_FAILURES, FLUSH_ROWS, FLUSH_TOTAL, PENDING_BATCHES, + PENDING_ROWS, PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED, PENDING_WORKERS, +}; + +const PHYSICAL_TABLE_KEY: &str = "physical_table"; +/// Whether wait for ingestion result before reply to client. +const PENDING_ROWS_BATCH_SYNC_ENV: &str = "PENDING_ROWS_BATCH_SYNC"; +const WORKER_IDLE_TIMEOUT_MULTIPLIER: u32 = 3; + +#[derive(Debug, Clone, Hash, Eq, PartialEq)] +struct BatchKey { + catalog: String, + schema: String, + physical_table: String, +} + +#[derive(Debug)] +struct TableBatch { + table_name: String, + batches: Vec, + row_count: usize, +} + +struct PendingBatch { + tables: HashMap, + created_at: Option, + total_row_count: usize, + ctx: Option, + waiters: Vec, +} + +struct FlushWaiter { + response_tx: oneshot::Sender>, + _permit: OwnedSemaphorePermit, +} + +struct FlushBatch { + table_batches: Vec, + total_row_count: usize, + ctx: QueryContextRef, + waiters: Vec, +} + +#[derive(Clone)] +struct PendingWorker { + tx: mpsc::Sender, +} + +enum WorkerCommand { + Submit { + table_batches: Vec<(String, RecordBatch)>, + total_rows: usize, + ctx: QueryContextRef, + response_tx: oneshot::Sender>, + _permit: OwnedSemaphorePermit, + }, +} + +// Batch key is derived from QueryContext; it assumes catalog/schema/physical_table fully +// define the write target and must remain consistent across the batch. +fn batch_key_from_ctx(ctx: &QueryContextRef) -> BatchKey { + let physical_table = ctx + .extension(PHYSICAL_TABLE_KEY) + .unwrap_or(GREPTIME_PHYSICAL_TABLE) + .to_string(); + BatchKey { + catalog: ctx.current_catalog().to_string(), + schema: ctx.current_schema(), + physical_table, + } +} + +/// Prometheus remote write pending rows batcher. +pub struct PendingRowsBatcher { + workers: Arc>, + flush_interval: Duration, + max_batch_rows: usize, + partition_manager: PartitionRuleManagerRef, + node_manager: NodeManagerRef, + catalog_manager: CatalogManagerRef, + flush_semaphore: Arc, + inflight_semaphore: Arc, + worker_channel_capacity: usize, + pending_rows_batch_sync: bool, + shutdown: broadcast::Sender<()>, +} + +impl PendingRowsBatcher { + #[allow(clippy::too_many_arguments)] + pub fn try_new( + partition_manager: PartitionRuleManagerRef, + node_manager: NodeManagerRef, + catalog_manager: CatalogManagerRef, + flush_interval: Duration, + max_batch_rows: usize, + max_concurrent_flushes: usize, + worker_channel_capacity: usize, + max_inflight_requests: usize, + ) -> Option> { + // Disable the batcher if flush is disabled or configuration is invalid. + // Zero values for these knobs either cause panics (e.g., zero-capacity channels) + // or deadlocks (e.g., semaphores with no permits). + if flush_interval.is_zero() + || max_batch_rows == 0 + || max_concurrent_flushes == 0 + || worker_channel_capacity == 0 + || max_inflight_requests == 0 + { + return None; + } + + let (shutdown, _) = broadcast::channel(1); + let pending_rows_batch_sync = std::env::var(PENDING_ROWS_BATCH_SYNC_ENV) + .ok() + .as_deref() + .and_then(|v| v.parse::().ok()) + .unwrap_or(true); + let workers = Arc::new(DashMap::new()); + PENDING_WORKERS.set(workers.len() as i64); + + Some(Arc::new(Self { + workers, + flush_interval, + max_batch_rows, + partition_manager, + node_manager, + catalog_manager, + flush_semaphore: Arc::new(Semaphore::new(max_concurrent_flushes)), + inflight_semaphore: Arc::new(Semaphore::new(max_inflight_requests)), + worker_channel_capacity, + pending_rows_batch_sync, + shutdown, + })) + } + + pub async fn submit(&self, requests: RowInsertRequests, ctx: QueryContextRef) -> Result { + let (table_batches, total_rows) = { + let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED + .with_label_values(&["submit_build_table_batches"]) + .start_timer(); + build_table_batches(requests)? + }; + if total_rows == 0 { + return Ok(0); + } + let table_batches = { + let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED + .with_label_values(&["submit_align_region_schema"]) + .start_timer(); + self.align_table_batches_to_region_schema(table_batches, &ctx) + .await? + }; + + let permit = { + let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED + .with_label_values(&["submit_acquire_inflight_permit"]) + .start_timer(); + self.inflight_semaphore + .clone() + .acquire_owned() + .await + .map_err(|_| Error::BatcherChannelClosed)? + }; + + let (response_tx, response_rx) = oneshot::channel(); + + let batch_key = batch_key_from_ctx(&ctx); + let mut cmd = Some(WorkerCommand::Submit { + table_batches, + total_rows, + ctx, + response_tx, + _permit: permit, + }); + + { + let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED + .with_label_values(&["submit_send_to_worker"]) + .start_timer(); + + for _ in 0..2 { + let worker = self.get_or_spawn_worker(batch_key.clone()); + let Some(worker_cmd) = cmd.take() else { + break; + }; + + match worker.tx.send(worker_cmd).await { + Ok(()) => break, + Err(err) => { + cmd = Some(err.0); + remove_worker_if_same_channel( + self.workers.as_ref(), + &batch_key, + &worker.tx, + ); + } + } + } + + if cmd.is_some() { + return Err(Error::BatcherChannelClosed); + } + } + + if self.pending_rows_batch_sync { + let result = { + let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED + .with_label_values(&["submit_wait_flush_result"]) + .start_timer(); + response_rx.await.map_err(|_| Error::BatcherChannelClosed)? + }; + result.map(|()| total_rows as u64) + } else { + Ok(total_rows as u64) + } + } + + async fn align_table_batches_to_region_schema( + &self, + table_batches: Vec<(String, RecordBatch)>, + ctx: &QueryContextRef, + ) -> Result> { + let catalog = ctx.current_catalog().to_string(); + let schema = ctx.current_schema(); + let mut region_schemas: HashMap> = HashMap::new(); + let mut aligned_batches = Vec::with_capacity(table_batches.len()); + + for (table_name, record_batch) in table_batches { + let region_schema = if let Some(region_schema) = region_schemas.get(&table_name) { + region_schema.clone() + } else { + let table = self + .catalog_manager + .table(&catalog, &schema, &table_name, Some(ctx.as_ref())) + .await + .map_err(|err| Error::Internal { + err_msg: format!( + "Failed to resolve table {} for pending batch alignment: {}", + table_name, err + ), + })? + .ok_or_else(|| Error::Internal { + err_msg: format!( + "Table not found during pending batch alignment: {}", + table_name + ), + })?; + let region_schema = table.table_info().meta.schema.arrow_schema().clone(); + region_schemas.insert(table_name.clone(), region_schema.clone()); + region_schema + }; + + let record_batch = align_record_batch_to_schema(record_batch, region_schema.as_ref())?; + aligned_batches.push((table_name, record_batch)); + } + + Ok(aligned_batches) + } + + fn get_or_spawn_worker(&self, key: BatchKey) -> PendingWorker { + if let Some(worker) = self.workers.get(&key) + && !worker.tx.is_closed() + { + return worker.clone(); + } + + let entry = self.workers.entry(key.clone()); + match entry { + Entry::Occupied(mut worker) => { + if worker.get().tx.is_closed() { + let new_worker = self.spawn_worker(key); + worker.insert(new_worker.clone()); + PENDING_WORKERS.set(self.workers.len() as i64); + new_worker + } else { + worker.get().clone() + } + } + Entry::Vacant(vacant) => { + let worker = self.spawn_worker(key); + + vacant.insert(worker.clone()); + PENDING_WORKERS.set(self.workers.len() as i64); + worker + } + } + } + + fn spawn_worker(&self, key: BatchKey) -> PendingWorker { + let (tx, rx) = mpsc::channel(self.worker_channel_capacity); + let worker = PendingWorker { tx: tx.clone() }; + let worker_idle_timeout = self + .flush_interval + .checked_mul(WORKER_IDLE_TIMEOUT_MULTIPLIER) + .unwrap_or(self.flush_interval); + + start_worker( + key, + worker.tx.clone(), + self.workers.clone(), + rx, + self.shutdown.clone(), + self.partition_manager.clone(), + self.node_manager.clone(), + self.catalog_manager.clone(), + self.flush_interval, + worker_idle_timeout, + self.max_batch_rows, + self.flush_semaphore.clone(), + ); + + worker + } +} + +impl Drop for PendingRowsBatcher { + fn drop(&mut self) { + let _ = self.shutdown.send(()); + } +} + +impl PendingBatch { + fn new() -> Self { + Self { + tables: HashMap::new(), + created_at: None, + total_row_count: 0, + ctx: None, + waiters: Vec::new(), + } + } +} + +#[allow(clippy::too_many_arguments)] +fn start_worker( + key: BatchKey, + worker_tx: mpsc::Sender, + workers: Arc>, + mut rx: mpsc::Receiver, + shutdown: broadcast::Sender<()>, + partition_manager: PartitionRuleManagerRef, + node_manager: NodeManagerRef, + catalog_manager: CatalogManagerRef, + flush_interval: Duration, + worker_idle_timeout: Duration, + max_batch_rows: usize, + flush_semaphore: Arc, +) { + tokio::spawn(async move { + let mut batch = PendingBatch::new(); + let mut interval = tokio::time::interval(flush_interval); + let mut shutdown_rx = shutdown.subscribe(); + let idle_deadline = tokio::time::Instant::now() + worker_idle_timeout; + let idle_timer = tokio::time::sleep_until(idle_deadline); + tokio::pin!(idle_timer); + + loop { + tokio::select! { + cmd = rx.recv() => { + match cmd { + Some(WorkerCommand::Submit { table_batches, total_rows, ctx, response_tx, _permit }) => { + idle_timer.as_mut().reset(tokio::time::Instant::now() + worker_idle_timeout); + + if batch.total_row_count == 0 { + batch.created_at = Some(Instant::now()); + batch.ctx = Some(ctx); + PENDING_BATCHES.inc(); + } + + batch.waiters.push(FlushWaiter { response_tx, _permit }); + + for (table_name, record_batch) in table_batches { + let entry = batch.tables.entry(table_name.clone()).or_insert_with(|| TableBatch { + table_name, + batches: Vec::new(), + row_count: 0, + }); + entry.row_count += record_batch.num_rows(); + entry.batches.push(record_batch); + } + + batch.total_row_count += total_rows; + PENDING_ROWS.add(total_rows as i64); + + if batch.total_row_count >= max_batch_rows + && let Some(flush) = drain_batch(&mut batch) { + spawn_flush( + flush, + partition_manager.clone(), + node_manager.clone(), + catalog_manager.clone(), + flush_semaphore.clone(), + ).await; + } + } + None => { + if let Some(flush) = drain_batch(&mut batch) { + flush_batch( + flush, + partition_manager.clone(), + node_manager.clone(), + catalog_manager.clone(), + ).await; + } + break; + } + } + } + _ = &mut idle_timer => { + if !should_close_worker_on_idle_timeout(batch.total_row_count, rx.len()) { + idle_timer + .as_mut() + .reset(tokio::time::Instant::now() + worker_idle_timeout); + continue; + } + + debug!( + "Closing idle pending rows worker due to timeout: catalog={}, schema={}, physical_table={}", + key.catalog, + key.schema, + key.physical_table + ); + break; + } + _ = interval.tick() => { + if let Some(created_at) = batch.created_at + && batch.total_row_count > 0 + && created_at.elapsed() >= flush_interval + && let Some(flush) = drain_batch(&mut batch) { + spawn_flush( + flush, + partition_manager.clone(), + node_manager.clone(), + catalog_manager.clone(), + flush_semaphore.clone(), + ).await; + } + } + _ = shutdown_rx.recv() => { + if let Some(flush) = drain_batch(&mut batch) { + flush_batch( + flush, + partition_manager.clone(), + node_manager.clone(), + catalog_manager.clone(), + ).await; + } + break; + } + } + } + + remove_worker_if_same_channel(workers.as_ref(), &key, &worker_tx); + }); +} + +fn remove_worker_if_same_channel( + workers: &DashMap, + key: &BatchKey, + worker_tx: &mpsc::Sender, +) -> bool { + if let Some(worker) = workers.get(key) + && worker.tx.same_channel(worker_tx) + { + drop(worker); + workers.remove(key); + PENDING_WORKERS.set(workers.len() as i64); + return true; + } + + false +} + +fn should_close_worker_on_idle_timeout(total_row_count: usize, queued_requests: usize) -> bool { + total_row_count == 0 && queued_requests == 0 +} + +fn drain_batch(batch: &mut PendingBatch) -> Option { + if batch.total_row_count == 0 { + return None; + } + + let ctx = match batch.ctx.take() { + Some(ctx) => ctx, + None => { + flush_with_error(batch, "Pending batch missing context"); + return None; + } + }; + + let total_row_count = batch.total_row_count; + let table_batches = std::mem::take(&mut batch.tables).into_values().collect(); + let waiters = std::mem::take(&mut batch.waiters); + batch.total_row_count = 0; + batch.created_at = None; + + PENDING_ROWS.sub(total_row_count as i64); + PENDING_BATCHES.dec(); + + Some(FlushBatch { + table_batches, + total_row_count, + ctx, + waiters, + }) +} + +async fn spawn_flush( + flush: FlushBatch, + partition_manager: PartitionRuleManagerRef, + node_manager: NodeManagerRef, + catalog_manager: CatalogManagerRef, + semaphore: Arc, +) { + match semaphore.acquire_owned().await { + Ok(permit) => { + tokio::spawn(async move { + let _permit = permit; + flush_batch(flush, partition_manager, node_manager, catalog_manager).await; + }); + } + Err(err) => { + warn!(err; "Flush semaphore closed, flushing inline"); + flush_batch(flush, partition_manager, node_manager, catalog_manager).await; + } + } +} + +async fn flush_batch( + flush: FlushBatch, + partition_manager: PartitionRuleManagerRef, + node_manager: NodeManagerRef, + catalog_manager: CatalogManagerRef, +) { + let FlushBatch { + table_batches, + total_row_count, + ctx, + waiters, + } = flush; + let start = Instant::now(); + let mut first_error: Option = None; + + let catalog = ctx.current_catalog().to_string(); + let schema = ctx.current_schema(); + + macro_rules! record_failure { + ($row_count:expr, $msg:expr) => {{ + let msg = $msg; + if first_error.is_none() { + first_error = Some(msg.clone()); + } + mark_flush_failure($row_count, &msg); + }}; + } + + for table_batch in table_batches { + let Some(first_batch) = table_batch.batches.first() else { + continue; + }; + + let schema_ref = first_batch.schema(); + let record_batch = { + let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED + .with_label_values(&["flush_concat_table_batches"]) + .start_timer(); + match concat_batches(&schema_ref, &table_batch.batches) { + Ok(batch) => batch, + Err(err) => { + record_failure!( + table_batch.row_count, + format!( + "Failed to concat table batch {}: {:?}", + table_batch.table_name, err + ) + ); + continue; + } + } + }; + + let table = { + let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED + .with_label_values(&["flush_resolve_table"]) + .start_timer(); + match catalog_manager + .table( + &catalog, + &schema, + &table_batch.table_name, + Some(ctx.as_ref()), + ) + .await + { + Ok(Some(table)) => table, + Ok(None) => { + record_failure!( + table_batch.row_count, + format!( + "Table not found during pending flush: {}", + table_batch.table_name + ) + ); + continue; + } + Err(err) => { + record_failure!( + table_batch.row_count, + format!( + "Failed to resolve table {} for pending flush: {:?}", + table_batch.table_name, err + ) + ); + continue; + } + } + }; + let table_info = table.table_info(); + + let partition_rule = { + let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED + .with_label_values(&["flush_fetch_partition_rule"]) + .start_timer(); + match partition_manager + .find_table_partition_rule(&table_info) + .await + { + Ok(rule) => rule, + Err(err) => { + record_failure!( + table_batch.row_count, + format!( + "Failed to fetch partition rule for table {}: {:?}", + table_batch.table_name, err + ) + ); + continue; + } + } + }; + + let region_masks = { + let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED + .with_label_values(&["flush_split_record_batch"]) + .start_timer(); + match partition_rule.0.split_record_batch(&record_batch) { + Ok(masks) => masks, + Err(err) => { + record_failure!( + table_batch.row_count, + format!( + "Failed to split record batch for table {}: {:?}", + table_batch.table_name, err + ) + ); + continue; + } + } + }; + + for (region_number, mask) in region_masks { + if mask.select_none() { + continue; + } + + let region_batch = if mask.select_all() { + record_batch.clone() + } else { + let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED + .with_label_values(&["flush_filter_record_batch"]) + .start_timer(); + match filter_record_batch(&record_batch, mask.array()) { + Ok(batch) => batch, + Err(err) => { + record_failure!( + table_batch.row_count, + format!( + "Failed to filter record batch for table {}: {:?}", + table_batch.table_name, err + ) + ); + continue; + } + } + }; + + let row_count = region_batch.num_rows(); + if row_count == 0 { + continue; + } + + let region_id = RegionId::new(table_info.table_id(), region_number); + let datanode = { + let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED + .with_label_values(&["flush_resolve_region_leader"]) + .start_timer(); + match partition_manager.find_region_leader(region_id).await { + Ok(peer) => peer, + Err(err) => { + record_failure!( + row_count, + format!("Failed to resolve region leader {}: {:?}", region_id, err) + ); + continue; + } + } + }; + + let (schema_bytes, data_header, payload) = { + let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED + .with_label_values(&["flush_encode_ipc"]) + .start_timer(); + match record_batch_to_ipc(region_batch) { + Ok(encoded) => encoded, + Err(err) => { + record_failure!( + row_count, + format!( + "Failed to encode Arrow IPC for region {}: {:?}", + region_id, err + ) + ); + continue; + } + } + }; + + let request = RegionRequest { + header: Some(RegionRequestHeader { + tracing_context: TracingContext::from_current_span().to_w3c(), + ..Default::default() + }), + body: Some(region_request::Body::BulkInsert(BulkInsertRequest { + region_id: region_id.as_u64(), + partition_expr_version: None, + body: Some(bulk_insert_request::Body::ArrowIpc(ArrowIpc { + schema: schema_bytes, + data_header, + payload, + })), + })), + }; + + let datanode = node_manager.datanode(&datanode).await; + let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED + .with_label_values(&["flush_write_region"]) + .start_timer(); + match datanode.handle(request).await { + Ok(_) => { + FLUSH_TOTAL.inc(); + FLUSH_ROWS.observe(row_count as f64); + } + Err(err) => { + record_failure!( + row_count, + format!( + "Bulk insert flush failed for region {}: {:?}", + region_id, err + ) + ); + } + } + } + } + + let elapsed = start.elapsed().as_secs_f64(); + FLUSH_ELAPSED.observe(elapsed); + info!( + "Pending rows batch flushed, total rows: {}, elapsed time: {}s", + total_row_count, elapsed + ); + + notify_waiters(waiters, &first_error); +} + +fn notify_waiters(waiters: Vec, first_error: &Option) { + for waiter in waiters { + let result = match first_error { + Some(err_msg) => Err(Error::Internal { + err_msg: err_msg.clone(), + }), + None => Ok(()), + }; + let _ = waiter.response_tx.send(result); + // waiter._permit is dropped here, releasing the inflight semaphore slot + } +} + +fn mark_flush_failure(row_count: usize, message: &str) { + error!("Pending rows batch flush failed, message: {}", message); + FLUSH_FAILURES.inc(); + FLUSH_DROPPED_ROWS.inc_by(row_count as u64); +} + +fn flush_with_error(batch: &mut PendingBatch, message: &str) { + if batch.total_row_count == 0 { + return; + } + + let row_count = batch.total_row_count; + let waiters = std::mem::take(&mut batch.waiters); + batch.tables.clear(); + batch.total_row_count = 0; + batch.created_at = None; + batch.ctx = None; + + PENDING_ROWS.sub(row_count as i64); + PENDING_BATCHES.dec(); + + let err_msg = Some(message.to_string()); + notify_waiters(waiters, &err_msg); + mark_flush_failure(row_count, message); +} + +fn build_table_batches(requests: RowInsertRequests) -> Result<(Vec<(String, RecordBatch)>, usize)> { + let mut table_batches = Vec::with_capacity(requests.inserts.len()); + let mut total_rows = 0; + + for request in requests.inserts { + let Some(rows) = request.rows else { + continue; + }; + if rows.rows.is_empty() { + continue; + } + + let record_batch = rows_to_record_batch(&rows)?; + total_rows += record_batch.num_rows(); + table_batches.push((request.table_name, record_batch)); + } + + Ok((table_batches, total_rows)) +} + +fn align_record_batch_to_schema( + record_batch: RecordBatch, + target_schema: &ArrowSchema, +) -> Result { + let source_schema = record_batch.schema(); + if source_schema.as_ref() == target_schema { + return Ok(record_batch); + } + + for source_field in source_schema.fields() { + if target_schema + .column_with_name(source_field.name()) + .is_none() + { + return Err(Error::Internal { + err_msg: format!( + "Failed to align record batch schema, column '{}' not found in target schema", + source_field.name() + ), + }); + } + } + + let row_count = record_batch.num_rows(); + let mut columns = Vec::with_capacity(target_schema.fields().len()); + for target_field in target_schema.fields() { + let column = if let Some((index, source_field)) = + source_schema.column_with_name(target_field.name()) + { + let source_column = record_batch.column(index).clone(); + if source_field.data_type() == target_field.data_type() { + source_column + } else { + cast(source_column.as_ref(), target_field.data_type()).map_err(|err| { + Error::Internal { + err_msg: format!( + "Failed to cast column '{}' to target type {:?}: {}", + target_field.name(), + target_field.data_type(), + err + ), + } + })? + } + } else { + new_null_array(target_field.data_type(), row_count) + }; + columns.push(column); + } + + RecordBatch::try_new(Arc::new(target_schema.clone()), columns).map_err(|err| Error::Internal { + err_msg: format!("Failed to build aligned record batch: {}", err), + }) +} + +fn rows_to_record_batch(rows: &Rows) -> Result { + let row_count = rows.rows.len(); + let column_count = rows.schema.len(); + + for (idx, row) in rows.rows.iter().enumerate() { + ensure!( + row.values.len() == column_count, + error::InternalSnafu { + err_msg: format!( + "Column count mismatch in row {}, expected {}, got {}", + idx, + column_count, + row.values.len() + ) + } + ); + } + + let mut fields = Vec::with_capacity(column_count); + let mut columns = Vec::with_capacity(column_count); + + for (idx, column_schema) in rows.schema.iter().enumerate() { + let datatype_wrapper = ColumnDataTypeWrapper::try_new( + column_schema.datatype, + column_schema.datatype_extension.clone(), + )?; + let data_type = ConcreteDataType::from(datatype_wrapper); + fields.push(Field::new( + column_schema.column_name.clone(), + data_type.as_arrow_type(), + true, + )); + columns.push(build_arrow_array( + rows, + idx, + &column_schema.column_name, + data_type.as_arrow_type(), + row_count, + )?); + } + + RecordBatch::try_new(Arc::new(ArrowSchema::new(fields)), columns).context(error::ArrowSnafu) +} + +fn build_arrow_array( + rows: &Rows, + col_idx: usize, + column_name: &String, + column_data_type: arrow::datatypes::DataType, + row_count: usize, +) -> Result { + macro_rules! build_array { + ($builder:expr, $( $pattern:pat => $value:expr ),+ $(,)?) => {{ + let mut builder = $builder; + for row in &rows.rows { + match row.values[col_idx].value_data.as_ref() { + $(Some($pattern) => builder.append_value($value),)+ + Some(v) => { + return error::InvalidPromRemoteRequestSnafu { + msg: format!("Unexpected value: {:?}", v), + } + .fail(); + } + None => builder.append_null(), + } + } + Arc::new(builder.finish()) as ArrayRef + }}; + } + + let array: ArrayRef = match column_data_type { + arrow::datatypes::DataType::Float64 => { + build_array!(Float64Builder::with_capacity(row_count), ValueData::F64Value(v) => *v) + } + arrow::datatypes::DataType::Utf8 => build_array!( + StringBuilder::with_capacity(row_count, 0), + ValueData::StringValue(v) => v + ), + arrow::datatypes::DataType::Timestamp(u, _) => match u { + TimeUnit::Second => build_array!( + TimestampSecondBuilder::with_capacity(row_count), + ValueData::TimestampSecondValue(v) => *v + ), + TimeUnit::Millisecond => build_array!( + TimestampMillisecondBuilder::with_capacity(row_count), + ValueData::TimestampMillisecondValue(v) => *v + ), + TimeUnit::Microsecond => build_array!( + TimestampMicrosecondBuilder::with_capacity(row_count), + ValueData::DatetimeValue(v) => *v, + ValueData::TimestampMicrosecondValue(v) => *v + ), + TimeUnit::Nanosecond => build_array!( + TimestampNanosecondBuilder::with_capacity(row_count), + ValueData::TimestampNanosecondValue(v) => *v + ), + }, + ty => { + return error::InvalidPromRemoteRequestSnafu { + msg: format!( + "Unexpected column type {:?}, column name: {}", + ty, column_name + ), + } + .fail(); + } + }; + + Ok(array) +} + +fn record_batch_to_ipc(record_batch: RecordBatch) -> Result<(Bytes, Bytes, Bytes)> { + let mut encoder = FlightEncoder::default(); + let schema = encoder.encode_schema(record_batch.schema().as_ref()); + let mut iter = encoder + .encode(FlightMessage::RecordBatch(record_batch)) + .into_iter(); + let Some(flight_data) = iter.next() else { + return Err(Error::Internal { + err_msg: "Failed to encode empty flight data".to_string(), + }); + }; + if iter.next().is_some() { + return Err(Error::NotSupported { + feat: "bulk insert RecordBatch with dictionary arrays".to_string(), + }); + } + + Ok(( + schema.data_header, + flight_data.data_header, + flight_data.data_body, + )) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use api::v1::value::ValueData; + use api::v1::{ColumnDataType, ColumnSchema, Row, Rows, SemanticType, Value}; + use arrow::array::{Array, Float64Array, Int32Array, Int64Array, StringArray}; + use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + use arrow::record_batch::RecordBatch; + use dashmap::DashMap; + use tokio::sync::mpsc; + + use super::{ + BatchKey, PendingWorker, WorkerCommand, align_record_batch_to_schema, + remove_worker_if_same_channel, rows_to_record_batch, should_close_worker_on_idle_timeout, + }; + + #[test] + fn test_rows_to_record_batch() { + let rows = Rows { + schema: vec![ + ColumnSchema { + column_name: "ts".to_string(), + datatype: ColumnDataType::TimestampMillisecond as i32, + semantic_type: SemanticType::Timestamp as i32, + ..Default::default() + }, + ColumnSchema { + column_name: "value".to_string(), + datatype: ColumnDataType::Float64 as i32, + semantic_type: SemanticType::Field as i32, + ..Default::default() + }, + ColumnSchema { + column_name: "host".to_string(), + datatype: ColumnDataType::String as i32, + semantic_type: SemanticType::Tag as i32, + ..Default::default() + }, + ], + rows: vec![ + Row { + values: vec![ + Value { + value_data: Some(ValueData::TimestampMillisecondValue(1000)), + }, + Value { + value_data: Some(ValueData::F64Value(42.0)), + }, + Value { + value_data: Some(ValueData::StringValue("h1".to_string())), + }, + ], + }, + Row { + values: vec![ + Value { + value_data: Some(ValueData::TimestampMillisecondValue(2000)), + }, + Value { value_data: None }, + Value { + value_data: Some(ValueData::StringValue("h2".to_string())), + }, + ], + }, + ], + }; + + let rb = rows_to_record_batch(&rows).unwrap(); + assert_eq!(2, rb.num_rows()); + assert_eq!(3, rb.num_columns()); + } + + #[test] + fn test_align_record_batch_to_schema_reorder_and_fill_missing() { + let source_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("host", DataType::Utf8, true), + Field::new("value", DataType::Float64, true), + ])); + let source = RecordBatch::try_new( + source_schema, + vec![ + Arc::new(StringArray::from(vec!["h1"])), + Arc::new(Float64Array::from(vec![42.0])), + ], + ) + .unwrap(); + + let target = ArrowSchema::new(vec![ + Field::new("ts", DataType::Int64, true), + Field::new("host", DataType::Utf8, true), + Field::new("value", DataType::Float64, true), + ]); + + let aligned = align_record_batch_to_schema(source, &target).unwrap(); + assert_eq!(aligned.schema().as_ref(), &target); + assert_eq!(1, aligned.num_rows()); + assert_eq!(3, aligned.num_columns()); + let ts = aligned + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(ts.is_null(0)); + } + + #[test] + fn test_align_record_batch_to_schema_cast_column_type() { + let source_schema = Arc::new(ArrowSchema::new(vec![Field::new( + "value", + DataType::Int32, + true, + )])); + let source = RecordBatch::try_new( + source_schema, + vec![Arc::new(Int32Array::from(vec![Some(7), None]))], + ) + .unwrap(); + + let target = ArrowSchema::new(vec![Field::new("value", DataType::Int64, true)]); + let aligned = align_record_batch_to_schema(source, &target).unwrap(); + let value = aligned + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(Some(7), value.iter().next().flatten()); + assert!(value.is_null(1)); + } + + #[test] + fn test_remove_worker_if_same_channel_removes_matching_entry() { + let workers = DashMap::new(); + let key = BatchKey { + catalog: "greptime".to_string(), + schema: "public".to_string(), + physical_table: "phy".to_string(), + }; + + let (tx, _rx) = mpsc::channel::(1); + workers.insert(key.clone(), PendingWorker { tx: tx.clone() }); + + assert!(remove_worker_if_same_channel(&workers, &key, &tx)); + assert!(!workers.contains_key(&key)); + } + + #[test] + fn test_remove_worker_if_same_channel_keeps_newer_entry() { + let workers = DashMap::new(); + let key = BatchKey { + catalog: "greptime".to_string(), + schema: "public".to_string(), + physical_table: "phy".to_string(), + }; + + let (stale_tx, _stale_rx) = mpsc::channel::(1); + let (fresh_tx, _fresh_rx) = mpsc::channel::(1); + workers.insert( + key.clone(), + PendingWorker { + tx: fresh_tx.clone(), + }, + ); + + assert!(!remove_worker_if_same_channel(&workers, &key, &stale_tx)); + assert!(workers.contains_key(&key)); + assert!(workers.get(&key).unwrap().tx.same_channel(&fresh_tx)); + } + + #[test] + fn test_worker_idle_timeout_close_decision() { + assert!(should_close_worker_on_idle_timeout(0, 0)); + assert!(!should_close_worker_on_idle_timeout(1, 0)); + assert!(!should_close_worker_on_idle_timeout(0, 1)); + } +} diff --git a/src/servers/src/query_handler.rs b/src/servers/src/query_handler.rs index 21c7646560..b55502e742 100644 --- a/src/servers/src/query_handler.rs +++ b/src/servers/src/query_handler.rs @@ -86,6 +86,11 @@ pub struct PromStoreResponse { #[async_trait] pub trait PromStoreProtocolHandler { + /// Runs pre-write checks/hooks for prometheus remote write requests. + async fn pre_write(&self, _request: &RowInsertRequests, _ctx: QueryContextRef) -> Result<()> { + Ok(()) + } + /// Handling prometheus remote write requests async fn write( &self, diff --git a/src/servers/tests/http/prom_store_test.rs b/src/servers/tests/http/prom_store_test.rs index b1e974d3d3..c5d5207486 100644 --- a/src/servers/tests/http/prom_store_test.rs +++ b/src/servers/tests/http/prom_store_test.rs @@ -120,7 +120,7 @@ fn make_test_app(tx: mpsc::Sender<(String, Vec)>) -> Router { let instance = Arc::new(DummyInstance { tx }); let server = HttpServerBuilder::new(http_opts) .with_sql_handler(instance.clone()) - .with_prom_handler(instance, None, true, PromValidationMode::Unchecked) + .with_prom_handler(instance, None, true, PromValidationMode::Unchecked, None) .build(); server.build(server.make_app()).unwrap() } diff --git a/tests-integration/src/test_util.rs b/tests-integration/src/test_util.rs index 2bf6e812c7..8e7c3ce8a6 100644 --- a/tests-integration/src/test_util.rs +++ b/tests-integration/src/test_util.rs @@ -623,6 +623,7 @@ pub async fn setup_test_prom_app_with_frontend( Some(frontend_ref.clone()), true, PromValidationMode::Strict, + None, ) .with_prometheus_handler(frontend_ref) .with_greptime_config_options(instance.opts.datanode_options().to_toml().unwrap()) diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index 05a34eb5b7..933fcadf6b 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -1483,6 +1483,11 @@ enable = true [prom_store] enable = true with_metric_engine = true +pending_rows_flush_interval = "0s" +max_batch_rows = 100000 +max_concurrent_flushes = 256 +worker_channel_capacity = 65526 +max_inflight_requests = 3000 [wal] provider = "raft_engine" From 6f2ec120598f5ed5eca436d6497968ef7a49453c Mon Sep 17 00:00:00 2001 From: Weny Xu Date: Fri, 27 Mar 2026 15:22:02 +0800 Subject: [PATCH 048/195] feat(partition): add expression split utility (#7822) * feat(partition): add expression split utility Implement MVP split logic with checker-safe degrade paths and move module under utils/split with aligned split naming and tests. Signed-off-by: WenyXu * refactor: minor Signed-off-by: WenyXu * chore: header Signed-off-by: WenyXu * chore: styling Signed-off-by: WenyXu * fix(partition): degrade split when branch becomes unsatisfiable Detect empty conjunction branches after split and return EmptyBranch instead of silently succeeding. This keeps split behavior aligned with expected partition semantics and adds regression tests for contradictory cuts. Signed-off-by: WenyXu * fix(partition): tighten empty-branch split detection Handle Eq/NotEq contradictions and discrete-gap unsatisfiable ranges in split empty-branch checks. Add regression tests for equality conflicts and impossible int/date intervals. Signed-off-by: WenyXu * fix(partition): degrade singleton and uint impossible split branches Signed-off-by: WenyXu * fix(partition): enforce finite float bounds in split degradation Signed-off-by: WenyXu * fix(partition): drop date and timestamp support from expr split Signed-off-by: WenyXu * fix(partition): reject nan and infinity in expr split Signed-off-by: WenyXu * refactor(partition): reuse conjunction bound collection in expr split Signed-off-by: WenyXu * chore: fmt Signed-off-by: WenyXu * chore: add comments Signed-off-by: WenyXu * fix(partition): respect null-first semantics in empty branch checks Signed-off-by: WenyXu * refactor(partition): restrict expr split to range-only shapes Signed-off-by: WenyXu * docs(partition): clarify split helper scope and test names Signed-off-by: WenyXu * chore: add comments Signed-off-by: WenyXu --------- Signed-off-by: WenyXu --- src/partition/src/lib.rs | 1 + src/partition/src/utils.rs | 15 + src/partition/src/utils/split.rs | 1263 ++++++++++++++++++++++++++++++ 3 files changed, 1279 insertions(+) create mode 100644 src/partition/src/utils.rs create mode 100644 src/partition/src/utils/split.rs diff --git a/src/partition/src/lib.rs b/src/partition/src/lib.rs index c9257e8ee5..647210d1d5 100644 --- a/src/partition/src/lib.rs +++ b/src/partition/src/lib.rs @@ -27,5 +27,6 @@ pub mod partition; pub mod simplify; pub mod splitter; pub mod subtask; +pub mod utils; pub use crate::partition::{PartitionRule, PartitionRuleRef}; diff --git a/src/partition/src/utils.rs b/src/partition/src/utils.rs new file mode 100644 index 0000000000..de212b5589 --- /dev/null +++ b/src/partition/src/utils.rs @@ -0,0 +1,15 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod split; diff --git a/src/partition/src/utils/split.rs b/src/partition/src/utils/split.rs new file mode 100644 index 0000000000..4b1980e34e --- /dev/null +++ b/src/partition/src/utils/split.rs @@ -0,0 +1,1263 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Expression split utilities for partition rules. +//! +//! This module provides a conservative way to split one partition expression `R` +//! by a split expression `S` into: +//! - `left = R AND S` +//! - `right = R AND NOT(S)` +//! +//! The implementation intentionally reuses existing partition components +//! (`Collider`, `simplify`, `PartitionChecker`) and degrades to no-split when an +//! unsupported shape/type is encountered. + +use std::collections::{BTreeMap, HashSet}; + +use datatypes::value::Value; +use snafu::ensure; + +use crate::collider::Collider; +use crate::error::{self, Result}; +use crate::expr::{Operand, PartitionExpr, RestrictedOp}; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ExprSplitDegradeReason { + UnsupportedType, + UnsupportedNotExpansion, + ColliderRejected, + EmptyBranch, +} + +/// Splits one partition expression with a split predicate. +/// +/// Returns `(left, right)` on success, where: +/// - `left = R AND S` +/// - `right = R AND NOT(S)` +/// +/// Supported shape: +/// - `split_expr` must be a single atomic range predicate (`<`, `<=`, `>`, `>=`). +/// - `base_expr` must be a pure `AND` tree of atomic range predicates, possibly +/// across unrelated columns. +/// +/// Returns [`ExprSplitDegradeReason`] when this cannot safely process the shape/type. +pub fn split_partition_expr( + base_expr: PartitionExpr, + split_expr: PartitionExpr, +) -> std::result::Result<(PartitionExpr, PartitionExpr), ExprSplitDegradeReason> { + let base = base_expr.canonicalize(); + let split = split_expr.canonicalize(); + + if validate_supported_expr(&base).is_err() || validate_supported_expr(&split).is_err() { + return Err(ExprSplitDegradeReason::UnsupportedType); + } + + if !validate_base_expr_shape(&base) || !validate_split_expr_shape(&split) { + return Err(ExprSplitDegradeReason::UnsupportedType); + } + + let not_split = match negate_split_expr(&split) { + Ok(expr) => expr, + Err(_) => { + return Err(ExprSplitDegradeReason::UnsupportedNotExpansion); + } + }; + + let left_raw = base.clone().and(split); + let right_raw = base.clone().and(not_split); + + if Collider::new(std::slice::from_ref(&left_raw)).is_err() + || Collider::new(std::slice::from_ref(&right_raw)).is_err() + { + return Err(ExprSplitDegradeReason::ColliderRejected); + } + + let left_expr = simplify_and_bounds(left_raw); + let right_expr = simplify_and_bounds(right_raw); + + if is_empty_and_conjunction(&left_expr) || is_empty_and_conjunction(&right_expr) { + return Err(ExprSplitDegradeReason::EmptyBranch); + } + + Ok((left_expr, right_expr)) +} + +/// Detects whether a pure conjunction expression is definitely unsatisfiable. +/// +/// Scope and intent: +/// - This checker is intentionally conservative. +/// - It only analyzes expressions that can be flattened into: +/// `atom1 AND atom2 AND ...` +/// - If any `OR` is present, it returns `false` (unknown / not handled here). +/// +/// Strategy: +/// - For each column, keep only the tightest lower bound (`>` / `>=`) and +/// tightest upper bound (`<` / `<=`). +/// - `=` is treated as both lower and upper bound at the same value. +/// - `!=` is tracked per column to catch direct conflicts with `=`. +/// - After bounds are collected, the conjunction is empty iff for any column: +/// - lower value is greater than upper value, or +/// - lower value equals upper value but at least one bound is exclusive. +/// - For discrete domains (`Int*`, `UInt*`), adjacent open bounds with no +/// representable value in between are also treated as empty. +/// +/// Notes: +/// - This is still a conservative fast path focused on conjunction emptiness +/// detection for split degradation. +/// - `split_partition_expr` currently restricts its main path to range-only +/// conjunctions, but this helper remains slightly more general so shared +/// bound collection and direct conflict checks stay reusable. +fn is_empty_and_conjunction(expr: &PartitionExpr) -> bool { + let Some(collected) = collect_conjunction_bounds(expr) else { + return false; + }; + + if collected.has_conflict { + return true; + } + + let CollectedConjunction { + lowers, + uppers, + not_equals, + passthrough: _, + has_conflict: _, + } = collected; + + if lowers + .iter() + .any(|(col, lower)| !uppers.contains_key(col) && is_strictly_greater_than_domain_max(lower)) + { + return true; + } + + // Check for contradiction between collected lower/upper bounds per column. + lowers.into_iter().any(|(col, lower)| { + let Some(upper) = uppers.get(&col) else { + return false; + }; + + match lower.value.partial_cmp(&upper.value) { + Some(std::cmp::Ordering::Greater) => true, + Some(std::cmp::Ordering::Equal) => { + if !lower.inclusive || !upper.inclusive { + true + } else { + not_equals + .get(&col) + .is_some_and(|excluded| excluded.contains(&lower.value)) + } + } + Some(std::cmp::Ordering::Less) => { + match ( + discrete_value_index(&lower.value), + discrete_value_index(&upper.value), + ) { + (Some(lower_idx), Some(upper_idx)) => { + let min_candidate = if lower.inclusive { + Some(lower_idx) + } else { + lower_idx.checked_add(1) + }; + let max_candidate = if upper.inclusive { + Some(upper_idx) + } else { + upper_idx.checked_sub(1) + }; + match (min_candidate, max_candidate) { + (Some(min_val), Some(max_val)) => min_val > max_val, + _ => true, + } + } + _ => false, + } + } + _ => false, + } + }) +} + +fn discrete_value_index(v: &Value) -> Option { + match v { + Value::Int8(x) => Some(*x as i128), + Value::Int16(x) => Some(*x as i128), + Value::Int32(x) => Some(*x as i128), + Value::Int64(x) => Some(*x as i128), + Value::UInt8(x) => Some(*x as i128), + Value::UInt16(x) => Some(*x as i128), + Value::UInt32(x) => Some(*x as i128), + Value::UInt64(x) => Some(*x as i128), + _ => None, + } +} + +fn is_strictly_greater_than_domain_max(bound: &LowerBound) -> bool { + if bound.inclusive { + return false; + } + + is_domain_max_value(&bound.value) +} + +fn is_domain_max_value(v: &Value) -> bool { + match v { + Value::Float32(v) => v.0 == f32::MAX, + Value::Float64(v) => v.0 == f64::MAX, + Value::UInt8(v) => *v == u8::MAX, + Value::UInt16(v) => *v == u16::MAX, + Value::UInt32(v) => *v == u32::MAX, + Value::UInt64(v) => *v == u64::MAX, + Value::Int8(v) => *v == i8::MAX, + Value::Int16(v) => *v == i16::MAX, + Value::Int32(v) => *v == i32::MAX, + Value::Int64(v) => *v == i64::MAX, + _ => false, + } +} + +/// Rewrites `NOT(expr)` into an equivalent `PartitionExpr` without introducing a unary NOT node. +/// +/// Why this function exists: +/// - `PartitionExpr` only models binary operators. +/// - Cut logic needs `R AND NOT(S)`. +/// - We therefore rewrite `NOT(S)` into an equivalent binary-expression tree. +/// +/// Rewrite rules: +/// - Atomic comparisons: +/// - `=` <-> `!=` +/// - `<` <-> `>=` +/// - `<=` <-> `>` +/// - `>` <-> `<=` +/// - `>=` <-> `<` +/// - Boolean composition: +/// - `NOT(A AND B)` => `NOT(A) OR NOT(B)` +/// - `NOT(A OR B)` => `NOT(A) AND NOT(B)` +/// +/// Failure behavior: +/// - For `AND/OR`, both sides must be `Operand::Expr`; otherwise returns `NoExprOperand`. +/// - Any unsupported shape bubbles up as an error and the caller degrades to no-split. +pub fn negate_split_expr(expr: &PartitionExpr) -> Result { + match expr.op() { + RestrictedOp::Eq + | RestrictedOp::NotEq + | RestrictedOp::Lt + | RestrictedOp::LtEq + | RestrictedOp::Gt + | RestrictedOp::GtEq => { + // Atomic negate by operator inversion. + let op = match expr.op() { + RestrictedOp::Eq => RestrictedOp::NotEq, + RestrictedOp::NotEq => RestrictedOp::Eq, + RestrictedOp::Lt => RestrictedOp::GtEq, + RestrictedOp::LtEq => RestrictedOp::Gt, + RestrictedOp::Gt => RestrictedOp::LtEq, + RestrictedOp::GtEq => RestrictedOp::Lt, + RestrictedOp::And | RestrictedOp::Or => unreachable!(), + }; + Ok(PartitionExpr::new( + expr.lhs().clone(), + op, + expr.rhs().clone(), + )) + } + RestrictedOp::And | RestrictedOp::Or => { + // De Morgan transform on recursive sub-expressions. + let lhs = match expr.lhs() { + Operand::Expr(lhs) => lhs, + other => { + return error::NoExprOperandSnafu { + operand: other.clone(), + } + .fail(); + } + }; + let rhs = match expr.rhs() { + Operand::Expr(rhs) => rhs, + other => { + return error::NoExprOperandSnafu { + operand: other.clone(), + } + .fail(); + } + }; + let not_lhs = negate_split_expr(lhs)?; + let not_rhs = negate_split_expr(rhs)?; + let op = match expr.op() { + // NOT(A AND B) => NOT(A) OR NOT(B) + RestrictedOp::And => RestrictedOp::Or, + // NOT(A OR B) => NOT(A) AND NOT(B) + RestrictedOp::Or => RestrictedOp::And, + _ => unreachable!(), + }; + Ok(PartitionExpr::new( + Operand::Expr(not_lhs), + op, + Operand::Expr(not_rhs), + )) + } + } +} + +pub fn validate_supported_expr(expr: &PartitionExpr) -> Result<()> { + match expr.op() { + RestrictedOp::And | RestrictedOp::Or => { + let lhs = match expr.lhs() { + Operand::Expr(lhs) => lhs, + other => { + return error::NoExprOperandSnafu { + operand: other.clone(), + } + .fail(); + } + }; + let rhs = match expr.rhs() { + Operand::Expr(rhs) => rhs, + other => { + return error::NoExprOperandSnafu { + operand: other.clone(), + } + .fail(); + } + }; + validate_supported_expr(lhs)?; + validate_supported_expr(rhs)?; + Ok(()) + } + _ => validate_atomic(expr), + } +} + +fn validate_atomic(expr: &PartitionExpr) -> Result<()> { + let (lhs, rhs) = (expr.lhs(), expr.rhs()); + match (lhs, rhs) { + (Operand::Column(_), Operand::Value(v)) | (Operand::Value(v), Operand::Column(_)) => { + ensure!( + is_supported_value(v), + error::InvalidExprSnafu { expr: expr.clone() } + ); + if is_nan_value(v) || is_infinite_value(v) { + return error::InvalidExprSnafu { expr: expr.clone() }.fail(); + } + Ok(()) + } + _ => error::InvalidExprSnafu { expr: expr.clone() }.fail(), + } +} + +/// Validates that `base_expr` stays within the range-only split contract. +/// +/// Scope and intent: +/// - The split utility only handles interval-style partition predicates. +/// - `base_expr` may mention multiple columns, but it must remain a pure `AND` +/// tree of atomic range predicates. +fn validate_base_expr_shape(expr: &PartitionExpr) -> bool { + let mut atoms = Vec::new(); + if !collect_and_atoms(expr, &mut atoms) { + return false; + } + + atoms + .into_iter() + .all(|atom| is_atomic_range_expr(&atom.canonicalize())) +} + +/// Validates that `split_expr` is a single atomic range predicate. +/// +/// This restriction keeps `NOT(split_expr)` in the same range-only subset so the +/// resulting left/right branches stay within the supported contract. +fn validate_split_expr_shape(expr: &PartitionExpr) -> bool { + is_atomic_range_expr(expr) +} + +/// Returns whether `expr` is an atomic `column op value` range predicate. +/// +/// Supported operators are limited to `<`, `<=`, `>`, and `>=`. +fn is_atomic_range_expr(expr: &PartitionExpr) -> bool { + atom_col_op_val(expr).is_some_and(|(_, op, _)| { + matches!( + op, + RestrictedOp::Lt | RestrictedOp::LtEq | RestrictedOp::Gt | RestrictedOp::GtEq + ) + }) +} + +fn is_supported_value(v: &Value) -> bool { + matches!( + v, + Value::Int8(_) + | Value::Int16(_) + | Value::Int32(_) + | Value::Int64(_) + | Value::UInt8(_) + | Value::UInt16(_) + | Value::UInt32(_) + | Value::UInt64(_) + | Value::Float32(_) + | Value::Float64(_) + | Value::String(_) + ) +} + +fn is_nan_value(v: &Value) -> bool { + match v { + Value::Float32(x) => x.0.is_nan(), + Value::Float64(x) => x.0.is_nan(), + _ => false, + } +} + +fn is_infinite_value(v: &Value) -> bool { + match v { + Value::Float32(x) => x.0.is_infinite(), + Value::Float64(x) => x.0.is_infinite(), + _ => false, + } +} + +#[derive(Debug, Clone)] +struct LowerBound { + value: Value, + inclusive: bool, +} + +#[derive(Debug, Clone)] +struct UpperBound { + value: Value, + inclusive: bool, +} + +struct CollectedConjunction { + lowers: BTreeMap, + uppers: BTreeMap, + not_equals: BTreeMap>, + passthrough: Vec, + has_conflict: bool, +} + +/// Simplifies conjunction-only range predicates by keeping the tightest bounds per column. +/// +/// This pass is intentionally conservative and only runs when the whole expression +/// can be flattened into `atom1 AND atom2 AND ...` without any `OR` node. +/// +/// Behavior: +/// - For each column, collect all lower-bound predicates (`>` / `>=`) and keep the +/// tightest one. +/// - For each column, collect all upper-bound predicates (`<` / `<=`) and keep the +/// tightest one. +/// - Non-range predicates (for example `=` / `!=`) are preserved as-is. +/// - If the expression contains `OR`, this function returns the original expression. +/// +/// Tightness rules: +/// - Upper bound: smaller value is tighter; if equal value, exclusive (`<`) is tighter. +/// - Lower bound: larger value is tighter; if equal value, exclusive (`>`) is tighter. +/// +/// Examples: +/// - `a <= 10 AND a < 10` => `a < 10` +/// - `a >= 10 AND a > 10` => `a > 10` +/// - `a < 10 AND a < 5` => `a < 5` +fn simplify_and_bounds(expr: PartitionExpr) -> PartitionExpr { + let Some(collected) = collect_conjunction_bounds(&expr) else { + return expr; + }; + + let CollectedConjunction { + lowers, + uppers, + not_equals: _, + passthrough, + has_conflict: _, + } = collected; + + let mut out = passthrough; + out.extend(lowers.into_iter().map(|(col, lower)| { + PartitionExpr::new( + Operand::Column(col), + if lower.inclusive { + RestrictedOp::GtEq + } else { + RestrictedOp::Gt + }, + Operand::Value(lower.value), + ) + })); + out.extend(uppers.into_iter().map(|(col, upper)| { + PartitionExpr::new( + Operand::Column(col), + if upper.inclusive { + RestrictedOp::LtEq + } else { + RestrictedOp::Lt + }, + Operand::Value(upper.value), + ) + })); + + fold_and_exprs(out).unwrap_or(expr) +} + +/// Flattens an expression into atomic terms when it is a pure conjunction tree. +/// +/// Returns `false` if any `OR` is encountered, signaling caller to skip this +/// simplification path. +fn collect_and_atoms(expr: &PartitionExpr, out: &mut Vec) -> bool { + match expr.op() { + RestrictedOp::And => { + let lhs = match expr.lhs() { + Operand::Expr(lhs) => lhs, + _ => return false, + }; + let rhs = match expr.rhs() { + Operand::Expr(rhs) => rhs, + _ => return false, + }; + collect_and_atoms(lhs, out) && collect_and_atoms(rhs, out) + } + RestrictedOp::Or => false, + _ => { + out.push(expr.clone()); + true + } + } +} + +/// Extracts `(column, op, value)` from a canonicalized atomic expression. +fn atom_col_op_val(expr: &PartitionExpr) -> Option<(String, RestrictedOp, Value)> { + let lhs = expr.lhs(); + let rhs = expr.rhs(); + match (lhs, rhs) { + (Operand::Column(col), Operand::Value(v)) => { + Some((col.clone(), expr.op().clone(), v.clone())) + } + _ => None, + } +} + +/// Collects per-column bounds and passthrough atoms from a pure `AND` tree. +/// +/// Scope and intent: +/// - This helper is shared by [`is_empty_and_conjunction`] and +/// [`simplify_and_bounds`] so both paths interpret conjunction atoms the same +/// way. +/// - It only handles conjunction-only expressions. If any `OR` is present, it +/// returns `None` and lets callers keep their conservative fallback behavior. +/// +/// Behavior: +/// - Tightest lower/upper bounds are recorded per column. +/// - `=` contributes both a lower and an upper bound at the same value. +/// - `!=` and non-range atoms are preserved in `passthrough` for callers that +/// need to rebuild the conjunction. +/// - `has_conflict` is set when atomic constraints already contradict each +/// other (for example `a = 1 AND a <> 1`). +/// +/// Notes: +/// - This helper is intentionally a bit more general than the current +/// `split_partition_expr` contract, which now only feeds range-only +/// conjunctions into the main split path. +fn collect_conjunction_bounds(expr: &PartitionExpr) -> Option { + let mut atoms = Vec::new(); + if !collect_and_atoms(expr, &mut atoms) { + return None; + } + + let mut lowers = BTreeMap::new(); + let mut uppers = BTreeMap::new(); + let mut equals = BTreeMap::new(); + let mut not_equals: BTreeMap> = BTreeMap::new(); + let mut passthrough = Vec::new(); + let mut seen = HashSet::new(); + let mut has_conflict = false; + + for atom in atoms { + let atom = atom.canonicalize(); + let Some((col, op, val)) = atom_col_op_val(&atom) else { + push_unique_expr(&mut passthrough, &mut seen, atom); + continue; + }; + + match op { + RestrictedOp::Lt | RestrictedOp::LtEq => update_upper_bound( + &mut uppers, + col, + UpperBound { + value: val, + inclusive: matches!(op, RestrictedOp::LtEq), + }, + ), + RestrictedOp::Gt | RestrictedOp::GtEq => update_lower_bound( + &mut lowers, + col, + LowerBound { + value: val, + inclusive: matches!(op, RestrictedOp::GtEq), + }, + ), + RestrictedOp::Eq => { + if let Some(existing) = equals.get(&col) + && existing != &val + { + has_conflict = true; + } + if not_equals + .get(&col) + .is_some_and(|excluded| excluded.contains(&val)) + { + has_conflict = true; + } + equals.insert(col.clone(), val.clone()); + update_lower_bound( + &mut lowers, + col.clone(), + LowerBound { + value: val.clone(), + inclusive: true, + }, + ); + update_upper_bound( + &mut uppers, + col, + UpperBound { + value: val, + inclusive: true, + }, + ); + push_unique_expr(&mut passthrough, &mut seen, atom); + } + RestrictedOp::NotEq => { + if equals.get(&col).is_some_and(|eq| eq == &val) { + has_conflict = true; + } + not_equals.entry(col).or_default().insert(val); + push_unique_expr(&mut passthrough, &mut seen, atom); + } + RestrictedOp::And | RestrictedOp::Or => { + push_unique_expr(&mut passthrough, &mut seen, atom); + } + } + } + + Some(CollectedConjunction { + lowers, + uppers, + not_equals, + passthrough, + has_conflict, + }) +} + +fn push_unique_expr(out: &mut Vec, seen: &mut HashSet, expr: PartitionExpr) { + let key = expr.to_string(); + if seen.insert(key) { + out.push(expr); + } +} + +fn update_upper_bound( + uppers: &mut BTreeMap, + col: String, + candidate: UpperBound, +) { + match uppers.get_mut(&col) { + Some(current) => { + if prefer_upper(&candidate, current) { + *current = candidate; + } + } + None => { + uppers.insert(col, candidate); + } + } +} + +fn update_lower_bound( + lowers: &mut BTreeMap, + col: String, + candidate: LowerBound, +) { + match lowers.get_mut(&col) { + Some(current) => { + if prefer_lower(&candidate, current) { + *current = candidate; + } + } + None => { + lowers.insert(col, candidate); + } + } +} + +fn prefer_upper(candidate: &UpperBound, current: &UpperBound) -> bool { + // "Smaller" upper bound is tighter. For equal value, exclusive is tighter. + match candidate.value.partial_cmp(¤t.value) { + Some(std::cmp::Ordering::Less) => true, + Some(std::cmp::Ordering::Equal) => !candidate.inclusive && current.inclusive, + _ => false, + } +} + +fn prefer_lower(candidate: &LowerBound, current: &LowerBound) -> bool { + // "Larger" lower bound is tighter. For equal value, exclusive is tighter. + match candidate.value.partial_cmp(¤t.value) { + Some(std::cmp::Ordering::Greater) => true, + Some(std::cmp::Ordering::Equal) => !candidate.inclusive && current.inclusive, + _ => false, + } +} + +/// Folds a list of expressions into a left-associated AND tree. +/// Returns `None` if the input list is empty. +fn fold_and_exprs(mut exprs: Vec) -> Option { + exprs.drain(..).reduce(|acc, next| acc.and(next)) +} + +#[cfg(test)] +mod tests { + use datatypes::value::{OrderedFloat, Value}; + use store_api::storage::RegionNumber; + + use super::*; + use crate::checker::PartitionChecker; + use crate::expr::col; + use crate::multi_dim::MultiDimPartitionRule; + + fn validate_cut_result_with_checker( + original_rule_exprs: &[PartitionExpr], + replaced_index: usize, + left: &Option, + right: &Option, + partition_columns: Vec, + regions: Vec, + ) -> Result<()> { + ensure!( + replaced_index < original_rule_exprs.len(), + error::UnexpectedSnafu { + err_msg: format!( + "replaced index out of bounds: {replaced_index} >= {}", + original_rule_exprs.len() + ) + } + ); + + let mut exprs = original_rule_exprs.to_vec(); + exprs.remove(replaced_index); + exprs.extend(left.iter().cloned()); + exprs.extend(right.iter().cloned()); + + ensure!( + !exprs.is_empty(), + error::UnexpectedSnafu { + err_msg: "empty rule exprs after split".to_string() + } + ); + + let final_regions = if regions.len() == exprs.len() { + regions + } else { + (0..exprs.len() as RegionNumber).collect() + }; + + let rule = MultiDimPartitionRule::try_new(partition_columns, final_regions, exprs, false)?; + let checker = PartitionChecker::try_new(&rule)?; + checker.check()?; + Ok(()) + } + + #[test] + fn test_split_simple_range() { + // R: a < 10 + let base = col("a").lt(Value::Int64(10)); + // S: a < 5 + let split = col("a").lt(Value::Int64(5)); + let (left, right) = split_partition_expr(base, split).unwrap(); + // left = R AND S = a < 5 + assert_eq!(left.to_string(), "a < 5"); + // right = R AND NOT(S) = a >= 5 AND a < 10 + assert_eq!(right.to_string(), "a >= 5 AND a < 10"); + } + + #[test] + fn test_split_string_interval() { + // R: v > 'm' AND v < 'n' + let base = col("v") + .gt(Value::String("m".into())) + .and(col("v").lt(Value::String("n".into()))); + // S: v < 'm~' + let split = col("v").lt(Value::String("m~".into())); + let (left, right) = split_partition_expr(base, split).unwrap(); + // left = (v > m AND v < n) AND (v < m~) -> v > m AND v < m~ + assert_eq!(left.to_string(), "v > m AND v < m~"); + // right = (v > m AND v < n) AND (v >= m~) -> v >= m~ AND v < n + assert_eq!(right.to_string(), "v >= m~ AND v < n"); + } + + #[test] + fn test_split_numeric_interval_mid_split() { + // R: a > 3 AND a < 10 + let base = col("a") + .gt(Value::Int64(3)) + .and(col("a").lt(Value::Int64(10))); + // S: a < 5 + let split = col("a").lt(Value::Int64(5)); + + let (left, right) = split_partition_expr(base, split).unwrap(); + + // left = (a > 3 AND a < 10) AND (a < 5) -> a > 3 AND a < 5 + assert_eq!(left.to_string(), "a > 3 AND a < 5"); + // right = (a > 3 AND a < 10) AND (a >= 5) -> a >= 5 AND a < 10 + assert_eq!(right.to_string(), "a >= 5 AND a < 10"); + } + + #[test] + fn test_split_base_expr_allows_unrelated_range_columns() { + // R: a > 20 AND b < 20 + let base = col("a") + .gt(Value::Int64(20)) + .and(col("b").lt(Value::Int64(20))); + // S: a < 30 + let split = col("a").lt(Value::Int64(30)); + + let (left, right) = split_partition_expr(base, split).unwrap(); + + // left keeps the unrelated `b < 20` bound while splitting column `a`. + assert_eq!(left.to_string(), "a > 20 AND a < 30 AND b < 20"); + // right also preserves the unrelated column bound. + assert_eq!(right.to_string(), "a >= 30 AND b < 20"); + } + + #[test] + fn test_split_degrade_on_unsupported_type() { + // intentionally excludes boolean from split-able value types. + let base = col("a").eq(Value::Boolean(true)); + let split = col("a").eq(Value::Boolean(true)); + + let result = split_partition_expr(base, split); + assert_eq!(result.unwrap_err(), ExprSplitDegradeReason::UnsupportedType); + } + + #[test] + fn test_validate_cut_result_with_checker() { + // Original partition set: a < 10, a >= 10 + let original = vec![ + col("a").lt(Value::Int64(10)), + col("a").gt_eq(Value::Int64(10)), + ]; + let left = Some(col("a").lt(Value::Int64(5))); + let right = Some( + col("a") + .gt_eq(Value::Int64(5)) + .and(col("a").lt(Value::Int64(10))), + ); + + validate_cut_result_with_checker( + &original, + 0, + &left, + &right, + vec!["a".to_string()], + vec![1, 2, 3], + ) + .unwrap(); + } + + #[test] + fn test_split_degrade_on_empty_branch() { + // R: a < 10 + let base = col("a").lt(Value::Int64(10)); + // S: a < 20 + let split = col("a").lt(Value::Int64(20)); + + // right = (a < 10) AND (a >= 20) is unsatisfiable, should degrade. + let result = split_partition_expr(base, split); + assert_eq!(result.unwrap_err(), ExprSplitDegradeReason::EmptyBranch); + } + + #[test] + fn test_split_rejects_eq_in_base_expr() { + // R: a = 5 falls outside the range-only base_expr contract. + let base = col("a").eq(Value::Int64(5)); + // S: a < 6 remains a valid range split. + let split = col("a").lt(Value::Int64(6)); + + let result = split_partition_expr(base, split); + assert_eq!(result.unwrap_err(), ExprSplitDegradeReason::UnsupportedType); + } + + #[test] + fn test_split_degrade_on_discrete_gap_int() { + // R: a < 5 + let base = col("a").lt(Value::Int64(5)); + // S: a <= 4 + let split = col("a").lt_eq(Value::Int64(4)); + + // right = (a < 5) AND (a > 4) has no integer solution, should degrade. + let result = split_partition_expr(base, split); + assert_eq!(result.unwrap_err(), ExprSplitDegradeReason::EmptyBranch); + } + + #[test] + fn test_split_degrade_on_unsupported_date_type() { + // Date is intentionally excluded from split-supported value types. + let base = col("d").lt(Value::Date(5.into())); + let split = col("d").lt_eq(Value::Date(4.into())); + + let result = split_partition_expr(base, split); + assert_eq!(result.unwrap_err(), ExprSplitDegradeReason::UnsupportedType); + } + + #[test] + fn test_split_degrade_on_unsupported_timestamp_type() { + // Timestamp is intentionally excluded from split-supported value types. + let base = col("ts").lt(Value::Timestamp(0.into())); + let split = col("ts").lt_eq(Value::Timestamp(1.into())); + + let result = split_partition_expr(base, split); + assert_eq!(result.unwrap_err(), ExprSplitDegradeReason::UnsupportedType); + } + + #[test] + fn test_split_rejects_not_eq_in_split_expr() { + // R: a >= 5 AND a <= 5 + let base = col("a") + .gt_eq(Value::Int64(5)) + .and(col("a").lt_eq(Value::Int64(5))); + // S: a <> 5 falls outside the range-only split_expr contract. + let split = col("a").not_eq(Value::Int64(5)); + + let result = split_partition_expr(base, split); + assert_eq!(result.unwrap_err(), ExprSplitDegradeReason::UnsupportedType); + } + + #[test] + fn test_split_rejects_eq_in_split_expr() { + // R: a >= 5 AND a <= 5 + let base = col("a") + .gt_eq(Value::Int64(5)) + .and(col("a").lt_eq(Value::Int64(5))); + // S: a = 5 falls outside the range-only split_expr contract. + let split = col("a").eq(Value::Int64(5)); + + let result = split_partition_expr(base, split); + assert_eq!(result.unwrap_err(), ExprSplitDegradeReason::UnsupportedType); + } + + #[test] + fn test_split_degrade_on_uint_one_sided_impossible_upper_bound() { + // R: a < 10 (UInt64 domain) + let base = col("a").lt(Value::UInt64(10)); + // S: a < 0 is still satisfiable by NULL under null-first partition semantics. + // The split keeps a nullable left branch instead of degrading it as empty. + let split = col("a").lt(Value::UInt64(0)); + + let (left, right) = split_partition_expr(base, split).unwrap(); + assert_eq!(left.to_string(), "a < 0"); + assert_eq!(right.to_string(), "a >= 0 AND a < 10"); + } + + #[test] + fn test_split_degrade_on_uint_one_sided_impossible_lower_bound() { + // R: a < 10 (UInt64 domain) + let base = col("a").lt(Value::UInt64(10)); + // S: a > u64::MAX (impossible on UInt64) + let split = col("a").gt(Value::UInt64(u64::MAX)); + + // left = (a < 10) AND (a > u64::MAX) is unsatisfiable on UInt64, should degrade. + let result = split_partition_expr(base, split); + assert_eq!(result.unwrap_err(), ExprSplitDegradeReason::EmptyBranch); + } + + #[test] + fn test_split_degrade_on_int_one_sided_impossible_upper_bound() { + // R: a < 10 (Int64 domain) + let base = col("a").lt(Value::Int64(10)); + // S: a < i64::MIN is still satisfiable by NULL under null-first partition semantics. + // The split keeps a nullable left branch instead of degrading it as empty. + let split = col("a").lt(Value::Int64(i64::MIN)); + + let (left, right) = split_partition_expr(base, split).unwrap(); + assert_eq!(left.to_string(), format!("a < {}", i64::MIN)); + assert_eq!(right.to_string(), format!("a >= {} AND a < 10", i64::MIN)); + } + + #[test] + fn test_split_degrade_on_int_one_sided_impossible_lower_bound() { + // R: a < 10 (Int64 domain) + let base = col("a").lt(Value::Int64(10)); + // S: a > i64::MAX (impossible on Int64) + let split = col("a").gt(Value::Int64(i64::MAX)); + + // left = (a < 10) AND (a > i64::MAX) is unsatisfiable on Int64, should degrade. + let result = split_partition_expr(base, split); + assert_eq!(result.unwrap_err(), ExprSplitDegradeReason::EmptyBranch); + } + + #[test] + fn test_split_degrade_on_string_one_sided_impossible_upper_bound() { + // R: s < "z" (String domain) + let base = col("s").lt(Value::String("z".into())); + // S: s < "" is still satisfiable by NULL under null-first partition semantics. + // The split keeps a nullable left branch instead of degrading it as empty. + let split = col("s").lt(Value::String("".into())); + + let (left, right) = split_partition_expr(base, split).unwrap(); + assert_eq!(left.to_string(), "s < "); + assert_eq!(right.to_string(), "s >= AND s < z"); + } + + #[test] + fn test_split_degrade_on_float64_one_sided_impossible_upper_bound() { + // R: a < 10.0 (Float64 domain) + let base = col("a").lt(Value::Float64(OrderedFloat(10.0))); + // S: a < f64::MIN is still satisfiable by NULL under null-first partition semantics. + // The split keeps a nullable left branch instead of degrading it as empty. + let split = col("a").lt(Value::Float64(OrderedFloat(f64::MIN))); + + let (left, right) = split_partition_expr(base, split).unwrap(); + assert_eq!(left.to_string(), format!("a < {}", f64::MIN)); + assert_eq!(right.to_string(), format!("a >= {} AND a < 10", f64::MIN)); + } + + #[test] + fn test_split_degrade_on_float64_one_sided_impossible_lower_bound() { + // R: a < 10.0 (Float64 domain) + let base = col("a").lt(Value::Float64(OrderedFloat(10.0))); + // S: a > f64::MAX (impossible with finite-only float policy) + let split = col("a").gt(Value::Float64(OrderedFloat(f64::MAX))); + + // left = (a < 10.0) AND (a > f64::MAX) is unsatisfiable, should degrade. + let result = split_partition_expr(base, split); + assert_eq!(result.unwrap_err(), ExprSplitDegradeReason::EmptyBranch); + } + + #[test] + fn test_split_degrade_on_float32_one_sided_impossible_upper_bound() { + // R: a < 10.0f32 (Float32 domain) + let base = col("a").lt(Value::Float32(OrderedFloat(10.0))); + // S: a < f32::MIN is still satisfiable by NULL under null-first partition semantics. + // The split keeps a nullable left branch instead of degrading it as empty. + let split = col("a").lt(Value::Float32(OrderedFloat(f32::MIN))); + + let (left, right) = split_partition_expr(base, split).unwrap(); + assert_eq!(left.to_string(), format!("a < {}", f32::MIN)); + assert_eq!(right.to_string(), format!("a >= {} AND a < 10", f32::MIN)); + } + + #[test] + fn test_split_degrade_on_float32_one_sided_impossible_lower_bound() { + // R: a < 10.0f32 (Float32 domain) + let base = col("a").lt(Value::Float32(OrderedFloat(10.0))); + // S: a > f32::MAX (impossible with finite-only float policy) + let split = col("a").gt(Value::Float32(OrderedFloat(f32::MAX))); + + // left = (a < 10.0f32) AND (a > f32::MAX) is unsatisfiable, should degrade. + let result = split_partition_expr(base, split); + assert_eq!(result.unwrap_err(), ExprSplitDegradeReason::EmptyBranch); + } + + #[test] + fn test_simplify_same_upper_bound_prefers_strict() { + // a <= 10 AND a < 10 => a < 10 + let expr = col("a") + .lt_eq(Value::Int64(10)) + .and(col("a").lt(Value::Int64(10))); + + let simplified = simplify_and_bounds(expr); + assert_eq!(simplified.to_string(), "a < 10"); + } + + #[test] + fn test_simplify_same_lower_bound_prefers_strict() { + // a >= 10 AND a > 10 => a > 10 + let expr = col("a") + .gt_eq(Value::Int64(10)) + .and(col("a").gt(Value::Int64(10))); + + let simplified = simplify_and_bounds(expr); + assert_eq!(simplified.to_string(), "a > 10"); + } + + #[test] + fn test_negate_split_expr_demorgan_and() { + // expr: (a < 10) AND (a >= 3) + let expr = col("a") + .lt(Value::Int64(10)) + .and(col("a").gt_eq(Value::Int64(3))); + let not_expr = negate_split_expr(&expr).unwrap(); + // NOT(expr) => (a >= 10) OR (a < 3) + assert_eq!(not_expr.to_string(), "a >= 10 OR a < 3"); + } + + #[test] + fn test_negate_split_expr_demorgan_or() { + // expr: (a = 1) OR (a <> 2) + let expr = PartitionExpr::new( + Operand::Expr(col("a").eq(Value::Int64(1))), + RestrictedOp::Or, + Operand::Expr(col("a").not_eq(Value::Int64(2))), + ); + let not_expr = negate_split_expr(&expr).unwrap(); + // NOT(expr) => (a <> 1) AND (a = 2) + assert_eq!(not_expr.to_string(), "a <> 1 AND a = 2"); + } + + #[test] + fn test_negate_split_expr_invalid_and_operand() { + // malformed AND: rhs is a scalar value, not an Expr subtree. + let malformed = PartitionExpr { + lhs: Box::new(Operand::Expr(col("a").lt(Value::Int64(10)))), + op: RestrictedOp::And, + rhs: Box::new(Operand::Value(Value::Int64(1))), + }; + assert!(negate_split_expr(&malformed).is_err()); + } + + #[test] + fn test_validate_supported_expr_value_column_allowed() { + // Canonicalization can flip to column-value; validator must accept value-column input. + let expr = PartitionExpr::new( + Operand::Value(Value::Int64(10)), + RestrictedOp::Lt, + Operand::Column("a".to_string()), + ); + assert!(validate_supported_expr(&expr).is_ok()); + } + + #[test] + fn test_validate_supported_expr_invalid_atomic_shape() { + // column-column atomic comparison is out of shape. + let expr = PartitionExpr::new( + Operand::Column("a".to_string()), + RestrictedOp::Eq, + Operand::Column("b".to_string()), + ); + assert!(validate_supported_expr(&expr).is_err()); + } + + #[test] + fn test_validate_supported_expr_nan_comparison_rejected() { + // NaN cannot be used in any supported comparison predicate. + let expr = col("a").lt(Value::Float64(OrderedFloat(f64::NAN))); + assert!(validate_supported_expr(&expr).is_err()); + } + + #[test] + fn test_validate_supported_expr_infinite_comparison_rejected() { + // Infinity cannot be used in any supported comparison predicate under + // finite-only float policy. + let pos_inf = col("a").gt(Value::Float64(OrderedFloat(f64::INFINITY))); + let neg_inf = col("a").lt(Value::Float32(OrderedFloat(f32::NEG_INFINITY))); + assert!(validate_supported_expr(&pos_inf).is_err()); + assert!(validate_supported_expr(&neg_inf).is_err()); + } + + #[test] + fn test_validate_supported_expr_nan_eq_rejected() { + let expr = col("a").eq(Value::Float64(OrderedFloat(f64::NAN))); + assert!(validate_supported_expr(&expr).is_err()); + } + + #[test] + fn test_validate_supported_expr_infinite_eq_rejected() { + let pos_inf = col("a").eq(Value::Float64(OrderedFloat(f64::INFINITY))); + let neg_inf = col("a").not_eq(Value::Float32(OrderedFloat(f32::NEG_INFINITY))); + assert!(validate_supported_expr(&pos_inf).is_err()); + assert!(validate_supported_expr(&neg_inf).is_err()); + } + + #[test] + fn test_simplify_and_bounds_or_keeps_original() { + // OR tree is intentionally not flattened by AND-only simplifier. + let expr = PartitionExpr::new( + Operand::Expr(col("a").lt(Value::Int64(10))), + RestrictedOp::Or, + Operand::Expr(col("a").gt_eq(Value::Int64(20))), + ); + let simplified = simplify_and_bounds(expr.clone()); + assert_eq!(simplified.to_string(), expr.to_string()); + } + + #[test] + fn test_simplify_and_bounds_keep_stronger_when_weaker_seen_later() { + // upper: stronger bound first, weaker later -> keep stronger (< 5). + let upper = col("a") + .lt(Value::Int64(5)) + .and(col("a").lt(Value::Int64(10))); + assert_eq!(simplify_and_bounds(upper).to_string(), "a < 5"); + + // lower: stronger bound first, weaker later -> keep stronger (> 10). + let lower = col("a") + .gt(Value::Int64(10)) + .and(col("a").gt(Value::Int64(5))); + assert_eq!(simplify_and_bounds(lower).to_string(), "a > 10"); + } + + #[test] + fn test_internal_helpers_uncovered_branches() { + // Empty AND fold should return None. + assert!(fold_and_exprs(vec![]).is_none()); + + // Any OR in tree disables AND-bound simplification path. + let mut out = Vec::new(); + let or_expr = PartitionExpr::new( + Operand::Expr(col("a").lt(Value::Int64(10))), + RestrictedOp::Or, + Operand::Expr(col("a").gt_eq(Value::Int64(20))), + ); + assert!(!collect_and_atoms(&or_expr, &mut out)); + + // value-value atom has no (column, op, value) projection. + let value_value = PartitionExpr::new( + Operand::Value(Value::Int64(1)), + RestrictedOp::Eq, + Operand::Value(Value::Int64(2)), + ); + assert!(atom_col_op_val(&value_value).is_none()); + } + + #[test] + fn test_split_rejects_or_in_base_expr() { + // R: (a < 10) OR (a >= 20 AND a < 30) falls outside the AND-only base_expr contract. + let base = PartitionExpr::new( + Operand::Expr(col("a").lt(Value::Int64(10))), + RestrictedOp::Or, + Operand::Expr( + col("a") + .gt_eq(Value::Int64(20)) + .and(col("a").lt(Value::Int64(30))), + ), + ); + // S: a < 5 + let split = col("a").lt(Value::Int64(5)); + + let result = split_partition_expr(base, split); + assert_eq!(result.unwrap_err(), ExprSplitDegradeReason::UnsupportedType); + } + + #[test] + fn test_split_rejects_or_in_split_expr() { + // R: a < 10 + let base = col("a").lt(Value::Int64(10)); + // S: (a < 5) OR (a >= 8 AND a < 9) falls outside the atomic split_expr contract. + let split = PartitionExpr::new( + Operand::Expr(col("a").lt(Value::Int64(5))), + RestrictedOp::Or, + Operand::Expr( + col("a") + .gt_eq(Value::Int64(8)) + .and(col("a").lt(Value::Int64(9))), + ), + ); + + let result = split_partition_expr(base, split); + assert_eq!(result.unwrap_err(), ExprSplitDegradeReason::UnsupportedType); + } +} From fe45ae446c035392fc61845e1e0d95c14bcd52f7 Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Sat, 28 Mar 2026 07:36:13 +0800 Subject: [PATCH 049/195] perf: optimize promql range functions (#7878) * bench(promql): add range-function benchmark suite * perf(promql): use flat buffers in range function hot loops * perf(promql): reuse quantile scratch buffers --- Cargo.lock | 9 +- src/promql/Cargo.toml | 8 + src/promql/benches/bench_main.rs | 21 ++ src/promql/benches/bench_range_fn.rs | 355 +++++++++++++++++++++ src/promql/src/functions.rs | 94 +++++- src/promql/src/functions/idelta.rs | 65 ++-- src/promql/src/functions/predict_linear.rs | 139 +++++--- src/promql/src/functions/quantile.rs | 80 +++-- 8 files changed, 653 insertions(+), 118 deletions(-) create mode 100644 src/promql/benches/bench_main.rs create mode 100644 src/promql/benches/bench_range_fn.rs diff --git a/Cargo.lock b/Cargo.lock index 676eaf0822..0f3b58b373 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10215,6 +10215,7 @@ dependencies = [ "common-macro", "common-recordbatch", "common-telemetry", + "criterion 0.7.0", "datafusion", "datafusion-common", "datafusion-expr", @@ -11595,9 +11596,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.10" +version = "0.103.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef" +checksum = "e4a72fe2bcf7a6ac6fd7d0b9e5cb68aeb7d4c0a0271730218b3e92d43b4eb435" dependencies = [ "ring", "rustls-pki-types", @@ -13365,9 +13366,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" [[package]] name = "tar" -version = "0.4.45" +version = "0.4.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973" +checksum = "1d863878d212c87a19c1a610eb53bb01fe12951c0501cf5a0d65f724914a667a" dependencies = [ "filetime", "libc", diff --git a/src/promql/Cargo.toml b/src/promql/Cargo.toml index f93cb8beb9..306563d1ce 100644 --- a/src/promql/Cargo.toml +++ b/src/promql/Cargo.toml @@ -27,4 +27,12 @@ prost.workspace = true snafu.workspace = true [dev-dependencies] +criterion.workspace = true +datafusion-common.workspace = true +datafusion-expr.workspace = true +datatypes.workspace = true tokio.workspace = true + +[[bench]] +name = "bench_main" +harness = false diff --git a/src/promql/benches/bench_main.rs b/src/promql/benches/bench_main.rs new file mode 100644 index 0000000000..2d93887041 --- /dev/null +++ b/src/promql/benches/bench_main.rs @@ -0,0 +1,21 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use criterion::criterion_main; + +mod bench_range_fn; + +criterion_main! { + bench_range_fn::benches +} diff --git a/src/promql/benches/bench_range_fn.rs b/src/promql/benches/bench_range_fn.rs new file mode 100644 index 0000000000..840956b942 --- /dev/null +++ b/src/promql/benches/bench_range_fn.rs @@ -0,0 +1,355 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Benchmarks for PromQL range functions. + +use std::sync::Arc; + +use criterion::{BenchmarkId, Criterion, criterion_group}; +use datafusion::arrow::array::{Float64Array, TimestampMillisecondArray}; +use datafusion::physical_plan::ColumnarValue; +use datafusion_common::ScalarValue; +use datafusion_common::config::ConfigOptions; +use datafusion_expr::ScalarFunctionArgs; +use datatypes::arrow::datatypes::{DataType, Field}; +use promql::functions::{Delta, IDelta, Increase, PredictLinear, QuantileOverTime, Rate}; +use promql::range_array::RangeArray; + +fn build_sliding_ranges( + num_points: usize, + window_size: u32, + values: Vec, + eval_offset_ms: i64, +) -> (RangeArray, RangeArray, Arc) { + let step_ms = 1000i64; + let timestamps: Vec = (0..num_points as i64).map(|i| (i + 1) * step_ms).collect(); + + let ts_array = Arc::new(TimestampMillisecondArray::from(timestamps.clone())); + let val_array = Arc::new(Float64Array::from(values)); + + let num_windows = if num_points >= window_size as usize { + num_points - window_size as usize + 1 + } else { + 0 + }; + + let ranges: Vec<(u32, u32)> = (0..num_windows).map(|i| (i as u32, window_size)).collect(); + + let eval_ts: Vec = (0..num_windows) + .map(|i| timestamps[i + window_size as usize - 1] + eval_offset_ms) + .collect(); + let eval_ts_array = Arc::new(TimestampMillisecondArray::from(eval_ts)); + + let ts_range = RangeArray::from_ranges(ts_array, ranges.clone()).unwrap(); + let val_range = RangeArray::from_ranges(val_array, ranges).unwrap(); + + (ts_range, val_range, eval_ts_array) +} + +fn build_monotonic_counter_values(num_points: usize) -> Vec { + let mut current = 0.0; + (0..num_points) + .map(|i| { + current += 1.0 + (i % 7) as f64 * 0.25; + current + }) + .collect() +} + +fn build_resetting_counter_values(num_points: usize) -> Vec { + let mut current = 0.0; + (0..num_points) + .map(|i| { + if i > 0 && i % 37 == 0 { + current = 1.0; + } else { + current += 1.0 + (i % 5) as f64 * 0.5; + } + current + }) + .collect() +} + +fn build_gauge_values(num_points: usize) -> Vec { + (0..num_points) + .map(|i| ((i % 29) as f64 - 14.0) * 1.25 + (i % 3) as f64 * 0.1) + .collect() +} + +fn build_default_values(num_points: usize) -> Vec { + (0..num_points).map(|i| i as f64 * 1.5 + 0.1).collect() +} + +fn make_extrapolated_rate_input( + num_points: usize, + window_size: u32, + values: Vec, + eval_offset_ms: i64, +) -> Vec { + let (ts_range, val_range, eval_ts) = + build_sliding_ranges(num_points, window_size, values, eval_offset_ms); + let range_length = window_size as i64 * 1000; + vec![ + ColumnarValue::Array(Arc::new(ts_range.into_dict())), + ColumnarValue::Array(Arc::new(val_range.into_dict())), + ColumnarValue::Array(eval_ts), + ColumnarValue::Scalar(ScalarValue::Int64(Some(range_length))), + ] +} + +fn make_idelta_input(num_points: usize, window_size: u32) -> Vec { + let (ts_range, val_range, _) = + build_sliding_ranges(num_points, window_size, build_default_values(num_points), 0); + vec![ + ColumnarValue::Array(Arc::new(ts_range.into_dict())), + ColumnarValue::Array(Arc::new(val_range.into_dict())), + ] +} + +fn make_quantile_input(num_points: usize, window_size: u32) -> Vec { + let (ts_range, val_range, _) = + build_sliding_ranges(num_points, window_size, build_default_values(num_points), 0); + vec![ + ColumnarValue::Array(Arc::new(ts_range.into_dict())), + ColumnarValue::Array(Arc::new(val_range.into_dict())), + ColumnarValue::Scalar(ScalarValue::Float64(Some(0.9))), + ] +} + +fn make_predict_linear_input(num_points: usize, window_size: u32) -> Vec { + let (ts_range, val_range, _) = + build_sliding_ranges(num_points, window_size, build_default_values(num_points), 0); + vec![ + ColumnarValue::Array(Arc::new(ts_range.into_dict())), + ColumnarValue::Array(Arc::new(val_range.into_dict())), + // predict 60s into the future + ColumnarValue::Scalar(ScalarValue::Int64(Some(60))), + ] +} + +struct PreparedUdfCall { + args: Vec, + arg_fields: Vec>, + number_rows: usize, + return_field: Arc, + config_options: Arc, +} + +impl PreparedUdfCall { + fn new(args: Vec) -> Self { + let arg_fields = args + .iter() + .enumerate() + .map(|(i, c)| Arc::new(Field::new(format!("c{i}"), c.data_type(), true))) + .collect(); + let number_rows = args + .iter() + .find_map(|c| match c { + ColumnarValue::Array(a) => Some(a.len()), + _ => None, + }) + .unwrap_or(1); + Self { + args, + arg_fields, + number_rows, + return_field: Arc::new(Field::new("out", DataType::Float64, true)), + config_options: Arc::new(ConfigOptions::default()), + } + } +} + +fn invoke_prepared(udf: &datafusion::logical_expr::ScalarUDF, prepared: &PreparedUdfCall) { + udf.invoke_with_args(ScalarFunctionArgs { + args: prepared.args.clone(), + arg_fields: prepared.arg_fields.clone(), + number_rows: prepared.number_rows, + return_field: prepared.return_field.clone(), + config_options: prepared.config_options.clone(), + }) + .unwrap(); +} + +fn bench_range_functions(c: &mut Criterion) { + let mut group = c.benchmark_group("range_fn"); + + // Benchmark parameters: (total_points, window_size) + let params: &[(usize, u32)] = &[ + (1_000, 10), // small series, small window + (10_000, 10), // large series, small window + (10_000, 60), // large series, typical 1-min window at 1s step + (10_000, 360), // large series, wide 6-min window + ]; + + // --- rate (monotonic counter) --- + let rate_udf = Rate::scalar_udf(); + for &(n, w) in params { + let prepared = PreparedUdfCall::new(make_extrapolated_rate_input( + n, + w, + build_monotonic_counter_values(n), + 500, + )); + group.bench_with_input( + BenchmarkId::new("rate_counter", format!("n{n}_w{w}")), + &(n, w), + |b, _| b.iter(|| invoke_prepared(&rate_udf, &prepared)), + ); + } + + // --- rate (periodic resets) --- + for &(n, w) in params { + let prepared = PreparedUdfCall::new(make_extrapolated_rate_input( + n, + w, + build_resetting_counter_values(n), + 500, + )); + group.bench_with_input( + BenchmarkId::new("rate_counter_reset", format!("n{n}_w{w}")), + &(n, w), + |b, _| b.iter(|| invoke_prepared(&rate_udf, &prepared)), + ); + } + + // --- increase (monotonic counter) --- + let increase_udf = Increase::scalar_udf(); + for &(n, w) in params { + let prepared = PreparedUdfCall::new(make_extrapolated_rate_input( + n, + w, + build_monotonic_counter_values(n), + 500, + )); + group.bench_with_input( + BenchmarkId::new("increase_counter", format!("n{n}_w{w}")), + &(n, w), + |b, _| b.iter(|| invoke_prepared(&increase_udf, &prepared)), + ); + } + + // --- increase (periodic resets) --- + for &(n, w) in params { + let prepared = PreparedUdfCall::new(make_extrapolated_rate_input( + n, + w, + build_resetting_counter_values(n), + 500, + )); + group.bench_with_input( + BenchmarkId::new("increase_counter_reset", format!("n{n}_w{w}")), + &(n, w), + |b, _| b.iter(|| invoke_prepared(&increase_udf, &prepared)), + ); + } + + // --- delta (gauge) --- + let delta_udf = Delta::scalar_udf(); + for &(n, w) in params { + let prepared = PreparedUdfCall::new(make_extrapolated_rate_input( + n, + w, + build_gauge_values(n), + 500, + )); + group.bench_with_input( + BenchmarkId::new("delta_gauge", format!("n{n}_w{w}")), + &(n, w), + |b, _| b.iter(|| invoke_prepared(&delta_udf, &prepared)), + ); + } + + // --- idelta --- + let idelta_udf = IDelta::::scalar_udf(); + for &(n, w) in params { + let prepared = PreparedUdfCall::new(make_idelta_input(n, w)); + group.bench_with_input( + BenchmarkId::new("idelta", format!("n{n}_w{w}")), + &(n, w), + |b, _| b.iter(|| invoke_prepared(&idelta_udf, &prepared)), + ); + } + + // --- irate --- + let irate_udf = IDelta::::scalar_udf(); + for &(n, w) in params { + let prepared = PreparedUdfCall::new(make_idelta_input(n, w)); + group.bench_with_input( + BenchmarkId::new("irate", format!("n{n}_w{w}")), + &(n, w), + |b, _| b.iter(|| invoke_prepared(&irate_udf, &prepared)), + ); + } + + // --- quantile_over_time --- + let quantile_udf = QuantileOverTime::scalar_udf(); + for &(n, w) in params { + let prepared = PreparedUdfCall::new(make_quantile_input(n, w)); + group.bench_with_input( + BenchmarkId::new("quantile_over_time", format!("n{n}_w{w}")), + &(n, w), + |b, _| b.iter(|| invoke_prepared(&quantile_udf, &prepared)), + ); + } + + // --- predict_linear --- + let predict_udf = PredictLinear::scalar_udf(); + for &(n, w) in params { + let prepared = PreparedUdfCall::new(make_predict_linear_input(n, w)); + group.bench_with_input( + BenchmarkId::new("predict_linear", format!("n{n}_w{w}")), + &(n, w), + |b, _| b.iter(|| invoke_prepared(&predict_udf, &prepared)), + ); + } + + // --- RangeArray: get vs get_offset_length micro-benchmark --- + // Isolates the overhead of array slicing vs offset/length lookup + for &(n, w) in params { + let step_ms = 1000i64; + let timestamps: Vec = (0..n as i64).map(|i| (i + 1) * step_ms).collect(); + let ts_array = Arc::new(TimestampMillisecondArray::from(timestamps)); + let num_windows = n - w as usize + 1; + let ranges: Vec<(u32, u32)> = (0..num_windows).map(|i| (i as u32, w)).collect(); + let range_array = RangeArray::from_ranges(ts_array, ranges).unwrap(); + + group.bench_with_input( + BenchmarkId::new("range_array_get", format!("n{n}_w{w}")), + &(), + |b, _| { + b.iter(|| { + for i in 0..range_array.len() { + std::hint::black_box(range_array.get(i)); + } + }) + }, + ); + + group.bench_with_input( + BenchmarkId::new("range_array_get_offset_length", format!("n{n}_w{w}")), + &(), + |b, _| { + b.iter(|| { + for i in 0..range_array.len() { + std::hint::black_box(range_array.get_offset_length(i)); + } + }) + }, + ); + } + + group.finish(); +} + +criterion_group!(benches, bench_range_functions); diff --git a/src/promql/src/functions.rs b/src/promql/src/functions.rs index e392d7dcf5..7c7452566a 100644 --- a/src/promql/src/functions.rs +++ b/src/promql/src/functions.rs @@ -31,9 +31,13 @@ pub use aggr_over_time::{ PresentOverTime, StddevOverTime, StdvarOverTime, SumOverTime, }; pub use changes::Changes; -use datafusion::arrow::array::{ArrayRef, Float64Array, TimestampMillisecondArray}; +use datafusion::arrow::array::{ + ArrayRef, DictionaryArray, Float64Array, TimestampMillisecondArray, +}; use datafusion::error::DataFusionError; use datafusion::physical_plan::ColumnarValue; +use datatypes::arrow::array::Array; +use datatypes::arrow::datatypes::Int64Type; pub use deriv::Deriv; pub use double_exponential_smoothing::DoubleExponentialSmoothing; pub use extrapolate_rate::{Delta, Increase, Rate}; @@ -44,6 +48,8 @@ pub use quantile_aggr::{QUANTILE_NAME, quantile_udaf}; pub use resets::Resets; pub use round::Round; +use crate::range_array::RangeArray; + /// Extracts an array from a `ColumnarValue`. /// /// If the `ColumnarValue` is a scalar, it converts it to an array of size 1. @@ -54,6 +60,24 @@ pub(crate) fn extract_array(columnar_value: &ColumnarValue) -> Result Result { + let array = extract_array(columnar_value)?; + let dict = array + .as_any() + .downcast_ref::>() + .ok_or_else(|| { + DataFusionError::Execution(format!( + "expected DictionaryArray, found {}", + array.data_type() + )) + })? + .clone(); + RangeArray::try_new(dict).map_err(DataFusionError::from) +} + /// compensation(Kahan) summation algorithm - a technique for reducing the numerical error /// in floating-point arithmetic. The algorithm also includes the modification ("Neumaier improvement") /// that reduces the numerical error further in cases @@ -78,6 +102,29 @@ pub(crate) fn linear_regression( values: &Float64Array, intercept_time: i64, ) -> (Option, Option) { + linear_regression_slice(times.values(), values, 0, values.len(), intercept_time) +} + +pub(crate) fn linear_regression_slice( + times: &[i64], + values: &Float64Array, + offset: usize, + len: usize, + intercept_time: i64, +) -> (Option, Option) { + linear_regression_slices(times, offset, values, offset, len, intercept_time) +} + +pub(crate) fn linear_regression_slices( + times: &[i64], + time_offset: usize, + values: &Float64Array, + value_offset: usize, + len: usize, + intercept_time: i64, +) -> (Option, Option) { + let raw_values = values.values(); + let has_nulls = values.null_count() > 0; let mut count: f64 = 0.0; let mut sum_x: f64 = 0.0; let mut sum_y: f64 = 0.0; @@ -89,15 +136,18 @@ pub(crate) fn linear_regression( let mut comp_x2: f64 = 0.0; let mut const_y = true; - let init_y: f64 = values.value(0); + let mut init_y = None; - for (i, value) in values.iter().enumerate() { - let time = times.value(i) as f64; - if value.is_none() { + for i in 0..len { + let time_idx = time_offset + i; + let value_idx = value_offset + i; + if has_nulls && values.is_null(value_idx) { continue; } - let value = value.unwrap(); - if const_y && i > 0 && value != init_y { + let value = raw_values[value_idx]; + let time = times[time_idx] as f64; + let initial = init_y.get_or_insert(value); + if const_y && count > 0.0 && value != *initial { const_y = false; } count += 1.0; @@ -113,6 +163,7 @@ pub(crate) fn linear_regression( } if const_y { + let init_y = init_y.unwrap(); if !init_y.is_finite() { return (None, None); } @@ -135,7 +186,14 @@ pub(crate) fn linear_regression( #[cfg(test)] mod test { + use std::sync::Arc; + + use datafusion::physical_plan::ColumnarValue; + use datatypes::arrow::array::Int64Array; + use datatypes::arrow::datatypes::Int64Type; + use super::*; + use crate::range_array::RangeArray; #[test] fn calculate_linear_regression_none() { @@ -253,4 +311,26 @@ mod test { } assert_eq!(sum + c, 2.0) } + + #[test] + fn extract_range_array_rejects_external_dictionary_with_null_keys() { + let keys = Int64Array::from_iter([Some(0), None]); + let values = Arc::new(Float64Array::from_iter([1.0, 2.0])); + let dict = DictionaryArray::::try_new(keys, values).unwrap(); + + let err = extract_range_array(&ColumnarValue::Array(Arc::new(dict))).unwrap_err(); + assert!(err.to_string().contains("Empty range is not expected")); + } + + #[test] + fn extract_range_array_accepts_internal_packed_ranges() { + let values = Arc::new(Float64Array::from_iter([1.0, 2.0, 3.0])); + let range_array = RangeArray::from_ranges(values, [(0, 2), (1, 2)]).unwrap(); + + let extracted = + extract_range_array(&ColumnarValue::Array(Arc::new(range_array.into_dict()))).unwrap(); + + assert_eq!(extracted.get_offset_length(0), Some((0, 2))); + assert_eq!(extracted.get_offset_length(1), Some((1, 2))); + } } diff --git a/src/promql/src/functions/idelta.rs b/src/promql/src/functions/idelta.rs index eeec9a4be9..0772b0bf1e 100644 --- a/src/promql/src/functions/idelta.rs +++ b/src/promql/src/functions/idelta.rs @@ -15,7 +15,7 @@ use std::fmt::Display; use std::sync::Arc; -use datafusion::arrow::array::{Float64Array, TimestampMillisecondArray}; +use datafusion::arrow::array::{Float64Array, Float64Builder, TimestampMillisecondArray}; use datafusion::arrow::datatypes::TimeUnit; use datafusion::common::DataFusionError; use datafusion::logical_expr::{ScalarUDF, Volatility}; @@ -94,49 +94,54 @@ impl IDelta { )), )?; - // calculation - let mut result_array = Vec::with_capacity(ts_range.len()); + let ts_values = ts_range.values(); + let ts_values = ts_values + .as_any() + .downcast_ref::() + .unwrap() + .values(); + + let value_values = value_range.values(); + let value_values = value_values + .as_any() + .downcast_ref::() + .unwrap() + .values(); + + let mut result_builder = Float64Builder::with_capacity(ts_range.len()); for index in 0..ts_range.len() { - let timestamps = ts_range.get(index).unwrap(); - let timestamps = timestamps - .as_any() - .downcast_ref::() - .unwrap() - .values(); - - let values = value_range.get(index).unwrap(); - let values = values - .as_any() - .downcast_ref::() - .unwrap() - .values(); + let (ts_offset, len) = ts_range.get_offset_length(index).unwrap(); + let (value_offset, value_len) = value_range.get_offset_length(index).unwrap(); error::ensure( - timestamps.len() == values.len(), + len == value_len, DataFusionError::Execution(format!( "{}: input arrays should have the same length, found {} and {}", Self::name(), - timestamps.len(), - values.len() + len, + value_len )), )?; - - let len = timestamps.len(); if len < 2 { - result_array.push(None); + result_builder.append_null(); continue; } - // if is delta + let last_offset = ts_offset + len - 1; + let prev_offset = last_offset - 1; + let sampled_interval = + (ts_values[last_offset] - ts_values[prev_offset]) as f64 / 1000.0; + + let last_value_offset = value_offset + len - 1; + let prev_value_offset = last_value_offset - 1; + let last_value = value_values[last_value_offset]; + let prev_value = value_values[prev_value_offset]; + if !IS_RATE { - result_array.push(Some(values[len - 1] - values[len - 2])); + result_builder.append_value(last_value - prev_value); continue; } - // else is rate - let sampled_interval = (timestamps[len - 1] - timestamps[len - 2]) as f64 / 1000.0; - let last_value = values[len - 1]; - let prev_value = values[len - 2]; let result_value = if last_value < prev_value { // counter reset last_value @@ -144,10 +149,10 @@ impl IDelta { last_value - prev_value }; - result_array.push(Some(result_value / sampled_interval as f64)); + result_builder.append_value(result_value / sampled_interval); } - let result = ColumnarValue::Array(Arc::new(Float64Array::from_iter(result_array))); + let result = ColumnarValue::Array(Arc::new(result_builder.finish())); Ok(result) } } diff --git a/src/promql/src/functions/predict_linear.rs b/src/promql/src/functions/predict_linear.rs index 09a46ed48f..dc49ec5d9f 100644 --- a/src/promql/src/functions/predict_linear.rs +++ b/src/promql/src/functions/predict_linear.rs @@ -17,7 +17,7 @@ use std::sync::Arc; -use datafusion::arrow::array::{Float64Array, TimestampMillisecondArray}; +use datafusion::arrow::array::{Float64Array, Float64Builder, TimestampMillisecondArray}; use datafusion::arrow::datatypes::TimeUnit; use datafusion::common::DataFusionError; use datafusion::logical_expr::{ScalarUDF, Volatility}; @@ -28,7 +28,7 @@ use datatypes::arrow::array::Array; use datatypes::arrow::datatypes::DataType; use crate::error; -use crate::functions::{extract_array, linear_regression}; +use crate::functions::{extract_range_array, linear_regression_slices}; use crate::range_array::RangeArray; pub struct PredictLinear; @@ -62,12 +62,10 @@ impl PredictLinear { DataFusionError::Plan("prom_predict_linear function should have 3 inputs".to_string()), )?; - let ts_array = extract_array(&input[0])?; - let value_array = extract_array(&input[1])?; let t_col = &input[2]; - let ts_range: RangeArray = RangeArray::try_new(ts_array.to_data().into())?; - let value_range: RangeArray = RangeArray::try_new(value_array.to_data().into())?; + let ts_range = extract_range_array(&input[0])?; + let value_range = extract_range_array(&input[1])?; error::ensure( ts_range.len() == value_range.len(), DataFusionError::Execution(format!( @@ -130,74 +128,85 @@ impl PredictLinear { Box::new(t_array.iter()) } }; - let mut result_array = Vec::with_capacity(ts_range.len()); + let all_timestamps = ts_range + .values() + .as_any() + .downcast_ref::() + .unwrap() + .values(); + let all_values = value_range + .values() + .as_any() + .downcast_ref::() + .unwrap(); + let mut result_builder = Float64Builder::with_capacity(ts_range.len()); for (index, t) in t_iter.enumerate() { - let (timestamps, values) = get_ts_values(&ts_range, &value_range, index, Self::name())?; - let ret = predict_linear_impl(×tamps, &values, t.unwrap()); - result_array.push(ret); + match predict_linear_impl( + &ts_range, + &value_range, + all_timestamps, + all_values, + index, + t.unwrap(), + Self::name(), + )? { + Some(value) => result_builder.append_value(value), + None => result_builder.append_null(), + } } - let result = ColumnarValue::Array(Arc::new(Float64Array::from_iter(result_array))); + let result = ColumnarValue::Array(Arc::new(result_builder.finish())); Ok(result) } } -fn get_ts_values( +fn predict_linear_impl( ts_range: &RangeArray, value_range: &RangeArray, + all_timestamps: &[i64], + all_values: &Float64Array, index: usize, + t: i64, func_name: &str, -) -> Result<(TimestampMillisecondArray, Float64Array), DataFusionError> { - let timestamps = ts_range - .get(index) - .unwrap() - .as_any() - .downcast_ref::() - .unwrap() - .clone(); - let values = value_range - .get(index) - .unwrap() - .as_any() - .downcast_ref::() - .unwrap() - .clone(); +) -> Result, DataFusionError> { + let (ts_offset, ts_len) = ts_range.get_offset_length(index).unwrap(); + let (value_offset, value_len) = value_range.get_offset_length(index).unwrap(); error::ensure( - timestamps.len() == values.len(), + ts_len == value_len, DataFusionError::Execution(format!( "{}: time and value arrays in a group should have the same length, found {} and {}", - func_name, - timestamps.len(), - values.len() + func_name, ts_len, value_len )), )?; - Ok((timestamps, values)) -} - -fn predict_linear_impl( - timestamps: &TimestampMillisecondArray, - values: &Float64Array, - t: i64, -) -> Option { - if timestamps.len() < 2 { - return None; + if ts_len < 2 { + return Ok(None); } // last timestamp is evaluation timestamp - let evaluate_ts = timestamps.value(timestamps.len() - 1); - let (slope, intercept) = linear_regression(timestamps, values, evaluate_ts); + let evaluate_ts = all_timestamps[ts_offset + ts_len - 1]; + let (slope, intercept) = linear_regression_slices( + all_timestamps, + ts_offset, + all_values, + value_offset, + value_len, + evaluate_ts, + ); if slope.is_none() || intercept.is_none() { - return None; + return Ok(None); } - Some(slope.unwrap() * t as f64 + intercept.unwrap()) + Ok(Some(slope.unwrap() * t as f64 + intercept.unwrap())) } #[cfg(test)] mod test { use std::vec; + use datafusion::arrow::array::{DictionaryArray, Int64Array}; + use datatypes::arrow::datatypes::Int64Type; + use super::*; use crate::functions::test_util::simple_range_udf_runner; @@ -304,4 +313,44 @@ mod test { vec![Some(82765.9090909091)], ); } + + #[test] + fn calculate_predict_linear_with_misaligned_offsets() { + let ts_values = Arc::new(TimestampMillisecondArray::from_iter( + [0i64, 1000, 2000, 3000].into_iter().map(Some), + )); + let value_values = Arc::new(Float64Array::from_iter([10.0, 20.0, 30.0])); + let ts_array = RangeArray::from_ranges(ts_values, [(1, 3)]).unwrap(); + let value_array = RangeArray::from_ranges(value_values, [(0, 3)]).unwrap(); + + simple_range_udf_runner( + PredictLinear::scalar_udf(), + ts_array, + value_array, + vec![ScalarValue::Int64(Some(0))], + vec![Some(30.0)], + ); + } + + #[test] + fn predict_linear_rejects_external_dictionary_with_null_keys() { + let ts_values = Arc::new(TimestampMillisecondArray::from_iter( + [0i64, 1000].into_iter().map(Some), + )); + let ts_keys = Int64Array::from_iter([Some(0), None]); + let ts_dict = DictionaryArray::::try_new(ts_keys, ts_values).unwrap(); + + let value_values = Arc::new(Float64Array::from_iter([1.0, 2.0])); + let value_keys = Int64Array::from_iter([Some(0), Some(1)]); + let value_dict = DictionaryArray::::try_new(value_keys, value_values).unwrap(); + + let err = PredictLinear::predict_linear(&[ + ColumnarValue::Array(Arc::new(ts_dict)), + ColumnarValue::Array(Arc::new(value_dict)), + ColumnarValue::Scalar(ScalarValue::Int64(Some(0))), + ]) + .unwrap_err(); + + assert!(err.to_string().contains("Empty range is not expected")); + } } diff --git a/src/promql/src/functions/quantile.rs b/src/promql/src/functions/quantile.rs index f368d5908c..93fc632d68 100644 --- a/src/promql/src/functions/quantile.rs +++ b/src/promql/src/functions/quantile.rs @@ -14,7 +14,7 @@ use std::sync::Arc; -use datafusion::arrow::array::Float64Array; +use datafusion::arrow::array::{Float64Array, Float64Builder}; use datafusion::arrow::datatypes::TimeUnit; use datafusion::common::DataFusionError; use datafusion::logical_expr::{ScalarUDF, Volatility}; @@ -93,8 +93,14 @@ impl QuantileOverTime { )), )?; - // calculation - let mut result_array = Vec::with_capacity(ts_range.len()); + let all_values = value_range + .values() + .as_any() + .downcast_ref::() + .unwrap() + .values(); + let mut result_builder = Float64Builder::with_capacity(ts_range.len()); + let mut scratch = Vec::new(); match quantile_col { ColumnarValue::Scalar(quantile_scalar) => { @@ -107,25 +113,26 @@ impl QuantileOverTime { }; for index in 0..ts_range.len() { - let timestamps = ts_range.get(index).unwrap(); - let values = value_range.get(index).unwrap(); - let values = values - .as_any() - .downcast_ref::() - .unwrap() - .values(); + let (_, ts_len) = ts_range.get_offset_length(index).unwrap(); + let (value_offset, value_len) = value_range.get_offset_length(index).unwrap(); error::ensure( - timestamps.len() == values.len(), + ts_len == value_len, DataFusionError::Execution(format!( "{}: time and value arrays in a group should have the same length, found {} and {}", Self::name(), - timestamps.len(), - values.len() + ts_len, + value_len )), )?; - let result = quantile_impl(values, quantile); - result_array.push(result); + match quantile_with_scratch( + &all_values[value_offset..value_offset + value_len], + quantile, + &mut scratch, + ) { + Some(value) => result_builder.append_value(value), + None => result_builder.append_null(), + } } } ColumnarValue::Array(quantile_array) => { @@ -150,20 +157,15 @@ impl QuantileOverTime { )), )?; for index in 0..ts_range.len() { - let timestamps = ts_range.get(index).unwrap(); - let values = value_range.get(index).unwrap(); - let values = values - .as_any() - .downcast_ref::() - .unwrap() - .values(); + let (_, ts_len) = ts_range.get_offset_length(index).unwrap(); + let (value_offset, value_len) = value_range.get_offset_length(index).unwrap(); error::ensure( - timestamps.len() == values.len(), + ts_len == value_len, DataFusionError::Execution(format!( "{}: time and value arrays in a group should have the same length, found {} and {}", Self::name(), - timestamps.len(), - values.len() + ts_len, + value_len )), )?; let quantile = if quantile_array.is_null(index) { @@ -171,19 +173,32 @@ impl QuantileOverTime { } else { quantile_array.value(index) }; - let result = quantile_impl(values, quantile); - result_array.push(result); + match quantile_with_scratch( + &all_values[value_offset..value_offset + value_len], + quantile, + &mut scratch, + ) { + Some(value) => result_builder.append_value(value), + None => result_builder.append_null(), + } } } } - let result = ColumnarValue::Array(Arc::new(Float64Array::from_iter(result_array))); + let result = ColumnarValue::Array(Arc::new(result_builder.finish())); Ok(result) } } /// Refer to pub(crate) fn quantile_impl(values: &[f64], quantile: f64) -> Option { + let mut scratch = Vec::new(); + quantile_with_scratch(values, quantile, &mut scratch) +} + +/// Same as [quantile_impl] but reuses a caller-provided scratch buffer to avoid +/// per-call allocation. +fn quantile_with_scratch(values: &[f64], quantile: f64, scratch: &mut Vec) -> Option { if quantile.is_nan() || values.is_empty() { return Some(f64::NAN); } @@ -194,17 +209,18 @@ pub(crate) fn quantile_impl(values: &[f64], quantile: f64) -> Option { return Some(f64::INFINITY); } - let mut values = values.to_vec(); - values.sort_unstable_by(f64::total_cmp); + scratch.clear(); + scratch.extend_from_slice(values); + scratch.sort_unstable_by(f64::total_cmp); - let length = values.len(); + let length = scratch.len(); let rank = quantile * (length - 1) as f64; let lower_index = 0.max(rank.floor() as usize); let upper_index = (length - 1).min(lower_index + 1); let weight = rank - rank.floor(); - let result = values[lower_index] * (1.0 - weight) + values[upper_index] * weight; + let result = scratch[lower_index] * (1.0 - weight) + scratch[upper_index] * weight; Some(result) } From 8e7e4a91d29c4c8cb17788a84d7c61a4fd9d74c1 Mon Sep 17 00:00:00 2001 From: cui Date: Mon, 30 Mar 2026 11:22:04 +0800 Subject: [PATCH 050/195] fix(datatypes): correct ConstantVector rhs comparison in vector equality (#7866) * fix(datatypes): compare ConstantVector rhs inner in vector equality When either operand is a ConstantVector, the recursive equal() call must compare lhs.inner() against rhs.inner(). The second argument incorrectly used lhs twice, breaking equality when only the rhs was constant. Signed-off-by: Weixie Cui * fix: review Signed-off-by: Weixie Cui --------- Signed-off-by: Weixie Cui --- src/datatypes/src/vectors/eq.rs | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/datatypes/src/vectors/eq.rs b/src/datatypes/src/vectors/eq.rs index 372b3ceee4..e8e942d3ff 100644 --- a/src/datatypes/src/vectors/eq.rs +++ b/src/datatypes/src/vectors/eq.rs @@ -67,7 +67,7 @@ fn equal(lhs: &dyn Vector, rhs: &dyn Vector) -> bool { .downcast_ref::() .unwrap() .inner(), - &**lhs + &**rhs .as_any() .downcast_ref::() .unwrap() @@ -259,6 +259,22 @@ mod tests { ]))); } + // Regression: second arm must downcast `rhs` (was `lhs`), or same-length ConstantVectors + // with different inners compare equal. + #[test] + fn test_constant_vector_eq_compares_both_inners() { + assert_vector_ref_ne( + Arc::new(ConstantVector::new( + Arc::new(BooleanVector::from(vec![true])), + 5, + )), + Arc::new(ConstantVector::new( + Arc::new(BooleanVector::from(vec![false])), + 5, + )), + ); + } + #[test] fn test_vector_ne() { assert_vector_ref_ne( From 3904df5397af595749dbce7334295b121750cef8 Mon Sep 17 00:00:00 2001 From: jeremyhi Date: Sun, 29 Mar 2026 20:32:49 -0700 Subject: [PATCH 051/195] chore: refine track memory metrics semantics (#7874) Signed-off-by: jeremyhi --- src/common/memory-manager/src/guard.rs | 2 +- src/common/memory-manager/src/lib.rs | 2 +- src/common/memory-manager/src/manager.rs | 5 +- src/common/recordbatch/src/lib.rs | 128 +++++++++++++++++++-- src/mito2/src/compaction/memory_manager.rs | 2 +- src/mito2/src/engine.rs | 9 +- src/mito2/src/metrics.rs | 9 +- src/servers/src/metrics.rs | 2 +- src/servers/src/request_memory_metrics.rs | 2 +- 9 files changed, 140 insertions(+), 21 deletions(-) diff --git a/src/common/memory-manager/src/guard.rs b/src/common/memory-manager/src/guard.rs index ad3111581b..784b72830d 100644 --- a/src/common/memory-manager/src/guard.rs +++ b/src/common/memory-manager/src/guard.rs @@ -172,7 +172,7 @@ impl MemoryGuard { true } Err(TryAcquireError::NoPermits) | Err(TryAcquireError::Closed) => { - quota.metrics.inc_rejected("try_acquire_additional"); + quota.metrics.inc_exhausted("try_acquire_additional"); false } } diff --git a/src/common/memory-manager/src/lib.rs b/src/common/memory-manager/src/lib.rs index b1d858ef89..983c6ca524 100644 --- a/src/common/memory-manager/src/lib.rs +++ b/src/common/memory-manager/src/lib.rs @@ -45,5 +45,5 @@ impl MemoryMetrics for NoOpMetrics { fn set_in_use(&self, _: i64) {} #[inline(always)] - fn inc_rejected(&self, _: &str) {} + fn inc_exhausted(&self, _: &str) {} } diff --git a/src/common/memory-manager/src/manager.rs b/src/common/memory-manager/src/manager.rs index 8cca5f220c..282f51e315 100644 --- a/src/common/memory-manager/src/manager.rs +++ b/src/common/memory-manager/src/manager.rs @@ -29,7 +29,8 @@ use crate::policy::OnExhaustedPolicy; pub trait MemoryMetrics: Clone + Send + Sync + 'static { fn set_limit(&self, bytes: i64); fn set_in_use(&self, bytes: i64); - fn inc_rejected(&self, reason: &str); + /// Record that immediate memory acquisition failed due to exhausted quota. + fn inc_exhausted(&self, reason: &str); } /// Generic memory manager for quota-controlled operations. @@ -171,7 +172,7 @@ impl MemoryManager { Some(MemoryGuard::limited(quota.clone(), permit)) } Err(TryAcquireError::NoPermits) | Err(TryAcquireError::Closed) => { - quota.metrics.inc_rejected("try_acquire"); + quota.metrics.inc_exhausted("try_acquire"); None } } diff --git a/src/common/recordbatch/src/lib.rs b/src/common/recordbatch/src/lib.rs index 0a2d697407..d84e9e9d26 100644 --- a/src/common/recordbatch/src/lib.rs +++ b/src/common/recordbatch/src/lib.rs @@ -437,7 +437,8 @@ impl fmt::Debug for QueryMemoryTracker { .field("limit", &self.limit()) .field("on_exhausted_policy", &self.on_exhausted_policy) .field("on_update", &self.metrics.has_on_update()) - .field("on_reject", &self.metrics.has_on_reject()) + .field("on_exhausted", &self.metrics.has_on_exhausted()) + .field("on_rejected", &self.metrics.has_on_rejected()) .finish() } } @@ -452,6 +453,7 @@ impl QueryMemoryTracker { limit, on_exhausted_policy, on_update: None, + on_exhausted: None, on_reject: None, } } @@ -489,6 +491,10 @@ impl QueryMemoryTracker { ); error::ExceedMemoryLimitSnafu { msg }.build() } + + fn inc_rejected(&self) { + self.metrics.inc_rejected(); + } } /// Builder for constructing a [`QueryMemoryTracker`] with optional callbacks. @@ -496,6 +502,7 @@ pub struct QueryMemoryTrackerBuilder { limit: usize, on_exhausted_policy: OnExhaustedPolicy, on_update: Option, + on_exhausted: Option, on_reject: Option, } @@ -514,11 +521,21 @@ impl QueryMemoryTrackerBuilder { self } - /// Set a callback to be called when memory allocation is rejected. + /// Set a callback to be called when memory is unavailable for immediate acquisition. /// /// # Note - /// This is only called when `track()` fails due to exceeding the limit. + /// This is called when the non-blocking allocation fast path fails. + /// Requests using `OnExhaustedPolicy::Wait` may still succeed after waiting. /// It is never called when `limit == 0` (unlimited mode). + pub fn on_exhausted(mut self, on_exhausted: F) -> Self + where + F: Fn() + Send + Sync + 'static, + { + self.on_exhausted = Some(Arc::new(on_exhausted)); + self + } + + /// Set a callback to be called when the request ultimately fails due to memory pressure. pub fn on_reject(mut self, on_reject: F) -> Self where F: Fn() + Send + Sync + 'static, @@ -529,7 +546,7 @@ impl QueryMemoryTrackerBuilder { /// Build a [`QueryMemoryTracker`] from this builder. pub fn build(self) -> QueryMemoryTracker { - let metrics = CallbackMemoryMetrics::new(self.on_update, self.on_reject); + let metrics = CallbackMemoryMetrics::new(self.on_update, self.on_exhausted, self.on_reject); let manager = MemoryManager::with_granularity( self.limit as u64, PermitGranularity::Kilobyte, @@ -553,6 +570,10 @@ struct StreamMemoryTracker { type MemoryAcquireResult = std::result::Result<(), common_memory_manager::Error>; impl StreamMemoryTracker { + fn inc_rejected(&self) { + self.tracker.inc_rejected(); + } + fn try_track(&mut self, additional: usize) -> Result<()> { if self.guard.try_acquire_additional(additional as u64) { self.tracked_bytes = self.tracked_bytes.saturating_add(additional); @@ -613,18 +634,25 @@ struct CallbackMemoryMetrics { } type UpdateCallback = Arc; -type RejectCallback = Arc; +type UnitCallback = Arc; +type RejectCallback = UnitCallback; struct CallbackMemoryMetricsInner { on_update: Option, + on_exhausted: Option, on_reject: Option, } impl CallbackMemoryMetrics { - fn new(on_update: Option, on_reject: Option) -> Self { + fn new( + on_update: Option, + on_exhausted: Option, + on_reject: Option, + ) -> Self { Self { inner: Arc::new(CallbackMemoryMetricsInner { on_update, + on_exhausted, on_reject, }), } @@ -634,9 +662,19 @@ impl CallbackMemoryMetrics { self.inner.on_update.is_some() } - fn has_on_reject(&self) -> bool { + fn has_on_exhausted(&self) -> bool { + self.inner.on_exhausted.is_some() + } + + fn has_on_rejected(&self) -> bool { self.inner.on_reject.is_some() } + + fn inc_rejected(&self) { + if let Some(callback) = &self.inner.on_reject { + callback(); + } + } } impl MemoryMetrics for CallbackMemoryMetrics { @@ -648,8 +686,8 @@ impl MemoryMetrics for CallbackMemoryMetrics { } } - fn inc_rejected(&self, _: &str) { - if let Some(callback) = &self.inner.on_reject { + fn inc_exhausted(&self, _: &str) { + if let Some(callback) = &self.inner.on_exhausted { callback(); } } @@ -712,7 +750,10 @@ impl MemoryTrackedStream { Poll::Ready((tracker, batch, additional, result)) => { let output = match result { Ok(()) => Ok(batch), - Err(error) => Err(tracker.wait_error(additional, error)), + Err(error) => { + tracker.inc_rejected(); + Err(tracker.wait_error(additional, error)) + } }; self.waiting = None; self.tracker = Some(tracker); @@ -732,7 +773,10 @@ impl MemoryTrackedStream { if let Err(error) = tracker.try_track(additional) { match tracker.tracker.on_exhausted_policy { - OnExhaustedPolicy::Fail => return Poll::Ready(Some(Err(error))), + OnExhaustedPolicy::Fail => { + tracker.inc_rejected(); + return Poll::Ready(Some(Err(error))); + } // `Wait` is a deliberate tradeoff: the batch has already been materialized, so we // keep it in memory while waiting for quota instead of failing immediately. Under // contention, real memory usage can therefore exceed `scan_memory_limit` by up to @@ -786,6 +830,7 @@ impl RecordBatchStream for MemoryTrackedStream { #[cfg(test)] mod tests { use std::sync::Arc; + use std::sync::atomic::{AtomicUsize, Ordering}; use std::time::Duration; use common_memory_manager::{OnExhaustedPolicy, PermitGranularity}; @@ -988,12 +1033,22 @@ mod tests { #[tokio::test] async fn test_memory_tracked_stream_waits_for_capacity() { + let exhausted = Arc::new(AtomicUsize::new(0)); + let rejected = Arc::new(AtomicUsize::new(0)); + let exhausted_counter = exhausted.clone(); + let rejected_counter = rejected.clone(); let tracker = QueryMemoryTracker::builder( MB, OnExhaustedPolicy::Wait { timeout: Duration::from_millis(200), }, ) + .on_exhausted(move || { + exhausted_counter.fetch_add(1, Ordering::Relaxed); + }) + .on_reject(move || { + rejected_counter.fetch_add(1, Ordering::Relaxed); + }) .build(); let batch = large_string_batch(700 * 1024); let expected_bytes = aligned_tracked_bytes(batch.buffer_memory_size()); @@ -1025,16 +1080,28 @@ mod tests { drop(stream1); let second = waiter.await.unwrap().unwrap(); assert_eq!(second.num_rows(), 1); + assert_eq!(exhausted.load(Ordering::Relaxed), 1); + assert_eq!(rejected.load(Ordering::Relaxed), 0); } #[tokio::test] async fn test_memory_tracked_stream_wait_times_out() { + let exhausted = Arc::new(AtomicUsize::new(0)); + let rejected = Arc::new(AtomicUsize::new(0)); + let exhausted_counter = exhausted.clone(); + let rejected_counter = rejected.clone(); let tracker = QueryMemoryTracker::builder( MB, OnExhaustedPolicy::Wait { timeout: Duration::from_millis(50), }, ) + .on_exhausted(move || { + exhausted_counter.fetch_add(1, Ordering::Relaxed); + }) + .on_reject(move || { + rejected_counter.fetch_add(1, Ordering::Relaxed); + }) .build(); let batch = large_string_batch(700 * 1024); @@ -1058,5 +1125,44 @@ mod tests { .unwrap(); let error = result.unwrap().unwrap_err(); assert!(error.to_string().contains("timed out waiting")); + assert_eq!(exhausted.load(Ordering::Relaxed), 1); + assert_eq!(rejected.load(Ordering::Relaxed), 1); + } + + #[tokio::test] + async fn test_memory_tracked_stream_fail_policy_rejects_immediately() { + let exhausted = Arc::new(AtomicUsize::new(0)); + let rejected = Arc::new(AtomicUsize::new(0)); + let exhausted_counter = exhausted.clone(); + let rejected_counter = rejected.clone(); + let tracker = QueryMemoryTracker::builder(MB, OnExhaustedPolicy::Fail) + .on_exhausted(move || { + exhausted_counter.fetch_add(1, Ordering::Relaxed); + }) + .on_reject(move || { + rejected_counter.fetch_add(1, Ordering::Relaxed); + }) + .build(); + let batch = large_string_batch(700 * 1024); + + let mut stream1 = MemoryTrackedStream::new( + RecordBatches::try_new(batch.schema.clone(), vec![batch.clone()]) + .unwrap() + .as_stream(), + tracker.clone(), + ); + let first = stream1.next().await.unwrap().unwrap(); + assert_eq!(first.num_rows(), 1); + + let mut stream2 = MemoryTrackedStream::new( + RecordBatches::try_new(batch.schema.clone(), vec![batch]) + .unwrap() + .as_stream(), + tracker, + ); + let result = stream2.next().await.unwrap(); + assert!(result.is_err()); + assert_eq!(exhausted.load(Ordering::Relaxed), 1); + assert_eq!(rejected.load(Ordering::Relaxed), 1); } } diff --git a/src/mito2/src/compaction/memory_manager.rs b/src/mito2/src/compaction/memory_manager.rs index 8cbb5d293a..94b0779254 100644 --- a/src/mito2/src/compaction/memory_manager.rs +++ b/src/mito2/src/compaction/memory_manager.rs @@ -31,7 +31,7 @@ impl MemoryMetrics for CompactionMemoryMetrics { COMPACTION_MEMORY_IN_USE.set(bytes); } - fn inc_rejected(&self, reason: &str) { + fn inc_exhausted(&self, reason: &str) { COMPACTION_MEMORY_REJECTED .with_label_values(&[reason]) .inc(); diff --git a/src/mito2/src/engine.rs b/src/mito2/src/engine.rs index fbafe1da67..d1c30c3ff6 100644 --- a/src/mito2/src/engine.rs +++ b/src/mito2/src/engine.rs @@ -138,7 +138,8 @@ use crate::gc::GcLimiterRef; use crate::manifest::action::RegionEdit; use crate::memtable::MemtableStats; use crate::metrics::{ - HANDLE_REQUEST_ELAPSED, SCAN_MEMORY_USAGE_BYTES, SCAN_REQUESTS_REJECTED_TOTAL, + HANDLE_REQUEST_ELAPSED, SCAN_MEMORY_EXHAUSTED_TOTAL, SCAN_MEMORY_USAGE_BYTES, + SCAN_REQUESTS_REJECTED_TOTAL, }; use crate::read::scan_region::{ScanRegion, Scanner}; use crate::read::stream::ScanBatchStream; @@ -231,6 +232,9 @@ impl<'a, S: LogStore> MitoEngineBuilder<'a, S> { .on_update(|usage| { SCAN_MEMORY_USAGE_BYTES.set(usage as i64); }) + .on_exhausted(|| { + SCAN_MEMORY_EXHAUSTED_TOTAL.inc(); + }) .on_reject(|| { SCAN_REQUESTS_REJECTED_TOTAL.inc(); }) @@ -1380,6 +1384,9 @@ impl MitoEngine { .on_update(|usage| { SCAN_MEMORY_USAGE_BYTES.set(usage as i64); }) + .on_exhausted(|| { + SCAN_MEMORY_EXHAUSTED_TOTAL.inc(); + }) .on_reject(|| { SCAN_REQUESTS_REJECTED_TOTAL.inc(); }) diff --git a/src/mito2/src/metrics.rs b/src/mito2/src/metrics.rs index c0537567f9..30a7ac765c 100644 --- a/src/mito2/src/metrics.rs +++ b/src/mito2/src/metrics.rs @@ -244,10 +244,15 @@ lazy_static! { "greptime_mito_scan_memory_usage_bytes", "current scan memory usage in bytes" ).unwrap(); - /// Counter of rejected scan requests due to memory limit. + /// Counter of scan allocation attempts that could not acquire memory immediately. + pub static ref SCAN_MEMORY_EXHAUSTED_TOTAL: IntCounter = register_int_counter!( + "greptime_mito_scan_memory_exhausted_total", + "total number of times scan memory was unavailable for immediate acquisition" + ).unwrap(); + /// Counter of scan requests that ultimately failed due to memory pressure. pub static ref SCAN_REQUESTS_REJECTED_TOTAL: IntCounter = register_int_counter!( "greptime_mito_scan_requests_rejected_total", - "total number of scan requests rejected due to memory limit" + "total number of scan requests that ultimately failed due to memory limit" ).unwrap(); /// Gauge for active file range builders in the pruner. pub static ref PRUNER_ACTIVE_BUILDERS: IntGauge = register_int_gauge!( diff --git a/src/servers/src/metrics.rs b/src/servers/src/metrics.rs index 37f923b73d..e3bff7fdbc 100644 --- a/src/servers/src/metrics.rs +++ b/src/servers/src/metrics.rs @@ -361,7 +361,7 @@ lazy_static! { "maximum bytes allowed for all concurrent request bodies and messages" ).unwrap(); - /// Total number of rejected requests due to memory exhaustion. + /// Total number of requests rejected due to memory exhaustion. pub static ref REQUEST_MEMORY_REJECTED: IntCounterVec = register_int_counter_vec!( "greptime_servers_request_memory_rejected_total", "number of requests rejected due to memory limit", diff --git a/src/servers/src/request_memory_metrics.rs b/src/servers/src/request_memory_metrics.rs index 4298830f18..68f52816f4 100644 --- a/src/servers/src/request_memory_metrics.rs +++ b/src/servers/src/request_memory_metrics.rs @@ -34,7 +34,7 @@ impl MemoryMetrics for RequestMemoryMetrics { REQUEST_MEMORY_IN_USE.set(bytes); } - fn inc_rejected(&self, reason: &str) { + fn inc_exhausted(&self, reason: &str) { REQUEST_MEMORY_REJECTED.with_label_values(&[reason]).inc(); } } From 92e2d71f484771e34f02bb5a957131fb52cccbc1 Mon Sep 17 00:00:00 2001 From: Yingwen Date: Mon, 30 Mar 2026 12:24:20 +0800 Subject: [PATCH 052/195] feat: implement prefilter framework and primary key prefilter (#7862) * feat: prefilter basic framework Signed-off-by: evenyag * refactor: move arguments to RowGroupBuildContext Signed-off-by: evenyag * refactor: skip prefiltered exprs in FlatPruneReader Signed-off-by: evenyag * refactor: remove unused functions Signed-off-by: evenyag * chore: update comment Signed-off-by: evenyag * feat: handle partition columns in prefilter Signed-off-by: evenyag * chore: fix clippy Signed-off-by: evenyag * fix: apply prefiltered selection by and_then Signed-off-by: evenyag * chore: fix clippy Signed-off-by: evenyag * fix: handle last row cache Signed-off-by: evenyag * fix: don't ignore error in PrimaryKeyFilter Signed-off-by: evenyag --------- Signed-off-by: evenyag --- src/common/recordbatch/src/filter.rs | 2 +- .../benches/bench_primary_key_filter.rs | 8 +- src/mito-codec/src/error.rs | 9 + src/mito-codec/src/primary_key_filter.rs | 168 ++++-- src/mito-codec/src/row_converter.rs | 3 +- src/mito-codec/src/row_converter/dense.rs | 2 + src/mito-codec/src/row_converter/sparse.rs | 2 + src/mito2/src/engine/row_selector_test.rs | 90 +++ src/mito2/src/memtable/bulk/part_reader.rs | 1 + .../src/memtable/partition_tree/partition.rs | 3 +- .../src/memtable/partition_tree/shard.rs | 5 +- .../memtable/partition_tree/shard_builder.rs | 5 +- src/mito2/src/sst/parquet.rs | 161 ++++++ src/mito2/src/sst/parquet/file_range.rs | 130 ++++- src/mito2/src/sst/parquet/flat_format.rs | 7 + src/mito2/src/sst/parquet/prefilter.rs | 541 +++++++----------- src/mito2/src/sst/parquet/reader.rs | 232 +++++++- src/mito2/src/sst/parquet/row_group.rs | 22 +- src/mito2/src/sst/parquet/row_selection.rs | 84 ++- 19 files changed, 1061 insertions(+), 414 deletions(-) diff --git a/src/common/recordbatch/src/filter.rs b/src/common/recordbatch/src/filter.rs index d7c522e656..9f1b596a49 100644 --- a/src/common/recordbatch/src/filter.rs +++ b/src/common/recordbatch/src/filter.rs @@ -48,7 +48,7 @@ use crate::error::{ArrowComputeSnafu, Result, ToArrowScalarSnafu, UnsupportedOpe /// /// This struct contains normalized predicate expr. In the form of /// `col` `op` `literal` where the `col` is provided from input. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct SimpleFilterEvaluator { /// Name of the referenced column. column_name: String, diff --git a/src/mito-codec/benches/bench_primary_key_filter.rs b/src/mito-codec/benches/bench_primary_key_filter.rs index 47158f87f8..2cc35abb22 100644 --- a/src/mito-codec/benches/bench_primary_key_filter.rs +++ b/src/mito-codec/benches/bench_primary_key_filter.rs @@ -246,18 +246,18 @@ fn bench_primary_key_filter(c: &mut Criterion) { let dense_pk = encode_dense_pk(&metadata, &row); let dense_codec = DensePrimaryKeyCodec::new(&metadata); - let mut dense_fast = dense_codec.primary_key_filter(&metadata, filters.clone()); + let mut dense_fast = dense_codec.primary_key_filter(&metadata, filters.clone(), false); let mut dense_offsets = Vec::new(); let sparse_pk = encode_sparse_pk(&metadata, &row); let sparse_codec = SparsePrimaryKeyCodec::new(&metadata); - let mut sparse_fast = sparse_codec.primary_key_filter(&metadata, filters.clone()); + let mut sparse_fast = sparse_codec.primary_key_filter(&metadata, filters.clone(), false); let mut sparse_offsets = std::collections::HashMap::new(); let mut group = c.benchmark_group(format!("primary_key_filter/{case_name}")); group.bench_function("dense/fast", |b| { - b.iter(|| black_box(dense_fast.matches(black_box(&dense_pk)))) + b.iter(|| black_box(dense_fast.matches(black_box(&dense_pk)).unwrap())) }); group.bench_function("dense/scalar", |b| { b.iter(|| { @@ -272,7 +272,7 @@ fn bench_primary_key_filter(c: &mut Criterion) { }); group.bench_function("sparse/fast", |b| { - b.iter(|| black_box(sparse_fast.matches(black_box(&sparse_pk)))) + b.iter(|| black_box(sparse_fast.matches(black_box(&sparse_pk)).unwrap())) }); group.bench_function("sparse/scalar", |b| { b.iter(|| { diff --git a/src/mito-codec/src/error.rs b/src/mito-codec/src/error.rs index 1be0074b1e..78a656415a 100644 --- a/src/mito-codec/src/error.rs +++ b/src/mito-codec/src/error.rs @@ -72,6 +72,14 @@ pub enum Error { #[snafu(implicit)] location: Location, }, + + #[snafu(display("Failed to evaluate filter"))] + EvaluateFilter { + #[snafu(source(from(common_recordbatch::error::Error, Box::new)))] + source: Box, + #[snafu(implicit)] + location: Location, + }, } pub type Result = std::result::Result; @@ -86,6 +94,7 @@ impl ErrorExt for Error { StatusCode::InvalidArguments } NotSupportedField { .. } | UnsupportedOperation { .. } => StatusCode::Unsupported, + EvaluateFilter { source, .. } => source.status_code(), } } diff --git a/src/mito-codec/src/primary_key_filter.rs b/src/mito-codec/src/primary_key_filter.rs index 189c7a08cd..70fda7bf54 100644 --- a/src/mito-codec/src/primary_key_filter.rs +++ b/src/mito-codec/src/primary_key_filter.rs @@ -20,11 +20,12 @@ use common_recordbatch::filter::SimpleFilterEvaluator; use datatypes::data_type::ConcreteDataType; use datatypes::value::Value; use memcomparable::Serializer; +use snafu::ResultExt; use store_api::metadata::RegionMetadataRef; use store_api::metric_engine_consts::DATA_SCHEMA_TABLE_ID_COLUMN_NAME; use store_api::storage::ColumnId; -use crate::error::Result; +use crate::error::{EvaluateFilterSnafu, Result}; use crate::row_converter::{ DensePrimaryKeyCodec, PrimaryKeyFilter, SortField, SparsePrimaryKeyCodec, }; @@ -41,8 +42,12 @@ struct PrimaryKeyFilterInner { } impl PrimaryKeyFilterInner { - fn new(metadata: RegionMetadataRef, filters: Arc>) -> Self { - let compiled_filters = Self::compile_filters(&metadata, &filters); + fn new( + metadata: RegionMetadataRef, + filters: Arc>, + skip_partition_column: bool, + ) -> Self { + let compiled_filters = Self::compile_filters(&metadata, &filters, skip_partition_column); Self { filters, compiled_filters, @@ -52,6 +57,7 @@ impl PrimaryKeyFilterInner { fn compile_filters( metadata: &RegionMetadataRef, filters: &[SimpleFilterEvaluator], + skip_partition_column: bool, ) -> Vec { if filters.is_empty() || metadata.primary_key.is_empty() { return Vec::new(); @@ -59,7 +65,7 @@ impl PrimaryKeyFilterInner { let mut compiled_filters = Vec::with_capacity(filters.len()); for (filter_idx, filter) in filters.iter().enumerate() { - if is_partition_column(filter.column_name()) { + if skip_partition_column && is_partition_column(filter.column_name()) { continue; } @@ -91,43 +97,36 @@ impl PrimaryKeyFilterInner { compiled_filters } - fn evaluate_filters<'a>(&self, accessor: &mut impl PrimaryKeyValueAccessor<'a>) -> bool { + fn evaluate_filters<'a>( + &self, + accessor: &mut impl PrimaryKeyValueAccessor<'a>, + ) -> Result { if self.compiled_filters.is_empty() { - return true; + return Ok(true); } for compiled in &self.compiled_filters { let filter = &self.filters[compiled.filter_idx]; let passed = if let Some(fast_path) = &compiled.fast_path { - let encoded_value = match accessor.encoded_value(compiled) { - Ok(v) => v, - Err(e) => { - common_telemetry::error!(e; "Failed to decode primary key"); - return true; - } - }; + let encoded_value = accessor.encoded_value(compiled)?; fast_path.matches(encoded_value) } else { - let value = match accessor.decode_value(compiled) { - Ok(v) => v, - Err(e) => { - common_telemetry::error!(e; "Failed to decode primary key"); - return true; - } - }; + let value = accessor.decode_value(compiled)?; // Safety: arrow schema and datatypes are constructed from the same source. let scalar_value = value.try_to_scalar_value(&compiled.data_type).unwrap(); - filter.evaluate_scalar(&scalar_value).unwrap_or(true) + filter + .evaluate_scalar(&scalar_value) + .context(EvaluateFilterSnafu)? }; if !passed { - return false; + return Ok(false); } } - true + Ok(true) } } @@ -258,9 +257,10 @@ impl DensePrimaryKeyFilter { metadata: RegionMetadataRef, filters: Arc>, codec: DensePrimaryKeyCodec, + skip_partition_column: bool, ) -> Self { Self { - inner: PrimaryKeyFilterInner::new(metadata, filters), + inner: PrimaryKeyFilterInner::new(metadata, filters, skip_partition_column), codec, offsets_buf: Vec::new(), } @@ -268,7 +268,7 @@ impl DensePrimaryKeyFilter { } impl PrimaryKeyFilter for DensePrimaryKeyFilter { - fn matches(&mut self, pk: &[u8]) -> bool { + fn matches(&mut self, pk: &[u8]) -> Result { self.offsets_buf.clear(); let mut accessor = DensePrimaryKeyValueAccessor { pk, @@ -311,9 +311,10 @@ impl SparsePrimaryKeyFilter { metadata: RegionMetadataRef, filters: Arc>, codec: SparsePrimaryKeyCodec, + skip_partition_column: bool, ) -> Self { Self { - inner: PrimaryKeyFilterInner::new(metadata, filters), + inner: PrimaryKeyFilterInner::new(metadata, filters, skip_partition_column), codec, offsets_map: HashMap::new(), } @@ -321,7 +322,7 @@ impl SparsePrimaryKeyFilter { } impl PrimaryKeyFilter for SparsePrimaryKeyFilter { - fn matches(&mut self, pk: &[u8]) -> bool { + fn matches(&mut self, pk: &[u8]) -> Result { self.offsets_map.clear(); let mut accessor = SparsePrimaryKeyValueAccessor { pk, @@ -369,6 +370,7 @@ mod tests { use datatypes::schema::ColumnSchema; use datatypes::value::{OrderedFloat, ValueRef}; use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder}; + use store_api::metric_engine_consts::DATA_SCHEMA_TABLE_ID_COLUMN_NAME; use store_api::storage::{ColumnId, RegionId}; use super::*; @@ -423,6 +425,36 @@ mod tests { Arc::new(metadata) } + fn setup_partitioned_metadata() -> RegionMetadataRef { + let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1)); + builder + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + DATA_SCHEMA_TABLE_ID_COLUMN_NAME, + ConcreteDataType::uint32_datatype(), + false, + ), + semantic_type: SemanticType::Tag, + column_id: 10, + }) + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new("tag", ConcreteDataType::string_datatype(), true), + semantic_type: SemanticType::Tag, + column_id: 1, + }) + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + greptime_timestamp(), + ConcreteDataType::timestamp_nanosecond_datatype(), + false, + ), + semantic_type: SemanticType::Timestamp, + column_id: 2, + }) + .primary_key(vec![10, 1]); + Arc::new(builder.build().unwrap()) + } + fn create_test_row() -> Vec<(ColumnId, ValueRef<'static>)> { vec![ (1, ValueRef::String("greptime-frontend-6989d9899-22222")), @@ -479,8 +511,8 @@ mod tests { )]); let pk = encode_sparse_pk(&metadata, create_test_row()); let codec = SparsePrimaryKeyCodec::new(&metadata); - let mut filter = SparsePrimaryKeyFilter::new(metadata, filters, codec); - assert!(filter.matches(&pk)); + let mut filter = SparsePrimaryKeyFilter::new(metadata, filters, codec, false); + assert!(filter.matches(&pk).unwrap()); } #[test] @@ -492,8 +524,8 @@ mod tests { )]); let pk = encode_sparse_pk(&metadata, create_test_row()); let codec = SparsePrimaryKeyCodec::new(&metadata); - let mut filter = SparsePrimaryKeyFilter::new(metadata, filters, codec); - assert!(!filter.matches(&pk)); + let mut filter = SparsePrimaryKeyFilter::new(metadata, filters, codec, false); + assert!(!filter.matches(&pk).unwrap()); } #[test] @@ -505,8 +537,8 @@ mod tests { )]); let pk = encode_sparse_pk(&metadata, create_test_row()); let codec = SparsePrimaryKeyCodec::new(&metadata); - let mut filter = SparsePrimaryKeyFilter::new(metadata, filters, codec); - assert!(filter.matches(&pk)); + let mut filter = SparsePrimaryKeyFilter::new(metadata, filters, codec, false); + assert!(filter.matches(&pk).unwrap()); } #[test] @@ -518,8 +550,8 @@ mod tests { )]); let pk = encode_dense_pk(&metadata, create_test_row()); let codec = DensePrimaryKeyCodec::new(&metadata); - let mut filter = DensePrimaryKeyFilter::new(metadata, filters, codec); - assert!(filter.matches(&pk)); + let mut filter = DensePrimaryKeyFilter::new(metadata, filters, codec, false); + assert!(filter.matches(&pk).unwrap()); } #[test] @@ -531,8 +563,8 @@ mod tests { )]); let pk = encode_dense_pk(&metadata, create_test_row()); let codec = DensePrimaryKeyCodec::new(&metadata); - let mut filter = DensePrimaryKeyFilter::new(metadata, filters, codec); - assert!(!filter.matches(&pk)); + let mut filter = DensePrimaryKeyFilter::new(metadata, filters, codec, false); + assert!(!filter.matches(&pk).unwrap()); } #[test] @@ -544,8 +576,8 @@ mod tests { )]); let pk = encode_dense_pk(&metadata, create_test_row()); let codec = DensePrimaryKeyCodec::new(&metadata); - let mut filter = DensePrimaryKeyFilter::new(metadata, filters, codec); - assert!(filter.matches(&pk)); + let mut filter = DensePrimaryKeyFilter::new(metadata, filters, codec, false); + assert!(filter.matches(&pk).unwrap()); } #[test] @@ -563,8 +595,9 @@ mod tests { for (op, value, expected) in cases { let filters = Arc::new(vec![create_filter_with_op("pod", op, value)]); - let mut filter = DensePrimaryKeyFilter::new(metadata.clone(), filters, codec.clone()); - assert_eq!(expected, filter.matches(&pk)); + let mut filter = + DensePrimaryKeyFilter::new(metadata.clone(), filters, codec.clone(), false); + assert_eq!(expected, filter.matches(&pk).unwrap()); } } @@ -583,8 +616,9 @@ mod tests { for (op, value, expected) in cases { let filters = Arc::new(vec![create_filter_with_op("pod", op, value)]); - let mut filter = SparsePrimaryKeyFilter::new(metadata.clone(), filters, codec.clone()); - assert_eq!(expected, filter.matches(&pk)); + let mut filter = + SparsePrimaryKeyFilter::new(metadata.clone(), filters, codec.clone(), false); + assert_eq!(expected, filter.matches(&pk).unwrap()); } } @@ -616,8 +650,52 @@ mod tests { .unwrap(); let filters = Arc::new(vec![create_filter_with_op("f", Operator::Eq, 0.0_f64)]); - let mut filter = DensePrimaryKeyFilter::new(metadata, filters, codec); + let mut filter = DensePrimaryKeyFilter::new(metadata, filters, codec, false); - assert!(filter.matches(&pk)); + assert!(filter.matches(&pk).unwrap()); + } + + #[test] + fn test_dense_primary_key_filter_matches_partition_column_by_default() { + let metadata = setup_partitioned_metadata(); + let codec = DensePrimaryKeyCodec::new(&metadata); + let mut pk = Vec::new(); + codec + .encode_to_vec( + [ValueRef::UInt32(42), ValueRef::String("host-a")].into_iter(), + &mut pk, + ) + .unwrap(); + + let filters = Arc::new(vec![create_filter_with_op( + DATA_SCHEMA_TABLE_ID_COLUMN_NAME, + Operator::Eq, + 42_u32, + )]); + let mut filter = DensePrimaryKeyFilter::new(metadata, filters, codec, false); + + assert!(filter.matches(&pk).unwrap()); + } + + #[test] + fn test_dense_primary_key_filter_can_skip_partition_column() { + let metadata = setup_partitioned_metadata(); + let codec = DensePrimaryKeyCodec::new(&metadata); + let mut pk = Vec::new(); + codec + .encode_to_vec( + [ValueRef::UInt32(42), ValueRef::String("host-a")].into_iter(), + &mut pk, + ) + .unwrap(); + + let filters = Arc::new(vec![create_filter_with_op( + DATA_SCHEMA_TABLE_ID_COLUMN_NAME, + Operator::Eq, + 7_u32, + )]); + let mut filter = DensePrimaryKeyFilter::new(metadata, filters, codec, true); + + assert!(filter.matches(&pk).unwrap()); } } diff --git a/src/mito-codec/src/row_converter.rs b/src/mito-codec/src/row_converter.rs index 6fe33a9ee7..fa57e1d96e 100644 --- a/src/mito-codec/src/row_converter.rs +++ b/src/mito-codec/src/row_converter.rs @@ -53,7 +53,7 @@ pub trait PrimaryKeyCodecExt { pub trait PrimaryKeyFilter: Send + Sync { /// Returns true if the primary key matches the filter. - fn matches(&mut self, pk: &[u8]) -> bool; + fn matches(&mut self, pk: &[u8]) -> Result; } /// Composite values decoded from primary key bytes. @@ -120,6 +120,7 @@ pub trait PrimaryKeyCodec: Send + Sync + Debug { &self, metadata: &RegionMetadataRef, filters: Arc>, + skip_partition_column: bool, ) -> Box; /// Returns the estimated size of the primary key. diff --git a/src/mito-codec/src/row_converter/dense.rs b/src/mito-codec/src/row_converter/dense.rs index 6cc70feaea..4bc774c941 100644 --- a/src/mito-codec/src/row_converter/dense.rs +++ b/src/mito-codec/src/row_converter/dense.rs @@ -556,11 +556,13 @@ impl PrimaryKeyCodec for DensePrimaryKeyCodec { &self, metadata: &RegionMetadataRef, filters: Arc>, + skip_partition_column: bool, ) -> Box { Box::new(DensePrimaryKeyFilter::new( metadata.clone(), filters, self.clone(), + skip_partition_column, )) } diff --git a/src/mito-codec/src/row_converter/sparse.rs b/src/mito-codec/src/row_converter/sparse.rs index 4638ddcefb..00ec51530c 100644 --- a/src/mito-codec/src/row_converter/sparse.rs +++ b/src/mito-codec/src/row_converter/sparse.rs @@ -357,11 +357,13 @@ impl PrimaryKeyCodec for SparsePrimaryKeyCodec { &self, metadata: &RegionMetadataRef, filters: Arc>, + skip_partition_column: bool, ) -> Box { Box::new(SparsePrimaryKeyFilter::new( metadata.clone(), filters, self.clone(), + skip_partition_column, )) } diff --git a/src/mito2/src/engine/row_selector_test.rs b/src/mito2/src/engine/row_selector_test.rs index d79152e57f..26d7327c2f 100644 --- a/src/mito2/src/engine/row_selector_test.rs +++ b/src/mito2/src/engine/row_selector_test.rs @@ -13,12 +13,15 @@ // limitations under the License. use api::v1::Rows; +use common_base::readable_size::ReadableSize; use common_recordbatch::RecordBatches; +use datafusion_expr::{col, lit}; use store_api::region_engine::RegionEngine; use store_api::region_request::RegionRequest; use store_api::storage::{RegionId, ScanRequest, TimeSeriesRowSelector}; use crate::config::MitoConfig; +use crate::engine::MitoEngine; use crate::test_util::batch_util::sort_batches_and_print; use crate::test_util::{ CreateRequestBuilder, TestEnv, build_rows_for_key, flush_region, put_rows, rows_schema, @@ -107,6 +110,27 @@ async fn test_last_row(append_mode: bool, flat_format: bool) { assert_eq!(expected, sort_batches_and_print(&batches, &["tag_0", "ts"])); } +async fn scan_last_row( + engine: &MitoEngine, + region_id: RegionId, + filters: Vec, +) -> String { + let scanner = engine + .scanner( + region_id, + ScanRequest { + filters, + series_row_selector: Some(TimeSeriesRowSelector::LastRow), + ..Default::default() + }, + ) + .await + .unwrap(); + let stream = scanner.scan().await.unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + sort_batches_and_print(&batches, &["tag_0", "ts"]) +} + #[tokio::test] async fn test_last_row_append_mode_disabled() { test_last_row(false, false).await; @@ -126,3 +150,69 @@ async fn test_last_row_flat_format_append_mode_disabled() { async fn test_last_row_flat_format_append_mode_enabled() { test_last_row(true, true).await; } + +#[tokio::test] +async fn test_last_row_flat_format_prefilter_does_not_poison_selector_cache() { + let mut env = TestEnv::new().await; + let engine = env + .create_engine(MitoConfig { + selector_result_cache_size: ReadableSize::mb(1), + ..Default::default() + }) + .await; + let region_id = RegionId::new(1, 1); + + env.get_schema_metadata_manager() + .register_region_table_info( + region_id.table_id(), + "test_table", + "test_catalog", + "test_schema", + None, + env.get_kv_backend(), + ) + .await; + + let request = CreateRequestBuilder::new() + .insert_option("sst_format", "flat") + .build(); + let column_schemas = rows_schema(&request); + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + let rows = Rows { + schema: column_schemas, + rows: [ + build_rows_for_key("a", 0, 3, 0), + build_rows_for_key("b", 0, 3, 10), + ] + .concat(), + }; + put_rows(&engine, region_id, rows).await; + flush_region(&engine, region_id, Some(16)).await; + + let filtered = scan_last_row(&engine, region_id, vec![col("tag_0").eq(lit("a"))]).await; + assert_eq!( + "\ ++-------+---------+---------------------+ +| tag_0 | field_0 | ts | ++-------+---------+---------------------+ +| a | 2.0 | 1970-01-01T00:00:02 | ++-------+---------+---------------------+", + filtered + ); + + let unfiltered = scan_last_row(&engine, region_id, vec![]).await; + assert_eq!( + "\ ++-------+---------+---------------------+ +| tag_0 | field_0 | ts | ++-------+---------+---------------------+ +| a | 2.0 | 1970-01-01T00:00:02 | +| b | 12.0 | 1970-01-01T00:00:02 | ++-------+---------+---------------------+", + unfiltered + ); +} diff --git a/src/mito2/src/memtable/bulk/part_reader.rs b/src/mito2/src/memtable/bulk/part_reader.rs index edb9ff52d9..1375e79542 100644 --- a/src/mito2/src/memtable/bulk/part_reader.rs +++ b/src/mito2/src/memtable/bulk/part_reader.rs @@ -367,6 +367,7 @@ fn apply_combined_filters( let predicate_mask = context.base.compute_filter_mask_flat( &record_batch, skip_fields, + false, &mut tag_decode_state, )?; // If predicate filters out the entire batch, return None early diff --git a/src/mito2/src/memtable/partition_tree/partition.rs b/src/mito2/src/memtable/partition_tree/partition.rs index e6e3b8bf81..0ffbce4867 100644 --- a/src/mito2/src/memtable/partition_tree/partition.rs +++ b/src/mito2/src/memtable/partition_tree/partition.rs @@ -152,7 +152,8 @@ impl Partition { filters: &Arc>, ) -> Option> { if need_prune_key { - let filter = row_codec.primary_key_filter(metadata, filters.clone()); + // TODO(yingwen): Remove `skip_partition_column` after dropping PartitionTreeMemtable. + let filter = row_codec.primary_key_filter(metadata, filters.clone(), true); Some(filter) } else { None diff --git a/src/mito2/src/memtable/partition_tree/shard.rs b/src/mito2/src/memtable/partition_tree/shard.rs index 162f937a7c..c5dc25f573 100644 --- a/src/mito2/src/memtable/partition_tree/shard.rs +++ b/src/mito2/src/memtable/partition_tree/shard.rs @@ -19,9 +19,10 @@ use std::time::{Duration, Instant}; use mito_codec::key_values::KeyValue; use mito_codec::row_converter::PrimaryKeyFilter; +use snafu::ResultExt; use store_api::metadata::RegionMetadataRef; -use crate::error::Result; +use crate::error::{DecodeSnafu, Result}; use crate::memtable::partition_tree::data::{ DATA_INIT_CAP, DataBatch, DataParts, DataPartsReader, DataPartsReaderBuilder, }; @@ -243,7 +244,7 @@ impl ShardReader { // Safety: `key_filter` is some so the shard has primary keys. let key = self.key_dict.as_ref().unwrap().key_by_pk_index(pk_index); let now = Instant::now(); - if key_filter.matches(key) { + if key_filter.matches(key).context(DecodeSnafu)? { self.prune_pk_cost += now.elapsed(); self.last_yield_pk_index = Some(pk_index); self.keys_after_pruning += 1; diff --git a/src/mito2/src/memtable/partition_tree/shard_builder.rs b/src/mito2/src/memtable/partition_tree/shard_builder.rs index 26de85767d..78eeb463c6 100644 --- a/src/mito2/src/memtable/partition_tree/shard_builder.rs +++ b/src/mito2/src/memtable/partition_tree/shard_builder.rs @@ -20,9 +20,10 @@ use std::time::{Duration, Instant}; use mito_codec::key_values::KeyValue; use mito_codec::row_converter::PrimaryKeyFilter; +use snafu::ResultExt; use store_api::metadata::RegionMetadataRef; -use crate::error::Result; +use crate::error::{DecodeSnafu, Result}; use crate::memtable::partition_tree::data::{ DATA_INIT_CAP, DataBatch, DataBuffer, DataBufferReader, DataBufferReaderBuilder, DataParts, }; @@ -281,7 +282,7 @@ impl ShardBuilderReader { self.keys_before_pruning += 1; let key = self.dict_reader.key_by_pk_index(pk_index); let now = Instant::now(); - if key_filter.matches(key) { + if key_filter.matches(key).context(DecodeSnafu)? { self.prune_pk_cost += now.elapsed(); self.last_yield_pk_index = Some(pk_index); self.keys_after_pruning += 1; diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs index 79a08a209d..4a3466a29c 100644 --- a/src/mito2/src/sst/parquet.rs +++ b/src/mito2/src/sst/parquet.rs @@ -142,6 +142,7 @@ mod tests { use crate::sst::index::{IndexBuildType, Indexer, IndexerBuilder, IndexerBuilderImpl}; use crate::sst::parquet::flat_format::FlatWriteFormat; use crate::sst::parquet::reader::{ParquetReader, ParquetReaderBuilder, ReaderMetrics}; + use crate::sst::parquet::row_selection::RowGroupSelection; use crate::sst::parquet::writer::ParquetWriter; use crate::sst::{ DEFAULT_WRITE_CONCURRENCY, FlatSchemaOptions, location, to_flat_sst_arrow_schema, @@ -1113,6 +1114,39 @@ mod tests { assert!(reader.next_record_batch().await.unwrap().is_none()); } + fn new_record_batch_from_rows(rows: &[(&str, &str, i64)]) -> RecordBatch { + let metadata = Arc::new(sst_region_metadata()); + let flat_schema = to_flat_sst_arrow_schema(&metadata, &FlatSchemaOptions::default()); + + let mut tag_0_builder = StringDictionaryBuilder::::new(); + let mut tag_1_builder = StringDictionaryBuilder::::new(); + let mut pk_builder = BinaryDictionaryBuilder::::new(); + let mut field_values = Vec::with_capacity(rows.len()); + let mut timestamps = Vec::with_capacity(rows.len()); + + for (tag_0, tag_1, ts) in rows { + tag_0_builder.append_value(*tag_0); + tag_1_builder.append_value(*tag_1); + pk_builder.append(new_primary_key(&[tag_0, tag_1])).unwrap(); + field_values.push(*ts as u64); + timestamps.push(*ts); + } + + RecordBatch::try_new( + flat_schema, + vec![ + Arc::new(tag_0_builder.finish()) as ArrayRef, + Arc::new(tag_1_builder.finish()) as ArrayRef, + Arc::new(UInt64Array::from(field_values)) as ArrayRef, + Arc::new(TimestampMillisecondArray::from(timestamps)) as ArrayRef, + Arc::new(pk_builder.finish()) as ArrayRef, + Arc::new(UInt64Array::from_value(1000, rows.len())) as ArrayRef, + Arc::new(UInt8Array::from_value(OpType::Put as u8, rows.len())) as ArrayRef, + ], + ) + .unwrap() + } + /// Creates a flat format RecordBatch for testing with sparse primary key encoding. /// Similar to `new_record_batch_by_range` but without individual primary key columns. fn new_record_batch_by_range_sparse( @@ -1642,6 +1676,133 @@ mod tests { assert_eq!(metrics.filter_metrics.rows_bloom_filtered, 100); } + #[tokio::test] + async fn test_reader_prefilter_with_outer_selection_and_trailing_filtered_rows() { + let mut env = TestEnv::new().await; + let object_store = env.init_object_store_manager(); + let file_path = RegionFilePathFactory::new(FILE_DIR.to_string(), PathType::Bare); + let metadata = Arc::new(sst_region_metadata()); + let row_group_size = 10; + + let flat_source = new_flat_source_from_record_batches(vec![ + new_record_batch_by_range(&["a", "d"], 0, 3), + new_record_batch_by_range(&["b", "d"], 3, 10), + ]); + let write_opts = WriteOptions { + row_group_size, + ..Default::default() + }; + let indexer_builder = create_test_indexer_builder( + &env, + object_store.clone(), + file_path.clone(), + metadata.clone(), + row_group_size, + ); + let info = write_flat_sst( + object_store.clone(), + metadata.clone(), + indexer_builder, + file_path, + flat_source, + &write_opts, + ) + .await; + let handle = create_file_handle_from_sst_info(&info, &metadata); + + let builder = + ParquetReaderBuilder::new(FILE_DIR.to_string(), PathType::Bare, handle, object_store) + .flat_format(true) + .predicate(Some(Predicate::new(vec![col("tag_0").eq(lit("a"))]))); + + let mut metrics = ReaderMetrics::default(); + let (context, _) = builder + .build_reader_input(&mut metrics) + .await + .unwrap() + .unwrap(); + let selection = RowGroupSelection::from_row_ranges( + vec![(0, std::iter::once(0..6).collect())], + row_group_size, + ); + + let mut reader = ParquetReader::new(Arc::new(context), selection) + .await + .unwrap(); + check_record_batch_reader_result( + &mut reader, + &[new_record_batch_by_range(&["a", "d"], 0, 3)], + ) + .await; + } + + #[tokio::test] + async fn test_reader_prefilter_with_outer_selection_disjoint_matches_and_trailing_gap() { + let mut env = TestEnv::new().await; + let object_store = env.init_object_store_manager(); + let file_path = RegionFilePathFactory::new(FILE_DIR.to_string(), PathType::Bare); + let metadata = Arc::new(sst_region_metadata()); + let row_group_size = 8; + + let flat_source = new_flat_source_from_record_batches(vec![ + new_record_batch_by_range(&["a", "d"], 0, 2), + new_record_batch_by_range(&["b", "d"], 2, 4), + new_record_batch_by_range(&["a", "d"], 4, 6), + new_record_batch_by_range(&["c", "d"], 6, 8), + ]); + let write_opts = WriteOptions { + row_group_size, + ..Default::default() + }; + let indexer_builder = create_test_indexer_builder( + &env, + object_store.clone(), + file_path.clone(), + metadata.clone(), + row_group_size, + ); + let info = write_flat_sst( + object_store.clone(), + metadata.clone(), + indexer_builder, + file_path, + flat_source, + &write_opts, + ) + .await; + let handle = create_file_handle_from_sst_info(&info, &metadata); + + let builder = + ParquetReaderBuilder::new(FILE_DIR.to_string(), PathType::Bare, handle, object_store) + .flat_format(true) + .predicate(Some(Predicate::new(vec![col("tag_0").eq(lit("a"))]))); + + let mut metrics = ReaderMetrics::default(); + let (context, _) = builder + .build_reader_input(&mut metrics) + .await + .unwrap() + .unwrap(); + let selection = RowGroupSelection::from_row_ranges( + vec![(0, std::iter::once(0..8).collect())], + row_group_size, + ); + + let mut reader = ParquetReader::new(Arc::new(context), selection) + .await + .unwrap(); + check_record_batch_reader_result( + &mut reader, + &[new_record_batch_from_rows(&[ + ("a", "d", 0), + ("a", "d", 1), + ("a", "d", 4), + ("a", "d", 5), + ])], + ) + .await; + } + #[tokio::test] async fn test_write_flat_read_with_inverted_index_sparse() { common_telemetry::init_default_ut_logging(); diff --git a/src/mito2/src/sst/parquet/file_range.rs b/src/mito2/src/sst/parquet/file_range.rs index 3a5251cb1a..8b4a61acb7 100644 --- a/src/mito2/src/sst/parquet/file_range.rs +++ b/src/mito2/src/sst/parquet/file_range.rs @@ -37,6 +37,7 @@ use store_api::metadata::RegionMetadataRef; use store_api::storage::{ColumnId, TimeSeriesRowSelector}; use table::predicate::Predicate; +use crate::cache::CacheStrategy; use crate::error::{ ComputeArrowSnafu, DataTypeMismatchSnafu, DecodeSnafu, DecodeStatsSnafu, EvalPartitionFilterSnafu, NewRecordBatchSnafu, RecordBatchSnafu, Result, StatsNotPresentSnafu, @@ -53,7 +54,8 @@ use crate::sst::parquet::flat_format::{ }; use crate::sst::parquet::format::ReadFormat; use crate::sst::parquet::reader::{ - FlatRowGroupReader, MaybeFilter, RowGroupReader, RowGroupReaderBuilder, SimpleFilterContext, + FlatRowGroupReader, MaybeFilter, RowGroupBuildContext, RowGroupReader, RowGroupReaderBuilder, + SimpleFilterContext, }; use crate::sst::parquet::row_group::ParquetFetchMetrics; use crate::sst::parquet::stats::RowGroupPruningStats; @@ -181,14 +183,17 @@ impl FileRange { if !self.in_dynamic_filter_range() { return Ok(None); } + // Compute skip_fields once for this row group + let skip_fields = self.context.should_skip_fields(self.row_group_idx); let parquet_reader = self .context .reader_builder - .build( + .build(self.context.build_context( self.row_group_idx, self.row_selection.clone(), fetch_metrics, - ) + skip_fields, + )) .await?; let use_last_row_reader = if selector @@ -210,9 +215,6 @@ impl FileRange { false }; - // Compute skip_fields once for this row group - let skip_fields = self.context.should_skip_fields(self.row_group_idx); - let prune_reader = if use_last_row_reader { // Row group is PUT only, use LastRowReader to skip unnecessary rows. let reader = RowGroupLastRowCachedReader::new( @@ -243,14 +245,17 @@ impl FileRange { if !self.in_dynamic_filter_range() { return Ok(None); } + // Compute skip_fields once for this row group + let skip_fields = self.context.should_skip_fields(self.row_group_idx); let parquet_reader = self .context .reader_builder - .build( + .build(self.context.build_context( self.row_group_idx, self.row_selection.clone(), fetch_metrics, - ) + skip_fields, + )) .await?; let use_last_row_reader = if selector @@ -271,16 +276,20 @@ impl FileRange { false }; - // Compute skip_fields once for this row group - let skip_fields = self.context.should_skip_fields(self.row_group_idx); - let flat_prune_reader = if use_last_row_reader { let flat_row_group_reader = FlatRowGroupReader::new(self.context.clone(), parquet_reader); + // Flat PK prefilter makes the input stream predicate-dependent, so cached + // selector results are not reusable across queries with different filters. + let cache_strategy = if self.context.reader_builder.has_flat_primary_key_prefilter() { + CacheStrategy::Disabled + } else { + self.context.reader_builder.cache_strategy().clone() + }; let reader = FlatRowGroupLastRowCachedReader::new( self.file_handle().file_id().file_id(), self.row_group_idx, - self.context.reader_builder.cache_strategy().clone(), + cache_strategy, self.context.read_format().projection_indices(), flat_row_group_reader, ); @@ -387,7 +396,11 @@ impl FileRangeContext { input: RecordBatch, skip_fields: bool, ) -> Result> { - self.base.precise_filter_flat(input, skip_fields) + self.base.precise_filter_flat( + input, + skip_fields, + self.reader_builder.has_flat_primary_key_prefilter(), + ) } /// Determines whether to skip field filters based on PreFilterMode and row group delete status. @@ -408,6 +421,23 @@ impl FileRangeContext { row_group_contains_delete(metadata, row_group_index, self.reader_builder.file_path()) } + /// Creates a [RowGroupBuildContext] for building row group readers with prefiltering. + pub(crate) fn build_context<'a>( + &'a self, + row_group_idx: usize, + row_selection: Option, + fetch_metrics: Option<&'a ParquetFetchMetrics>, + skip_fields: bool, + ) -> RowGroupBuildContext<'a> { + RowGroupBuildContext { + filters: &self.base.filters, + skip_fields, + row_group_idx, + row_selection, + fetch_metrics, + } + } + /// Returns the estimated memory size of this context. /// Mainly accounts for the parquet metadata size. pub(crate) fn memory_size(&self) -> usize { @@ -600,9 +630,15 @@ impl RangeBase { &self, input: RecordBatch, skip_fields: bool, + skip_prefiltered_pk_filters: bool, ) -> Result> { let mut tag_decode_state = TagDecodeState::new(); - let mask = self.compute_filter_mask_flat(&input, skip_fields, &mut tag_decode_state)?; + let mask = self.compute_filter_mask_flat( + &input, + skip_fields, + skip_prefiltered_pk_filters, + &mut tag_decode_state, + )?; // If mask is None, the entire batch is filtered out let Some(mut mask) = mask else { @@ -647,6 +683,7 @@ impl RangeBase { &self, input: &RecordBatch, skip_fields: bool, + skip_prefiltered_pk_filters: bool, tag_decode_state: &mut TagDecodeState, ) -> Result> { let mut mask = BooleanBuffer::new_set(input.num_rows()); @@ -674,6 +711,12 @@ impl RangeBase { continue; } + // Flat parquet PK prefiltering already applied these tag predicates while refining + // row selection, so skip them here to avoid decoding/evaluating the same condition twice. + if skip_prefiltered_pk_filters && filter_ctx.usable_primary_key_filter() { + continue; + } + // Get the column directly by its projected index. // If the column is missing and it's not a tag/time column, this filter is skipped. // Assumes the projection indices align with the input batch schema. @@ -926,3 +969,62 @@ impl RangeBase { RecordBatch::try_new(arrow_schema.clone(), columns).context(NewRecordBatchSnafu) } } + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use datafusion_expr::{col, lit}; + + use super::*; + use crate::sst::parquet::format::ReadFormat; + use crate::test_util::sst_util::{new_record_batch_with_custom_sequence, sst_region_metadata}; + + fn new_test_range_base(filters: Vec) -> RangeBase { + let metadata: RegionMetadataRef = Arc::new(sst_region_metadata()); + let read_format = ReadFormat::new_flat( + metadata.clone(), + metadata.column_metadatas.iter().map(|c| c.column_id), + None, + "test", + true, + ) + .unwrap(); + + RangeBase { + filters, + dyn_filters: vec![], + read_format, + expected_metadata: None, + prune_schema: metadata.schema.clone(), + codec: mito_codec::row_converter::build_primary_key_codec(metadata.as_ref()), + compat_batch: None, + compaction_projection_mapper: None, + pre_filter_mode: PreFilterMode::All, + partition_filter: None, + } + } + + #[test] + fn test_compute_filter_mask_flat_skips_prefiltered_pk_filters() { + let metadata: RegionMetadataRef = Arc::new(sst_region_metadata()); + let filters = vec![ + SimpleFilterContext::new_opt(&metadata, None, &col("tag_0").eq(lit("a"))).unwrap(), + SimpleFilterContext::new_opt(&metadata, None, &col("field_0").gt(lit(1_u64))).unwrap(), + ]; + let base = new_test_range_base(filters); + let batch = new_record_batch_with_custom_sequence(&["b", "x"], 0, 4, 1); + + let mask_without_skip = base + .compute_filter_mask_flat(&batch, false, false, &mut TagDecodeState::new()) + .unwrap() + .unwrap(); + assert_eq!(mask_without_skip.count_set_bits(), 0); + + let mask_with_skip = base + .compute_filter_mask_flat(&batch, false, true, &mut TagDecodeState::new()) + .unwrap() + .unwrap(); + assert_eq!(mask_with_skip.count_set_bits(), 2); + } +} diff --git a/src/mito2/src/sst/parquet/flat_format.rs b/src/mito2/src/sst/parquet/flat_format.rs index 8a59e9a97d..ca39cac7e1 100644 --- a/src/mito2/src/sst/parquet/flat_format.rs +++ b/src/mito2/src/sst/parquet/flat_format.rs @@ -282,6 +282,13 @@ impl FlatReadFormat { } } + /// Returns `true` if raw batches from parquet use the flat layout with a + /// dictionary-encoded `__primary_key` column (i.e., [`ParquetAdapter::Flat`]). + /// Returns `false` for the legacy primary-key-to-flat conversion path. + pub(crate) fn raw_batch_has_primary_key_dictionary(&self) -> bool { + matches!(&self.parquet_adapter, ParquetAdapter::Flat(_)) + } + /// Creates a sequence array to override. pub(crate) fn new_override_sequence_array(&self, length: usize) -> Option { self.override_sequence diff --git a/src/mito2/src/sst/parquet/prefilter.rs b/src/mito2/src/sst/parquet/prefilter.rs index 5de2e3512f..07efbd052f 100644 --- a/src/mito2/src/sst/parquet/prefilter.rs +++ b/src/mito2/src/sst/parquet/prefilter.rs @@ -12,31 +12,40 @@ // See the License for the specific language governing permissions and // limitations under the License. -//! Helpers for parquet prefiltering. +//! Prefilter framework for parquet reader. +//! +//! Prefilter optimization reduces I/O by reading only a subset of columns first +//! (the prefilter phase), applying filters to compute a refined row selection, +//! then reading the remaining columns with the refined selection. use std::ops::Range; +use std::sync::Arc; use api::v1::SemanticType; use common_recordbatch::filter::SimpleFilterEvaluator; -use datatypes::arrow::array::{BinaryArray, BooleanArray}; +use datatypes::arrow::array::BinaryArray; use datatypes::arrow::record_batch::RecordBatch; -use mito_codec::primary_key_filter::is_partition_column; -use mito_codec::row_converter::PrimaryKeyFilter; +use futures::StreamExt; +use mito_codec::row_converter::{PrimaryKeyCodec, PrimaryKeyFilter}; +use parquet::arrow::ProjectionMask; +use parquet::arrow::arrow_reader::RowSelection; +use parquet::schema::types::SchemaDescriptor; use snafu::{OptionExt, ResultExt}; use store_api::metadata::{RegionMetadata, RegionMetadataRef}; -use crate::error::{ComputeArrowSnafu, Result, UnexpectedSnafu}; +use crate::error::{DecodeSnafu, ReadParquetSnafu, Result, UnexpectedSnafu}; use crate::sst::parquet::flat_format::primary_key_column_index; -use crate::sst::parquet::format::PrimaryKeyArray; +use crate::sst::parquet::format::{PrimaryKeyArray, ReadFormat}; +use crate::sst::parquet::reader::{RowGroupBuildContext, RowGroupReaderBuilder}; +use crate::sst::parquet::row_selection::row_selection_from_row_ranges_exact; -#[cfg_attr(not(test), allow(dead_code))] pub(crate) fn matching_row_ranges_by_primary_key( input: &RecordBatch, + pk_column_index: usize, pk_filter: &mut dyn PrimaryKeyFilter, ) -> Result>> { - let primary_key_index = primary_key_column_index(input.num_columns()); let pk_dict_array = input - .column(primary_key_index) + .column(pk_column_index) .as_any() .downcast_ref::() .context(UnexpectedSnafu { @@ -65,7 +74,10 @@ pub(crate) fn matching_row_ranges_by_primary_key( end += 1; } - if pk_filter.matches(pk_values.value(key as usize)) { + if pk_filter + .matches(pk_values.value(key as usize)) + .context(DecodeSnafu)? + { if let Some(last) = matched_row_ranges.last_mut() && last.end == start { @@ -81,68 +93,15 @@ pub(crate) fn matching_row_ranges_by_primary_key( Ok(matched_row_ranges) } -#[cfg_attr(not(test), allow(dead_code))] -pub(crate) fn prefilter_flat_batch_by_primary_key( - input: RecordBatch, - pk_filter: &mut dyn PrimaryKeyFilter, -) -> Result> { - if input.num_rows() == 0 { - return Ok(Some(input)); - } - - let matched_row_ranges = matching_row_ranges_by_primary_key(&input, pk_filter)?; - if matched_row_ranges.is_empty() { - return Ok(None); - } - - if matched_row_ranges.len() == 1 - && matched_row_ranges[0].start == 0 - && matched_row_ranges[0].end == input.num_rows() - { - return Ok(Some(input)); - } - - if matched_row_ranges.len() == 1 { - let span = &matched_row_ranges[0]; - return Ok(Some(input.slice(span.start, span.end - span.start))); - } - - let mut mask = vec![false; input.num_rows()]; - for span in matched_row_ranges { - mask[span].fill(true); - } - - let filtered = - datatypes::arrow::compute::filter_record_batch(&input, &BooleanArray::from(mask)) - .context(ComputeArrowSnafu)?; - if filtered.num_rows() == 0 { - Ok(None) - } else { - Ok(Some(filtered)) - } -} - -#[cfg_attr(not(test), allow(dead_code))] -pub(crate) fn retain_usable_primary_key_filters( - sst_metadata: &RegionMetadataRef, - expected_metadata: Option<&RegionMetadata>, - filters: &mut Vec, -) { - filters.retain(|filter| is_usable_primary_key_filter(sst_metadata, expected_metadata, filter)); -} - -#[cfg_attr(not(test), allow(dead_code))] +/// Returns whether a filter can be applied by parquet primary-key prefiltering. +/// +/// Unlike `PartitionTreeMemtable`, parquet prefilter always supports predicates +/// on the partition column. pub(crate) fn is_usable_primary_key_filter( sst_metadata: &RegionMetadataRef, expected_metadata: Option<&RegionMetadata>, filter: &SimpleFilterEvaluator, ) -> bool { - // TODO(yingwen): The primary key filter always skips the partition column. Consider using a flag - // to control this behavior. We can remove this behavior after we remove the PartitionTreeMemtable. - if is_partition_column(filter.column_name()) { - return false; - } - let sst_column = match expected_metadata { Some(expected_metadata) => { let Some(expected_column) = expected_metadata.column_by_name(filter.column_name()) @@ -176,7 +135,6 @@ pub(crate) fn is_usable_primary_key_filter( .is_some() } -#[cfg_attr(not(test), allow(dead_code))] pub(crate) struct CachedPrimaryKeyFilter { inner: Box, last_primary_key: Vec, @@ -184,7 +142,6 @@ pub(crate) struct CachedPrimaryKeyFilter { } impl CachedPrimaryKeyFilter { - #[cfg_attr(not(test), allow(dead_code))] pub(crate) fn new(inner: Box) -> Self { Self { inner, @@ -195,49 +152,191 @@ impl CachedPrimaryKeyFilter { } impl PrimaryKeyFilter for CachedPrimaryKeyFilter { - fn matches(&mut self, pk: &[u8]) -> bool { + fn matches(&mut self, pk: &[u8]) -> mito_codec::error::Result { if let Some(last_match) = self.last_match && self.last_primary_key == pk { - return last_match; + return Ok(last_match); } - let matched = self.inner.matches(pk); + let matched = self.inner.matches(pk)?; self.last_primary_key.clear(); self.last_primary_key.extend_from_slice(pk); self.last_match = Some(matched); - matched + Ok(matched) } } -#[cfg_attr(not(test), allow(dead_code))] -pub(crate) fn batch_single_primary_key(batch: &RecordBatch) -> Result> { - let primary_key_index = primary_key_column_index(batch.num_columns()); - let pk_dict_array = batch - .column(primary_key_index) - .as_any() - .downcast_ref::() - .context(UnexpectedSnafu { - reason: "Primary key column is not a dictionary array", - })?; - let pk_values = pk_dict_array - .values() - .as_any() - .downcast_ref::() - .context(UnexpectedSnafu { - reason: "Primary key values are not binary array", - })?; - let keys = pk_dict_array.keys(); - if keys.is_empty() { - return Ok(None); +/// Context for prefiltering a row group. +/// +/// Currently supports primary key (PK) filtering only. +/// Will be extended with simple column filters and physical filters in the future. +pub(crate) struct PrefilterContext { + /// PK filter instance. + pk_filter: Box, + /// Projection mask for reading only the PK column. + pk_projection: ProjectionMask, + /// Index of the PK column within the prefilter projection batch. + /// This is 0 when we project only the PK column. + pk_column_index: usize, +} + +/// Pre-built state for constructing [PrefilterContext] per row group. +/// +/// Fields invariant across row groups (projection mask, codec, metadata, filters) +/// are computed once. A fresh [PrefilterContext] with its own mutable PK filter +/// is created via [PrefilterContextBuilder::build()] for each row group. +pub(crate) struct PrefilterContextBuilder { + pk_projection: ProjectionMask, + pk_column_index: usize, + codec: Arc, + metadata: RegionMetadataRef, + pk_filters: Arc>, +} + +impl PrefilterContextBuilder { + /// Creates a builder if prefiltering is applicable. + /// + /// Returns `None` if: + /// - No primary key filters are available + /// - The read format doesn't use flat layout with dictionary-encoded PKs + /// - The primary key is empty + pub(crate) fn new( + read_format: &ReadFormat, + codec: &Arc, + primary_key_filters: Option<&Arc>>, + parquet_schema: &SchemaDescriptor, + ) -> Option { + let pk_filters = primary_key_filters?; + if pk_filters.is_empty() { + return None; + } + + let metadata = read_format.metadata(); + if metadata.primary_key.is_empty() { + return None; + } + + // Only flat format with dictionary-encoded PKs supports PK prefiltering. + let flat_format = read_format.as_flat()?; + if !flat_format.raw_batch_has_primary_key_dictionary() { + return None; + } + + // Compute PK-only projection mask. + let num_parquet_columns = parquet_schema.num_columns(); + let pk_index = primary_key_column_index(num_parquet_columns); + let pk_projection = ProjectionMask::roots(parquet_schema, [pk_index]); + + // The PK column is the only column in the projection, so its index is 0. + let pk_column_index = 0; + + Some(Self { + pk_projection, + pk_column_index, + codec: Arc::clone(codec), + metadata: metadata.clone(), + pk_filters: Arc::clone(pk_filters), + }) } - let first_key = keys.value(0); - if first_key != keys.value(keys.len() - 1) { - return Ok(None); + /// Builds a [PrefilterContext] for a specific row group. + pub(crate) fn build(&self) -> PrefilterContext { + // Parquet PK prefilter always supports the partition column. Only + // PartitionTreeMemtable skips it after partition pruning. + let pk_filter = + self.codec + .primary_key_filter(&self.metadata, Arc::clone(&self.pk_filters), false); + let pk_filter = Box::new(CachedPrimaryKeyFilter::new(pk_filter)); + PrefilterContext { + pk_filter, + pk_projection: self.pk_projection.clone(), + pk_column_index: self.pk_column_index, + } + } +} + +/// Result of prefiltering a row group. +pub(crate) struct PrefilterResult { + /// Refined row selection after prefiltering. + pub(crate) refined_selection: RowSelection, + /// Number of rows filtered out by prefiltering. + pub(crate) filtered_rows: usize, +} + +/// Executes prefiltering on a row group. +/// +/// Reads only the prefilter columns (currently the PK dictionary column), +/// applies filters, and returns a refined [RowSelection]. +pub(crate) async fn execute_prefilter( + prefilter_ctx: &mut PrefilterContext, + reader_builder: &RowGroupReaderBuilder, + build_ctx: &RowGroupBuildContext<'_>, +) -> Result { + // Reads PK column only. + let mut pk_stream = reader_builder + .build_with_projection( + build_ctx.row_group_idx, + build_ctx.row_selection.clone(), + prefilter_ctx.pk_projection.clone(), + build_ctx.fetch_metrics, + ) + .await?; + + // Applies PK filter to each batch and collect matching row ranges. + let mut matched_row_ranges: Vec> = Vec::new(); + let mut row_offset = 0; + let mut rows_before_filter = 0usize; + + while let Some(batch_result) = pk_stream.next().await { + let batch = batch_result.context(ReadParquetSnafu { + path: reader_builder.file_path(), + })?; + let batch_num_rows = batch.num_rows(); + if batch_num_rows == 0 { + continue; + } + rows_before_filter += batch_num_rows; + + let ranges = matching_row_ranges_by_primary_key( + &batch, + prefilter_ctx.pk_column_index, + prefilter_ctx.pk_filter.as_mut(), + )?; + matched_row_ranges.extend( + ranges + .into_iter() + .map(|range| (range.start + row_offset)..(range.end + row_offset)), + ); + row_offset += batch_num_rows; } - Ok(Some(pk_values.value(first_key as usize))) + // Converts matched ranges to RowSelection. + let rows_selected: usize = matched_row_ranges.iter().map(|r| r.end - r.start).sum(); + let filtered_rows = rows_before_filter.saturating_sub(rows_selected); + + let refined_selection = if rows_selected == 0 { + RowSelection::from(vec![]) + } else { + // Build the prefilter selection relative to the yielded rows + // (not total_rows), since matched_row_ranges are offsets within + // the rows actually read from the stream. + let prefilter_selection = + row_selection_from_row_ranges_exact(matched_row_ranges.into_iter(), rows_before_filter); + + // Use and_then to apply prefilter selection within the context + // of the original selection, since prefilter offsets are relative + // to the original selection's selected rows. + match &build_ctx.row_selection { + Some(original) => original.and_then(&prefilter_selection), + None => prefilter_selection, + } + }; + + Ok(PrefilterResult { + refined_selection, + filtered_rows, + }) } #[cfg(test)] @@ -245,175 +344,14 @@ mod tests { use std::sync::Arc; use std::sync::atomic::{AtomicUsize, Ordering}; - use api::v1::SemanticType; use common_recordbatch::filter::SimpleFilterEvaluator; use datafusion_expr::{col, lit}; - use datatypes::arrow::array::{ - ArrayRef, BinaryArray, DictionaryArray, TimestampMillisecondArray, UInt8Array, UInt32Array, - UInt64Array, - }; - use datatypes::arrow::datatypes::{Schema, UInt32Type}; - use datatypes::arrow::record_batch::RecordBatch; - use datatypes::prelude::ConcreteDataType; - use mito_codec::row_converter::{PrimaryKeyFilter, build_primary_key_codec}; + use mito_codec::row_converter::PrimaryKeyFilter; use store_api::codec::PrimaryKeyEncoding; - use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder}; - use store_api::storage::ColumnSchema; use super::*; - use crate::sst::internal_fields; use crate::sst::parquet::format::ReadFormat; - use crate::test_util::sst_util::{ - new_primary_key, sst_region_metadata, sst_region_metadata_with_encoding, - }; - - fn new_test_filters(exprs: &[datafusion_expr::Expr]) -> Vec { - exprs - .iter() - .filter_map(SimpleFilterEvaluator::try_new) - .collect() - } - - fn expected_metadata_with_reused_tag_name( - old_metadata: &RegionMetadata, - ) -> Arc { - let mut builder = RegionMetadataBuilder::new(old_metadata.region_id); - builder - .push_column_metadata(ColumnMetadata { - column_schema: ColumnSchema::new( - "tag_0".to_string(), - ConcreteDataType::string_datatype(), - true, - ), - semantic_type: SemanticType::Tag, - column_id: 10, - }) - .push_column_metadata(ColumnMetadata { - column_schema: ColumnSchema::new( - "tag_1".to_string(), - ConcreteDataType::string_datatype(), - true, - ), - semantic_type: SemanticType::Tag, - column_id: 1, - }) - .push_column_metadata(ColumnMetadata { - column_schema: ColumnSchema::new( - "field_0".to_string(), - ConcreteDataType::uint64_datatype(), - true, - ), - semantic_type: SemanticType::Field, - column_id: 2, - }) - .push_column_metadata(ColumnMetadata { - column_schema: ColumnSchema::new( - "ts".to_string(), - ConcreteDataType::timestamp_millisecond_datatype(), - false, - ), - semantic_type: SemanticType::Timestamp, - column_id: 3, - }) - .primary_key(vec![10, 1]); - - Arc::new(builder.build().unwrap()) - } - - fn new_raw_batch_with_metadata( - metadata: Arc, - primary_keys: &[&[u8]], - field_values: &[u64], - ) -> RecordBatch { - assert_eq!(primary_keys.len(), field_values.len()); - - let arrow_schema = metadata.schema.arrow_schema(); - let field_column = arrow_schema - .field(arrow_schema.index_of("field_0").unwrap()) - .clone(); - let time_index_column = arrow_schema - .field(arrow_schema.index_of("ts").unwrap()) - .clone(); - let mut fields = vec![field_column, time_index_column]; - fields.extend( - internal_fields() - .into_iter() - .map(|field| field.as_ref().clone()), - ); - let schema = Arc::new(Schema::new(fields)); - - let mut dict_values = Vec::new(); - let mut keys = Vec::with_capacity(primary_keys.len()); - for pk in primary_keys { - let key = dict_values - .iter() - .position(|existing: &&[u8]| existing == pk) - .unwrap_or_else(|| { - dict_values.push(*pk); - dict_values.len() - 1 - }); - keys.push(key as u32); - } - - let pk_array: ArrayRef = Arc::new(DictionaryArray::::new( - UInt32Array::from(keys), - Arc::new(BinaryArray::from_iter_values(dict_values.iter().copied())), - )); - - RecordBatch::try_new( - schema, - vec![ - Arc::new(UInt64Array::from(field_values.to_vec())), - Arc::new(TimestampMillisecondArray::from_iter_values( - 0..primary_keys.len() as i64, - )), - pk_array, - Arc::new(UInt64Array::from(vec![1; primary_keys.len()])), - Arc::new(UInt8Array::from(vec![1; primary_keys.len()])), - ], - ) - .unwrap() - } - - fn new_raw_batch(primary_keys: &[&[u8]], field_values: &[u64]) -> RecordBatch { - new_raw_batch_with_metadata(Arc::new(sst_region_metadata()), primary_keys, field_values) - } - - fn field_values(batch: &RecordBatch) -> Vec { - batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap() - .values() - .to_vec() - } - - #[test] - fn test_retain_usable_primary_key_filters_skips_non_tag_filters() { - let metadata = Arc::new(sst_region_metadata()); - let mut filters = - new_test_filters(&[col("field_0").eq(lit(1_u64)), col("ts").gt(lit(0_i64))]); - - retain_usable_primary_key_filters(&metadata, None, &mut filters); - - assert!(filters.is_empty()); - } - - #[test] - fn test_retain_usable_primary_key_filters_skips_reused_expected_tag_name() { - let metadata = Arc::new(sst_region_metadata()); - let expected_metadata = expected_metadata_with_reused_tag_name(&metadata); - let mut filters = new_test_filters(&[col("tag_0").eq(lit("b"))]); - - retain_usable_primary_key_filters( - &metadata, - Some(expected_metadata.as_ref()), - &mut filters, - ); - - assert!(filters.is_empty()); - } + use crate::test_util::sst_util::{new_primary_key, sst_region_metadata_with_encoding}; #[test] fn test_is_usable_primary_key_filter_skips_legacy_primary_key_batches() { @@ -435,52 +373,16 @@ mod tests { } #[test] - fn test_prefilter_primary_key_drops_single_dictionary_batch() { - let metadata = Arc::new(sst_region_metadata()); - let filters = Arc::new(new_test_filters(&[col("tag_0").eq(lit("b"))])); - let mut primary_key_filter = - build_primary_key_codec(metadata.as_ref()).primary_key_filter(&metadata, filters); - let pk_a = new_primary_key(&["a", "x"]); - let batch = new_raw_batch(&[pk_a.as_slice(), pk_a.as_slice()], &[10, 11]); + fn test_is_usable_primary_key_filter_supports_partition_column_by_default() { + let metadata = Arc::new(sst_region_metadata_with_encoding( + PrimaryKeyEncoding::Sparse, + )); + let filter = SimpleFilterEvaluator::try_new( + &col(store_api::metric_engine_consts::DATA_SCHEMA_TABLE_ID_COLUMN_NAME).eq(lit(1_u32)), + ) + .unwrap(); - let filtered = - prefilter_flat_batch_by_primary_key(batch, primary_key_filter.as_mut()).unwrap(); - - assert!(filtered.is_none()); - } - - #[test] - fn test_prefilter_primary_key_builds_mask_for_fragmented_matches() { - let metadata = Arc::new(sst_region_metadata()); - let filters = Arc::new(new_test_filters(&[col("tag_0") - .eq(lit("a")) - .or(col("tag_0").eq(lit("c")))])); - let mut primary_key_filter = - build_primary_key_codec(metadata.as_ref()).primary_key_filter(&metadata, filters); - let pk_a = new_primary_key(&["a", "x"]); - let pk_b = new_primary_key(&["b", "x"]); - let pk_c = new_primary_key(&["c", "x"]); - let pk_d = new_primary_key(&["d", "x"]); - let batch = new_raw_batch( - &[ - pk_a.as_slice(), - pk_a.as_slice(), - pk_b.as_slice(), - pk_b.as_slice(), - pk_c.as_slice(), - pk_c.as_slice(), - pk_d.as_slice(), - pk_d.as_slice(), - ], - &[10, 11, 12, 13, 14, 15, 16, 17], - ); - - let filtered = prefilter_flat_batch_by_primary_key(batch, primary_key_filter.as_mut()) - .unwrap() - .unwrap(); - - assert_eq!(filtered.num_rows(), 4); - assert_eq!(field_values(&filtered), vec![10, 11, 14, 15]); + assert!(is_usable_primary_key_filter(&metadata, None, &filter)); } struct CountingPrimaryKeyFilter { @@ -489,9 +391,9 @@ mod tests { } impl PrimaryKeyFilter for CountingPrimaryKeyFilter { - fn matches(&mut self, pk: &[u8]) -> bool { + fn matches(&mut self, pk: &[u8]) -> mito_codec::error::Result { self.hits.fetch_add(1, Ordering::Relaxed); - pk == self.expected.as_slice() + Ok(pk == self.expected.as_slice()) } } @@ -504,25 +406,14 @@ mod tests { expected: expected.clone(), })); - assert!(filter.matches(expected.as_slice())); - assert!(filter.matches(expected.as_slice())); - assert!(!filter.matches(new_primary_key(&["b", "x"]).as_slice())); + assert!(filter.matches(expected.as_slice()).unwrap()); + assert!(filter.matches(expected.as_slice()).unwrap()); + assert!( + !filter + .matches(new_primary_key(&["b", "x"]).as_slice()) + .unwrap() + ); assert_eq!(hits.load(Ordering::Relaxed), 2); } - - #[test] - fn test_batch_single_primary_key() { - let pk_a = new_primary_key(&["a", "x"]); - let pk_b = new_primary_key(&["b", "x"]); - - let batch = new_raw_batch(&[pk_a.as_slice(), pk_a.as_slice()], &[10, 11]); - assert_eq!( - batch_single_primary_key(&batch).unwrap(), - Some(pk_a.as_slice()) - ); - - let batch = new_raw_batch(&[pk_a.as_slice(), pk_b.as_slice()], &[10, 11]); - assert_eq!(batch_single_primary_key(&batch).unwrap(), None); - } } diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs index f152c97075..8832cd4a16 100644 --- a/src/mito2/src/sst/parquet/reader.rs +++ b/src/mito2/src/sst/parquet/reader.rs @@ -76,6 +76,9 @@ use crate::sst::parquet::file_range::{ }; use crate::sst::parquet::format::{ReadFormat, need_override_sequence}; use crate::sst::parquet::metadata::MetadataLoader; +use crate::sst::parquet::prefilter::{ + PrefilterContextBuilder, execute_prefilter, is_usable_primary_key_filter, +}; use crate::sst::parquet::row_group::ParquetFetchMetrics; use crate::sst::parquet::row_selection::RowGroupSelection; use crate::sst::parquet::stats::RowGroupPruningStats; @@ -459,16 +462,6 @@ impl ParquetReaderBuilder { ArrowReaderMetadata::try_new(parquet_meta.clone(), arrow_reader_options) .context(ReadDataPartSnafu)?; - let reader_builder = RowGroupReaderBuilder { - file_handle: self.file_handle.clone(), - file_path, - parquet_meta, - arrow_metadata, - object_store: self.object_store.clone(), - projection: projection_mask, - cache_strategy: self.cache_strategy.clone(), - }; - let filters = if let Some(predicate) = &self.predicate { predicate .exprs() @@ -493,6 +486,33 @@ impl ParquetReaderBuilder { let codec = build_primary_key_codec(read_format.metadata()); + // Extract primary key filters from precomputed filter contexts for prefiltering. + let primary_key_filters = { + let pk_filters = filters + .iter() + .filter_map(SimpleFilterContext::primary_key_prefilter) + .collect::>(); + (!pk_filters.is_empty()).then_some(Arc::new(pk_filters)) + }; + + let prefilter_builder = PrefilterContextBuilder::new( + &read_format, + &codec, + primary_key_filters.as_ref(), + parquet_meta.file_metadata().schema_descr(), + ); + + let reader_builder = RowGroupReaderBuilder { + file_handle: self.file_handle.clone(), + file_path, + parquet_meta, + arrow_metadata, + object_store: self.object_store.clone(), + projection: projection_mask, + cache_strategy: self.cache_strategy.clone(), + prefilter_builder, + }; + let partition_filter = self.build_partition_filter(&read_format, &prune_schema)?; let context = FileRangeContext::new( @@ -1658,6 +1678,25 @@ pub(crate) struct RowGroupReaderBuilder { projection: ProjectionMask, /// Cache. cache_strategy: CacheStrategy, + /// Pre-built prefilter state. `None` if prefiltering is not applicable. + prefilter_builder: Option, +} + +/// Context passed to [RowGroupReaderBuilder::build()] carrying all information +/// needed for prefiltering decisions. +pub(crate) struct RowGroupBuildContext<'a> { + /// Simple filters pushed down. Used by prefilter on other columns. + #[allow(dead_code)] + pub(crate) filters: &'a [SimpleFilterContext], + /// Whether to skip field filters. Used by prefilter on other columns. + #[allow(dead_code)] + pub(crate) skip_fields: bool, + /// Index of the row group to read. + pub(crate) row_group_idx: usize, + /// Row selection for the row group. `None` means all rows. + pub(crate) row_selection: Option, + /// Metrics for tracking fetch operations. + pub(crate) fetch_metrics: Option<&'a ParquetFetchMetrics>, } impl RowGroupReaderBuilder { @@ -1679,11 +1718,58 @@ impl RowGroupReaderBuilder { &self.cache_strategy } + pub(crate) fn has_flat_primary_key_prefilter(&self) -> bool { + self.prefilter_builder.is_some() + } + /// Builds a [ParquetRecordBatchStream] to read the row group at `row_group_idx`. + /// + /// If prefiltering is applicable (based on `build_ctx`), this performs a two-phase read: + /// 1. Reads only the prefilter columns (e.g. PK column), applies filters to get a refined row selection + /// 2. Reads the full projection with the refined row selection pub(crate) async fn build( + &self, + build_ctx: RowGroupBuildContext<'_>, + ) -> Result> { + let prefilter_ctx = self.prefilter_builder.as_ref().map(|b| b.build()); + + let Some(mut prefilter_ctx) = prefilter_ctx else { + // No prefilter applicable, build stream with full projection. + return self + .build_with_projection( + build_ctx.row_group_idx, + build_ctx.row_selection, + self.projection.clone(), + build_ctx.fetch_metrics, + ) + .await; + }; + + let prefilter_start = Instant::now(); + let prefilter_result = execute_prefilter(&mut prefilter_ctx, self, &build_ctx).await?; + if let Some(metrics) = build_ctx.fetch_metrics { + let mut data = metrics.data.lock().unwrap(); + data.prefilter_cost += prefilter_start.elapsed(); + data.prefilter_filtered_rows += prefilter_result.filtered_rows; + } + + let refined_selection = Some(prefilter_result.refined_selection); + + self.build_with_projection( + build_ctx.row_group_idx, + refined_selection, + self.projection.clone(), + build_ctx.fetch_metrics, + ) + .await + } + + /// Builds a [ParquetRecordBatchStream] with a custom projection mask. + pub(crate) async fn build_with_projection( &self, row_group_idx: usize, row_selection: Option, + projection: ProjectionMask, fetch_metrics: Option<&ParquetFetchMetrics>, ) -> Result> { // Create async file reader with caching support. @@ -1704,7 +1790,7 @@ impl RowGroupReaderBuilder { ); builder = builder .with_row_groups(vec![row_group_idx]) - .with_projection(self.projection.clone()) + .with_projection(projection) .with_batch_size(DEFAULT_READ_BATCH_SIZE); if let Some(selection) = row_selection { @@ -1739,6 +1825,8 @@ pub(crate) struct SimpleFilterContext { semantic_type: SemanticType, /// The data type of the column. data_type: ConcreteDataType, + /// Whether this filter can be applied by flat parquet primary-key prefiltering. + usable_primary_key_filter: bool, } impl SimpleFilterContext { @@ -1752,6 +1840,10 @@ impl SimpleFilterContext { expr: &Expr, ) -> Option { let filter = SimpleFilterEvaluator::try_new(expr)?; + // Parquet PK prefilter always supports the partition column. Only + // PartitionTreeMemtable skips it after partition pruning. + let usable_primary_key_filter = + is_usable_primary_key_filter(sst_meta, expected_meta, &filter); let (column_metadata, maybe_filter) = match expected_meta { Some(meta) => { // Gets the column metadata from the expected metadata. @@ -1782,11 +1874,15 @@ impl SimpleFilterContext { } }; + let usable_primary_key_filter = + matches!(maybe_filter, MaybeFilter::Filter(_)) && usable_primary_key_filter; + Some(Self { filter: maybe_filter, column_id: column_metadata.column_id, semantic_type: column_metadata.semantic_type, data_type: column_metadata.column_schema.data_type.clone(), + usable_primary_key_filter, }) } @@ -1809,6 +1905,23 @@ impl SimpleFilterContext { pub(crate) fn data_type(&self) -> &ConcreteDataType { &self.data_type } + + /// Returns whether this filter is eligible for flat parquet PK prefiltering. + pub(crate) fn usable_primary_key_filter(&self) -> bool { + self.usable_primary_key_filter + } + + /// Returns the filter evaluator when it is eligible for PK prefiltering. + pub(crate) fn primary_key_prefilter(&self) -> Option { + if !self.usable_primary_key_filter { + return None; + } + + match &self.filter { + MaybeFilter::Filter(filter) => Some(filter.clone()), + MaybeFilter::Matched | MaybeFilter::Pruned => None, + } + } } /// Prune a column by its default value. @@ -1856,17 +1969,17 @@ impl ParquetReader { return Ok(None); }; + let skip_fields = self.context.should_skip_fields(row_group_idx); let parquet_reader = self .context .reader_builder() - .build( + .build(self.context.build_context( row_group_idx, Some(row_selection), Some(&self.fetch_metrics), - ) + skip_fields, + )) .await?; - - let skip_fields = self.context.should_skip_fields(row_group_idx); self.reader = Some(FlatPruneReader::new_with_row_group_reader( self.context.clone(), FlatRowGroupReader::new(self.context.clone(), parquet_reader), @@ -1889,11 +2002,16 @@ impl ParquetReader { debug_assert!(context.read_format().as_flat().is_some()); let fetch_metrics = ParquetFetchMetrics::default(); let reader = if let Some((row_group_idx, row_selection)) = selection.pop_first() { + let skip_fields = context.should_skip_fields(row_group_idx); let parquet_reader = context .reader_builder() - .build(row_group_idx, Some(row_selection), Some(&fetch_metrics)) + .build(context.build_context( + row_group_idx, + Some(row_selection), + Some(&fetch_metrics), + skip_fields, + )) .await?; - let skip_fields = context.should_skip_fields(row_group_idx); Some(FlatPruneReader::new_with_row_group_reader( context.clone(), FlatRowGroupReader::new(context.clone(), parquet_reader), @@ -2111,11 +2229,15 @@ mod tests { use datafusion_expr::expr::ScalarFunction; use datafusion_expr::{ ColumnarValue, Expr, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, Volatility, + col, lit, }; use datatypes::arrow::array::{ArrayRef, Int64Array}; use datatypes::arrow::record_batch::RecordBatch; + use datatypes::prelude::ConcreteDataType; + use datatypes::schema::ColumnSchema; use object_store::services::Memory; use parquet::arrow::ArrowWriter; + use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder}; use store_api::region_request::PathType; use table::predicate::Predicate; @@ -2207,4 +2329,80 @@ mod tests { assert!(!selection.is_empty()); } + + fn expected_metadata_with_reused_tag_name( + old_metadata: &RegionMetadata, + ) -> Arc { + let mut builder = RegionMetadataBuilder::new(old_metadata.region_id); + builder + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "tag_0".to_string(), + ConcreteDataType::string_datatype(), + true, + ), + semantic_type: SemanticType::Tag, + column_id: 10, + }) + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "tag_1".to_string(), + ConcreteDataType::string_datatype(), + true, + ), + semantic_type: SemanticType::Tag, + column_id: 1, + }) + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "field_0".to_string(), + ConcreteDataType::uint64_datatype(), + true, + ), + semantic_type: SemanticType::Field, + column_id: 2, + }) + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "ts".to_string(), + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + semantic_type: SemanticType::Timestamp, + column_id: 3, + }) + .primary_key(vec![10, 1]); + + Arc::new(builder.build().unwrap()) + } + + #[test] + fn test_simple_filter_context_marks_usable_primary_key_filter() { + let metadata: RegionMetadataRef = Arc::new(sst_region_metadata()); + let ctx = + SimpleFilterContext::new_opt(&metadata, None, &col("tag_0").eq(lit("a"))).unwrap(); + + assert!(ctx.usable_primary_key_filter()); + assert!(ctx.primary_key_prefilter().is_some()); + } + + #[test] + fn test_simple_filter_context_skips_non_usable_primary_key_filter() { + let metadata: RegionMetadataRef = Arc::new(sst_region_metadata()); + + let field_ctx = + SimpleFilterContext::new_opt(&metadata, None, &col("field_0").eq(lit(1_u64))).unwrap(); + assert!(!field_ctx.usable_primary_key_filter()); + assert!(field_ctx.primary_key_prefilter().is_none()); + + let expected_metadata = expected_metadata_with_reused_tag_name(metadata.as_ref()); + let mismatched_ctx = SimpleFilterContext::new_opt( + &metadata, + Some(expected_metadata.as_ref()), + &col("tag_0").eq(lit("a")), + ) + .unwrap(); + assert!(!mismatched_ctx.usable_primary_key_filter()); + assert!(mismatched_ctx.primary_key_prefilter().is_none()); + } } diff --git a/src/mito2/src/sst/parquet/row_group.rs b/src/mito2/src/sst/parquet/row_group.rs index 38ef62c6b8..8822882c5d 100644 --- a/src/mito2/src/sst/parquet/row_group.rs +++ b/src/mito2/src/sst/parquet/row_group.rs @@ -48,12 +48,16 @@ pub struct ParquetFetchMetricsData { pub store_fetch_elapsed: std::time::Duration, /// Total elapsed time for fetching row groups. pub total_fetch_elapsed: std::time::Duration, + /// Elapsed time for prefilter execution. + pub prefilter_cost: std::time::Duration, + /// Number of rows filtered out by prefiltering. + pub prefilter_filtered_rows: usize, } impl ParquetFetchMetricsData { /// Returns true if the metrics are empty (contain no meaningful data). fn is_empty(&self) -> bool { - self.total_fetch_elapsed.is_zero() + self.total_fetch_elapsed.is_zero() && self.prefilter_cost.is_zero() } } @@ -84,6 +88,8 @@ impl std::fmt::Debug for ParquetFetchMetrics { write_cache_fetch_elapsed, store_fetch_elapsed, total_fetch_elapsed, + prefilter_cost, + prefilter_filtered_rows, } = *data; write!(f, "{{")?; @@ -142,6 +148,16 @@ impl std::fmt::Debug for ParquetFetchMetrics { if !store_fetch_elapsed.is_zero() { write!(f, ", \"store_fetch_elapsed\":\"{:?}\"", store_fetch_elapsed)?; } + if !prefilter_cost.is_zero() { + write!(f, ", \"prefilter_cost\":\"{:?}\"", prefilter_cost)?; + } + if prefilter_filtered_rows > 0 { + write!( + f, + ", \"prefilter_filtered_rows\":{}", + prefilter_filtered_rows + )?; + } write!(f, "}}") } @@ -169,6 +185,8 @@ impl ParquetFetchMetrics { write_cache_fetch_elapsed, store_fetch_elapsed, total_fetch_elapsed, + prefilter_cost, + prefilter_filtered_rows, } = *other.data.lock().unwrap(); let mut data = self.data.lock().unwrap(); @@ -185,6 +203,8 @@ impl ParquetFetchMetrics { data.write_cache_fetch_elapsed += write_cache_fetch_elapsed; data.store_fetch_elapsed += store_fetch_elapsed; data.total_fetch_elapsed += total_fetch_elapsed; + data.prefilter_cost += prefilter_cost; + data.prefilter_filtered_rows += prefilter_filtered_rows; } } diff --git a/src/mito2/src/sst/parquet/row_selection.rs b/src/mito2/src/sst/parquet/row_selection.rs index 595f1d352a..763e244ef2 100644 --- a/src/mito2/src/sst/parquet/row_selection.rs +++ b/src/mito2/src/sst/parquet/row_selection.rs @@ -554,11 +554,43 @@ fn intersect_row_selections(left: &RowSelection, right: &RowSelection) -> RowSel /// or if there's a gap that requires skipping rows. It handles both "select" and "skip" actions, /// optimizing the list of selectors by merging contiguous actions of the same type. /// +/// The returned selection intentionally stops at the end of the last matched range and may omit a +/// trailing `skip` that would extend it to `total_row_count`. That is fine when the selection is +/// used directly by the parquet reader, which simply stops once the selectors are exhausted. +/// /// Note: overlapping ranges are not supported and will result in an incorrect selection. pub(crate) fn row_selection_from_row_ranges( row_ranges: impl Iterator>, total_row_count: usize, ) -> RowSelection { + let (selectors, _) = build_selectors_from_row_ranges(row_ranges, total_row_count); + RowSelection::from(selectors) +} + +/// Like [`row_selection_from_row_ranges`] but guarantees the resulting selection +/// covers exactly `total_row_count` rows by appending a trailing skip if needed. +/// +/// Required when the result is used as the inner operand of [`RowSelection::and_then`], because +/// `and_then` expects the inner selection to account for every row selected by the outer one. +pub(crate) fn row_selection_from_row_ranges_exact( + row_ranges: impl Iterator>, + total_row_count: usize, +) -> RowSelection { + let (mut selectors, last_processed_end) = + build_selectors_from_row_ranges(row_ranges, total_row_count); + if last_processed_end < total_row_count { + // Preserve the full logical length of the selection even when the final rows are all + // filtered out. Without this trailing skip, `and_then` sees an undersized inner + // selection and panics. + add_or_merge_selector(&mut selectors, total_row_count - last_processed_end, true); + } + RowSelection::from(selectors) +} + +fn build_selectors_from_row_ranges( + row_ranges: impl Iterator>, + total_row_count: usize, +) -> (Vec, usize) { let mut selectors: Vec = Vec::new(); let mut last_processed_end = 0; @@ -572,7 +604,7 @@ pub(crate) fn row_selection_from_row_ranges( last_processed_end = end; } - RowSelection::from(selectors) + (selectors, last_processed_end) } /// Converts an iterator of sorted row IDs into a `RowSelection`. @@ -707,6 +739,56 @@ mod tests { assert_eq!(selection, expected); } + #[test] + fn test_exact_single_range_with_trailing_skip() { + let selection = row_selection_from_row_ranges_exact(Some(0..3).into_iter(), 6); + let expected = RowSelection::from(vec![RowSelector::select(3), RowSelector::skip(3)]); + assert_eq!(selection, expected); + assert_eq!(selection.row_count(), 3); + } + + #[test] + fn test_exact_non_contiguous_ranges() { + let ranges = [1..3, 5..8]; + let selection = row_selection_from_row_ranges_exact(ranges.iter().cloned(), 10); + let expected = RowSelection::from(vec![ + RowSelector::skip(1), + RowSelector::select(2), + RowSelector::skip(2), + RowSelector::select(3), + RowSelector::skip(2), + ]); + assert_eq!(selection, expected); + assert_eq!(selection.row_count(), 5); + } + + #[test] + fn test_exact_empty_ranges() { + let selection = row_selection_from_row_ranges_exact([].iter().cloned(), 10); + let expected = RowSelection::from(vec![RowSelector::skip(10)]); + assert_eq!(selection, expected); + assert_eq!(selection.row_count(), 0); + } + + #[test] + fn test_exact_range_covers_all_rows() { + let selection = row_selection_from_row_ranges_exact(Some(0..10).into_iter(), 10); + let expected = RowSelection::from(vec![RowSelector::select(10)]); + assert_eq!(selection, expected); + assert_eq!(selection.row_count(), 10); + } + + #[test] + fn test_exact_compatible_with_and_then() { + // Outer selects rows 0..6 out of 10. + let outer = RowSelection::from(vec![RowSelector::select(6), RowSelector::skip(4)]); + // Inner: within those 6 rows, select only rows 0..3. + let inner = row_selection_from_row_ranges_exact(Some(0..3).into_iter(), 6); + let result = outer.and_then(&inner); + let expected = RowSelection::from(vec![RowSelector::select(3), RowSelector::skip(7)]); + assert_eq!(result, expected); + } + #[test] fn test_row_ids_to_selection() { let row_ids = [1, 3, 5, 7, 9].into_iter(); From 6bd14aaf9f1d051c1967fdc09638f8d3abb82544 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Mon, 30 Mar 2026 16:22:37 +0800 Subject: [PATCH 053/195] fix: correct app-name for dashboard (#7884) --- Cargo.lock | 1 - src/cmd/Cargo.toml | 1 - src/cmd/src/cli.rs | 2 +- src/cmd/src/datanode.rs | 2 +- src/cmd/src/flownode.rs | 3 +-- src/cmd/src/frontend.rs | 2 +- src/cmd/src/metasrv.rs | 3 +-- src/cmd/src/standalone.rs | 3 +-- 8 files changed, 6 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0f3b58b373..0cab5067bb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2055,7 +2055,6 @@ dependencies = [ "common-time", "common-version", "common-wal", - "const_format", "datafusion", "datafusion-common", "datafusion-physical-plan", diff --git a/src/cmd/Cargo.toml b/src/cmd/Cargo.toml index 003f1434f4..d547ec6e81 100644 --- a/src/cmd/Cargo.toml +++ b/src/cmd/Cargo.toml @@ -54,7 +54,6 @@ common-telemetry = { workspace = true, features = [ common-time.workspace = true common-version.workspace = true common-wal.workspace = true -const_format.workspace = true datafusion.workspace = true datafusion-common.workspace = true datafusion-physical-plan.workspace = true diff --git a/src/cmd/src/cli.rs b/src/cmd/src/cli.rs index 501b7b1615..84e797c291 100644 --- a/src/cmd/src/cli.rs +++ b/src/cmd/src/cli.rs @@ -21,7 +21,7 @@ use tracing_appender::non_blocking::WorkerGuard; use crate::options::GlobalOptions; use crate::{App, Result, error}; -pub const APP_NAME: &str = const_format::concatcp!(common_version::product_name(), "-cli"); +pub const APP_NAME: &str = "greptime-cli"; use async_trait::async_trait; pub struct Instance { diff --git a/src/cmd/src/datanode.rs b/src/cmd/src/datanode.rs index 2fadb1d210..06e2568b72 100644 --- a/src/cmd/src/datanode.rs +++ b/src/cmd/src/datanode.rs @@ -43,7 +43,7 @@ use crate::error::{ }; use crate::options::{GlobalOptions, GreptimeOptions}; -pub const APP_NAME: &str = const_format::concatcp!(common_version::product_name(), "-datanode"); +pub const APP_NAME: &str = "greptime-datanode"; type DatanodeOptions = GreptimeOptions; diff --git a/src/cmd/src/flownode.rs b/src/cmd/src/flownode.rs index 8e3277cdb3..3f8458cddf 100644 --- a/src/cmd/src/flownode.rs +++ b/src/cmd/src/flownode.rs @@ -35,7 +35,6 @@ use common_stat::ResourceStatImpl; use common_telemetry::info; use common_telemetry::logging::{DEFAULT_LOGGING_DIR, TracingOptions}; use common_version::{short_version, verbose_version}; -use const_format::concatcp; use flow::{ FlownodeBuilder, FlownodeInstance, FlownodeServiceBuilder, FrontendClient, FrontendInvoker, get_flow_auth_options, @@ -53,7 +52,7 @@ use crate::error::{ use crate::options::{GlobalOptions, GreptimeOptions}; use crate::{App, create_resource_limit_metrics, log_versions, maybe_activate_heap_profile}; -pub const APP_NAME: &str = concatcp!(common_version::product_name(), "-flownode"); +pub const APP_NAME: &str = "greptime-flownode"; type FlownodeOptions = GreptimeOptions; diff --git a/src/cmd/src/frontend.rs b/src/cmd/src/frontend.rs index 07c9f775f2..cb802791c5 100644 --- a/src/cmd/src/frontend.rs +++ b/src/cmd/src/frontend.rs @@ -72,7 +72,7 @@ pub struct Instance { _guard: Vec, } -pub const APP_NAME: &str = const_format::concatcp!(common_version::product_name(), "-frontend"); +pub const APP_NAME: &str = "greptime-frontend"; impl Instance { pub fn new(frontend: Frontend, _guard: Vec) -> Self { diff --git a/src/cmd/src/metasrv.rs b/src/cmd/src/metasrv.rs index dec9edc193..2ce5fb3a02 100644 --- a/src/cmd/src/metasrv.rs +++ b/src/cmd/src/metasrv.rs @@ -24,7 +24,6 @@ use common_meta::distributed_time_constants::init_distributed_time_constants; use common_telemetry::info; use common_telemetry::logging::{DEFAULT_LOGGING_DIR, TracingOptions}; use common_version::{short_version, verbose_version}; -use const_format::concatcp; use meta_srv::bootstrap::{MetasrvInstance, metasrv_builder}; use meta_srv::metasrv::BackendImpl; use snafu::ResultExt; @@ -36,7 +35,7 @@ use crate::{App, create_resource_limit_metrics, log_versions, maybe_activate_hea type MetasrvOptions = GreptimeOptions; -pub const APP_NAME: &str = concatcp!(common_version::product_name(), "-metasrv"); +pub const APP_NAME: &str = "greptime-metasrv"; pub struct Instance { instance: MetasrvInstance, diff --git a/src/cmd/src/standalone.rs b/src/cmd/src/standalone.rs index 196ff07c92..215bea0ec5 100644 --- a/src/cmd/src/standalone.rs +++ b/src/cmd/src/standalone.rs @@ -48,7 +48,6 @@ use common_telemetry::info; use common_telemetry::logging::{DEFAULT_LOGGING_DIR, TracingOptions}; use common_time::timezone::set_default_timezone; use common_version::{short_version, verbose_version}; -use const_format::concatcp; use datanode::config::DatanodeOptions; use datanode::datanode::{Datanode, DatanodeBuilder}; use datanode::region_server::RegionServer; @@ -76,7 +75,7 @@ use crate::error::{OtherSnafu, Result, StartFlownodeSnafu}; use crate::options::{GlobalOptions, GreptimeOptions}; use crate::{App, create_resource_limit_metrics, error, log_versions, maybe_activate_heap_profile}; -pub const APP_NAME: &str = concatcp!(common_version::product_name(), "-standalone"); +pub const APP_NAME: &str = "greptime-standalone"; #[derive(Parser)] pub struct Command { From a8fe6b5e44fc47ccbebd33ac08d8541efbc91490 Mon Sep 17 00:00:00 2001 From: shuiyisong <113876041+shuiyisong@users.noreply.github.com> Date: Mon, 30 Mar 2026 16:30:32 +0800 Subject: [PATCH 054/195] fix: allow auto type upscale conversion in trace ingestion (#7870) * fix: allow auto type upscale conversion in trace ingestion Signed-off-by: shuiyisong * fix: immediate return when parse fails Signed-off-by: shuiyisong * fix: typos Signed-off-by: shuiyisong * test: add integration test and fix Signed-off-by: shuiyisong * feat: add Int/Float/Bool to String conversion Signed-off-by: shuiyisong * refactor: coerce rows together Signed-off-by: shuiyisong * refactor: extract coerce Signed-off-by: shuiyisong * refactor: save clone Signed-off-by: shuiyisong * chore: add comments Signed-off-by: shuiyisong * refactor: unify in- and cross-batch check Signed-off-by: shuiyisong * chore: add comments Signed-off-by: shuiyisong --------- Signed-off-by: shuiyisong --- src/frontend/src/instance/otlp.rs | 233 ++++++++++++++- src/servers/src/otlp/trace.rs | 1 + src/servers/src/otlp/trace/coerce.rs | 343 ++++++++++++++++++++++ src/servers/src/otlp/trace/v1.rs | 265 +++++++++++++++-- src/servers/src/row_writer.rs | 29 ++ tests-integration/tests/http.rs | 418 +++++++++++++++++++++++++++ 6 files changed, 1258 insertions(+), 31 deletions(-) create mode 100644 src/servers/src/otlp/trace/coerce.rs diff --git a/src/frontend/src/instance/otlp.rs b/src/frontend/src/instance/otlp.rs index 52df274780..9b21f9924f 100644 --- a/src/frontend/src/instance/otlp.rs +++ b/src/frontend/src/instance/otlp.rs @@ -14,6 +14,8 @@ use std::sync::Arc; +use api::helper::ColumnDataTypeWrapper; +use api::v1::{ColumnDataType, RowInsertRequests}; use async_trait::async_trait; use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq}; use client::Output; @@ -24,10 +26,14 @@ use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest; use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest; use otel_arrow_rust::proto::opentelemetry::collector::metrics::v1::ExportMetricsServiceRequest; use pipeline::{GreptimePipelineParams, PipelineWay}; -use servers::error::{self, AuthSnafu, Result as ServerResult}; +use servers::error::{self, AuthSnafu, CatalogSnafu, Result as ServerResult}; use servers::http::prom_store::PHYSICAL_TABLE_PARAM; use servers::interceptor::{OpenTelemetryProtocolInterceptor, OpenTelemetryProtocolInterceptorRef}; use servers::otlp; +use servers::otlp::trace::coerce::{ + coerce_value_data, is_supported_trace_coercion, resolve_new_trace_column_type, + trace_value_datatype, +}; use servers::query_handler::{OpenTelemetryProtocolHandler, PipelineHandlerRef}; use session::context::QueryContextRef; use snafu::ResultExt; @@ -124,7 +130,7 @@ impl OpenTelemetryProtocolHandler for Instance { let is_trace_v1_model = matches!(pipeline, PipelineWay::OtlpTraceDirectV1); - let (requests, rows) = otlp::trace::to_grpc_insert_requests( + let (mut requests, rows) = otlp::trace::to_grpc_insert_requests( request, pipeline, pipeline_params, @@ -136,6 +142,8 @@ impl OpenTelemetryProtocolHandler for Instance { OTLP_TRACES_ROWS.inc_by(rows as u64); if is_trace_v1_model { + self.reconcile_trace_column_types(&mut requests, &ctx) + .await?; self.handle_trace_inserts(requests, ctx) .await .map_err(BoxedError::new) @@ -200,3 +208,224 @@ impl OpenTelemetryProtocolHandler for Instance { Ok(outputs) } } + +impl Instance { + /// Picks the final datatype for one trace column. + /// + /// Existing table schema is authoritative when present. Otherwise we resolve the + /// request-local observed types using the shared trace coercion rules. + fn choose_trace_target_type( + observed_types: &[ColumnDataType], + existing_type: Option, + ) -> ServerResult> { + let Some(existing_type) = existing_type else { + return resolve_new_trace_column_type(observed_types.iter().copied()).map_err(|_| { + error::InvalidParameterSnafu { + reason: "unsupported trace type mix".to_string(), + } + .build() + }); + }; + + if observed_types.iter().copied().all(|request_type| { + request_type == existing_type + || is_supported_trace_coercion(request_type, existing_type) + }) { + Ok(Some(existing_type)) + } else { + error::InvalidParameterSnafu { + reason: "unsupported trace type mix".to_string(), + } + .fail() + } + } + + /// Coerce request column types and values to match the existing table schema + /// for compatible type pairs. Existing table schema wins when present; + /// otherwise the full request batch decides a stable target type. + async fn reconcile_trace_column_types( + &self, + requests: &mut RowInsertRequests, + ctx: &QueryContextRef, + ) -> ServerResult<()> { + let catalog = ctx.current_catalog(); + let schema = ctx.current_schema(); + + for req in &mut requests.inserts { + let table = self + .catalog_manager + .table(catalog, &schema, &req.table_name, None) + .await + .context(CatalogSnafu)?; + + let Some(rows) = req.rows.as_mut() else { + continue; + }; + + let table_schema = table.map(|table| table.schema()); + let mut pending_coercions = Vec::new(); + + for (col_idx, col_schema) in rows.schema.iter().enumerate() { + let Some(current_type) = ColumnDataType::try_from(col_schema.datatype).ok() else { + continue; + }; + + let mut observed_types = Vec::new(); + push_observed_trace_type(&mut observed_types, current_type); + + // Scan the full request first so the final type decision is not affected + // by row order inside the batch. + for row in &rows.rows { + let Some(value) = row + .values + .get(col_idx) + .and_then(|value| value.value_data.as_ref()) + else { + continue; + }; + + let Some(value_type) = trace_value_datatype(value) else { + continue; + }; + push_observed_trace_type(&mut observed_types, value_type); + } + + let existing_type = table_schema + .as_ref() + .and_then(|schema| schema.column_schema_by_name(&col_schema.column_name)) + .and_then(|table_col| { + ColumnDataTypeWrapper::try_from(table_col.data_type.clone()) + .ok() + .map(|wrapper| wrapper.datatype()) + }); + + if !observed_types + .iter() + .copied() + .any(is_trace_reconcile_candidate_type) + && existing_type + .map(|datatype| !is_trace_reconcile_candidate_type(datatype)) + .unwrap_or(true) + { + continue; + } + + // Decide the final type once per column, then rewrite all affected cells + // together in one row pass below. + let Some(target_type) = + Self::choose_trace_target_type(&observed_types, existing_type).map_err( + |_| { + enrich_trace_reconcile_error( + &req.table_name, + &col_schema.column_name, + &observed_types, + existing_type, + ) + }, + )? + else { + continue; + }; + + if observed_types + .iter() + .all(|observed| *observed == target_type) + && col_schema.datatype == target_type as i32 + { + continue; + } + + pending_coercions.push((col_idx, target_type, col_schema.column_name.clone())); + } + + if pending_coercions.is_empty() { + continue; + } + + // Update schema metadata before mutating row values so both stay in sync. + for (col_idx, target_type, ..) in &pending_coercions { + rows.schema[*col_idx].datatype = *target_type as i32; + } + + // Apply all pending column rewrites in one row pass. + for row in &mut rows.rows { + for (col_idx, target_type, column_name) in &pending_coercions { + let Some(value) = row.values.get_mut(*col_idx) else { + continue; + }; + let Some(request_type) = + value.value_data.as_ref().and_then(trace_value_datatype) + else { + continue; + }; + if request_type == *target_type { + continue; + } + + value.value_data = coerce_value_data( + &value.value_data, + *target_type, + request_type, + ) + .map_err(|_| { + error::InvalidParameterSnafu { + reason: format!( + "failed to coerce trace column '{}' in table '{}' from {:?} to {:?}", + column_name, req.table_name, request_type, target_type + ), + } + .build() + })?; + } + } + } + + Ok(()) + } +} + +fn enrich_trace_reconcile_error( + table_name: &str, + column_name: &str, + observed_types: &[ColumnDataType], + existing_type: Option, +) -> servers::error::Error { + let observed_types = observed_types + .iter() + .map(|datatype| format!("{datatype:?}")) + .collect::>() + .join(", "); + + error::InvalidParameterSnafu { + reason: match existing_type { + Some(existing_type) => format!( + "failed to reconcile trace column '{}' in table '{}' with observed types [{}] against existing {:?}", + column_name, table_name, observed_types, existing_type + ), + None => format!( + "failed to reconcile trace column '{}' in table '{}' with observed types [{}]", + column_name, table_name, observed_types + ), + }, + } + .build() +} + +/// Only these trace scalar types participate in reconciliation. Other column kinds +/// such as JSON and binary keep their original write path and schema checks. +fn is_trace_reconcile_candidate_type(datatype: ColumnDataType) -> bool { + matches!( + datatype, + ColumnDataType::String + | ColumnDataType::Boolean + | ColumnDataType::Int64 + | ColumnDataType::Float64 + ) +} + +/// Keeps the observed type list small without depending on enum ordering. +fn push_observed_trace_type(observed_types: &mut Vec, datatype: ColumnDataType) { + if !observed_types.contains(&datatype) { + observed_types.push(datatype); + } +} diff --git a/src/servers/src/otlp/trace.rs b/src/servers/src/otlp/trace.rs index b724bb1d22..ca56f9b868 100644 --- a/src/servers/src/otlp/trace.rs +++ b/src/servers/src/otlp/trace.rs @@ -13,6 +13,7 @@ // limitations under the License. pub mod attributes; +pub mod coerce; pub mod span; pub mod v0; pub mod v1; diff --git a/src/servers/src/otlp/trace/coerce.rs b/src/servers/src/otlp/trace/coerce.rs new file mode 100644 index 0000000000..febec0fda9 --- /dev/null +++ b/src/servers/src/otlp/trace/coerce.rs @@ -0,0 +1,343 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use api::v1::ColumnDataType; +use api::v1::value::ValueData; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TraceCoerceError { + Unsupported, +} + +// For now we support the following coercions: +// - Int64 to Float64 +// - Int64 to String +// - Float64 to String +// - Boolean to String +// The following coercions are supported with parse, which could fail: +// If fails, we will return TraceCoerceError::Unsupported. +// - String to Int64 +// - String to Float64 +// - String to Boolean +pub fn is_supported_trace_coercion( + request_type: ColumnDataType, + target_type: ColumnDataType, +) -> bool { + matches!( + (request_type, target_type), + (ColumnDataType::Int64, ColumnDataType::Float64) + | (ColumnDataType::Int64, ColumnDataType::String) + | (ColumnDataType::Float64, ColumnDataType::String) + | (ColumnDataType::Boolean, ColumnDataType::String) + | (ColumnDataType::String, ColumnDataType::Int64) + | (ColumnDataType::String, ColumnDataType::Float64) + | (ColumnDataType::String, ColumnDataType::Boolean) + ) +} + +pub fn coerce_value_data( + value: &Option, + target: ColumnDataType, + request_type: ColumnDataType, +) -> Result, TraceCoerceError> { + let Some(v) = value else { + return Ok(None); + }; + + let Some(value) = coerce_non_null_value(target, request_type, v) else { + return Err(TraceCoerceError::Unsupported); + }; + Ok(Some(value)) +} + +pub fn coerce_non_null_value( + target: ColumnDataType, + request_type: ColumnDataType, + value: &ValueData, +) -> Option { + match (request_type, target, value) { + (ColumnDataType::Int64, ColumnDataType::Float64, ValueData::I64Value(n)) => { + Some(ValueData::F64Value(*n as f64)) + } + (ColumnDataType::Int64, ColumnDataType::String, ValueData::I64Value(n)) => { + Some(ValueData::StringValue(n.to_string())) + } + (ColumnDataType::Float64, ColumnDataType::String, ValueData::F64Value(n)) => { + Some(ValueData::StringValue(n.to_string())) + } + (ColumnDataType::Boolean, ColumnDataType::String, ValueData::BoolValue(b)) => { + Some(ValueData::StringValue(b.to_string())) + } + (ColumnDataType::String, ColumnDataType::Int64, ValueData::StringValue(s)) => { + s.parse::().ok().map(ValueData::I64Value) + } + (ColumnDataType::String, ColumnDataType::Float64, ValueData::StringValue(s)) => { + s.parse::().ok().map(ValueData::F64Value) + } + (ColumnDataType::String, ColumnDataType::Boolean, ValueData::StringValue(s)) => { + s.parse::().ok().map(ValueData::BoolValue) + } + _ => None, + } +} + +pub fn trace_value_datatype(value: &ValueData) -> Option { + match value { + ValueData::StringValue(_) => Some(ColumnDataType::String), + ValueData::BoolValue(_) => Some(ColumnDataType::Boolean), + ValueData::I64Value(_) => Some(ColumnDataType::Int64), + ValueData::F64Value(_) => Some(ColumnDataType::Float64), + ValueData::BinaryValue(_) => Some(ColumnDataType::Binary), + _ => None, + } +} + +/// Resolves the final datatype for a new trace column when there is no existing +/// table schema to override the request-local observations. +pub fn resolve_new_trace_column_type( + observed_types: impl IntoIterator, +) -> Result, TraceCoerceError> { + let mut observed_types = observed_types.into_iter().collect::>(); + observed_types.dedup(); + + match observed_types.as_slice() { + [] => Ok(None), + [datatype] => Ok(Some(*datatype)), + [_, _] + if observed_types.contains(&ColumnDataType::String) + && observed_types.contains(&ColumnDataType::Boolean) => + { + Ok(Some(ColumnDataType::Boolean)) + } + [_, _] + if observed_types.contains(&ColumnDataType::String) + && observed_types.contains(&ColumnDataType::Int64) => + { + Ok(Some(ColumnDataType::Int64)) + } + [_, _] + if observed_types.contains(&ColumnDataType::String) + && observed_types.contains(&ColumnDataType::Float64) => + { + Ok(Some(ColumnDataType::Float64)) + } + [_, _] + if observed_types.contains(&ColumnDataType::Int64) + && observed_types.contains(&ColumnDataType::Float64) => + { + Ok(Some(ColumnDataType::Float64)) + } + _ => Err(TraceCoerceError::Unsupported), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_coerce_int64_to_float64() { + let result = coerce_value_data( + &Some(ValueData::I64Value(42)), + ColumnDataType::Float64, + ColumnDataType::Int64, + ); + assert_eq!(result, Ok(Some(ValueData::F64Value(42.0)))); + } + + #[test] + fn test_coerce_string_to_int64() { + let result = coerce_value_data( + &Some(ValueData::StringValue("123".to_string())), + ColumnDataType::Int64, + ColumnDataType::String, + ); + assert_eq!(result, Ok(Some(ValueData::I64Value(123)))); + } + + #[test] + fn test_coerce_int64_to_string() { + let result = coerce_value_data( + &Some(ValueData::I64Value(123)), + ColumnDataType::String, + ColumnDataType::Int64, + ); + assert_eq!(result, Ok(Some(ValueData::StringValue("123".to_string())))); + } + + #[test] + fn test_coerce_string_to_float64() { + let result = coerce_value_data( + &Some(ValueData::StringValue("1.5".to_string())), + ColumnDataType::Float64, + ColumnDataType::String, + ); + assert_eq!(result, Ok(Some(ValueData::F64Value(1.5)))); + } + + #[test] + fn test_coerce_float64_to_string() { + let result = coerce_value_data( + &Some(ValueData::F64Value(1.5)), + ColumnDataType::String, + ColumnDataType::Float64, + ); + assert_eq!(result, Ok(Some(ValueData::StringValue("1.5".to_string())))); + } + + #[test] + fn test_coerce_string_to_boolean() { + let result = coerce_value_data( + &Some(ValueData::StringValue("true".to_string())), + ColumnDataType::Boolean, + ColumnDataType::String, + ); + assert_eq!(result, Ok(Some(ValueData::BoolValue(true)))); + + let result = coerce_value_data( + &Some(ValueData::StringValue("false".to_string())), + ColumnDataType::Boolean, + ColumnDataType::String, + ); + assert_eq!(result, Ok(Some(ValueData::BoolValue(false)))); + } + + #[test] + fn test_coerce_boolean_to_string() { + let result = coerce_value_data( + &Some(ValueData::BoolValue(true)), + ColumnDataType::String, + ColumnDataType::Boolean, + ); + assert_eq!(result, Ok(Some(ValueData::StringValue("true".to_string())))); + } + + #[test] + fn test_coerce_unparsable_string() { + let result = coerce_value_data( + &Some(ValueData::StringValue("not_a_number".to_string())), + ColumnDataType::Int64, + ColumnDataType::String, + ); + assert_eq!(result, Err(TraceCoerceError::Unsupported)); + } + + #[test] + fn test_coerce_float64_to_int64_not_supported() { + let result = coerce_value_data( + &Some(ValueData::F64Value(1.5)), + ColumnDataType::Int64, + ColumnDataType::Float64, + ); + assert_eq!(result, Err(TraceCoerceError::Unsupported)); + } + + #[test] + fn test_coerce_none_value() { + let result = coerce_value_data(&None, ColumnDataType::Float64, ColumnDataType::Int64); + assert_eq!(result, Ok(None)); + } + + #[test] + fn test_is_supported_trace_coercion() { + assert!(is_supported_trace_coercion( + ColumnDataType::Int64, + ColumnDataType::Float64 + )); + assert!(is_supported_trace_coercion( + ColumnDataType::Int64, + ColumnDataType::String + )); + assert!(is_supported_trace_coercion( + ColumnDataType::Float64, + ColumnDataType::String + )); + assert!(is_supported_trace_coercion( + ColumnDataType::Boolean, + ColumnDataType::String + )); + assert!(is_supported_trace_coercion( + ColumnDataType::String, + ColumnDataType::Int64 + )); + assert!(is_supported_trace_coercion( + ColumnDataType::String, + ColumnDataType::Float64 + )); + assert!(is_supported_trace_coercion( + ColumnDataType::String, + ColumnDataType::Boolean + )); + assert!(!is_supported_trace_coercion( + ColumnDataType::Binary, + ColumnDataType::Json + )); + } + + #[test] + fn test_trace_value_datatype() { + assert_eq!( + trace_value_datatype(&ValueData::StringValue("x".to_string())), + Some(ColumnDataType::String) + ); + assert_eq!( + trace_value_datatype(&ValueData::BoolValue(true)), + Some(ColumnDataType::Boolean) + ); + assert_eq!( + trace_value_datatype(&ValueData::I64Value(1)), + Some(ColumnDataType::Int64) + ); + assert_eq!( + trace_value_datatype(&ValueData::F64Value(1.0)), + Some(ColumnDataType::Float64) + ); + assert_eq!( + trace_value_datatype(&ValueData::BinaryValue(vec![1_u8])), + Some(ColumnDataType::Binary) + ); + } + + #[test] + fn test_resolve_new_trace_column_type() { + assert_eq!( + resolve_new_trace_column_type([ColumnDataType::Int64]), + Ok(Some(ColumnDataType::Int64)) + ); + assert_eq!( + resolve_new_trace_column_type([ColumnDataType::String, ColumnDataType::Int64]), + Ok(Some(ColumnDataType::Int64)) + ); + assert_eq!( + resolve_new_trace_column_type([ColumnDataType::String, ColumnDataType::Float64]), + Ok(Some(ColumnDataType::Float64)) + ); + assert_eq!( + resolve_new_trace_column_type([ColumnDataType::String, ColumnDataType::Boolean]), + Ok(Some(ColumnDataType::Boolean)) + ); + assert_eq!( + resolve_new_trace_column_type([ColumnDataType::Int64, ColumnDataType::Float64]), + Ok(Some(ColumnDataType::Float64)) + ); + assert_eq!( + resolve_new_trace_column_type([ + ColumnDataType::String, + ColumnDataType::Int64, + ColumnDataType::Float64, + ]), + Err(TraceCoerceError::Unsupported) + ); + } +} diff --git a/src/servers/src/otlp/trace/v1.rs b/src/servers/src/otlp/trace/v1.rs index 86f8229769..11e986de04 100644 --- a/src/servers/src/otlp/trace/v1.rs +++ b/src/servers/src/otlp/trace/v1.rs @@ -230,7 +230,7 @@ fn write_trace_operations_to_row( Ok(()) } -fn write_attributes( +pub(crate) fn write_attributes( writer: &mut TableData, prefix: &str, attributes: Attributes, @@ -247,44 +247,40 @@ fn write_attributes( let key = format!("{}.{}", prefix, key_suffix); match attr.value.and_then(|v| v.value) { Some(OtlpValue::StringValue(v)) => { - row_writer::write_fields( - writer, - std::iter::once(make_string_column_data(&key, Some(v))), + // Keep the raw request value here. Mixed trace types are reconciled later + // in the frontend once we can also see the existing table schema. + writer.write_field_unchecked( + &key, + ColumnDataType::String, + Some(ValueData::StringValue(v)), row, - )?; + ); } Some(OtlpValue::BoolValue(v)) => { - row_writer::write_fields( - writer, - std::iter::once(make_column_data( - &key, - ColumnDataType::Boolean, - Some(ValueData::BoolValue(v)), - )), + // Do not coerce or promote types while building the request-local rows. + writer.write_field_unchecked( + &key, + ColumnDataType::Boolean, + Some(ValueData::BoolValue(v)), row, - )?; + ); } Some(OtlpValue::IntValue(v)) => { - row_writer::write_fields( - writer, - std::iter::once(make_column_data( - &key, - ColumnDataType::Int64, - Some(ValueData::I64Value(v)), - )), + // Preserving the original value avoids order-dependent behavior inside one batch. + writer.write_field_unchecked( + &key, + ColumnDataType::Int64, + Some(ValueData::I64Value(v)), row, - )?; + ); } Some(OtlpValue::DoubleValue(v)) => { - row_writer::write_fields( - writer, - std::iter::once(make_column_data( - &key, - ColumnDataType::Float64, - Some(ValueData::F64Value(v)), - )), + writer.write_field_unchecked( + &key, + ColumnDataType::Float64, + Some(ValueData::F64Value(v)), row, - )?; + ); } Some(OtlpValue::ArrayValue(v)) => row_writer::write_json( writer, @@ -315,3 +311,214 @@ fn write_attributes( Ok(()) } + +#[cfg(test)] +mod tests { + use api::v1::value::ValueData; + use opentelemetry_proto::tonic::common::v1::any_value::Value as OtlpValue; + use opentelemetry_proto::tonic::common::v1::{AnyValue, KeyValue}; + + use super::*; + use crate::otlp::trace::attributes::Attributes; + use crate::row_writer::TableData; + + fn make_kv(key: &str, value: OtlpValue) -> KeyValue { + KeyValue { + key: key.to_string(), + value: Some(AnyValue { value: Some(value) }), + } + } + + #[test] + fn test_keep_mixed_numeric_values_until_frontend_reconciliation() { + let mut writer = TableData::new(4, 2); + + let attrs1 = Attributes::from(vec![make_kv("val", OtlpValue::DoubleValue(1.5))]); + let mut row1 = writer.alloc_one_row(); + write_attributes(&mut writer, "attr", attrs1, &mut row1).unwrap(); + writer.add_row(row1); + + let attrs2 = Attributes::from(vec![make_kv("val", OtlpValue::IntValue(42))]); + let mut row2 = writer.alloc_one_row(); + write_attributes(&mut writer, "attr", attrs2, &mut row2).unwrap(); + writer.add_row(row2); + + let (schema, rows) = writer.into_schema_and_rows(); + + let col_idx = schema + .iter() + .position(|c| c.column_name == "attr.val") + .unwrap(); + assert_eq!(schema[col_idx].datatype, ColumnDataType::Float64 as i32); + + assert_eq!( + rows[0].values[col_idx].value_data, + Some(ValueData::F64Value(1.5)) + ); + assert_eq!( + rows[1].values[col_idx].value_data, + Some(ValueData::I64Value(42)) + ); + } + + #[test] + fn test_keep_mixed_string_and_int_values_until_frontend_reconciliation() { + let mut writer = TableData::new(4, 2); + + let attrs1 = Attributes::from(vec![make_kv("val", OtlpValue::IntValue(10))]); + let mut row1 = writer.alloc_one_row(); + write_attributes(&mut writer, "attr", attrs1, &mut row1).unwrap(); + writer.add_row(row1); + + let attrs2 = Attributes::from(vec![make_kv( + "val", + OtlpValue::StringValue("20".to_string()), + )]); + let mut row2 = writer.alloc_one_row(); + write_attributes(&mut writer, "attr", attrs2, &mut row2).unwrap(); + writer.add_row(row2); + + let (schema, rows) = writer.into_schema_and_rows(); + let col_idx = schema + .iter() + .position(|c| c.column_name == "attr.val") + .unwrap(); + assert_eq!(schema[col_idx].datatype, ColumnDataType::Int64 as i32); + assert_eq!( + rows[1].values[col_idx].value_data, + Some(ValueData::StringValue("20".to_string())) + ); + } + + #[test] + fn test_keep_first_seen_schema_until_frontend_reconciliation() { + let mut writer = TableData::new(4, 2); + + let attrs1 = Attributes::from(vec![make_kv( + "val", + OtlpValue::StringValue("10".to_string()), + )]); + let mut row1 = writer.alloc_one_row(); + write_attributes(&mut writer, "attr", attrs1, &mut row1).unwrap(); + writer.add_row(row1); + + let attrs2 = Attributes::from(vec![make_kv("val", OtlpValue::IntValue(20))]); + let mut row2 = writer.alloc_one_row(); + write_attributes(&mut writer, "attr", attrs2, &mut row2).unwrap(); + writer.add_row(row2); + + let (schema, rows) = writer.into_schema_and_rows(); + let col_idx = schema + .iter() + .position(|c| c.column_name == "attr.val") + .unwrap(); + assert_eq!(schema[col_idx].datatype, ColumnDataType::String as i32); + assert_eq!( + rows[0].values[col_idx].value_data, + Some(ValueData::StringValue("10".to_string())) + ); + assert_eq!( + rows[1].values[col_idx].value_data, + Some(ValueData::I64Value(20)) + ); + } + + #[test] + fn test_keep_mixed_string_and_float_values_until_frontend_reconciliation() { + let mut writer = TableData::new(4, 2); + + let attrs1 = Attributes::from(vec![make_kv("val", OtlpValue::DoubleValue(1.5))]); + let mut row1 = writer.alloc_one_row(); + write_attributes(&mut writer, "attr", attrs1, &mut row1).unwrap(); + writer.add_row(row1); + + let attrs2 = Attributes::from(vec![make_kv( + "val", + OtlpValue::StringValue("1.5".to_string()), + )]); + let mut row2 = writer.alloc_one_row(); + write_attributes(&mut writer, "attr", attrs2, &mut row2).unwrap(); + writer.add_row(row2); + + let (schema, rows) = writer.into_schema_and_rows(); + let col_idx = schema + .iter() + .position(|c| c.column_name == "attr.val") + .unwrap(); + assert_eq!(schema[col_idx].datatype, ColumnDataType::Float64 as i32); + assert_eq!( + rows[1].values[col_idx].value_data, + Some(ValueData::StringValue("1.5".to_string())) + ); + } + + #[test] + fn test_keep_mixed_string_and_bool_values_until_frontend_reconciliation() { + let mut writer = TableData::new(4, 2); + + let attrs1 = Attributes::from(vec![make_kv( + "val", + OtlpValue::StringValue("true".to_string()), + )]); + let mut row1 = writer.alloc_one_row(); + write_attributes(&mut writer, "attr", attrs1, &mut row1).unwrap(); + writer.add_row(row1); + + let attrs2 = Attributes::from(vec![make_kv("val", OtlpValue::BoolValue(false))]); + let mut row2 = writer.alloc_one_row(); + write_attributes(&mut writer, "attr", attrs2, &mut row2).unwrap(); + writer.add_row(row2); + + let (schema, rows) = writer.into_schema_and_rows(); + let col_idx = schema + .iter() + .position(|c| c.column_name == "attr.val") + .unwrap(); + assert_eq!(schema[col_idx].datatype, ColumnDataType::String as i32); + assert_eq!( + rows[0].values[col_idx].value_data, + Some(ValueData::StringValue("true".to_string())) + ); + assert_eq!( + rows[1].values[col_idx].value_data, + Some(ValueData::BoolValue(false)) + ); + } + + #[test] + fn test_keep_mixed_binary_and_string_values_until_frontend_reconciliation() { + let mut writer = TableData::new(4, 2); + + let attrs1 = Attributes::from(vec![make_kv( + "val", + OtlpValue::BytesValue(vec![1_u8, 2, 3]), + )]); + let mut row1 = writer.alloc_one_row(); + write_attributes(&mut writer, "attr", attrs1, &mut row1).unwrap(); + writer.add_row(row1); + + let attrs2 = Attributes::from(vec![make_kv( + "val", + OtlpValue::StringValue("false".to_string()), + )]); + let mut row2 = writer.alloc_one_row(); + write_attributes(&mut writer, "attr", attrs2, &mut row2).unwrap(); + writer.add_row(row2); + + let (schema, rows) = writer.into_schema_and_rows(); + let col_idx = schema + .iter() + .position(|c| c.column_name == "attr.val") + .unwrap(); + assert_eq!(schema[col_idx].datatype, ColumnDataType::Binary as i32); + assert_eq!( + rows[0].values[col_idx].value_data, + Some(ValueData::BinaryValue(vec![1_u8, 2, 3])) + ); + assert_eq!( + rows[1].values[col_idx].value_data, + Some(ValueData::StringValue("false".to_string())) + ); + } + // Conversion matrix coverage lives in the shared coercion helper tests. +} diff --git a/src/servers/src/row_writer.rs b/src/servers/src/row_writer.rs index ec439a8659..fca2c21b41 100644 --- a/src/servers/src/row_writer.rs +++ b/src/servers/src/row_writer.rs @@ -77,6 +77,35 @@ impl TableData { pub fn into_schema_and_rows(self) -> (Vec, Vec) { (self.schema, self.rows) } + + /// Writes a field value without enforcing that later writes use the same datatype + /// as the first-seen schema entry. + /// + /// The OTLP trace v1 path uses this to preserve raw mixed values inside one request + /// so the frontend can reconcile them later against both the full batch and the + /// existing table schema. + pub fn write_field_unchecked( + &mut self, + name: impl ToString, + datatype: ColumnDataType, + value: Option, + one_row: &mut Vec, + ) { + let name = name.to_string(); + if let Some(index) = self.column_indexes.get(&name).copied() { + one_row[index].value_data = value; + } else { + let index = self.schema.len(); + self.schema.push(ColumnSchema { + column_name: name.clone(), + datatype: datatype as i32, + semantic_type: SemanticType::Field as i32, + ..Default::default() + }); + self.column_indexes.insert(name, index); + one_row.push(Value { value_data: value }); + } + } } pub struct MultiTableData { diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index 933fcadf6b..c0d858a592 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -5283,6 +5283,315 @@ pub async fn test_otlp_traces_v1(store_type: StorageType) { ) .await; + let coercion_table_name = "trace_type_coercion"; + let coercion_req = make_trace_v1_request( + "type-coercion", + vec![make_trace_v1_span( + "00000000000000000000000000000001", + "0000000000000001", + "coercion-seed", + 1_736_480_942_444_376_000, + 1_736_480_942_444_499_000, + vec![ + make_double_attr("attr_float", 1.5), + make_int_attr("attr_int", 10), + make_bool_attr("attr_bool", true), + ], + )], + ); + let res = send_trace_v1_req(&client, coercion_table_name, coercion_req, false).await; + assert_eq!(StatusCode::OK, res.status()); + + let coercion_req = make_trace_v1_request( + "type-coercion", + vec![make_trace_v1_span( + "00000000000000000000000000000002", + "0000000000000002", + "coercion-apply", + 1_736_480_942_444_589_000, + 1_736_480_942_444_712_000, + vec![ + make_int_attr("attr_float", 2), + make_string_attr("attr_int", "20"), + make_string_attr("attr_bool", "false"), + ], + )], + ); + let res = send_trace_v1_req(&client, coercion_table_name, coercion_req, false).await; + assert_eq!(StatusCode::OK, res.status()); + + let string_target_table_name = "trace_type_coercion_to_string"; + let string_target_seed_req = make_trace_v1_request( + "type-coercion-string", + vec![make_trace_v1_span( + "00000000000000000000000000000021", + "0000000000000021", + "string-target-seed", + 1_736_480_942_444_720_000, + 1_736_480_942_444_820_000, + vec![ + make_string_attr("attr_int", "seed"), + make_string_attr("attr_float", "seed"), + make_string_attr("attr_bool", "seed"), + ], + )], + ); + let res = send_trace_v1_req( + &client, + string_target_table_name, + string_target_seed_req, + false, + ) + .await; + assert_eq!(StatusCode::OK, res.status()); + + let string_target_req = make_trace_v1_request( + "type-coercion-string", + vec![make_trace_v1_span( + "00000000000000000000000000000022", + "0000000000000022", + "string-target-apply", + 1_736_480_942_444_830_000, + 1_736_480_942_444_930_000, + vec![ + make_int_attr("attr_int", 20), + make_double_attr("attr_float", 2.5), + make_bool_attr("attr_bool", false), + ], + )], + ); + let res = send_trace_v1_req(&client, string_target_table_name, string_target_req, false).await; + assert_eq!(StatusCode::OK, res.status()); + + validate_data( + "otlp_traces_v1_type_coercion_to_string_rows", + &client, + &format!( + "select trace_id, \"span_attributes.attr_bool\", \"span_attributes.attr_float\", \"span_attributes.attr_int\" from {} order by trace_id;", + string_target_table_name + ), + r#"[["00000000000000000000000000000021","seed","seed","seed"],["00000000000000000000000000000022","false","2.5","20"]]"#, + ) + .await; + validate_data( + "otlp_traces_v1_type_coercion_to_string_schema", + &client, + "select column_name, lower(data_type), semantic_type from information_schema.columns where table_name = 'trace_type_coercion_to_string' and column_name in ('span_attributes.attr_bool', 'span_attributes.attr_float', 'span_attributes.attr_int') order by column_name;", + r#"[["span_attributes.attr_bool","string","FIELD"],["span_attributes.attr_float","string","FIELD"],["span_attributes.attr_int","string","FIELD"]]"#, + ) + .await; + + let intra_batch_prefer_non_string_table_name = "trace_type_prefer_non_string"; + let intra_batch_prefer_non_string_req = make_trace_v1_request( + "type-prefer-non-string", + vec![ + make_trace_v1_span( + "00000000000000000000000000000031", + "0000000000000031", + "prefer-non-string-seed", + 1_736_480_942_444_940_000, + 1_736_480_942_445_040_000, + vec![ + make_string_attr("attr_int", "10"), + make_string_attr("attr_float", "1.5"), + make_string_attr("attr_bool", "true"), + ], + ), + make_trace_v1_span( + "00000000000000000000000000000032", + "0000000000000032", + "prefer-non-string-apply", + 1_736_480_942_445_050_000, + 1_736_480_942_445_150_000, + vec![ + make_int_attr("attr_int", 20), + make_double_attr("attr_float", 2.5), + make_bool_attr("attr_bool", false), + ], + ), + ], + ); + let res = send_trace_v1_req( + &client, + intra_batch_prefer_non_string_table_name, + intra_batch_prefer_non_string_req, + false, + ) + .await; + assert_eq!(StatusCode::OK, res.status()); + + validate_data( + "otlp_traces_v1_prefer_non_string_rows", + &client, + &format!( + "select trace_id, \"span_attributes.attr_bool\", \"span_attributes.attr_float\", \"span_attributes.attr_int\" from {} order by trace_id;", + intra_batch_prefer_non_string_table_name + ), + r#"[["00000000000000000000000000000031",true,1.5,10],["00000000000000000000000000000032",false,2.5,20]]"#, + ) + .await; + validate_data( + "otlp_traces_v1_prefer_non_string_schema", + &client, + "select column_name, lower(data_type), semantic_type from information_schema.columns where table_name = 'trace_type_prefer_non_string' and column_name in ('span_attributes.attr_bool', 'span_attributes.attr_float', 'span_attributes.attr_int') order by column_name;", + r#"[["span_attributes.attr_bool","boolean","FIELD"],["span_attributes.attr_float","double","FIELD"],["span_attributes.attr_int","bigint","FIELD"]]"#, + ) + .await; + + let existing_float_table_name = "trace_type_existing_float_prefers_schema"; + let existing_float_seed_req = make_trace_v1_request( + "type-existing-float", + vec![make_trace_v1_span( + "00000000000000000000000000000041", + "0000000000000041", + "existing-float-seed", + 1_736_480_942_445_160_000, + 1_736_480_942_445_260_000, + vec![make_double_attr("attr_num", 1.25)], + )], + ); + let res = send_trace_v1_req( + &client, + existing_float_table_name, + existing_float_seed_req, + false, + ) + .await; + assert_eq!(StatusCode::OK, res.status()); + + let existing_float_req = make_trace_v1_request( + "type-existing-float", + vec![ + make_trace_v1_span( + "00000000000000000000000000000042", + "0000000000000042", + "existing-float-int-first", + 1_736_480_942_445_270_000, + 1_736_480_942_445_370_000, + vec![make_int_attr("attr_num", 2)], + ), + make_trace_v1_span( + "00000000000000000000000000000043", + "0000000000000043", + "existing-float-float-later", + 1_736_480_942_445_380_000, + 1_736_480_942_445_480_000, + vec![make_double_attr("attr_num", 3.5)], + ), + ], + ); + let res = send_trace_v1_req( + &client, + existing_float_table_name, + existing_float_req, + false, + ) + .await; + assert_eq!(StatusCode::OK, res.status()); + + validate_data( + "otlp_traces_v1_existing_float_prefers_schema_rows", + &client, + &format!( + "select trace_id, \"span_attributes.attr_num\" from {} order by trace_id;", + existing_float_table_name + ), + r#"[["00000000000000000000000000000041",1.25],["00000000000000000000000000000042",2.0],["00000000000000000000000000000043",3.5]]"#, + ) + .await; + validate_data( + "otlp_traces_v1_existing_float_prefers_schema_type", + &client, + "select column_name, lower(data_type), semantic_type from information_schema.columns where table_name = 'trace_type_existing_float_prefers_schema' and column_name = 'span_attributes.attr_num';", + r#"[["span_attributes.attr_num","double","FIELD"]]"#, + ) + .await; + + validate_data( + "otlp_traces_v1_type_coercion_rows", + &client, + &format!( + "select trace_id, \"span_attributes.attr_int\", \"span_attributes.attr_bool\" from {} order by trace_id;", + coercion_table_name + ), + r#"[["00000000000000000000000000000001",10,true],["00000000000000000000000000000002",20,false]]"#, + ) + .await; + validate_data( + "otlp_traces_v1_type_coercion_float_sum", + &client, + &format!( + "select sum(\"span_attributes.attr_float\") from {};", + coercion_table_name + ), + r#"[[3.5]]"#, + ) + .await; + validate_data( + "otlp_traces_v1_type_coercion_schema", + &client, + "select column_name, lower(data_type), semantic_type from information_schema.columns where table_name = 'trace_type_coercion' and column_name in ('span_attributes.attr_bool', 'span_attributes.attr_float', 'span_attributes.attr_int') order by column_name;", + r#"[["span_attributes.attr_bool","boolean","FIELD"],["span_attributes.attr_float","double","FIELD"],["span_attributes.attr_int","bigint","FIELD"]]"#, + ) + .await; + + let abort_table_name = "trace_type_abort"; + let abort_seed_req = make_trace_v1_request( + "type-abort", + vec![make_trace_v1_span( + "00000000000000000000000000000011", + "0000000000000011", + "abort-seed", + 1_736_480_942_444_800_000, + 1_736_480_942_444_900_000, + vec![make_int_attr("attr_int", 10)], + )], + ); + let res = send_trace_v1_req(&client, abort_table_name, abort_seed_req, false).await; + assert_eq!(StatusCode::OK, res.status()); + + let abort_req = make_trace_v1_request( + "type-abort", + vec![ + make_trace_v1_span( + "00000000000000000000000000000012", + "0000000000000012", + "abort-parseable", + 1_736_480_942_445_000_000, + 1_736_480_942_445_100_000, + vec![make_string_attr("attr_int", "20")], + ), + make_trace_v1_span( + "00000000000000000000000000000013", + "0000000000000013", + "abort-unparsable", + 1_736_480_942_445_200_000, + 1_736_480_942_445_300_000, + vec![make_string_attr("attr_int", "not_a_number")], + ), + ], + ); + let res = send_trace_v1_req(&client, abort_table_name, abort_req, false).await; + assert_eq!(StatusCode::BAD_REQUEST, res.status()); + let body: Value = res.json().await; + assert!( + body["error"].as_str().unwrap().contains( + "failed to coerce trace column 'span_attributes.attr_int' in table 'trace_type_abort'" + ), + "unexpected error body: {body}" + ); + + validate_data( + "otlp_traces_v1_type_abort_rows", + &client, + &format!( + "select trace_id, \"span_attributes.attr_int\" from {} order by trace_id;", + abort_table_name + ), + r#"[["00000000000000000000000000000011",10]]"#, + ) + .await; + guard.remove_all().await; } @@ -7515,6 +7824,115 @@ async fn send_req( req.header("content-length", len).send().await } +async fn send_trace_v1_req( + client: &TestClient, + table_name: &str, + req: ExportTraceServiceRequest, + with_gzip: bool, +) -> TestResponse { + send_req( + client, + vec![ + ( + HeaderName::from_static("content-type"), + HeaderValue::from_static("application/x-protobuf"), + ), + ( + HeaderName::from_static("x-greptime-pipeline-name"), + HeaderValue::from_static(GREPTIME_INTERNAL_TRACE_PIPELINE_V1_NAME), + ), + ( + HeaderName::from_static("x-greptime-trace-table-name"), + HeaderValue::from_str(table_name).unwrap(), + ), + ], + "/v1/otlp/v1/traces", + req.encode_to_vec(), + with_gzip, + ) + .await +} + +fn make_trace_v1_request(service_name: &str, spans: Vec) -> ExportTraceServiceRequest { + serde_json::from_value(json!({ + "resourceSpans": [{ + "resource": { + "attributes": [{ + "key": "service.name", + "value": { "stringValue": service_name } + }] + }, + "scopeSpans": [{ + "scope": { + "name": "trace-v1-type-tests" + }, + "spans": spans + }], + "schemaUrl": "https://opentelemetry.io/schemas/1.4.0" + }] + })) + .unwrap() +} + +fn make_trace_v1_span( + trace_id: &str, + span_id: &str, + name: &str, + start_time_unix_nano: i64, + end_time_unix_nano: i64, + attributes: Vec, +) -> Value { + json!({ + "traceId": trace_id, + "spanId": span_id, + "name": name, + "kind": 2, + "startTimeUnixNano": start_time_unix_nano.to_string(), + "endTimeUnixNano": end_time_unix_nano.to_string(), + "attributes": attributes, + "status": { + "message": "", + "code": 0 + } + }) +} + +fn make_string_attr(key: &str, value: &str) -> Value { + json!({ + "key": key, + "value": { + "stringValue": value + } + }) +} + +fn make_int_attr(key: &str, value: i64) -> Value { + json!({ + "key": key, + "value": { + "intValue": value.to_string() + } + }) +} + +fn make_double_attr(key: &str, value: f64) -> Value { + json!({ + "key": key, + "value": { + "doubleValue": value + } + }) +} + +fn make_bool_attr(key: &str, value: bool) -> Value { + json!({ + "key": key, + "value": { + "boolValue": value + } + }) +} + fn get_rows_from_output(output: &str) -> String { let resp: Value = serde_json::from_str(output).unwrap(); resp.get("output") From e14404c677ce386ce128bd107c611019ec4094a4 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Mon, 30 Mar 2026 20:13:14 +0800 Subject: [PATCH 055/195] chore: update rust toolchain to 2026-03-21 (#7849) * chore: update rust toolchain to 2026-03-21 * chore: new format * fix: lint * chore: resolve lint issues * chore: remove as_millis_f64 * chore: deps up --- Cargo.lock | 32 +++---- flake.lock | 18 ++-- flake.nix | 2 +- rust-toolchain.toml | 2 +- src/auth/tests/mod.rs | 3 +- src/catalog/src/lib.rs | 3 - src/catalog/src/memory/manager.rs | 20 ++--- .../information_schema/tables.rs | 24 ++--- src/catalog/src/system_schema/pg_catalog.rs | 10 +-- src/catalog/src/table_source.rs | 2 +- src/cli/src/data/export.rs | 6 +- src/client/src/database.rs | 2 +- src/client/src/lib.rs | 2 - src/cmd/src/bin/greptime.rs | 1 + src/cmd/src/datanode.rs | 2 +- src/cmd/src/datanode/scanbench.rs | 2 +- src/cmd/src/lib.rs | 2 +- .../datasource/src/file_format/tests.rs | 3 +- src/common/datasource/src/lib.rs | 3 - src/common/frontend/src/error.rs | 2 +- src/common/frontend/src/selector.rs | 1 + src/common/function/src/lib.rs | 1 - src/common/function/src/scalars/matches.rs | 14 ++- src/common/function/src/scalars/vector.rs | 6 +- .../scalars/vector/convert/parse_vector.rs | 2 +- src/common/meta/src/cluster.rs | 2 +- .../meta/src/ddl/drop_database/start.rs | 2 +- .../meta/src/ddl/drop_table/executor.rs | 2 +- src/common/meta/src/ddl/test_util.rs | 2 +- .../src/ddl/tests/alter_logical_tables.rs | 2 +- src/common/meta/src/ddl/tests/alter_table.rs | 6 +- src/common/meta/src/ddl/tests/create_flow.rs | 2 +- .../src/ddl/tests/create_logical_tables.rs | 2 +- src/common/meta/src/ddl/tests/create_table.rs | 2 +- src/common/meta/src/ddl/tests/create_view.rs | 2 +- src/common/meta/src/ddl/tests/drop_flow.rs | 2 +- src/common/meta/src/ddl/tests/drop_table.rs | 2 +- src/common/meta/src/election/rds/mysql.rs | 3 +- src/common/meta/src/election/rds/postgres.rs | 3 +- src/common/meta/src/key/flow.rs | 2 +- src/common/meta/src/key/topic_name.rs | 2 +- src/common/meta/src/kv_backend/rds.rs | 4 +- src/common/meta/src/lib.rs | 3 - src/common/meta/src/range_stream.rs | 2 +- .../update_table_infos.rs | 2 +- src/common/meta/src/reconciliation/utils.rs | 2 +- src/common/meta/src/sequence.rs | 2 +- src/common/meta/src/snapshot.rs | 2 +- src/common/meta/src/state_store.rs | 3 +- src/common/meta/src/wal_provider.rs | 2 +- .../meta/src/wal_provider/topic_pool.rs | 2 +- src/common/procedure/src/lib.rs | 2 - src/common/procedure/src/local.rs | 2 +- src/common/procedure/src/local/runner.rs | 2 +- src/common/procedure/src/store/util.rs | 4 +- src/common/recordbatch/src/lib.rs | 4 +- src/common/recordbatch/src/recordbatch.rs | 2 +- src/common/sql/src/default_constraint.rs | 2 +- src/common/sql/src/lib.rs | 2 - src/common/telemetry/src/lib.rs | 4 +- src/common/wal/src/lib.rs | 4 +- src/datanode/src/datanode.rs | 7 +- src/datanode/src/heartbeat/handler.rs | 2 +- .../src/heartbeat/handler/close_region.rs | 3 +- .../src/heartbeat/handler/downgrade_region.rs | 2 +- .../src/heartbeat/handler/open_region.rs | 2 +- .../src/heartbeat/handler/upgrade_region.rs | 7 +- src/datanode/src/lib.rs | 2 - src/datanode/src/region_server.rs | 2 +- src/datatypes/src/json.rs | 4 +- src/datatypes/src/lib.rs | 3 - src/datatypes/src/types/json_type.rs | 2 +- src/datatypes/src/value.rs | 18 ++-- src/datatypes/src/vectors.rs | 9 +- src/datatypes/src/vectors/binary.rs | 2 +- src/datatypes/src/vectors/json/builder.rs | 4 +- src/file-engine/src/lib.rs | 2 - src/file-engine/src/region.rs | 2 +- src/flow/src/adapter/flownode_impl.rs | 2 +- src/flow/src/lib.rs | 1 - src/flow/src/utils.rs | 6 +- src/frontend/src/lib.rs | 2 - .../inverted_index/format/reader/footer.rs | 2 +- src/index/src/lib.rs | 1 - src/log-query/src/log_query.rs | 88 +++++++++---------- src/log-store/src/kafka/index/iterator.rs | 6 +- src/log-store/src/kafka/log_store.rs | 2 +- src/log-store/src/kafka/util/record.rs | 2 +- src/log-store/src/lib.rs | 3 - src/meta-client/src/client/cluster.rs | 4 +- src/meta-client/src/client/procedure.rs | 4 +- src/meta-srv/src/gc/candidate.rs | 2 +- src/meta-srv/src/gc/handler.rs | 2 +- src/meta-srv/src/handler.rs | 18 ++-- src/meta-srv/src/lib.rs | 2 - .../src/procedure/region_migration.rs | 2 +- .../downgrade_leader_region.rs | 2 +- .../region_migration/flush_leader_region.rs | 2 +- .../src/procedure/region_migration/manager.rs | 2 +- .../region_migration/migration_start.rs | 2 +- .../region_migration/open_candidate_region.rs | 2 +- .../procedure/region_migration/test_util.rs | 2 +- .../downgrade_leader_region.rs | 2 +- .../rollback_downgraded_region.rs | 2 +- .../upgrade_candidate_region.rs | 2 +- .../upgrade_candidate_region.rs | 2 +- .../src/procedure/region_migration/utils.rs | 2 +- .../src/procedure/repartition/group.rs | 2 +- .../repartition/group/enter_staging_region.rs | 2 +- .../repartition/group/repartition_start.rs | 2 +- .../repartition/group/sync_region.rs | 2 +- src/meta-srv/src/procedure/wal_prune.rs | 2 +- .../src/procedure/wal_prune/manager.rs | 2 +- src/meta-srv/src/region/supervisor.rs | 2 +- src/meta-srv/src/service/mailbox.rs | 2 +- src/meta-srv/src/state.rs | 2 +- src/metric-engine/src/engine.rs | 2 +- src/metric-engine/src/engine/bulk_insert.rs | 2 +- .../src/engine/create/extract_new_columns.rs | 2 +- src/metric-engine/src/lib.rs | 2 +- src/metric-engine/src/repeated_task.rs | 2 +- src/metric-engine/src/test_util.rs | 2 +- src/mito-codec/src/key_values.rs | 4 +- src/mito2/src/cache/index.rs | 4 +- src/mito2/src/compaction.rs | 13 ++- src/mito2/src/compaction/run.rs | 2 +- src/mito2/src/compaction/twcs.rs | 4 +- src/mito2/src/engine/alter_test.rs | 2 +- .../src/engine/apply_staging_manifest_test.rs | 3 +- src/mito2/src/engine/catchup_test.rs | 2 +- src/mito2/src/engine/copy_region_from_test.rs | 3 +- src/mito2/src/engine/remap_manifests_test.rs | 2 +- src/mito2/src/engine/staging_test.rs | 3 +- src/mito2/src/engine/sync_test.rs | 2 +- src/mito2/src/lib.rs | 2 - src/mito2/src/manifest/tests/checkpoint.rs | 2 +- src/mito2/src/region.rs | 2 +- src/mito2/src/region/opener.rs | 4 +- src/mito2/src/sst/index.rs | 4 +- src/mito2/src/sst/parquet/row_group.rs | 2 +- src/operator/src/insert.rs | 6 +- src/operator/src/lib.rs | 3 - src/operator/src/statement.rs | 2 +- src/partition/src/lib.rs | 1 - src/partition/src/multi_dim.rs | 2 +- src/pipeline/src/etl/ctx_req.rs | 2 +- src/pipeline/src/etl/processor/dissect.rs | 2 +- src/pipeline/src/etl/processor/filter.rs | 8 +- .../src/extension_plan/range_manipulate.rs | 2 +- .../src/extension_plan/series_divide.rs | 2 +- src/promql/src/functions/extrapolate_rate.rs | 2 +- src/promql/src/functions/quantile.rs | 2 +- .../fs_puffin_manager/reader.rs | 2 +- src/query/src/dist_plan/analyzer/utils.rs | 2 +- src/query/src/optimizer/parallelize_scan.rs | 2 +- src/query/src/optimizer/scan_hint.rs | 8 +- src/query/src/promql/planner.rs | 22 ++--- src/query/src/sql.rs | 22 ++--- src/query/src/window_sort.rs | 2 +- src/servers/src/grpc/flight.rs | 3 +- src/servers/src/http/authorize.rs | 2 +- src/servers/src/http/header.rs | 12 ++- src/servers/src/http/prometheus.rs | 20 ++--- .../src/http/result/prometheus_resp.rs | 16 ++-- src/servers/src/lib.rs | 3 - src/servers/src/mysql/handler.rs | 4 +- src/servers/tests/mysql/mysql_server_test.rs | 4 +- src/sql/src/lib.rs | 2 - src/sql/src/parsers/alter_parser.rs | 2 +- src/sql/src/parsers/comment_parser.rs | 2 +- src/sql/src/parsers/copy_parser.rs | 2 +- src/sql/src/parsers/create_parser.rs | 2 +- src/sql/src/parsers/delete_parser.rs | 2 +- src/sql/src/parsers/insert_parser.rs | 2 +- src/sql/src/parsers/show_parser.rs | 2 +- src/sql/src/statements/alter.rs | 2 +- src/sql/src/statements/copy.rs | 2 +- src/sql/src/statements/create.rs | 2 +- src/sql/src/statements/describe.rs | 2 +- src/sql/src/statements/drop.rs | 2 +- src/sql/src/statements/set_variables.rs | 2 +- src/sql/src/statements/show.rs | 2 +- src/sql/src/statements/truncate.rs | 2 +- src/store-api/src/lib.rs | 2 - src/table/src/lib.rs | 1 - src/table/src/metadata.rs | 2 +- tests-integration/Cargo.toml | 5 ++ tests-integration/src/cluster.rs | 4 +- tests-integration/src/lib.rs | 2 +- .../src/tests/instance_kafka_wal_test.rs | 2 +- tests-integration/tests/main.rs | 2 + 191 files changed, 353 insertions(+), 446 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0cab5067bb..cfec1c5f54 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -565,7 +565,7 @@ dependencies = [ "arrow-schema 57.3.0", "arrow-select 57.3.0", "flatbuffers", - "lz4_flex 0.12.0", + "lz4_flex 0.12.1", "zstd", ] @@ -6277,7 +6277,7 @@ dependencies = [ "js-sys", "log", "wasm-bindgen", - "windows-core 0.61.2", + "windows-core 0.57.0", ] [[package]] @@ -7592,18 +7592,18 @@ dependencies = [ [[package]] name = "lz4_flex" -version = "0.11.5" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08ab2867e3eeeca90e844d1940eab391c9dc5228783db2ed999acbc0a9ed375a" +checksum = "373f5eceeeab7925e0c1098212f2fbc4d416adec9d35051a6ab251e824c1854a" dependencies = [ "twox-hash", ] [[package]] name = "lz4_flex" -version = "0.12.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab6473172471198271ff72e9379150e9dfd70d8e533e0752a27e515b48dd375e" +checksum = "98c23545df7ecf1b16c303910a69b079e8e251d60f7dd2cc9b4177f2afaf1746" dependencies = [ "twox-hash", ] @@ -9122,7 +9122,7 @@ dependencies = [ "flate2", "futures", "futures-util", - "lz4_flex 0.11.5", + "lz4_flex 0.11.6", "lzokay-native", "num", "prost 0.13.5", @@ -9345,7 +9345,7 @@ dependencies = [ "futures", "half", "hashbrown 0.16.1", - "lz4_flex 0.12.0", + "lz4_flex 0.12.1", "num-bigint", "num-integer", "num-traits", @@ -10574,7 +10574,7 @@ dependencies = [ "common-test-util", "derive_builder 0.20.2", "futures", - "lz4_flex 0.11.5", + "lz4_flex 0.11.6", "moka", "pin-project", "prometheus 0.14.0", @@ -11306,9 +11306,9 @@ dependencies = [ [[package]] name = "rsasl" -version = "2.2.0" +version = "2.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8b534a23662bb559c5c73213be63ecd6524e774d291f3618c2b04b723d184eb" +checksum = "9f1bcb95b531681a622f3d6972eaab523e17e2aad6d6209f0276628eb1cb5038" dependencies = [ "base64 0.22.1", "core2", @@ -11320,7 +11320,7 @@ dependencies = [ "serde_json", "sha2", "stringprep", - "thiserror 1.0.69", + "thiserror 2.0.17", ] [[package]] @@ -12308,9 +12308,9 @@ dependencies = [ [[package]] name = "slab" -version = "0.4.10" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04dc19736151f35336d325007ac991178d504a119863a2fcb3758cdb5e52c50d" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" [[package]] name = "slotmap" @@ -13224,7 +13224,7 @@ dependencies = [ "levenshtein_automata", "log", "lru", - "lz4_flex 0.11.5", + "lz4_flex 0.11.6", "measure_time", "memmap2", "once_cell", @@ -14714,7 +14714,7 @@ dependencies = [ "itertools 0.14.0", "lalrpop", "lalrpop-util", - "lz4_flex 0.11.5", + "lz4_flex 0.11.6", "md-5", "nom 7.1.3", "ofb", diff --git a/flake.lock b/flake.lock index bec6e18d9a..3c3ee4bd67 100644 --- a/flake.lock +++ b/flake.lock @@ -8,11 +8,11 @@ "rust-analyzer-src": "rust-analyzer-src" }, "locked": { - "lastModified": 1770794449, - "narHash": "sha256-1nFkhcZx9+Sdw5OXwJqp5TxvGncqRqLeK781v0XV3WI=", + "lastModified": 1774250935, + "narHash": "sha256-mWID0WFgTnd9hbEeaPNX+YYWF70JN3r7zBouEqERJOE=", "owner": "nix-community", "repo": "fenix", - "rev": "b19d93fdf9761e6101f8cb5765d638bacebd9a1b", + "rev": "64d7705e8c37d650cfb1aa99c24a8ce46597f29e", "type": "github" }, "original": { @@ -41,11 +41,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1770617025, - "narHash": "sha256-1jZvgZoAagZZB6NwGRv2T2ezPy+X6EFDsJm+YSlsvEs=", + "lastModified": 1774244481, + "narHash": "sha256-4XfMXU0DjN83o6HWZoKG9PegCvKvIhNUnRUI19vzTcQ=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "2db38e08fdadcc0ce3232f7279bab59a15b94482", + "rev": "4590696c8693fea477850fe379a01544293ca4e2", "type": "github" }, "original": { @@ -65,11 +65,11 @@ "rust-analyzer-src": { "flake": false, "locked": { - "lastModified": 1770702974, - "narHash": "sha256-CbvWu72rpGHK5QynoXwuOnVzxX7njF2LYgk8wRSiAQ0=", + "lastModified": 1774221325, + "narHash": "sha256-aEIdkqB8gtQZtEbogdUb5iyfcZpKIlD3FkG8ANu73/I=", "owner": "rust-lang", "repo": "rust-analyzer", - "rev": "07a594815f7c1d6e7e39f21ddeeedb75b21795f4", + "rev": "b42b63f390a4dab14e6efa34a70e67f5b087cc62", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index 6a02f4f05f..8dc84136a0 100644 --- a/flake.nix +++ b/flake.nix @@ -20,7 +20,7 @@ lib = nixpkgs.lib; rustToolchain = fenix.packages.${system}.fromToolchainName { name = (lib.importTOML ./rust-toolchain.toml).toolchain.channel; - sha256 = "sha256-GCGEXGZeJySLND0KU5TdtTrqFV76TF3UdvAHSUegSsk="; + sha256 = "sha256-rboGKQLH4eDuiY01SINOqmXUFUNr9F4awoFZGzib17o="; }; in { diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 58b88a3894..d16edecca8 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,2 +1,2 @@ [toolchain] -channel = "nightly-2025-10-01" +channel = "nightly-2026-03-21" diff --git a/src/auth/tests/mod.rs b/src/auth/tests/mod.rs index 65db96a13f..4abbf89e5c 100644 --- a/src/auth/tests/mod.rs +++ b/src/auth/tests/mod.rs @@ -12,8 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(assert_matches)] -use std::assert_matches::assert_matches; +use std::assert_matches; use std::sync::Arc; use api::v1::greptime_request::Request; diff --git a/src/catalog/src/lib.rs b/src/catalog/src/lib.rs index 9c31e809fd..a701473551 100644 --- a/src/catalog/src/lib.rs +++ b/src/catalog/src/lib.rs @@ -12,9 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(assert_matches)] -#![feature(try_blocks)] - use std::any::Any; use std::fmt::{Debug, Formatter}; use std::sync::Arc; diff --git a/src/catalog/src/memory/manager.rs b/src/catalog/src/memory/manager.rs index 571cd06468..6e747f62ed 100644 --- a/src/catalog/src/memory/manager.rs +++ b/src/catalog/src/memory/manager.rs @@ -132,15 +132,13 @@ impl CatalogManager for MemoryCatalogManager { table_name: &str, _query_ctx: Option<&QueryContext>, ) -> Result> { - let result = try { - self.catalogs - .read() - .unwrap() - .get(catalog)? - .get(schema)? - .get(table_name) - .cloned()? - }; + let catalogs = self.catalogs.read().unwrap(); + let result = catalogs + .get(catalog) + .and_then(|c| c.get(schema)) + .and_then(|s| s.get(table_name)) + .cloned(); + Ok(result) } @@ -149,8 +147,8 @@ impl CatalogManager for MemoryCatalogManager { .catalogs .read() .unwrap() - .iter() - .flat_map(|(_, schema_entries)| schema_entries.values()) + .values() + .flat_map(|schema_entries| schema_entries.values()) .flat_map(|tables| tables.values()) .find(|t| t.table_info().ident.table_id == table_id) .map(|t| t.table_info())) diff --git a/src/catalog/src/system_schema/information_schema/tables.rs b/src/catalog/src/system_schema/information_schema/tables.rs index 248fb243dd..6175c17d39 100644 --- a/src/catalog/src/system_schema/information_schema/tables.rs +++ b/src/catalog/src/system_schema/information_schema/tables.rs @@ -372,22 +372,16 @@ impl InformationSchemaTablesBuilder { self.table_types.push(Some(table_type_text)); self.table_ids.push(Some(table_id)); - let data_length = region_stats.iter().map(|stat| stat.sst_size).sum(); - let table_rows = region_stats.iter().map(|stat| stat.num_rows).sum(); - let index_length = region_stats.iter().map(|stat| stat.index_size).sum(); + let data_length: u64 = region_stats.iter().map(|stat| stat.sst_size).sum(); + let table_rows: u64 = region_stats.iter().map(|stat| stat.num_rows).sum(); + let index_length: u64 = region_stats.iter().map(|stat| stat.index_size).sum(); - // It's not precise, but it is acceptable for long-term data storage. - let avg_row_length = if table_rows > 0 { - let total_data_length = data_length - + region_stats - .iter() - .map(|stat| stat.memtable_size) - .sum::(); - - total_data_length / table_rows - } else { - 0 - }; + let total_data_length: u64 = data_length + + region_stats + .iter() + .map(|stat| stat.memtable_size) + .sum::(); + let avg_row_length = total_data_length.checked_div(table_rows).unwrap_or(0); self.data_length.push(Some(data_length)); self.index_length.push(Some(index_length)); diff --git a/src/catalog/src/system_schema/pg_catalog.rs b/src/catalog/src/system_schema/pg_catalog.rs index 08aad2d6dd..feec46ff90 100644 --- a/src/catalog/src/system_schema/pg_catalog.rs +++ b/src/catalog/src/system_schema/pg_catalog.rs @@ -74,12 +74,10 @@ impl PGCatalogProvider { ) .expect("Failed to initialize PgCatalogSchemaProvider"); - let mut table_ids = HashMap::new(); - let mut table_id = PG_CATALOG_TABLE_ID_START; - for name in PG_CATALOG_TABLES { - table_ids.insert(*name, table_id); - table_id += 1; - } + let table_ids: HashMap<_, _> = (PG_CATALOG_TABLE_ID_START..) + .zip(PG_CATALOG_TABLES.iter()) + .map(|(id, name)| (*name, id)) + .collect(); let mut provider = Self { catalog_name, diff --git a/src/catalog/src/table_source.rs b/src/catalog/src/table_source.rs index 8aabf64e99..f7ba51722f 100644 --- a/src/catalog/src/table_source.rs +++ b/src/catalog/src/table_source.rs @@ -195,7 +195,7 @@ impl DfTableSourceProvider { plan_columns .iter() .map(|c| c.as_str()) - .zip(columns.into_iter()) + .zip(columns) .collect(), ) .context(ProjectViewColumnsSnafu)? diff --git a/src/cli/src/data/export.rs b/src/cli/src/data/export.rs index b5d547d4f3..051c07da35 100644 --- a/src/cli/src/data/export.rs +++ b/src/cli/src/data/export.rs @@ -458,8 +458,10 @@ impl Export { /// build operator with preference for file system async fn build_prefer_fs_operator(&self) -> Result { - if self.storage_type.is_remote_storage() && self.ddl_local_dir.is_some() { - let root = self.ddl_local_dir.as_ref().unwrap().clone(); + if self.storage_type.is_remote_storage() + && let Some(ddl_local_dir) = &self.ddl_local_dir + { + let root = ddl_local_dir.clone(); let op = new_fs_object_store(&root).map_err(|e| Error::Other { source: e, location: snafu::location!(), diff --git a/src/client/src/database.rs b/src/client/src/database.rs index 6a7ac62fc3..e12c2ec0fc 100644 --- a/src/client/src/database.rs +++ b/src/client/src/database.rs @@ -512,7 +512,7 @@ struct FlightContext { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use api::v1::auth_header::AuthScheme; use api::v1::{AuthHeader, Basic}; diff --git a/src/client/src/lib.rs b/src/client/src/lib.rs index bf383acff9..0c9334b7d4 100644 --- a/src/client/src/lib.rs +++ b/src/client/src/lib.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(assert_matches)] - mod client; pub mod client_manager; pub mod database; diff --git a/src/cmd/src/bin/greptime.rs b/src/cmd/src/bin/greptime.rs index 7ddc2cd176..9c48d1fd6a 100644 --- a/src/cmd/src/bin/greptime.rs +++ b/src/cmd/src/bin/greptime.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#![recursion_limit = "256"] #![doc = include_str!("../../../../README.md")] use clap::{Parser, Subcommand}; diff --git a/src/cmd/src/datanode.rs b/src/cmd/src/datanode.rs index 06e2568b72..9b06f24ecb 100644 --- a/src/cmd/src/datanode.rs +++ b/src/cmd/src/datanode.rs @@ -356,7 +356,7 @@ impl StartCommand { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::io::Write; use std::time::Duration; diff --git a/src/cmd/src/datanode/scanbench.rs b/src/cmd/src/datanode/scanbench.rs index fdda1d97bb..6bfe177fc1 100644 --- a/src/cmd/src/datanode/scanbench.rs +++ b/src/cmd/src/datanode/scanbench.rs @@ -662,7 +662,7 @@ impl ScanbenchCommand { // Sort ranges within each partition by start time ascending for partition in &mut partitions { - partition.sort_by(|a, b| a.start.cmp(&b.start)); + partition.sort_by_key(|a| a.start); } scanner diff --git a/src/cmd/src/lib.rs b/src/cmd/src/lib.rs index 46ca4c8a76..27564597d7 100644 --- a/src/cmd/src/lib.rs +++ b/src/cmd/src/lib.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(assert_matches)] +#![recursion_limit = "256"] use async_trait::async_trait; use common_error::ext::ErrorExt; diff --git a/src/common/datasource/src/file_format/tests.rs b/src/common/datasource/src/file_format/tests.rs index ad54472d33..75d74b53cd 100644 --- a/src/common/datasource/src/file_format/tests.rs +++ b/src/common/datasource/src/file_format/tests.rs @@ -12,10 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::assert_matches::assert_matches; use std::collections::HashMap; use std::sync::Arc; -use std::vec; +use std::{assert_matches, vec}; use common_test_util::find_workspace_path; use datafusion::assert_batches_eq; diff --git a/src/common/datasource/src/lib.rs b/src/common/datasource/src/lib.rs index 91663ce22c..f4c7fdf120 100644 --- a/src/common/datasource/src/lib.rs +++ b/src/common/datasource/src/lib.rs @@ -12,9 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(assert_matches)] -#![feature(type_alias_impl_trait)] - pub mod buffered_writer; pub mod compressed_writer; pub mod compression; diff --git a/src/common/frontend/src/error.rs b/src/common/frontend/src/error.rs index cee8c6df77..429489326c 100644 --- a/src/common/frontend/src/error.rs +++ b/src/common/frontend/src/error.rs @@ -52,7 +52,7 @@ pub enum Error { #[snafu(display("Failed to invoke list process service"))] CreateChannel { - source: common_grpc::error::Error, + source: Box, #[snafu(implicit)] location: Location, }, diff --git a/src/common/frontend/src/selector.rs b/src/common/frontend/src/selector.rs index 804169d1dd..5bbd8bb52c 100644 --- a/src/common/frontend/src/selector.rs +++ b/src/common/frontend/src/selector.rs @@ -91,6 +91,7 @@ impl FrontendSelector for MetaClientSelector { let channel = self .channel_manager .get(node.peer.addr) + .map_err(Box::new) .context(error::CreateChannelSnafu)?; let client = frontend_client::FrontendClient::new(channel); Ok(Box::new(client) as FrontendClientPtr) diff --git a/src/common/function/src/lib.rs b/src/common/function/src/lib.rs index 36fd27381d..7abd595367 100644 --- a/src/common/function/src/lib.rs +++ b/src/common/function/src/lib.rs @@ -13,7 +13,6 @@ // limitations under the License. #![feature(try_blocks)] -#![feature(assert_matches)] mod admin; mod flush_flow; diff --git a/src/common/function/src/scalars/matches.rs b/src/common/function/src/scalars/matches.rs index 821a1b0581..b5de60dc85 100644 --- a/src/common/function/src/scalars/matches.rs +++ b/src/common/function/src/scalars/matches.rs @@ -794,16 +794,12 @@ impl Tokenizer { is_quote_present = true; break; } - ' ' => { - if !is_quoted { - break; - } + ' ' if !is_quoted => { + break; } - '(' | ')' | '+' | '-' => { - if !is_quoted { - self.rewind_one(); - break; - } + '(' | ')' | '+' | '-' if !is_quoted => { + self.rewind_one(); + break; } '\\' => { let Some(next) = self.consume_next(pattern) else { diff --git a/src/common/function/src/scalars/vector.rs b/src/common/function/src/scalars/vector.rs index 968231aa0a..2f8772c410 100644 --- a/src/common/function/src/scalars/vector.rs +++ b/src/common/function/src/scalars/vector.rs @@ -141,7 +141,7 @@ where results.push((self.func)(v0, v1)?); } - let results = ScalarValue::iter_to_array(results.into_iter())?; + let results = ScalarValue::iter_to_array(results)?; Ok(ColumnarValue::Array(results)) } } @@ -200,7 +200,7 @@ where } } - let results = ScalarValue::iter_to_array(results.into_iter())?; + let results = ScalarValue::iter_to_array(results)?; Ok(ColumnarValue::Array(results)) } } @@ -232,7 +232,7 @@ where results.push((self.func)(&v)?); } - let results = ScalarValue::iter_to_array(results.into_iter())?; + let results = ScalarValue::iter_to_array(results)?; Ok(ColumnarValue::Array(results)) } } diff --git a/src/common/function/src/scalars/vector/convert/parse_vector.rs b/src/common/function/src/scalars/vector/convert/parse_vector.rs index 0d83f098db..7a112a4453 100644 --- a/src/common/function/src/scalars/vector/convert/parse_vector.rs +++ b/src/common/function/src/scalars/vector/convert/parse_vector.rs @@ -167,7 +167,7 @@ mod tests { "External error: Invalid vector string: [7.0,hello,9.0]", ]; - for (input, expected) in inputs.into_iter().zip(expected.into_iter()) { + for (input, expected) in inputs.into_iter().zip(expected) { let args = ScalarFunctionArgs { args: vec![ColumnarValue::Array(Arc::new(input))], arg_fields: vec![], diff --git a/src/common/meta/src/cluster.rs b/src/common/meta/src/cluster.rs index 78af133e8f..527ad589f1 100644 --- a/src/common/meta/src/cluster.rs +++ b/src/common/meta/src/cluster.rs @@ -303,7 +303,7 @@ impl TryFrom for Role { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use common_workload::DatanodeWorkloadType; diff --git a/src/common/meta/src/ddl/drop_database/start.rs b/src/common/meta/src/ddl/drop_database/start.rs index 775071d684..4da83e367f 100644 --- a/src/common/meta/src/ddl/drop_database/start.rs +++ b/src/common/meta/src/ddl/drop_database/start.rs @@ -72,7 +72,7 @@ impl State for DropDatabaseStart { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::sync::Arc; use crate::ddl::drop_database::cursor::DropDatabaseCursor; diff --git a/src/common/meta/src/ddl/drop_table/executor.rs b/src/common/meta/src/ddl/drop_table/executor.rs index c342487365..271bdbfede 100644 --- a/src/common/meta/src/ddl/drop_table/executor.rs +++ b/src/common/meta/src/ddl/drop_table/executor.rs @@ -322,7 +322,7 @@ impl DropTableExecutor { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::collections::HashMap; use std::sync::Arc; diff --git a/src/common/meta/src/ddl/test_util.rs b/src/common/meta/src/ddl/test_util.rs index 1dd1f783dc..36d422216f 100644 --- a/src/common/meta/src/ddl/test_util.rs +++ b/src/common/meta/src/ddl/test_util.rs @@ -19,7 +19,7 @@ pub mod datanode_handler; pub mod flownode_handler; pub mod region_metadata; -use std::assert_matches::assert_matches; +use std::assert_matches; use std::collections::HashMap; use api::v1::meta::Partition; diff --git a/src/common/meta/src/ddl/tests/alter_logical_tables.rs b/src/common/meta/src/ddl/tests/alter_logical_tables.rs index e6bb7676f5..bca56bafc2 100644 --- a/src/common/meta/src/ddl/tests/alter_logical_tables.rs +++ b/src/common/meta/src/ddl/tests/alter_logical_tables.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::assert_matches::assert_matches; +use std::assert_matches; use std::sync::Arc; use api::region::RegionResponse; diff --git a/src/common/meta/src/ddl/tests/alter_table.rs b/src/common/meta/src/ddl/tests/alter_table.rs index d935aa6a15..14ee71b3e6 100644 --- a/src/common/meta/src/ddl/tests/alter_table.rs +++ b/src/common/meta/src/ddl/tests/alter_table.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::assert_matches::assert_matches; +use std::assert_matches; use std::collections::HashMap; use std::sync::Arc; @@ -256,7 +256,7 @@ async fn test_on_submit_alter_request() { results.push(result); } rx.try_recv().unwrap_err(); - results.sort_unstable_by(|(a, _), (b, _)| a.id.cmp(&b.id)); + results.sort_unstable_by_key(|(a, _)| a.id); let (peer, request) = results.remove(0); assert_alter_request(peer, request, 1, RegionId::new(table_id, 1)); @@ -310,7 +310,7 @@ async fn test_on_submit_alter_request_without_sync_request() { results.push(result); } rx.try_recv().unwrap_err(); - results.sort_unstable_by(|(a, _), (b, _)| a.id.cmp(&b.id)); + results.sort_unstable_by_key(|(a, _)| a.id); let (peer, request) = results.remove(0); assert_alter_request(peer, request, 1, RegionId::new(table_id, 1)); diff --git a/src/common/meta/src/ddl/tests/create_flow.rs b/src/common/meta/src/ddl/tests/create_flow.rs index 5b22c81857..344fc05024 100644 --- a/src/common/meta/src/ddl/tests/create_flow.rs +++ b/src/common/meta/src/ddl/tests/create_flow.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::assert_matches::assert_matches; +use std::assert_matches; use std::collections::HashMap; use std::sync::Arc; diff --git a/src/common/meta/src/ddl/tests/create_logical_tables.rs b/src/common/meta/src/ddl/tests/create_logical_tables.rs index f7dd397f9f..e2927e8df6 100644 --- a/src/common/meta/src/ddl/tests/create_logical_tables.rs +++ b/src/common/meta/src/ddl/tests/create_logical_tables.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::assert_matches::assert_matches; +use std::assert_matches; use std::sync::Arc; use api::region::RegionResponse; diff --git a/src/common/meta/src/ddl/tests/create_table.rs b/src/common/meta/src/ddl/tests/create_table.rs index 5cc1db71cb..5355ac8c7c 100644 --- a/src/common/meta/src/ddl/tests/create_table.rs +++ b/src/common/meta/src/ddl/tests/create_table.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::assert_matches::assert_matches; +use std::assert_matches; use std::collections::HashMap; use std::sync::Arc; diff --git a/src/common/meta/src/ddl/tests/create_view.rs b/src/common/meta/src/ddl/tests/create_view.rs index e4fefa8944..cc98bb1bae 100644 --- a/src/common/meta/src/ddl/tests/create_view.rs +++ b/src/common/meta/src/ddl/tests/create_view.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::assert_matches::assert_matches; +use std::assert_matches; use std::collections::HashSet; use std::sync::Arc; diff --git a/src/common/meta/src/ddl/tests/drop_flow.rs b/src/common/meta/src/ddl/tests/drop_flow.rs index 8de42f5c96..af34da4809 100644 --- a/src/common/meta/src/ddl/tests/drop_flow.rs +++ b/src/common/meta/src/ddl/tests/drop_flow.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::assert_matches::assert_matches; +use std::assert_matches; use std::collections::HashMap; use std::sync::Arc; diff --git a/src/common/meta/src/ddl/tests/drop_table.rs b/src/common/meta/src/ddl/tests/drop_table.rs index 65c3915adc..fb2c882da0 100644 --- a/src/common/meta/src/ddl/tests/drop_table.rs +++ b/src/common/meta/src/ddl/tests/drop_table.rs @@ -172,7 +172,7 @@ async fn test_on_datanode_drop_regions() { let result = rx.try_recv().unwrap(); results.push(result); } - results.sort_unstable_by(|(a, _), (b, _)| a.id.cmp(&b.id)); + results.sort_unstable_by_key(|(a, _)| a.id); let (peer, request) = results.remove(0); check(peer, request, 1, RegionId::new(table_id, 1), false); diff --git a/src/common/meta/src/election/rds/mysql.rs b/src/common/meta/src/election/rds/mysql.rs index 80f3d8ca7c..bd694e4ae1 100644 --- a/src/common/meta/src/election/rds/mysql.rs +++ b/src/common/meta/src/election/rds/mysql.rs @@ -987,8 +987,7 @@ impl MySqlElection { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; - use std::env; + use std::{assert_matches, env}; use common_telemetry::init_default_ut_logging; use sqlx::MySqlPool; diff --git a/src/common/meta/src/election/rds/postgres.rs b/src/common/meta/src/election/rds/postgres.rs index 01910335a0..220b33bb60 100644 --- a/src/common/meta/src/election/rds/postgres.rs +++ b/src/common/meta/src/election/rds/postgres.rs @@ -829,8 +829,7 @@ impl PgElection { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; - use std::env; + use std::{assert_matches, env}; use deadpool_postgres::{Config, Runtime}; use tokio_postgres::NoTls; diff --git a/src/common/meta/src/key/flow.rs b/src/common/meta/src/key/flow.rs index 546071f2a0..1ebd52da1a 100644 --- a/src/common/meta/src/key/flow.rs +++ b/src/common/meta/src/key/flow.rs @@ -390,7 +390,7 @@ impl std::fmt::Debug for FlowMetadataManager { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::collections::BTreeMap; use std::sync::Arc; diff --git a/src/common/meta/src/key/topic_name.rs b/src/common/meta/src/key/topic_name.rs index 5497fbe478..99ae631a72 100644 --- a/src/common/meta/src/key/topic_name.rs +++ b/src/common/meta/src/key/topic_name.rs @@ -237,7 +237,7 @@ impl TopicNameManager { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::sync::Arc; use super::*; diff --git a/src/common/meta/src/kv_backend/rds.rs b/src/common/meta/src/kv_backend/rds.rs index 8acab1eb65..fd88496bc3 100644 --- a/src/common/meta/src/kv_backend/rds.rs +++ b/src/common/meta/src/kv_backend/rds.rs @@ -575,12 +575,12 @@ macro_rules! record_rds_sql_execute_elapsed { .inspect(|_| { $crate::metrics::RDS_SQL_EXECUTE_ELAPSED .with_label_values(&[$label_store, "success", $label_op, $label_type]) - .observe(timer.elapsed().as_millis_f64()) + .observe(timer.elapsed().as_millis() as f64) }) .inspect_err(|_| { $crate::metrics::RDS_SQL_EXECUTE_ELAPSED .with_label_values(&[$label_store, "error", $label_op, $label_type]) - .observe(timer.elapsed().as_millis_f64()); + .observe(timer.elapsed().as_millis() as f64); }) }}; } diff --git a/src/common/meta/src/lib.rs b/src/common/meta/src/lib.rs index 36aae1026e..ef5109dc03 100644 --- a/src/common/meta/src/lib.rs +++ b/src/common/meta/src/lib.rs @@ -12,9 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(assert_matches)] -#![feature(duration_millis_float)] - pub mod cache; pub mod cache_invalidator; pub mod cluster; diff --git a/src/common/meta/src/range_stream.rs b/src/common/meta/src/range_stream.rs index 2fc333064e..4bcd5de7db 100644 --- a/src/common/meta/src/range_stream.rs +++ b/src/common/meta/src/range_stream.rs @@ -187,7 +187,7 @@ impl PaginationStream { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::collections::BTreeMap; use std::sync::Arc; diff --git a/src/common/meta/src/reconciliation/reconcile_logical_tables/update_table_infos.rs b/src/common/meta/src/reconciliation/reconcile_logical_tables/update_table_infos.rs index d0de7a06fb..e8597ccd36 100644 --- a/src/common/meta/src/reconciliation/reconcile_logical_tables/update_table_infos.rs +++ b/src/common/meta/src/reconciliation/reconcile_logical_tables/update_table_infos.rs @@ -84,7 +84,7 @@ impl State for UpdateTableInfos { .persistent_ctx .update_table_infos .iter() - .zip(table_info_values.into_iter()) + .zip(table_info_values) { let new_table_info = Self::build_new_table_info( *table_id, diff --git a/src/common/meta/src/reconciliation/utils.rs b/src/common/meta/src/reconciliation/utils.rs index 4debc6de4d..6ddc084596 100644 --- a/src/common/meta/src/reconciliation/utils.rs +++ b/src/common/meta/src/reconciliation/utils.rs @@ -949,7 +949,7 @@ impl Display for ReconcileTableMetrics { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::collections::HashMap; use std::sync::Arc; diff --git a/src/common/meta/src/sequence.rs b/src/common/meta/src/sequence.rs index d186446fda..cd0a8ebf88 100644 --- a/src/common/meta/src/sequence.rs +++ b/src/common/meta/src/sequence.rs @@ -337,7 +337,7 @@ impl Inner { #[cfg(test)] mod tests { use std::any::Any; - use std::assert_matches::assert_matches; + use std::assert_matches; use std::collections::HashSet; use std::sync::Arc; diff --git a/src/common/meta/src/snapshot.rs b/src/common/meta/src/snapshot.rs index 8f4818e33a..e66156b255 100644 --- a/src/common/meta/src/snapshot.rs +++ b/src/common/meta/src/snapshot.rs @@ -355,7 +355,7 @@ impl MetadataSnapshotManager { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::sync::Arc; use common_test_util::temp_dir::{TempDir, create_temp_dir}; diff --git a/src/common/meta/src/state_store.rs b/src/common/meta/src/state_store.rs index 0ecbd5a8e4..d98a286581 100644 --- a/src/common/meta/src/state_store.rs +++ b/src/common/meta/src/state_store.rs @@ -380,9 +380,8 @@ impl PoisonStore for KvStateStore { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; - use std::env; use std::sync::Arc; + use std::{assert_matches, env}; use common_procedure::store::state_store::KeyValue; use common_telemetry::info; diff --git a/src/common/meta/src/wal_provider.rs b/src/common/meta/src/wal_provider.rs index cd599b58ba..bf2cb9ba32 100644 --- a/src/common/meta/src/wal_provider.rs +++ b/src/common/meta/src/wal_provider.rs @@ -172,7 +172,7 @@ pub fn extract_topic_from_wal_options( #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use common_wal::config::kafka::MetasrvKafkaConfig; use common_wal::config::kafka::common::KafkaTopicConfig; diff --git a/src/common/meta/src/wal_provider/topic_pool.rs b/src/common/meta/src/wal_provider/topic_pool.rs index 919f0b2abe..f9b4863e52 100644 --- a/src/common/meta/src/wal_provider/topic_pool.rs +++ b/src/common/meta/src/wal_provider/topic_pool.rs @@ -136,7 +136,7 @@ impl KafkaTopicPool { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use common_wal::maybe_skip_kafka_integration_test; use common_wal::test_util::get_kafka_endpoints; diff --git a/src/common/procedure/src/lib.rs b/src/common/procedure/src/lib.rs index e322765558..156a0ab78c 100644 --- a/src/common/procedure/src/lib.rs +++ b/src/common/procedure/src/lib.rs @@ -14,8 +14,6 @@ //! Common traits and structures for the procedure framework. -#![feature(assert_matches)] - pub mod error; pub mod event; pub mod local; diff --git a/src/common/procedure/src/local.rs b/src/common/procedure/src/local.rs index fe86cd7993..9e8536308c 100644 --- a/src/common/procedure/src/local.rs +++ b/src/common/procedure/src/local.rs @@ -920,7 +920,7 @@ pub(crate) mod test_util { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use common_error::mock::MockError; use common_error::status_code::StatusCode; diff --git a/src/common/procedure/src/local/runner.rs b/src/common/procedure/src/local/runner.rs index 46dcef11d4..2a974de889 100644 --- a/src/common/procedure/src/local/runner.rs +++ b/src/common/procedure/src/local/runner.rs @@ -704,7 +704,7 @@ impl Runner { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering}; diff --git a/src/common/procedure/src/store/util.rs b/src/common/procedure/src/store/util.rs index e6ef5b62ec..c44e5f4712 100644 --- a/src/common/procedure/src/store/util.rs +++ b/src/common/procedure/src/store/util.rs @@ -57,7 +57,7 @@ fn merge_multiple_values( let (key, value) = pairs.into_iter().next().unwrap(); let prefix = KeySet::with_prefix(&key); let mut parsed_segments = parse_segments(segments, &prefix)?; - parsed_segments.sort_unstable_by(|a, b| a.0.cmp(&b.0)); + parsed_segments.sort_unstable_by_key(|a| a.0); // Safety: `parsed_segments` must larger than 0. let segment_num = parsed_segments.last().unwrap().0; @@ -133,7 +133,7 @@ pub fn multiple_value_stream( #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use futures::TryStreamExt; use futures::stream::{self}; diff --git a/src/common/recordbatch/src/lib.rs b/src/common/recordbatch/src/lib.rs index d84e9e9d26..629efd6d84 100644 --- a/src/common/recordbatch/src/lib.rs +++ b/src/common/recordbatch/src/lib.rs @@ -485,7 +485,7 @@ impl QueryMemoryTracker { "{} requested, {} used globally ({}%), {} used by this stream, hard limit: {}", ReadableSize(additional as u64), ReadableSize(current as u64), - if limit > 0 { current * 100 / limit } else { 0 }, + (current * 100).checked_div(limit).unwrap_or(0), ReadableSize(stream_tracked as u64), ReadableSize(limit as u64) ); @@ -613,7 +613,7 @@ impl StreamMemoryTracker { waited, ReadableSize(additional as u64), ReadableSize(current as u64), - if limit > 0 { current * 100 / limit } else { 0 }, + (current * 100).checked_div(limit).unwrap_or(0), ReadableSize(self.tracked_bytes as u64), ReadableSize(limit as u64) ); diff --git a/src/common/recordbatch/src/recordbatch.rs b/src/common/recordbatch/src/recordbatch.rs index f6e0aeed93..2e92b9e87a 100644 --- a/src/common/recordbatch/src/recordbatch.rs +++ b/src/common/recordbatch/src/recordbatch.rs @@ -437,7 +437,7 @@ fn maybe_align_json_array_with_schema( } let mut aligned = Vec::with_capacity(arrays.len()); - for (field, array) in schema.fields().iter().zip(arrays.into_iter()) { + for (field, array) in schema.fields().iter().zip(arrays) { if !is_json_extension_type(field) { aligned.push(array); continue; diff --git a/src/common/sql/src/default_constraint.rs b/src/common/sql/src/default_constraint.rs index 0084320835..bad9f374fd 100644 --- a/src/common/sql/src/default_constraint.rs +++ b/src/common/sql/src/default_constraint.rs @@ -122,7 +122,7 @@ pub fn parse_column_default_constraint( #[cfg(test)] mod test { - use std::assert_matches::assert_matches; + use std::assert_matches; use datatypes::prelude::{ConcreteDataType, Value}; use datatypes::types::BooleanType; diff --git a/src/common/sql/src/lib.rs b/src/common/sql/src/lib.rs index abca883124..8835cae6b4 100644 --- a/src/common/sql/src/lib.rs +++ b/src/common/sql/src/lib.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(assert_matches)] - pub mod convert; pub mod default_constraint; pub mod error; diff --git a/src/common/telemetry/src/lib.rs b/src/common/telemetry/src/lib.rs index cd60d61645..26bf5d53b3 100644 --- a/src/common/telemetry/src/lib.rs +++ b/src/common/telemetry/src/lib.rs @@ -21,10 +21,12 @@ mod panic_hook; pub mod tracing_context; mod tracing_sampler; +pub use common_error; pub use logging::{ LOG_RELOAD_HANDLE, TRACE_RELOAD_HANDLE, get_or_init_tracer, init_default_ut_logging, init_global_logging, }; pub use metric::dump_metrics; pub use panic_hook::set_panic_hook; -pub use {common_error, tracing, tracing_subscriber}; +pub use tracing; +pub use tracing_subscriber; diff --git a/src/common/wal/src/lib.rs b/src/common/wal/src/lib.rs index 659a045f57..a0b1dc99f9 100644 --- a/src/common/wal/src/lib.rs +++ b/src/common/wal/src/lib.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(assert_matches)] - use std::net::SocketAddr; use error::{EndpointIPV4NotFoundSnafu, ResolveEndpointSnafu, Result}; @@ -59,7 +57,7 @@ async fn resolve_to_ipv4_one>(endpoint: T) -> Result { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use common_telemetry::warn; use rskafka::client::{Credentials, SaslConfig}; diff --git a/src/datanode/src/datanode.rs b/src/datanode/src/datanode.rs index 859235fa9f..c848215d39 100644 --- a/src/datanode/src/datanode.rs +++ b/src/datanode/src/datanode.rs @@ -551,14 +551,15 @@ impl DatanodeBuilder { if kafka_config.create_index && opts.node_id.is_none() { warn!("The WAL index creation only available in distributed mode.") } - let global_index_collector = if kafka_config.create_index && opts.node_id.is_some() + let global_index_collector = if kafka_config.create_index + && let Some(node_id) = opts.node_id { let operator = new_object_store_without_cache( &opts.storage.store, &opts.storage.data_home, ) .await?; - let path = default_index_file(opts.node_id.unwrap()); + let path = default_index_file(node_id); Some(Self::build_global_index_collector( kafka_config.dump_index_interval, operator, @@ -782,7 +783,7 @@ async fn open_all_regions( #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::collections::{BTreeMap, HashMap}; use std::sync::Arc; diff --git a/src/datanode/src/heartbeat/handler.rs b/src/datanode/src/heartbeat/handler.rs index defc910573..10948a3e7c 100644 --- a/src/datanode/src/heartbeat/handler.rs +++ b/src/datanode/src/heartbeat/handler.rs @@ -295,7 +295,7 @@ impl HeartbeatResponseHandler for RegionHeartbeatResponseHandler { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; diff --git a/src/datanode/src/heartbeat/handler/close_region.rs b/src/datanode/src/heartbeat/handler/close_region.rs index 819fcc4880..484edf5ff4 100644 --- a/src/datanode/src/heartbeat/handler/close_region.rs +++ b/src/datanode/src/heartbeat/handler/close_region.rs @@ -47,7 +47,7 @@ impl InstructionHandler for CloseRegionsHandler { let results = join_all(futs).await; let mut errors = vec![]; - for (region_id, result) in region_ids.into_iter().zip(results.into_iter()) { + for (region_id, result) in region_ids.into_iter().zip(results) { match result { Ok(_) => (), Err(error::Error::RegionNotFound { .. }) => { @@ -79,7 +79,6 @@ mod tests { use std::assert_matches; use std::sync::Arc; - use assert_matches::assert_matches; use common_meta::RegionIdent; use common_meta::heartbeat::handler::{HandleControl, HeartbeatResponseHandler}; use common_meta::heartbeat::mailbox::MessageMeta; diff --git a/src/datanode/src/heartbeat/handler/downgrade_region.rs b/src/datanode/src/heartbeat/handler/downgrade_region.rs index 40d9765ca2..f0ec37c844 100644 --- a/src/datanode/src/heartbeat/handler/downgrade_region.rs +++ b/src/datanode/src/heartbeat/handler/downgrade_region.rs @@ -225,7 +225,7 @@ impl HandlerContext { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::sync::Arc; use std::time::Duration; diff --git a/src/datanode/src/heartbeat/handler/open_region.rs b/src/datanode/src/heartbeat/handler/open_region.rs index 91ba618d9a..56c07a3efe 100644 --- a/src/datanode/src/heartbeat/handler/open_region.rs +++ b/src/datanode/src/heartbeat/handler/open_region.rs @@ -72,7 +72,7 @@ impl InstructionHandler for OpenRegionsHandler { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::collections::HashMap; use std::sync::Arc; diff --git a/src/datanode/src/heartbeat/handler/upgrade_region.rs b/src/datanode/src/heartbeat/handler/upgrade_region.rs index b2036a6ef4..c06e8aa845 100644 --- a/src/datanode/src/heartbeat/handler/upgrade_region.rs +++ b/src/datanode/src/heartbeat/handler/upgrade_region.rs @@ -183,9 +183,10 @@ impl UpgradeRegionsHandler { .await { Ok(responses) => { - replies.extend( - Self::convert_responses_to_replies(responses, &catchup_regions).into_iter(), - ); + replies.extend(Self::convert_responses_to_replies( + responses, + &catchup_regions, + )); } Err(_) => { replies.extend(catchup_regions.iter().map(|region_id| UpgradeRegionReply { diff --git a/src/datanode/src/lib.rs b/src/datanode/src/lib.rs index 55d2b1796d..7e0db3cabc 100644 --- a/src/datanode/src/lib.rs +++ b/src/datanode/src/lib.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(assert_matches)] - pub mod alive_keeper; pub mod config; pub mod datanode; diff --git a/src/datanode/src/region_server.rs b/src/datanode/src/region_server.rs index 9d675aa276..ec10691bea 100644 --- a/src/datanode/src/region_server.rs +++ b/src/datanode/src/region_server.rs @@ -1667,7 +1667,7 @@ impl RegionAttribute { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use api::v1::SemanticType; use common_error::ext::ErrorExt; diff --git a/src/datatypes/src/json.rs b/src/datatypes/src/json.rs index 3bebbf89aa..db657abbcb 100644 --- a/src/datatypes/src/json.rs +++ b/src/datatypes/src/json.rs @@ -426,7 +426,7 @@ fn decode_struct_with_context<'a>( let (items, fields) = struct_value.into_parts(); - for (field, field_value) in fields.fields().iter().zip(items.into_iter()) { + for (field, field_value) in fields.fields().iter().zip(items) { let field_context = context.with_key(field.name()); let json_value = decode_value_with_context(field_value, &field_context)?; json_object.insert(field.name().to_string(), json_value); @@ -561,7 +561,7 @@ fn decode_struct_with_settings<'a>( // Process each field in the struct value let (struct_data, fields) = struct_value.into_parts(); - for (field, value) in fields.fields().iter().zip(struct_data.into_iter()) { + for (field, value) in fields.fields().iter().zip(struct_data) { let field_context = context.with_key(field.name()); // Check if this field should be treated as unstructured diff --git a/src/datatypes/src/lib.rs b/src/datatypes/src/lib.rs index 2c3d4c23bf..0dcce16857 100644 --- a/src/datatypes/src/lib.rs +++ b/src/datatypes/src/lib.rs @@ -12,9 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(assert_matches)] -#![feature(box_patterns)] - pub mod arrow_array; pub mod data_type; pub mod duration; diff --git a/src/datatypes/src/types/json_type.rs b/src/datatypes/src/types/json_type.rs index 912bbfca54..13aeffb26c 100644 --- a/src/datatypes/src/types/json_type.rs +++ b/src/datatypes/src/types/json_type.rs @@ -773,7 +773,7 @@ mod tests { r#"Failed to merge JSON datatype: datatypes have conflict, this: {"hello":"","list":[""],"object":{"a":""}}, that: """#, r#"Failed to merge JSON datatype: datatypes have conflict, this: {"hello":"","list":[""],"object":{"a":""}}, that: [""]"#, ]; - for (json, expect) in jsons.into_iter().zip(expects.into_iter()) { + for (json, expect) in jsons.into_iter().zip(expects) { test(json, json_type, Err(expect))?; } diff --git a/src/datatypes/src/value.rs b/src/datatypes/src/value.rs index ab64c801e5..8cfb8da7ad 100644 --- a/src/datatypes/src/value.rs +++ b/src/datatypes/src/value.rs @@ -922,7 +922,7 @@ impl TryFrom for serde_json::Value { let map = struct_type .fields() .iter() - .zip(items.into_iter()) + .zip(items) .map(|(field, value)| { Ok(( field.name().to_string(), @@ -2723,26 +2723,26 @@ pub(crate) mod tests { .unwrap() ); assert_eq!( - ScalarValue::UInt8(Some(u8::MIN + 1)), - Value::UInt8(u8::MIN + 1) + ScalarValue::UInt8(Some(1)), + Value::UInt8(1) .try_to_scalar_value(&ConcreteDataType::uint8_datatype()) .unwrap() ); assert_eq!( - ScalarValue::UInt16(Some(u16::MIN + 2)), - Value::UInt16(u16::MIN + 2) + ScalarValue::UInt16(Some(2)), + Value::UInt16(2) .try_to_scalar_value(&ConcreteDataType::uint16_datatype()) .unwrap() ); assert_eq!( - ScalarValue::UInt32(Some(u32::MIN + 3)), - Value::UInt32(u32::MIN + 3) + ScalarValue::UInt32(Some(3)), + Value::UInt32(3) .try_to_scalar_value(&ConcreteDataType::uint32_datatype()) .unwrap() ); assert_eq!( - ScalarValue::UInt64(Some(u64::MIN + 4)), - Value::UInt64(u64::MIN + 4) + ScalarValue::UInt64(Some(4)), + Value::UInt64(4) .try_to_scalar_value(&ConcreteDataType::uint64_datatype()) .unwrap() ); diff --git a/src/datatypes/src/vectors.rs b/src/datatypes/src/vectors.rs index 5355b35ff4..7c7d2a4ad6 100644 --- a/src/datatypes/src/vectors.rs +++ b/src/datatypes/src/vectors.rs @@ -307,10 +307,11 @@ macro_rules! impl_extend_for_builder { }}; } -pub(crate) use { - impl_extend_for_builder, impl_get_for_vector, impl_get_ref_for_vector, - impl_try_from_arrow_array_for_vector, impl_validity_for_vector, -}; +pub(crate) use impl_extend_for_builder; +pub(crate) use impl_get_for_vector; +pub(crate) use impl_get_ref_for_vector; +pub(crate) use impl_try_from_arrow_array_for_vector; +pub(crate) use impl_validity_for_vector; #[cfg(test)] pub mod tests { diff --git a/src/datatypes/src/vectors/binary.rs b/src/datatypes/src/vectors/binary.rs index 21d188199c..6d9954fb80 100644 --- a/src/datatypes/src/vectors/binary.rs +++ b/src/datatypes/src/vectors/binary.rs @@ -458,7 +458,7 @@ impl BinaryVector { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use arrow::datatypes::DataType as ArrowDataType; use common_base::bytes::Bytes; diff --git a/src/datatypes/src/vectors/json/builder.rs b/src/datatypes/src/vectors/json/builder.rs index ecc19f4bdd..58b4073666 100644 --- a/src/datatypes/src/vectors/json/builder.rs +++ b/src/datatypes/src/vectors/json/builder.rs @@ -328,7 +328,7 @@ mod tests { ), ]; let mut builder = JsonVectorBuilder::new(JsonNativeType::Null, 1); - for (json, result) in jsons.into_iter().zip(results.into_iter()) { + for (json, result) in jsons.into_iter().zip(results) { push(json, &mut builder, result); } let vector = builder.to_vector(); @@ -448,7 +448,7 @@ mod tests { for (builder, (expect_type, expect_vector)) in builder .builders .iter() - .zip(expect_types.into_iter().zip(expect_vectors.into_iter())) + .zip(expect_types.into_iter().zip(expect_vectors)) { assert_eq!(builder.json_type.name(), expect_type); let vector = builder.inner.to_vector_cloned(); diff --git a/src/file-engine/src/lib.rs b/src/file-engine/src/lib.rs index cc9bac8c6f..51d13946cd 100644 --- a/src/file-engine/src/lib.rs +++ b/src/file-engine/src/lib.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(assert_matches)] - pub mod config; pub mod engine; pub mod error; diff --git a/src/file-engine/src/region.rs b/src/file-engine/src/region.rs index c17e797966..3808b33a67 100644 --- a/src/file-engine/src/region.rs +++ b/src/file-engine/src/region.rs @@ -105,7 +105,7 @@ impl FileRegion { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use store_api::region_request::PathType; diff --git a/src/flow/src/adapter/flownode_impl.rs b/src/flow/src/adapter/flownode_impl.rs index d1e4600e23..976df56c9e 100644 --- a/src/flow/src/adapter/flownode_impl.rs +++ b/src/flow/src/adapter/flownode_impl.rs @@ -1060,7 +1060,7 @@ impl StreamingEngine { let fetch_order: Vec = table_col_names .iter() - .zip(default_vals.into_iter()) + .zip(default_vals) .map(|(col_name, col_default_val)| { name_to_col .get(col_name) diff --git a/src/flow/src/lib.rs b/src/flow/src/lib.rs index bd4cbd9f08..fe8a760a07 100644 --- a/src/flow/src/lib.rs +++ b/src/flow/src/lib.rs @@ -17,7 +17,6 @@ //! It also contains definition of expression, adapter and plan, and internal state management. #![allow(dead_code)] -#![warn(clippy::missing_docs_in_private_items)] #![warn(clippy::too_many_lines)] // TODO(discord9): enable this lint to handle out of bound access diff --git a/src/flow/src/utils.rs b/src/flow/src/utils.rs index 8f28ed23c7..1a9879b996 100644 --- a/src/flow/src/utils.rs +++ b/src/flow/src/utils.rs @@ -213,7 +213,7 @@ impl KeyExpiryManager { let mut before = self.event_ts_to_key.split_off(&expire_time); std::mem::swap(&mut before, &mut self.event_ts_to_key); - Some(before.into_iter().flat_map(|(_ts, keys)| keys.into_iter())) + Some(before.into_values().flat_map(|keys| keys.into_iter())) } } @@ -409,8 +409,8 @@ impl Arrangement { // iter over batches that only have updates of `timestamp>now` and find the first non empty batch, then get the minimum timestamp in that batch for (_ts, batch) in self.spine.range((Bound::Excluded(now), Bound::Unbounded)) { let min_ts = batch - .iter() - .flat_map(|(_k, v)| v.iter().map(|(_, ts, _)| *ts).min()) + .values() + .flat_map(|v| v.iter().map(|(_, ts, _)| *ts).min()) .min(); if min_ts.is_some() { diff --git a/src/frontend/src/lib.rs b/src/frontend/src/lib.rs index 16321795b7..c170236073 100644 --- a/src/frontend/src/lib.rs +++ b/src/frontend/src/lib.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(assert_matches)] - pub mod error; pub mod events; pub mod frontend; diff --git a/src/index/src/inverted_index/format/reader/footer.rs b/src/index/src/inverted_index/format/reader/footer.rs index 866021c6e6..5d6ac922c9 100644 --- a/src/index/src/inverted_index/format/reader/footer.rs +++ b/src/index/src/inverted_index/format/reader/footer.rs @@ -173,7 +173,7 @@ impl InvertedIndexFooterReader { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use prost::Message; diff --git a/src/index/src/lib.rs b/src/index/src/lib.rs index c469acb8c3..7969ece891 100644 --- a/src/index/src/lib.rs +++ b/src/index/src/lib.rs @@ -13,7 +13,6 @@ // limitations under the License. #![feature(iter_partition_in_place)] -#![feature(assert_matches)] pub mod bitmap; pub mod bloom_filter; diff --git a/src/log-query/src/log_query.rs b/src/log-query/src/log_query.rs index c5b71c6efb..2955b7bded 100644 --- a/src/log-query/src/log_query.rs +++ b/src/log-query/src/log_query.rs @@ -199,57 +199,53 @@ impl TimeFilter { let mut start_dt = None; let mut end_dt = None; - if self.start.is_some() && self.end.is_none() && self.span.is_none() { - // Only 'start' is provided - let s = self.start.as_ref().unwrap(); - let (start, end_opt) = Self::parse_datetime(s)?; - if end_opt.is_none() { + match (&self.start, &self.end, &self.span) { + (Some(start), None, None) => { + let (start, end_opt) = Self::parse_datetime(start)?; + if end_opt.is_none() { + return Err(InvalidTimeFilterSnafu { + filter: self.clone(), + } + .build()); + } + start_dt = Some(start); + end_dt = end_opt; + } + (Some(start), Some(end), _) => { + // Both 'start' and 'end' are provided + let (start, _) = Self::parse_datetime(start)?; + let (end, _) = Self::parse_datetime(end)?; + start_dt = Some(start); + end_dt = Some(end); + } + (Some(start), None, Some(span)) => { + let (start, _) = Self::parse_datetime(start)?; + let span = Self::parse_span(span)?; + let end = start + span; + start_dt = Some(start); + end_dt = Some(end); + } + (None, Some(end), Some(span)) => { + let (end, _) = Self::parse_datetime(end)?; + let span = Self::parse_span(span)?; + let start = end - span; + start_dt = Some(start); + end_dt = Some(end); + } + (None, None, Some(span)) => { + let span = Self::parse_span(span)?; + let end = Utc::now(); + let start = end - span; + start_dt = Some(start); + end_dt = Some(end); + } + _ => { + // Exception return Err(InvalidTimeFilterSnafu { filter: self.clone(), } .build()); } - start_dt = Some(start); - end_dt = end_opt; - } else if self.start.is_some() && self.end.is_some() { - // Both 'start' and 'end' are provided - let (start, _) = Self::parse_datetime(self.start.as_ref().unwrap())?; - let (end, _) = Self::parse_datetime(self.end.as_ref().unwrap())?; - start_dt = Some(start); - end_dt = Some(end); - } else if self.span.is_some() && (self.start.is_some() || self.end.is_some()) { - // 'span' with 'start' or 'end' - let span = Self::parse_span(self.span.as_ref().unwrap())?; - if self.start.is_some() { - let (start, _) = Self::parse_datetime(self.start.as_ref().unwrap())?; - let end = start + span; - start_dt = Some(start); - end_dt = Some(end); - } else { - let (end, _) = Self::parse_datetime(self.end.as_ref().unwrap())?; - let start = end - span; - start_dt = Some(start); - end_dt = Some(end); - } - } else if self.span.is_some() && self.start.is_none() && self.end.is_none() { - // Only 'span' is provided - let span = Self::parse_span(self.span.as_ref().unwrap())?; - let end = Utc::now(); - let start = end - span; - start_dt = Some(start); - end_dt = Some(end); - } else if self.start.is_some() && self.span.is_some() && self.end.is_some() { - // All fields are provided; 'start' and 'end' take priority - let (start, _) = Self::parse_datetime(self.start.as_ref().unwrap())?; - let (end, _) = Self::parse_datetime(self.end.as_ref().unwrap())?; - start_dt = Some(start); - end_dt = Some(end); - } else { - // Exception - return Err(InvalidTimeFilterSnafu { - filter: self.clone(), - } - .build()); } // Validate that end is after start diff --git a/src/log-store/src/kafka/index/iterator.rs b/src/log-store/src/kafka/index/iterator.rs index 9ab350036d..ba8e7273d7 100644 --- a/src/log-store/src/kafka/index/iterator.rs +++ b/src/log-store/src/kafka/index/iterator.rs @@ -61,11 +61,7 @@ impl RegionWalRange { fn next_batch_size(&self) -> Option { if self.current_entry_id < self.end_entry_id { - Some( - self.end_entry_id - .checked_sub(self.current_entry_id) - .unwrap_or_default(), - ) + Some(self.end_entry_id.saturating_sub(self.current_entry_id)) } else { None } diff --git a/src/log-store/src/kafka/log_store.rs b/src/log-store/src/kafka/log_store.rs index 702e5bf319..e7fd06816d 100644 --- a/src/log-store/src/kafka/log_store.rs +++ b/src/log-store/src/kafka/log_store.rs @@ -550,7 +550,7 @@ fn check_termination(offset: i64, end_offset: i64) -> bool { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; diff --git a/src/log-store/src/kafka/util/record.rs b/src/log-store/src/kafka/util/record.rs index 1e291d9776..720f989139 100644 --- a/src/log-store/src/kafka/util/record.rs +++ b/src/log-store/src/kafka/util/record.rs @@ -306,7 +306,7 @@ pub(crate) fn maybe_emit_entry( #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::sync::Arc; use super::*; diff --git a/src/log-store/src/lib.rs b/src/log-store/src/lib.rs index ec8207d5eb..c054fbc1d8 100644 --- a/src/log-store/src/lib.rs +++ b/src/log-store/src/lib.rs @@ -12,9 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(io_error_more)] -#![feature(assert_matches)] - pub mod error; pub mod kafka; pub mod metrics; diff --git a/src/meta-client/src/client/cluster.rs b/src/meta-client/src/client/cluster.rs index 8da45aaa7a..976626b3a0 100644 --- a/src/meta-client/src/client/cluster.rs +++ b/src/meta-client/src/client/cluster.rs @@ -212,8 +212,8 @@ impl Inner { } } } - } else if let Err(err) = leader_provider.ask_leader().await { - return Err(err); + } else { + leader_provider.ask_leader().await?; } } diff --git a/src/meta-client/src/client/procedure.rs b/src/meta-client/src/client/procedure.rs index 93e37511d9..63f45c28fb 100644 --- a/src/meta-client/src/client/procedure.rs +++ b/src/meta-client/src/client/procedure.rs @@ -208,8 +208,8 @@ impl Inner { } } } - } else if let Err(err) = leader_provider.ask_leader().await { - return Err(err); + } else { + leader_provider.ask_leader().await?; } } diff --git a/src/meta-srv/src/gc/candidate.rs b/src/meta-srv/src/gc/candidate.rs index 05fc79ac52..2101318cdf 100644 --- a/src/meta-srv/src/gc/candidate.rs +++ b/src/meta-srv/src/gc/candidate.rs @@ -109,7 +109,7 @@ impl GcScheduler { } // Sort candidates by score in descending order and take top N - candidates.sort_by(|a, b| b.score.cmp(&a.score)); + candidates.sort_by_key(|a| std::cmp::Reverse(a.score)); let top_candidates: Vec = candidates .into_iter() .take(self.config.regions_per_table_threshold) diff --git a/src/meta-srv/src/gc/handler.rs b/src/meta-srv/src/gc/handler.rs index 105ddca58c..c62e927f89 100644 --- a/src/meta-srv/src/gc/handler.rs +++ b/src/meta-srv/src/gc/handler.rs @@ -346,7 +346,7 @@ impl GcScheduler { // Add to need_retry_regions since it failed combined_report .need_retry_regions - .extend(fast_list_regions.clone().into_iter()); + .extend(fast_list_regions.clone()); } } } diff --git a/src/meta-srv/src/handler.rs b/src/meta-srv/src/handler.rs index 58dd1a7c98..4b05db4e4c 100644 --- a/src/meta-srv/src/handler.rs +++ b/src/meta-srv/src/handler.rs @@ -870,7 +870,7 @@ impl HeartbeatHandlerGroupBuilderCustomizer for DefaultHeartbeatHandlerGroupBuil #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::sync::Arc; use std::time::Duration; @@ -972,7 +972,7 @@ mod tests { "RemapFlowPeerHandler", ]; assert_eq!(names.len(), handlers.len()); - for (handler, name) in handlers.iter().zip(names.into_iter()) { + for (handler, name) in handlers.iter().zip(names) { assert_eq!(handler.name, name); } } @@ -1009,7 +1009,7 @@ mod tests { "RemapFlowPeerHandler", ]; assert_eq!(names.len(), handlers.len()); - for (handler, name) in handlers.iter().zip(names.into_iter()) { + for (handler, name) in handlers.iter().zip(names) { assert_eq!(handler.name, name); } } @@ -1043,7 +1043,7 @@ mod tests { "RemapFlowPeerHandler", ]; assert_eq!(names.len(), handlers.len()); - for (handler, name) in handlers.iter().zip(names.into_iter()) { + for (handler, name) in handlers.iter().zip(names) { assert_eq!(handler.name, name); } } @@ -1077,7 +1077,7 @@ mod tests { "RemapFlowPeerHandler", ]; assert_eq!(names.len(), handlers.len()); - for (handler, name) in handlers.iter().zip(names.into_iter()) { + for (handler, name) in handlers.iter().zip(names) { assert_eq!(handler.name, name); } } @@ -1111,7 +1111,7 @@ mod tests { "RemapFlowPeerHandler", ]; assert_eq!(names.len(), handlers.len()); - for (handler, name) in handlers.iter().zip(names.into_iter()) { + for (handler, name) in handlers.iter().zip(names) { assert_eq!(handler.name, name); } } @@ -1145,7 +1145,7 @@ mod tests { ]; assert_eq!(names.len(), handlers.len()); - for (handler, name) in handlers.iter().zip(names.into_iter()) { + for (handler, name) in handlers.iter().zip(names) { assert_eq!(handler.name, name); } } @@ -1179,7 +1179,7 @@ mod tests { ]; assert_eq!(names.len(), handlers.len()); - for (handler, name) in handlers.iter().zip(names.into_iter()) { + for (handler, name) in handlers.iter().zip(names) { assert_eq!(handler.name, name); } } @@ -1212,7 +1212,7 @@ mod tests { "RemapFlowPeerHandler", ]; assert_eq!(names.len(), handlers.len()); - for (handler, name) in handlers.iter().zip(names.into_iter()) { + for (handler, name) in handlers.iter().zip(names) { assert_eq!(handler.name, name); } } diff --git a/src/meta-srv/src/lib.rs b/src/meta-srv/src/lib.rs index 0e87d4421a..70ce449bba 100644 --- a/src/meta-srv/src/lib.rs +++ b/src/meta-srv/src/lib.rs @@ -12,10 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(assert_matches)] #![feature(hash_set_entry)] #![feature(duration_constructors)] -#![feature(string_from_utf8_lossy_owned)] pub mod bootstrap; pub mod cache_invalidator; diff --git a/src/meta-srv/src/procedure/region_migration.rs b/src/meta-srv/src/procedure/region_migration.rs index b3797860b3..6563da63bc 100644 --- a/src/meta-srv/src/procedure/region_migration.rs +++ b/src/meta-srv/src/procedure/region_migration.rs @@ -924,7 +924,7 @@ impl Procedure for RegionMigrationProcedure { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::sync::Arc; use common_meta::distributed_time_constants::default_distributed_time_constants; diff --git a/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs b/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs index 37cc0dfa96..b7e8315a77 100644 --- a/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs +++ b/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs @@ -372,7 +372,7 @@ impl DowngradeLeaderRegion { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::collections::HashMap; use common_meta::key::table_route::TableRouteValue; diff --git a/src/meta-srv/src/procedure/region_migration/flush_leader_region.rs b/src/meta-srv/src/procedure/region_migration/flush_leader_region.rs index f3dc0ee661..9dee05373b 100644 --- a/src/meta-srv/src/procedure/region_migration/flush_leader_region.rs +++ b/src/meta-srv/src/procedure/region_migration/flush_leader_region.rs @@ -93,7 +93,7 @@ impl PreFlushRegion { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use store_api::storage::RegionId; diff --git a/src/meta-srv/src/procedure/region_migration/manager.rs b/src/meta-srv/src/procedure/region_migration/manager.rs index 70cba21b5f..d18458b08f 100644 --- a/src/meta-srv/src/procedure/region_migration/manager.rs +++ b/src/meta-srv/src/procedure/region_migration/manager.rs @@ -620,7 +620,7 @@ impl RegionMigrationManager { #[cfg(test)] mod test { - use std::assert_matches::assert_matches; + use std::assert_matches; use common_meta::key::table_route::LogicalTableRouteValue; use common_meta::key::test_utils::new_test_table_info; diff --git a/src/meta-srv/src/procedure/region_migration/migration_start.rs b/src/meta-srv/src/procedure/region_migration/migration_start.rs index 17b577501e..b9d4372c47 100644 --- a/src/meta-srv/src/procedure/region_migration/migration_start.rs +++ b/src/meta-srv/src/procedure/region_migration/migration_start.rs @@ -187,7 +187,7 @@ impl RegionMigrationStart { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use common_meta::key::test_utils::new_test_table_info; use common_meta::peer::Peer; diff --git a/src/meta-srv/src/procedure/region_migration/open_candidate_region.rs b/src/meta-srv/src/procedure/region_migration/open_candidate_region.rs index 8e2e015669..189ba89449 100644 --- a/src/meta-srv/src/procedure/region_migration/open_candidate_region.rs +++ b/src/meta-srv/src/procedure/region_migration/open_candidate_region.rs @@ -208,7 +208,7 @@ impl OpenCandidateRegion { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::collections::HashMap; use common_catalog::consts::MITO2_ENGINE; diff --git a/src/meta-srv/src/procedure/region_migration/test_util.rs b/src/meta-srv/src/procedure/region_migration/test_util.rs index 4e5624401e..269357bc0a 100644 --- a/src/meta-srv/src/procedure/region_migration/test_util.rs +++ b/src/meta-srv/src/procedure/region_migration/test_util.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::assert_matches::assert_matches; +use std::assert_matches; use std::collections::HashMap; use std::sync::Arc; use std::sync::atomic::{AtomicUsize, Ordering}; diff --git a/src/meta-srv/src/procedure/region_migration/update_metadata/downgrade_leader_region.rs b/src/meta-srv/src/procedure/region_migration/update_metadata/downgrade_leader_region.rs index 9e8545bb43..bab79a96bf 100644 --- a/src/meta-srv/src/procedure/region_migration/update_metadata/downgrade_leader_region.rs +++ b/src/meta-srv/src/procedure/region_migration/update_metadata/downgrade_leader_region.rs @@ -88,7 +88,7 @@ impl UpdateMetadata { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::collections::HashMap; use std::sync::Arc; diff --git a/src/meta-srv/src/procedure/region_migration/update_metadata/rollback_downgraded_region.rs b/src/meta-srv/src/procedure/region_migration/update_metadata/rollback_downgraded_region.rs index b3ee848fbe..2f0ed0fbe9 100644 --- a/src/meta-srv/src/procedure/region_migration/update_metadata/rollback_downgraded_region.rs +++ b/src/meta-srv/src/procedure/region_migration/update_metadata/rollback_downgraded_region.rs @@ -72,7 +72,7 @@ impl UpdateMetadata { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::collections::HashMap; use std::sync::Arc; diff --git a/src/meta-srv/src/procedure/region_migration/update_metadata/upgrade_candidate_region.rs b/src/meta-srv/src/procedure/region_migration/update_metadata/upgrade_candidate_region.rs index db70e3e166..d5aa8699ec 100644 --- a/src/meta-srv/src/procedure/region_migration/update_metadata/upgrade_candidate_region.rs +++ b/src/meta-srv/src/procedure/region_migration/update_metadata/upgrade_candidate_region.rs @@ -224,7 +224,7 @@ impl UpdateMetadata { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use common_meta::key::test_utils::new_test_table_info; use common_meta::peer::Peer; diff --git a/src/meta-srv/src/procedure/region_migration/upgrade_candidate_region.rs b/src/meta-srv/src/procedure/region_migration/upgrade_candidate_region.rs index 39ff1ed741..4c60215cf7 100644 --- a/src/meta-srv/src/procedure/region_migration/upgrade_candidate_region.rs +++ b/src/meta-srv/src/procedure/region_migration/upgrade_candidate_region.rs @@ -353,7 +353,7 @@ impl UpgradeCandidateRegion { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::collections::HashMap; use common_meta::key::table_route::TableRouteValue; diff --git a/src/meta-srv/src/procedure/region_migration/utils.rs b/src/meta-srv/src/procedure/region_migration/utils.rs index df2e8014e2..91af7ccf17 100644 --- a/src/meta-srv/src/procedure/region_migration/utils.rs +++ b/src/meta-srv/src/procedure/region_migration/utils.rs @@ -242,7 +242,7 @@ pub async fn analyze_region_migration_task( #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::sync::Arc; use std::time::Duration; diff --git a/src/meta-srv/src/procedure/repartition/group.rs b/src/meta-srv/src/procedure/repartition/group.rs index 2ef764b634..f0cb1c4dd0 100644 --- a/src/meta-srv/src/procedure/repartition/group.rs +++ b/src/meta-srv/src/procedure/repartition/group.rs @@ -597,7 +597,7 @@ pub(crate) trait State: Sync + Send + Debug { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::sync::Arc; use common_meta::key::TableMetadataManager; diff --git a/src/meta-srv/src/procedure/repartition/group/enter_staging_region.rs b/src/meta-srv/src/procedure/repartition/group/enter_staging_region.rs index 18ecfd4bb2..59de569c13 100644 --- a/src/meta-srv/src/procedure/repartition/group/enter_staging_region.rs +++ b/src/meta-srv/src/procedure/repartition/group/enter_staging_region.rs @@ -433,7 +433,7 @@ impl EnterStagingRegion { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::time::Duration; use common_meta::instruction::StagingPartitionDirective; diff --git a/src/meta-srv/src/procedure/repartition/group/repartition_start.rs b/src/meta-srv/src/procedure/repartition/group/repartition_start.rs index 72de2f2934..8b8b5208b4 100644 --- a/src/meta-srv/src/procedure/repartition/group/repartition_start.rs +++ b/src/meta-srv/src/procedure/repartition/group/repartition_start.rs @@ -207,7 +207,7 @@ impl State for RepartitionStart { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use common_meta::peer::Peer; use common_meta::rpc::router::{Region, RegionRoute}; diff --git a/src/meta-srv/src/procedure/repartition/group/sync_region.rs b/src/meta-srv/src/procedure/repartition/group/sync_region.rs index 5e842da2fb..dcd58c21e9 100644 --- a/src/meta-srv/src/procedure/repartition/group/sync_region.rs +++ b/src/meta-srv/src/procedure/repartition/group/sync_region.rs @@ -338,7 +338,7 @@ impl SyncRegion { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use common_meta::peer::Peer; use common_meta::rpc::router::{Region, RegionRoute}; diff --git a/src/meta-srv/src/procedure/wal_prune.rs b/src/meta-srv/src/procedure/wal_prune.rs index 0897441647..f5e74ef543 100644 --- a/src/meta-srv/src/procedure/wal_prune.rs +++ b/src/meta-srv/src/procedure/wal_prune.rs @@ -193,7 +193,7 @@ impl Procedure for WalPruneProcedure { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use common_wal::maybe_skip_kafka_integration_test; use common_wal::test_util::get_kafka_endpoints; diff --git a/src/meta-srv/src/procedure/wal_prune/manager.rs b/src/meta-srv/src/procedure/wal_prune/manager.rs index e5f45d229a..1f1ab2b2f0 100644 --- a/src/meta-srv/src/procedure/wal_prune/manager.rs +++ b/src/meta-srv/src/procedure/wal_prune/manager.rs @@ -287,7 +287,7 @@ impl WalPruneManager { #[cfg(test)] mod test { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::time::Duration; use common_meta::key::topic_name::TopicNameKey; diff --git a/src/meta-srv/src/region/supervisor.rs b/src/meta-srv/src/region/supervisor.rs index 90e08992f6..05716e60c8 100644 --- a/src/meta-srv/src/region/supervisor.rs +++ b/src/meta-srv/src/region/supervisor.rs @@ -847,7 +847,7 @@ impl RegionSupervisor { #[cfg(test)] pub(crate) mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::collections::HashMap; use std::sync::{Arc, Mutex}; use std::time::Duration; diff --git a/src/meta-srv/src/service/mailbox.rs b/src/meta-srv/src/service/mailbox.rs index 5e569e6d4d..8b37eeaad5 100644 --- a/src/meta-srv/src/service/mailbox.rs +++ b/src/meta-srv/src/service/mailbox.rs @@ -214,7 +214,7 @@ pub trait Mailbox: Send + Sync { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use common_time::util::current_time_millis; use tokio::sync::watch; diff --git a/src/meta-srv/src/state.rs b/src/meta-srv/src/state.rs index e5edc5f169..12eb708b5c 100644 --- a/src/meta-srv/src/state.rs +++ b/src/meta-srv/src/state.rs @@ -117,7 +117,7 @@ pub fn become_follower() -> impl FnOnce(&State) -> State { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use crate::state::{FollowerState, LeaderState, State, become_follower, become_leader}; diff --git a/src/metric-engine/src/engine.rs b/src/metric-engine/src/engine.rs index ba90ca960d..c2c39951cc 100644 --- a/src/metric-engine/src/engine.rs +++ b/src/metric-engine/src/engine.rs @@ -575,7 +575,7 @@ struct MetricEngineInner { #[cfg(test)] mod test { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::collections::HashMap; use common_telemetry::info; diff --git a/src/metric-engine/src/engine/bulk_insert.rs b/src/metric-engine/src/engine/bulk_insert.rs index 2a3c26c80c..8122cdc958 100644 --- a/src/metric-engine/src/engine/bulk_insert.rs +++ b/src/metric-engine/src/engine/bulk_insert.rs @@ -380,7 +380,7 @@ fn record_batch_to_ipc(record_batch: &RecordBatch) -> Result<(Bytes, Bytes, Byte #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::sync::Arc; use api::v1::ArrowIpc; diff --git a/src/metric-engine/src/engine/create/extract_new_columns.rs b/src/metric-engine/src/engine/create/extract_new_columns.rs index b3eabd5706..9d1de9ebb2 100644 --- a/src/metric-engine/src/engine/create/extract_new_columns.rs +++ b/src/metric-engine/src/engine/create/extract_new_columns.rs @@ -52,7 +52,7 @@ pub fn extract_new_columns<'a>( #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::collections::{HashMap, HashSet}; use api::v1::SemanticType; diff --git a/src/metric-engine/src/lib.rs b/src/metric-engine/src/lib.rs index b93029f2f4..557baba25a 100644 --- a/src/metric-engine/src/lib.rs +++ b/src/metric-engine/src/lib.rs @@ -50,7 +50,7 @@ //! └─────────────────────┘ //! ``` -#![feature(assert_matches)] +#![recursion_limit = "256"] mod batch_modifier; pub mod config; diff --git a/src/metric-engine/src/repeated_task.rs b/src/metric-engine/src/repeated_task.rs index fa382d7844..4354511039 100644 --- a/src/metric-engine/src/repeated_task.rs +++ b/src/metric-engine/src/repeated_task.rs @@ -85,7 +85,7 @@ impl TaskFunction for FlushMetadataRegionTask { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::time::Duration; use store_api::region_engine::{RegionEngine, RegionManifestInfo}; diff --git a/src/metric-engine/src/test_util.rs b/src/metric-engine/src/test_util.rs index d81240d47f..d3e929cf63 100644 --- a/src/metric-engine/src/test_util.rs +++ b/src/metric-engine/src/test_util.rs @@ -167,7 +167,7 @@ impl TestEnv { primary_key: vec![], options: [(PHYSICAL_TABLE_METADATA_KEY.to_string(), String::new())] .into_iter() - .chain(options.into_iter()) + .chain(options) .collect(), table_dir: table_dir.to_string(), path_type: PathType::Bare, // Use Bare path type for engine regions diff --git a/src/mito-codec/src/key_values.rs b/src/mito-codec/src/key_values.rs index 5afacc3718..d66110bacf 100644 --- a/src/mito-codec/src/key_values.rs +++ b/src/mito-codec/src/key_values.rs @@ -431,11 +431,9 @@ mod tests { values: &[Option], ) { assert_eq!(num_rows, kvs.num_rows()); - let mut expect_seq = START_SEQ; let expect_ts = ValueRef::Int64(ts); - for kv in kvs.iter() { + for (expect_seq, kv) in (START_SEQ..).zip(kvs.iter()) { assert_eq!(expect_seq, kv.sequence()); - expect_seq += 1; assert_eq!(OpType::Put, kv.op_type); assert_eq!(keys.len(), kv.num_primary_keys()); assert_eq!(values.len(), kv.num_fields()); diff --git a/src/mito2/src/cache/index.rs b/src/mito2/src/cache/index.rs index 74f16fdb71..a404978714 100644 --- a/src/mito2/src/cache/index.rs +++ b/src/mito2/src/cache/index.rs @@ -229,14 +229,14 @@ where } if !cache_miss_range.is_empty() { let pages = load(cache_miss_range).await?; - for (i, page) in cache_miss_idx.into_iter().zip(pages.into_iter()) { + for (i, page) in cache_miss_idx.into_iter().zip(pages) { let page_key = page_keys[i]; metrics.page_bytes += page.len() as u64; data[i] = page.clone(); self.put_page(key, page_key, page.clone()); } } - let buffer = Buffer::from_iter(data.into_iter()); + let buffer = Buffer::from_iter(data); Ok(( buffer .slice(PageKey::calculate_range(offset, size, self.page_size)) diff --git a/src/mito2/src/compaction.rs b/src/mito2/src/compaction.rs index ba6957fdae..a43fa8a0a6 100644 --- a/src/mito2/src/compaction.rs +++ b/src/mito2/src/compaction.rs @@ -614,15 +614,12 @@ async fn find_dynamic_options( region_options: &crate::region::options::RegionOptions, schema_metadata_manager: &SchemaMetadataManagerRef, ) -> Result<(crate::region::options::CompactionOptions, TimeToLive)> { - if region_options.compaction_override && region_options.ttl.is_some() { + if let (true, Some(ttl)) = (region_options.compaction_override, region_options.ttl) { debug!( "Use region options directly for table {}: compaction={:?}, ttl={:?}", table_id, region_options.compaction, region_options.ttl ); - return Ok(( - region_options.compaction.clone(), - region_options.ttl.unwrap(), - )); + return Ok((region_options.compaction.clone(), ttl)); } let db_options = tokio::time::timeout( @@ -633,12 +630,12 @@ async fn find_dynamic_options( .context(TimeoutSnafu)? .context(GetSchemaMetadataSnafu)?; - let ttl = if region_options.ttl.is_some() { + let ttl = if let Some(ttl) = region_options.ttl { debug!( "Use region TTL directly for table {}: ttl={:?}", table_id, region_options.ttl ); - region_options.ttl.unwrap() + ttl } else { db_options .as_ref() @@ -980,7 +977,7 @@ struct PendingCompaction { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::time::Duration; use api::v1::region::StrictWindow; diff --git a/src/mito2/src/compaction/run.rs b/src/mito2/src/compaction/run.rs index a7e5ca490c..cf1cedd29a 100644 --- a/src/mito2/src/compaction/run.rs +++ b/src/mito2/src/compaction/run.rs @@ -309,7 +309,7 @@ where pub fn reduce_runs(mut runs: Vec>) -> Vec { assert!(runs.len() > 1); // sort runs by size - runs.sort_unstable_by(|a, b| a.size.cmp(&b.size)); + runs.sort_unstable_by_key(|a| a.size); // limit max probe runs to 100 let probe_end = runs.len().min(100); let mut min_penalty = usize::MAX; diff --git a/src/mito2/src/compaction/twcs.rs b/src/mito2/src/compaction/twcs.rs index 9012457f75..952d8771d8 100644 --- a/src/mito2/src/compaction/twcs.rs +++ b/src/mito2/src/compaction/twcs.rs @@ -588,8 +588,8 @@ mod tests { assert_eq!(*overlapping, actual_window.overlapping); let mut file_ranges = actual_window .files - .iter() - .flat_map(|(_, f)| { + .values() + .flat_map(|f| { f.files().iter().map(|f| { let (s, e) = f.time_range(); (s.value(), e.value()) diff --git a/src/mito2/src/engine/alter_test.rs b/src/mito2/src/engine/alter_test.rs index e710e08688..b8ba06f0b9 100644 --- a/src/mito2/src/engine/alter_test.rs +++ b/src/mito2/src/engine/alter_test.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::assert_matches::assert_matches; +use std::assert_matches; use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; diff --git a/src/mito2/src/engine/apply_staging_manifest_test.rs b/src/mito2/src/engine/apply_staging_manifest_test.rs index 6904fbd624..401e6572a2 100644 --- a/src/mito2/src/engine/apply_staging_manifest_test.rs +++ b/src/mito2/src/engine/apply_staging_manifest_test.rs @@ -12,9 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::assert_matches::assert_matches; -use std::fs; use std::sync::Arc; +use std::{assert_matches, fs}; use api::v1::Rows; use common_function::utils::partition_expr_version; diff --git a/src/mito2/src/engine/catchup_test.rs b/src/mito2/src/engine/catchup_test.rs index 0c7d058e4d..718462e8a8 100644 --- a/src/mito2/src/engine/catchup_test.rs +++ b/src/mito2/src/engine/catchup_test.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::assert_matches::assert_matches; +use std::assert_matches; use std::collections::HashMap; use api::v1::Rows; diff --git a/src/mito2/src/engine/copy_region_from_test.rs b/src/mito2/src/engine/copy_region_from_test.rs index 75580d5c0b..e9f8398302 100644 --- a/src/mito2/src/engine/copy_region_from_test.rs +++ b/src/mito2/src/engine/copy_region_from_test.rs @@ -12,9 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::assert_matches::assert_matches; -use std::fs; use std::sync::Arc; +use std::{assert_matches, fs}; use api::v1::Rows; use common_error::ext::ErrorExt; diff --git a/src/mito2/src/engine/remap_manifests_test.rs b/src/mito2/src/engine/remap_manifests_test.rs index e3538401aa..339896450c 100644 --- a/src/mito2/src/engine/remap_manifests_test.rs +++ b/src/mito2/src/engine/remap_manifests_test.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::assert_matches::assert_matches; +use std::assert_matches; use api::v1::Rows; use datatypes::value::Value; diff --git a/src/mito2/src/engine/staging_test.rs b/src/mito2/src/engine/staging_test.rs index 2e9c5045ff..e47a77bea0 100644 --- a/src/mito2/src/engine/staging_test.rs +++ b/src/mito2/src/engine/staging_test.rs @@ -14,10 +14,9 @@ //! Integration tests for staging state functionality. -use std::assert_matches::assert_matches; -use std::fs; use std::sync::Arc; use std::time::Duration; +use std::{assert_matches, fs}; use api::v1::Rows; use common_error::ext::ErrorExt; diff --git a/src/mito2/src/engine/sync_test.rs b/src/mito2/src/engine/sync_test.rs index 65b997e498..6c3b91c130 100644 --- a/src/mito2/src/engine/sync_test.rs +++ b/src/mito2/src/engine/sync_test.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::assert_matches::assert_matches; +use std::assert_matches; use api::v1::{Rows, SemanticType}; use common_error::ext::ErrorExt; diff --git a/src/mito2/src/lib.rs b/src/mito2/src/lib.rs index a15711b34a..7d43685ded 100644 --- a/src/mito2/src/lib.rs +++ b/src/mito2/src/lib.rs @@ -16,8 +16,6 @@ //! //! Mito is the a region engine to store timeseries data. -#![feature(assert_matches)] -#![feature(int_roundings)] #![feature(debug_closure_helpers)] #![feature(duration_constructors)] diff --git a/src/mito2/src/manifest/tests/checkpoint.rs b/src/mito2/src/manifest/tests/checkpoint.rs index 718755cd46..64547f45aa 100644 --- a/src/mito2/src/manifest/tests/checkpoint.rs +++ b/src/mito2/src/manifest/tests/checkpoint.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::assert_matches::assert_matches; +use std::assert_matches; use std::sync::Arc; use std::time::Duration; diff --git a/src/mito2/src/region.rs b/src/mito2/src/region.rs index 3020c9ecf4..26ab96c779 100644 --- a/src/mito2/src/region.rs +++ b/src/mito2/src/region.rs @@ -656,7 +656,7 @@ impl MitoRegion { .unwrap_or_default(); let files = manifest_files .into_iter() - .chain(staging_files.into_iter()) + .chain(staging_files) .collect::>(); files diff --git a/src/mito2/src/region/opener.rs b/src/mito2/src/region/opener.rs index d089493f81..9aa6454f75 100644 --- a/src/mito2/src/region/opener.rs +++ b/src/mito2/src/region/opener.rs @@ -881,7 +881,7 @@ impl RegionLoadCacheTask { } // Sorts files by max timestamp in descending order to loads latest files first - files_to_download.sort_by(|a, b| b.2.cmp(&a.2)); + files_to_download.sort_by_key(|b| std::cmp::Reverse(b.2)); let total_files = files_to_download.len() as i64; @@ -1011,7 +1011,7 @@ async fn preload_parquet_meta_cache_for_files( let allow_direct_load = matches!(object_store.info().scheme(), object_store::Scheme::Fs); // Sort by time range so we can prefer preloading newer files first. - files.sort_by(|a, b| b.meta_ref().time_range.1.cmp(&a.meta_ref().time_range.1)); + files.sort_by_key(|b| std::cmp::Reverse(b.meta_ref().time_range.1)); let mut loaded = 0usize; for file_handle in files { diff --git a/src/mito2/src/sst/index.rs b/src/mito2/src/sst/index.rs index 88aebfc001..31a96eecea 100644 --- a/src/mito2/src/sst/index.rs +++ b/src/mito2/src/sst/index.rs @@ -1083,8 +1083,8 @@ impl IndexBuildScheduler { /// Find the next task which has the highest priority to run. fn find_next_task(&self) -> Option { self.region_status - .iter() - .filter_map(|(_, status)| status.pending_tasks.peek()) + .values() + .filter_map(|status| status.pending_tasks.peek()) .max() .cloned() } diff --git a/src/mito2/src/sst/parquet/row_group.rs b/src/mito2/src/sst/parquet/row_group.rs index 8822882c5d..d8a44dfff5 100644 --- a/src/mito2/src/sst/parquet/row_group.rs +++ b/src/mito2/src/sst/parquet/row_group.rs @@ -220,7 +220,7 @@ pub(crate) fn compute_total_range_size(ranges: &[Range]) -> (u64, u64) { let gap = MERGE_GAP as u64; let mut sorted_ranges = ranges.to_vec(); - sorted_ranges.sort_unstable_by(|a, b| a.start.cmp(&b.start)); + sorted_ranges.sort_unstable_by_key(|a| a.start); let mut total_size_aligned = 0; let mut total_size_unaligned = 0; diff --git a/src/operator/src/insert.rs b/src/operator/src/insert.rs index 2366db7897..e1f121699e 100644 --- a/src/operator/src/insert.rs +++ b/src/operator/src/insert.rs @@ -343,8 +343,7 @@ impl Inserter { .convert(request) .await?; - let table_infos = - HashMap::from_iter([(table_info.table_id(), table_info.clone())].into_iter()); + let table_infos = HashMap::from_iter([(table_info.table_id(), table_info.clone())]); self.do_request(inserts, &table_infos, &ctx).await } @@ -360,8 +359,7 @@ impl Inserter { .convert(insert, ctx, statement_executor) .await?; - let table_infos = - HashMap::from_iter([(table_info.table_id(), table_info.clone())].into_iter()); + let table_infos = HashMap::from_iter([(table_info.table_id(), table_info.clone())]); self.do_request(inserts, &table_infos, ctx).await } diff --git a/src/operator/src/lib.rs b/src/operator/src/lib.rs index e31ad87ca1..5e723faeb5 100644 --- a/src/operator/src/lib.rs +++ b/src/operator/src/lib.rs @@ -12,9 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(assert_matches)] -#![feature(if_let_guard)] - mod bulk_insert; pub mod delete; pub mod error; diff --git a/src/operator/src/statement.rs b/src/operator/src/statement.rs index b87be42ff4..f3931255d0 100644 --- a/src/operator/src/statement.rs +++ b/src/operator/src/statement.rs @@ -933,7 +933,7 @@ impl Inserter for InserterImpl { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::collections::HashMap; use common_time::range::TimestampRange; diff --git a/src/partition/src/lib.rs b/src/partition/src/lib.rs index 647210d1d5..3bfd7297cc 100644 --- a/src/partition/src/lib.rs +++ b/src/partition/src/lib.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(assert_matches)] //! Structs and traits for partitioning rule. pub mod cache; diff --git a/src/partition/src/multi_dim.rs b/src/partition/src/multi_dim.rs index 7b1f7aa3dd..8825c6de59 100644 --- a/src/partition/src/multi_dim.rs +++ b/src/partition/src/multi_dim.rs @@ -356,7 +356,7 @@ impl PartitionRule for MultiDimPartitionRule { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use super::*; use crate::error::{self, Error}; diff --git a/src/pipeline/src/etl/ctx_req.rs b/src/pipeline/src/etl/ctx_req.rs index 23873cfdf1..0bfac33dac 100644 --- a/src/pipeline/src/etl/ctx_req.rs +++ b/src/pipeline/src/etl/ctx_req.rs @@ -223,7 +223,7 @@ impl ContextReq { } pub fn all_req(self) -> impl Iterator { - self.req.into_iter().flat_map(|(_, req)| req) + self.req.into_values().flatten() } pub fn ref_all_req(&self) -> impl Iterator { diff --git a/src/pipeline/src/etl/processor/dissect.rs b/src/pipeline/src/etl/processor/dissect.rs index 6694049253..69bdbe0f89 100644 --- a/src/pipeline/src/etl/processor/dissect.rs +++ b/src/pipeline/src/etl/processor/dissect.rs @@ -527,7 +527,7 @@ impl DissectProcessor { }; for (name, mut values) in appends { - values.sort_by(|a, b| a.1.cmp(&b.1)); + values.sort_by_key(|a| a.1); let value = values.into_iter().map(|(a, _)| a).join(sep); map.push((name, VrlValue::Bytes(Bytes::from(value)))); } diff --git a/src/pipeline/src/etl/processor/filter.rs b/src/pipeline/src/etl/processor/filter.rs index a700a8f2d3..d1820ba8ff 100644 --- a/src/pipeline/src/etl/processor/filter.rs +++ b/src/pipeline/src/etl/processor/filter.rs @@ -158,20 +158,18 @@ impl Processor for FilterProcessor { for field in self.fields.iter() { let val = val.as_object_mut().context(ValueMustBeMapSnafu)?; let index = field.input_field(); - match val.get(index) { - Some(VrlValue::Bytes(b)) => { + if let Some(v) = val.get(index) { + if let VrlValue::Bytes(b) = v { if self.match_target(&String::from_utf8_lossy(b)) { return Ok(VrlValue::Null); } - } - Some(v) => { + } else { return ProcessorExpectStringSnafu { processor: self.kind(), v: v.clone(), } .fail(); } - None => {} } } diff --git a/src/promql/src/extension_plan/range_manipulate.rs b/src/promql/src/extension_plan/range_manipulate.rs index 6d57de62c6..ac03ff5e79 100644 --- a/src/promql/src/extension_plan/range_manipulate.rs +++ b/src/promql/src/extension_plan/range_manipulate.rs @@ -753,7 +753,7 @@ impl RangeManipulateStream { if ts <= curr_ts { range_end = range_end.max(cursor); } else { - range_start_index = range_start_index.checked_sub(1usize).unwrap_or_default(); + range_start_index = range_start_index.saturating_sub(1usize); break; } cursor += 1; diff --git a/src/promql/src/extension_plan/series_divide.rs b/src/promql/src/extension_plan/series_divide.rs index 4a0b32f4e3..19385b7a97 100644 --- a/src/promql/src/extension_plan/series_divide.rs +++ b/src/promql/src/extension_plan/series_divide.rs @@ -632,7 +632,7 @@ impl SeriesDivideStream { let tags = TagIdentifier::try_new(batch, &self.tag_indices)?; // check if the first row is the same with last batch's last row - if resumed_batch_index > self.inspect_start.checked_sub(1).unwrap_or_default() { + if resumed_batch_index > self.inspect_start.saturating_sub(1) { let last_batch = &self.buffer[resumed_batch_index - 1]; let last_row = last_batch.num_rows() - 1; let last_tags = TagIdentifier::try_new(last_batch, &self.tag_indices)?; diff --git a/src/promql/src/functions/extrapolate_rate.rs b/src/promql/src/functions/extrapolate_rate.rs index 126a506bc9..8c3ab88776 100644 --- a/src/promql/src/functions/extrapolate_rate.rs +++ b/src/promql/src/functions/extrapolate_rate.rs @@ -96,7 +96,7 @@ impl ExtrapolatedRate() .unwrap() - .value(0) as i64; + .value(0); Ok(Self::new(range_length)) } diff --git a/src/promql/src/functions/quantile.rs b/src/promql/src/functions/quantile.rs index 93fc632d68..35f0c1aa81 100644 --- a/src/promql/src/functions/quantile.rs +++ b/src/promql/src/functions/quantile.rs @@ -216,7 +216,7 @@ fn quantile_with_scratch(values: &[f64], quantile: f64, scratch: &mut Vec) let length = scratch.len(); let rank = quantile * (length - 1) as f64; - let lower_index = 0.max(rank.floor() as usize); + let lower_index = rank.floor() as usize; let upper_index = (length - 1).min(lower_index + 1); let weight = rank - rank.floor(); diff --git a/src/puffin/src/puffin_manager/fs_puffin_manager/reader.rs b/src/puffin/src/puffin_manager/fs_puffin_manager/reader.rs index c660d1e19a..b35b03875e 100644 --- a/src/puffin/src/puffin_manager/fs_puffin_manager/reader.rs +++ b/src/puffin/src/puffin_manager/fs_puffin_manager/reader.rs @@ -262,7 +262,7 @@ where tasks.push(task); } - let size = futures::future::try_join_all(tasks.into_iter()) + let size = futures::future::try_join_all(tasks) .await .into_iter() .flatten() diff --git a/src/query/src/dist_plan/analyzer/utils.rs b/src/query/src/dist_plan/analyzer/utils.rs index e9205e33f4..dd1ad867e5 100644 --- a/src/query/src/dist_plan/analyzer/utils.rs +++ b/src/query/src/dist_plan/analyzer/utils.rs @@ -89,7 +89,7 @@ pub fn patch_batch_timezone( let patched_columns: Vec = expected_schema .fields() .iter() - .zip(columns.into_iter()) + .zip(columns) .map(|(expected_field, column)| { let expected_type = expected_field.data_type(); let actual_type = column.data_type(); diff --git a/src/query/src/optimizer/parallelize_scan.rs b/src/query/src/optimizer/parallelize_scan.rs index 171fc6e919..dd2ba06290 100644 --- a/src/query/src/optimizer/parallelize_scan.rs +++ b/src/query/src/optimizer/parallelize_scan.rs @@ -139,7 +139,7 @@ impl ParallelizeScan { } // Sort ranges by number of rows in descending order. - ranges.sort_by(|a, b| b.num_rows.cmp(&a.num_rows)); + ranges.sort_by_key(|b| std::cmp::Reverse(b.num_rows)); let mut partition_ranges = vec![vec![]; expected_partition_num]; #[derive(Eq, PartialEq)] diff --git a/src/query/src/optimizer/scan_hint.rs b/src/query/src/optimizer/scan_hint.rs index da70813404..89e3afaed0 100644 --- a/src/query/src/optimizer/scan_hint.rs +++ b/src/query/src/optimizer/scan_hint.rs @@ -375,10 +375,10 @@ impl TreeNodeVisitor<'_> for ScanHintVisitor { LogicalPlan::Filter(_) => { self.vector_search.on_filter_exit(); } - LogicalPlan::Subquery(_) | LogicalPlan::SubqueryAlias(_) => { - if is_branching_for_vector(_node) { - self.vector_search.on_branching_exit(); - } + LogicalPlan::Subquery(_) | LogicalPlan::SubqueryAlias(_) + if is_branching_for_vector(_node) => + { + self.vector_search.on_branching_exit(); } _ if _node.inputs().len() > 1 => { self.vector_search.on_branching_exit(); diff --git a/src/query/src/promql/planner.rs b/src/query/src/promql/planner.rs index 23d654d2b6..640994dea2 100644 --- a/src/query/src/promql/planner.rs +++ b/src/query/src/promql/planner.rs @@ -523,8 +523,7 @@ impl PromPlanner { .use_tsid .then_some(DfExpr::Column(Column::from_name( DATA_SCHEMA_TSID_COLUMN_NAME, - ))) - .into_iter(), + ))), ) .chain(Some(self.create_time_index_column_expr()?)); @@ -1313,8 +1312,7 @@ impl PromPlanner { .use_tsid .then_some(DfExpr::Column(Column::new_unqualified( DATA_SCHEMA_TSID_COLUMN_NAME, - ))) - .into_iter(), + ))), ) .chain(Some(self.create_time_index_column_expr()?)) .collect::>(); @@ -1828,15 +1826,10 @@ impl PromPlanner { .iter() .map(|tag| DfExpr::Column(Column::from_name(tag))), ) - .chain( - self.ctx - .use_tsid - .then_some(DfExpr::Column(Column::new( - Some(table_ref.clone()), - DATA_SCHEMA_TSID_COLUMN_NAME.to_string(), - ))) - .into_iter(), - ) + .chain(self.ctx.use_tsid.then_some(DfExpr::Column(Column::new( + Some(table_ref.clone()), + DATA_SCHEMA_TSID_COLUMN_NAME.to_string(), + )))) .chain(Some(DfExpr::Alias(Alias { expr: Box::new(DfExpr::Cast(Cast { expr: Box::new(self.create_time_index_column_expr()?), @@ -1874,8 +1867,7 @@ impl PromPlanner { .use_tsid .then_some(DfExpr::Column(Column::from_name( DATA_SCHEMA_TSID_COLUMN_NAME, - ))) - .into_iter(), + ))), ) .chain(Some(self.create_time_index_column_expr()?)) .collect::>(); diff --git a/src/query/src/sql.rs b/src/query/src/sql.rs index 49e26c92ca..74f8b13fea 100644 --- a/src/query/src/sql.rs +++ b/src/query/src/sql.rs @@ -215,10 +215,10 @@ pub async fn show_databases( fn replace_column_in_expr(expr: &mut sqlparser::ast::Expr, from_column: &str, to_column: &str) { let _ = visit_expressions_mut(expr, |e| { match e { - sqlparser::ast::Expr::Identifier(ident) => { - if ident.value.eq_ignore_ascii_case(from_column) { - ident.value = to_column.to_string(); - } + sqlparser::ast::Expr::Identifier(ident) + if ident.value.eq_ignore_ascii_case(from_column) => + { + ident.value = to_column.to_string(); } sqlparser::ast::Expr::CompoundIdentifier(idents) => { if let Some(last) = idents.last_mut() @@ -748,23 +748,17 @@ pub fn show_variable(stmt: ShowVariables, query_ctx: QueryContextRef) -> Result< .pg_intervalstyle_format(); style.to_string() } - "MAX_EXECUTION_TIME" => { - if query_ctx.channel() == Channel::Mysql { + "MAX_EXECUTION_TIME" + if query_ctx.channel() == Channel::Mysql => { query_ctx.query_timeout_as_millis().to_string() - } else { - return UnsupportedVariableSnafu { name: variable }.fail(); } - } - "STATEMENT_TIMEOUT" => { + "STATEMENT_TIMEOUT" // Add time units to postgres query timeout display. - if query_ctx.channel() == Channel::Postgres { + if query_ctx.channel() == Channel::Postgres => { let mut timeout = query_ctx.query_timeout_as_millis().to_string(); timeout.push_str("ms"); timeout - } else { - return UnsupportedVariableSnafu { name: variable }.fail(); } - } _ => return UnsupportedVariableSnafu { name: variable }.fail(), }; let schema = Arc::new(Schema::new(vec![ColumnSchema::new( diff --git a/src/query/src/window_sort.rs b/src/query/src/window_sort.rs index b667b9c3b0..83feee6f30 100644 --- a/src/query/src/window_sort.rs +++ b/src/query/src/window_sort.rs @@ -1198,7 +1198,7 @@ fn split_overlapping_ranges(ranges: &[PartitionRange]) -> BTreeMap (Option<&str>, Option<&str>) #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use common_base::secrets::ExposeSecret; diff --git a/src/servers/src/http/header.rs b/src/servers/src/http/header.rs index ce58e15e98..785901ad7b 100644 --- a/src/servers/src/http/header.rs +++ b/src/servers/src/http/header.rs @@ -170,13 +170,11 @@ pub fn collect_plan_metrics(plan: &Arc, maps: &mut [&mut Hash MetricValue::Gauge { name, gauge } => { collect_into_maps(name, gauge.value() as u64, maps); } - MetricValue::Time { name, time } => { - if name.starts_with(GREPTIME_EXEC_PREFIX) { - // override - maps.iter_mut().for_each(|map| { - map.insert(name.to_string(), time.value() as u64); - }); - } + MetricValue::Time { name, time } if name.starts_with(GREPTIME_EXEC_PREFIX) => { + // override + maps.iter_mut().for_each(|map| { + map.insert(name.to_string(), time.value() as u64); + }); } _ => {} }); diff --git a/src/servers/src/http/prometheus.rs b/src/servers/src/http/prometheus.rs index 488a49df99..60ad780beb 100644 --- a/src/servers/src/http/prometheus.rs +++ b/src/servers/src/http/prometheus.rs @@ -1118,17 +1118,17 @@ fn collect_metric_names(expr: &PromqlExpr, metric_names: &mut HashSet) { match expr { PromqlExpr::Aggregate(AggregateExpr { modifier, expr, .. }) => { match modifier { - Some(LabelModifier::Include(labels)) => { - if !labels.labels.contains(&METRIC_NAME.to_string()) { - metric_names.clear(); - return; - } + Some(LabelModifier::Include(labels)) + if !labels.labels.contains(&METRIC_NAME.to_string()) => + { + metric_names.clear(); + return; } - Some(LabelModifier::Exclude(labels)) => { - if labels.labels.contains(&METRIC_NAME.to_string()) { - metric_names.clear(); - return; - } + Some(LabelModifier::Exclude(labels)) + if labels.labels.contains(&METRIC_NAME.to_string()) => + { + metric_names.clear(); + return; } _ => {} } diff --git a/src/servers/src/http/result/prometheus_resp.rs b/src/servers/src/http/result/prometheus_resp.rs index 9ecbe671b4..6f2b115686 100644 --- a/src/servers/src/http/result/prometheus_resp.rs +++ b/src/servers/src/http/result/prometheus_resp.rs @@ -204,10 +204,10 @@ impl PrometheusJsonResponse { for (i, column) in batches.schema().column_schemas().iter().enumerate() { match column.data_type { - ConcreteDataType::Timestamp(datatypes::types::TimestampType::Millisecond(_)) => { - if timestamp_column_index.is_none() { - timestamp_column_index = Some(i); - } + ConcreteDataType::Timestamp(datatypes::types::TimestampType::Millisecond(_)) + if timestamp_column_index.is_none() => + { + timestamp_column_index = Some(i); } // Treat all value types as field ConcreteDataType::Float32(_) @@ -219,10 +219,10 @@ impl PrometheusJsonResponse { | ConcreteDataType::UInt8(_) | ConcreteDataType::UInt16(_) | ConcreteDataType::UInt32(_) - | ConcreteDataType::UInt64(_) => { - if first_field_column_index.is_none() { - first_field_column_index = Some(i); - } + | ConcreteDataType::UInt64(_) + if first_field_column_index.is_none() => + { + first_field_column_index = Some(i); } ConcreteDataType::String(_) => { tag_column_indices.push(i); diff --git a/src/servers/src/lib.rs b/src/servers/src/lib.rs index c44c674b9e..41d73b109f 100644 --- a/src/servers/src/lib.rs +++ b/src/servers/src/lib.rs @@ -12,11 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(assert_matches)] #![feature(try_blocks)] #![feature(exclusive_wrapper)] -#![feature(if_let_guard)] -#![feature(box_patterns)] use datafusion_expr::LogicalPlan; use datatypes::schema::Schema; diff --git a/src/servers/src/mysql/handler.rs b/src/servers/src/mysql/handler.rs index dd67012a52..2ce229fea4 100644 --- a/src/servers/src/mysql/handler.rs +++ b/src/servers/src/mysql/handler.rs @@ -674,10 +674,8 @@ fn convert_param_value_to_string(param: &ParamValue) -> String { fn replace_params(params: Vec, query: String) -> String { let mut query = query; - let mut index = 1; - for param in params { + for (index, param) in (1..).zip(params) { query = query.replace(&format_placeholder(index), ¶m); - index += 1; } query } diff --git a/src/servers/tests/mysql/mysql_server_test.rs b/src/servers/tests/mysql/mysql_server_test.rs index 3aa7b98f39..c53ff34d45 100644 --- a/src/servers/tests/mysql/mysql_server_test.rs +++ b/src/servers/tests/mysql/mysql_server_test.rs @@ -514,8 +514,7 @@ async fn test_prepare_all_type( connection: &mut Conn, ) { let mut column_index = 0; - let mut stmt_id = 1; - for schema in column_schemas { + for (stmt_id, schema) in (1..).zip(column_schemas) { let query = format!( "SELECT {} FROM all_datatypes WHERE {} = ?", schema.name, schema.name @@ -523,7 +522,6 @@ async fn test_prepare_all_type( let statement = connection.prep(query).await; let statement = statement.unwrap(); assert_eq!(stmt_id, statement.id()); - stmt_id += 1; let vector_ref = columns.get(column_index).unwrap(); for vector_index in 0..vector_ref.len() { diff --git a/src/sql/src/lib.rs b/src/sql/src/lib.rs index b2b151d439..e8c6bdf8ef 100644 --- a/src/sql/src/lib.rs +++ b/src/sql/src/lib.rs @@ -13,8 +13,6 @@ // limitations under the License. #![feature(box_patterns)] -#![feature(assert_matches)] -#![feature(if_let_guard)] pub mod ast; pub mod dialect; diff --git a/src/sql/src/parsers/alter_parser.rs b/src/sql/src/parsers/alter_parser.rs index df89f90922..e5e1575a20 100644 --- a/src/sql/src/parsers/alter_parser.rs +++ b/src/sql/src/parsers/alter_parser.rs @@ -677,7 +677,7 @@ fn parse_string_option_names(parser: &mut Parser) -> std::result::Result { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use crate::dialect::GreptimeDbDialect; use crate::parser::{ParseOptions, ParserContext}; diff --git a/src/sql/src/parsers/copy_parser.rs b/src/sql/src/parsers/copy_parser.rs index d975d884f6..9a2eddcc78 100644 --- a/src/sql/src/parsers/copy_parser.rs +++ b/src/sql/src/parsers/copy_parser.rs @@ -267,7 +267,7 @@ impl ParserContext<'_> { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::collections::HashMap; use sqlparser::ast::{Ident, ObjectName}; diff --git a/src/sql/src/parsers/create_parser.rs b/src/sql/src/parsers/create_parser.rs index d69a1af61d..b83c2032db 100644 --- a/src/sql/src/parsers/create_parser.rs +++ b/src/sql/src/parsers/create_parser.rs @@ -1242,7 +1242,7 @@ fn ensure_partition_columns_defined<'a>( #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use std::collections::HashMap; use common_catalog::consts::FILE_ENGINE; diff --git a/src/sql/src/parsers/delete_parser.rs b/src/sql/src/parsers/delete_parser.rs index 3f13d18f97..be1a93b6c6 100644 --- a/src/sql/src/parsers/delete_parser.rs +++ b/src/sql/src/parsers/delete_parser.rs @@ -43,7 +43,7 @@ impl ParserContext<'_> { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use super::*; use crate::dialect::GreptimeDbDialect; diff --git a/src/sql/src/parsers/insert_parser.rs b/src/sql/src/parsers/insert_parser.rs index a7d5f311f6..d121181774 100644 --- a/src/sql/src/parsers/insert_parser.rs +++ b/src/sql/src/parsers/insert_parser.rs @@ -64,7 +64,7 @@ impl ParserContext<'_> { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use super::*; use crate::dialect::GreptimeDbDialect; diff --git a/src/sql/src/parsers/show_parser.rs b/src/sql/src/parsers/show_parser.rs index e2e5fc50ac..d6fc35c675 100644 --- a/src/sql/src/parsers/show_parser.rs +++ b/src/sql/src/parsers/show_parser.rs @@ -599,7 +599,7 @@ impl ParserContext<'_> { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use sqlparser::ast::{Ident, ObjectName}; diff --git a/src/sql/src/statements/alter.rs b/src/sql/src/statements/alter.rs index 5b6a5ab5e6..ab35e5bd34 100644 --- a/src/sql/src/statements/alter.rs +++ b/src/sql/src/statements/alter.rs @@ -390,7 +390,7 @@ impl Display for AlterDatabaseOperation { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use crate::dialect::GreptimeDbDialect; use crate::parser::{ParseOptions, ParserContext}; diff --git a/src/sql/src/statements/copy.rs b/src/sql/src/statements/copy.rs index 7aa099c53c..7a59175538 100644 --- a/src/sql/src/statements/copy.rs +++ b/src/sql/src/statements/copy.rs @@ -164,7 +164,7 @@ impl CopyTableArgument { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use crate::dialect::GreptimeDbDialect; use crate::parser::{ParseOptions, ParserContext}; diff --git a/src/sql/src/statements/create.rs b/src/sql/src/statements/create.rs index 817b31518d..80eb52c406 100644 --- a/src/sql/src/statements/create.rs +++ b/src/sql/src/statements/create.rs @@ -709,7 +709,7 @@ impl Display for CreateView { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use crate::dialect::GreptimeDbDialect; use crate::error::Error; diff --git a/src/sql/src/statements/describe.rs b/src/sql/src/statements/describe.rs index 2a87725dfc..ec48f4be81 100644 --- a/src/sql/src/statements/describe.rs +++ b/src/sql/src/statements/describe.rs @@ -44,7 +44,7 @@ impl Display for DescribeTable { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use crate::dialect::GreptimeDbDialect; use crate::parser::{ParseOptions, ParserContext}; diff --git a/src/sql/src/statements/drop.rs b/src/sql/src/statements/drop.rs index 1e97bee25c..26190b48a0 100644 --- a/src/sql/src/statements/drop.rs +++ b/src/sql/src/statements/drop.rs @@ -167,7 +167,7 @@ impl Display for DropView { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use crate::dialect::GreptimeDbDialect; use crate::parser::{ParseOptions, ParserContext}; diff --git a/src/sql/src/statements/set_variables.rs b/src/sql/src/statements/set_variables.rs index 748d077d84..4c6eb16692 100644 --- a/src/sql/src/statements/set_variables.rs +++ b/src/sql/src/statements/set_variables.rs @@ -41,7 +41,7 @@ impl Display for SetVariables { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use crate::dialect::GreptimeDbDialect; use crate::parser::{ParseOptions, ParserContext}; diff --git a/src/sql/src/statements/show.rs b/src/sql/src/statements/show.rs index 0dfdd1de7d..77880e4a50 100644 --- a/src/sql/src/statements/show.rs +++ b/src/sql/src/statements/show.rs @@ -340,7 +340,7 @@ impl Display for ShowProcessList { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use sqlparser::ast::UnaryOperator; diff --git a/src/sql/src/statements/truncate.rs b/src/sql/src/statements/truncate.rs index b9c299601c..307326d9c7 100644 --- a/src/sql/src/statements/truncate.rs +++ b/src/sql/src/statements/truncate.rs @@ -98,7 +98,7 @@ impl Display for TruncateTable { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use crate::dialect::GreptimeDbDialect; use crate::parser::{ParseOptions, ParserContext}; diff --git a/src/store-api/src/lib.rs b/src/store-api/src/lib.rs index 4df594fc67..cb39875d74 100644 --- a/src/store-api/src/lib.rs +++ b/src/store-api/src/lib.rs @@ -14,8 +14,6 @@ //! Storage related APIs -#![feature(iterator_try_collect)] - pub mod codec; pub mod data_source; pub mod logstore; diff --git a/src/table/src/lib.rs b/src/table/src/lib.rs index 64e72029b8..abc7dc8300 100644 --- a/src/table/src/lib.rs +++ b/src/table/src/lib.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(assert_matches)] #![feature(try_blocks)] pub mod dist_table; diff --git a/src/table/src/metadata.rs b/src/table/src/metadata.rs index ca8bc30aa1..42f66044d9 100644 --- a/src/table/src/metadata.rs +++ b/src/table/src/metadata.rs @@ -1388,7 +1388,7 @@ fn unset_column_skipping_index_options( #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::assert_matches; use common_error::ext::ErrorExt; use common_error::status_code::StatusCode; diff --git a/tests-integration/Cargo.toml b/tests-integration/Cargo.toml index ec35205a55..bee03ae7fe 100644 --- a/tests-integration/Cargo.toml +++ b/tests-integration/Cargo.toml @@ -3,6 +3,11 @@ name = "tests-integration" version.workspace = true edition.workspace = true license.workspace = true +autotests = false + +[[test]] +name = "main" +path = "tests/main.rs" [features] dashboard = ["servers/dashboard"] diff --git a/tests-integration/src/cluster.rs b/tests-integration/src/cluster.rs index daa2f64919..4a73014dbf 100644 --- a/tests-integration/src/cluster.rs +++ b/tests-integration/src/cluster.rs @@ -127,8 +127,8 @@ impl GreptimeDbCluster { .await .into_iter() .flat_map(|e| { - if e.index_file_path.is_some() { - vec![e.file_path, e.index_file_path.unwrap()] + if let Some(index_file_path) = e.index_file_path { + vec![e.file_path, index_file_path] } else { vec![e.file_path] } diff --git a/tests-integration/src/lib.rs b/tests-integration/src/lib.rs index 5def9351d0..5a4dca146f 100644 --- a/tests-integration/src/lib.rs +++ b/tests-integration/src/lib.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#![feature(assert_matches)] +#![recursion_limit = "256"] pub mod cluster; mod grpc; diff --git a/tests-integration/src/tests/instance_kafka_wal_test.rs b/tests-integration/src/tests/instance_kafka_wal_test.rs index ed74525aef..57be7e6cd6 100644 --- a/tests-integration/src/tests/instance_kafka_wal_test.rs +++ b/tests-integration/src/tests/instance_kafka_wal_test.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::assert_matches::assert_matches; +use std::assert_matches; use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering}; diff --git a/tests-integration/tests/main.rs b/tests-integration/tests/main.rs index 14cf734291..01aed6c8e9 100644 --- a/tests-integration/tests/main.rs +++ b/tests-integration/tests/main.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#![recursion_limit = "256"] + #[macro_use] mod grpc; #[macro_use] From fb2dd862d5b29f07b30511bce3e9919d0c789b8a Mon Sep 17 00:00:00 2001 From: Yao Noel Achi <43069141+ynachi@users.noreply.github.com> Date: Tue, 31 Mar 2026 03:51:09 +0200 Subject: [PATCH 056/195] fix: avoid cloning serialized view plans on resolve (#7882) fix: avoid cloning serialized view plan on resolve - Change `ViewInfoValue.view_info` from `Vec` to `common_base::bytes::Bytes` so resolving a view no longer clones the full serialized plan buffer on every decode. - To keep the change narrow, the metadata write boundary still accepts `Vec` and converts once when constructing/updating `ViewInfoValue`. The hot read path now uses a cheap clone of the stored bytes. - The benches introduced revealed up to 82% resolution time improvment. Signed-off-by: Yao ACHI --- src/catalog/src/table_source.rs | 7 +--- src/common/meta/src/key.rs | 8 ++-- src/common/meta/src/key/view_info.rs | 61 ++++++++++++++++++++++++---- 3 files changed, 59 insertions(+), 17 deletions(-) diff --git a/src/catalog/src/table_source.rs b/src/catalog/src/table_source.rs index f7ba51722f..fd78cc2573 100644 --- a/src/catalog/src/table_source.rs +++ b/src/catalog/src/table_source.rs @@ -15,7 +15,6 @@ use std::collections::HashMap; use std::sync::Arc; -use bytes::Bytes; use common_catalog::format_full_table_name; use common_query::logical_plan::{SubstraitPlanDecoderRef, rename_logical_plan_columns}; use datafusion::common::{ResolvedTableReference, TableReference}; @@ -151,11 +150,7 @@ impl DfTableSourceProvider { let catalog_list = Arc::new(DummyCatalogList::new(self.catalog_manager.clone())); let logical_plan = self .plan_decoder - .decode( - Bytes::from(view_info.view_info.clone()), - catalog_list, - false, - ) + .decode(view_info.view_info.clone().into(), catalog_list, false) .await .context(DecodePlanSnafu { name: &table.table_info().name, diff --git a/src/common/meta/src/key.rs b/src/common/meta/src/key.rs index 97b68f9b04..332c60f225 100644 --- a/src/common/meta/src/key.rs +++ b/src/common/meta/src/key.rs @@ -708,7 +708,7 @@ impl TableMetadataManager { // Creates view info let view_info_value = ViewInfoValue::new( - raw_logical_plan, + raw_logical_plan.into(), table_names, columns, plan_columns, @@ -1184,7 +1184,7 @@ impl TableMetadataManager { definition: String, ) -> Result<()> { let new_view_info_value = current_view_info_value.update( - new_view_info, + new_view_info.into(), table_names, columns, plan_columns, @@ -2752,7 +2752,7 @@ mod tests { let new_definition = "CREATE VIEW test AS SELECT * FROM b_table join c_table"; let current_view_info_value = DeserializedValueWithBytes::from_inner(ViewInfoValue::new( - logical_plan.clone(), + logical_plan.clone().into(), table_names, columns, plan_columns, @@ -2803,7 +2803,7 @@ mod tests { let wrong_definition = "wrong_definition"; let wrong_view_info_value = DeserializedValueWithBytes::from_inner(current_view_info_value.update( - wrong_view_info, + wrong_view_info.into(), new_table_names.clone(), new_columns.clone(), new_plan_columns.clone(), diff --git a/src/common/meta/src/key/view_info.rs b/src/common/meta/src/key/view_info.rs index 82be00b26d..5efc2c9bb7 100644 --- a/src/common/meta/src/key/view_info.rs +++ b/src/common/meta/src/key/view_info.rs @@ -16,6 +16,7 @@ use std::collections::{HashMap, HashSet}; use std::fmt::Display; use std::sync::Arc; +use common_base::bytes::Bytes; use serde::{Deserialize, Serialize}; use snafu::OptionExt; use table::metadata::TableId; @@ -31,9 +32,6 @@ use crate::kv_backend::KvBackendRef; use crate::kv_backend::txn::Txn; use crate::rpc::store::BatchGetRequest; -/// The VIEW logical plan encoded bytes -type RawViewLogicalPlan = Vec; - /// The key stores the metadata of the view. /// /// The layout: `__view_info/{view_id}`. @@ -86,7 +84,7 @@ impl MetadataKey<'_, ViewInfoKey> for ViewInfoKey { #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct ViewInfoValue { // The encoded logical plan - pub view_info: RawViewLogicalPlan, + pub view_info: Bytes, // The resolved fully table names in logical plan pub table_names: HashSet, // The view columns @@ -100,7 +98,7 @@ pub struct ViewInfoValue { impl ViewInfoValue { pub fn new( - view_info: RawViewLogicalPlan, + view_info: Bytes, table_names: HashSet, columns: Vec, plan_columns: Vec, @@ -118,7 +116,7 @@ impl ViewInfoValue { pub(crate) fn update( &self, - new_view_info: RawViewLogicalPlan, + new_view_info: Bytes, table_names: HashSet, columns: Vec, plan_columns: Vec, @@ -305,7 +303,7 @@ mod tests { }; let value = ViewInfoValue { - view_info: vec![1, 2, 3], + view_info: Bytes::from([1, 2, 3].as_ref()), version: 1, table_names, columns: vec!["a".to_string()], @@ -316,4 +314,53 @@ mod tests { let deserialized = ViewInfoValue::try_from_raw_value(&serialized).unwrap(); assert_eq!(value, deserialized); } + + #[test] + fn test_deserialize_view_info_value_with_vec_u8() { + #[derive(Serialize)] + struct OldViewInfoValue { + view_info: Vec, + table_names: HashSet, + columns: Vec, + plan_columns: Vec, + definition: String, + version: u64, + } + + let table_names = { + let mut set = HashSet::new(); + set.insert(TableName { + catalog_name: "greptime".to_string(), + schema_name: "public".to_string(), + table_name: "a_table".to_string(), + }); + set.insert(TableName { + catalog_name: "greptime".to_string(), + schema_name: "public".to_string(), + table_name: "b_table".to_string(), + }); + set + }; + + let old_value = OldViewInfoValue { + view_info: vec![1, 2, 3], + table_names: table_names.clone(), + columns: vec!["a".to_string()], + plan_columns: vec!["number".to_string()], + definition: "CREATE VIEW test AS SELECT * FROM numbers".to_string(), + version: 1, + }; + + let serialized = serde_json::to_vec(&old_value).unwrap(); + let deserialized = ViewInfoValue::try_from_raw_value(&serialized).unwrap(); + + assert_eq!(deserialized.view_info, vec![1, 2, 3]); + assert_eq!(deserialized.table_names, table_names); + assert_eq!(deserialized.columns, vec!["a".to_string()]); + assert_eq!(deserialized.plan_columns, vec!["number".to_string()]); + assert_eq!( + deserialized.definition, + "CREATE VIEW test AS SELECT * FROM numbers" + ); + } } From dde1edcdb4ca32bbeed0c4e1d1ae1a5cfcbcea0d Mon Sep 17 00:00:00 2001 From: Yingwen Date: Tue, 31 Mar 2026 10:45:07 +0800 Subject: [PATCH 057/195] fix: incorrect prefilter check (#7886) Signed-off-by: evenyag --- src/mito2/src/sst/parquet/flat_format.rs | 6 +++--- src/mito2/src/sst/parquet/prefilter.rs | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/mito2/src/sst/parquet/flat_format.rs b/src/mito2/src/sst/parquet/flat_format.rs index ca39cac7e1..d4d6c11a45 100644 --- a/src/mito2/src/sst/parquet/flat_format.rs +++ b/src/mito2/src/sst/parquet/flat_format.rs @@ -282,10 +282,10 @@ impl FlatReadFormat { } } - /// Returns `true` if raw batches from parquet use the flat layout with a - /// dictionary-encoded `__primary_key` column (i.e., [`ParquetAdapter::Flat`]). + /// Returns `true` if raw batches from parquet use the flat layout and + /// stores primary key columns as raw columns. /// Returns `false` for the legacy primary-key-to-flat conversion path. - pub(crate) fn raw_batch_has_primary_key_dictionary(&self) -> bool { + pub(crate) fn batch_has_raw_pk_columns(&self) -> bool { matches!(&self.parquet_adapter, ParquetAdapter::Flat(_)) } diff --git a/src/mito2/src/sst/parquet/prefilter.rs b/src/mito2/src/sst/parquet/prefilter.rs index 07efbd052f..88df56e401 100644 --- a/src/mito2/src/sst/parquet/prefilter.rs +++ b/src/mito2/src/sst/parquet/prefilter.rs @@ -217,9 +217,9 @@ impl PrefilterContextBuilder { return None; } - // Only flat format with dictionary-encoded PKs supports PK prefiltering. + // Only perform PK prefiltering for primary-key-to-flat conversion path. let flat_format = read_format.as_flat()?; - if !flat_format.raw_batch_has_primary_key_dictionary() { + if flat_format.batch_has_raw_pk_columns() { return None; } From 358524566a7d300d8be712ebd2b430c0bb0a5585 Mon Sep 17 00:00:00 2001 From: shuiyisong <113876041+shuiyisong@users.noreply.github.com> Date: Tue, 31 Mar 2026 18:46:30 +0800 Subject: [PATCH 058/195] chore: update ignore list for AI related (#7896) Signed-off-by: shuiyisong --- .gitignore | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 87412d570c..1e6194369f 100644 --- a/.gitignore +++ b/.gitignore @@ -65,11 +65,12 @@ greptimedb_data # github !/.github -# Claude code +# AI related CLAUDE.md - -# AGENTS.md AGENTS.md +.codex +.gemini +.opencode # local design docs docs/specs/ From ab106966574bea30296931b69050400e8b73a3aa Mon Sep 17 00:00:00 2001 From: jeremyhi Date: Tue, 31 Mar 2026 15:43:40 -0700 Subject: [PATCH 059/195] feat: implement export-v2 chunked data export flow (#7841) * feat: implement export-v2 chunked data export flow Signed-off-by: jeremyhi * fix: by codex comment Signed-off-by: jeremyhi * fix: by gemini comment Signed-off-by: jeremyhi * fix: clippy Signed-off-by: jeremyhi * fix: by comment Signed-off-by: jeremyhi * fix: handle empty export ranges consistently Signed-off-by: jeremyhi * fix: validate resume config Signed-off-by: jeremyhi * fix: file-uri paths Signed-off-by: jeremyhi * feat: check args on schema-only mode Signed-off-by: jeremyhi --------- Signed-off-by: jeremyhi --- src/cli/src/data/export_v2.rs | 5 +- src/cli/src/data/export_v2/chunker.rs | 103 +++++ src/cli/src/data/export_v2/command.rs | 467 ++++++++++++++++++++-- src/cli/src/data/export_v2/coordinator.rs | 166 ++++++++ src/cli/src/data/export_v2/data.rs | 440 ++++++++++++++++++++ src/cli/src/data/export_v2/error.rs | 56 ++- src/cli/src/data/export_v2/manifest.rs | 190 ++++++++- src/cli/src/data/import_v2/command.rs | 131 +++++- src/cli/src/data/import_v2/error.rs | 18 +- src/cli/src/data/path.rs | 13 + src/cli/src/data/snapshot_storage.rs | 57 ++- 11 files changed, 1577 insertions(+), 69 deletions(-) create mode 100644 src/cli/src/data/export_v2/chunker.rs create mode 100644 src/cli/src/data/export_v2/coordinator.rs create mode 100644 src/cli/src/data/export_v2/data.rs diff --git a/src/cli/src/data/export_v2.rs b/src/cli/src/data/export_v2.rs index 91020d2f2e..1921ffe4b4 100644 --- a/src/cli/src/data/export_v2.rs +++ b/src/cli/src/data/export_v2.rs @@ -30,7 +30,7 @@ //! --to file:///tmp/snapshot \ //! --schema-only //! -//! # Export with time range (M2) +//! # Export with time range //! greptime cli data export-v2 create \ //! --addr 127.0.0.1:4000 \ //! --to s3://bucket/snapshots/prod-20250101 \ @@ -38,7 +38,10 @@ //! --end-time 2025-01-31T23:59:59Z //! ``` +mod chunker; mod command; +mod coordinator; +mod data; pub mod error; pub mod extractor; pub mod manifest; diff --git a/src/cli/src/data/export_v2/chunker.rs b/src/cli/src/data/export_v2/chunker.rs new file mode 100644 index 0000000000..260d95fae9 --- /dev/null +++ b/src/cli/src/data/export_v2/chunker.rs @@ -0,0 +1,103 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::time::Duration; + +use chrono::Duration as ChronoDuration; + +use crate::data::export_v2::manifest::{ChunkMeta, TimeRange}; + +pub fn generate_chunks(time_range: &TimeRange, window: Duration) -> Vec { + let (Some(start), Some(end)) = (time_range.start, time_range.end) else { + return vec![ChunkMeta::new(1, time_range.clone())]; + }; + + if start == end { + return vec![ChunkMeta::skipped(1, time_range.clone())]; + } + + if start > end { + return Vec::new(); + } + + let window = match ChronoDuration::from_std(window) { + Ok(window) if window > ChronoDuration::zero() => window, + _ => return vec![ChunkMeta::new(1, time_range.clone())], + }; + + let mut chunks = Vec::new(); + let mut cursor = start; + let mut id = 1; + + while cursor < end { + let next = cursor + .checked_add_signed(window) + .map_or(end, |timestamp| timestamp.min(end)); + chunks.push(ChunkMeta::new(id, TimeRange::new(Some(cursor), Some(next)))); + id += 1; + cursor = next; + } + + chunks +} + +#[cfg(test)] +mod tests { + use chrono::{TimeZone, Utc}; + + use super::*; + use crate::data::export_v2::manifest::ChunkStatus; + + #[test] + fn test_generate_chunks_unbounded() { + let range = TimeRange::unbounded(); + let chunks = generate_chunks(&range, Duration::from_secs(3600)); + assert_eq!(chunks.len(), 1); + assert_eq!(chunks[0].time_range, range); + } + + #[test] + fn test_generate_chunks_split() { + let start = Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap(); + let end = Utc.with_ymd_and_hms(2025, 1, 1, 3, 0, 0).unwrap(); + let range = TimeRange::new(Some(start), Some(end)); + + let chunks = generate_chunks(&range, Duration::from_secs(3600)); + assert_eq!(chunks.len(), 3); + assert_eq!(chunks[0].time_range.start, Some(start)); + assert_eq!( + chunks[2].time_range.end, + Some(Utc.with_ymd_and_hms(2025, 1, 1, 3, 0, 0).unwrap()) + ); + } + + #[test] + fn test_generate_chunks_empty_range() { + let start = Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap(); + let range = TimeRange::new(Some(start), Some(start)); + let chunks = generate_chunks(&range, Duration::from_secs(3600)); + assert_eq!(chunks.len(), 1); + assert_eq!(chunks[0].status, ChunkStatus::Skipped); + assert_eq!(chunks[0].time_range, range); + } + + #[test] + fn test_generate_chunks_invalid_range_is_empty() { + let start = Utc.with_ymd_and_hms(2025, 1, 1, 1, 0, 0).unwrap(); + let end = Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap(); + let range = TimeRange::new(Some(start), Some(end)); + let chunks = generate_chunks(&range, Duration::from_secs(3600)); + assert!(chunks.is_empty()); + } +} diff --git a/src/cli/src/data/export_v2/command.rs b/src/cli/src/data/export_v2/command.rs index 341436fe0f..ddcb323fef 100644 --- a/src/cli/src/data/export_v2/command.rs +++ b/src/cli/src/data/export_v2/command.rs @@ -26,12 +26,16 @@ use snafu::{OptionExt, ResultExt}; use crate::Tool; use crate::common::ObjectStoreConfig; +use crate::data::export_v2::coordinator::export_data; use crate::data::export_v2::error::{ - CannotResumeSchemaOnlySnafu, DataExportNotImplementedSnafu, DatabaseSnafu, EmptyResultSnafu, - ManifestVersionMismatchSnafu, Result, UnexpectedValueTypeSnafu, + ChunkTimeWindowRequiresBoundsSnafu, DatabaseSnafu, EmptyResultSnafu, + ManifestVersionMismatchSnafu, Result, ResumeConfigMismatchSnafu, SchemaOnlyArgsNotAllowedSnafu, + SchemaOnlyModeMismatchSnafu, UnexpectedValueTypeSnafu, }; use crate::data::export_v2::extractor::SchemaExtractor; -use crate::data::export_v2::manifest::{DataFormat, MANIFEST_VERSION, Manifest}; +use crate::data::export_v2::manifest::{ + ChunkMeta, DataFormat, MANIFEST_VERSION, Manifest, TimeRange, +}; use crate::data::path::ddl_path_for_schema; use crate::data::snapshot_storage::{OpenDalStorage, SnapshotStorage, validate_uri}; use crate::data::sql::{escape_sql_identifier, escape_sql_literal}; @@ -84,6 +88,11 @@ pub struct ExportCreateCommand { #[clap(long)] end_time: Option, + /// Chunk time window (e.g., 1h, 6h, 1d, 7d). + /// Requires both --start-time and --end-time when specified. + #[clap(long, value_parser = humantime::parse_duration)] + chunk_time_window: Option, + /// Data format: parquet, csv, json. #[clap(long, value_enum, default_value = "parquet")] format: DataFormat, @@ -92,7 +101,7 @@ pub struct ExportCreateCommand { #[clap(long)] force: bool, - /// Concurrency level (for future use). + /// Parallelism for COPY DATABASE execution (server-side, per schema per chunk). #[clap(long, default_value = "1")] parallelism: usize, @@ -127,11 +136,38 @@ impl ExportCreateCommand { // Validate URI format validate_uri(&self.to).map_err(BoxedError::new)?; - if !self.schema_only { - return DataExportNotImplementedSnafu + let time_range = TimeRange::parse(self.start_time.as_deref(), self.end_time.as_deref()) + .map_err(BoxedError::new)?; + if self.chunk_time_window.is_some() && !time_range.is_bounded() { + return ChunkTimeWindowRequiresBoundsSnafu .fail() .map_err(BoxedError::new); } + if self.schema_only { + let mut invalid_args = Vec::new(); + if self.start_time.is_some() { + invalid_args.push("--start-time"); + } + if self.end_time.is_some() { + invalid_args.push("--end-time"); + } + if self.chunk_time_window.is_some() { + invalid_args.push("--chunk-time-window"); + } + if self.format != DataFormat::Parquet { + invalid_args.push("--format"); + } + if self.parallelism != 1 { + invalid_args.push("--parallelism"); + } + if !invalid_args.is_empty() { + return SchemaOnlyArgsNotAllowedSnafu { + args: invalid_args.join(", "), + } + .fail() + .map_err(BoxedError::new); + } + } // Parse schemas (empty vec means all schemas) let schemas = if self.schemas.is_empty() { @@ -155,12 +191,18 @@ impl ExportCreateCommand { ); Ok(Box::new(ExportCreate { - catalog: self.catalog.clone(), - schemas, - schema_only: self.schema_only, - _format: self.format, - force: self.force, - _parallelism: self.parallelism, + config: ExportConfig { + catalog: self.catalog.clone(), + schemas, + schema_only: self.schema_only, + format: self.format, + force: self.force, + time_range, + chunk_time_window: self.chunk_time_window, + parallelism: self.parallelism, + snapshot_uri: self.to.clone(), + storage_config: self.storage.clone(), + }, storage: Box::new(storage), database_client, })) @@ -169,14 +211,22 @@ impl ExportCreateCommand { /// Export tool implementation. pub struct ExportCreate { + config: ExportConfig, + storage: Box, + database_client: DatabaseClient, +} + +struct ExportConfig { catalog: String, schemas: Option>, schema_only: bool, - _format: DataFormat, + format: DataFormat, force: bool, - _parallelism: usize, - storage: Box, - database_client: DatabaseClient, + time_range: TimeRange, + chunk_time_window: Option, + parallelism: usize, + snapshot_uri: String, + storage_config: ObjectStoreConfig, } #[async_trait] @@ -192,12 +242,12 @@ impl ExportCreate { let exists = self.storage.exists().await?; if exists { - if self.force { + if self.config.force { info!("Deleting existing snapshot (--force)"); self.storage.delete_snapshot().await?; } else { // Resume mode - read existing manifest - let manifest = self.storage.read_manifest().await?; + let mut manifest = self.storage.read_manifest().await?; // Check version compatibility if manifest.version != MANIFEST_VERSION { @@ -208,10 +258,7 @@ impl ExportCreate { .fail(); } - // Cannot resume schema-only with data export - if manifest.schema_only && !self.schema_only { - return CannotResumeSchemaOnlySnafu.fail(); - } + validate_resume_config(&manifest, &self.config)?; info!( "Resuming existing snapshot: {} (completed: {}/{} chunks)", @@ -220,22 +267,31 @@ impl ExportCreate { manifest.chunks.len() ); - // For M1, we only handle schema-only exports - // M2 will add chunk resume logic if manifest.is_complete() { info!("Snapshot is already complete"); return Ok(()); } - // TODO: Resume data export in M2 - info!("Data export resume not yet implemented (M2)"); + if manifest.schema_only { + return Ok(()); + } + + export_data( + self.storage.as_ref(), + &self.database_client, + &self.config.snapshot_uri, + &self.config.storage_config, + &mut manifest, + self.config.parallelism, + ) + .await?; return Ok(()); } } // 2. Get schema list - let extractor = SchemaExtractor::new(&self.database_client, &self.catalog); - let schema_snapshot = extractor.extract(self.schemas.as_deref()).await?; + let extractor = SchemaExtractor::new(&self.database_client, &self.config.catalog); + let schema_snapshot = extractor.extract(self.config.schemas.as_deref()).await?; let schema_names: Vec = schema_snapshot .schemas @@ -245,7 +301,14 @@ impl ExportCreate { info!("Exporting schemas: {:?}", schema_names); // 3. Create manifest - let manifest = Manifest::new_schema_only(self.catalog.clone(), schema_names.clone()); + let mut manifest = Manifest::new_for_export( + self.config.catalog.clone(), + schema_names.clone(), + self.config.schema_only, + self.config.time_range.clone(), + self.config.format, + self.config.chunk_time_window, + )?; // 4. Write schema files self.storage.write_schema(&schema_snapshot).await?; @@ -259,14 +322,28 @@ impl ExportCreate { info!("Exported DDL for schema {} to {}", schema, ddl_path); } - // 6. Write manifest last. + // 6. Write manifest after schema artifacts and before any data export. // // The manifest is the snapshot commit point: only write it after the schema // index and all DDL files are durable, so a crash cannot leave a "valid" - // snapshot that is missing required schema artifacts. + // snapshot that is missing required schema artifacts. For full exports we + // still need the manifest before data copy starts, because chunk resume is + // tracked by updating this manifest in place. self.storage.write_manifest(&manifest).await?; info!("Snapshot created: {}", manifest.snapshot_id); + if !self.config.schema_only { + export_data( + self.storage.as_ref(), + &self.database_client, + &self.config.snapshot_uri, + &self.config.storage_config, + &mut manifest, + self.config.parallelism, + ) + .await?; + } + Ok(()) } @@ -321,7 +398,7 @@ impl ExportCreate { "SELECT table_name, table_type FROM information_schema.tables \ WHERE table_catalog = '{}' AND table_schema = '{}' \ AND (table_type = 'BASE TABLE' OR table_type = 'VIEW')", - escape_sql_literal(&self.catalog), + escape_sql_literal(&self.config.catalog), escape_sql_literal(schema) ); let records: Option>> = self @@ -359,7 +436,7 @@ impl ExportCreate { let sql = format!( "SELECT DISTINCT table_name FROM information_schema.columns \ WHERE table_catalog = '{}' AND table_schema = '{}' AND column_name = '__tsid'", - escape_sql_literal(&self.catalog), + escape_sql_literal(&self.config.catalog), escape_sql_literal(schema) ); let records: Option>> = self @@ -392,14 +469,14 @@ impl ExportCreate { Some(table) => format!( r#"SHOW CREATE {} "{}"."{}"."{}""#, show_type, - escape_sql_identifier(&self.catalog), + escape_sql_identifier(&self.config.catalog), escape_sql_identifier(schema), escape_sql_identifier(table) ), None => format!( r#"SHOW CREATE {} "{}"."{}""#, show_type, - escape_sql_identifier(&self.catalog), + escape_sql_identifier(&self.config.catalog), escape_sql_identifier(schema) ), }; @@ -442,8 +519,118 @@ fn build_schema_ddl( ddl } +fn validate_resume_config(manifest: &Manifest, config: &ExportConfig) -> Result<()> { + if manifest.schema_only != config.schema_only { + return SchemaOnlyModeMismatchSnafu { + existing_schema_only: manifest.schema_only, + requested_schema_only: config.schema_only, + } + .fail(); + } + + if manifest.catalog != config.catalog { + return ResumeConfigMismatchSnafu { + field: "catalog", + existing: manifest.catalog.clone(), + requested: config.catalog.clone(), + } + .fail(); + } + + // If no schema filter is provided on resume, inherit the existing snapshot + // selection instead of reinterpreting the request as "all schemas". + if let Some(requested_schemas) = &config.schemas + && !schema_selection_matches(&manifest.schemas, requested_schemas) + { + return ResumeConfigMismatchSnafu { + field: "schemas", + existing: format_schema_selection(&manifest.schemas), + requested: format_schema_selection(requested_schemas), + } + .fail(); + } + + if manifest.time_range != config.time_range { + return ResumeConfigMismatchSnafu { + field: "time_range", + existing: format!("{:?}", manifest.time_range), + requested: format!("{:?}", config.time_range), + } + .fail(); + } + + if manifest.format != config.format { + return ResumeConfigMismatchSnafu { + field: "format", + existing: manifest.format.to_string(), + requested: config.format.to_string(), + } + .fail(); + } + + let expected_plan = Manifest::new_for_export( + manifest.catalog.clone(), + manifest.schemas.clone(), + config.schema_only, + config.time_range.clone(), + config.format, + config.chunk_time_window, + )?; + if !chunk_plan_matches(manifest, &expected_plan) { + return ResumeConfigMismatchSnafu { + field: "chunk plan", + existing: format_chunk_plan(&manifest.chunks), + requested: format_chunk_plan(&expected_plan.chunks), + } + .fail(); + } + + Ok(()) +} + +fn schema_selection_matches(existing: &[String], requested: &[String]) -> bool { + canonical_schema_selection(existing) == canonical_schema_selection(requested) +} + +fn canonical_schema_selection(schemas: &[String]) -> Vec { + let mut canonicalized = Vec::new(); + let mut seen = HashSet::new(); + + for schema in schemas { + let normalized = schema.to_ascii_lowercase(); + if seen.insert(normalized.clone()) { + canonicalized.push(normalized); + } + } + + canonicalized.sort(); + canonicalized +} + +fn format_schema_selection(schemas: &[String]) -> String { + format!("[{}]", schemas.join(", ")) +} + +fn chunk_plan_matches(existing: &Manifest, expected: &Manifest) -> bool { + existing.chunks.len() == expected.chunks.len() + && existing + .chunks + .iter() + .zip(&expected.chunks) + .all(|(left, right)| left.id == right.id && left.time_range == right.time_range) +} + +fn format_chunk_plan(chunks: &[ChunkMeta]) -> String { + let items = chunks + .iter() + .map(|chunk| format!("#{}:{:?}", chunk.id, chunk.time_range)) + .collect::>(); + format!("[{}]", items.join(", ")) +} + #[cfg(test)] mod tests { + use chrono::TimeZone; use clap::Parser; use super::*; @@ -478,19 +665,225 @@ mod tests { } #[tokio::test] - async fn test_build_rejects_non_schema_only_export() { + async fn test_build_rejects_chunk_window_without_bounds() { let cmd = ExportCreateCommand::parse_from([ "export-v2-create", "--addr", "127.0.0.1:4000", "--to", "file:///tmp/export-v2-test", + "--chunk-time-window", + "1h", ]); let result = cmd.build().await; assert!(result.is_err()); let error = result.err().unwrap().to_string(); - assert!(error.contains("Data export is not implemented yet")); + assert!(error.contains("chunk_time_window requires both --start-time and --end-time")); + } + + #[tokio::test] + async fn test_build_rejects_data_export_args_in_schema_only_mode() { + let cmd = ExportCreateCommand::parse_from([ + "export-v2-create", + "--addr", + "127.0.0.1:4000", + "--to", + "file:///tmp/export-v2-test", + "--schema-only", + "--start-time", + "2024-01-01T00:00:00Z", + "--end-time", + "2024-01-02T00:00:00Z", + "--chunk-time-window", + "1h", + "--format", + "csv", + "--parallelism", + "2", + ]); + + let error = cmd.build().await.err().unwrap().to_string(); + + assert!(error.contains("--schema-only cannot be used with data export arguments")); + assert!(error.contains("--start-time")); + assert!(error.contains("--end-time")); + assert!(error.contains("--chunk-time-window")); + assert!(error.contains("--format")); + assert!(error.contains("--parallelism")); + } + + #[test] + fn test_schema_only_mode_mismatch_error_message() { + let error = crate::data::export_v2::error::SchemaOnlyModeMismatchSnafu { + existing_schema_only: false, + requested_schema_only: true, + } + .build() + .to_string(); + + assert!(error.contains("existing: false")); + assert!(error.contains("requested: true")); + } + + #[test] + fn test_validate_resume_config_rejects_catalog_mismatch() { + let manifest = Manifest::new_for_export( + "greptime".to_string(), + vec!["public".to_string()], + false, + TimeRange::unbounded(), + DataFormat::Parquet, + None, + ) + .unwrap(); + let config = ExportConfig { + catalog: "other".to_string(), + schemas: None, + schema_only: false, + format: DataFormat::Parquet, + force: false, + time_range: TimeRange::unbounded(), + chunk_time_window: None, + parallelism: 1, + snapshot_uri: "file:///tmp/snapshot".to_string(), + storage_config: ObjectStoreConfig::default(), + }; + + let error = validate_resume_config(&manifest, &config) + .err() + .unwrap() + .to_string(); + assert!(error.contains("catalog")); + } + + #[test] + fn test_validate_resume_config_accepts_schema_selection_with_different_case_and_order() { + let manifest = Manifest::new_for_export( + "greptime".to_string(), + vec!["public".to_string(), "analytics".to_string()], + false, + TimeRange::unbounded(), + DataFormat::Parquet, + None, + ) + .unwrap(); + let config = ExportConfig { + catalog: "greptime".to_string(), + schemas: Some(vec![ + "ANALYTICS".to_string(), + "PUBLIC".to_string(), + "public".to_string(), + ]), + schema_only: false, + format: DataFormat::Parquet, + force: false, + time_range: TimeRange::unbounded(), + chunk_time_window: None, + parallelism: 1, + snapshot_uri: "file:///tmp/snapshot".to_string(), + storage_config: ObjectStoreConfig::default(), + }; + + assert!(validate_resume_config(&manifest, &config).is_ok()); + } + + #[test] + fn test_validate_resume_config_rejects_chunk_plan_mismatch() { + let start = chrono::Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap(); + let end = chrono::Utc.with_ymd_and_hms(2025, 1, 1, 2, 0, 0).unwrap(); + let time_range = TimeRange::new(Some(start), Some(end)); + let manifest = Manifest::new_for_export( + "greptime".to_string(), + vec!["public".to_string()], + false, + time_range.clone(), + DataFormat::Parquet, + None, + ) + .unwrap(); + let config = ExportConfig { + catalog: "greptime".to_string(), + schemas: None, + schema_only: false, + format: DataFormat::Parquet, + force: false, + time_range, + chunk_time_window: Some(Duration::from_secs(3600)), + parallelism: 1, + snapshot_uri: "file:///tmp/snapshot".to_string(), + storage_config: ObjectStoreConfig::default(), + }; + + let error = validate_resume_config(&manifest, &config) + .err() + .unwrap() + .to_string(); + assert!(error.contains("chunk plan")); + } + + #[test] + fn test_validate_resume_config_rejects_format_mismatch() { + let manifest = Manifest::new_for_export( + "greptime".to_string(), + vec!["public".to_string()], + false, + TimeRange::unbounded(), + DataFormat::Parquet, + None, + ) + .unwrap(); + let config = ExportConfig { + catalog: "greptime".to_string(), + schemas: None, + schema_only: false, + format: DataFormat::Csv, + force: false, + time_range: TimeRange::unbounded(), + chunk_time_window: None, + parallelism: 1, + snapshot_uri: "file:///tmp/snapshot".to_string(), + storage_config: ObjectStoreConfig::default(), + }; + + let error = validate_resume_config(&manifest, &config) + .err() + .unwrap() + .to_string(); + assert!(error.contains("format")); + } + + #[test] + fn test_validate_resume_config_rejects_time_range_mismatch() { + let start = chrono::Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap(); + let end = chrono::Utc.with_ymd_and_hms(2025, 1, 1, 1, 0, 0).unwrap(); + let manifest = Manifest::new_for_export( + "greptime".to_string(), + vec!["public".to_string()], + false, + TimeRange::new(Some(start), Some(end)), + DataFormat::Parquet, + None, + ) + .unwrap(); + let config = ExportConfig { + catalog: "greptime".to_string(), + schemas: None, + schema_only: false, + format: DataFormat::Parquet, + force: false, + time_range: TimeRange::new(Some(start), Some(start)), + chunk_time_window: None, + parallelism: 1, + snapshot_uri: "file:///tmp/snapshot".to_string(), + storage_config: ObjectStoreConfig::default(), + }; + + let error = validate_resume_config(&manifest, &config) + .err() + .unwrap() + .to_string(); + assert!(error.contains("time_range")); } } diff --git a/src/cli/src/data/export_v2/coordinator.rs b/src/cli/src/data/export_v2/coordinator.rs new file mode 100644 index 0000000000..d96c01d693 --- /dev/null +++ b/src/cli/src/data/export_v2/coordinator.rs @@ -0,0 +1,166 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use common_telemetry::info; + +use crate::common::ObjectStoreConfig; +use crate::data::export_v2::data::{CopyOptions, build_copy_target, execute_copy_database}; +use crate::data::export_v2::error::Result; +use crate::data::export_v2::manifest::{ChunkStatus, DataFormat, Manifest, TimeRange}; +use crate::data::path::data_dir_for_schema_chunk; +use crate::data::snapshot_storage::{SnapshotStorage, StorageScheme}; +use crate::database::DatabaseClient; + +struct ExportContext<'a> { + storage: &'a dyn SnapshotStorage, + database_client: &'a DatabaseClient, + snapshot_uri: &'a str, + storage_config: &'a ObjectStoreConfig, + catalog: &'a str, + schemas: &'a [String], + format: DataFormat, + parallelism: usize, +} + +pub async fn export_data( + storage: &dyn SnapshotStorage, + database_client: &DatabaseClient, + snapshot_uri: &str, + storage_config: &ObjectStoreConfig, + manifest: &mut Manifest, + parallelism: usize, +) -> Result<()> { + if manifest.chunks.is_empty() { + return Ok(()); + } + + for idx in 0..manifest.chunks.len() { + if matches!( + manifest.chunks[idx].status, + ChunkStatus::Completed | ChunkStatus::Skipped + ) { + continue; + } + + let (chunk_id, time_range) = mark_chunk_in_progress(manifest, idx); + manifest.touch(); + storage.write_manifest(manifest).await?; + + let context = ExportContext { + storage, + database_client, + snapshot_uri, + storage_config, + catalog: &manifest.catalog, + schemas: &manifest.schemas, + format: manifest.format, + parallelism, + }; + let export_result = export_chunk(&context, chunk_id, time_range).await; + + let result = match export_result { + Ok(files) => { + mark_chunk_completed(manifest, idx, files); + Ok(()) + } + Err(err) => { + mark_chunk_failed(manifest, idx, err.to_string()); + Err(err) + } + }; + + manifest.touch(); + storage.write_manifest(manifest).await?; + + result?; + } + + Ok(()) +} + +fn mark_chunk_in_progress(manifest: &mut Manifest, idx: usize) -> (u32, TimeRange) { + let chunk = &mut manifest.chunks[idx]; + chunk.mark_in_progress(); + (chunk.id, chunk.time_range.clone()) +} + +fn mark_chunk_completed(manifest: &mut Manifest, idx: usize, files: Vec) { + let chunk = &mut manifest.chunks[idx]; + if files.is_empty() { + chunk.mark_skipped(); + } else { + chunk.mark_completed(files, None); + } +} + +fn mark_chunk_failed(manifest: &mut Manifest, idx: usize, error: String) { + let chunk = &mut manifest.chunks[idx]; + chunk.mark_failed(error); +} + +async fn export_chunk( + context: &ExportContext<'_>, + chunk_id: u32, + time_range: TimeRange, +) -> Result> { + let scheme = StorageScheme::from_uri(context.snapshot_uri)?; + let needs_dir = matches!(scheme, StorageScheme::File); + let copy_options = CopyOptions { + format: context.format, + time_range, + parallelism: context.parallelism, + }; + + for schema in context.schemas { + let prefix = data_dir_for_schema_chunk(schema, chunk_id); + if needs_dir { + context.storage.create_dir_all(&prefix).await?; + } + + let target = build_copy_target( + context.snapshot_uri, + context.storage_config, + schema, + chunk_id, + )?; + execute_copy_database( + context.database_client, + context.catalog, + schema, + &target, + ©_options, + ) + .await?; + } + + let files = list_chunk_files(context.storage, context.schemas, chunk_id).await?; + info!("Collected {} files for chunk {}", files.len(), chunk_id); + Ok(files) +} + +async fn list_chunk_files( + storage: &dyn SnapshotStorage, + schemas: &[String], + chunk_id: u32, +) -> Result> { + let mut files = Vec::new(); + + for schema in schemas { + let prefix = data_dir_for_schema_chunk(schema, chunk_id); + files.extend(storage.list_files_recursive(&prefix).await?); + } + + files.sort(); + Ok(files) +} diff --git a/src/cli/src/data/export_v2/data.rs b/src/cli/src/data/export_v2/data.rs new file mode 100644 index 0000000000..fe2ec7c051 --- /dev/null +++ b/src/cli/src/data/export_v2/data.rs @@ -0,0 +1,440 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use common_base::secrets::{ExposeSecret, SecretString}; +use common_telemetry::info; +use object_store::util::{join_path, normalize_path}; +use snafu::ResultExt; +use url::Url; + +use crate::common::ObjectStoreConfig; +use crate::data::export_v2::error::{DatabaseSnafu, InvalidUriSnafu, Result, UrlParseSnafu}; +use crate::data::export_v2::manifest::{DataFormat, TimeRange}; +use crate::data::path::data_dir_for_schema_chunk; +use crate::data::snapshot_storage::StorageScheme; +use crate::data::sql::{escape_sql_identifier, escape_sql_literal}; +use crate::database::DatabaseClient; + +pub(super) struct CopyOptions { + pub(super) format: DataFormat, + pub(super) time_range: TimeRange, + pub(super) parallelism: usize, +} + +pub(super) struct CopyTarget { + pub(super) location: String, + pub(super) connection: String, + secrets: Vec>, +} + +impl CopyTarget { + fn mask_sql(&self, sql: &str) -> String { + mask_secrets(sql, &self.secrets) + } +} + +pub(super) fn build_copy_target( + snapshot_uri: &str, + storage: &ObjectStoreConfig, + schema: &str, + chunk_id: u32, +) -> Result { + let url = Url::parse(snapshot_uri).context(UrlParseSnafu)?; + let scheme = StorageScheme::from_uri(snapshot_uri)?; + let suffix = data_dir_for_schema_chunk(schema, chunk_id); + + match scheme { + StorageScheme::File => { + let root = url.to_file_path().map_err(|_| { + InvalidUriSnafu { + uri: snapshot_uri, + reason: "file:// URI must use an absolute path like file:///tmp/backup", + } + .build() + })?; + let location = normalize_path(&format!("{}/{}", root.to_string_lossy(), suffix)); + Ok(CopyTarget { + location, + connection: String::new(), + secrets: Vec::new(), + }) + } + StorageScheme::S3 => { + let (bucket, root) = extract_bucket_root(&url, snapshot_uri)?; + let location = format!("s3://{}/{}", bucket, join_root(&root, &suffix)); + let (connection, secrets) = build_s3_connection(storage); + Ok(CopyTarget { + location, + connection, + secrets, + }) + } + StorageScheme::Oss => { + let (bucket, root) = extract_bucket_root(&url, snapshot_uri)?; + let location = format!("oss://{}/{}", bucket, join_root(&root, &suffix)); + let (connection, secrets) = build_oss_connection(storage); + Ok(CopyTarget { + location, + connection, + secrets, + }) + } + StorageScheme::Gcs => { + let (bucket, root) = extract_bucket_root(&url, snapshot_uri)?; + let location = format!("gcs://{}/{}", bucket, join_root(&root, &suffix)); + let (connection, secrets) = build_gcs_connection(storage, snapshot_uri)?; + Ok(CopyTarget { + location, + connection, + secrets, + }) + } + StorageScheme::Azblob => { + let (bucket, root) = extract_bucket_root(&url, snapshot_uri)?; + let location = format!("azblob://{}/{}", bucket, join_root(&root, &suffix)); + let (connection, secrets) = build_azblob_connection(storage); + Ok(CopyTarget { + location, + connection, + secrets, + }) + } + } +} + +pub(super) async fn execute_copy_database( + database_client: &DatabaseClient, + catalog: &str, + schema: &str, + target: &CopyTarget, + options: &CopyOptions, +) -> Result<()> { + let with_options = build_with_options(options); + let sql = format!( + r#"COPY DATABASE "{}"."{}" TO '{}' WITH ({}){};"#, + escape_sql_identifier(catalog), + escape_sql_identifier(schema), + escape_sql_literal(&target.location), + with_options, + target.connection + ); + let safe_sql = target.mask_sql(&sql); + info!("Executing sql: {}", safe_sql); + database_client + .sql_in_public(&sql) + .await + .context(DatabaseSnafu)?; + Ok(()) +} + +fn build_with_options(options: &CopyOptions) -> String { + let mut parts = vec![format!("FORMAT='{}'", options.format)]; + if let Some(start) = options.time_range.start { + parts.push(format!( + "START_TIME='{}'", + escape_sql_literal(&start.to_rfc3339()) + )); + } + if let Some(end) = options.time_range.end { + parts.push(format!( + "END_TIME='{}'", + escape_sql_literal(&end.to_rfc3339()) + )); + } + parts.push(format!("PARALLELISM={}", options.parallelism)); + parts.join(", ") +} + +fn extract_bucket_root(url: &Url, snapshot_uri: &str) -> Result<(String, String)> { + let bucket = url.host_str().unwrap_or("").to_string(); + if bucket.is_empty() { + return InvalidUriSnafu { + uri: snapshot_uri, + reason: "URI must include bucket/container in host", + } + .fail(); + } + let root = url + .path() + .trim_start_matches('/') + .trim_end_matches('/') + .to_string(); + Ok((bucket, root)) +} + +fn join_root(root: &str, suffix: &str) -> String { + join_path(root, suffix).trim_start_matches('/').to_string() +} + +fn build_s3_connection(storage: &ObjectStoreConfig) -> (String, Vec>) { + let access_key_id = expose_optional_secret(&storage.s3.s3_access_key_id); + let secret_access_key = expose_optional_secret(&storage.s3.s3_secret_access_key); + + let mut options = Vec::new(); + if let Some(access_key_id) = &access_key_id { + options.push(format!( + "ACCESS_KEY_ID='{}'", + escape_sql_literal(access_key_id) + )); + } + if let Some(secret_access_key) = &secret_access_key { + options.push(format!( + "SECRET_ACCESS_KEY='{}'", + escape_sql_literal(secret_access_key) + )); + } + if let Some(region) = &storage.s3.s3_region { + options.push(format!("REGION='{}'", escape_sql_literal(region))); + } + if let Some(endpoint) = &storage.s3.s3_endpoint { + options.push(format!("ENDPOINT='{}'", escape_sql_literal(endpoint))); + } + + let secrets = vec![access_key_id, secret_access_key]; + let connection = if options.is_empty() { + String::new() + } else { + format!(" CONNECTION ({})", options.join(", ")) + }; + (connection, secrets) +} + +fn build_oss_connection(storage: &ObjectStoreConfig) -> (String, Vec>) { + let access_key_id = expose_optional_secret(&storage.oss.oss_access_key_id); + let access_key_secret = expose_optional_secret(&storage.oss.oss_access_key_secret); + + let mut options = Vec::new(); + if let Some(access_key_id) = &access_key_id { + options.push(format!( + "ACCESS_KEY_ID='{}'", + escape_sql_literal(access_key_id) + )); + } + if let Some(access_key_secret) = &access_key_secret { + options.push(format!( + "ACCESS_KEY_SECRET='{}'", + escape_sql_literal(access_key_secret) + )); + } + if !storage.oss.oss_endpoint.is_empty() { + options.push(format!( + "ENDPOINT='{}'", + escape_sql_literal(&storage.oss.oss_endpoint) + )); + } + + let secrets = vec![access_key_id, access_key_secret]; + let connection = if options.is_empty() { + String::new() + } else { + format!(" CONNECTION ({})", options.join(", ")) + }; + (connection, secrets) +} + +fn build_gcs_connection( + storage: &ObjectStoreConfig, + snapshot_uri: &str, +) -> Result<(String, Vec>)> { + let credential_path = expose_optional_secret(&storage.gcs.gcs_credential_path); + let credential = expose_optional_secret(&storage.gcs.gcs_credential); + + if credential.is_none() && credential_path.is_some() { + return InvalidUriSnafu { + uri: snapshot_uri, + reason: "gcs_credential_path is not supported for server-side COPY; provide gcs_credential or rely on server-side ADC", + } + .fail(); + } + + let mut options = Vec::new(); + if let Some(credential) = &credential { + options.push(format!("CREDENTIAL='{}'", escape_sql_literal(credential))); + } + if !storage.gcs.gcs_scope.is_empty() { + options.push(format!( + "SCOPE='{}'", + escape_sql_literal(&storage.gcs.gcs_scope) + )); + } + if !storage.gcs.gcs_endpoint.is_empty() { + options.push(format!( + "ENDPOINT='{}'", + escape_sql_literal(&storage.gcs.gcs_endpoint) + )); + } + + let connection = if options.is_empty() { + String::new() + } else { + format!(" CONNECTION ({})", options.join(", ")) + }; + let secrets = vec![credential_path, credential]; + Ok((connection, secrets)) +} + +fn build_azblob_connection(storage: &ObjectStoreConfig) -> (String, Vec>) { + let account_name = expose_optional_secret(&storage.azblob.azblob_account_name); + let account_key = expose_optional_secret(&storage.azblob.azblob_account_key); + let sas_token = storage.azblob.azblob_sas_token.clone(); + + let mut options = Vec::new(); + if let Some(account_name) = &account_name { + options.push(format!( + "ACCOUNT_NAME='{}'", + escape_sql_literal(account_name) + )); + } + if let Some(account_key) = &account_key { + options.push(format!("ACCOUNT_KEY='{}'", escape_sql_literal(account_key))); + } + if let Some(sas_token) = &sas_token { + options.push(format!("SAS_TOKEN='{}'", escape_sql_literal(sas_token))); + } + if !storage.azblob.azblob_endpoint.is_empty() { + options.push(format!( + "ENDPOINT='{}'", + escape_sql_literal(&storage.azblob.azblob_endpoint) + )); + } + + let secrets = vec![account_name, account_key, sas_token]; + let connection = if options.is_empty() { + String::new() + } else { + format!(" CONNECTION ({})", options.join(", ")) + }; + (connection, secrets) +} + +fn expose_optional_secret(secret: &Option) -> Option { + secret.as_ref().map(|s| s.expose_secret().to_owned()) +} + +fn mask_secrets(sql: &str, secrets: &[Option]) -> String { + let mut masked = sql.to_string(); + for secret in secrets { + if let Some(secret) = secret + && !secret.is_empty() + { + masked = masked.replace(secret, "[REDACTED]"); + } + } + masked +} + +#[cfg(test)] +mod tests { + use common_base::secrets::SecretString; + + use super::*; + use crate::common::{PrefixedAzblobConnection, PrefixedGcsConnection, PrefixedOssConnection}; + + #[test] + fn test_build_oss_connection_includes_endpoint() { + let storage = ObjectStoreConfig { + oss: PrefixedOssConnection { + oss_endpoint: "https://oss.example.com".to_string(), + oss_access_key_id: Some(SecretString::from("key_id".to_string())), + oss_access_key_secret: Some(SecretString::from("key_secret".to_string())), + ..Default::default() + }, + ..Default::default() + }; + + let (connection, _) = build_oss_connection(&storage); + assert!(connection.contains("ENDPOINT='https://oss.example.com'")); + } + + #[test] + fn test_build_gcs_connection_uses_scope_and_inline_credential() { + let storage = ObjectStoreConfig { + gcs: PrefixedGcsConnection { + gcs_scope: "scope-a".to_string(), + gcs_endpoint: "https://storage.googleapis.com".to_string(), + gcs_credential: Some(SecretString::from("credential-json".to_string())), + ..Default::default() + }, + ..Default::default() + }; + + let (connection, _) = build_gcs_connection(&storage, "gcs://bucket/root").unwrap(); + assert!(connection.contains("CREDENTIAL='credential-json'")); + assert!(connection.contains("SCOPE='scope-a'")); + assert!(connection.contains("ENDPOINT='https://storage.googleapis.com'")); + assert!(!connection.contains("CREDENTIAL_PATH")); + } + + #[test] + fn test_build_gcs_connection_rejects_credential_path_only() { + let storage = ObjectStoreConfig { + gcs: PrefixedGcsConnection { + gcs_scope: "scope-a".to_string(), + gcs_credential_path: Some(SecretString::from("/tmp/creds.json".to_string())), + ..Default::default() + }, + ..Default::default() + }; + + let error = build_gcs_connection(&storage, "gcs://bucket/root") + .expect_err("credential_path-only should be rejected") + .to_string(); + assert!(error.contains("gcs_credential_path is not supported")); + } + + #[test] + fn test_build_azblob_connection_includes_endpoint() { + let storage = ObjectStoreConfig { + azblob: PrefixedAzblobConnection { + azblob_account_name: Some(SecretString::from("account".to_string())), + azblob_account_key: Some(SecretString::from("key".to_string())), + azblob_endpoint: "https://blob.example.com".to_string(), + ..Default::default() + }, + ..Default::default() + }; + + let (connection, _) = build_azblob_connection(&storage); + assert!(connection.contains("ENDPOINT='https://blob.example.com'")); + } + + #[test] + fn test_build_azblob_connection_redacts_sas_token() { + let storage = ObjectStoreConfig { + azblob: PrefixedAzblobConnection { + azblob_account_name: Some(SecretString::from("account".to_string())), + azblob_account_key: Some(SecretString::from("key".to_string())), + azblob_sas_token: Some("sig=secret-token".to_string()), + ..Default::default() + }, + ..Default::default() + }; + + let (connection, secrets) = build_azblob_connection(&storage); + let masked = mask_secrets(&connection, &secrets); + + assert!(connection.contains("SAS_TOKEN='sig=secret-token'")); + assert!(masked.contains("SAS_TOKEN='[REDACTED]'")); + assert!(!masked.contains("sig=secret-token")); + } + + #[test] + fn test_build_copy_target_decodes_file_uri_path() { + let storage = ObjectStoreConfig::default(); + let target = build_copy_target("file:///tmp/my%20backup", &storage, "public", 7) + .expect("file:// copy target should be built"); + + assert_eq!(target.location, "/tmp/my backup/data/public/7/"); + } +} diff --git a/src/cli/src/data/export_v2/error.rs b/src/cli/src/data/export_v2/error.rs index 2db71d5326..ec860fecfa 100644 --- a/src/cli/src/data/export_v2/error.rs +++ b/src/cli/src/data/export_v2/error.rs @@ -72,17 +72,55 @@ pub enum Error { }, #[snafu(display( - "Cannot resume schema-only snapshot with data export. Use --force to recreate." + "Cannot resume snapshot with a different schema_only mode (existing: {}, requested: {}). Use --force to recreate.", + existing_schema_only, + requested_schema_only ))] - CannotResumeSchemaOnly { + SchemaOnlyModeMismatch { + existing_schema_only: bool, + requested_schema_only: bool, #[snafu(implicit)] location: Location, }, #[snafu(display( - "Data export is not implemented yet. Use --schema-only to create a schema snapshot." + "Cannot resume snapshot with different {} (existing: {}, requested: {}). Use --force to recreate.", + field, + existing, + requested ))] - DataExportNotImplemented { + ResumeConfigMismatch { + field: String, + existing: String, + requested: String, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Failed to parse time: invalid format: {}", input))] + TimeParseInvalidFormat { + input: String, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Failed to parse time: end_time is before start_time"))] + TimeParseEndBeforeStart { + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display( + "chunk_time_window requires both --start-time and --end-time to be specified" + ))] + ChunkTimeWindowRequiresBounds { + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("--schema-only cannot be used with data export arguments: {}", args))] + SchemaOnlyArgsNotAllowed { + args: String, #[snafu(implicit)] location: Location, }, @@ -154,9 +192,13 @@ impl ErrorExt for Error { match self { Error::InvalidUri { .. } | Error::UnsupportedScheme { .. } - | Error::CannotResumeSchemaOnly { .. } - | Error::DataExportNotImplemented { .. } - | Error::ManifestVersionMismatch { .. } => StatusCode::InvalidArguments, + | Error::SchemaOnlyModeMismatch { .. } + | Error::ResumeConfigMismatch { .. } + | Error::ManifestVersionMismatch { .. } + | Error::SchemaOnlyArgsNotAllowed { .. } => StatusCode::InvalidArguments, + Error::TimeParseInvalidFormat { .. } + | Error::TimeParseEndBeforeStart { .. } + | Error::ChunkTimeWindowRequiresBounds { .. } => StatusCode::InvalidArguments, Error::StorageOperation { .. } | Error::ManifestParse { .. } diff --git a/src/cli/src/data/export_v2/manifest.rs b/src/cli/src/data/export_v2/manifest.rs index 0ebf753fa4..918288bb51 100644 --- a/src/cli/src/data/export_v2/manifest.rs +++ b/src/cli/src/data/export_v2/manifest.rs @@ -14,12 +14,19 @@ //! Manifest data structures for Export/Import V2. +use std::time::Duration; use std::{fmt, str}; use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use uuid::Uuid; +use crate::data::export_v2::chunker::generate_chunks; +use crate::data::export_v2::error::{ + ChunkTimeWindowRequiresBoundsSnafu, Result as ExportResult, TimeParseEndBeforeStartSnafu, + TimeParseInvalidFormatSnafu, +}; + /// Current manifest format version. pub const MANIFEST_VERSION: u32 = 1; @@ -55,6 +62,31 @@ impl TimeRange { pub fn is_unbounded(&self) -> bool { self.start.is_none() && self.end.is_none() } + + /// Returns true if both bounds are specified. + pub fn is_bounded(&self) -> bool { + self.start.is_some() && self.end.is_some() + } + + /// Parses a time range from optional RFC3339 strings. + pub fn parse(start: Option<&str>, end: Option<&str>) -> ExportResult { + let start = start.map(parse_time).transpose()?; + let end = end.map(parse_time).transpose()?; + + if let (Some(start), Some(end)) = (start, end) + && end < start + { + return TimeParseEndBeforeStartSnafu.fail(); + } + + Ok(Self::new(start, end)) + } +} + +fn parse_time(input: &str) -> ExportResult> { + DateTime::parse_from_rfc3339(input) + .map(|dt| dt.with_timezone(&Utc)) + .map_err(|_| TimeParseInvalidFormatSnafu { input }.build()) } impl Default for TimeRange { @@ -74,6 +106,8 @@ pub enum ChunkStatus { InProgress, /// Chunk export completed successfully. Completed, + /// Chunk had no data to export. + Skipped, /// Chunk export failed. Failed, } @@ -111,6 +145,13 @@ impl ChunkMeta { } } + /// Creates a skipped chunk with the given id and time range. + pub fn skipped(id: u32, time_range: TimeRange) -> Self { + let mut chunk = Self::new(id, time_range); + chunk.mark_skipped(); + chunk + } + /// Marks this chunk as in progress. pub fn mark_in_progress(&mut self) { self.status = ChunkStatus::InProgress; @@ -125,6 +166,14 @@ impl ChunkMeta { self.error = None; } + /// Marks this chunk as skipped because no data files were produced. + pub fn mark_skipped(&mut self) { + self.status = ChunkStatus::Skipped; + self.files.clear(); + self.checksum = None; + self.error = None; + } + /// Marks this chunk as failed with the given error message. pub fn mark_failed(&mut self, error: String) { self.status = ChunkStatus::Failed; @@ -210,6 +259,35 @@ pub struct Manifest { } impl Manifest { + pub fn new_for_export( + catalog: String, + schemas: Vec, + schema_only: bool, + time_range: TimeRange, + format: DataFormat, + chunk_time_window: Option, + ) -> ExportResult { + if chunk_time_window.is_some() && !time_range.is_bounded() { + return ChunkTimeWindowRequiresBoundsSnafu.fail(); + } + + let mut manifest = if schema_only { + Self::new_schema_only(catalog, schemas) + } else { + Self::new_full(catalog, schemas, time_range, format) + }; + + if !schema_only { + manifest.chunks = match chunk_time_window { + Some(window) => generate_chunks(&manifest.time_range, window), + None => generate_single_chunk(&manifest.time_range), + }; + manifest.touch(); + } + + Ok(manifest) + } + /// Creates a new manifest for schema-only export. pub fn new_schema_only(catalog: String, schemas: Vec) -> Self { let now = Utc::now(); @@ -258,7 +336,7 @@ impl Manifest { && self .chunks .iter() - .all(|c| c.status == ChunkStatus::Completed)) + .all(|c| matches!(c.status, ChunkStatus::Completed | ChunkStatus::Skipped))) } /// Returns the number of pending chunks. @@ -285,6 +363,14 @@ impl Manifest { .count() } + /// Returns the number of skipped chunks. + pub fn skipped_count(&self) -> usize { + self.chunks + .iter() + .filter(|c| c.status == ChunkStatus::Skipped) + .count() + } + /// Returns the number of failed chunks. pub fn failed_count(&self) -> usize { self.chunks @@ -313,8 +399,24 @@ impl Manifest { } } +fn generate_single_chunk(time_range: &TimeRange) -> Vec { + if let (Some(start), Some(end)) = (time_range.start, time_range.end) { + if start == end { + return vec![ChunkMeta::skipped(1, time_range.clone())]; + } + if start > end { + return Vec::new(); + } + } + vec![ChunkMeta::new(1, time_range.clone())] +} + #[cfg(test)] mod tests { + use std::time::Duration; + + use chrono::{TimeZone, Utc}; + use super::*; #[test] @@ -338,6 +440,26 @@ mod tests { assert!(manifest.is_complete()); } + #[test] + fn test_generate_single_chunk_zero_width_range_is_skipped() { + let ts = Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap(); + let chunks = generate_single_chunk(&TimeRange::new(Some(ts), Some(ts))); + + assert_eq!(chunks.len(), 1); + assert_eq!(chunks[0].status, ChunkStatus::Skipped); + assert_eq!(chunks[0].time_range.start, Some(ts)); + assert_eq!(chunks[0].time_range.end, Some(ts)); + } + + #[test] + fn test_generate_single_chunk_invalid_range_is_empty() { + let start = Utc.with_ymd_and_hms(2025, 1, 1, 1, 0, 0).unwrap(); + let end = Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap(); + let chunks = generate_single_chunk(&TimeRange::new(Some(start), Some(end))); + + assert!(chunks.is_empty()); + } + #[test] fn test_manifest_full() { let manifest = Manifest::new_full( @@ -377,5 +499,71 @@ mod tests { ); assert_eq!(chunk.status, ChunkStatus::Completed); assert_eq!(chunk.files.len(), 1); + + chunk.mark_skipped(); + assert_eq!(chunk.status, ChunkStatus::Skipped); + assert!(chunk.files.is_empty()); + } + + #[test] + fn test_manifest_is_complete_when_chunks_are_completed_or_skipped() { + let mut manifest = Manifest::new_full( + "greptime".to_string(), + vec!["public".to_string()], + TimeRange::unbounded(), + DataFormat::Parquet, + ); + manifest.add_chunk(ChunkMeta::new(1, TimeRange::unbounded())); + manifest.add_chunk(ChunkMeta::new(2, TimeRange::unbounded())); + + manifest.update_chunk(1, |chunk| { + chunk.mark_completed(vec!["a.parquet".to_string()], None) + }); + manifest.update_chunk(2, |chunk| chunk.mark_skipped()); + + assert!(manifest.is_complete()); + assert_eq!(manifest.completed_count(), 1); + assert_eq!(manifest.skipped_count(), 1); + } + + #[test] + fn test_manifest_chunk_time_window_none_single_chunk() { + let start = Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap(); + let end = Utc.with_ymd_and_hms(2025, 1, 2, 0, 0, 0).unwrap(); + let range = TimeRange::new(Some(start), Some(end)); + let manifest = Manifest::new_for_export( + "greptime".to_string(), + vec!["public".to_string()], + false, + range.clone(), + DataFormat::Parquet, + None, + ) + .unwrap(); + + assert_eq!(manifest.chunks.len(), 1); + assert_eq!(manifest.chunks[0].time_range, range); + } + + #[test] + fn test_time_range_parse_requires_order() { + let result = TimeRange::parse(Some("2025-01-02T00:00:00Z"), Some("2025-01-01T00:00:00Z")); + assert!(result.is_err()); + } + + #[test] + fn test_new_for_export_with_chunk_window_requires_bounded_range() { + let result = Manifest::new_for_export( + "greptime".to_string(), + vec!["public".to_string()], + false, + TimeRange::new( + None, + Some(Utc.with_ymd_and_hms(2025, 1, 2, 0, 0, 0).unwrap()), + ), + DataFormat::Parquet, + Some(Duration::from_secs(3600)), + ); + assert!(result.is_err()); } } diff --git a/src/cli/src/data/import_v2/command.rs b/src/cli/src/data/import_v2/command.rs index 544763d92b..6a9d440071 100644 --- a/src/cli/src/data/import_v2/command.rs +++ b/src/cli/src/data/import_v2/command.rs @@ -27,7 +27,8 @@ use crate::Tool; use crate::common::ObjectStoreConfig; use crate::data::export_v2::manifest::MANIFEST_VERSION; use crate::data::import_v2::error::{ - ManifestVersionMismatchSnafu, Result, SchemaNotInSnapshotSnafu, SnapshotStorageSnafu, + FullSnapshotImportNotSupportedSnafu, ManifestVersionMismatchSnafu, Result, + SchemaNotInSnapshotSnafu, SnapshotStorageSnafu, }; use crate::data::import_v2::executor::{DdlExecutor, DdlStatement}; use crate::data::path::ddl_path_for_schema; @@ -58,10 +59,6 @@ pub struct ImportV2Command { #[clap(long)] dry_run: bool, - /// Concurrency level (for future use). - #[clap(long, default_value = "1")] - parallelism: usize, - /// Basic authentication (user:password). #[clap(long)] auth_basic: Option, @@ -121,7 +118,6 @@ impl ImportV2Command { Ok(Box::new(Import { schemas, dry_run: self.dry_run, - _parallelism: self.parallelism, storage: Box::new(storage), database_client, })) @@ -132,7 +128,6 @@ impl ImportV2Command { pub struct Import { schemas: Option>, dry_run: bool, - _parallelism: usize, storage: Box, database_client: DatabaseClient, } @@ -169,6 +164,13 @@ impl Import { info!("Snapshot contains {} schema(s)", manifest.schemas.len()); + if !manifest.schema_only && !manifest.chunks.is_empty() { + return FullSnapshotImportNotSupportedSnafu { + chunk_count: manifest.chunks.len(), + } + .fail(); + } + // 2. Determine schemas to import let schemas_to_import = match &self.schemas { Some(filter) => canonicalize_schema_filter(filter, &manifest.schemas)?, @@ -203,14 +205,6 @@ impl Import { ddl_statements.len() ); - // 6. Data import would happen here for non-schema-only snapshots (M2/M3) - if !manifest.schema_only && !manifest.chunks.is_empty() { - info!( - "Data import not yet implemented (M3). {} chunks pending.", - manifest.chunks.len() - ); - } - Ok(()) } @@ -403,7 +397,114 @@ fn canonicalize_schema_filter( #[cfg(test)] mod tests { + use std::time::Duration; + + use async_trait::async_trait; + use super::*; + use crate::Tool; + use crate::data::export_v2::manifest::{ChunkMeta, DataFormat, Manifest, TimeRange}; + use crate::data::export_v2::schema::SchemaSnapshot; + use crate::data::snapshot_storage::SnapshotStorage; + use crate::database::DatabaseClient; + + struct StubStorage { + manifest: Manifest, + } + + #[async_trait] + impl SnapshotStorage for StubStorage { + async fn exists(&self) -> crate::data::export_v2::error::Result { + Ok(true) + } + + async fn read_manifest(&self) -> crate::data::export_v2::error::Result { + Ok(self.manifest.clone()) + } + + async fn write_manifest( + &self, + _manifest: &Manifest, + ) -> crate::data::export_v2::error::Result<()> { + unimplemented!("not needed in import_v2::command tests") + } + + async fn read_text(&self, _path: &str) -> crate::data::export_v2::error::Result { + unimplemented!("not needed in import_v2::command tests") + } + + async fn write_text( + &self, + _path: &str, + _content: &str, + ) -> crate::data::export_v2::error::Result<()> { + unimplemented!("not needed in import_v2::command tests") + } + + async fn write_schema( + &self, + _snapshot: &SchemaSnapshot, + ) -> crate::data::export_v2::error::Result<()> { + unimplemented!("not needed in import_v2::command tests") + } + + async fn create_dir_all(&self, _path: &str) -> crate::data::export_v2::error::Result<()> { + unimplemented!("not needed in import_v2::command tests") + } + + async fn list_files_recursive( + &self, + _prefix: &str, + ) -> crate::data::export_v2::error::Result> { + unimplemented!("not needed in import_v2::command tests") + } + + async fn delete_snapshot(&self) -> crate::data::export_v2::error::Result<()> { + unimplemented!("not needed in import_v2::command tests") + } + } + + fn test_database_client() -> DatabaseClient { + DatabaseClient::new( + "127.0.0.1:4000".to_string(), + "greptime".to_string(), + None, + Duration::from_secs(1), + None, + false, + ) + } + + #[tokio::test] + async fn test_import_rejects_full_snapshot_before_schema_execution() { + let mut manifest = Manifest::new_full( + "greptime".to_string(), + vec!["public".to_string()], + TimeRange::unbounded(), + DataFormat::Parquet, + ); + manifest + .chunks + .push(ChunkMeta::new(1, TimeRange::unbounded())); + + let import = Import { + schemas: None, + dry_run: false, + storage: Box::new(StubStorage { manifest }), + database_client: test_database_client(), + }; + + let error = import + .do_work() + .await + .expect_err("full snapshot import should fail"); + + assert!( + error + .to_string() + .contains("Importing data from full snapshots is not implemented yet") + ); + } #[test] fn test_parse_ddl_statements() { diff --git a/src/cli/src/data/import_v2/error.rs b/src/cli/src/data/import_v2/error.rs index 5ae3db1583..169f11c0fa 100644 --- a/src/cli/src/data/import_v2/error.rs +++ b/src/cli/src/data/import_v2/error.rs @@ -45,6 +45,16 @@ pub enum Error { location: Location, }, + #[snafu(display( + "Importing data from full snapshots is not implemented yet (snapshot has {} chunk(s))", + chunk_count + ))] + FullSnapshotImportNotSupported { + chunk_count: usize, + #[snafu(implicit)] + location: Location, + }, + #[snafu(display("Snapshot storage error"))] SnapshotStorage { #[snafu(source)] @@ -67,10 +77,10 @@ pub type Result = std::result::Result; impl ErrorExt for Error { fn status_code(&self) -> StatusCode { match self { - Error::SnapshotNotFound { .. } | Error::SchemaNotInSnapshot { .. } => { - StatusCode::InvalidArguments - } - Error::ManifestVersionMismatch { .. } => StatusCode::InvalidArguments, + Error::SnapshotNotFound { .. } + | Error::SchemaNotInSnapshot { .. } + | Error::ManifestVersionMismatch { .. } + | Error::FullSnapshotImportNotSupported { .. } => StatusCode::InvalidArguments, Error::Database { error, .. } => error.status_code(), Error::SnapshotStorage { error, .. } => error.status_code(), } diff --git a/src/cli/src/data/path.rs b/src/cli/src/data/path.rs index 2e0f5d3f1a..2df81f62c8 100644 --- a/src/cli/src/data/path.rs +++ b/src/cli/src/data/path.rs @@ -25,6 +25,10 @@ pub(crate) fn ddl_path_for_schema(schema: &str) -> String { ) } +pub(crate) fn data_dir_for_schema_chunk(schema: &str, chunk_id: u32) -> String { + format!("data/{}/{}/", encode_path_segment(schema), chunk_id) +} + pub(crate) fn encode_path_segment(value: &str) -> String { let mut encoded = String::with_capacity(value.len()); for byte in value.bytes() { @@ -73,4 +77,13 @@ mod tests { "schema/ddl/%2E%2E%2Fevil.sql" ); } + + #[test] + fn test_data_dir_for_schema_chunk_encodes_schema_segment() { + assert_eq!(data_dir_for_schema_chunk("public", 1), "data/public/1/"); + assert_eq!( + data_dir_for_schema_chunk("../evil", 7), + "data/%2E%2E%2Fevil/7/" + ); + } } diff --git a/src/cli/src/data/snapshot_storage.rs b/src/cli/src/data/snapshot_storage.rs index 50c8734a67..6bc71153df 100644 --- a/src/cli/src/data/snapshot_storage.rs +++ b/src/cli/src/data/snapshot_storage.rs @@ -18,9 +18,12 @@ //! to various storage backends (S3, OSS, GCS, Azure Blob, local filesystem). use async_trait::async_trait; +use futures::TryStreamExt; use object_store::services::{Azblob, Fs, Gcs, Oss, S3}; use object_store::util::{with_instrument_layers, with_retry_layers}; -use object_store::{AzblobConnection, GcsConnection, ObjectStore, OssConnection, S3Connection}; +use object_store::{ + AzblobConnection, ErrorKind, GcsConnection, ObjectStore, OssConnection, S3Connection, +}; use snafu::ResultExt; use url::Url; @@ -139,14 +142,14 @@ fn extract_file_path_from_uri(uri: &str) -> Result { .fail(), _ => url .to_file_path() - .map(|path| path.to_string_lossy().into_owned()) .map_err(|_| { InvalidUriSnafu { uri, - reason: "file:// URI must use a valid absolute filesystem path", + reason: "file:// URI must use an absolute path like file:///tmp/backup", } .build() - }), + }) + .map(|path| path.to_string_lossy().into_owned()), } } @@ -184,6 +187,12 @@ pub trait SnapshotStorage: Send + Sync { /// Reads a text file from a relative path under the snapshot root. async fn read_text(&self, path: &str) -> Result; + /// Creates a directory-like prefix under the snapshot root when needed by the backend. + async fn create_dir_all(&self, path: &str) -> Result<()>; + + /// Lists files recursively under a relative prefix. + async fn list_files_recursive(&self, prefix: &str) -> Result>; + /// Deletes the entire snapshot (for --force). async fn delete_snapshot(&self) -> Result<()>; } @@ -443,6 +452,38 @@ impl SnapshotStorage for OpenDalStorage { String::from_utf8(data).context(TextDecodeSnafu) } + async fn create_dir_all(&self, path: &str) -> Result<()> { + self.object_store + .create_dir(path) + .await + .context(StorageOperationSnafu { + operation: format!("create dir {}", path), + }) + } + + async fn list_files_recursive(&self, prefix: &str) -> Result> { + let mut lister = match self.object_store.lister_with(prefix).recursive(true).await { + Ok(lister) => lister, + Err(error) if error.kind() == ErrorKind::NotFound => return Ok(Vec::new()), + Err(error) => { + return Err(error).context(StorageOperationSnafu { + operation: format!("list {}", prefix), + }); + } + }; + + let mut files = Vec::new(); + while let Some(entry) = lister.try_next().await.context(StorageOperationSnafu { + operation: format!("list {}", prefix), + })? { + if entry.metadata().is_dir() { + continue; + } + files.push(entry.path().to_string()); + } + Ok(files) + } + async fn delete_snapshot(&self) -> Result<()> { self.object_store .remove_all("/") @@ -533,6 +574,14 @@ mod tests { extract_file_path_from_uri("file://localhost/tmp/backup").unwrap(), "/tmp/backup" ); + assert_eq!( + extract_file_path_from_uri("file:///tmp/my%20backup").unwrap(), + "/tmp/my backup" + ); + assert_eq!( + extract_file_path_from_uri("file://localhost/tmp/my%20backup").unwrap(), + "/tmp/my backup" + ); } #[test] From 58ac889818a17f2d7350088582c973ba1561fd2b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 1 Apr 2026 09:44:13 +0800 Subject: [PATCH 060/195] chore(deps): bump rustls-webpki from 0.103.3 to 0.103.10 (#7891) Bumps [rustls-webpki](https://github.com/rustls/webpki) from 0.103.3 to 0.103.10. - [Release notes](https://github.com/rustls/webpki/releases) - [Commits](https://github.com/rustls/webpki/compare/v/0.103.3...v/0.103.10) --- updated-dependencies: - dependency-name: rustls-webpki dependency-version: 0.103.10 dependency-type: indirect ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cfec1c5f54..525ac56f92 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7295,7 +7295,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" dependencies = [ "cfg-if", - "windows-targets 0.52.6", + "windows-targets 0.48.5", ] [[package]] @@ -11595,9 +11595,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.3" +version = "0.103.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4a72fe2bcf7a6ac6fd7d0b9e5cb68aeb7d4c0a0271730218b3e92d43b4eb435" +checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef" dependencies = [ "ring", "rustls-pki-types", From 03b2f94821e54b6f03f4b78cf249ed601dc12fea Mon Sep 17 00:00:00 2001 From: liyang Date: Wed, 1 Apr 2026 09:59:37 +0800 Subject: [PATCH 061/195] chore: Update Dockerfile (#7893) * chore: Update Dockerfile * Update update-dev-builder-version.sh --- .github/scripts/update-dev-builder-version.sh | 7 +++++-- docker/dev-builder/centos/Dockerfile | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/scripts/update-dev-builder-version.sh b/.github/scripts/update-dev-builder-version.sh index 38466760a4..a593f385fd 100755 --- a/.github/scripts/update-dev-builder-version.sh +++ b/.github/scripts/update-dev-builder-version.sh @@ -30,8 +30,11 @@ update_dev_builder_version() { --body "This PR updates the dev-builder image tag" \ --base main \ --head $BRANCH_NAME \ - --reviewer zyy17 \ - --reviewer daviderli614 + --reviewer sunng87 \ + --reviewer daviderli614 \ + --reviewer killme2008 \ + --reviewer evenyag \ + --reviewer fengjiachun } update_dev_builder_version diff --git a/docker/dev-builder/centos/Dockerfile b/docker/dev-builder/centos/Dockerfile index 8b29bb3065..344bc1ec2a 100644 --- a/docker/dev-builder/centos/Dockerfile +++ b/docker/dev-builder/centos/Dockerfile @@ -7,7 +7,7 @@ RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo # Install dependencies -RUN ulimit -n 1024000 && yum groupinstall -y 'Development Tools' +RUN yum groupinstall -y 'Development Tools' RUN yum install -y epel-release \ openssl \ openssl-devel \ From 0bd0df0e88d6f6da78bd25d195ccf4640f2b6d1e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 1 Apr 2026 02:20:05 +0000 Subject: [PATCH 062/195] chore(deps): bump tar from 0.4.44 to 0.4.45 (#7890) Bumps [tar](https://github.com/alexcrichton/tar-rs) from 0.4.44 to 0.4.45. - [Commits](https://github.com/alexcrichton/tar-rs/compare/0.4.44...0.4.45) --- updated-dependencies: - dependency-name: tar dependency-version: 0.4.45 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 525ac56f92..401ac3b1ca 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6277,7 +6277,7 @@ dependencies = [ "js-sys", "log", "wasm-bindgen", - "windows-core 0.57.0", + "windows-core 0.61.2", ] [[package]] @@ -13365,9 +13365,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" [[package]] name = "tar" -version = "0.4.44" +version = "0.4.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d863878d212c87a19c1a610eb53bb01fe12951c0501cf5a0d65f724914a667a" +checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973" dependencies = [ "filetime", "libc", From b4492ee39d8fdaa824a2f396fbc1508afd484042 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 1 Apr 2026 10:43:25 +0800 Subject: [PATCH 063/195] ci: update dev-builder image tag (#7894) * chore: Update Dockerfile * ci: update dev-builder image tag Signed-off-by: greptimedb-ci --------- Signed-off-by: greptimedb-ci Co-authored-by: liyang Co-authored-by: greptimedb-ci Co-authored-by: Ning Sun --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 60ea01a3ce..3fd09ad4ea 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ CARGO_BUILD_OPTS := --locked IMAGE_REGISTRY ?= docker.io IMAGE_NAMESPACE ?= greptime IMAGE_TAG ?= latest -DEV_BUILDER_IMAGE_TAG ?= 2025-10-01-8fe17d43-20251011080129 +DEV_BUILDER_IMAGE_TAG ?= 2026-03-21-9c9d9e9e-20260331090344 BUILDX_MULTI_PLATFORM_BUILD ?= false BUILDX_BUILDER_NAME ?= gtbuilder BASE_IMAGE ?= ubuntu From 2b4e12c358818ef0829cc524aa56bbe38cc37980 Mon Sep 17 00:00:00 2001 From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com> Date: Wed, 1 Apr 2026 10:45:26 +0800 Subject: [PATCH 064/195] feat: auto-align Prometheus schemas in pending rows batching (#7877) * feat/auto-schema-align: - **Error Handling Improvements**: - Removed `CatalogSnafu` context from various `.await` calls in `dashboard.rs`, `influxdb.rs`, `jaeger.rs`, `prometheus.rs`, `event.rs`, and `pipeline.rs` to streamline error handling. - **Prometheus Store Enhancements**: - Added support for auto-creating tables and adding missing Prometheus tag columns in `prom_store.rs` and `pending_rows_batcher.rs`. - Introduced `PendingRowsSchemaAlterer` trait for schema alterations in `pending_rows_batcher.rs`. - **Test Additions**: - Added tests for new Prometheus store functionalities in `prom_store.rs` and `pending_rows_batcher.rs`. - **Error Message Improvements**: - Enhanced error messages for catalog access in `error.rs`. - **Server Configuration Updates**: - Updated server configuration to include Prometheus store options in `server.rs`. Signed-off-by: Lei, HUANG * reformat Signed-off-by: Lei, HUANG * feat/auto-schema-align: ### Add DataTypes Error Handling and Column Renaming Logic - **`error.rs`**: Introduced a new `DataTypes` error variant to handle errors from `datatypes::error::Error`. Updated `ErrorExt` implementation to include `DataTypes`. - **`pending_rows_batcher.rs`**: Added functions `find_prom_special_column_names` and `rename_prom_special_columns_for_existing_schema` to handle renaming of special Prometheus columns. Updated `build_prom_create_table_schema` to simplify error handling with `ConcreteDataType`. - **Tests**: Added a test case `test_rename_prom_special_columns_for_existing_schema` to verify the renaming logic for Prometheus special columns. Signed-off-by: Lei, HUANG * feat/auto-schema-align: - Refactored `PendingRowsBatcher` to accommodate Prometheus record batches: - Introduced `accommodate_record_batch_for_target_schema` to normalize incoming record batches against existing table schemas. - Removed `collect_missing_prom_tag_columns` and `rename_prom_special_columns_for_existing_schema` in favor of the new function. - Added `unzip_logical_region_schema` to extract schema components. - Updated tests in `pending_rows_batcher.rs`: - Added tests for `accommodate_record_batch_for_target_schema` to verify handling of missing tag columns and renaming of special columns. - Ensured error handling for missing timestamp and field columns in target schema. Signed-off-by: Lei, HUANG * feat/auto-schema-align: ### Commit Summary - **Enhancement in Table Creation Logic**: Updated `prom_store.rs` to modify the handling of `table_options` during table creation. Specifically, `table_options` are now extended differently based on the `AutoCreateTableType`. For `Physical` tables, enforced `sst_format=flat` to optimize pending-rows writes by leveraging bulk memtables. Signed-off-by: Lei, HUANG * feat/auto-schema-align: Enhance Performance Monitoring in `pending_rows_batcher.rs` - Added performance monitoring timers to various stages of the `PendingRowsBatcher` process, including schema cache checks, table resolution, schema creation, and record batch alignment. - Improved schema handling by adding timers around schema alteration and missing column addition processes. Signed-off-by: Lei, HUANG * feat/auto-schema-align: - **Enhance Concurrent Write Handling**: Introduced `FlushRegionWrite` and `FlushWriteResult` structs to manage region writes and their results. Added `flush_region_writes_concurrently` function to handle concurrent flushing of region writes based on `should_dispatch_concurrently` logic in `pending_rows_batcher.rs`. - **Testing Enhancements**: Added tests for concurrent dispatching of region writes and the logic for determining concurrent dispatch in `pending_rows_batcher.rs`. Signed-off-by: Lei, HUANG * feat/auto-schema-align: ### Add Histogram for Flush Stage Elapsed Time - **`metrics.rs`**: Introduced a new `HistogramVec` named `PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED` to track the elapsed time of pending rows batch flush stages. - **`pending_rows_batcher.rs`**: Replaced instances of `PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED` with `PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED` to measure the elapsed time for various flush stages, including `flush_write_region`, `flush_concat_table_batches`, `flush_resolve_table`, `flush_fetch_partition_rule`, `flush_split_record_batch`, `flush_filter_record_batch`, `flush_resolve_region_leader`, and `flush_encode_ipc`. Signed-off-by: Lei, HUANG * Add design doc for physical table batching in PendingRowsBatcher Signed-off-by: Lei, HUANG * Add implementation plan for physical table batching in PendingRowsBatcher * feat/auto-schema-align: ### Commit Message **Enhance Metric Engine with Physical Batch Processing** - **Add `metric-engine` Dependency**: Updated `Cargo.lock` and `Cargo.toml` to include `metric-engine` as a workspace dependency. - **Expose Batch Modifier Functions**: Changed visibility of `TagColumnInfo`, `compute_tsid_array`, and `modify_batch_sparse` in `batch_modifier.rs` to public, and made `batch_modifier` a public module in `lib.rs`. - **Implement Physical Batch Processing**: - Added functions `bulk_insert_physical_region` and `bulk_insert_logical_region` in `bulk_insert.rs` to handle physical and logical batch insertions. - Updated `pending_rows_batcher.rs` to attempt physical batch processing before falling back to logical processing, including new functions `flush_batch_physical` and `flush_batch_per_logical_table`. - **Enhance Testing**: - Added tests for physical region passthrough and empty batch handling in `bulk_insert.rs`. - Introduced `with_mito_config` in `test_util.rs` for customized test environments. Signed-off-by: Lei, HUANG * feat/auto-schema-align: ### Enhance Batch Processing for Table Creation and Alteration - **`prom_store.rs`**: - Added `create_tables_if_missing_batch` and `add_missing_prom_tag_columns_batch` methods to handle batch creation of tables and batch alteration to add missing tag columns. - Implemented logic to determine missing tables and columns, and perform batch operations accordingly. - **`pending_rows_batcher.rs`**: - Updated `PendingRowsBatcher` to utilize batch methods for creating tables an adding missing columns. - Enhanced logic to resolve table schemas and accommodate record batches after batch operations. Signed-off-by: Lei, HUANG * perf: concurrent catalog lookups and eliminate redundant concat_batches on ingest path Replace sequential catalog_manager.table() calls with concurrent futures::future::join_all in align_table_batches_to_region_schema. This affects all three lookup loops: initial table resolution, post-create resolution, and post-alter schema refresh. Reduces O(N) sequential RPC latency to O(1) wall-clock time for requests with many distinct logical tables (e.g. Prometheus remote_write). Remove the per-logical-table concat_batches in flush_batch_physical. Instead of merging all chunks of a table into one RecordBatch before calling modify_batch_sparse, apply modify_batch_sparse directly to each chunk and collect all modified chunks for a single final concat. This eliminates one full data copy per logical table on the flush path. * refactor: extract Prometheus schema alignment helpers into prom_row_builder module Move six functions and their eight unit tests from pending_rows_batcher.rs (~2386 lines) into a new prom_row_builder.rs module (~776 lines), leaving the batcher at ~1665 lines focused on flush/worker machinery. Extracted functions: - accommodate_record_batch_for_target_schema (normalize incoming batch against existing table schema) - unzip_logical_region_schema (extract ts/field/tag columns) - build_prom_create_table_schema (build ColumnSchema vec for table creation) - align_record_batch_to_schema (reorder/fill/cast columns to target schema) - rows_to_record_batch (convert proto Rows to Arrow RecordBatch) - build_arrow_array (build Arrow arrays from proto values) Cleaned up 12 now-unused imports from pending_rows_batcher.rs. * feat/auto-schema-align: ### Enhance `PendingRowsBatcher` and `prom_row_builder` for Efficient Schema Handling - **`pending_rows_batcher.rs`:** - Refactored `submit` method to integrate table batch building and alignment into a single method `build_and_align_table_batches`. - Removed intermediate `RecordBatch` creation, optimizing the process by directly converting proto `RowInsertRequests` into aligned `RecordBatch`es. - Enhanced schema handling by identifying missing columns directly from proto schemas. - **`prom_row_builder.rs`:** - Introduced `rows_to_aligned_record_batch` for direct conversion of proto `Rows` into aligned `RecordBatch`es. - Added `identify_missing_columns_from_proto` to detect absent tag columns without intermediate `RecordBatch`. - Implemented `build_prom_create_table_schema_from_proto` to construct table schemas directly from proto schemas. Signed-off-by: Lei, HUANG * feat/auto-schema-align: Add elapsed time metrics for bulk insert operations - Updated `bulk_insert` method in `bulk_insert.rs` to record elapsed time metrics using `MITO_OPERATION_ELAPSED` for both physical and logical regions. - Added a new test `test_bulk_insert_records_elapsed_metric` to verify that the elapsed time metric is recorded correctly during bulk insert operations. Signed-off-by: Lei, HUANG * remove flush per logical region Signed-off-by: Lei, HUANG * feat/auto-schema-align: **Refactor `flush_batch` and `flush_batch_physical` functions** - Removed unused `catalog` and `schema` variables from `flush_batch` in `pending_rows_batcher.rs`. - Updated `flush_batch_physical` to directly use `ctx.current_catalog()` and `ctx.current_schema()` for resolving table names. Signed-off-by: Lei, HUANG * feat/auto-schema-align: ### Remove Unused Function and Associated Test - **File:** `src/servers/src/prom_row_builder.rs` - Removed the unused function `build_prom_create_table_schema` which was responsible for building a `Vec` from an Arrow schema. - Deleted the associated test `test_build_prom_create_table_schema_from_request_schema` that validated the removed function. Signed-off-by: Lei, HUANG * feat/auto-schema-align: - **Remove Test**: Deleted the `test_bulk_insert_records_elapsed_metric` test from `bulk_insert.rs`. - **Refactor Table Resolution**: Introduced `TableResolutionPlan` struct and refactored table resolution logic in `pending_rows_batcher.rs`. - **Enhance Table Handling**: Added functions for collecting non-empty table rows, unique table schemas, and handling table creation and alteration in `pending_rows_batcher.rs`. - **Add Tests**: Implemented tests for `collect_non_empty_table_rows` and `collect_unique_table_schemas` in `pending_rows_batcher.rs`. Signed-off-by: Lei, HUANG * feat/auto-schema-align: - **Refactor Error Handling**: Updated error handling in `pending_rows_batcher.rs` and `prom_row_builder.rs` to use `Snafu` error context for more descriptive error messages. - **Remove Unused Functionality**: Eliminated the `rows_to_record_batch` function and related test in `prom_row_builder.rs` as it was redundant. - **Simplify Function Return Types**: Modified `rows_to_aligned_record_batch` in `prom_row_builder.rs` to return only `RecordBatch` without missing columns, simplifying the function's interface and related tests. Signed-off-by: Lei, HUANG * feat/auto-schema-align: ### Add Helper Function for Table Options in `prom_store.rs` - Introduced `fill_metric_physical_table_options` function to encapsulate logic for setting table options, ensuring the use of flat SST format and physical table metadata. - Updated `Instance` implementation to utilize the new helper function for setting table options. - Added a unit test `test_metric_physical_table_options_forces_flat_sst_format` to verify the correct application of table options. Signed-off-by: Lei, HUANG * feat/auto-schema-align: - **Refactor `PendingRowsBatcher`**: Simplified worker retrieval logic in `get_or_spawn_worker` method by using a more concise conditional check. - **Metrics Update**: Added `PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED` metric in `pending_rows_batcher.rs`. - **Remove Unused Code**: Deleted multiple test functions related to record batch alignment and schema preparation in `pending_rows_batcher.rs` and `prom_row_builder.rs`. - **Function Visibility Change**: Made `build_prom_create_table_schema_from_proto` public in `prom_row_builder.rs`. Signed-off-by: Lei, HUANG * chore: remove plan Signed-off-by: Lei, HUANG * feat/auto-schema-align: ### Refactor and Simplify Schema Alteration Logic - **Removed Unused Methods**: Deleted `create_table_if_missing` and `add_missing_prom_tag_columns` methods from `PendingRowsSchemaAlterer` trait in `prom_store.rs` and `pending_rows_batcher.rs`. - **Error Handling Improvement**: Enhanced error handling in `create_tables_if_missing_batch` method to return a specific error message for unsupported `AutoCreateTableType` in `prom_store.rs`. - **Visibility Change**: Made `as_str` method public in `AutoCreateTableType` enum in `insert.rs` to support external access. Signed-off-by: Lei, HUANG * feat/auto-schema-align: ### Commit Message Improve safety in `prom_row_builder.rs` - Updated `unzip_logical_region_schema` to use `saturating_sub` for safer capacity calculation of `tag_columns`. Signed-off-by: Lei, HUANG * feat/auto-schema-align: Add TODO comments for future improvements in `pending_rows_batcher.rs` - Added a TODO comment to consider bounding the `flush_region_writes_concurrently` function. - Added a TODO comment to potentially limit the maximum rows to concatenate in the `flush_batch_physical` function. Signed-off-by: Lei, HUANG * feat/auto-schema-align: ### Commit Message Enhance error handling in `pending_rows_batcher.rs` - Updated `collect_unique_table_schemas` to return a `Result` type, enabling error handling for duplicate table names. - Modified the function to return an error when duplicate table names are found in `table_rows`. - Adjusted test cases to handle the new `Result` return type in `collect_unique_table_schemas`. Signed-off-by: Lei, HUANG * feat/auto-schema-align: - **Refactor `partition_columns` Method**: Updated the `partition_columns` method in `multi_dim.rs`, `partition.rs`, and `splitter.rs` to return a slice reference instead of a cloned vector, improving performance by avoiding unnecessary cloning. - **Enhance Partition Handling**: Added functions `collect_tag_columns_and_non_tag_indices` and `strip_partition_columns_from_batch` in `pending_rows_batcher.rs` to manage partition columns more efficiently, including stripping partition columns from record batches. - **Update Tests**: Modified existing tests and added new ones in `pending_rows_batcher.rs` to verify the functionality of partition column handling, ensuring correct behavior of the new methods. Signed-off-by: Lei, HUANG * feat/auto-schema-align: ### Enhance Schema Handling and Validation in `pending_rows_batcher.rs` - **Schema Validation Enhancements**: - Added checks for essential columns (`timestamp`, `value`) in `collect_tag_columns_and_non_tag_indices`. - Introduced `PHYSICAL_REGION_ESSENTIAL_COLUMN_COUNT` to ensure minimum column count in `strip_partition_columns_from_batch`. - Improved error handling for unexpected data types and duplicated columns. - **Function Modifications**: - Updated `strip_partition_columns_from_batch` to project essential columns without lookup. - Modified `flush_batch_physical` to use `essential_col_indices` instead of `non_tag_indices`. - **Test Enhancements**: - Added tests for schema validation, including checks for unexpected data types and duplicated columns. - Verified correct projection of essential columns in `strip_partition_columns_from_batch`. Files affected: `pending_rows_batcher.rs`, `tests`. Signed-off-by: Lei, HUANG * feat/auto-schema-align: - **Add `smallvec` Dependency**: Updated `Cargo.lock` and `Cargo.toml` to include `smallvec` as a workspace dependency. - **Refactor Function**: Renamed `collect_tag_columns_and_non_tag_indices` to `columns_taxonomy` in `pending_rows_batcher.rs` and updated its return type to use `SmallVec`. - **Update Tests**: Modified test cases in `pending_rows_batcher.rs` to reflect changes in function name and return type. Signed-off-by: Lei, HUANG * feat/auto-schema-align: **Refactor `pending_rows_batcher.rs` to Simplify Table ID Handling** - Updated `TableBatch` struct to use `TableId` directly instead of `Option` for `table_id`. - Simplified logic in `flush_batch_physical` by removing the check for `None` in `table_id`. - Adjusted related logic in `start_worker` to accommodate the change in `table_id` handling. Signed-off-by: Lei, HUANG * feat/auto-schema-align: ### Enhance Batch Processing Logic - **`pending_rows_batcher.rs`**: - Moved column taxonomy resolution inside the loop to handle schema variations across batches. - Added checks to skip processing if both tag columns and essential column indices are empty. - **Tests**: - Added `test_modify_batch_sparse_with_taxonomy_per_batch` to verify batch modification logic with varying schemas. Signed-off-by: Lei, HUANG * feat/auto-schema-align: ### Remove Primary Key Column Check in `pending_rows_batcher.rs` - Removed the check for the primary key column and other essential column names in the function `strip_partition_columns_from_batch` within `pending_rows_batcher.rs`. - Simplified the logic by eliminating the validation of column order against expected essential names. Signed-off-by: Lei, HUANG * feat/auto-schema-align: ### Refactor error handling and iteration in `otlp.rs` and `pending_rows_batcher.rs` - **`otlp.rs`**: Simplified error handling by removing `CatalogSnafu` context when awaiting table retrieval. - **`pending_rows_batcher.rs`**: Streamlined iteration over tables by removing unnecessary `into_iter()` calls, improving code readability and efficiency. Signed-off-by: Lei, HUANG * chore/metrics-for-bulk: Add timing metrics for batch processing in `pending_rows_batcher.rs` - Introduced `modify_elapsed` and `columns_taxonomy_elapsed` to measure time spent in `modify_batch_sparse` and `columns_taxonomy` functions. - Updated `flush_batch_physical` to record these metrics using `PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED`. Signed-off-by: Lei, HUANG * feat/auto-schema-align: ### Commit Summary - **Remove Unused Code**: Eliminated the `#[allow(dead_code)]` attribute from the `compute_tsid_array` function in `batch_modifier.rs`. - **Error Handling Improvement**: Enhanced error handling in `flush_batch_physical` function by adjusting the `match` block in `pending_rows_batcher.rs`. - **Simplify Logic**: Streamlined the logic in `rows_to_aligned_record_batch` by removing unnecessary type casting in `prom_row_builder.rs`. Signed-off-by: Lei, HUANG * feat/auto-schema-align: **Refactor `flush_batch_physical` in `pending_rows_batcher.rs`:** - Moved partition column stripping logic to a single location before processing region batches. - Updated the use of `combined_batch` to `stripped_batch` for consistency in batch processing. - Removed redundant partition column stripping logic within the region batch loop. Signed-off-by: Lei, HUANG * feat/auto-schema-align: ### Update `batch_modifier.rs` Documentation and Parameter Naming - Enhanced documentation for `compute_tsid_array` and `modify_batch_sparse` functions to clarify their logic and parameters. - Renamed parameter `non_tag_column_indices` to `extra_column_indices` in `modify_batch_sparse` for better clarity. Signed-off-by: Lei, HUANG --------- Signed-off-by: Lei, HUANG --- Cargo.lock | 2 + src/frontend/src/instance/dashboard.rs | 11 +- src/frontend/src/instance/influxdb.rs | 7 +- src/frontend/src/instance/jaeger.rs | 9 +- src/frontend/src/instance/otlp.rs | 5 +- src/frontend/src/instance/prom_store.rs | 317 ++- src/frontend/src/server.rs | 2 + src/metric-engine/src/batch_modifier.rs | 41 +- src/metric-engine/src/engine/bulk_insert.rs | 139 +- src/metric-engine/src/lib.rs | 2 +- src/metric-engine/src/test_util.rs | 16 + src/operator/src/insert.rs | 2 +- src/partition/src/multi_dim.rs | 4 +- src/partition/src/partition.rs | 2 +- src/partition/src/splitter.rs | 42 +- src/servers/Cargo.toml | 2 + src/servers/src/error.rs | 10 +- src/servers/src/http/event.rs | 9 +- src/servers/src/http/prometheus.rs | 21 +- src/servers/src/lib.rs | 1 + src/servers/src/metrics.rs | 7 + src/servers/src/pending_rows_batcher.rs | 2006 +++++++++++++------ src/servers/src/pipeline.rs | 12 +- src/servers/src/prom_row_builder.rs | 557 +++++ 24 files changed, 2551 insertions(+), 675 deletions(-) create mode 100644 src/servers/src/prom_row_builder.rs diff --git a/Cargo.lock b/Cargo.lock index 401ac3b1ca..54be9bbdcb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12057,6 +12057,7 @@ dependencies = [ "local-ip-address", "log-query", "loki-proto", + "metric-engine", "mime_guess", "mysql_async", "notify", @@ -12093,6 +12094,7 @@ dependencies = [ "session", "simd-json", "simdutf8", + "smallvec", "snafu 0.8.6", "snap", "socket2 0.5.10", diff --git a/src/frontend/src/instance/dashboard.rs b/src/frontend/src/instance/dashboard.rs index 373961dbfa..5b83a31f20 100644 --- a/src/frontend/src/instance/dashboard.rs +++ b/src/frontend/src/instance/dashboard.rs @@ -33,7 +33,7 @@ use datafusion::sql::TableReference; use datafusion_expr::{DmlStatement, LogicalPlan, lit}; use datatypes::arrow::array::{Array, AsArray}; use servers::error::{ - CatalogSnafu, CollectRecordbatchSnafu, DataFusionSnafu, ExecuteQuerySnafu, NotSupportedSnafu, + CollectRecordbatchSnafu, DataFusionSnafu, ExecuteQuerySnafu, NotSupportedSnafu, TableNotFoundSnafu, }; use servers::query_handler::DashboardDefinition; @@ -139,8 +139,7 @@ impl Instance { DASHBOARD_TABLE_NAME, Some(&ctx), ) - .await - .context(CatalogSnafu)? + .await? { return Ok(table); } @@ -178,8 +177,7 @@ impl Instance { DASHBOARD_TABLE_NAME, Some(&ctx), ) - .await - .context(CatalogSnafu)? + .await? .context(TableNotFoundSnafu { catalog: catalog.to_string(), schema: DEFAULT_PRIVATE_SCHEMA_NAME.to_string(), @@ -255,8 +253,7 @@ impl Instance { DASHBOARD_TABLE_NAME, Some(&query_ctx), ) - .await - .context(CatalogSnafu)? + .await? { table } else { diff --git a/src/frontend/src/instance/influxdb.rs b/src/frontend/src/instance/influxdb.rs index 0c63688262..fe5fdeac77 100644 --- a/src/frontend/src/instance/influxdb.rs +++ b/src/frontend/src/instance/influxdb.rs @@ -21,9 +21,7 @@ use client::Output; use common_error::ext::BoxedError; use common_time::Timestamp; use common_time::timestamp::TimeUnit; -use servers::error::{ - AuthSnafu, CatalogSnafu, Error, TimestampOverflowSnafu, UnexpectedResultSnafu, -}; +use servers::error::{AuthSnafu, Error, TimestampOverflowSnafu, UnexpectedResultSnafu}; use servers::influxdb::InfluxdbRequest; use servers::interceptor::{LineProtocolInterceptor, LineProtocolInterceptorRef}; use servers::query_handler::InfluxdbLineProtocolHandler; @@ -92,8 +90,7 @@ impl InfluxdbLineTimestampAligner<'_> { &insert.table_name, Some(query_context), ) - .await - .context(CatalogSnafu)? + .await? .map(|x| x.schema()) .and_then(|schema| { schema.timestamp_column().map(|col| { diff --git a/src/frontend/src/instance/jaeger.rs b/src/frontend/src/instance/jaeger.rs index 607ed80098..e500de7a85 100644 --- a/src/frontend/src/instance/jaeger.rs +++ b/src/frontend/src/instance/jaeger.rs @@ -38,8 +38,7 @@ use datafusion_expr::{Expr, ExprFunctionExt, SortExpr, col, lit, lit_timestamp_n use query::QueryEngineRef; use serde_json::Value as JsonValue; use servers::error::{ - CatalogSnafu, CollectRecordbatchSnafu, DataFusionSnafu, Result as ServerResult, - TableNotFoundSnafu, + CollectRecordbatchSnafu, DataFusionSnafu, Result as ServerResult, TableNotFoundSnafu, }; use servers::http::jaeger::{JAEGER_QUERY_TABLE_NAME_KEY, QueryTraceParams, TraceUserAgent}; use servers::otlp::trace::{ @@ -336,8 +335,7 @@ async fn query_trace_table( table_name, Some(&ctx), ) - .await - .context(CatalogSnafu)? + .await? .with_context(|| TableNotFoundSnafu { table: table_name, catalog: ctx.current_catalog(), @@ -425,8 +423,7 @@ async fn get_table( table_name, Some(&ctx), ) - .await - .context(CatalogSnafu)? + .await? .with_context(|| TableNotFoundSnafu { table: table_name, catalog: ctx.current_catalog(), diff --git a/src/frontend/src/instance/otlp.rs b/src/frontend/src/instance/otlp.rs index 9b21f9924f..59174aa89a 100644 --- a/src/frontend/src/instance/otlp.rs +++ b/src/frontend/src/instance/otlp.rs @@ -26,7 +26,7 @@ use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest; use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest; use otel_arrow_rust::proto::opentelemetry::collector::metrics::v1::ExportMetricsServiceRequest; use pipeline::{GreptimePipelineParams, PipelineWay}; -use servers::error::{self, AuthSnafu, CatalogSnafu, Result as ServerResult}; +use servers::error::{self, AuthSnafu, Result as ServerResult}; use servers::http::prom_store::PHYSICAL_TABLE_PARAM; use servers::interceptor::{OpenTelemetryProtocolInterceptor, OpenTelemetryProtocolInterceptorRef}; use servers::otlp; @@ -255,8 +255,7 @@ impl Instance { let table = self .catalog_manager .table(catalog, &schema, &req.table_name, None) - .await - .context(CatalogSnafu)?; + .await?; let Some(rows) = req.rows.as_mut() else { continue; diff --git a/src/frontend/src/instance/prom_store.rs b/src/frontend/src/instance/prom_store.rs index c8f76753af..10fc2f1790 100644 --- a/src/frontend/src/instance/prom_store.rs +++ b/src/frontend/src/instance/prom_store.rs @@ -17,7 +17,11 @@ use std::sync::Arc; use api::prom_store::remote::read_request::ResponseType; use api::prom_store::remote::{Query, QueryResult, ReadRequest, ReadResponse}; -use api::v1::RowInsertRequests; +use api::v1::alter_table_expr::Kind; +use api::v1::{ + AddColumn, AddColumns, AlterTableExpr, ColumnDataType, ColumnDef, CreateTableExpr, + RowInsertRequests, SemanticType, +}; use async_trait::async_trait; use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq}; use client::OutputData; @@ -27,19 +31,25 @@ use common_query::Output; use common_query::prelude::GREPTIME_PHYSICAL_TABLE; use common_recordbatch::RecordBatches; use common_telemetry::{debug, tracing}; -use operator::insert::InserterRef; +use operator::insert::{ + AutoCreateTableType, InserterRef, build_create_table_expr, fill_table_options_for_create, +}; use operator::statement::StatementExecutor; use prost::Message; use servers::error::{self, AuthSnafu, Result as ServerResult}; use servers::http::header::{CONTENT_ENCODING_SNAPPY, CONTENT_TYPE_PROTOBUF, collect_plan_metrics}; use servers::http::prom_store::PHYSICAL_TABLE_PARAM; use servers::interceptor::{PromStoreProtocolInterceptor, PromStoreProtocolInterceptorRef}; +use servers::pending_rows_batcher::PendingRowsSchemaAlterer; use servers::prom_store::{self, Metrics}; use servers::query_handler::{ PromStoreProtocolHandler, PromStoreProtocolHandlerRef, PromStoreResponse, }; use session::context::QueryContextRef; use snafu::{OptionExt, ResultExt}; +use store_api::metric_engine_consts::{METRIC_ENGINE_NAME, PHYSICAL_TABLE_METADATA_KEY}; +use store_api::mito_engine_options::SST_FORMAT_KEY; +use table::table_reference::TableReference; use tracing::instrument; use crate::error::{ @@ -50,6 +60,34 @@ use crate::instance::Instance; const SAMPLES_RESPONSE_TYPE: i32 = ResponseType::Samples as i32; +fn auto_create_table_type_for_prom_remote_write( + ctx: &QueryContextRef, + with_metric_engine: bool, +) -> AutoCreateTableType { + if with_metric_engine { + let physical_table = ctx + .extension(PHYSICAL_TABLE_PARAM) + .unwrap_or(GREPTIME_PHYSICAL_TABLE) + .to_string(); + AutoCreateTableType::Logical(physical_table) + } else { + AutoCreateTableType::Physical + } +} + +fn required_physical_table_for_create_type(create_type: &AutoCreateTableType) -> Option<&str> { + match create_type { + AutoCreateTableType::Logical(physical_table) => Some(physical_table.as_str()), + _ => None, + } +} + +fn fill_metric_physical_table_options(table_options: &mut HashMap) { + // We always enforce flat format in this ingestion path. + table_options.insert(SST_FORMAT_KEY.to_string(), "flat".to_string()); + table_options.insert(PHYSICAL_TABLE_METADATA_KEY.to_string(), "true".to_string()); +} + #[inline] fn is_supported(response_type: i32) -> bool { // Only supports samples response right now @@ -159,6 +197,157 @@ impl Instance { } } +#[async_trait] +impl PendingRowsSchemaAlterer for Instance { + async fn create_tables_if_missing_batch( + &self, + catalog: &str, + schema: &str, + tables: &[(&str, &[api::v1::ColumnSchema])], + with_metric_engine: bool, + ctx: QueryContextRef, + ) -> ServerResult<()> { + if tables.is_empty() { + return Ok(()); + } + + let create_type = auto_create_table_type_for_prom_remote_write(&ctx, with_metric_engine); + if let Some(physical_table) = required_physical_table_for_create_type(&create_type) { + self.create_metric_physical_table_if_missing( + catalog, + schema, + physical_table, + ctx.clone(), + ) + .await?; + } + + let engine = if matches!(create_type, AutoCreateTableType::Logical(_)) { + METRIC_ENGINE_NAME + } else { + common_catalog::consts::default_engine() + }; + + // Check which tables actually still need to be created (may have been + // concurrently created by another request). + let mut create_exprs: Vec = Vec::with_capacity(tables.len()); + for &(table_name, request_schema) in tables { + let existing = self + .catalog_manager() + .table(catalog, schema, table_name, Some(ctx.as_ref())) + .await + .map_err(BoxedError::new) + .context(error::ExecuteGrpcQuerySnafu)?; + if existing.is_some() { + continue; + } + + let table_ref = TableReference::full(catalog, schema, table_name); + let mut create_table_expr = build_create_table_expr(&table_ref, request_schema, engine) + .map_err(BoxedError::new) + .context(error::ExecuteGrpcQuerySnafu)?; + + let mut table_options = std::collections::HashMap::with_capacity(4); + fill_table_options_for_create(&mut table_options, &create_type, &ctx); + create_table_expr.table_options.extend(table_options); + create_exprs.push(create_table_expr); + } + + if create_exprs.is_empty() { + return Ok(()); + } + + match create_type { + AutoCreateTableType::Logical(_) => { + // Use the batch API for logical tables. + self.statement_executor + .create_logical_tables(&create_exprs, ctx) + .await + .map_err(BoxedError::new) + .context(error::ExecuteGrpcQuerySnafu)?; + } + AutoCreateTableType::Physical => { + // Physical tables don't have a batch DDL path; create one at a time. + for mut expr in create_exprs { + expr.table_options + .insert(SST_FORMAT_KEY.to_string(), "flat".to_string()); + self.statement_executor + .create_table_inner(&mut expr, None, ctx.clone()) + .await + .map_err(BoxedError::new) + .context(error::ExecuteGrpcQuerySnafu)?; + } + } + create_type => { + return error::InvalidPromRemoteRequestSnafu { + msg: format!( + "prom remote write only supports logical or physical auto-create: {}", + create_type.as_str() + ), + } + .fail(); + } + } + + Ok(()) + } + + async fn add_missing_prom_tag_columns_batch( + &self, + catalog: &str, + schema: &str, + tables: &[(&str, &[String])], + ctx: QueryContextRef, + ) -> ServerResult<()> { + if tables.is_empty() { + return Ok(()); + } + + let alter_exprs: Vec = tables + .iter() + .filter(|(_, columns)| !columns.is_empty()) + .map(|&(table_name, columns)| { + let add_columns = AddColumns { + add_columns: columns + .iter() + .map(|column_name| AddColumn { + column_def: Some(ColumnDef { + name: column_name.clone(), + data_type: ColumnDataType::String as i32, + is_nullable: true, + semantic_type: SemanticType::Tag as i32, + comment: String::new(), + ..Default::default() + }), + location: None, + add_if_not_exists: true, + }) + .collect(), + }; + + AlterTableExpr { + catalog_name: catalog.to_string(), + schema_name: schema.to_string(), + table_name: table_name.to_string(), + kind: Some(Kind::AddColumns(add_columns)), + } + }) + .collect(); + + if alter_exprs.is_empty() { + return Ok(()); + } + + self.statement_executor + .alter_logical_tables(alter_exprs, ctx) + .await + .map_err(BoxedError::new) + .context(error::ExecuteGrpcQuerySnafu)?; + + Ok(()) + } +} + #[async_trait] impl PromStoreProtocolHandler for Instance { async fn pre_write( @@ -267,6 +456,61 @@ impl PromStoreProtocolHandler for Instance { } } +impl Instance { + async fn create_metric_physical_table_if_missing( + &self, + catalog: &str, + schema: &str, + physical_table: &str, + ctx: QueryContextRef, + ) -> ServerResult<()> { + let table = self + .catalog_manager() + .table(catalog, schema, physical_table, Some(ctx.as_ref())) + .await + .map_err(BoxedError::new) + .context(error::ExecuteGrpcQuerySnafu)?; + if table.is_some() { + return Ok(()); + } + + let table_ref = TableReference::full(catalog, schema, physical_table); + let default_schema = vec![ + api::v1::ColumnSchema { + column_name: common_query::prelude::greptime_timestamp().to_string(), + datatype: api::v1::ColumnDataType::TimestampMillisecond as i32, + semantic_type: api::v1::SemanticType::Timestamp as i32, + datatype_extension: None, + options: None, + }, + api::v1::ColumnSchema { + column_name: common_query::prelude::greptime_value().to_string(), + datatype: api::v1::ColumnDataType::Float64 as i32, + semantic_type: api::v1::SemanticType::Field as i32, + datatype_extension: None, + options: None, + }, + ]; + let mut create_table_expr = build_create_table_expr( + &table_ref, + &default_schema, + common_catalog::consts::default_engine(), + ) + .map_err(BoxedError::new) + .context(error::ExecuteGrpcQuerySnafu)?; + create_table_expr.engine = METRIC_ENGINE_NAME.to_string(); + fill_metric_physical_table_options(&mut create_table_expr.table_options); + + self.statement_executor + .create_table_inner(&mut create_table_expr, None, ctx) + .await + .map_err(BoxedError::new) + .context(error::ExecuteGrpcQuerySnafu)?; + + Ok(()) + } +} + /// This handler is mainly used for `frontend` or `standalone` to directly import /// the metrics collected by itself, thereby avoiding importing metrics through the network, /// thus reducing compression and network transmission overhead, @@ -320,3 +564,72 @@ impl PromStoreProtocolHandler for ExportMetricHandler { unreachable!(); } } + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use session::context::QueryContext; + + use super::*; + + #[test] + fn test_auto_create_table_type_for_prom_remote_write_metric_engine() { + let mut query_ctx = QueryContext::with( + common_catalog::consts::DEFAULT_CATALOG_NAME, + common_catalog::consts::DEFAULT_SCHEMA_NAME, + ); + query_ctx.set_extension(PHYSICAL_TABLE_PARAM, "metric_physical".to_string()); + let ctx = Arc::new(query_ctx); + + let create_type = auto_create_table_type_for_prom_remote_write(&ctx, true); + match create_type { + AutoCreateTableType::Logical(physical) => assert_eq!(physical, "metric_physical"), + _ => panic!("expected logical table create type"), + } + } + + #[test] + fn test_auto_create_table_type_for_prom_remote_write_without_metric_engine() { + let ctx = Arc::new(QueryContext::with( + common_catalog::consts::DEFAULT_CATALOG_NAME, + common_catalog::consts::DEFAULT_SCHEMA_NAME, + )); + + let create_type = auto_create_table_type_for_prom_remote_write(&ctx, false); + match create_type { + AutoCreateTableType::Physical => {} + _ => panic!("expected physical table create type"), + } + } + + #[test] + fn test_required_physical_table_for_create_type() { + let logical = AutoCreateTableType::Logical("phy_table".to_string()); + assert_eq!( + Some("phy_table"), + required_physical_table_for_create_type(&logical) + ); + + let physical = AutoCreateTableType::Physical; + assert_eq!(None, required_physical_table_for_create_type(&physical)); + } + + #[test] + fn test_metric_physical_table_options_forces_flat_sst_format() { + let mut table_options = HashMap::new(); + + fill_metric_physical_table_options(&mut table_options); + + assert_eq!( + Some("flat"), + table_options.get(SST_FORMAT_KEY).map(String::as_str) + ); + assert_eq!( + Some("true"), + table_options + .get(PHYSICAL_TABLE_METADATA_KEY) + .map(String::as_str) + ); + } +} diff --git a/src/frontend/src/server.rs b/src/frontend/src/server.rs index 4d0db700d1..e66ae718ba 100644 --- a/src/frontend/src/server.rs +++ b/src/frontend/src/server.rs @@ -130,6 +130,8 @@ where self.instance.partition_manager().clone(), self.instance.node_manager().clone(), self.instance.catalog_manager().clone(), + opts.prom_store.with_metric_engine, + self.instance.clone(), opts.prom_store.pending_rows_flush_interval, opts.prom_store.max_batch_rows, opts.prom_store.max_concurrent_flushes, diff --git a/src/metric-engine/src/batch_modifier.rs b/src/metric-engine/src/batch_modifier.rs index d06eaa976b..76d9bb418a 100644 --- a/src/metric-engine/src/batch_modifier.rs +++ b/src/metric-engine/src/batch_modifier.rs @@ -28,7 +28,7 @@ use crate::error::{EncodePrimaryKeySnafu, Result, UnexpectedRequestSnafu}; /// Info about a tag column for TSID computation and sparse primary key encoding. #[allow(dead_code)] -pub(crate) struct TagColumnInfo { +pub struct TagColumnInfo { /// Column name (used for label-name hash). pub name: String, /// Column index in the RecordBatch. @@ -37,9 +37,16 @@ pub(crate) struct TagColumnInfo { pub column_id: ColumnId, } -/// Computes `__tsid` values for each row. -#[allow(dead_code)] -pub(crate) fn compute_tsid_array( +/// Computes the TSID for each row in a [RecordBatch]. +/// +/// The TSID is a stable hash of the set of labels (tags) present in each row. +/// It accounts for both the names and values of all non-null tag columns. +/// +/// # Logic +/// - If a row has no nulls across all `sorted_tag_columns`, it uses a precomputed hash of all label names. +/// - If a row has nulls, it dynamically computes a hash of the names of labels that are present (non-null). +/// - In both cases, it then hashes the values of those present labels in the order specified by `sorted_tag_columns`. +pub fn compute_tsid_array( batch: &RecordBatch, sorted_tag_columns: &[TagColumnInfo], tag_arrays: &[&StringArray], @@ -110,12 +117,30 @@ fn build_tag_arrays<'a>( .collect() } -/// Modifies a RecordBatch for sparse primary key encoding. -pub(crate) fn modify_batch_sparse( +/// Modifies a [RecordBatch] to include a sparse primary key column. +/// +/// This function transforms the input `batch` into a new `RecordBatch` where the first column +/// is the generated primary key (named [PRIMARY_KEY_COLUMN_NAME]), followed by columns +/// indicated by `extra_column_indices`. +/// +/// The primary key uses a "sparse" encoding, which compactly represents the row's identity +/// by only including non-null tag values. The encoding, handled by [SparsePrimaryKeyCodec], +/// consists of: +/// 1. The `table_id`. +/// 2. A `tsid` (Time Series ID), which is a hash of the present tags. +/// 3. The actual non-null tag values paired with their `column_id`. +/// +/// # Parameters +/// - `batch`: The source [RecordBatch]. +/// - `table_id`: The ID of the table. +/// - `sorted_tag_columns`: Metadata for tag columns, used for both TSID computation and PK encoding. +/// - `extra_column_indices`: Indices of columns from the original batch to keep in the output +/// (typically the timestamp and value fields). +pub fn modify_batch_sparse( batch: RecordBatch, table_id: u32, sorted_tag_columns: &[TagColumnInfo], - non_tag_column_indices: &[usize], + extra_column_indices: &[usize], ) -> Result { let num_rows = batch.num_rows(); let codec = SparsePrimaryKeyCodec::schemaless(); @@ -151,7 +176,7 @@ pub(crate) fn modify_batch_sparse( ))]; let mut columns: Vec> = vec![Arc::new(pk_array)]; - for &idx in non_tag_column_indices { + for &idx in extra_column_indices { fields.push(batch.schema().fields()[idx].clone()); columns.push(batch.column(idx).clone()); } diff --git a/src/metric-engine/src/engine/bulk_insert.rs b/src/metric-engine/src/engine/bulk_insert.rs index 8122cdc958..300bd34647 100644 --- a/src/metric-engine/src/engine/bulk_insert.rs +++ b/src/metric-engine/src/engine/bulk_insert.rs @@ -34,18 +34,20 @@ use crate::batch_modifier::{TagColumnInfo, modify_batch_sparse}; use crate::engine::MetricEngineInner; use crate::error; use crate::error::Result; +use crate::metrics::MITO_OPERATION_ELAPSED; impl MetricEngineInner { - /// Bulk-inserts logical rows into a metric region. + /// Bulk-inserts rows into a metric region. /// - /// This method accepts a `RegionBulkInsertsRequest` whose payload is a logical - /// `RecordBatch` (timestamp, value and tag columns) for the given logical `region_id`. + /// **Logical region path:** The request payload is a logical `RecordBatch` + /// (timestamp, value and tag columns). It is transformed to physical format + /// via `modify_batch_sparse`, encoded to Arrow IPC, and forwarded as a + /// `BulkInserts` request to the data region. If mito reports + /// `StatusCode::Unsupported`, the request is transparently retried as a `Put`. /// - /// The transformed batch is encoded to Arrow IPC and forwarded as a `BulkInserts` - /// request to the data region, along with the original `partition_expr_version`. - /// If the data region reports `StatusCode::Unsupported` for bulk inserts, the request - /// is transparently retried as a `Put` by converting the original logical batch into - /// `api::v1::Rows`, so callers observe the same semantics as `put_region`. + /// **Physical region path:** The request payload is already in physical format + /// (produced by the batcher's `flush_batch_physical`). It is forwarded directly + /// to the data region with no transformation. /// /// Returns the number of affected rows, or `0` if the input batch is empty. pub async fn bulk_insert_region( @@ -53,13 +55,42 @@ impl MetricEngineInner { region_id: RegionId, request: RegionBulkInsertsRequest, ) -> Result { - ensure!( - !self.is_physical_region(region_id), - error::UnsupportedRegionRequestSnafu { - request: RegionRequest::BulkInserts(request), - } - ); + if request.payload.num_rows() == 0 { + return Ok(0); + } + if self.is_physical_region(region_id) { + let _timer = MITO_OPERATION_ELAPSED + .with_label_values(&["bulk_insert_physical"]) + .start_timer(); + return self.bulk_insert_physical_region(region_id, request).await; + } + let _timer = MITO_OPERATION_ELAPSED + .with_label_values(&["bulk_insert_logical"]) + .start_timer(); + self.bulk_insert_logical_region(region_id, request).await + } + + /// Passthrough for bulk inserts targeting a physical data region. + /// + /// The batch is already in physical format (with `__primary_key`, timestamp, + /// value columns), so no logical-to-physical transformation is needed. + async fn bulk_insert_physical_region( + &self, + region_id: RegionId, + request: RegionBulkInsertsRequest, + ) -> Result { + self.data_region + .write_data(region_id, RegionRequest::BulkInserts(request)) + .await + } + + /// Bulk-inserts logical rows, transforming them to physical format first. + async fn bulk_insert_logical_region( + &self, + region_id: RegionId, + request: RegionBulkInsertsRequest, + ) -> Result { let (physical_region_id, data_region_id, primary_key_encoding) = self.find_data_region_meta(region_id)?; @@ -390,6 +421,7 @@ mod tests { use datatypes::arrow::array::{Float64Array, StringArray, TimestampMillisecondArray}; use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema, TimeUnit}; use datatypes::arrow::record_batch::RecordBatch; + use mito2::config::MitoConfig; use store_api::metric_engine_consts::MEMTABLE_PARTITION_TREE_PRIMARY_KEY_ENCODING; use store_api::path_utils::table_dir; use store_api::region_engine::RegionEngine; @@ -397,6 +429,7 @@ mod tests { use store_api::storage::{RegionId, ScanRequest}; use super::record_batch_to_ipc; + use crate::batch_modifier::{TagColumnInfo, modify_batch_sparse}; use crate::error::Error; use crate::test_util::{self, TestEnv}; @@ -492,23 +525,81 @@ mod tests { } #[tokio::test] - async fn test_bulk_insert_physical_region_rejected() { - let env = TestEnv::new().await; + async fn test_bulk_insert_physical_region_passthrough() { + // Use flat format so that BulkMemtable is used (supports write_bulk). + let mito_config = MitoConfig { + default_experimental_flat_format: true, + ..Default::default() + }; + let env = TestEnv::with_mito_config("", mito_config, Default::default()).await; env.init_metric_region().await; - let physical_region_id = env.default_physical_region_id(); - let batch = build_logical_batch(0, 2); - let request = build_bulk_request(physical_region_id, batch); + let logical_region_id = env.default_logical_region_id(); - let err = env + // First, do a normal logical bulk insert so we can compare results. + let logical_batch = build_logical_batch(0, 3); + let logical_request = build_bulk_request(logical_region_id, logical_batch.clone()); + let response = env + .metric() + .handle_request(logical_region_id, logical_request) + .await + .unwrap(); + assert_eq!(response.affected_rows, 3); + + // Now build a physical-format batch using modify_batch_sparse (simulating + // what the batcher's flush_batch_physical does) and send it directly to + // the physical region. + let tag_columns = vec![TagColumnInfo { + name: "job".to_string(), + index: 2, + column_id: 2, // column_id for "job" in the physical table + }]; + let non_tag_indices = vec![0, 1]; // timestamp, value + let second_batch = build_logical_batch(3, 3); + let physical_batch = modify_batch_sparse( + second_batch, + logical_region_id.table_id(), + &tag_columns, + &non_tag_indices, + ) + .unwrap(); + let request = build_bulk_request(physical_region_id, physical_batch); + let response = env .metric() .handle_request(physical_region_id, request) .await - .unwrap_err(); - let Some(err) = err.as_any().downcast_ref::() else { - panic!("unexpected error type"); + .unwrap(); + assert_eq!(response.affected_rows, 3); + + // Verify all 6 rows are readable from the logical region. + let stream = env + .metric() + .scan_to_stream(logical_region_id, ScanRequest::default()) + .await + .unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(batches.iter().map(|b| b.num_rows()).sum::(), 6); + } + + #[tokio::test] + async fn test_bulk_insert_physical_region_empty_batch() { + // Use flat format so that BulkMemtable is used (supports write_bulk). + let mito_config = MitoConfig { + default_experimental_flat_format: true, + ..Default::default() }; - assert_matches!(err, Error::UnsupportedRegionRequest { .. }); + let env = TestEnv::with_mito_config("", mito_config, Default::default()).await; + env.init_metric_region().await; + let physical_region_id = env.default_physical_region_id(); + + let batch = build_logical_batch(0, 0); + let request = build_bulk_request(physical_region_id, batch); + let response = env + .metric() + .handle_request(physical_region_id, request) + .await + .unwrap(); + assert_eq!(response.affected_rows, 0); } #[tokio::test] diff --git a/src/metric-engine/src/lib.rs b/src/metric-engine/src/lib.rs index 557baba25a..d209eb7588 100644 --- a/src/metric-engine/src/lib.rs +++ b/src/metric-engine/src/lib.rs @@ -52,7 +52,7 @@ #![recursion_limit = "256"] -mod batch_modifier; +pub mod batch_modifier; pub mod config; mod data_region; pub mod engine; diff --git a/src/metric-engine/src/test_util.rs b/src/metric-engine/src/test_util.rs index d3e929cf63..ec55a01903 100644 --- a/src/metric-engine/src/test_util.rs +++ b/src/metric-engine/src/test_util.rs @@ -76,6 +76,22 @@ impl TestEnv { } } + /// Returns a new env with specific `prefix`, `mito_config`, and `config` for test. + pub async fn with_mito_config( + prefix: &str, + mito_config: MitoConfig, + config: EngineConfig, + ) -> Self { + let mut mito_env = MitoTestEnv::with_prefix(prefix).await; + let mito = mito_env.create_engine(mito_config).await; + let metric = MetricEngine::try_new(mito.clone(), config).unwrap(); + Self { + mito_env, + mito, + metric, + } + } + /// Returns a new env with specific `prefix` and `mito_env` for test. pub async fn with_mito_env(mut mito_env: MitoTestEnv) -> Self { let mito = mito_env.create_engine(MitoConfig::default()).await; diff --git a/src/operator/src/insert.rs b/src/operator/src/insert.rs index e1f121699e..aecca50e09 100644 --- a/src/operator/src/insert.rs +++ b/src/operator/src/insert.rs @@ -103,7 +103,7 @@ pub enum AutoCreateTableType { } impl AutoCreateTableType { - fn as_str(&self) -> &'static str { + pub fn as_str(&self) -> &'static str { match self { AutoCreateTableType::Logical(_) => "logical", AutoCreateTableType::Physical => "physical", diff --git a/src/partition/src/multi_dim.rs b/src/partition/src/multi_dim.rs index 8825c6de59..a68479888b 100644 --- a/src/partition/src/multi_dim.rs +++ b/src/partition/src/multi_dim.rs @@ -338,8 +338,8 @@ impl PartitionRule for MultiDimPartitionRule { self } - fn partition_columns(&self) -> Vec { - self.partition_columns.clone() + fn partition_columns(&self) -> &[String] { + &self.partition_columns } fn find_region(&self, values: &[Value]) -> Result { diff --git a/src/partition/src/partition.rs b/src/partition/src/partition.rs index 110f61a39e..f4c585e404 100644 --- a/src/partition/src/partition.rs +++ b/src/partition/src/partition.rs @@ -29,7 +29,7 @@ pub type PartitionRuleRef = Arc; pub trait PartitionRule: Sync + Send { fn as_any(&self) -> &dyn Any; - fn partition_columns(&self) -> Vec; + fn partition_columns(&self) -> &[String]; /// Finds the target region by the partition values. /// diff --git a/src/partition/src/splitter.rs b/src/partition/src/splitter.rs index fa19b74ad3..176422a173 100644 --- a/src/partition/src/splitter.rs +++ b/src/partition/src/splitter.rs @@ -66,7 +66,7 @@ impl<'a> SplitReadRowHelper<'a> { .collect::>(); let partition_cols = partition_rule.partition_columns(); let partition_cols_indexes = partition_cols - .into_iter() + .iter() .map(|col_name| col_name_to_idx.get(&col_name).cloned()) .collect::>(); @@ -176,15 +176,25 @@ mod tests { } #[derive(Debug, Serialize, Deserialize)] - struct MockPartitionRule; + struct MockPartitionRule { + partition_columns: Vec, + } + + impl Default for MockPartitionRule { + fn default() -> Self { + Self { + partition_columns: vec!["id".to_string()], + } + } + } impl PartitionRule for MockPartitionRule { fn as_any(&self) -> &dyn Any { self } - fn partition_columns(&self) -> Vec { - vec!["id".to_string()] + fn partition_columns(&self) -> &[String] { + &self.partition_columns } fn find_region(&self, values: &[Value]) -> Result { @@ -206,15 +216,25 @@ mod tests { } #[derive(Debug, Serialize, Deserialize)] - struct MockMissedColPartitionRule; + struct MockMissedColPartitionRule { + partition_columns: Vec, + } + + impl Default for MockMissedColPartitionRule { + fn default() -> Self { + Self { + partition_columns: vec!["missed_col".to_string()], + } + } + } impl PartitionRule for MockMissedColPartitionRule { fn as_any(&self) -> &dyn Any { self } - fn partition_columns(&self) -> Vec { - vec!["missed_col".to_string()] + fn partition_columns(&self) -> &[String] { + &self.partition_columns } fn find_region(&self, values: &[Value]) -> Result { @@ -243,8 +263,8 @@ mod tests { self } - fn partition_columns(&self) -> Vec { - vec![] + fn partition_columns(&self) -> &[String] { + &[] } fn find_region(&self, _values: &[Value]) -> Result { @@ -261,7 +281,7 @@ mod tests { #[test] fn test_writer_splitter() { let rows = mock_rows(); - let rule = Arc::new(MockPartitionRule) as PartitionRuleRef; + let rule = Arc::new(MockPartitionRule::default()) as PartitionRuleRef; let splitter = RowSplitter::new(rule); let mut splits = splitter.split(rows).unwrap(); @@ -276,7 +296,7 @@ mod tests { #[test] fn test_missed_col_writer_splitter() { let rows = mock_rows(); - let rule = Arc::new(MockMissedColPartitionRule) as PartitionRuleRef; + let rule = Arc::new(MockMissedColPartitionRule::default()) as PartitionRuleRef; let splitter = RowSplitter::new(rule); let mut splits = splitter.split(rows).unwrap(); diff --git a/src/servers/Cargo.toml b/src/servers/Cargo.toml index 6531390ca3..55bb41ee51 100644 --- a/src/servers/Cargo.toml +++ b/src/servers/Cargo.toml @@ -78,6 +78,7 @@ jsonb.workspace = true lazy_static.workspace = true log-query.workspace = true loki-proto.workspace = true +metric-engine.workspace = true mime_guess = "2.0" notify.workspace = true object-pool = "0.5" @@ -114,6 +115,7 @@ serde_json.workspace = true session.workspace = true simd-json.workspace = true simdutf8 = "0.1" +smallvec.workspace = true snafu.workspace = true snap = "1" socket2 = "0.5" diff --git a/src/servers/src/error.rs b/src/servers/src/error.rs index 5fae7a82db..682288b271 100644 --- a/src/servers/src/error.rs +++ b/src/servers/src/error.rs @@ -392,7 +392,7 @@ pub enum Error { location: Location, }, - #[snafu(display("Error accessing catalog"))] + #[snafu(transparent)] Catalog { source: catalog::error::Error, #[snafu(implicit)] @@ -678,6 +678,13 @@ pub enum Error { #[snafu(implicit)] location: Location, }, + + #[snafu(transparent)] + DataTypes { + source: datatypes::error::Error, + #[snafu(implicit)] + location: Location, + }, } pub type Result = std::result::Result; @@ -756,6 +763,7 @@ impl ErrorExt for Error { Catalog { source, .. } => source.status_code(), RowWriter { source, .. } => source.status_code(), + DataTypes { source, .. } => source.status_code(), TlsRequired { .. } => StatusCode::Unknown, Auth { source, .. } => source.status_code(), diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs index 24bb844dc7..dc468a9b75 100644 --- a/src/servers/src/http/event.rs +++ b/src/servers/src/http/event.rs @@ -49,7 +49,7 @@ use table::table_reference::TableReference; use vrl::value::{KeyString, Value as VrlValue}; use crate::error::{ - CatalogSnafu, Error, InvalidParameterSnafu, OtherSnafu, ParseJsonSnafu, PipelineSnafu, Result, + Error, InvalidParameterSnafu, OtherSnafu, ParseJsonSnafu, PipelineSnafu, Result, status_code_to_http_status, }; use crate::http::HttpResponse; @@ -265,12 +265,7 @@ pub async fn query_pipeline_ddl( .map_err(BoxedError::new) .context(OtherSnafu)?; - let message = if handler - .get_table(&table_name, &query_ctx) - .await - .context(CatalogSnafu)? - .is_some() - { + let message = if handler.get_table(&table_name, &query_ctx).await?.is_some() { Some(CREATE_TABLE_SQL_TABLE_EXISTS.to_string()) } else if pipeline.is_variant_table_name() { Some(CREATE_TABLE_SQL_SUFFIX_EXISTS.to_string()) diff --git a/src/servers/src/http/prometheus.rs b/src/servers/src/http/prometheus.rs index 60ad780beb..63149948a5 100644 --- a/src/servers/src/http/prometheus.rs +++ b/src/servers/src/http/prometheus.rs @@ -65,9 +65,8 @@ use store_api::metric_engine_consts::{ pub use super::result::prometheus_resp::PrometheusJsonResponse; use crate::error::{ - CatalogSnafu, CollectRecordbatchSnafu, ConvertScalarValueSnafu, DataFusionSnafu, Error, - InvalidQuerySnafu, NotSupportedSnafu, ParseTimestampSnafu, Result, TableNotFoundSnafu, - UnexpectedResultSnafu, + CollectRecordbatchSnafu, ConvertScalarValueSnafu, DataFusionSnafu, Error, InvalidQuerySnafu, + NotSupportedSnafu, ParseTimestampSnafu, Result, TableNotFoundSnafu, UnexpectedResultSnafu, }; use crate::http::header::collect_plan_metrics; use crate::prom_store::{FIELD_NAME_LABEL, METRIC_NAME_LABEL, is_database_selection_label}; @@ -662,8 +661,7 @@ async fn retrieve_series_from_query_result( table_name, Some(query_ctx), ) - .await - .context(CatalogSnafu)? + .await? .with_context(|| TableNotFoundSnafu { catalog: query_ctx.current_catalog(), schema: query_ctx.current_schema(), @@ -1440,7 +1438,7 @@ async fn retrieve_table_names( }); while let Some(table) = tables_stream.next().await { - let table = table.context(CatalogSnafu)?; + let table = table?; if !table .table_info() .meta @@ -1497,7 +1495,7 @@ async fn retrieve_field_names( .next() .await { - let table = table.context(CatalogSnafu)?; + let table = table?; for column in table.field_columns() { field_columns.insert(column.name); } @@ -1508,8 +1506,7 @@ async fn retrieve_field_names( for table_name in matches { let table = manager .table(catalog, &schema, &table_name, Some(query_ctx)) - .await - .context(CatalogSnafu)? + .await? .with_context(|| TableNotFoundSnafu { catalog: catalog.to_string(), schema: schema.clone(), @@ -1533,8 +1530,7 @@ async fn retrieve_schema_names( let candidate_schemas = catalog_manager .schema_names(catalog, Some(query_ctx)) - .await - .context(CatalogSnafu)?; + .await?; for schema in candidate_schemas { let mut found = true; @@ -1542,8 +1538,7 @@ async fn retrieve_schema_names( if let Some(table_name) = retrieve_metric_name_from_promql(match_item) { let exists = catalog_manager .table_exists(catalog, &schema, &table_name, Some(query_ctx)) - .await - .context(CatalogSnafu)?; + .await?; if !exists { found = false; break; diff --git a/src/servers/src/lib.rs b/src/servers/src/lib.rs index 41d73b109f..44587783be 100644 --- a/src/servers/src/lib.rs +++ b/src/servers/src/lib.rs @@ -42,6 +42,7 @@ pub mod pending_rows_batcher; mod pipeline; pub mod postgres; pub mod prom_remote_write; +pub(crate) mod prom_row_builder; pub mod prom_store; pub mod prometheus; pub mod prometheus_handler; diff --git a/src/servers/src/metrics.rs b/src/servers/src/metrics.rs index e3bff7fdbc..6f151db539 100644 --- a/src/servers/src/metrics.rs +++ b/src/servers/src/metrics.rs @@ -177,6 +177,13 @@ lazy_static! { vec![0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0] ) .unwrap(); + pub static ref PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED: HistogramVec = register_histogram_vec!( + "greptime_prom_store_pending_rows_batch_flush_stage_elapsed", + "Elapsed time of pending rows batch flush stages in seconds", + &["stage"], + vec![0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0] + ) + .unwrap(); /// Http prometheus read duration per database. pub static ref METRIC_HTTP_PROM_STORE_READ_ELAPSED: HistogramVec = register_histogram_vec!( "greptime_servers_http_prometheus_read_elapsed", diff --git a/src/servers/src/pending_rows_batcher.rs b/src/servers/src/pending_rows_batcher.rs index f8486e3636..b6e07d2a81 100644 --- a/src/servers/src/pending_rows_batcher.rs +++ b/src/servers/src/pending_rows_batcher.rs @@ -12,53 +12,79 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::sync::Arc; use std::time::{Duration, Instant}; -use api::helper::ColumnDataTypeWrapper; +use api::v1::meta::Peer; use api::v1::region::{ BulkInsertRequest, RegionRequest, RegionRequestHeader, bulk_insert_request, region_request, }; -use api::v1::value::ValueData; -use api::v1::{ArrowIpc, RowInsertRequests, Rows}; -use arrow::array::{ - ArrayRef, Float64Builder, StringBuilder, TimestampMicrosecondBuilder, - TimestampMillisecondBuilder, TimestampNanosecondBuilder, TimestampSecondBuilder, - new_null_array, -}; -use arrow::compute::{cast, concat_batches, filter_record_batch}; -use arrow::datatypes::{Field, Schema as ArrowSchema}; +use api::v1::{ArrowIpc, ColumnSchema, RowInsertRequests, Rows}; +use arrow::compute::{concat_batches, filter_record_batch}; +use arrow::datatypes::{DataType as ArrowDataType, Schema as ArrowSchema, TimeUnit}; use arrow::record_batch::RecordBatch; -use arrow_schema::TimeUnit; +use async_trait::async_trait; use bytes::Bytes; use catalog::CatalogManagerRef; use common_grpc::flight::{FlightEncoder, FlightMessage}; use common_meta::node_manager::NodeManagerRef; -use common_query::prelude::GREPTIME_PHYSICAL_TABLE; +use common_query::prelude::{GREPTIME_PHYSICAL_TABLE, greptime_timestamp, greptime_value}; use common_telemetry::tracing_context::TracingContext; -use common_telemetry::{debug, error, info, warn}; +use common_telemetry::{debug, error, warn}; use dashmap::DashMap; use dashmap::mapref::entry::Entry; -use datatypes::data_type::DataType; -use datatypes::prelude::ConcreteDataType; +use metric_engine::batch_modifier::{TagColumnInfo, modify_batch_sparse}; use partition::manager::PartitionRuleManagerRef; use session::context::QueryContextRef; -use snafu::{ResultExt, ensure}; -use store_api::storage::RegionId; +use smallvec::SmallVec; +use snafu::{OptionExt, ensure}; +use store_api::storage::{RegionId, TableId}; use tokio::sync::{OwnedSemaphorePermit, Semaphore, broadcast, mpsc, oneshot}; use crate::error; use crate::error::{Error, Result}; use crate::metrics::{ FLUSH_DROPPED_ROWS, FLUSH_ELAPSED, FLUSH_FAILURES, FLUSH_ROWS, FLUSH_TOTAL, PENDING_BATCHES, - PENDING_ROWS, PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED, PENDING_WORKERS, + PENDING_ROWS, PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED, PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED, + PENDING_WORKERS, +}; +use crate::prom_row_builder::{ + build_prom_create_table_schema_from_proto, identify_missing_columns_from_proto, + rows_to_aligned_record_batch, }; const PHYSICAL_TABLE_KEY: &str = "physical_table"; /// Whether wait for ingestion result before reply to client. const PENDING_ROWS_BATCH_SYNC_ENV: &str = "PENDING_ROWS_BATCH_SYNC"; const WORKER_IDLE_TIMEOUT_MULTIPLIER: u32 = 3; +const PHYSICAL_REGION_ESSENTIAL_COLUMN_COUNT: usize = 3; + +#[async_trait] +pub trait PendingRowsSchemaAlterer: Send + Sync { + /// Batch-create multiple logical tables that are missing. + /// Each entry is `(table_name, request_schema)`. + async fn create_tables_if_missing_batch( + &self, + catalog: &str, + schema: &str, + tables: &[(&str, &[ColumnSchema])], + with_metric_engine: bool, + ctx: QueryContextRef, + ) -> Result<()>; + + /// Batch-alter multiple logical tables to add missing tag columns. + /// Each entry is `(table_name, missing_column_names)`. + async fn add_missing_prom_tag_columns_batch( + &self, + catalog: &str, + schema: &str, + tables: &[(&str, &[String])], + ctx: QueryContextRef, + ) -> Result<()>; +} + +pub type PendingRowsSchemaAltererRef = Arc; #[derive(Debug, Clone, Hash, Eq, PartialEq)] struct BatchKey { @@ -70,10 +96,22 @@ struct BatchKey { #[derive(Debug)] struct TableBatch { table_name: String, + table_id: TableId, batches: Vec, row_count: usize, } +/// Intermediate planning state for resolving and preparing logical tables +/// before row-to-batch alignment. +struct TableResolutionPlan { + /// Resolved table schema and table id by logical table name. + region_schemas: HashMap, u32)>, + /// Missing tables that need to be created before alignment. + tables_to_create: Vec<(String, Vec)>, + /// Existing tables that need tag-column schema evolution. + tables_to_alter: Vec<(String, Vec)>, +} + struct PendingBatch { tables: HashMap, created_at: Option, @@ -101,7 +139,7 @@ struct PendingWorker { enum WorkerCommand { Submit { - table_batches: Vec<(String, RecordBatch)>, + table_batches: Vec<(String, u32, RecordBatch)>, total_rows: usize, ctx: QueryContextRef, response_tx: oneshot::Sender>, @@ -134,6 +172,8 @@ pub struct PendingRowsBatcher { flush_semaphore: Arc, inflight_semaphore: Arc, worker_channel_capacity: usize, + prom_store_with_metric_engine: bool, + schema_alterer: PendingRowsSchemaAltererRef, pending_rows_batch_sync: bool, shutdown: broadcast::Sender<()>, } @@ -144,6 +184,8 @@ impl PendingRowsBatcher { partition_manager: PartitionRuleManagerRef, node_manager: NodeManagerRef, catalog_manager: CatalogManagerRef, + prom_store_with_metric_engine: bool, + schema_alterer: PendingRowsSchemaAltererRef, flush_interval: Duration, max_batch_rows: usize, max_concurrent_flushes: usize, @@ -178,6 +220,8 @@ impl PendingRowsBatcher { partition_manager, node_manager, catalog_manager, + prom_store_with_metric_engine, + schema_alterer, flush_semaphore: Arc::new(Semaphore::new(max_concurrent_flushes)), inflight_semaphore: Arc::new(Semaphore::new(max_inflight_requests)), worker_channel_capacity, @@ -189,20 +233,13 @@ impl PendingRowsBatcher { pub async fn submit(&self, requests: RowInsertRequests, ctx: QueryContextRef) -> Result { let (table_batches, total_rows) = { let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED - .with_label_values(&["submit_build_table_batches"]) + .with_label_values(&["submit_build_and_align"]) .start_timer(); - build_table_batches(requests)? + self.build_and_align_table_batches(requests, &ctx).await? }; if total_rows == 0 { return Ok(0); } - let table_batches = { - let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED - .with_label_values(&["submit_align_region_schema"]) - .start_timer(); - self.align_table_batches_to_region_schema(table_batches, &ctx) - .await? - }; let permit = { let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED @@ -212,7 +249,7 @@ impl PendingRowsBatcher { .clone() .acquire_owned() .await - .map_err(|_| Error::BatcherChannelClosed)? + .map_err(|_| error::BatcherChannelClosedSnafu.build())? }; let (response_tx, response_rx) = oneshot::channel(); @@ -260,7 +297,9 @@ impl PendingRowsBatcher { let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED .with_label_values(&["submit_wait_flush_result"]) .start_timer(); - response_rx.await.map_err(|_| Error::BatcherChannelClosed)? + response_rx + .await + .map_err(|_| error::BatcherChannelClosedSnafu.build())? }; result.map(|()| total_rows as u64) } else { @@ -268,43 +307,337 @@ impl PendingRowsBatcher { } } - async fn align_table_batches_to_region_schema( + /// Converts proto `RowInsertRequests` directly into aligned `RecordBatch`es + /// in a single pass, handling table creation, schema alteration, column + /// renaming, reordering, and null-filling without building intermediate + /// RecordBatches. + async fn build_and_align_table_batches( &self, - table_batches: Vec<(String, RecordBatch)>, + requests: RowInsertRequests, ctx: &QueryContextRef, - ) -> Result> { + ) -> Result<(Vec<(String, u32, RecordBatch)>, usize)> { let catalog = ctx.current_catalog().to_string(); let schema = ctx.current_schema(); - let mut region_schemas: HashMap> = HashMap::new(); - let mut aligned_batches = Vec::with_capacity(table_batches.len()); - for (table_name, record_batch) in table_batches { - let region_schema = if let Some(region_schema) = region_schemas.get(&table_name) { - region_schema.clone() + let (table_rows, total_rows) = Self::collect_non_empty_table_rows(requests); + if total_rows == 0 { + return Ok((Vec::new(), 0)); + } + + let unique_tables = Self::collect_unique_table_schemas(&table_rows)?; + let mut plan = self + .plan_table_resolution(&catalog, &schema, ctx, &unique_tables) + .await?; + + self.create_missing_tables_and_refresh_schemas( + &catalog, + &schema, + ctx, + &table_rows, + &mut plan, + ) + .await?; + + self.alter_tables_and_refresh_schemas(&catalog, &schema, ctx, &mut plan) + .await?; + + let aligned_batches = Self::build_aligned_batches(&table_rows, &plan.region_schemas)?; + + Ok((aligned_batches, total_rows)) + } + + /// Extracts non-empty `(table_name, rows)` pairs and computes total row + /// count across the retained entries. + fn collect_non_empty_table_rows(requests: RowInsertRequests) -> (Vec<(String, Rows)>, usize) { + let mut table_rows: Vec<(String, Rows)> = Vec::with_capacity(requests.inserts.len()); + let mut total_rows = 0; + + for request in requests.inserts { + let Some(rows) = request.rows else { + continue; + }; + if rows.rows.is_empty() { + continue; + } + + total_rows += rows.rows.len(); + table_rows.push((request.table_name, rows)); + } + + (table_rows, total_rows) + } + + /// Returns unique `(table_name, proto_schema)` pairs while keeping the + /// first-seen schema for duplicate table names. + fn collect_unique_table_schemas( + table_rows: &[(String, Rows)], + ) -> Result> { + let mut unique_tables: Vec<(&str, &[ColumnSchema])> = Vec::with_capacity(table_rows.len()); + let mut seen = HashSet::new(); + + for (table_name, rows) in table_rows { + if seen.insert(table_name.as_str()) { + unique_tables.push((table_name.as_str(), &rows.schema)); } else { - let table = self - .catalog_manager - .table(&catalog, &schema, &table_name, Some(ctx.as_ref())) - .await - .map_err(|err| Error::Internal { - err_msg: format!( - "Failed to resolve table {} for pending batch alignment: {}", - table_name, err - ), - })? - .ok_or_else(|| Error::Internal { - err_msg: format!( - "Table not found during pending batch alignment: {}", - table_name - ), - })?; - let region_schema = table.table_info().meta.schema.arrow_schema().clone(); - region_schemas.insert(table_name.clone(), region_schema.clone()); - region_schema + // table_rows should group rows by table name. + return error::InvalidPromRemoteRequestSnafu { + msg: format!( + "Found duplicated table name in RowInsertRequest: {}", + table_name + ), + } + .fail(); + } + } + + Ok(unique_tables) + } + + /// Resolves table metadata and classifies each table into existing, + /// to-create, and to-alter groups used by subsequent DDL steps. + async fn plan_table_resolution( + &self, + catalog: &str, + schema: &str, + ctx: &QueryContextRef, + unique_tables: &[(&str, &[ColumnSchema])], + ) -> Result { + let mut plan = TableResolutionPlan { + region_schemas: HashMap::with_capacity(unique_tables.len()), + tables_to_create: Vec::new(), + tables_to_alter: Vec::new(), + }; + + let resolved_tables = { + let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED + .with_label_values(&["align_resolve_table"]) + .start_timer(); + futures::future::join_all(unique_tables.iter().map(|(table_name, _)| { + self.catalog_manager + .table(catalog, schema, table_name, Some(ctx.as_ref())) + })) + .await + }; + + for ((table_name, rows_schema), table_result) in unique_tables.iter().zip(resolved_tables) { + let table = table_result?; + + if let Some(table) = table { + let table_info = table.table_info(); + let table_id = table_info.ident.table_id; + let region_schema = table_info.meta.schema.arrow_schema().clone(); + + let missing_columns = { + let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED + .with_label_values(&["align_identify_missing_columns"]) + .start_timer(); + identify_missing_columns_from_proto(rows_schema, region_schema.as_ref())? + }; + if !missing_columns.is_empty() { + plan.tables_to_alter + .push(((*table_name).to_string(), missing_columns)); + } + plan.region_schemas + .insert((*table_name).to_string(), (region_schema, table_id)); + } else { + let request_schema = { + let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED + .with_label_values(&["align_build_create_table_schema"]) + .start_timer(); + build_prom_create_table_schema_from_proto(rows_schema)? + }; + plan.tables_to_create + .push(((*table_name).to_string(), request_schema)); + } + } + + Ok(plan) + } + + /// Batch-creates missing tables, refreshes their schema metadata, and + /// enqueues follow-up alters for extra tag columns discovered in later rows. + async fn create_missing_tables_and_refresh_schemas( + &self, + catalog: &str, + schema: &str, + ctx: &QueryContextRef, + table_rows: &[(String, Rows)], + plan: &mut TableResolutionPlan, + ) -> Result<()> { + if plan.tables_to_create.is_empty() { + return Ok(()); + } + + let create_refs: Vec<(&str, &[ColumnSchema])> = plan + .tables_to_create + .iter() + .map(|(name, schema)| (name.as_str(), schema.as_slice())) + .collect(); + + { + let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED + .with_label_values(&["align_batch_create_tables"]) + .start_timer(); + self.schema_alterer + .create_tables_if_missing_batch( + catalog, + schema, + &create_refs, + self.prom_store_with_metric_engine, + ctx.clone(), + ) + .await?; + } + + let created_table_results = { + let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED + .with_label_values(&["align_resolve_table_after_create"]) + .start_timer(); + futures::future::join_all(plan.tables_to_create.iter().map(|(table_name, _)| { + self.catalog_manager + .table(catalog, schema, table_name, Some(ctx.as_ref())) + })) + .await + }; + + for ((table_name, _), table_result) in + plan.tables_to_create.iter().zip(created_table_results) + { + let table = table_result?.with_context(|| error::UnexpectedResultSnafu { + reason: format!( + "Table not found after pending batch create attempt: {}", + table_name + ), + })?; + let table_info = table.table_info(); + let table_id = table_info.ident.table_id; + let region_schema = table_info.meta.schema.arrow_schema().clone(); + plan.region_schemas + .insert(table_name.clone(), (region_schema, table_id)); + } + + Self::enqueue_alter_for_new_tables(table_rows, plan)?; + + Ok(()) + } + + /// For newly created tables, re-checks all row schemas and appends alter + /// operations when additional tag columns are still missing. + fn enqueue_alter_for_new_tables( + table_rows: &[(String, Rows)], + plan: &mut TableResolutionPlan, + ) -> Result<()> { + let created_tables: HashSet<&str> = plan + .tables_to_create + .iter() + .map(|(table_name, _)| table_name.as_str()) + .collect(); + + for (table_name, rows) in table_rows { + if !created_tables.contains(table_name.as_str()) { + continue; + } + + let Some((region_schema, _)) = plan.region_schemas.get(table_name) else { + continue; }; - let record_batch = align_record_batch_to_schema(record_batch, region_schema.as_ref())?; - aligned_batches.push((table_name, record_batch)); + let missing_columns = identify_missing_columns_from_proto(&rows.schema, region_schema)?; + if missing_columns.is_empty() + || plan + .tables_to_alter + .iter() + .any(|(existing_name, _)| existing_name == table_name) + { + continue; + } + + plan.tables_to_alter + .push((table_name.clone(), missing_columns)); + } + + Ok(()) + } + + /// Batch-alters tables that have missing tag columns and refreshes the + /// in-memory schema map used for row alignment. + async fn alter_tables_and_refresh_schemas( + &self, + catalog: &str, + schema: &str, + ctx: &QueryContextRef, + plan: &mut TableResolutionPlan, + ) -> Result<()> { + if plan.tables_to_alter.is_empty() { + return Ok(()); + } + + let alter_refs: Vec<(&str, &[String])> = plan + .tables_to_alter + .iter() + .map(|(name, cols)| (name.as_str(), cols.as_slice())) + .collect(); + { + let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED + .with_label_values(&["align_batch_add_missing_columns"]) + .start_timer(); + self.schema_alterer + .add_missing_prom_tag_columns_batch(catalog, schema, &alter_refs, ctx.clone()) + .await?; + } + + let altered_table_results = { + let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED + .with_label_values(&["align_resolve_table_after_schema_alter"]) + .start_timer(); + futures::future::join_all(plan.tables_to_alter.iter().map(|(table_name, _)| { + self.catalog_manager + .table(catalog, schema, table_name, Some(ctx.as_ref())) + })) + .await + }; + + for ((table_name, _), table_result) in + plan.tables_to_alter.iter().zip(altered_table_results) + { + let table = table_result?.with_context(|| error::UnexpectedResultSnafu { + reason: format!( + "Table not found after pending batch schema alter: {}", + table_name + ), + })?; + let table_info = table.table_info(); + let table_id = table_info.ident.table_id; + let refreshed_region_schema = table_info.meta.schema.arrow_schema().clone(); + plan.region_schemas + .insert(table_name.clone(), (refreshed_region_schema, table_id)); + } + + Ok(()) + } + + /// Converts proto rows to `RecordBatch` values aligned to resolved region + /// schemas and returns `(table_name, table_id, batch)` tuples. + fn build_aligned_batches( + table_rows: &[(String, Rows)], + region_schemas: &HashMap, u32)>, + ) -> Result> { + let mut aligned_batches = Vec::with_capacity(table_rows.len()); + for (table_name, rows) in table_rows { + let (region_schema, table_id) = + region_schemas.get(table_name).cloned().with_context(|| { + error::UnexpectedResultSnafu { + reason: format!("Region schema not resolved for table: {}", table_name), + } + })?; + + let record_batch = { + let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED + .with_label_values(&["align_rows_to_record_batch"]) + .start_timer(); + rows_to_aligned_record_batch(rows, region_schema.as_ref())? + }; + aligned_batches.push((table_name.clone(), table_id, record_batch)); } Ok(aligned_batches) @@ -422,9 +755,10 @@ fn start_worker( batch.waiters.push(FlushWaiter { response_tx, _permit }); - for (table_name, record_batch) in table_batches { + for (table_name, table_id, record_batch) in table_batches { let entry = batch.tables.entry(table_name.clone()).or_insert_with(|| TableBatch { table_name, + table_id, batches: Vec::new(), row_count: 0, }); @@ -579,6 +913,199 @@ async fn spawn_flush( } } +struct FlushRegionWrite { + region_id: RegionId, + row_count: usize, + datanode: Peer, + request: RegionRequest, +} + +enum FlushWriteResult { + Success { row_count: usize }, + Failed { row_count: usize, message: String }, +} + +fn should_dispatch_concurrently(region_write_count: usize) -> bool { + region_write_count > 1 +} + +/// Classifies columns in a logical-table batch for sparse primary-key conversion. +/// +/// Returns: +/// - `Vec`: all Utf8 tag columns sorted by tag name, used for +/// TSID and sparse primary-key encoding. +/// - `SmallVec<[usize; 3]>`: indices of columns copied into the physical batch +/// after `__primary_key`, ordered as `[greptime_timestamp, greptime_value, +/// partition_tag_columns...]`. +fn columns_taxonomy( + batch_schema: &Arc, + table_name: &str, + name_to_ids: &HashMap, + partition_columns: &HashSet<&str>, +) -> Result<(Vec, SmallVec<[usize; 3]>)> { + let mut tag_columns = Vec::new(); + let mut essential_column_indices = + SmallVec::<[usize; 3]>::with_capacity(2 + partition_columns.len()); + // Placeholder for greptime_timestamp and greptime_value + essential_column_indices.push(0); + essential_column_indices.push(0); + + let mut timestamp_index = None; + let mut value_index = None; + + for (index, field) in batch_schema.fields().iter().enumerate() { + match field.data_type() { + ArrowDataType::Utf8 => { + let column_id = name_to_ids.get(field.name()).copied().with_context(|| { + error::InvalidPromRemoteRequestSnafu { + msg: format!( + "Column '{}' from logical table '{}' not found in physical table column IDs", + field.name(), + table_name + ), + } + })?; + tag_columns.push(TagColumnInfo { + name: field.name().clone(), + index, + column_id, + }); + + if partition_columns.contains(field.name().as_str()) { + essential_column_indices.push(index); + } + } + ArrowDataType::Timestamp(TimeUnit::Millisecond, _) => { + ensure!( + timestamp_index.replace(index).is_none(), + error::InvalidPromRemoteRequestSnafu { + msg: format!( + "Duplicated timestamp column in logical table '{}' batch schema", + table_name + ), + } + ); + } + ArrowDataType::Float64 => { + ensure!( + value_index.replace(index).is_none(), + error::InvalidPromRemoteRequestSnafu { + msg: format!( + "Duplicated value column in logical table '{}' batch schema", + table_name + ), + } + ); + } + datatype => { + return error::InvalidPromRemoteRequestSnafu { + msg: format!( + "Unexpected data type '{datatype:?}' in logical table '{}' batch schema", + table_name + ), + } + .fail(); + } + } + } + + let timestamp_index = + timestamp_index.with_context(|| error::InvalidPromRemoteRequestSnafu { + msg: format!( + "Missing essential column '{}' in logical table '{}' batch schema", + greptime_timestamp(), + table_name + ), + })?; + let value_index = value_index.with_context(|| error::InvalidPromRemoteRequestSnafu { + msg: format!( + "Missing essential column '{}' in logical table '{}' batch schema", + greptime_value(), + table_name + ), + })?; + + tag_columns.sort_by(|a, b| a.name.cmp(&b.name)); + + essential_column_indices[0] = timestamp_index; + essential_column_indices[1] = value_index; + + Ok((tag_columns, essential_column_indices)) +} + +fn strip_partition_columns_from_batch(batch: RecordBatch) -> Result { + ensure!( + batch.num_columns() >= PHYSICAL_REGION_ESSENTIAL_COLUMN_COUNT, + error::InternalSnafu { + err_msg: format!( + "Expected at least {} columns in physical batch, got {}", + PHYSICAL_REGION_ESSENTIAL_COLUMN_COUNT, + batch.num_columns() + ), + } + ); + let essential_indices: Vec = (0..PHYSICAL_REGION_ESSENTIAL_COLUMN_COUNT).collect(); + batch + .project(&essential_indices) + .map_err(|err| Error::Internal { + err_msg: format!("Failed to project essential columns from RecordBatch: {err}"), + }) +} + +async fn flush_region_writes_concurrently( + node_manager: NodeManagerRef, + writes: Vec, +) -> Vec { + if !should_dispatch_concurrently(writes.len()) { + let mut results = Vec::with_capacity(writes.len()); + for write in writes { + let datanode = node_manager.datanode(&write.datanode).await; + let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED + .with_label_values(&["flush_write_region"]) + .start_timer(); + match datanode.handle(write.request).await { + Ok(_) => results.push(FlushWriteResult::Success { + row_count: write.row_count, + }), + Err(err) => results.push(FlushWriteResult::Failed { + row_count: write.row_count, + message: format!( + "Bulk insert flush failed for region {}: {:?}", + write.region_id, err + ), + }), + } + } + return results; + } + + let write_futures = writes.into_iter().map(|write| { + let node_manager = node_manager.clone(); + async move { + let datanode = node_manager.datanode(&write.datanode).await; + let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED + .with_label_values(&["flush_write_region"]) + .start_timer(); + + match datanode.handle(write.request).await { + Ok(_) => FlushWriteResult::Success { + row_count: write.row_count, + }, + Err(err) => FlushWriteResult::Failed { + row_count: write.row_count, + message: format!( + "Bulk insert flush failed for region {}: {:?}", + write.region_id, err + ), + }, + } + } + }); + + // todo(hl): should be bounded. + futures::future::join_all(write_futures).await +} + async fn flush_batch( flush: FlushBatch, partition_manager: PartitionRuleManagerRef, @@ -594,231 +1121,27 @@ async fn flush_batch( let start = Instant::now(); let mut first_error: Option = None; - let catalog = ctx.current_catalog().to_string(); - let schema = ctx.current_schema(); - - macro_rules! record_failure { - ($row_count:expr, $msg:expr) => {{ - let msg = $msg; - if first_error.is_none() { - first_error = Some(msg.clone()); - } - mark_flush_failure($row_count, &msg); - }}; - } - - for table_batch in table_batches { - let Some(first_batch) = table_batch.batches.first() else { - continue; - }; - - let schema_ref = first_batch.schema(); - let record_batch = { - let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED - .with_label_values(&["flush_concat_table_batches"]) - .start_timer(); - match concat_batches(&schema_ref, &table_batch.batches) { - Ok(batch) => batch, - Err(err) => { - record_failure!( - table_batch.row_count, - format!( - "Failed to concat table batch {}: {:?}", - table_batch.table_name, err - ) - ); - continue; - } - } - }; - - let table = { - let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED - .with_label_values(&["flush_resolve_table"]) - .start_timer(); - match catalog_manager - .table( - &catalog, - &schema, - &table_batch.table_name, - Some(ctx.as_ref()), - ) - .await - { - Ok(Some(table)) => table, - Ok(None) => { - record_failure!( - table_batch.row_count, - format!( - "Table not found during pending flush: {}", - table_batch.table_name - ) - ); - continue; - } - Err(err) => { - record_failure!( - table_batch.row_count, - format!( - "Failed to resolve table {} for pending flush: {:?}", - table_batch.table_name, err - ) - ); - continue; - } - } - }; - let table_info = table.table_info(); - - let partition_rule = { - let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED - .with_label_values(&["flush_fetch_partition_rule"]) - .start_timer(); - match partition_manager - .find_table_partition_rule(&table_info) - .await - { - Ok(rule) => rule, - Err(err) => { - record_failure!( - table_batch.row_count, - format!( - "Failed to fetch partition rule for table {}: {:?}", - table_batch.table_name, err - ) - ); - continue; - } - } - }; - - let region_masks = { - let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED - .with_label_values(&["flush_split_record_batch"]) - .start_timer(); - match partition_rule.0.split_record_batch(&record_batch) { - Ok(masks) => masks, - Err(err) => { - record_failure!( - table_batch.row_count, - format!( - "Failed to split record batch for table {}: {:?}", - table_batch.table_name, err - ) - ); - continue; - } - } - }; - - for (region_number, mask) in region_masks { - if mask.select_none() { - continue; - } - - let region_batch = if mask.select_all() { - record_batch.clone() - } else { - let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED - .with_label_values(&["flush_filter_record_batch"]) - .start_timer(); - match filter_record_batch(&record_batch, mask.array()) { - Ok(batch) => batch, - Err(err) => { - record_failure!( - table_batch.row_count, - format!( - "Failed to filter record batch for table {}: {:?}", - table_batch.table_name, err - ) - ); - continue; - } - } - }; - - let row_count = region_batch.num_rows(); - if row_count == 0 { - continue; - } - - let region_id = RegionId::new(table_info.table_id(), region_number); - let datanode = { - let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED - .with_label_values(&["flush_resolve_region_leader"]) - .start_timer(); - match partition_manager.find_region_leader(region_id).await { - Ok(peer) => peer, - Err(err) => { - record_failure!( - row_count, - format!("Failed to resolve region leader {}: {:?}", region_id, err) - ); - continue; - } - } - }; - - let (schema_bytes, data_header, payload) = { - let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED - .with_label_values(&["flush_encode_ipc"]) - .start_timer(); - match record_batch_to_ipc(region_batch) { - Ok(encoded) => encoded, - Err(err) => { - record_failure!( - row_count, - format!( - "Failed to encode Arrow IPC for region {}: {:?}", - region_id, err - ) - ); - continue; - } - } - }; - - let request = RegionRequest { - header: Some(RegionRequestHeader { - tracing_context: TracingContext::from_current_span().to_w3c(), - ..Default::default() - }), - body: Some(region_request::Body::BulkInsert(BulkInsertRequest { - region_id: region_id.as_u64(), - partition_expr_version: None, - body: Some(bulk_insert_request::Body::ArrowIpc(ArrowIpc { - schema: schema_bytes, - data_header, - payload, - })), - })), - }; - - let datanode = node_manager.datanode(&datanode).await; - let _timer = PENDING_ROWS_BATCH_INGEST_STAGE_ELAPSED - .with_label_values(&["flush_write_region"]) - .start_timer(); - match datanode.handle(request).await { - Ok(_) => { - FLUSH_TOTAL.inc(); - FLUSH_ROWS.observe(row_count as f64); - } - Err(err) => { - record_failure!( - row_count, - format!( - "Bulk insert flush failed for region {}: {:?}", - region_id, err - ) - ); - } - } - } - } + // Physical-table-level flush: transform all logical table batches + // into physical format and write them together. + let physical_table_name = ctx + .extension(PHYSICAL_TABLE_KEY) + .unwrap_or(GREPTIME_PHYSICAL_TABLE) + .to_string(); + flush_batch_physical( + &table_batches, + total_row_count, + &physical_table_name, + &ctx, + &partition_manager, + &node_manager, + &catalog_manager, + &mut first_error, + ) + .await; let elapsed = start.elapsed().as_secs_f64(); FLUSH_ELAPSED.observe(elapsed); - info!( + debug!( "Pending rows batch flushed, total rows: {}, elapsed time: {}s", total_row_count, elapsed ); @@ -826,6 +1149,370 @@ async fn flush_batch( notify_waiters(waiters, &first_error); } +/// Attempts to flush all table batches by transforming them into the physical +/// table format (sparse primary key encoding) and writing directly to the +/// physical data regions. +/// +/// This is the only flush path. Any failure in resolving or transforming the +/// physical flush inputs is recorded as flush failure and reported to waiters. +#[allow(clippy::too_many_arguments)] +async fn flush_batch_physical( + table_batches: &[TableBatch], + total_row_count: usize, + physical_table_name: &str, + ctx: &QueryContextRef, + partition_manager: &PartitionRuleManagerRef, + node_manager: &NodeManagerRef, + catalog_manager: &CatalogManagerRef, + first_error: &mut Option, +) { + macro_rules! record_failure { + ($row_count:expr, $msg:expr) => {{ + let msg = $msg; + if first_error.is_none() { + *first_error = Some(msg.clone()); + } + mark_flush_failure($row_count, &msg); + }}; + } + + // 1. Resolve the physical table and get column ID mapping + let physical_table = { + let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED + .with_label_values(&["flush_physical_resolve_table"]) + .start_timer(); + match catalog_manager + .table( + ctx.current_catalog(), + &ctx.current_schema(), + physical_table_name, + Some(ctx.as_ref()), + ) + .await + { + Ok(Some(table)) => table, + Ok(None) => { + record_failure!( + total_row_count, + format!( + "Physical table '{}' not found during pending flush", + physical_table_name + ) + ); + return; + } + Err(err) => { + record_failure!( + total_row_count, + format!( + "Failed to resolve physical table '{}' for pending flush: {:?}", + physical_table_name, err + ) + ); + return; + } + } + }; + + let physical_table_info = physical_table.table_info(); + let name_to_ids = match physical_table_info.name_to_ids() { + Some(ids) => ids, + None => { + record_failure!( + total_row_count, + format!( + "Physical table '{}' has no column IDs for pending flush", + physical_table_name + ) + ); + return; + } + }; + + // 2. Get the physical table's partition rule (one lookup instead of N) + let partition_rule = { + let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED + .with_label_values(&["flush_physical_fetch_partition_rule"]) + .start_timer(); + match partition_manager + .find_table_partition_rule(&physical_table_info) + .await + { + Ok(rule) => rule, + Err(err) => { + record_failure!( + total_row_count, + format!( + "Failed to fetch partition rule for physical table '{}': {:?}", + physical_table_name, err + ) + ); + return; + } + } + }; + let partition_columns = partition_rule.0.partition_columns(); + let partition_columns_set: HashSet<&str> = + partition_columns.iter().map(String::as_str).collect(); + + // 3. Transform each logical table batch into physical format + let mut modified_batches: Vec = Vec::with_capacity(table_batches.len()); + let mut modified_row_count: usize = 0; + + let mut modify_elapsed = Duration::ZERO; + let mut columns_taxonomy_elapsed = Duration::ZERO; + + 'next_table: for table_batch in table_batches { + let table_id = table_batch.table_id; + + // Transform each chunk to physical format directly, avoiding an + // intermediate concat_batches per logical table. + for batch in &table_batch.batches { + // Identify tag columns and non-tag columns from the logical batch schema. + // Chunks within a table_batch may have different schemas if new tag columns + // are added between submits. + // In prom batches, Float64 = value, Timestamp = timestamp, Utf8 = tags. + let batch_schema = batch.schema(); + let start = Instant::now(); + let (tag_columns, essential_col_indices) = match columns_taxonomy( + &batch_schema, + &table_batch.table_name, + &name_to_ids, + &partition_columns_set, + ) { + Ok(columns) => columns, + Err(err) => { + warn!( + "Failed to resolve columns for logical table '{}': {:?}", + table_batch.table_name, err + ); + record_failure!(table_batch.row_count, err.to_string()); + continue 'next_table; + } + }; + + columns_taxonomy_elapsed += start.elapsed(); + if tag_columns.is_empty() && essential_col_indices.is_empty() { + continue; + } + + let modified = { + let start = Instant::now(); + match modify_batch_sparse( + batch.clone(), + table_id, + &tag_columns, + &essential_col_indices, + ) { + Ok(batch) => { + modify_elapsed += start.elapsed(); + batch + } + Err(err) => { + record_failure!( + table_batch.row_count, + format!( + "Failed to modify batch for logical table '{}': {:?}", + table_batch.table_name, err + ) + ); + continue 'next_table; + } + } + }; + + modified_row_count += modified.num_rows(); + modified_batches.push(modified); + } + } + + PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED + .with_label_values(&["flush_physical_modify_batch"]) + .observe(modify_elapsed.as_secs_f64()); + PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED + .with_label_values(&["flush_physical_columns_taxonomy"]) + .observe(columns_taxonomy_elapsed.as_secs_f64()); + + if modified_batches.is_empty() { + if first_error.is_none() { + record_failure!( + total_row_count, + format!( + "No batches can be transformed for physical table '{}' during pending flush", + physical_table_name + ) + ); + } + return; + } + + // 4. Concatenate all modified batches (all share the same physical schema) + let combined_batch = { + let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED + .with_label_values(&["flush_physical_concat_all"]) + .start_timer(); + let combined_schema = modified_batches[0].schema(); + // todo(hl): maybe limit max rows to concat. + match concat_batches(&combined_schema, &modified_batches) { + Ok(batch) => batch, + Err(err) => { + record_failure!( + modified_row_count, + format!("Failed to concat modified batches: {:?}", err) + ); + return; + } + } + }; + + // 5. Split by physical partition rule and send to regions + let physical_table_id = physical_table_info.table_id(); + let region_masks = { + let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED + .with_label_values(&["flush_physical_split_record_batch"]) + .start_timer(); + match partition_rule.0.split_record_batch(&combined_batch) { + Ok(masks) => masks, + Err(err) => { + record_failure!( + total_row_count, + format!( + "Failed to split combined batch for physical table '{}': {:?}", + physical_table_name, err + ) + ); + return; + } + } + }; + + let stripped_batch = if partition_columns.is_empty() { + combined_batch + } else { + // Strip partition columns before encoding and sending requests. + match strip_partition_columns_from_batch(combined_batch) { + Ok(batch) => batch, + Err(err) => { + record_failure!( + total_row_count, + format!( + "Failed to strip partition columns for physical table '{}': {:?}", + physical_table_name, err + ) + ); + return; + } + } + }; + + let mut region_writes = Vec::new(); + for (region_number, mask) in region_masks { + if mask.select_none() { + continue; + } + + let region_batch = if mask.select_all() { + stripped_batch.clone() + } else { + let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED + .with_label_values(&["flush_physical_filter_record_batch"]) + .start_timer(); + match filter_record_batch(&stripped_batch, mask.array()) { + Ok(batch) => batch, + Err(err) => { + record_failure!( + total_row_count, + format!( + "Failed to filter combined batch for physical table '{}': {:?}", + physical_table_name, err + ) + ); + continue; + } + } + }; + + let row_count = region_batch.num_rows(); + if row_count == 0 { + continue; + } + + let region_id = RegionId::new(physical_table_id, region_number); + let datanode = { + let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED + .with_label_values(&["flush_physical_resolve_region_leader"]) + .start_timer(); + match partition_manager.find_region_leader(region_id).await { + Ok(peer) => peer, + Err(err) => { + record_failure!( + row_count, + format!( + "Failed to resolve region leader for physical region {}: {:?}", + region_id, err + ) + ); + continue; + } + } + }; + + let (schema_bytes, data_header, payload) = { + let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED + .with_label_values(&["flush_physical_encode_ipc"]) + .start_timer(); + match record_batch_to_ipc(region_batch) { + Ok(encoded) => encoded, + Err(err) => { + record_failure!( + row_count, + format!( + "Failed to encode Arrow IPC for physical region {}: {:?}", + region_id, err + ) + ); + continue; + } + } + }; + + let request = RegionRequest { + header: Some(RegionRequestHeader { + tracing_context: TracingContext::from_current_span().to_w3c(), + ..Default::default() + }), + body: Some(region_request::Body::BulkInsert(BulkInsertRequest { + region_id: region_id.as_u64(), + partition_expr_version: None, + body: Some(bulk_insert_request::Body::ArrowIpc(ArrowIpc { + schema: schema_bytes, + data_header, + payload, + })), + })), + }; + + region_writes.push(FlushRegionWrite { + region_id, + row_count, + datanode, + request, + }); + } + + for result in flush_region_writes_concurrently(node_manager.clone(), region_writes).await { + match result { + FlushWriteResult::Success { row_count } => { + FLUSH_TOTAL.inc(); + FLUSH_ROWS.observe(row_count as f64); + } + FlushWriteResult::Failed { row_count, message } => { + record_failure!(row_count, message); + } + } + } +} + fn notify_waiters(waiters: Vec, first_error: &Option) { for waiter in waiters { let result = match first_error { @@ -865,192 +1552,6 @@ fn flush_with_error(batch: &mut PendingBatch, message: &str) { mark_flush_failure(row_count, message); } -fn build_table_batches(requests: RowInsertRequests) -> Result<(Vec<(String, RecordBatch)>, usize)> { - let mut table_batches = Vec::with_capacity(requests.inserts.len()); - let mut total_rows = 0; - - for request in requests.inserts { - let Some(rows) = request.rows else { - continue; - }; - if rows.rows.is_empty() { - continue; - } - - let record_batch = rows_to_record_batch(&rows)?; - total_rows += record_batch.num_rows(); - table_batches.push((request.table_name, record_batch)); - } - - Ok((table_batches, total_rows)) -} - -fn align_record_batch_to_schema( - record_batch: RecordBatch, - target_schema: &ArrowSchema, -) -> Result { - let source_schema = record_batch.schema(); - if source_schema.as_ref() == target_schema { - return Ok(record_batch); - } - - for source_field in source_schema.fields() { - if target_schema - .column_with_name(source_field.name()) - .is_none() - { - return Err(Error::Internal { - err_msg: format!( - "Failed to align record batch schema, column '{}' not found in target schema", - source_field.name() - ), - }); - } - } - - let row_count = record_batch.num_rows(); - let mut columns = Vec::with_capacity(target_schema.fields().len()); - for target_field in target_schema.fields() { - let column = if let Some((index, source_field)) = - source_schema.column_with_name(target_field.name()) - { - let source_column = record_batch.column(index).clone(); - if source_field.data_type() == target_field.data_type() { - source_column - } else { - cast(source_column.as_ref(), target_field.data_type()).map_err(|err| { - Error::Internal { - err_msg: format!( - "Failed to cast column '{}' to target type {:?}: {}", - target_field.name(), - target_field.data_type(), - err - ), - } - })? - } - } else { - new_null_array(target_field.data_type(), row_count) - }; - columns.push(column); - } - - RecordBatch::try_new(Arc::new(target_schema.clone()), columns).map_err(|err| Error::Internal { - err_msg: format!("Failed to build aligned record batch: {}", err), - }) -} - -fn rows_to_record_batch(rows: &Rows) -> Result { - let row_count = rows.rows.len(); - let column_count = rows.schema.len(); - - for (idx, row) in rows.rows.iter().enumerate() { - ensure!( - row.values.len() == column_count, - error::InternalSnafu { - err_msg: format!( - "Column count mismatch in row {}, expected {}, got {}", - idx, - column_count, - row.values.len() - ) - } - ); - } - - let mut fields = Vec::with_capacity(column_count); - let mut columns = Vec::with_capacity(column_count); - - for (idx, column_schema) in rows.schema.iter().enumerate() { - let datatype_wrapper = ColumnDataTypeWrapper::try_new( - column_schema.datatype, - column_schema.datatype_extension.clone(), - )?; - let data_type = ConcreteDataType::from(datatype_wrapper); - fields.push(Field::new( - column_schema.column_name.clone(), - data_type.as_arrow_type(), - true, - )); - columns.push(build_arrow_array( - rows, - idx, - &column_schema.column_name, - data_type.as_arrow_type(), - row_count, - )?); - } - - RecordBatch::try_new(Arc::new(ArrowSchema::new(fields)), columns).context(error::ArrowSnafu) -} - -fn build_arrow_array( - rows: &Rows, - col_idx: usize, - column_name: &String, - column_data_type: arrow::datatypes::DataType, - row_count: usize, -) -> Result { - macro_rules! build_array { - ($builder:expr, $( $pattern:pat => $value:expr ),+ $(,)?) => {{ - let mut builder = $builder; - for row in &rows.rows { - match row.values[col_idx].value_data.as_ref() { - $(Some($pattern) => builder.append_value($value),)+ - Some(v) => { - return error::InvalidPromRemoteRequestSnafu { - msg: format!("Unexpected value: {:?}", v), - } - .fail(); - } - None => builder.append_null(), - } - } - Arc::new(builder.finish()) as ArrayRef - }}; - } - - let array: ArrayRef = match column_data_type { - arrow::datatypes::DataType::Float64 => { - build_array!(Float64Builder::with_capacity(row_count), ValueData::F64Value(v) => *v) - } - arrow::datatypes::DataType::Utf8 => build_array!( - StringBuilder::with_capacity(row_count, 0), - ValueData::StringValue(v) => v - ), - arrow::datatypes::DataType::Timestamp(u, _) => match u { - TimeUnit::Second => build_array!( - TimestampSecondBuilder::with_capacity(row_count), - ValueData::TimestampSecondValue(v) => *v - ), - TimeUnit::Millisecond => build_array!( - TimestampMillisecondBuilder::with_capacity(row_count), - ValueData::TimestampMillisecondValue(v) => *v - ), - TimeUnit::Microsecond => build_array!( - TimestampMicrosecondBuilder::with_capacity(row_count), - ValueData::DatetimeValue(v) => *v, - ValueData::TimestampMicrosecondValue(v) => *v - ), - TimeUnit::Nanosecond => build_array!( - TimestampNanosecondBuilder::with_capacity(row_count), - ValueData::TimestampNanosecondValue(v) => *v - ), - }, - ty => { - return error::InvalidPromRemoteRequestSnafu { - msg: format!( - "Unexpected column type {:?}, column name: {}", - ty, column_name - ), - } - .fail(); - } - }; - - Ok(array) -} - fn record_batch_to_ipc(record_batch: RecordBatch) -> Result<(Bytes, Bytes, Bytes)> { let mut encoder = FlightEncoder::default(); let schema = encoder.encode_schema(record_batch.schema().as_ref()); @@ -1077,132 +1578,154 @@ fn record_batch_to_ipc(record_batch: RecordBatch) -> Result<(Bytes, Bytes, Bytes #[cfg(test)] mod tests { + use std::collections::{HashMap, HashSet}; use std::sync::Arc; + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::time::Duration; - use api::v1::value::ValueData; - use api::v1::{ColumnDataType, ColumnSchema, Row, Rows, SemanticType, Value}; - use arrow::array::{Array, Float64Array, Int32Array, Int64Array, StringArray}; - use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + use api::region::RegionResponse; + use api::v1::flow::{DirtyWindowRequests, FlowRequest, FlowResponse}; + use api::v1::meta::Peer; + use api::v1::region::{InsertRequests, RegionRequest}; + use api::v1::{ColumnSchema, Row, RowInsertRequest, RowInsertRequests, Rows}; + use arrow::array::{BinaryArray, StringArray, TimestampMillisecondArray}; + use arrow::datatypes::{DataType as ArrowDataType, Field, Schema as ArrowSchema}; use arrow::record_batch::RecordBatch; + use async_trait::async_trait; + use common_meta::error::Result as MetaResult; + use common_meta::node_manager::{ + Datanode, DatanodeManager, DatanodeRef, Flownode, FlownodeManager, FlownodeRef, + }; + use common_query::request::QueryRequest; + use common_recordbatch::SendableRecordBatchStream; use dashmap::DashMap; + use smallvec::SmallVec; + use store_api::storage::RegionId; use tokio::sync::mpsc; + use tokio::time::sleep; use super::{ - BatchKey, PendingWorker, WorkerCommand, align_record_batch_to_schema, - remove_worker_if_same_channel, rows_to_record_batch, should_close_worker_on_idle_timeout, + BatchKey, Error, FlushRegionWrite, FlushWriteResult, PendingRowsBatcher, PendingWorker, + WorkerCommand, columns_taxonomy, flush_region_writes_concurrently, + remove_worker_if_same_channel, should_close_worker_on_idle_timeout, + should_dispatch_concurrently, strip_partition_columns_from_batch, }; + fn mock_rows(row_count: usize, schema_name: &str) -> Rows { + Rows { + schema: vec![ColumnSchema { + column_name: schema_name.to_string(), + ..Default::default() + }], + rows: (0..row_count).map(|_| Row { values: vec![] }).collect(), + } + } + #[test] - fn test_rows_to_record_batch() { - let rows = Rows { - schema: vec![ - ColumnSchema { - column_name: "ts".to_string(), - datatype: ColumnDataType::TimestampMillisecond as i32, - semantic_type: SemanticType::Timestamp as i32, - ..Default::default() + fn test_collect_non_empty_table_rows_filters_empty_payloads() { + let requests = RowInsertRequests { + inserts: vec![ + RowInsertRequest { + table_name: "cpu".to_string(), + rows: Some(mock_rows(2, "host")), }, - ColumnSchema { - column_name: "value".to_string(), - datatype: ColumnDataType::Float64 as i32, - semantic_type: SemanticType::Field as i32, - ..Default::default() + RowInsertRequest { + table_name: "mem".to_string(), + rows: Some(mock_rows(0, "host")), }, - ColumnSchema { - column_name: "host".to_string(), - datatype: ColumnDataType::String as i32, - semantic_type: SemanticType::Tag as i32, - ..Default::default() - }, - ], - rows: vec![ - Row { - values: vec![ - Value { - value_data: Some(ValueData::TimestampMillisecondValue(1000)), - }, - Value { - value_data: Some(ValueData::F64Value(42.0)), - }, - Value { - value_data: Some(ValueData::StringValue("h1".to_string())), - }, - ], - }, - Row { - values: vec![ - Value { - value_data: Some(ValueData::TimestampMillisecondValue(2000)), - }, - Value { value_data: None }, - Value { - value_data: Some(ValueData::StringValue("h2".to_string())), - }, - ], + RowInsertRequest { + table_name: "disk".to_string(), + rows: None, }, ], }; - let rb = rows_to_record_batch(&rows).unwrap(); - assert_eq!(2, rb.num_rows()); - assert_eq!(3, rb.num_columns()); + let (table_rows, total_rows) = PendingRowsBatcher::collect_non_empty_table_rows(requests); + + assert_eq!(2, total_rows); + assert_eq!(1, table_rows.len()); + assert_eq!("cpu", table_rows[0].0); + assert_eq!(2, table_rows[0].1.rows.len()); } - #[test] - fn test_align_record_batch_to_schema_reorder_and_fill_missing() { - let source_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("host", DataType::Utf8, true), - Field::new("value", DataType::Float64, true), - ])); - let source = RecordBatch::try_new( - source_schema, - vec![ - Arc::new(StringArray::from(vec!["h1"])), - Arc::new(Float64Array::from(vec![42.0])), - ], - ) - .unwrap(); - - let target = ArrowSchema::new(vec![ - Field::new("ts", DataType::Int64, true), - Field::new("host", DataType::Utf8, true), - Field::new("value", DataType::Float64, true), - ]); - - let aligned = align_record_batch_to_schema(source, &target).unwrap(); - assert_eq!(aligned.schema().as_ref(), &target); - assert_eq!(1, aligned.num_rows()); - assert_eq!(3, aligned.num_columns()); - let ts = aligned - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(ts.is_null(0)); + #[derive(Clone)] + struct ConcurrentMockDatanode { + delay: Duration, + inflight: Arc, + max_inflight: Arc, } - #[test] - fn test_align_record_batch_to_schema_cast_column_type() { - let source_schema = Arc::new(ArrowSchema::new(vec![Field::new( - "value", - DataType::Int32, - true, - )])); - let source = RecordBatch::try_new( - source_schema, - vec![Arc::new(Int32Array::from(vec![Some(7), None]))], - ) - .unwrap(); + #[async_trait] + impl Datanode for ConcurrentMockDatanode { + async fn handle(&self, _request: RegionRequest) -> MetaResult { + let now = self.inflight.fetch_add(1, Ordering::SeqCst) + 1; + loop { + let max = self.max_inflight.load(Ordering::SeqCst); + if now <= max { + break; + } + if self + .max_inflight + .compare_exchange(max, now, Ordering::SeqCst, Ordering::SeqCst) + .is_ok() + { + break; + } + } - let target = ArrowSchema::new(vec![Field::new("value", DataType::Int64, true)]); - let aligned = align_record_batch_to_schema(source, &target).unwrap(); - let value = aligned - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(Some(7), value.iter().next().flatten()); - assert!(value.is_null(1)); + sleep(self.delay).await; + self.inflight.fetch_sub(1, Ordering::SeqCst); + Ok(RegionResponse::new(0)) + } + + async fn handle_query( + &self, + _request: QueryRequest, + ) -> MetaResult { + unimplemented!() + } + } + + #[derive(Clone)] + struct ConcurrentMockNodeManager { + datanodes: Arc>, + } + + #[async_trait] + impl DatanodeManager for ConcurrentMockNodeManager { + async fn datanode(&self, node: &Peer) -> DatanodeRef { + self.datanodes + .get(&node.id) + .expect("datanode not found") + .clone() + } + } + + struct NoopFlownode; + + #[async_trait] + impl Flownode for NoopFlownode { + async fn handle(&self, _request: FlowRequest) -> MetaResult { + unimplemented!() + } + + async fn handle_inserts(&self, _request: InsertRequests) -> MetaResult { + unimplemented!() + } + + async fn handle_mark_window_dirty( + &self, + _req: DirtyWindowRequests, + ) -> MetaResult { + unimplemented!() + } + } + + #[async_trait] + impl FlownodeManager for ConcurrentMockNodeManager { + async fn flownode(&self, _node: &Peer) -> FlownodeRef { + Arc::new(NoopFlownode) + } } #[test] @@ -1250,4 +1773,339 @@ mod tests { assert!(!should_close_worker_on_idle_timeout(1, 0)); assert!(!should_close_worker_on_idle_timeout(0, 1)); } + + #[tokio::test] + async fn test_flush_region_writes_concurrently_dispatches_multiple_datanodes() { + let inflight = Arc::new(AtomicUsize::new(0)); + let max_inflight = Arc::new(AtomicUsize::new(0)); + let datanode1: DatanodeRef = Arc::new(ConcurrentMockDatanode { + delay: Duration::from_millis(100), + inflight: inflight.clone(), + max_inflight: max_inflight.clone(), + }); + let datanode2: DatanodeRef = Arc::new(ConcurrentMockDatanode { + delay: Duration::from_millis(100), + inflight, + max_inflight: max_inflight.clone(), + }); + + let mut datanodes = HashMap::new(); + datanodes.insert(1, datanode1); + datanodes.insert(2, datanode2); + let node_manager = Arc::new(ConcurrentMockNodeManager { + datanodes: Arc::new(datanodes), + }); + + let writes = vec![ + FlushRegionWrite { + region_id: RegionId::new(1024, 1), + row_count: 10, + datanode: Peer { + id: 1, + addr: "node1".to_string(), + }, + request: RegionRequest::default(), + }, + FlushRegionWrite { + region_id: RegionId::new(1024, 2), + row_count: 12, + datanode: Peer { + id: 2, + addr: "node2".to_string(), + }, + request: RegionRequest::default(), + }, + ]; + + let results = flush_region_writes_concurrently(node_manager, writes).await; + assert_eq!(2, results.len()); + assert!( + results + .iter() + .all(|result| matches!(result, FlushWriteResult::Success { .. })) + ); + assert!(max_inflight.load(Ordering::SeqCst) >= 2); + } + + #[test] + fn test_should_dispatch_concurrently_by_region_count() { + assert!(!should_dispatch_concurrently(0)); + assert!(!should_dispatch_concurrently(1)); + assert!(should_dispatch_concurrently(2)); + } + + #[test] + fn test_strip_partition_columns_from_batch_removes_partition_tags() { + let batch = RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![ + Field::new("__primary_key", ArrowDataType::Binary, false), + Field::new( + "greptime_timestamp", + ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), + false, + ), + Field::new("greptime_value", ArrowDataType::Float64, true), + Field::new("host", ArrowDataType::Utf8, true), + ])), + vec![ + Arc::new(BinaryArray::from(vec![b"k1".as_slice()])), + Arc::new(TimestampMillisecondArray::from(vec![1000_i64])), + Arc::new(arrow::array::Float64Array::from(vec![42.0_f64])), + Arc::new(StringArray::from(vec!["node-1"])), + ], + ) + .unwrap(); + + let stripped = strip_partition_columns_from_batch(batch).unwrap(); + + assert_eq!(3, stripped.num_columns()); + assert_eq!("__primary_key", stripped.schema().field(0).name()); + assert_eq!("greptime_timestamp", stripped.schema().field(1).name()); + assert_eq!("greptime_value", stripped.schema().field(2).name()); + } + + #[test] + fn test_strip_partition_columns_from_batch_projects_essential_columns_without_lookup() { + let batch = RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![ + Field::new("__primary_key", ArrowDataType::Binary, false), + Field::new( + "greptime_timestamp", + ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), + false, + ), + Field::new("greptime_value", ArrowDataType::Float64, true), + Field::new("host", ArrowDataType::Utf8, true), + ])), + vec![ + Arc::new(BinaryArray::from(vec![b"k1".as_slice()])), + Arc::new(TimestampMillisecondArray::from(vec![1000_i64])), + Arc::new(arrow::array::Float64Array::from(vec![42.0_f64])), + Arc::new(StringArray::from(vec!["node-1"])), + ], + ) + .unwrap(); + + let stripped = strip_partition_columns_from_batch(batch).unwrap(); + + assert_eq!(3, stripped.num_columns()); + assert_eq!("__primary_key", stripped.schema().field(0).name()); + assert_eq!("greptime_timestamp", stripped.schema().field(1).name()); + assert_eq!("greptime_value", stripped.schema().field(2).name()); + } + + #[test] + fn test_collect_tag_columns_and_non_tag_indices_keeps_partition_tag_column() { + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + "greptime_timestamp", + ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), + false, + ), + Field::new("greptime_value", ArrowDataType::Float64, true), + Field::new("host", ArrowDataType::Utf8, true), + Field::new("region", ArrowDataType::Utf8, true), + ])); + let name_to_ids = + HashMap::from([("host".to_string(), 1_u32), ("region".to_string(), 2_u32)]); + let partition_columns = HashSet::from(["host"]); + + let (tag_columns, non_tag_indices) = + columns_taxonomy(&schema, "cpu", &name_to_ids, &partition_columns).unwrap(); + + assert_eq!(2, tag_columns.len()); + assert_eq!(&[0, 1, 2], non_tag_indices.as_slice()); + } + + #[test] + fn test_collect_tag_columns_and_non_tag_indices_prioritizes_essential_columns() { + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("host", ArrowDataType::Utf8, true), + Field::new("greptime_value", ArrowDataType::Float64, true), + Field::new( + "greptime_timestamp", + ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), + false, + ), + Field::new("region", ArrowDataType::Utf8, true), + ])); + let name_to_ids = + HashMap::from([("host".to_string(), 1_u32), ("region".to_string(), 2_u32)]); + let partition_columns = HashSet::from(["host", "region"]); + + let (_tag_columns, non_tag_indices): (_, SmallVec<[usize; 3]>) = + columns_taxonomy(&schema, "cpu", &name_to_ids, &partition_columns).unwrap(); + + assert_eq!(&[2, 1, 0, 3], non_tag_indices.as_slice()); + } + + #[test] + fn test_collect_tag_columns_and_non_tag_indices_rejects_unexpected_data_type() { + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + "greptime_timestamp", + ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), + false, + ), + Field::new("greptime_value", ArrowDataType::Float64, true), + Field::new("host", ArrowDataType::Utf8, true), + Field::new("invalid", ArrowDataType::Boolean, true), + ])); + let name_to_ids = HashMap::from([("host".to_string(), 1_u32)]); + let partition_columns = HashSet::from(["host"]); + + let result = columns_taxonomy(&schema, "cpu", &name_to_ids, &partition_columns); + + assert!(matches!( + result, + Err(Error::InvalidPromRemoteRequest { .. }) + )); + } + + #[test] + fn test_collect_tag_columns_and_non_tag_indices_rejects_int64_timestamp_column() { + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("greptime_timestamp", ArrowDataType::Int64, false), + Field::new("greptime_value", ArrowDataType::Float64, true), + Field::new("host", ArrowDataType::Utf8, true), + ])); + let name_to_ids = HashMap::from([("host".to_string(), 1_u32)]); + let partition_columns = HashSet::from(["host"]); + + let result = columns_taxonomy(&schema, "cpu", &name_to_ids, &partition_columns); + + assert!(matches!( + result, + Err(Error::InvalidPromRemoteRequest { .. }) + )); + } + + #[test] + fn test_collect_tag_columns_and_non_tag_indices_rejects_duplicated_timestamp_column() { + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + "ts1", + ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), + false, + ), + Field::new( + "ts2", + ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), + false, + ), + Field::new("greptime_value", ArrowDataType::Float64, true), + Field::new("host", ArrowDataType::Utf8, true), + ])); + let name_to_ids = HashMap::from([("host".to_string(), 1_u32)]); + let partition_columns = HashSet::from(["host"]); + + let result = columns_taxonomy(&schema, "cpu", &name_to_ids, &partition_columns); + + assert!(matches!( + result, + Err(Error::InvalidPromRemoteRequest { .. }) + )); + } + + #[test] + fn test_collect_tag_columns_and_non_tag_indices_rejects_duplicated_value_column() { + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + "greptime_timestamp", + ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), + false, + ), + Field::new("value1", ArrowDataType::Float64, true), + Field::new("value2", ArrowDataType::Float64, true), + Field::new("host", ArrowDataType::Utf8, true), + ])); + let name_to_ids = HashMap::from([("host".to_string(), 1_u32)]); + let partition_columns = HashSet::from(["host"]); + + let result = columns_taxonomy(&schema, "cpu", &name_to_ids, &partition_columns); + + assert!(matches!( + result, + Err(Error::InvalidPromRemoteRequest { .. }) + )); + } + + #[test] + fn test_modify_batch_sparse_with_taxonomy_per_batch() { + use arrow::array::BinaryArray; + use metric_engine::batch_modifier::modify_batch_sparse; + + let schema1 = Arc::new(ArrowSchema::new(vec![ + Field::new( + "greptime_timestamp", + ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), + false, + ), + Field::new("greptime_value", ArrowDataType::Float64, true), + Field::new("tag1", ArrowDataType::Utf8, true), + ])); + + let schema2 = Arc::new(ArrowSchema::new(vec![ + Field::new( + "greptime_timestamp", + ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), + false, + ), + Field::new("greptime_value", ArrowDataType::Float64, true), + Field::new("tag1", ArrowDataType::Utf8, true), + Field::new("tag2", ArrowDataType::Utf8, true), + ])); + let batch2 = RecordBatch::try_new( + schema2.clone(), + vec![ + Arc::new(TimestampMillisecondArray::from(vec![2000])), + Arc::new(arrow::array::Float64Array::from(vec![2.0])), + Arc::new(StringArray::from(vec!["v1"])), + Arc::new(StringArray::from(vec!["v2"])), + ], + ) + .unwrap(); + + let name_to_ids = HashMap::from([("tag1".to_string(), 1), ("tag2".to_string(), 2)]); + let partition_columns = HashSet::new(); + + // A batch that only has tag1, same values as batch2 for ts and val. + let batch3 = RecordBatch::try_new( + schema1.clone(), + vec![ + Arc::new(TimestampMillisecondArray::from(vec![2000])), + Arc::new(arrow::array::Float64Array::from(vec![2.0])), + Arc::new(StringArray::from(vec!["v1"])), + ], + ) + .unwrap(); + + // Simulate the new loop logic in flush_batch_physical: + // Resolve taxonomy FOR EACH BATCH. + let (tag_columns2, indices2) = + columns_taxonomy(&batch2.schema(), "table", &name_to_ids, &partition_columns).unwrap(); + let modified2 = modify_batch_sparse(batch2, 123, &tag_columns2, &indices2).unwrap(); + + let (tag_columns3, indices3) = + columns_taxonomy(&batch3.schema(), "table", &name_to_ids, &partition_columns).unwrap(); + let modified3 = modify_batch_sparse(batch3, 123, &tag_columns3, &indices3).unwrap(); + + let pk2 = modified2 + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let pk3 = modified3 + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + // Now they SHOULD be different because tag2 is included in pk2 but not in pk3. + assert_ne!( + pk2.value(0), + pk3.value(0), + "PK should be different because batch2 has tag2!" + ); + } } diff --git a/src/servers/src/pipeline.rs b/src/servers/src/pipeline.rs index fe7a2f48f3..9de2d63b97 100644 --- a/src/servers/src/pipeline.rs +++ b/src/servers/src/pipeline.rs @@ -28,7 +28,7 @@ use session::context::{Channel, QueryContextRef}; use snafu::ResultExt; use vrl::value::Value as VrlValue; -use crate::error::{CatalogSnafu, PipelineSnafu, Result}; +use crate::error::{PipelineSnafu, Result}; use crate::http::event::PipelineIngestRequest; use crate::metrics::{ METRIC_FAILURE_VALUE, METRIC_HTTP_LOGS_TRANSFORM_ELAPSED, METRIC_SUCCESS_VALUE, @@ -89,10 +89,7 @@ async fn run_identity_pipeline( let table = if pipeline_ctx.channel == Channel::Prometheus { None } else { - handler - .get_table(&table_name, query_ctx) - .await - .context(CatalogSnafu)? + handler.get_table(&table_name, query_ctx).await? }; identity_pipeline(data_array, table, pipeline_ctx) .map(|opt_map| ContextReq::from_opt_map(opt_map, table_name)) @@ -141,10 +138,7 @@ async fn run_custom_pipeline( } }; - let table = handler - .get_table(&table_name, query_ctx) - .await - .context(CatalogSnafu)?; + let table = handler.get_table(&table_name, query_ctx).await?; schema_info.set_table(table); for pipeline_map in pipeline_maps { diff --git a/src/servers/src/prom_row_builder.rs b/src/servers/src/prom_row_builder.rs new file mode 100644 index 0000000000..0fddc0938a --- /dev/null +++ b/src/servers/src/prom_row_builder.rs @@ -0,0 +1,557 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Prometheus row-level helpers for converting proto `Rows` into Arrow +//! `RecordBatch`es and aligning / normalizing their schemas against +//! existing table schemas in the catalog. + +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use api::helper::ColumnDataTypeWrapper; +use api::v1::value::ValueData; +use api::v1::{ColumnSchema, Rows, SemanticType}; +use arrow::array::{ + ArrayRef, Float64Builder, StringBuilder, TimestampMicrosecondBuilder, + TimestampMillisecondBuilder, TimestampNanosecondBuilder, TimestampSecondBuilder, + new_null_array, +}; +use arrow::datatypes::{DataType as ArrowDataType, Schema as ArrowSchema}; +use arrow::record_batch::RecordBatch; +use arrow_schema::TimeUnit; +use common_query::prelude::{greptime_timestamp, greptime_value}; +use datatypes::data_type::DataType; +use datatypes::prelude::ConcreteDataType; +use snafu::{OptionExt, ResultExt, ensure}; + +use crate::error; +use crate::error::Result; + +/// Extract timestamp, field, and tag column names from a logical region schema. +fn unzip_logical_region_schema( + target_schema: &ArrowSchema, +) -> Result<(String, String, HashSet)> { + let mut timestamp_column = None; + let mut field_column = None; + let mut tag_columns = HashSet::with_capacity(target_schema.fields.len().saturating_sub(2)); + for field in target_schema.fields() { + if field.name() == greptime_timestamp() { + timestamp_column = Some(field.name().clone()); + continue; + } + + if field.name() == greptime_value() { + field_column = Some(field.name().clone()); + continue; + } + + if timestamp_column.is_none() && matches!(field.data_type(), ArrowDataType::Timestamp(_, _)) + { + timestamp_column = Some(field.name().clone()); + continue; + } + + if field_column.is_none() && matches!(field.data_type(), ArrowDataType::Float64) { + field_column = Some(field.name().clone()); + continue; + } + tag_columns.insert(field.name().clone()); + } + + let timestamp_column = timestamp_column.with_context(|| error::UnexpectedResultSnafu { + reason: "Failed to locate timestamp column in target schema".to_string(), + })?; + let field_column = field_column.with_context(|| error::UnexpectedResultSnafu { + reason: "Failed to locate field column in target schema".to_string(), + })?; + + Ok((timestamp_column, field_column, tag_columns)) +} + +/// Directly converts proto `Rows` into a `RecordBatch` aligned to the given +/// `target_schema`, handling Prometheus column renaming (timestamp/value), +/// reordering, type casting, and null-filling in a single pass. +pub(crate) fn rows_to_aligned_record_batch( + rows: &Rows, + target_schema: &ArrowSchema, +) -> Result { + let row_count = rows.rows.len(); + let column_count = rows.schema.len(); + + for (idx, row) in rows.rows.iter().enumerate() { + ensure!( + row.values.len() == column_count, + error::InternalSnafu { + err_msg: format!( + "Column count mismatch in row {}, expected {}, got {}", + idx, + column_count, + row.values.len() + ) + } + ); + } + + let (target_ts_name, target_field_name, _target_tags) = + unzip_logical_region_schema(target_schema)?; + + // Map effective target column name → (source column index, source arrow type). + // Handles prom renames: Timestamp → target ts name, Float64 → target field name. + let mut source_map: HashMap<&str, (usize, ArrowDataType)> = + HashMap::with_capacity(rows.schema.len()); + + for (src_idx, col) in rows.schema.iter().enumerate() { + let wrapper = ColumnDataTypeWrapper::try_new(col.datatype, col.datatype_extension.clone())?; + let src_arrow_type = ConcreteDataType::from(wrapper).as_arrow_type(); + + match &src_arrow_type { + ArrowDataType::Float64 => { + source_map.insert(&target_field_name, (src_idx, src_arrow_type)); + } + ArrowDataType::Timestamp(unit, _) => { + ensure!( + unit == &TimeUnit::Millisecond, + error::InvalidPromRemoteRequestSnafu { + msg: format!( + "Unexpected remote write batch timestamp unit, expect millisecond, got: {}", + unit + ) + } + ); + source_map.insert(&target_ts_name, (src_idx, src_arrow_type)); + } + ArrowDataType::Utf8 => { + source_map.insert(&col.column_name, (src_idx, src_arrow_type)); + } + other => { + return error::InvalidPromRemoteRequestSnafu { + msg: format!( + "Unexpected remote write batch field type {}, field name: {}", + other, col.column_name + ), + } + .fail(); + } + } + } + + // Build columns in target schema order + let mut columns = Vec::with_capacity(target_schema.fields().len()); + for target_field in target_schema.fields() { + if let Some((src_idx, src_arrow_type)) = source_map.get(target_field.name().as_str()) { + let array = build_arrow_array( + rows, + *src_idx, + &rows.schema[*src_idx].column_name, + src_arrow_type.clone(), + row_count, + )?; + columns.push(array); + } else { + columns.push(new_null_array(target_field.data_type(), row_count)); + } + } + + let batch = RecordBatch::try_new(Arc::new(target_schema.clone()), columns) + .context(error::ArrowSnafu)?; + Ok(batch) +} + +/// Identify tag columns in the proto `rows_schema` that are absent from the +/// target region schema, without building an intermediate `RecordBatch`. +pub(crate) fn identify_missing_columns_from_proto( + rows_schema: &[ColumnSchema], + target_schema: &ArrowSchema, +) -> Result> { + let (_, _, target_tags) = unzip_logical_region_schema(target_schema)?; + let mut missing = Vec::new(); + for col in rows_schema { + let wrapper = ColumnDataTypeWrapper::try_new(col.datatype, col.datatype_extension.clone())?; + let arrow_type = ConcreteDataType::from(wrapper).as_arrow_type(); + if matches!(arrow_type, ArrowDataType::Utf8) + && !target_tags.contains(&col.column_name) + && target_schema.column_with_name(&col.column_name).is_none() + { + missing.push(col.column_name.clone()); + } + } + Ok(missing) +} + +/// Build a `Vec` suitable for creating a new Prometheus logical table +/// directly from the proto `rows.schema`, avoiding the round-trip through Arrow schema. +pub fn build_prom_create_table_schema_from_proto( + rows_schema: &[ColumnSchema], +) -> Result> { + rows_schema + .iter() + .map(|col| { + let semantic_type = if col.datatype == api::v1::ColumnDataType::TimestampMillisecond as i32 { + SemanticType::Timestamp + } else if col.datatype == api::v1::ColumnDataType::Float64 as i32 { + SemanticType::Field + } else { + // tag columns must be String type + ensure!(col.datatype == api::v1::ColumnDataType::String as i32, error::InvalidPromRemoteRequestSnafu{ + msg: format!( + "Failed to build create table schema, tag column '{}' must be String but got datatype {}", + col.column_name, col.datatype + ) + }); + SemanticType::Tag + }; + + Ok(ColumnSchema { + column_name: col.column_name.clone(), + datatype: col.datatype, + semantic_type: semantic_type as i32, + datatype_extension: col.datatype_extension.clone(), + options: None, + }) + }) + .collect() +} + +/// Build a single Arrow array for the given column index from proto `Rows`. +fn build_arrow_array( + rows: &Rows, + col_idx: usize, + column_name: &String, + column_data_type: arrow::datatypes::DataType, + row_count: usize, +) -> Result { + macro_rules! build_array { + ($builder:expr, $( $pattern:pat => $value:expr ),+ $(,)?) => {{ + let mut builder = $builder; + for row in &rows.rows { + match row.values[col_idx].value_data.as_ref() { + $(Some($pattern) => builder.append_value($value),)+ + Some(v) => { + return error::InvalidPromRemoteRequestSnafu { + msg: format!("Unexpected value: {:?}", v), + } + .fail(); + } + None => builder.append_null(), + } + } + Arc::new(builder.finish()) as ArrayRef + }}; + } + + let array: ArrayRef = match column_data_type { + arrow::datatypes::DataType::Float64 => { + build_array!(Float64Builder::with_capacity(row_count), ValueData::F64Value(v) => *v) + } + arrow::datatypes::DataType::Utf8 => build_array!( + StringBuilder::with_capacity(row_count, 0), + ValueData::StringValue(v) => v + ), + arrow::datatypes::DataType::Timestamp(u, _) => match u { + TimeUnit::Second => build_array!( + TimestampSecondBuilder::with_capacity(row_count), + ValueData::TimestampSecondValue(v) => *v + ), + TimeUnit::Millisecond => build_array!( + TimestampMillisecondBuilder::with_capacity(row_count), + ValueData::TimestampMillisecondValue(v) => *v + ), + TimeUnit::Microsecond => build_array!( + TimestampMicrosecondBuilder::with_capacity(row_count), + ValueData::DatetimeValue(v) => *v, + ValueData::TimestampMicrosecondValue(v) => *v + ), + TimeUnit::Nanosecond => build_array!( + TimestampNanosecondBuilder::with_capacity(row_count), + ValueData::TimestampNanosecondValue(v) => *v + ), + }, + ty => { + return error::InvalidPromRemoteRequestSnafu { + msg: format!( + "Unexpected column type {:?}, column name: {}", + ty, column_name + ), + } + .fail(); + } + }; + + Ok(array) +} + +#[cfg(test)] +mod tests { + use api::v1::value::ValueData; + use api::v1::{ColumnDataType, ColumnSchema, Row, Rows, SemanticType, Value}; + use arrow::array::{Array, Float64Array, StringArray, TimestampMillisecondArray}; + use arrow::datatypes::{DataType, Field, Schema as ArrowSchema, TimeUnit}; + + use super::{ + build_prom_create_table_schema_from_proto, identify_missing_columns_from_proto, + rows_to_aligned_record_batch, + }; + + #[test] + fn test_rows_to_aligned_record_batch_renames_and_reorders() { + let rows = Rows { + schema: vec![ + ColumnSchema { + column_name: "greptime_timestamp".to_string(), + datatype: ColumnDataType::TimestampMillisecond as i32, + semantic_type: SemanticType::Timestamp as i32, + ..Default::default() + }, + ColumnSchema { + column_name: "host".to_string(), + datatype: ColumnDataType::String as i32, + semantic_type: SemanticType::Tag as i32, + ..Default::default() + }, + ColumnSchema { + column_name: "greptime_value".to_string(), + datatype: ColumnDataType::Float64 as i32, + semantic_type: SemanticType::Field as i32, + ..Default::default() + }, + ], + rows: vec![ + Row { + values: vec![ + Value { + value_data: Some(ValueData::TimestampMillisecondValue(1000)), + }, + Value { + value_data: Some(ValueData::StringValue("h1".to_string())), + }, + Value { + value_data: Some(ValueData::F64Value(42.0)), + }, + ], + }, + Row { + values: vec![ + Value { + value_data: Some(ValueData::TimestampMillisecondValue(2000)), + }, + Value { + value_data: Some(ValueData::StringValue("h2".to_string())), + }, + Value { + value_data: Some(ValueData::F64Value(99.0)), + }, + ], + }, + ], + }; + + // Target schema has renamed columns and different ordering. + let target = ArrowSchema::new(vec![ + Field::new( + "my_ts", + DataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + Field::new("host", DataType::Utf8, true), + Field::new("my_value", DataType::Float64, true), + ]); + + let batch = rows_to_aligned_record_batch(&rows, &target).unwrap(); + assert_eq!(batch.schema().as_ref(), &target); + assert_eq!(2, batch.num_rows()); + assert_eq!(3, batch.num_columns()); + + let ts = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(ts.value(0), 1000); + assert_eq!(ts.value(1), 2000); + + let hosts = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(hosts.value(0), "h1"); + assert_eq!(hosts.value(1), "h2"); + + let values = batch + .column(2) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(values.value(0), 42.0); + assert_eq!(values.value(1), 99.0); + } + + #[test] + fn test_rows_to_aligned_record_batch_fills_nulls() { + let rows = Rows { + schema: vec![ + ColumnSchema { + column_name: "greptime_timestamp".to_string(), + datatype: ColumnDataType::TimestampMillisecond as i32, + semantic_type: SemanticType::Timestamp as i32, + ..Default::default() + }, + ColumnSchema { + column_name: "host".to_string(), + datatype: ColumnDataType::String as i32, + semantic_type: SemanticType::Tag as i32, + ..Default::default() + }, + ColumnSchema { + column_name: "instance".to_string(), + datatype: ColumnDataType::String as i32, + semantic_type: SemanticType::Tag as i32, + ..Default::default() + }, + ColumnSchema { + column_name: "greptime_value".to_string(), + datatype: ColumnDataType::Float64 as i32, + semantic_type: SemanticType::Field as i32, + ..Default::default() + }, + ], + rows: vec![Row { + values: vec![ + Value { + value_data: Some(ValueData::TimestampMillisecondValue(1000)), + }, + Value { + value_data: Some(ValueData::StringValue("h1".to_string())), + }, + Value { + value_data: Some(ValueData::StringValue("i1".to_string())), + }, + Value { + value_data: Some(ValueData::F64Value(1.0)), + }, + ], + }], + }; + + // Target schema has "host" but not "instance"; also has "region" which is missing from source. + let target = ArrowSchema::new(vec![ + Field::new( + "my_ts", + DataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + Field::new("host", DataType::Utf8, true), + Field::new("region", DataType::Utf8, true), + Field::new("my_value", DataType::Float64, true), + ]); + + let batch = rows_to_aligned_record_batch(&rows, &target).unwrap(); + assert_eq!(batch.schema().as_ref(), &target); + assert_eq!(1, batch.num_rows()); + assert_eq!(4, batch.num_columns()); + + // "region" column should be null-filled. + let region = batch + .column(2) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(region.is_null(0)); + } + + #[test] + fn test_identify_missing_columns_from_proto() { + let rows_schema = vec![ + ColumnSchema { + column_name: "greptime_timestamp".to_string(), + datatype: ColumnDataType::TimestampMillisecond as i32, + semantic_type: SemanticType::Timestamp as i32, + ..Default::default() + }, + ColumnSchema { + column_name: "host".to_string(), + datatype: ColumnDataType::String as i32, + semantic_type: SemanticType::Tag as i32, + ..Default::default() + }, + ColumnSchema { + column_name: "instance".to_string(), + datatype: ColumnDataType::String as i32, + semantic_type: SemanticType::Tag as i32, + ..Default::default() + }, + ColumnSchema { + column_name: "greptime_value".to_string(), + datatype: ColumnDataType::Float64 as i32, + semantic_type: SemanticType::Field as i32, + ..Default::default() + }, + ]; + + let target = ArrowSchema::new(vec![ + Field::new( + "my_ts", + DataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + Field::new("host", DataType::Utf8, true), + Field::new("my_value", DataType::Float64, true), + ]); + + let missing = identify_missing_columns_from_proto(&rows_schema, &target).unwrap(); + assert_eq!(missing, vec!["instance".to_string()]); + } + + #[test] + fn test_build_prom_create_table_schema_from_proto() { + let rows_schema = vec![ + ColumnSchema { + column_name: "greptime_timestamp".to_string(), + datatype: ColumnDataType::TimestampMillisecond as i32, + semantic_type: SemanticType::Timestamp as i32, + ..Default::default() + }, + ColumnSchema { + column_name: "job".to_string(), + datatype: ColumnDataType::String as i32, + semantic_type: SemanticType::Tag as i32, + ..Default::default() + }, + ColumnSchema { + column_name: "greptime_value".to_string(), + datatype: ColumnDataType::Float64 as i32, + semantic_type: SemanticType::Field as i32, + ..Default::default() + }, + ]; + + let schema = build_prom_create_table_schema_from_proto(&rows_schema).unwrap(); + assert_eq!(3, schema.len()); + + assert_eq!("greptime_timestamp", schema[0].column_name); + assert_eq!(SemanticType::Timestamp as i32, schema[0].semantic_type); + assert_eq!( + ColumnDataType::TimestampMillisecond as i32, + schema[0].datatype + ); + + assert_eq!("job", schema[1].column_name); + assert_eq!(SemanticType::Tag as i32, schema[1].semantic_type); + assert_eq!(ColumnDataType::String as i32, schema[1].datatype); + + assert_eq!("greptime_value", schema[2].column_name); + assert_eq!(SemanticType::Field as i32, schema[2].semantic_type); + assert_eq!(ColumnDataType::Float64 as i32, schema[2].datatype); + } +} From b75a1125610a146f5b9e38176e82d0cdc7c36db3 Mon Sep 17 00:00:00 2001 From: Yingwen Date: Wed, 1 Apr 2026 17:02:54 +0800 Subject: [PATCH 065/195] feat: implement prefilter for bulk memtable (#7895) * feat: prefilter in memtable Signed-off-by: evenyag * chore: fmt code Signed-off-by: evenyag * feat: bulk part reader also do prefilter Signed-off-by: evenyag * chore: extract pk filters check Signed-off-by: evenyag * fix: scanbench support explain verbose Signed-off-by: evenyag * feat: add metrics for mem prefilter Signed-off-by: evenyag * chore: address review comment Signed-off-by: evenyag * chore: remove dead code Signed-off-by: evenyag --------- Signed-off-by: evenyag --- src/cmd/src/datanode/scanbench.rs | 4 +- src/mito2/src/memtable.rs | 6 + src/mito2/src/memtable/bulk.rs | 2 - src/mito2/src/memtable/bulk/context.rs | 52 ++++- src/mito2/src/memtable/bulk/part.rs | 80 +++----- src/mito2/src/memtable/bulk/part_reader.rs | 105 ++++++++-- src/mito2/src/memtable/partition_tree/tree.rs | 1 + .../src/memtable/simple_bulk_memtable.rs | 1 + src/mito2/src/memtable/time_series.rs | 1 + src/mito2/src/read/scan_util.rs | 17 ++ src/mito2/src/sst/parquet/prefilter.rs | 185 +++++++++++++++++- 11 files changed, 377 insertions(+), 77 deletions(-) diff --git a/src/cmd/src/datanode/scanbench.rs b/src/cmd/src/datanode/scanbench.rs index 6bfe177fc1..51064126fe 100644 --- a/src/cmd/src/datanode/scanbench.rs +++ b/src/cmd/src/datanode/scanbench.rs @@ -677,7 +677,9 @@ impl ScanbenchCommand { // Scan all partitions let num_partitions = scanner.properties().partitions.len(); - let ctx = QueryScanContext::default(); + let ctx = QueryScanContext { + explain_verbose: self.verbose, + }; let metrics_set = ExecutionPlanMetricsSet::new(); let mut scan_futures = FuturesUnordered::new(); diff --git a/src/mito2/src/memtable.rs b/src/mito2/src/memtable.rs index 3ebfdd3628..154d062e07 100644 --- a/src/mito2/src/memtable.rs +++ b/src/mito2/src/memtable.rs @@ -497,6 +497,8 @@ impl MemScanMetrics { metrics.num_rows += inner.num_rows; metrics.num_batches += inner.num_batches; metrics.scan_cost += inner.scan_cost; + metrics.prefilter_cost += inner.prefilter_cost; + metrics.prefilter_rows_filtered += inner.prefilter_rows_filtered; } /// Gets the metrics data. @@ -515,6 +517,10 @@ pub(crate) struct MemScanMetricsData { pub(crate) num_batches: usize, /// Duration to scan the memtable. pub(crate) scan_cost: Duration, + /// Duration of prefilter in memtable scan. + pub(crate) prefilter_cost: Duration, + /// Number of rows filtered by prefilter in memtable scan. + pub(crate) prefilter_rows_filtered: usize, } /// Encoded range in the memtable. diff --git a/src/mito2/src/memtable/bulk.rs b/src/mito2/src/memtable/bulk.rs index 502b61759d..9d25d0c39f 100644 --- a/src/mito2/src/memtable/bulk.rs +++ b/src/mito2/src/memtable/bulk.rs @@ -15,9 +15,7 @@ //! Memtable implementation for bulk load pub(crate) mod chunk_reader; -#[allow(unused)] pub mod context; -#[allow(unused)] pub mod part; pub mod part_reader; mod row_group_reader; diff --git a/src/mito2/src/memtable/bulk/context.rs b/src/mito2/src/memtable/bulk/context.rs index c3274d30e9..7551eb33af 100644 --- a/src/mito2/src/memtable/bulk/context.rs +++ b/src/mito2/src/memtable/bulk/context.rs @@ -17,7 +17,8 @@ use std::collections::VecDeque; use std::sync::Arc; -use mito_codec::row_converter::{DensePrimaryKeyCodec, build_primary_key_codec}; +use common_recordbatch::filter::SimpleFilterEvaluator; +use mito_codec::row_converter::build_primary_key_codec; use parquet::file::metadata::ParquetMetaData; use store_api::metadata::RegionMetadataRef; use store_api::storage::ColumnId; @@ -25,8 +26,8 @@ use table::predicate::Predicate; use crate::error::Result; use crate::sst::parquet::file_range::{PreFilterMode, RangeBase}; -use crate::sst::parquet::flat_format::FlatReadFormat; use crate::sst::parquet::format::ReadFormat; +use crate::sst::parquet::prefilter::CachedPrimaryKeyFilter; use crate::sst::parquet::reader::SimpleFilterContext; use crate::sst::parquet::stats::RowGroupPruningStats; @@ -35,6 +36,9 @@ pub(crate) type BulkIterContextRef = Arc; pub struct BulkIterContext { pub(crate) base: RangeBase, pub(crate) predicate: Option, + /// Pre-extracted primary key filters for PK prefiltering. + /// `None` if PK prefiltering is not applicable. + pk_filters: Option>>, } impl BulkIterContext { @@ -62,7 +66,7 @@ impl BulkIterContext { ) -> Result { let codec = build_primary_key_codec(®ion_metadata); - let simple_filters = predicate + let simple_filters: Vec = predicate .as_ref() .iter() .flat_map(|predicate| { @@ -87,6 +91,9 @@ impl BulkIterContext { .map(|pred| pred.dyn_filters().as_ref().clone()) .unwrap_or_default(); + // Pre-extract PK filters if applicable. + let pk_filters = Self::extract_pk_filters(&read_format, &simple_filters); + Ok(Self { base: RangeBase { filters: simple_filters, @@ -102,6 +109,7 @@ impl BulkIterContext { partition_filter: None, }, predicate, + pk_filters, }) } @@ -133,6 +141,44 @@ impl BulkIterContext { } } + /// Extracts PK filters if flat format with dictionary-encoded PKs is used. + fn extract_pk_filters( + read_format: &ReadFormat, + filters: &[SimpleFilterContext], + ) -> Option>> { + let flat_format = read_format.as_flat()?; + if flat_format.batch_has_raw_pk_columns() { + return None; + } + let metadata = read_format.metadata(); + if metadata.primary_key.is_empty() { + return None; + } + + let pk_filters: Vec<_> = filters + .iter() + .filter_map(|f| f.primary_key_prefilter()) + .collect(); + if pk_filters.is_empty() { + return None; + } + + Some(Arc::new(pk_filters)) + } + + /// Builds a fresh PK filter for a new iterator. Returns `None` if PK + /// prefiltering is not applicable. + pub(crate) fn build_pk_filter(&self) -> Option { + let pk_filters = self.pk_filters.as_ref()?; + let metadata = self.base.read_format.metadata(); + // Parquet PK prefilter always supports the partition column. + let inner = self + .base + .codec + .primary_key_filter(metadata, Arc::clone(pk_filters), false); + Some(CachedPrimaryKeyFilter::new(inner)) + } + pub(crate) fn read_format(&self) -> &ReadFormat { &self.base.read_format } diff --git a/src/mito2/src/memtable/bulk/part.rs b/src/mito2/src/memtable/bulk/part.rs index bf345c038e..986e9409ee 100644 --- a/src/mito2/src/memtable/bulk/part.rs +++ b/src/mito2/src/memtable/bulk/part.rs @@ -14,66 +14,55 @@ //! Bulk part encoder/decoder. -use std::collections::{HashMap, HashSet, VecDeque}; +use std::collections::{HashMap, HashSet}; use std::sync::Arc; use std::time::{Duration, Instant}; use api::helper::{ColumnDataTypeWrapper, to_grpc_value}; use api::v1::bulk_wal_entry::Body; -use api::v1::{ArrowIpc, BulkWalEntry, Mutation, OpType, bulk_wal_entry}; +use api::v1::{ArrowIpc, BulkWalEntry, Mutation, OpType}; use bytes::Bytes; use common_grpc::flight::{FlightDecoder, FlightEncoder, FlightMessage}; use common_recordbatch::DfRecordBatch as RecordBatch; use common_time::Timestamp; -use common_time::timestamp::TimeUnit; use datatypes::arrow; -use datatypes::arrow::array::{ - Array, ArrayRef, BinaryBuilder, BinaryDictionaryBuilder, DictionaryArray, StringBuilder, - StringDictionaryBuilder, TimestampMicrosecondArray, TimestampMillisecondArray, - TimestampNanosecondArray, TimestampSecondArray, UInt8Array, UInt8Builder, UInt32Array, - UInt64Array, UInt64Builder, -}; -use datatypes::arrow::compute::{SortColumn, SortOptions, TakeOptions}; +use datatypes::arrow::array::{Array, ArrayRef, StringDictionaryBuilder, UInt8Array, UInt64Array}; +use datatypes::arrow::compute::{SortColumn, SortOptions}; use datatypes::arrow::datatypes::{ DataType as ArrowDataType, Field, Schema, SchemaRef, UInt32Type, }; -use datatypes::arrow_array::BinaryArray; use datatypes::data_type::DataType; -use datatypes::prelude::{MutableVector, ScalarVectorBuilder, Vector}; -use datatypes::value::{Value, ValueRef}; +use datatypes::prelude::{MutableVector, Vector}; +use datatypes::value::ValueRef; use datatypes::vectors::Helper; -use mito_codec::key_values::{KeyValue, KeyValues, KeyValuesRef}; -use mito_codec::row_converter::{ - DensePrimaryKeyCodec, PrimaryKeyCodec, PrimaryKeyCodecExt, build_primary_key_codec, -}; +use mito_codec::key_values::{KeyValue, KeyValues}; +use mito_codec::row_converter::PrimaryKeyCodec; use parquet::arrow::ArrowWriter; use parquet::basic::{Compression, ZstdLevel}; -use parquet::data_type::AsBytes; use parquet::file::metadata::ParquetMetaData; use parquet::file::properties::WriterProperties; use smallvec::SmallVec; -use snafu::{OptionExt, ResultExt, Snafu}; +use snafu::{OptionExt, ResultExt}; use store_api::codec::PrimaryKeyEncoding; -use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataRef}; +use store_api::metadata::{RegionMetadata, RegionMetadataRef}; use store_api::storage::consts::PRIMARY_KEY_COLUMN_NAME; -use store_api::storage::{FileId, RegionId, SequenceNumber, SequenceRange}; -use table::predicate::Predicate; +use store_api::storage::{FileId, SequenceNumber, SequenceRange}; use crate::error::{ - self, ColumnNotFoundSnafu, ComputeArrowSnafu, ConvertColumnDataTypeSnafu, CreateDefaultSnafu, - DataTypeMismatchSnafu, EncodeMemtableSnafu, EncodeSnafu, InvalidMetadataSnafu, - InvalidRequestSnafu, NewRecordBatchSnafu, Result, UnexpectedSnafu, + self, ColumnNotFoundSnafu, ComputeArrowSnafu, CreateDefaultSnafu, DataTypeMismatchSnafu, + EncodeMemtableSnafu, EncodeSnafu, InvalidMetadataSnafu, InvalidRequestSnafu, + NewRecordBatchSnafu, Result, }; use crate::memtable::bulk::context::BulkIterContextRef; use crate::memtable::bulk::part_reader::EncodedBulkPartIter; use crate::memtable::time_series::{ValueBuilder, Values}; use crate::memtable::{BoxedRecordBatchIterator, MemScanMetrics, MemtableStats}; +use crate::sst::SeriesEstimator; use crate::sst::index::IndexOutput; use crate::sst::parquet::file_range::{PreFilterMode, row_group_contains_delete}; use crate::sst::parquet::flat_format::primary_key_column_index; -use crate::sst::parquet::format::{PrimaryKeyArray, PrimaryKeyArrayBuilder, ReadFormat}; +use crate::sst::parquet::format::{PrimaryKeyArray, PrimaryKeyArrayBuilder}; use crate::sst::parquet::{PARQUET_METADATA_KEY, SstInfo}; -use crate::sst::{SeriesEstimator, to_sst_arrow_schema}; const INIT_DICT_VALUE_CAPACITY: usize = 8; @@ -527,8 +516,6 @@ impl PrimaryKeyColumnBuilder { /// Converter that converts structs into [BulkPart]. pub struct BulkPartConverter { - /// Region metadata. - region_metadata: RegionMetadataRef, /// Schema of the converted batch. schema: SchemaRef, /// Primary key codec for encoding keys @@ -577,7 +564,6 @@ impl BulkPartConverter { }; Self { - region_metadata: region_metadata.clone(), schema, primary_key_codec, key_buf: Vec::new(), @@ -1116,7 +1102,6 @@ pub struct BulkPartEncodeMetrics { pub struct BulkPartEncoder { metadata: RegionMetadataRef, - row_group_size: usize, writer_props: Option, } @@ -1141,7 +1126,6 @@ impl BulkPartEncoder { Ok(Self { metadata, - row_group_size, writer_props, }) } @@ -1182,7 +1166,6 @@ impl BulkPartEncoder { iter_start = Instant::now(); } metrics.iter_cost += iter_start.elapsed(); - iter_start = Instant::now(); if total_rows == 0 { return Ok(None); @@ -1348,11 +1331,6 @@ impl MultiBulkPart { self.batches.len() } - /// Returns an iterator over the record batches. - pub(crate) fn batches(&self) -> impl Iterator { - self.batches.iter() - } - /// Returns the estimated memory size of all batches. pub(crate) fn estimated_size(&self) -> usize { self.batches.iter().map(record_batch_estimated_size).sum() @@ -1400,19 +1378,22 @@ impl MultiBulkPart { mod tests { use api::v1::{Row, SemanticType, WriteHint}; use datafusion_common::ScalarValue; - use datatypes::arrow::array::Float64Array; + use datatypes::arrow::array::{ + BinaryArray, DictionaryArray, Float64Array, TimestampMillisecondArray, + }; + use datatypes::arrow::datatypes::UInt32Type; use datatypes::prelude::{ConcreteDataType, Value}; use datatypes::schema::ColumnSchema; + use mito_codec::row_converter::build_primary_key_codec; use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder}; use store_api::storage::RegionId; use store_api::storage::consts::ReservedColumnId; + use table::predicate::Predicate; use super::*; use crate::memtable::bulk::context::BulkIterContext; use crate::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema}; - use crate::test_util::memtable_util::{ - build_key_values_with_ts_seq_values, metadata_for_test, region_metadata_to_row_schema, - }; + use crate::test_util::memtable_util::{build_key_values_with_ts_seq_values, metadata_for_test}; struct MutationInput<'a> { k0: &'a str, @@ -1422,13 +1403,6 @@ mod tests { sequence: u64, } - #[derive(Debug, PartialOrd, PartialEq)] - struct BatchOutput<'a> { - pk_values: &'a [Value], - timestamps: &'a [i64], - v1: &'a [Option], - } - fn encode(input: &[MutationInput]) -> EncodedBulkPart { let metadata = metadata_for_test(); let kvs = input @@ -1482,7 +1456,7 @@ mod tests { ]); let projection = &[4u32]; - let mut reader = part + let reader = part .read( Arc::new( BulkIterContext::new( @@ -1523,7 +1497,7 @@ mod tests { let kvs = key_values .into_iter() .map(|(k0, k1, (start, end), sequence)| { - let ts = (start..end); + let ts = start..end; let v1 = (start..end).map(|_| None); build_key_values_with_ts_seq_values(&metadata, k0.to_string(), k1, ts, v1, sequence) }) @@ -1553,7 +1527,7 @@ mod tests { ) .unwrap(), ); - let mut reader = part + let reader = part .read(context, None, None) .unwrap() .expect("expect at least one row group"); @@ -1626,7 +1600,7 @@ mod tests { 100, ); - /// Predicates over field column can do precise filtering. + // Predicates over field column can do precise filtering. check_prune_row_group( &part, Some(Predicate::new(vec![ diff --git a/src/mito2/src/memtable/bulk/part_reader.rs b/src/mito2/src/memtable/bulk/part_reader.rs index 1375e79542..a9caeef08c 100644 --- a/src/mito2/src/memtable/bulk/part_reader.rs +++ b/src/mito2/src/memtable/bulk/part_reader.rs @@ -17,6 +17,7 @@ use std::time::Instant; use datatypes::arrow::array::BooleanArray; use datatypes::arrow::record_batch::RecordBatch; +use mito_codec::row_converter::PrimaryKeyFilter; use parquet::arrow::ProjectionMask; use parquet::arrow::arrow_reader::ParquetRecordBatchReader; use snafu::ResultExt; @@ -29,7 +30,8 @@ use crate::memtable::bulk::row_group_reader::MemtableRowGroupReaderBuilder; use crate::memtable::{MemScanMetrics, MemScanMetricsData}; use crate::metrics::{READ_ROWS_TOTAL, READ_STAGE_ELAPSED}; use crate::sst::parquet::file_range::{PreFilterMode, TagDecodeState}; -use crate::sst::parquet::flat_format::sequence_column_index; +use crate::sst::parquet::flat_format::{primary_key_column_index, sequence_column_index}; +use crate::sst::parquet::prefilter::{CachedPrimaryKeyFilter, prefilter_flat_batch_by_primary_key}; /// Iterator for reading data inside a bulk part. pub struct EncodedBulkPartIter { @@ -41,6 +43,8 @@ pub struct EncodedBulkPartIter { sequence: Option, /// Cached skip_fields for current row group. current_skip_fields: bool, + /// Primary key filter for prefiltering before convert_batch. + pk_filter: Option, /// Metrics for this iterator. metrics: MemScanMetricsData, /// Optional memory scan metrics to report to. @@ -69,6 +73,9 @@ impl EncodedBulkPartIter { let builder = MemtableRowGroupReaderBuilder::try_new(&context, projection_mask, parquet_meta, data)?; + // Build PK filter if applicable (flat format with dictionary-encoded PKs). + let pk_filter = context.build_pk_filter(); + let (init_reader, current_skip_fields) = match row_groups_to_read.pop_front() { Some(first_row_group) => { let skip_fields = builder.compute_skip_fields(&context, first_row_group); @@ -85,6 +92,7 @@ impl EncodedBulkPartIter { builder, sequence, current_skip_fields, + pk_filter, metrics: MemScanMetricsData { total_series: series_count, ..Default::default() @@ -116,6 +124,10 @@ impl EncodedBulkPartIter { &self.sequence, batch, self.current_skip_fields, + self.pk_filter + .as_mut() + .map(|f| f as &mut dyn PrimaryKeyFilter), + &mut self.metrics, )? { // Update metrics self.metrics.num_batches += 1; @@ -142,6 +154,10 @@ impl EncodedBulkPartIter { &self.sequence, batch, self.current_skip_fields, + self.pk_filter + .as_mut() + .map(|f| f as &mut dyn PrimaryKeyFilter), + &mut self.metrics, )? { // Update metrics self.metrics.num_batches += 1; @@ -175,12 +191,14 @@ impl Iterator for EncodedBulkPartIter { impl Drop for EncodedBulkPartIter { fn drop(&mut self) { common_telemetry::debug!( - "EncodedBulkPartIter region: {}, metrics: total_series={}, num_rows={}, num_batches={}, scan_cost={:?}", + "EncodedBulkPartIter region: {}, metrics: total_series={}, num_rows={}, num_batches={}, scan_cost={:?}, prefilter_cost={:?}, prefilter_rows_filtered={}", self.context.region_id(), self.metrics.total_series, self.metrics.num_rows, self.metrics.num_batches, - self.metrics.scan_cost + self.metrics.scan_cost, + self.metrics.prefilter_cost, + self.metrics.prefilter_rows_filtered ); // Report MemScanMetrics if not already reported @@ -205,6 +223,8 @@ pub struct BulkPartBatchIter { context: BulkIterContextRef, /// Sequence number filter. sequence: Option, + /// Primary key filter for prefiltering before convert_batch. + pk_filter: Option, /// Metrics for this iterator. metrics: MemScanMetricsData, /// Optional memory scan metrics to report to. @@ -222,10 +242,13 @@ impl BulkPartBatchIter { ) -> Self { assert!(context.read_format().as_flat().is_some()); + let pk_filter = context.build_pk_filter(); + Self { batches: VecDeque::from(batches), context, sequence, + pk_filter, metrics: MemScanMetricsData { total_series: series_count, ..Default::default() @@ -282,8 +305,16 @@ impl BulkPartBatchIter { PreFilterMode::SkipFieldsOnDelete => true, }; - let Some(filtered_batch) = - apply_combined_filters(&self.context, &self.sequence, projected_batch, skip_fields)? + let Some(filtered_batch) = apply_combined_filters( + &self.context, + &self.sequence, + projected_batch, + skip_fields, + self.pk_filter + .as_mut() + .map(|f| f as &mut dyn PrimaryKeyFilter), + &mut self.metrics, + )? else { self.metrics.scan_cost += start.elapsed(); return Ok(None); @@ -323,12 +354,14 @@ impl Iterator for BulkPartBatchIter { impl Drop for BulkPartBatchIter { fn drop(&mut self) { common_telemetry::debug!( - "BulkPartBatchIter region: {}, metrics: total_series={}, num_rows={}, num_batches={}, scan_cost={:?}", + "BulkPartBatchIter region: {}, metrics: total_series={}, num_rows={}, num_batches={}, scan_cost={:?}, prefilter_cost={:?}, prefilter_rows_filtered={}", self.context.region_id(), self.metrics.total_series, self.metrics.num_rows, self.metrics.num_batches, - self.metrics.scan_cost + self.metrics.scan_cost, + self.metrics.prefilter_cost, + self.metrics.prefilter_rows_filtered ); // Report MemScanMetrics if not already reported @@ -353,8 +386,32 @@ fn apply_combined_filters( sequence: &Option, record_batch: RecordBatch, skip_fields: bool, + pk_filter: Option<&mut dyn PrimaryKeyFilter>, + metrics: &mut MemScanMetricsData, ) -> error::Result> { - // Converts the format to the flat format first. + // Apply PK prefilter on raw batch before convert_batch to reduce conversion overhead. + let has_pk_prefilter = pk_filter.is_some(); + let record_batch = if let Some(pk_filter) = pk_filter { + let rows_before = record_batch.num_rows(); + let prefilter_start = Instant::now(); + let pk_col_idx = primary_key_column_index(record_batch.num_columns()); + match prefilter_flat_batch_by_primary_key(record_batch, pk_col_idx, pk_filter)? { + Some(batch) => { + metrics.prefilter_cost += prefilter_start.elapsed(); + metrics.prefilter_rows_filtered += rows_before - batch.num_rows(); + batch + } + None => { + metrics.prefilter_cost += prefilter_start.elapsed(); + metrics.prefilter_rows_filtered += rows_before; + return Ok(None); + } + } + } else { + record_batch + }; + + // Converts the format to the flat format. let format = context.read_format().as_flat().unwrap(); let record_batch = format.convert_batch(record_batch, None)?; @@ -362,12 +419,12 @@ fn apply_combined_filters( let mut combined_filter = None; let mut tag_decode_state = TagDecodeState::new(); - // First, apply predicate filters using the shared method. + // Apply predicate filters using the shared method. if !context.base.filters.is_empty() { let predicate_mask = context.base.compute_filter_mask_flat( &record_batch, skip_fields, - false, + has_pk_prefilter, &mut tag_decode_state, )?; // If predicate filters out the entire batch, return None early @@ -433,6 +490,7 @@ mod tests { use super::*; use crate::memtable::bulk::context::BulkIterContext; + use crate::test_util::sst_util::new_primary_key; #[test] fn test_bulk_part_batch_iter() { @@ -461,9 +519,16 @@ mod tests { vec![1000, 2000, 3000], )); - // Create primary key dictionary array + // Create primary key dictionary array with properly encoded PKs use datatypes::arrow::array::{BinaryArray, DictionaryArray, UInt32Array}; - let values = Arc::new(BinaryArray::from_iter_values([b"key1", b"key2", b"key3"])); + let pk1 = new_primary_key(&["key1"]); + let pk2 = new_primary_key(&["key2"]); + let pk3 = new_primary_key(&["key3"]); + let values = Arc::new(BinaryArray::from_iter_values([ + pk1.as_slice(), + pk2.as_slice(), + pk3.as_slice(), + ])); let keys = UInt32Array::from(vec![0, 1, 2]); let primary_key = Arc::new(DictionaryArray::new(keys, values)); @@ -596,12 +661,17 @@ mod tests { ])); // Create first batch with 2 rows + let pk1 = new_primary_key(&["key1"]); + let pk2 = new_primary_key(&["key2"]); let key1_1 = Arc::new(StringArray::from_iter_values(["key1", "key2"])); let field1_1 = Arc::new(Int64Array::from(vec![11, 12])); let timestamp_1 = Arc::new(datatypes::arrow::array::TimestampMillisecondArray::from( vec![1000, 2000], )); - let values_1 = Arc::new(BinaryArray::from_iter_values([b"key1", b"key2"])); + let values_1 = Arc::new(BinaryArray::from_iter_values([ + pk1.as_slice(), + pk2.as_slice(), + ])); let keys_1 = UInt32Array::from(vec![0, 1]); let primary_key_1 = Arc::new(DictionaryArray::new(keys_1, values_1)); let sequence_1 = Arc::new(UInt64Array::from(vec![1, 2])); @@ -621,12 +691,19 @@ mod tests { .unwrap(); // Create second batch with 3 rows + let pk3 = new_primary_key(&["key3"]); + let pk4 = new_primary_key(&["key4"]); + let pk5 = new_primary_key(&["key5"]); let key1_2 = Arc::new(StringArray::from_iter_values(["key3", "key4", "key5"])); let field1_2 = Arc::new(Int64Array::from(vec![13, 14, 15])); let timestamp_2 = Arc::new(datatypes::arrow::array::TimestampMillisecondArray::from( vec![3000, 4000, 5000], )); - let values_2 = Arc::new(BinaryArray::from_iter_values([b"key3", b"key4", b"key5"])); + let values_2 = Arc::new(BinaryArray::from_iter_values([ + pk3.as_slice(), + pk4.as_slice(), + pk5.as_slice(), + ])); let keys_2 = UInt32Array::from(vec![0, 1, 2]); let primary_key_2 = Arc::new(DictionaryArray::new(keys_2, values_2)); let sequence_2 = Arc::new(UInt64Array::from(vec![3, 4, 5])); diff --git a/src/mito2/src/memtable/partition_tree/tree.rs b/src/mito2/src/memtable/partition_tree/tree.rs index 17977db56a..f5863ae0c8 100644 --- a/src/mito2/src/memtable/partition_tree/tree.rs +++ b/src/mito2/src/memtable/partition_tree/tree.rs @@ -490,6 +490,7 @@ impl TreeIter { num_rows: self.metrics.rows_fetched, num_batches: self.metrics.batches_fetched, scan_cost: self.metrics.iter_elapsed, + ..Default::default() }; mem_scan_metrics.merge_inner(&inner); } diff --git a/src/mito2/src/memtable/simple_bulk_memtable.rs b/src/mito2/src/memtable/simple_bulk_memtable.rs index 6d91f00361..1284741347 100644 --- a/src/mito2/src/memtable/simple_bulk_memtable.rs +++ b/src/mito2/src/memtable/simple_bulk_memtable.rs @@ -372,6 +372,7 @@ impl IterBuilder for BatchRangeBuilder { num_rows: batch.num_rows(), num_batches: 1, scan_cost: self.scan_cost, + ..Default::default() }; metrics.merge_inner(&inner); } diff --git a/src/mito2/src/memtable/time_series.rs b/src/mito2/src/memtable/time_series.rs index d3d00d0703..9666bee51c 100644 --- a/src/mito2/src/memtable/time_series.rs +++ b/src/mito2/src/memtable/time_series.rs @@ -567,6 +567,7 @@ impl Iter { num_rows: self.metrics.num_rows, num_batches: self.metrics.num_batches, scan_cost: self.metrics.scan_cost, + ..Default::default() }; mem_scan_metrics.merge_inner(&inner); } diff --git a/src/mito2/src/read/scan_util.rs b/src/mito2/src/read/scan_util.rs index 9bf1c17276..d065657242 100644 --- a/src/mito2/src/read/scan_util.rs +++ b/src/mito2/src/read/scan_util.rs @@ -138,6 +138,10 @@ pub(crate) struct ScanMetricsSet { mem_batches: usize, /// Number of series read from memtables. mem_series: usize, + /// Duration of prefilter in memtable scan. + mem_prefilter_cost: Duration, + /// Number of rows filtered by prefilter in memtable scan. + mem_prefilter_rows_filtered: usize, // SST related metrics: /// Duration to build file ranges. @@ -341,6 +345,8 @@ impl fmt::Debug for ScanMetricsSet { mem_rows, mem_batches, mem_series, + mem_prefilter_cost, + mem_prefilter_rows_filtered, inverted_index_apply_metrics, bloom_filter_apply_metrics, fulltext_index_apply_metrics, @@ -509,6 +515,15 @@ impl fmt::Debug for ScanMetricsSet { if !mem_scan_cost.is_zero() { write!(f, ", \"mem_scan_cost\":\"{mem_scan_cost:?}\"")?; } + if !mem_prefilter_cost.is_zero() { + write!(f, ", \"mem_prefilter_cost\":\"{mem_prefilter_cost:?}\"")?; + } + if *mem_prefilter_rows_filtered > 0 { + write!( + f, + ", \"mem_prefilter_rows_filtered\":{mem_prefilter_rows_filtered}" + )?; + } // Write optional verbose metrics if they are not empty if let Some(metrics) = inverted_index_apply_metrics @@ -1061,6 +1076,8 @@ impl PartitionMetrics { metrics.mem_rows += data.num_rows; metrics.mem_batches += data.num_batches; metrics.mem_series += data.total_series; + metrics.mem_prefilter_cost += data.prefilter_cost; + metrics.mem_prefilter_rows_filtered += data.prefilter_rows_filtered; } /// Merges [ScannerMetrics], `build_reader_cost`, `scan_cost` and `yield_cost`. diff --git a/src/mito2/src/sst/parquet/prefilter.rs b/src/mito2/src/sst/parquet/prefilter.rs index 88df56e401..967ddd491b 100644 --- a/src/mito2/src/sst/parquet/prefilter.rs +++ b/src/mito2/src/sst/parquet/prefilter.rs @@ -23,7 +23,7 @@ use std::sync::Arc; use api::v1::SemanticType; use common_recordbatch::filter::SimpleFilterEvaluator; -use datatypes::arrow::array::BinaryArray; +use datatypes::arrow::array::{BinaryArray, BooleanArray, BooleanBufferBuilder}; use datatypes::arrow::record_batch::RecordBatch; use futures::StreamExt; use mito_codec::row_converter::{PrimaryKeyCodec, PrimaryKeyFilter}; @@ -33,7 +33,7 @@ use parquet::schema::types::SchemaDescriptor; use snafu::{OptionExt, ResultExt}; use store_api::metadata::{RegionMetadata, RegionMetadataRef}; -use crate::error::{DecodeSnafu, ReadParquetSnafu, Result, UnexpectedSnafu}; +use crate::error::{ComputeArrowSnafu, DecodeSnafu, ReadParquetSnafu, Result, UnexpectedSnafu}; use crate::sst::parquet::flat_format::primary_key_column_index; use crate::sst::parquet::format::{PrimaryKeyArray, ReadFormat}; use crate::sst::parquet::reader::{RowGroupBuildContext, RowGroupReaderBuilder}; @@ -93,6 +93,55 @@ pub(crate) fn matching_row_ranges_by_primary_key( Ok(matched_row_ranges) } +/// Filters a flat-format record batch by primary key, returning only rows whose +/// primary key matches the filter. Returns `None` if all rows are filtered out. +pub(crate) fn prefilter_flat_batch_by_primary_key( + input: RecordBatch, + pk_column_index: usize, + pk_filter: &mut dyn PrimaryKeyFilter, +) -> Result> { + if input.num_rows() == 0 { + return Ok(Some(input)); + } + + let matched_row_ranges = + matching_row_ranges_by_primary_key(&input, pk_column_index, pk_filter)?; + if matched_row_ranges.is_empty() { + return Ok(None); + } + + if matched_row_ranges.len() == 1 + && matched_row_ranges[0].start == 0 + && matched_row_ranges[0].end == input.num_rows() + { + return Ok(Some(input)); + } + + if matched_row_ranges.len() == 1 { + let span = &matched_row_ranges[0]; + return Ok(Some(input.slice(span.start, span.end - span.start))); + } + + let mut builder = BooleanBufferBuilder::new(input.num_rows()); + builder.append_n(input.num_rows(), false); + for span in matched_row_ranges { + for i in span { + builder.set_bit(i, true); + } + } + + let filtered = datatypes::arrow::compute::filter_record_batch( + &input, + &BooleanArray::new(builder.finish(), None), + ) + .context(ComputeArrowSnafu)?; + if filtered.num_rows() == 0 { + Ok(None) + } else { + Ok(Some(filtered)) + } +} + /// Returns whether a filter can be applied by parquet primary-key prefiltering. /// /// Unlike `PartitionTreeMemtable`, parquet prefilter always supports predicates @@ -346,12 +395,19 @@ mod tests { use common_recordbatch::filter::SimpleFilterEvaluator; use datafusion_expr::{col, lit}; - use mito_codec::row_converter::PrimaryKeyFilter; + use datatypes::arrow::array::{ + ArrayRef, DictionaryArray, TimestampMillisecondArray, UInt8Array, UInt32Array, UInt64Array, + }; + use datatypes::arrow::datatypes::{Schema, UInt32Type}; + use mito_codec::row_converter::{PrimaryKeyFilter, build_primary_key_codec}; use store_api::codec::PrimaryKeyEncoding; use super::*; + use crate::sst::internal_fields; use crate::sst::parquet::format::ReadFormat; - use crate::test_util::sst_util::{new_primary_key, sst_region_metadata_with_encoding}; + use crate::test_util::sst_util::{ + new_primary_key, sst_region_metadata, sst_region_metadata_with_encoding, + }; #[test] fn test_is_usable_primary_key_filter_skips_legacy_primary_key_batches() { @@ -416,4 +472,125 @@ mod tests { assert_eq!(hits.load(Ordering::Relaxed), 2); } + + fn new_test_filters(exprs: &[datafusion_expr::Expr]) -> Vec { + exprs + .iter() + .filter_map(SimpleFilterEvaluator::try_new) + .collect() + } + + fn new_raw_batch(primary_keys: &[&[u8]], field_values: &[u64]) -> RecordBatch { + assert_eq!(primary_keys.len(), field_values.len()); + + let metadata = Arc::new(sst_region_metadata()); + let arrow_schema = metadata.schema.arrow_schema(); + let field_column = arrow_schema + .field(arrow_schema.index_of("field_0").unwrap()) + .clone(); + let time_index_column = arrow_schema + .field(arrow_schema.index_of("ts").unwrap()) + .clone(); + let mut fields = vec![field_column, time_index_column]; + fields.extend( + internal_fields() + .into_iter() + .map(|field| field.as_ref().clone()), + ); + let schema = Arc::new(Schema::new(fields)); + + let mut dict_values = Vec::new(); + let mut keys = Vec::with_capacity(primary_keys.len()); + for pk in primary_keys { + let key = dict_values + .iter() + .position(|existing: &&[u8]| existing == pk) + .unwrap_or_else(|| { + dict_values.push(*pk); + dict_values.len() - 1 + }); + keys.push(key as u32); + } + let pk_array: ArrayRef = Arc::new(DictionaryArray::::new( + UInt32Array::from(keys), + Arc::new(BinaryArray::from_iter_values(dict_values.iter().copied())), + )); + + RecordBatch::try_new( + schema, + vec![ + Arc::new(UInt64Array::from(field_values.to_vec())), + Arc::new(TimestampMillisecondArray::from_iter_values( + 0..primary_keys.len() as i64, + )), + pk_array, + Arc::new(UInt64Array::from(vec![1; primary_keys.len()])), + Arc::new(UInt8Array::from(vec![1; primary_keys.len()])), + ], + ) + .unwrap() + } + + fn field_values(batch: &RecordBatch) -> Vec { + batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .values() + .to_vec() + } + + #[test] + fn test_prefilter_primary_key_drops_single_dictionary_batch() { + let metadata = Arc::new(sst_region_metadata()); + let filters = Arc::new(new_test_filters(&[col("tag_0").eq(lit("b"))])); + let mut primary_key_filter = build_primary_key_codec(metadata.as_ref()) + .primary_key_filter(&metadata, filters, false); + let pk_a = new_primary_key(&["a", "x"]); + let batch = new_raw_batch(&[pk_a.as_slice(), pk_a.as_slice()], &[10, 11]); + let pk_col_idx = primary_key_column_index(batch.num_columns()); + + let filtered = + prefilter_flat_batch_by_primary_key(batch, pk_col_idx, primary_key_filter.as_mut()) + .unwrap(); + + assert!(filtered.is_none()); + } + + #[test] + fn test_prefilter_primary_key_builds_mask_for_fragmented_matches() { + let metadata = Arc::new(sst_region_metadata()); + let filters = Arc::new(new_test_filters(&[col("tag_0") + .eq(lit("a")) + .or(col("tag_0").eq(lit("c")))])); + let mut primary_key_filter = build_primary_key_codec(metadata.as_ref()) + .primary_key_filter(&metadata, filters, false); + let pk_a = new_primary_key(&["a", "x"]); + let pk_b = new_primary_key(&["b", "x"]); + let pk_c = new_primary_key(&["c", "x"]); + let pk_d = new_primary_key(&["d", "x"]); + let batch = new_raw_batch( + &[ + pk_a.as_slice(), + pk_a.as_slice(), + pk_b.as_slice(), + pk_b.as_slice(), + pk_c.as_slice(), + pk_c.as_slice(), + pk_d.as_slice(), + pk_d.as_slice(), + ], + &[10, 11, 12, 13, 14, 15, 16, 17], + ); + let pk_col_idx = primary_key_column_index(batch.num_columns()); + + let filtered = + prefilter_flat_batch_by_primary_key(batch, pk_col_idx, primary_key_filter.as_mut()) + .unwrap() + .unwrap(); + + assert_eq!(filtered.num_rows(), 4); + assert_eq!(field_values(&filtered), vec![10, 11, 14, 15]); + } } From 3f3407fa2433858d041ed21aad000ea0fe9c76a5 Mon Sep 17 00:00:00 2001 From: shuiyisong <113876041+shuiyisong@users.noreply.github.com> Date: Wed, 1 Apr 2026 20:14:53 +0800 Subject: [PATCH 066/195] feat: partial success in trace ingestion (#7892) * feat: impl partial success Signed-off-by: shuiyisong * refactor: grouping by resource and scope Signed-off-by: shuiyisong * chore: remove unused code Signed-off-by: shuiyisong * chore: rebase main & fix clippy Signed-off-by: shuiyisong * chore: add trace ingestion failure counter Signed-off-by: shuiyisong * fix: address comments Signed-off-by: shuiyisong * fix: update status list and remove TODO Signed-off-by: shuiyisong * fix: address comments Signed-off-by: shuiyisong * fix: fmt Signed-off-by: shuiyisong * chore: add more tests Signed-off-by: shuiyisong * fix: fmt Signed-off-by: shuiyisong --------- Signed-off-by: shuiyisong --- Cargo.lock | 1 + src/frontend/Cargo.toml | 1 + src/frontend/src/instance/otlp.rs | 554 +++++++++++++++++++++++++++-- src/frontend/src/metrics.rs | 8 + src/servers/src/http/otlp.rs | 13 +- src/servers/src/otlp/trace.rs | 77 +++- src/servers/src/otlp/trace/span.rs | 115 +++++- src/servers/src/otlp/trace/v0.rs | 136 ++++--- src/servers/src/otlp/trace/v1.rs | 125 ++++--- src/servers/src/query_handler.rs | 10 +- tests-integration/tests/http.rs | 89 ++++- 11 files changed, 980 insertions(+), 149 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 54be9bbdcb..695f19b072 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5255,6 +5255,7 @@ dependencies = [ "humantime", "humantime-serde", "hyper-util", + "itertools 0.14.0", "lazy_static", "log-query", "meta-client", diff --git a/src/frontend/Cargo.toml b/src/frontend/Cargo.toml index 03b0d35130..1b0ffe6e29 100644 --- a/src/frontend/Cargo.toml +++ b/src/frontend/Cargo.toml @@ -52,6 +52,7 @@ futures.workspace = true hostname.workspace = true humantime.workspace = true humantime-serde.workspace = true +itertools.workspace = true lazy_static.workspace = true log-query.workspace = true meta-client.workspace = true diff --git a/src/frontend/src/instance/otlp.rs b/src/frontend/src/instance/otlp.rs index 59174aa89a..8cda639686 100644 --- a/src/frontend/src/instance/otlp.rs +++ b/src/frontend/src/instance/otlp.rs @@ -19,9 +19,11 @@ use api::v1::{ColumnDataType, RowInsertRequests}; use async_trait::async_trait; use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq}; use client::Output; -use common_error::ext::BoxedError; +use common_error::ext::{BoxedError, ErrorExt}; +use common_error::status_code::StatusCode; use common_query::prelude::GREPTIME_PHYSICAL_TABLE; use common_telemetry::tracing; +use itertools::Itertools; use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest; use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest; use otel_arrow_rust::proto::opentelemetry::collector::metrics::v1::ExportMetricsServiceRequest; @@ -30,17 +32,57 @@ use servers::error::{self, AuthSnafu, Result as ServerResult}; use servers::http::prom_store::PHYSICAL_TABLE_PARAM; use servers::interceptor::{OpenTelemetryProtocolInterceptor, OpenTelemetryProtocolInterceptorRef}; use servers::otlp; +use servers::otlp::trace::TraceAuxData; use servers::otlp::trace::coerce::{ coerce_value_data, is_supported_trace_coercion, resolve_new_trace_column_type, trace_value_datatype, }; -use servers::query_handler::{OpenTelemetryProtocolHandler, PipelineHandlerRef}; +use servers::otlp::trace::span::{TraceSpan, TraceSpanGroup}; +use servers::query_handler::{ + OpenTelemetryProtocolHandler, PipelineHandlerRef, TraceIngestOutcome, +}; use session::context::QueryContextRef; use snafu::ResultExt; use table::requests::{OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM}; use crate::instance::Instance; -use crate::metrics::{OTLP_LOGS_ROWS, OTLP_METRICS_ROWS, OTLP_TRACES_ROWS}; +use crate::metrics::{ + OTLP_LOGS_ROWS, OTLP_METRICS_ROWS, OTLP_TRACES_FAILURE_COUNT, OTLP_TRACES_ROWS, +}; + +const TRACE_INGEST_CHUNK_SIZE: usize = 64; +const TRACE_FAILURE_MESSAGE_LIMIT: usize = 4; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ChunkFailureReaction { + RetryPerSpan, + DiscardChunk, + Propagate, +} + +impl ChunkFailureReaction { + fn as_metric_label(self) -> &'static str { + match self { + Self::RetryPerSpan => "retry_per_span", + Self::DiscardChunk => "discard_chunk", + Self::Propagate => "propagate_failure", + } + } +} + +struct TraceChunkIngestContext<'a> { + pipeline_handler: PipelineHandlerRef, + pipeline: &'a PipelineWay, + pipeline_params: &'a GreptimePipelineParams, + table_name: &'a str, + is_trace_v1_model: bool, +} + +struct TraceIngestState { + aux_data: TraceAuxData, + outcome: TraceIngestOutcome, + failure_messages: Vec, +} #[async_trait] impl OpenTelemetryProtocolHandler for Instance { @@ -116,7 +158,7 @@ impl OpenTelemetryProtocolHandler for Instance { pipeline_params: GreptimePipelineParams, table_name: String, ctx: QueryContextRef, - ) -> ServerResult { + ) -> ServerResult { self.plugins .get::() .as_ref() @@ -128,32 +170,16 @@ impl OpenTelemetryProtocolHandler for Instance { .get::>(); interceptor_ref.pre_execute(ctx.clone())?; - let is_trace_v1_model = matches!(pipeline, PipelineWay::OtlpTraceDirectV1); - - let (mut requests, rows) = otlp::trace::to_grpc_insert_requests( - request, - pipeline, - pipeline_params, - table_name, - &ctx, + let spans = otlp::trace::span::parse(request); + self.ingest_trace_spans( pipeline_handler, - )?; - - OTLP_TRACES_ROWS.inc_by(rows as u64); - - if is_trace_v1_model { - self.reconcile_trace_column_types(&mut requests, &ctx) - .await?; - self.handle_trace_inserts(requests, ctx) - .await - .map_err(BoxedError::new) - .context(error::ExecuteGrpcQuerySnafu) - } else { - self.handle_log_inserts(requests, ctx) - .await - .map_err(BoxedError::new) - .context(error::ExecuteGrpcQuerySnafu) - } + &pipeline, + &pipeline_params, + table_name, + spans, + ctx, + ) + .await } #[tracing::instrument(skip_all)] @@ -210,6 +236,316 @@ impl OpenTelemetryProtocolHandler for Instance { } impl Instance { + /// Ingest OTLP trace spans with chunk-level writes and span-level fallback on + /// deterministic chunk failures. + async fn ingest_trace_spans( + &self, + pipeline_handler: PipelineHandlerRef, + pipeline: &PipelineWay, + pipeline_params: &GreptimePipelineParams, + table_name: String, + groups: Vec, + ctx: QueryContextRef, + ) -> ServerResult { + let is_trace_v1_model = matches!(pipeline, PipelineWay::OtlpTraceDirectV1); + let ingest_ctx = TraceChunkIngestContext { + pipeline_handler, + pipeline, + pipeline_params, + table_name: &table_name, + is_trace_v1_model, + }; + let mut ingest_state = TraceIngestState { + aux_data: TraceAuxData::default(), + outcome: TraceIngestOutcome::default(), + failure_messages: Vec::new(), + }; + + for group in groups { + let chunks = group + .spans + .into_iter() + .chunks(TRACE_INGEST_CHUNK_SIZE) + .into_iter() + .map(|chunk| chunk.collect::>()) + .collect::>(); + for chunk in chunks { + self.ingest_trace_chunk(&ingest_ctx, chunk, ctx.clone(), &mut ingest_state) + .await?; + } + } + + OTLP_TRACES_ROWS.inc_by(ingest_state.outcome.accepted_spans as u64); + + if !ingest_state.aux_data.is_empty() { + // Auxiliary trace tables are derived from spans whose main-table + // writes are already confirmed, so they never create new accepted + // spans and they do not affect rejected span counts. + let (aux_requests, _) = otlp::trace::to_grpc_insert_requests_for_aux_tables( + std::mem::take(&mut ingest_state.aux_data), + ingest_ctx.pipeline, + ingest_ctx.table_name, + )?; + + if !aux_requests.inserts.is_empty() { + match self + .insert_trace_requests(aux_requests, ingest_ctx.is_trace_v1_model, ctx) + .await + { + Ok(output) => { + Self::add_trace_write_cost(&mut ingest_state.outcome, output.meta.cost); + } + Err(err) => { + Self::push_trace_failure_message( + &mut ingest_state.failure_messages, + "aux_table_update_failed", + format!( + "Auxiliary trace tables were not fully updated ({})", + err.status_code().as_ref() + ), + ); + } + } + } + } + + ingest_state.outcome.error_message = Self::finish_trace_failure_message( + ingest_state.outcome.accepted_spans, + ingest_state.outcome.rejected_spans, + ingest_state.failure_messages, + ); + + Ok(ingest_state.outcome) + } + + /// Ingest one owned trace chunk so successful spans can be moved into the + /// accepted set without extra cloning. + async fn ingest_trace_chunk( + &self, + ingest_ctx: &TraceChunkIngestContext<'_>, + chunk: Vec, + ctx: QueryContextRef, + ingest_state: &mut TraceIngestState, + ) -> ServerResult<()> { + // Try the fast path first so healthy batches keep their original + // throughput and write amplification stays low. + let (requests, chunk_rows) = otlp::trace::to_grpc_insert_requests_from_spans( + &chunk, + ingest_ctx.pipeline, + ingest_ctx.pipeline_params, + ingest_ctx.table_name, + &ctx, + ingest_ctx.pipeline_handler.clone(), + )?; + + match self + .insert_trace_requests(requests, ingest_ctx.is_trace_v1_model, ctx.clone()) + .await + { + Ok(output) => { + Self::add_trace_write_cost(&mut ingest_state.outcome, output.meta.cost); + ingest_state.outcome.accepted_spans += chunk_rows; + for span in &chunk { + ingest_state.aux_data.observe_span(span); + } + } + Err(err) => match Self::classify_trace_chunk_failure(err.status_code()) { + ChunkFailureReaction::RetryPerSpan => { + Self::push_trace_failure_message( + &mut ingest_state.failure_messages, + ChunkFailureReaction::RetryPerSpan.as_metric_label(), + format!("Chunk fallback triggered by {}", err.status_code().as_ref()), + ); + // Only deterministic failures are retried span by span. + // This includes schemaless table or column creation paths for + // trace ingestion. Ambiguous failures are handled below + // without retrying because the chunk may already have been + // ingested. + self.ingest_trace_chunk_span_by_span( + ingest_ctx, + chunk, + ctx.clone(), + ingest_state, + ) + .await?; + } + ChunkFailureReaction::DiscardChunk => { + ingest_state.outcome.rejected_spans += chunk.len(); + Self::push_trace_failure_message( + &mut ingest_state.failure_messages, + ChunkFailureReaction::DiscardChunk.as_metric_label(), + format!( + "Discarded {} spans after ambiguous chunk failure ({})", + chunk.len(), + err.status_code().as_ref() + ), + ); + // TODO(shuiyisong): Add an idempotent retry-safe recovery path for + // ambiguous chunk failures such as timeout-like errors. + } + // Retryable or ambiguous failures must fail the request instead of + // becoming partial success. This path is not retry-safe because the + // chunk may already have been committed before the error surfaced. + ChunkFailureReaction::Propagate => { + Self::push_trace_failure_message( + &mut ingest_state.failure_messages, + ChunkFailureReaction::Propagate.as_metric_label(), + format!( + "Propagating retryable chunk failure ({})", + err.status_code().as_ref() + ), + ); + return Err(err); + } + }, + } + + Ok(()) + } + + /// Retry spans one by one only after a deterministic chunk failure. + async fn ingest_trace_chunk_span_by_span( + &self, + ingest_ctx: &TraceChunkIngestContext<'_>, + chunk: Vec, + ctx: QueryContextRef, + ingest_state: &mut TraceIngestState, + ) -> ServerResult<()> { + for span in chunk { + let (requests, rows) = otlp::trace::to_grpc_insert_requests_from_spans( + std::slice::from_ref(&span), + ingest_ctx.pipeline, + ingest_ctx.pipeline_params, + ingest_ctx.table_name, + &ctx, + ingest_ctx.pipeline_handler.clone(), + )?; + + match self + .insert_trace_requests(requests, ingest_ctx.is_trace_v1_model, ctx.clone()) + .await + { + Ok(output) => { + Self::add_trace_write_cost(&mut ingest_state.outcome, output.meta.cost); + ingest_state.outcome.accepted_spans += rows; + ingest_state.aux_data.observe_span(&span); + } + Err(err) => { + if Self::should_propagate_trace_span_failure(err.status_code()) { + Self::push_trace_failure_message( + &mut ingest_state.failure_messages, + ChunkFailureReaction::Propagate.as_metric_label(), + format!( + "Propagating retryable span failure for {}:{} ({})", + span.trace_id, + span.span_id, + err.status_code().as_ref() + ), + ); + return Err(err); + } + + ingest_state.outcome.rejected_spans += 1; + Self::push_trace_failure_message( + &mut ingest_state.failure_messages, + "span_rejected", + format!( + "Rejected span {}:{} ({})", + span.trace_id, + span.span_id, + err.status_code().as_ref() + ), + ); + } + } + } + + Ok(()) + } + + /// Reconcile and insert one trace request batch. + async fn insert_trace_requests( + &self, + mut requests: RowInsertRequests, + is_trace_v1_model: bool, + ctx: QueryContextRef, + ) -> ServerResult { + if is_trace_v1_model { + self.reconcile_trace_column_types(&mut requests, &ctx) + .await?; + self.handle_trace_inserts(requests, ctx) + .await + .map_err(BoxedError::new) + .context(error::ExecuteGrpcQuerySnafu) + } else { + self.handle_log_inserts(requests, ctx) + .await + .map_err(BoxedError::new) + .context(error::ExecuteGrpcQuerySnafu) + } + } + + fn classify_trace_chunk_failure(status: StatusCode) -> ChunkFailureReaction { + match status { + StatusCode::InvalidArguments + | StatusCode::InvalidSyntax + | StatusCode::Unsupported + | StatusCode::TableNotFound + | StatusCode::TableColumnNotFound => ChunkFailureReaction::RetryPerSpan, + StatusCode::DatabaseNotFound => ChunkFailureReaction::DiscardChunk, + StatusCode::Cancelled | StatusCode::DeadlineExceeded => ChunkFailureReaction::Propagate, + _ if status.is_retryable() => ChunkFailureReaction::Propagate, + _ => ChunkFailureReaction::DiscardChunk, + } + } + + fn should_propagate_trace_span_failure(status: StatusCode) -> bool { + matches!( + Self::classify_trace_chunk_failure(status), + ChunkFailureReaction::Propagate + ) + } + + fn add_trace_write_cost(outcome: &mut TraceIngestOutcome, cost: usize) { + outcome.write_cost += cost; + } + + fn push_trace_failure_message(messages: &mut Vec, label: &str, message: String) { + OTLP_TRACES_FAILURE_COUNT.with_label_values(&[label]).inc(); + + if messages.len() < TRACE_FAILURE_MESSAGE_LIMIT { + messages.push(message); + } else if messages.len() == TRACE_FAILURE_MESSAGE_LIMIT { + tracing::debug!( + label, + limit = TRACE_FAILURE_MESSAGE_LIMIT, + "Trace ingest failure message limit reached; suppressing additional failure details" + ); + } + } + + fn finish_trace_failure_message( + accepted_spans: usize, + rejected_spans: usize, + messages: Vec, + ) -> Option { + if rejected_spans == 0 && messages.is_empty() { + return None; + } + + let mut summary = format!( + "Accepted {} spans, rejected {} spans", + accepted_spans, rejected_spans + ); + + if !messages.is_empty() { + summary.push_str(": "); + summary.push_str(&messages.join("; ")); + } + + Some(summary) + } + /// Picks the final datatype for one trace column. /// /// Existing table schema is authoritative when present. Otherwise we resolve the @@ -428,3 +764,163 @@ fn push_observed_trace_type(observed_types: &mut Vec, datatype: observed_types.push(datatype); } } + +#[cfg(test)] +mod tests { + use common_error::status_code::StatusCode; + use servers::query_handler::TraceIngestOutcome; + + use super::{ChunkFailureReaction, Instance}; + use crate::metrics::OTLP_TRACES_FAILURE_COUNT; + + #[test] + fn test_classify_trace_chunk_failure() { + assert_eq!( + Instance::classify_trace_chunk_failure(StatusCode::InvalidArguments), + ChunkFailureReaction::RetryPerSpan + ); + assert_eq!( + Instance::classify_trace_chunk_failure(StatusCode::InvalidSyntax), + ChunkFailureReaction::RetryPerSpan + ); + assert_eq!( + Instance::classify_trace_chunk_failure(StatusCode::Unsupported), + ChunkFailureReaction::RetryPerSpan + ); + assert_eq!( + Instance::classify_trace_chunk_failure(StatusCode::TableColumnNotFound), + ChunkFailureReaction::RetryPerSpan + ); + assert_eq!( + Instance::classify_trace_chunk_failure(StatusCode::TableNotFound), + ChunkFailureReaction::RetryPerSpan + ); + assert_eq!( + Instance::classify_trace_chunk_failure(StatusCode::DatabaseNotFound), + ChunkFailureReaction::DiscardChunk + ); + assert_eq!( + Instance::classify_trace_chunk_failure(StatusCode::DeadlineExceeded), + ChunkFailureReaction::Propagate + ); + assert_eq!( + Instance::classify_trace_chunk_failure(StatusCode::Cancelled), + ChunkFailureReaction::Propagate + ); + assert_eq!( + Instance::classify_trace_chunk_failure(StatusCode::StorageUnavailable), + ChunkFailureReaction::Propagate + ); + assert_eq!( + Instance::classify_trace_chunk_failure(StatusCode::Internal), + ChunkFailureReaction::Propagate + ); + assert_eq!( + Instance::classify_trace_chunk_failure(StatusCode::RegionNotReady), + ChunkFailureReaction::Propagate + ); + assert_eq!( + Instance::classify_trace_chunk_failure(StatusCode::TableUnavailable), + ChunkFailureReaction::Propagate + ); + assert_eq!( + Instance::classify_trace_chunk_failure(StatusCode::RegionBusy), + ChunkFailureReaction::Propagate + ); + assert_eq!( + Instance::classify_trace_chunk_failure(StatusCode::RuntimeResourcesExhausted), + ChunkFailureReaction::Propagate + ); + } + + #[test] + fn test_classify_trace_span_failure() { + assert!(Instance::should_propagate_trace_span_failure( + StatusCode::DeadlineExceeded + )); + assert!(Instance::should_propagate_trace_span_failure( + StatusCode::StorageUnavailable + )); + assert!(!Instance::should_propagate_trace_span_failure( + StatusCode::InvalidArguments + )); + } + + #[test] + fn test_add_trace_write_cost() { + let mut outcome = TraceIngestOutcome::default(); + Instance::add_trace_write_cost(&mut outcome, 3); + Instance::add_trace_write_cost(&mut outcome, 5); + assert_eq!(outcome.write_cost, 8); + } + + #[test] + fn test_finish_trace_failure_message() { + let message = Instance::finish_trace_failure_message( + 3, + 2, + vec!["Rejected span trace:span (InvalidArguments)".to_string()], + ) + .unwrap(); + assert!(message.contains("Accepted 3 spans, rejected 2 spans")); + assert!(message.contains("Rejected span trace:span")); + + assert_eq!(Instance::finish_trace_failure_message(2, 0, vec![]), None); + } + + #[test] + fn test_finish_trace_failure_message_without_detail_messages() { + assert_eq!( + Instance::finish_trace_failure_message(0, 2, vec![]), + Some("Accepted 0 spans, rejected 2 spans".to_string()) + ); + } + + #[test] + fn test_push_trace_failure_message_increments_labeled_counter() { + let label = "retry_per_span_counter_test"; + let initial = OTLP_TRACES_FAILURE_COUNT.with_label_values(&[label]).get(); + let mut messages = Vec::new(); + + Instance::push_trace_failure_message( + &mut messages, + label, + "Chunk fallback triggered by InvalidArguments".to_string(), + ); + + assert_eq!(messages.len(), 1); + assert_eq!( + OTLP_TRACES_FAILURE_COUNT.with_label_values(&[label]).get(), + initial + 1 + ); + } + + #[test] + fn test_push_trace_failure_message_caps_recorded_messages() { + let label = "retry_per_span_limit_test"; + let mut messages = Vec::new(); + + for idx in 0..=4 { + Instance::push_trace_failure_message(&mut messages, label, format!("failure-{idx}")); + } + + assert_eq!(messages.len(), 4); + assert_eq!( + messages, + vec![ + "failure-0".to_string(), + "failure-1".to_string(), + "failure-2".to_string(), + "failure-3".to_string() + ] + ); + } + + #[test] + fn test_classify_trace_chunk_failure_defaults_to_discard() { + assert_eq!( + Instance::classify_trace_chunk_failure(StatusCode::Unknown), + ChunkFailureReaction::DiscardChunk + ); + } +} diff --git a/src/frontend/src/metrics.rs b/src/frontend/src/metrics.rs index 58ba21476a..aba33637cf 100644 --- a/src/frontend/src/metrics.rs +++ b/src/frontend/src/metrics.rs @@ -52,6 +52,14 @@ lazy_static! { ) .unwrap(); + /// The number of OpenTelemetry trace ingest failures on the frontend node. + pub static ref OTLP_TRACES_FAILURE_COUNT: IntCounterVec = register_int_counter_vec!( + "greptime_frontend_otlp_traces_failure_count", + "frontend otlp trace ingest failure count", + &["label"] + ) + .unwrap(); + /// The number of OpenTelemetry logs send by frontend node. pub static ref OTLP_LOGS_ROWS: IntCounter = register_int_counter!( "greptime_frontend_otlp_logs_rows", diff --git a/src/servers/src/http/otlp.rs b/src/servers/src/http/otlp.rs index 4fd2d42122..3d6057f046 100644 --- a/src/servers/src/http/otlp.rs +++ b/src/servers/src/http/otlp.rs @@ -29,7 +29,7 @@ use opentelemetry_proto::tonic::collector::logs::v1::{ }; use opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceResponse; use opentelemetry_proto::tonic::collector::trace::v1::{ - ExportTraceServiceRequest, ExportTraceServiceResponse, + ExportTracePartialSuccess, ExportTraceServiceRequest, ExportTraceServiceResponse, }; use otel_arrow_rust::proto::opentelemetry::collector::metrics::v1::ExportMetricsServiceRequest; use pipeline::PipelineWay; @@ -175,11 +175,16 @@ pub async fn traces( query_ctx, ) .await - .map(|o| OtlpResponse { + .map(|outcome| OtlpResponse { resp_body: ExportTraceServiceResponse { - partial_success: None, + partial_success: outcome.error_message.map(|error_message| { + ExportTracePartialSuccess { + rejected_spans: outcome.rejected_spans as i64, + error_message, + } + }), }, - write_cost: o.meta.cost, + write_cost: outcome.write_cost, }) } diff --git a/src/servers/src/otlp/trace.rs b/src/servers/src/otlp/trace.rs index ca56f9b868..98f4441923 100644 --- a/src/servers/src/otlp/trace.rs +++ b/src/servers/src/otlp/trace.rs @@ -18,15 +18,17 @@ pub mod span; pub mod v0; pub mod v1; +use std::collections::HashSet; + use api::v1::RowInsertRequests; pub use common_catalog::consts::{ PARENT_SPAN_ID_COLUMN, SPAN_ID_COLUMN, SPAN_NAME_COLUMN, TRACE_ID_COLUMN, }; -use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest; use pipeline::{GreptimePipelineParams, PipelineWay}; use session::context::QueryContextRef; use crate::error::{NotSupportedSnafu, Result}; +use crate::otlp::trace::span::TraceSpan; use crate::query_handler::PipelineHandlerRef; // column names @@ -65,27 +67,58 @@ pub const SPAN_STATUS_PREFIX: &str = "STATUS_CODE_"; pub const SPAN_STATUS_UNSET: &str = "STATUS_CODE_UNSET"; pub const SPAN_STATUS_ERROR: &str = "STATUS_CODE_ERROR"; -/// Convert SpanTraces to GreptimeDB row insert requests. -/// Returns `InsertRequests` and total number of rows to ingest -pub fn to_grpc_insert_requests( - request: ExportTraceServiceRequest, - pipeline: PipelineWay, - pipeline_params: GreptimePipelineParams, - table_name: String, +/// Deduplicated auxiliary trace entities derived from successfully ingested +/// spans. +/// +/// The main trace table is written first. Once a span is confirmed accepted, we +/// record the service and operation tuples here so the auxiliary tables can be +/// updated separately without affecting span acceptance accounting. +#[derive(Debug, Default)] +pub struct TraceAuxData { + pub services: HashSet, + pub operations: HashSet<(String, String, String)>, +} + +impl TraceAuxData { + /// Records the auxiliary service and operation rows implied by one accepted + /// span. + pub fn observe_span(&mut self, span: &TraceSpan) { + if let Some(service_name) = &span.service_name { + self.services.insert(service_name.clone()); + self.operations.insert(( + service_name.clone(), + span.span_name.clone(), + span.span_kind.clone(), + )); + } + } + + /// Returns true when no auxiliary table updates are needed. + pub fn is_empty(&self) -> bool { + self.services.is_empty() && self.operations.is_empty() + } +} + +/// Convert a subset of trace spans to GreptimeDB row insert requests. +pub fn to_grpc_insert_requests_from_spans( + spans: &[TraceSpan], + pipeline: &PipelineWay, + pipeline_params: &GreptimePipelineParams, + table_name: &str, query_ctx: &QueryContextRef, pipeline_handler: PipelineHandlerRef, ) -> Result<(RowInsertRequests, usize)> { match pipeline { - PipelineWay::OtlpTraceDirectV0 => v0::v0_to_grpc_insert_requests( - request, + PipelineWay::OtlpTraceDirectV0 => v0::v0_to_grpc_main_insert_requests( + spans, pipeline, pipeline_params, table_name, query_ctx, pipeline_handler, ), - PipelineWay::OtlpTraceDirectV1 => v1::v1_to_grpc_insert_requests( - request, + PipelineWay::OtlpTraceDirectV1 => v1::v1_to_grpc_main_insert_requests( + spans, pipeline, pipeline_params, table_name, @@ -98,3 +131,23 @@ pub fn to_grpc_insert_requests( .fail(), } } + +/// Build insert requests for the auxiliary trace tables derived from accepted +/// spans. +/// +/// "Aux" here refers to the trace service and trace operation tables, not the +/// main trace span table itself. +pub fn to_grpc_insert_requests_for_aux_tables( + aux_data: TraceAuxData, + pipeline: &PipelineWay, + table_name: &str, +) -> Result<(RowInsertRequests, usize)> { + match pipeline { + PipelineWay::OtlpTraceDirectV0 => v0::build_aux_table_requests(aux_data, table_name), + PipelineWay::OtlpTraceDirectV1 => v1::build_aux_table_requests(aux_data, table_name), + _ => NotSupportedSnafu { + feat: "Unsupported pipeline for trace", + } + .fail(), + } +} diff --git a/src/servers/src/otlp/trace/span.rs b/src/servers/src/otlp/trace/span.rs index d96bc17277..19103240f6 100644 --- a/src/servers/src/otlp/trace/span.rs +++ b/src/servers/src/otlp/trace/span.rs @@ -53,6 +53,18 @@ pub struct TraceSpan { pub type TraceSpans = Vec; +#[derive(Debug, Clone)] +pub struct TraceSpanGroup { + pub service_name: Option, + pub resource_attributes: Attributes, + pub scope_name: String, + pub scope_version: String, + pub scope_attributes: Attributes, + pub spans: TraceSpans, +} + +pub type TraceSpanGroups = Vec; + #[derive(Debug, Clone, Serialize)] pub struct SpanLink { pub trace_id: String, @@ -241,14 +253,13 @@ pub fn status_to_string(status: &Option) -> (String, String) { /// See /// /// for data structure of OTLP traces. -pub fn parse(request: ExportTraceServiceRequest) -> TraceSpans { - let span_size = request +pub fn parse(request: ExportTraceServiceRequest) -> TraceSpanGroups { + let group_size = request .resource_spans .iter() .flat_map(|res| res.scope_spans.iter()) - .flat_map(|scope| scope.spans.iter()) .count(); - let mut spans = Vec::with_capacity(span_size); + let mut groups = Vec::with_capacity(group_size); for resource_spans in request.resource_spans { let resource_attrs = resource_spans .resource @@ -268,6 +279,7 @@ pub fn parse(request: ExportTraceServiceRequest) -> TraceSpans { for scope_spans in resource_spans.scope_spans { let scope = scope_spans.scope.unwrap_or_default(); + let mut spans = Vec::with_capacity(scope_spans.spans.len()); for span in scope_spans.spans { spans.push(parse_span( service_name.clone(), @@ -276,16 +288,47 @@ pub fn parse(request: ExportTraceServiceRequest) -> TraceSpans { span, )); } + groups.push(TraceSpanGroup { + service_name: service_name.clone(), + resource_attributes: Attributes::from(&resource_attrs[..]), + scope_name: scope.name, + scope_version: scope.version, + scope_attributes: Attributes::from(scope.attributes), + spans, + }); } } - spans + groups } #[cfg(test)] mod tests { - use opentelemetry_proto::tonic::trace::v1::Status; + use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest; + use opentelemetry_proto::tonic::common::v1::{ + AnyValue, InstrumentationScope, KeyValue, any_value, + }; + use opentelemetry_proto::tonic::resource::v1::Resource; + use opentelemetry_proto::tonic::trace::v1::{ResourceSpans, ScopeSpans, Span, Status}; - use crate::otlp::trace::span::{bytes_to_hex_string, status_to_string}; + use crate::otlp::trace::KEY_SERVICE_NAME; + use crate::otlp::trace::span::{bytes_to_hex_string, parse, status_to_string}; + + fn make_kv(key: &str, value: &str) -> KeyValue { + KeyValue { + key: key.to_string(), + value: Some(AnyValue { + value: Some(any_value::Value::StringValue(value.to_string())), + }), + } + } + + fn make_span(trace_id: u8, span_id: u8) -> Span { + Span { + trace_id: vec![trace_id; 16], + span_id: vec![span_id; 8], + ..Default::default() + } + } #[test] fn test_bytes_to_hex_string() { @@ -315,4 +358,62 @@ mod tests { status_to_string(&Some(status)), ); } + + #[test] + fn test_parse_preserves_resource_scope_groups() { + let request = ExportTraceServiceRequest { + resource_spans: vec![ + ResourceSpans { + resource: Some(Resource { + attributes: vec![make_kv(KEY_SERVICE_NAME, "svc-a")], + ..Default::default() + }), + scope_spans: vec![ + ScopeSpans { + scope: Some(InstrumentationScope { + name: "scope-1".to_string(), + ..Default::default() + }), + spans: vec![make_span(0x11, 0x21), make_span(0x12, 0x22)], + ..Default::default() + }, + ScopeSpans { + scope: Some(InstrumentationScope { + name: "scope-2".to_string(), + ..Default::default() + }), + spans: vec![make_span(0x13, 0x23)], + ..Default::default() + }, + ], + ..Default::default() + }, + ResourceSpans { + resource: Some(Resource { + attributes: vec![make_kv(KEY_SERVICE_NAME, "svc-b")], + ..Default::default() + }), + scope_spans: vec![ScopeSpans { + scope: Some(InstrumentationScope { + name: "scope-3".to_string(), + ..Default::default() + }), + spans: vec![make_span(0x14, 0x24)], + ..Default::default() + }], + ..Default::default() + }, + ], + }; + + let groups = parse(request); + assert_eq!(groups.len(), 3); + assert_eq!(groups[0].service_name.as_deref(), Some("svc-a")); + assert_eq!(groups[0].scope_name, "scope-1"); + assert_eq!(groups[0].spans.len(), 2); + assert_eq!(groups[1].scope_name, "scope-2"); + assert_eq!(groups[1].spans.len(), 1); + assert_eq!(groups[2].service_name.as_deref(), Some("svc-b")); + assert_eq!(groups[2].scope_name, "scope-3"); + } } diff --git a/src/servers/src/otlp/trace/v0.rs b/src/servers/src/otlp/trace/v0.rs index b52b406fb2..fa10dcc00f 100644 --- a/src/servers/src/otlp/trace/v0.rs +++ b/src/servers/src/otlp/trace/v0.rs @@ -18,16 +18,16 @@ use api::v1::value::ValueData; use api::v1::{ColumnDataType, RowInsertRequests}; use common_catalog::consts::{trace_operations_table_name, trace_services_table_name}; use common_grpc::precision::Precision; -use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest; use pipeline::{GreptimePipelineParams, PipelineWay}; use session::context::QueryContextRef; use crate::error::Result; -use crate::otlp::trace::span::{TraceSpan, parse}; +use crate::otlp::trace::span::TraceSpan; use crate::otlp::trace::{ DURATION_NANO_COLUMN, PARENT_SPAN_ID_COLUMN, SERVICE_NAME_COLUMN, SPAN_ATTRIBUTES_COLUMN, SPAN_EVENTS_COLUMN, SPAN_ID_COLUMN, SPAN_KIND_COLUMN, SPAN_NAME_COLUMN, SPAN_STATUS_CODE, SPAN_STATUS_MESSAGE_COLUMN, TIMESTAMP_COLUMN, TRACE_ID_COLUMN, TRACE_STATE_COLUMN, + TraceAuxData, }; use crate::otlp::utils::{make_column_data, make_string_column_data}; use crate::query_handler::PipelineHandlerRef; @@ -38,56 +38,52 @@ const APPROXIMATE_COLUMN_COUNT: usize = 24; // Use a timestamp(2100-01-01 00:00:00) as large as possible. const MAX_TIMESTAMP: i64 = 4102444800000000000; -/// Convert SpanTraces to GreptimeDB row insert requests. -/// Returns `InsertRequests` and total number of rows to ingest -pub fn v0_to_grpc_insert_requests( - request: ExportTraceServiceRequest, - _pipeline: PipelineWay, - _pipeline_params: GreptimePipelineParams, - table_name: String, +/// Converts trace spans into row insert requests for the main v0 trace table. +/// +/// Auxiliary service and operation table writes are built separately so the +/// caller can update them only after the main span write succeeds. +pub fn v0_to_grpc_main_insert_requests( + spans: &[TraceSpan], + _pipeline: &PipelineWay, + _pipeline_params: &GreptimePipelineParams, + table_name: &str, _query_ctx: &QueryContextRef, _pipeline_handler: PipelineHandlerRef, ) -> Result<(RowInsertRequests, usize)> { - let spans = parse(request); let mut multi_table_writer = MultiTableData::default(); + let trace_writer = build_trace_table_data(spans)?; + multi_table_writer.add_table_data(table_name, trace_writer); + + Ok(multi_table_writer.into_row_insert_requests()) +} + +/// Builds the row-oriented payload for the main v0 trace table. +pub fn build_trace_table_data(spans: &[TraceSpan]) -> Result { let mut trace_writer = TableData::new(APPROXIMATE_COLUMN_COUNT, spans.len()); + for span in spans.iter().cloned() { + write_span_to_row(&mut trace_writer, span)?; + } + + Ok(trace_writer) +} + +/// Builds row insert requests for the v0 trace auxiliary tables. +pub fn build_aux_table_requests( + aux_data: TraceAuxData, + table_name: &str, +) -> Result<(RowInsertRequests, usize)> { + let mut multi_table_writer = MultiTableData::default(); let mut trace_services_writer = TableData::new(APPROXIMATE_COLUMN_COUNT, 1); let mut trace_operations_writer = TableData::new(APPROXIMATE_COLUMN_COUNT, 1); - let mut services = HashSet::new(); - let mut operations = HashSet::new(); - for span in spans { - if let Some(service_name) = &span.service_name { - // Only insert the service name if it's not already in the set. - if !services.contains(service_name) { - services.insert(service_name.clone()); - } - - // Collect operations (service_name + span_name + span_kind). - let operation = ( - service_name.clone(), - span.span_name.clone(), - span.span_kind.clone(), - ); - if !operations.contains(&operation) { - operations.insert(operation); - } - } - write_span_to_row(&mut trace_writer, span)?; - } - write_trace_services_to_row(&mut trace_services_writer, services)?; - write_trace_operations_to_row(&mut trace_operations_writer, operations)?; + write_trace_services_to_row(&mut trace_services_writer, aux_data.services)?; + write_trace_operations_to_row(&mut trace_operations_writer, aux_data.operations)?; + multi_table_writer.add_table_data(trace_services_table_name(table_name), trace_services_writer); multi_table_writer.add_table_data( - trace_services_table_name(&table_name), - trace_services_writer, - ); - multi_table_writer.add_table_data( - trace_operations_table_name(&table_name), + trace_operations_table_name(table_name), trace_operations_writer, ); - multi_table_writer.add_table_data(table_name, trace_writer); - Ok(multi_table_writer.into_row_insert_requests()) } @@ -232,3 +228,63 @@ fn write_trace_operations_to_row( Ok(()) } + +#[cfg(test)] +mod tests { + use super::{build_aux_table_requests, build_trace_table_data}; + use crate::otlp::trace::TraceAuxData; + use crate::otlp::trace::attributes::Attributes; + use crate::otlp::trace::span::{SpanEvents, SpanLinks, TraceSpan}; + + fn make_span(service_name: &str, trace_id: &str, span_id: &str) -> TraceSpan { + TraceSpan { + service_name: Some(service_name.to_string()), + trace_id: trace_id.to_string(), + span_id: span_id.to_string(), + parent_span_id: None, + resource_attributes: Attributes::from(vec![]), + scope_name: "scope".to_string(), + scope_version: "v1".to_string(), + scope_attributes: Attributes::from(vec![]), + trace_state: String::new(), + span_name: "op".to_string(), + span_kind: "SPAN_KIND_SERVER".to_string(), + span_status_code: "STATUS_CODE_UNSET".to_string(), + span_status_message: String::new(), + span_attributes: Attributes::from(vec![]), + span_events: SpanEvents::from(vec![]), + span_links: SpanLinks::from(vec![]), + start_in_nanosecond: 1, + end_in_nanosecond: 2, + } + } + + #[test] + fn test_build_trace_table_data_from_span_subset() { + let spans = [ + make_span("svc-a", "trace-a", "span-a"), + make_span("svc-b", "trace-b", "span-b"), + ]; + + let writer = build_trace_table_data(&spans[..1]).unwrap(); + let (_, rows) = writer.into_schema_and_rows(); + assert_eq!(rows.len(), 1); + } + + #[test] + fn test_build_aux_table_requests_deduplicates_services_and_operations() { + let spans = vec![ + make_span("svc-a", "trace-a", "span-a"), + make_span("svc-a", "trace-b", "span-b"), + ]; + let mut aux_data = TraceAuxData::default(); + for span in &spans { + aux_data.observe_span(span); + } + + let (requests, total_rows) = + build_aux_table_requests(aux_data, "opentelemetry_traces").unwrap(); + assert_eq!(requests.inserts.len(), 2); + assert_eq!(total_rows, 2); + } +} diff --git a/src/servers/src/otlp/trace/v1.rs b/src/servers/src/otlp/trace/v1.rs index 11e986de04..cce6891b0d 100644 --- a/src/servers/src/otlp/trace/v1.rs +++ b/src/servers/src/otlp/trace/v1.rs @@ -18,19 +18,18 @@ use api::v1::value::ValueData; use api::v1::{ColumnDataType, RowInsertRequests, Value}; use common_catalog::consts::{trace_operations_table_name, trace_services_table_name}; use common_grpc::precision::Precision; -use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest; use opentelemetry_proto::tonic::common::v1::any_value::Value as OtlpValue; use pipeline::{GreptimePipelineParams, PipelineWay}; use session::context::QueryContextRef; use crate::error::Result; use crate::otlp::trace::attributes::Attributes; -use crate::otlp::trace::span::{TraceSpan, parse}; +use crate::otlp::trace::span::TraceSpan; use crate::otlp::trace::{ DURATION_NANO_COLUMN, KEY_SERVICE_NAME, PARENT_SPAN_ID_COLUMN, SCOPE_NAME_COLUMN, SCOPE_VERSION_COLUMN, SERVICE_NAME_COLUMN, SPAN_EVENTS_COLUMN, SPAN_ID_COLUMN, SPAN_KIND_COLUMN, SPAN_NAME_COLUMN, SPAN_STATUS_CODE, SPAN_STATUS_MESSAGE_COLUMN, - TIMESTAMP_COLUMN, TRACE_ID_COLUMN, TRACE_STATE_COLUMN, + TIMESTAMP_COLUMN, TRACE_ID_COLUMN, TRACE_STATE_COLUMN, TraceAuxData, }; use crate::otlp::utils::{any_value_to_jsonb, make_column_data, make_string_column_data}; use crate::query_handler::PipelineHandlerRef; @@ -41,64 +40,52 @@ const APPROXIMATE_COLUMN_COUNT: usize = 30; // Use a timestamp(2100-01-01 00:00:00) as large as possible. const MAX_TIMESTAMP: i64 = 4102444800000000000; -/// Convert SpanTraces to GreptimeDB row insert requests. -/// Returns `InsertRequests` and total number of rows to ingest +/// Converts trace spans into row insert requests for the main v1 trace table. /// -/// Compared with v0, this v1 implementation: -/// 1. flattens all attribute data into columns. -/// 2. treat `span_id` and `parent_trace_id` as fields. -/// 3. removed `service_name` column because it's already in -/// `resource_attributes.service_name` -/// -/// For other compound data structures like span_links and span_events here we -/// are still using `json` data structure. -pub fn v1_to_grpc_insert_requests( - request: ExportTraceServiceRequest, - _pipeline: PipelineWay, - _pipeline_params: GreptimePipelineParams, - table_name: String, +/// Auxiliary service and operation table writes are built separately so the +/// caller can update them only after the main span write succeeds. +pub fn v1_to_grpc_main_insert_requests( + spans: &[TraceSpan], + _pipeline: &PipelineWay, + _pipeline_params: &GreptimePipelineParams, + table_name: &str, _query_ctx: &QueryContextRef, _pipeline_handler: PipelineHandlerRef, ) -> Result<(RowInsertRequests, usize)> { - let spans = parse(request); let mut multi_table_writer = MultiTableData::default(); + let trace_writer = build_trace_table_data(spans)?; + multi_table_writer.add_table_data(table_name, trace_writer); + + Ok(multi_table_writer.into_row_insert_requests()) +} + +/// Builds the row-oriented payload for the main v1 trace table. +pub fn build_trace_table_data(spans: &[TraceSpan]) -> Result { let mut trace_writer = TableData::new(APPROXIMATE_COLUMN_COUNT, spans.len()); + for span in spans.iter().cloned() { + write_span_to_row(&mut trace_writer, span)?; + } + + Ok(trace_writer) +} + +/// Builds row insert requests for the v1 trace auxiliary tables. +pub fn build_aux_table_requests( + aux_data: TraceAuxData, + table_name: &str, +) -> Result<(RowInsertRequests, usize)> { + let mut multi_table_writer = MultiTableData::default(); let mut trace_services_writer = TableData::new(APPROXIMATE_COLUMN_COUNT, 1); let mut trace_operations_writer = TableData::new(APPROXIMATE_COLUMN_COUNT, 1); - let mut services = HashSet::new(); - let mut operations = HashSet::new(); - for span in spans { - if let Some(service_name) = &span.service_name { - // Only insert the service name if it's not already in the set. - if !services.contains(service_name) { - services.insert(service_name.clone()); - } - - // Only insert the operation if it's not already in the set. - let operation = ( - service_name.clone(), - span.span_name.clone(), - span.span_kind.clone(), - ); - if !operations.contains(&operation) { - operations.insert(operation); - } - } - write_span_to_row(&mut trace_writer, span)?; - } - write_trace_services_to_row(&mut trace_services_writer, services)?; - write_trace_operations_to_row(&mut trace_operations_writer, operations)?; + write_trace_services_to_row(&mut trace_services_writer, aux_data.services)?; + write_trace_operations_to_row(&mut trace_operations_writer, aux_data.operations)?; + multi_table_writer.add_table_data(trace_services_table_name(table_name), trace_services_writer); multi_table_writer.add_table_data( - trace_services_table_name(&table_name), - trace_services_writer, - ); - multi_table_writer.add_table_data( - trace_operations_table_name(&table_name), + trace_operations_table_name(table_name), trace_operations_writer, ); - multi_table_writer.add_table_data(table_name, trace_writer); Ok(multi_table_writer.into_row_insert_requests()) } @@ -319,7 +306,9 @@ mod tests { use opentelemetry_proto::tonic::common::v1::{AnyValue, KeyValue}; use super::*; + use crate::otlp::trace::TraceAuxData; use crate::otlp::trace::attributes::Attributes; + use crate::otlp::trace::span::{SpanEvents, SpanLinks}; use crate::row_writer::TableData; fn make_kv(key: &str, value: OtlpValue) -> KeyValue { @@ -329,6 +318,29 @@ mod tests { } } + fn make_span(service_name: &str, trace_id: &str, span_id: &str) -> TraceSpan { + TraceSpan { + service_name: Some(service_name.to_string()), + trace_id: trace_id.to_string(), + span_id: span_id.to_string(), + parent_span_id: None, + resource_attributes: Attributes::from(vec![]), + scope_name: "scope".to_string(), + scope_version: "v1".to_string(), + scope_attributes: Attributes::from(vec![]), + trace_state: String::new(), + span_name: "op".to_string(), + span_kind: "SPAN_KIND_SERVER".to_string(), + span_status_code: "STATUS_CODE_UNSET".to_string(), + span_status_message: String::new(), + span_attributes: Attributes::from(vec![]), + span_events: SpanEvents::from(vec![]), + span_links: SpanLinks::from(vec![]), + start_in_nanosecond: 1, + end_in_nanosecond: 2, + } + } + #[test] fn test_keep_mixed_numeric_values_until_frontend_reconciliation() { let mut writer = TableData::new(4, 2); @@ -520,5 +532,22 @@ mod tests { Some(ValueData::StringValue("false".to_string())) ); } + + #[test] + fn test_build_aux_table_requests_deduplicates_services_and_operations() { + let spans = vec![ + make_span("svc-a", "trace-a", "span-a"), + make_span("svc-a", "trace-b", "span-b"), + ]; + let mut aux_data = TraceAuxData::default(); + for span in &spans { + aux_data.observe_span(span); + } + + let (requests, total_rows) = + build_aux_table_requests(aux_data, "opentelemetry_traces").unwrap(); + assert_eq!(requests.inserts.len(), 2); + assert_eq!(total_rows, 2); + } // Conversion matrix coverage lives in the shared coercion helper tests. } diff --git a/src/servers/src/query_handler.rs b/src/servers/src/query_handler.rs index b55502e742..d4b272de12 100644 --- a/src/servers/src/query_handler.rs +++ b/src/servers/src/query_handler.rs @@ -63,6 +63,14 @@ pub type PipelineHandlerRef = Arc; pub type LogQueryHandlerRef = Arc; pub type JaegerQueryHandlerRef = Arc; +#[derive(Debug, Default, Clone)] +pub struct TraceIngestOutcome { + pub write_cost: usize, + pub accepted_spans: usize, + pub rejected_spans: usize, + pub error_message: Option, +} + #[async_trait] pub trait InfluxdbLineProtocolHandler { /// A successful request will not return a response. @@ -123,7 +131,7 @@ pub trait OpenTelemetryProtocolHandler: PipelineHandler { pipeline_params: GreptimePipelineParams, table_name: String, ctx: QueryContextRef, - ) -> Result; + ) -> Result; async fn logs( &self, diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index c0d858a592..36ddb1bb38 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -40,7 +40,9 @@ use loki_proto::logproto::{EntryAdapter, LabelPairAdapter, PushRequest, StreamAd use loki_proto::prost_types::Timestamp; use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest; use opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequest; -use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest; +use opentelemetry_proto::tonic::collector::trace::v1::{ + ExportTraceServiceRequest, ExportTraceServiceResponse, +}; use pipeline::GREPTIME_INTERNAL_TRACE_PIPELINE_V1_NAME; use prost::Message; use serde_json::{Value, json}; @@ -5572,26 +5574,83 @@ pub async fn test_otlp_traces_v1(store_type: StorageType) { ], ); let res = send_trace_v1_req(&client, abort_table_name, abort_req, false).await; - assert_eq!(StatusCode::BAD_REQUEST, res.status()); - let body: Value = res.json().await; + assert_eq!(StatusCode::OK, res.status()); + let body = ExportTraceServiceResponse::decode(res.bytes().await).unwrap(); + let partial_success = body.partial_success.as_ref().unwrap(); + assert_eq!(partial_success.rejected_spans, 1); assert!( - body["error"].as_str().unwrap().contains( - "failed to coerce trace column 'span_attributes.attr_int' in table 'trace_type_abort'" + partial_success + .error_message + .contains("Accepted 1 spans, rejected 1 spans"), + "unexpected partial success body: {body:?}" + ); + assert!( + partial_success.error_message.contains( + "Rejected span 00000000000000000000000000000013:0000000000000013 (InvalidArguments)" ), - "unexpected error body: {body}" + "unexpected partial success body: {body:?}" ); validate_data( "otlp_traces_v1_type_abort_rows", &client, &format!( - "select trace_id, \"span_attributes.attr_int\" from {} order by trace_id;", + "select trace_id, \"span_attributes.attr_int\" from {} order by trace_id", abort_table_name ), - r#"[["00000000000000000000000000000011",10]]"#, + r#"[["00000000000000000000000000000011",10],["00000000000000000000000000000012",20]]"#, ) .await; + let chunk_failure_req = make_trace_v1_request( + "type-discard", + vec![ + make_trace_v1_span( + "00000000000000000000000000000021", + "0000000000000021", + "discard-one", + 1_736_480_942_445_400_000, + 1_736_480_942_445_500_000, + vec![make_string_attr("attr_text", "alpha")], + ), + make_trace_v1_span( + "00000000000000000000000000000022", + "0000000000000022", + "discard-two", + 1_736_480_942_445_600_000, + 1_736_480_942_445_700_000, + vec![make_string_attr("attr_text", "beta")], + ), + ], + ); + let res = send_trace_v1_req_with_db( + &client, + "nonexistent", + "trace_chunk_discard", + chunk_failure_req, + false, + ) + .await; + assert_eq!(StatusCode::OK, res.status()); + let body = ExportTraceServiceResponse::decode(res.bytes().await).unwrap(); + let partial_success = body.partial_success.as_ref().unwrap(); + assert_eq!(partial_success.rejected_spans, 2); + assert!( + partial_success + .error_message + .contains("Accepted 0 spans, rejected 2 spans"), + "unexpected partial success body: {body:?}" + ); + assert!( + partial_success + .error_message + .contains("Chunk fallback triggered by") + || partial_success + .error_message + .contains("Discarded 2 spans after ambiguous chunk failure"), + "unexpected partial success body: {body:?}" + ); + guard.remove_all().await; } @@ -7829,6 +7888,16 @@ async fn send_trace_v1_req( table_name: &str, req: ExportTraceServiceRequest, with_gzip: bool, +) -> TestResponse { + send_trace_v1_req_with_db(client, "public", table_name, req, with_gzip).await +} + +async fn send_trace_v1_req_with_db( + client: &TestClient, + db_name: &str, + table_name: &str, + req: ExportTraceServiceRequest, + with_gzip: bool, ) -> TestResponse { send_req( client, @@ -7845,6 +7914,10 @@ async fn send_trace_v1_req( HeaderName::from_static("x-greptime-trace-table-name"), HeaderValue::from_str(table_name).unwrap(), ), + ( + GREPTIME_DB_HEADER_NAME.clone(), + HeaderValue::from_str(db_name).unwrap(), + ), ], "/v1/otlp/v1/traces", req.encode_to_vec(), From d9736407f2fa383df96466f23be9644d75fed94e Mon Sep 17 00:00:00 2001 From: shuiyisong <113876041+shuiyisong@users.noreply.github.com> Date: Thu, 2 Apr 2026 11:20:45 +0800 Subject: [PATCH 067/195] fix: return empty when promql gets non-exist label name (#7899) * fix: return empty when promql gets non-exist label name Signed-off-by: shuiyisong * fix: fmt Signed-off-by: shuiyisong * chore: minor refactor Signed-off-by: shuiyisong * fix: typo Signed-off-by: shuiyisong --------- Signed-off-by: shuiyisong --- src/catalog/src/process_manager.rs | 2 +- src/frontend/src/instance/promql.rs | 18 +++++++++++++++--- tests-integration/tests/http.rs | 14 ++++++++++++++ 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/src/catalog/src/process_manager.rs b/src/catalog/src/process_manager.rs index 1eebfec627..8796d948e1 100644 --- a/src/catalog/src/process_manager.rs +++ b/src/catalog/src/process_manager.rs @@ -395,7 +395,7 @@ impl SlowQueryTimer { impl Drop for SlowQueryTimer { fn drop(&mut self) { - // Calculate the elaspsed duration since the timer is created. + // Calculate the elapsed duration since the timer is created. let elapsed = self.start.elapsed(); if elapsed > self.threshold { // Only capture a portion of slow queries based on sample_ratio. diff --git a/src/frontend/src/instance/promql.rs b/src/frontend/src/instance/promql.rs index 419be8d96e..3a3aba2307 100644 --- a/src/frontend/src/instance/promql.rs +++ b/src/frontend/src/instance/promql.rs @@ -31,7 +31,7 @@ use snafu::{OptionExt, ResultExt}; use crate::error::{ CatalogSnafu, CollectRecordbatchSnafu, ExecLogicalPlanSnafu, PrometheusLabelValuesQueryPlanSnafu, PrometheusMetricNamesQueryPlanSnafu, ReadTableSnafu, - Result, TableNotFoundSnafu, + Result, TableNotFoundSnafu, TableSnafu, }; use crate::instance::Instance; @@ -120,20 +120,32 @@ impl Instance { }) .unwrap_or_else(|| ctx.current_schema()); + let full_table_name = format_full_table_name(ctx.current_catalog(), &table_schema, &metric); let table = self .catalog_manager .table(ctx.current_catalog(), &table_schema, &metric, Some(ctx)) .await .context(CatalogSnafu)? .with_context(|| TableNotFoundSnafu { - table_name: format_full_table_name(ctx.current_catalog(), &table_schema, &metric), + table_name: full_table_name.clone(), })?; + // Check label column existence before building the query plan so a missing label can be + // reported as `TableColumnNotFound` and handled like Prometheus expects. + if table.schema().column_schema_by_name(&label_name).is_none() { + return table::error::ColumnNotExistsSnafu { + column_name: label_name, + table_name: full_table_name, + } + .fail() + .context(TableSnafu); + } + let dataframe = self .query_engine .read_table(table.clone()) .with_context(|_| ReadTableSnafu { - table_name: format_full_table_name(ctx.current_catalog(), &table_schema, &metric), + table_name: full_table_name, })?; let scan_plan = dataframe.into_unoptimized_plan(); diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index 36ddb1bb38..caf5b2d11c 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -1176,6 +1176,20 @@ pub async fn test_prom_http_api(store_type: StorageType) { .await; assert_eq!(res.status(), StatusCode::OK); + // query non-exist label in metric table + let res = client + .get("/v1/prometheus/api/v1/label/not_exist_label/values?match[]=demo&start=0&end=600") + .send() + .await; + assert_eq!(res.status(), StatusCode::OK); + let prom_resp = res.json::().await; + assert_eq!(prom_resp.status, "success"); + assert!(prom_resp.error.is_none() && prom_resp.error_type.is_none()); + assert_eq!( + prom_resp.data, + serde_json::from_value::(json!([])).unwrap() + ); + // query `__name__` without match[] // create a physical table and a logical table let res = client From ba32c5fe9eb29374945846612cd9a58f5adfeb16 Mon Sep 17 00:00:00 2001 From: shuiyisong <113876041+shuiyisong@users.noreply.github.com> Date: Thu, 2 Apr 2026 14:49:27 +0800 Subject: [PATCH 068/195] chore: remove unused deps using udeps (#7906) * chore: remove unused deps using udeps Signed-off-by: shuiyisong * chore: fmt toml Signed-off-by: shuiyisong --------- Signed-off-by: shuiyisong --- Cargo.lock | 7 ------- src/cmd/Cargo.toml | 2 +- src/common/base/Cargo.toml | 5 ----- src/common/config/Cargo.toml | 1 - src/common/datasource/Cargo.toml | 2 +- src/common/meta/Cargo.toml | 2 +- src/mito-codec/Cargo.toml | 2 +- src/object-store/Cargo.toml | 1 - src/promql/Cargo.toml | 2 +- tests-integration/Cargo.toml | 16 ++++++++-------- 10 files changed, 13 insertions(+), 27 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 695f19b072..68c01a3c63 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2162,17 +2162,12 @@ dependencies = [ "async-trait", "bitvec", "bytes", - "common-error", - "common-macro", "common-test-util", "futures", "lazy_static", - "paste", "pin-project", - "rand 0.9.1", "regex", "serde", - "snafu 0.8.6", "tokio", "toml 0.8.23", "zeroize", @@ -2203,7 +2198,6 @@ dependencies = [ "object-store", "serde", "serde_json", - "serde_with", "snafu 0.8.6", "temp-env", "tempfile", @@ -8724,7 +8718,6 @@ dependencies = [ "futures", "humantime-serde", "lazy_static", - "moka", "opendal", "prometheus 0.14.0", "reqwest", diff --git a/src/cmd/Cargo.toml b/src/cmd/Cargo.toml index d547ec6e81..34619f4f1b 100644 --- a/src/cmd/Cargo.toml +++ b/src/cmd/Cargo.toml @@ -86,7 +86,6 @@ serde.workspace = true serde_json.workspace = true servers.workspace = true session.workspace = true -similar-asserts.workspace = true snafu.workspace = true sqlparser.workspace = true standalone.workspace = true @@ -113,5 +112,6 @@ common-version.workspace = true file-engine.workspace = true mito2.workspace = true serde.workspace = true +similar-asserts.workspace = true temp-env = "0.3" tempfile.workspace = true diff --git a/src/common/base/Cargo.toml b/src/common/base/Cargo.toml index 3ec9e1fa35..44c30cd548 100644 --- a/src/common/base/Cargo.toml +++ b/src/common/base/Cargo.toml @@ -16,16 +16,11 @@ anymap2 = "0.13" async-trait.workspace = true bitvec = "1.0" bytes.workspace = true -common-error.workspace = true -common-macro.workspace = true futures.workspace = true lazy_static.workspace = true -paste.workspace = true pin-project.workspace = true -rand.workspace = true regex.workspace = true serde = { version = "1.0", features = ["derive"] } -snafu.workspace = true tokio.workspace = true zeroize = { version = "1.6", default-features = false, features = ["alloc"] } diff --git a/src/common/config/Cargo.toml b/src/common/config/Cargo.toml index 2737f82a58..27b238add7 100644 --- a/src/common/config/Cargo.toml +++ b/src/common/config/Cargo.toml @@ -18,7 +18,6 @@ notify.workspace = true object-store.workspace = true serde.workspace = true serde_json.workspace = true -serde_with.workspace = true snafu.workspace = true toml.workspace = true diff --git a/src/common/datasource/Cargo.toml b/src/common/datasource/Cargo.toml index 6ec9a14733..ae81c6ba98 100644 --- a/src/common/datasource/Cargo.toml +++ b/src/common/datasource/Cargo.toml @@ -28,7 +28,6 @@ common-runtime.workspace = true common-telemetry.workspace = true datafusion.workspace = true datafusion-datasource.workspace = true -datafusion-orc.workspace = true datatypes.workspace = true futures.workspace = true lazy_static.workspace = true @@ -47,3 +46,4 @@ url.workspace = true [dev-dependencies] common-test-util.workspace = true +datafusion-orc.workspace = true diff --git a/src/common/meta/Cargo.toml b/src/common/meta/Cargo.toml index f5ca9d2c09..8132a1fc4e 100644 --- a/src/common/meta/Cargo.toml +++ b/src/common/meta/Cargo.toml @@ -37,7 +37,6 @@ common-error.workspace = true common-grpc-expr.workspace = true common-macro.workspace = true common-procedure.workspace = true -common-procedure-test.workspace = true common-query.workspace = true common-recordbatch.workspace = true common-runtime.workspace = true @@ -92,6 +91,7 @@ typetag.workspace = true [dev-dependencies] chrono.workspace = true common-procedure = { workspace = true, features = ["testing"] } +common-procedure-test.workspace = true common-test-util.workspace = true common-wal = { workspace = true, features = ["testing"] } datatypes.workspace = true diff --git a/src/mito-codec/Cargo.toml b/src/mito-codec/Cargo.toml index 07d64482e0..c5a6625cc2 100644 --- a/src/mito-codec/Cargo.toml +++ b/src/mito-codec/Cargo.toml @@ -15,7 +15,6 @@ common-base.workspace = true common-decimal.workspace = true common-error.workspace = true common-macro.workspace = true -common-query.workspace = true common-recordbatch.workspace = true common-telemetry.workspace = true common-time.workspace = true @@ -27,6 +26,7 @@ snafu.workspace = true store-api.workspace = true [dev-dependencies] +common-query.workspace = true criterion = "0.7" datafusion-common.workspace = true datafusion-expr.workspace = true diff --git a/src/object-store/Cargo.toml b/src/object-store/Cargo.toml index 2ef251d04d..7247c5892c 100644 --- a/src/object-store/Cargo.toml +++ b/src/object-store/Cargo.toml @@ -21,7 +21,6 @@ derive_builder = { workspace = true, optional = true } futures.workspace = true humantime-serde.workspace = true lazy_static.workspace = true -moka = { workspace = true, features = ["future"] } opendal = { version = "0.54", features = [ "layers-tracing", "layers-prometheus", diff --git a/src/promql/Cargo.toml b/src/promql/Cargo.toml index 306563d1ce..460be8ddd9 100644 --- a/src/promql/Cargo.toml +++ b/src/promql/Cargo.toml @@ -13,7 +13,6 @@ async-trait.workspace = true bytemuck.workspace = true common-error.workspace = true common-macro.workspace = true -common-recordbatch.workspace = true common-telemetry.workspace = true datafusion.workspace = true datafusion-common.workspace = true @@ -27,6 +26,7 @@ prost.workspace = true snafu.workspace = true [dev-dependencies] +common-recordbatch.workspace = true criterion.workspace = true datafusion-common.workspace = true datafusion-expr.workspace = true diff --git a/tests-integration/Cargo.toml b/tests-integration/Cargo.toml index bee03ae7fe..10b7097f4f 100644 --- a/tests-integration/Cargo.toml +++ b/tests-integration/Cargo.toml @@ -63,22 +63,14 @@ meta-client.workspace = true meta-srv = { workspace = true, features = ["mock"] } mito2.workspace = true moka.workspace = true -mysql_async = { version = "0.35", default-features = false, features = [ - "time", - "default-rustls-ring", -] } object-store.workspace = true operator = { workspace = true, features = ["testing"] } prost.workspace = true query.workspace = true rand.workspace = true -rstest.workspace = true -rstest_reuse.workspace = true -sea-query.workspace = true serde_json.workspace = true servers = { workspace = true, features = ["testing"] } session.workspace = true -similar-asserts.workspace = true snafu.workspace = true sql.workspace = true sqlx = { workspace = true, features = [ @@ -108,6 +100,10 @@ hex.workspace = true http.workspace = true itertools.workspace = true jsonb.workspace = true +mysql_async = { version = "0.35", default-features = false, features = [ + "time", + "default-rustls-ring", +] } opentelemetry-proto.workspace = true otel-arrow-rust.workspace = true partition.workspace = true @@ -115,7 +111,11 @@ paste.workspace = true pipeline.workspace = true prost.workspace = true rand.workspace = true +rstest.workspace = true +rstest_reuse.workspace = true +sea-query.workspace = true session = { workspace = true, features = ["testing"] } +similar-asserts.workspace = true store-api.workspace = true tokio-postgres = { workspace = true } url = "2.3" From 2af59ed3866f49481b509ced097b66da851285ae Mon Sep 17 00:00:00 2001 From: Yingwen Date: Thu, 2 Apr 2026 15:53:33 +0800 Subject: [PATCH 069/195] feat: always use flat scan path for both format (#7901) * feat: remove primary_key format scan path Signed-off-by: evenyag * feat: remove flat format flag Signed-off-by: evenyag * test: remove CompatReader tests Signed-off-by: evenyag * chore: show whether the format is flat in explain Signed-off-by: evenyag * test: stable series scan result Signed-off-by: evenyag --------- Signed-off-by: evenyag --- src/cmd/src/datanode/objbench.rs | 1 - src/cmd/src/datanode/scanbench.rs | 8 +- src/mito2/src/compaction.rs | 5 +- src/mito2/src/engine/scan_test.rs | 255 +++++------------ src/mito2/src/read.rs | 5 + src/mito2/src/read/compat.rs | 346 +----------------------- src/mito2/src/read/last_row.rs | 2 + src/mito2/src/read/projection.rs | 266 ++---------------- src/mito2/src/read/prune.rs | 4 + src/mito2/src/read/range_cache.rs | 5 +- src/mito2/src/read/scan_region.rs | 129 ++------- src/mito2/src/read/scan_util.rs | 199 +------------- src/mito2/src/read/seq_scan.rs | 299 +------------------- src/mito2/src/read/series_scan.rs | 156 +---------- src/mito2/src/read/stream.rs | 37 +-- src/mito2/src/read/unordered_scan.rs | 162 +---------- src/mito2/src/sst/parquet.rs | 11 - src/mito2/src/sst/parquet/file_range.rs | 1 + src/mito2/src/sst/parquet/reader.rs | 24 +- src/store-api/src/storage/requests.rs | 19 -- 20 files changed, 159 insertions(+), 1775 deletions(-) diff --git a/src/cmd/src/datanode/objbench.rs b/src/cmd/src/datanode/objbench.rs index f6d8674d4c..1f3591635f 100644 --- a/src/cmd/src/datanode/objbench.rs +++ b/src/cmd/src/datanode/objbench.rs @@ -211,7 +211,6 @@ impl ObjbenchCommand { object_store.clone(), ) .expected_metadata(Some(region_meta.clone())) - .flat_format(true) .build() .await .map_err(|e| { diff --git a/src/cmd/src/datanode/scanbench.rs b/src/cmd/src/datanode/scanbench.rs index 51064126fe..a93aca430a 100644 --- a/src/cmd/src/datanode/scanbench.rs +++ b/src/cmd/src/datanode/scanbench.rs @@ -102,10 +102,6 @@ pub struct ScanbenchCommand { #[clap(long, value_name = "FILE")] pprof_file: Option, - /// Force reading the region in flat format. - #[clap(long, default_value_t = false)] - force_flat_format: bool, - /// Enable WAL replay when opening the region. #[clap(long, default_value_t = false)] enable_wal: bool, @@ -580,12 +576,11 @@ impl ScanbenchCommand { }; println!( - "{} Scanner: {}, Parallelism: {}, Iterations: {}, Force flat format: {}", + "{} Scanner: {}, Parallelism: {}, Iterations: {}", "ℹ".blue(), self.scanner, self.parallelism, self.iterations, - self.force_flat_format, ); // Start profiling if pprof_file is specified (unless pprof_after_warmup is set) @@ -626,7 +621,6 @@ impl ScanbenchCommand { filters: filters.clone(), series_row_selector, distribution, - force_flat_format: self.force_flat_format, ..Default::default() }; diff --git a/src/mito2/src/compaction.rs b/src/mito2/src/compaction.rs index a43fa8a0a6..944c51ebd6 100644 --- a/src/mito2/src/compaction.rs +++ b/src/mito2/src/compaction.rs @@ -848,7 +848,7 @@ impl CompactionSstReaderBuilder<'_> { } fn build_scan_input(self) -> Result { - let mapper = ProjectionMapper::all(&self.metadata, true)?; + let mapper = ProjectionMapper::all(&self.metadata)?; let mut scan_input = ScanInput::new(self.sst_layer, mapper) .with_files(self.inputs.to_vec()) .with_append_mode(self.append_mode) @@ -857,8 +857,7 @@ impl CompactionSstReaderBuilder<'_> { .with_filter_deleted(self.filter_deleted) // We ignore file not found error during compaction. .with_ignore_file_not_found(true) - .with_merge_mode(self.merge_mode) - .with_flat_format(true); + .with_merge_mode(self.merge_mode); // This serves as a workaround of https://github.com/GreptimeTeam/greptimedb/issues/3944 // by converting time ranges into predicate. diff --git a/src/mito2/src/engine/scan_test.rs b/src/mito2/src/engine/scan_test.rs index 46f4cc6cf2..6357f01775 100644 --- a/src/mito2/src/engine/scan_test.rs +++ b/src/mito2/src/engine/scan_test.rs @@ -12,11 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::BTreeMap; + use api::v1::Rows; use common_error::ext::ErrorExt; use common_error::status_code::StatusCode; use common_recordbatch::RecordBatches; use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; +use datatypes::arrow::array::AsArray; +use datatypes::arrow::datatypes::{Float64Type, TimestampMillisecondType}; use futures::TryStreamExt; use store_api::region_engine::{PrepareRequest, RegionEngine, RegionScanner}; use store_api::region_request::RegionRequest; @@ -222,11 +226,16 @@ async fn test_max_concurrent_scan_files_with_format(flat_format: bool) { } #[tokio::test] -async fn test_series_scan_primarykey() { +async fn test_series_scan() { + test_series_scan_with_format(false).await; + test_series_scan_with_format(true).await; +} + +async fn test_series_scan_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("test_series_scan").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: false, + default_experimental_flat_format: flat_format, ..Default::default() }) .await; @@ -295,10 +304,27 @@ async fn test_series_scan_primarykey() { }) .unwrap(); + let actual_rows = collect_partition_rows_round_robin(&scanner, 3).await; + + let mut expected_rows = Vec::new(); + for value in [0_i64, 1, 2, 3, 4, 5, 3600, 3601, 3602, 7200, 7201, 7202] { + expected_rows.push((value.to_string(), value as f64, value * 1000)); + } + expected_rows.sort_by(|a, b| a.0.cmp(&b.0).then(a.2.cmp(&b.2))); + + assert_eq!(expected_rows, actual_rows); +} + +/// Scans all partitions in round-robin fashion and returns rows sorted by (tag, ts). +/// Also asserts that each series appears in only one partition. +async fn collect_partition_rows_round_robin( + scanner: &dyn RegionScanner, + num_partitions: usize, +) -> Vec<(String, f64, i64)> { let metrics_set = ExecutionPlanMetricsSet::default(); - let mut partition_batches = vec![vec![]; 3]; - let mut streams: Vec<_> = (0..3) + let mut partition_batches = vec![vec![]; num_partitions]; + let mut streams: Vec<_> = (0..num_partitions) .map(|partition| { let stream = scanner .scan_partition(&Default::default(), &metrics_set, partition) @@ -309,11 +335,11 @@ async fn test_series_scan_primarykey() { let mut num_done = 0; let mut schema = None; // Pull streams in round-robin fashion to get the consistent output from the sender. - while num_done < 3 { + while num_done < num_partitions { if schema.is_none() { schema = Some(streams[0].as_ref().unwrap().schema().clone()); } - for i in 0..3 { + for i in 0..num_partitions { let Some(mut stream) = streams[i].take() else { continue; }; @@ -326,189 +352,54 @@ async fn test_series_scan_primarykey() { } } - let mut check_result = |expected| { - let batches = - RecordBatches::try_new(schema.clone().unwrap(), partition_batches.remove(0)).unwrap(); - assert_eq!(expected, batches.pretty_print().unwrap()); - }; - - // Output series order is 0, 1, 2, 3, 3600, 3601, 3602, 4, 5, 7200, 7201, 7202 - let expected = "\ -+-------+---------+---------------------+ -| tag_0 | field_0 | ts | -+-------+---------+---------------------+ -| 0 | 0.0 | 1970-01-01T00:00:00 | -| 3 | 3.0 | 1970-01-01T00:00:03 | -| 3602 | 3602.0 | 1970-01-01T01:00:02 | -| 7200 | 7200.0 | 1970-01-01T02:00:00 | -+-------+---------+---------------------+"; - check_result(expected); - - let expected = "\ -+-------+---------+---------------------+ -| tag_0 | field_0 | ts | -+-------+---------+---------------------+ -| 1 | 1.0 | 1970-01-01T00:00:01 | -| 3600 | 3600.0 | 1970-01-01T01:00:00 | -| 4 | 4.0 | 1970-01-01T00:00:04 | -| 7201 | 7201.0 | 1970-01-01T02:00:01 | -+-------+---------+---------------------+"; - check_result(expected); - - let expected = "\ -+-------+---------+---------------------+ -| tag_0 | field_0 | ts | -+-------+---------+---------------------+ -| 2 | 2.0 | 1970-01-01T00:00:02 | -| 3601 | 3601.0 | 1970-01-01T01:00:01 | -| 5 | 5.0 | 1970-01-01T00:00:05 | -| 7202 | 7202.0 | 1970-01-01T02:00:02 | -+-------+---------+---------------------+"; - check_result(expected); + let schema = schema.unwrap(); + collect_and_assert_partition_rows(schema, partition_batches) } -#[tokio::test] -async fn test_series_scan_flat() { - let mut env = TestEnv::with_prefix("test_series_scan").await; - let engine = env - .create_engine(MitoConfig { - default_experimental_flat_format: true, - ..Default::default() - }) - .await; +/// Collects rows sorted by (tag, ts) from partition batches. +/// Also asserts that each series appears in only one partition. +fn collect_and_assert_partition_rows( + schema: datatypes::schema::SchemaRef, + partition_batches: Vec>, +) -> Vec<(String, f64, i64)> { + let mut series_to_partition = BTreeMap::new(); + let mut actual_rows = Vec::new(); - let region_id = RegionId::new(1, 1); - let request = CreateRequestBuilder::new() - .insert_option("compaction.type", "twcs") - .insert_option("compaction.twcs.time_window", "1h") - .build(); - let column_schemas = test_util::rows_schema(&request); + for (partition, batches) in partition_batches.into_iter().enumerate() { + let batches = RecordBatches::try_new(schema.clone(), batches).unwrap(); + let mut partition_series = Vec::new(); - engine - .handle_request(region_id, RegionRequest::Create(request)) - .await - .unwrap(); + for batch in batches.iter() { + let tags = batch.column_by_name("tag_0").unwrap().as_string::(); + let fields = batch + .column_by_name("field_0") + .unwrap() + .as_primitive::(); + let ts = batch + .column_by_name("ts") + .unwrap() + .as_primitive::(); - let put_flush_rows = async |start, end| { - let rows = Rows { - schema: column_schemas.clone(), - rows: test_util::build_rows(start, end), - }; - test_util::put_rows(&engine, region_id, rows).await; - test_util::flush_region(&engine, region_id, None).await; - }; - // generates 3 SST files - put_flush_rows(0, 3).await; - put_flush_rows(2, 6).await; - put_flush_rows(3600, 3603).await; - // Put to memtable. - let rows = Rows { - schema: column_schemas.clone(), - rows: test_util::build_rows(7200, 7203), - }; - test_util::put_rows(&engine, region_id, rows).await; - - let request = ScanRequest { - distribution: Some(TimeSeriesDistribution::PerSeries), - ..Default::default() - }; - let scanner = engine.scanner(region_id, request).await.unwrap(); - let Scanner::Series(mut scanner) = scanner else { - panic!("Scanner should be series scan"); - }; - // 3 partition ranges for 3 time window. - assert_eq!( - 3, - scanner.properties().partitions[0].len(), - "unexpected ranges: {:?}", - scanner.properties().partitions - ); - let raw_ranges: Vec<_> = scanner - .properties() - .partitions - .iter() - .flatten() - .cloned() - .collect(); - let mut new_ranges = Vec::with_capacity(3); - for range in raw_ranges { - new_ranges.push(vec![range]); - } - scanner - .prepare(PrepareRequest { - ranges: Some(new_ranges), - ..Default::default() - }) - .unwrap(); - - let metrics_set = ExecutionPlanMetricsSet::default(); - - let mut partition_batches = vec![vec![]; 3]; - let mut streams: Vec<_> = (0..3) - .map(|partition| { - let stream = scanner - .scan_partition(&Default::default(), &metrics_set, partition) - .unwrap(); - Some(stream) - }) - .collect(); - let mut num_done = 0; - let mut schema = None; - // Pull streams in round-robin fashion to get the consistent output from the sender. - while num_done < 3 { - if schema.is_none() { - schema = Some(streams[0].as_ref().unwrap().schema().clone()); + for row in 0..batch.num_rows() { + let tag = tags.value(row).to_string(); + let field = fields.value(row); + let ts = ts.value(row); + partition_series.push(tag.clone()); + actual_rows.push((tag, field, ts)); + } } - for i in 0..3 { - let Some(mut stream) = streams[i].take() else { - continue; - }; - let Some(rb) = stream.try_next().await.unwrap() else { - num_done += 1; - continue; - }; - partition_batches[i].push(rb); - streams[i] = Some(stream); + + partition_series.sort(); + partition_series.dedup(); + for tag in partition_series { + let prev = series_to_partition.insert(tag.clone(), partition); + assert_eq!( + None, prev, + "series {tag} appears in multiple partitions: {prev:?} and {partition}" + ); } } - let mut check_result = |expected| { - let batches = - RecordBatches::try_new(schema.clone().unwrap(), partition_batches.remove(0)).unwrap(); - assert_eq!(expected, batches.pretty_print().unwrap()); - }; - - // Output series order is 0, 1, 2, 3, 3600, 3601, 3602, 4, 5, 7200, 7201, 7202 - let expected = "\ -+-------+---------+---------------------+ -| tag_0 | field_0 | ts | -+-------+---------+---------------------+ -| 0 | 0.0 | 1970-01-01T00:00:00 | -| 1 | 1.0 | 1970-01-01T00:00:01 | -| 2 | 2.0 | 1970-01-01T00:00:02 | -| 3 | 3.0 | 1970-01-01T00:00:03 | -| 7200 | 7200.0 | 1970-01-01T02:00:00 | -| 7201 | 7201.0 | 1970-01-01T02:00:01 | -| 7202 | 7202.0 | 1970-01-01T02:00:02 | -+-------+---------+---------------------+"; - check_result(expected); - - let expected = "\ -+-------+---------+---------------------+ -| tag_0 | field_0 | ts | -+-------+---------+---------------------+ -| 3600 | 3600.0 | 1970-01-01T01:00:00 | -| 3601 | 3601.0 | 1970-01-01T01:00:01 | -| 3602 | 3602.0 | 1970-01-01T01:00:02 | -+-------+---------+---------------------+"; - check_result(expected); - - let expected = "\ -+-------+---------+---------------------+ -| tag_0 | field_0 | ts | -+-------+---------+---------------------+ -| 4 | 4.0 | 1970-01-01T00:00:04 | -| 5 | 5.0 | 1970-01-01T00:00:05 | -+-------+---------+---------------------+"; - check_result(expected); + actual_rows.sort_by(|a, b| a.0.cmp(&b.0).then(a.2.cmp(&b.2))); + actual_rows } diff --git a/src/mito2/src/read.rs b/src/mito2/src/read.rs index 84931b9f37..db7dfd1958 100644 --- a/src/mito2/src/read.rs +++ b/src/mito2/src/read.rs @@ -175,6 +175,7 @@ impl Batch { } /// Create an empty [`Batch`]. + #[allow(dead_code)] pub(crate) fn empty() -> Self { Self { primary_key: vec![], @@ -677,6 +678,7 @@ impl Batch { /// Checks the batch is monotonic by timestamps. #[cfg(debug_assertions)] + #[allow(dead_code)] pub(crate) fn check_monotonic(&self) -> Result<(), String> { use std::cmp::Ordering; if self.timestamps_native().is_none() { @@ -719,6 +721,7 @@ impl Batch { /// Returns Ok if the given batch is behind the current batch. #[cfg(debug_assertions)] + #[allow(dead_code)] pub(crate) fn check_next_batch(&self, other: &Batch) -> Result<(), String> { // Checks the primary key if self.primary_key() < other.primary_key() { @@ -798,6 +801,7 @@ impl Batch { /// A struct to check the batch is monotonic. #[cfg(debug_assertions)] #[derive(Default)] +#[allow(dead_code)] pub(crate) struct BatchChecker { last_batch: Option, start: Option, @@ -805,6 +809,7 @@ pub(crate) struct BatchChecker { } #[cfg(debug_assertions)] +#[allow(dead_code)] impl BatchChecker { /// Attaches the given start timestamp to the checker. pub(crate) fn with_start(mut self, start: Option) -> Self { diff --git a/src/mito2/src/read/compat.rs b/src/mito2/src/read/compat.rs index fd88749827..90d664a4bd 100644 --- a/src/mito2/src/read/compat.rs +++ b/src/mito2/src/read/compat.rs @@ -98,6 +98,7 @@ pub(crate) enum CompatBatch { impl CompatBatch { /// Returns the inner primary key batch adapter if this is a PrimaryKey format. + #[allow(dead_code)] pub(crate) fn as_primary_key(&self) -> Option<&PrimaryKeyCompatBatch> { match self { CompatBatch::PrimaryKey(batch) => Some(batch), @@ -980,7 +981,6 @@ mod tests { use datatypes::prelude::ConcreteDataType; use datatypes::schema::ColumnSchema; use datatypes::value::ValueRef; - use datatypes::vectors::{Int64Vector, TimestampMillisecondVector, UInt8Vector, UInt64Vector}; use mito_codec::row_converter::{ DensePrimaryKeyCodec, PrimaryKeyCodecExt, SparsePrimaryKeyCodec, }; @@ -992,7 +992,6 @@ mod tests { use crate::read::flat_projection::FlatProjectionMapper; use crate::sst::parquet::flat_format::FlatReadFormat; use crate::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema}; - use crate::test_util::{VecBatchReader, check_reader_result}; /// Creates a new [RegionMetadata]. fn new_metadata( @@ -1053,44 +1052,6 @@ mod tests { buffer } - /// Creates a batch for specific primary `key`. - /// - /// `fields`: [(column_id of the field, is null)] - fn new_batch( - primary_key: &[u8], - fields: &[(ColumnId, bool)], - start_ts: i64, - num_rows: usize, - ) -> Batch { - let timestamps = Arc::new(TimestampMillisecondVector::from_values( - start_ts..start_ts + num_rows as i64, - )); - let sequences = Arc::new(UInt64Vector::from_values(0..num_rows as u64)); - let op_types = Arc::new(UInt8Vector::from_vec(vec![OpType::Put as u8; num_rows])); - let field_columns = fields - .iter() - .map(|(id, is_null)| { - let data = if *is_null { - Arc::new(Int64Vector::from(vec![None; num_rows])) - } else { - Arc::new(Int64Vector::from_vec(vec![*id as i64; num_rows])) - }; - BatchColumn { - column_id: *id, - data, - } - }) - .collect(); - Batch::new( - primary_key.to_vec(), - timestamps, - sequences, - op_types, - field_columns, - ) - .unwrap() - } - #[test] fn test_invalid_pk_len() { let reader_meta = new_metadata( @@ -1213,311 +1174,6 @@ mod tests { assert!(may_compat_fields(&mapper, &reader_meta).unwrap().is_none()) } - #[tokio::test] - async fn test_compat_reader() { - let reader_meta = Arc::new(new_metadata( - &[ - ( - 0, - SemanticType::Timestamp, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - (1, SemanticType::Tag, ConcreteDataType::string_datatype()), - (2, SemanticType::Field, ConcreteDataType::int64_datatype()), - ], - &[1], - )); - let expect_meta = Arc::new(new_metadata( - &[ - ( - 0, - SemanticType::Timestamp, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - (1, SemanticType::Tag, ConcreteDataType::string_datatype()), - (2, SemanticType::Field, ConcreteDataType::int64_datatype()), - (3, SemanticType::Tag, ConcreteDataType::string_datatype()), - (4, SemanticType::Field, ConcreteDataType::int64_datatype()), - ], - &[1, 3], - )); - let mapper = ProjectionMapper::all(&expect_meta, false).unwrap(); - let k1 = encode_key(&[Some("a")]); - let k2 = encode_key(&[Some("b")]); - let source_reader = VecBatchReader::new(&[ - new_batch(&k1, &[(2, false)], 1000, 3), - new_batch(&k2, &[(2, false)], 1000, 3), - ]); - - let mut compat_reader = CompatReader::new(&mapper, reader_meta, source_reader).unwrap(); - let k1 = encode_key(&[Some("a"), None]); - let k2 = encode_key(&[Some("b"), None]); - check_reader_result( - &mut compat_reader, - &[ - new_batch(&k1, &[(2, false), (4, true)], 1000, 3), - new_batch(&k2, &[(2, false), (4, true)], 1000, 3), - ], - ) - .await; - } - - #[tokio::test] - async fn test_compat_reader_different_order() { - let reader_meta = Arc::new(new_metadata( - &[ - ( - 0, - SemanticType::Timestamp, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - (1, SemanticType::Tag, ConcreteDataType::string_datatype()), - (2, SemanticType::Field, ConcreteDataType::int64_datatype()), - ], - &[1], - )); - let expect_meta = Arc::new(new_metadata( - &[ - ( - 0, - SemanticType::Timestamp, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - (1, SemanticType::Tag, ConcreteDataType::string_datatype()), - (3, SemanticType::Field, ConcreteDataType::int64_datatype()), - (2, SemanticType::Field, ConcreteDataType::int64_datatype()), - (4, SemanticType::Field, ConcreteDataType::int64_datatype()), - ], - &[1], - )); - let mapper = ProjectionMapper::all(&expect_meta, false).unwrap(); - let k1 = encode_key(&[Some("a")]); - let k2 = encode_key(&[Some("b")]); - let source_reader = VecBatchReader::new(&[ - new_batch(&k1, &[(2, false)], 1000, 3), - new_batch(&k2, &[(2, false)], 1000, 3), - ]); - - let mut compat_reader = CompatReader::new(&mapper, reader_meta, source_reader).unwrap(); - check_reader_result( - &mut compat_reader, - &[ - new_batch(&k1, &[(3, true), (2, false), (4, true)], 1000, 3), - new_batch(&k2, &[(3, true), (2, false), (4, true)], 1000, 3), - ], - ) - .await; - } - - #[tokio::test] - async fn test_compat_reader_different_types() { - let actual_meta = Arc::new(new_metadata( - &[ - ( - 0, - SemanticType::Timestamp, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - (1, SemanticType::Tag, ConcreteDataType::string_datatype()), - (2, SemanticType::Field, ConcreteDataType::int64_datatype()), - ], - &[1], - )); - let expect_meta = Arc::new(new_metadata( - &[ - ( - 0, - SemanticType::Timestamp, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - (1, SemanticType::Tag, ConcreteDataType::string_datatype()), - (2, SemanticType::Field, ConcreteDataType::string_datatype()), - ], - &[1], - )); - let mapper = ProjectionMapper::all(&expect_meta, false).unwrap(); - let k1 = encode_key(&[Some("a")]); - let k2 = encode_key(&[Some("b")]); - let source_reader = VecBatchReader::new(&[ - new_batch(&k1, &[(2, false)], 1000, 3), - new_batch(&k2, &[(2, false)], 1000, 3), - ]); - - let fn_batch_cast = |batch: Batch| { - let mut new_fields = batch.fields.clone(); - new_fields[0].data = new_fields[0] - .data - .cast(&ConcreteDataType::string_datatype()) - .unwrap(); - - batch.with_fields(new_fields).unwrap() - }; - let mut compat_reader = CompatReader::new(&mapper, actual_meta, source_reader).unwrap(); - check_reader_result( - &mut compat_reader, - &[ - fn_batch_cast(new_batch(&k1, &[(2, false)], 1000, 3)), - fn_batch_cast(new_batch(&k2, &[(2, false)], 1000, 3)), - ], - ) - .await; - } - - #[tokio::test] - async fn test_compat_reader_projection() { - let reader_meta = Arc::new(new_metadata( - &[ - ( - 0, - SemanticType::Timestamp, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - (1, SemanticType::Tag, ConcreteDataType::string_datatype()), - (2, SemanticType::Field, ConcreteDataType::int64_datatype()), - ], - &[1], - )); - let expect_meta = Arc::new(new_metadata( - &[ - ( - 0, - SemanticType::Timestamp, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - (1, SemanticType::Tag, ConcreteDataType::string_datatype()), - (3, SemanticType::Field, ConcreteDataType::int64_datatype()), - (2, SemanticType::Field, ConcreteDataType::int64_datatype()), - (4, SemanticType::Field, ConcreteDataType::int64_datatype()), - ], - &[1], - )); - // tag_1, field_2, field_3 - let mapper = ProjectionMapper::new(&expect_meta, [1, 3, 2].into_iter(), false).unwrap(); - let k1 = encode_key(&[Some("a")]); - let source_reader = VecBatchReader::new(&[new_batch(&k1, &[(2, false)], 1000, 3)]); - - let mut compat_reader = - CompatReader::new(&mapper, reader_meta.clone(), source_reader).unwrap(); - check_reader_result( - &mut compat_reader, - &[new_batch(&k1, &[(3, true), (2, false)], 1000, 3)], - ) - .await; - - // tag_1, field_4, field_3 - let mapper = ProjectionMapper::new(&expect_meta, [1, 4, 2].into_iter(), false).unwrap(); - let k1 = encode_key(&[Some("a")]); - let source_reader = VecBatchReader::new(&[new_batch(&k1, &[], 1000, 3)]); - - let mut compat_reader = CompatReader::new(&mapper, reader_meta, source_reader).unwrap(); - check_reader_result( - &mut compat_reader, - &[new_batch(&k1, &[(3, true), (4, true)], 1000, 3)], - ) - .await; - } - - #[tokio::test] - async fn test_compat_reader_projection_read_superset() { - let reader_meta = Arc::new(new_metadata( - &[ - ( - 0, - SemanticType::Timestamp, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - (1, SemanticType::Tag, ConcreteDataType::string_datatype()), - (2, SemanticType::Field, ConcreteDataType::int64_datatype()), - ], - &[1], - )); - let expect_meta = Arc::new(new_metadata( - &[ - ( - 0, - SemanticType::Timestamp, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - (1, SemanticType::Tag, ConcreteDataType::string_datatype()), - (3, SemanticType::Field, ConcreteDataType::int64_datatype()), - (2, SemanticType::Field, ConcreteDataType::int64_datatype()), - (4, SemanticType::Field, ConcreteDataType::int64_datatype()), - ], - &[1], - )); - // Output: tag_1, field_3, field_2. Read also includes field_4. - let mapper = ProjectionMapper::new_with_read_columns( - &expect_meta, - [1, 3, 2].into_iter(), - false, - vec![1, 3, 2, 4], - ) - .unwrap(); - let k1 = encode_key(&[Some("a")]); - let source_reader = VecBatchReader::new(&[new_batch(&k1, &[(2, false)], 1000, 3)]); - - let mut compat_reader = CompatReader::new(&mapper, reader_meta, source_reader).unwrap(); - check_reader_result( - &mut compat_reader, - &[new_batch(&k1, &[(3, true), (2, false), (4, true)], 1000, 3)], - ) - .await; - } - - #[tokio::test] - async fn test_compat_reader_different_pk_encoding() { - let mut reader_meta = new_metadata( - &[ - ( - 0, - SemanticType::Timestamp, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - (1, SemanticType::Tag, ConcreteDataType::string_datatype()), - (2, SemanticType::Field, ConcreteDataType::int64_datatype()), - ], - &[1], - ); - reader_meta.primary_key_encoding = PrimaryKeyEncoding::Dense; - let reader_meta = Arc::new(reader_meta); - let mut expect_meta = new_metadata( - &[ - ( - 0, - SemanticType::Timestamp, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - (1, SemanticType::Tag, ConcreteDataType::string_datatype()), - (2, SemanticType::Field, ConcreteDataType::int64_datatype()), - (3, SemanticType::Tag, ConcreteDataType::string_datatype()), - (4, SemanticType::Field, ConcreteDataType::int64_datatype()), - ], - &[1, 3], - ); - expect_meta.primary_key_encoding = PrimaryKeyEncoding::Sparse; - let expect_meta = Arc::new(expect_meta); - - let mapper = ProjectionMapper::all(&expect_meta, false).unwrap(); - let k1 = encode_key(&[Some("a")]); - let k2 = encode_key(&[Some("b")]); - let source_reader = VecBatchReader::new(&[ - new_batch(&k1, &[(2, false)], 1000, 3), - new_batch(&k2, &[(2, false)], 1000, 3), - ]); - - let mut compat_reader = CompatReader::new(&mapper, reader_meta, source_reader).unwrap(); - let k1 = encode_sparse_key(&[(1, Some("a")), (3, None)]); - let k2 = encode_sparse_key(&[(1, Some("b")), (3, None)]); - check_reader_result( - &mut compat_reader, - &[ - new_batch(&k1, &[(2, false), (4, true)], 1000, 3), - new_batch(&k2, &[(2, false), (4, true)], 1000, 3), - ], - ) - .await; - } - /// Creates a primary key array for flat format testing. fn build_flat_test_pk_array(primary_keys: &[&[u8]]) -> ArrayRef { let mut builder = BinaryDictionaryBuilder::::new(); diff --git a/src/mito2/src/read/last_row.rs b/src/mito2/src/read/last_row.rs index 1dc4102311..e087e12094 100644 --- a/src/mito2/src/read/last_row.rs +++ b/src/mito2/src/read/last_row.rs @@ -45,6 +45,7 @@ use crate::sst::parquet::reader::{FlatRowGroupReader, ReaderMetrics, RowGroupRea /// /// This reader is different from the [MergeMode](crate::region::options::MergeMode) as /// it focus on time series (the same key). +#[allow(dead_code)] pub(crate) struct LastRowReader { /// Inner reader. reader: BoxedBatchReader, @@ -52,6 +53,7 @@ pub(crate) struct LastRowReader { selector: LastRowSelector, } +#[allow(dead_code)] impl LastRowReader { /// Creates a new `LastRowReader`. pub(crate) fn new(reader: BoxedBatchReader) -> Self { diff --git a/src/mito2/src/read/projection.rs b/src/mito2/src/read/projection.rs index b5b6904521..d22c87bcc2 100644 --- a/src/mito2/src/read/projection.rs +++ b/src/mito2/src/read/projection.rs @@ -52,51 +52,27 @@ impl ProjectionMapper { pub fn new( metadata: &RegionMetadataRef, projection: impl Iterator + Clone, - flat_format: bool, ) -> Result { - if flat_format { - Ok(ProjectionMapper::Flat(FlatProjectionMapper::new( - metadata, projection, - )?)) - } else { - Ok(ProjectionMapper::PrimaryKey( - PrimaryKeyProjectionMapper::new(metadata, projection)?, - )) - } + Ok(ProjectionMapper::Flat(FlatProjectionMapper::new( + metadata, projection, + )?)) } /// Returns a new mapper with output projection and explicit read columns. pub fn new_with_read_columns( metadata: &RegionMetadataRef, projection: impl Iterator, - flat_format: bool, read_column_ids: Vec, ) -> Result { let projection: Vec<_> = projection.collect(); - if flat_format { - Ok(ProjectionMapper::Flat( - FlatProjectionMapper::new_with_read_columns(metadata, projection, read_column_ids)?, - )) - } else { - Ok(ProjectionMapper::PrimaryKey( - PrimaryKeyProjectionMapper::new_with_read_columns( - metadata, - projection, - read_column_ids, - )?, - )) - } + Ok(ProjectionMapper::Flat( + FlatProjectionMapper::new_with_read_columns(metadata, projection, read_column_ids)?, + )) } /// Returns a new mapper without projection. - pub fn all(metadata: &RegionMetadataRef, flat_format: bool) -> Result { - if flat_format { - Ok(ProjectionMapper::Flat(FlatProjectionMapper::all(metadata)?)) - } else { - Ok(ProjectionMapper::PrimaryKey( - PrimaryKeyProjectionMapper::all(metadata)?, - )) - } + pub fn all(metadata: &RegionMetadataRef) -> Result { + Ok(ProjectionMapper::Flat(FlatProjectionMapper::all(metadata)?)) } /// Returns the metadata that created the mapper. @@ -159,6 +135,7 @@ impl ProjectionMapper { } /// Handles projection and converts a projected [Batch] to a projected [RecordBatch]. +#[allow(dead_code)] pub struct PrimaryKeyProjectionMapper { /// Metadata of the region. metadata: RegionMetadataRef, @@ -178,6 +155,7 @@ pub struct PrimaryKeyProjectionMapper { is_empty_projection: bool, } +#[allow(dead_code)] impl PrimaryKeyProjectionMapper { /// Returns a new mapper with projection. /// If `projection` is empty, it outputs [RecordBatch] without any column but only a row count. @@ -413,6 +391,7 @@ pub(crate) fn read_column_ids_from_projection( /// Index of a vector in a [Batch]. #[derive(Debug, Clone, Copy)] +#[allow(dead_code)] enum BatchIndex { /// Index in primary keys. Tag((usize, ColumnId)), @@ -480,53 +459,6 @@ mod tests { }; use super::*; - use crate::cache::CacheManager; - use crate::read::BatchBuilder; - - fn new_batch( - ts_start: i64, - tags: &[i64], - fields: &[(ColumnId, i64)], - num_rows: usize, - ) -> Batch { - let converter = DensePrimaryKeyCodec::with_fields( - (0..tags.len()) - .map(|idx| { - ( - idx as u32, - SortField::new(ConcreteDataType::int64_datatype()), - ) - }) - .collect(), - ); - let primary_key = converter - .encode(tags.iter().map(|v| ValueRef::Int64(*v))) - .unwrap(); - - let mut builder = BatchBuilder::new(primary_key); - builder - .timestamps_array(Arc::new(TimestampMillisecondArray::from_iter_values( - (0..num_rows).map(|i| ts_start + i as i64 * 1000), - ))) - .unwrap() - .sequences_array(Arc::new(UInt64Array::from_iter_values(0..num_rows as u64))) - .unwrap() - .op_types_array(Arc::new(UInt8Array::from_iter_values( - (0..num_rows).map(|_| OpType::Put as u8), - ))) - .unwrap(); - for (column_id, field) in fields { - builder - .push_field_array( - *column_id, - Arc::new(Int64Array::from_iter_values(std::iter::repeat_n( - *field, num_rows, - ))), - ) - .unwrap(); - } - builder.build().unwrap() - } fn print_record_batch(record_batch: RecordBatch) -> String { pretty::pretty_format_batches(&[record_batch.into_df_record_batch()]) @@ -534,166 +466,6 @@ mod tests { .to_string() } - #[test] - fn test_projection_mapper_all() { - let metadata = Arc::new( - TestRegionMetadataBuilder::default() - .num_tags(2) - .num_fields(2) - .build(), - ); - // Create the enum wrapper with default format (primary key) - let mapper = ProjectionMapper::all(&metadata, false).unwrap(); - assert_eq!([0, 1, 2, 3, 4], mapper.column_ids()); - assert_eq!( - [ - (3, ConcreteDataType::int64_datatype()), - (4, ConcreteDataType::int64_datatype()) - ], - mapper.as_primary_key().unwrap().batch_fields() - ); - - // With vector cache. - let cache = CacheManager::builder().vector_cache_size(1024).build(); - let cache = CacheStrategy::EnableAll(Arc::new(cache)); - let batch = new_batch(0, &[1, 2], &[(3, 3), (4, 4)], 3); - let record_batch = mapper - .as_primary_key() - .unwrap() - .convert(&batch, &cache) - .unwrap(); - let expect = "\ -+---------------------+----+----+----+----+ -| ts | k0 | k1 | v0 | v1 | -+---------------------+----+----+----+----+ -| 1970-01-01T00:00:00 | 1 | 2 | 3 | 4 | -| 1970-01-01T00:00:01 | 1 | 2 | 3 | 4 | -| 1970-01-01T00:00:02 | 1 | 2 | 3 | 4 | -+---------------------+----+----+----+----+"; - assert_eq!(expect, print_record_batch(record_batch)); - - assert!( - cache - .get_repeated_vector(&ConcreteDataType::int64_datatype(), &Value::Int64(1)) - .is_some() - ); - assert!( - cache - .get_repeated_vector(&ConcreteDataType::int64_datatype(), &Value::Int64(2)) - .is_some() - ); - assert!( - cache - .get_repeated_vector(&ConcreteDataType::int64_datatype(), &Value::Int64(3)) - .is_none() - ); - let record_batch = mapper - .as_primary_key() - .unwrap() - .convert(&batch, &cache) - .unwrap(); - assert_eq!(expect, print_record_batch(record_batch)); - } - - #[test] - fn test_projection_mapper_with_projection() { - let metadata = Arc::new( - TestRegionMetadataBuilder::default() - .num_tags(2) - .num_fields(2) - .build(), - ); - // Columns v1, k0 - let mapper = ProjectionMapper::new(&metadata, [4, 1].into_iter(), false).unwrap(); - assert_eq!([4, 1], mapper.column_ids()); - assert_eq!( - [(4, ConcreteDataType::int64_datatype())], - mapper.as_primary_key().unwrap().batch_fields() - ); - - let batch = new_batch(0, &[1, 2], &[(4, 4)], 3); - let cache = CacheManager::builder().vector_cache_size(1024).build(); - let cache = CacheStrategy::EnableAll(Arc::new(cache)); - let record_batch = mapper - .as_primary_key() - .unwrap() - .convert(&batch, &cache) - .unwrap(); - let expect = "\ -+----+----+ -| v1 | k0 | -+----+----+ -| 4 | 1 | -| 4 | 1 | -| 4 | 1 | -+----+----+"; - assert_eq!(expect, print_record_batch(record_batch)); - } - - #[test] - fn test_projection_mapper_read_superset() { - let metadata = Arc::new( - TestRegionMetadataBuilder::default() - .num_tags(2) - .num_fields(2) - .build(), - ); - // Output columns v1, k0. Read also includes v0. - let mapper = ProjectionMapper::new_with_read_columns( - &metadata, - [4, 1].into_iter(), - false, - vec![4, 1, 3], - ) - .unwrap(); - assert_eq!([4, 1, 3], mapper.column_ids()); - - let batch = new_batch(0, &[1, 2], &[(3, 3), (4, 4)], 3); - let cache = CacheManager::builder().vector_cache_size(1024).build(); - let cache = CacheStrategy::EnableAll(Arc::new(cache)); - let record_batch = mapper - .as_primary_key() - .unwrap() - .convert(&batch, &cache) - .unwrap(); - let expect = "\ -+----+----+ -| v1 | k0 | -+----+----+ -| 4 | 1 | -| 4 | 1 | -| 4 | 1 | -+----+----+"; - assert_eq!(expect, print_record_batch(record_batch)); - } - - #[test] - fn test_projection_mapper_empty_projection() { - let metadata = Arc::new( - TestRegionMetadataBuilder::default() - .num_tags(2) - .num_fields(2) - .build(), - ); - // Empty projection - let mapper = ProjectionMapper::new(&metadata, [].into_iter(), false).unwrap(); - assert_eq!([0], mapper.column_ids()); // Should still read the time index column - assert!(mapper.output_schema().is_empty()); - let pk_mapper = mapper.as_primary_key().unwrap(); - assert!(pk_mapper.batch_fields().is_empty()); - assert!(!pk_mapper.has_tags); - assert!(pk_mapper.batch_indices.is_empty()); - assert!(pk_mapper.is_empty_projection); - - let batch = new_batch(0, &[1, 2], &[], 3); - let cache = CacheManager::builder().vector_cache_size(1024).build(); - let cache = CacheStrategy::EnableAll(Arc::new(cache)); - let record_batch = pk_mapper.convert(&batch, &cache).unwrap(); - assert_eq!(3, record_batch.num_rows()); - assert_eq!(0, record_batch.num_columns()); - assert!(record_batch.schema.is_empty()); - } - fn new_flat_batch( ts_start: Option, idx_tags: &[(usize, i64)], @@ -809,7 +581,7 @@ mod tests { .build(), ); let cache = CacheStrategy::Disabled; - let mapper = ProjectionMapper::all(&metadata, true).unwrap(); + let mapper = ProjectionMapper::all(&metadata).unwrap(); assert_eq!([0, 1, 2, 3, 4], mapper.column_ids()); assert_eq!( [ @@ -845,7 +617,7 @@ mod tests { ); let cache = CacheStrategy::Disabled; // Columns v1, k0 - let mapper = ProjectionMapper::new(&metadata, [4, 1].into_iter(), true).unwrap(); + let mapper = ProjectionMapper::new(&metadata, [4, 1].into_iter()).unwrap(); assert_eq!([4, 1], mapper.column_ids()); assert_eq!( [ @@ -879,13 +651,9 @@ mod tests { ); let cache = CacheStrategy::Disabled; // Output columns v1, k0. Read also includes v0. - let mapper = ProjectionMapper::new_with_read_columns( - &metadata, - [4, 1].into_iter(), - true, - vec![4, 1, 3], - ) - .unwrap(); + let mapper = + ProjectionMapper::new_with_read_columns(&metadata, [4, 1].into_iter(), vec![4, 1, 3]) + .unwrap(); assert_eq!([4, 1, 3], mapper.column_ids()); let batch = new_flat_batch(None, &[(1, 1)], &[(3, 3), (4, 4)], 3); @@ -911,7 +679,7 @@ mod tests { ); let cache = CacheStrategy::Disabled; // Empty projection - let mapper = ProjectionMapper::new(&metadata, [].into_iter(), true).unwrap(); + let mapper = ProjectionMapper::new(&metadata, [].into_iter()).unwrap(); assert_eq!([0], mapper.column_ids()); // Should still read the time index column assert!(mapper.output_schema().is_empty()); let flat_mapper = mapper.as_flat().unwrap(); diff --git a/src/mito2/src/read/prune.rs b/src/mito2/src/read/prune.rs index 6766bf3f38..55ad504e6f 100644 --- a/src/mito2/src/read/prune.rs +++ b/src/mito2/src/read/prune.rs @@ -30,11 +30,13 @@ use crate::sst::file::FileTimeRange; use crate::sst::parquet::file_range::FileRangeContextRef; use crate::sst::parquet::reader::{FlatRowGroupReader, ReaderMetrics, RowGroupReader}; +#[allow(dead_code)] pub enum Source { RowGroup(RowGroupReader), LastRow(RowGroupLastRowCachedReader), } +#[allow(dead_code)] impl Source { async fn next_batch(&mut self) -> Result> { match self { @@ -44,6 +46,7 @@ impl Source { } } +#[allow(dead_code)] pub struct PruneReader { /// Context for file ranges. context: FileRangeContextRef, @@ -53,6 +56,7 @@ pub struct PruneReader { skip_fields: bool, } +#[allow(dead_code)] impl PruneReader { pub(crate) fn new_with_row_group_reader( ctx: FileRangeContextRef, diff --git a/src/mito2/src/read/range_cache.rs b/src/mito2/src/read/range_cache.rs index 5fc8931691..2431a21f6a 100644 --- a/src/mito2/src/read/range_cache.rs +++ b/src/mito2/src/read/range_cache.rs @@ -515,7 +515,7 @@ mod tests { ) -> (StreamContext, PartitionRange) { let env = SchedulerEnv::new().await; let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false)); - let mapper = ProjectionMapper::new(&metadata, [0, 2, 3].into_iter(), true).unwrap(); + let mapper = ProjectionMapper::new(&metadata, [0, 2, 3].into_iter()).unwrap(); let predicate = PredicateGroup::new(metadata.as_ref(), &filters).unwrap(); let file_id = FileId::random(); let file = sst_file_handle_with_file_id( @@ -527,8 +527,7 @@ mod tests { .with_predicate(predicate) .with_time_range(query_time_range) .with_files(vec![file]) - .with_cache(test_cache_strategy()) - .with_flat_format(true); + .with_cache(test_cache_strategy()); let range_meta = RangeMeta { time_range: partition_time_range, indices: smallvec![SourceIndex { diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs index e7cae7e7b8..f56c807af3 100644 --- a/src/mito2/src/read/scan_region.rs +++ b/src/mito2/src/read/scan_region.rs @@ -63,7 +63,6 @@ use crate::read::unordered_scan::UnorderedScan; use crate::read::{Batch, BoxedRecordBatchStream, RecordBatch, Source}; use crate::region::options::MergeMode; use crate::region::version::VersionRef; -use crate::sst::FormatType; use crate::sst::file::FileHandle; use crate::sst::index::bloom_filter::applier::{ BloomFilterIndexApplierBuilder, BloomFilterIndexApplierRef, @@ -77,8 +76,6 @@ use crate::sst::index::vector_index::applier::{VectorIndexApplier, VectorIndexAp use crate::sst::parquet::file_range::PreFilterMode; use crate::sst::parquet::reader::ReaderMetrics; -/// Parallel scan channel size for flat format. -const FLAT_SCAN_CHANNEL_SIZE: usize = 2; #[cfg(feature = "vector_index")] const VECTOR_INDEX_OVERFETCH_MULTIPLIER: usize = 2; @@ -399,19 +396,12 @@ impl ScanRegion { self.request.distribution == Some(TimeSeriesDistribution::PerSeries) } - /// Returns true if the region use flat format. - fn use_flat_format(&self) -> bool { - self.request.force_flat_format - || self.version.options.sst_format.unwrap_or_default() == FormatType::Flat - } - /// Creates a scan input. #[tracing::instrument(skip_all, fields(region_id = %self.region_id()))] - async fn scan_input(mut self) -> Result { + async fn scan_input(self) -> Result { let sst_min_sequence = self.request.sst_min_sequence.and_then(NonZeroU64::new); let time_range = self.build_time_range_predicate(); let predicate = PredicateGroup::new(&self.version.metadata, &self.request.filters)?; - let flat_format = self.use_flat_format(); let read_column_ids = match &self.request.projection { Some(p) => self.build_read_column_ids(p, &predicate)?, @@ -429,10 +419,9 @@ impl ScanRegion { Some(p) => ProjectionMapper::new_with_read_columns( &self.version.metadata, p.iter().copied(), - flat_format, read_column_ids.clone(), )?, - None => ProjectionMapper::all(&self.version.metadata, flat_format)?, + None => ProjectionMapper::all(&self.version.metadata)?, }; let ssts = &self.version.ssts; @@ -496,14 +485,13 @@ impl ScanRegion { let region_id = self.region_id(); debug!( - "Scan region {}, request: {:?}, time range: {:?}, memtables: {}, ssts_to_read: {}, append_mode: {}, flat_format: {}", + "Scan region {}, request: {:?}, time range: {:?}, memtables: {}, ssts_to_read: {}, append_mode: {}", region_id, self.request, time_range, mem_range_builders.len(), files.len(), self.version.options.append_mode, - flat_format, ); let (non_field_filters, field_filters) = self.partition_by_field_filters(); @@ -530,11 +518,6 @@ impl ScanRegion { } }); - if flat_format { - // The batch is already large enough so we use a small channel size here. - self.parallel_scan_channel_size = FLAT_SCAN_CHANNEL_SIZE; - } - let input = ScanInput::new(self.access_layer, mapper) .with_time_range(Some(time_range)) .with_predicate(predicate) @@ -552,7 +535,9 @@ impl ScanRegion { .with_merge_mode(self.version.options.merge_mode()) .with_series_row_selector(self.request.series_row_selector) .with_distribution(self.request.distribution) - .with_flat_format(flat_format); + .with_explain_flat_format( + self.version.options.sst_format == Some(crate::sst::FormatType::Flat), + ); #[cfg(feature = "vector_index")] let input = input .with_vector_index_applier(vector_index_applier) @@ -855,8 +840,8 @@ pub struct ScanInput { pub(crate) series_row_selector: Option, /// Hint for the required distribution of the scanner. pub(crate) distribution: Option, - /// Whether to use flat format. - pub(crate) flat_format: bool, + /// Whether the region's configured SST format is flat. + explain_flat_format: bool, /// Whether this scan is for compaction. pub(crate) compaction: bool, #[cfg(feature = "enterprise")] @@ -893,7 +878,7 @@ impl ScanInput { merge_mode: MergeMode::default(), series_row_selector: None, distribution: None, - flat_format: false, + explain_flat_format: false, compaction: false, #[cfg(feature = "enterprise")] extension_ranges: Vec::new(), @@ -1049,6 +1034,13 @@ impl ScanInput { self } + /// Sets whether the region's configured SST format is flat for explain output. + #[must_use] + pub(crate) fn with_explain_flat_format(mut self, explain_flat_format: bool) -> Self { + self.explain_flat_format = explain_flat_format; + self + } + /// Sets the time series row selector. #[must_use] pub(crate) fn with_series_row_selector( @@ -1059,13 +1051,6 @@ impl ScanInput { self } - /// Sets whether to use flat format. - #[must_use] - pub(crate) fn with_flat_format(mut self, flat_format: bool) -> Self { - self.flat_format = flat_format; - self - } - /// Sets whether this scan is for compaction. #[must_use] pub(crate) fn with_compaction(mut self, compaction: bool) -> Self { @@ -1165,7 +1150,6 @@ impl ScanInput { }; let res = reader .expected_metadata(Some(self.mapper.metadata().clone())) - .flat_format(self.flat_format) .compaction(self.compaction) .pre_filter_mode(filter_mode) .decode_primary_key_values(decode_pk_values) @@ -1421,8 +1405,7 @@ fn pre_filter_mode(append_mode: bool, merge_mode: MergeMode) -> PreFilterMode { /// Builds a [ScanRequestFingerprint] from a [ScanInput] if the scan is eligible /// for partition range caching. pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option { - let eligible = input.flat_format - && !input.compaction + let eligible = !input.compaction && !input.files.is_empty() && matches!(input.cache_strategy, CacheStrategy::EnableAll(_)); @@ -1709,8 +1692,7 @@ impl StreamContext { .entries(self.input.files.iter().map(|file| FileWrapper { file })) .finish()?; } - write!(f, ", \"flat_format\": {}", self.input.flat_format)?; - + write!(f, ", \"flat_format\": {}", self.input.explain_flat_format)?; #[cfg(feature = "enterprise")] self.format_extension_ranges(f)?; @@ -1881,9 +1863,7 @@ mod tests { use crate::cache::CacheManager; use crate::memtable::time_partition::TimePartitions; use crate::read::range_cache::ScanRequestFingerprintBuilder; - use crate::region::options::RegionOptions; use crate::region::version::VersionBuilder; - use crate::sst::FormatType; use crate::test_util::memtable_util::{EmptyMemtableBuilder, metadata_with_primary_key}; use crate::test_util::scheduler_util::SchedulerEnv; @@ -1897,30 +1877,9 @@ mod tests { Arc::new(VersionBuilder::new(metadata, mutable).build()) } - fn new_version_with_sst_format( - metadata: RegionMetadataRef, - sst_format: Option, - ) -> VersionRef { - let mutable = Arc::new(TimePartitions::new( - metadata.clone(), - Arc::new(EmptyMemtableBuilder::default()), - 0, - None, - )); - let options = RegionOptions { - sst_format, - ..Default::default() - }; - Arc::new( - VersionBuilder::new(metadata, mutable) - .options(options) - .build(), - ) - } - async fn new_scan_input(metadata: RegionMetadataRef, filters: Vec) -> ScanInput { let env = SchedulerEnv::new().await; - let mapper = ProjectionMapper::new(&metadata, [0, 2, 3].into_iter(), true).unwrap(); + let mapper = ProjectionMapper::new(&metadata, [0, 2, 3].into_iter()).unwrap(); let predicate = PredicateGroup::new(metadata.as_ref(), &filters).unwrap(); let file = FileHandle::new( crate::sst::file::FileMeta::default(), @@ -1934,7 +1893,6 @@ mod tests { .range_result_cache_size(1024) .build(), ))) - .with_flat_format(true) .with_files(vec![file]) } @@ -2018,45 +1976,6 @@ mod tests { assert_eq!(vec![4, 1, 3], read_ids); } - #[tokio::test] - async fn test_use_flat_format_honors_request_override() { - let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false)); - let env = SchedulerEnv::new().await; - - let primary_key_version = - new_version_with_sst_format(metadata.clone(), Some(FormatType::PrimaryKey)); - let request = ScanRequest::default(); - let scan_region = ScanRegion::new( - primary_key_version.clone(), - env.access_layer.clone(), - request, - CacheStrategy::Disabled, - ); - assert!(!scan_region.use_flat_format()); - - let request = ScanRequest { - force_flat_format: true, - ..Default::default() - }; - let scan_region = ScanRegion::new( - primary_key_version, - env.access_layer.clone(), - request, - CacheStrategy::Disabled, - ); - assert!(scan_region.use_flat_format()); - - let flat_version = new_version_with_sst_format(metadata, Some(FormatType::Flat)); - let request = ScanRequest::default(); - let scan_region = ScanRegion::new( - flat_version, - env.access_layer.clone(), - request, - CacheStrategy::Disabled, - ); - assert!(scan_region.use_flat_format()); - } - /// Helper to create a timestamp millisecond literal. fn ts_lit(val: i64) -> datafusion_expr::Expr { lit(ScalarValue::TimestampMillisecond(Some(val), None)) @@ -2128,17 +2047,11 @@ mod tests { let disabled = ScanInput::new( SchedulerEnv::new().await.access_layer.clone(), - ProjectionMapper::new(&metadata, [0, 2, 3].into_iter(), true).unwrap(), + ProjectionMapper::new(&metadata, [0, 2, 3].into_iter()).unwrap(), ) - .with_predicate(PredicateGroup::new(metadata.as_ref(), &filters).unwrap()) - .with_flat_format(true); + .with_predicate(PredicateGroup::new(metadata.as_ref(), &filters).unwrap()); assert!(build_scan_fingerprint(&disabled).is_none()); - let non_flat = new_scan_input(metadata.clone(), filters.clone()) - .await - .with_flat_format(false); - assert!(build_scan_fingerprint(&non_flat).is_none()); - let compaction = new_scan_input(metadata.clone(), filters.clone()) .await .with_compaction(true); diff --git a/src/mito2/src/read/scan_util.rs b/src/mito2/src/read/scan_util.rs index d065657242..eee32e7835 100644 --- a/src/mito2/src/read/scan_util.rs +++ b/src/mito2/src/read/scan_util.rs @@ -43,7 +43,7 @@ use crate::read::merge::{MergeMetrics, MergeMetricsReport}; use crate::read::pruner::PartitionPruner; use crate::read::range::{RangeMeta, RowGroupIndex}; use crate::read::scan_region::StreamContext; -use crate::read::{Batch, BoxedBatchStream, BoxedRecordBatchStream, ScannerMetrics, Source}; +use crate::read::{BoxedRecordBatchStream, ScannerMetrics}; use crate::sst::file::{FileTimeRange, RegionFileId}; use crate::sst::index::bloom_filter::applier::BloomFilterIndexApplyMetrics; use crate::sst::index::fulltext_index::applier::FulltextIndexApplyMetrics; @@ -1186,45 +1186,6 @@ pub(crate) struct SeriesDistributorMetrics { pub(crate) divider_cost: Duration, } -/// Scans memtable ranges at `index`. -#[tracing::instrument( - skip_all, - fields( - region_id = %stream_ctx.input.region_metadata().region_id, - file_or_mem_index = %index.index, - row_group_index = %index.row_group_index, - source = "mem" - ) -)] -pub(crate) fn scan_mem_ranges( - stream_ctx: Arc, - part_metrics: PartitionMetrics, - index: RowGroupIndex, - time_range: FileTimeRange, -) -> impl Stream> { - try_stream! { - let ranges = stream_ctx.input.build_mem_ranges(index); - part_metrics.inc_num_mem_ranges(ranges.len()); - for range in ranges { - let build_reader_start = Instant::now(); - let mem_scan_metrics = Some(MemScanMetrics::default()); - let iter = range.build_prune_iter(time_range, mem_scan_metrics.clone())?; - part_metrics.inc_build_reader_cost(build_reader_start.elapsed()); - - let mut source = Source::Iter(iter); - while let Some(batch) = source.next_batch().await? { - yield batch; - } - - // Report the memtable scan metrics to partition metrics - if let Some(ref metrics) = mem_scan_metrics { - let data = metrics.data(); - part_metrics.report_mem_scan_metrics(&data); - } - } - } -} - /// Scans memtable ranges at `index` using flat format that returns RecordBatch. #[tracing::instrument( skip_all, @@ -1342,59 +1303,6 @@ fn new_filter_metrics(explain_verbose: bool) -> ReaderFilterMetrics { } } -/// Scans file ranges at `index`. -#[tracing::instrument( - skip_all, - fields( - region_id = %stream_ctx.input.region_metadata().region_id, - row_group_index = %index.index, - source = read_type - ) -)] -pub(crate) async fn scan_file_ranges( - stream_ctx: Arc, - part_metrics: PartitionMetrics, - index: RowGroupIndex, - read_type: &'static str, - partition_pruner: Arc, -) -> Result>> { - let mut reader_metrics = ReaderMetrics { - filter_metrics: new_filter_metrics(part_metrics.explain_verbose()), - ..Default::default() - }; - let ranges = partition_pruner - .build_file_ranges(index, &part_metrics, &mut reader_metrics) - .await?; - part_metrics.inc_num_file_ranges(ranges.len()); - part_metrics.merge_reader_metrics(&reader_metrics, None); - - // Creates initial per-file metrics with build_part_cost. - let init_per_file_metrics = if part_metrics.explain_verbose() { - let file = stream_ctx.input.file_from_index(index); - let file_id = file.file_id(); - - let mut map = HashMap::new(); - map.insert( - file_id, - FileScanMetrics { - build_part_cost: reader_metrics.build_cost, - ..Default::default() - }, - ); - Some(map) - } else { - None - }; - - Ok(build_file_range_scan_stream( - stream_ctx, - part_metrics, - read_type, - ranges, - init_per_file_metrics, - )) -} - /// Scans file ranges at `index` using flat reader that returns RecordBatch. #[tracing::instrument( skip_all, @@ -1448,70 +1356,6 @@ pub(crate) async fn scan_flat_file_ranges( )) } -/// Build the stream of scanning the input [`FileRange`]s. -#[tracing::instrument( - skip_all, - fields(read_type = read_type, range_count = ranges.len()) -)] -pub fn build_file_range_scan_stream( - stream_ctx: Arc, - part_metrics: PartitionMetrics, - read_type: &'static str, - ranges: SmallVec<[FileRange; 2]>, - mut per_file_metrics: Option>, -) -> impl Stream> { - try_stream! { - let fetch_metrics = if part_metrics.explain_verbose() { - Some(Arc::new(ParquetFetchMetrics::default())) - } else { - None - }; - let reader_metrics = &mut ReaderMetrics { - fetch_metrics: fetch_metrics.clone(), - ..Default::default() - }; - for range in ranges { - let build_reader_start = Instant::now(); - let Some(reader) = range.reader(stream_ctx.input.series_row_selector, fetch_metrics.as_deref()).await? else { - continue; - }; - let build_cost = build_reader_start.elapsed(); - part_metrics.inc_build_reader_cost(build_cost); - let compat_batch = range.compat_batch(); - let mut source = Source::PruneReader(reader); - while let Some(mut batch) = source.next_batch().await? { - if let Some(compact_batch) = compat_batch { - batch = compact_batch.as_primary_key().unwrap().compat_batch(batch)?; - } - yield batch; - } - if let Source::PruneReader(reader) = source { - let prune_metrics = reader.metrics(); - - // Update per-file metrics if tracking is enabled - if let Some(file_metrics_map) = per_file_metrics.as_mut() { - let file_id = range.file_handle().file_id(); - let file_metrics = file_metrics_map - .entry(file_id) - .or_insert_with(FileScanMetrics::default); - - file_metrics.num_ranges += 1; - file_metrics.num_rows += prune_metrics.num_rows; - file_metrics.build_reader_cost += build_cost; - file_metrics.scan_cost += prune_metrics.scan_cost; - } - - reader_metrics.merge_from(&prune_metrics); - } - } - - // Reports metrics. - reader_metrics.observe_rows(read_type); - reader_metrics.filter_metrics.observe(); - part_metrics.merge_reader_metrics(reader_metrics, per_file_metrics.as_ref()); - } -} - /// Build the stream of scanning the input [`FileRange`]s using flat reader that returns RecordBatch. #[tracing::instrument( skip_all, @@ -1591,47 +1435,6 @@ pub fn build_flat_file_range_scan_stream( } } -/// Build the stream of scanning the extension range denoted by the [`RowGroupIndex`]. -#[cfg(feature = "enterprise")] -pub(crate) async fn scan_extension_range( - context: Arc, - index: RowGroupIndex, - partition_metrics: PartitionMetrics, -) -> Result { - use snafu::ResultExt; - - let range = context.input.extension_range(index.index); - let reader = range.reader(context.as_ref()); - let stream = reader - .read(context, partition_metrics, index) - .await - .context(crate::error::ScanExternalRangeSnafu)?; - Ok(stream) -} - -pub(crate) async fn maybe_scan_other_ranges( - context: &Arc, - index: RowGroupIndex, - metrics: &PartitionMetrics, -) -> Result { - #[cfg(feature = "enterprise")] - { - scan_extension_range(context.clone(), index, metrics.clone()).await - } - - #[cfg(not(feature = "enterprise"))] - { - let _ = context; - let _ = index; - let _ = metrics; - - crate::error::UnexpectedSnafu { - reason: "no other ranges scannable", - } - .fail() - } -} - /// Build the stream of scanning the extension range in flat format denoted by the [`RowGroupIndex`]. #[cfg(feature = "enterprise")] pub(crate) async fn scan_flat_extension_range( diff --git a/src/mito2/src/read/seq_scan.rs b/src/mito2/src/read/seq_scan.rs index a1b3b8f350..49f173e7c9 100644 --- a/src/mito2/src/read/seq_scan.rs +++ b/src/mito2/src/read/seq_scan.rs @@ -27,7 +27,7 @@ use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion::physical_plan::{DisplayAs, DisplayFormatType}; use datatypes::schema::SchemaRef; use futures::{StreamExt, TryStreamExt}; -use snafu::{OptionExt, ensure}; +use snafu::ensure; use store_api::metadata::RegionMetadataRef; use store_api::region_engine::{ PartitionRange, PrepareRequest, QueryScanContext, RegionScanner, ScannerProperties, @@ -35,24 +35,19 @@ use store_api::region_engine::{ use store_api::storage::TimeSeriesRowSelector; use tokio::sync::Semaphore; -use crate::error::{PartitionOutOfRangeSnafu, Result, TooManyFilesToReadSnafu, UnexpectedSnafu}; -use crate::read::dedup::{DedupReader, LastNonNull, LastRow}; +use crate::error::{PartitionOutOfRangeSnafu, Result, TooManyFilesToReadSnafu}; use crate::read::flat_dedup::{FlatDedupReader, FlatLastNonNull, FlatLastRow}; use crate::read::flat_merge::FlatMergeReader; -use crate::read::last_row::{FlatLastRowReader, LastRowReader}; -use crate::read::merge::MergeReaderBuilder; +use crate::read::last_row::FlatLastRowReader; use crate::read::pruner::{PartitionPruner, Pruner}; use crate::read::range::RangeMeta; use crate::read::scan_region::{ScanInput, StreamContext}; use crate::read::scan_util::{ - PartitionMetrics, PartitionMetricsList, SplitRecordBatchStream, scan_file_ranges, - scan_flat_file_ranges, scan_flat_mem_ranges, scan_mem_ranges, - should_split_flat_batches_for_merge, + PartitionMetrics, PartitionMetricsList, SplitRecordBatchStream, scan_flat_file_ranges, + scan_flat_mem_ranges, should_split_flat_batches_for_merge, }; use crate::read::stream::{ConvertBatchStream, ScanBatch, ScanBatchStream}; -use crate::read::{ - Batch, BatchReader, BoxedBatchReader, BoxedRecordBatchStream, ScannerMetrics, Source, scan_util, -}; +use crate::read::{BoxedRecordBatchStream, ScannerMetrics, scan_util}; use crate::region::options::MergeMode; use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE; @@ -121,7 +116,7 @@ impl SeqScan { let streams = (0..self.properties.partitions.len()) .map(|partition| { let metrics = self.new_partition_metrics(false, &metrics_set, partition); - self.scan_batch_in_partition(partition, metrics) + self.scan_flat_batch_in_partition(partition, metrics) }) .collect::>>()?; @@ -184,57 +179,6 @@ impl SeqScan { Self::build_flat_reader_from_sources(stream_ctx, sources, None, None).await } - /// Builds a reader to read sources. If `semaphore` is provided, reads sources in parallel - /// if possible. - #[tracing::instrument(level = tracing::Level::DEBUG, skip_all)] - pub(crate) async fn build_reader_from_sources( - stream_ctx: &StreamContext, - mut sources: Vec, - semaphore: Option>, - part_metrics: Option<&PartitionMetrics>, - ) -> Result { - if let Some(semaphore) = semaphore.as_ref() { - // Read sources in parallel. - if sources.len() > 1 { - sources = stream_ctx - .input - .create_parallel_sources(sources, semaphore.clone())?; - } - } - - let mut builder = MergeReaderBuilder::from_sources(sources); - if let Some(metrics) = part_metrics { - builder.with_metrics_reporter(Some(metrics.merge_metrics_reporter())); - } - let reader = builder.build().await?; - - let dedup = !stream_ctx.input.append_mode; - let dedup_metrics_reporter = part_metrics.map(|m| m.dedup_metrics_reporter()); - let reader = if dedup { - match stream_ctx.input.merge_mode { - MergeMode::LastRow => Box::new(DedupReader::new( - reader, - LastRow::new(stream_ctx.input.filter_deleted), - dedup_metrics_reporter, - )) as _, - MergeMode::LastNonNull => Box::new(DedupReader::new( - reader, - LastNonNull::new(stream_ctx.input.filter_deleted), - dedup_metrics_reporter, - )) as _, - } - } else { - Box::new(reader) as _ - }; - - let reader = match &stream_ctx.input.series_row_selector { - Some(TimeSeriesRowSelector::LastRow) => Box::new(LastRowReader::new(reader)) as _, - None => reader, - }; - - Ok(reader) - } - /// Builds a flat reader to read sources that returns RecordBatch. If `semaphore` is provided, reads sources in parallel /// if possible. #[tracing::instrument(level = tracing::Level::DEBUG, skip_all)] @@ -318,13 +262,7 @@ impl SeqScan { let metrics = self.new_partition_metrics(ctx.explain_verbose, metrics_set, partition); let input = &self.stream_ctx.input; - let batch_stream = if input.flat_format { - // Use flat scan for bulk memtables - self.scan_flat_batch_in_partition(partition, metrics.clone())? - } else { - // Use regular batch scan for normal memtables - self.scan_batch_in_partition(partition, metrics.clone())? - }; + let batch_stream = self.scan_flat_batch_in_partition(partition, metrics.clone())?; let record_batch_stream = ConvertBatchStream::new( batch_stream, input.mapper.clone(), @@ -338,125 +276,6 @@ impl SeqScan { ))) } - #[tracing::instrument( - skip_all, - fields( - region_id = %self.stream_ctx.input.mapper.metadata().region_id, - partition = partition - ) - )] - fn scan_batch_in_partition( - &self, - partition: usize, - part_metrics: PartitionMetrics, - ) -> Result { - ensure!( - partition < self.properties.partitions.len(), - PartitionOutOfRangeSnafu { - given: partition, - all: self.properties.partitions.len(), - } - ); - - if self.properties.partitions[partition].is_empty() { - return Ok(Box::pin(futures::stream::empty())); - } - - let stream_ctx = self.stream_ctx.clone(); - let semaphore = self.new_semaphore(); - let partition_ranges = self.properties.partitions[partition].clone(); - let compaction = self.stream_ctx.input.compaction; - let distinguish_range = self.properties.distinguish_partition_range; - let file_scan_semaphore = if compaction { None } else { semaphore.clone() }; - let pruner = self.pruner.clone(); - // Initializes ref counts for the pruner. - // If we call scan_batch_in_partition() multiple times but don't read all batches from the stream, - // then the ref count won't be decremented. - // This is a rare case and keeping all remaining entries still uses less memory than a per partition cache. - pruner.add_partition_ranges(&partition_ranges); - let partition_pruner = Arc::new(PartitionPruner::new(pruner, &partition_ranges)); - - let stream = try_stream! { - part_metrics.on_first_poll(); - // Start fetch time before building sources so scan cost contains - // build part cost. - let mut fetch_start = Instant::now(); - - let _mapper = stream_ctx.input.mapper.as_primary_key().context(UnexpectedSnafu { - reason: "Unexpected format", - })?; - // Scans each part. - for part_range in partition_ranges { - let mut sources = Vec::new(); - build_sources( - &stream_ctx, - &part_range, - compaction, - &part_metrics, - partition_pruner.clone(), - &mut sources, - file_scan_semaphore.clone(), - ).await?; - - let mut reader = - Self::build_reader_from_sources(&stream_ctx, sources, semaphore.clone(), Some(&part_metrics)) - .await?; - #[cfg(debug_assertions)] - let mut checker = crate::read::BatchChecker::default() - .with_start(Some(part_range.start)) - .with_end(Some(part_range.end)); - - let mut metrics = ScannerMetrics { - scan_cost: fetch_start.elapsed(), - ..Default::default() - }; - fetch_start = Instant::now(); - - while let Some(batch) = reader.next_batch().await? { - metrics.scan_cost += fetch_start.elapsed(); - metrics.num_batches += 1; - metrics.num_rows += batch.num_rows(); - - debug_assert!(!batch.is_empty()); - if batch.is_empty() { - fetch_start = Instant::now(); - continue; - } - - #[cfg(debug_assertions)] - checker.ensure_part_range_batch( - "SeqScan", - _mapper.metadata().region_id, - partition, - part_range, - &batch, - ); - - let yield_start = Instant::now(); - yield ScanBatch::Normal(batch); - metrics.yield_cost += yield_start.elapsed(); - - fetch_start = Instant::now(); - } - - // Yields an empty part to indicate this range is terminated. - // The query engine can use this to optimize some queries. - if distinguish_range { - let yield_start = Instant::now(); - yield ScanBatch::Normal(Batch::empty()); - metrics.yield_cost += yield_start.elapsed(); - } - - metrics.scan_cost += fetch_start.elapsed(); - fetch_start = Instant::now(); - part_metrics.merge_metrics(&metrics); - } - - part_metrics.on_finish(); - }; - Ok(Box::pin(stream)) - } - #[tracing::instrument( skip_all, fields( @@ -709,108 +528,6 @@ impl fmt::Debug for SeqScan { } } -/// Builds sources for the partition range and push them to the `sources` vector. -pub(crate) async fn build_sources( - stream_ctx: &Arc, - part_range: &PartitionRange, - compaction: bool, - part_metrics: &PartitionMetrics, - partition_pruner: Arc, - sources: &mut Vec, - semaphore: Option>, -) -> Result<()> { - // Gets range meta. - let range_meta = &stream_ctx.ranges[part_range.identifier]; - #[cfg(debug_assertions)] - if compaction { - // Compaction expects input sources are not been split. - debug_assert_eq!(range_meta.indices.len(), range_meta.row_group_indices.len()); - for (i, row_group_idx) in range_meta.row_group_indices.iter().enumerate() { - // It should scan all row groups. - debug_assert_eq!( - -1, row_group_idx.row_group_index, - "Expect {} range scan all row groups, given: {}", - i, row_group_idx.row_group_index, - ); - } - } - - let read_type = if compaction { - "compaction" - } else { - "seq_scan_files" - }; - let num_indices = range_meta.row_group_indices.len(); - if num_indices == 0 { - return Ok(()); - } - - sources.reserve(num_indices); - let mut ordered_sources = Vec::with_capacity(num_indices); - ordered_sources.resize_with(num_indices, || None); - let mut file_scan_tasks = Vec::new(); - - for (position, index) in range_meta.row_group_indices.iter().enumerate() { - if stream_ctx.is_mem_range_index(*index) { - let stream = scan_mem_ranges( - stream_ctx.clone(), - part_metrics.clone(), - *index, - range_meta.time_range, - ); - ordered_sources[position] = Some(Source::Stream(Box::pin(stream) as _)); - } else if stream_ctx.is_file_range_index(*index) { - if let Some(semaphore_ref) = semaphore.as_ref() { - // run in parallel, controlled by semaphore - let stream_ctx = stream_ctx.clone(); - let part_metrics = part_metrics.clone(); - let partition_pruner = partition_pruner.clone(); - let semaphore = Arc::clone(semaphore_ref); - let row_group_index = *index; - file_scan_tasks.push(async move { - let _permit = semaphore.acquire().await.unwrap(); - let stream = scan_file_ranges( - stream_ctx, - part_metrics, - row_group_index, - read_type, - partition_pruner, - ) - .await?; - Ok((position, Source::Stream(Box::pin(stream) as _))) - }); - } else { - // no semaphore, run sequentially - let stream = scan_file_ranges( - stream_ctx.clone(), - part_metrics.clone(), - *index, - read_type, - partition_pruner.clone(), - ) - .await?; - ordered_sources[position] = Some(Source::Stream(Box::pin(stream) as _)); - } - } else { - let stream = - scan_util::maybe_scan_other_ranges(stream_ctx, *index, part_metrics).await?; - ordered_sources[position] = Some(Source::Stream(stream)); - } - } - - if !file_scan_tasks.is_empty() { - let results = futures::future::try_join_all(file_scan_tasks).await?; - for (position, source) in results { - ordered_sources[position] = Some(source); - } - } - - for source in ordered_sources.into_iter().flatten() { - sources.push(source); - } - Ok(()) -} - /// Builds flat sources for the partition range and push them to the `sources` vector. pub(crate) async fn build_flat_sources( stream_ctx: &Arc, diff --git a/src/mito2/src/read/series_scan.rs b/src/mito2/src/read/series_scan.rs index 5109120d92..d2e37af66a 100644 --- a/src/mito2/src/read/series_scan.rs +++ b/src/mito2/src/read/series_scan.rs @@ -30,7 +30,7 @@ use datatypes::arrow::array::BinaryArray; use datatypes::arrow::record_batch::RecordBatch; use datatypes::schema::SchemaRef; use futures::{StreamExt, TryStreamExt}; -use smallvec::{SmallVec, smallvec}; +use smallvec::SmallVec; use snafu::{OptionExt, ResultExt, ensure}; use store_api::metadata::RegionMetadataRef; use store_api::region_engine::{ @@ -44,12 +44,12 @@ use crate::error::{ Error, InvalidSenderSnafu, PartitionOutOfRangeSnafu, Result, ScanMultiTimesSnafu, ScanSeriesSnafu, TooManyFilesToReadSnafu, }; +use crate::read::ScannerMetrics; use crate::read::pruner::{PartitionPruner, Pruner}; use crate::read::scan_region::{ScanInput, StreamContext}; use crate::read::scan_util::{PartitionMetrics, PartitionMetricsList, SeriesDistributorMetrics}; -use crate::read::seq_scan::{SeqScan, build_flat_sources, build_sources}; +use crate::read::seq_scan::{SeqScan, build_flat_sources}; use crate::read::stream::{ConvertBatchStream, ScanBatch, ScanBatchStream}; -use crate::read::{Batch, ScannerMetrics}; use crate::sst::parquet::flat_format::primary_key_column_index; use crate::sst::parquet::format::PrimaryKeyArray; @@ -443,11 +443,7 @@ impl SeriesDistributor { fields(region_id = %self.stream_ctx.input.mapper.metadata().region_id) )] async fn execute(&mut self) { - let result = if self.stream_ctx.input.flat_format { - self.scan_partitions_flat().await - } else { - self.scan_partitions().await - }; + let result = self.scan_partitions_flat().await; if let Err(e) = result { self.senders.send_error(e).await; @@ -559,151 +555,11 @@ impl SeriesDistributor { Ok(()) } - - /// Scans all parts. - #[tracing::instrument( - skip_all, - fields(region_id = %self.stream_ctx.input.mapper.metadata().region_id) - )] - async fn scan_partitions(&mut self) -> Result<()> { - // Initialize reference counts for all partition ranges. - for partition_ranges in &self.partitions { - self.pruner.add_partition_ranges(partition_ranges); - } - - // Create PartitionPruner covering all partitions - let all_partition_ranges: Vec<_> = self.partitions.iter().flatten().cloned().collect(); - let partition_pruner = Arc::new(PartitionPruner::new( - self.pruner.clone(), - &all_partition_ranges, - )); - - let part_metrics = new_partition_metrics( - &self.stream_ctx, - self.explain_verbose, - &self.metrics_set, - self.partitions.len(), - &self.metrics_list, - ); - part_metrics.on_first_poll(); - // Start fetch time before building sources so scan cost contains - // build part cost. - let mut fetch_start = Instant::now(); - - // Scans all parts. - let mut sources = Vec::with_capacity(self.partitions.len()); - for partition in &self.partitions { - sources.reserve(partition.len()); - for part_range in partition { - build_sources( - &self.stream_ctx, - part_range, - false, - &part_metrics, - partition_pruner.clone(), - &mut sources, - self.semaphore.clone(), - ) - .await?; - } - } - - // Builds a reader that merge sources from all parts. - let mut reader = SeqScan::build_reader_from_sources( - &self.stream_ctx, - sources, - self.semaphore.clone(), - Some(&part_metrics), - ) - .await?; - let mut metrics = SeriesDistributorMetrics::default(); - - let mut current_series = PrimaryKeySeriesBatch::default(); - while let Some(batch) = reader.next_batch().await? { - metrics.scan_cost += fetch_start.elapsed(); - metrics.num_batches += 1; - metrics.num_rows += batch.num_rows(); - - debug_assert!(!batch.is_empty()); - if batch.is_empty() { - fetch_start = Instant::now(); - continue; - } - - let Some(last_key) = current_series.current_key() else { - current_series.push(batch); - fetch_start = Instant::now(); - continue; - }; - - if last_key == batch.primary_key() { - current_series.push(batch); - fetch_start = Instant::now(); - continue; - } - - // We find a new series, send the current one. - let to_send = - std::mem::replace(&mut current_series, PrimaryKeySeriesBatch::single(batch)); - let yield_start = Instant::now(); - self.senders - .send_batch(SeriesBatch::PrimaryKey(to_send)) - .await?; - metrics.yield_cost += yield_start.elapsed(); - fetch_start = Instant::now(); - } - - if !current_series.is_empty() { - let yield_start = Instant::now(); - self.senders - .send_batch(SeriesBatch::PrimaryKey(current_series)) - .await?; - metrics.yield_cost += yield_start.elapsed(); - } - - metrics.scan_cost += fetch_start.elapsed(); - metrics.num_series_send_timeout = self.senders.num_timeout; - metrics.num_series_send_full = self.senders.num_full; - part_metrics.set_distributor_metrics(&metrics); - - part_metrics.on_finish(); - - Ok(()) - } -} - -/// Batches of the same series in primary key format. -#[derive(Default, Debug)] -pub struct PrimaryKeySeriesBatch { - pub batches: SmallVec<[Batch; 4]>, -} - -impl PrimaryKeySeriesBatch { - /// Creates a new [PrimaryKeySeriesBatch] from a single [Batch]. - fn single(batch: Batch) -> Self { - Self { - batches: smallvec![batch], - } - } - - fn current_key(&self) -> Option<&[u8]> { - self.batches.first().map(|batch| batch.primary_key()) - } - - fn push(&mut self, batch: Batch) { - self.batches.push(batch); - } - - /// Returns true if there is no batch. - fn is_empty(&self) -> bool { - self.batches.is_empty() - } } /// Batches of the same series. #[derive(Debug)] pub enum SeriesBatch { - PrimaryKey(PrimaryKeySeriesBatch), Flat(FlatSeriesBatch), } @@ -711,7 +567,6 @@ impl SeriesBatch { /// Returns the number of batches. pub fn num_batches(&self) -> usize { match self { - SeriesBatch::PrimaryKey(primary_key_batch) => primary_key_batch.batches.len(), SeriesBatch::Flat(flat_batch) => flat_batch.batches.len(), } } @@ -719,9 +574,6 @@ impl SeriesBatch { /// Returns the total number of rows across all batches. pub fn num_rows(&self) -> usize { match self { - SeriesBatch::PrimaryKey(primary_key_batch) => { - primary_key_batch.batches.iter().map(|x| x.num_rows()).sum() - } SeriesBatch::Flat(flat_batch) => flat_batch.batches.iter().map(|x| x.num_rows()).sum(), } } diff --git a/src/mito2/src/read/stream.rs b/src/mito2/src/read/stream.rs index 80002147ea..c8547fdf0c 100644 --- a/src/mito2/src/read/stream.rs +++ b/src/mito2/src/read/stream.rs @@ -27,14 +27,12 @@ use snafu::ResultExt; use crate::cache::CacheStrategy; use crate::error::Result; -use crate::read::Batch; use crate::read::projection::ProjectionMapper; use crate::read::scan_util::PartitionMetrics; use crate::read::series_scan::SeriesBatch; /// All kinds of [`Batch`]es to produce in scanner. pub enum ScanBatch { - Normal(Batch), Series(SeriesBatch), RecordBatch(DfRecordBatch), } @@ -45,6 +43,7 @@ pub type ScanBatchStream = BoxStream<'static, Result>; pub(crate) struct ConvertBatchStream { inner: ScanBatchStream, projection_mapper: Arc, + #[allow(dead_code)] cache_strategy: CacheStrategy, partition_metrics: PartitionMetrics, pending: VecDeque, @@ -68,41 +67,19 @@ impl ConvertBatchStream { fn convert(&mut self, batch: ScanBatch) -> common_recordbatch::error::Result { match batch { - ScanBatch::Normal(batch) => { - // Safety: Only primary key format returns this batch. - let mapper = self.projection_mapper.as_primary_key().unwrap(); - - if batch.is_empty() { - Ok(mapper.empty_record_batch()) - } else { - mapper.convert(&batch, &self.cache_strategy) - } - } ScanBatch::Series(series) => { debug_assert!( self.pending.is_empty(), "ConvertBatchStream should not convert a new SeriesBatch when pending batches exist" ); - match series { - SeriesBatch::PrimaryKey(primary_key_batch) => { - // Safety: Only primary key format returns this batch. - let mapper = self.projection_mapper.as_primary_key().unwrap(); + let SeriesBatch::Flat(flat_batch) = series; + // Safety: Only flat format returns this batch. + let mapper = self.projection_mapper.as_flat().unwrap(); - for batch in primary_key_batch.batches { - self.pending - .push_back(mapper.convert(&batch, &self.cache_strategy)?); - } - } - SeriesBatch::Flat(flat_batch) => { - // Safety: Only flat format returns this batch. - let mapper = self.projection_mapper.as_flat().unwrap(); - - for batch in flat_batch.batches { - self.pending - .push_back(mapper.convert(&batch, &self.cache_strategy)?); - } - } + for batch in flat_batch.batches { + self.pending + .push_back(mapper.convert(&batch, &self.cache_strategy)?); } let output_schema = self.projection_mapper.output_schema(); diff --git a/src/mito2/src/read/unordered_scan.rs b/src/mito2/src/read/unordered_scan.rs index 2d557e8871..9763d14cd2 100644 --- a/src/mito2/src/read/unordered_scan.rs +++ b/src/mito2/src/read/unordered_scan.rs @@ -37,11 +37,10 @@ use crate::error::{PartitionOutOfRangeSnafu, Result}; use crate::read::pruner::{PartitionPruner, Pruner}; use crate::read::scan_region::{ScanInput, StreamContext}; use crate::read::scan_util::{ - PartitionMetrics, PartitionMetricsList, scan_file_ranges, scan_flat_file_ranges, - scan_flat_mem_ranges, scan_mem_ranges, + PartitionMetrics, PartitionMetricsList, scan_flat_file_ranges, scan_flat_mem_ranges, }; use crate::read::stream::{ConvertBatchStream, ScanBatch, ScanBatchStream}; -use crate::read::{Batch, ScannerMetrics, scan_util}; +use crate::read::{ScannerMetrics, scan_util}; /// Scans a region without providing any output ordering guarantee. /// @@ -103,59 +102,6 @@ impl UnorderedScan { Ok(stream) } - /// Scans a [PartitionRange] by its `identifier` and returns a stream. - #[tracing::instrument( - skip_all, - fields( - region_id = %stream_ctx.input.region_metadata().region_id, - part_range_id = part_range_id - ) - )] - fn scan_partition_range( - stream_ctx: Arc, - part_range_id: usize, - part_metrics: PartitionMetrics, - partition_pruner: Arc, - ) -> impl Stream> { - try_stream! { - // Gets range meta. - let range_meta = &stream_ctx.ranges[part_range_id]; - for index in &range_meta.row_group_indices { - if stream_ctx.is_mem_range_index(*index) { - let stream = scan_mem_ranges( - stream_ctx.clone(), - part_metrics.clone(), - *index, - range_meta.time_range, - ); - for await batch in stream { - yield batch?; - } - } else if stream_ctx.is_file_range_index(*index) { - let stream = scan_file_ranges( - stream_ctx.clone(), - part_metrics.clone(), - *index, - "unordered_scan_files", - partition_pruner.clone(), - ).await?; - for await batch in stream { - yield batch?; - } - } else { - let stream = scan_util::maybe_scan_other_ranges( - &stream_ctx, - *index, - &part_metrics, - ).await?; - for await batch in stream { - yield batch?; - } - } - } - } - } - /// Scans a [PartitionRange] by its `identifier` and returns a flat stream of RecordBatch. #[tracing::instrument( skip_all, @@ -216,7 +162,7 @@ impl UnorderedScan { let streams = (0..self.properties.partitions.len()) .map(|partition| { let metrics = self.partition_metrics(false, partition, &metrics_set); - self.scan_batch_in_partition(partition, metrics) + self.scan_flat_batch_in_partition(partition, metrics) }) .collect::>>()?; @@ -265,13 +211,7 @@ impl UnorderedScan { let metrics = self.partition_metrics(ctx.explain_verbose, partition, metrics_set); let input = &self.stream_ctx.input; - let batch_stream = if input.flat_format { - // Use flat scan for bulk memtables - self.scan_flat_batch_in_partition(partition, metrics.clone())? - } else { - // Use regular batch scan for normal memtables - self.scan_batch_in_partition(partition, metrics.clone())? - }; + let batch_stream = self.scan_flat_batch_in_partition(partition, metrics.clone())?; let record_batch_stream = ConvertBatchStream::new( batch_stream, @@ -286,100 +226,6 @@ impl UnorderedScan { ))) } - #[tracing::instrument( - skip_all, - fields( - region_id = %self.stream_ctx.input.mapper.metadata().region_id, - partition = partition - ) - )] - fn scan_batch_in_partition( - &self, - partition: usize, - part_metrics: PartitionMetrics, - ) -> Result { - ensure!( - partition < self.properties.partitions.len(), - PartitionOutOfRangeSnafu { - given: partition, - all: self.properties.partitions.len(), - } - ); - - let stream_ctx = self.stream_ctx.clone(); - let part_ranges = self.properties.partitions[partition].clone(); - let distinguish_range = self.properties.distinguish_partition_range; - let pruner = self.pruner.clone(); - // Initializes ref counts for the pruner. - // If we call scan_batch_in_partition() multiple times but don't read all batches from the stream, - // then the ref count won't be decremented. - // This is a rare case and keeping all remaining entries still uses less memory than a per partition cache. - pruner.add_partition_ranges(&part_ranges); - let partition_pruner = Arc::new(PartitionPruner::new(pruner, &part_ranges)); - - let stream = try_stream! { - part_metrics.on_first_poll(); - - // Scans each part. - for part_range in part_ranges { - let mut metrics = ScannerMetrics::default(); - let mut fetch_start = Instant::now(); - let _mapper = &stream_ctx.input.mapper; - #[cfg(debug_assertions)] - let mut checker = crate::read::BatchChecker::default() - .with_start(Some(part_range.start)) - .with_end(Some(part_range.end)); - - let stream = Self::scan_partition_range( - stream_ctx.clone(), - part_range.identifier, - part_metrics.clone(), - partition_pruner.clone(), - ); - for await batch in stream { - let batch = batch?; - metrics.scan_cost += fetch_start.elapsed(); - metrics.num_batches += 1; - metrics.num_rows += batch.num_rows(); - - debug_assert!(!batch.is_empty()); - if batch.is_empty() { - continue; - } - - #[cfg(debug_assertions)] - checker.ensure_part_range_batch( - "UnorderedScan", - _mapper.metadata().region_id, - partition, - part_range, - &batch, - ); - - let yield_start = Instant::now(); - yield ScanBatch::Normal(batch); - metrics.yield_cost += yield_start.elapsed(); - - fetch_start = Instant::now(); - } - - // Yields an empty part to indicate this range is terminated. - // The query engine can use this to optimize some queries. - if distinguish_range { - let yield_start = Instant::now(); - yield ScanBatch::Normal(Batch::empty()); - metrics.yield_cost += yield_start.elapsed(); - } - - metrics.scan_cost += fetch_start.elapsed(); - part_metrics.merge_metrics(&metrics); - } - - part_metrics.on_finish(); - }; - Ok(Box::pin(stream)) - } - #[tracing::instrument( skip_all, fields( diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs index 4a3466a29c..2ca83ca8cf 100644 --- a/src/mito2/src/sst/parquet.rs +++ b/src/mito2/src/sst/parquet.rs @@ -895,7 +895,6 @@ mod tests { handle.clone(), object_store.clone(), ) - .flat_format(true) .predicate(Some(Predicate::new(preds))) .inverted_index_appliers([inverted_index_applier.clone(), None]) .bloom_filter_index_appliers([bloom_filter_applier.clone(), None]) @@ -960,7 +959,6 @@ mod tests { handle.clone(), object_store.clone(), ) - .flat_format(true) .predicate(Some(Predicate::new(preds))) .inverted_index_appliers([inverted_index_applier.clone(), None]) .bloom_filter_index_appliers([bloom_filter_applier.clone(), None]) @@ -1015,7 +1013,6 @@ mod tests { handle.clone(), object_store.clone(), ) - .flat_format(true) .predicate(Some(Predicate::new(preds))) .inverted_index_appliers([inverted_index_applier.clone(), None]) .bloom_filter_index_appliers([bloom_filter_applier.clone(), None]) @@ -1549,7 +1546,6 @@ mod tests { handle.clone(), object_store.clone(), ) - .flat_format(true) .predicate(Some(Predicate::new(preds))) .inverted_index_appliers([inverted_index_applier.clone(), None]) .cache(CacheStrategy::EnableAll(cache.clone())); @@ -1652,7 +1648,6 @@ mod tests { handle.clone(), object_store.clone(), ) - .flat_format(true) .predicate(Some(Predicate::new(preds))) .bloom_filter_index_appliers([None, bloom_filter_applier.clone()]) .cache(CacheStrategy::EnableAll(cache.clone())); @@ -1712,7 +1707,6 @@ mod tests { let builder = ParquetReaderBuilder::new(FILE_DIR.to_string(), PathType::Bare, handle, object_store) - .flat_format(true) .predicate(Some(Predicate::new(vec![col("tag_0").eq(lit("a"))]))); let mut metrics = ReaderMetrics::default(); @@ -1774,7 +1768,6 @@ mod tests { let builder = ParquetReaderBuilder::new(FILE_DIR.to_string(), PathType::Bare, handle, object_store) - .flat_format(true) .predicate(Some(Predicate::new(vec![col("tag_0").eq(lit("a"))]))); let mut metrics = ReaderMetrics::default(); @@ -1884,7 +1877,6 @@ mod tests { handle.clone(), object_store.clone(), ) - .flat_format(true) .predicate(Some(Predicate::new(preds))) .inverted_index_appliers([inverted_index_applier.clone(), None]) .cache(CacheStrategy::EnableAll(cache.clone())); @@ -1991,7 +1983,6 @@ mod tests { handle.clone(), object_store.clone(), ) - .flat_format(true) .predicate(Some(Predicate::new(preds))) .bloom_filter_index_appliers([None, bloom_filter_applier.clone()]) .cache(CacheStrategy::EnableAll(cache.clone())); @@ -2255,7 +2246,6 @@ mod tests { handle.clone(), object_store.clone(), ) - .flat_format(true) .predicate(Some(Predicate::new(preds))) .fulltext_index_appliers([None, fulltext_applier.clone()]) .cache(CacheStrategy::EnableAll(cache.clone())); @@ -2304,7 +2294,6 @@ mod tests { handle.clone(), object_store.clone(), ) - .flat_format(true) .predicate(Some(Predicate::new(preds))) .fulltext_index_appliers([None, fulltext_applier.clone()]) .cache(CacheStrategy::EnableAll(cache.clone())); diff --git a/src/mito2/src/sst/parquet/file_range.rs b/src/mito2/src/sst/parquet/file_range.rs index 8b4a61acb7..bf86e4a764 100644 --- a/src/mito2/src/sst/parquet/file_range.rs +++ b/src/mito2/src/sst/parquet/file_range.rs @@ -175,6 +175,7 @@ impl FileRange { } /// Returns a reader to read the [FileRange]. + #[allow(dead_code)] pub(crate) async fn reader( &self, selector: Option, diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs index 8832cd4a16..73ca7748e9 100644 --- a/src/mito2/src/sst/parquet/reader.rs +++ b/src/mito2/src/sst/parquet/reader.rs @@ -141,8 +141,6 @@ pub struct ParquetReaderBuilder { /// This is usually the latest metadata of the region. The reader use /// it get the correct column id of a column by name. expected_metadata: Option, - /// Whether to use flat format for reading. - flat_format: bool, /// Whether this reader is for compaction. compaction: bool, /// Mode to pre-filter columns. @@ -176,7 +174,6 @@ impl ParquetReaderBuilder { #[cfg(feature = "vector_index")] vector_index_k: None, expected_metadata: None, - flat_format: false, compaction: false, pre_filter_mode: PreFilterMode::All, decode_primary_key_values: false, @@ -257,13 +254,6 @@ impl ParquetReaderBuilder { self } - /// Sets the flat format flag. - #[must_use] - pub fn flat_format(mut self, flat_format: bool) -> Self { - self.flat_format = flat_format; - self - } - /// Sets the compaction flag. #[must_use] pub fn compaction(mut self, compaction: bool) -> Self { @@ -304,8 +294,7 @@ impl ParquetReaderBuilder { pub async fn build(&self) -> Result> { let mut metrics = ReaderMetrics::default(); - let Some((context, selection)) = self.build_reader_input_inner(&mut metrics, true).await? - else { + let Some((context, selection)) = self.build_reader_input_inner(&mut metrics).await? else { return Ok(None); }; ParquetReader::new(Arc::new(context), selection) @@ -327,14 +316,12 @@ impl ParquetReaderBuilder { &self, metrics: &mut ReaderMetrics, ) -> Result> { - self.build_reader_input_inner(metrics, self.flat_format) - .await + self.build_reader_input_inner(metrics).await } async fn build_reader_input_inner( &self, metrics: &mut ReaderMetrics, - flat_format: bool, ) -> Result> { let start = Instant::now(); @@ -376,7 +363,6 @@ impl ParquetReaderBuilder { // before compat handling. let compaction_projection_mapper = if self.compaction && !is_same_region_partition - && flat_format && region_meta.primary_key_encoding == PrimaryKeyEncoding::Sparse { Some(CompactionProjectionMapper::try_new(®ion_meta)?) @@ -388,7 +374,7 @@ impl ParquetReaderBuilder { ReadFormat::new( region_meta.clone(), Some(column_ids), - flat_format, + true, // Always reads as flat format. Some(parquet_meta.file_metadata().schema_descr().num_columns()), &file_path, skip_auto_convert, @@ -404,7 +390,7 @@ impl ParquetReaderBuilder { ReadFormat::new( region_meta.clone(), Some(&column_ids), - flat_format, + true, // Always reads as flat format. Some(parquet_meta.file_metadata().schema_descr().num_columns()), &file_path, skip_auto_convert, @@ -2060,6 +2046,7 @@ impl RowGroupReaderContext for FileRangeContextRef { /// [RowGroupReader] that reads from [FileRange]. pub(crate) type RowGroupReader = RowGroupReaderBase; +#[allow(dead_code)] impl RowGroupReader { /// Creates a new reader from file range. pub(crate) fn new( @@ -2084,6 +2071,7 @@ pub(crate) struct RowGroupReaderBase { override_sequence: Option, } +#[allow(dead_code)] impl RowGroupReaderBase where T: RowGroupReaderContext, diff --git a/src/store-api/src/storage/requests.rs b/src/store-api/src/storage/requests.rs index d072ec1b39..db3fb0388a 100644 --- a/src/store-api/src/storage/requests.rs +++ b/src/store-api/src/storage/requests.rs @@ -128,8 +128,6 @@ pub struct ScanRequest { /// Optional hint for KNN vector search. When set, the scan should use /// vector index to find the k nearest neighbors. pub vector_search: Option, - /// Whether to force reading region data in flat format. - pub force_flat_format: bool, } impl Display for ScanRequest { @@ -220,14 +218,6 @@ impl Display for ScanRequest { vector_search.metric )?; } - if self.force_flat_format { - write!( - f, - "{}force_flat_format: {}", - delimiter.as_str(), - self.force_flat_format - )?; - } write!(f, " }}") } } @@ -282,15 +272,6 @@ mod tests { "ScanRequest { projection: [1, 2], limit: 10 }" ); - let request = ScanRequest { - force_flat_format: true, - ..Default::default() - }; - assert_eq!( - request.to_string(), - "ScanRequest { force_flat_format: true }" - ); - let request = ScanRequest { snapshot_on_scan: true, ..Default::default() From 8d495909d3cfd91aa212acefcbe0133b39ddbf51 Mon Sep 17 00:00:00 2001 From: shuiyisong <113876041+shuiyisong@users.noreply.github.com> Date: Thu, 2 Apr 2026 18:01:18 +0800 Subject: [PATCH 070/195] feat: auto alter table during trace ingestion from int to float (#7871) * feat: impl alter table Signed-off-by: shuiyisong * chore: minor refactor Signed-off-by: shuiyisong * chore: address issues Signed-off-by: shuiyisong * chore: address issues Signed-off-by: shuiyisong --------- Signed-off-by: shuiyisong --- src/frontend/src/instance/otlp.rs | 361 +++++++++++++++++++++++++++--- tests-integration/tests/http.rs | 196 ++++++++++++++++ 2 files changed, 520 insertions(+), 37 deletions(-) diff --git a/src/frontend/src/instance/otlp.rs b/src/frontend/src/instance/otlp.rs index 8cda639686..8b3f8b3eec 100644 --- a/src/frontend/src/instance/otlp.rs +++ b/src/frontend/src/instance/otlp.rs @@ -15,14 +15,17 @@ use std::sync::Arc; use api::helper::ColumnDataTypeWrapper; -use api::v1::{ColumnDataType, RowInsertRequests}; +use api::v1::alter_table_expr::Kind; +use api::v1::{ + AlterTableExpr, ColumnDataType, ModifyColumnType, ModifyColumnTypes, RowInsertRequests, +}; use async_trait::async_trait; use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq}; use client::Output; use common_error::ext::{BoxedError, ErrorExt}; use common_error::status_code::StatusCode; use common_query::prelude::GREPTIME_PHYSICAL_TABLE; -use common_telemetry::tracing; +use common_telemetry::{tracing, warn}; use itertools::Itertools; use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest; use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest; @@ -42,7 +45,7 @@ use servers::query_handler::{ OpenTelemetryProtocolHandler, PipelineHandlerRef, TraceIngestOutcome, }; use session::context::QueryContextRef; -use snafu::ResultExt; +use snafu::{IntoError, ResultExt}; use table::requests::{OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM}; use crate::instance::Instance; @@ -60,6 +63,33 @@ enum ChunkFailureReaction { Propagate, } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum TraceReconcileDecision { + UseExisting(ColumnDataType), + UseRequestLocal(ColumnDataType), + AlterExistingTo(ColumnDataType), +} + +impl TraceReconcileDecision { + fn target_type(self) -> ColumnDataType { + match self { + Self::UseExisting(target_type) + | Self::UseRequestLocal(target_type) + | Self::AlterExistingTo(target_type) => target_type, + } + } + + fn requires_alter(self) -> bool { + matches!(self, Self::AlterExistingTo(_)) + } +} + +struct PendingTraceColumnRewrite { + col_idx: usize, + target_type: ColumnDataType, + column_name: String, +} + impl ChunkFailureReaction { fn as_metric_label(self) -> &'static str { match self { @@ -546,34 +576,118 @@ impl Instance { Some(summary) } - /// Picks the final datatype for one trace column. + /// Picks the reconciliation action for one trace column. /// - /// Existing table schema is authoritative when present. Otherwise we resolve the - /// request-local observed types using the shared trace coercion rules. - fn choose_trace_target_type( + /// Existing table schema is authoritative unless the only incompatible case is + /// widening an existing Int64 column to Float64 for incoming Int64/Float64 data. + fn choose_trace_reconcile_decision( observed_types: &[ColumnDataType], existing_type: Option, - ) -> ServerResult> { + ) -> ServerResult> { let Some(existing_type) = existing_type else { - return resolve_new_trace_column_type(observed_types.iter().copied()).map_err(|_| { - error::InvalidParameterSnafu { - reason: "unsupported trace type mix".to_string(), - } - .build() - }); + return resolve_new_trace_column_type(observed_types.iter().copied()) + .map(|target_type| target_type.map(TraceReconcileDecision::UseRequestLocal)) + .map_err(|_| { + error::InvalidParameterSnafu { + reason: "unsupported trace type mix".to_string(), + } + .build() + }); }; - if observed_types.iter().copied().all(|request_type| { + if observed_types.iter().all(|&request_type| { request_type == existing_type || is_supported_trace_coercion(request_type, existing_type) }) { - Ok(Some(existing_type)) - } else { - error::InvalidParameterSnafu { - reason: "unsupported trace type mix".to_string(), - } - .fail() + return Ok(Some(TraceReconcileDecision::UseExisting(existing_type))); } + + if existing_type == ColumnDataType::Int64 + && observed_types.contains(&ColumnDataType::Float64) + && observed_types.iter().all(|observed_type| { + matches!( + observed_type, + ColumnDataType::Int64 | ColumnDataType::Float64 + ) + }) + { + return Ok(Some(TraceReconcileDecision::AlterExistingTo( + ColumnDataType::Float64, + ))); + } + + error::InvalidParameterSnafu { + reason: "unsupported trace type mix".to_string(), + } + .fail() + } + + /// Widen existing trace table columns to Float64 before request rewrite. + async fn alter_trace_table_columns_to_float64( + &self, + ctx: &QueryContextRef, + table_name: &str, + column_names: &[String], + ) -> ServerResult<()> { + let catalog_name = ctx.current_catalog().to_string(); + let schema_name = ctx.current_schema(); + let alter_expr = AlterTableExpr { + catalog_name: catalog_name.clone(), + schema_name: schema_name.clone(), + table_name: table_name.to_string(), + kind: Some(Kind::ModifyColumnTypes(ModifyColumnTypes { + modify_column_types: column_names + .iter() + .map(|column_name| ModifyColumnType { + column_name: column_name.clone(), + target_type: ColumnDataType::Float64 as i32, + target_type_extension: None, + }) + .collect(), + })), + }; + + if let Err(err) = self + .statement_executor + .alter_table_inner(alter_expr, ctx.clone()) + .await + { + let table = self + .catalog_manager + .table(&catalog_name, &schema_name, table_name, None) + .await + .map_err(servers::error::Error::from)?; + let alter_already_applied = table + .map(|table| { + let table_schema = table.schema(); + column_names.iter().all(|column_name| { + table_schema + .column_schema_by_name(column_name) + .and_then(|table_col| { + ColumnDataTypeWrapper::try_from(table_col.data_type.clone()) + .ok() + .map(|wrapper| wrapper.datatype()) + }) + == Some(ColumnDataType::Float64) + }) + }) + .unwrap_or(false); + + if alter_already_applied { + return Ok(()); + } + + warn!( + table_name, + columns = ?column_names, + error = %err, + "failed to widen trace columns before insert" + ); + + return Err(wrap_trace_alter_failure(err)); + } + + Ok(()) } /// Coerce request column types and values to match the existing table schema @@ -598,7 +712,8 @@ impl Instance { }; let table_schema = table.map(|table| table.schema()); - let mut pending_coercions = Vec::new(); + let mut pending_rewrites = Vec::new(); + let mut pending_alter_columns = Vec::new(); for (col_idx, col_schema) in rows.schema.iter().enumerate() { let Some(current_type) = ColumnDataType::try_from(col_schema.datatype).ok() else { @@ -647,8 +762,8 @@ impl Instance { // Decide the final type once per column, then rewrite all affected cells // together in one row pass below. - let Some(target_type) = - Self::choose_trace_target_type(&observed_types, existing_type).map_err( + let Some(decision) = + Self::choose_trace_reconcile_decision(&observed_types, existing_type).map_err( |_| { enrich_trace_reconcile_error( &req.table_name, @@ -661,31 +776,54 @@ impl Instance { else { continue; }; + let target_type = decision.target_type(); - if observed_types - .iter() - .all(|observed| *observed == target_type) + if !decision.requires_alter() + && observed_types + .iter() + .all(|observed| *observed == target_type) && col_schema.datatype == target_type as i32 { continue; } - pending_coercions.push((col_idx, target_type, col_schema.column_name.clone())); + if decision.requires_alter() + && !pending_alter_columns.contains(&col_schema.column_name) + { + pending_alter_columns.push(col_schema.column_name.clone()); + } + + pending_rewrites.push(PendingTraceColumnRewrite { + col_idx, + target_type, + column_name: col_schema.column_name.clone(), + }); } - if pending_coercions.is_empty() { + if pending_rewrites.is_empty() { continue; } + validate_trace_column_rewrites(&rows.rows, &pending_rewrites, &req.table_name)?; + + if !pending_alter_columns.is_empty() { + self.alter_trace_table_columns_to_float64( + ctx, + &req.table_name, + &pending_alter_columns, + ) + .await?; + } + // Update schema metadata before mutating row values so both stay in sync. - for (col_idx, target_type, ..) in &pending_coercions { - rows.schema[*col_idx].datatype = *target_type as i32; + for pending_rewrite in &pending_rewrites { + rows.schema[pending_rewrite.col_idx].datatype = pending_rewrite.target_type as i32; } // Apply all pending column rewrites in one row pass. for row in &mut rows.rows { - for (col_idx, target_type, column_name) in &pending_coercions { - let Some(value) = row.values.get_mut(*col_idx) else { + for pending_rewrite in &pending_rewrites { + let Some(value) = row.values.get_mut(pending_rewrite.col_idx) else { continue; }; let Some(request_type) = @@ -693,20 +831,23 @@ impl Instance { else { continue; }; - if request_type == *target_type { + if request_type == pending_rewrite.target_type { continue; } value.value_data = coerce_value_data( &value.value_data, - *target_type, + pending_rewrite.target_type, request_type, ) .map_err(|_| { error::InvalidParameterSnafu { reason: format!( "failed to coerce trace column '{}' in table '{}' from {:?} to {:?}", - column_name, req.table_name, request_type, target_type + pending_rewrite.column_name, + req.table_name, + request_type, + pending_rewrite.target_type ), } .build() @@ -719,6 +860,52 @@ impl Instance { } } +/// Validate all pending trace column rewrites before any schema mutation happens. +fn validate_trace_column_rewrites( + rows: &[api::v1::Row], + pending_rewrites: &[PendingTraceColumnRewrite], + table_name: &str, +) -> ServerResult<()> { + for row in rows { + for pending_rewrite in pending_rewrites { + let Some(value) = row.values.get(pending_rewrite.col_idx) else { + continue; + }; + let Some(request_type) = value.value_data.as_ref().and_then(trace_value_datatype) + else { + continue; + }; + if request_type == pending_rewrite.target_type { + continue; + } + + coerce_value_data(&value.value_data, pending_rewrite.target_type, request_type) + .map_err(|_| { + error::InvalidParameterSnafu { + reason: format!( + "failed to coerce trace column '{}' in table '{}' from {:?} to {:?}", + pending_rewrite.column_name, + table_name, + request_type, + pending_rewrite.target_type + ), + } + .build() + })?; + } + } + + Ok(()) +} + +/// Preserve the original alter failure status so chunk retry behavior stays correct. +fn wrap_trace_alter_failure(err: E) -> servers::error::Error +where + E: ErrorExt + Send + Sync + 'static, +{ + error::ExecuteGrpcQuerySnafu.into_error(BoxedError::new(err)) +} + fn enrich_trace_reconcile_error( table_name: &str, column_name: &str, @@ -767,10 +954,16 @@ fn push_observed_trace_type(observed_types: &mut Vec, datatype: #[cfg(test)] mod tests { + use api::v1::value::ValueData; + use api::v1::{ColumnDataType, Row, Value}; + use common_error::ext::ErrorExt; use common_error::status_code::StatusCode; use servers::query_handler::TraceIngestOutcome; - use super::{ChunkFailureReaction, Instance}; + use super::{ + ChunkFailureReaction, Instance, PendingTraceColumnRewrite, TraceReconcileDecision, + validate_trace_column_rewrites, wrap_trace_alter_failure, + }; use crate::metrics::OTLP_TRACES_FAILURE_COUNT; #[test] @@ -923,4 +1116,98 @@ mod tests { ChunkFailureReaction::DiscardChunk ); } + + #[test] + fn test_choose_trace_reconcile_decision_existing_int64_keeps_int64() { + assert_eq!( + Instance::choose_trace_reconcile_decision( + &[ColumnDataType::Int64], + Some(ColumnDataType::Int64) + ) + .unwrap(), + Some(TraceReconcileDecision::UseExisting(ColumnDataType::Int64)) + ); + } + + #[test] + fn test_choose_trace_reconcile_decision_existing_int64_widens_to_float64() { + assert_eq!( + Instance::choose_trace_reconcile_decision( + &[ColumnDataType::Int64, ColumnDataType::Float64], + Some(ColumnDataType::Int64) + ) + .unwrap(), + Some(TraceReconcileDecision::AlterExistingTo( + ColumnDataType::Float64 + )) + ); + } + + #[test] + fn test_choose_trace_reconcile_decision_existing_float64_stays_authoritative() { + assert_eq!( + Instance::choose_trace_reconcile_decision( + &[ColumnDataType::Int64, ColumnDataType::Float64], + Some(ColumnDataType::Float64) + ) + .unwrap(), + Some(TraceReconcileDecision::UseExisting(ColumnDataType::Float64)) + ); + } + + #[test] + fn test_choose_trace_reconcile_decision_existing_int64_with_boolean_is_error() { + let err = Instance::choose_trace_reconcile_decision( + &[ColumnDataType::Boolean, ColumnDataType::Int64], + Some(ColumnDataType::Int64), + ) + .unwrap_err(); + assert_eq!(err.status_code(), StatusCode::InvalidArguments); + } + + #[test] + fn test_choose_trace_reconcile_decision_request_local_prefers_float64() { + assert_eq!( + Instance::choose_trace_reconcile_decision( + &[ColumnDataType::Int64, ColumnDataType::Float64], + None + ) + .unwrap(), + Some(TraceReconcileDecision::UseRequestLocal( + ColumnDataType::Float64 + )) + ); + } + + #[test] + fn test_validate_trace_column_rewrites_rejects_invalid_string_parse() { + let rows = vec![Row { + values: vec![Value { + value_data: Some(ValueData::StringValue("not_a_number".to_string())), + }], + }]; + let pending_rewrites = vec![PendingTraceColumnRewrite { + col_idx: 0, + target_type: ColumnDataType::Int64, + column_name: "span_attributes.attr_int".to_string(), + }]; + + let err = validate_trace_column_rewrites(&rows, &pending_rewrites, "trace_type_atomicity") + .unwrap_err(); + assert_eq!(err.status_code(), StatusCode::InvalidArguments); + } + + #[test] + fn test_wrap_trace_alter_failure_preserves_status_code() { + let err = wrap_trace_alter_failure( + servers::error::TableNotFoundSnafu { + catalog: "greptime".to_string(), + schema: "public".to_string(), + table: "trace_type_missing".to_string(), + } + .build(), + ); + + assert_eq!(err.status_code(), StatusCode::TableNotFound); + } } diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index caf5b2d11c..21e707e4d0 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -5523,6 +5523,202 @@ pub async fn test_otlp_traces_v1(store_type: StorageType) { ) .await; + let existing_int_table_name = "trace_type_existing_int_widens_to_float"; + let existing_int_seed_req = make_trace_v1_request( + "type-existing-int", + vec![make_trace_v1_span( + "00000000000000000000000000000051", + "0000000000000051", + "existing-int-seed", + 1_736_480_942_445_490_000, + 1_736_480_942_445_590_000, + vec![make_int_attr("attr_num", 1)], + )], + ); + let res = send_trace_v1_req( + &client, + existing_int_table_name, + existing_int_seed_req, + false, + ) + .await; + assert_eq!(StatusCode::OK, res.status()); + + let existing_int_req = make_trace_v1_request( + "type-existing-int", + vec![ + make_trace_v1_span( + "00000000000000000000000000000052", + "0000000000000052", + "existing-int-upcast-int", + 1_736_480_942_445_600_000, + 1_736_480_942_445_700_000, + vec![make_int_attr("attr_num", 2)], + ), + make_trace_v1_span( + "00000000000000000000000000000053", + "0000000000000053", + "existing-int-upcast-float", + 1_736_480_942_445_710_000, + 1_736_480_942_445_810_000, + vec![make_double_attr("attr_num", 3.5)], + ), + ], + ); + let res = send_trace_v1_req(&client, existing_int_table_name, existing_int_req, false).await; + assert_eq!(StatusCode::OK, res.status()); + + validate_data( + "otlp_traces_v1_existing_int_widens_rows", + &client, + &format!( + "select trace_id, \"span_attributes.attr_num\" from {} order by trace_id;", + existing_int_table_name + ), + r#"[["00000000000000000000000000000051",1.0],["00000000000000000000000000000052",2.0],["00000000000000000000000000000053",3.5]]"#, + ) + .await; + validate_data( + "otlp_traces_v1_existing_int_widens_type", + &client, + "select column_name, lower(data_type), semantic_type from information_schema.columns where table_name = 'trace_type_existing_int_widens_to_float' and column_name = 'span_attributes.attr_num';", + r#"[["span_attributes.attr_num","double","FIELD"]]"#, + ) + .await; + + let existing_int_atomic_table_name = "trace_type_existing_int_widen_atomic"; + let existing_int_atomic_seed_req = make_trace_v1_request( + "type-existing-int-atomic", + vec![make_trace_v1_span( + "00000000000000000000000000000054", + "0000000000000054", + "existing-int-atomic-seed", + 1_736_480_942_445_720_000, + 1_736_480_942_445_820_000, + vec![ + make_int_attr("attr_num", 1), + make_int_attr("attr_parse", 10), + ], + )], + ); + let res = send_trace_v1_req( + &client, + existing_int_atomic_table_name, + existing_int_atomic_seed_req, + false, + ) + .await; + assert_eq!(StatusCode::OK, res.status()); + + let existing_int_atomic_req = make_trace_v1_request( + "type-existing-int-atomic", + vec![make_trace_v1_span( + "00000000000000000000000000000055", + "0000000000000055", + "existing-int-atomic-invalid", + 1_736_480_942_445_830_000, + 1_736_480_942_445_930_000, + vec![ + make_double_attr("attr_num", 3.5), + make_string_attr("attr_parse", "not_a_number"), + ], + )], + ); + let res = send_trace_v1_req( + &client, + existing_int_atomic_table_name, + existing_int_atomic_req, + false, + ) + .await; + assert_eq!(StatusCode::OK, res.status()); + let body = ExportTraceServiceResponse::decode(res.bytes().await).unwrap(); + let partial_success = body.partial_success.as_ref().unwrap(); + assert_eq!(partial_success.rejected_spans, 1); + assert!( + partial_success + .error_message + .contains("Accepted 0 spans, rejected 1 spans"), + "unexpected partial success body: {body:?}" + ); + + validate_data( + "otlp_traces_v1_existing_int_widen_atomic_rows", + &client, + &format!( + "select trace_id, \"span_attributes.attr_num\", \"span_attributes.attr_parse\" from {} order by trace_id;", + existing_int_atomic_table_name + ), + r#"[["00000000000000000000000000000054",1,10]]"#, + ) + .await; + validate_data( + "otlp_traces_v1_existing_int_widen_atomic_types", + &client, + "select column_name, lower(data_type), semantic_type from information_schema.columns where table_name = 'trace_type_existing_int_widen_atomic' and column_name in ('span_attributes.attr_num', 'span_attributes.attr_parse') order by column_name;", + r#"[["span_attributes.attr_num","bigint","FIELD"],["span_attributes.attr_parse","bigint","FIELD"]]"#, + ) + .await; + + let existing_int_float_only_table_name = "trace_type_existing_int_float_only"; + let existing_int_float_only_seed_req = make_trace_v1_request( + "type-existing-int-float-only", + vec![make_trace_v1_span( + "00000000000000000000000000000061", + "0000000000000061", + "existing-int-float-only-seed", + 1_736_480_942_445_820_000, + 1_736_480_942_445_920_000, + vec![make_int_attr("attr_num", 1)], + )], + ); + let res = send_trace_v1_req( + &client, + existing_int_float_only_table_name, + existing_int_float_only_seed_req, + false, + ) + .await; + assert_eq!(StatusCode::OK, res.status()); + + let existing_int_float_only_req = make_trace_v1_request( + "type-existing-int-float-only", + vec![make_trace_v1_span( + "00000000000000000000000000000062", + "0000000000000062", + "existing-int-float-only-apply", + 1_736_480_942_445_930_000, + 1_736_480_942_446_030_000, + vec![make_double_attr("attr_num", 2.5)], + )], + ); + let res = send_trace_v1_req( + &client, + existing_int_float_only_table_name, + existing_int_float_only_req, + false, + ) + .await; + assert_eq!(StatusCode::OK, res.status()); + + validate_data( + "otlp_traces_v1_existing_int_float_only_rows", + &client, + &format!( + "select trace_id, \"span_attributes.attr_num\" from {} order by trace_id;", + existing_int_float_only_table_name + ), + r#"[["00000000000000000000000000000061",1.0],["00000000000000000000000000000062",2.5]]"#, + ) + .await; + validate_data( + "otlp_traces_v1_existing_int_float_only_type", + &client, + "select column_name, lower(data_type), semantic_type from information_schema.columns where table_name = 'trace_type_existing_int_float_only' and column_name = 'span_attributes.attr_num';", + r#"[["span_attributes.attr_num","double","FIELD"]]"#, + ) + .await; + validate_data( "otlp_traces_v1_type_coercion_rows", &client, From f0ea87f52fe2bc98c0e513523ed09f5efdd858d1 Mon Sep 17 00:00:00 2001 From: jeremyhi Date: Thu, 2 Apr 2026 19:17:42 -0700 Subject: [PATCH 071/195] fix: windows ci (#7905) * fix: windows ci Signed-off-by: jeremyhi * fix: typo Signed-off-by: jeremyhi * chore: use common create_temp_dir Signed-off-by: jeremyhi --------- Signed-off-by: jeremyhi --- src/cli/src/data/export_v2/data.rs | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/cli/src/data/export_v2/data.rs b/src/cli/src/data/export_v2/data.rs index fe2ec7c051..25d70ee118 100644 --- a/src/cli/src/data/export_v2/data.rs +++ b/src/cli/src/data/export_v2/data.rs @@ -337,6 +337,7 @@ fn mask_secrets(sql: &str, secrets: &[Option]) -> String { #[cfg(test)] mod tests { use common_base::secrets::SecretString; + use common_test_util::temp_dir::create_temp_dir; use super::*; use crate::common::{PrefixedAzblobConnection, PrefixedGcsConnection, PrefixedOssConnection}; @@ -432,9 +433,21 @@ mod tests { #[test] fn test_build_copy_target_decodes_file_uri_path() { let storage = ObjectStoreConfig::default(); - let target = build_copy_target("file:///tmp/my%20backup", &storage, "public", 7) + let snapshot_root = create_temp_dir("my backup"); + let snapshot_uri = Url::from_file_path(snapshot_root.path()) + .expect("absolute platform path should convert to file:// URI") + .to_string(); + let expected = normalize_path(&format!( + "{}/{}", + snapshot_root.path().to_string_lossy(), + data_dir_for_schema_chunk("public", 7) + )); + let target = build_copy_target(&snapshot_uri, &storage, "public", 7) .expect("file:// copy target should be built"); - assert_eq!(target.location, "/tmp/my backup/data/public/7/"); + assert!(snapshot_uri.contains("%20")); + assert!(!target.location.contains("%20")); + assert!(target.location.contains("my backup")); + assert_eq!(target.location, expected); } } From a424ee1c0a381cbd25985056bc211d51c6e2cdea Mon Sep 17 00:00:00 2001 From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com> Date: Fri, 3 Apr 2026 10:56:18 +0800 Subject: [PATCH 072/195] refactor(metric-engine): Refactor PendingRowsBatcher for better testability and benchmarking (#7902) * perf/schema-align: **Refactor and Enhance Error Handling in `pending_rows_batcher.rs`** - **Refactored `record_failure` Macro**: Moved the `record_failure` macro outside of the `flush_batch_physical` function to improve code reuse and maintainability. - **Enhanced Batch Transformation**: Introduced `transform_logical_batches_to_physical` function to handle the transformation of logical table batches into physical format. - **Batch Concatenation**: Added `concat_modified_batches` function to concatenate modified batches into a single batch. - **Region Write Splitting**: Implemented `split_and_encode_region_writes` function to split combined batches into region-specific writes based on partition rules. Signed-off-by: Lei, HUANG * perf/schema-align: Add tests for `transform_logical_batches_to_physical` in `pending_rows_batcher.rs` - Implemented `mock_tag_batch` function to create mock `RecordBatch` instances for testing. - Added multiple test cases for `transform_logical_batches_to_physical`: - `test_transform_logical_batches_to_physical_success`: Verifies successful transformation of logical to physical batches. - `test_transform_logical_batches_to_physical_taxonomy_failure`: Tests failure scenario when column IDs are missing. - `test_transform_logical_batches_to_physical_multiple_batches`: Checks handling of multiple batches. - `test_transform_logical_batches_to_physical_mixed_success_failure`: Tests mixed success and failure scenarios. Signed-off-by: Lei, HUANG * perf/schema-align: refactor `flush_batch_physical` for better testability Introduced several traits to abstract dependencies on CatalogManager, PartitionRuleManager, and NodeManager, enabling easier unit testing with mock implementations. - Added `PhysicalFlushCatalogProvider`, `PhysicalFlushPartitionProvider`, and `PhysicalFlushNodeRequester` traits. - Implemented adapters for existing managers to satisfy the new traits. - Refactored `flush_batch_physical` to use these traits instead of concrete manager references. - Modularized region write planning, resolution, and encoding into standalone functions. - Added comprehensive unit tests for the refactored logic, including edge cases for table lookup and region routing. Signed-off-by: Lei, HUANG * perf/schema-align: ### Enhance Error Handling and Simplify Code in `error.rs` and `pending_rows_batcher.rs` - **Error Handling Improvements**: - Added new error variants `Partition` and `MetricEngine` in `error.rs` to handle specific error cases. - Updated error propagation using `ResultExt` and `context` for better error messages and handling in `pending_rows_batcher.rs`. - **Code Simplification**: - Removed `FlushWriteResult` enum and refactored `flush_region_writes_concurrently` to return `Result<()>`. - Simplified error handling in `flush_batch_physical` and related functions by removing `first_error` and using `Result` for error propagation. - **Test Adjustments**: - Updated tests to align with the new error handling approach, ensuring they check for specific error messages and conditions. Signed-off-by: Lei, HUANG * perf/schema-align: refactor `PendingBatch` to use `Option` for cleaner state management Refactored `PendingBatch` in `pending_rows_batcher.rs` to use `Option` within the worker loop. This change simplifies initialization and cleanup logic by leveraging `Option::get_or_insert_with` and `Option::take`. - Updated `PendingBatch` fields `created_at` and `ctx` to be non-optional. - Modified `drain_batch` to take `&mut Option` and return the drained batch, removing the need for `flush_with_error`. - Simplified the worker loop logic for batch creation and flushing. - Added a unit test `test_drain_batch_takes_initialized_pending_batch_from_option` to verify the new draining logic. Signed-off-by: Lei, HUANG * perf/schema-align: share errors across waiters using `Arc` Enhanced error reporting in `PendingRowsBatcher` by using `Arc` in `FlushWaiter` and `WorkerCommand`. This allows the same error instance to be shared among all waiters of a batch, avoiding redundant error string conversions and providing more structured error information. - Added `SubmitBatch` variant to `Error` in `error.rs`. - Updated `FlushWaiter` and `WorkerCommand` to use `std::result::Result<(), Arc>`. - Refactored `notify_waiters` to distribute the shared `Arc`. - Added `SubmitBatchSnafu` context when receiving results from the worker. Signed-off-by: Lei, HUANG * perf/schema-align: export types for benchmarking Exported several internal types and traits from `pending_rows_batcher.rs` to enable external benchmarking of the physical batch flushing logic. - Made `PhysicalTableMetadata`, `PhysicalFlushCatalogProvider`, `PhysicalFlushPartitionProvider`, `PhysicalFlushNodeRequester`, `TableBatch`, and `flush_batch_physical` public. - Added a new criterion benchmark `flush_batch_physical.rs` to measure the performance of physical batch flushing with varying numbers of logical tables and rows per table. - Registered the new benchmark in `src/servers/Cargo.toml`. Signed-off-by: Lei, HUANG * fix: typo Signed-off-by: Lei, HUANG * refactor(servers): improve error handling and documentation in batcher Refactored error handling in `pending_rows_batcher.rs` by using `ArrowSnafu` for RecordBatch projection errors and simplified partition rule fetching. Added comprehensive documentation for `flush_batch_physical` and updated error display for `SubmitBatch`. - Added `Location` to `Arrow` error variant for better traceability. - Updated `SubmitBatch` display to include source error. - Replaced manual error mapping with `context(error::ArrowSnafu)` in `strip_partition_columns_from_batch`. - Added doc comments to `flush_batch_physical` outlining the pipeline steps. - Optimized capacity allocation in `transform_logical_batches_to_physical`. Signed-off-by: Lei, HUANG * refactor(servers): clarify physical table metadata and simplify planned batch Renamed `name_to_ids` to `col_name_to_ids` in `PhysicalTableMetadata` to better reflect its purpose. Refactored `PlannedRegionBatch` to use a `num_rows()` method instead of storing a redundant `row_count` field. - Updated `PhysicalTableMetadata` and its usages in `pending_rows_batcher.rs` and benchmarks. - Removed `row_count` field from `PlannedRegionBatch` and added a `num_rows()` helper. - Cleaned up manual `with_context` closures for table lookups. - Fixed a minor formatting issue in worker command processing. Signed-off-by: Lei, HUANG * refactor(servers): simplify flush write structs and centralize metrics Removed redundant `row_count` fields from `FlushRegionWrite` and `PlannedRegionBatch` (made the helper method test-only). Centralized the incrementing of `FLUSH_TOTAL` and `FLUSH_ROWS` metrics into `flush_batch` to avoid duplication and ensure consistency. - Removed `row_count` from `FlushRegionWrite` and `PlannedRegionBatch`. - Marked `PlannedRegionBatch::num_rows()` as `#[cfg(test)]`. - Updated `flush_batch` to handle `FLUSH_TOTAL` and `FLUSH_ROWS` metrics. - Simplified concurrent and sequential flush logic by removing local metric updates. - Cleaned up related tests to match the structural changes. Signed-off-by: Lei, HUANG --------- Signed-off-by: Lei, HUANG --- src/servers/Cargo.toml | 4 + src/servers/benches/flush_batch_physical.rs | 289 ++++ src/servers/src/error.rs | 23 + src/servers/src/pending_rows_batcher.rs | 1348 +++++++++++++------ 4 files changed, 1255 insertions(+), 409 deletions(-) create mode 100644 src/servers/benches/flush_batch_physical.rs diff --git a/src/servers/Cargo.toml b/src/servers/Cargo.toml index 55bb41ee51..115636821b 100644 --- a/src/servers/Cargo.toml +++ b/src/servers/Cargo.toml @@ -178,3 +178,7 @@ harness = false [[bench]] name = "loki_labels" harness = false + +[[bench]] +name = "flush_batch_physical" +harness = false diff --git a/src/servers/benches/flush_batch_physical.rs b/src/servers/benches/flush_batch_physical.rs new file mode 100644 index 0000000000..a3d190adf2 --- /dev/null +++ b/src/servers/benches/flush_batch_physical.rs @@ -0,0 +1,289 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::Arc; + +use api::region::RegionResponse; +use api::v1::meta::Peer; +use api::v1::region::RegionRequest; +use arrow::array::{Float64Array, StringArray, TimestampMillisecondArray}; +use arrow::datatypes::{DataType as ArrowDataType, Field, Schema as ArrowSchema, TimeUnit}; +use arrow::record_batch::RecordBatch; +use async_trait::async_trait; +use catalog::error::Result as CatalogResult; +use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use datatypes::prelude::ConcreteDataType; +use datatypes::schema::{ColumnSchema as DtColumnSchema, Schema as DtSchema}; +use partition::error::Result as PartitionResult; +use partition::partition::{PartitionRule, PartitionRuleRef, RegionMask}; +use servers::error::{self, Result}; +use servers::pending_rows_batcher::{ + PhysicalFlushCatalogProvider, PhysicalFlushNodeRequester, PhysicalFlushPartitionProvider, + PhysicalTableMetadata, TableBatch, flush_batch_physical, +}; +use store_api::storage::RegionId; +use table::test_util::table_info::test_table_info; +use tokio::runtime::Runtime; + +// --------------------------------------------------------------------------- +// Mock implementations (memory-backed, no I/O) +// --------------------------------------------------------------------------- + +struct BenchCatalogProvider { + table: PhysicalTableMetadata, +} + +#[async_trait] +impl PhysicalFlushCatalogProvider for BenchCatalogProvider { + async fn physical_table( + &self, + _catalog: &str, + _schema: &str, + _table_name: &str, + _query_ctx: &session::context::QueryContext, + ) -> CatalogResult> { + Ok(Some(self.table.clone())) + } +} + +struct BenchPartitionProvider; + +struct SingleRegionPartitionRule; + +impl PartitionRule for SingleRegionPartitionRule { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn partition_columns(&self) -> &[String] { + &[] + } + + fn find_region( + &self, + _values: &[datatypes::prelude::Value], + ) -> PartitionResult { + Ok(1) + } + + fn split_record_batch( + &self, + record_batch: &RecordBatch, + ) -> PartitionResult> { + let n = record_batch.num_rows(); + Ok(HashMap::from([( + 1, + RegionMask::new(arrow::array::BooleanArray::from(vec![true; n]), n), + )])) + } +} + +#[async_trait] +impl PhysicalFlushPartitionProvider for BenchPartitionProvider { + async fn find_table_partition_rule( + &self, + _table_info: &table::metadata::TableInfo, + ) -> PartitionResult { + Ok(Arc::new(SingleRegionPartitionRule)) + } + + async fn find_region_leader(&self, _region_id: RegionId) -> Result { + Ok(Peer { + id: 1, + addr: "bench-node".to_string(), + }) + } +} + +struct BenchNodeRequester; + +#[async_trait] +impl PhysicalFlushNodeRequester for BenchNodeRequester { + async fn handle(&self, _peer: &Peer, _request: RegionRequest) -> error::Result { + Ok(RegionResponse::new(0)) + } +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +fn make_physical_table_metadata(num_tags: usize) -> PhysicalTableMetadata { + let mut columns = vec![ + DtColumnSchema::new("__primary_key", ConcreteDataType::binary_datatype(), false), + DtColumnSchema::new( + "greptime_timestamp", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + DtColumnSchema::new("greptime_value", ConcreteDataType::float64_datatype(), true), + ]; + + let mut name_to_ids = HashMap::new(); + let mut column_ids = vec![0u32, 1, 2]; + + for i in 0..num_tags { + let tag_name = format!("tag{}", i); + let col_id = (i + 3) as u32; + columns.push(DtColumnSchema::new( + &tag_name, + ConcreteDataType::string_datatype(), + true, + )); + name_to_ids.insert(tag_name, col_id); + column_ids.push(col_id); + } + + let schema = Arc::new(DtSchema::try_new(columns).unwrap()); + let mut table_info = test_table_info(1, "phy", "public", "greptime", schema); + table_info.meta.column_ids = column_ids; + + PhysicalTableMetadata { + table_info: Arc::new(table_info), + col_name_to_ids: Some(name_to_ids), + } +} + +fn make_tag_batch(tag_names: &[&str], num_rows: usize) -> RecordBatch { + let mut fields = vec![ + Field::new( + "greptime_timestamp", + ArrowDataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + Field::new("greptime_value", ArrowDataType::Float64, true), + ]; + for tag in tag_names { + fields.push(Field::new(*tag, ArrowDataType::Utf8, true)); + } + + let schema = Arc::new(ArrowSchema::new(fields)); + + let ts: Vec = (0..num_rows as i64).collect(); + let vals: Vec = (0..num_rows).map(|i| i as f64).collect(); + + let mut arrays: Vec> = vec![ + Arc::new(TimestampMillisecondArray::from(ts)), + Arc::new(Float64Array::from(vals)), + ]; + + for (tag_idx, _tag) in tag_names.iter().enumerate() { + let values: Vec = (0..num_rows) + .map(|i| format!("val-{}-{}", tag_idx, i)) + .collect(); + arrays.push(Arc::new(StringArray::from(values))); + } + + RecordBatch::try_new(schema, arrays).unwrap() +} + +fn make_table_batches( + num_logical_tables: usize, + rows_per_table: usize, + tag_names: &[&str], +) -> Vec { + (0..num_logical_tables) + .map(|i| { + let batch = make_tag_batch(tag_names, rows_per_table); + let row_count = batch.num_rows(); + TableBatch { + table_name: format!("logical_{}", i), + table_id: (100 + i) as u32, + batches: vec![batch], + row_count, + } + }) + .collect() +} + +// --------------------------------------------------------------------------- +// Benchmarks +// --------------------------------------------------------------------------- + +fn bench_flush_batch_physical(c: &mut Criterion) { + let rt = Runtime::new().unwrap(); + let ctx = session::context::QueryContext::arc(); + + let num_tags = 5; + let tag_names: Vec = (0..num_tags).map(|i| format!("tag{}", i)).collect(); + let tag_refs: Vec<&str> = tag_names.iter().map(|s| s.as_str()).collect(); + + let catalog_provider = BenchCatalogProvider { + table: make_physical_table_metadata(num_tags), + }; + let partition_provider = BenchPartitionProvider; + let node_requester = BenchNodeRequester; + + let mut group = c.benchmark_group("flush_batch_physical"); + + // Vary the number of logical tables + for num_tables in [1, 10, 50, 100] { + let rows_per_table = 100; + let table_batches = make_table_batches(num_tables, rows_per_table, &tag_refs); + + group.bench_with_input( + BenchmarkId::new("tables", num_tables), + &table_batches, + |b, batches| { + b.iter(|| { + rt.block_on(async { + flush_batch_physical( + batches, + "phy", + &ctx, + &partition_provider, + &node_requester, + &catalog_provider, + ) + .await + .unwrap(); + }); + }); + }, + ); + } + + // Vary the number of rows per table + for rows_per_table in [10, 100, 1000, 5000] { + let num_tables = 10; + let table_batches = make_table_batches(num_tables, rows_per_table, &tag_refs); + + group.bench_with_input( + BenchmarkId::new("rows_per_table", rows_per_table), + &table_batches, + |b, batches| { + b.iter(|| { + rt.block_on(async { + flush_batch_physical( + batches, + "phy", + &ctx, + &partition_provider, + &node_requester, + &catalog_provider, + ) + .await + .unwrap(); + }); + }); + }, + ); + } + + group.finish(); +} + +criterion_group!(benches, bench_flush_batch_physical); +criterion_main!(benches); diff --git a/src/servers/src/error.rs b/src/servers/src/error.rs index 682288b271..8a3c554058 100644 --- a/src/servers/src/error.rs +++ b/src/servers/src/error.rs @@ -15,6 +15,7 @@ use std::any::Any; use std::net::SocketAddr; use std::string::FromUtf8Error; +use std::sync::Arc; use axum::http::StatusCode as HttpStatusCode; use axum::response::{IntoResponse, Response}; @@ -51,6 +52,8 @@ pub enum Error { Arrow { #[snafu(source)] error: arrow_schema::ArrowError, + #[snafu(implicit)] + location: Location, }, #[snafu(display("Internal error: {}", err_msg))] @@ -685,6 +688,23 @@ pub enum Error { #[snafu(implicit)] location: Location, }, + + #[snafu(transparent)] + Partition { + source: partition::error::Error, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(transparent)] + MetricEngine { + source: metric_engine::error::Error, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Failed to submit batch: {}", source))] + SubmitBatch { source: Arc }, } pub type Result = std::result::Result; @@ -818,6 +838,9 @@ impl ErrorExt for Error { MemoryLimitExceeded { .. } => StatusCode::RateLimited, GreptimeProto { source, .. } => source.status_code(), + Partition { source, .. } => source.status_code(), + MetricEngine { source, .. } => source.status_code(), + SubmitBatch { source, .. } => source.status_code(), } } diff --git a/src/servers/src/pending_rows_batcher.rs b/src/servers/src/pending_rows_batcher.rs index b6e07d2a81..4cd8331636 100644 --- a/src/servers/src/pending_rows_batcher.rs +++ b/src/servers/src/pending_rows_batcher.rs @@ -31,15 +31,17 @@ use common_grpc::flight::{FlightEncoder, FlightMessage}; use common_meta::node_manager::NodeManagerRef; use common_query::prelude::{GREPTIME_PHYSICAL_TABLE, greptime_timestamp, greptime_value}; use common_telemetry::tracing_context::TracingContext; -use common_telemetry::{debug, error, warn}; +use common_telemetry::{debug, warn}; use dashmap::DashMap; use dashmap::mapref::entry::Entry; use metric_engine::batch_modifier::{TagColumnInfo, modify_batch_sparse}; use partition::manager::PartitionRuleManagerRef; +use partition::partition::PartitionRuleRef; use session::context::QueryContextRef; use smallvec::SmallVec; -use snafu::{OptionExt, ensure}; +use snafu::{OptionExt, ResultExt, ensure}; use store_api::storage::{RegionId, TableId}; +use table::metadata::{TableInfo, TableInfoRef}; use tokio::sync::{OwnedSemaphorePermit, Semaphore, broadcast, mpsc, oneshot}; use crate::error; @@ -86,6 +88,116 @@ pub trait PendingRowsSchemaAlterer: Send + Sync { pub type PendingRowsSchemaAltererRef = Arc; +#[derive(Clone)] +pub struct PhysicalTableMetadata { + pub table_info: TableInfoRef, + /// Mapping from column name to column id + pub col_name_to_ids: Option>, +} + +#[async_trait] +pub trait PhysicalFlushCatalogProvider: Send + Sync { + async fn physical_table( + &self, + catalog: &str, + schema: &str, + table_name: &str, + query_ctx: &session::context::QueryContext, + ) -> catalog::error::Result>; +} + +#[async_trait] +pub trait PhysicalFlushPartitionProvider: Send + Sync { + async fn find_table_partition_rule( + &self, + table_info: &TableInfo, + ) -> partition::error::Result; + + async fn find_region_leader(&self, region_id: RegionId) -> Result; +} + +#[async_trait] +pub trait PhysicalFlushNodeRequester: Send + Sync { + async fn handle( + &self, + peer: &Peer, + request: RegionRequest, + ) -> Result; +} + +#[derive(Clone)] +struct CatalogManagerPhysicalFlushAdapter { + catalog_manager: CatalogManagerRef, +} + +#[async_trait] +impl PhysicalFlushCatalogProvider for CatalogManagerPhysicalFlushAdapter { + async fn physical_table( + &self, + catalog: &str, + schema: &str, + table_name: &str, + query_ctx: &session::context::QueryContext, + ) -> catalog::error::Result> { + self.catalog_manager + .table(catalog, schema, table_name, Some(query_ctx)) + .await + .map(|table| { + table.map(|table| { + let table_info = table.table_info(); + let name_to_ids = table_info.name_to_ids(); + PhysicalTableMetadata { + table_info, + col_name_to_ids: name_to_ids, + } + }) + }) + } +} + +#[derive(Clone)] +struct PartitionManagerPhysicalFlushAdapter { + partition_manager: PartitionRuleManagerRef, +} + +#[async_trait] +impl PhysicalFlushPartitionProvider for PartitionManagerPhysicalFlushAdapter { + async fn find_table_partition_rule( + &self, + table_info: &TableInfo, + ) -> partition::error::Result { + self.partition_manager + .find_table_partition_rule(table_info) + .await + .map(|(rule, _)| rule) + } + + async fn find_region_leader(&self, region_id: RegionId) -> Result { + let peer = self.partition_manager.find_region_leader(region_id).await?; + Ok(peer) + } +} + +#[derive(Clone)] +struct NodeManagerPhysicalFlushAdapter { + node_manager: NodeManagerRef, +} + +#[async_trait] +impl PhysicalFlushNodeRequester for NodeManagerPhysicalFlushAdapter { + async fn handle( + &self, + peer: &Peer, + request: RegionRequest, + ) -> error::Result { + let datanode = self.node_manager.datanode(peer).await; + datanode + .handle(request) + .await + .context(error::CommonMetaSnafu) + } +} + #[derive(Debug, Clone, Hash, Eq, PartialEq)] struct BatchKey { catalog: String, @@ -94,11 +206,11 @@ struct BatchKey { } #[derive(Debug)] -struct TableBatch { - table_name: String, - table_id: TableId, - batches: Vec, - row_count: usize, +pub struct TableBatch { + pub table_name: String, + pub table_id: TableId, + pub batches: Vec, + pub row_count: usize, } /// Intermediate planning state for resolving and preparing logical tables @@ -114,14 +226,14 @@ struct TableResolutionPlan { struct PendingBatch { tables: HashMap, - created_at: Option, + created_at: Instant, total_row_count: usize, - ctx: Option, + ctx: QueryContextRef, waiters: Vec, } struct FlushWaiter { - response_tx: oneshot::Sender>, + response_tx: oneshot::Sender>>, _permit: OwnedSemaphorePermit, } @@ -142,7 +254,7 @@ enum WorkerCommand { table_batches: Vec<(String, u32, RecordBatch)>, total_rows: usize, ctx: QueryContextRef, - response_tx: oneshot::Sender>, + response_tx: oneshot::Sender>>, _permit: OwnedSemaphorePermit, }, } @@ -301,7 +413,9 @@ impl PendingRowsBatcher { .await .map_err(|_| error::BatcherChannelClosedSnafu.build())? }; - result.map(|()| total_rows as u64) + result + .context(error::SubmitBatchSnafu) + .map(|()| total_rows as u64) } else { Ok(total_rows as u64) } @@ -706,12 +820,12 @@ impl Drop for PendingRowsBatcher { } impl PendingBatch { - fn new() -> Self { + fn new(ctx: QueryContextRef) -> Self { Self { tables: HashMap::new(), - created_at: None, + created_at: Instant::now(), total_row_count: 0, - ctx: None, + ctx, waiters: Vec::new(), } } @@ -733,7 +847,7 @@ fn start_worker( flush_semaphore: Arc, ) { tokio::spawn(async move { - let mut batch = PendingBatch::new(); + let mut batch = None; let mut interval = tokio::time::interval(flush_interval); let mut shutdown_rx = shutdown.subscribe(); let idle_deadline = tokio::time::Instant::now() + worker_idle_timeout; @@ -747,16 +861,15 @@ fn start_worker( Some(WorkerCommand::Submit { table_batches, total_rows, ctx, response_tx, _permit }) => { idle_timer.as_mut().reset(tokio::time::Instant::now() + worker_idle_timeout); - if batch.total_row_count == 0 { - batch.created_at = Some(Instant::now()); - batch.ctx = Some(ctx); + let pending_batch = batch.get_or_insert_with(||{ PENDING_BATCHES.inc(); - } + PendingBatch::new(ctx) + }); - batch.waiters.push(FlushWaiter { response_tx, _permit }); + pending_batch.waiters.push(FlushWaiter { response_tx, _permit }); for (table_name, table_id, record_batch) in table_batches { - let entry = batch.tables.entry(table_name.clone()).or_insert_with(|| TableBatch { + let entry = pending_batch.tables.entry(table_name.clone()).or_insert_with(|| TableBatch { table_name, table_id, batches: Vec::new(), @@ -766,10 +879,10 @@ fn start_worker( entry.batches.push(record_batch); } - batch.total_row_count += total_rows; + pending_batch.total_row_count += total_rows; PENDING_ROWS.add(total_rows as i64); - if batch.total_row_count >= max_batch_rows + if pending_batch.total_row_count >= max_batch_rows && let Some(flush) = drain_batch(&mut batch) { spawn_flush( flush, @@ -794,7 +907,10 @@ fn start_worker( } } _ = &mut idle_timer => { - if !should_close_worker_on_idle_timeout(batch.total_row_count, rx.len()) { + if !should_close_worker_on_idle_timeout( + batch.as_ref().map_or(0, |batch| batch.total_row_count), + rx.len(), + ) { idle_timer .as_mut() .reset(tokio::time::Instant::now() + worker_idle_timeout); @@ -810,9 +926,9 @@ fn start_worker( break; } _ = interval.tick() => { - if let Some(created_at) = batch.created_at - && batch.total_row_count > 0 - && created_at.elapsed() >= flush_interval + if batch + .as_ref() + .is_some_and(|batch| batch.created_at.elapsed() >= flush_interval) && let Some(flush) = drain_batch(&mut batch) { spawn_flush( flush, @@ -862,24 +978,16 @@ fn should_close_worker_on_idle_timeout(total_row_count: usize, queued_requests: total_row_count == 0 && queued_requests == 0 } -fn drain_batch(batch: &mut PendingBatch) -> Option { - if batch.total_row_count == 0 { +fn drain_batch(batch: &mut Option) -> Option { + let batch = batch.take()?; + let total_row_count = batch.total_row_count; + + if total_row_count == 0 { return None; } - let ctx = match batch.ctx.take() { - Some(ctx) => ctx, - None => { - flush_with_error(batch, "Pending batch missing context"); - return None; - } - }; - - let total_row_count = batch.total_row_count; - let table_batches = std::mem::take(&mut batch.tables).into_values().collect(); - let waiters = std::mem::take(&mut batch.waiters); - batch.total_row_count = 0; - batch.created_at = None; + let table_batches = batch.tables.into_values().collect(); + let waiters = batch.waiters; PENDING_ROWS.sub(total_row_count as i64); PENDING_BATCHES.dec(); @@ -887,7 +995,7 @@ fn drain_batch(batch: &mut PendingBatch) -> Option { Some(FlushBatch { table_batches, total_row_count, - ctx, + ctx: batch.ctx, waiters, }) } @@ -914,15 +1022,25 @@ async fn spawn_flush( } struct FlushRegionWrite { - region_id: RegionId, - row_count: usize, datanode: Peer, request: RegionRequest, } -enum FlushWriteResult { - Success { row_count: usize }, - Failed { row_count: usize, message: String }, +struct PlannedRegionBatch { + region_id: RegionId, + batch: RecordBatch, +} + +#[cfg(test)] +impl PlannedRegionBatch { + fn num_rows(&self) -> usize { + self.batch.num_rows() + } +} + +struct ResolvedRegionBatch { + planned: PlannedRegionBatch, + datanode: Peer, } fn should_dispatch_concurrently(region_write_count: usize) -> bool { @@ -1045,65 +1163,35 @@ fn strip_partition_columns_from_batch(batch: RecordBatch) -> Result } ); let essential_indices: Vec = (0..PHYSICAL_REGION_ESSENTIAL_COLUMN_COUNT).collect(); - batch - .project(&essential_indices) - .map_err(|err| Error::Internal { - err_msg: format!("Failed to project essential columns from RecordBatch: {err}"), - }) + batch.project(&essential_indices).context(error::ArrowSnafu) } async fn flush_region_writes_concurrently( - node_manager: NodeManagerRef, + node_manager: &(impl PhysicalFlushNodeRequester + ?Sized), writes: Vec, -) -> Vec { +) -> Result<()> { if !should_dispatch_concurrently(writes.len()) { - let mut results = Vec::with_capacity(writes.len()); for write in writes { - let datanode = node_manager.datanode(&write.datanode).await; let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED .with_label_values(&["flush_write_region"]) .start_timer(); - match datanode.handle(write.request).await { - Ok(_) => results.push(FlushWriteResult::Success { - row_count: write.row_count, - }), - Err(err) => results.push(FlushWriteResult::Failed { - row_count: write.row_count, - message: format!( - "Bulk insert flush failed for region {}: {:?}", - write.region_id, err - ), - }), - } + node_manager.handle(&write.datanode, write.request).await?; } - return results; + return Ok(()); } - let write_futures = writes.into_iter().map(|write| { - let node_manager = node_manager.clone(); - async move { - let datanode = node_manager.datanode(&write.datanode).await; - let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED - .with_label_values(&["flush_write_region"]) - .start_timer(); + let write_futures = writes.into_iter().map(|write| async move { + let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED + .with_label_values(&["flush_write_region"]) + .start_timer(); - match datanode.handle(write.request).await { - Ok(_) => FlushWriteResult::Success { - row_count: write.row_count, - }, - Err(err) => FlushWriteResult::Failed { - row_count: write.row_count, - message: format!( - "Bulk insert flush failed for region {}: {:?}", - write.region_id, err - ), - }, - } - } + node_manager.handle(&write.datanode, write.request).await?; + Ok::<_, Error>(()) }); // todo(hl): should be bounded. - futures::future::join_all(write_futures).await + futures::future::try_join_all(write_futures).await?; + Ok(()) } async fn flush_batch( @@ -1119,7 +1207,6 @@ async fn flush_batch( waiters, } = flush; let start = Instant::now(); - let mut first_error: Option = None; // Physical-table-level flush: transform all logical table batches // into physical format and write them together. @@ -1127,169 +1214,148 @@ async fn flush_batch( .extension(PHYSICAL_TABLE_KEY) .unwrap_or(GREPTIME_PHYSICAL_TABLE) .to_string(); - flush_batch_physical( + let partition_provider = PartitionManagerPhysicalFlushAdapter { partition_manager }; + let node_requester = NodeManagerPhysicalFlushAdapter { node_manager }; + let catalog_provider = CatalogManagerPhysicalFlushAdapter { catalog_manager }; + let result = flush_batch_physical( &table_batches, - total_row_count, &physical_table_name, &ctx, - &partition_manager, - &node_manager, - &catalog_manager, - &mut first_error, + &partition_provider, + &node_requester, + &catalog_provider, ) .await; let elapsed = start.elapsed().as_secs_f64(); FLUSH_ELAPSED.observe(elapsed); + + if result.is_err() { + FLUSH_FAILURES.inc(); + FLUSH_DROPPED_ROWS.inc_by(total_row_count as u64); + } else { + FLUSH_TOTAL.inc(); + FLUSH_ROWS.observe(total_row_count as f64); + } + debug!( "Pending rows batch flushed, total rows: {}, elapsed time: {}s", total_row_count, elapsed ); - notify_waiters(waiters, &first_error); + notify_waiters(waiters, result); } -/// Attempts to flush all table batches by transforming them into the physical -/// table format (sparse primary key encoding) and writing directly to the -/// physical data regions. +/// Flushes a batch of logical table rows by transforming them into the physical table format +/// and writing them to the appropriate datanode regions. /// -/// This is the only flush path. Any failure in resolving or transforming the -/// physical flush inputs is recorded as flush failure and reported to waiters. -#[allow(clippy::too_many_arguments)] -async fn flush_batch_physical( +/// This function performs the end-to-end physical flush pipeline: +/// 1. Resolves the physical table metadata and column ID mapping. +/// 2. Fetches the physical table's partition rule. +/// 3. Transforms each logical table batch into the physical (sparse primary key) format. +/// 4. Concatenates all transformed batches into a single combined batch. +/// 5. Splits the combined batch by partition rule and sends region write requests +/// concurrently to the target datanodes. +pub async fn flush_batch_physical( table_batches: &[TableBatch], - total_row_count: usize, physical_table_name: &str, ctx: &QueryContextRef, - partition_manager: &PartitionRuleManagerRef, - node_manager: &NodeManagerRef, - catalog_manager: &CatalogManagerRef, - first_error: &mut Option, -) { - macro_rules! record_failure { - ($row_count:expr, $msg:expr) => {{ - let msg = $msg; - if first_error.is_none() { - *first_error = Some(msg.clone()); - } - mark_flush_failure($row_count, &msg); - }}; - } - + partition_manager: &(impl PhysicalFlushPartitionProvider + ?Sized), + node_manager: &(impl PhysicalFlushNodeRequester + ?Sized), + catalog_manager: &(impl PhysicalFlushCatalogProvider + ?Sized), +) -> Result<()> { // 1. Resolve the physical table and get column ID mapping let physical_table = { let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED .with_label_values(&["flush_physical_resolve_table"]) .start_timer(); - match catalog_manager - .table( + catalog_manager + .physical_table( ctx.current_catalog(), &ctx.current_schema(), physical_table_name, - Some(ctx.as_ref()), + ctx.as_ref(), ) - .await - { - Ok(Some(table)) => table, - Ok(None) => { - record_failure!( - total_row_count, - format!( - "Physical table '{}' not found during pending flush", - physical_table_name - ) - ); - return; - } - Err(err) => { - record_failure!( - total_row_count, - format!( - "Failed to resolve physical table '{}' for pending flush: {:?}", - physical_table_name, err - ) - ); - return; - } - } + .await? + .with_context(|| error::InternalSnafu { + err_msg: format!( + "Physical table '{}' not found during pending flush", + physical_table_name + ), + })? }; - let physical_table_info = physical_table.table_info(); - let name_to_ids = match physical_table_info.name_to_ids() { - Some(ids) => ids, - None => { - record_failure!( - total_row_count, - format!( - "Physical table '{}' has no column IDs for pending flush", - physical_table_name - ) - ); - return; - } - }; + let physical_table_info = physical_table.table_info; + let name_to_ids = physical_table + .col_name_to_ids + .with_context(|| error::InternalSnafu { + err_msg: format!( + "Physical table '{}' has no column IDs for pending flush", + physical_table_name + ), + })?; // 2. Get the physical table's partition rule (one lookup instead of N) let partition_rule = { let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED .with_label_values(&["flush_physical_fetch_partition_rule"]) .start_timer(); - match partition_manager - .find_table_partition_rule(&physical_table_info) - .await - { - Ok(rule) => rule, - Err(err) => { - record_failure!( - total_row_count, - format!( - "Failed to fetch partition rule for physical table '{}': {:?}", - physical_table_name, err - ) - ); - return; - } - } + partition_manager + .find_table_partition_rule(physical_table_info.as_ref()) + .await? }; - let partition_columns = partition_rule.0.partition_columns(); + let partition_columns = partition_rule.partition_columns(); let partition_columns_set: HashSet<&str> = partition_columns.iter().map(String::as_str).collect(); // 3. Transform each logical table batch into physical format - let mut modified_batches: Vec = Vec::with_capacity(table_batches.len()); - let mut modified_row_count: usize = 0; + let modified_batches = + transform_logical_batches_to_physical(table_batches, &name_to_ids, &partition_columns_set)?; + + // 4. Concatenate all modified batches (all share the same physical schema) + let combined_batch = concat_modified_batches(&modified_batches)?; + + // 5. Split by physical partition rule and send to regions + let physical_table_id = physical_table_info.table_id(); + let planned_batches = plan_region_batches( + combined_batch, + physical_table_id, + partition_rule.as_ref(), + partition_columns, + )?; + + let resolved_batches = resolve_region_targets(planned_batches, partition_manager).await?; + let region_writes = encode_region_write_requests(resolved_batches)?; + flush_region_writes_concurrently(node_manager, region_writes).await +} + +/// Transforms logical table batches into physical format (sparse primary key encoding). +/// +/// It identifies tag columns and essential columns (timestamp, value) for each logical batch +/// and applies sparse primary key modification. +fn transform_logical_batches_to_physical( + table_batches: &[TableBatch], + name_to_ids: &HashMap, + partition_columns_set: &HashSet<&str>, +) -> Result> { + let mut modified_batches: Vec = + Vec::with_capacity(table_batches.iter().map(|b| b.batches.len()).sum()); let mut modify_elapsed = Duration::ZERO; let mut columns_taxonomy_elapsed = Duration::ZERO; - 'next_table: for table_batch in table_batches { + for table_batch in table_batches { let table_id = table_batch.table_id; - // Transform each chunk to physical format directly, avoiding an - // intermediate concat_batches per logical table. for batch in &table_batch.batches { - // Identify tag columns and non-tag columns from the logical batch schema. - // Chunks within a table_batch may have different schemas if new tag columns - // are added between submits. - // In prom batches, Float64 = value, Timestamp = timestamp, Utf8 = tags. let batch_schema = batch.schema(); let start = Instant::now(); - let (tag_columns, essential_col_indices) = match columns_taxonomy( + let (tag_columns, essential_col_indices) = columns_taxonomy( &batch_schema, &table_batch.table_name, - &name_to_ids, - &partition_columns_set, - ) { - Ok(columns) => columns, - Err(err) => { - warn!( - "Failed to resolve columns for logical table '{}': {:?}", - table_batch.table_name, err - ); - record_failure!(table_batch.row_count, err.to_string()); - continue 'next_table; - } - }; + name_to_ids, + partition_columns_set, + )?; columns_taxonomy_elapsed += start.elapsed(); if tag_columns.is_empty() && essential_col_indices.is_empty() { @@ -1298,30 +1364,16 @@ async fn flush_batch_physical( let modified = { let start = Instant::now(); - match modify_batch_sparse( + let batch = modify_batch_sparse( batch.clone(), table_id, &tag_columns, &essential_col_indices, - ) { - Ok(batch) => { - modify_elapsed += start.elapsed(); - batch - } - Err(err) => { - record_failure!( - table_batch.row_count, - format!( - "Failed to modify batch for logical table '{}': {:?}", - table_batch.table_name, err - ) - ); - continue 'next_table; - } - } + )?; + modify_elapsed += start.elapsed(); + batch }; - modified_row_count += modified.num_rows(); modified_batches.push(modified); } } @@ -1333,147 +1385,130 @@ async fn flush_batch_physical( .with_label_values(&["flush_physical_columns_taxonomy"]) .observe(columns_taxonomy_elapsed.as_secs_f64()); - if modified_batches.is_empty() { - if first_error.is_none() { - record_failure!( - total_row_count, - format!( - "No batches can be transformed for physical table '{}' during pending flush", - physical_table_name - ) - ); + ensure!( + !modified_batches.is_empty(), + error::InternalSnafu { + err_msg: "No batches can be transformed during pending flush", } - return; + ); + Ok(modified_batches) +} + +/// Concatenates all modified batches into a single large batch. +/// +/// All modified batches share the same physical schema. +fn concat_modified_batches(modified_batches: &[RecordBatch]) -> Result { + let combined_schema = modified_batches[0].schema(); + let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED + .with_label_values(&["flush_physical_concat_all"]) + .start_timer(); + concat_batches(&combined_schema, modified_batches).context(error::ArrowSnafu) +} + +fn split_combined_batch_by_region( + combined_batch: &RecordBatch, + partition_rule: &dyn partition::partition::PartitionRule, +) -> Result> { + let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED + .with_label_values(&["flush_physical_split_record_batch"]) + .start_timer(); + let map = partition_rule.split_record_batch(combined_batch)?; + Ok(map) +} + +fn prepare_physical_region_routing_batch( + combined_batch: RecordBatch, + partition_columns: &[String], +) -> Result { + if partition_columns.is_empty() { + return Ok(combined_batch); + } + strip_partition_columns_from_batch(combined_batch) +} + +fn plan_region_batch( + stripped_batch: &RecordBatch, + physical_table_id: TableId, + region_number: u32, + mask: &partition::partition::RegionMask, +) -> Result> { + if mask.select_none() { + return Ok(None); } - // 4. Concatenate all modified batches (all share the same physical schema) - let combined_batch = { - let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED - .with_label_values(&["flush_physical_concat_all"]) - .start_timer(); - let combined_schema = modified_batches[0].schema(); - // todo(hl): maybe limit max rows to concat. - match concat_batches(&combined_schema, &modified_batches) { - Ok(batch) => batch, - Err(err) => { - record_failure!( - modified_row_count, - format!("Failed to concat modified batches: {:?}", err) - ); - return; - } - } - }; - - // 5. Split by physical partition rule and send to regions - let physical_table_id = physical_table_info.table_id(); - let region_masks = { - let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED - .with_label_values(&["flush_physical_split_record_batch"]) - .start_timer(); - match partition_rule.0.split_record_batch(&combined_batch) { - Ok(masks) => masks, - Err(err) => { - record_failure!( - total_row_count, - format!( - "Failed to split combined batch for physical table '{}': {:?}", - physical_table_name, err - ) - ); - return; - } - } - }; - - let stripped_batch = if partition_columns.is_empty() { - combined_batch + let region_batch = if mask.select_all() { + stripped_batch.clone() } else { - // Strip partition columns before encoding and sending requests. - match strip_partition_columns_from_batch(combined_batch) { - Ok(batch) => batch, - Err(err) => { - record_failure!( - total_row_count, - format!( - "Failed to strip partition columns for physical table '{}': {:?}", - physical_table_name, err - ) - ); - return; - } - } + let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED + .with_label_values(&["flush_physical_filter_record_batch"]) + .start_timer(); + filter_record_batch(stripped_batch, mask.array()).context(error::ArrowSnafu)? }; - let mut region_writes = Vec::new(); + let row_count = region_batch.num_rows(); + if row_count == 0 { + return Ok(None); + } + + Ok(Some(PlannedRegionBatch { + region_id: RegionId::new(physical_table_id, region_number), + batch: region_batch, + })) +} + +fn plan_region_batches( + combined_batch: RecordBatch, + physical_table_id: TableId, + partition_rule: &dyn partition::partition::PartitionRule, + partition_columns: &[String], +) -> Result> { + let region_masks = split_combined_batch_by_region(&combined_batch, partition_rule)?; + let stripped_batch = prepare_physical_region_routing_batch(combined_batch, partition_columns)?; + + let mut planned_batches = Vec::new(); for (region_number, mask) in region_masks { - if mask.select_none() { - continue; + if let Some(planned_batch) = + plan_region_batch(&stripped_batch, physical_table_id, region_number, &mask)? + { + planned_batches.push(planned_batch); } + } - let region_batch = if mask.select_all() { - stripped_batch.clone() - } else { - let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED - .with_label_values(&["flush_physical_filter_record_batch"]) - .start_timer(); - match filter_record_batch(&stripped_batch, mask.array()) { - Ok(batch) => batch, - Err(err) => { - record_failure!( - total_row_count, - format!( - "Failed to filter combined batch for physical table '{}': {:?}", - physical_table_name, err - ) - ); - continue; - } - } - }; + Ok(planned_batches) +} - let row_count = region_batch.num_rows(); - if row_count == 0 { - continue; - } - - let region_id = RegionId::new(physical_table_id, region_number); +async fn resolve_region_targets( + planned_batches: Vec, + partition_manager: &(impl PhysicalFlushPartitionProvider + ?Sized), +) -> Result> { + let mut resolved_batches = Vec::with_capacity(planned_batches.len()); + for planned in planned_batches { let datanode = { let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED .with_label_values(&["flush_physical_resolve_region_leader"]) .start_timer(); - match partition_manager.find_region_leader(region_id).await { - Ok(peer) => peer, - Err(err) => { - record_failure!( - row_count, - format!( - "Failed to resolve region leader for physical region {}: {:?}", - region_id, err - ) - ); - continue; - } - } + partition_manager + .find_region_leader(planned.region_id) + .await? }; + resolved_batches.push(ResolvedRegionBatch { planned, datanode }); + } + + Ok(resolved_batches) +} + +fn encode_region_write_requests( + resolved_batches: Vec, +) -> Result> { + let mut region_writes = Vec::with_capacity(resolved_batches.len()); + for resolved in resolved_batches { + let region_id = resolved.planned.region_id; let (schema_bytes, data_header, payload) = { let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED .with_label_values(&["flush_physical_encode_ipc"]) .start_timer(); - match record_batch_to_ipc(region_batch) { - Ok(encoded) => encoded, - Err(err) => { - record_failure!( - row_count, - format!( - "Failed to encode Arrow IPC for physical region {}: {:?}", - region_id, err - ) - ); - continue; - } - } + record_batch_to_ipc(resolved.planned.batch)? }; let request = RegionRequest { @@ -1493,65 +1528,25 @@ async fn flush_batch_physical( }; region_writes.push(FlushRegionWrite { - region_id, - row_count, - datanode, + datanode: resolved.datanode, request, }); } - for result in flush_region_writes_concurrently(node_manager.clone(), region_writes).await { - match result { - FlushWriteResult::Success { row_count } => { - FLUSH_TOTAL.inc(); - FLUSH_ROWS.observe(row_count as f64); - } - FlushWriteResult::Failed { row_count, message } => { - record_failure!(row_count, message); - } - } - } + Ok(region_writes) } -fn notify_waiters(waiters: Vec, first_error: &Option) { +fn notify_waiters(waiters: Vec, result: Result<()>) { + let shared_result = result.map_err(Arc::new); for waiter in waiters { - let result = match first_error { - Some(err_msg) => Err(Error::Internal { - err_msg: err_msg.clone(), - }), - None => Ok(()), - }; - let _ = waiter.response_tx.send(result); + let _ = waiter.response_tx.send(match &shared_result { + Ok(()) => Ok(()), + Err(error) => Err(Arc::clone(error)), + }); // waiter._permit is dropped here, releasing the inflight semaphore slot } } -fn mark_flush_failure(row_count: usize, message: &str) { - error!("Pending rows batch flush failed, message: {}", message); - FLUSH_FAILURES.inc(); - FLUSH_DROPPED_ROWS.inc_by(row_count as u64); -} - -fn flush_with_error(batch: &mut PendingBatch, message: &str) { - if batch.total_row_count == 0 { - return; - } - - let row_count = batch.total_row_count; - let waiters = std::mem::take(&mut batch.waiters); - batch.tables.clear(); - batch.total_row_count = 0; - batch.created_at = None; - batch.ctx = None; - - PENDING_ROWS.sub(row_count as i64); - PENDING_BATCHES.dec(); - - let err_msg = Some(message.to_string()); - notify_waiters(waiters, &err_msg); - mark_flush_failure(row_count, message); -} - fn record_batch_to_ipc(record_batch: RecordBatch) -> Result<(Bytes, Bytes, Bytes)> { let mut encoder = FlightEncoder::default(); let schema = encoder.encode_schema(record_batch.schema().as_ref()); @@ -1581,17 +1576,18 @@ mod tests { use std::collections::{HashMap, HashSet}; use std::sync::Arc; use std::sync::atomic::{AtomicUsize, Ordering}; - use std::time::Duration; + use std::time::{Duration, Instant}; use api::region::RegionResponse; use api::v1::flow::{DirtyWindowRequests, FlowRequest, FlowResponse}; use api::v1::meta::Peer; - use api::v1::region::{InsertRequests, RegionRequest}; + use api::v1::region::{InsertRequests, RegionRequest, region_request}; use api::v1::{ColumnSchema, Row, RowInsertRequest, RowInsertRequests, Rows}; - use arrow::array::{BinaryArray, StringArray, TimestampMillisecondArray}; + use arrow::array::{BinaryArray, BooleanArray, StringArray, TimestampMillisecondArray}; use arrow::datatypes::{DataType as ArrowDataType, Field, Schema as ArrowSchema}; use arrow::record_batch::RecordBatch; use async_trait::async_trait; + use catalog::error::Result as CatalogResult; use common_meta::error::Result as MetaResult; use common_meta::node_manager::{ Datanode, DatanodeManager, DatanodeRef, Flownode, FlownodeManager, FlownodeRef, @@ -1599,17 +1595,28 @@ mod tests { use common_query::request::QueryRequest; use common_recordbatch::SendableRecordBatchStream; use dashmap::DashMap; + use datatypes::schema::{ColumnSchema as DtColumnSchema, Schema as DtSchema}; + use partition::error::Result as PartitionResult; + use partition::partition::{PartitionRule, PartitionRuleRef, RegionMask}; use smallvec::SmallVec; + use snafu::ResultExt; use store_api::storage::RegionId; - use tokio::sync::mpsc; + use table::metadata::TableId; + use table::test_util::table_info::test_table_info; + use tokio::sync::{Semaphore, mpsc, oneshot}; use tokio::time::sleep; use super::{ - BatchKey, Error, FlushRegionWrite, FlushWriteResult, PendingRowsBatcher, PendingWorker, - WorkerCommand, columns_taxonomy, flush_region_writes_concurrently, - remove_worker_if_same_channel, should_close_worker_on_idle_timeout, + BatchKey, Error, FlushRegionWrite, FlushWaiter, PendingBatch, PendingRowsBatcher, + PendingWorker, PhysicalFlushCatalogProvider, PhysicalFlushNodeRequester, + PhysicalFlushPartitionProvider, PhysicalTableMetadata, PlannedRegionBatch, + ResolvedRegionBatch, TableBatch, WorkerCommand, columns_taxonomy, drain_batch, + encode_region_write_requests, flush_batch_physical, flush_region_writes_concurrently, + plan_region_batches, remove_worker_if_same_channel, should_close_worker_on_idle_timeout, should_dispatch_concurrently, strip_partition_columns_from_batch, + transform_logical_batches_to_physical, }; + use crate::error; fn mock_rows(row_count: usize, schema_name: &str) -> Rows { Rows { @@ -1621,6 +1628,190 @@ mod tests { } } + fn mock_tag_batch(tag_name: &str, tag_value: &str, ts: i64, val: f64) -> RecordBatch { + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + "greptime_timestamp", + ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), + false, + ), + Field::new("greptime_value", ArrowDataType::Float64, true), + Field::new(tag_name, ArrowDataType::Utf8, true), + ])); + + RecordBatch::try_new( + schema, + vec![ + Arc::new(TimestampMillisecondArray::from(vec![ts])), + Arc::new(arrow::array::Float64Array::from(vec![val])), + Arc::new(StringArray::from(vec![tag_value])), + ], + ) + .unwrap() + } + + fn mock_physical_table_metadata(table_id: TableId) -> PhysicalTableMetadata { + let schema = Arc::new( + DtSchema::try_new(vec![ + DtColumnSchema::new( + "__primary_key", + datatypes::prelude::ConcreteDataType::binary_datatype(), + false, + ), + DtColumnSchema::new( + "greptime_timestamp", + datatypes::prelude::ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + DtColumnSchema::new( + "greptime_value", + datatypes::prelude::ConcreteDataType::float64_datatype(), + true, + ), + DtColumnSchema::new( + "tag1", + datatypes::prelude::ConcreteDataType::string_datatype(), + true, + ), + ]) + .unwrap(), + ); + let mut table_info = test_table_info(table_id, "phy", "public", "greptime", schema); + table_info.meta.column_ids = vec![0, 1, 2, 3]; + + PhysicalTableMetadata { + table_info: Arc::new(table_info), + col_name_to_ids: Some(HashMap::from([("tag1".to_string(), 3)])), + } + } + + struct MockFlushCatalogProvider { + table: Option, + } + + #[async_trait] + impl PhysicalFlushCatalogProvider for MockFlushCatalogProvider { + async fn physical_table( + &self, + _catalog: &str, + _schema: &str, + _table_name: &str, + _query_ctx: &session::context::QueryContext, + ) -> CatalogResult> { + Ok(self.table.clone()) + } + } + + struct SingleRegionPartitionRule; + + impl PartitionRule for SingleRegionPartitionRule { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn partition_columns(&self) -> &[String] { + &[] + } + + fn find_region( + &self, + _values: &[datatypes::prelude::Value], + ) -> partition::error::Result { + unimplemented!() + } + + fn split_record_batch( + &self, + record_batch: &RecordBatch, + ) -> partition::error::Result> + { + Ok(HashMap::from([( + 1, + RegionMask::new( + arrow::array::BooleanArray::from(vec![true; record_batch.num_rows()]), + record_batch.num_rows(), + ), + )])) + } + } + + struct TwoRegionPartitionRule { + partition_columns: Vec, + } + + impl PartitionRule for TwoRegionPartitionRule { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn partition_columns(&self) -> &[String] { + &self.partition_columns + } + + fn find_region( + &self, + _values: &[datatypes::prelude::Value], + ) -> partition::error::Result { + unimplemented!() + } + + fn split_record_batch( + &self, + _record_batch: &RecordBatch, + ) -> partition::error::Result> + { + Ok(HashMap::from([ + (1, RegionMask::new(BooleanArray::from(vec![true, false]), 1)), + (2, RegionMask::new(BooleanArray::from(vec![false, true]), 1)), + ( + 3, + RegionMask::new(BooleanArray::from(vec![false, false]), 0), + ), + ])) + } + } + + struct MockFlushPartitionProvider { + partition_rule_calls: Arc, + region_leader_calls: Arc, + } + + #[async_trait] + impl PhysicalFlushPartitionProvider for MockFlushPartitionProvider { + async fn find_table_partition_rule( + &self, + _table_info: &table::metadata::TableInfo, + ) -> PartitionResult { + self.partition_rule_calls.fetch_add(1, Ordering::SeqCst); + Ok(Arc::new(SingleRegionPartitionRule)) + } + + async fn find_region_leader(&self, _region_id: RegionId) -> error::Result { + self.region_leader_calls.fetch_add(1, Ordering::SeqCst); + Ok(Peer { + id: 1, + addr: "node-1".to_string(), + }) + } + } + + #[derive(Default)] + struct MockFlushNodeRequester { + writes: Arc, + } + + #[async_trait] + impl PhysicalFlushNodeRequester for MockFlushNodeRequester { + async fn handle( + &self, + _peer: &Peer, + _request: RegionRequest, + ) -> error::Result { + self.writes.fetch_add(1, Ordering::SeqCst); + Ok(RegionResponse::new(0)) + } + } + #[test] fn test_collect_non_empty_table_rows_filters_empty_payloads() { let requests = RowInsertRequests { @@ -1648,6 +1839,38 @@ mod tests { assert_eq!(2, table_rows[0].1.rows.len()); } + #[test] + fn test_drain_batch_takes_initialized_pending_batch_from_option() { + let ctx = session::context::QueryContext::arc(); + let (response_tx, _response_rx) = oneshot::channel(); + let permit = Arc::new(Semaphore::new(1)).try_acquire_owned().unwrap(); + let mut batch = Some(PendingBatch { + tables: HashMap::from([( + "cpu".to_string(), + TableBatch { + table_name: "cpu".to_string(), + table_id: 42, + batches: vec![mock_tag_batch("tag1", "host-1", 1000, 1.0)], + row_count: 1, + }, + )]), + created_at: Instant::now(), + total_row_count: 1, + ctx: ctx.clone(), + waiters: vec![FlushWaiter { + response_tx, + _permit: permit, + }], + }); + + let flush = drain_batch(&mut batch).unwrap(); + + assert!(batch.is_none()); + assert_eq!(1, flush.total_row_count); + assert_eq!(1, flush.table_batches.len()); + assert_eq!(ctx.current_catalog(), flush.ctx.current_catalog()); + } + #[derive(Clone)] struct ConcurrentMockDatanode { delay: Duration, @@ -1728,6 +1951,21 @@ mod tests { } } + #[async_trait] + impl PhysicalFlushNodeRequester for ConcurrentMockNodeManager { + async fn handle( + &self, + peer: &Peer, + request: RegionRequest, + ) -> error::Result { + let datanode = self.datanode(peer).await; + datanode + .handle(request) + .await + .context(error::CommonMetaSnafu) + } + } + #[test] fn test_remove_worker_if_same_channel_removes_matching_entry() { let workers = DashMap::new(); @@ -1798,8 +2036,6 @@ mod tests { let writes = vec![ FlushRegionWrite { - region_id: RegionId::new(1024, 1), - row_count: 10, datanode: Peer { id: 1, addr: "node1".to_string(), @@ -1807,8 +2043,6 @@ mod tests { request: RegionRequest::default(), }, FlushRegionWrite { - region_id: RegionId::new(1024, 2), - row_count: 12, datanode: Peer { id: 2, addr: "node2".to_string(), @@ -1817,13 +2051,9 @@ mod tests { }, ]; - let results = flush_region_writes_concurrently(node_manager, writes).await; - assert_eq!(2, results.len()); - assert!( - results - .iter() - .all(|result| matches!(result, FlushWriteResult::Success { .. })) - ); + flush_region_writes_concurrently(node_manager.as_ref(), writes) + .await + .unwrap(); assert!(max_inflight.load(Ordering::SeqCst) >= 2); } @@ -2108,4 +2338,304 @@ mod tests { "PK should be different because batch2 has tag2!" ); } + + #[test] + fn test_transform_logical_batches_to_physical_success() { + let batch = mock_tag_batch("tag1", "v1", 1000, 1.0); + + let table_batches = vec![TableBatch { + table_name: "t1".to_string(), + table_id: 1, + batches: vec![batch], + row_count: 1, + }]; + + let name_to_ids = HashMap::from([("tag1".to_string(), 1)]); + let partition_columns = HashSet::new(); + let modified = + transform_logical_batches_to_physical(&table_batches, &name_to_ids, &partition_columns) + .unwrap(); + + assert_eq!(1, modified.len()); + assert_eq!(3, modified[0].num_columns()); + assert_eq!("__primary_key", modified[0].schema().field(0).name()); + assert_eq!("greptime_timestamp", modified[0].schema().field(1).name()); + assert_eq!("greptime_value", modified[0].schema().field(2).name()); + } + + #[test] + fn test_transform_logical_batches_to_physical_taxonomy_failure() { + let batch = mock_tag_batch("tag1", "v1", 1000, 1.0); + + let table_batches = vec![TableBatch { + table_name: "t1".to_string(), + table_id: 1, + batches: vec![batch], + row_count: 1, + }]; + + // tag1 is missing from name_to_ids, causing columns_taxonomy to fail. + let name_to_ids = HashMap::new(); + let partition_columns = HashSet::new(); + let err = + transform_logical_batches_to_physical(&table_batches, &name_to_ids, &partition_columns) + .unwrap_err(); + + assert!( + err.to_string() + .contains("not found in physical table column IDs") + ); + } + + #[test] + fn test_transform_logical_batches_to_physical_multiple_batches() { + let batch1 = mock_tag_batch("tag1", "v1", 1000, 1.0); + let batch2 = mock_tag_batch("tag2", "v2", 2000, 2.0); + + let table_batches = vec![ + TableBatch { + table_name: "t1".to_string(), + table_id: 1, + batches: vec![batch1], + row_count: 1, + }, + TableBatch { + table_name: "t2".to_string(), + table_id: 2, + batches: vec![batch2], + row_count: 1, + }, + ]; + + let name_to_ids = HashMap::from([("tag1".to_string(), 1), ("tag2".to_string(), 2)]); + let partition_columns = HashSet::new(); + let modified = + transform_logical_batches_to_physical(&table_batches, &name_to_ids, &partition_columns) + .unwrap(); + + assert_eq!(2, modified.len()); + } + + #[test] + fn test_transform_logical_batches_to_physical_mixed_success_failure() { + let batch1 = mock_tag_batch("tag1", "v1", 1000, 1.0); + let batch2 = mock_tag_batch("tag2", "v2", 2000, 2.0); + + let table_batches = vec![ + TableBatch { + table_name: "t1".to_string(), + table_id: 1, + batches: vec![batch1], + row_count: 1, + }, + TableBatch { + table_name: "t2".to_string(), + table_id: 2, + batches: vec![batch2], + row_count: 1, + }, + ]; + + // tag1 is missing from name_to_ids, causing batch1 to fail. + let name_to_ids = HashMap::from([("tag2".to_string(), 2)]); + let partition_columns = HashSet::new(); + let err = + transform_logical_batches_to_physical(&table_batches, &name_to_ids, &partition_columns) + .unwrap_err(); + + assert!(err.to_string().contains("tag1")); + } + + #[tokio::test] + async fn test_flush_batch_physical_uses_mockable_trait_dependencies() { + let table_batches = vec![TableBatch { + table_name: "t1".to_string(), + table_id: 11, + batches: vec![mock_tag_batch("tag1", "host-1", 1000, 1.0)], + row_count: 1, + }]; + let partition_calls = Arc::new(AtomicUsize::new(0)); + let leader_calls = Arc::new(AtomicUsize::new(0)); + let node = MockFlushNodeRequester::default(); + let ctx = session::context::QueryContext::arc(); + + flush_batch_physical( + &table_batches, + "phy", + &ctx, + &MockFlushPartitionProvider { + partition_rule_calls: partition_calls.clone(), + region_leader_calls: leader_calls.clone(), + }, + &node, + &MockFlushCatalogProvider { + table: Some(mock_physical_table_metadata(1024)), + }, + ) + .await + .unwrap(); + + assert_eq!(1, partition_calls.load(Ordering::SeqCst)); + assert_eq!(1, leader_calls.load(Ordering::SeqCst)); + assert_eq!(1, node.writes.load(Ordering::SeqCst)); + } + + #[tokio::test] + async fn test_flush_batch_physical_stops_before_partition_and_node_when_table_missing() { + let table_batches = vec![TableBatch { + table_name: "t1".to_string(), + table_id: 11, + batches: vec![mock_tag_batch("tag1", "host-1", 1000, 1.0)], + row_count: 1, + }]; + let partition_calls = Arc::new(AtomicUsize::new(0)); + let leader_calls = Arc::new(AtomicUsize::new(0)); + let node = MockFlushNodeRequester::default(); + let ctx = session::context::QueryContext::arc(); + + let err = flush_batch_physical( + &table_batches, + "missing_phy", + &ctx, + &MockFlushPartitionProvider { + partition_rule_calls: partition_calls.clone(), + region_leader_calls: leader_calls.clone(), + }, + &node, + &MockFlushCatalogProvider { table: None }, + ) + .await + .unwrap_err(); + + assert!( + err.to_string() + .contains("Physical table 'missing_phy' not found") + ); + assert_eq!(0, partition_calls.load(Ordering::SeqCst)); + assert_eq!(0, leader_calls.load(Ordering::SeqCst)); + assert_eq!(0, node.writes.load(Ordering::SeqCst)); + } + + #[tokio::test] + async fn test_flush_batch_physical_aborts_immediately_on_transform_error() { + let table_batches = vec![ + TableBatch { + table_name: "broken".to_string(), + table_id: 11, + batches: vec![mock_tag_batch("unknown_tag", "host-1", 1000, 1.0)], + row_count: 1, + }, + TableBatch { + table_name: "healthy".to_string(), + table_id: 12, + batches: vec![mock_tag_batch("tag1", "host-2", 2000, 2.0)], + row_count: 1, + }, + ]; + let partition_calls = Arc::new(AtomicUsize::new(0)); + let leader_calls = Arc::new(AtomicUsize::new(0)); + let node = MockFlushNodeRequester::default(); + let ctx = session::context::QueryContext::arc(); + + let err = flush_batch_physical( + &table_batches, + "phy", + &ctx, + &MockFlushPartitionProvider { + partition_rule_calls: partition_calls.clone(), + region_leader_calls: leader_calls.clone(), + }, + &node, + &MockFlushCatalogProvider { + table: Some(mock_physical_table_metadata(1024)), + }, + ) + .await + .unwrap_err(); + + assert!(err.to_string().contains("unknown_tag")); + assert_eq!(1, partition_calls.load(Ordering::SeqCst)); + assert_eq!(0, leader_calls.load(Ordering::SeqCst)); + assert_eq!(0, node.writes.load(Ordering::SeqCst)); + } + + #[test] + fn test_plan_region_batches_splits_and_strips_partition_columns() { + let combined_batch = RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![ + Field::new("__primary_key", ArrowDataType::Binary, false), + Field::new( + "greptime_timestamp", + ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), + false, + ), + Field::new("greptime_value", ArrowDataType::Float64, true), + Field::new("host", ArrowDataType::Utf8, true), + ])), + vec![ + Arc::new(BinaryArray::from(vec![b"k1".as_slice(), b"k2".as_slice()])), + Arc::new(TimestampMillisecondArray::from(vec![1000_i64, 2000_i64])), + Arc::new(arrow::array::Float64Array::from(vec![1.0_f64, 2.0_f64])), + Arc::new(StringArray::from(vec!["node-1", "node-2"])), + ], + ) + .unwrap(); + let mut planned_batches = plan_region_batches( + combined_batch, + 1024, + &TwoRegionPartitionRule { + partition_columns: vec!["host".to_string()], + }, + &["host".to_string()], + ) + .unwrap(); + planned_batches.sort_by_key(|planned| planned.region_id.region_number()); + + assert_eq!(2, planned_batches.len()); + assert_eq!(RegionId::new(1024, 1), planned_batches[0].region_id); + assert_eq!(1, planned_batches[0].num_rows()); + assert_eq!(3, planned_batches[0].batch.num_columns()); + assert_eq!(RegionId::new(1024, 2), planned_batches[1].region_id); + assert_eq!(1, planned_batches[1].num_rows()); + assert_eq!(3, planned_batches[1].batch.num_columns()); + } + + #[test] + fn test_encode_region_write_requests_builds_bulk_insert_requests() { + let planned_batch = PlannedRegionBatch { + region_id: RegionId::new(1024, 1), + batch: RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![ + Field::new("__primary_key", ArrowDataType::Binary, false), + Field::new( + "greptime_timestamp", + ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), + false, + ), + Field::new("greptime_value", ArrowDataType::Float64, true), + ])), + vec![ + Arc::new(BinaryArray::from(vec![b"k1".as_slice()])), + Arc::new(TimestampMillisecondArray::from(vec![1000_i64])), + Arc::new(arrow::array::Float64Array::from(vec![1.0_f64])), + ], + ) + .unwrap(), + }; + let resolved_batch = ResolvedRegionBatch { + planned: planned_batch, + datanode: Peer { + id: 1, + addr: "node-1".to_string(), + }, + }; + let writes = encode_region_write_requests(vec![resolved_batch]).unwrap(); + + assert_eq!(1, writes.len()); + assert_eq!(1, writes[0].datanode.id); + let Some(region_request::Body::BulkInsert(request)) = &writes[0].request.body else { + panic!("expected bulk insert request"); + }; + assert_eq!(RegionId::new(1024, 1).as_u64(), request.region_id); + } } From a9256f031072e48a05816ec6b86aadb091e64fb8 Mon Sep 17 00:00:00 2001 From: shuiyisong <113876041+shuiyisong@users.noreply.github.com> Date: Fri, 3 Apr 2026 12:13:44 +0800 Subject: [PATCH 073/195] refactor: extract otel helper (#7910) * refactor: extract otel helper Signed-off-by: shuiyisong * chore: move to submodule Signed-off-by: shuiyisong --------- Signed-off-by: shuiyisong --- src/frontend/src/instance/otlp.rs | 257 +-------------- src/frontend/src/instance/otlp/trace_types.rs | 308 ++++++++++++++++++ 2 files changed, 317 insertions(+), 248 deletions(-) create mode 100644 src/frontend/src/instance/otlp/trace_types.rs diff --git a/src/frontend/src/instance/otlp.rs b/src/frontend/src/instance/otlp.rs index 8b3f8b3eec..75168b3b9a 100644 --- a/src/frontend/src/instance/otlp.rs +++ b/src/frontend/src/instance/otlp.rs @@ -36,10 +36,7 @@ use servers::http::prom_store::PHYSICAL_TABLE_PARAM; use servers::interceptor::{OpenTelemetryProtocolInterceptor, OpenTelemetryProtocolInterceptorRef}; use servers::otlp; use servers::otlp::trace::TraceAuxData; -use servers::otlp::trace::coerce::{ - coerce_value_data, is_supported_trace_coercion, resolve_new_trace_column_type, - trace_value_datatype, -}; +use servers::otlp::trace::coerce::{coerce_value_data, trace_value_datatype}; use servers::otlp::trace::span::{TraceSpan, TraceSpanGroup}; use servers::query_handler::{ OpenTelemetryProtocolHandler, PipelineHandlerRef, TraceIngestOutcome, @@ -49,10 +46,16 @@ use snafu::{IntoError, ResultExt}; use table::requests::{OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM}; use crate::instance::Instance; +use crate::instance::otlp::trace_types::{ + PendingTraceColumnRewrite, choose_trace_reconcile_decision, enrich_trace_reconcile_error, + is_trace_reconcile_candidate_type, push_observed_trace_type, validate_trace_column_rewrites, +}; use crate::metrics::{ OTLP_LOGS_ROWS, OTLP_METRICS_ROWS, OTLP_TRACES_FAILURE_COUNT, OTLP_TRACES_ROWS, }; +pub mod trace_types; + const TRACE_INGEST_CHUNK_SIZE: usize = 64; const TRACE_FAILURE_MESSAGE_LIMIT: usize = 4; @@ -63,33 +66,6 @@ enum ChunkFailureReaction { Propagate, } -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum TraceReconcileDecision { - UseExisting(ColumnDataType), - UseRequestLocal(ColumnDataType), - AlterExistingTo(ColumnDataType), -} - -impl TraceReconcileDecision { - fn target_type(self) -> ColumnDataType { - match self { - Self::UseExisting(target_type) - | Self::UseRequestLocal(target_type) - | Self::AlterExistingTo(target_type) => target_type, - } - } - - fn requires_alter(self) -> bool { - matches!(self, Self::AlterExistingTo(_)) - } -} - -struct PendingTraceColumnRewrite { - col_idx: usize, - target_type: ColumnDataType, - column_name: String, -} - impl ChunkFailureReaction { fn as_metric_label(self) -> &'static str { match self { @@ -576,52 +552,6 @@ impl Instance { Some(summary) } - /// Picks the reconciliation action for one trace column. - /// - /// Existing table schema is authoritative unless the only incompatible case is - /// widening an existing Int64 column to Float64 for incoming Int64/Float64 data. - fn choose_trace_reconcile_decision( - observed_types: &[ColumnDataType], - existing_type: Option, - ) -> ServerResult> { - let Some(existing_type) = existing_type else { - return resolve_new_trace_column_type(observed_types.iter().copied()) - .map(|target_type| target_type.map(TraceReconcileDecision::UseRequestLocal)) - .map_err(|_| { - error::InvalidParameterSnafu { - reason: "unsupported trace type mix".to_string(), - } - .build() - }); - }; - - if observed_types.iter().all(|&request_type| { - request_type == existing_type - || is_supported_trace_coercion(request_type, existing_type) - }) { - return Ok(Some(TraceReconcileDecision::UseExisting(existing_type))); - } - - if existing_type == ColumnDataType::Int64 - && observed_types.contains(&ColumnDataType::Float64) - && observed_types.iter().all(|observed_type| { - matches!( - observed_type, - ColumnDataType::Int64 | ColumnDataType::Float64 - ) - }) - { - return Ok(Some(TraceReconcileDecision::AlterExistingTo( - ColumnDataType::Float64, - ))); - } - - error::InvalidParameterSnafu { - reason: "unsupported trace type mix".to_string(), - } - .fail() - } - /// Widen existing trace table columns to Float64 before request rewrite. async fn alter_trace_table_columns_to_float64( &self, @@ -763,7 +693,7 @@ impl Instance { // Decide the final type once per column, then rewrite all affected cells // together in one row pass below. let Some(decision) = - Self::choose_trace_reconcile_decision(&observed_types, existing_type).map_err( + choose_trace_reconcile_decision(&observed_types, existing_type).map_err( |_| { enrich_trace_reconcile_error( &req.table_name, @@ -860,44 +790,6 @@ impl Instance { } } -/// Validate all pending trace column rewrites before any schema mutation happens. -fn validate_trace_column_rewrites( - rows: &[api::v1::Row], - pending_rewrites: &[PendingTraceColumnRewrite], - table_name: &str, -) -> ServerResult<()> { - for row in rows { - for pending_rewrite in pending_rewrites { - let Some(value) = row.values.get(pending_rewrite.col_idx) else { - continue; - }; - let Some(request_type) = value.value_data.as_ref().and_then(trace_value_datatype) - else { - continue; - }; - if request_type == pending_rewrite.target_type { - continue; - } - - coerce_value_data(&value.value_data, pending_rewrite.target_type, request_type) - .map_err(|_| { - error::InvalidParameterSnafu { - reason: format!( - "failed to coerce trace column '{}' in table '{}' from {:?} to {:?}", - pending_rewrite.column_name, - table_name, - request_type, - pending_rewrite.target_type - ), - } - .build() - })?; - } - } - - Ok(()) -} - /// Preserve the original alter failure status so chunk retry behavior stays correct. fn wrap_trace_alter_failure(err: E) -> servers::error::Error where @@ -906,64 +798,13 @@ where error::ExecuteGrpcQuerySnafu.into_error(BoxedError::new(err)) } -fn enrich_trace_reconcile_error( - table_name: &str, - column_name: &str, - observed_types: &[ColumnDataType], - existing_type: Option, -) -> servers::error::Error { - let observed_types = observed_types - .iter() - .map(|datatype| format!("{datatype:?}")) - .collect::>() - .join(", "); - - error::InvalidParameterSnafu { - reason: match existing_type { - Some(existing_type) => format!( - "failed to reconcile trace column '{}' in table '{}' with observed types [{}] against existing {:?}", - column_name, table_name, observed_types, existing_type - ), - None => format!( - "failed to reconcile trace column '{}' in table '{}' with observed types [{}]", - column_name, table_name, observed_types - ), - }, - } - .build() -} - -/// Only these trace scalar types participate in reconciliation. Other column kinds -/// such as JSON and binary keep their original write path and schema checks. -fn is_trace_reconcile_candidate_type(datatype: ColumnDataType) -> bool { - matches!( - datatype, - ColumnDataType::String - | ColumnDataType::Boolean - | ColumnDataType::Int64 - | ColumnDataType::Float64 - ) -} - -/// Keeps the observed type list small without depending on enum ordering. -fn push_observed_trace_type(observed_types: &mut Vec, datatype: ColumnDataType) { - if !observed_types.contains(&datatype) { - observed_types.push(datatype); - } -} - #[cfg(test)] mod tests { - use api::v1::value::ValueData; - use api::v1::{ColumnDataType, Row, Value}; use common_error::ext::ErrorExt; use common_error::status_code::StatusCode; use servers::query_handler::TraceIngestOutcome; - use super::{ - ChunkFailureReaction, Instance, PendingTraceColumnRewrite, TraceReconcileDecision, - validate_trace_column_rewrites, wrap_trace_alter_failure, - }; + use super::{ChunkFailureReaction, Instance, wrap_trace_alter_failure}; use crate::metrics::OTLP_TRACES_FAILURE_COUNT; #[test] @@ -1117,86 +958,6 @@ mod tests { ); } - #[test] - fn test_choose_trace_reconcile_decision_existing_int64_keeps_int64() { - assert_eq!( - Instance::choose_trace_reconcile_decision( - &[ColumnDataType::Int64], - Some(ColumnDataType::Int64) - ) - .unwrap(), - Some(TraceReconcileDecision::UseExisting(ColumnDataType::Int64)) - ); - } - - #[test] - fn test_choose_trace_reconcile_decision_existing_int64_widens_to_float64() { - assert_eq!( - Instance::choose_trace_reconcile_decision( - &[ColumnDataType::Int64, ColumnDataType::Float64], - Some(ColumnDataType::Int64) - ) - .unwrap(), - Some(TraceReconcileDecision::AlterExistingTo( - ColumnDataType::Float64 - )) - ); - } - - #[test] - fn test_choose_trace_reconcile_decision_existing_float64_stays_authoritative() { - assert_eq!( - Instance::choose_trace_reconcile_decision( - &[ColumnDataType::Int64, ColumnDataType::Float64], - Some(ColumnDataType::Float64) - ) - .unwrap(), - Some(TraceReconcileDecision::UseExisting(ColumnDataType::Float64)) - ); - } - - #[test] - fn test_choose_trace_reconcile_decision_existing_int64_with_boolean_is_error() { - let err = Instance::choose_trace_reconcile_decision( - &[ColumnDataType::Boolean, ColumnDataType::Int64], - Some(ColumnDataType::Int64), - ) - .unwrap_err(); - assert_eq!(err.status_code(), StatusCode::InvalidArguments); - } - - #[test] - fn test_choose_trace_reconcile_decision_request_local_prefers_float64() { - assert_eq!( - Instance::choose_trace_reconcile_decision( - &[ColumnDataType::Int64, ColumnDataType::Float64], - None - ) - .unwrap(), - Some(TraceReconcileDecision::UseRequestLocal( - ColumnDataType::Float64 - )) - ); - } - - #[test] - fn test_validate_trace_column_rewrites_rejects_invalid_string_parse() { - let rows = vec![Row { - values: vec![Value { - value_data: Some(ValueData::StringValue("not_a_number".to_string())), - }], - }]; - let pending_rewrites = vec![PendingTraceColumnRewrite { - col_idx: 0, - target_type: ColumnDataType::Int64, - column_name: "span_attributes.attr_int".to_string(), - }]; - - let err = validate_trace_column_rewrites(&rows, &pending_rewrites, "trace_type_atomicity") - .unwrap_err(); - assert_eq!(err.status_code(), StatusCode::InvalidArguments); - } - #[test] fn test_wrap_trace_alter_failure_preserves_status_code() { let err = wrap_trace_alter_failure( diff --git a/src/frontend/src/instance/otlp/trace_types.rs b/src/frontend/src/instance/otlp/trace_types.rs new file mode 100644 index 0000000000..0be3df550e --- /dev/null +++ b/src/frontend/src/instance/otlp/trace_types.rs @@ -0,0 +1,308 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use api::v1::{ColumnDataType, Row}; +use servers::error::{self, Result as ServerResult}; +use servers::otlp::trace::coerce::{ + coerce_value_data, is_supported_trace_coercion, resolve_new_trace_column_type, + trace_value_datatype, +}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(super) enum TraceReconcileDecision { + UseExisting(ColumnDataType), + UseRequestLocal(ColumnDataType), + AlterExistingTo(ColumnDataType), +} + +impl TraceReconcileDecision { + pub(super) fn target_type(self) -> ColumnDataType { + match self { + Self::UseExisting(target_type) + | Self::UseRequestLocal(target_type) + | Self::AlterExistingTo(target_type) => target_type, + } + } + + pub(super) fn requires_alter(self) -> bool { + matches!(self, Self::AlterExistingTo(_)) + } +} + +pub(super) struct PendingTraceColumnRewrite { + pub(super) col_idx: usize, + pub(super) target_type: ColumnDataType, + pub(super) column_name: String, +} + +/// Picks the reconciliation action for one trace column. +/// +/// Existing table schema is authoritative unless the only incompatible case is +/// widening an existing Int64 column to Float64 for incoming Int64/Float64 data. +pub(super) fn choose_trace_reconcile_decision( + observed_types: &[ColumnDataType], + existing_type: Option, +) -> ServerResult> { + let Some(existing_type) = existing_type else { + return resolve_new_trace_column_type(observed_types.iter().copied()) + .map(|target_type| target_type.map(TraceReconcileDecision::UseRequestLocal)) + .map_err(|_| { + error::InvalidParameterSnafu { + reason: "unsupported trace type mix".to_string(), + } + .build() + }); + }; + + if observed_types.iter().all(|&request_type| { + request_type == existing_type || is_supported_trace_coercion(request_type, existing_type) + }) { + return Ok(Some(TraceReconcileDecision::UseExisting(existing_type))); + } + + if existing_type == ColumnDataType::Int64 + && observed_types.contains(&ColumnDataType::Float64) + && observed_types.iter().all(|observed_type| { + matches!( + observed_type, + ColumnDataType::Int64 | ColumnDataType::Float64 + ) + }) + { + return Ok(Some(TraceReconcileDecision::AlterExistingTo( + ColumnDataType::Float64, + ))); + } + + error::InvalidParameterSnafu { + reason: "unsupported trace type mix".to_string(), + } + .fail() +} + +/// Validate all pending trace column rewrites before any schema mutation happens. +pub(super) fn validate_trace_column_rewrites( + rows: &[Row], + pending_rewrites: &[PendingTraceColumnRewrite], + table_name: &str, +) -> ServerResult<()> { + for row in rows { + for pending_rewrite in pending_rewrites { + let Some(value) = row.values.get(pending_rewrite.col_idx) else { + continue; + }; + let Some(request_type) = value.value_data.as_ref().and_then(trace_value_datatype) + else { + continue; + }; + if request_type == pending_rewrite.target_type { + continue; + } + + coerce_value_data(&value.value_data, pending_rewrite.target_type, request_type) + .map_err(|_| { + error::InvalidParameterSnafu { + reason: format!( + "failed to coerce trace column '{}' in table '{}' from {:?} to {:?}", + pending_rewrite.column_name, + table_name, + request_type, + pending_rewrite.target_type + ), + } + .build() + })?; + } + } + + Ok(()) +} + +pub(super) fn enrich_trace_reconcile_error( + table_name: &str, + column_name: &str, + observed_types: &[ColumnDataType], + existing_type: Option, +) -> servers::error::Error { + let observed_types = observed_types + .iter() + .map(|datatype| format!("{datatype:?}")) + .collect::>() + .join(", "); + + error::InvalidParameterSnafu { + reason: match existing_type { + Some(existing_type) => format!( + "failed to reconcile trace column '{}' in table '{}' with observed types [{}] against existing {:?}", + column_name, table_name, observed_types, existing_type + ), + None => format!( + "failed to reconcile trace column '{}' in table '{}' with observed types [{}]", + column_name, table_name, observed_types + ), + }, + } + .build() +} + +/// Only these trace scalar types participate in reconciliation. Other column kinds +/// such as JSON and binary keep their original write path and schema checks. +pub(super) fn is_trace_reconcile_candidate_type(datatype: ColumnDataType) -> bool { + matches!( + datatype, + ColumnDataType::String + | ColumnDataType::Boolean + | ColumnDataType::Int64 + | ColumnDataType::Float64 + ) +} + +/// Keeps the observed type list small without depending on enum ordering. +pub(super) fn push_observed_trace_type( + observed_types: &mut Vec, + datatype: ColumnDataType, +) { + if !observed_types.contains(&datatype) { + observed_types.push(datatype); + } +} + +#[cfg(test)] +mod tests { + use api::v1::value::ValueData; + use api::v1::{ColumnDataType, Row, Value}; + use common_error::ext::ErrorExt; + use common_error::status_code::StatusCode; + + use super::{ + PendingTraceColumnRewrite, TraceReconcileDecision, choose_trace_reconcile_decision, + enrich_trace_reconcile_error, is_trace_reconcile_candidate_type, push_observed_trace_type, + validate_trace_column_rewrites, + }; + + #[test] + fn test_choose_trace_reconcile_decision_existing_int64_keeps_int64() { + assert_eq!( + choose_trace_reconcile_decision(&[ColumnDataType::Int64], Some(ColumnDataType::Int64)) + .unwrap(), + Some(TraceReconcileDecision::UseExisting(ColumnDataType::Int64)) + ); + } + + #[test] + fn test_choose_trace_reconcile_decision_existing_int64_widens_to_float64() { + assert_eq!( + choose_trace_reconcile_decision( + &[ColumnDataType::Int64, ColumnDataType::Float64], + Some(ColumnDataType::Int64) + ) + .unwrap(), + Some(TraceReconcileDecision::AlterExistingTo( + ColumnDataType::Float64 + )) + ); + } + + #[test] + fn test_choose_trace_reconcile_decision_existing_float64_stays_authoritative() { + assert_eq!( + choose_trace_reconcile_decision( + &[ColumnDataType::Int64, ColumnDataType::Float64], + Some(ColumnDataType::Float64) + ) + .unwrap(), + Some(TraceReconcileDecision::UseExisting(ColumnDataType::Float64)) + ); + } + + #[test] + fn test_choose_trace_reconcile_decision_existing_int64_with_boolean_is_error() { + let err = choose_trace_reconcile_decision( + &[ColumnDataType::Boolean, ColumnDataType::Int64], + Some(ColumnDataType::Int64), + ) + .unwrap_err(); + assert_eq!(err.status_code(), StatusCode::InvalidArguments); + } + + #[test] + fn test_choose_trace_reconcile_decision_request_local_prefers_float64() { + assert_eq!( + choose_trace_reconcile_decision( + &[ColumnDataType::Int64, ColumnDataType::Float64], + None + ) + .unwrap(), + Some(TraceReconcileDecision::UseRequestLocal( + ColumnDataType::Float64 + )) + ); + } + + #[test] + fn test_validate_trace_column_rewrites_rejects_invalid_string_parse() { + let rows = vec![Row { + values: vec![Value { + value_data: Some(ValueData::StringValue("not_a_number".to_string())), + }], + }]; + let pending_rewrites = vec![PendingTraceColumnRewrite { + col_idx: 0, + target_type: ColumnDataType::Int64, + column_name: "span_attributes.attr_int".to_string(), + }]; + + let err = validate_trace_column_rewrites(&rows, &pending_rewrites, "trace_type_atomicity") + .unwrap_err(); + assert_eq!(err.status_code(), StatusCode::InvalidArguments); + } + + #[test] + fn test_enrich_trace_reconcile_error_includes_existing_type() { + let err = enrich_trace_reconcile_error( + "trace_type_atomicity", + "span_attributes.attr_int", + &[ColumnDataType::String, ColumnDataType::Int64], + Some(ColumnDataType::Boolean), + ); + + assert_eq!(err.status_code(), StatusCode::InvalidArguments); + assert!(err.to_string().contains("span_attributes.attr_int")); + assert!(err.to_string().contains("Boolean")); + } + + #[test] + fn test_is_trace_reconcile_candidate_type_filters_non_scalar_types() { + assert!(is_trace_reconcile_candidate_type(ColumnDataType::String)); + assert!(is_trace_reconcile_candidate_type(ColumnDataType::Boolean)); + assert!(!is_trace_reconcile_candidate_type(ColumnDataType::Binary)); + assert!(!is_trace_reconcile_candidate_type( + ColumnDataType::TimestampMillisecond + )); + } + + #[test] + fn test_push_observed_trace_type_deduplicates_types() { + let mut observed_types = Vec::new(); + + push_observed_trace_type(&mut observed_types, ColumnDataType::Int64); + push_observed_trace_type(&mut observed_types, ColumnDataType::Int64); + push_observed_trace_type(&mut observed_types, ColumnDataType::Float64); + + assert_eq!( + observed_types, + vec![ColumnDataType::Int64, ColumnDataType::Float64] + ); + } +} From 233e35c0c91ba9859fa514511872834f6502cbd8 Mon Sep 17 00:00:00 2001 From: Yingwen Date: Fri, 3 Apr 2026 12:14:02 +0800 Subject: [PATCH 074/195] feat!: switch default sst format to flat (#7909) * feat: support alter from primary_key to flat Signed-off-by: evenyag * chore: alter flat to primary_key Signed-off-by: evenyag * feat: change default_experimental_flat_format to true Signed-off-by: evenyag * feat: compute channel size from splitted batch size Signed-off-by: evenyag * test: add tests for split and channel size Signed-off-by: evenyag * fix: always set sst_format from manifest on region open sanitize_region_options did not set options.sst_format when the default (PrimaryKey) matched the manifest value, leaving it as None after reopen. This caused the alter format change to appear lost. Signed-off-by: evenyag * test: fix tests Signed-off-by: evenyag * test: show create table after alteration Signed-off-by: evenyag * refactor!: rename default_experimental_flat_format to default_flat_format The flat format is no longer experimental. Remove "experimental" from the config field name, doc comments, and all references. Signed-off-by: evenyag * chore: fix clippy Signed-off-by: evenyag --------- Signed-off-by: evenyag --- config/config.md | 6 +- config/datanode.example.toml | 7 +- config/standalone.example.toml | 7 +- src/metric-engine/src/engine/bulk_insert.rs | 4 +- src/metric-engine/src/engine/flush.rs | 16 +- src/mito2/src/compaction/compactor.rs | 6 +- src/mito2/src/config.rs | 19 +- src/mito2/src/engine.rs | 1 - src/mito2/src/engine/alter_test.rs | 276 ++++++++++++++++- src/mito2/src/engine/append_mode_test.rs | 14 +- .../src/engine/apply_staging_manifest_test.rs | 12 +- src/mito2/src/engine/basic_test.rs | 32 +- src/mito2/src/engine/batch_catchup_test.rs | 6 +- src/mito2/src/engine/batch_open_test.rs | 6 +- .../engine/bump_committed_sequence_test.rs | 6 +- src/mito2/src/engine/catchup_test.rs | 2 +- src/mito2/src/engine/close_test.rs | 2 +- src/mito2/src/engine/compaction_test.rs | 22 +- src/mito2/src/engine/copy_region_from_test.rs | 8 +- src/mito2/src/engine/create_test.rs | 22 +- src/mito2/src/engine/drop_test.rs | 4 +- src/mito2/src/engine/edit_region_test.rs | 6 +- src/mito2/src/engine/filter_deleted_test.rs | 2 +- src/mito2/src/engine/flush_test.rs | 12 +- src/mito2/src/engine/merge_mode_test.rs | 6 +- src/mito2/src/engine/open_test.rs | 24 +- src/mito2/src/engine/parallel_test.rs | 18 +- src/mito2/src/engine/partition_filter_test.rs | 2 +- src/mito2/src/engine/projection_test.rs | 4 +- src/mito2/src/engine/prune_test.rs | 10 +- src/mito2/src/engine/remap_manifests_test.rs | 8 +- src/mito2/src/engine/scan_test.rs | 6 +- src/mito2/src/engine/set_role_state_test.rs | 10 +- src/mito2/src/engine/staging_test.rs | 20 +- src/mito2/src/engine/sync_test.rs | 8 +- src/mito2/src/engine/truncate_test.rs | 14 +- src/mito2/src/flush.rs | 2 +- src/mito2/src/memtable.rs | 2 +- src/mito2/src/read/scan_region.rs | 35 +-- src/mito2/src/read/scan_util.rs | 278 +++++++++++++++++- src/mito2/src/read/seq_scan.rs | 39 ++- src/mito2/src/read/series_scan.rs | 14 +- src/mito2/src/region/opener.rs | 8 +- src/mito2/src/worker/handle_alter.rs | 11 - tests-integration/tests/http.rs | 3 +- .../common/alter/alter_format.result | 146 ++++++++- .../standalone/common/alter/alter_format.sql | 29 +- tests/conf/datanode-test.toml.template | 2 +- tests/conf/standalone-test.toml.template | 2 +- tests/runner/src/cmd/bare.rs | 2 +- 50 files changed, 913 insertions(+), 288 deletions(-) diff --git a/config/config.md b/config/config.md index 4861675217..f28d09e28d 100644 --- a/config/config.md +++ b/config/config.md @@ -157,13 +157,12 @@ | `region_engine.mito.enable_refill_cache_on_read` | Bool | `true` | Enable refilling cache on read operations (default: true).
When disabled, cache refilling on read won't happen. | | `region_engine.mito.manifest_cache_size` | String | `256MB` | Capacity for manifest cache (default: 256MB). | | `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. | -| `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. | | `region_engine.mito.max_concurrent_scan_files` | Integer | `384` | Maximum number of SST files to scan concurrently. | | `region_engine.mito.allow_stale_entries` | Bool | `false` | Whether to allow stale WAL entries read during replay. | | `region_engine.mito.scan_memory_limit` | String | `50%` | Memory limit for table scans across all queries.
Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").
Setting it to 0 disables the limit. | | `region_engine.mito.scan_memory_on_exhausted` | String | `fail` | Controls what happens when a scan cannot get memory immediately.
"fail" (default) fails fast and is the recommended option for most users.
"wait" / "wait()" waits for memory to become available. This is mainly
for advanced tuning in bursty workloads where temporary contention is common and
higher latency is acceptable.
"wait" means "wait(10s)", not unlimited waiting. | | `region_engine.mito.min_compaction_interval` | String | `0m` | Minimum time interval between two compactions.
To align with the old behavior, the default value is 0 (no restrictions). | -| `region_engine.mito.default_experimental_flat_format` | Bool | `false` | Whether to enable experimental flat format as the default format. | +| `region_engine.mito.default_flat_format` | Bool | `true` | Whether to enable flat format as the default SST format. | | `region_engine.mito.index` | -- | -- | The options for index in Mito engine. | | `region_engine.mito.index.aux_path` | String | `""` | Auxiliary directory path for the index in filesystem, used to store intermediate files for
creating the index and staging files for searching the index, defaults to `{data_home}/index_intermediate`.
The default name for this directory is `index_intermediate` for backward compatibility.

This path contains two subdirectories:
- `__intm`: for storing intermediate files used during creating index.
- `staging`: for storing staging files used during searching index. | | `region_engine.mito.index.staging_size` | String | `2GB` | The max capacity of the staging directory. | @@ -550,13 +549,12 @@ | `region_engine.mito.enable_refill_cache_on_read` | Bool | `true` | Enable refilling cache on read operations (default: true).
When disabled, cache refilling on read won't happen. | | `region_engine.mito.manifest_cache_size` | String | `256MB` | Capacity for manifest cache (default: 256MB). | | `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. | -| `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. | | `region_engine.mito.max_concurrent_scan_files` | Integer | `384` | Maximum number of SST files to scan concurrently. | | `region_engine.mito.allow_stale_entries` | Bool | `false` | Whether to allow stale WAL entries read during replay. | | `region_engine.mito.scan_memory_limit` | String | `50%` | Memory limit for table scans across all queries.
Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").
Setting it to 0 disables the limit. | | `region_engine.mito.scan_memory_on_exhausted` | String | `fail` | Controls what happens when a scan cannot get memory immediately.
"fail" (default) fails fast and is the recommended option for most users.
"wait" / "wait()" waits for memory to become available. This is mainly
for advanced tuning in bursty workloads where temporary contention is common and
higher latency is acceptable.
"wait" means "wait(10s)", not unlimited waiting. | | `region_engine.mito.min_compaction_interval` | String | `0m` | Minimum time interval between two compactions.
To align with the old behavior, the default value is 0 (no restrictions). | -| `region_engine.mito.default_experimental_flat_format` | Bool | `false` | Whether to enable experimental flat format as the default format. | +| `region_engine.mito.default_flat_format` | Bool | `true` | Whether to enable flat format as the default SST format. | | `region_engine.mito.index` | -- | -- | The options for index in Mito engine. | | `region_engine.mito.index.aux_path` | String | `""` | Auxiliary directory path for the index in filesystem, used to store intermediate files for
creating the index and staging files for searching the index, defaults to `{data_home}/index_intermediate`.
The default name for this directory is `index_intermediate` for backward compatibility.

This path contains two subdirectories:
- `__intm`: for storing intermediate files used during creating index.
- `staging`: for storing staging files used during searching index. | | `region_engine.mito.index.staging_size` | String | `2GB` | The max capacity of the staging directory. | diff --git a/config/datanode.example.toml b/config/datanode.example.toml index 833a567d74..10e6965b84 100644 --- a/config/datanode.example.toml +++ b/config/datanode.example.toml @@ -520,9 +520,6 @@ manifest_cache_size = "256MB" ## Buffer size for SST writing. sst_write_buffer_size = "8MB" -## Capacity of the channel to send data from parallel scan tasks to the main task. -parallel_scan_channel_size = 32 - ## Maximum number of SST files to scan concurrently. max_concurrent_scan_files = 384 @@ -545,8 +542,8 @@ scan_memory_on_exhausted = "fail" ## To align with the old behavior, the default value is 0 (no restrictions). min_compaction_interval = "0m" -## Whether to enable experimental flat format as the default format. -default_experimental_flat_format = false +## Whether to enable flat format as the default SST format. +default_flat_format = true ## The options for index in Mito engine. [region_engine.mito.index] diff --git a/config/standalone.example.toml b/config/standalone.example.toml index 94c5feebf1..486bc74af2 100644 --- a/config/standalone.example.toml +++ b/config/standalone.example.toml @@ -612,9 +612,6 @@ manifest_cache_size = "256MB" ## Buffer size for SST writing. sst_write_buffer_size = "8MB" -## Capacity of the channel to send data from parallel scan tasks to the main task. -parallel_scan_channel_size = 32 - ## Maximum number of SST files to scan concurrently. max_concurrent_scan_files = 384 @@ -637,8 +634,8 @@ scan_memory_on_exhausted = "fail" ## To align with the old behavior, the default value is 0 (no restrictions). min_compaction_interval = "0m" -## Whether to enable experimental flat format as the default format. -default_experimental_flat_format = false +## Whether to enable flat format as the default SST format. +default_flat_format = true ## The options for index in Mito engine. [region_engine.mito.index] diff --git a/src/metric-engine/src/engine/bulk_insert.rs b/src/metric-engine/src/engine/bulk_insert.rs index 300bd34647..942dae1136 100644 --- a/src/metric-engine/src/engine/bulk_insert.rs +++ b/src/metric-engine/src/engine/bulk_insert.rs @@ -528,7 +528,7 @@ mod tests { async fn test_bulk_insert_physical_region_passthrough() { // Use flat format so that BulkMemtable is used (supports write_bulk). let mito_config = MitoConfig { - default_experimental_flat_format: true, + default_flat_format: true, ..Default::default() }; let env = TestEnv::with_mito_config("", mito_config, Default::default()).await; @@ -585,7 +585,7 @@ mod tests { async fn test_bulk_insert_physical_region_empty_batch() { // Use flat format so that BulkMemtable is used (supports write_bulk). let mito_config = MitoConfig { - default_experimental_flat_format: true, + default_flat_format: true, ..Default::default() }; let env = TestEnv::with_mito_config("", mito_config, Default::default()).await; diff --git a/src/metric-engine/src/engine/flush.rs b/src/metric-engine/src/engine/flush.rs index 5d7479c5d0..8c0f33aaf3 100644 --- a/src/metric-engine/src/engine/flush.rs +++ b/src/metric-engine/src/engine/flush.rs @@ -121,6 +121,10 @@ mod tests { .map(|path| path.replace(&e.file_id, "")); e.file_id = "".to_string(); e.index_version = 0; + // Round down sizes to nearest 1000 to avoid exact size + // comparisons that break when the SST format changes. + e.file_size = e.file_size / 1000 * 1000; + e.index_file_size = e.index_file_size.map(|s| s / 1000 * 1000); format!("\n{:?}", e) }) .sorted() @@ -129,12 +133,12 @@ mod tests { assert_eq!( debug_format, r#" -ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000001/data/.parquet", file_size: 3217, index_file_path: Some("test_metric_region/11_0000000001/data/index/.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(20), origin_region_id: 47244640257(11, 1), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000002/data/.parquet", file_size: 3217, index_file_path: Some("test_metric_region/11_0000000002/data/index/.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417473(11, 16777217), table_id: 11, region_number: 16777217, region_group: 1, region_sequence: 1, file_id: "", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000001/metadata/.parquet", file_size: 3487, index_file_path: None, index_file_size: None, num_rows: 8, num_row_groups: 1, num_series: Some(8), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(8), origin_region_id: 47261417473(11, 16777217), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417474(11, 16777218), table_id: 11, region_number: 16777218, region_group: 1, region_sequence: 2, file_id: "", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000002/metadata/.parquet", file_size: 3471, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 47261417474(11, 16777218), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "", index_version: 0, level: 0, file_path: "test_metric_region/22_0000000042/data/.parquet", file_size: 3217, index_file_path: Some("test_metric_region/22_0000000042/data/index/.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94506057770(22, 16777258), table_id: 22, region_number: 16777258, region_group: 1, region_sequence: 42, file_id: "", index_version: 0, level: 0, file_path: "test_metric_region/22_0000000042/metadata/.parquet", file_size: 3471, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 94506057770(22, 16777258), node_id: None, visible: true }"#, +ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000001/data/.parquet", file_size: 3000, index_file_path: Some("test_metric_region/11_0000000001/data/index/.puffin"), index_file_size: Some(0), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(20), origin_region_id: 47244640257(11, 1), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000002/data/.parquet", file_size: 3000, index_file_path: Some("test_metric_region/11_0000000002/data/index/.puffin"), index_file_size: Some(0), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417473(11, 16777217), table_id: 11, region_number: 16777217, region_group: 1, region_sequence: 1, file_id: "", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000001/metadata/.parquet", file_size: 4000, index_file_path: None, index_file_size: None, num_rows: 8, num_row_groups: 1, num_series: Some(8), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(8), origin_region_id: 47261417473(11, 16777217), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417474(11, 16777218), table_id: 11, region_number: 16777218, region_group: 1, region_sequence: 2, file_id: "", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000002/metadata/.parquet", file_size: 3000, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 47261417474(11, 16777218), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "", index_version: 0, level: 0, file_path: "test_metric_region/22_0000000042/data/.parquet", file_size: 3000, index_file_path: Some("test_metric_region/22_0000000042/data/index/.puffin"), index_file_size: Some(0), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94506057770(22, 16777258), table_id: 22, region_number: 16777258, region_group: 1, region_sequence: 42, file_id: "", index_version: 0, level: 0, file_path: "test_metric_region/22_0000000042/metadata/.parquet", file_size: 3000, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 94506057770(22, 16777258), node_id: None, visible: true }"#, ); // list from storage let storage_entries = mito diff --git a/src/mito2/src/compaction/compactor.rs b/src/mito2/src/compaction/compactor.rs index b03e6415e8..ff4317331f 100644 --- a/src/mito2/src/compaction/compactor.rs +++ b/src/mito2/src/compaction/compactor.rs @@ -322,11 +322,7 @@ impl DefaultCompactor { .region_options .sst_format .map(|format| format == FormatType::Flat) - .unwrap_or( - compaction_region - .engine_config - .default_experimental_flat_format, - ); + .unwrap_or(compaction_region.engine_config.default_flat_format); let index_config = compaction_region.engine_config.index.clone(); let inverted_index_config = compaction_region.engine_config.inverted_index.clone(); diff --git a/src/mito2/src/config.rs b/src/mito2/src/config.rs index da0ec74022..b3ddb023cb 100644 --- a/src/mito2/src/config.rs +++ b/src/mito2/src/config.rs @@ -33,8 +33,6 @@ use crate::memtable::MemtableConfig; use crate::sst::DEFAULT_WRITE_BUFFER_SIZE; const MULTIPART_UPLOAD_MINIMUM_SIZE: ReadableSize = ReadableSize::mb(5); -/// Default channel size for parallel scan task. -pub(crate) const DEFAULT_SCAN_CHANNEL_SIZE: usize = 32; /// Default maximum number of SST files to scan concurrently. pub(crate) const DEFAULT_MAX_CONCURRENT_SCAN_FILES: usize = 384; @@ -142,8 +140,6 @@ pub struct MitoConfig { // Other configs: /// Buffer size for SST writing. pub sst_write_buffer_size: ReadableSize, - /// Capacity of the channel to send data from parallel scan tasks to the main task (default 32). - pub parallel_scan_channel_size: usize, /// Maximum number of SST files to scan concurrently (default 384). pub max_concurrent_scan_files: usize, /// Whether to allow stale entries read during replay. @@ -177,9 +173,9 @@ pub struct MitoConfig { #[serde(with = "humantime_serde")] pub min_compaction_interval: Duration, - /// Whether to enable experimental flat format as the default format. + /// Whether to enable flat format as the default SST format. /// When enabled, forces using BulkMemtable and BulkMemtableBuilder. - pub default_experimental_flat_format: bool, + pub default_flat_format: bool, pub gc: GcConfig, } @@ -217,7 +213,6 @@ impl Default for MitoConfig { enable_refill_cache_on_read: true, manifest_cache_size: ReadableSize::mb(256), sst_write_buffer_size: DEFAULT_WRITE_BUFFER_SIZE, - parallel_scan_channel_size: DEFAULT_SCAN_CHANNEL_SIZE, max_concurrent_scan_files: DEFAULT_MAX_CONCURRENT_SCAN_FILES, allow_stale_entries: false, scan_memory_limit: MemoryLimit::default(), @@ -230,7 +225,7 @@ impl Default for MitoConfig { vector_index: VectorIndexConfig::default(), memtable: MemtableConfig::default(), min_compaction_interval: Duration::from_secs(0), - default_experimental_flat_format: false, + default_flat_format: true, gc: GcConfig::default(), }; @@ -295,14 +290,6 @@ impl MitoConfig { ); } - if self.parallel_scan_channel_size < 1 { - self.parallel_scan_channel_size = DEFAULT_SCAN_CHANNEL_SIZE; - warn!( - "Sanitize scan channel size to {}", - self.parallel_scan_channel_size - ); - } - // Sets write cache path if it is empty. if self.write_cache_path.trim().is_empty() { self.write_cache_path = data_home.to_string(); diff --git a/src/mito2/src/engine.rs b/src/mito2/src/engine.rs index d1c30c3ff6..d006067f0d 100644 --- a/src/mito2/src/engine.rs +++ b/src/mito2/src/engine.rs @@ -1027,7 +1027,6 @@ impl EngineInner { request, CacheStrategy::EnableAll(cache_manager), ) - .with_parallel_scan_channel_size(self.config.parallel_scan_channel_size) .with_max_concurrent_scan_files(self.config.max_concurrent_scan_files) .with_ignore_inverted_index(self.config.inverted_index.apply_on_query.disabled()) .with_ignore_fulltext_index(self.config.fulltext_index.apply_on_query.disabled()) diff --git a/src/mito2/src/engine/alter_test.rs b/src/mito2/src/engine/alter_test.rs index b8ba06f0b9..05ba5dae25 100644 --- a/src/mito2/src/engine/alter_test.rs +++ b/src/mito2/src/engine/alter_test.rs @@ -141,7 +141,7 @@ async fn test_alter_region_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -213,7 +213,7 @@ async fn test_alter_region_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -267,7 +267,7 @@ async fn test_put_after_alter_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -318,7 +318,7 @@ async fn test_put_after_alter_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -387,7 +387,7 @@ async fn test_alter_region_retry_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -457,7 +457,7 @@ async fn test_alter_on_flushing_with_format(flat_format: bool) { let engine = env .create_engine_with( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, None, @@ -574,7 +574,7 @@ async fn test_alter_column_fulltext_options_with_format(flat_format: bool) { let engine = env .create_engine_with( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, None, @@ -681,7 +681,7 @@ async fn test_alter_column_fulltext_options_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -718,7 +718,7 @@ async fn test_alter_column_set_inverted_index_with_format(flat_format: bool) { let engine = env .create_engine_with( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, None, @@ -816,7 +816,7 @@ async fn test_alter_column_set_inverted_index_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -853,7 +853,7 @@ async fn test_alter_region_ttl_options_with_format(flat_format: bool) { let engine = env .create_engine_with( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, None, @@ -916,7 +916,7 @@ async fn test_write_stall_on_altering_with_format(flat_format: bool) { let engine = env .create_engine_with( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, None, @@ -994,7 +994,7 @@ async fn test_alter_region_sst_format_with_flush() { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: false, + default_flat_format: false, ..Default::default() }) .await; @@ -1085,7 +1085,7 @@ async fn test_alter_region_sst_format_with_flush() { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: false, + default_flat_format: false, ..Default::default() }, ) @@ -1118,7 +1118,7 @@ async fn test_alter_region_sst_format_without_flush() { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: false, + default_flat_format: false, ..Default::default() }) .await; @@ -1203,7 +1203,7 @@ async fn test_alter_region_sst_format_without_flush() { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: false, + default_flat_format: false, ..Default::default() }, ) @@ -1231,6 +1231,250 @@ async fn test_alter_region_sst_format_without_flush() { assert_eq!(expected_all_data, batches.pretty_print().unwrap()); } +#[tokio::test] +async fn test_alter_region_sst_format_flat_to_pk_with_flush() { + common_telemetry::init_default_ut_logging(); + + let mut env = TestEnv::new().await; + let engine = env + .create_engine(MitoConfig { + default_flat_format: true, + ..Default::default() + }) + .await; + + let region_id = RegionId::new(1, 1); + let request = CreateRequestBuilder::new().build(); + + env.get_schema_metadata_manager() + .register_region_table_info( + region_id.table_id(), + "test_table", + "test_catalog", + "test_schema", + None, + env.get_kv_backend(), + ) + .await; + + let column_schemas = rows_schema(&request); + let table_dir = request.table_dir.clone(); + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + // Inserts some data with flat format + let rows = Rows { + schema: column_schemas.clone(), + rows: build_rows(0, 3), + }; + put_rows(&engine, region_id, rows).await; + + // Flushes to create SST files with flat format + flush_region(&engine, region_id, None).await; + + let expected_data = "\ ++-------+---------+---------------------+ +| tag_0 | field_0 | ts | ++-------+---------+---------------------+ +| 0 | 0.0 | 1970-01-01T00:00:00 | +| 1 | 1.0 | 1970-01-01T00:00:01 | +| 2 | 2.0 | 1970-01-01T00:00:02 | ++-------+---------+---------------------+"; + let request = ScanRequest::default(); + let stream = engine.scan_to_stream(region_id, request).await.unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(expected_data, batches.pretty_print().unwrap()); + + // Alters sst_format from flat to primary_key + let alter_format_request = RegionAlterRequest { + kind: AlterKind::SetRegionOptions { + options: vec![SetRegionOption::Format("primary_key".to_string())], + }, + }; + engine + .handle_request(region_id, RegionRequest::Alter(alter_format_request)) + .await + .unwrap(); + + // Inserts more data after alter + let rows = Rows { + schema: column_schemas.clone(), + rows: build_rows(3, 6), + }; + put_rows(&engine, region_id, rows).await; + + // Flushes to create SST files with primary_key format + flush_region(&engine, region_id, None).await; + + let expected_all_data = "\ ++-------+---------+---------------------+ +| tag_0 | field_0 | ts | ++-------+---------+---------------------+ +| 0 | 0.0 | 1970-01-01T00:00:00 | +| 1 | 1.0 | 1970-01-01T00:00:01 | +| 2 | 2.0 | 1970-01-01T00:00:02 | +| 3 | 3.0 | 1970-01-01T00:00:03 | +| 4 | 4.0 | 1970-01-01T00:00:04 | +| 5 | 5.0 | 1970-01-01T00:00:05 | ++-------+---------+---------------------+"; + let request = ScanRequest::default(); + let stream = engine.scan_to_stream(region_id, request).await.unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(expected_all_data, batches.pretty_print().unwrap()); + + // Reopens region to verify format persists + let engine = env + .reopen_engine( + engine, + MitoConfig { + default_flat_format: false, + ..Default::default() + }, + ) + .await; + engine + .handle_request( + region_id, + RegionRequest::Open(RegionOpenRequest { + engine: String::new(), + table_dir, + path_type: PathType::Bare, + options: HashMap::default(), + skip_wal_replay: false, + checkpoint: None, + }), + ) + .await + .unwrap(); + + let request = ScanRequest::default(); + let stream = engine.scan_to_stream(region_id, request).await.unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(expected_all_data, batches.pretty_print().unwrap()); +} + +#[tokio::test] +async fn test_alter_region_sst_format_flat_to_pk_without_flush() { + common_telemetry::init_default_ut_logging(); + + let mut env = TestEnv::new().await; + let engine = env + .create_engine(MitoConfig { + default_flat_format: true, + ..Default::default() + }) + .await; + + let region_id = RegionId::new(1, 1); + let request = CreateRequestBuilder::new().build(); + + env.get_schema_metadata_manager() + .register_region_table_info( + region_id.table_id(), + "test_table", + "test_catalog", + "test_schema", + None, + env.get_kv_backend(), + ) + .await; + + let column_schemas = rows_schema(&request); + let table_dir = request.table_dir.clone(); + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + let check_format = |engine: &MitoEngine, expected: Option| { + let current_format = engine + .get_region(region_id) + .unwrap() + .version() + .options + .sst_format; + assert_eq!(current_format, expected); + }; + check_format(&engine, Some(FormatType::Flat)); + + // Inserts some data with flat format + let rows = Rows { + schema: column_schemas.clone(), + rows: build_rows(0, 3), + }; + put_rows(&engine, region_id, rows).await; + + // Alters sst_format from flat to primary_key + let alter_format_request = RegionAlterRequest { + kind: AlterKind::SetRegionOptions { + options: vec![SetRegionOption::Format("primary_key".to_string())], + }, + }; + engine + .handle_request(region_id, RegionRequest::Alter(alter_format_request)) + .await + .unwrap(); + + check_format(&engine, Some(FormatType::PrimaryKey)); + + // Inserts more data after alter + let rows = Rows { + schema: column_schemas.clone(), + rows: build_rows(3, 6), + }; + put_rows(&engine, region_id, rows).await; + + let expected_all_data = "\ ++-------+---------+---------------------+ +| tag_0 | field_0 | ts | ++-------+---------+---------------------+ +| 0 | 0.0 | 1970-01-01T00:00:00 | +| 1 | 1.0 | 1970-01-01T00:00:01 | +| 2 | 2.0 | 1970-01-01T00:00:02 | +| 3 | 3.0 | 1970-01-01T00:00:03 | +| 4 | 4.0 | 1970-01-01T00:00:04 | +| 5 | 5.0 | 1970-01-01T00:00:05 | ++-------+---------+---------------------+"; + let request = ScanRequest::default(); + let stream = engine.scan_to_stream(region_id, request).await.unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(expected_all_data, batches.pretty_print().unwrap()); + + // Reopens region to verify format persists + let engine = env + .reopen_engine( + engine, + MitoConfig { + default_flat_format: false, + ..Default::default() + }, + ) + .await; + engine + .handle_request( + region_id, + RegionRequest::Open(RegionOpenRequest { + engine: String::new(), + table_dir, + path_type: PathType::Bare, + options: HashMap::default(), + skip_wal_replay: false, + checkpoint: None, + }), + ) + .await + .unwrap(); + + check_format(&engine, Some(FormatType::PrimaryKey)); + + let request = ScanRequest::default(); + let stream = engine.scan_to_stream(region_id, request).await.unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(expected_all_data, batches.pretty_print().unwrap()); +} + #[tokio::test] async fn test_alter_region_append_mode_with_flush() { common_telemetry::init_default_ut_logging(); diff --git a/src/mito2/src/engine/append_mode_test.rs b/src/mito2/src/engine/append_mode_test.rs index 61488b6592..fa7db1f573 100644 --- a/src/mito2/src/engine/append_mode_test.rs +++ b/src/mito2/src/engine/append_mode_test.rs @@ -44,7 +44,7 @@ async fn test_append_mode_write_query_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -112,7 +112,7 @@ async fn test_append_mode_compaction_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -211,7 +211,7 @@ async fn test_append_mode_compaction_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -238,7 +238,7 @@ async fn test_alter_append_mode_clears_merge_mode_with_format(flat_format: bool) let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -329,7 +329,7 @@ async fn test_alter_append_mode_clears_merge_mode_with_format(flat_format: bool) .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -376,7 +376,7 @@ async fn test_put_single_range_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -474,7 +474,7 @@ async fn test_put_single_range_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) diff --git a/src/mito2/src/engine/apply_staging_manifest_test.rs b/src/mito2/src/engine/apply_staging_manifest_test.rs index 401e6572a2..a82fcfe049 100644 --- a/src/mito2/src/engine/apply_staging_manifest_test.rs +++ b/src/mito2/src/engine/apply_staging_manifest_test.rs @@ -62,7 +62,7 @@ async fn test_apply_staging_manifest_invalid_region_state_with_format(flat_forma let mut env = TestEnv::with_prefix("invalid-region-state").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -125,7 +125,7 @@ async fn test_apply_staging_manifest_mismatched_partition_expr_with_format(flat_ let mut env = TestEnv::with_prefix("mismatched-partition-expr").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -205,7 +205,7 @@ async fn test_apply_staging_manifest_success_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("success").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -406,7 +406,7 @@ async fn test_apply_staging_manifest_invalid_files_to_add_with_format(flat_forma let mut env = TestEnv::with_prefix("invalid-files-to-add").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -483,7 +483,7 @@ async fn test_apply_staging_manifest_change_edit_different_columns_fails_with_fo let mut env = TestEnv::with_prefix("apply-change-edit-different-columns").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -599,7 +599,7 @@ async fn test_apply_staging_manifest_preserves_unflushed_memtable_with_format(fl let mut env = TestEnv::with_prefix("apply-preserve-memtable").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/basic_test.rs b/src/mito2/src/engine/basic_test.rs index ed92c2b4ac..5c2bd4fd4e 100644 --- a/src/mito2/src/engine/basic_test.rs +++ b/src/mito2/src/engine/basic_test.rs @@ -56,7 +56,7 @@ async fn test_engine_new_stop_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("engine-stop").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -93,7 +93,7 @@ async fn test_write_to_region_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("write-to-region").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -134,7 +134,7 @@ async fn test_region_replay_with_format(factory: Option, flat_f .with_log_store_factory(factory.clone()); let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -169,7 +169,7 @@ async fn test_region_replay_with_format(factory: Option, flat_f .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -234,7 +234,7 @@ async fn test_write_query_region_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -278,7 +278,7 @@ async fn test_different_order_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -339,7 +339,7 @@ async fn test_different_order_and_type_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -403,7 +403,7 @@ async fn test_put_delete_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -465,7 +465,7 @@ async fn test_delete_not_null_fields_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -524,7 +524,7 @@ async fn test_put_overwrite_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -594,7 +594,7 @@ async fn test_absent_and_invalid_columns_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -650,7 +650,7 @@ async fn test_region_usage_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("region_usage").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -716,7 +716,7 @@ async fn test_engine_with_write_cache_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let path = env.data_home().to_str().unwrap().to_string(); let mito_config = MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() } .enable_write_cache(path, ReadableSize::mb(512), None); @@ -765,7 +765,7 @@ async fn test_cache_null_primary_key_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, vector_cache_size: ReadableSize::mb(32), ..Default::default() }) @@ -896,7 +896,7 @@ async fn test_list_ssts_with_format( let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -1002,7 +1002,7 @@ async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expe let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/batch_catchup_test.rs b/src/mito2/src/engine/batch_catchup_test.rs index d8c744a733..dc0b552adc 100644 --- a/src/mito2/src/engine/batch_catchup_test.rs +++ b/src/mito2/src/engine/batch_catchup_test.rs @@ -49,7 +49,7 @@ async fn test_batch_catchup_with_format(factory: Option, flat_f .with_log_store_factory(factory.clone()); let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -135,7 +135,7 @@ async fn test_batch_catchup_with_format(factory: Option, flat_f .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -216,7 +216,7 @@ async fn test_batch_catchup_err_with_format(factory: Option, fl .with_log_store_factory(factory.clone()); let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/batch_open_test.rs b/src/mito2/src/engine/batch_open_test.rs index c718ef248c..6b16b3c120 100644 --- a/src/mito2/src/engine/batch_open_test.rs +++ b/src/mito2/src/engine/batch_open_test.rs @@ -49,7 +49,7 @@ async fn test_batch_open_with_format(factory: Option, flat_form .with_log_store_factory(factory.clone()); let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -157,7 +157,7 @@ async fn test_batch_open_with_format(factory: Option, flat_form .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -193,7 +193,7 @@ async fn test_batch_open_err_with_format(factory: Option, flat_ .with_log_store_factory(factory.clone()); let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/bump_committed_sequence_test.rs b/src/mito2/src/engine/bump_committed_sequence_test.rs index 00d2c0f51c..12db0044c5 100644 --- a/src/mito2/src/engine/bump_committed_sequence_test.rs +++ b/src/mito2/src/engine/bump_committed_sequence_test.rs @@ -35,7 +35,7 @@ async fn test_bump_committed_sequence_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -97,7 +97,7 @@ async fn test_bump_committed_sequence_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -136,7 +136,7 @@ async fn test_bump_committed_sequence_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) diff --git a/src/mito2/src/engine/catchup_test.rs b/src/mito2/src/engine/catchup_test.rs index 718462e8a8..e10e91b51b 100644 --- a/src/mito2/src/engine/catchup_test.rs +++ b/src/mito2/src/engine/catchup_test.rs @@ -701,7 +701,7 @@ async fn test_catchup_not_exist_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/close_test.rs b/src/mito2/src/engine/close_test.rs index 965a4f6fff..4c06583b0b 100644 --- a/src/mito2/src/engine/close_test.rs +++ b/src/mito2/src/engine/close_test.rs @@ -29,7 +29,7 @@ async fn test_engine_close_region_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("close").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/compaction_test.rs b/src/mito2/src/engine/compaction_test.rs index df8521535f..cbcad3a58a 100644 --- a/src/mito2/src/engine/compaction_test.rs +++ b/src/mito2/src/engine/compaction_test.rs @@ -147,7 +147,7 @@ async fn test_compaction_region_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -223,7 +223,7 @@ async fn test_infer_compaction_time_window_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -374,7 +374,7 @@ async fn test_compaction_overlapping_files_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -445,7 +445,7 @@ async fn test_compaction_region_with_overlapping_with_format(flat_format: bool) let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -503,7 +503,7 @@ async fn test_compaction_region_with_overlapping_delete_all_with_format(flat_for let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -571,7 +571,7 @@ async fn test_readonly_during_compaction_with_format(flat_format: bool) { let engine = env .create_engine_with( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, // Ensure there is only one background worker for purge task. max_background_purges: 1, ..Default::default() @@ -730,7 +730,7 @@ async fn test_compaction_update_time_window_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -836,7 +836,7 @@ async fn test_change_region_compaction_window_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -938,7 +938,7 @@ async fn test_change_region_compaction_window_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -981,7 +981,7 @@ async fn test_open_overwrite_compaction_window_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -1040,7 +1040,7 @@ async fn test_open_overwrite_compaction_window_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) diff --git a/src/mito2/src/engine/copy_region_from_test.rs b/src/mito2/src/engine/copy_region_from_test.rs index e9f8398302..0cf2686fca 100644 --- a/src/mito2/src/engine/copy_region_from_test.rs +++ b/src/mito2/src/engine/copy_region_from_test.rs @@ -41,7 +41,7 @@ async fn test_engine_copy_region_from_with_format(flat_format: bool, with_index: let mut env = TestEnv::with_prefix("copy-region-from").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -156,7 +156,7 @@ async fn test_engine_copy_region_failure_with_format(flat_format: bool) { let mut env = TestEnv::new().await.with_mock_layer(mock_layer); let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -283,7 +283,7 @@ async fn test_engine_copy_region_invalid_args_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -328,7 +328,7 @@ async fn test_engine_copy_region_unexpected_state_with_format(flat_format: bool) let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/create_test.rs b/src/mito2/src/engine/create_test.rs index e5980d9442..6dff346539 100644 --- a/src/mito2/src/engine/create_test.rs +++ b/src/mito2/src/engine/create_test.rs @@ -36,7 +36,7 @@ async fn test_engine_create_new_region_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("new-region").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -61,7 +61,7 @@ async fn test_engine_create_existing_region_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("create-existing").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -91,7 +91,7 @@ async fn test_engine_create_close_create_region_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("create-close-create").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -131,7 +131,7 @@ async fn test_engine_create_with_different_id_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -160,7 +160,7 @@ async fn test_engine_create_with_different_schema_with_format(flat_format: bool) let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -190,7 +190,7 @@ async fn test_engine_create_with_different_primary_key_with_format(flat_format: let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -220,7 +220,7 @@ async fn test_engine_create_with_options_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -253,7 +253,7 @@ async fn test_engine_create_with_custom_store_with_format(flat_format: bool) { let engine = env .create_engine_with_multiple_object_stores( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, None, @@ -301,7 +301,7 @@ async fn test_engine_create_with_memtable_opts_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -353,7 +353,7 @@ async fn create_with_partition_expr_persists_manifest_with_format(flat_format: b let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -401,7 +401,7 @@ async fn test_engine_create_with_format_one_case(create_format: &str, default_fl let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: default_flat_format, + default_flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/drop_test.rs b/src/mito2/src/engine/drop_test.rs index b3da775117..a34a5d1172 100644 --- a/src/mito2/src/engine/drop_test.rs +++ b/src/mito2/src/engine/drop_test.rs @@ -45,7 +45,7 @@ async fn test_engine_drop_region_with_format(flat_format: bool) { let engine = env .create_engine_with( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, None, @@ -175,7 +175,7 @@ async fn test_engine_drop_region_for_custom_store_with_format(flat_format: bool) let engine = env .create_engine_with_multiple_object_stores( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, None, diff --git a/src/mito2/src/engine/edit_region_test.rs b/src/mito2/src/engine/edit_region_test.rs index 01bdf60070..4a92d3494f 100644 --- a/src/mito2/src/engine/edit_region_test.rs +++ b/src/mito2/src/engine/edit_region_test.rs @@ -54,7 +54,7 @@ async fn test_edit_region_schedule_compaction_with_format(flat_format: bool) { let (tx, mut rx) = oneshot::channel(); let config = MitoConfig { min_compaction_interval: Duration::from_secs(60 * 60), - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }; let time_provider = Arc::new(MockTimeProvider::new(current_time_millis())); @@ -154,7 +154,7 @@ async fn test_edit_region_fill_cache_with_format(flat_format: bool) { MitoConfig { // Write cache must be enabled to download the ingested SST file. enable_write_cache: true, - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, None, @@ -268,7 +268,7 @@ async fn test_edit_region_concurrently_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, // Suppress the compaction to not impede the speed of this kinda stress testing. min_compaction_interval: Duration::from_secs(60 * 60), ..Default::default() diff --git a/src/mito2/src/engine/filter_deleted_test.rs b/src/mito2/src/engine/filter_deleted_test.rs index c40fc7ba02..497583b8bc 100644 --- a/src/mito2/src/engine/filter_deleted_test.rs +++ b/src/mito2/src/engine/filter_deleted_test.rs @@ -36,7 +36,7 @@ async fn test_scan_without_filtering_deleted_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/flush_test.rs b/src/mito2/src/engine/flush_test.rs index 78bae2b461..b86e75c72a 100644 --- a/src/mito2/src/engine/flush_test.rs +++ b/src/mito2/src/engine/flush_test.rs @@ -49,7 +49,7 @@ async fn test_manual_flush_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -112,7 +112,7 @@ async fn test_flush_engine_with_format(flat_format: bool) { let engine = env .create_engine_with( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, Some(write_buffer_manager.clone()), @@ -191,7 +191,7 @@ async fn test_write_stall_with_format(flat_format: bool) { let engine = env .create_engine_with( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, Some(write_buffer_manager.clone()), @@ -274,7 +274,7 @@ async fn test_flush_empty_with_format(flat_format: bool) { let engine = env .create_engine_with( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, Some(write_buffer_manager.clone()), @@ -447,7 +447,7 @@ async fn test_auto_flush_engine_with_format(flat_format: bool) { .create_engine_with_time( MitoConfig { auto_flush_interval: Duration::from_secs(60 * 5), - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, Some(write_buffer_manager.clone()), @@ -523,7 +523,7 @@ async fn test_flush_workers_with_format(flat_format: bool) { .create_engine_with( MitoConfig { num_workers: 2, - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, Some(write_buffer_manager.clone()), diff --git a/src/mito2/src/engine/merge_mode_test.rs b/src/mito2/src/engine/merge_mode_test.rs index 097d5e2b91..40a87642ae 100644 --- a/src/mito2/src/engine/merge_mode_test.rs +++ b/src/mito2/src/engine/merge_mode_test.rs @@ -39,7 +39,7 @@ async fn test_merge_mode_write_query_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -107,7 +107,7 @@ async fn test_merge_mode_compaction_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -220,7 +220,7 @@ async fn test_merge_mode_compaction_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) diff --git a/src/mito2/src/engine/open_test.rs b/src/mito2/src/engine/open_test.rs index 5ee25fb9ff..28ad1de71e 100644 --- a/src/mito2/src/engine/open_test.rs +++ b/src/mito2/src/engine/open_test.rs @@ -48,7 +48,7 @@ async fn test_engine_open_empty_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("open-empty").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -87,7 +87,7 @@ async fn test_engine_open_existing_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("open-exiting").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -126,7 +126,7 @@ async fn test_engine_reopen_region_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("reopen-region").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -153,7 +153,7 @@ async fn test_engine_open_readonly_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -207,7 +207,7 @@ async fn test_engine_region_open_with_options_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -260,7 +260,7 @@ async fn test_engine_region_open_with_custom_store_with_format(flat_format: bool let engine = env .create_engine_with_multiple_object_stores( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, None, @@ -332,7 +332,7 @@ async fn test_open_region_skip_wal_replay_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -376,7 +376,7 @@ async fn test_open_region_skip_wal_replay_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -415,7 +415,7 @@ async fn test_open_region_skip_wal_replay_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -462,7 +462,7 @@ async fn test_open_region_wait_for_opening_region_ok_with_format(flat_format: bo let mut env = TestEnv::with_prefix("wait-for-opening-region-ok").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -513,7 +513,7 @@ async fn test_open_region_wait_for_opening_region_err_with_format(flat_format: b let mut env = TestEnv::with_prefix("wait-for-opening-region-err").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -569,7 +569,7 @@ async fn test_open_compaction_region() { async fn test_open_compaction_region_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let mut mito_config = MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }; mito_config diff --git a/src/mito2/src/engine/parallel_test.rs b/src/mito2/src/engine/parallel_test.rs index cf5b6491a7..b88a60739b 100644 --- a/src/mito2/src/engine/parallel_test.rs +++ b/src/mito2/src/engine/parallel_test.rs @@ -33,13 +33,11 @@ async fn scan_in_parallel( region_id: RegionId, table_dir: &str, parallelism: usize, - channel_size: usize, flat_format: bool, ) { let engine = env .open_engine(MitoConfig { - default_experimental_flat_format: flat_format, - parallel_scan_channel_size: channel_size, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -85,7 +83,7 @@ async fn test_parallel_scan_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -146,15 +144,13 @@ async fn test_parallel_scan_with_format(flat_format: bool) { engine.stop().await.unwrap(); - scan_in_parallel(&mut env, region_id, &table_dir, 0, 1, flat_format).await; + scan_in_parallel(&mut env, region_id, &table_dir, 0, flat_format).await; - scan_in_parallel(&mut env, region_id, &table_dir, 1, 1, flat_format).await; + scan_in_parallel(&mut env, region_id, &table_dir, 1, flat_format).await; - scan_in_parallel(&mut env, region_id, &table_dir, 2, 1, flat_format).await; + scan_in_parallel(&mut env, region_id, &table_dir, 2, flat_format).await; - scan_in_parallel(&mut env, region_id, &table_dir, 2, 8, flat_format).await; + scan_in_parallel(&mut env, region_id, &table_dir, 4, flat_format).await; - scan_in_parallel(&mut env, region_id, &table_dir, 4, 8, flat_format).await; - - scan_in_parallel(&mut env, region_id, &table_dir, 8, 2, flat_format).await; + scan_in_parallel(&mut env, region_id, &table_dir, 8, flat_format).await; } diff --git a/src/mito2/src/engine/partition_filter_test.rs b/src/mito2/src/engine/partition_filter_test.rs index fdea7d547f..61db52484e 100644 --- a/src/mito2/src/engine/partition_filter_test.rs +++ b/src/mito2/src/engine/partition_filter_test.rs @@ -58,7 +58,7 @@ async fn test_partition_filter_basic_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/projection_test.rs b/src/mito2/src/engine/projection_test.rs index 7726005b0b..afa505a3ee 100644 --- a/src/mito2/src/engine/projection_test.rs +++ b/src/mito2/src/engine/projection_test.rs @@ -84,7 +84,7 @@ async fn test_scan_projection_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -141,7 +141,7 @@ async fn test_scan_projection_without_primary_key_with_format(flat_format: bool) let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/prune_test.rs b/src/mito2/src/engine/prune_test.rs index beb5e2644a..599547ec8d 100644 --- a/src/mito2/src/engine/prune_test.rs +++ b/src/mito2/src/engine/prune_test.rs @@ -32,7 +32,7 @@ async fn check_prune_row_groups(exprs: Vec, expected: &str, flat_format: b let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -180,7 +180,7 @@ async fn test_prune_memtable_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -264,7 +264,7 @@ async fn test_prune_memtable_complex_expr_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -327,7 +327,7 @@ async fn test_mem_range_prune_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -392,7 +392,7 @@ async fn test_scan_filter_field_after_delete_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/remap_manifests_test.rs b/src/mito2/src/engine/remap_manifests_test.rs index 339896450c..b893eb5b97 100644 --- a/src/mito2/src/engine/remap_manifests_test.rs +++ b/src/mito2/src/engine/remap_manifests_test.rs @@ -37,7 +37,7 @@ async fn test_remap_manifests_invalid_partition_expr_with_format(flat_format: bo let mut env = TestEnv::with_prefix("invalid-partition-expr").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -83,7 +83,7 @@ async fn test_remap_manifests_invalid_region_state_with_format(flat_format: bool let mut env = TestEnv::with_prefix("invalid-region-state").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -123,7 +123,7 @@ async fn test_remap_manifests_invalid_input_regions_with_format(flat_format: boo let mut env = TestEnv::with_prefix("invalid-input-regions").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -166,7 +166,7 @@ async fn test_remap_manifests_success_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("engine-stop").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/scan_test.rs b/src/mito2/src/engine/scan_test.rs index 6357f01775..119b4493fd 100644 --- a/src/mito2/src/engine/scan_test.rs +++ b/src/mito2/src/engine/scan_test.rs @@ -41,7 +41,7 @@ async fn test_scan_with_min_sst_sequence_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("test_scan_with_min_sst_sequence").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -176,7 +176,7 @@ async fn test_max_concurrent_scan_files() { async fn test_max_concurrent_scan_files_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("test_max_concurrent_scan_files").await; let config = MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, max_concurrent_scan_files: 2, ..Default::default() }; @@ -235,7 +235,7 @@ async fn test_series_scan_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("test_series_scan").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/set_role_state_test.rs b/src/mito2/src/engine/set_role_state_test.rs index fd90cd99f7..4fb15ab7fe 100644 --- a/src/mito2/src/engine/set_role_state_test.rs +++ b/src/mito2/src/engine/set_role_state_test.rs @@ -70,7 +70,7 @@ async fn test_set_role_state_gracefully_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -141,7 +141,7 @@ async fn test_set_role_state_gracefully_not_exist_with_format(flat_format: bool) let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -166,7 +166,7 @@ async fn test_write_downgrading_region_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("write-to-downgrading-region").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -220,7 +220,7 @@ async fn test_unified_state_transitions_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -329,7 +329,7 @@ async fn test_restricted_state_transitions_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/staging_test.rs b/src/mito2/src/engine/staging_test.rs index e47a77bea0..bd90779e0b 100644 --- a/src/mito2/src/engine/staging_test.rs +++ b/src/mito2/src/engine/staging_test.rs @@ -72,7 +72,7 @@ async fn test_staging_state_integration_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -130,7 +130,7 @@ async fn test_staging_blocks_alter_operations_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -171,7 +171,7 @@ async fn test_staging_blocks_truncate_operations_with_format(flat_format: bool) let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -308,7 +308,7 @@ async fn test_staging_write_partition_expr_version_with_format(flat_format: bool let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -505,7 +505,7 @@ async fn test_staging_manifest_directory_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -657,7 +657,7 @@ async fn test_staging_exit_success_with_manifests_with_format(flat_format: bool) let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -883,7 +883,7 @@ async fn test_enter_staging_writes_partition_expr_change_action_with_format(flat let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -947,7 +947,7 @@ async fn test_staging_exit_conflict_partition_expr_change_and_change_with_format let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -1032,7 +1032,7 @@ async fn test_write_stall_on_enter_staging_with_format(flat_format: bool) { let engine = env .create_engine_with( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, None, @@ -1156,7 +1156,7 @@ async fn test_enter_staging_error(env: &mut TestEnv, flat_format: bool) { let partition_expr = default_partition_expr(); let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/sync_test.rs b/src/mito2/src/engine/sync_test.rs index 6c3b91c130..17d73b1848 100644 --- a/src/mito2/src/engine/sync_test.rs +++ b/src/mito2/src/engine/sync_test.rs @@ -80,7 +80,7 @@ async fn test_sync_after_flush_region_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -112,7 +112,7 @@ async fn test_sync_after_flush_region_with_format(flat_format: bool) { // Open the region on the follower engine let follower_engine = env .create_follower_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -189,7 +189,7 @@ async fn test_sync_after_alter_region_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -224,7 +224,7 @@ async fn test_sync_after_alter_region_with_format(flat_format: bool) { // Open the region on the follower engine let follower_engine = env .create_follower_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/truncate_test.rs b/src/mito2/src/engine/truncate_test.rs index 223cc2b488..818da17faa 100644 --- a/src/mito2/src/engine/truncate_test.rs +++ b/src/mito2/src/engine/truncate_test.rs @@ -41,7 +41,7 @@ async fn test_engine_truncate_region_basic_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("truncate-basic").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -104,7 +104,7 @@ async fn test_engine_put_data_after_truncate_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("truncate-put").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -180,7 +180,7 @@ async fn test_engine_truncate_after_flush_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("truncate-flush").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -270,7 +270,7 @@ async fn test_engine_truncate_reopen_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("truncate-reopen").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -310,7 +310,7 @@ async fn test_engine_truncate_reopen_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -355,7 +355,7 @@ async fn test_engine_truncate_during_flush_with_format(flat_format: bool) { let engine = env .create_engine_with( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, Some(write_buffer_manager), @@ -436,7 +436,7 @@ async fn test_engine_truncate_during_flush_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) diff --git a/src/mito2/src/flush.rs b/src/mito2/src/flush.rs index fedac95d27..7be81dec8d 100644 --- a/src/mito2/src/flush.rs +++ b/src/mito2/src/flush.rs @@ -634,7 +634,7 @@ impl RegionFlushTask { .options .sst_format .map(|f| f == FormatType::Flat) - .unwrap_or(self.engine_config.default_experimental_flat_format); + .unwrap_or(self.engine_config.default_flat_format); SstWriteRequest { op_type: OperationType::Flush, metadata: version.metadata.clone(), diff --git a/src/mito2/src/memtable.rs b/src/mito2/src/memtable.rs index 154d062e07..e1494aa47b 100644 --- a/src/mito2/src/memtable.rs +++ b/src/mito2/src/memtable.rs @@ -421,7 +421,7 @@ impl MemtableBuilderProvider { let flat_format = options .sst_format .map(|format| format == FormatType::Flat) - .unwrap_or(self.config.default_experimental_flat_format); + .unwrap_or(self.config.default_flat_format); if flat_format { if options.memtable.is_some() { common_telemetry::info!( diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs index f56c807af3..c447685822 100644 --- a/src/mito2/src/read/scan_region.rs +++ b/src/mito2/src/read/scan_region.rs @@ -46,7 +46,7 @@ use tokio_stream::wrappers::ReceiverStream; use crate::access_layer::AccessLayerRef; use crate::cache::CacheStrategy; -use crate::config::{DEFAULT_MAX_CONCURRENT_SCAN_FILES, DEFAULT_SCAN_CHANNEL_SIZE}; +use crate::config::DEFAULT_MAX_CONCURRENT_SCAN_FILES; use crate::error::{InvalidPartitionExprSnafu, InvalidRequestSnafu, Result}; #[cfg(feature = "enterprise")] use crate::extension::{BoxedExtensionRange, BoxedExtensionRangeProvider}; @@ -219,8 +219,6 @@ pub(crate) struct ScanRegion { request: ScanRequest, /// Cache. cache_strategy: CacheStrategy, - /// Capacity of the channel to send data from parallel scan tasks to the main task. - parallel_scan_channel_size: usize, /// Maximum number of SST files to scan concurrently. max_concurrent_scan_files: usize, /// Whether to ignore inverted index. @@ -251,7 +249,6 @@ impl ScanRegion { access_layer, request, cache_strategy, - parallel_scan_channel_size: DEFAULT_SCAN_CHANNEL_SIZE, max_concurrent_scan_files: DEFAULT_MAX_CONCURRENT_SCAN_FILES, ignore_inverted_index: false, ignore_fulltext_index: false, @@ -263,16 +260,6 @@ impl ScanRegion { } } - /// Sets parallel scan task channel size. - #[must_use] - pub(crate) fn with_parallel_scan_channel_size( - mut self, - parallel_scan_channel_size: usize, - ) -> Self { - self.parallel_scan_channel_size = parallel_scan_channel_size; - self - } - /// Sets maximum number of SST files to scan concurrently. #[must_use] pub(crate) fn with_max_concurrent_scan_files( @@ -527,7 +514,6 @@ impl ScanRegion { .with_inverted_index_appliers(inverted_index_appliers) .with_bloom_filter_index_appliers(bloom_filter_appliers) .with_fulltext_index_appliers(fulltext_index_appliers) - .with_parallel_scan_channel_size(self.parallel_scan_channel_size) .with_max_concurrent_scan_files(self.max_concurrent_scan_files) .with_start_time(self.start_time) .with_append_mode(self.version.options.append_mode) @@ -814,8 +800,6 @@ pub struct ScanInput { pub(crate) cache_strategy: CacheStrategy, /// Ignores file not found error. ignore_file_not_found: bool, - /// Capacity of the channel to send data from parallel scan tasks to the main task. - pub(crate) parallel_scan_channel_size: usize, /// Maximum number of SST files to scan concurrently. pub(crate) max_concurrent_scan_files: usize, /// Index appliers. @@ -863,7 +847,6 @@ impl ScanInput { files: Vec::new(), cache_strategy: CacheStrategy::Disabled, ignore_file_not_found: false, - parallel_scan_channel_size: DEFAULT_SCAN_CHANNEL_SIZE, max_concurrent_scan_files: DEFAULT_MAX_CONCURRENT_SCAN_FILES, inverted_index_appliers: [None, None], bloom_filter_index_appliers: [None, None], @@ -928,16 +911,6 @@ impl ScanInput { self } - /// Sets scan task channel size. - #[must_use] - pub(crate) fn with_parallel_scan_channel_size( - mut self, - parallel_scan_channel_size: usize, - ) -> Self { - self.parallel_scan_channel_size = parallel_scan_channel_size; - self - } - /// Sets maximum number of SST files to scan concurrently. #[must_use] pub(crate) fn with_max_concurrent_scan_files( @@ -1072,6 +1045,7 @@ impl ScanInput { &self, sources: Vec, semaphore: Arc, + channel_size: usize, ) -> Result> { if sources.len() <= 1 { return Ok(sources); @@ -1081,7 +1055,7 @@ impl ScanInput { let sources = sources .into_iter() .map(|source| { - let (sender, receiver) = mpsc::channel(self.parallel_scan_channel_size); + let (sender, receiver) = mpsc::channel(channel_size); self.spawn_scan_task(source, semaphore.clone(), sender); let stream = Box::pin(ReceiverStream::new(receiver)); Source::Stream(stream) @@ -1256,6 +1230,7 @@ impl ScanInput { &self, sources: Vec, semaphore: Arc, + channel_size: usize, ) -> Result> { if sources.len() <= 1 { return Ok(sources); @@ -1265,7 +1240,7 @@ impl ScanInput { let sources = sources .into_iter() .map(|source| { - let (sender, receiver) = mpsc::channel(self.parallel_scan_channel_size); + let (sender, receiver) = mpsc::channel(channel_size); self.spawn_flat_scan_task(source, semaphore.clone(), sender); let stream = Box::pin(ReceiverStream::new(receiver)); Box::pin(stream) as _ diff --git a/src/mito2/src/read/scan_util.rs b/src/mito2/src/read/scan_util.rs index eee32e7835..597f592de6 100644 --- a/src/mito2/src/read/scan_util.rs +++ b/src/mito2/src/read/scan_util.rs @@ -48,11 +48,11 @@ use crate::sst::file::{FileTimeRange, RegionFileId}; use crate::sst::index::bloom_filter::applier::BloomFilterIndexApplyMetrics; use crate::sst::index::fulltext_index::applier::FulltextIndexApplyMetrics; use crate::sst::index::inverted_index::applier::InvertedIndexApplyMetrics; -use crate::sst::parquet::DEFAULT_ROW_GROUP_SIZE; use crate::sst::parquet::file_range::FileRange; use crate::sst::parquet::flat_format::time_index_column_index; use crate::sst::parquet::reader::{MetadataCacheMetrics, ReaderFilterMetrics, ReaderMetrics}; use crate::sst::parquet::row_group::ParquetFetchMetrics; +use crate::sst::parquet::{DEFAULT_READ_BATCH_SIZE, DEFAULT_ROW_GROUP_SIZE}; /// Per-file scan metrics. #[derive(Default, Clone)] @@ -1231,15 +1231,19 @@ const NUM_SERIES_THRESHOLD: u64 = 10240; /// 60 samples per hour. const BATCH_SIZE_THRESHOLD: u64 = 50; -/// Returns true if splitting flat record batches may improve merge performance. +/// Returns the estimated rows per batch after splitting if splitting flat record batches +/// may improve merge performance. Returns `None` if splitting is not beneficial. pub(crate) fn should_split_flat_batches_for_merge( stream_ctx: &Arc, range_meta: &RangeMeta, -) -> bool { +) -> Option { // Number of files to split and scan. let mut num_files_to_split = 0; let mut num_mem_rows = 0; let mut num_mem_series = 0; + // Total rows and series for estimating batch size after splitting. + let mut total_rows: u64 = 0; + let mut total_series: u64 = 0; // Checks each file range, returns early if any range is not splittable. // For mem ranges, we collect the total number of rows and series because the number of rows in a // mem range may be too small. @@ -1261,23 +1265,49 @@ pub(crate) fn should_split_flat_batches_for_merge( debug_assert!(file.meta_ref().num_rows > 0); if !can_split_series(file.meta_ref().num_rows, file.meta_ref().num_series) { // We can't split batches in a file. - return false; + return None; } else { num_files_to_split += 1; + total_rows += file.meta_ref().num_rows; + total_series += file.meta_ref().num_series; } } // Skips non-file and non-mem ranges. } - if num_files_to_split > 0 { + let should_split = if num_files_to_split > 0 { // We mainly consider file ranges because they have enough data for sampling. true - } else if num_mem_series > 0 && num_mem_rows > 0 { - // If we don't have files to scan, we check whether to split by the memtable. - can_split_series(num_mem_rows as u64, num_mem_series as u64) + } else if num_mem_series > 0 + && num_mem_rows > 0 + && can_split_series(num_mem_rows as u64, num_mem_series as u64) + { + total_rows += num_mem_rows as u64; + total_series += num_mem_series as u64; + true } else { false + }; + + if !should_split { + return None; } + + // Estimate rows per batch after splitting. + let estimated_batch_size = if total_series > 0 && total_rows > 0 { + ((total_rows / total_series) as usize).clamp(1, DEFAULT_READ_BATCH_SIZE) + } else { + // No valid estimate available, use a conservative fallback. + DEFAULT_READ_BATCH_SIZE / 4 + }; + Some(estimated_batch_size) +} + +/// Computes the channel size for parallel scan based on the estimated rows per batch. +/// The channel should buffer approximately `2 * DEFAULT_READ_BATCH_SIZE` rows. +pub(crate) fn compute_parallel_channel_size(estimated_rows_per_batch: usize) -> usize { + let size = 2 * DEFAULT_READ_BATCH_SIZE / estimated_rows_per_batch.max(1); + size.clamp(2, 64) } fn can_split_series(num_rows: u64, num_series: u64) -> bool { @@ -1555,3 +1585,235 @@ pub(crate) fn split_record_batch(record_batch: RecordBatch, batches: &mut VecDeq batches.push_back(record_batch.slice(start, rows_in_batch)); } } + +#[cfg(test)] +mod tests { + use std::sync::Arc; + use std::time::Instant; + + use common_time::Timestamp; + use smallvec::{SmallVec, smallvec}; + use store_api::storage::RegionId; + + use super::*; + use crate::cache::CacheStrategy; + use crate::memtable::{ + BoxedBatchIterator, BoxedRecordBatchIterator, IterBuilder, MemtableRange, + MemtableRangeContext, MemtableStats, + }; + use crate::read::projection::ProjectionMapper; + use crate::read::range::{MemRangeBuilder, SourceIndex}; + use crate::read::scan_region::ScanInput; + use crate::sst::file::{FileHandle, FileMeta}; + use crate::sst::file_purger::NoopFilePurger; + use crate::test_util::memtable_util::metadata_for_test; + use crate::test_util::scheduler_util::SchedulerEnv; + + struct EmptyIterBuilder; + + impl IterBuilder for EmptyIterBuilder { + fn build(&self, _metrics: Option) -> Result { + Ok(Box::new(std::iter::empty())) + } + + fn is_record_batch(&self) -> bool { + true + } + + fn build_record_batch( + &self, + _time_range: Option<(Timestamp, Timestamp)>, + _metrics: Option, + ) -> Result { + Ok(Box::new(std::iter::empty())) + } + } + + async fn new_test_stream_ctx( + files: Vec, + memtables: Vec, + ) -> Arc { + let env = SchedulerEnv::new().await; + let metadata = metadata_for_test(); + let mapper = ProjectionMapper::new(&metadata, [0, 2, 3].into_iter()).unwrap(); + let input = ScanInput::new(env.access_layer.clone(), mapper) + .with_cache(CacheStrategy::Disabled) + .with_memtables(memtables) + .with_files(files); + + Arc::new(StreamContext { + input, + ranges: Vec::new(), + scan_fingerprint: None, + query_start: Instant::now(), + }) + } + + fn new_test_file(num_rows: u64, num_series: u64) -> FileHandle { + let meta = FileMeta { + region_id: RegionId::new(123, 456), + file_id: Default::default(), + time_range: ( + Timestamp::new_millisecond(0), + Timestamp::new_millisecond(1000), + ), + num_rows, + num_series, + ..Default::default() + }; + FileHandle::new(meta, Arc::new(NoopFilePurger)) + } + + fn new_test_memtable(num_rows: usize, series_count: usize) -> MemRangeBuilder { + let context = Arc::new(MemtableRangeContext::new( + 0, + Box::new(EmptyIterBuilder), + Default::default(), + )); + let stats = MemtableStats { + time_range: Some(( + Timestamp::new_millisecond(0), + Timestamp::new_millisecond(1000), + )), + num_rows, + num_ranges: 1, + series_count, + ..Default::default() + }; + let range = MemtableRange::new(context, stats.clone()); + MemRangeBuilder::new(range, stats) + } + + fn new_test_range_meta(row_group_indices: SmallVec<[RowGroupIndex; 2]>) -> RangeMeta { + let indices = row_group_indices + .iter() + .map(|row_group_index| SourceIndex { + index: row_group_index.index, + num_row_groups: 1, + }) + .collect(); + + RangeMeta { + time_range: ( + Timestamp::new_millisecond(0), + Timestamp::new_millisecond(1000), + ), + indices, + row_group_indices, + num_rows: 0, + } + } + + #[tokio::test] + async fn test_should_split_flat_batches_for_merge_uses_splittable_file_rows_per_series() { + let num_rows = SPLIT_ROW_THRESHOLD * 2; + let num_series = (num_rows / 100).max(1); + let stream_ctx = + new_test_stream_ctx(vec![new_test_file(num_rows, num_series)], vec![]).await; + let range_meta = new_test_range_meta(smallvec![RowGroupIndex { + index: 0, + row_group_index: 0, + }]); + + assert_eq!( + Some((num_rows / num_series) as usize), + should_split_flat_batches_for_merge(&stream_ctx, &range_meta) + ); + } + + #[tokio::test] + async fn test_should_split_flat_batches_for_merge_skips_small_or_unknown_series_files() { + let stream_ctx = new_test_stream_ctx( + vec![ + new_test_file(SPLIT_ROW_THRESHOLD.saturating_sub(1), 1), + new_test_file(SPLIT_ROW_THRESHOLD * 2, 0), + ], + vec![], + ) + .await; + let range_meta = new_test_range_meta(smallvec![ + RowGroupIndex { + index: 0, + row_group_index: 0, + }, + RowGroupIndex { + index: 1, + row_group_index: 0, + } + ]); + + assert_eq!( + None, + should_split_flat_batches_for_merge(&stream_ctx, &range_meta) + ); + } + + #[tokio::test] + async fn test_should_split_flat_batches_for_merge_returns_none_for_unsplittable_file() { + let num_series = + (SPLIT_ROW_THRESHOLD / (BATCH_SIZE_THRESHOLD - 1)).max(NUM_SERIES_THRESHOLD) + 1; + let stream_ctx = + new_test_stream_ctx(vec![new_test_file(SPLIT_ROW_THRESHOLD, num_series)], vec![]).await; + let range_meta = new_test_range_meta(smallvec![RowGroupIndex { + index: 0, + row_group_index: 0, + }]); + + assert_eq!( + None, + should_split_flat_batches_for_merge(&stream_ctx, &range_meta) + ); + } + + #[tokio::test] + async fn test_should_split_flat_batches_for_merge_falls_back_to_memtables() { + let stream_ctx = new_test_stream_ctx(vec![], vec![new_test_memtable(5_000, 100)]).await; + let range_meta = new_test_range_meta(smallvec![RowGroupIndex { + index: 0, + row_group_index: 0, + }]); + + assert_eq!( + Some(50), + should_split_flat_batches_for_merge(&stream_ctx, &range_meta) + ); + } + + #[tokio::test] + async fn test_should_split_flat_batches_for_merge_clamps_estimate() { + let stream_ctx = + new_test_stream_ctx(vec![new_test_file(SPLIT_ROW_THRESHOLD * 2, 1)], vec![]).await; + let range_meta = new_test_range_meta(smallvec![RowGroupIndex { + index: 0, + row_group_index: 0, + }]); + + assert_eq!( + Some(DEFAULT_READ_BATCH_SIZE), + should_split_flat_batches_for_merge(&stream_ctx, &range_meta) + ); + } + + #[test] + fn test_compute_parallel_channel_size_clamps_to_max_for_small_batches() { + assert_eq!(64, compute_parallel_channel_size(0)); + assert_eq!(64, compute_parallel_channel_size(1)); + } + + #[test] + fn test_compute_parallel_channel_size_returns_expected_mid_range_size() { + assert_eq!( + 4, + compute_parallel_channel_size(DEFAULT_READ_BATCH_SIZE / 2) + ); + } + + #[test] + fn test_compute_parallel_channel_size_clamps_to_min_for_large_batches() { + assert_eq!(2, compute_parallel_channel_size(DEFAULT_READ_BATCH_SIZE)); + assert_eq!( + 2, + compute_parallel_channel_size(DEFAULT_READ_BATCH_SIZE * 2) + ); + } +} diff --git a/src/mito2/src/read/seq_scan.rs b/src/mito2/src/read/seq_scan.rs index 49f173e7c9..15ab435425 100644 --- a/src/mito2/src/read/seq_scan.rs +++ b/src/mito2/src/read/seq_scan.rs @@ -43,8 +43,8 @@ use crate::read::pruner::{PartitionPruner, Pruner}; use crate::read::range::RangeMeta; use crate::read::scan_region::{ScanInput, StreamContext}; use crate::read::scan_util::{ - PartitionMetrics, PartitionMetricsList, SplitRecordBatchStream, scan_flat_file_ranges, - scan_flat_mem_ranges, should_split_flat_batches_for_merge, + PartitionMetrics, PartitionMetricsList, SplitRecordBatchStream, compute_parallel_channel_size, + scan_flat_file_ranges, scan_flat_mem_ranges, should_split_flat_batches_for_merge, }; use crate::read::stream::{ConvertBatchStream, ScanBatch, ScanBatchStream}; use crate::read::{BoxedRecordBatchStream, ScannerMetrics, scan_util}; @@ -176,7 +176,14 @@ impl SeqScan { partition_ranges.len(), sources.len() ); - Self::build_flat_reader_from_sources(stream_ctx, sources, None, None).await + Self::build_flat_reader_from_sources( + stream_ctx, + sources, + None, + None, + compute_parallel_channel_size(DEFAULT_READ_BATCH_SIZE), + ) + .await } /// Builds a flat reader to read sources that returns RecordBatch. If `semaphore` is provided, reads sources in parallel @@ -187,13 +194,16 @@ impl SeqScan { mut sources: Vec, semaphore: Option>, part_metrics: Option<&PartitionMetrics>, + channel_size: usize, ) -> Result { if let Some(semaphore) = semaphore.as_ref() { // Read sources in parallel. if sources.len() > 1 { - sources = stream_ctx - .input - .create_parallel_flat_sources(sources, semaphore.clone())?; + sources = stream_ctx.input.create_parallel_flat_sources( + sources, + semaphore.clone(), + channel_size, + )?; } } @@ -322,7 +332,7 @@ impl SeqScan { // Scans each part. for part_range in partition_ranges { let mut sources = Vec::new(); - build_flat_sources( + let split_batch_size = build_flat_sources( &stream_ctx, &part_range, compaction, @@ -332,8 +342,11 @@ impl SeqScan { file_scan_semaphore.clone(), ).await?; + let channel_size = compute_parallel_channel_size( + split_batch_size.unwrap_or(DEFAULT_READ_BATCH_SIZE), + ); let mut reader = - Self::build_flat_reader_from_sources(&stream_ctx, sources, semaphore.clone(), Some(&part_metrics)) + Self::build_flat_reader_from_sources(&stream_ctx, sources, semaphore.clone(), Some(&part_metrics), channel_size) .await?; let mut metrics = ScannerMetrics { @@ -529,6 +542,7 @@ impl fmt::Debug for SeqScan { } /// Builds flat sources for the partition range and push them to the `sources` vector. +/// Returns the estimated rows per batch after splitting if splitting is applied, or `None`. pub(crate) async fn build_flat_sources( stream_ctx: &Arc, part_range: &PartitionRange, @@ -537,7 +551,7 @@ pub(crate) async fn build_flat_sources( partition_pruner: Arc, sources: &mut Vec, semaphore: Option>, -) -> Result<()> { +) -> Result> { // Gets range meta. let range_meta = &stream_ctx.ranges[part_range.identifier]; #[cfg(debug_assertions)] @@ -561,10 +575,11 @@ pub(crate) async fn build_flat_sources( }; let num_indices = range_meta.row_group_indices.len(); if num_indices == 0 { - return Ok(()); + return Ok(None); } - let should_split = should_split_flat_batches_for_merge(stream_ctx, range_meta); + let split_batch_size = should_split_flat_batches_for_merge(stream_ctx, range_meta); + let should_split = split_batch_size.is_some(); sources.reserve(num_indices); let mut ordered_sources = Vec::with_capacity(num_indices); ordered_sources.resize_with(num_indices, || None); @@ -642,7 +657,7 @@ pub(crate) async fn build_flat_sources( ); } - Ok(()) + Ok(split_batch_size) } #[cfg(test)] diff --git a/src/mito2/src/read/series_scan.rs b/src/mito2/src/read/series_scan.rs index d2e37af66a..bf7ed072ab 100644 --- a/src/mito2/src/read/series_scan.rs +++ b/src/mito2/src/read/series_scan.rs @@ -47,9 +47,12 @@ use crate::error::{ use crate::read::ScannerMetrics; use crate::read::pruner::{PartitionPruner, Pruner}; use crate::read::scan_region::{ScanInput, StreamContext}; -use crate::read::scan_util::{PartitionMetrics, PartitionMetricsList, SeriesDistributorMetrics}; +use crate::read::scan_util::{ + PartitionMetrics, PartitionMetricsList, SeriesDistributorMetrics, compute_parallel_channel_size, +}; use crate::read::seq_scan::{SeqScan, build_flat_sources}; use crate::read::stream::{ConvertBatchStream, ScanBatch, ScanBatchStream}; +use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE; use crate::sst::parquet::flat_format::primary_key_column_index; use crate::sst::parquet::format::PrimaryKeyArray; @@ -482,10 +485,11 @@ impl SeriesDistributor { // Scans all parts. let mut sources = Vec::with_capacity(self.partitions.len()); + let mut min_batch_size: Option = None; for partition in &self.partitions { sources.reserve(partition.len()); for part_range in partition { - build_flat_sources( + let split_batch_size = build_flat_sources( &self.stream_ctx, part_range, false, @@ -495,15 +499,21 @@ impl SeriesDistributor { self.semaphore.clone(), ) .await?; + if let Some(size) = split_batch_size { + min_batch_size = Some(min_batch_size.map_or(size, |cur| cur.min(size))); + } } } // Builds a flat reader that merge sources from all parts. + let channel_size = + compute_parallel_channel_size(min_batch_size.unwrap_or(DEFAULT_READ_BATCH_SIZE)); let mut reader = SeqScan::build_flat_reader_from_sources( &self.stream_ctx, sources, self.semaphore.clone(), Some(&part_metrics), + channel_size, ) .await?; let mut metrics = SeriesDistributorMetrics::default(); diff --git a/src/mito2/src/region/opener.rs b/src/mito2/src/region/opener.rs index 9aa6454f75..b23e73557d 100644 --- a/src/mito2/src/region/opener.rs +++ b/src/mito2/src/region/opener.rs @@ -269,7 +269,7 @@ impl RegionOpener { // Sets the sst_format based on options or flat_format flag let sst_format = if let Some(format) = options.sst_format { format - } else if config.default_experimental_flat_format { + } else if config.default_flat_format { options.sst_format = Some(FormatType::Flat); FormatType::Flat } else { @@ -309,7 +309,7 @@ impl RegionOpener { debug!( "Create region {} with options: {:?}, default_flat_format: {}", - region_id, options, config.default_experimental_flat_format + region_id, options, config.default_flat_format ); let version = VersionBuilder::new(metadata, mutable) @@ -626,8 +626,10 @@ pub(crate) fn sanitize_region_options(manifest: &RegionManifest, options: &mut R manifest.sst_format, manifest.metadata.region_id, ); - options.sst_format = Some(manifest.sst_format); } + // Always set sst_format from manifest to ensure it's explicitly stored, + // even when the default matches the manifest value. + options.sst_format = Some(manifest.sst_format); if let Some(manifest_append_mode) = manifest.append_mode && options.append_mode != manifest_append_mode { diff --git a/src/mito2/src/worker/handle_alter.rs b/src/mito2/src/worker/handle_alter.rs index 459aa8dd32..6fa560e90c 100644 --- a/src/mito2/src/worker/handle_alter.rs +++ b/src/mito2/src/worker/handle_alter.rs @@ -216,15 +216,6 @@ impl RegionWorkerLoop { // If the format is unchanged, we also consider the option is altered. if new_format != current_options.sst_format.unwrap_or_default() { all_options_altered = false; - - // Validates the format type. - ensure!( - new_format == FormatType::Flat, - store_api::metadata::InvalidRegionRequestSnafu { - region_id: region.region_id, - err: "Only allow changing format type to flat", - } - ); } } SetRegionOption::AppendMode(new_append_mode) => { @@ -274,8 +265,6 @@ fn new_region_options_on_empty_memtable( SetRegionOption::Format(format_str) => { // Safety: handle_alter_region_options_fast() has validated this. let new_format = format_str.parse::().unwrap(); - assert_eq!(FormatType::Flat, new_format); - current_options.sst_format = Some(new_format); } SetRegionOption::AppendMode(new_append_mode) => { diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index 21e707e4d0..29d4256864 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -1566,12 +1566,11 @@ index_cache_percent = 20 enable_refill_cache_on_read = true manifest_cache_size = "256MiB" sst_write_buffer_size = "8MiB" -parallel_scan_channel_size = 32 max_concurrent_scan_files = 384 allow_stale_entries = false scan_memory_on_exhausted = "fail" min_compaction_interval = "0s" -default_experimental_flat_format = false +default_flat_format = true [region_engine.mito.index] aux_path = "" diff --git a/tests/cases/standalone/common/alter/alter_format.result b/tests/cases/standalone/common/alter/alter_format.result index d38c63997d..a1019a8c93 100644 --- a/tests/cases/standalone/common/alter/alter_format.result +++ b/tests/cases/standalone/common/alter/alter_format.result @@ -42,6 +42,26 @@ ALTER TABLE test_alt_format SET 'sst_format' = 'flat'; Affected Rows: 0 +SHOW CREATE TABLE test_alt_format; + ++-----------------+------------------------------------------------+ +| Table | Create Table | ++-----------------+------------------------------------------------+ +| test_alt_format | CREATE TABLE IF NOT EXISTS "test_alt_format" ( | +| | "h" INT NULL, | +| | "i" INT NULL DEFAULT 0, | +| | "j" TIMESTAMP(3) NOT NULL, | +| | "k" INT NULL, | +| | TIME INDEX ("j"), | +| | PRIMARY KEY ("h") | +| | ) | +| | | +| | ENGINE=mito | +| | WITH( | +| | sst_format = 'flat' | +| | ) | ++-----------------+------------------------------------------------+ + -- SQLNESS SORT_RESULT 3 1 SELECT * FROM test_alt_format; @@ -116,11 +136,68 @@ SELECT i, h FROM test_alt_format; | 23 | 13 | +----+----+ --- not allow to change from flat to primary_key --- SQLNESS REPLACE \d+\(\d+,\s+\d+\) REDACTED +-- allow to change from flat to primary_key ALTER TABLE test_alt_format SET 'sst_format' = 'primary_key'; -Error: 1004(InvalidArguments), Invalid region request, region_id: REDACTED, err: Only allow changing format type to flat +Affected Rows: 0 + +SHOW CREATE TABLE test_alt_format; + ++-----------------+------------------------------------------------+ +| Table | Create Table | ++-----------------+------------------------------------------------+ +| test_alt_format | CREATE TABLE IF NOT EXISTS "test_alt_format" ( | +| | "h" INT NULL, | +| | "i" INT NULL DEFAULT 0, | +| | "j" TIMESTAMP(3) NOT NULL, | +| | "k" INT NULL, | +| | TIME INDEX ("j"), | +| | PRIMARY KEY ("h") | +| | ) | +| | | +| | ENGINE=mito | +| | WITH( | +| | sst_format = 'primary_key' | +| | ) | ++-----------------+------------------------------------------------+ + +INSERT INTO test_alt_format (h, j, i) VALUES (14, 4, 34); + +Affected Rows: 1 + +-- SQLNESS SORT_RESULT 3 1 +SELECT * FROM test_alt_format; + ++----+----+-------------------------+----+ +| h | i | j | k | ++----+----+-------------------------+----+ +| 10 | 0 | 1970-01-01T00:00:00 | | +| 11 | 0 | 1970-01-01T00:00:00.001 | | +| 12 | 0 | 1970-01-01T00:00:00.002 | | +| 13 | 23 | 1970-01-01T00:00:00.003 | 33 | +| 14 | 34 | 1970-01-01T00:00:00.004 | | ++----+----+-------------------------+----+ + +ADMIN flush_table('test_alt_format'); + ++--------------------------------------+ +| ADMIN flush_table('test_alt_format') | ++--------------------------------------+ +| 0 | ++--------------------------------------+ + +-- SQLNESS SORT_RESULT 3 1 +SELECT * FROM test_alt_format; + ++----+----+-------------------------+----+ +| h | i | j | k | ++----+----+-------------------------+----+ +| 10 | 0 | 1970-01-01T00:00:00 | | +| 11 | 0 | 1970-01-01T00:00:00.001 | | +| 12 | 0 | 1970-01-01T00:00:00.002 | | +| 13 | 23 | 1970-01-01T00:00:00.003 | 33 | +| 14 | 34 | 1970-01-01T00:00:00.004 | | ++----+----+-------------------------+----+ DROP TABLE test_alt_format; @@ -167,6 +244,27 @@ ALTER TABLE alt_format_phy SET 'sst_format' = 'flat'; Affected Rows: 0 +SHOW CREATE TABLE alt_format_phy; + ++----------------+-----------------------------------------------+ +| Table | Create Table | ++----------------+-----------------------------------------------+ +| alt_format_phy | CREATE TABLE IF NOT EXISTS "alt_format_phy" ( | +| | "ts" TIMESTAMP(3) NOT NULL, | +| | "val" DOUBLE NULL, | +| | "host" STRING NULL, | +| | "k" STRING NULL, | +| | TIME INDEX ("ts"), | +| | PRIMARY KEY ("host", "k") | +| | ) | +| | | +| | ENGINE=metric | +| | WITH( | +| | physical_metric_table = '', | +| | sst_format = 'flat' | +| | ) | ++----------------+-----------------------------------------------+ + SELECT * FROM t1 ORDER BY ts ASC; +-------------+---+---------------------+------+ @@ -202,11 +300,47 @@ SELECT host, ts, val FROM t1 where host = 'example.com' ORDER BY ts ASC; | example.com | 2022-01-02T00:00:00 | 4.56 | +-------------+---------------------+------+ --- not allow to change from flat to primary_key --- SQLNESS REPLACE \d+\(\d+,\s+\d+\) REDACTED +-- allow to change from flat to primary_key ALTER TABLE alt_format_phy SET 'sst_format' = 'primary_key'; -Error: 1004(InvalidArguments), Invalid region request, region_id: REDACTED, err: Only allow changing format type to flat +Affected Rows: 0 + +SHOW CREATE TABLE alt_format_phy; + ++----------------+-----------------------------------------------+ +| Table | Create Table | ++----------------+-----------------------------------------------+ +| alt_format_phy | CREATE TABLE IF NOT EXISTS "alt_format_phy" ( | +| | "ts" TIMESTAMP(3) NOT NULL, | +| | "val" DOUBLE NULL, | +| | "host" STRING NULL, | +| | "k" STRING NULL, | +| | TIME INDEX ("ts"), | +| | PRIMARY KEY ("host", "k") | +| | ) | +| | | +| | ENGINE=metric | +| | WITH( | +| | physical_metric_table = '', | +| | sst_format = 'primary_key' | +| | ) | ++----------------+-----------------------------------------------+ + +INSERT INTO t1 (ts, val, host) VALUES + ('2022-01-01 00:00:02', 5.0, 'example.com'); + +Affected Rows: 1 + +SELECT host, ts, val FROM t1 where host = 'example.com' ORDER BY ts ASC; + ++-------------+---------------------+------+ +| host | ts | val | ++-------------+---------------------+------+ +| example.com | 2022-01-01T00:00:00 | 1.23 | +| example.com | 2022-01-01T00:00:01 | 3.0 | +| example.com | 2022-01-01T00:00:02 | 5.0 | +| example.com | 2022-01-02T00:00:00 | 4.56 | ++-------------+---------------------+------+ DROP TABLE t1; diff --git a/tests/cases/standalone/common/alter/alter_format.sql b/tests/cases/standalone/common/alter/alter_format.sql index e1472d28e1..c3b292875c 100644 --- a/tests/cases/standalone/common/alter/alter_format.sql +++ b/tests/cases/standalone/common/alter/alter_format.sql @@ -16,6 +16,8 @@ SELECT i, h FROM test_alt_format; ALTER TABLE test_alt_format SET 'sst_format' = 'flat'; +SHOW CREATE TABLE test_alt_format; + -- SQLNESS SORT_RESULT 3 1 SELECT * FROM test_alt_format; @@ -37,10 +39,21 @@ SELECT * FROM test_alt_format; -- SQLNESS SORT_RESULT 3 1 SELECT i, h FROM test_alt_format; --- not allow to change from flat to primary_key --- SQLNESS REPLACE \d+\(\d+,\s+\d+\) REDACTED +-- allow to change from flat to primary_key ALTER TABLE test_alt_format SET 'sst_format' = 'primary_key'; +SHOW CREATE TABLE test_alt_format; + +INSERT INTO test_alt_format (h, j, i) VALUES (14, 4, 34); + +-- SQLNESS SORT_RESULT 3 1 +SELECT * FROM test_alt_format; + +ADMIN flush_table('test_alt_format'); + +-- SQLNESS SORT_RESULT 3 1 +SELECT * FROM test_alt_format; + DROP TABLE test_alt_format; CREATE TABLE alt_format_phy (ts timestamp time index, val double) engine=metric with ("physical_metric_table" = "", "sst_format" = "primary_key"); @@ -62,6 +75,8 @@ SELECT * FROM t1 ORDER BY ts ASC; ALTER TABLE alt_format_phy SET 'sst_format' = 'flat'; +SHOW CREATE TABLE alt_format_phy; + SELECT * FROM t1 ORDER BY ts ASC; SELECT host, ts, val FROM t1 where host = 'example.com' ORDER BY ts ASC; @@ -72,10 +87,16 @@ INSERT INTO t1 (ts, val, host) VALUES SELECT host, ts, val FROM t1 where host = 'example.com' ORDER BY ts ASC; --- not allow to change from flat to primary_key --- SQLNESS REPLACE \d+\(\d+,\s+\d+\) REDACTED +-- allow to change from flat to primary_key ALTER TABLE alt_format_phy SET 'sst_format' = 'primary_key'; +SHOW CREATE TABLE alt_format_phy; + +INSERT INTO t1 (ts, val, host) VALUES + ('2022-01-01 00:00:02', 5.0, 'example.com'); + +SELECT host, ts, val FROM t1 where host = 'example.com' ORDER BY ts ASC; + DROP TABLE t1; DROP TABLE alt_format_phy; diff --git a/tests/conf/datanode-test.toml.template b/tests/conf/datanode-test.toml.template index 3ec8a2f695..e68a76cc9a 100644 --- a/tests/conf/datanode-test.toml.template +++ b/tests/conf/datanode-test.toml.template @@ -6,7 +6,7 @@ rpc_runtime_size = 8 [[region_engine]] [region_engine.mito] {{ if enable_flat_format }} -default_experimental_flat_format = true +default_flat_format = true {{ endif }} [wal] diff --git a/tests/conf/standalone-test.toml.template b/tests/conf/standalone-test.toml.template index 50c014e991..bcd263d0b5 100644 --- a/tests/conf/standalone-test.toml.template +++ b/tests/conf/standalone-test.toml.template @@ -5,7 +5,7 @@ require_lease_before_startup = true [[region_engine]] [region_engine.mito] {{ if enable_flat_format }} -default_experimental_flat_format = true +default_flat_format = true {{ endif }} [wal] diff --git a/tests/runner/src/cmd/bare.rs b/tests/runner/src/cmd/bare.rs index e9a4ff8b79..58199f959e 100644 --- a/tests/runner/src/cmd/bare.rs +++ b/tests/runner/src/cmd/bare.rs @@ -103,7 +103,7 @@ pub struct BareCommand { #[clap(long)] extra_args: Vec, - /// Enable flat format for storage engine (sets default_experimental_flat_format = true). + /// Enable flat format for storage engine (sets default_flat_format = true). #[clap(long, default_value = "false")] enable_flat_format: bool, } From 1df9837538ceb30909ac7cd99e7f2d2ba036694b Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Tue, 7 Apr 2026 19:37:21 +0800 Subject: [PATCH 075/195] refactor!: update arrow-ipc output to stream format (#7922) * refactor!: update arrow-ipc output to stream format * chore: format --- src/servers/src/http.rs | 6 +++--- src/servers/src/http/result/arrow_result.rs | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/servers/src/http.rs b/src/servers/src/http.rs index eb2086726a..d25be0485f 100644 --- a/src/servers/src/http.rs +++ b/src/servers/src/http.rs @@ -1312,7 +1312,7 @@ mod test { use std::io::Cursor; use std::sync::Arc; - use arrow_ipc::reader::FileReader; + use arrow_ipc::reader::StreamReader; use arrow_schema::DataType; use axum::handler::Handler; use axum::http::StatusCode; @@ -1684,8 +1684,8 @@ mod test { HttpResponse::Arrow(resp) => { let output = resp.data; - let mut reader = - FileReader::try_new(Cursor::new(output), None).expect("Arrow reader error"); + let mut reader = StreamReader::try_new(Cursor::new(output), None) + .expect("Arrow reader error"); let schema = reader.schema(); assert_eq!(schema.fields[0].name(), "numbers"); assert_eq!(schema.fields[0].data_type(), &DataType::UInt32); diff --git a/src/servers/src/http/result/arrow_result.rs b/src/servers/src/http/result/arrow_result.rs index d583c3a590..90f8513827 100644 --- a/src/servers/src/http/result/arrow_result.rs +++ b/src/servers/src/http/result/arrow_result.rs @@ -17,7 +17,7 @@ use std::sync::Arc; use arrow::datatypes::Schema; use arrow_ipc::CompressionType; -use arrow_ipc::writer::{FileWriter, IpcWriteOptions}; +use arrow_ipc::writer::{IpcWriteOptions, StreamWriter}; use axum::http::{HeaderValue, header}; use axum::response::{IntoResponse, Response}; use common_error::status_code::StatusCode; @@ -48,7 +48,7 @@ async fn write_arrow_bytes( let options = IpcWriteOptions::default() .try_with_compression(compression) .context(error::ArrowSnafu)?; - let mut writer = FileWriter::try_new_with_options(&mut bytes, schema, options) + let mut writer = StreamWriter::try_new_with_options(&mut bytes, schema, options) .context(error::ArrowSnafu)?; while let Some(rb) = recordbatches.next().await { @@ -164,7 +164,7 @@ impl IntoResponse for ArrowResponse { mod test { use std::io::Cursor; - use arrow_ipc::reader::FileReader; + use arrow_ipc::reader::StreamReader; use arrow_schema::DataType; use common_recordbatch::{RecordBatch, RecordBatches}; use datatypes::prelude::*; @@ -200,8 +200,8 @@ mod test { match http_resp { HttpResponse::Arrow(resp) => { let output = resp.data; - let mut reader = - FileReader::try_new(Cursor::new(output), None).expect("Arrow reader error"); + let mut reader = StreamReader::try_new(Cursor::new(output), None) + .expect("Arrow reader error"); let schema = reader.schema(); assert_eq!(schema.fields[0].name(), "numbers"); assert_eq!(schema.fields[0].data_type(), &DataType::UInt32); From 6c72dc8e5754d0980247031bece373258ec2fd8b Mon Sep 17 00:00:00 2001 From: Yingwen Date: Wed, 8 Apr 2026 05:59:29 +0800 Subject: [PATCH 076/195] fix: add overflow check before interleave() (#7921) * fix: add overflow check before interleave() Signed-off-by: evenyag * refactor: pass batches and column index to check_interleave_bytes_overflow Refactor check_interleave_bytes_overflow to accept batches and a column index directly, avoiding the intermediate Vec collection of arrays. Signed-off-by: evenyag --------- Signed-off-by: evenyag --- src/mito2/src/read/flat_merge.rs | 63 +++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 2 deletions(-) diff --git a/src/mito2/src/read/flat_merge.rs b/src/mito2/src/read/flat_merge.rs index 90df227ae9..946f2a610c 100644 --- a/src/mito2/src/read/flat_merge.rs +++ b/src/mito2/src/read/flat_merge.rs @@ -19,9 +19,10 @@ use std::time::Instant; use async_stream::try_stream; use common_telemetry::debug; -use datatypes::arrow::array::{Int64Array, UInt64Array}; +use datatypes::arrow::array::{Array, AsArray, Int64Array, UInt64Array}; use datatypes::arrow::compute::interleave; -use datatypes::arrow::datatypes::SchemaRef; +use datatypes::arrow::datatypes::{ArrowNativeType, BinaryType, DataType, SchemaRef, Utf8Type}; +use datatypes::arrow::error::ArrowError; use datatypes::arrow::record_batch::RecordBatch; use datatypes::arrow_array::BinaryArray; use datatypes::timestamp::timestamp_array_to_primitive; @@ -39,6 +40,62 @@ use crate::sst::parquet::flat_format::{ }; use crate::sst::parquet::format::PrimaryKeyArray; +/// Checks whether interleaving the selected rows from byte columns would overflow +/// i32 offsets. Similar to arrow-rs `interleave_bytes()`, accumulates offsets and +/// returns an error if the capacity exceeds `i32::MAX`. +/// +/// TODO(yingwen): Remove this after upgrading to arrow >= 58.1.0, which handles +/// offset overflow in `interleave_bytes()` natively. +/// +/// See: +fn check_interleave_bytes_overflow( + batches: &[(usize, RecordBatch)], + col_idx: usize, + indices: &[(usize, usize)], +) -> std::result::Result<(), ArrowError> { + // Quick check: if concatenating all value data won't overflow, interleaving + // a subset of rows definitely won't either. + let total: usize = batches + .iter() + .map(|(_, batch)| batch.column(col_idx).as_bytes::().value_data().len()) + .sum(); + if T::Offset::from_usize(total).is_some() { + return Ok(()); + } + // Total exceeds the offset limit, do the precise per-row check. + let mut capacity: usize = 0; + for &(a, b) in indices { + let array = batches[a].1.column(col_idx).as_bytes::(); + let o = array.value_offsets(); + let element_len = o[b + 1].as_usize() - o[b].as_usize(); + capacity += element_len; + T::Offset::from_usize(capacity).ok_or(ArrowError::OffsetOverflowError(capacity))?; + } + Ok(()) +} + +/// Checks whether `interleave()` would overflow i32 offsets for `Utf8` or `Binary` columns. +fn check_interleave_overflow( + batches: &[(usize, RecordBatch)], + schema: &SchemaRef, + indices: &[(usize, usize)], +) -> Result<()> { + for (col_idx, field) in schema.fields.iter().enumerate() { + match field.data_type() { + DataType::Utf8 => { + check_interleave_bytes_overflow::(batches, col_idx, indices) + .context(ComputeArrowSnafu)?; + } + DataType::Binary => { + check_interleave_bytes_overflow::(batches, col_idx, indices) + .context(ComputeArrowSnafu)?; + } + _ => continue, + } + } + Ok(()) +} + /// Keeps track of the current position in a batch #[derive(Debug, Copy, Clone, Default)] struct BatchCursor { @@ -121,6 +178,8 @@ impl BatchBuilder { return Ok(None); } + check_interleave_overflow(&self.batches, &self.schema, &self.indices)?; + let columns = (0..self.schema.fields.len()) .map(|column_idx| { let arrays: Vec<_> = self From 6e0f5c504216039ada760d0bb014df3770e50054 Mon Sep 17 00:00:00 2001 From: jeremyhi Date: Tue, 7 Apr 2026 17:37:03 -0700 Subject: [PATCH 077/195] chore: memory limit comment (#7914) * chore: memory limit comment Signed-off-by: jeremyhi * chore: by gemini comment Signed-off-by: jeremyhi --------- Signed-off-by: jeremyhi --- config/config.md | 4 ++-- config/datanode.example.toml | 4 +++- config/standalone.example.toml | 4 +++- src/mito2/src/config.rs | 4 +++- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/config/config.md b/config/config.md index f28d09e28d..fb683eaaea 100644 --- a/config/config.md +++ b/config/config.md @@ -139,7 +139,7 @@ | `region_engine.mito.max_background_flushes` | Integer | Auto | Max number of running background flush jobs (default: 1/2 of cpu cores). | | `region_engine.mito.max_background_compactions` | Integer | Auto | Max number of running background compaction jobs (default: 1/4 of cpu cores). | | `region_engine.mito.max_background_purges` | Integer | Auto | Max number of running background purge jobs (default: number of cpu cores). | -| `region_engine.mito.experimental_compaction_memory_limit` | String | 0 | Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit. | +| `region_engine.mito.experimental_compaction_memory_limit` | String | 0 | Memory budget for compaction tasks.
Supports absolute size (e.g., "2GiB", "512MB") or percentage of system memory (e.g., "50%").
Setting it to 0 or "unlimited" disables the limit. | | `region_engine.mito.experimental_compaction_on_exhausted` | String | wait | Behavior when compaction cannot acquire memory from the budget.
Options: "wait" (default, 10s), "wait()", "fail" | | `region_engine.mito.auto_flush_interval` | String | `1h` | Interval to auto flush a region if it has not flushed yet. | | `region_engine.mito.global_write_buffer_size` | String | Auto | Global write buffer size for all regions. If not set, it's default to 1/8 of OS memory with a max limitation of 1GB. | @@ -531,7 +531,7 @@ | `region_engine.mito.max_background_flushes` | Integer | Auto | Max number of running background flush jobs (default: 1/2 of cpu cores). | | `region_engine.mito.max_background_compactions` | Integer | Auto | Max number of running background compaction jobs (default: 1/4 of cpu cores). | | `region_engine.mito.max_background_purges` | Integer | Auto | Max number of running background purge jobs (default: number of cpu cores). | -| `region_engine.mito.experimental_compaction_memory_limit` | String | 0 | Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit. | +| `region_engine.mito.experimental_compaction_memory_limit` | String | 0 | Memory budget for compaction tasks.
Supports absolute size (e.g., "2GiB", "512MB") or percentage of system memory (e.g., "50%").
Setting it to 0 or "unlimited" disables the limit. | | `region_engine.mito.experimental_compaction_on_exhausted` | String | wait | Behavior when compaction cannot acquire memory from the budget.
Options: "wait" (default, 10s), "wait()", "fail" | | `region_engine.mito.auto_flush_interval` | String | `1h` | Interval to auto flush a region if it has not flushed yet. | | `region_engine.mito.global_write_buffer_size` | String | Auto | Global write buffer size for all regions. If not set, it's default to 1/8 of OS memory with a max limitation of 1GB. | diff --git a/config/datanode.example.toml b/config/datanode.example.toml index 10e6965b84..6effec4c87 100644 --- a/config/datanode.example.toml +++ b/config/datanode.example.toml @@ -446,7 +446,9 @@ compress_manifest = false ## @toml2docs:none-default="Auto" #+ max_background_purges = 8 -## Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit. +## Memory budget for compaction tasks. +## Supports absolute size (e.g., "2GiB", "512MB") or percentage of system memory (e.g., "50%"). +## Setting it to 0 or "unlimited" disables the limit. ## @toml2docs:none-default="0" #+ experimental_compaction_memory_limit = "0" diff --git a/config/standalone.example.toml b/config/standalone.example.toml index 486bc74af2..7c7faa8a8b 100644 --- a/config/standalone.example.toml +++ b/config/standalone.example.toml @@ -538,7 +538,9 @@ compress_manifest = false ## @toml2docs:none-default="Auto" #+ max_background_purges = 8 -## Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit. +## Memory budget for compaction tasks. +## Supports absolute size (e.g., "2GiB", "512MB") or percentage of system memory (e.g., "50%"). +## Setting it to 0 or "unlimited" disables the limit. ## @toml2docs:none-default="0" #+ experimental_compaction_memory_limit = "0" diff --git a/src/mito2/src/config.rs b/src/mito2/src/config.rs index b3ddb023cb..120b5adbe3 100644 --- a/src/mito2/src/config.rs +++ b/src/mito2/src/config.rs @@ -91,7 +91,9 @@ pub struct MitoConfig { pub max_background_compactions: usize, /// Max number of running background purge jobs (default: number of cpu cores). pub max_background_purges: usize, - /// Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit. + /// Memory budget for compaction tasks. + /// Supports absolute size (e.g., "2GiB", "512MB") or percentage of system memory (e.g., "50%"). + /// Setting it to 0 or "unlimited" disables the limit. pub experimental_compaction_memory_limit: MemoryLimit, /// Behavior when compaction cannot acquire memory from the budget. pub experimental_compaction_on_exhausted: OnExhaustedPolicy, From b623cb1aa2fcab9cf6a7c833e2c5ba2460df37da Mon Sep 17 00:00:00 2001 From: discord9 <55937128+discord9@users.noreply.github.com> Date: Wed, 8 Apr 2026 10:54:22 +0800 Subject: [PATCH 078/195] perf: no longer window sort when limit (#7912) * perf: no longer window sort when limit Signed-off-by: discord9 * test: confusing vector sqlness Signed-off-by: discord9 * chore: redact sqlness Signed-off-by: discord9 * chore: redact every thing Signed-off-by: discord9 * REDACTED Signed-off-by: discord9 * what Signed-off-by: discord9 --------- Signed-off-by: discord9 --- src/query/src/optimizer/windowed_sort.rs | 2 ++ src/query/src/part_sort.rs | 12 +++----- .../cases/distributed/explain/order_by.result | 6 ++-- .../vector/vector_index_explain.result | 18 ++++++++---- .../function/vector/vector_index_explain.sql | 8 ++++++ .../standalone/common/order/order_by.result | 9 +++--- .../standalone/common/order/order_by.sql | 1 + .../common/order/windowed_sort.result | 28 +++++++------------ .../standalone/optimizer/order_by.result | 6 ++-- 9 files changed, 46 insertions(+), 44 deletions(-) diff --git a/src/query/src/optimizer/windowed_sort.rs b/src/query/src/optimizer/windowed_sort.rs index 9365c8e1e8..3d3993d454 100644 --- a/src/query/src/optimizer/windowed_sort.rs +++ b/src/query/src/optimizer/windowed_sort.rs @@ -94,6 +94,8 @@ impl WindowedSortPhysicalRule { && scanner_info .time_index .contains(input_schema.field(column_expr.index()).name()) + && sort_exec.fetch().is_none() + // skip if there is a limit, as dyn filter along is good enough in this case { } else { return Ok(Transformed::no(plan)); diff --git a/src/query/src/part_sort.rs b/src/query/src/part_sort.rs index e12479cc5a..19a114c8ce 100644 --- a/src/query/src/part_sort.rs +++ b/src/query/src/part_sort.rs @@ -237,14 +237,10 @@ impl ExecutionPlan for PartSortExec { } else { internal_err!("No children found")? }; - // create a new dynamic filter when with_new_children, as the old filter is bound to the old input and cannot be reused - let new = Self::try_new( - self.expression.clone(), - self.limit, - self.partition_ranges.clone(), - new_input.clone(), - )?; - Ok(Arc::new(new)) + let mut new_exec = self.as_ref().clone(); + new_exec.input = new_input.clone(); + new_exec.properties = new_input.properties().clone(); + Ok(Arc::new(new_exec)) } fn execute( diff --git a/tests/cases/distributed/explain/order_by.result b/tests/cases/distributed/explain/order_by.result index 362849afea..6ce8b4e170 100644 --- a/tests/cases/distributed/explain/order_by.result +++ b/tests/cases/distributed/explain/order_by.result @@ -126,8 +126,7 @@ EXPLAIN ANALYZE SELECT i, t AS alias_ts FROM test_pk ORDER BY t DESC LIMIT 5; |_|_|_| | 1_| 0_|_ProjectionExec: expr=[i@0 as i, t@1 as alias_ts] REDACTED |_|_|_SortPreservingMergeExec: [test_pk.t__temp__0@2 DESC], fetch=5 REDACTED -|_|_|_WindowedSortExec: expr=t@1 DESC num_ranges=REDACTED fetch=5 REDACTED -|_|_|_PartSortExec: expr=t@1 DESC num_ranges=REDACTED limit=5 REDACTED +|_|_|_SortExec: TopK(fetch=5), expr=[t@1 DESC], preserve_partitioning=[true], filter=[t@1 IS NULL OR t@1 > 2] REDACTED |_|_|_ProjectionExec: expr=[i@0 as i, t@1 as t, t@1 as test_pk.t__temp__0] REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED |_|_|_| @@ -150,8 +149,7 @@ EXPLAIN ANALYZE SELECT i, t AS alias_ts FROM test_pk ORDER BY alias_ts DESC LIMI |_|_|_| | 1_| 0_|_ProjectionExec: expr=[i@0 as i, t@1 as alias_ts] REDACTED |_|_|_SortPreservingMergeExec: [t@1 DESC], fetch=5 REDACTED -|_|_|_WindowedSortExec: expr=t@1 DESC num_ranges=REDACTED fetch=5 REDACTED -|_|_|_PartSortExec: expr=t@1 DESC num_ranges=REDACTED limit=5 REDACTED +|_|_|_SortExec: TopK(fetch=5), expr=[t@1 DESC], preserve_partitioning=[true], filter=[t@1 IS NULL OR t@1 > 2] REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED |_|_|_| |_|_| Total rows: 5_| diff --git a/tests/cases/standalone/common/function/vector/vector_index_explain.result b/tests/cases/standalone/common/function/vector/vector_index_explain.result index 246a49f405..10351cce11 100644 --- a/tests/cases/standalone/common/function/vector/vector_index_explain.result +++ b/tests/cases/standalone/common/function/vector/vector_index_explain.result @@ -41,6 +41,9 @@ ADMIN FLUSH_TABLE('vectors_explain'); -- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED -- SQLNESS REPLACE (peers.*) REDACTED -- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE ,\s"dyn_filters":\s\[.* REDACTED +-- SQLNESS REPLACE ,\s"vector_index_k":\s\d+ +-- SQLNESS REPLACE "index_size":\d+ "index_size":REDACTED EXPLAIN ANALYZE VERBOSE SELECT vec_id FROM vectors_explain @@ -56,7 +59,7 @@ LIMIT 2; | 1_| 0_|_ProjectionExec: expr=[vec_id@0 as vec_id] metrics=REDACTED_| |_|_|_SortPreservingMergeExec: [vec_l2sq_distance(embedding@1, [1.0, 0.0]) ASC NULLS LAST, vec_id@0 ASC NULLS LAST], fetch=2 metrics=REDACTED_| |_|_|_SortExec: TopK(fetch=2), expr=[vec_l2sq_distance(embedding@1, [1.0, 0.0]) ASC NULLS LAST, vec_id@0 ASC NULLS LAST], preserve_partitioning=[true] metrics=REDACTED_| -|_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":0, "files":1, "file_ranges":1}, "projection": ["vec_id", "embedding"], "dyn_filters": ["DynamicFilter [ vec_l2sq_distance(embedding@1, [1.0, 0.0]) < 0.010000004 OR vec_l2sq_distance(embedding@1, [1.0, 0.0]) = 0.010000004 AND vec_id@0 < 2 ]"], "vector_index_k": 2, "files": [{"file_id":"REDACTED","time_range_start":"REDACTED","time_range_end":"REDACTED","rows":4,"size":REDACTED,"index_size":893}], "flat_format":REDACTED, "metrics_per_partition": REDACTED metrics=REDACTED | +|_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":0, "files":1, "file_ranges":1}, "projection": ["vec_id", "embedding"]REDACTED |_|_|_| |_|_| Total rows: REDACTED_| +-+-+-+ @@ -122,6 +125,7 @@ ADMIN FLUSH_TABLE('vectors_explain_left'); -- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED -- SQLNESS REPLACE (peers.*) REDACTED -- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE SeqScan:.* SeqScan: REDACTED EXPLAIN ANALYZE VERBOSE SELECT l.vec_id FROM vectors_explain_left l @@ -144,10 +148,10 @@ LIMIT 1; |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_CooperativeExec metrics=REDACTED_| -|_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":0, "files":1, "file_ranges":1}, "projection": ["vec_id", "ts", "embedding"], "files": [{"file_id":"REDACTED","time_range_start":"REDACTED","time_range_end":"REDACTED","rows":4,"size":REDACTED,"index_size":893}], "flat_format":REDACTED, "metrics_per_partition": REDACTED metrics=REDACTED | +|_|_|_SeqScan: REDACTED |_|_|_| | 1_| 0_|_CooperativeExec metrics=REDACTED_| -|_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0}, "projection": ["vec_id", "note", "ts"], "flat_format":REDACTED, "metrics_per_partition": REDACTED metrics=REDACTED_| +|_|_|_SeqScan: REDACTED |_|_|_| |_|_| Total rows: REDACTED_| +-+-+-+ @@ -202,6 +206,9 @@ ADMIN FLUSH_TABLE('vectors_explain_metric'); -- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED -- SQLNESS REPLACE (peers.*) REDACTED -- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE SeqScan:.* SeqScan: REDACTED +-- SQLNESS REPLACE ,\s"vector_index_k":\s\d+ +-- SQLNESS REPLACE "index_size":\d+ "index_size":REDACTED EXPLAIN ANALYZE VERBOSE SELECT vec_id FROM vectors_explain_metric @@ -217,7 +224,7 @@ LIMIT 2; | 1_| 0_|_ProjectionExec: expr=[vec_id@0 as vec_id] metrics=REDACTED_| |_|_|_SortPreservingMergeExec: [vec_cos_distance(embedding@1, [1.0, 0.0]) ASC NULLS LAST, vec_id@0 ASC NULLS LAST], fetch=2 metrics=REDACTED_| |_|_|_SortExec: TopK(fetch=2), expr=[vec_cos_distance(embedding@1, [1.0, 0.0]) ASC NULLS LAST, vec_id@0 ASC NULLS LAST], preserve_partitioning=[true] metrics=REDACTED_| -|_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":0, "files":1, "file_ranges":1}, "projection": ["vec_id", "embedding"], "dyn_filters": ["DynamicFilter [ vec_cos_distance(embedding@1, [1.0, 0.0]) < 1 OR vec_cos_distance(embedding@1, [1.0, 0.0]) = 1 AND vec_id@0 < 4 ]"], "vector_index_k": 2, "files": [{"file_id":"REDACTED","time_range_start":"REDACTED","time_range_end":"REDACTED","rows":4,"size":REDACTED,"index_size":895}], "flat_format":REDACTED, "metrics_per_partition": REDACTED metrics=REDACTED | +|_|_|_SeqScan: REDACTED |_|_|_| |_|_| Total rows: REDACTED_| +-+-+-+ @@ -236,6 +243,7 @@ LIMIT 2; -- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED -- SQLNESS REPLACE (peers.*) REDACTED -- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE SeqScan:.* SeqScan: REDACTED EXPLAIN ANALYZE VERBOSE SELECT vec_id FROM vectors_explain_metric @@ -251,7 +259,7 @@ LIMIT 2; | 1_| 0_|_ProjectionExec: expr=[vec_id@0 as vec_id] metrics=REDACTED_| |_|_|_SortPreservingMergeExec: [vec_dot_product(embedding@1, [1.0, 0.0]) DESC, vec_id@0 ASC NULLS LAST], fetch=2 metrics=REDACTED_| |_|_|_SortExec: TopK(fetch=2), expr=[vec_dot_product(embedding@1, [1.0, 0.0]) DESC, vec_id@0 ASC NULLS LAST], preserve_partitioning=[true] metrics=REDACTED_| -|_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":0, "files":1, "file_ranges":1}, "projection": ["vec_id", "embedding"], "dyn_filters": ["DynamicFilter [ vec_dot_product(embedding@1, [1.0, 0.0]) IS NULL OR vec_dot_product(embedding@1, [1.0, 0.0]) > 0 OR vec_dot_product(embedding@1, [1.0, 0.0]) = 0 AND vec_id@0 < 2 ]"], "vector_index_k": 2, "files": [{"file_id":"REDACTED","time_range_start":"REDACTED","time_range_end":"REDACTED","rows":4,"size":REDACTED,"index_size":895}], "flat_format":REDACTED, "metrics_per_partition": REDACTED metrics=REDACTED | +|_|_|_SeqScan: REDACTED |_|_|_| |_|_| Total rows: REDACTED_| +-+-+-+ diff --git a/tests/cases/standalone/common/function/vector/vector_index_explain.sql b/tests/cases/standalone/common/function/vector/vector_index_explain.sql index 50bf3cdbaa..7a0330b0ea 100644 --- a/tests/cases/standalone/common/function/vector/vector_index_explain.sql +++ b/tests/cases/standalone/common/function/vector/vector_index_explain.sql @@ -33,6 +33,9 @@ ADMIN FLUSH_TABLE('vectors_explain'); -- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED -- SQLNESS REPLACE (peers.*) REDACTED -- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE ,\s"dyn_filters":\s\[.* REDACTED +-- SQLNESS REPLACE ,\s"vector_index_k":\s\d+ +-- SQLNESS REPLACE "index_size":\d+ "index_size":REDACTED EXPLAIN ANALYZE VERBOSE SELECT vec_id FROM vectors_explain @@ -84,6 +87,7 @@ ADMIN FLUSH_TABLE('vectors_explain_left'); -- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED -- SQLNESS REPLACE (peers.*) REDACTED -- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE SeqScan:.* SeqScan: REDACTED EXPLAIN ANALYZE VERBOSE SELECT l.vec_id FROM vectors_explain_left l @@ -126,6 +130,9 @@ ADMIN FLUSH_TABLE('vectors_explain_metric'); -- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED -- SQLNESS REPLACE (peers.*) REDACTED -- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE SeqScan:.* SeqScan: REDACTED +-- SQLNESS REPLACE ,\s"vector_index_k":\s\d+ +-- SQLNESS REPLACE "index_size":\d+ "index_size":REDACTED EXPLAIN ANALYZE VERBOSE SELECT vec_id FROM vectors_explain_metric @@ -146,6 +153,7 @@ LIMIT 2; -- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED -- SQLNESS REPLACE (peers.*) REDACTED -- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE SeqScan:.* SeqScan: REDACTED EXPLAIN ANALYZE VERBOSE SELECT vec_id FROM vectors_explain_metric diff --git a/tests/cases/standalone/common/order/order_by.result b/tests/cases/standalone/common/order/order_by.result index 13ac8caebe..6a2807e4b6 100644 --- a/tests/cases/standalone/common/order/order_by.result +++ b/tests/cases/standalone/common/order/order_by.result @@ -288,6 +288,7 @@ select tag from t where num > 6 order by ts; -- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED -- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED -- SQLNESS REPLACE num_ranges=\d+ num_ranges=REDACTED +-- SQLNESS REPLACE ,\sfilter=\[[^]]+\] explain analyze select tag from t where num > 6 order by ts desc limit 2; +-+-+-+ @@ -295,18 +296,16 @@ explain analyze select tag from t where num > 6 order by ts desc limit 2; +-+-+-+ | 0_| 0_|_ProjectionExec: expr=[tag@0 as tag] REDACTED |_|_|_SortPreservingMergeExec: [ts@1 DESC], fetch=2 REDACTED -|_|_|_SortExec: TopK(fetch=2), expr=[ts@1 DESC], preserve_partitioning=[true], filter=[ts@1 IS NULL OR ts@1 > 6000] REDACTED +|_|_|_SortExec: TopK(fetch=2), expr=[ts@1 DESC], preserve_partitioning=[true] REDACTED |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_SortPreservingMergeExec: [ts@1 DESC], fetch=2 REDACTED -|_|_|_WindowedSortExec: expr=ts@1 DESC num_ranges=REDACTED fetch=2 REDACTED -|_|_|_PartSortExec: expr=ts@1 DESC num_ranges=REDACTED limit=2 REDACTED +|_|_|_SortExec: TopK(fetch=2), expr=[ts@1 DESC], preserve_partitioning=[true] REDACTED |_|_|_FilterExec: num@2 > 6, projection=[tag@0, ts@1] REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED |_|_|_| | 1_| 1_|_SortPreservingMergeExec: [ts@1 DESC], fetch=2 REDACTED -|_|_|_WindowedSortExec: expr=ts@1 DESC num_ranges=REDACTED fetch=2 REDACTED -|_|_|_PartSortExec: expr=ts@1 DESC num_ranges=REDACTED limit=2 REDACTED +|_|_|_SortExec: TopK(fetch=2), expr=[ts@1 DESC], preserve_partitioning=[true] REDACTED |_|_|_FilterExec: num@2 > 6, projection=[tag@0, ts@1] REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED |_|_|_| diff --git a/tests/cases/standalone/common/order/order_by.sql b/tests/cases/standalone/common/order/order_by.sql index dd641613d9..95fd2c6f18 100644 --- a/tests/cases/standalone/common/order/order_by.sql +++ b/tests/cases/standalone/common/order/order_by.sql @@ -95,6 +95,7 @@ select tag from t where num > 6 order by ts; -- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED -- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED -- SQLNESS REPLACE num_ranges=\d+ num_ranges=REDACTED +-- SQLNESS REPLACE ,\sfilter=\[[^]]+\] explain analyze select tag from t where num > 6 order by ts desc limit 2; drop table t; diff --git a/tests/cases/standalone/common/order/windowed_sort.result b/tests/cases/standalone/common/order/windowed_sort.result index 4e550bf311..f85aa8c04e 100644 --- a/tests/cases/standalone/common/order/windowed_sort.result +++ b/tests/cases/standalone/common/order/windowed_sort.result @@ -70,7 +70,7 @@ EXPLAIN ANALYZE SELECT * FROM test ORDER BY t LIMIT 5; |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_SortPreservingMergeExec: [t@1 ASC NULLS LAST], fetch=5 REDACTED -|_|_|_WindowedSortExec: expr=t@1 ASC NULLS LAST num_ranges=REDACTED fetch=5 REDACTED +|_|_|_SortExec: TopK(fetch=5), expr=[t@1 ASC NULLS LAST], preserve_partitioning=[true] REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":4, "mem_ranges":1, "files":3, "file_ranges":3} REDACTED |_|_|_| |_|_| Total rows: 5_| @@ -103,8 +103,7 @@ EXPLAIN ANALYZE SELECT * FROM test ORDER BY t DESC LIMIT 5; |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_SortPreservingMergeExec: [t@1 DESC], fetch=5 REDACTED -|_|_|_WindowedSortExec: expr=t@1 DESC num_ranges=REDACTED fetch=5 REDACTED -|_|_|_PartSortExec: expr=t@1 DESC num_ranges=REDACTED limit=5 REDACTED +|_|_|_SortExec: TopK(fetch=5), expr=[t@1 DESC], preserve_partitioning=[true] REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":4, "mem_ranges":1, "files":3, "file_ranges":3} REDACTED |_|_|_| |_|_| Total rows: 5_| @@ -137,7 +136,7 @@ EXPLAIN ANALYZE SELECT * FROM test where i > 2 ORDER BY t LIMIT 4; |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_SortPreservingMergeExec: [t@1 ASC NULLS LAST], fetch=4 REDACTED -|_|_|_WindowedSortExec: expr=t@1 ASC NULLS LAST num_ranges=REDACTED fetch=4 REDACTED +|_|_|_SortExec: TopK(fetch=4), expr=[t@1 ASC NULLS LAST], preserve_partitioning=[true] REDACTED |_|_|_FilterExec: i@0 > 2 REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":4, "mem_ranges":1, "files":3, "file_ranges":3} REDACTED |_|_|_| @@ -171,8 +170,7 @@ EXPLAIN ANALYZE SELECT * FROM test where i > 2 ORDER BY t DESC LIMIT 4; |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_SortPreservingMergeExec: [t@1 DESC], fetch=4 REDACTED -|_|_|_WindowedSortExec: expr=t@1 DESC num_ranges=REDACTED fetch=4 REDACTED -|_|_|_PartSortExec: expr=t@1 DESC num_ranges=REDACTED limit=4 REDACTED +|_|_|_SortExec: TopK(fetch=4), expr=[t@1 DESC], preserve_partitioning=[true] REDACTED |_|_|_FilterExec: i@0 > 2 REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":4, "mem_ranges":1, "files":3, "file_ranges":3} REDACTED |_|_|_| @@ -206,8 +204,7 @@ EXPLAIN ANALYZE SELECT * FROM test where t > 8 ORDER BY t DESC LIMIT 4; |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_SortPreservingMergeExec: [t@1 DESC], fetch=4 REDACTED -|_|_|_WindowedSortExec: expr=t@1 DESC num_ranges=REDACTED fetch=4 REDACTED -|_|_|_PartSortExec: expr=t@1 DESC num_ranges=REDACTED limit=4 REDACTED +|_|_|_SortExec: TopK(fetch=4), expr=[t@1 DESC], preserve_partitioning=[true] REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":2, "mem_ranges":1, "files":1, "file_ranges":1} REDACTED |_|_|_| |_|_| Total rows: 4_| @@ -289,8 +286,7 @@ EXPLAIN ANALYZE SELECT * FROM test_pk ORDER BY t LIMIT 5; |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_SortPreservingMergeExec: [t@2 ASC NULLS LAST], fetch=5 REDACTED -|_|_|_WindowedSortExec: expr=t@2 ASC NULLS LAST num_ranges=REDACTED fetch=5 REDACTED -|_|_|_PartSortExec: expr=t@2 ASC NULLS LAST num_ranges=REDACTED limit=5 REDACTED +|_|_|_SortExec: TopK(fetch=5), expr=[t@2 ASC NULLS LAST], preserve_partitioning=[true] REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":4, "mem_ranges":1, "files":3, "file_ranges":3} REDACTED |_|_|_| |_|_| Total rows: 5_| @@ -312,8 +308,7 @@ EXPLAIN ANALYZE VERBOSE SELECT * FROM test_pk ORDER BY t LIMIT 5; |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_SortPreservingMergeExec: [t@2 ASC NULLS LAST], fetch=5 REDACTED -|_|_|_WindowedSortExec: expr=t@2 ASC NULLS LAST num_ranges=REDACTED fetch=5 REDACTED -|_|_|_PartSortExec: expr=t@2 ASC NULLS LAST num_ranges=REDACTED limit=5 REDACTED +|_|_|_SortExec: TopK(fetch=5), expr=[t@2 ASC NULLS LAST], preserve_partitioning=[true] REDACTED |_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":4, "mem_ranges":1, "REDACTED |_|_|_| |_|_| Total rows: 5_| @@ -346,8 +341,7 @@ EXPLAIN ANALYZE SELECT * FROM test_pk ORDER BY t DESC LIMIT 5; |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_SortPreservingMergeExec: [t@2 DESC], fetch=5 REDACTED -|_|_|_WindowedSortExec: expr=t@2 DESC num_ranges=REDACTED fetch=5 REDACTED -|_|_|_PartSortExec: expr=t@2 DESC num_ranges=REDACTED limit=5 REDACTED +|_|_|_SortExec: TopK(fetch=5), expr=[t@2 DESC], preserve_partitioning=[true] REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":4, "mem_ranges":1, "files":3, "file_ranges":3} REDACTED |_|_|_| |_|_| Total rows: 5_| @@ -381,8 +375,7 @@ EXPLAIN ANALYZE SELECT * FROM test_pk where pk > 7 ORDER BY t LIMIT 5; |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_SortPreservingMergeExec: [t@2 ASC NULLS LAST], fetch=5 REDACTED -|_|_|_WindowedSortExec: expr=t@2 ASC NULLS LAST num_ranges=REDACTED fetch=5 REDACTED -|_|_|_PartSortExec: expr=t@2 ASC NULLS LAST num_ranges=REDACTED limit=5 REDACTED +|_|_|_SortExec: TopK(fetch=5), expr=[t@2 ASC NULLS LAST], preserve_partitioning=[true] REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":4, "mem_ranges":1, "files":3, "file_ranges":3} REDACTED |_|_|_| |_|_| Total rows: 5_| @@ -404,8 +397,7 @@ EXPLAIN ANALYZE VERBOSE SELECT * FROM test_pk where pk > 7 ORDER BY t LIMIT 5; |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_SortPreservingMergeExec: [t@2 ASC NULLS LAST], fetch=5 REDACTED -|_|_|_WindowedSortExec: expr=t@2 ASC NULLS LAST num_ranges=REDACTED fetch=5 REDACTED -|_|_|_PartSortExec: expr=t@2 ASC NULLS LAST num_ranges=REDACTED limit=5 REDACTED +|_|_|_SortExec: TopK(fetch=5), expr=[t@2 ASC NULLS LAST], preserve_partitioning=[true] REDACTED |_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":4, "mem_ranges":1, "REDACTED |_|_|_| |_|_| Total rows: 5_| diff --git a/tests/cases/standalone/optimizer/order_by.result b/tests/cases/standalone/optimizer/order_by.result index 8bc4c14816..06b06ae442 100644 --- a/tests/cases/standalone/optimizer/order_by.result +++ b/tests/cases/standalone/optimizer/order_by.result @@ -142,8 +142,7 @@ EXPLAIN ANALYZE SELECT i, t AS alias_ts FROM test_pk ORDER BY t DESC LIMIT 5; |_|_|_| | 1_| 0_|_ProjectionExec: expr=[i@0 as i, alias_ts@1 as alias_ts] REDACTED |_|_|_SortPreservingMergeExec: [t@2 DESC], fetch=5 REDACTED -|_|_|_WindowedSortExec: expr=alias_ts@1 DESC num_ranges=REDACTED fetch=5 REDACTED -|_|_|_PartSortExec: expr=alias_ts@1 DESC num_ranges=REDACTED limit=5 REDACTED +|_|_|_SortExec: TopK(fetch=5), expr=[alias_ts@1 DESC], preserve_partitioning=[true], filter=[alias_ts@1 IS NULL OR alias_ts@1 > 2] REDACTED |_|_|_ProjectionExec: expr=[i@0 as i, t@1 as alias_ts, t@1 as t] REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED |_|_|_| @@ -165,8 +164,7 @@ EXPLAIN ANALYZE SELECT i, t AS alias_ts FROM test_pk ORDER BY alias_ts DESC LIMI |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_SortPreservingMergeExec: [alias_ts@1 DESC], fetch=5 REDACTED -|_|_|_WindowedSortExec: expr=alias_ts@1 DESC num_ranges=REDACTED fetch=5 REDACTED -|_|_|_PartSortExec: expr=alias_ts@1 DESC num_ranges=REDACTED limit=5 REDACTED +|_|_|_SortExec: TopK(fetch=5), expr=[alias_ts@1 DESC], preserve_partitioning=[true], filter=[alias_ts@1 IS NULL OR alias_ts@1 > 2] REDACTED |_|_|_ProjectionExec: expr=[i@0 as i, t@1 as alias_ts] REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED |_|_|_| From 2f8607138dc3f6d8d1ef47bfb1d73840838caf47 Mon Sep 17 00:00:00 2001 From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com> Date: Wed, 8 Apr 2026 10:59:39 +0800 Subject: [PATCH 079/195] docs(metric-engine): update prom_store example configs (#7920) docs: update prom_store example configs Signed-off-by: Lei, HUANG --- config/config.md | 10 ++++++++++ config/frontend.example.toml | 11 +++++++++++ config/standalone.example.toml | 11 +++++++++++ 3 files changed, 32 insertions(+) diff --git a/config/config.md b/config/config.md index fb683eaaea..82297d484e 100644 --- a/config/config.md +++ b/config/config.md @@ -69,6 +69,11 @@ | `prom_store` | -- | -- | Prometheus remote storage options | | `prom_store.enable` | Bool | `true` | Whether to enable Prometheus remote write and read in HTTP API. | | `prom_store.with_metric_engine` | Bool | `true` | Whether to store the data from Prometheus remote write in metric engine. | +| `prom_store.pending_rows_flush_interval` | String | `0s` | Interval to flush pending rows batcher.
Set to "0s" to disable batching mode in Prometheus Remote Write endpoint | +| `prom_store.max_batch_rows` | Integer | `100000` | Max rows per pending batch before triggering a flush. | +| `prom_store.max_concurrent_flushes` | Integer | `256` | Max number of concurrent batch flushes. | +| `prom_store.worker_channel_capacity` | Integer | `65526` | Capacity of the pending batch worker channel. | +| `prom_store.max_inflight_requests` | Integer | `3000` | Max inflight write requests before backpressure. | | `wal` | -- | -- | The WAL options. | | `wal.provider` | String | `raft_engine` | The provider of the WAL.
- `raft_engine`: the wal is stored in the local file system by raft-engine.
- `kafka`: it's remote wal that data is stored in Kafka. | | `wal.dir` | String | Unset | The directory to store the WAL files.
**It's only used when the provider is `raft_engine`**. | @@ -292,6 +297,11 @@ | `prom_store` | -- | -- | Prometheus remote storage options | | `prom_store.enable` | Bool | `true` | Whether to enable Prometheus remote write and read in HTTP API. | | `prom_store.with_metric_engine` | Bool | `true` | Whether to store the data from Prometheus remote write in metric engine. | +| `prom_store.pending_rows_flush_interval` | String | `0s` | Interval to flush pending rows batcher.
Set to "0s" to disable batching mode in Prometheus Remote Write endpoint | +| `prom_store.max_batch_rows` | Integer | `100000` | Max rows per pending batch before triggering a flush. | +| `prom_store.max_concurrent_flushes` | Integer | `256` | Max number of concurrent batch flushes. | +| `prom_store.worker_channel_capacity` | Integer | `65526` | Capacity of the pending batch worker channel. | +| `prom_store.max_inflight_requests` | Integer | `3000` | Max inflight write requests before backpressure. | | `meta_client` | -- | -- | The metasrv client options. | | `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. | | `meta_client.timeout` | String | `3s` | Operation timeout. | diff --git a/config/frontend.example.toml b/config/frontend.example.toml index 435504b122..97b5851672 100644 --- a/config/frontend.example.toml +++ b/config/frontend.example.toml @@ -214,6 +214,17 @@ enable = true enable = true ## Whether to store the data from Prometheus remote write in metric engine. with_metric_engine = true +## Interval to flush pending rows batcher. +## Set to "0s" to disable batching mode in Prometheus Remote Write endpoint +#+pending_rows_flush_interval = "0s" +## Max rows per pending batch before triggering a flush. +#+max_batch_rows = 100000 +## Max number of concurrent batch flushes. +#+max_concurrent_flushes = 256 +## Capacity of the pending batch worker channel. +#+worker_channel_capacity = 65526 +## Max inflight write requests before backpressure. +#+max_inflight_requests = 3000 ## The metasrv client options. [meta_client] diff --git a/config/standalone.example.toml b/config/standalone.example.toml index 7c7faa8a8b..d14bbe63d5 100644 --- a/config/standalone.example.toml +++ b/config/standalone.example.toml @@ -173,6 +173,17 @@ enable = true enable = true ## Whether to store the data from Prometheus remote write in metric engine. with_metric_engine = true +## Interval to flush pending rows batcher. +## Set to "0s" to disable batching mode in Prometheus Remote Write endpoint +#+pending_rows_flush_interval = "0s" +## Max rows per pending batch before triggering a flush. +#+max_batch_rows = 100000 +## Max number of concurrent batch flushes. +#+max_concurrent_flushes = 256 +## Capacity of the pending batch worker channel. +#+worker_channel_capacity = 65526 +## Max inflight write requests before backpressure. +#+max_inflight_requests = 3000 ## The WAL options. [wal] From 70ad41209237bd6657a05aea4fb2d6b86c4a1d4e Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Wed, 8 Apr 2026 14:41:19 +0800 Subject: [PATCH 080/195] fix: resolve postgres format and sync cleanup issues (#7928) --- Cargo.lock | 6 +++--- src/servers/Cargo.toml | 2 +- src/servers/src/postgres/handler.rs | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 68c01a3c63..f0ca46b271 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7290,7 +7290,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" dependencies = [ "cfg-if", - "windows-targets 0.48.5", + "windows-targets 0.52.6", ] [[package]] @@ -9586,9 +9586,9 @@ dependencies = [ [[package]] name = "pgwire" -version = "0.38.2" +version = "0.38.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a1bdf05fc8231cc5024572fe056e3ce34eb6b9b755ba7aba110e1c64119cec3" +checksum = "24bd4e6b1bfddc5c6420dee6602ec80946700b4c31ddcb64ee190ad6d979c210" dependencies = [ "async-trait", "base64 0.22.1", diff --git a/src/servers/Cargo.toml b/src/servers/Cargo.toml index 115636821b..2d68f17699 100644 --- a/src/servers/Cargo.toml +++ b/src/servers/Cargo.toml @@ -91,7 +91,7 @@ otel-arrow-rust.workspace = true parking_lot.workspace = true partition.workspace = true pg_interval = { version = "0.5.2", package = "pg_interval_2" } -pgwire = { version = "0.38.2", default-features = false, features = [ +pgwire = { version = "0.38.3", default-features = false, features = [ "server-api-ring", "pg-ext-types", ] } diff --git a/src/servers/src/postgres/handler.rs b/src/servers/src/postgres/handler.rs index 7e9b75c036..2b84b3aa30 100644 --- a/src/servers/src/postgres/handler.rs +++ b/src/servers/src/postgres/handler.rs @@ -529,7 +529,7 @@ impl ExtendedQueryHandler for PostgresServerHandlerInner { .collect::>(); if let Some(schema) = &sql_plan.schema { - schema_to_pg(schema, &Format::UnifiedBinary, None) + schema_to_pg(schema, &Format::UnifiedText, None) .map(|fields| DescribeStatementResponse::new(param_types, fields)) .map_err(convert_err) } else { From 6cc68ee8e13b43256bb5c7943feb5da896750c1c Mon Sep 17 00:00:00 2001 From: Weny Xu Date: Wed, 8 Apr 2026 20:06:11 +0800 Subject: [PATCH 081/195] fix(repartition): harden repartition rollback paths (#7918) * fix(meta-srv): restore repartition group metadata on rollback Signed-off-by: WenyXu * test(meta-srv): add repartition group rollback coverage * fix(meta-srv): rollback allocated regions on repartition failure * test(meta-srv): cover repartition parent rollback flow * test(meta-srv): cover repartition retry paths * fix: fix unit tests Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu * chore: apply suggestions Signed-off-by: WenyXu * test: add unit tests Signed-off-by: WenyXu * fix: persist repartition allocate state for retry and rollback Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu * fix: retry repartition mailbox channel close Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu * chore: refine logs * chore: add comments Signed-off-by: WenyXu --------- Signed-off-by: WenyXu --- src/meta-srv/src/bootstrap.rs | 3 +- src/meta-srv/src/error.rs | 38 + src/meta-srv/src/procedure/repartition.rs | 808 +++++++++++++++++- .../procedure/repartition/allocate_region.rs | 249 ++++-- .../src/procedure/repartition/collect.rs | 155 +++- .../repartition/deallocate_region.rs | 50 +- .../src/procedure/repartition/dispatch.rs | 8 +- .../src/procedure/repartition/group.rs | 405 ++++++++- .../group/apply_staging_manifest.rs | 9 +- .../repartition/group/enter_staging_region.rs | 9 +- .../repartition/group/remap_manifest.rs | 8 + .../repartition/group/sync_region.rs | 8 + .../update_metadata/apply_staging_region.rs | 5 +- .../rollback_staging_region.rs | 252 ++++-- .../repartition/repartition_start.rs | 2 +- .../src/procedure/repartition/test_util.rs | 202 ++++- src/meta-srv/src/procedure/utils.rs | 17 + 17 files changed, 2066 insertions(+), 162 deletions(-) diff --git a/src/meta-srv/src/bootstrap.rs b/src/meta-srv/src/bootstrap.rs index eadb7cdc75..51d2b4d37b 100644 --- a/src/meta-srv/src/bootstrap.rs +++ b/src/meta-srv/src/bootstrap.rs @@ -24,7 +24,6 @@ use common_base::Plugins; use common_config::Configurable; #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))] use common_meta::distributed_time_constants::META_LEASE_SECS; -use common_meta::election::CANDIDATE_LEASE_SECS; use common_meta::election::etcd::EtcdElection; use common_meta::kv_backend::chroot::ChrootKvBackend; use common_meta::kv_backend::etcd::EtcdStore; @@ -290,6 +289,7 @@ pub async fn metasrv_builder( use std::time::Duration; use common_meta::distributed_time_constants::POSTGRES_KEEP_ALIVE_SECS; + use common_meta::election::CANDIDATE_LEASE_SECS; use common_meta::election::rds::postgres::{ElectionPgClient, PgElection}; use common_meta::kv_backend::rds::PgStore; use deadpool_postgres::{Config, ManagerConfig, RecyclingMethod}; @@ -354,6 +354,7 @@ pub async fn metasrv_builder( (None, BackendImpl::MysqlStore) => { use std::time::Duration; + use common_meta::election::CANDIDATE_LEASE_SECS; use common_meta::election::rds::mysql::{ElectionMysqlClient, MySqlElection}; use common_meta::kv_backend::rds::MySqlStore; diff --git a/src/meta-srv/src/error.rs b/src/meta-srv/src/error.rs index 7b7983b1ba..a0f800f981 100644 --- a/src/meta-srv/src/error.rs +++ b/src/meta-srv/src/error.rs @@ -1136,6 +1136,12 @@ impl Error { Error::RetryLater { .. } | Error::RetryLaterWithSource { .. } | Error::MailboxTimeout { .. } + ) || matches!( + self, + Error::AllocateRegions { source, .. } if source.is_retry_later() + ) || matches!( + self, + Error::DeallocateRegions { source, .. } if source.is_retry_later() ) } } @@ -1324,3 +1330,35 @@ pub(crate) fn match_for_io_error(err_status: &tonic::Status) -> Option<&std::io: err = err.source()?; } } + +#[cfg(test)] +mod tests { + use common_error::mock::MockError; + use common_error::status_code::StatusCode; + use snafu::ResultExt; + + use super::DeallocateRegionsSnafu; + + #[test] + fn test_deallocate_regions_is_retryable_when_source_is_retry_later() { + let source = common_meta::error::Error::retry_later(MockError::new(StatusCode::Internal)); + let err = Err::<(), _>(source) + .context(DeallocateRegionsSnafu { table_id: 1024_u32 }) + .unwrap_err(); + + assert!(err.is_retryable()); + } + + #[test] + fn test_deallocate_regions_is_not_retryable_when_source_is_not_retry_later() { + let source = common_meta::error::UnexpectedSnafu { + err_msg: "mock error", + } + .build(); + let err = Err::<(), _>(source) + .context(DeallocateRegionsSnafu { table_id: 1024_u32 }) + .unwrap_err(); + + assert!(!err.is_retryable()); + } +} diff --git a/src/meta-srv/src/procedure/repartition.rs b/src/meta-srv/src/procedure/repartition.rs index 37c7745ae5..db8bfeadc5 100644 --- a/src/meta-srv/src/procedure/repartition.rs +++ b/src/meta-srv/src/procedure/repartition.rs @@ -23,7 +23,7 @@ pub mod repartition_start; pub mod utils; use std::any::Any; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::fmt::{Debug, Display}; use std::time::{Duration, Instant}; @@ -40,15 +40,15 @@ use common_meta::key::table_route::TableRouteValue; use common_meta::key::{DeserializedValueWithBytes, TableMetadataManagerRef}; use common_meta::lock_key::{CatalogLock, SchemaLock, TableLock, TableNameLock}; use common_meta::node_manager::NodeManagerRef; -use common_meta::region_keeper::MemoryRegionKeeperRef; +use common_meta::region_keeper::{MemoryRegionKeeperRef, OperatingRegionGuard}; use common_meta::region_registry::LeaderRegionRegistryRef; -use common_meta::rpc::router::RegionRoute; +use common_meta::rpc::router::{RegionRoute, operating_leader_regions}; use common_procedure::error::{FromJsonSnafu, ToJsonSnafu}; use common_procedure::{ BoxedProcedure, Context as ProcedureContext, Error as ProcedureError, LockKey, Procedure, ProcedureManagerRef, Result as ProcedureResult, Status, StringKey, UserMetadata, }; -use common_telemetry::{error, info}; +use common_telemetry::{error, info, warn}; use partition::expr::PartitionExpr; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt}; @@ -56,6 +56,8 @@ use store_api::storage::{RegionNumber, TableId}; use table::table_name::TableName; use crate::error::{self, Result}; +use crate::procedure::repartition::collect::ProcedureMeta; +use crate::procedure::repartition::deallocate_region::DeallocateRegion; use crate::procedure::repartition::group::{ Context as RepartitionGroupContext, RepartitionGroupProcedure, }; @@ -74,6 +76,12 @@ pub struct PersistentContext { pub table_name: String, pub table_id: TableId, pub plans: Vec, + /// Records failed sub-procedures for metadata rollback. + #[serde(default)] + pub failed_procedures: Vec, + #[serde(default)] + /// Records unknown sub-procedures for metadata rollback. + pub unknown_procedures: Vec, /// The timeout for repartition operations. #[serde(with = "humantime_serde", default = "default_timeout")] pub timeout: Duration, @@ -102,6 +110,8 @@ impl PersistentContext { table_name, table_id, plans: vec![], + failed_procedures: vec![], + unknown_procedures: vec![], timeout: timeout.unwrap_or_else(default_timeout), } } @@ -393,6 +403,23 @@ impl Context { .await; Ok(()) } + + pub fn register_operating_regions( + memory_region_keeper: &MemoryRegionKeeperRef, + region_routes: &[RegionRoute], + ) -> Result> { + let mut operating_guards = Vec::with_capacity(region_routes.len()); + for (region_id, datanode_id) in operating_leader_regions(region_routes) { + let guard = memory_region_keeper + .register(datanode_id, region_id) + .context(error::RegionOperatingRaceSnafu { + peer_id: datanode_id, + region_id, + })?; + operating_guards.push(guard); + } + Ok(operating_guards) + } } #[async_trait::async_trait] @@ -456,6 +483,131 @@ impl RepartitionProcedure { Ok(Self { state, context }) } + + /// Returns whether parent rollback should remove this repartition's allocated regions. + /// + /// This uses an "after AllocateRegion" semantic: once execution reaches + /// `AllocateRegion` or any later state, rollback must try to remove this round's + /// `allocated_region_ids` from table-route metadata when they exist. + /// + /// State flow: + /// `RepartitionStart -> AllocateRegion -> Dispatch -> Collect -> DeallocateRegion -> RepartitionEnd` + /// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + /// rollback allocated regions in metadata + /// + /// Notes: + /// - `RepartitionStart`: no-op, because allocation has not happened yet. + /// - `AllocateRegion` / `Dispatch` / `Collect` rollback-active. + /// - `DeallocateRegion`: is not rollback-active. + /// - `RepartitionEnd`: no-op. + fn should_rollback_allocated_regions(&self) -> bool { + self.state.as_any().is::() + || self.state.as_any().is::() + || self.state.as_any().is::() + } + + fn rollback_allocated_region_ids(&self) -> HashSet { + if self.state.as_any().is::() + || self.state.as_any().is::() + { + return self + .context + .persistent_ctx + .plans + .iter() + .flat_map(|plan| plan.allocated_region_ids.iter().copied()) + .collect(); + } + + self.context + .persistent_ctx + .failed_procedures + .iter() + .chain(self.context.persistent_ctx.unknown_procedures.iter()) + .flat_map(|procedure_meta| { + let plan_index = procedure_meta.plan_index; + self.context.persistent_ctx.plans[plan_index] + .allocated_region_ids + .iter() + .copied() + }) + .collect() + } + + fn filter_allocated_region_routes( + region_routes: &[RegionRoute], + allocated_region_ids: &HashSet, + ) -> Vec { + region_routes + .iter() + .filter(|route| !allocated_region_ids.contains(&route.region.id)) + .cloned() + .collect() + } + + async fn rollback_inner(&mut self, procedure_ctx: &ProcedureContext) -> Result<()> { + if !self.should_rollback_allocated_regions() { + return Ok(()); + } + + let table_id = self.context.persistent_ctx.table_id; + let allocated_region_ids = self.rollback_allocated_region_ids(); + if allocated_region_ids.is_empty() { + return Ok(()); + } + + let table_lock = TableLock::Write(table_id).into(); + let _guard = procedure_ctx.provider.acquire_lock(&table_lock).await; + let table_route_value = self.context.get_table_route_value().await?; + let current_region_routes = table_route_value.region_routes().unwrap(); + let allocated_region_routes = DeallocateRegion::filter_deallocatable_region_routes( + table_id, + current_region_routes, + &allocated_region_ids, + ); + if !allocated_region_routes.is_empty() { + let table = TableName { + catalog_name: self.context.persistent_ctx.catalog_name.clone(), + schema_name: self.context.persistent_ctx.schema_name.clone(), + table_name: self.context.persistent_ctx.table_name.clone(), + }; + // Memory guards are not required here, + // because the table metadata still contains routes for the deallocating regions. + if let Err(err) = DeallocateRegion::deallocate_regions( + &self.context.node_manager, + &self.context.leader_region_registry, + table, + table_id, + &allocated_region_routes, + ) + .await + { + warn!(err; "Failed to drop allocated regions during repartition rollback, table_id: {}, regions: {:?}", table_id, allocated_region_ids); + } + } + + let new_region_routes = + Self::filter_allocated_region_routes(current_region_routes, &allocated_region_ids); + + if new_region_routes.len() != current_region_routes.len() { + self.context + .update_table_route(&table_route_value, new_region_routes, HashMap::new()) + .await + .map_err(BoxedError::new) + .with_context(|_| error::RetryLaterWithSourceSnafu { + reason: format!( + "Failed to rollback allocated region routes for repartition table: {}", + table_id + ), + })?; + } + + if let Err(err) = self.context.invalidate_table_cache().await { + warn!(err; "Failed to invalidate table cache during repartition rollback, table_id: {}", table_id); + } + + Ok(()) + } } #[async_trait::async_trait] @@ -497,9 +649,14 @@ impl Procedure for RepartitionProcedure { } } + async fn rollback(&mut self, ctx: &ProcedureContext) -> ProcedureResult<()> { + self.rollback_inner(ctx) + .await + .map_err(ProcedureError::external) + } + fn rollback_supported(&self) -> bool { - // TODO(weny): support rollback. - false + true } fn dump(&self) -> ProcedureResult { @@ -624,3 +781,642 @@ impl RepartitionProcedureFactory for DefaultRepartitionProcedureFactory { Ok(()) } } + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::sync::Arc; + use std::sync::atomic::{AtomicBool, Ordering}; + + use common_error::ext::BoxedError; + use common_error::mock::MockError; + use common_error::status_code::StatusCode; + use common_meta::ddl::test_util::datanode_handler::{ + DatanodeWatcher, NaiveDatanodeHandler, UnexpectedErrorDatanodeHandler, + }; + use common_meta::error; + use common_meta::peer::Peer; + use common_meta::rpc::router::{Region, RegionRoute}; + use common_meta::test_util::MockDatanodeManager; + use common_procedure::{Error as ProcedureError, Procedure, ProcedureId, ProcedureState}; + use store_api::storage::RegionId; + use table::table_name::TableName; + use tokio::sync::mpsc; + use uuid::Uuid; + + use super::*; + use crate::procedure::repartition::allocate_region::AllocateRegion; + use crate::procedure::repartition::collect::Collect; + use crate::procedure::repartition::deallocate_region::DeallocateRegion; + use crate::procedure::repartition::dispatch::Dispatch; + use crate::procedure::repartition::plan::RegionDescriptor; + use crate::procedure::repartition::repartition_end::RepartitionEnd; + use crate::procedure::repartition::test_util::{ + TestingEnv, assert_parent_state, current_parent_region_routes, extract_subprocedure_ids, + new_parent_context, procedure_context_with_receivers, procedure_state_receiver, range_expr, + test_region_route, test_region_wal_options, + }; + + fn test_plan(table_id: TableId) -> RepartitionPlanEntry { + RepartitionPlanEntry { + group_id: uuid::Uuid::new_v4(), + source_regions: vec![RegionDescriptor { + region_id: RegionId::new(table_id, 1), + partition_expr: range_expr("x", 0, 100), + }], + target_regions: vec![ + RegionDescriptor { + region_id: RegionId::new(table_id, 1), + partition_expr: range_expr("x", 0, 50), + }, + RegionDescriptor { + region_id: RegionId::new(table_id, 3), + partition_expr: range_expr("x", 50, 100), + }, + ], + allocated_region_ids: vec![RegionId::new(table_id, 3)], + pending_deallocate_region_ids: vec![], + transition_map: vec![vec![0, 1]], + } + } + + fn test_procedure(state: Box, context: Context) -> RepartitionProcedure { + RepartitionProcedure { state, context } + } + + fn test_context(env: &TestingEnv, table_id: TableId) -> Context { + let node_manager = Arc::new(MockDatanodeManager::new(UnexpectedErrorDatanodeHandler)); + let ddl_ctx = env.ddl_context(node_manager); + let persistent_ctx = PersistentContext::new( + TableName::new("test_catalog", "test_schema", "test_table"), + table_id, + None, + ); + + Context::new( + &ddl_ctx, + env.mailbox_ctx.mailbox().clone(), + env.server_addr.clone(), + persistent_ctx, + ) + } + + #[test] + fn test_filter_allocated_region_routes() { + let table_id = 1024; + let region_routes = vec![ + test_region_route(RegionId::new(table_id, 1), "a"), + test_region_route(RegionId::new(table_id, 2), "b"), + ]; + let allocated_region_ids = HashSet::from([RegionId::new(table_id, 2)]); + + let new_region_routes = RepartitionProcedure::filter_allocated_region_routes( + ®ion_routes, + &allocated_region_ids, + ); + + assert_eq!(new_region_routes.len(), 1); + assert_eq!(new_region_routes[0].region.id, RegionId::new(table_id, 1)); + } + + #[test] + fn test_should_rollback_allocated_regions() { + let env = TestingEnv::new(); + let table_id = 1024; + + let procedure = test_procedure( + Box::new(RepartitionStart::new(vec![], vec![])), + test_context(&env, table_id), + ); + assert!(!procedure.should_rollback_allocated_regions()); + + let procedure = test_procedure( + Box::new(AllocateRegion::new(vec![])), + test_context(&env, table_id), + ); + assert!(procedure.should_rollback_allocated_regions()); + + let procedure = test_procedure(Box::new(Dispatch), test_context(&env, table_id)); + assert!(procedure.should_rollback_allocated_regions()); + + let procedure = + test_procedure(Box::new(Collect::new(vec![])), test_context(&env, table_id)); + assert!(procedure.should_rollback_allocated_regions()); + + let procedure = test_procedure(Box::new(DeallocateRegion), test_context(&env, table_id)); + assert!(!procedure.should_rollback_allocated_regions()); + + let procedure = test_procedure(Box::new(RepartitionEnd), test_context(&env, table_id)); + assert!(!procedure.should_rollback_allocated_regions()); + } + + #[tokio::test] + async fn test_repartition_rollback_removes_allocated_routes_from_dispatch() { + let env = TestingEnv::new(); + let table_id = 1024; + let node_manager = Arc::new(MockDatanodeManager::new(UnexpectedErrorDatanodeHandler)); + let ddl_ctx = env.ddl_context(node_manager); + let original_region_routes = vec![ + test_region_route( + RegionId::new(table_id, 1), + &range_expr("x", 0, 100).as_json_str().unwrap(), + ), + test_region_route( + RegionId::new(table_id, 2), + &range_expr("x", 50, 100).as_json_str().unwrap(), + ), + test_region_route(RegionId::new(table_id, 3), ""), + ]; + env.create_physical_table_metadata_with_wal_options( + table_id, + original_region_routes, + test_region_wal_options(&[1, 2]), + ) + .await; + + let mut persistent_ctx = PersistentContext::new( + TableName::new("test_catalog", "test_schema", "test_table"), + table_id, + None, + ); + persistent_ctx.plans = vec![test_plan(table_id)]; + persistent_ctx.failed_procedures = vec![ProcedureMeta { + plan_index: 0, + group_id: Uuid::new_v4(), + procedure_id: ProcedureId::random(), + }]; + let context = Context::new( + &ddl_ctx, + env.mailbox_ctx.mailbox().clone(), + env.server_addr.clone(), + persistent_ctx, + ); + let mut procedure = RepartitionProcedure { + state: Box::new(Dispatch), + context, + }; + + procedure + .rollback(&TestingEnv::procedure_context()) + .await + .unwrap(); + + let region_routes = current_parent_region_routes(&procedure.context).await; + assert_eq!(region_routes.len(), 2); + assert_eq!(region_routes[0].region.id, RegionId::new(table_id, 1)); + assert_eq!(region_routes[1].region.id, RegionId::new(table_id, 2)); + } + + #[tokio::test] + async fn test_repartition_rollback_removes_allocated_routes_from_allocate() { + let env = TestingEnv::new(); + let table_id = 1024; + let node_manager = Arc::new(MockDatanodeManager::new(UnexpectedErrorDatanodeHandler)); + let ddl_ctx = env.ddl_context(node_manager); + let original_region_routes = vec![ + test_region_route( + RegionId::new(table_id, 1), + &range_expr("x", 0, 100).as_json_str().unwrap(), + ), + test_region_route( + RegionId::new(table_id, 2), + &range_expr("x", 50, 100).as_json_str().unwrap(), + ), + test_region_route(RegionId::new(table_id, 3), ""), + ]; + env.create_physical_table_metadata_with_wal_options( + table_id, + original_region_routes, + test_region_wal_options(&[1, 2]), + ) + .await; + + let mut persistent_ctx = PersistentContext::new( + TableName::new("test_catalog", "test_schema", "test_table"), + table_id, + None, + ); + persistent_ctx.plans = vec![test_plan(table_id)]; + let context = Context::new( + &ddl_ctx, + env.mailbox_ctx.mailbox().clone(), + env.server_addr.clone(), + persistent_ctx, + ); + let mut procedure = RepartitionProcedure { + state: Box::new(AllocateRegion::new(vec![])), + context, + }; + + procedure + .rollback(&TestingEnv::procedure_context()) + .await + .unwrap(); + + let region_routes = current_parent_region_routes(&procedure.context).await; + assert_eq!(region_routes.len(), 2); + assert_eq!(region_routes[0].region.id, RegionId::new(table_id, 1)); + assert_eq!(region_routes[1].region.id, RegionId::new(table_id, 2)); + } + + #[tokio::test] + async fn test_repartition_rollback_from_collect_only_removes_failed_allocated_routes() { + let env = TestingEnv::new(); + let table_id = 1024; + let node_manager = Arc::new(MockDatanodeManager::new(UnexpectedErrorDatanodeHandler)); + let ddl_ctx = env.ddl_context(node_manager); + let original_region_routes = vec![ + test_region_route( + RegionId::new(table_id, 1), + &range_expr("x", 0, 100).as_json_str().unwrap(), + ), + test_region_route( + RegionId::new(table_id, 2), + &range_expr("x", 100, 200).as_json_str().unwrap(), + ), + test_region_route(RegionId::new(table_id, 3), ""), + test_region_route(RegionId::new(table_id, 4), ""), + ]; + env.create_physical_table_metadata_with_wal_options( + table_id, + original_region_routes, + test_region_wal_options(&[1, 2, 3, 4]), + ) + .await; + + let mut persistent_ctx = PersistentContext::new( + TableName::new("test_catalog", "test_schema", "test_table"), + table_id, + None, + ); + let failed_plan = test_plan(table_id); + let succeeded_plan = RepartitionPlanEntry { + group_id: Uuid::new_v4(), + source_regions: vec![RegionDescriptor { + region_id: RegionId::new(table_id, 2), + partition_expr: range_expr("x", 100, 200), + }], + target_regions: vec![ + RegionDescriptor { + region_id: RegionId::new(table_id, 2), + partition_expr: range_expr("x", 100, 150), + }, + RegionDescriptor { + region_id: RegionId::new(table_id, 4), + partition_expr: range_expr("x", 150, 200), + }, + ], + allocated_region_ids: vec![RegionId::new(table_id, 4)], + pending_deallocate_region_ids: vec![], + transition_map: vec![vec![0]], + }; + persistent_ctx.plans = vec![failed_plan, succeeded_plan]; + persistent_ctx.failed_procedures = vec![ProcedureMeta { + plan_index: 0, + group_id: persistent_ctx.plans[0].group_id, + procedure_id: ProcedureId::random(), + }]; + + let context = Context::new( + &ddl_ctx, + env.mailbox_ctx.mailbox().clone(), + env.server_addr.clone(), + persistent_ctx, + ); + let mut procedure = RepartitionProcedure { + state: Box::new(Collect::new(vec![])), + context, + }; + + procedure + .rollback(&TestingEnv::procedure_context()) + .await + .unwrap(); + + let region_routes = current_parent_region_routes(&procedure.context).await; + assert_eq!(region_routes.len(), 3); + assert_eq!(region_routes[0].region.id, RegionId::new(table_id, 1)); + assert_eq!(region_routes[1].region.id, RegionId::new(table_id, 2)); + assert_eq!(region_routes[2].region.id, RegionId::new(table_id, 4)); + } + + #[tokio::test] + async fn test_repartition_rollback_is_idempotent() { + let env = TestingEnv::new(); + let table_id = 1024; + let node_manager = Arc::new(MockDatanodeManager::new(UnexpectedErrorDatanodeHandler)); + let ddl_ctx = env.ddl_context(node_manager); + let original_region_routes = vec![ + test_region_route( + RegionId::new(table_id, 1), + &range_expr("x", 0, 100).as_json_str().unwrap(), + ), + test_region_route( + RegionId::new(table_id, 2), + &range_expr("x", 50, 100).as_json_str().unwrap(), + ), + test_region_route(RegionId::new(table_id, 3), ""), + ]; + env.create_physical_table_metadata_with_wal_options( + table_id, + original_region_routes, + test_region_wal_options(&[1, 2]), + ) + .await; + + let mut persistent_ctx = PersistentContext::new( + TableName::new("test_catalog", "test_schema", "test_table"), + table_id, + None, + ); + persistent_ctx.plans = vec![test_plan(table_id)]; + persistent_ctx.failed_procedures = vec![ProcedureMeta { + plan_index: 0, + group_id: Uuid::new_v4(), + procedure_id: ProcedureId::random(), + }]; + let context = Context::new( + &ddl_ctx, + env.mailbox_ctx.mailbox().clone(), + env.server_addr.clone(), + persistent_ctx, + ); + let mut procedure = RepartitionProcedure { + state: Box::new(Dispatch), + context, + }; + + procedure + .rollback(&TestingEnv::procedure_context()) + .await + .unwrap(); + let once = current_parent_region_routes(&procedure.context).await; + + procedure + .rollback(&TestingEnv::procedure_context()) + .await + .unwrap(); + let twice = current_parent_region_routes(&procedure.context).await; + + assert_eq!(once, twice); + assert_eq!(once.len(), 2); + assert_eq!(once[0].region.id, RegionId::new(table_id, 1)); + assert_eq!(once[1].region.id, RegionId::new(table_id, 2)); + } + + #[tokio::test] + async fn test_repartition_procedure_flow_split_failed_and_full_rollback() { + let env = TestingEnv::new(); + let table_id = 1024; + let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler)); + + env.create_physical_table_metadata_for_repartition( + table_id, + vec![ + test_region_route( + RegionId::new(table_id, 1), + &range_expr("x", 0, 100).as_json_str().unwrap(), + ), + test_region_route( + RegionId::new(table_id, 2), + &range_expr("x", 100, 200).as_json_str().unwrap(), + ), + ], + test_region_wal_options(&[1, 2]), + ) + .await; + + let context = new_parent_context(&env, node_manager, table_id); + let mut procedure = RepartitionProcedure::new( + vec![range_expr("x", 0, 100)], + vec![range_expr("x", 0, 50), range_expr("x", 50, 100)], + context, + ); + + let start_status = procedure + .execute(&TestingEnv::procedure_context()) + .await + .unwrap(); + assert!(!start_status.need_persist()); + let start_status = procedure + .execute(&TestingEnv::procedure_context()) + .await + .unwrap(); + assert!(start_status.need_persist()); + assert_parent_state::(&procedure); + + let allocate_status = procedure + .execute(&TestingEnv::procedure_context()) + .await + .unwrap(); + assert!(allocate_status.need_persist()); + assert_parent_state::(&procedure); + assert_eq!(procedure.context.persistent_ctx.plans.len(), 1); + let plan = &procedure.context.persistent_ctx.plans[0]; + let expected_plan = test_plan(table_id); + assert_eq!(plan.source_regions, expected_plan.source_regions); + assert_eq!(plan.target_regions, expected_plan.target_regions); + assert_eq!( + plan.allocated_region_ids, + expected_plan.allocated_region_ids + ); + assert_eq!( + plan.pending_deallocate_region_ids, + expected_plan.pending_deallocate_region_ids + ); + assert_eq!(plan.transition_map, expected_plan.transition_map); + assert_eq!( + current_parent_region_routes(&procedure.context).await, + vec![ + test_region_route( + RegionId::new(table_id, 1), + &range_expr("x", 0, 100).as_json_str().unwrap(), + ), + test_region_route( + RegionId::new(table_id, 2), + &range_expr("x", 100, 200).as_json_str().unwrap(), + ), + RegionRoute { + region: Region { + id: RegionId::new(table_id, 3), + partition_expr: range_expr("x", 50, 100).as_json_str().unwrap(), + ..Default::default() + }, + leader_peer: Some(Peer::empty(0)), + ..Default::default() + }, + ] + ); + + let dispatch_status = procedure + .execute(&TestingEnv::procedure_context()) + .await + .unwrap(); + assert!(!dispatch_status.need_persist()); + let subprocedure_ids = extract_subprocedure_ids(dispatch_status); + assert_eq!(subprocedure_ids.len(), 1); + assert_parent_state::(&procedure); + + let failed_state = ProcedureState::failed(Arc::new(ProcedureError::external( + MockError::new(StatusCode::Internal), + ))); + let collect_ctx = procedure_context_with_receivers(HashMap::from([( + subprocedure_ids[0], + procedure_state_receiver(failed_state), + )])); + + let err = procedure.execute(&collect_ctx).await.unwrap_err(); + assert!(!err.is_retry_later()); + assert_parent_state::(&procedure); + + procedure + .rollback(&TestingEnv::procedure_context()) + .await + .unwrap(); + + let region_routes = current_parent_region_routes(&procedure.context).await; + assert_eq!( + region_routes, + vec![ + test_region_route( + RegionId::new(table_id, 1), + &range_expr("x", 0, 100).as_json_str().unwrap(), + ), + test_region_route( + RegionId::new(table_id, 2), + &range_expr("x", 100, 200).as_json_str().unwrap(), + ), + ] + ); + } + + #[tokio::test] + async fn test_repartition_procedure_flow_split_allocate_retryable_then_resume() { + common_telemetry::init_default_ut_logging(); + let env = TestingEnv::new(); + let table_id = 1024; + let (tx, _rx) = mpsc::channel(8); + let should_retry = Arc::new(AtomicBool::new(true)); + let datanode_handler = DatanodeWatcher::new(tx).with_handler(move |_, _| { + if should_retry.swap(false, Ordering::SeqCst) { + return Err(error::Error::RetryLater { + source: BoxedError::new( + error::UnexpectedSnafu { + err_msg: "retry later", + } + .build(), + ), + clean_poisons: false, + }); + } + + Ok(api::region::RegionResponse::new(0)) + }); + let node_manager = Arc::new(MockDatanodeManager::new(datanode_handler)); + + env.create_physical_table_metadata_for_repartition( + table_id, + vec![ + test_region_route( + RegionId::new(table_id, 1), + &range_expr("x", 0, 100).as_json_str().unwrap(), + ), + test_region_route( + RegionId::new(table_id, 2), + &range_expr("x", 100, 200).as_json_str().unwrap(), + ), + ], + test_region_wal_options(&[1, 2]), + ) + .await; + + let context = new_parent_context(&env, node_manager, table_id); + let mut procedure = RepartitionProcedure::new( + vec![range_expr("x", 0, 100)], + vec![range_expr("x", 0, 50), range_expr("x", 50, 100)], + context, + ); + + let start_status = procedure + .execute(&TestingEnv::procedure_context()) + .await + .unwrap(); + assert!(!start_status.need_persist()); + let start_status = procedure + .execute(&TestingEnv::procedure_context()) + .await + .unwrap(); + assert!(start_status.need_persist()); + assert_parent_state::(&procedure); + + let err = procedure + .execute(&TestingEnv::procedure_context()) + .await + .unwrap_err(); + assert!(err.is_retry_later()); + assert_parent_state::(&procedure); + assert!(!procedure.context.persistent_ctx.plans.is_empty()); + assert_eq!( + current_parent_region_routes(&procedure.context).await, + vec![ + test_region_route( + RegionId::new(table_id, 1), + &range_expr("x", 0, 100).as_json_str().unwrap(), + ), + test_region_route( + RegionId::new(table_id, 2), + &range_expr("x", 100, 200).as_json_str().unwrap(), + ), + ] + ); + + let allocate_status = procedure + .execute(&TestingEnv::procedure_context()) + .await + .unwrap(); + assert!(allocate_status.need_persist()); + assert_parent_state::(&procedure); + + assert_eq!(procedure.context.persistent_ctx.plans.len(), 1); + let plan = &procedure.context.persistent_ctx.plans[0]; + let expected_plan = test_plan(table_id); + assert_eq!(plan.source_regions, expected_plan.source_regions); + assert_eq!(plan.target_regions, expected_plan.target_regions); + assert_eq!( + plan.allocated_region_ids, + expected_plan.allocated_region_ids + ); + assert_eq!(plan.transition_map, expected_plan.transition_map); + assert_eq!( + current_parent_region_routes(&procedure.context).await, + vec![ + test_region_route( + RegionId::new(table_id, 1), + &range_expr("x", 0, 100).as_json_str().unwrap(), + ), + test_region_route( + RegionId::new(table_id, 2), + &range_expr("x", 100, 200).as_json_str().unwrap(), + ), + RegionRoute { + region: Region { + id: RegionId::new(table_id, 3), + partition_expr: range_expr("x", 50, 100).as_json_str().unwrap(), + ..Default::default() + }, + leader_peer: Some(Peer::empty(0)), + ..Default::default() + }, + ] + ); + + let dispatch_status = procedure + .execute(&TestingEnv::procedure_context()) + .await + .unwrap(); + assert!(!dispatch_status.need_persist()); + let subprocedure_ids = extract_subprocedure_ids(dispatch_status); + assert_eq!(subprocedure_ids.len(), 1); + assert_parent_state::(&procedure); + } +} diff --git a/src/meta-srv/src/procedure/repartition/allocate_region.rs b/src/meta-srv/src/procedure/repartition/allocate_region.rs index b1bf93d986..12ffac9918 100644 --- a/src/meta-srv/src/procedure/repartition/allocate_region.rs +++ b/src/meta-srv/src/procedure/repartition/allocate_region.rs @@ -21,12 +21,11 @@ use common_meta::ddl::create_table::template::{ }; use common_meta::lock_key::TableLock; use common_meta::node_manager::NodeManagerRef; -use common_meta::region_keeper::{MemoryRegionKeeperRef, OperatingRegionGuard}; -use common_meta::rpc::router::{RegionRoute, operating_leader_regions}; +use common_meta::rpc::router::RegionRoute; use common_procedure::{Context as ProcedureContext, Status}; -use common_telemetry::info; -use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt}; +use common_telemetry::{debug, info}; +use serde::{Deserialize, Deserializer, Serialize}; +use snafu::ResultExt; use store_api::storage::{RegionNumber, TableId}; use table::metadata::TableInfo; use table::table_reference::TableReference; @@ -40,14 +39,103 @@ use crate::procedure::repartition::plan::{ }; use crate::procedure::repartition::{Context, State}; +#[derive(Debug, Clone, Serialize)] +pub enum AllocateRegion { + Build(BuildPlan), + Execute(ExecutePlan), +} + +impl<'de> Deserialize<'de> for AllocateRegion { + fn deserialize(deserializer: D) -> std::result::Result + where + D: Deserializer<'de>, + { + #[derive(Deserialize)] + enum CurrentAllocateRegion { + Build(BuildPlan), + Execute(ExecutePlan), + } + + #[derive(Deserialize)] + struct LegacyAllocateRegion { + plan_entries: Vec, + } + + #[derive(Deserialize)] + #[serde(untagged)] + enum AllocateRegionRepr { + Current(CurrentAllocateRegion), + Legacy(LegacyAllocateRegion), + } + + match AllocateRegionRepr::deserialize(deserializer)? { + AllocateRegionRepr::Current(CurrentAllocateRegion::Build(build_plan)) => { + Ok(Self::Build(build_plan)) + } + AllocateRegionRepr::Current(CurrentAllocateRegion::Execute(execute_plan)) => { + Ok(Self::Execute(execute_plan)) + } + AllocateRegionRepr::Legacy(legacy) => Ok(Self::Build(BuildPlan { + plan_entries: legacy.plan_entries, + })), + } + } +} + #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct AllocateRegion { +pub struct BuildPlan { plan_entries: Vec, } -#[async_trait::async_trait] -#[typetag::serde] -impl State for AllocateRegion { +impl BuildPlan { + async fn next( + &mut self, + ctx: &mut Context, + _procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)> { + let timer = Instant::now(); + let table_id = ctx.persistent_ctx.table_id; + let table_route_value = ctx.get_table_route_value().await?; + let mut next_region_number = + AllocateRegion::get_next_region_number(table_route_value.max_region_number().unwrap()); + + // Converts allocation plan to repartition plan. + let repartition_plan_entries = AllocateRegion::convert_to_repartition_plans( + table_id, + &mut next_region_number, + &self.plan_entries, + ); + let plan_count = repartition_plan_entries.len(); + let to_allocate = AllocateRegion::count_regions_to_allocate(&repartition_plan_entries); + info!( + "Repartition allocate regions start, table_id: {}, groups: {}, regions_to_allocate: {}", + table_id, plan_count, to_allocate + ); + + // If no region to allocate, directly dispatch the plan. + if AllocateRegion::count_regions_to_allocate(&repartition_plan_entries) == 0 { + ctx.persistent_ctx.plans = repartition_plan_entries; + ctx.update_allocate_region_elapsed(timer.elapsed()); + return Ok((Box::new(Dispatch), Status::executing(true))); + } + + ctx.persistent_ctx.plans = repartition_plan_entries; + debug!( + "Repartition allocate regions build plan completed, table_id: {}, elapsed: {:?}", + table_id, + timer.elapsed() + ); + Ok(( + Box::new(AllocateRegion::Execute(ExecutePlan)), + Status::executing(true), + )) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ExecutePlan; + +impl ExecutePlan { async fn next( &mut self, ctx: &mut Context, @@ -55,36 +143,13 @@ impl State for AllocateRegion { ) -> Result<(Box, Status)> { let timer = Instant::now(); let table_id = ctx.persistent_ctx.table_id; + let allocate_regions = AllocateRegion::collect_allocate_regions(&ctx.persistent_ctx.plans); + let region_number_and_partition_exprs = + AllocateRegion::prepare_region_allocation_data(&allocate_regions)?; + let table_info_value = ctx.get_table_info_value().await?; let table_route_value = ctx.get_table_route_value().await?; // Safety: it is physical table route value. let region_routes = table_route_value.region_routes().unwrap(); - let mut next_region_number = - Self::get_next_region_number(table_route_value.max_region_number().unwrap()); - - // Converts allocation plan to repartition plan. - let repartition_plan_entries = Self::convert_to_repartition_plans( - table_id, - &mut next_region_number, - &self.plan_entries, - ); - let plan_count = repartition_plan_entries.len(); - let to_allocate = Self::count_regions_to_allocate(&repartition_plan_entries); - info!( - "Repartition allocate regions start, table_id: {}, groups: {}, regions_to_allocate: {}", - table_id, plan_count, to_allocate - ); - - // If no region to allocate, directly dispatch the plan. - if Self::count_regions_to_allocate(&repartition_plan_entries) == 0 { - ctx.persistent_ctx.plans = repartition_plan_entries; - ctx.update_allocate_region_elapsed(timer.elapsed()); - return Ok((Box::new(Dispatch), Status::executing(true))); - } - - let allocate_regions = Self::collect_allocate_regions(&repartition_plan_entries); - let region_number_and_partition_exprs = - Self::prepare_region_allocation_data(&allocate_regions)?; - let table_info_value = ctx.get_table_info_value().await?; let new_allocated_region_routes = ctx .region_routes_allocator .allocate( @@ -122,12 +187,13 @@ impl State for AllocateRegion { table_id, new_region_count, new_regions_brief ); - let _operating_guards = Self::register_operating_regions( + // The table route metadata is not updated yet; register it in memory for region lease renewal. + let _operating_guards = Context::register_operating_regions( &ctx.memory_region_keeper, &new_allocated_region_routes, )?; // Allocates the regions on datanodes. - Self::allocate_regions( + AllocateRegion::allocate_regions( &ctx.node_manager, &table_info_value.table_info, &new_allocated_region_routes, @@ -135,21 +201,33 @@ impl State for AllocateRegion { ) .await?; - // TODO(weny): for metric engine, sync logical regions from the the central region. - // Updates the table routes. let table_lock = TableLock::Write(table_id).into(); let _guard = procedure_ctx.provider.acquire_lock(&table_lock).await; let new_region_routes = - Self::generate_region_routes(region_routes, &new_allocated_region_routes); + AllocateRegion::generate_region_routes(region_routes, &new_allocated_region_routes); ctx.update_table_route(&table_route_value, new_region_routes, wal_options) .await?; ctx.invalidate_table_cache().await?; - ctx.persistent_ctx.plans = repartition_plan_entries; ctx.update_allocate_region_elapsed(timer.elapsed()); Ok((Box::new(Dispatch), Status::executing(true))) } +} + +#[async_trait::async_trait] +#[typetag::serde] +impl State for AllocateRegion { + async fn next( + &mut self, + ctx: &mut Context, + procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)> { + match self { + AllocateRegion::Build(build_plan) => build_plan.next(ctx, procedure_ctx).await, + AllocateRegion::Execute(execute_plan) => execute_plan.next(ctx, procedure_ctx).await, + } + } fn as_any(&self) -> &dyn Any { self @@ -158,24 +236,7 @@ impl State for AllocateRegion { impl AllocateRegion { pub fn new(plan_entries: Vec) -> Self { - Self { plan_entries } - } - - fn register_operating_regions( - memory_region_keeper: &MemoryRegionKeeperRef, - region_routes: &[RegionRoute], - ) -> Result> { - let mut operating_guards = Vec::with_capacity(region_routes.len()); - for (region_id, datanode_id) in operating_leader_regions(region_routes) { - let guard = memory_region_keeper - .register(datanode_id, region_id) - .context(error::RegionOperatingRaceSnafu { - peer_id: datanode_id, - region_id, - })?; - operating_guards.push(guard); - } - Ok(operating_guards) + AllocateRegion::Build(BuildPlan { plan_entries }) } fn generate_region_routes( @@ -300,6 +361,7 @@ mod tests { use uuid::Uuid; use super::*; + use crate::procedure::repartition::State; use crate::procedure::repartition::test_util::range_expr; fn create_region_descriptor( @@ -488,4 +550,71 @@ mod tests { assert!(!result[0].1.is_empty()); assert!(!result[1].1.is_empty()); } + + #[test] + fn test_allocate_region_state_backward_compatibility() { + // Arrange + let serialized = r#"{"repartition_state":"AllocateRegion","plan_entries":[]}"#; + + // Act + let state: Box = serde_json::from_str(serialized).unwrap(); + + // Assert + let allocate_region = state + .as_any() + .downcast_ref::() + .expect("expected AllocateRegion state"); + match allocate_region { + AllocateRegion::Build(build_plan) => assert!(build_plan.plan_entries.is_empty()), + AllocateRegion::Execute(_) => panic!("expected build plan"), + } + } + + #[test] + fn test_allocate_region_state_round_trip() { + // Arrange + let state: Box = Box::new(AllocateRegion::new(vec![])); + + // Act + let serialized = serde_json::to_string(&state).unwrap(); + let deserialized: Box = serde_json::from_str(&serialized).unwrap(); + + // Assert + assert_eq!( + serialized, + r#"{"repartition_state":"AllocateRegion","Build":{"plan_entries":[]}}"# + ); + let allocate_region = deserialized + .as_any() + .downcast_ref::() + .expect("expected AllocateRegion state"); + match allocate_region { + AllocateRegion::Build(build_plan) => assert!(build_plan.plan_entries.is_empty()), + AllocateRegion::Execute(_) => panic!("expected build plan"), + } + } + + #[test] + fn test_allocate_region_execute_state_round_trip() { + // Arrange + let state: Box = Box::new(AllocateRegion::Execute(ExecutePlan)); + + // Act + let serialized = serde_json::to_string(&state).unwrap(); + let deserialized: Box = serde_json::from_str(&serialized).unwrap(); + + // Assert + assert_eq!( + serialized, + r#"{"repartition_state":"AllocateRegion","Execute":null}"# + ); + let allocate_region = deserialized + .as_any() + .downcast_ref::() + .expect("expected AllocateRegion state"); + match allocate_region { + AllocateRegion::Execute(_) => {} + AllocateRegion::Build(_) => panic!("expected execute plan"), + } + } } diff --git a/src/meta-srv/src/procedure/repartition/collect.rs b/src/meta-srv/src/procedure/repartition/collect.rs index d413158b94..1a6d0c6257 100644 --- a/src/meta-srv/src/procedure/repartition/collect.rs +++ b/src/meta-srv/src/procedure/repartition/collect.rs @@ -94,17 +94,28 @@ impl State for Collect { } } - let inflight = self.inflight_procedures.len(); let succeeded = self.succeeded_procedures.len(); let failed = self.failed_procedures.len(); let unknown = self.unknown_procedures.len(); info!( - "Collected repartition group results for table_id: {}, inflight: {}, succeeded: {}, failed: {}, unknown: {}", - table_id, inflight, succeeded, failed, unknown + "Collected repartition group results for table_id: {}, succeeded: {}, failed: {}, unknown: {}", + table_id, succeeded, failed, unknown ); if failed > 0 || unknown > 0 { - // TODO(weny): retry the failed or unknown procedures. + ctx.persistent_ctx + .failed_procedures + .extend(self.failed_procedures.iter()); + ctx.persistent_ctx + .unknown_procedures + .extend(self.unknown_procedures.iter()); + return crate::error::UnexpectedSnafu { + violated: format!( + "Repartition groups failed or became unknown, table_id: {}, failed: {}, unknown: {}", + table_id, failed, unknown + ), + } + .fail(); } if let Some(start_time) = ctx.volatile_ctx.dispatch_start_time.take() { @@ -118,3 +129,139 @@ impl State for Collect { self } } + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use common_error::mock::MockError; + use common_error::status_code::StatusCode; + use common_meta::test_util::MockDatanodeManager; + use common_procedure::{ + Context as ProcedureContext, ContextProvider, Error as ProcedureError, ProcedureId, + ProcedureState, + }; + use common_procedure_test::MockContextProvider; + use tokio::sync::watch; + + use super::*; + use crate::procedure::repartition::PersistentContext; + use crate::procedure::repartition::test_util::TestingEnv; + + struct FailedProcedureContextProvider { + receiver: watch::Receiver, + inner: MockContextProvider, + } + + #[async_trait::async_trait] + impl ContextProvider for FailedProcedureContextProvider { + async fn procedure_state( + &self, + procedure_id: ProcedureId, + ) -> common_procedure::Result> { + self.inner.procedure_state(procedure_id).await + } + + async fn procedure_state_receiver( + &self, + _procedure_id: ProcedureId, + ) -> common_procedure::Result>> { + Ok(Some(self.receiver.clone())) + } + + async fn try_put_poison( + &self, + key: &common_procedure::PoisonKey, + procedure_id: ProcedureId, + ) -> common_procedure::Result<()> { + self.inner.try_put_poison(key, procedure_id).await + } + + async fn acquire_lock( + &self, + key: &common_procedure::StringKey, + ) -> common_procedure::local::DynamicKeyLockGuard { + self.inner.acquire_lock(key).await + } + } + + #[tokio::test] + async fn test_collect_returns_error_when_unknown_exists() { + let env = TestingEnv::new(); + let ddl_ctx = env.ddl_context(Arc::new(MockDatanodeManager::new(()))); + let persistent_ctx = PersistentContext::new( + table::table_name::TableName::new("test_catalog", "test_schema", "test_table"), + 1024, + None, + ); + let mut ctx = crate::procedure::repartition::Context::new( + &ddl_ctx, + env.mailbox_ctx.mailbox().clone(), + env.server_addr.clone(), + persistent_ctx, + ); + let mut state = Collect { + inflight_procedures: vec![], + succeeded_procedures: vec![], + failed_procedures: vec![], + unknown_procedures: vec![ProcedureMeta { + plan_index: 0, + group_id: uuid::Uuid::new_v4(), + procedure_id: common_procedure::ProcedureId::random(), + }], + }; + + let err = state + .next(&mut ctx, &TestingEnv::procedure_context()) + .await + .unwrap_err(); + + assert!(!err.is_retryable()); + } + + #[tokio::test] + async fn test_collect_returns_error_when_failed_exists() { + let env = TestingEnv::new(); + let ddl_ctx = env.ddl_context(Arc::new(MockDatanodeManager::new(()))); + let persistent_ctx = PersistentContext::new( + table::table_name::TableName::new("test_catalog", "test_schema", "test_table"), + 1024, + None, + ); + let mut ctx = crate::procedure::repartition::Context::new( + &ddl_ctx, + env.mailbox_ctx.mailbox().clone(), + env.server_addr.clone(), + persistent_ctx, + ); + let procedure_id = common_procedure::ProcedureId::random(); + let (tx, rx) = watch::channel(ProcedureState::Running); + tx.send(ProcedureState::failed(Arc::new(ProcedureError::external( + MockError::new(StatusCode::Internal), + )))) + .unwrap(); + let procedure_ctx = ProcedureContext { + procedure_id: ProcedureId::random(), + provider: Arc::new(FailedProcedureContextProvider { + receiver: rx, + inner: MockContextProvider::default(), + }), + }; + let mut state = Collect { + inflight_procedures: vec![ProcedureMeta { + plan_index: 0, + group_id: uuid::Uuid::new_v4(), + procedure_id, + }], + succeeded_procedures: vec![], + failed_procedures: vec![], + unknown_procedures: vec![], + }; + + let err = state.next(&mut ctx, &procedure_ctx).await.unwrap_err(); + + assert_eq!(state.failed_procedures.len(), 1); + assert_eq!(state.unknown_procedures.len(), 0); + assert!(!err.is_retryable()); + } +} diff --git a/src/meta-srv/src/procedure/repartition/deallocate_region.rs b/src/meta-srv/src/procedure/repartition/deallocate_region.rs index 12233c27e7..3f5dc5bd8e 100644 --- a/src/meta-srv/src/procedure/repartition/deallocate_region.rs +++ b/src/meta-srv/src/procedure/repartition/deallocate_region.rs @@ -88,7 +88,8 @@ impl State for DeallocateRegion { &ctx.persistent_ctx.schema_name, &ctx.persistent_ctx.table_name, ); - // Deallocates the regions on datanodes. + // Memory guards are not required here, + // because the table metadata still contains routes for the deallocating regions. Self::deallocate_regions( &ctx.node_manager, &ctx.leader_region_registry, @@ -116,7 +117,7 @@ impl State for DeallocateRegion { } impl DeallocateRegion { - async fn deallocate_regions( + pub(crate) async fn deallocate_regions( node_manager: &NodeManagerRef, leader_region_registry: &LeaderRegionRegistryRef, table: TableName, @@ -141,7 +142,7 @@ impl DeallocateRegion { Ok(()) } - fn filter_deallocatable_region_routes( + pub(crate) fn filter_deallocatable_region_routes( table_id: TableId, region_routes: &[RegionRoute], pending_deallocate_region_ids: &HashSet, @@ -165,7 +166,7 @@ impl DeallocateRegion { .collect::>() } - fn generate_region_routes( + pub(crate) fn generate_region_routes( region_routes: &[RegionRoute], pending_deallocate_region_ids: &HashSet, ) -> Vec { @@ -181,12 +182,21 @@ impl DeallocateRegion { #[cfg(test)] mod tests { use std::collections::HashSet; + use std::sync::Arc; + use common_meta::ddl::test_util::datanode_handler::RetryErrorDatanodeHandler; use common_meta::peer::Peer; use common_meta::rpc::router::{Region, RegionRoute}; + use common_meta::test_util::MockDatanodeManager; use store_api::storage::{RegionId, TableId}; + use crate::error::Error; + use crate::procedure::repartition::State; use crate::procedure::repartition::deallocate_region::DeallocateRegion; + use crate::procedure::repartition::plan::RepartitionPlanEntry; + use crate::procedure::repartition::test_util::{ + TestingEnv, current_parent_region_routes, new_parent_context, + }; fn test_region_routes(table_id: TableId) -> Vec { vec![ @@ -238,4 +248,36 @@ mod tests { assert_eq!(new_region_routes.len(), 1); assert_eq!(new_region_routes[0].region.id, RegionId::new(table_id, 2)); } + + #[tokio::test] + async fn test_next_retryable_when_deallocate_regions_retry_later() { + let env = TestingEnv::new(); + let table_id = 1024; + let original_routes = test_region_routes(table_id); + + env.create_physical_table_metadata(table_id, original_routes.clone()) + .await; + + let node_manager = Arc::new(MockDatanodeManager::new(RetryErrorDatanodeHandler)); + let mut ctx = new_parent_context(&env, node_manager, table_id); + ctx.persistent_ctx.plans = vec![RepartitionPlanEntry { + group_id: uuid::Uuid::new_v4(), + source_regions: vec![], + target_regions: vec![], + allocated_region_ids: vec![], + pending_deallocate_region_ids: vec![RegionId::new(table_id, 1)], + transition_map: vec![], + }]; + + let mut state = DeallocateRegion; + + let err = state + .next(&mut ctx, &TestingEnv::procedure_context()) + .await + .unwrap_err(); + + assert!(matches!(err, Error::DeallocateRegions { .. })); + assert!(err.is_retryable()); + assert_eq!(current_parent_region_routes(&ctx).await, original_routes); + } } diff --git a/src/meta-srv/src/procedure/repartition/dispatch.rs b/src/meta-srv/src/procedure/repartition/dispatch.rs index 02dc73362d..3a9f9376f1 100644 --- a/src/meta-srv/src/procedure/repartition/dispatch.rs +++ b/src/meta-srv/src/procedure/repartition/dispatch.rs @@ -31,7 +31,7 @@ use crate::procedure::repartition::{self, Context, State}; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Dispatch; -fn build_region_mapping( +pub(crate) fn build_region_mapping( source_regions: &[RegionDescriptor], target_regions: &[RegionDescriptor], transition_map: &[Vec], @@ -106,7 +106,11 @@ impl State for Dispatch { Ok(( Box::new(Collect::new(procedure_metas)), - Status::suspended(procedures, true), + // The state is not persisted after sub-procedures are spawned. + // If metasrv restarts before all sub-procedures complete, + // it restores from the `Dispatch` state and re-dispatches them. + // This is safe because the sub-procedures are idempotent. + Status::suspended(procedures, false), )) } diff --git a/src/meta-srv/src/procedure/repartition/group.rs b/src/meta-srv/src/procedure/repartition/group.rs index f0cb1c4dd0..e5a06f79a8 100644 --- a/src/meta-srv/src/procedure/repartition/group.rs +++ b/src/meta-srv/src/procedure/repartition/group.rs @@ -41,14 +41,18 @@ use common_procedure::{ Context as ProcedureContext, Error as ProcedureError, LockKey, Procedure, Result as ProcedureResult, Status, StringKey, UserMetadata, }; -use common_telemetry::{error, info}; +use common_telemetry::{error, info, warn}; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt}; use store_api::storage::{RegionId, TableId}; use uuid::Uuid; use crate::error::{self, Result}; +use crate::procedure::repartition::group::apply_staging_manifest::ApplyStagingManifest; +use crate::procedure::repartition::group::enter_staging_region::EnterStagingRegion; +use crate::procedure::repartition::group::remap_manifest::RemapManifest; use crate::procedure::repartition::group::repartition_start::RepartitionStart; +use crate::procedure::repartition::group::update_metadata::UpdateMetadata; use crate::procedure::repartition::plan::RegionDescriptor; use crate::procedure::repartition::utils::get_datanode_table_value; use crate::procedure::repartition::{self}; @@ -192,6 +196,62 @@ impl RepartitionGroupProcedure { Ok(Self { state, context }) } + + async fn rollback_inner(&mut self, procedure_ctx: &ProcedureContext) -> Result<()> { + if !self.should_rollback_metadata() { + return Ok(()); + } + + let table_lock = + common_meta::lock_key::TableLock::Write(self.context.persistent_ctx.table_id).into(); + let _guard = procedure_ctx.provider.acquire_lock(&table_lock).await; + UpdateMetadata::RollbackStaging + .rollback_staging_regions(&mut self.context) + .await?; + + if let Err(err) = self.context.invalidate_table_cache().await { + warn!( + err; + "Failed to broadcast the invalidate table cache message during repartition group rollback" + ); + } + + Ok(()) + } + + /// Returns whether group rollback should revert staging metadata. + /// + /// This uses an "after metadata apply, before exit staging" semantic. + /// Once execution reaches `UpdateMetadata::ApplyStaging` or any later staging state, + /// rollback must restore table-route metadata back to the pre-apply view. + /// + /// State flow: + /// `RepartitionStart -> SyncRegion -> UpdateMetadata::ApplyStaging -> EnterStagingRegion` + /// ` -> RemapManifest -> ApplyStagingManifest -> UpdateMetadata::ExitStaging -> RepartitionEnd` + /// ` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^` + /// ` rollback staging metadata` + /// + /// Notes: + /// - `RepartitionStart` / `SyncRegion`: no-op, metadata has not been staged yet. + /// - `UpdateMetadata::ApplyStaging` / `EnterStagingRegion` / `RemapManifest` / + /// `ApplyStagingManifest` / `UpdateMetadata::RollbackStaging`: rollback-active. + /// - `UpdateMetadata::ExitStaging` / `RepartitionEnd`: excluded, because metadata has + /// already moved into the post-commit exit path. + fn should_rollback_metadata(&self) -> bool { + self.state.as_any().is::() + || self.state.as_any().is::() + || self.state.as_any().is::() + || self + .state + .as_any() + .downcast_ref::() + .is_some_and(|state| { + matches!( + state, + UpdateMetadata::ApplyStaging | UpdateMetadata::RollbackStaging + ) + }) + } } #[async_trait::async_trait] @@ -200,6 +260,12 @@ impl Procedure for RepartitionGroupProcedure { Self::TYPE_NAME } + async fn rollback(&mut self, ctx: &ProcedureContext) -> ProcedureResult<()> { + self.rollback_inner(ctx) + .await + .map_err(ProcedureError::external) + } + #[tracing::instrument(skip_all, fields( state = %self.state.name(), table_id = self.context.persistent_ctx.table_id, @@ -238,7 +304,7 @@ impl Procedure for RepartitionGroupProcedure { } fn rollback_supported(&self) -> bool { - false + true } fn dump(&self) -> ProcedureResult { @@ -304,7 +370,7 @@ impl Context { pub struct GroupPrepareResult { /// The validated source region routes. pub source_routes: Vec, - /// The validated target region routes. + /// Validated target region routes used for metadata rollback (logical rollback). pub target_routes: Vec, /// The primary source region id (first source region), used for retrieving region options. pub central_region: RegionId, @@ -599,12 +665,149 @@ pub(crate) trait State: Sync + Send + Debug { mod tests { use std::assert_matches; use std::sync::Arc; + use std::time::Duration; use common_meta::key::TableMetadataManager; use common_meta::kv_backend::test_util::MockKvBackendBuilder; + use common_meta::peer::Peer; + use common_meta::rpc::router::{Region, RegionRoute}; + use common_procedure::{Context as ProcedureContext, Procedure, ProcedureId}; + use common_procedure_test::MockContextProvider; + use partition::expr::PartitionExpr; + use store_api::storage::RegionId; + use super::{ + Context, PersistentContext, RepartitionGroupProcedure, RepartitionStart, State, + region_routes, + }; use crate::error::Error; - use crate::procedure::repartition::test_util::{TestingEnv, new_persistent_context}; + use crate::procedure::repartition::dispatch::build_region_mapping; + use crate::procedure::repartition::group::apply_staging_manifest::ApplyStagingManifest; + use crate::procedure::repartition::group::enter_staging_region::EnterStagingRegion; + use crate::procedure::repartition::group::remap_manifest::RemapManifest; + use crate::procedure::repartition::group::repartition_start::RepartitionStart as GroupRepartitionStart; + use crate::procedure::repartition::group::sync_region::SyncRegion; + use crate::procedure::repartition::group::update_metadata::UpdateMetadata; + use crate::procedure::repartition::plan; + use crate::procedure::repartition::repartition_start::RepartitionStart as ParentRepartitionStart; + use crate::procedure::repartition::test_util::{ + TestingEnv, new_persistent_context, range_expr, + }; + + struct GroupRollbackFixture { + context: Context, + original_region_routes: Vec, + next_state: Option>, + } + + async fn new_group_rollback_fixture( + original_region_routes: Vec, + from_exprs: Vec, + to_exprs: Vec, + sync_region: bool, + ) -> GroupRollbackFixture { + let env = TestingEnv::new(); + let procedure_ctx = TestingEnv::procedure_context(); + let table_id = 1024; + let mut next_region_number = 10; + + env.create_physical_table_metadata(table_id, original_region_routes.clone()) + .await; + + let (_, physical_route) = env + .table_metadata_manager + .table_route_manager() + .get_physical_table_route(table_id) + .await + .unwrap(); + let allocation_plans = + ParentRepartitionStart::build_plan(&physical_route, &from_exprs, &to_exprs).unwrap(); + assert_eq!(allocation_plans.len(), 1); + + let repartition_plan = plan::convert_allocation_plan_to_repartition_plan( + table_id, + &mut next_region_number, + &allocation_plans[0], + ); + let region_mapping = build_region_mapping( + &repartition_plan.source_regions, + &repartition_plan.target_regions, + &repartition_plan.transition_map, + ); + let persistent_context = PersistentContext::new( + repartition_plan.group_id, + table_id, + "test_catalog".to_string(), + "test_schema".to_string(), + repartition_plan.source_regions, + repartition_plan.target_regions, + region_mapping, + sync_region, + repartition_plan.allocated_region_ids, + repartition_plan.pending_deallocate_region_ids, + Duration::from_secs(120), + ); + let mut context = env.create_context(persistent_context); + let (next_state, _) = GroupRepartitionStart + .next(&mut context, &procedure_ctx) + .await + .unwrap(); + + GroupRollbackFixture { + context, + original_region_routes, + next_state: Some(next_state), + } + } + + async fn new_split_group_rollback_fixture(sync_region: bool) -> GroupRollbackFixture { + new_group_rollback_fixture( + vec![ + new_region_route(RegionId::new(1024, 1), Some(range_expr("x", 0, 100))), + new_region_route(RegionId::new(1024, 2), Some(range_expr("x", 100, 200))), + new_region_route(RegionId::new(1024, 10), None), + ], + vec![range_expr("x", 0, 100)], + vec![range_expr("x", 0, 50), range_expr("x", 50, 100)], + sync_region, + ) + .await + } + + async fn new_merge_group_rollback_fixture(sync_region: bool) -> GroupRollbackFixture { + new_group_rollback_fixture( + vec![ + new_region_route(RegionId::new(1024, 1), Some(range_expr("x", 0, 100))), + new_region_route(RegionId::new(1024, 2), Some(range_expr("x", 100, 200))), + new_region_route(RegionId::new(1024, 3), Some(range_expr("x", 200, 300))), + ], + vec![range_expr("x", 0, 100), range_expr("x", 100, 200)], + vec![range_expr("x", 0, 200)], + sync_region, + ) + .await + } + + async fn stage_metadata(context: &mut Context) { + UpdateMetadata::ApplyStaging + .apply_staging_regions(context) + .await + .unwrap(); + } + + fn new_region_route(region_id: RegionId, partition_expr: Option) -> RegionRoute { + RegionRoute { + region: Region { + id: region_id, + partition_expr: partition_expr + .map(|expr| expr.as_json_str().unwrap()) + .unwrap_or_default(), + ..Default::default() + }, + leader_peer: Some(Peer::empty(1)), + ..Default::default() + } + } #[tokio::test] async fn test_get_table_route_value_not_found_error() { @@ -653,4 +856,198 @@ mod tests { let err = ctx.get_datanode_table_value(1024, 1).await.unwrap_err(); assert!(err.is_retryable()); } + + #[tokio::test] + async fn test_group_rollback_supported() { + let env = TestingEnv::new(); + let persistent_context = new_persistent_context(1024, vec![], vec![]); + let procedure = RepartitionGroupProcedure { + state: Box::new(RepartitionStart), + context: env.create_context(persistent_context), + }; + + assert!(procedure.rollback_supported()); + } + + #[tokio::test] + async fn test_group_rollback_is_noop_before_apply_staging() { + let env = TestingEnv::new(); + let persistent_context = new_persistent_context(1024, vec![], vec![]); + let ctx = env.create_context(persistent_context.clone()); + let mut procedure = RepartitionGroupProcedure { + state: Box::new(RepartitionStart), + context: ctx, + }; + let provider = Arc::new(MockContextProvider::new(Default::default())); + let procedure_ctx = ProcedureContext { + procedure_id: ProcedureId::random(), + provider, + }; + + procedure.rollback(&procedure_ctx).await.unwrap(); + + assert!(procedure.state.as_any().is::()); + assert_eq!(procedure.context.persistent_ctx, persistent_context); + } + + async fn assert_noop_rollback( + fixture: GroupRollbackFixture, + state: Box, + assert_state: impl FnOnce(&dyn State), + ) { + let original_region_routes = fixture.original_region_routes.clone(); + let procedure_ctx = TestingEnv::procedure_context(); + let mut procedure = RepartitionGroupProcedure { + state, + context: fixture.context, + }; + + procedure.rollback(&procedure_ctx).await.unwrap(); + + assert_state(&*procedure.state); + let table_route_value = procedure + .context + .get_table_route_value() + .await + .unwrap() + .into_inner(); + let region_routes = region_routes( + procedure.context.persistent_ctx.table_id, + &table_route_value, + ) + .unwrap(); + assert_eq!(region_routes.clone(), original_region_routes); + } + + async fn assert_metadata_rollback_restores_table_route( + mut fixture: GroupRollbackFixture, + state: Box, + ) { + let original_region_routes = fixture.original_region_routes.clone(); + let procedure_ctx = TestingEnv::procedure_context(); + stage_metadata(&mut fixture.context).await; + let mut procedure = RepartitionGroupProcedure { + state, + context: fixture.context, + }; + + procedure.rollback(&procedure_ctx).await.unwrap(); + + let table_route_value = procedure + .context + .get_table_route_value() + .await + .unwrap() + .into_inner(); + let region_routes = region_routes( + procedure.context.persistent_ctx.table_id, + &table_route_value, + ) + .unwrap(); + assert_eq!(region_routes.clone(), original_region_routes); + } + + #[tokio::test] + async fn test_group_rollback_is_noop_in_sync_region() { + let mut fixture = new_split_group_rollback_fixture(true).await; + assert!( + fixture + .next_state + .as_ref() + .unwrap() + .as_any() + .is::() + ); + let state = fixture.next_state.take().unwrap(); + + assert_noop_rollback(fixture, state, |state| { + assert!(state.as_any().is::()); + }) + .await; + } + + #[tokio::test] + async fn test_group_rollback_is_noop_in_exit_staging() { + let fixture = new_split_group_rollback_fixture(false).await; + + assert_noop_rollback(fixture, Box::new(UpdateMetadata::ExitStaging), |state| { + assert!(state.as_any().is::()); + assert!(matches!( + state.as_any().downcast_ref::(), + Some(UpdateMetadata::ExitStaging) + )); + }) + .await; + } + + #[tokio::test] + async fn test_group_rollback_restores_split_routes_from_apply_staging() { + let fixture = new_split_group_rollback_fixture(false).await; + assert_metadata_rollback_restores_table_route( + fixture, + Box::new(UpdateMetadata::ApplyStaging), + ) + .await; + } + + #[tokio::test] + async fn test_group_rollback_restores_split_routes_from_enter_staging_region() { + let fixture = new_split_group_rollback_fixture(false).await; + assert_metadata_rollback_restores_table_route(fixture, Box::new(EnterStagingRegion)).await; + } + + #[tokio::test] + async fn test_group_rollback_restores_split_routes_from_remap_manifest() { + let fixture = new_split_group_rollback_fixture(false).await; + assert_metadata_rollback_restores_table_route(fixture, Box::new(RemapManifest)).await; + } + + #[tokio::test] + async fn test_group_rollback_restores_split_routes_from_apply_staging_manifest() { + let fixture = new_split_group_rollback_fixture(false).await; + assert_metadata_rollback_restores_table_route(fixture, Box::new(ApplyStagingManifest)) + .await; + } + + #[tokio::test] + async fn test_group_rollback_restores_merge_routes_and_is_idempotent() { + let mut fixture = new_merge_group_rollback_fixture(false).await; + let original_region_routes = fixture.original_region_routes.clone(); + let procedure_ctx = TestingEnv::procedure_context(); + stage_metadata(&mut fixture.context).await; + let mut procedure = RepartitionGroupProcedure { + state: Box::new(UpdateMetadata::ApplyStaging), + context: fixture.context, + }; + + procedure.rollback(&procedure_ctx).await.unwrap(); + let table_route_value = procedure + .context + .get_table_route_value() + .await + .unwrap() + .into_inner(); + let once = region_routes( + procedure.context.persistent_ctx.table_id, + &table_route_value, + ) + .unwrap() + .clone(); + procedure.rollback(&procedure_ctx).await.unwrap(); + let table_route_value = procedure + .context + .get_table_route_value() + .await + .unwrap() + .into_inner(); + let twice = region_routes( + procedure.context.persistent_ctx.table_id, + &table_route_value, + ) + .unwrap() + .clone(); + + assert_eq!(once, original_region_routes); + assert_eq!(once, twice); + } } diff --git a/src/meta-srv/src/procedure/repartition/group/apply_staging_manifest.rs b/src/meta-srv/src/procedure/repartition/group/apply_staging_manifest.rs index 2020e9e2f4..43e5ee31d9 100644 --- a/src/meta-srv/src/procedure/repartition/group/apply_staging_manifest.rs +++ b/src/meta-srv/src/procedure/repartition/group/apply_staging_manifest.rs @@ -332,7 +332,14 @@ impl ApplyStagingManifest { ); Ok(()) - } + }, + Err(error::Error::MailboxChannelClosed {..})=> error::RetryLaterSnafu { + reason: format!( + "Mailbox closed when sending apply staging manifests to datanode {:?}, elapsed: {:?}", + peer, + now.elapsed() + ), + }.fail()?, Err(error::Error::MailboxTimeout { .. }) => { let reason = format!( "Mailbox received timeout for apply staging manifests on datanode {:?}, elapsed: {:?}", diff --git a/src/meta-srv/src/procedure/repartition/group/enter_staging_region.rs b/src/meta-srv/src/procedure/repartition/group/enter_staging_region.rs index 59de569c13..911e881ac3 100644 --- a/src/meta-srv/src/procedure/repartition/group/enter_staging_region.rs +++ b/src/meta-srv/src/procedure/repartition/group/enter_staging_region.rs @@ -315,7 +315,14 @@ impl EnterStagingRegion { ); Ok(()) - } + }, + Err(error::Error::MailboxChannelClosed {..})=> error::RetryLaterSnafu { + reason: format!( + "Mailbox closed when sending enter staging regions to datanode {:?}, elapsed: {:?}", + peer, + now.elapsed() + ), + }.fail()?, Err(error::Error::MailboxTimeout { .. }) => { let reason = format!( "Mailbox received timeout for enter staging regions on datanode {:?}, elapsed: {:?}", diff --git a/src/meta-srv/src/procedure/repartition/group/remap_manifest.rs b/src/meta-srv/src/procedure/repartition/group/remap_manifest.rs index 6e3460c2ce..1d6a75100e 100644 --- a/src/meta-srv/src/procedure/repartition/group/remap_manifest.rs +++ b/src/meta-srv/src/procedure/repartition/group/remap_manifest.rs @@ -184,6 +184,14 @@ impl RemapManifest { Self::handle_remap_manifest_reply(remap.region_id, reply, &now, peer) } + Err(error::Error::MailboxChannelClosed { .. }) => error::RetryLaterSnafu { + reason: format!( + "Mailbox closed when sending remap manifests to datanode {:?}, elapsed: {:?}", + peer, + now.elapsed() + ), + } + .fail()?, Err(error::Error::MailboxTimeout { .. }) => { let reason = format!( "Mailbox received timeout for remap manifests on datanode {:?}, elapsed: {:?}", diff --git a/src/meta-srv/src/procedure/repartition/group/sync_region.rs b/src/meta-srv/src/procedure/repartition/group/sync_region.rs index dcd58c21e9..7422ae8607 100644 --- a/src/meta-srv/src/procedure/repartition/group/sync_region.rs +++ b/src/meta-srv/src/procedure/repartition/group/sync_region.rs @@ -273,6 +273,14 @@ impl SyncRegion { } Ok(()) } + Err(error::Error::MailboxChannelClosed { .. }) => error::RetryLaterSnafu { + reason: format!( + "Mailbox closed when sending sync region to datanode {:?}, elapsed: {:?}", + peer, + now.elapsed() + ), + } + .fail()?, Err(error::Error::MailboxTimeout { .. }) => { let reason = format!( "Mailbox received timeout for sync regions on datanode {:?}, elapsed: {:?}", diff --git a/src/meta-srv/src/procedure/repartition/group/update_metadata/apply_staging_region.rs b/src/meta-srv/src/procedure/repartition/group/update_metadata/apply_staging_region.rs index ecde5f0507..ff01161ff5 100644 --- a/src/meta-srv/src/procedure/repartition/group/update_metadata/apply_staging_region.rs +++ b/src/meta-srv/src/procedure/repartition/group/update_metadata/apply_staging_region.rs @@ -30,7 +30,7 @@ impl UpdateMetadata { /// Abort: /// - Target region not found. /// - Source region not found. - fn apply_staging_region_routes( + pub(crate) fn apply_staging_region_routes( group_id: GroupId, sources: &[RegionDescriptor], targets: &[RegionDescriptor], @@ -50,10 +50,12 @@ impl UpdateMetadata { region_id: target.region_id, }, )?; + // Set the new partition expression for the target region route. region_route.region.partition_expr = target .partition_expr .as_json_str() .context(error::SerializePartitionExprSnafu)?; + // Set leader staging state and write route policy for the target region route. region_route.set_leader_staging(); region_route.clear_ignore_all_writes(); } @@ -65,6 +67,7 @@ impl UpdateMetadata { region_id: source.region_id, }, )?; + // Set leader staging state for the source region route. region_route.set_leader_staging(); if pending_deallocate_region_ids.contains(&source.region_id) { // When a region is pending deallocation, it should ignore all writes. diff --git a/src/meta-srv/src/procedure/repartition/group/update_metadata/rollback_staging_region.rs b/src/meta-srv/src/procedure/repartition/group/update_metadata/rollback_staging_region.rs index e9bef4cf8e..4e6bf67fc8 100644 --- a/src/meta-srv/src/procedure/repartition/group/update_metadata/rollback_staging_region.rs +++ b/src/meta-srv/src/procedure/repartition/group/update_metadata/rollback_staging_region.rs @@ -18,10 +18,12 @@ use common_error::ext::BoxedError; use common_meta::rpc::router::RegionRoute; use common_telemetry::{error, info}; use snafu::{OptionExt, ResultExt}; +use store_api::storage::RegionId; use crate::error::{self, Result}; use crate::procedure::repartition::group::update_metadata::UpdateMetadata; use crate::procedure::repartition::group::{Context, GroupId, region_routes}; +use crate::procedure::repartition::plan::RegionDescriptor; impl UpdateMetadata { /// Rolls back the staging regions. @@ -31,8 +33,9 @@ impl UpdateMetadata { /// - Target region not found. fn rollback_staging_region_routes( group_id: GroupId, - source_routes: &[RegionRoute], - target_routes: &[RegionRoute], + sources: &[RegionDescriptor], + original_target_routes: &[RegionRoute], + pending_deallocate_region_ids: &[RegionId], current_region_routes: &[RegionRoute], ) -> Result> { let mut region_routes = current_region_routes.to_vec(); @@ -40,26 +43,35 @@ impl UpdateMetadata { .iter_mut() .map(|route| (route.region.id, route)) .collect::>(); - - for source in source_routes { - let region_route = region_routes_map.get_mut(&source.region.id).context( + for source in sources { + let region_route = region_routes_map.get_mut(&source.region_id).context( error::RepartitionSourceRegionMissingSnafu { group_id, - region_id: source.region.id, + region_id: source.region_id, }, )?; - region_route.region.partition_expr = source.region.partition_expr.clone(); + // Clean leader staging state for source regions. region_route.clear_leader_staging(); - region_route.clear_ignore_all_writes(); + if pending_deallocate_region_ids.contains(&source.region_id) { + // Clean ignore all writes state for source regions if it's pending to be deallocated, + // which means the source region is merged into the target region. + region_route.clear_ignore_all_writes(); + } } - for target in target_routes { + for target in original_target_routes { let region_route = region_routes_map.get_mut(&target.region.id).context( error::RepartitionTargetRegionMissingSnafu { group_id, region_id: target.region.id, }, )?; + + // Revert the partition expression and write route policy to the original value for the target region. + region_route.region.partition_expr = target.region.partition_expr.clone(); + region_route.write_route_policy = target.write_route_policy; + + // Clean leader staging state for target regions. region_route.clear_leader_staging(); } @@ -83,8 +95,9 @@ impl UpdateMetadata { let prepare_result = ctx.persistent_ctx.group_prepare_result.as_ref().unwrap(); let new_region_routes = Self::rollback_staging_region_routes( group_id, - &prepare_result.source_routes, + &ctx.persistent_ctx.sources, &prepare_result.target_routes, + &ctx.persistent_ctx.pending_deallocate_region_ids, region_routes, )?; @@ -113,87 +126,176 @@ impl UpdateMetadata { #[cfg(test)] mod tests { + use std::collections::HashSet; + use common_meta::peer::Peer; use common_meta::rpc::router::{LeaderState, Region, RegionRoute}; use store_api::storage::RegionId; use uuid::Uuid; use crate::procedure::repartition::group::update_metadata::UpdateMetadata; + use crate::procedure::repartition::plan::RegionDescriptor; use crate::procedure::repartition::test_util::range_expr; + fn new_region_route( + region_id: RegionId, + partition_expr: &str, + leader_state: Option, + ignore_all_writes: bool, + ) -> RegionRoute { + let mut route = RegionRoute { + region: Region { + id: region_id, + partition_expr: partition_expr.to_string(), + ..Default::default() + }, + leader_peer: Some(Peer::empty(1)), + leader_state, + ..Default::default() + }; + + if ignore_all_writes { + route.set_ignore_all_writes(); + } + + route + } + + fn original_target_routes( + region_routes: &[RegionRoute], + targets: &[RegionDescriptor], + ) -> Vec { + let target_ids = targets + .iter() + .map(|target| target.region_id) + .collect::>(); + region_routes + .iter() + .filter(|route| target_ids.contains(&route.region.id)) + .cloned() + .collect() + } + #[test] - fn test_rollback_staging_region_routes() { + fn test_rollback_staging_region_routes_split_case() { let group_id = Uuid::new_v4(); let table_id = 1024; - let region_routes = vec![ - { - let mut route = RegionRoute { - region: Region { - id: RegionId::new(table_id, 1), - partition_expr: range_expr("x", 0, 100).as_json_str().unwrap(), - ..Default::default() - }, - leader_peer: Some(Peer::empty(1)), - leader_state: Some(LeaderState::Staging), - ..Default::default() - }; - route.set_ignore_all_writes(); - route + let original_region_routes = vec![ + new_region_route( + RegionId::new(table_id, 1), + &range_expr("x", 0, 100).as_json_str().unwrap(), + None, + false, + ), + new_region_route( + RegionId::new(table_id, 2), + &range_expr("x", 100, 200).as_json_str().unwrap(), + None, + false, + ), + new_region_route(RegionId::new(table_id, 3), "", None, false), + ]; + let sources = vec![RegionDescriptor { + region_id: RegionId::new(table_id, 1), + partition_expr: range_expr("x", 0, 100), + }]; + let targets = vec![ + RegionDescriptor { + region_id: RegionId::new(table_id, 1), + partition_expr: range_expr("x", 0, 50), }, - RegionRoute { - region: Region { - id: RegionId::new(table_id, 2), - partition_expr: String::new(), - ..Default::default() - }, - leader_peer: Some(Peer::empty(1)), - leader_state: Some(LeaderState::Staging), - ..Default::default() - }, - RegionRoute { - region: Region { - id: RegionId::new(table_id, 3), - partition_expr: String::new(), - ..Default::default() - }, - leader_peer: Some(Peer::empty(1)), - leader_state: Some(LeaderState::Downgrading), - ..Default::default() + RegionDescriptor { + region_id: RegionId::new(table_id, 3), + partition_expr: range_expr("x", 50, 100), }, ]; - let source_routes = vec![RegionRoute { - region: Region { - id: RegionId::new(table_id, 1), - partition_expr: range_expr("x", 0, 20).as_json_str().unwrap(), - ..Default::default() - }, - leader_peer: Some(Peer::empty(1)), - ..Default::default() - }]; - let target_routes = vec![RegionRoute { - region: Region { - id: RegionId::new(table_id, 2), - partition_expr: range_expr("x", 0, 20).as_json_str().unwrap(), - ..Default::default() - }, - leader_peer: Some(Peer::empty(1)), - ..Default::default() - }]; - let new_region_routes = UpdateMetadata::rollback_staging_region_routes( + let applied_region_routes = UpdateMetadata::apply_staging_region_routes( group_id, - &source_routes, - &target_routes, - ®ion_routes, + &sources, + &targets, + &[], + &original_region_routes, ) .unwrap(); - assert!(!new_region_routes[0].is_leader_staging()); - assert!(!new_region_routes[0].is_ignore_all_writes()); - assert_eq!( - new_region_routes[0].region.partition_expr, - range_expr("x", 0, 20).as_json_str().unwrap(), - ); - assert!(!new_region_routes[1].is_leader_staging()); - assert!(!new_region_routes[1].is_ignore_all_writes()); - assert!(new_region_routes[2].is_leader_downgrading()); + let target_routes = original_target_routes(&original_region_routes, &targets); + let new_region_routes = UpdateMetadata::rollback_staging_region_routes( + group_id, + &sources, + &target_routes, + &[], + &applied_region_routes, + ) + .unwrap(); + + assert_eq!(new_region_routes, original_region_routes); + } + + #[test] + fn test_rollback_staging_region_routes_merge_case_is_idempotent() { + let group_id = Uuid::new_v4(); + let table_id = 1024; + let original_region_routes = vec![ + new_region_route( + RegionId::new(table_id, 1), + &range_expr("x", 0, 100).as_json_str().unwrap(), + None, + false, + ), + new_region_route( + RegionId::new(table_id, 2), + &range_expr("x", 100, 200).as_json_str().unwrap(), + None, + false, + ), + new_region_route( + RegionId::new(table_id, 3), + &range_expr("x", 200, 300).as_json_str().unwrap(), + None, + false, + ), + ]; + let sources = vec![ + RegionDescriptor { + region_id: RegionId::new(table_id, 1), + partition_expr: range_expr("x", 0, 100), + }, + RegionDescriptor { + region_id: RegionId::new(table_id, 2), + partition_expr: range_expr("x", 100, 200), + }, + ]; + let targets = vec![RegionDescriptor { + region_id: RegionId::new(table_id, 1), + partition_expr: range_expr("x", 0, 200), + }]; + let target_routes = original_target_routes(&original_region_routes, &targets); + let applied_region_routes = UpdateMetadata::apply_staging_region_routes( + group_id, + &sources, + &targets, + &[RegionId::new(table_id, 2)], + &original_region_routes, + ) + .unwrap(); + + let once = UpdateMetadata::rollback_staging_region_routes( + group_id, + &sources, + &target_routes, + &[RegionId::new(table_id, 2)], + &applied_region_routes, + ) + .unwrap(); + let twice = UpdateMetadata::rollback_staging_region_routes( + group_id, + &sources, + &target_routes, + &[RegionId::new(table_id, 2)], + &once, + ) + .unwrap(); + + assert_eq!(once, original_region_routes); + assert_eq!(once, twice); } } diff --git a/src/meta-srv/src/procedure/repartition/repartition_start.rs b/src/meta-srv/src/procedure/repartition/repartition_start.rs index 1f657d58f2..5c6bcfdb06 100644 --- a/src/meta-srv/src/procedure/repartition/repartition_start.rs +++ b/src/meta-srv/src/procedure/repartition/repartition_start.rs @@ -102,7 +102,7 @@ impl State for RepartitionStart { } impl RepartitionStart { - fn build_plan( + pub(crate) fn build_plan( physical_route: &PhysicalTableRouteValue, from_exprs: &[PartitionExpr], to_exprs: &[PartitionExpr], diff --git a/src/meta-srv/src/procedure/repartition/test_util.rs b/src/meta-srv/src/procedure/repartition/test_util.rs index 3cefd4a095..83856a49e6 100644 --- a/src/meta-srv/src/procedure/repartition/test_util.rs +++ b/src/meta-srv/src/procedure/repartition/test_util.rs @@ -16,22 +16,41 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::{Duration, Instant}; +use common_meta::ddl::DdlContext; +use common_meta::key::table_route::TableRouteValue; +use common_meta::key::test_utils::{new_test_table_info, new_test_table_info_with_name}; use common_meta::key::{TableMetadataManager, TableMetadataManagerRef}; +use common_meta::kv_backend::KvBackendRef; use common_meta::kv_backend::memory::MemoryKvBackend; +use common_meta::node_manager::NodeManagerRef; +use common_meta::peer::Peer; +use common_meta::rpc::router::{Region, RegionRoute}; use common_meta::sequence::SequenceBuilder; +use common_meta::test_util::new_ddl_context_with_kv_backend; +use common_procedure::{ + Context as ProcedureContext, ContextProvider, ProcedureId, ProcedureState, Status, +}; +use common_procedure_test::MockContextProvider; +use common_wal::options::{KafkaWalOptions, WalOptions}; use datatypes::value::Value; use partition::expr::{PartitionExpr, col}; -use store_api::storage::TableId; +use store_api::storage::{RegionId, RegionNumber, TableId}; +use table::table_name::TableName; +use tokio::sync::watch; use uuid::Uuid; use crate::cache_invalidator::MetasrvCacheInvalidator; use crate::metasrv::MetasrvInfo; use crate::procedure::repartition::group::{Context, PersistentContext, VolatileContext}; use crate::procedure::repartition::plan::RegionDescriptor; +use crate::procedure::repartition::{ + Context as ParentContext, PersistentContext as ParentPersistentContext, RepartitionProcedure, +}; use crate::procedure::test_util::MailboxContext; /// `TestingEnv` provides components during the tests. pub struct TestingEnv { + pub kv_backend: KvBackendRef, pub table_metadata_manager: TableMetadataManagerRef, pub mailbox_ctx: MailboxContext, pub server_addr: String, @@ -45,13 +64,14 @@ impl Default for TestingEnv { impl TestingEnv { pub fn new() -> Self { - let kv_backend = Arc::new(MemoryKvBackend::new()); + let kv_backend = Arc::new(MemoryKvBackend::new()) as KvBackendRef; let table_metadata_manager = Arc::new(TableMetadataManager::new(kv_backend.clone())); let mailbox_sequence = SequenceBuilder::new("test_heartbeat_mailbox", kv_backend.clone()).build(); let mailbox_ctx = MailboxContext::new(mailbox_sequence); Self { + kv_backend, table_metadata_manager, mailbox_ctx, server_addr: "localhost".to_string(), @@ -76,6 +96,65 @@ impl TestingEnv { volatile_ctx: VolatileContext::default(), } } + + pub fn procedure_context() -> ProcedureContext { + ProcedureContext { + procedure_id: ProcedureId::random(), + provider: Arc::new(MockContextProvider::default()), + } + } + + pub async fn create_physical_table_metadata( + &self, + table_id: TableId, + region_routes: Vec, + ) { + self.create_physical_table_metadata_with_wal_options( + table_id, + region_routes, + HashMap::default(), + ) + .await; + } + + pub async fn create_physical_table_metadata_with_wal_options( + &self, + table_id: TableId, + region_routes: Vec, + region_wal_options: HashMap, + ) { + self.table_metadata_manager + .create_table_metadata( + new_test_table_info(table_id), + TableRouteValue::physical(region_routes), + region_wal_options, + ) + .await + .unwrap(); + } + + pub async fn create_physical_table_metadata_for_repartition( + &self, + table_id: TableId, + region_routes: Vec, + region_wal_options: HashMap, + ) { + let mut table_info = new_test_table_info_with_name(table_id, "test_table"); + table_info.meta.column_ids = vec![0, 1, 2]; + + self.table_metadata_manager + .create_table_metadata( + table_info, + TableRouteValue::physical(region_routes), + region_wal_options, + ) + .await + .unwrap(); + } + + pub fn ddl_context(&self, node_manager: NodeManagerRef) -> DdlContext { + new_ddl_context_with_kv_backend(node_manager, self.kv_backend.clone()) + } } pub fn range_expr(col_name: &str, start: i64, end: i64) -> PartitionExpr { @@ -84,6 +163,18 @@ pub fn range_expr(col_name: &str, start: i64, end: i64) -> PartitionExpr { .and(col(col_name).lt(Value::Int64(end))) } +pub fn test_region_wal_options(region_numbers: &[RegionNumber]) -> HashMap { + let wal_options = serde_json::to_string(&WalOptions::Kafka(KafkaWalOptions { + topic: "test_topic".to_string(), + })) + .unwrap(); + + region_numbers + .iter() + .map(|region_number| (*region_number, wal_options.clone())) + .collect() +} + pub fn new_persistent_context( table_id: TableId, sources: Vec, @@ -105,3 +196,110 @@ pub fn new_persistent_context( timeout: Duration::from_secs(120), } } + +pub fn test_region_route(region_id: RegionId, partition_expr: &str) -> RegionRoute { + RegionRoute { + region: Region { + id: region_id, + partition_expr: partition_expr.to_string(), + ..Default::default() + }, + leader_peer: Some(Peer::empty(1)), + ..Default::default() + } +} + +pub async fn current_parent_region_routes(ctx: &ParentContext) -> Vec { + let table_route_value = ctx.get_table_route_value().await.unwrap().into_inner(); + table_route_value.region_routes().unwrap().clone() +} + +pub fn new_parent_context( + env: &TestingEnv, + node_manager: NodeManagerRef, + table_id: TableId, +) -> ParentContext { + let ddl_ctx = env.ddl_context(node_manager); + let persistent_ctx = ParentPersistentContext::new( + TableName::new("test_catalog", "test_schema", "test_table"), + table_id, + None, + ); + + ParentContext::new( + &ddl_ctx, + env.mailbox_ctx.mailbox().clone(), + env.server_addr.clone(), + persistent_ctx, + ) +} + +pub fn assert_parent_state(procedure: &RepartitionProcedure) { + assert!(procedure.state.as_any().is::()); +} + +pub fn extract_subprocedure_ids(status: Status) -> Vec { + let Status::Suspended { subprocedures, .. } = status else { + panic!("expected suspended status"); + }; + + subprocedures + .into_iter() + .map(|procedure| procedure.id) + .collect() +} + +pub fn procedure_state_receiver(state: ProcedureState) -> watch::Receiver { + let (tx, rx) = watch::channel(ProcedureState::Running); + tx.send(state).unwrap(); + rx +} + +pub fn procedure_context_with_receivers( + receivers: HashMap>, +) -> ProcedureContext { + ProcedureContext { + procedure_id: ProcedureId::random(), + provider: Arc::new(ProcedureStateReceiverProvider { + receivers, + inner: MockContextProvider::default(), + }), + } +} + +struct ProcedureStateReceiverProvider { + receivers: HashMap>, + inner: MockContextProvider, +} + +#[async_trait::async_trait] +impl ContextProvider for ProcedureStateReceiverProvider { + async fn procedure_state( + &self, + procedure_id: ProcedureId, + ) -> common_procedure::Result> { + self.inner.procedure_state(procedure_id).await + } + + async fn procedure_state_receiver( + &self, + procedure_id: ProcedureId, + ) -> common_procedure::Result>> { + Ok(self.receivers.get(&procedure_id).cloned()) + } + + async fn try_put_poison( + &self, + key: &common_procedure::PoisonKey, + procedure_id: ProcedureId, + ) -> common_procedure::Result<()> { + self.inner.try_put_poison(key, procedure_id).await + } + + async fn acquire_lock( + &self, + key: &common_procedure::StringKey, + ) -> common_procedure::local::DynamicKeyLockGuard { + self.inner.acquire_lock(key).await + } +} diff --git a/src/meta-srv/src/procedure/utils.rs b/src/meta-srv/src/procedure/utils.rs index bea2195573..5ea8e00038 100644 --- a/src/meta-srv/src/procedure/utils.rs +++ b/src/meta-srv/src/procedure/utils.rs @@ -190,6 +190,23 @@ pub(crate) async fn flush_region( operation: "Flush regions", } .fail(), + Err(error::Error::MailboxChannelClosed { .. }) => match error_strategy { + ErrorStrategy::Ignore => { + warn!( + "Failed to flush regions({:?}), the datanode({}) is unreachable(MailboxChannelClosed). Skip flush operation.", + region_ids, datanode + ); + Ok(()) + } + ErrorStrategy::Retry => error::RetryLaterSnafu { + reason: format!( + "Mailbox closed when sending flush region to datanode {:?}, elapsed: {:?}", + datanode, + now.elapsed() + ), + } + .fail()?, + }, Err(err) => Err(err), } } From f3dbf34c74bab30becd79da283bfa4cc88ef1e8a Mon Sep 17 00:00:00 2001 From: Yingwen Date: Wed, 8 Apr 2026 23:15:27 +0800 Subject: [PATCH 082/195] chore: bump version to 1.0.0 (#7935) * chore: bump version to 1.0.0 Signed-off-by: evenyag * test: fix sqlness test Signed-off-by: evenyag * test: fix cluster info sqlness Signed-off-by: evenyag * test: reorder regex in cluster_info Signed-off-by: evenyag * chore: fix pg catalog Signed-off-by: evenyag --------- Signed-off-by: evenyag --- Cargo.lock | 152 +++++++++--------- Cargo.toml | 2 +- .../information_schema/cluster_info.result | 40 ++--- .../information_schema/cluster_info.sql | 40 ++--- .../standalone/common/function/system.result | 2 +- .../standalone/common/function/system.sql | 2 +- .../common/system/pg_catalog.result | 7 +- .../standalone/common/system/pg_catalog.sql | 1 + .../information_schema/cluster_info.result | 12 +- .../information_schema/cluster_info.sql | 12 +- 10 files changed, 134 insertions(+), 136 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f0ca46b271..edb8ce04d4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -212,7 +212,7 @@ checksum = "d301b3b94cb4b2f23d7917810addbbaff90738e0ca2be692bd027e70d7e0330c" [[package]] name = "api" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "arrow-schema 57.3.0", "common-base", @@ -933,7 +933,7 @@ checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" [[package]] name = "auth" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "async-trait", @@ -1523,7 +1523,7 @@ dependencies = [ [[package]] name = "cache" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "catalog", "common-error", @@ -1559,7 +1559,7 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "catalog" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "arrow 57.3.0", @@ -1894,7 +1894,7 @@ checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" [[package]] name = "cli" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "async-stream", "async-trait", @@ -1951,7 +1951,7 @@ dependencies = [ [[package]] name = "client" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "arc-swap", @@ -1983,7 +1983,7 @@ dependencies = [ "serde_json", "snafu 0.8.6", "store-api", - "substrait 1.0.0-rc.2", + "substrait 1.0.0", "tokio", "tokio-stream", "tonic 0.14.2", @@ -2023,7 +2023,7 @@ dependencies = [ [[package]] name = "cmd" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "async-trait", @@ -2155,7 +2155,7 @@ dependencies = [ [[package]] name = "common-base" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "ahash 0.8.12", "anymap2", @@ -2175,14 +2175,14 @@ dependencies = [ [[package]] name = "common-catalog" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "const_format", ] [[package]] name = "common-config" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "common-base", "common-error", @@ -2206,7 +2206,7 @@ dependencies = [ [[package]] name = "common-datasource" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "arrow 57.3.0", "arrow-schema 57.3.0", @@ -2242,7 +2242,7 @@ dependencies = [ [[package]] name = "common-decimal" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "bigdecimal 0.4.8", "common-error", @@ -2255,7 +2255,7 @@ dependencies = [ [[package]] name = "common-error" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "common-macro", "http 1.3.1", @@ -2266,7 +2266,7 @@ dependencies = [ [[package]] name = "common-event-recorder" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "async-trait", @@ -2289,7 +2289,7 @@ dependencies = [ [[package]] name = "common-frontend" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "async-trait", @@ -2310,7 +2310,7 @@ dependencies = [ [[package]] name = "common-function" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "ahash 0.8.12", "api", @@ -2373,7 +2373,7 @@ dependencies = [ [[package]] name = "common-greptimedb-telemetry" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "async-trait", "common-runtime", @@ -2390,7 +2390,7 @@ dependencies = [ [[package]] name = "common-grpc" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "arrow-flight", @@ -2425,7 +2425,7 @@ dependencies = [ [[package]] name = "common-grpc-expr" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "common-base", @@ -2445,7 +2445,7 @@ dependencies = [ [[package]] name = "common-macro" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "greptime-proto", "once_cell", @@ -2456,7 +2456,7 @@ dependencies = [ [[package]] name = "common-mem-prof" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "anyhow", "common-error", @@ -2472,7 +2472,7 @@ dependencies = [ [[package]] name = "common-memory-manager" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "common-error", "common-macro", @@ -2484,7 +2484,7 @@ dependencies = [ [[package]] name = "common-meta" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "anymap2", "api", @@ -2555,7 +2555,7 @@ dependencies = [ [[package]] name = "common-options" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "common-grpc", "humantime-serde", @@ -2565,11 +2565,11 @@ dependencies = [ [[package]] name = "common-plugins" -version = "1.0.0-rc.2" +version = "1.0.0" [[package]] name = "common-pprof" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "common-error", "common-macro", @@ -2580,7 +2580,7 @@ dependencies = [ [[package]] name = "common-procedure" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "async-stream", @@ -2609,7 +2609,7 @@ dependencies = [ [[package]] name = "common-procedure-test" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "async-trait", "common-procedure", @@ -2619,7 +2619,7 @@ dependencies = [ [[package]] name = "common-query" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "async-trait", @@ -2645,7 +2645,7 @@ dependencies = [ [[package]] name = "common-recordbatch" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "arc-swap", "common-base", @@ -2670,7 +2670,7 @@ dependencies = [ [[package]] name = "common-runtime" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "async-trait", "clap", @@ -2699,7 +2699,7 @@ dependencies = [ [[package]] name = "common-session" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "serde", "strum 0.27.1", @@ -2707,7 +2707,7 @@ dependencies = [ [[package]] name = "common-sql" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "arrow-schema 57.3.0", "common-base", @@ -2727,7 +2727,7 @@ dependencies = [ [[package]] name = "common-stat" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "common-base", "common-runtime", @@ -2742,7 +2742,7 @@ dependencies = [ [[package]] name = "common-telemetry" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "backtrace", "common-base", @@ -2771,7 +2771,7 @@ dependencies = [ [[package]] name = "common-test-util" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "client", "common-grpc", @@ -2784,7 +2784,7 @@ dependencies = [ [[package]] name = "common-time" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "arrow 57.3.0", "chrono", @@ -2802,7 +2802,7 @@ dependencies = [ [[package]] name = "common-version" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "cargo-manifest", "const_format", @@ -2812,7 +2812,7 @@ dependencies = [ [[package]] name = "common-wal" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "common-base", "common-error", @@ -2835,7 +2835,7 @@ dependencies = [ [[package]] name = "common-workload" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "common-telemetry", "serde", @@ -4197,7 +4197,7 @@ dependencies = [ [[package]] name = "datanode" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "arrow-flight", @@ -4265,7 +4265,7 @@ dependencies = [ [[package]] name = "datatypes" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "arrow 57.3.0", "arrow-array 57.3.0", @@ -4943,7 +4943,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "file-engine" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "async-trait", @@ -5075,7 +5075,7 @@ checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8" [[package]] name = "flow" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "arrow 57.3.0", @@ -5144,7 +5144,7 @@ dependencies = [ "sql", "store-api", "strum 0.27.1", - "substrait 1.0.0-rc.2", + "substrait 1.0.0", "table", "tokio", "tonic 0.14.2", @@ -5205,7 +5205,7 @@ checksum = "28dd6caf6059519a65843af8fe2a3ae298b14b80179855aeb4adc2c1934ee619" [[package]] name = "frontend" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "arc-swap", @@ -6453,7 +6453,7 @@ dependencies = [ [[package]] name = "index" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "async-trait", "asynchronous-codec", @@ -7421,7 +7421,7 @@ checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" [[package]] name = "log-query" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "chrono", "common-error", @@ -7433,7 +7433,7 @@ dependencies = [ [[package]] name = "log-store" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "async-stream", "async-trait", @@ -7724,7 +7724,7 @@ dependencies = [ [[package]] name = "meta-client" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "async-trait", @@ -7755,7 +7755,7 @@ dependencies = [ [[package]] name = "meta-srv" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "async-trait", @@ -7855,7 +7855,7 @@ dependencies = [ [[package]] name = "metric-engine" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "aquamarine", @@ -7956,7 +7956,7 @@ dependencies = [ [[package]] name = "mito-codec" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "bytes", @@ -7981,7 +7981,7 @@ dependencies = [ [[package]] name = "mito2" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "aquamarine", @@ -8705,7 +8705,7 @@ dependencies = [ [[package]] name = "object-store" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "anyhow", "bytes", @@ -9032,7 +9032,7 @@ dependencies = [ [[package]] name = "operator" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "ahash 0.8.12", "api", @@ -9092,7 +9092,7 @@ dependencies = [ "sql", "sqlparser", "store-api", - "substrait 1.0.0-rc.2", + "substrait 1.0.0", "table", "tokio", "tokio-util", @@ -9368,7 +9368,7 @@ checksum = "e3c406c9e2aa74554e662d2c2ee11cd3e73756988800be7e6f5eddb16fed4699" [[package]] name = "partition" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "async-trait", @@ -9724,7 +9724,7 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pipeline" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "ahash 0.8.12", "api", @@ -9881,7 +9881,7 @@ dependencies = [ [[package]] name = "plugins" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "auth", "catalog", @@ -10199,7 +10199,7 @@ dependencies = [ [[package]] name = "promql" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "ahash 0.8.12", "async-trait", @@ -10551,7 +10551,7 @@ dependencies = [ [[package]] name = "puffin" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "async-compression", "async-trait", @@ -10613,7 +10613,7 @@ dependencies = [ [[package]] name = "query" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "ahash 0.8.12", "api", @@ -10680,7 +10680,7 @@ dependencies = [ "sql", "sqlparser", "store-api", - "substrait 1.0.0-rc.2", + "substrait 1.0.0", "table", "tokio", "tokio-stream", @@ -11984,7 +11984,7 @@ dependencies = [ [[package]] name = "servers" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "ahash 0.8.12", "api", @@ -12118,7 +12118,7 @@ dependencies = [ [[package]] name = "session" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "ahash 0.8.12", "api", @@ -12450,7 +12450,7 @@ dependencies = [ [[package]] name = "sql" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "arrow-buffer 57.3.0", @@ -12511,7 +12511,7 @@ dependencies = [ [[package]] name = "sqlness-runner" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "async-trait", "clap", @@ -12791,7 +12791,7 @@ dependencies = [ [[package]] name = "standalone" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "async-trait", "catalog", @@ -12835,7 +12835,7 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" [[package]] name = "store-api" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "aquamarine", @@ -13027,7 +13027,7 @@ dependencies = [ [[package]] name = "substrait" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "async-trait", "bytes", @@ -13149,7 +13149,7 @@ dependencies = [ [[package]] name = "table" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "arc-swap", @@ -13419,7 +13419,7 @@ checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683" [[package]] name = "tests-fuzz" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "arbitrary", "async-trait", @@ -13463,7 +13463,7 @@ dependencies = [ [[package]] name = "tests-integration" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "arrow-flight", @@ -13540,7 +13540,7 @@ dependencies = [ "sqlx", "standalone", "store-api", - "substrait 1.0.0-rc.2", + "substrait 1.0.0", "table", "tempfile", "time", diff --git a/Cargo.toml b/Cargo.toml index 5041f167c3..227608bf64 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -75,7 +75,7 @@ members = [ resolver = "2" [workspace.package] -version = "1.0.0-rc.2" +version = "1.0.0" edition = "2024" license = "Apache-2.0" diff --git a/tests/cases/distributed/information_schema/cluster_info.result b/tests/cases/distributed/information_schema/cluster_info.result index 83512e6ffb..662c73a1ea 100644 --- a/tests/cases/distributed/information_schema/cluster_info.result +++ b/tests/cases/distributed/information_schema/cluster_info.result @@ -24,55 +24,55 @@ DESC TABLE CLUSTER_INFO; +----------------------+----------------------+-----+------+---------+---------------+ -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash --- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time --- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE (\s127\.0\.0\.1:\d+\s) Address +-- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash +-- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO ORDER BY peer_type; +++++++++|peer_id|peer_type|peer_addr|node_version|git_commit|start_time|uptime|active_time|+++++++++|0|DATANODE|Address|Version|Hash|Start_time|Duration|Duration||1|DATANODE|Address|Version|Hash|Start_time|Duration|Duration||2|DATANODE|Address|Version|Hash|Start_time|Duration|Duration||0|FLOWNODE|Address|Version|Hash|Start_time|Duration|Duration||1|FRONTEND|Address|Version|Hash|Start_time|Duration|Duration||1|METASRV|Address|Version|Hash|Start_time|Duration||+++++++++ -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash --- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time --- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE (\s127\.0\.0\.1:\d+\s) Address +-- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash +-- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_TYPE = 'METASRV' ORDER BY peer_type; +++++++++|peer_id|peer_type|peer_addr|node_version|git_commit|start_time|uptime|active_time|+++++++++|1|METASRV|Address|Version|Hash|Start_time|Duration||+++++++++ -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash --- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time --- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE (\s127\.0\.0\.1:\d+\s) Address +-- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash +-- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_TYPE = 'FRONTEND' ORDER BY peer_type; +++++++++|peer_id|peer_type|peer_addr|node_version|git_commit|start_time|uptime|active_time|+++++++++|1|FRONTEND|Address|Version|Hash|Start_time|Duration|Duration|+++++++++ -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash --- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time --- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE (\s127\.0\.0\.1:\d+\s) Address +-- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash +-- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_TYPE != 'FRONTEND' ORDER BY peer_type; +++++++++|peer_id|peer_type|peer_addr|node_version|git_commit|start_time|uptime|active_time|+++++++++|0|DATANODE|Address|Version|Hash|Start_time|Duration|Duration||1|DATANODE|Address|Version|Hash|Start_time|Duration|Duration||2|DATANODE|Address|Version|Hash|Start_time|Duration|Duration||0|FLOWNODE|Address|Version|Hash|Start_time|Duration|Duration||1|METASRV|Address|Version|Hash|Start_time|Duration||+++++++++ -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash --- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time --- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE (\s127\.0\.0\.1:\d+\s) Address +-- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash +-- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_ID > 1 ORDER BY peer_type; diff --git a/tests/cases/distributed/information_schema/cluster_info.sql b/tests/cases/distributed/information_schema/cluster_info.sql index 3c2dcccaa0..9e85245859 100644 --- a/tests/cases/distributed/information_schema/cluster_info.sql +++ b/tests/cases/distributed/information_schema/cluster_info.sql @@ -3,47 +3,47 @@ USE INFORMATION_SCHEMA; DESC TABLE CLUSTER_INFO; -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash --- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time --- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE (\s127\.0\.0\.1:\d+\s) Address +-- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash +-- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO ORDER BY peer_type; -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash --- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time --- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE (\s127\.0\.0\.1:\d+\s) Address +-- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash +-- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_TYPE = 'METASRV' ORDER BY peer_type; -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash --- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time --- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE (\s127\.0\.0\.1:\d+\s) Address +-- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash +-- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_TYPE = 'FRONTEND' ORDER BY peer_type; -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash --- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time --- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE (\s127\.0\.0\.1:\d+\s) Address +-- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash +-- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_TYPE != 'FRONTEND' ORDER BY peer_type; -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash --- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time --- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE (\s127\.0\.0\.1:\d+\s) Address +-- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash +-- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_ID > 1 ORDER BY peer_type; diff --git a/tests/cases/standalone/common/function/system.result b/tests/cases/standalone/common/function/system.result index 0cb6839292..d2007b5240 100644 --- a/tests/cases/standalone/common/function/system.result +++ b/tests/cases/standalone/common/function/system.result @@ -8,7 +8,7 @@ SELECT build(); ++|build()|++|branch:BRANCH|commit:COMMIT|commit_short:COMMIT_SHORT|clean:CLEAN|version:VERSION++ --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) VERSION +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) VERSION -- SQLNESS REPLACE [\s\-]+ SELECT version(); diff --git a/tests/cases/standalone/common/function/system.sql b/tests/cases/standalone/common/function/system.sql index 8ae1475311..6504b48679 100644 --- a/tests/cases/standalone/common/function/system.sql +++ b/tests/cases/standalone/common/function/system.sql @@ -6,7 +6,7 @@ -- SQLNESS REPLACE [\s\-]+ SELECT build(); --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) VERSION +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) VERSION -- SQLNESS REPLACE [\s\-]+ SELECT version(); diff --git a/tests/cases/standalone/common/system/pg_catalog.result b/tests/cases/standalone/common/system/pg_catalog.result index ef0452e316..7a6b65c7c8 100644 --- a/tests/cases/standalone/common/system/pg_catalog.result +++ b/tests/cases/standalone/common/system/pg_catalog.result @@ -14,15 +14,12 @@ SELECT session_user is not null; +----------------------------+ -- SQLNESS REPLACE PostgreSQL.* VERSION +-- SQLNESS REPLACE [\s\-]+ -- current_schema -- SQLNESS PROTOCOL POSTGRES select current_schema(), current_schemas(true), current_schemas(false), version(), current_database(); -+------------------+---------------------------------------------------------+---------------------------------+---------------------------------------+--------------------+ -| current_schema() | current_schemas(Boolean(true)) | current_schemas(Boolean(false)) | version | current_database() | -+------------------+---------------------------------------------------------+---------------------------------+---------------------------------------+--------------------+ -| public | {public,information_schema,pg_catalog,greptime_private} | {public} | VERSION -+------------------+---------------------------------------------------------+---------------------------------+---------------------------------------+--------------------+ +++++++|current_schema()|current_schemas(Boolean(true))|current_schemas(Boolean(false))|version|current_database()|++++++|public|{public,information_schema,pg_catalog,greptime_private}|{public}|VERSION++++++ -- search_path for pg using schema for now FIXME when support real search_path -- SQLNESS PROTOCOL POSTGRES diff --git a/tests/cases/standalone/common/system/pg_catalog.sql b/tests/cases/standalone/common/system/pg_catalog.sql index ad59da372c..2e84ecd7ce 100644 --- a/tests/cases/standalone/common/system/pg_catalog.sql +++ b/tests/cases/standalone/common/system/pg_catalog.sql @@ -6,6 +6,7 @@ create database pg_catalog; SELECT session_user is not null; -- SQLNESS REPLACE PostgreSQL.* VERSION +-- SQLNESS REPLACE [\s\-]+ -- current_schema -- SQLNESS PROTOCOL POSTGRES select current_schema(), current_schemas(true), current_schemas(false), version(), current_database(); diff --git a/tests/cases/standalone/information_schema/cluster_info.result b/tests/cases/standalone/information_schema/cluster_info.result index 07fc4bd5c6..04567ff721 100644 --- a/tests/cases/standalone/information_schema/cluster_info.result +++ b/tests/cases/standalone/information_schema/cluster_info.result @@ -24,9 +24,9 @@ DESC TABLE CLUSTER_INFO; +----------------------+----------------------+-----+------+---------+---------------+ -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash -- SQLNESS REPLACE (\s[\-0-9T:\.]{15,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash -- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO; @@ -34,9 +34,9 @@ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, a +++++++++|peer_id|peer_type|peer_addr|node_version|git_commit|start_time|uptime|active_time|+++++++++|0|STANDALONE||Version|Hash|Start_time|Duration||+++++++++ -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash -- SQLNESS REPLACE (\s[\-0-9T:\.]{15,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash -- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_TYPE = 'STANDALONE'; @@ -49,9 +49,9 @@ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, a ++ -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash -- SQLNESS REPLACE (\s[\-0-9T:\.]{15,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash -- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_ID = 0; diff --git a/tests/cases/standalone/information_schema/cluster_info.sql b/tests/cases/standalone/information_schema/cluster_info.sql index 5e253fc43d..798e9eff28 100644 --- a/tests/cases/standalone/information_schema/cluster_info.sql +++ b/tests/cases/standalone/information_schema/cluster_info.sql @@ -3,17 +3,17 @@ USE INFORMATION_SCHEMA; DESC TABLE CLUSTER_INFO; -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash -- SQLNESS REPLACE (\s[\-0-9T:\.]{15,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash -- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO; -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash -- SQLNESS REPLACE (\s[\-0-9T:\.]{15,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash -- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_TYPE = 'STANDALONE'; @@ -21,9 +21,9 @@ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, a SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_TYPE != 'STANDALONE'; -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash -- SQLNESS REPLACE (\s[\-0-9T:\.]{15,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash -- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_ID = 0; From 09b368c00aff85a468497c213100a27210f265fc Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Wed, 8 Apr 2026 16:34:13 -0700 Subject: [PATCH 083/195] feat: tune constants (#7851) * feat: tune constants Signed-off-by: Ruihang Xia * cap output batch size Signed-off-by: Ruihang Xia * handle empty input Signed-off-by: Ruihang Xia * one more ut for cr Signed-off-by: Ruihang Xia --------- Signed-off-by: Ruihang Xia --- .../datasource/src/file_format/parquet.rs | 37 ++- src/mito2/src/cache/file_cache.rs | 43 ++- src/mito2/src/memtable/bulk.rs | 2 +- src/mito2/src/memtable/partition_tree/data.rs | 7 +- src/mito2/src/sst/parquet.rs | 12 +- src/promql/src/extension_plan/absent.rs | 277 ++++++++++++++---- src/query/src/range_select/plan.rs | 195 +++++++++++- 7 files changed, 490 insertions(+), 83 deletions(-) diff --git a/src/common/datasource/src/file_format/parquet.rs b/src/common/datasource/src/file_format/parquet.rs index c2c14b4680..9c8e8d6ce8 100644 --- a/src/common/datasource/src/file_format/parquet.rs +++ b/src/common/datasource/src/file_format/parquet.rs @@ -23,7 +23,9 @@ use datafusion::error::Result as DatafusionResult; use datafusion::parquet::arrow::async_reader::AsyncFileReader; use datafusion::parquet::arrow::{ArrowWriter, parquet_to_arrow_schema}; use datafusion::parquet::errors::{ParquetError, Result as ParquetResult}; -use datafusion::parquet::file::metadata::ParquetMetaData; +use datafusion::parquet::file::metadata::{ + PageIndexPolicy, ParquetMetaData, ParquetMetaDataReader, +}; use datafusion::physical_plan::SendableRecordBatchStream; use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion_datasource::PartitionedFile; @@ -94,35 +96,40 @@ impl DefaultParquetFileReaderFactory { } impl ParquetFileReaderFactory for DefaultParquetFileReaderFactory { - // TODO(weny): Supports [`metadata_size_hint`]. - // The upstream has a implementation supports [`metadata_size_hint`], - // however it coupled with Box. fn create_reader( &self, _partition_index: usize, partitioned_file: PartitionedFile, - _metadata_size_hint: Option, + metadata_size_hint: Option, _metrics: &ExecutionPlanMetricsSet, ) -> DatafusionResult> { let path = partitioned_file.path().to_string(); let object_store = self.object_store.clone(); - Ok(Box::new(LazyParquetFileReader::new(object_store, path))) + Ok(Box::new(LazyParquetFileReader::new( + object_store, + path, + metadata_size_hint, + ))) } } pub struct LazyParquetFileReader { object_store: ObjectStore, reader: Option>, + file_size: Option, + metadata_size_hint: Option, path: String, } impl LazyParquetFileReader { - pub fn new(object_store: ObjectStore, path: String) -> Self { + pub fn new(object_store: ObjectStore, path: String, metadata_size_hint: Option) -> Self { LazyParquetFileReader { object_store, path, reader: None, + file_size: None, + metadata_size_hint, } } @@ -130,6 +137,7 @@ impl LazyParquetFileReader { async fn maybe_initialize(&mut self) -> result::Result<(), object_store::Error> { if self.reader.is_none() { let meta = self.object_store.stat(&self.path).await?; + self.file_size = Some(meta.content_length()); let reader = self .object_store .reader(&self.path) @@ -166,8 +174,19 @@ impl AsyncFileReader for LazyParquetFileReader { self.maybe_initialize() .await .map_err(|e| ParquetError::External(Box::new(e)))?; - // Safety: Must initialized - self.reader.as_mut().unwrap().get_metadata(options).await + + let metadata_opts = options.map(|o| o.metadata_options().clone()); + let metadata_reader = ParquetMetaDataReader::new() + .with_metadata_options(metadata_opts) + .with_page_index_policy(PageIndexPolicy::from( + options.is_some_and(|o| o.page_index()), + )) + .with_prefetch_hint(self.metadata_size_hint); + + let metadata = metadata_reader + .load_and_finish(self.reader.as_mut().unwrap(), self.file_size.unwrap()) + .await?; + Ok(Arc::new(metadata)) }) } } diff --git a/src/mito2/src/cache/file_cache.rs b/src/mito2/src/cache/file_cache.rs index 278838b369..9b987c810b 100644 --- a/src/mito2/src/cache/file_cache.rs +++ b/src/mito2/src/cache/file_cache.rs @@ -288,6 +288,17 @@ pub(crate) struct FileCache { pub(crate) type FileCacheRef = Arc; impl FileCache { + /// Splits the configured total capacity between parquet and puffin caches + /// without exceeding the requested overall budget. + fn split_cache_capacities(total_capacity: u64, index_percent: u8) -> (u64, u64) { + let desired_puffin_capacity = total_capacity * u64::from(index_percent) / 100; + let min_cache_capacity = MIN_CACHE_CAPACITY.min(total_capacity / 2); + let puffin_capacity = + desired_puffin_capacity.clamp(min_cache_capacity, total_capacity - min_cache_capacity); + let parquet_capacity = total_capacity - puffin_capacity; + (parquet_capacity, puffin_capacity) + } + /// Creates a new file cache. pub(crate) fn new( local_store: ObjectStore, @@ -302,14 +313,8 @@ impl FileCache { .unwrap_or(DEFAULT_INDEX_CACHE_PERCENT); let total_capacity = capacity.as_bytes(); - // Convert percent to ratio and calculate capacity for each cache - let index_ratio = index_percent as f64 / 100.0; - let puffin_capacity = (total_capacity as f64 * index_ratio) as u64; - let parquet_capacity = total_capacity - puffin_capacity; - - // Ensure both capacities are at least 512MB - let puffin_capacity = puffin_capacity.max(MIN_CACHE_CAPACITY); - let parquet_capacity = parquet_capacity.max(MIN_CACHE_CAPACITY); + let (parquet_capacity, puffin_capacity) = + Self::split_cache_capacities(total_capacity, index_percent); info!( "Initializing file cache with index_percent: {}%, total_capacity: {}, parquet_capacity: {}, puffin_capacity: {}", @@ -1064,6 +1069,28 @@ mod tests { assert_eq!(data, bytes[3].as_ref()); } + #[test] + fn test_file_cache_capacity_respects_total_budget() { + let total_capacity = ReadableSize::mb(256).as_bytes(); + let (parquet_capacity, puffin_capacity) = + FileCache::split_cache_capacities(total_capacity, 20); + + assert_eq!(total_capacity, parquet_capacity + puffin_capacity); + assert_eq!(ReadableSize::mb(128).as_bytes(), parquet_capacity); + assert_eq!(ReadableSize::mb(128).as_bytes(), puffin_capacity); + } + + #[test] + fn test_file_cache_capacity_keeps_split_when_total_allows_it() { + let total_capacity = ReadableSize::gb(5).as_bytes(); + let (parquet_capacity, puffin_capacity) = + FileCache::split_cache_capacities(total_capacity, 20); + + assert_eq!(total_capacity, parquet_capacity + puffin_capacity); + assert_eq!(ReadableSize::gb(4).as_bytes(), parquet_capacity); + assert_eq!(ReadableSize::gb(1).as_bytes(), puffin_capacity); + } + #[test] fn test_cache_file_path() { let file_id = FileId::parse_str("3368731b-a556-42b8-a5df-9c31ce155095").unwrap(); diff --git a/src/mito2/src/memtable/bulk.rs b/src/mito2/src/memtable/bulk.rs index 9d25d0c39f..24b2bebaa9 100644 --- a/src/mito2/src/memtable/bulk.rs +++ b/src/mito2/src/memtable/bulk.rs @@ -137,7 +137,7 @@ struct CollectedParts { /// All parts in a bulk memtable. #[derive(Default)] struct BulkParts { - /// Unordered small parts (< 1024 rows). + /// Unordered small parts. unordered_part: UnorderedPart, /// All parts (raw and encoded). parts: Vec, diff --git a/src/mito2/src/memtable/partition_tree/data.rs b/src/mito2/src/memtable/partition_tree/data.rs index a6d40bdcbf..f6e2a59bec 100644 --- a/src/mito2/src/memtable/partition_tree/data.rs +++ b/src/mito2/src/memtable/partition_tree/data.rs @@ -50,6 +50,7 @@ use crate::memtable::partition_tree::merger::{DataBatchKey, DataNode, DataSource use crate::metrics::{ PARTITION_TREE_DATA_BUFFER_FREEZE_STAGE_ELAPSED, PARTITION_TREE_READ_STAGE_ELAPSED, }; +use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE; const PK_INDEX_COLUMN_NAME: &str = "__pk_index"; @@ -821,7 +822,11 @@ impl DataPart { /// Reads frozen data part and yields [DataBatch]es. pub fn read(&self) -> Result { match self { - DataPart::Parquet(data_bytes) => DataPartReader::new(data_bytes.data.clone(), None), + // Keep encoded memtable scans aligned with mito/DataFusion batch sizing instead of + // parquet-rs's implicit 1024-row default. + DataPart::Parquet(data_bytes) => { + DataPartReader::new(data_bytes.data.clone(), Some(DEFAULT_READ_BATCH_SIZE)) + } } } diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs index 2ca83ca8cf..2447824ad9 100644 --- a/src/mito2/src/sst/parquet.rs +++ b/src/mito2/src/sst/parquet.rs @@ -41,9 +41,17 @@ pub mod writer; pub const PARQUET_METADATA_KEY: &str = "greptime:metadata"; /// Default batch size to read parquet files. -pub(crate) const DEFAULT_READ_BATCH_SIZE: usize = 1024; +/// +/// This is a runtime-only scan granularity, so we align it with DataFusion's +/// default execution batch size to reduce rebatching and concatenation in the +/// query pipeline. +pub(crate) const DEFAULT_READ_BATCH_SIZE: usize = 8 * 1024; /// Default row group size for parquet files. -pub const DEFAULT_ROW_GROUP_SIZE: usize = 100 * DEFAULT_READ_BATCH_SIZE; +/// +/// Keep the existing persisted/on-disk default stable. It intentionally stays +/// decoupled from [`DEFAULT_READ_BATCH_SIZE`] so we can tune runtime scan +/// batching without changing the row group layout of newly written SSTs. +pub const DEFAULT_ROW_GROUP_SIZE: usize = 100 * 1024; /// Parquet write options. #[derive(Debug, Clone)] diff --git a/src/promql/src/extension_plan/absent.rs b/src/promql/src/extension_plan/absent.rs index db31a3d901..71af413029 100644 --- a/src/promql/src/extension_plan/absent.rs +++ b/src/promql/src/extension_plan/absent.rs @@ -49,9 +49,6 @@ use snafu::ResultExt; use crate::error::DeserializeSnafu; use crate::extension_plan::{Millisecond, resolve_column_name, serialize_column_index}; -/// Maximum number of rows per output batch -const ABSENT_BATCH_SIZE: usize = 8192; - #[derive(Debug, PartialEq, Eq, Hash)] pub struct Absent { start: Millisecond, @@ -390,11 +387,13 @@ impl ExecutionPlan for AbsentExec { context: Arc, ) -> DataFusionResult { let baseline_metric = BaselineMetrics::new(&self.metric, partition); + let batch_size = context.session_config().batch_size(); let input = self.input.execute(partition, context)?; Ok(Box::pin(AbsentStream { end: self.end, step: self.step, + batch_size, time_index_column_index: self .input .schema() @@ -407,6 +406,8 @@ impl ExecutionPlan for AbsentExec { metric: baseline_metric, // Buffer for streaming output timestamps output_timestamps: Vec::new(), + input_timestamps: Vec::new(), + input_timestamp_offset: 0, // Current timestamp in the output range output_ts_cursor: self.start, input_finished: false, @@ -441,6 +442,7 @@ impl DisplayAs for AbsentExec { pub struct AbsentStream { end: Millisecond, step: Millisecond, + batch_size: usize, time_index_column_index: usize, output_schema: SchemaRef, fake_labels: Vec<(String, String)>, @@ -448,6 +450,9 @@ pub struct AbsentStream { metric: BaselineMetrics, // Buffer for streaming output timestamps output_timestamps: Vec, + // Current input timestamps being processed incrementally. + input_timestamps: Vec, + input_timestamp_offset: usize, // Current timestamp in the output range output_ts_cursor: Millisecond, input_finished: bool, @@ -464,52 +469,53 @@ impl Stream for AbsentStream { fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { loop { - if !self.input_finished { - match ready!(self.input.poll_next_unpin(cx)) { - Some(Ok(batch)) => { - let timer = std::time::Instant::now(); - if let Err(e) = self.process_input_batch(&batch) { - return Poll::Ready(Some(Err(e))); - } - self.metric.elapsed_compute().add_elapsed(timer); - - // If we have enough data for a batch, output it - if self.output_timestamps.len() >= ABSENT_BATCH_SIZE { - let timer = std::time::Instant::now(); - let result = self.flush_output_batch(); - self.metric.elapsed_compute().add_elapsed(timer); - - match result { - Ok(Some(batch)) => return Poll::Ready(Some(Ok(batch))), - Ok(None) => continue, - Err(e) => return Poll::Ready(Some(Err(e))), - } - } - } - Some(Err(e)) => return Poll::Ready(Some(Err(e))), - None => { - self.input_finished = true; - - let timer = std::time::Instant::now(); - // Process any remaining absent timestamps - if let Err(e) = self.process_remaining_absent_timestamps() { - return Poll::Ready(Some(Err(e))); - } - let result = self.flush_output_batch(); - self.metric.elapsed_compute().add_elapsed(timer); - return Poll::Ready(result.transpose()); - } + if self.has_pending_input_timestamps() { + let timer = std::time::Instant::now(); + if let Err(e) = self.process_input_batch() { + return Poll::Ready(Some(Err(e))); + } + self.metric.elapsed_compute().add_elapsed(timer); + + match self.flush_output_batch() { + Ok(Some(batch)) => return Poll::Ready(Some(Ok(batch))), + Ok(None) => continue, + Err(e) => return Poll::Ready(Some(Err(e))), + } + } + + if self.input_finished { + let timer = std::time::Instant::now(); + if let Err(e) = self.process_remaining_absent_timestamps() { + return Poll::Ready(Some(Err(e))); + } + self.metric.elapsed_compute().add_elapsed(timer); + + match self.flush_output_batch() { + Ok(Some(batch)) => return Poll::Ready(Some(Ok(batch))), + Ok(None) => return Poll::Ready(None), + Err(e) => return Poll::Ready(Some(Err(e))), + } + } + + match ready!(self.input.poll_next_unpin(cx)) { + Some(Ok(batch)) => { + let timer = std::time::Instant::now(); + if let Err(e) = self.buffer_input_timestamps(&batch) { + return Poll::Ready(Some(Err(e))); + } + self.metric.elapsed_compute().add_elapsed(timer); + } + Some(Err(e)) => return Poll::Ready(Some(Err(e))), + None => { + self.input_finished = true; } - } else { - return Poll::Ready(None); } } } } impl AbsentStream { - fn process_input_batch(&mut self, batch: &RecordBatch) -> DataFusionResult<()> { - // Extract timestamps from this batch + fn buffer_input_timestamps(&mut self, batch: &RecordBatch) -> DataFusionResult<()> { let timestamp_array = batch.column(self.time_index_column_index); let milli_ts_array = arrow::compute::cast( timestamp_array, @@ -519,29 +525,52 @@ impl AbsentStream { .as_any() .downcast_ref::() .unwrap(); + self.input_timestamps.clear(); + self.input_timestamps + .extend_from_slice(timestamp_array.values()); + self.input_timestamp_offset = 0; + Ok(()) + } + + fn has_pending_input_timestamps(&self) -> bool { + self.input_timestamp_offset < self.input_timestamps.len() + } + + fn process_input_batch(&mut self) -> DataFusionResult<()> { + while self.input_timestamp_offset < self.input_timestamps.len() { + let input_ts = self.input_timestamps[self.input_timestamp_offset]; - // Process against current output cursor position - for &input_ts in timestamp_array.values() { // Generate absent timestamps up to this input timestamp while self.output_ts_cursor < input_ts && self.output_ts_cursor <= self.end { self.output_timestamps.push(self.output_ts_cursor); self.output_ts_cursor += self.step; + + if self.output_timestamps.len() >= self.batch_size { + return Ok(()); + } } // Skip the input timestamp if it matches our cursor if self.output_ts_cursor == input_ts { self.output_ts_cursor += self.step; } + + self.input_timestamp_offset += 1; } + self.input_timestamps.clear(); + self.input_timestamp_offset = 0; Ok(()) } fn process_remaining_absent_timestamps(&mut self) -> DataFusionResult<()> { - // Generate all remaining absent timestamps (input is finished) while self.output_ts_cursor <= self.end { self.output_timestamps.push(self.output_ts_cursor); self.output_ts_cursor += self.step; + + if self.output_timestamps.len() >= self.batch_size { + return Ok(()); + } } Ok(()) } @@ -551,11 +580,16 @@ impl AbsentStream { return Ok(None); } + let timestamps = if self.output_timestamps.len() <= self.batch_size { + std::mem::take(&mut self.output_timestamps) + } else { + let remaining = self.output_timestamps.split_off(self.batch_size); + std::mem::replace(&mut self.output_timestamps, remaining) + }; + let mut columns: Vec = Vec::with_capacity(self.output_schema.fields().len()); - let num_rows = self.output_timestamps.len(); - columns.push(Arc::new(TimestampMillisecondArray::from( - self.output_timestamps.clone(), - )) as _); + let num_rows = timestamps.len(); + columns.push(Arc::new(TimestampMillisecondArray::from(timestamps)) as _); columns.push(Arc::new(Float64Array::from(vec![1.0; num_rows])) as _); for (_, value) in self.fake_labels.iter() { @@ -567,7 +601,6 @@ impl AbsentStream { let batch = RecordBatch::try_new(self.output_schema.clone(), columns)?; - self.output_timestamps.clear(); Ok(Some(batch)) } } @@ -580,7 +613,7 @@ mod tests { use datafusion::arrow::record_batch::RecordBatch; use datafusion::catalog::memory::DataSourceExec; use datafusion::datasource::memory::MemorySourceConfig; - use datafusion::prelude::SessionContext; + use datafusion::prelude::{SessionConfig, SessionContext}; use datatypes::arrow::array::{Float64Array, TimestampMillisecondArray}; use super::*; @@ -725,4 +758,146 @@ mod tests { // Should output all timestamps in range: 0, 1000, 2000 assert_eq!(output_timestamps, vec![0, 1000, 2000]); } + + #[tokio::test] + async fn test_absent_respects_session_batch_size_for_large_gap() { + let schema = Arc::new(Schema::new(vec![ + Field::new( + "timestamp", + DataType::Timestamp(TimeUnit::Millisecond, None), + true, + ), + Field::new("value", DataType::Float64, true), + ])); + + let timestamp_array = Arc::new(TimestampMillisecondArray::from(vec![9])); + let value_array = Arc::new(Float64Array::from(vec![1.0])); + let batch = + RecordBatch::try_new(schema.clone(), vec![timestamp_array, value_array]).unwrap(); + + let memory_exec = DataSourceExec::new(Arc::new( + MemorySourceConfig::try_new(&[vec![batch]], schema, None).unwrap(), + )); + + let output_schema = Arc::new(Schema::new(vec![ + Field::new( + "timestamp", + DataType::Timestamp(TimeUnit::Millisecond, None), + true, + ), + Field::new("value", DataType::Float64, true), + ])); + + let absent_exec = AbsentExec { + start: 0, + end: 10, + step: 1, + time_index_column: "timestamp".to_string(), + value_column: "value".to_string(), + fake_labels: vec![], + output_schema: output_schema.clone(), + input: Arc::new(memory_exec), + properties: Arc::new(PlanProperties::new( + EquivalenceProperties::new(output_schema.clone()), + Partitioning::UnknownPartitioning(1), + EmissionType::Incremental, + Boundedness::Bounded, + )), + metric: ExecutionPlanMetricsSet::new(), + }; + + let session_ctx = SessionContext::new_with_config(SessionConfig::new().with_batch_size(3)); + let task_ctx = session_ctx.task_ctx(); + let mut stream = absent_exec.execute(0, task_ctx).unwrap(); + + let mut batch_sizes = Vec::new(); + let mut output_timestamps = Vec::new(); + while let Some(batch_result) = stream.next().await { + let batch = batch_result.unwrap(); + batch_sizes.push(batch.num_rows()); + + let ts_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..ts_array.len() { + if !ts_array.is_null(i) { + output_timestamps.push(ts_array.value(i)); + } + } + } + + assert_eq!(batch_sizes, vec![3, 3, 3, 1]); + assert_eq!(output_timestamps, vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 10]); + } + + #[tokio::test] + async fn test_absent_resumes_same_input_timestamp_after_batch_flush() { + let schema = Arc::new(Schema::new(vec![ + Field::new( + "timestamp", + DataType::Timestamp(TimeUnit::Millisecond, None), + true, + ), + Field::new("value", DataType::Float64, true), + ])); + + let timestamp_array = Arc::new(TimestampMillisecondArray::from(vec![9])); + let value_array = Arc::new(Float64Array::from(vec![1.0])); + let batch = + RecordBatch::try_new(schema.clone(), vec![timestamp_array, value_array]).unwrap(); + + let memory_exec = DataSourceExec::new(Arc::new( + MemorySourceConfig::try_new(&[vec![batch]], schema, None).unwrap(), + )); + + let output_schema = Arc::new(Schema::new(vec![ + Field::new( + "timestamp", + DataType::Timestamp(TimeUnit::Millisecond, None), + true, + ), + Field::new("value", DataType::Float64, true), + ])); + + let absent_exec = AbsentExec { + start: 0, + end: 9, + step: 1, + time_index_column: "timestamp".to_string(), + value_column: "value".to_string(), + fake_labels: vec![], + output_schema: output_schema.clone(), + input: Arc::new(memory_exec), + properties: Arc::new(PlanProperties::new( + EquivalenceProperties::new(output_schema.clone()), + Partitioning::UnknownPartitioning(1), + EmissionType::Incremental, + Boundedness::Bounded, + )), + metric: ExecutionPlanMetricsSet::new(), + }; + + let session_ctx = SessionContext::new_with_config(SessionConfig::new().with_batch_size(3)); + let task_ctx = session_ctx.task_ctx(); + let mut stream = absent_exec.execute(0, task_ctx).unwrap(); + + let mut output_timestamps = Vec::new(); + while let Some(batch_result) = stream.next().await { + let batch = batch_result.unwrap(); + let ts_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..ts_array.len() { + if !ts_array.is_null(i) { + output_timestamps.push(ts_array.value(i)); + } + } + } + + assert_eq!(output_timestamps, vec![0, 1, 2, 3, 4, 5, 6, 7, 8]); + } } diff --git a/src/query/src/range_select/plan.rs b/src/query/src/range_select/plan.rs index e863aaced0..d83514f435 100644 --- a/src/query/src/range_select/plan.rs +++ b/src/query/src/range_select/plan.rs @@ -836,6 +836,7 @@ impl ExecutionPlan for RangeSelectExec { context: Arc, ) -> DfResult { let baseline_metric = BaselineMetrics::new(&self.metric, partition); + let batch_size = context.session_config().batch_size(); let input = self.input.execute(partition, context)?; let schema = input.schema(); let time_index = schema @@ -852,6 +853,7 @@ impl ExecutionPlan for RangeSelectExec { .collect(), )?; Ok(Box::pin(RangeSelectStream { + batch_size, schema: self.schema.clone(), range_exec: self.range_exec.clone(), input, @@ -868,6 +870,8 @@ impl ExecutionPlan for RangeSelectExec { metric: baseline_metric, schema_project: self.schema_project.clone(), schema_before_project: self.schema_before_project.clone(), + output_batch: None, + output_batch_offset: 0, })) } @@ -881,6 +885,7 @@ impl ExecutionPlan for RangeSelectExec { } struct RangeSelectStream { + batch_size: usize, /// the schema of output column schema: SchemaRef, range_exec: Vec, @@ -907,6 +912,8 @@ struct RangeSelectStream { metric: BaselineMetrics, schema_project: Option>, schema_before_project: SchemaRef, + output_batch: Option, + output_batch_offset: usize, } #[derive(Debug)] @@ -1149,6 +1156,36 @@ impl RangeSelectStream { }; Ok(project_output) } + + fn next_output_batch(&mut self) -> DfResult> { + if self.output_batch.is_none() { + self.output_batch = Some(self.generate_output()?); + self.output_batch_offset = 0; + } + + let num_rows = self.output_batch.as_ref().unwrap().num_rows(); + if num_rows == 0 { + self.output_batch = None; + self.output_batch_offset = 0; + return Ok(None); + } + + if self.output_batch_offset == 0 && num_rows <= self.batch_size { + return Ok(self.output_batch.take()); + } + + let offset = self.output_batch_offset; + let len = (num_rows - offset).min(self.batch_size); + let batch = self.output_batch.as_ref().unwrap().slice(offset, len); + self.output_batch_offset += len; + + if self.output_batch_offset >= num_rows { + self.output_batch = None; + self.output_batch_offset = 0; + } + + Ok(Some(batch)) + } } enum ExecutionState { @@ -1191,13 +1228,19 @@ impl Stream for RangeSelectStream { } } ExecutionState::ProducingOutput => { - let result = self.generate_output(); + let result = self.next_output_batch(); return match result { // made output - Ok(batch) => { - self.exec_state = ExecutionState::Done; + Ok(Some(batch)) => { + if self.output_batch.is_none() { + self.exec_state = ExecutionState::Done; + } Poll::Ready(Some(Ok(batch))) } + Ok(None) => { + self.exec_state = ExecutionState::Done; + Poll::Ready(None) + } // error making output Err(error) => Poll::Ready(Some(Err(error))), }; @@ -1251,7 +1294,7 @@ mod test { use datafusion::prelude::SessionContext; use datafusion_physical_expr::PhysicalSortExpr; use datafusion_physical_expr::expressions::Column; - use datatypes::arrow::array::TimestampMillisecondArray; + use datatypes::arrow::array::{Float64Array, Int64Array, TimestampMillisecondArray}; use datatypes::arrow_array::StringArray; use super::*; @@ -1313,15 +1356,49 @@ mod test { )) } - async fn do_range_select_test( + fn prepare_empty_test_data(is_float: bool) -> DataSourceExec { + let schema = Arc::new(Schema::new(vec![ + Field::new(TIME_INDEX_COLUMN, TimestampMillisecondType::DATA_TYPE, true), + Field::new( + "value", + if is_float { + DataType::Float64 + } else { + DataType::Int64 + }, + true, + ), + Field::new("host", DataType::Utf8, true), + ])); + let timestamp_column: Arc = + Arc::new(TimestampMillisecondArray::from(Vec::::new())) as _; + let value_column: Arc = if is_float { + Arc::new(Float64Array::from(Vec::>::new())) as _ + } else { + Arc::new(Int64Array::from(Vec::>::new())) as _ + }; + let host_column: Arc = + Arc::new(StringArray::from(Vec::>::new())) as _; + let data = RecordBatch::try_new( + schema.clone(), + vec![timestamp_column, value_column, host_column], + ) + .unwrap(); + + DataSourceExec::new(Arc::new( + MemorySourceConfig::try_new(&[vec![data]], schema, None).unwrap(), + )) + } + + async fn collect_range_select_test( range1: Millisecond, range2: Millisecond, align: Millisecond, fill: Option, is_float: bool, is_gap: bool, - expected: String, - ) { + batch_size: usize, + ) -> Vec { let data_type = if is_float { DataType::Float64 } else { @@ -1412,11 +1489,25 @@ mod test { .into(), range_select_exec, ); - let session_context = SessionContext::default(); + let session_context = SessionContext::new_with_config( + datafusion::execution::config::SessionConfig::new().with_batch_size(batch_size), + ); + datafusion::physical_plan::collect(Arc::new(sort_exec), session_context.task_ctx()) + .await + .unwrap() + } + + async fn do_range_select_test( + range1: Millisecond, + range2: Millisecond, + align: Millisecond, + fill: Option, + is_float: bool, + is_gap: bool, + expected: String, + ) { let result = - datafusion::physical_plan::collect(Arc::new(sort_exec), session_context.task_ctx()) - .await - .unwrap(); + collect_range_select_test(range1, range2, align, fill, is_float, is_gap, 8192).await; let result_literal = arrow::util::pretty::pretty_format_batches(&result) .unwrap() @@ -1700,6 +1791,88 @@ mod test { .await; } + #[tokio::test] + async fn range_select_respects_session_batch_size() { + let result = + collect_range_select_test(10_000, 5_000, 5_000, Some(Fill::Null), true, false, 3).await; + + let row_counts = result + .iter() + .map(|batch| batch.num_rows()) + .collect::>(); + assert_eq!(vec![3, 3, 3, 3], row_counts); + } + + #[tokio::test] + async fn range_select_skips_empty_output_batch() { + let memory_exec = Arc::new(prepare_empty_test_data(true)); + let schema = Arc::new(Schema::new(vec![ + Field::new("MIN(value)", DataType::Float64, true), + Field::new("MAX(value)", DataType::Float64, true), + Field::new(TIME_INDEX_COLUMN, TimestampMillisecondType::DATA_TYPE, true), + Field::new("host", DataType::Utf8, true), + ])); + let cache = Arc::new(PlanProperties::new( + EquivalenceProperties::new(schema.clone()), + Partitioning::UnknownPartitioning(1), + EmissionType::Incremental, + Boundedness::Bounded, + )); + let input_schema = memory_exec.schema().clone(); + let range_select_exec = Arc::new(RangeSelectExec { + input: memory_exec, + range_exec: vec![ + RangeFnExec { + expr: Arc::new( + AggregateExprBuilder::new( + min_max::min_udaf(), + vec![Arc::new(Column::new("value", 1))], + ) + .schema(input_schema.clone()) + .alias("MIN(value)") + .build() + .unwrap(), + ), + range: 10_000, + fill: Some(Fill::Null), + need_cast: None, + }, + RangeFnExec { + expr: Arc::new( + AggregateExprBuilder::new( + min_max::max_udaf(), + vec![Arc::new(Column::new("value", 1))], + ) + .schema(input_schema) + .alias("MAX(value)") + .build() + .unwrap(), + ), + range: 5_000, + fill: Some(Fill::Null), + need_cast: None, + }, + ], + align: 5_000, + align_to: 0, + by: vec![Arc::new(Column::new("host", 2))], + time_index: TIME_INDEX_COLUMN.to_string(), + schema: schema.clone(), + schema_before_project: schema.clone(), + schema_project: None, + by_schema: Arc::new(Schema::new(vec![Field::new("host", DataType::Utf8, true)])), + metric: ExecutionPlanMetricsSet::new(), + cache, + }); + let session_context = SessionContext::new(); + let result = + datafusion::physical_plan::collect(range_select_exec, session_context.task_ctx()) + .await + .unwrap(); + + assert!(result.is_empty()); + } + #[test] fn fill_test() { assert!(Fill::try_from_str("", &DataType::UInt8).unwrap().is_none()); From dca451c4851fb00723122013ab95f5ba8c8301a3 Mon Sep 17 00:00:00 2001 From: Weny Xu Date: Thu, 9 Apr 2026 11:40:14 +0800 Subject: [PATCH 084/195] fix: remap peer addresses during retries (#7933) * fix: remap peer addresses during retries Signed-off-by: WenyXu * chore: styling Signed-off-by: WenyXu * test: add tests Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu --------- Signed-off-by: WenyXu --- src/common/meta/src/ddl/create_table.rs | 27 ++++++-- src/common/meta/src/ddl/drop_database.rs | 6 +- .../meta/src/ddl/drop_database/cursor.rs | 3 + .../meta/src/ddl/drop_database/executor.rs | 69 ++++++++++++++++++- .../meta/src/ddl/drop_database/metadata.rs | 2 + .../meta/src/ddl/drop_database/start.rs | 3 + src/common/meta/src/ddl/drop_table.rs | 26 ++++++- src/common/meta/src/ddl/test_util.rs | 19 +++++ src/common/meta/src/ddl/tests/create_table.rs | 23 ++++++- src/common/meta/src/ddl/tests/drop_table.rs | 48 ++++++++++++- src/common/meta/src/ddl/truncate_table.rs | 19 ++--- src/common/meta/src/ddl_manager.rs | 40 +++++------ src/common/meta/src/key.rs | 2 +- src/common/meta/src/key/table_route.rs | 53 +++++++++----- src/common/procedure/src/local/runner.rs | 54 +++++++++++++++ src/common/procedure/src/procedure.rs | 12 ++++ src/meta-srv/src/procedure/tests.rs | 2 +- 17 files changed, 346 insertions(+), 62 deletions(-) diff --git a/src/common/meta/src/ddl/create_table.rs b/src/common/meta/src/ddl/create_table.rs index a5b642f1a2..b377a60406 100644 --- a/src/common/meta/src/ddl/create_table.rs +++ b/src/common/meta/src/ddl/create_table.rs @@ -172,8 +172,24 @@ impl CreateTableProcedure { /// - [Code::Cancelled](tonic::status::Code::Cancelled) /// - [Code::DeadlineExceeded](tonic::status::Code::DeadlineExceeded) /// - [Code::Unavailable](tonic::status::Code::Unavailable) - pub async fn on_datanode_create_regions(&mut self) -> Result { - let table_route = self.table_route()?.clone(); + pub async fn on_datanode_create_regions(&mut self, retrying: bool) -> Result { + let mut table_route = self.table_route()?.clone(); + if retrying { + info!( + "Remapping region routes addresses for retrying create regions for table: {}", + self.data.table_ref() + ); + let storage = self + .context + .table_metadata_manager + .table_route_manager() + .table_route_storage(); + // The peer addresses may change during retries, + // so we always remap the region routes. + storage + .remap_region_routes(&mut table_route.region_routes) + .await?; + } // Registers opening regions let guards = self.register_opening_regions(&self.context, &table_route.region_routes)?; if !guards.is_empty() { @@ -301,7 +317,10 @@ impl Procedure for CreateTableProcedure { match state { CreateTableState::Prepare => self.on_prepare().await, - CreateTableState::DatanodeCreateRegions => self.on_datanode_create_regions().await, + CreateTableState::DatanodeCreateRegions => { + let retrying = ctx.is_retrying().await.unwrap_or(false); + self.on_datanode_create_regions(retrying).await + } CreateTableState::CreateMetadata => self.on_create_metadata(ctx.procedure_id).await, } .map_err(map_to_procedure_error) @@ -339,7 +358,7 @@ pub struct CreateTableData { #[serde(default)] pub column_metadatas: Vec, /// None stands for not allocated yet. - table_route: Option, + pub(crate) table_route: Option, /// None stands for not allocated yet. pub region_wal_options: Option>, } diff --git a/src/common/meta/src/ddl/drop_database.rs b/src/common/meta/src/ddl/drop_database.rs index d3c5c5831e..53b6190ff5 100644 --- a/src/common/meta/src/ddl/drop_database.rs +++ b/src/common/meta/src/ddl/drop_database.rs @@ -58,6 +58,7 @@ pub(crate) struct DropDatabaseContext { schema: String, drop_if_exists: bool, tables: Option>>, + retrying: bool, } #[async_trait::async_trait] @@ -90,6 +91,7 @@ impl DropDatabaseProcedure { schema, drop_if_exists, tables: None, + retrying: false, }, state: Box::new(DropDatabaseStart), } @@ -110,6 +112,7 @@ impl DropDatabaseProcedure { schema, drop_if_exists, tables: None, + retrying: false, }, state, }) @@ -136,9 +139,10 @@ impl Procedure for DropDatabaseProcedure { }) } - async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult { + async fn execute(&mut self, ctx: &ProcedureContext) -> ProcedureResult { let state = &mut self.state; + self.context.retrying = ctx.is_retrying().await.unwrap_or(false); let (next, status) = state .next(&self.runtime_context, &mut self.context) .await diff --git a/src/common/meta/src/ddl/drop_database/cursor.rs b/src/common/meta/src/ddl/drop_database/cursor.rs index 0a1180a52e..e060567be5 100644 --- a/src/common/meta/src/ddl/drop_database/cursor.rs +++ b/src/common/meta/src/ddl/drop_database/cursor.rs @@ -224,6 +224,7 @@ mod tests { schema: DEFAULT_SCHEMA_NAME.to_string(), drop_if_exists: false, tables: None, + retrying: false, }; // Ticks let (mut state, status) = state.next(&ddl_context, &mut ctx).await.unwrap(); @@ -259,6 +260,7 @@ mod tests { schema: DEFAULT_SCHEMA_NAME.to_string(), drop_if_exists: false, tables: None, + retrying: false, }; // Ticks let (state, status) = state.next(&ddl_context, &mut ctx).await.unwrap(); @@ -287,6 +289,7 @@ mod tests { schema: DEFAULT_SCHEMA_NAME.to_string(), drop_if_exists: false, tables: None, + retrying: false, }; // Ticks let (state, status) = state.next(&ddl_context, &mut ctx).await.unwrap(); diff --git a/src/common/meta/src/ddl/drop_database/executor.rs b/src/common/meta/src/ddl/drop_database/executor.rs index c478cdb746..80c730f30a 100644 --- a/src/common/meta/src/ddl/drop_database/executor.rs +++ b/src/common/meta/src/ddl/drop_database/executor.rs @@ -96,10 +96,25 @@ impl State for DropDatabaseExecutor { async fn next( &mut self, ddl_ctx: &DdlContext, - _ctx: &mut DropDatabaseContext, + ctx: &mut DropDatabaseContext, ) -> Result<(Box, Status)> { self.register_dropping_regions(ddl_ctx)?; let executor = DropTableExecutor::new(self.table_name.clone(), self.table_id, true); + if ctx.retrying { + info!( + "Remapping region routes addresses for retrying drop regions for table_id: {}", + self.table_id + ); + let storage = ddl_ctx + .table_metadata_manager + .table_route_manager() + .table_route_storage(); + // The peer addresses may change during retries, + // so we always remap the region routes. + storage + .remap_region_routes(&mut self.physical_region_routes) + .await?; + } // Deletes metadata for table permanently. let table_route_value = TableRouteValue::new( self.table_id, @@ -157,7 +172,10 @@ mod tests { use crate::ddl::drop_database::cursor::DropDatabaseCursor; use crate::ddl::drop_database::executor::DropDatabaseExecutor; use crate::ddl::drop_database::{DropDatabaseContext, DropTableTarget, State}; - use crate::ddl::test_util::{create_logical_table, create_physical_table}; + use crate::ddl::test_util::datanode_handler::DatanodeWatcher; + use crate::ddl::test_util::{ + create_logical_table, create_physical_table, put_datanode_address, + }; use crate::error::{self, Error, Result}; use crate::key::datanode_table::DatanodeTableKey; use crate::peer::Peer; @@ -206,6 +224,7 @@ mod tests { schema: DEFAULT_SCHEMA_NAME.to_string(), drop_if_exists: false, tables: None, + retrying: false, }; let (state, status) = state.next(&ddl_context, &mut ctx).await.unwrap(); assert!(!status.need_persist()); @@ -218,6 +237,7 @@ mod tests { schema: DEFAULT_SCHEMA_NAME.to_string(), drop_if_exists: false, tables: None, + retrying: false, }; let mut state = DropDatabaseExecutor::new( physical_table_id, @@ -258,6 +278,7 @@ mod tests { schema: DEFAULT_SCHEMA_NAME.to_string(), drop_if_exists: false, tables: None, + retrying: false, }; let (state, status) = state.next(&ddl_context, &mut ctx).await.unwrap(); assert!(!status.need_persist()); @@ -270,6 +291,7 @@ mod tests { schema: DEFAULT_SCHEMA_NAME.to_string(), drop_if_exists: false, tables: None, + retrying: false, }; let mut state = DropDatabaseExecutor::new( logical_table_id, @@ -360,6 +382,7 @@ mod tests { schema: DEFAULT_SCHEMA_NAME.to_string(), drop_if_exists: false, tables: None, + retrying: false, }; let err = state.next(&ddl_context, &mut ctx).await.unwrap_err(); assert!(err.is_retry_later()); @@ -389,6 +412,7 @@ mod tests { schema: DEFAULT_SCHEMA_NAME.to_string(), drop_if_exists: false, tables: None, + retrying: false, }; state.recover(&ddl_context).unwrap(); assert_eq!(state.dropping_regions.len(), 1); @@ -398,4 +422,45 @@ mod tests { assert_eq!(cursor.target, DropTableTarget::Physical); } } + + #[tokio::test] + async fn test_next_remaps_addresses_when_retrying() { + let (tx, mut rx) = tokio::sync::mpsc::channel(8); + let node_manager = Arc::new(MockDatanodeManager::new(DatanodeWatcher::new(tx))); + let ddl_context = new_ddl_context(node_manager); + let physical_table_id = create_physical_table(&ddl_context, "phy").await; + let (_, table_route) = ddl_context + .table_metadata_manager + .table_route_manager() + .get_physical_table_route(physical_table_id) + .await + .unwrap(); + + let mut state = DropDatabaseExecutor::new( + physical_table_id, + physical_table_id, + TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "phy"), + table_route.region_routes, + DropTableTarget::Physical, + ); + state.physical_region_routes[0] + .leader_peer + .as_mut() + .unwrap() + .addr = "old-addr".to_string(); + let mut ctx = DropDatabaseContext { + catalog: DEFAULT_CATALOG_NAME.to_string(), + schema: DEFAULT_SCHEMA_NAME.to_string(), + drop_if_exists: false, + tables: None, + retrying: true, + }; + + put_datanode_address(&ddl_context, 0, "new-addr").await; + + state.next(&ddl_context, &mut ctx).await.unwrap(); + + let (peer, _) = rx.try_recv().unwrap(); + assert_eq!(peer.addr, "new-addr"); + } } diff --git a/src/common/meta/src/ddl/drop_database/metadata.rs b/src/common/meta/src/ddl/drop_database/metadata.rs index c71eded7af..0306dd2a3b 100644 --- a/src/common/meta/src/ddl/drop_database/metadata.rs +++ b/src/common/meta/src/ddl/drop_database/metadata.rs @@ -122,6 +122,7 @@ mod tests { schema: "bar".to_string(), drop_if_exists: true, tables: None, + retrying: false, }; let (state, status) = state.next(&ddl_context, &mut ctx).await.unwrap(); state @@ -150,6 +151,7 @@ mod tests { schema: "bar".to_string(), drop_if_exists: true, tables: None, + retrying: false, }; let (state, status) = state.next(&ddl_context, &mut ctx).await.unwrap(); state diff --git a/src/common/meta/src/ddl/drop_database/start.rs b/src/common/meta/src/ddl/drop_database/start.rs index 4da83e367f..6b3e149877 100644 --- a/src/common/meta/src/ddl/drop_database/start.rs +++ b/src/common/meta/src/ddl/drop_database/start.rs @@ -93,6 +93,7 @@ mod tests { schema: "bar".to_string(), drop_if_exists: false, tables: None, + retrying: false, }; let err = step.next(&ddl_context, &mut ctx).await.unwrap_err(); assert_matches!(err, error::Error::SchemaNotFound { .. }); @@ -108,6 +109,7 @@ mod tests { schema: "bar".to_string(), drop_if_exists: true, tables: None, + retrying: false, }; let (state, status) = state.next(&ddl_context, &mut ctx).await.unwrap(); state.as_any().downcast_ref::().unwrap(); @@ -130,6 +132,7 @@ mod tests { schema: "bar".to_string(), drop_if_exists: false, tables: None, + retrying: false, }; let (state, status) = state.next(&ddl_context, &mut ctx).await.unwrap(); state.as_any().downcast_ref::().unwrap(); diff --git a/src/common/meta/src/ddl/drop_table.rs b/src/common/meta/src/ddl/drop_table.rs index 55c33330c4..8bd7c7155c 100644 --- a/src/common/meta/src/ddl/drop_table.rs +++ b/src/common/meta/src/ddl/drop_table.rs @@ -154,7 +154,24 @@ impl DropTableProcedure { Ok(Status::executing(true)) } - pub async fn on_datanode_drop_regions(&mut self) -> Result { + pub async fn on_datanode_drop_regions(&mut self, retrying: bool) -> Result { + if retrying { + info!( + "Remapping region routes addresses for retrying drop regions for table_id: {}", + self.data.table_id() + ); + let storage = self + .context + .table_metadata_manager + .table_route_manager() + .table_route_storage(); + // The peer addresses may change during retries, + // so we always remap the region routes. + storage + .remap_region_routes(&mut self.data.physical_region_routes) + .await?; + } + self.executor .on_drop_regions( &self.context.node_manager, @@ -215,7 +232,7 @@ impl Procedure for DropTableProcedure { Ok(()) } - async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult { + async fn execute(&mut self, ctx: &ProcedureContext) -> ProcedureResult { let state = &self.data.state; let _timer = metrics::METRIC_META_PROCEDURE_DROP_TABLE .with_label_values(&[state.as_ref()]) @@ -225,7 +242,10 @@ impl Procedure for DropTableProcedure { DropTableState::Prepare => self.on_prepare().await, DropTableState::DeleteMetadata => self.on_delete_metadata().await, DropTableState::InvalidateTableCache => self.on_broadcast().await, - DropTableState::DatanodeDropRegions => self.on_datanode_drop_regions().await, + DropTableState::DatanodeDropRegions => { + let retrying = ctx.is_retrying().await.unwrap_or(false); + self.on_datanode_drop_regions(retrying).await + } DropTableState::DeleteTombstone => self.on_delete_metadata_tombstone().await, } .map_err(map_to_procedure_error) diff --git a/src/common/meta/src/ddl/test_util.rs b/src/common/meta/src/ddl/test_util.rs index 36d422216f..7d8cdd895b 100644 --- a/src/common/meta/src/ddl/test_util.rs +++ b/src/common/meta/src/ddl/test_util.rs @@ -41,8 +41,12 @@ use crate::ddl::test_util::create_table::{ TestCreateTableExprBuilder, build_raw_table_info_from_expr, }; use crate::ddl::{DdlContext, TableMetadata}; +use crate::key::node_address::{NodeAddressKey, NodeAddressValue}; use crate::key::table_route::TableRouteValue; +use crate::key::{MetadataKey, MetadataValue}; +use crate::peer::Peer; use crate::rpc::ddl::CreateTableTask; +use crate::rpc::store::PutRequest; pub async fn create_physical_table_metadata( ddl_context: &DdlContext, @@ -56,6 +60,21 @@ pub async fn create_physical_table_metadata( .unwrap(); } +pub async fn put_datanode_address(ddl_context: &DdlContext, node_id: u64, addr: &str) { + ddl_context + .table_metadata_manager + .kv_backend() + .put(PutRequest { + key: NodeAddressKey::with_datanode(node_id).to_bytes(), + value: NodeAddressValue::new(Peer::new(node_id, addr)) + .try_as_raw_value() + .unwrap(), + ..Default::default() + }) + .await + .unwrap(); +} + pub async fn create_physical_table(ddl_context: &DdlContext, name: &str) -> TableId { // Prepares physical table metadata. let mut create_physical_table_task = test_create_physical_table_task(name); diff --git a/src/common/meta/src/ddl/tests/create_table.rs b/src/common/meta/src/ddl/tests/create_table.rs index 5355ac8c7c..7f4a6bd716 100644 --- a/src/common/meta/src/ddl/tests/create_table.rs +++ b/src/common/meta/src/ddl/tests/create_table.rs @@ -42,7 +42,7 @@ use crate::ddl::test_util::datanode_handler::{ DatanodeWatcher, NaiveDatanodeHandler, RetryErrorDatanodeHandler, UnexpectedErrorDatanodeHandler, }; -use crate::ddl::test_util::{assert_column_name, get_raw_table_info}; +use crate::ddl::test_util::{assert_column_name, get_raw_table_info, put_datanode_address}; use crate::error::{Error, Result}; use crate::key::table_route::TableRouteValue; use crate::kv_backend::memory::MemoryKvBackend; @@ -244,6 +244,27 @@ async fn test_on_datanode_create_regions_should_not_retry() { assert!(!error.is_retry_later()); } +#[tokio::test] +async fn test_on_datanode_create_regions_remaps_addresses_when_retrying() { + let (tx, mut rx) = mpsc::channel(8); + let datanode_handler = DatanodeWatcher::new(tx).with_handler(create_request_handler); + let node_manager = Arc::new(MockDatanodeManager::new(datanode_handler)); + let ddl_context = new_ddl_context(node_manager); + let task = test_create_table_task("foo"); + let mut procedure = CreateTableProcedure::new(task, ddl_context.clone()).unwrap(); + procedure.on_prepare().await.unwrap(); + + let table_route = procedure.data.table_route.as_mut().unwrap(); + let leader = table_route.region_routes[0].leader_peer.as_mut().unwrap(); + leader.addr = "old-addr".to_string(); + put_datanode_address(&ddl_context, leader.id, "new-addr").await; + + procedure.on_datanode_create_regions(true).await.unwrap(); + + let (peer, _) = rx.try_recv().unwrap(); + assert_eq!(peer.addr, "new-addr"); +} + #[tokio::test] async fn test_on_create_metadata_error() { common_telemetry::init_default_ut_logging(); diff --git a/src/common/meta/src/ddl/tests/drop_table.rs b/src/common/meta/src/ddl/tests/drop_table.rs index fb2c882da0..ae81bd7f52 100644 --- a/src/common/meta/src/ddl/tests/drop_table.rs +++ b/src/common/meta/src/ddl/tests/drop_table.rs @@ -34,7 +34,7 @@ use crate::ddl::test_util::create_table::test_create_table_task; use crate::ddl::test_util::datanode_handler::{DatanodeWatcher, NaiveDatanodeHandler}; use crate::ddl::test_util::{ create_logical_table, create_physical_table, create_physical_table_metadata, - test_create_logical_table_task, test_create_physical_table_task, + put_datanode_address, test_create_logical_table_task, test_create_physical_table_task, }; use crate::key::table_route::TableRouteValue; use crate::kv_backend::memory::MemoryKvBackend; @@ -146,7 +146,7 @@ async fn test_on_datanode_drop_regions() { // Drop table let mut procedure = DropTableProcedure::new(task, ddl_context); procedure.on_prepare().await.unwrap(); - procedure.on_datanode_drop_regions().await.unwrap(); + procedure.on_datanode_drop_regions(false).await.unwrap(); let check = |peer: Peer, request: RegionRequest, @@ -186,6 +186,50 @@ async fn test_on_datanode_drop_regions() { check(peer, request, 5, RegionId::new(table_id, 1), true); } +#[tokio::test] +async fn test_on_datanode_drop_regions_remaps_addresses_when_retrying() { + let (tx, mut rx) = mpsc::channel(8); + let datanode_handler = DatanodeWatcher::new(tx); + let node_manager = Arc::new(MockDatanodeManager::new(datanode_handler)); + let ddl_context = new_ddl_context(node_manager); + let table_id = 1024; + let table_name = "foo"; + let task = test_create_table_task(table_name, table_id); + ddl_context + .table_metadata_manager + .create_table_metadata( + task.table_info.clone(), + TableRouteValue::physical(vec![RegionRoute { + region: Region::new_test(RegionId::new(table_id, 1)), + leader_peer: Some(Peer::new(1, "old-leader")), + follower_peers: vec![Peer::new(5, "old-follower")], + leader_state: None, + leader_down_since: None, + write_route_policy: None, + }]), + HashMap::new(), + ) + .await + .unwrap(); + + let task = new_drop_table_task(table_name, table_id, false); + let mut procedure = DropTableProcedure::new(task, ddl_context.clone()); + procedure.on_prepare().await.unwrap(); + + put_datanode_address(&ddl_context, 1, "new-leader").await; + put_datanode_address(&ddl_context, 5, "new-follower").await; + + procedure.on_datanode_drop_regions(true).await.unwrap(); + + let mut peers = Vec::new(); + for _ in 0..2 { + peers.push(rx.try_recv().unwrap().0); + } + peers.sort_unstable_by_key(|p| p.id); + assert_eq!(peers[0].addr, "new-leader"); + assert_eq!(peers[1].addr, "new-follower"); +} + #[tokio::test] async fn test_on_rollback() { let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler)); diff --git a/src/common/meta/src/ddl/truncate_table.rs b/src/common/meta/src/ddl/truncate_table.rs index 40e8980a1c..21491b6230 100644 --- a/src/common/meta/src/ddl/truncate_table.rs +++ b/src/common/meta/src/ddl/truncate_table.rs @@ -42,7 +42,7 @@ use crate::key::table_name::TableNameKey; use crate::lock_key::{CatalogLock, SchemaLock, TableLock}; use crate::metrics; use crate::rpc::ddl::TruncateTableTask; -use crate::rpc::router::{RegionRoute, find_leader_regions, find_leaders}; +use crate::rpc::router::{find_leader_regions, find_leaders}; pub struct TruncateTableProcedure { context: DdlContext, @@ -94,12 +94,11 @@ impl TruncateTableProcedure { pub(crate) fn new( task: TruncateTableTask, table_info_value: DeserializedValueWithBytes, - region_routes: Vec, context: DdlContext, ) -> Self { Self { context, - data: TruncateTableData::new(task, table_info_value, region_routes), + data: TruncateTableData::new(task, table_info_value), } } @@ -138,13 +137,18 @@ impl TruncateTableProcedure { async fn on_datanode_truncate_regions(&mut self) -> Result { let table_id = self.data.table_id(); - let region_routes = &self.data.region_routes; - let leaders = find_leaders(region_routes); + let (_, physical_table_route) = self + .context + .table_metadata_manager + .table_route_manager() + .get_physical_table_route(table_id) + .await?; + let leaders = find_leaders(&physical_table_route.region_routes); let mut truncate_region_tasks = Vec::with_capacity(leaders.len()); for datanode in leaders { let requester = self.context.node_manager.datanode(&datanode).await; - let regions = find_leader_regions(region_routes, &datanode); + let regions = find_leader_regions(&physical_table_route.region_routes, &datanode); for region in regions { let region_id = RegionId::new(table_id, region); @@ -201,20 +205,17 @@ pub struct TruncateTableData { state: TruncateTableState, task: TruncateTableTask, table_info_value: DeserializedValueWithBytes, - region_routes: Vec, } impl TruncateTableData { pub fn new( task: TruncateTableTask, table_info_value: DeserializedValueWithBytes, - region_routes: Vec, ) -> Self { Self { state: TruncateTableState::Prepare, task, table_info_value, - region_routes, } } diff --git a/src/common/meta/src/ddl_manager.rs b/src/common/meta/src/ddl_manager.rs index 0106add32f..d0619ca74f 100644 --- a/src/common/meta/src/ddl_manager.rs +++ b/src/common/meta/src/ddl_manager.rs @@ -45,7 +45,7 @@ use crate::ddl::drop_view::DropViewProcedure; use crate::ddl::truncate_table::TruncateTableProcedure; use crate::ddl::{DdlContext, utils}; use crate::error::{ - CreateRepartitionProcedureSnafu, EmptyDdlTasksSnafu, ProcedureOutputSnafu, + self, CreateRepartitionProcedureSnafu, EmptyDdlTasksSnafu, ProcedureOutputSnafu, RegisterProcedureLoaderSnafu, RegisterRepartitionProcedureLoaderSnafu, Result, SubmitProcedureSnafu, TableInfoNotFoundSnafu, TableNotFoundSnafu, TableRouteNotFoundSnafu, UnexpectedLogicalRouteTableSnafu, WaitProcedureSnafu, @@ -72,7 +72,6 @@ use crate::rpc::ddl::{ CreateTableTask, CreateViewTask, DropDatabaseTask, DropFlowTask, DropTableTask, DropViewTask, QueryContext, SubmitDdlTaskRequest, SubmitDdlTaskResponse, TruncateTableTask, }; -use crate::rpc::router::RegionRoute; /// A configurator that customizes or enhances a [`DdlManager`]. #[async_trait::async_trait] @@ -521,15 +520,9 @@ impl DdlManager { &self, truncate_table_task: TruncateTableTask, table_info_value: DeserializedValueWithBytes, - region_routes: Vec, ) -> Result<(ProcedureId, Option)> { let context = self.create_context(); - let procedure = TruncateTableProcedure::new( - truncate_table_task, - table_info_value, - region_routes, - context, - ); + let procedure = TruncateTableProcedure::new(truncate_table_task, table_info_value, context); let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure)); @@ -658,19 +651,26 @@ async fn handle_truncate_table_task( let table_metadata_manager = &ddl_manager.table_metadata_manager(); let table_ref = truncate_table_task.table_ref(); - let (table_info_value, table_route_value) = - table_metadata_manager.get_full_table_info(table_id).await?; - - let table_info_value = table_info_value.with_context(|| TableInfoNotFoundSnafu { - table: table_ref.to_string(), - })?; - - let table_route_value = table_route_value.context(TableRouteNotFoundSnafu { table_id })?; - - let table_route = table_route_value.into_inner().region_routes()?.clone(); + let table_info_value = table_metadata_manager + .table_info_manager() + .get(table_id) + .await? + .with_context(|| TableInfoNotFoundSnafu { + table: table_ref.to_string(), + })?; + let physical_table_id = table_metadata_manager + .table_route_manager() + .get_physical_table_id(table_id) + .await?; + ensure!( + physical_table_id == table_id, + error::UnexpectedSnafu { + err_msg: "Truncate table is only supported for physical tables." + } + ); let (id, _) = ddl_manager - .submit_truncate_table_task(truncate_table_task, table_info_value, table_route) + .submit_truncate_table_task(truncate_table_task, table_info_value) .await?; info!("Table: {table_id} is truncated via procedure_id {id:?}"); diff --git a/src/common/meta/src/key.rs b/src/common/meta/src/key.rs index 332c60f225..3a9217e710 100644 --- a/src/common/meta/src/key.rs +++ b/src/common/meta/src/key.rs @@ -663,7 +663,7 @@ impl TableMetadataManager { if let Some(table_route_value) = &mut table_route_value { self.table_route_manager() .table_route_storage() - .remap_route_address(table_route_value) + .remap_table_route(table_route_value) .await?; } Ok((table_info_value, table_route_value)) diff --git a/src/common/meta/src/key/table_route.rs b/src/common/meta/src/key/table_route.rs index a409b75fce..4c618addda 100644 --- a/src/common/meta/src/key/table_route.rs +++ b/src/common/meta/src/key/table_route.rs @@ -675,7 +675,7 @@ impl TableRouteStorage { pub async fn get(&self, table_id: TableId) -> Result> { let mut table_route = self.get_inner(table_id).await?; if let Some(table_route) = &mut table_route { - self.remap_route_address(table_route).await?; + self.remap_table_route(table_route).await?; }; Ok(table_route) @@ -697,7 +697,7 @@ impl TableRouteStorage { ) -> Result>> { let mut table_route = self.get_with_raw_bytes_inner(table_id).await?; if let Some(table_route) = &mut table_route { - self.remap_route_address(table_route).await?; + self.remap_table_route(table_route).await?; }; Ok(table_route) @@ -791,10 +791,7 @@ impl TableRouteStorage { Ok(()) } - pub(crate) async fn remap_route_address( - &self, - table_route: &mut TableRouteValue, - ) -> Result<()> { + pub(crate) async fn remap_table_route(&self, table_route: &mut TableRouteValue) -> Result<()> { let keys = extract_address_keys(table_route).into_iter().collect(); let node_addrs = self.get_node_addresses(keys).await?; set_addresses(&node_addrs, table_route)?; @@ -802,6 +799,17 @@ impl TableRouteStorage { Ok(()) } + pub(crate) async fn remap_region_routes( + &self, + region_routes: &mut [RegionRoute], + ) -> Result<()> { + let keys = extract_address_keys_from_region_routes(region_routes) + .into_iter() + .collect(); + let node_addrs = self.get_node_addresses(keys).await?; + set_addresses_for_region_routes(&node_addrs, region_routes) + } + async fn get_node_addresses( &self, keys: Vec>, @@ -824,15 +832,11 @@ impl TableRouteStorage { } } -fn set_addresses( +fn set_addresses_for_region_routes( node_addrs: &HashMap, - table_route: &mut TableRouteValue, + region_routes: &mut [RegionRoute], ) -> Result<()> { - let TableRouteValue::Physical(physical_table_route) = table_route else { - return Ok(()); - }; - - for region_route in &mut physical_table_route.region_routes { + for region_route in region_routes { if let Some(leader) = &mut region_route.leader_peer && let Some(node_addr) = node_addrs.get(&leader.id) { @@ -848,13 +852,18 @@ fn set_addresses( Ok(()) } -fn extract_address_keys(table_route: &TableRouteValue) -> HashSet> { +fn set_addresses( + node_addrs: &HashMap, + table_route: &mut TableRouteValue, +) -> Result<()> { let TableRouteValue::Physical(physical_table_route) = table_route else { - return HashSet::default(); + return Ok(()); }; + set_addresses_for_region_routes(node_addrs, &mut physical_table_route.region_routes) +} - physical_table_route - .region_routes +fn extract_address_keys_from_region_routes(region_routes: &[RegionRoute]) -> HashSet> { + region_routes .iter() .flat_map(|region_route| { region_route @@ -871,6 +880,14 @@ fn extract_address_keys(table_route: &TableRouteValue) -> HashSet> { .collect() } +fn extract_address_keys(table_route: &TableRouteValue) -> HashSet> { + let TableRouteValue::Physical(physical_table_route) = table_route else { + return HashSet::default(); + }; + + extract_address_keys_from_region_routes(&physical_table_route.region_routes) +} + #[cfg(test)] mod tests { use std::sync::Arc; @@ -1104,7 +1121,7 @@ mod tests { .unwrap(); table_route_storage - .remap_route_address(&mut table_route) + .remap_table_route(&mut table_route) .await .unwrap(); diff --git a/src/common/procedure/src/local/runner.rs b/src/common/procedure/src/local/runner.rs index 2a974de889..ca3e221f43 100644 --- a/src/common/procedure/src/local/runner.rs +++ b/src/common/procedure/src/local/runner.rs @@ -1293,6 +1293,60 @@ mod tests { .await; } + #[tokio::test] + async fn test_retrying_state_visible_in_context_on_retry() { + let retrying_states = Arc::new(std::sync::Mutex::new(Vec::new())); + let captured = retrying_states.clone(); + let mut times = 0; + + let exec_fn = move |ctx: Context| { + times += 1; + let captured = captured.clone(); + async move { + let is_retrying = ctx.is_retrying().await; + captured.lock().unwrap().push(is_retrying); + if times == 1 { + Err(Error::retry_later(MockError::new(StatusCode::Unexpected))) + } else { + Ok(Status::done()) + } + } + .boxed() + }; + + let procedure = ProcedureAdapter { + data: "retrying_state".to_string(), + lock_key: LockKey::single_exclusive("catalog.schema.table"), + poison_keys: PoisonKeys::default(), + exec_fn, + rollback_fn: None, + }; + + let dir = create_temp_dir("retrying_state"); + let meta = procedure.new_meta(ROOT_ID); + let object_store = test_util::new_object_store(&dir); + let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store)); + let mut runner = new_runner(meta.clone(), Box::new(procedure), procedure_store); + let ctx = context_with_provider( + meta.id, + runner.manager_ctx.clone() as Arc, + ); + + runner + .manager_ctx + .procedures + .write() + .unwrap() + .insert(meta.id, runner.meta.clone()); + runner.manager_ctx.start(); + + runner.execute_once(&ctx).await; + runner.execute_once(&ctx).await; + + let states = retrying_states.lock().unwrap().clone(); + assert_eq!(states, vec![Some(false), Some(true)]); + } + #[tokio::test(flavor = "multi_thread")] async fn test_execute_on_retry_later_error_with_child() { common_telemetry::init_default_ut_logging(); diff --git a/src/common/procedure/src/procedure.rs b/src/common/procedure/src/procedure.rs index 843052ed97..8e34f4bb30 100644 --- a/src/common/procedure/src/procedure.rs +++ b/src/common/procedure/src/procedure.rs @@ -177,6 +177,18 @@ pub struct Context { pub provider: ContextProviderRef, } +impl Context { + /// Returns true if current procedure state is retrying. + pub async fn is_retrying(&self) -> Option { + self.provider + .procedure_state(self.procedure_id) + .await + .ok() + .flatten() + .map(|s| s.is_retrying()) + } +} + /// A `Procedure` represents an operation or a set of operations to be performed step-by-step. #[async_trait] pub trait Procedure: Send { diff --git a/src/meta-srv/src/procedure/tests.rs b/src/meta-srv/src/procedure/tests.rs index 105bfb7bc4..93879d3c08 100644 --- a/src/meta-srv/src/procedure/tests.rs +++ b/src/meta-srv/src/procedure/tests.rs @@ -219,7 +219,7 @@ async fn test_on_datanode_create_regions() { } }); - let status = procedure.on_datanode_create_regions().await.unwrap(); + let status = procedure.on_datanode_create_regions(false).await.unwrap(); assert!(matches!( status, Status::Executing { From 24ab861052f8baef6bb46f904e3bedeca4b201c2 Mon Sep 17 00:00:00 2001 From: Lanqing Yang Date: Thu, 9 Apr 2026 04:12:05 -0700 Subject: [PATCH 085/195] chore: move Tantivy fulltext search to blocking thread pool (#7919) perf: move Tantivy fulltext search to blocking thread pool Wrap the synchronous Tantivy search (query parsing, posting list traversal, stored field reads) in spawn_blocking_global to avoid starving the tokio async runtime with CPU-bound work. Signed-off-by: lyang24 --- .../src/fulltext_index/search/tantivy.rs | 123 ++++++++++-------- 1 file changed, 69 insertions(+), 54 deletions(-) diff --git a/src/index/src/fulltext_index/search/tantivy.rs b/src/index/src/fulltext_index/search/tantivy.rs index d06a5cd329..9fb0091868 100644 --- a/src/index/src/fulltext_index/search/tantivy.rs +++ b/src/index/src/fulltext_index/search/tantivy.rs @@ -14,6 +14,7 @@ use std::collections::{BTreeSet, HashMap}; use std::path::Path; +use std::sync::Arc; use std::time::Instant; use async_trait::async_trait; @@ -27,15 +28,19 @@ use tantivy::{Index, IndexReader, ReloadPolicy, TantivyDocument}; use crate::fulltext_index::Config; use crate::fulltext_index::create::{ROWID_FIELD_NAME, TEXT_FIELD_NAME}; use crate::fulltext_index::error::{ - Result, TantivyDocNotFoundSnafu, TantivyParserSnafu, TantivySnafu, + JoinSnafu, Result, TantivyDocNotFoundSnafu, TantivyParserSnafu, TantivySnafu, }; use crate::fulltext_index::search::{FulltextIndexSearcher, RowId}; /// `TantivyFulltextIndexSearcher` is a searcher using Tantivy. pub struct TantivyFulltextIndexSearcher { - /// Tanitvy index. + inner: Arc, +} + +struct TantivySearcherInner { + /// Tantivy index. index: Index, - /// Tanitvy index reader. + /// Tantivy index reader. reader: IndexReader, /// The default field used to build `QueryParser` default_field: Field, @@ -66,63 +71,73 @@ impl TantivyFulltextIndexSearcher { ); Ok(Self { - index, - reader, - default_field, + inner: Arc::new(TantivySearcherInner { + index, + reader, + default_field, + }), }) } } +fn search_sync(inner: &TantivySearcherInner, query: &str) -> Result> { + let searcher = inner.reader.searcher(); + let query_parser = QueryParser::for_index(&inner.index, vec![inner.default_field]); + let query = query_parser + .parse_query(query) + .context(TantivyParserSnafu)?; + let doc_addrs = searcher + .search(&query, &DocSetCollector) + .context(TantivySnafu)?; + + let seg_metas = inner + .index + .searchable_segment_metas() + .context(TantivySnafu)?; + + // FAST PATH: only one segment, the doc id is the same as the row id. + // Also for compatibility with the old version. + if seg_metas.len() == 1 { + return Ok(doc_addrs.into_iter().map(|d| d.doc_id).collect()); + } + + // SLOW PATH: multiple segments, need to calculate the row id. + let rowid_field = searcher + .schema() + .get_field(ROWID_FIELD_NAME) + .context(TantivySnafu)?; + let mut seg_offsets = HashMap::with_capacity(seg_metas.len()); + let mut res = BTreeSet::new(); + for doc_addr in doc_addrs { + let offset = if let Some(offset) = seg_offsets.get(&doc_addr.segment_ord) { + *offset + } else { + // Calculate the offset at the first time meeting the segment and cache it since + // the offset is the same for all rows in the same segment. + let doc: TantivyDocument = searcher.doc(doc_addr).context(TantivySnafu)?; + let rowid = doc + .get_first(rowid_field) + .and_then(|v| v.as_u64()) + .context(TantivyDocNotFoundSnafu { doc_addr })?; + + let offset = rowid as u32 - doc_addr.doc_id; + seg_offsets.insert(doc_addr.segment_ord, offset); + offset + }; + + res.insert(doc_addr.doc_id + offset); + } + + Ok(res) +} + #[async_trait] impl FulltextIndexSearcher for TantivyFulltextIndexSearcher { async fn search(&self, query: &str) -> Result> { - let searcher = self.reader.searcher(); - let query_parser = QueryParser::for_index(&self.index, vec![self.default_field]); - let query = query_parser - .parse_query(query) - .context(TantivyParserSnafu)?; - let doc_addrs = searcher - .search(&query, &DocSetCollector) - .context(TantivySnafu)?; - - let seg_metas = self - .index - .searchable_segment_metas() - .context(TantivySnafu)?; - - // FAST PATH: only one segment, the doc id is the same as the row id. - // Also for compatibility with the old version. - if seg_metas.len() == 1 { - return Ok(doc_addrs.into_iter().map(|d| d.doc_id).collect()); - } - - // SLOW PATH: multiple segments, need to calculate the row id. - let rowid_field = searcher - .schema() - .get_field(ROWID_FIELD_NAME) - .context(TantivySnafu)?; - let mut seg_offsets = HashMap::with_capacity(seg_metas.len()); - let mut res = BTreeSet::new(); - for doc_addr in doc_addrs { - let offset = if let Some(offset) = seg_offsets.get(&doc_addr.segment_ord) { - *offset - } else { - // Calculate the offset at the first time meeting the segment and cache it since - // the offset is the same for all rows in the same segment. - let doc: TantivyDocument = searcher.doc(doc_addr).context(TantivySnafu)?; - let rowid = doc - .get_first(rowid_field) - .and_then(|v| v.as_u64()) - .context(TantivyDocNotFoundSnafu { doc_addr })?; - - let offset = rowid as u32 - doc_addr.doc_id; - seg_offsets.insert(doc_addr.segment_ord, offset); - offset - }; - - res.insert(doc_addr.doc_id + offset); - } - - Ok(res) + let inner = self.inner.clone(); + let query = query.to_string(); + common_runtime::spawn_blocking_global(move || search_sync(&inner, &query)) + .await + .context(JoinSnafu)? } } From fb5333e116d89fd073d2c40e84486caa7a5a83db Mon Sep 17 00:00:00 2001 From: Yingwen Date: Thu, 9 Apr 2026 20:37:24 +0800 Subject: [PATCH 086/195] ci: add standalone workflows for bumping helm charts and homebrew (#7941) ci: add standalone workflows for bumping helm charts and homebrew versions Signed-off-by: evenyag --- .../workflows/bump-helm-charts-version.yml | 29 +++++++++++++++++++ .../bump-homebrew-greptime-version.yml | 29 +++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 .github/workflows/bump-helm-charts-version.yml create mode 100644 .github/workflows/bump-homebrew-greptime-version.yml diff --git a/.github/workflows/bump-helm-charts-version.yml b/.github/workflows/bump-helm-charts-version.yml new file mode 100644 index 0000000000..5921ec8a8c --- /dev/null +++ b/.github/workflows/bump-helm-charts-version.yml @@ -0,0 +1,29 @@ +name: Bump helm charts version + +on: + workflow_dispatch: + inputs: + version: + description: The version to bump (e.g. v1.0.0) + required: true + type: string + +jobs: + bump-helm-charts-version: + name: Bump helm charts version + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Bump helm charts version + env: + GITHUB_TOKEN: ${{ secrets.HELM_CHARTS_REPO_TOKEN }} + VERSION: ${{ inputs.version }} + run: | + ./.github/scripts/update-helm-charts-version.sh diff --git a/.github/workflows/bump-homebrew-greptime-version.yml b/.github/workflows/bump-homebrew-greptime-version.yml new file mode 100644 index 0000000000..af8ca8fc99 --- /dev/null +++ b/.github/workflows/bump-homebrew-greptime-version.yml @@ -0,0 +1,29 @@ +name: Bump homebrew greptime version + +on: + workflow_dispatch: + inputs: + version: + description: The version to bump (e.g. v1.0.0) + required: true + type: string + +jobs: + bump-homebrew-greptime-version: + name: Bump homebrew greptime version + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Bump homebrew greptime version + env: + GITHUB_TOKEN: ${{ secrets.HOMEBREW_GREPTIME_REPO_TOKEN }} + VERSION: ${{ inputs.version }} + run: | + ./.github/scripts/update-homebrew-greptme-version.sh From e9d783cccf6b41596b4537ac102b9e757a462da8 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Fri, 10 Apr 2026 03:18:56 +0800 Subject: [PATCH 087/195] feat: execution timeout for prepared statement (#7932) * feat: execution timeout for prepared statement * fix: lint fix --- src/frontend/src/instance.rs | 58 +++++++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 8 deletions(-) diff --git a/src/frontend/src/instance.rs b/src/frontend/src/instance.rs index ce589bb677..99444bb2a2 100644 --- a/src/frontend/src/instance.rs +++ b/src/frontend/src/instance.rs @@ -483,6 +483,27 @@ fn derive_timeout(stmt: &Statement, query_ctx: &QueryContextRef) -> Option, + query_ctx: &QueryContextRef, +) -> Option { + match stmt { + Some(s) => derive_timeout(s, query_ctx), + None => { + let query_timeout = query_ctx.query_timeout()?; + if query_timeout.is_zero() { + return None; + } + match query_ctx.channel() { + Channel::Postgres => Some(query_timeout), + _ => None, + } + } + } +} + fn attach_timeout(output: Output, mut timeout: Duration) -> Result { if timeout.is_zero() { return StatementTimeoutSnafu.fail(); @@ -588,6 +609,33 @@ impl Instance { } } + async fn exec_plan(&self, plan: LogicalPlan, query_ctx: QueryContextRef) -> Result { + self.query_engine + .execute(plan, query_ctx) + .await + .context(ExecLogicalPlanSnafu) + } + + async fn exec_plan_with_timeout( + &self, + stmt: Option, + plan: LogicalPlan, + query_ctx: QueryContextRef, + ) -> Result { + let timeout = derive_timeout_for_plan(stmt.as_ref(), &query_ctx); + match timeout { + Some(timeout) => { + let start = tokio::time::Instant::now(); + let output = tokio::time::timeout(timeout, self.exec_plan(plan, query_ctx)) + .await + .map_err(|_| StatementTimeoutSnafu.build())??; + let remaining_timeout = timeout.checked_sub(start.elapsed()).unwrap_or_default(); + attach_timeout(output, remaining_timeout) + } + None => self.exec_plan(plan, query_ctx).await, + } + } + async fn do_exec_plan_inner( &self, stmt: Option, @@ -624,7 +672,7 @@ impl Instance { slow_query_timer, ); - let query_fut = self.query_engine.execute(plan.clone(), query_ctx); + let query_fut = self.exec_plan_with_timeout(Some(stmt), plan, query_ctx); CancellableFuture::new(query_fut, ticket.cancellation_handle.clone()) .await @@ -640,14 +688,8 @@ impl Instance { }; Output { data, meta } }) - .context(ExecLogicalPlanSnafu) } else { - // plan should be prepared before exec - // we'll do check there - self.query_engine - .execute(plan.clone(), query_ctx) - .await - .context(ExecLogicalPlanSnafu) + self.exec_plan_with_timeout(stmt, plan, query_ctx).await } } From fd94f5519398af27f9fb7119abffa43ac0315e70 Mon Sep 17 00:00:00 2001 From: Yingwen Date: Fri, 10 Apr 2026 11:12:33 +0800 Subject: [PATCH 088/195] refactor(mito2): remove dead scan code (#7925) * refactor(mito2): remove dead batch parallel scan helpers Signed-off-by: evenyag * refactor(mito2): remove dead merge reader path Signed-off-by: evenyag * refactor(mito2): remove dead batch dedup reader Signed-off-by: evenyag * test(mito2): remove obsolete batch source helper Signed-off-by: evenyag * refactor: remove unused plain batch Signed-off-by: evenyag --------- Signed-off-by: evenyag --- src/mito2/benches/simple_bulk_memtable.rs | 47 +- .../src/memtable/simple_bulk_memtable.rs | 106 +- src/mito2/src/read.rs | 2 - src/mito2/src/read/dedup.rs | 544 +--------- src/mito2/src/read/flat_merge.rs | 82 +- src/mito2/src/read/merge.rs | 982 ------------------ src/mito2/src/read/plain_batch.rs | 505 --------- src/mito2/src/read/scan_region.rs | 78 +- src/mito2/src/read/scan_util.rs | 2 +- src/mito2/src/sst.rs | 28 - src/mito2/src/test_util/sst_util.rs | 10 +- 11 files changed, 94 insertions(+), 2292 deletions(-) delete mode 100644 src/mito2/src/read/merge.rs delete mode 100644 src/mito2/src/read/plain_batch.rs diff --git a/src/mito2/benches/simple_bulk_memtable.rs b/src/mito2/benches/simple_bulk_memtable.rs index 05035734de..8a199f46f1 100644 --- a/src/mito2/benches/simple_bulk_memtable.rs +++ b/src/mito2/benches/simple_bulk_memtable.rs @@ -21,11 +21,7 @@ use criterion::{Criterion, criterion_group, criterion_main}; use datatypes::data_type::ConcreteDataType; use datatypes::schema::ColumnSchema; use mito2::memtable::simple_bulk_memtable::SimpleBulkMemtable; -use mito2::memtable::{IterBuilder, KeyValues, Memtable, MemtableRanges, RangesOptions}; -use mito2::read; -use mito2::read::Source; -use mito2::read::dedup::DedupReader; -use mito2::read::merge::MergeReaderBuilder; +use mito2::memtable::{IterBuilder, KeyValues, Memtable, RangesOptions}; use mito2::region::options::MergeMode; use mito2::test_util::column_metadata_to_column_schema; use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder}; @@ -126,36 +122,6 @@ fn create_memtable_with_rows(num_batches: usize) -> SimpleBulkMemtable { } async fn flush(mem: &SimpleBulkMemtable) { - let MemtableRanges { ranges, .. } = mem.ranges(None, RangesOptions::for_flush()).unwrap(); - - let mut source = if ranges.len() == 1 { - let only_range = ranges.into_values().next().unwrap(); - let iter = only_range.build_iter().unwrap(); - Source::Iter(iter) - } else { - let sources = ranges - .into_values() - .map(|r| r.build_iter().map(Source::Iter)) - .collect::>>() - .unwrap(); - let merge_reader = MergeReaderBuilder::from_sources(sources) - .build() - .await - .unwrap(); - let reader = Box::new(DedupReader::new( - merge_reader, - read::dedup::LastRow::new(true), - None, - )); - Source::Reader(reader) - }; - - while let Some(b) = source.next_batch().await.unwrap() { - black_box(b); - } -} - -async fn flush_original(mem: &SimpleBulkMemtable) { let iter = mem .ranges(None, RangesOptions::default()) .unwrap() @@ -179,19 +145,10 @@ fn bench_ranges_parallel_vs_sequential(c: &mut Criterion) { let total_rows_k = num_batch * 10; let memtable = create_memtable_with_rows(num_batch); - group.bench_with_input( - BenchmarkId::new("flush_by_merge_reader", format!("{}k_rows", total_rows_k)), - &memtable, - |b, memtable| b.to_async(&rt).iter(|| async { flush(memtable).await }), - ); - group.bench_with_input( BenchmarkId::new("flush_by_iter", format!("{}k_rows", total_rows_k)), &memtable, - |b, memtable| { - b.to_async(&rt) - .iter(|| async { flush_original(memtable).await }) - }, + |b, memtable| b.to_async(&rt).iter(|| async { flush(memtable).await }), ); } diff --git a/src/mito2/src/memtable/simple_bulk_memtable.rs b/src/mito2/src/memtable/simple_bulk_memtable.rs index 1284741347..6ff799ebf5 100644 --- a/src/mito2/src/memtable/simple_bulk_memtable.rs +++ b/src/mito2/src/memtable/simple_bulk_memtable.rs @@ -421,10 +421,6 @@ mod tests { use store_api::storage::{RegionId, SequenceNumber, SequenceRange}; use super::*; - use crate::read; - use crate::read::dedup::DedupReader; - use crate::read::merge::MergeReaderBuilder; - use crate::read::{BatchReader, Source}; use crate::region::options::MergeMode; use crate::test_util::column_metadata_to_column_schema; @@ -621,81 +617,6 @@ mod tests { assert_eq!(1, batch.num_rows()); } - #[tokio::test] - async fn test_write_dedup() { - let memtable = new_test_memtable(true, MergeMode::LastRow); - let kvs = build_key_values( - &memtable.region_metadata, - 0, - &[(1, 1.0, "a".to_string())], - OpType::Put, - ); - let kv = kvs.iter().next().unwrap(); - memtable.write_one(kv).unwrap(); - memtable.freeze().unwrap(); - - let kvs = build_key_values( - &memtable.region_metadata, - 1, - &[(1, 1.0, "a".to_string())], - OpType::Delete, - ); - let kv = kvs.iter().next().unwrap(); - memtable.write_one(kv).unwrap(); - - let ranges = memtable.ranges(None, RangesOptions::default()).unwrap(); - let mut source = vec![]; - for r in ranges.ranges.values() { - source.push(Source::Iter(r.build_iter().unwrap())); - } - - let reader = MergeReaderBuilder::from_sources(source) - .build() - .await - .unwrap(); - - let mut reader = DedupReader::new(reader, read::dedup::LastRow::new(false), None); - let mut num_rows = 0; - while let Some(b) = reader.next_batch().await.unwrap() { - num_rows += b.num_rows(); - } - assert_eq!(num_rows, 1); - } - - #[tokio::test] - async fn test_delete_only() { - let memtable = new_test_memtable(true, MergeMode::LastRow); - let kvs = build_key_values( - &memtable.region_metadata, - 0, - &[(1, 1.0, "a".to_string())], - OpType::Delete, - ); - let kv = kvs.iter().next().unwrap(); - memtable.write_one(kv).unwrap(); - memtable.freeze().unwrap(); - - let ranges = memtable.ranges(None, RangesOptions::default()).unwrap(); - let mut source = vec![]; - for r in ranges.ranges.values() { - source.push(Source::Iter(r.build_iter().unwrap())); - } - - let reader = MergeReaderBuilder::from_sources(source) - .build() - .await - .unwrap(); - - let mut reader = DedupReader::new(reader, read::dedup::LastRow::new(false), None); - let mut num_rows = 0; - while let Some(b) = reader.next_batch().await.unwrap() { - num_rows += b.num_rows(); - assert_eq!(b.num_rows(), 1); - assert_eq!(b.op_types().get_data(0).unwrap(), OpType::Delete as u8); - } - assert_eq!(num_rows, 1); - } - #[tokio::test] async fn test_single_range() { let memtable = new_test_memtable(true, MergeMode::LastRow); @@ -902,8 +823,8 @@ mod tests { .unwrap() } - #[tokio::test] - async fn test_write_read_large_string() { + #[test] + fn test_write_read_large_string() { let mut builder = RegionMetadataBuilder::new(RegionId::new(123, 456)); builder .push_column_metadata(ColumnMetadata { @@ -948,25 +869,12 @@ mod tests { .unwrap(); let MemtableRanges { ranges, .. } = memtable.ranges(None, RangesOptions::default()).unwrap(); - let mut source = if ranges.len() == 1 { - let only_range = ranges.into_values().next().unwrap(); - Source::Iter(only_range.build_iter().unwrap()) - } else { - let sources = ranges - .into_values() - .map(|r| r.build_iter().map(Source::Iter)) - .collect::>>() - .unwrap(); - let merge_reader = MergeReaderBuilder::from_sources(sources) - .build() - .await - .unwrap(); - Source::Reader(Box::new(merge_reader)) - }; - let mut rows = 0; - while let Some(b) = source.next_batch().await.unwrap() { - rows += b.num_rows(); + for range in ranges.into_values() { + let iter = range.build_iter().unwrap(); + for batch in iter { + rows += batch.unwrap().num_rows(); + } } assert_eq!(rows, 2); } diff --git a/src/mito2/src/read.rs b/src/mito2/src/read.rs index db7dfd1958..90eb9a3da7 100644 --- a/src/mito2/src/read.rs +++ b/src/mito2/src/read.rs @@ -21,8 +21,6 @@ pub mod flat_dedup; pub mod flat_merge; pub mod flat_projection; pub mod last_row; -pub mod merge; -pub mod plain_batch; pub mod projection; pub(crate) mod prune; pub(crate) mod pruner; diff --git a/src/mito2/src/read/dedup.rs b/src/mito2/src/read/dedup.rs index 5c881459b2..86f6b07ffc 100644 --- a/src/mito2/src/read/dedup.rs +++ b/src/mito2/src/read/dedup.rs @@ -19,17 +19,13 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use api::v1::OpType; -use async_trait::async_trait; -use common_telemetry::debug; -use common_time::Timestamp; use datatypes::data_type::DataType; use datatypes::prelude::ScalarVector; use datatypes::value::Value; use datatypes::vectors::MutableVector; use crate::error::Result; -use crate::metrics::MERGE_FILTER_ROWS_TOTAL; -use crate::read::{Batch, BatchColumn, BatchReader}; +use crate::read::{Batch, BatchColumn}; /// Trait for reporting dedup metrics. pub trait DedupMetricsReport: Send + Sync { @@ -37,80 +33,6 @@ pub trait DedupMetricsReport: Send + Sync { fn report(&self, metrics: &mut DedupMetrics); } -/// A reader that dedup sorted batches from a source based on the -/// dedup strategy. -pub struct DedupReader { - source: R, - strategy: S, - metrics: DedupMetrics, - /// Optional metrics reporter. - metrics_reporter: Option>, -} - -impl DedupReader { - /// Creates a new dedup reader. - pub fn new( - source: R, - strategy: S, - metrics_reporter: Option>, - ) -> Self { - Self { - source, - strategy, - metrics: DedupMetrics::default(), - metrics_reporter, - } - } -} - -impl DedupReader { - /// Returns the next deduplicated batch. - async fn fetch_next_batch(&mut self) -> Result> { - while let Some(batch) = self.source.next_batch().await? { - if let Some(batch) = self.strategy.push_batch(batch, &mut self.metrics)? { - self.metrics.maybe_report(&self.metrics_reporter); - return Ok(Some(batch)); - } - } - - let result = self.strategy.finish(&mut self.metrics)?; - self.metrics.maybe_report(&self.metrics_reporter); - Ok(result) - } -} - -#[async_trait] -impl BatchReader for DedupReader { - async fn next_batch(&mut self) -> Result> { - self.fetch_next_batch().await - } -} - -impl Drop for DedupReader { - fn drop(&mut self) { - debug!("Dedup reader finished, metrics: {:?}", self.metrics); - - MERGE_FILTER_ROWS_TOTAL - .with_label_values(&["dedup"]) - .inc_by(self.metrics.num_unselected_rows as u64); - MERGE_FILTER_ROWS_TOTAL - .with_label_values(&["delete"]) - .inc_by(self.metrics.num_unselected_rows as u64); - - // Report any remaining metrics. - if let Some(reporter) = &self.metrics_reporter { - reporter.report(&mut self.metrics); - } - } -} - -#[cfg(test)] -impl DedupReader { - fn metrics(&self) -> &DedupMetrics { - &self.metrics - } -} - /// Strategy to remove duplicate rows from sorted batches. pub trait DedupStrategy: Send { /// Pushes a batch to the dedup strategy. @@ -124,114 +46,6 @@ pub trait DedupStrategy: Send { fn finish(&mut self, metrics: &mut DedupMetrics) -> Result>; } -/// State of the last row in a batch for dedup. -struct BatchLastRow { - primary_key: Vec, - /// The last timestamp of the batch. - timestamp: Timestamp, -} - -/// Dedup strategy that keeps the row with latest sequence of each key. -/// -/// This strategy is optimized specially based on the properties of the SST files, -/// memtables and the merge reader. It assumes that batches from files and memtables -/// don't contain duplicate rows and the merge reader never concatenates batches from -/// different source. -/// -/// We might implement a new strategy if we need to process files with duplicate rows. -pub struct LastRow { - /// Meta of the last row in the previous batch that has the same key - /// as the batch to push. - prev_batch: Option, - /// Filter deleted rows. - filter_deleted: bool, -} - -impl LastRow { - /// Creates a new strategy with the given `filter_deleted` flag. - pub fn new(filter_deleted: bool) -> Self { - Self { - prev_batch: None, - filter_deleted, - } - } -} - -impl DedupStrategy for LastRow { - fn push_batch( - &mut self, - mut batch: Batch, - metrics: &mut DedupMetrics, - ) -> Result> { - let start = Instant::now(); - - if batch.is_empty() { - return Ok(None); - } - debug_assert!(batch.first_timestamp().is_some()); - let prev_timestamp = match &self.prev_batch { - Some(prev_batch) => { - if prev_batch.primary_key != batch.primary_key() { - // The key has changed. This is the first batch of the - // new key. - None - } else { - Some(prev_batch.timestamp) - } - } - None => None, - }; - if batch.first_timestamp() == prev_timestamp { - metrics.num_unselected_rows += 1; - // This batch contains a duplicate row, skip it. - if batch.num_rows() == 1 { - // We don't need to update `prev_batch` because they have the same - // key and timestamp. - metrics.dedup_cost += start.elapsed(); - return Ok(None); - } - // Skips the first row. - batch = batch.slice(1, batch.num_rows() - 1); - } - - // Store current batch to `prev_batch` so we could compare the next batch - // with this batch. We store batch before filtering it as rows with `OpType::Delete` - // would be removed from the batch after filter, then we may store an incorrect `last row` - // of previous batch. - match &mut self.prev_batch { - Some(prev) => { - // Reuse the primary key buffer. - prev.primary_key.clone_from(&batch.primary_key); - prev.timestamp = batch.last_timestamp().unwrap(); - } - None => { - self.prev_batch = Some(BatchLastRow { - primary_key: batch.primary_key().to_vec(), - timestamp: batch.last_timestamp().unwrap(), - }) - } - } - - // Filters deleted rows. - if self.filter_deleted { - filter_deleted_from_batch(&mut batch, metrics)?; - } - - metrics.dedup_cost += start.elapsed(); - - // The batch can become empty if all rows are deleted. - if batch.is_empty() { - Ok(None) - } else { - Ok(Some(batch)) - } - } - - fn finish(&mut self, _metrics: &mut DedupMetrics) -> Result> { - Ok(None) - } -} - /// Removes deleted rows from the batch and updates metrics. fn filter_deleted_from_batch(batch: &mut Batch, metrics: &mut DedupMetrics) -> Result<()> { let num_rows = batch.num_rows(); @@ -672,137 +486,10 @@ impl>> Iterator for LastNonNullIter { mod tests { use std::sync::Arc; - use api::v1::OpType; use datatypes::arrow::array::{TimestampMillisecondArray, UInt8Array, UInt64Array}; use super::*; use crate::read::BatchBuilder; - use crate::test_util::{VecBatchReader, check_reader_result, new_batch}; - - #[tokio::test] - async fn test_dedup_reader_no_duplications() { - let input = [ - new_batch( - b"k1", - &[1, 2], - &[11, 12], - &[OpType::Put, OpType::Put], - &[21, 22], - ), - new_batch(b"k1", &[3], &[13], &[OpType::Put], &[23]), - new_batch( - b"k2", - &[1, 2], - &[111, 112], - &[OpType::Put, OpType::Put], - &[31, 32], - ), - ]; - - // Test last row. - let reader = VecBatchReader::new(&input); - let mut reader = DedupReader::new(reader, LastRow::new(true), None); - check_reader_result(&mut reader, &input).await; - assert_eq!(0, reader.metrics().num_unselected_rows); - assert_eq!(0, reader.metrics().num_deleted_rows); - - // Test last non-null. - let reader = VecBatchReader::new(&input); - let mut reader = DedupReader::new(reader, LastNonNull::new(true), None); - check_reader_result(&mut reader, &input).await; - assert_eq!(0, reader.metrics().num_unselected_rows); - assert_eq!(0, reader.metrics().num_deleted_rows); - } - - #[tokio::test] - async fn test_dedup_reader_duplications() { - let input = [ - new_batch( - b"k1", - &[1, 2], - &[13, 11], - &[OpType::Put, OpType::Put], - &[11, 12], - ), - // empty batch. - new_batch(b"k1", &[], &[], &[], &[]), - // Duplicate with the previous batch. - new_batch( - b"k1", - &[2, 3, 4], - &[10, 13, 13], - &[OpType::Put, OpType::Put, OpType::Delete], - &[2, 13, 14], - ), - new_batch( - b"k2", - &[1, 2], - &[20, 20], - &[OpType::Put, OpType::Delete], - &[101, 0], - ), - new_batch(b"k2", &[2], &[19], &[OpType::Put], &[102]), - new_batch(b"k3", &[2], &[20], &[OpType::Put], &[202]), - // This batch won't increase the deleted rows count as it - // is filtered out by the previous batch. - new_batch(b"k3", &[2], &[19], &[OpType::Delete], &[0]), - ]; - // Filter deleted. - let reader = VecBatchReader::new(&input); - let mut reader = DedupReader::new(reader, LastRow::new(true), None); - check_reader_result( - &mut reader, - &[ - new_batch( - b"k1", - &[1, 2], - &[13, 11], - &[OpType::Put, OpType::Put], - &[11, 12], - ), - new_batch(b"k1", &[3], &[13], &[OpType::Put], &[13]), - new_batch(b"k2", &[1], &[20], &[OpType::Put], &[101]), - new_batch(b"k3", &[2], &[20], &[OpType::Put], &[202]), - ], - ) - .await; - assert_eq!(5, reader.metrics().num_unselected_rows); - assert_eq!(2, reader.metrics().num_deleted_rows); - - // Does not filter deleted. - let reader = VecBatchReader::new(&input); - let mut reader = DedupReader::new(reader, LastRow::new(false), None); - check_reader_result( - &mut reader, - &[ - new_batch( - b"k1", - &[1, 2], - &[13, 11], - &[OpType::Put, OpType::Put], - &[11, 12], - ), - new_batch( - b"k1", - &[3, 4], - &[13, 13], - &[OpType::Put, OpType::Delete], - &[13, 14], - ), - new_batch( - b"k2", - &[1, 2], - &[20, 20], - &[OpType::Put, OpType::Delete], - &[101, 0], - ), - new_batch(b"k3", &[2], &[20], &[OpType::Put], &[202]), - ], - ) - .await; - assert_eq!(3, reader.metrics().num_unselected_rows); - assert_eq!(0, reader.metrics().num_deleted_rows); - } /// Returns a new [Batch] whose field has column id 1, 2. fn new_batch_multi_fields( @@ -839,235 +526,6 @@ mod tests { builder.build().unwrap() } - #[tokio::test] - async fn test_last_non_null_merge() { - let input = [ - new_batch_multi_fields( - b"k1", - &[1, 2], - &[13, 11], - &[OpType::Put, OpType::Put], - &[(Some(11), Some(11)), (None, None)], - ), - // empty batch. - new_batch_multi_fields(b"k1", &[], &[], &[], &[]), - // Duplicate with the previous batch. - new_batch_multi_fields(b"k1", &[2], &[10], &[OpType::Put], &[(Some(12), None)]), - new_batch_multi_fields( - b"k1", - &[2, 3, 4], - &[10, 13, 13], - &[OpType::Put, OpType::Put, OpType::Delete], - &[(Some(2), Some(22)), (Some(13), None), (None, Some(14))], - ), - new_batch_multi_fields( - b"k2", - &[1, 2], - &[20, 20], - &[OpType::Put, OpType::Delete], - &[(Some(101), Some(101)), (None, None)], - ), - new_batch_multi_fields( - b"k2", - &[2], - &[19], - &[OpType::Put], - &[(Some(102), Some(102))], - ), - new_batch_multi_fields( - b"k3", - &[2], - &[20], - &[OpType::Put], - &[(Some(202), Some(202))], - ), - // This batch won't increase the deleted rows count as it - // is filtered out by the previous batch. (All fields are null). - new_batch_multi_fields(b"k3", &[2], &[19], &[OpType::Delete], &[(None, None)]), - ]; - - // Filter deleted. - let reader = VecBatchReader::new(&input); - let mut reader = DedupReader::new(reader, LastNonNull::new(true), None); - check_reader_result( - &mut reader, - &[ - new_batch_multi_fields( - b"k1", - &[1, 2], - &[13, 11], - &[OpType::Put, OpType::Put], - &[(Some(11), Some(11)), (Some(12), Some(22))], - ), - new_batch_multi_fields(b"k1", &[3], &[13], &[OpType::Put], &[(Some(13), None)]), - new_batch_multi_fields( - b"k2", - &[1], - &[20], - &[OpType::Put], - &[(Some(101), Some(101))], - ), - new_batch_multi_fields( - b"k3", - &[2], - &[20], - &[OpType::Put], - &[(Some(202), Some(202))], - ), - ], - ) - .await; - assert_eq!(6, reader.metrics().num_unselected_rows); - assert_eq!(2, reader.metrics().num_deleted_rows); - - // Does not filter deleted. - let reader = VecBatchReader::new(&input); - let mut reader = DedupReader::new(reader, LastNonNull::new(false), None); - check_reader_result( - &mut reader, - &[ - new_batch_multi_fields( - b"k1", - &[1, 2], - &[13, 11], - &[OpType::Put, OpType::Put], - &[(Some(11), Some(11)), (Some(12), Some(22))], - ), - new_batch_multi_fields( - b"k1", - &[3, 4], - &[13, 13], - &[OpType::Put, OpType::Delete], - &[(Some(13), None), (None, Some(14))], - ), - new_batch_multi_fields( - b"k2", - &[1, 2], - &[20, 20], - &[OpType::Put, OpType::Delete], - &[(Some(101), Some(101)), (None, None)], - ), - new_batch_multi_fields( - b"k3", - &[2], - &[20], - &[OpType::Put], - &[(Some(202), Some(202))], - ), - ], - ) - .await; - assert_eq!(4, reader.metrics().num_unselected_rows); - assert_eq!(0, reader.metrics().num_deleted_rows); - } - - #[tokio::test] - async fn test_last_non_null_skip_merge_single() { - let input = [new_batch_multi_fields( - b"k1", - &[1, 2, 3], - &[13, 11, 13], - &[OpType::Put, OpType::Delete, OpType::Put], - &[(Some(11), Some(11)), (None, None), (Some(13), Some(13))], - )]; - - let reader = VecBatchReader::new(&input); - let mut reader = DedupReader::new(reader, LastNonNull::new(true), None); - check_reader_result( - &mut reader, - &[new_batch_multi_fields( - b"k1", - &[1, 3], - &[13, 13], - &[OpType::Put, OpType::Put], - &[(Some(11), Some(11)), (Some(13), Some(13))], - )], - ) - .await; - assert_eq!(1, reader.metrics().num_unselected_rows); - assert_eq!(1, reader.metrics().num_deleted_rows); - - let reader = VecBatchReader::new(&input); - let mut reader = DedupReader::new(reader, LastNonNull::new(false), None); - check_reader_result(&mut reader, &input).await; - assert_eq!(0, reader.metrics().num_unselected_rows); - assert_eq!(0, reader.metrics().num_deleted_rows); - } - - #[tokio::test] - async fn test_last_non_null_skip_merge_no_null() { - let input = [ - new_batch_multi_fields( - b"k1", - &[1, 2], - &[13, 11], - &[OpType::Put, OpType::Put], - &[(Some(11), Some(11)), (Some(12), Some(12))], - ), - new_batch_multi_fields(b"k1", &[2], &[10], &[OpType::Put], &[(None, Some(22))]), - new_batch_multi_fields( - b"k1", - &[2, 3], - &[9, 13], - &[OpType::Put, OpType::Put], - &[(Some(32), None), (Some(13), Some(13))], - ), - ]; - - let reader = VecBatchReader::new(&input); - let mut reader = DedupReader::new(reader, LastNonNull::new(true), None); - check_reader_result( - &mut reader, - &[ - new_batch_multi_fields( - b"k1", - &[1, 2], - &[13, 11], - &[OpType::Put, OpType::Put], - &[(Some(11), Some(11)), (Some(12), Some(12))], - ), - new_batch_multi_fields(b"k1", &[3], &[13], &[OpType::Put], &[(Some(13), Some(13))]), - ], - ) - .await; - assert_eq!(2, reader.metrics().num_unselected_rows); - assert_eq!(0, reader.metrics().num_deleted_rows); - } - - #[tokio::test] - async fn test_last_non_null_merge_null() { - let input = [ - new_batch_multi_fields( - b"k1", - &[1, 2], - &[13, 11], - &[OpType::Put, OpType::Put], - &[(Some(11), Some(11)), (None, None)], - ), - new_batch_multi_fields(b"k1", &[2], &[10], &[OpType::Put], &[(None, Some(22))]), - new_batch_multi_fields(b"k1", &[3], &[13], &[OpType::Put], &[(Some(33), None)]), - ]; - - let reader = VecBatchReader::new(&input); - let mut reader = DedupReader::new(reader, LastNonNull::new(true), None); - check_reader_result( - &mut reader, - &[ - new_batch_multi_fields( - b"k1", - &[1, 2], - &[13, 11], - &[OpType::Put, OpType::Put], - &[(Some(11), Some(11)), (None, Some(22))], - ), - new_batch_multi_fields(b"k1", &[3], &[13], &[OpType::Put], &[(Some(33), None)]), - ], - ) - .await; - assert_eq!(1, reader.metrics().num_unselected_rows); - assert_eq!(0, reader.metrics().num_deleted_rows); - } - fn check_dedup_strategy(input: &[Batch], strategy: &mut dyn DedupStrategy, expect: &[Batch]) { let mut actual = Vec::new(); let mut metrics = DedupMetrics::default(); diff --git a/src/mito2/src/read/flat_merge.rs b/src/mito2/src/read/flat_merge.rs index 946f2a610c..b1c304f244 100644 --- a/src/mito2/src/read/flat_merge.rs +++ b/src/mito2/src/read/flat_merge.rs @@ -14,8 +14,9 @@ use std::cmp::Ordering; use std::collections::BinaryHeap; +use std::fmt; use std::sync::Arc; -use std::time::Instant; +use std::time::{Duration, Instant}; use async_stream::try_stream; use common_telemetry::debug; @@ -34,7 +35,6 @@ use crate::error::{ComputeArrowSnafu, Result}; use crate::memtable::BoxedRecordBatchIterator; use crate::metrics::READ_STAGE_ELAPSED; use crate::read::BoxedRecordBatchStream; -use crate::read::merge::{MergeMetrics, MergeMetricsReport}; use crate::sst::parquet::flat_format::{ primary_key_column_index, sequence_column_index, time_index_column_index, }; @@ -105,6 +105,84 @@ struct BatchCursor { row_idx: usize, } +/// Trait for reporting merge metrics. +pub trait MergeMetricsReport: Send + Sync { + /// Reports and resets the metrics. + fn report(&self, metrics: &mut MergeMetrics); +} + +/// Metrics for the merge reader. +#[derive(Default)] +pub struct MergeMetrics { + /// Cost to initialize the reader. + pub(crate) init_cost: Duration, + /// Total scan cost of the reader. + pub(crate) scan_cost: Duration, + /// Number of times to fetch batches. + pub(crate) num_fetch_by_batches: usize, + /// Number of times to fetch rows. + pub(crate) num_fetch_by_rows: usize, + /// Cost to fetch batches from sources. + pub(crate) fetch_cost: Duration, +} + +impl fmt::Debug for MergeMetrics { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.scan_cost.is_zero() { + return write!(f, "{{}}"); + } + + write!(f, r#"{{"scan_cost":"{:?}""#, self.scan_cost)?; + + if !self.init_cost.is_zero() { + write!(f, r#", "init_cost":"{:?}""#, self.init_cost)?; + } + if self.num_fetch_by_batches > 0 { + write!( + f, + r#", "num_fetch_by_batches":{}"#, + self.num_fetch_by_batches + )?; + } + if self.num_fetch_by_rows > 0 { + write!(f, r#", "num_fetch_by_rows":{}"#, self.num_fetch_by_rows)?; + } + if !self.fetch_cost.is_zero() { + write!(f, r#", "fetch_cost":"{:?}""#, self.fetch_cost)?; + } + + write!(f, "}}") + } +} + +impl MergeMetrics { + /// Merges metrics from another MergeMetrics instance. + pub(crate) fn merge(&mut self, other: &MergeMetrics) { + let MergeMetrics { + init_cost, + scan_cost, + num_fetch_by_batches, + num_fetch_by_rows, + fetch_cost, + } = other; + + self.init_cost += *init_cost; + self.scan_cost += *scan_cost; + self.num_fetch_by_batches += *num_fetch_by_batches; + self.num_fetch_by_rows += *num_fetch_by_rows; + self.fetch_cost += *fetch_cost; + } + + /// Reports the metrics if scan_cost exceeds 10ms and resets them. + pub(crate) fn maybe_report(&mut self, reporter: &Option>) { + if self.scan_cost.as_millis() > 10 + && let Some(r) = reporter + { + r.report(self); + } + } +} + /// Provides an API to incrementally build a [`RecordBatch`] from partitioned [`RecordBatch`] // Ports from https://github.com/apache/datafusion/blob/49.0.0/datafusion/physical-plan/src/sorts/builder.rs // Adds the `take_remaining_rows()` method. diff --git a/src/mito2/src/read/merge.rs b/src/mito2/src/read/merge.rs deleted file mode 100644 index 0470e4b01a..0000000000 --- a/src/mito2/src/read/merge.rs +++ /dev/null @@ -1,982 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Merge reader implementation. - -use std::cmp::Ordering; -use std::collections::BinaryHeap; -use std::sync::Arc; -use std::time::{Duration, Instant}; -use std::{fmt, mem}; - -use async_trait::async_trait; -use common_telemetry::debug; - -use crate::error::Result; -use crate::memtable::BoxedBatchIterator; -use crate::metrics::READ_STAGE_ELAPSED; -use crate::read::{Batch, BatchReader, BoxedBatchReader, Source}; - -/// Trait for reporting merge metrics. -pub trait MergeMetricsReport: Send + Sync { - /// Reports and resets the metrics. - fn report(&self, metrics: &mut MergeMetrics); -} - -/// Reader to merge sorted batches. -/// -/// The merge reader merges [Batch]es from multiple sources that yield sorted batches. -/// 1. Batch is ordered by primary key, time index, sequence desc, op type desc (we can -/// ignore op type as sequence is already unique). -/// 2. Batches from sources **must** not be empty. -/// -/// The reader won't concatenate batches. Each batch returned by the reader also doesn't -/// contain duplicate rows. But the last (primary key, timestamp) of a batch may be the same -/// as the first one in the next batch. -pub struct MergeReader { - /// Holds [Node]s whose key range of current batch **is** overlapped with the merge window. - /// Each node yields batches from a `source`. - /// - /// [Node] in this heap **must** not be empty. A `merge window` is the (primary key, timestamp) - /// range of the **root node** in the `hot` heap. - hot: BinaryHeap, - /// Holds `Node` whose key range of current batch **isn't** overlapped with the merge window. - /// - /// `Node` in this heap **must** not be empty. - cold: BinaryHeap, - /// Batch to output. - output_batch: Option, - /// Local metrics. - metrics: MergeMetrics, - /// Optional metrics reporter. - metrics_reporter: Option>, -} - -#[async_trait] -impl BatchReader for MergeReader { - async fn next_batch(&mut self) -> Result> { - let start = Instant::now(); - while !self.hot.is_empty() && self.output_batch.is_none() { - if self.hot.len() == 1 { - // No need to do merge sort if only one batch in the hot heap. - self.fetch_batch_from_hottest().await?; - self.metrics.num_fetch_by_batches += 1; - } else { - // We could only fetch rows that less than the next node from the hottest node. - self.fetch_rows_from_hottest().await?; - self.metrics.num_fetch_by_rows += 1; - } - } - - if let Some(batch) = self.output_batch.take() { - self.metrics.scan_cost += start.elapsed(); - self.metrics.maybe_report(&self.metrics_reporter); - Ok(Some(batch)) - } else { - // Nothing fetched. - self.metrics.scan_cost += start.elapsed(); - self.metrics.maybe_report(&self.metrics_reporter); - Ok(None) - } - } -} - -impl Drop for MergeReader { - fn drop(&mut self) { - debug!("Merge reader finished, metrics: {:?}", self.metrics); - - READ_STAGE_ELAPSED - .with_label_values(&["merge"]) - .observe(self.metrics.scan_cost.as_secs_f64()); - READ_STAGE_ELAPSED - .with_label_values(&["merge_fetch"]) - .observe(self.metrics.fetch_cost.as_secs_f64()); - - // Report any remaining metrics. - if let Some(reporter) = &self.metrics_reporter { - reporter.report(&mut self.metrics); - } - } -} - -impl MergeReader { - /// Creates and initializes a new [MergeReader]. - pub async fn new( - sources: Vec, - metrics_reporter: Option>, - ) -> Result { - let start = Instant::now(); - let mut metrics = MergeMetrics::default(); - - let mut cold = BinaryHeap::with_capacity(sources.len()); - let hot = BinaryHeap::with_capacity(sources.len()); - for source in sources { - let node = Node::new(source, &mut metrics).await?; - if !node.is_eof() { - // Ensure `cold` don't have eof nodes. - cold.push(node); - } - } - - let mut reader = MergeReader { - hot, - cold, - output_batch: None, - metrics, - metrics_reporter, - }; - // Initializes the reader. - reader.refill_hot(); - - let elapsed = start.elapsed(); - reader.metrics.init_cost += elapsed; - reader.metrics.scan_cost += elapsed; - Ok(reader) - } - - /// Moves nodes in `cold` heap, whose key range is overlapped with current merge - /// window to `hot` heap. - fn refill_hot(&mut self) { - while !self.cold.is_empty() { - if let Some(merge_window) = self.hot.peek() { - let warmest = self.cold.peek().unwrap(); - if warmest.is_behind(merge_window) { - // if the warmest node in the `cold` heap is totally after the - // `merge_window`, then no need to add more nodes into the `hot` - // heap for merge sorting. - break; - } - } - - let warmest = self.cold.pop().unwrap(); - self.hot.push(warmest); - } - } - - /// Fetches one batch from the hottest node. - async fn fetch_batch_from_hottest(&mut self) -> Result<()> { - assert_eq!(1, self.hot.len()); - - let mut hottest = self.hot.pop().unwrap(); - let batch = hottest.fetch_batch(&mut self.metrics).await?; - Self::maybe_output_batch(batch, &mut self.output_batch)?; - self.reheap(hottest) - } - - /// Fetches non-duplicated rows from the hottest node. - async fn fetch_rows_from_hottest(&mut self) -> Result<()> { - // Safety: `fetch_batches_to_output()` ensures the hot heap has more than 1 element. - // Pop hottest node. - let mut top_node = self.hot.pop().unwrap(); - let top = top_node.current_batch(); - // Min timestamp and its sequence in the next batch. - let next_min_ts = { - let next_node = self.hot.peek().unwrap(); - let next = next_node.current_batch(); - // top and next have overlapping rows so they must have same primary keys. - debug_assert_eq!(top.primary_key(), next.primary_key()); - // Safety: Batches in the heap is not empty, so we can use unwrap here. - next.first_timestamp().unwrap() - }; - - // Safety: Batches in the heap is not empty, so we can use unwrap here. - let timestamps = top.timestamps_native().unwrap(); - // Binary searches the timestamp in the top batch. - // Safety: Batches should have the same timestamp resolution so we can compare the native - // value directly. - let duplicate_pos = match timestamps.binary_search(&next_min_ts.value()) { - Ok(pos) => pos, - Err(pos) => { - // No duplicate timestamp. Outputs timestamp before `pos`. - Self::maybe_output_batch(top.slice(0, pos), &mut self.output_batch)?; - top_node.skip_rows(pos, &mut self.metrics).await?; - return self.reheap(top_node); - } - }; - - // No need to remove duplicate timestamps. - let output_end = if duplicate_pos == 0 { - // If the first timestamp of the top node is duplicate. We can simply return the first row - // as the heap ensure it is the one with largest sequence. - 1 - } else { - // We don't know which one has the larger sequence so we use the range before - // the duplicate pos. - duplicate_pos - }; - Self::maybe_output_batch(top.slice(0, output_end), &mut self.output_batch)?; - top_node.skip_rows(output_end, &mut self.metrics).await?; - self.reheap(top_node) - } - - /// Push the node popped from `hot` back to a proper heap. - fn reheap(&mut self, node: Node) -> Result<()> { - if node.is_eof() { - // If the node is EOF, don't put it into the heap again. - // The merge window would be updated, need to refill the hot heap. - self.refill_hot(); - } else { - // Find a proper heap for this node. - let node_is_cold = if let Some(hottest) = self.hot.peek() { - // If key range of this node is behind the hottest node's then we can - // push it to the cold heap. Otherwise we should push it to the hot heap. - node.is_behind(hottest) - } else { - // The hot heap is empty, but we don't known whether the current - // batch of this node is still the hottest. - true - }; - - if node_is_cold { - self.cold.push(node); - } else { - self.hot.push(node); - } - // Anyway, the merge window has been changed, we need to refill the hot heap. - self.refill_hot(); - } - - Ok(()) - } - - /// If `filter_deleted` is set to true, removes deleted entries and sets the `batch` to the `output_batch`. - /// - /// Ignores the `batch` if it is empty. - fn maybe_output_batch(batch: Batch, output_batch: &mut Option) -> Result<()> { - debug_assert!(output_batch.is_none()); - if batch.is_empty() { - return Ok(()); - } - *output_batch = Some(batch); - - Ok(()) - } -} - -/// Builder to build and initialize a [MergeReader]. -#[derive(Default)] -pub struct MergeReaderBuilder { - /// Input sources. - /// - /// All source must yield batches with the same schema. - sources: Vec, - /// Optional metrics reporter. - metrics_reporter: Option>, -} - -impl MergeReaderBuilder { - /// Returns an empty builder. - pub fn new() -> MergeReaderBuilder { - MergeReaderBuilder::default() - } - - /// Creates a builder from sources. - pub fn from_sources(sources: Vec) -> MergeReaderBuilder { - MergeReaderBuilder { - sources, - metrics_reporter: None, - } - } - - /// Pushes a batch reader to sources. - pub fn push_batch_reader(&mut self, reader: BoxedBatchReader) -> &mut Self { - self.sources.push(Source::Reader(reader)); - self - } - - /// Pushes a batch iterator to sources. - pub fn push_batch_iter(&mut self, iter: BoxedBatchIterator) -> &mut Self { - self.sources.push(Source::Iter(iter)); - self - } - - /// Sets the metrics reporter. - pub fn with_metrics_reporter( - &mut self, - reporter: Option>, - ) -> &mut Self { - self.metrics_reporter = reporter; - self - } - - /// Builds and initializes the reader, then resets the builder. - pub async fn build(&mut self) -> Result { - let sources = mem::take(&mut self.sources); - let metrics_reporter = self.metrics_reporter.take(); - MergeReader::new(sources, metrics_reporter).await - } -} - -/// Metrics for the merge reader. -#[derive(Default)] -pub struct MergeMetrics { - /// Cost to initialize the reader. - pub(crate) init_cost: Duration, - /// Total scan cost of the reader. - pub(crate) scan_cost: Duration, - /// Number of times to fetch batches. - pub(crate) num_fetch_by_batches: usize, - /// Number of times to fetch rows. - pub(crate) num_fetch_by_rows: usize, - /// Cost to fetch batches from sources. - pub(crate) fetch_cost: Duration, -} - -impl fmt::Debug for MergeMetrics { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - // Skip output if scan_cost is zero - if self.scan_cost.is_zero() { - return write!(f, "{{}}"); - } - - write!(f, r#"{{"scan_cost":"{:?}""#, self.scan_cost)?; - - if !self.init_cost.is_zero() { - write!(f, r#", "init_cost":"{:?}""#, self.init_cost)?; - } - if self.num_fetch_by_batches > 0 { - write!( - f, - r#", "num_fetch_by_batches":{}"#, - self.num_fetch_by_batches - )?; - } - if self.num_fetch_by_rows > 0 { - write!(f, r#", "num_fetch_by_rows":{}"#, self.num_fetch_by_rows)?; - } - if !self.fetch_cost.is_zero() { - write!(f, r#", "fetch_cost":"{:?}""#, self.fetch_cost)?; - } - - write!(f, "}}") - } -} - -impl MergeMetrics { - /// Merges metrics from another MergeMetrics instance. - pub(crate) fn merge(&mut self, other: &MergeMetrics) { - let MergeMetrics { - init_cost, - scan_cost, - num_fetch_by_batches, - num_fetch_by_rows, - fetch_cost, - } = other; - - self.init_cost += *init_cost; - self.scan_cost += *scan_cost; - self.num_fetch_by_batches += *num_fetch_by_batches; - self.num_fetch_by_rows += *num_fetch_by_rows; - self.fetch_cost += *fetch_cost; - } - - /// Reports the metrics if scan_cost exceeds 10ms and resets them. - pub(crate) fn maybe_report(&mut self, reporter: &Option>) { - if self.scan_cost.as_millis() > 10 - && let Some(r) = reporter - { - r.report(self); - } - } -} - -/// A `Node` represent an individual input data source to be merged. -struct Node { - /// Data source of this `Node`. - source: Source, - /// Current batch to be read. The node ensures the batch is not empty. - /// - /// `None` means the `source` has reached EOF. - current_batch: Option, -} - -impl Node { - /// Initialize a node. - /// - /// It tries to fetch one batch from the `source`. - async fn new(mut source: Source, metrics: &mut MergeMetrics) -> Result { - // Ensures batch is not empty. - let start = Instant::now(); - let current_batch = source.next_batch().await?.map(CompareFirst); - metrics.fetch_cost += start.elapsed(); - - Ok(Node { - source, - current_batch, - }) - } - - /// Returns whether the node still has batch to read. - fn is_eof(&self) -> bool { - self.current_batch.is_none() - } - - /// Returns the primary key of current batch. - /// - /// # Panics - /// Panics if the node has reached EOF. - fn primary_key(&self) -> &[u8] { - self.current_batch().primary_key() - } - - /// Returns current batch. - /// - /// # Panics - /// Panics if the node has reached EOF. - fn current_batch(&self) -> &Batch { - &self.current_batch.as_ref().unwrap().0 - } - - /// Returns current batch and fetches next batch - /// from the source. - /// - /// # Panics - /// Panics if the node has reached EOF. - async fn fetch_batch(&mut self, metrics: &mut MergeMetrics) -> Result { - let current = self.current_batch.take().unwrap(); - let start = Instant::now(); - // Ensures batch is not empty. - self.current_batch = self.source.next_batch().await?.map(CompareFirst); - metrics.fetch_cost += start.elapsed(); - Ok(current.0) - } - - /// Returns true if the key range of current batch in `self` is behind (exclusive) current - /// batch in `other`. - /// - /// # Panics - /// Panics if either `self` or `other` is EOF. - fn is_behind(&self, other: &Node) -> bool { - debug_assert!(!self.current_batch().is_empty()); - debug_assert!(!other.current_batch().is_empty()); - - // We only compare pk and timestamp so nodes in the cold - // heap don't have overlapping timestamps with the hottest node - // in the hot heap. - self.primary_key().cmp(other.primary_key()).then_with(|| { - self.current_batch() - .first_timestamp() - .cmp(&other.current_batch().last_timestamp()) - }) == Ordering::Greater - } - - /// Skips first `num_to_skip` rows from node's current batch. If current batch is empty it fetches - /// next batch from the node. - /// - /// # Panics - /// Panics if the node is EOF. - async fn skip_rows(&mut self, num_to_skip: usize, metrics: &mut MergeMetrics) -> Result<()> { - let batch = self.current_batch(); - debug_assert!(batch.num_rows() >= num_to_skip); - - let remaining = batch.num_rows() - num_to_skip; - if remaining == 0 { - // Nothing remains, we need to fetch next batch to ensure the batch is not empty. - self.fetch_batch(metrics).await?; - } else { - debug_assert!(!batch.is_empty()); - self.current_batch = Some(CompareFirst(batch.slice(num_to_skip, remaining))); - } - - Ok(()) - } -} - -impl PartialEq for Node { - fn eq(&self, other: &Node) -> bool { - self.current_batch == other.current_batch - } -} - -impl Eq for Node {} - -impl PartialOrd for Node { - fn partial_cmp(&self, other: &Node) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for Node { - fn cmp(&self, other: &Node) -> Ordering { - // The std binary heap is a max heap, but we want the nodes are ordered in - // ascend order, so we compare the nodes in reverse order. - other.current_batch.cmp(&self.current_batch) - } -} - -/// Type to compare [Batch] by first row. -/// -/// It ignores op type as sequence is enough to distinguish different rows. -struct CompareFirst(Batch); - -impl PartialEq for CompareFirst { - fn eq(&self, other: &Self) -> bool { - self.0.primary_key() == other.0.primary_key() - && self.0.first_timestamp() == other.0.first_timestamp() - && self.0.first_sequence() == other.0.first_sequence() - } -} - -impl Eq for CompareFirst {} - -impl PartialOrd for CompareFirst { - fn partial_cmp(&self, other: &CompareFirst) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for CompareFirst { - /// Compares by primary key, time index, sequence desc. - fn cmp(&self, other: &CompareFirst) -> Ordering { - self.0 - .primary_key() - .cmp(other.0.primary_key()) - .then_with(|| self.0.first_timestamp().cmp(&other.0.first_timestamp())) - .then_with(|| other.0.first_sequence().cmp(&self.0.first_sequence())) - } -} - -#[cfg(test)] -mod tests { - use api::v1::OpType; - - use super::*; - use crate::test_util::{VecBatchReader, check_reader_result, new_batch}; - - #[tokio::test] - async fn test_merge_reader_empty() { - let mut reader = MergeReaderBuilder::new().build().await.unwrap(); - assert!(reader.next_batch().await.unwrap().is_none()); - assert!(reader.next_batch().await.unwrap().is_none()); - } - - #[tokio::test] - async fn test_merge_non_overlapping() { - let reader1 = VecBatchReader::new(&[ - new_batch( - b"k1", - &[1, 2], - &[11, 12], - &[OpType::Put, OpType::Put], - &[21, 22], - ), - new_batch( - b"k1", - &[7, 8], - &[17, 18], - &[OpType::Put, OpType::Delete], - &[27, 28], - ), - new_batch( - b"k2", - &[2, 3], - &[12, 13], - &[OpType::Delete, OpType::Put], - &[22, 23], - ), - ]); - let reader2 = VecBatchReader::new(&[new_batch( - b"k1", - &[4, 5], - &[14, 15], - &[OpType::Put, OpType::Put], - &[24, 25], - )]); - let mut reader = MergeReaderBuilder::new() - .push_batch_reader(Box::new(reader1)) - .push_batch_iter(Box::new(reader2)) - .build() - .await - .unwrap(); - check_reader_result( - &mut reader, - &[ - new_batch( - b"k1", - &[1, 2], - &[11, 12], - &[OpType::Put, OpType::Put], - &[21, 22], - ), - new_batch( - b"k1", - &[4, 5], - &[14, 15], - &[OpType::Put, OpType::Put], - &[24, 25], - ), - new_batch( - b"k1", - &[7, 8], - &[17, 18], - &[OpType::Put, OpType::Delete], - &[27, 28], - ), - new_batch( - b"k2", - &[2, 3], - &[12, 13], - &[OpType::Delete, OpType::Put], - &[22, 23], - ), - ], - ) - .await; - } - - #[tokio::test] - async fn test_merge_reheap_hot() { - let reader1 = VecBatchReader::new(&[ - new_batch( - b"k1", - &[1, 3], - &[10, 10], - &[OpType::Put, OpType::Put], - &[21, 23], - ), - new_batch(b"k2", &[3], &[10], &[OpType::Put], &[23]), - ]); - let reader2 = VecBatchReader::new(&[new_batch( - b"k1", - &[2, 4], - &[11, 11], - &[OpType::Put, OpType::Put], - &[32, 34], - )]); - let mut reader = MergeReaderBuilder::new() - .push_batch_reader(Box::new(reader1)) - .push_batch_iter(Box::new(reader2)) - .build() - .await - .unwrap(); - check_reader_result( - &mut reader, - &[ - new_batch(b"k1", &[1], &[10], &[OpType::Put], &[21]), - new_batch(b"k1", &[2], &[11], &[OpType::Put], &[32]), - new_batch(b"k1", &[3], &[10], &[OpType::Put], &[23]), - new_batch(b"k1", &[4], &[11], &[OpType::Put], &[34]), - new_batch(b"k2", &[3], &[10], &[OpType::Put], &[23]), - ], - ) - .await; - } - - #[tokio::test] - async fn test_merge_overlapping() { - let reader1 = VecBatchReader::new(&[ - new_batch( - b"k1", - &[1, 2], - &[11, 12], - &[OpType::Put, OpType::Put], - &[21, 22], - ), - new_batch( - b"k1", - &[4, 5], - &[14, 15], - // This override 4 and deletes 5. - &[OpType::Put, OpType::Delete], - &[24, 25], - ), - new_batch( - b"k2", - &[2, 3], - &[12, 13], - // This delete 2. - &[OpType::Delete, OpType::Put], - &[22, 23], - ), - ]); - let reader2 = VecBatchReader::new(&[ - new_batch( - b"k1", - &[3, 4, 5], - &[10, 10, 10], - &[OpType::Put, OpType::Put, OpType::Put], - &[33, 34, 35], - ), - new_batch( - b"k2", - &[1, 10], - &[11, 20], - &[OpType::Put, OpType::Put], - &[21, 30], - ), - ]); - let mut reader = MergeReaderBuilder::new() - .push_batch_reader(Box::new(reader1)) - .push_batch_iter(Box::new(reader2)) - .build() - .await - .unwrap(); - check_reader_result( - &mut reader, - &[ - new_batch( - b"k1", - &[1, 2], - &[11, 12], - &[OpType::Put, OpType::Put], - &[21, 22], - ), - new_batch(b"k1", &[3], &[10], &[OpType::Put], &[33]), - new_batch(b"k1", &[4], &[14], &[OpType::Put], &[24]), - new_batch(b"k1", &[4], &[10], &[OpType::Put], &[34]), - new_batch(b"k1", &[5], &[15], &[OpType::Delete], &[25]), - new_batch(b"k1", &[5], &[10], &[OpType::Put], &[35]), - new_batch(b"k2", &[1], &[11], &[OpType::Put], &[21]), - new_batch( - b"k2", - &[2, 3], - &[12, 13], - &[OpType::Delete, OpType::Put], - &[22, 23], - ), - new_batch(b"k2", &[10], &[20], &[OpType::Put], &[30]), - ], - ) - .await; - } - - #[tokio::test] - async fn test_merge_deleted() { - let reader1 = VecBatchReader::new(&[ - new_batch( - b"k1", - &[1, 2], - &[11, 12], - &[OpType::Delete, OpType::Delete], - &[21, 22], - ), - new_batch( - b"k2", - &[2, 3], - &[12, 13], - &[OpType::Delete, OpType::Put], - &[22, 23], - ), - ]); - let reader2 = VecBatchReader::new(&[new_batch( - b"k1", - &[4, 5], - &[14, 15], - &[OpType::Delete, OpType::Delete], - &[24, 25], - )]); - let mut reader = MergeReaderBuilder::new() - .push_batch_reader(Box::new(reader1)) - .push_batch_iter(Box::new(reader2)) - .build() - .await - .unwrap(); - check_reader_result( - &mut reader, - &[ - new_batch( - b"k1", - &[1, 2], - &[11, 12], - &[OpType::Delete, OpType::Delete], - &[21, 22], - ), - new_batch( - b"k1", - &[4, 5], - &[14, 15], - &[OpType::Delete, OpType::Delete], - &[24, 25], - ), - new_batch( - b"k2", - &[2, 3], - &[12, 13], - &[OpType::Delete, OpType::Put], - &[22, 23], - ), - ], - ) - .await; - } - - #[tokio::test] - async fn test_merge_next_node_empty() { - let reader1 = VecBatchReader::new(&[new_batch( - b"k1", - &[1, 2], - &[11, 12], - &[OpType::Put, OpType::Put], - &[21, 22], - )]); - let reader2 = VecBatchReader::new(&[new_batch(b"k1", &[1], &[10], &[OpType::Put], &[33])]); - let mut reader = MergeReaderBuilder::new() - .push_batch_reader(Box::new(reader1)) - .push_batch_iter(Box::new(reader2)) - .build() - .await - .unwrap(); - check_reader_result( - &mut reader, - &[ - new_batch(b"k1", &[1], &[11], &[OpType::Put], &[21]), - new_batch(b"k1", &[1], &[10], &[OpType::Put], &[33]), - new_batch(b"k1", &[2], &[12], &[OpType::Put], &[22]), - ], - ) - .await; - } - - #[tokio::test] - async fn test_merge_top_node_empty() { - let reader1 = VecBatchReader::new(&[new_batch( - b"k1", - &[1, 2], - &[10, 10], - &[OpType::Put, OpType::Put], - &[21, 22], - )]); - let reader2 = VecBatchReader::new(&[new_batch( - b"k1", - &[2, 3], - &[11, 11], - &[OpType::Put, OpType::Put], - &[32, 33], - )]); - let mut reader = MergeReaderBuilder::new() - .push_batch_reader(Box::new(reader1)) - .push_batch_iter(Box::new(reader2)) - .build() - .await - .unwrap(); - check_reader_result( - &mut reader, - &[ - new_batch(b"k1", &[1], &[10], &[OpType::Put], &[21]), - new_batch(b"k1", &[2], &[11], &[OpType::Put], &[32]), - new_batch(b"k1", &[2], &[10], &[OpType::Put], &[22]), - new_batch(b"k1", &[3], &[11], &[OpType::Put], &[33]), - ], - ) - .await; - } - - #[tokio::test] - async fn test_merge_large_range() { - let reader1 = VecBatchReader::new(&[new_batch( - b"k1", - &[1, 10], - &[10, 10], - &[OpType::Put, OpType::Put], - &[21, 30], - )]); - let reader2 = VecBatchReader::new(&[new_batch( - b"k1", - &[1, 20], - &[11, 11], - &[OpType::Put, OpType::Put], - &[31, 40], - )]); - // The hot heap have a node that doesn't have duplicate - // timestamps. - let reader3 = VecBatchReader::new(&[new_batch( - b"k1", - &[6, 8], - &[11, 11], - &[OpType::Put, OpType::Put], - &[36, 38], - )]); - let mut reader = MergeReaderBuilder::new() - .push_batch_reader(Box::new(reader1)) - .push_batch_iter(Box::new(reader2)) - .push_batch_reader(Box::new(reader3)) - .build() - .await - .unwrap(); - check_reader_result( - &mut reader, - &[ - new_batch(b"k1", &[1], &[11], &[OpType::Put], &[31]), - new_batch(b"k1", &[1], &[10], &[OpType::Put], &[21]), - new_batch( - b"k1", - &[6, 8], - &[11, 11], - &[OpType::Put, OpType::Put], - &[36, 38], - ), - new_batch(b"k1", &[10], &[10], &[OpType::Put], &[30]), - new_batch(b"k1", &[20], &[11], &[OpType::Put], &[40]), - ], - ) - .await; - } - - #[tokio::test] - async fn test_merge_many_duplicates() { - let mut builder = MergeReaderBuilder::new(); - for i in 0..10 { - let batches: Vec<_> = (0..8) - .map(|ts| new_batch(b"k1", &[ts], &[i], &[OpType::Put], &[100])) - .collect(); - let reader = VecBatchReader::new(&batches); - builder.push_batch_reader(Box::new(reader)); - } - let mut reader = builder.build().await.unwrap(); - let mut expect = Vec::with_capacity(80); - for ts in 0..8 { - for i in 0..10 { - let batch = new_batch(b"k1", &[ts], &[9 - i], &[OpType::Put], &[100]); - expect.push(batch); - } - } - check_reader_result(&mut reader, &expect).await; - } - - #[tokio::test] - async fn test_merge_keep_duplicate() { - let reader1 = VecBatchReader::new(&[new_batch( - b"k1", - &[1, 2], - &[10, 10], - &[OpType::Put, OpType::Put], - &[21, 22], - )]); - let reader2 = VecBatchReader::new(&[new_batch( - b"k1", - &[2, 3], - &[11, 11], - &[OpType::Put, OpType::Put], - &[32, 33], - )]); - let sources = vec![ - Source::Reader(Box::new(reader1)), - Source::Iter(Box::new(reader2)), - ]; - let mut reader = MergeReaderBuilder::from_sources(sources) - .build() - .await - .unwrap(); - check_reader_result( - &mut reader, - &[ - new_batch(b"k1", &[1], &[10], &[OpType::Put], &[21]), - new_batch(b"k1", &[2], &[11], &[OpType::Put], &[32]), - new_batch(b"k1", &[2], &[10], &[OpType::Put], &[22]), - new_batch(b"k1", &[3], &[11], &[OpType::Put], &[33]), - ], - ) - .await; - } -} diff --git a/src/mito2/src/read/plain_batch.rs b/src/mito2/src/read/plain_batch.rs deleted file mode 100644 index f22b6688d6..0000000000 --- a/src/mito2/src/read/plain_batch.rs +++ /dev/null @@ -1,505 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Batch without an encoded primary key. - -use std::collections::HashMap; -use std::sync::Arc; - -use api::v1::OpType; -use datatypes::arrow::array::{ArrayRef, BooleanArray, UInt8Array, UInt64Array}; -use datatypes::arrow::compute::filter_record_batch; -use datatypes::arrow::datatypes::SchemaRef; -use datatypes::arrow::record_batch::RecordBatch; -use snafu::{OptionExt, ResultExt}; -use store_api::metadata::{ColumnMetadata, RegionMetadata}; -use store_api::storage::{RegionId, SequenceNumber}; - -use crate::error::{ - ComputeArrowSnafu, CreateDefaultSnafu, InvalidRequestSnafu, NewRecordBatchSnafu, Result, - UnexpectedSnafu, -}; - -/// Number of columns that have fixed positions. -/// -/// Contains all internal columns. -pub(crate) const PLAIN_FIXED_POS_COLUMN_NUM: usize = 2; - -/// [PlainBatch] represents a batch of rows. -/// It is a wrapper around [RecordBatch]. -/// -/// The columns order is the same as the order of the columns read from the SST. -/// It always contains two internal columns now. We may change modify this behavior -/// in the future. -#[derive(Debug)] -pub struct PlainBatch { - /// The original record batch. - record_batch: RecordBatch, -} - -impl PlainBatch { - /// Creates a new [PlainBatch] from a [RecordBatch]. - pub fn new(record_batch: RecordBatch) -> Self { - assert!( - record_batch.num_columns() >= 2, - "record batch missing internal columns, num_columns: {}", - record_batch.num_columns() - ); - - Self { record_batch } - } - - /// Returns a new [PlainBatch] with the given columns. - pub fn with_new_columns(&self, columns: Vec) -> Result { - let record_batch = RecordBatch::try_new(self.record_batch.schema(), columns) - .context(NewRecordBatchSnafu)?; - Ok(Self::new(record_batch)) - } - - /// Returns the number of columns in the batch. - pub fn num_columns(&self) -> usize { - self.record_batch.num_columns() - } - - /// Returns the number of rows in the batch. - pub fn num_rows(&self) -> usize { - self.record_batch.num_rows() - } - - /// Returns true if the batch is empty. - pub fn is_empty(&self) -> bool { - self.num_rows() == 0 - } - - /// Returns all columns. - pub fn columns(&self) -> &[ArrayRef] { - self.record_batch.columns() - } - - /// Returns the array of column at index `idx`. - pub fn column(&self, idx: usize) -> &ArrayRef { - self.record_batch.column(idx) - } - - /// Returns the slice of internal columns. - pub fn internal_columns(&self) -> &[ArrayRef] { - &self.record_batch.columns()[self.record_batch.num_columns() - PLAIN_FIXED_POS_COLUMN_NUM..] - } - - /// Returns the inner record batch. - pub fn as_record_batch(&self) -> &RecordBatch { - &self.record_batch - } - - /// Converts this batch into a record batch. - pub fn into_record_batch(self) -> RecordBatch { - self.record_batch - } - - /// Filters this batch by the boolean array. - pub fn filter(&self, predicate: &BooleanArray) -> Result { - let record_batch = - filter_record_batch(&self.record_batch, predicate).context(ComputeArrowSnafu)?; - Ok(Self::new(record_batch)) - } - - /// Returns the column index of the sequence column. - #[allow(dead_code)] - pub(crate) fn sequence_column_index(&self) -> usize { - self.record_batch.num_columns() - PLAIN_FIXED_POS_COLUMN_NUM - } -} - -/// Helper struct to fill default values and internal columns. -pub struct ColumnFiller<'a> { - /// Region metadata information - metadata: &'a RegionMetadata, - /// Schema for the output record batch - schema: SchemaRef, - /// Map of column names to indices in the input record batch - name_to_index: HashMap, -} - -impl<'a> ColumnFiller<'a> { - /// Creates a new ColumnFiller - /// The `schema` is the sst schema of the `metadata`. - pub fn new( - metadata: &'a RegionMetadata, - schema: SchemaRef, - record_batch: &RecordBatch, - ) -> Self { - debug_assert_eq!(metadata.column_metadatas.len() + 2, schema.fields().len()); - - // Pre-construct the name to index map - let name_to_index: HashMap<_, _> = record_batch - .schema() - .fields() - .iter() - .enumerate() - .map(|(i, field)| (field.name().clone(), i)) - .collect(); - - Self { - metadata, - schema, - name_to_index, - } - } - - /// Fills default values and internal columns for a [RecordBatch]. - pub fn fill_missing_columns( - &self, - record_batch: &RecordBatch, - sequence: SequenceNumber, - op_type: OpType, - ) -> Result { - let num_rows = record_batch.num_rows(); - let mut new_columns = - Vec::with_capacity(record_batch.num_columns() + PLAIN_FIXED_POS_COLUMN_NUM); - - // Fills default values. - // Implementation based on `WriteRequest::fill_missing_columns()`. - for column in &self.metadata.column_metadatas { - let array = match self.name_to_index.get(&column.column_schema.name) { - Some(index) => record_batch.column(*index).clone(), - None => match op_type { - OpType::Put => { - // For put requests, we use the default value from column schema. - fill_column_put_default(self.metadata.region_id, column, num_rows)? - } - OpType::Delete => { - // For delete requests, we need default value for padding. - fill_column_delete_default(column, num_rows)? - } - }, - }; - - new_columns.push(array); - } - - // Adds internal columns. - // Adds the sequence number. - let sequence_array = Arc::new(UInt64Array::from(vec![sequence; num_rows])); - // Adds the op type. - let op_type_array = Arc::new(UInt8Array::from(vec![op_type as u8; num_rows])); - new_columns.push(sequence_array); - new_columns.push(op_type_array); - - RecordBatch::try_new(self.schema.clone(), new_columns).context(NewRecordBatchSnafu) - } -} - -fn fill_column_put_default( - region_id: RegionId, - column: &ColumnMetadata, - num_rows: usize, -) -> Result { - if column.column_schema.is_default_impure() { - return UnexpectedSnafu { - reason: format!( - "unexpected impure default value with region_id: {}, column: {}, default_value: {:?}", - region_id, - column.column_schema.name, - column.column_schema.default_constraint(), - ), - } - .fail(); - } - let vector = column - .column_schema - .create_default_vector(num_rows) - .context(CreateDefaultSnafu { - region_id, - column: &column.column_schema.name, - })? - // This column doesn't have default value. - .with_context(|| InvalidRequestSnafu { - region_id, - reason: format!( - "column {} does not have default value", - column.column_schema.name - ), - })?; - Ok(vector.to_arrow_array()) -} - -fn fill_column_delete_default(column: &ColumnMetadata, num_rows: usize) -> Result { - // For delete requests, we need a default value for padding - let vector = column - .column_schema - .create_default_vector_for_padding(num_rows); - Ok(vector.to_arrow_array()) -} - -#[cfg(test)] -mod tests { - use api::v1::SemanticType; - use datatypes::arrow::array::{ - Float64Array, Int32Array, StringArray, TimestampMillisecondArray, - }; - use datatypes::arrow::datatypes::{DataType, Field, Schema, TimeUnit}; - use datatypes::schema::ColumnSchema; - use datatypes::schema::constraint::ColumnDefaultConstraint; - use datatypes::value::Value; - use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder}; - use store_api::storage::consts::{OP_TYPE_COLUMN_NAME, SEQUENCE_COLUMN_NAME}; - use store_api::storage::{ConcreteDataType, RegionId}; - - use super::*; - use crate::sst::to_plain_sst_arrow_schema; - - /// Creates a test region metadata with schema: k0(string), ts(timestamp), v1(float64) - fn create_test_region_metadata() -> RegionMetadata { - let mut builder = RegionMetadataBuilder::new(RegionId::new(100, 200)); - builder - // Add string key column - .push_column_metadata(ColumnMetadata { - column_schema: ColumnSchema::new("k0", ConcreteDataType::string_datatype(), false) - .with_default_constraint(None) - .unwrap(), - semantic_type: SemanticType::Tag, - column_id: 0, - }) - // Add timestamp column - .push_column_metadata(ColumnMetadata { - column_schema: ColumnSchema::new( - "ts", - ConcreteDataType::timestamp_millisecond_datatype(), - false, - ) - .with_time_index(true) - .with_default_constraint(None) - .unwrap(), - semantic_type: SemanticType::Timestamp, - column_id: 1, - }) - // Add float value column with default - .push_column_metadata(ColumnMetadata { - column_schema: ColumnSchema::new("v1", ConcreteDataType::float64_datatype(), true) - .with_default_constraint(Some(ColumnDefaultConstraint::Value(Value::Float64( - datatypes::value::OrderedFloat::from(42.0), - )))) - .unwrap(), - semantic_type: SemanticType::Field, - column_id: 2, - }) - .primary_key(vec![0]); - - builder.build().unwrap() - } - - #[test] - fn test_column_filler_put() { - let region_metadata = create_test_region_metadata(); - let output_schema = to_plain_sst_arrow_schema(®ion_metadata); - - // Create input record batch with only k0 and ts columns (v1 is missing) - let input_schema = Arc::new(Schema::new(vec![ - Field::new("k0", DataType::Utf8, false), - Field::new( - "ts", - DataType::Timestamp(TimeUnit::Millisecond, None), - false, - ), - ])); - - let k0_values: ArrayRef = Arc::new(StringArray::from(vec!["key1", "key2"])); - let ts_values: ArrayRef = Arc::new(TimestampMillisecondArray::from(vec![1000, 2000])); - - let input_batch = - RecordBatch::try_new(input_schema, vec![k0_values.clone(), ts_values.clone()]).unwrap(); - - // Create column filler - let filler = ColumnFiller::new(®ion_metadata, output_schema.clone(), &input_batch); - - // Fill missing columns with OpType::Put - let result = filler - .fill_missing_columns(&input_batch, 100, OpType::Put) - .unwrap(); - - // Verify the result - // Create an expected record batch to compare against - let expected_columns = vec![ - k0_values.clone(), - ts_values.clone(), - Arc::new(Float64Array::from(vec![42.0, 42.0])), - Arc::new(UInt64Array::from(vec![100, 100])), - Arc::new(UInt8Array::from(vec![OpType::Put as u8, OpType::Put as u8])), - ]; - let expected_batch = RecordBatch::try_new(output_schema.clone(), expected_columns).unwrap(); - assert_eq!(expected_batch, result); - } - - #[test] - fn test_column_filler_delete() { - let region_metadata = create_test_region_metadata(); - let output_schema = to_plain_sst_arrow_schema(®ion_metadata); - - // Create input record batch with only k0 and ts columns (v1 is missing) - let input_schema = Arc::new(Schema::new(vec![ - Field::new("k0", DataType::Utf8, false), - Field::new( - "ts", - DataType::Timestamp(TimeUnit::Millisecond, None), - false, - ), - ])); - - let k0_values: ArrayRef = Arc::new(StringArray::from(vec!["key1", "key2"])); - let ts_values: ArrayRef = Arc::new(TimestampMillisecondArray::from(vec![1000, 2000])); - - let input_batch = - RecordBatch::try_new(input_schema, vec![k0_values.clone(), ts_values.clone()]).unwrap(); - - // Create column filler - let filler = ColumnFiller::new(®ion_metadata, output_schema.clone(), &input_batch); - - // Fill missing columns with OpType::Delete - let result = filler - .fill_missing_columns(&input_batch, 200, OpType::Delete) - .unwrap(); - - // Verify the result by creating an expected record batch to compare against - let v1_default = Arc::new(Float64Array::from(vec![None, None])); - let expected_columns = vec![ - k0_values.clone(), - ts_values.clone(), - v1_default, - Arc::new(UInt64Array::from(vec![200, 200])), - Arc::new(UInt8Array::from(vec![ - OpType::Delete as u8, - OpType::Delete as u8, - ])), - ]; - let expected_batch = RecordBatch::try_new(output_schema.clone(), expected_columns).unwrap(); - assert_eq!(expected_batch, result); - } - - fn create_test_record_batch() -> RecordBatch { - let schema = Arc::new(Schema::new(vec![ - Field::new("col1", DataType::Int32, false), - Field::new("col2", DataType::Utf8, false), - Field::new(SEQUENCE_COLUMN_NAME, DataType::UInt64, false), - Field::new(OP_TYPE_COLUMN_NAME, DataType::UInt8, false), - ])); - - let col1 = Arc::new(Int32Array::from(vec![1, 2, 3])); - let col2 = Arc::new(StringArray::from(vec!["a", "b", "c"])); - let sequence = Arc::new(UInt64Array::from(vec![100, 101, 102])); - let op_type = Arc::new(UInt8Array::from(vec![1, 1, 1])); - - RecordBatch::try_new(schema, vec![col1, col2, sequence, op_type]).unwrap() - } - - #[test] - fn test_plain_batch_basic_methods() { - let record_batch = create_test_record_batch(); - let plain_batch = PlainBatch::new(record_batch.clone()); - - // Test basic properties - assert_eq!(plain_batch.num_columns(), 4); - assert_eq!(plain_batch.num_rows(), 3); - assert!(!plain_batch.is_empty()); - assert_eq!(plain_batch.columns().len(), 4); - - // Test internal columns access - let internal_columns = plain_batch.internal_columns(); - assert_eq!(internal_columns.len(), PLAIN_FIXED_POS_COLUMN_NUM); - assert_eq!(internal_columns[0].len(), 3); - assert_eq!(internal_columns[1].len(), 3); - - // Test column access - let col1 = plain_batch.column(0); - assert_eq!(col1.len(), 3); - assert_eq!( - col1.as_any().downcast_ref::().unwrap().value(0), - 1 - ); - - // Test sequence column index - assert_eq!(plain_batch.sequence_column_index(), 2); - - // Test to record batch. - assert_eq!(record_batch, *plain_batch.as_record_batch()); - assert_eq!(record_batch, plain_batch.into_record_batch()); - } - - #[test] - fn test_with_new_columns() { - let record_batch = create_test_record_batch(); - let plain_batch = PlainBatch::new(record_batch); - - // Create new columns - let col1 = Arc::new(Int32Array::from(vec![10, 20, 30])); - let col2 = Arc::new(StringArray::from(vec!["x", "y", "z"])); - let sequence = Arc::new(UInt64Array::from(vec![200, 201, 202])); - let op_type = Arc::new(UInt8Array::from(vec![0, 0, 0])); - - let new_batch = plain_batch - .with_new_columns(vec![col1, col2, sequence, op_type]) - .unwrap(); - - assert_eq!(new_batch.num_columns(), 4); - assert_eq!(new_batch.num_rows(), 3); - assert_eq!( - new_batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - 10 - ); - assert_eq!( - new_batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - "x" - ); - } - - #[test] - fn test_filter() { - let record_batch = create_test_record_batch(); - let plain_batch = PlainBatch::new(record_batch); - - // Create a predicate that selects the first and third rows - let predicate = BooleanArray::from(vec![true, false, true]); - - let filtered_batch = plain_batch.filter(&predicate).unwrap(); - - assert_eq!(filtered_batch.num_rows(), 2); - assert_eq!( - filtered_batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap() - .value(0), - 1 - ); - assert_eq!( - filtered_batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap() - .value(1), - 3 - ); - } -} diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs index c447685822..f645e3dc26 100644 --- a/src/mito2/src/read/scan_region.rs +++ b/src/mito2/src/read/scan_region.rs @@ -60,7 +60,7 @@ use crate::read::seq_scan::SeqScan; use crate::read::series_scan::SeriesScan; use crate::read::stream::ScanBatchStream; use crate::read::unordered_scan::UnorderedScan; -use crate::read::{Batch, BoxedRecordBatchStream, RecordBatch, Source}; +use crate::read::{BoxedRecordBatchStream, RecordBatch}; use crate::region::options::MergeMode; use crate::region::version::VersionRef; use crate::sst::file::FileHandle; @@ -1031,39 +1031,6 @@ impl ScanInput { self } - /// Scans sources in parallel. - /// - /// # Panics if the input doesn't allow parallel scan. - #[tracing::instrument( - skip(self, sources, semaphore), - fields( - region_id = %self.region_metadata().region_id, - source_count = sources.len() - ) - )] - pub(crate) fn create_parallel_sources( - &self, - sources: Vec, - semaphore: Arc, - channel_size: usize, - ) -> Result> { - if sources.len() <= 1 { - return Ok(sources); - } - - // Spawn a task for each source. - let sources = sources - .into_iter() - .map(|source| { - let (sender, receiver) = mpsc::channel(channel_size); - self.spawn_scan_task(source, semaphore.clone(), sender); - let stream = Box::pin(ReceiverStream::new(receiver)); - Source::Stream(stream) - }) - .collect(); - Ok(sources) - } - /// Builds memtable ranges to scan by `index`. pub(crate) fn build_mem_ranges(&self, index: RowGroupIndex) -> SmallVec<[MemtableRange; 2]> { let memtable = &self.memtables[index.index]; @@ -1173,49 +1140,6 @@ impl ScanInput { Ok(FileRangeBuilder::new(Arc::new(file_range_ctx), selection)) } - /// Scans the input source in another task and sends batches to the sender. - #[tracing::instrument( - skip(self, input, semaphore, sender), - fields(region_id = %self.region_metadata().region_id) - )] - pub(crate) fn spawn_scan_task( - &self, - mut input: Source, - semaphore: Arc, - sender: mpsc::Sender>, - ) { - let region_id = self.region_metadata().region_id; - let span = tracing::info_span!( - "ScanInput::parallel_scan_task", - region_id = %region_id, - stream_kind = "batch" - ); - common_runtime::spawn_global( - async move { - loop { - // We release the permit before sending result to avoid the task waiting on - // the channel with the permit held. - let maybe_batch = { - // Safety: We never close the semaphore. - let _permit = semaphore.acquire().await.unwrap(); - input.next_batch().await - }; - match maybe_batch { - Ok(Some(batch)) => { - let _ = sender.send(Ok(batch)).await; - } - Ok(None) => break, - Err(e) => { - let _ = sender.send(Err(e)).await; - break; - } - } - } - } - .instrument(span), - ); - } - /// Scans flat sources (RecordBatch streams) in parallel. /// /// # Panics if the input doesn't allow parallel scan. diff --git a/src/mito2/src/read/scan_util.rs b/src/mito2/src/read/scan_util.rs index 597f592de6..8fc946b3d3 100644 --- a/src/mito2/src/read/scan_util.rs +++ b/src/mito2/src/read/scan_util.rs @@ -39,7 +39,7 @@ use crate::metrics::{ READ_ROWS_IN_ROW_GROUP_TOTAL, READ_ROWS_RETURN, READ_STAGE_ELAPSED, }; use crate::read::dedup::{DedupMetrics, DedupMetricsReport}; -use crate::read::merge::{MergeMetrics, MergeMetricsReport}; +use crate::read::flat_merge::{MergeMetrics, MergeMetricsReport}; use crate::read::pruner::PartitionPruner; use crate::read::range::{RangeMeta, RowGroupIndex}; use crate::read::scan_region::StreamContext; diff --git a/src/mito2/src/sst.rs b/src/mito2/src/sst.rs index 94bc1feea8..c769f78c6c 100644 --- a/src/mito2/src/sst.rs +++ b/src/mito2/src/sst.rs @@ -218,34 +218,6 @@ pub(crate) fn internal_fields() -> [FieldRef; 3] { ] } -/// Gets the arrow schema to store in parquet. -pub fn to_plain_sst_arrow_schema(metadata: &RegionMetadata) -> SchemaRef { - let fields = Fields::from_iter( - metadata - .schema - .arrow_schema() - .fields() - .iter() - .cloned() - .chain(plain_internal_fields()), - ); - - Arc::new(Schema::new(fields)) -} - -/// Fields for internal columns. -fn plain_internal_fields() -> [FieldRef; 2] { - // Internal columns are always not null. - [ - Arc::new(Field::new( - SEQUENCE_COLUMN_NAME, - ArrowDataType::UInt64, - false, - )), - Arc::new(Field::new(OP_TYPE_COLUMN_NAME, ArrowDataType::UInt8, false)), - ] -} - /// Gets the estimated number of series from record batches. /// /// This struct tracks the last timestamp value to detect series boundaries diff --git a/src/mito2/src/test_util/sst_util.rs b/src/mito2/src/test_util/sst_util.rs index e9515030c0..84f15ad837 100644 --- a/src/mito2/src/test_util/sst_util.rs +++ b/src/mito2/src/test_util/sst_util.rs @@ -36,10 +36,10 @@ use store_api::metric_engine_consts::{ use store_api::storage::consts::ReservedColumnId; use store_api::storage::{FileId, RegionId}; -use crate::read::{Batch, FlatSource, Source}; +use crate::read::{Batch, FlatSource}; use crate::sst::file::{FileHandle, FileMeta}; use crate::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema}; -use crate::test_util::{VecBatchReader, new_batch_builder, new_noop_file_purger}; +use crate::test_util::{new_batch_builder, new_noop_file_purger}; /// Test region id. const REGION_ID: RegionId = RegionId::new(0, 0); @@ -190,12 +190,6 @@ pub fn new_sparse_primary_key( buffer } -/// Creates a [Source] from `batches`. -pub fn new_source(batches: &[Batch]) -> Source { - let reader = VecBatchReader::new(batches); - Source::Reader(Box::new(reader)) -} - /// Creates a SST file handle with provided file id pub fn sst_file_handle_with_file_id(file_id: FileId, start_ms: i64, end_ms: i64) -> FileHandle { let file_purger = new_noop_file_purger(); From 59021ce83b94e8457283f08ebb92898e00ae300f Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Fri, 10 Apr 2026 15:56:33 +0800 Subject: [PATCH 089/195] fix: using uint64 datatype for postgres prepared statement parameters (#7942) * feat: add support for decimal parameter type, remove string replacement fallback * chore: format * fix: add support for using unsigned bigint in postgres * chore: format toml * refactor: cleanup duplicated code * fix: rescale decimal --- Cargo.lock | 1 + src/servers/Cargo.toml | 1 + src/servers/src/postgres/handler.rs | 15 +-- src/servers/src/postgres/types.rs | 191 ++++++++++++++++++---------- tests-integration/tests/sql.rs | 52 ++++++++ 5 files changed, 184 insertions(+), 76 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index edb8ce04d4..872095752b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12080,6 +12080,7 @@ dependencies = [ "regex", "reqwest", "rust-embed", + "rust_decimal", "rustls", "rustls-pemfile", "rustls-pki-types", diff --git a/src/servers/Cargo.toml b/src/servers/Cargo.toml index 2d68f17699..46a51f1280 100644 --- a/src/servers/Cargo.toml +++ b/src/servers/Cargo.toml @@ -107,6 +107,7 @@ rand.workspace = true regex.workspace = true reqwest.workspace = true rust-embed = { version = "6.6", optional = true, features = ["debug-embed"] } +rust_decimal = { workspace = true, features = ["db-postgres"] } rustls = { workspace = true, default-features = false, features = ["ring", "logging", "std", "tls12"] } rustls-pemfile = "2.0" rustls-pki-types = "1.0" diff --git a/src/servers/src/postgres/handler.rs b/src/servers/src/postgres/handler.rs index 2b84b3aa30..94363b06eb 100644 --- a/src/servers/src/postgres/handler.rs +++ b/src/servers/src/postgres/handler.rs @@ -456,16 +456,13 @@ impl ExtendedQueryHandler for PostgresServerHandlerInner { .do_exec_plan(sql_plan.statement.clone(), plan, query_ctx.clone()) .await } else { - // manually replace variables in prepared statement when no - // logical_plan is generated. This happens when logical plan is not - // supported for certain statements. - let mut sql = sql_plan.query.clone(); - for i in 0..portal.parameter_len() { - sql = sql.replace(&format!("${}", i + 1), ¶meter_to_string(portal, i)?); - } - + // We won't replace params from statement manually any more. + // Newer version of datafusion can generate plan for SELECT/INSERT/UPDATE/DELETE. + // Only CREATE TABLE and others minor statements cannot generate sql plan, + // in this case, we assume these statements will not carry parameters + // and execute them directly. self.query_handler - .do_query(&sql, query_ctx.clone()) + .do_query(&sql_plan.query, query_ctx.clone()) .await .remove(0) }; diff --git a/src/servers/src/postgres/types.rs b/src/servers/src/postgres/types.rs index d4d15ef64a..203e477c6f 100644 --- a/src/servers/src/postgres/types.rs +++ b/src/servers/src/postgres/types.rs @@ -33,7 +33,7 @@ use datatypes::arrow::datatypes::DataType as ArrowDataType; use datatypes::json::JsonStructureSettings; use datatypes::prelude::{ConcreteDataType, Value}; use datatypes::schema::{Schema, SchemaRef}; -use datatypes::types::{IntervalType, TimestampType, jsonb_to_string}; +use datatypes::types::{Decimal128Type, IntervalType, TimestampType, jsonb_to_string}; use datatypes::value::StructValue; use futures::Stream; use pg_interval::Interval as PgInterval; @@ -43,6 +43,8 @@ use pgwire::api::results::FieldInfo; use pgwire::error::{PgWireError, PgWireResult}; use pgwire::types::format::FormatOptions as PgFormatOptions; use query::planner::DfLogicalPlanner; +use rust_decimal::Decimal; +use rust_decimal::prelude::ToPrimitive; use session::context::QueryContextRef; use snafu::ResultExt; @@ -293,11 +295,11 @@ pub(super) fn type_pg_to_gt(origin: &Type) -> Result { // Note that we only support a small amount of pg data types match origin { &Type::BOOL => Ok(ConcreteDataType::boolean_datatype()), - &Type::CHAR => Ok(ConcreteDataType::int8_datatype()), &Type::INT2 => Ok(ConcreteDataType::int16_datatype()), &Type::INT4 => Ok(ConcreteDataType::int32_datatype()), &Type::INT8 => Ok(ConcreteDataType::int64_datatype()), - &Type::VARCHAR | &Type::TEXT => Ok(ConcreteDataType::string_datatype()), + &Type::NUMERIC => Ok(ConcreteDataType::uint64_datatype()), + &Type::VARCHAR | &Type::CHAR | &Type::TEXT => Ok(ConcreteDataType::string_datatype()), &Type::TIMESTAMP | &Type::TIMESTAMPTZ => Ok(ConcreteDataType::timestamp_datatype( common_time::timestamp::TimeUnit::Millisecond, )), @@ -305,9 +307,6 @@ pub(super) fn type_pg_to_gt(origin: &Type) -> Result { &Type::TIME => Ok(ConcreteDataType::timestamp_datatype( common_time::timestamp::TimeUnit::Microsecond, )), - &Type::CHAR_ARRAY => Ok(ConcreteDataType::list_datatype(Arc::new( - ConcreteDataType::int8_datatype(), - ))), &Type::INT2_ARRAY => Ok(ConcreteDataType::list_datatype(Arc::new( ConcreteDataType::int16_datatype(), ))), @@ -317,9 +316,12 @@ pub(super) fn type_pg_to_gt(origin: &Type) -> Result { &Type::INT8_ARRAY => Ok(ConcreteDataType::list_datatype(Arc::new( ConcreteDataType::int64_datatype(), ))), - &Type::VARCHAR_ARRAY => Ok(ConcreteDataType::list_datatype(Arc::new( - ConcreteDataType::string_datatype(), + &Type::NUMERIC_ARRAY => Ok(ConcreteDataType::list_datatype(Arc::new( + ConcreteDataType::uint64_datatype(), ))), + &Type::VARCHAR_ARRAY | &Type::CHAR_ARRAY | &Type::TEXT_ARRAY => Ok( + ConcreteDataType::list_datatype(Arc::new(ConcreteDataType::string_datatype())), + ), _ => server_error::InternalSnafu { err_msg: format!("unimplemented datatype {origin:?}"), } @@ -327,63 +329,6 @@ pub(super) fn type_pg_to_gt(origin: &Type) -> Result { } } -pub(super) fn parameter_to_string(portal: &Portal, idx: usize) -> PgWireResult { - // the index is managed from portal's parameters count so it's safe to - // unwrap here. - let param_type = portal - .statement - .parameter_types - .get(idx) - .unwrap() - .as_ref() - .unwrap_or(&Type::UNKNOWN); - match param_type { - &Type::VARCHAR | &Type::TEXT => Ok(format!( - "'{}'", - portal - .parameter::(idx, param_type)? - .as_deref() - .unwrap_or("") - )), - &Type::BOOL => Ok(portal - .parameter::(idx, param_type)? - .map(|v| v.to_string()) - .unwrap_or_else(|| "".to_owned())), - &Type::INT4 => Ok(portal - .parameter::(idx, param_type)? - .map(|v| v.to_string()) - .unwrap_or_else(|| "".to_owned())), - &Type::INT8 => Ok(portal - .parameter::(idx, param_type)? - .map(|v| v.to_string()) - .unwrap_or_else(|| "".to_owned())), - &Type::FLOAT4 => Ok(portal - .parameter::(idx, param_type)? - .map(|v| v.to_string()) - .unwrap_or_else(|| "".to_owned())), - &Type::FLOAT8 => Ok(portal - .parameter::(idx, param_type)? - .map(|v| v.to_string()) - .unwrap_or_else(|| "".to_owned())), - &Type::DATE => Ok(portal - .parameter::(idx, param_type)? - .map(|v| v.format("%Y-%m-%d").to_string()) - .unwrap_or_else(|| "".to_owned())), - &Type::TIMESTAMP => Ok(portal - .parameter::(idx, param_type)? - .map(|v| v.format("%Y-%m-%d %H:%M:%S%.6f").to_string()) - .unwrap_or_else(|| "".to_owned())), - &Type::INTERVAL => Ok(portal - .parameter::(idx, param_type)? - .map(|v| v.to_sql()) - .unwrap_or_else(|| "".to_owned())), - _ => Err(invalid_parameter_error( - "unsupported_parameter_type", - Some(param_type.to_string()), - )), - } -} - pub(super) fn invalid_parameter_error(msg: &str, detail: Option) -> PgWireError { let mut error_info = PgErrorCode::Ec22023.to_err_info(msg.to_string()); error_info.detail = detail; @@ -407,6 +352,17 @@ where } } +fn to_decimal_scalar_value(data: Option, ctype: &Decimal128Type) -> ScalarValue { + if let Some(data) = data { + let mut value = data; + value.rescale(ctype.scale() as u32); + + ScalarValue::Decimal128(Some(value.mantissa()), ctype.precision(), ctype.scale()) + } else { + ScalarValue::Decimal128(None, ctype.precision(), ctype.scale()) + } +} + pub(super) fn parameters_to_scalar_values( plan: &LogicalPlan, portal: &Portal, @@ -442,7 +398,7 @@ pub(super) fn parameters_to_scalar_values( }; let value = match &client_type { - &Type::VARCHAR | &Type::TEXT => { + &Type::VARCHAR | &Type::TEXT | &Type::CHAR => { let data = portal.parameter::(idx, &client_type)?; if let Some(server_type) = &server_type { match server_type { @@ -558,6 +514,24 @@ pub(super) fn parameters_to_scalar_values( ScalarValue::Int64(data) } } + &Type::NUMERIC => { + let data = portal.parameter::(idx, &client_type)?; + match &server_type { + Some(ConcreteDataType::Decimal128(dt)) => to_decimal_scalar_value(data, dt), + Some(st @ ConcreteDataType::Timestamp(unit)) => { + to_timestamp_scalar_value(data.and_then(|n| n.to_i64()), unit, st)? + } + Some(ConcreteDataType::UInt64(_)) | None => { + ScalarValue::UInt64(data.and_then(|n| n.to_u64())) + } + Some(st) => { + return Err(invalid_parameter_error( + "invalid_parameter_type", + Some(format!("Expected: {}, found: {}", st, client_type)), + )); + } + } + } &Type::FLOAT4 => { let data = portal.parameter::(idx, &client_type)?; if let Some(server_type) = &server_type { @@ -837,7 +811,67 @@ pub(super) fn parameters_to_scalar_values( ScalarValue::Null } } - &Type::VARCHAR_ARRAY => { + &Type::NUMERIC_ARRAY => { + let data = portal.parameter::>>(idx, &client_type)?; + if let Some(data) = data { + let build_u64_list = |data: Vec>| { + let values = data + .into_iter() + .map(|n| ScalarValue::UInt64(n.and_then(|n| n.to_u64()))) + .collect::>(); + ScalarValue::List(ScalarValue::new_list( + &values, + &ArrowDataType::UInt64, + true, + )) + }; + if let Some(server_type) = &server_type { + match server_type { + ConcreteDataType::List(list_type) => match list_type.item_type() { + ConcreteDataType::UInt64(_) => build_u64_list(data), + ConcreteDataType::Decimal128(dt) => { + let values = data + .into_iter() + .map(|n| to_decimal_scalar_value(n, dt)) + .collect::>(); + ScalarValue::List(ScalarValue::new_list( + &values, + &ArrowDataType::Decimal128(dt.precision(), dt.scale()), + true, + )) + } + _ => { + // the server type is not a list of decimal or uint64 + return Err(invalid_parameter_error( + "invalid_parameter_type", + Some(format!( + "Expected: {}, found: {}", + list_type.item_type(), + client_type + )), + )); + } + }, + _ => { + // the server type is not a list + return Err(invalid_parameter_error( + "invalid_parameter_type", + Some(format!( + "Expected: {}, found: {}", + server_type, client_type + )), + )); + } + } + } else { + // server type not provided + build_u64_list(data) + } + } else { + ScalarValue::Null + } + } + &Type::VARCHAR_ARRAY | &Type::TEXT_ARRAY | &Type::CHAR_ARRAY => { let data = portal.parameter::>>(idx, &client_type)?; if let Some(data) = data { let values = data.into_iter().map(|i| i.into()).collect::>(); @@ -1098,6 +1132,7 @@ pub fn format_options_from_query_ctx(query_ctx: &QueryContextRef) -> Arc Date: Fri, 10 Apr 2026 16:37:52 +0800 Subject: [PATCH 090/195] fix: fix current version comparison logic for pre-releases (#7946) Signed-off-by: liyang --- .github/scripts/check-version.sh | 69 +++++++++++++++++++++++++++++--- 1 file changed, 64 insertions(+), 5 deletions(-) diff --git a/.github/scripts/check-version.sh b/.github/scripts/check-version.sh index 28c2812ded..1efa3bb4db 100755 --- a/.github/scripts/check-version.sh +++ b/.github/scripts/check-version.sh @@ -30,13 +30,72 @@ CLEAN_LATEST=$(echo "$LATEST_VERSION" | sed 's/^v//' | sed 's/-nightly-.*//') echo "Current version: $CLEAN_CURRENT" echo "Latest release version: $CLEAN_LATEST" -# Use sort -V to compare versions -HIGHER_VERSION=$(printf "%s\n%s" "$CLEAN_CURRENT" "$CLEAN_LATEST" | sort -V | tail -n1) +# Function to extract base version (without pre-release suffix) +get_base_version() { + echo "$1" | sed -E 's/-(alpha|beta|rc|pre).*//' +} -if [ "$HIGHER_VERSION" = "$CLEAN_CURRENT" ]; then +# Function to check if a version is pre-release +is_prerelease() { + [[ "$1" =~ -(alpha|beta|rc|pre) ]] +} + +# Compare versions properly considering pre-release +compare_versions() { + local current=$1 + local latest=$2 + + # Extract base versions + local current_base=$(get_base_version "$current") + local latest_base=$(get_base_version "$latest") + + # Compare base versions first + HIGHER_BASE=$(printf "%s\n%s" "$current_base" "$latest_base" | sort -V | tail -n1) + + if [ "$HIGHER_BASE" = "$latest_base" ] && [ "$current_base" != "$latest_base" ]; then + # Latest has higher base version + echo "current_older" + return + elif [ "$HIGHER_BASE" = "$current_base" ] && [ "$current_base" != "$latest_base" ]; then + # Current has higher base version + echo "current_newer" + return + fi + + # Base versions are equal, compare pre-release status + if [ "$current_base" = "$latest_base" ]; then + # If current is pre-release and latest is not, current is older + if is_prerelease "$current" && ! is_prerelease "$latest"; then + echo "current_older" + return + fi + + # If latest is pre-release and current is not, current is newer + if ! is_prerelease "$current" && is_prerelease "$latest"; then + echo "current_newer" + return + fi + fi + + # Both are same type or different base versions already handled, use sort -V + HIGHER_VERSION=$(printf "%s\n%s" "$current" "$latest" | sort -V | tail -n1) + if [ "$HIGHER_VERSION" = "$current" ]; then + echo "current_newer_or_equal" + else + echo "current_older" + fi +} + +RESULT=$(compare_versions "$CLEAN_CURRENT" "$CLEAN_LATEST") + +if [ "$RESULT" = "current_newer" ] || [ "$RESULT" = "current_newer_or_equal" ]; then echo "Current version ($CLEAN_CURRENT) is NEWER than or EQUAL to latest ($CLEAN_LATEST)" - echo "is-current-version-latest=true" >> $GITHUB_OUTPUT + if [ -n "$GITHUB_OUTPUT" ]; then + echo "is-current-version-latest=true" >> $GITHUB_OUTPUT + fi else echo "Current version ($CLEAN_CURRENT) is OLDER than latest ($CLEAN_LATEST)" - echo "is-current-version-latest=false" >> $GITHUB_OUTPUT + if [ -n "$GITHUB_OUTPUT" ]; then + echo "is-current-version-latest=false" >> $GITHUB_OUTPUT + fi fi From 06e49961c75a05a63e0c74e0fe8c909e25d6d134 Mon Sep 17 00:00:00 2001 From: cui Date: Fri, 10 Apr 2026 17:22:12 +0800 Subject: [PATCH 091/195] fix(index): intersect bitmaps before early exit in predicates applier (#7867) * fix(index): intersect bitmaps before early exit in predicates applier The loop skipped intersecting when the next bitmap was empty, which left the accumulator unchanged instead of zeroing it. Intersect first, then break when the result is empty. Signed-off-by: Weixie Cui * per gemini * style(index): format predicates applier loop * fix(index): remove unused mut in predicates applier --------- Signed-off-by: Weixie Cui Co-authored-by: discord9 <55937128+discord9@users.noreply.github.com> Co-authored-by: discord9 --- .../search/index_apply/predicates_apply.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/index/src/inverted_index/search/index_apply/predicates_apply.rs b/src/index/src/inverted_index/search/index_apply/predicates_apply.rs index 441a4b4304..eaea8cfb95 100644 --- a/src/index/src/inverted_index/search/index_apply/predicates_apply.rs +++ b/src/index/src/inverted_index/search/index_apply/predicates_apply.rs @@ -94,15 +94,15 @@ impl IndexApplier for PredicatesIndexApplier { .collect::>(); let mut mapper = ParallelFstValuesMapper::new(reader); - let mut bm_vec = mapper.map_values_vec(&value_and_meta_vec, metrics).await?; + let bm_vec = mapper.map_values_vec(&value_and_meta_vec, metrics).await?; - let mut bitmap = bm_vec.pop().unwrap(); // SAFETY: `fst_ranges` is not empty - for bm in bm_vec { - if bm.count_ones() == 0 { + let mut iter = bm_vec.into_iter(); + let mut bitmap = iter.next().unwrap(); // SAFETY: `fst_ranges` is not empty + for bm in iter { + bitmap.intersect(bm); + if bitmap.count_ones() == 0 { break; } - - bitmap.intersect(bm); } output.matched_segment_ids = bitmap; From 76cad696c6ce26d9377491ca7ae46e078d6efbea Mon Sep 17 00:00:00 2001 From: fys <40801205+fengys1996@users.noreply.github.com> Date: Fri, 10 Apr 2026 18:41:48 +0800 Subject: [PATCH 092/195] feat: add parquet nested leaf projection (#7900) * feat: add parquet nested leaf projection * rename ParquetProjection related struct * add some apis * extract common build schema function for test * remove unsed method * keep only deduped parquet root projection constructor * add more unit tests * fix: typo * fix: cr * fast-path parquet root projection without nested fields * extract a build_projection_mask method * fix: cargo clippy --- src/mito2/src/sst/parquet.rs | 1 + src/mito2/src/sst/parquet/read_columns.rs | 316 ++++++++++++++++++++++ src/mito2/src/sst/parquet/reader.rs | 10 +- 3 files changed, 323 insertions(+), 4 deletions(-) create mode 100644 src/mito2/src/sst/parquet/read_columns.rs diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs index 2447824ad9..90395642b6 100644 --- a/src/mito2/src/sst/parquet.rs +++ b/src/mito2/src/sst/parquet.rs @@ -31,6 +31,7 @@ pub mod format; pub(crate) mod helper; pub(crate) mod metadata; pub mod prefilter; +pub mod read_columns; pub mod reader; pub mod row_group; pub mod row_selection; diff --git a/src/mito2/src/sst/parquet/read_columns.rs b/src/mito2/src/sst/parquet/read_columns.rs new file mode 100644 index 0000000000..f0f35a4099 --- /dev/null +++ b/src/mito2/src/sst/parquet/read_columns.rs @@ -0,0 +1,316 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use parquet::arrow::ProjectionMask; +use parquet::schema::types::SchemaDescriptor; + +/// A nested field access path inside one parquet root column. +pub type ParquetNestedPath = Vec; + +/// The parquet columns to read. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ParquetReadColumns { + cols: Vec, + has_nested: bool, +} + +impl ParquetReadColumns { + /// Builds root-column projections from root indices that are already + /// deduplicated. + /// + /// Note: this constructor does not check for duplicates. + pub fn from_deduped_root_indices(root_indices: impl IntoIterator) -> Self { + let cols = root_indices + .into_iter() + .map(ParquetReadColumn::new) + .collect(); + Self { + cols, + has_nested: false, + } + } + + pub fn columns(&self) -> &[ParquetReadColumn] { + &self.cols + } + + pub fn has_nested(&self) -> bool { + self.has_nested + } + + pub fn root_indices_iter(&self) -> impl Iterator + '_ { + self.cols.iter().map(|col| col.root_index) + } +} + +/// Read requirement for a single parquet root column. +/// +/// `root_index` identifies the root column in the parquet schema. +/// +/// If `nested_paths` is empty, the whole root column is read. Otherwise, only +/// leaves under the specified nested paths are read. +/// +/// To construct a [`ParquetReadColumn`]: +/// - `ParquetReadColumn::new(0)` reads the whole root column at index `0`. +/// - `ParquetReadColumn::new(0).with_nested_paths(vec![vec!["j".into(), "b".into()]])` +/// reads only leaves under `j.b`. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ParquetReadColumn { + /// Root field index in the parquet schema. + root_index: usize, + /// Nested paths to read under this root column. + /// + /// Each path includes the root column itself. For example, for a root + /// column `j`, path `["j", "a", "b"]` refers to `j.a.b`. + /// + /// If empty, the whole root column is read. + nested_paths: Vec, +} + +impl ParquetReadColumn { + pub fn new(root_index: usize) -> Self { + Self { + root_index, + nested_paths: vec![], + } + } + + pub fn with_nested_paths(self, nested_paths: Vec) -> Self { + Self { + nested_paths, + ..self + } + } + + pub fn root_index(&self) -> usize { + self.root_index + } + + pub fn nested_paths(&self) -> &[ParquetNestedPath] { + &self.nested_paths + } +} + +/// Builds a projection mask from parquet read columns. +pub fn build_projection_mask( + parquet_read_cols: &ParquetReadColumns, + parquet_schema_desc: &SchemaDescriptor, +) -> ProjectionMask { + if parquet_read_cols.has_nested() { + let leaf_indices = build_parquet_leaves_indices(parquet_schema_desc, parquet_read_cols); + ProjectionMask::leaves(parquet_schema_desc, leaf_indices) + } else { + ProjectionMask::roots(parquet_schema_desc, parquet_read_cols.root_indices_iter()) + } +} + +/// Builds parquet leaf-column indices from parquet read columns. +fn build_parquet_leaves_indices( + parquet_schema_desc: &SchemaDescriptor, + projection: &ParquetReadColumns, +) -> Vec { + let mut map = HashMap::with_capacity(projection.cols.len()); + for col in &projection.cols { + map.insert(col.root_index, &col.nested_paths); + } + + let mut leaf_indices = Vec::new(); + for (leaf_idx, leaf_col) in parquet_schema_desc.columns().iter().enumerate() { + let root_idx = parquet_schema_desc.get_column_root_idx(leaf_idx); + let Some(nested_paths) = map.get(&root_idx) else { + continue; + }; + if nested_paths.is_empty() { + leaf_indices.push(leaf_idx); + continue; + } + + let leaf_path = leaf_col.path().parts(); + if nested_paths + .iter() + .any(|nested_path| leaf_path.starts_with(nested_path)) + { + leaf_indices.push(leaf_idx); + } + } + leaf_indices +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use parquet::basic::Repetition; + use parquet::schema::types::Type; + + use super::*; + + #[test] + fn test_reads_whole_root() { + let parquet_schema_desc = build_test_nested_parquet_schema(); + + let projection = ParquetReadColumns { + cols: vec![ParquetReadColumn { + root_index: 0, + nested_paths: vec![], + }], + has_nested: false, + }; + + assert_eq!( + vec![0, 1, 2], + build_parquet_leaves_indices(&parquet_schema_desc, &projection) + ); + } + + #[test] + fn test_filters_nested_paths() { + let parquet_schema_desc = build_test_nested_parquet_schema(); + + let projection = ParquetReadColumns { + cols: vec![ + ParquetReadColumn { + root_index: 0, + nested_paths: vec![vec!["j".to_string(), "b".to_string()]], + }, + ParquetReadColumn { + root_index: 1, + nested_paths: vec![], + }, + ], + has_nested: true, + }; + + assert_eq!( + vec![1, 2, 3], + build_parquet_leaves_indices(&parquet_schema_desc, &projection) + ); + } + + #[test] + fn test_reads_middle_level_path() { + let parquet_schema_desc = build_test_nested_parquet_schema(); + + let projection = ParquetReadColumns { + cols: vec![ParquetReadColumn { + root_index: 0, + nested_paths: vec![vec!["j".to_string(), "b".to_string()]], + }], + has_nested: true, + }; + + assert_eq!( + vec![1, 2], + build_parquet_leaves_indices(&parquet_schema_desc, &projection) + ); + } + + #[test] + fn test_reads_leaf_level_path() { + let parquet_schema_desc = build_test_nested_parquet_schema(); + + let projection = ParquetReadColumns { + cols: vec![ParquetReadColumn { + root_index: 0, + nested_paths: vec![vec!["j".to_string(), "b".to_string(), "c".to_string()]], + }], + has_nested: true, + }; + + assert_eq!( + vec![1], + build_parquet_leaves_indices(&parquet_schema_desc, &projection) + ); + } + + #[test] + fn test_merges_mixed_paths() { + let parquet_schema_desc = build_test_nested_parquet_schema(); + + let projection = ParquetReadColumns { + cols: vec![ParquetReadColumn { + root_index: 0, + nested_paths: vec![ + vec!["j".to_string(), "a".to_string()], + vec!["j".to_string(), "b".to_string(), "d".to_string()], + ], + }], + has_nested: true, + }; + + assert_eq!( + vec![0, 2], + build_parquet_leaves_indices(&parquet_schema_desc, &projection) + ); + } + + // Test schema: + // schema + // |- j + // | |- a: INT64 + // | `- b + // | |- c: INT64 + // | `- d: INT64 + // `- k: INT64 + fn build_test_nested_parquet_schema() -> SchemaDescriptor { + let leaf_a = Arc::new( + Type::primitive_type_builder("a", parquet::basic::Type::INT64) + .with_repetition(Repetition::REQUIRED) + .build() + .unwrap(), + ); + let leaf_c = Arc::new( + Type::primitive_type_builder("c", parquet::basic::Type::INT64) + .with_repetition(Repetition::REQUIRED) + .build() + .unwrap(), + ); + let leaf_d = Arc::new( + Type::primitive_type_builder("d", parquet::basic::Type::INT64) + .with_repetition(Repetition::REQUIRED) + .build() + .unwrap(), + ); + let group_b = Arc::new( + Type::group_type_builder("b") + .with_repetition(Repetition::REQUIRED) + .with_fields(vec![leaf_c, leaf_d]) + .build() + .unwrap(), + ); + let root_j = Arc::new( + Type::group_type_builder("j") + .with_repetition(Repetition::REQUIRED) + .with_fields(vec![leaf_a, group_b]) + .build() + .unwrap(), + ); + let root_k = Arc::new( + Type::primitive_type_builder("k", parquet::basic::Type::INT64) + .with_repetition(Repetition::REQUIRED) + .build() + .unwrap(), + ); + let schema = Arc::new( + Type::group_type_builder("schema") + .with_fields(vec![root_j, root_k]) + .build() + .unwrap(), + ); + + SchemaDescriptor::new(schema) + } +} diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs index 73ca7748e9..6fdbb6f243 100644 --- a/src/mito2/src/sst/parquet/reader.rs +++ b/src/mito2/src/sst/parquet/reader.rs @@ -79,6 +79,7 @@ use crate::sst::parquet::metadata::MetadataLoader; use crate::sst::parquet::prefilter::{ PrefilterContextBuilder, execute_prefilter, is_usable_primary_key_filter, }; +use crate::sst::parquet::read_columns::{ParquetReadColumns, build_projection_mask}; use crate::sst::parquet::row_group::ParquetFetchMetrics; use crate::sst::parquet::row_selection::RowGroupSelection; use crate::sst::parquet::stats::RowGroupPruningStats; @@ -406,10 +407,11 @@ impl ParquetReaderBuilder { // Computes the projection mask. let parquet_schema_desc = parquet_meta.file_metadata().schema_descr(); - let indices = read_format.projection_indices(); - // Now we assumes we don't have nested schemas. - // TODO(yingwen): Revisit this if we introduce nested types such as JSON type. - let projection_mask = ProjectionMask::roots(parquet_schema_desc, indices.iter().copied()); + let parquet_read_cols = ParquetReadColumns::from_deduped_root_indices( + read_format.projection_indices().iter().copied(), + ); + + let projection_mask = build_projection_mask(&parquet_read_cols, parquet_schema_desc); let selection = self .row_groups_to_read(&read_format, &parquet_meta, &mut metrics.filter_metrics) .await; From d1b2a310975d1c4d54eac746563ca87b74b0b9d1 Mon Sep 17 00:00:00 2001 From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com> Date: Mon, 13 Apr 2026 14:42:55 +0800 Subject: [PATCH 093/195] fix: randomize standalone test ports in cli export test (#7955) fix/flaky-test: ### Add Dynamic Port Selection for Standalone Tests - **`cli.rs`**: Implemented functions `random_standalone_addrs` and `choose_random_unused_port_offset` to dynamically select unused ports for standalone tests, enhancing test reliability. - Updated `test_export_create_table_with_quoted_names` to use dynamically assigned ports for HTTP, RPC, MySQL, and PostgreSQL addresses. Signed-off-by: Lei, HUANG --- src/cmd/src/cli.rs | 52 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/src/cmd/src/cli.rs b/src/cmd/src/cli.rs index 84e797c291..95c5f00b77 100644 --- a/src/cmd/src/cli.rs +++ b/src/cmd/src/cli.rs @@ -102,31 +102,79 @@ impl Command { #[cfg(test)] mod tests { + use std::net::TcpListener; + use std::ops::RangeInclusive; + use clap::Parser; use client::{Client, Database}; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use common_telemetry::logging::LoggingOptions; + use rand::Rng; use crate::error::Result as CmdResult; use crate::options::GlobalOptions; use crate::{App, cli, standalone}; + fn random_standalone_addrs() -> (String, String, String, String) { + let offset = choose_random_unused_port_offset(14000..=24000, 10); + + ( + format!("127.0.0.1:{}", 4000 + offset), + format!("127.0.0.1:{}", 4001 + offset), + format!("127.0.0.1:{}", 4002 + offset), + format!("127.0.0.1:{}", 4003 + offset), + ) + } + + fn choose_random_unused_port_offset( + port_range: RangeInclusive, + max_attempts: usize, + ) -> u16 { + let mut rng = rand::rng(); + + for _ in 0..max_attempts { + let http_port = rng.random_range(port_range.clone()); + let offset = http_port - 4000; + let ports = [4000 + offset, 4001 + offset, 4002 + offset, 4003 + offset]; + + let listeners = ports + .into_iter() + .map(|port| TcpListener::bind(("127.0.0.1", port))) + .collect::, _>>(); + + if listeners.is_ok() { + return offset; + } + } + + panic!("failed to find unused standalone test ports"); + } + #[tokio::test(flavor = "multi_thread")] async fn test_export_create_table_with_quoted_names() -> CmdResult<()> { let output_dir = tempfile::tempdir().unwrap(); + let (http_addr, rpc_addr, mysql_addr, postgres_addr) = random_standalone_addrs(); let standalone = standalone::Command::parse_from([ "standalone", "start", "--data-home", &*output_dir.path().to_string_lossy(), + "--http-addr", + &http_addr, + "--rpc-bind-addr", + &rpc_addr, + "--mysql-addr", + &mysql_addr, + "--postgres-addr", + &postgres_addr, ]); let standalone_opts = standalone.load_options(&GlobalOptions::default()).unwrap(); let mut instance = standalone.build(standalone_opts).await?; instance.start().await?; - let client = Client::with_urls(["127.0.0.1:4001"]); + let client = Client::with_urls([rpc_addr.as_str()]); let database = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client); database .sql(r#"CREATE DATABASE "cli.export.create_table";"#) @@ -149,7 +197,7 @@ mod tests { "data", "export", "--addr", - "127.0.0.1:4000", + &http_addr, "--output-dir", &*output_dir.path().to_string_lossy(), "--target", From 9f7ffb4d26f498ceb8ea24349f6e341fe29f8365 Mon Sep 17 00:00:00 2001 From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com> Date: Mon, 13 Apr 2026 16:12:11 +0800 Subject: [PATCH 094/195] feat(mito2): allow CompactionOutput to succeed independently (#7948) * refactor(mito2): improve compaction error handling and file removal Refactor compaction task execution to enhance error handling and robustness. - Implemented parallel execution of compaction tasks with proper error capture and logging for individual task failures. - Ensured JoinSnafu is no longer directly used in error propagation, instead handling errors within the task processing loop. - Adjusted file removal logic to correctly include expired SSTs after compaction merges. Signed-off-by: Lei, HUANG * refactor(mito2): extract SstMerger trait for testability in compaction Extract SstMerger trait and DefaultSstMerger implementation to improve the testability of DefaultCompactor. The DefaultCompactor is now generic over SstMerger, allowing mock implementations to be injected for unit testing without relying on the full object storage access layer. This refactoring separates the concerns of SST file merging from the overall compaction orchestration logic. Additionally: - Updated CompactionScheduler to use DefaultCompactor::default(). - Added unit tests for DefaultCompactor using a MockMerger. Signed-off-by: Lei, HUANG * fix(compaction): propagate join error during sst flush Correctly propagates the error when joining SST flush handles during compaction. Previously, the error was logged but not returned, leading to potential silent failures. Also reorders some imports for consistency. Signed-off-by: Lei, HUANG * perf(compaction): pre-allocate capacity for compacted_inputs Pre-allocates capacity for the compacted_inputs vector based on the estimated total size of inputs and expired SSTs. This optimization aims to reduce vector reallocations during the compaction process. Signed-off-by: Lei, HUANG * feat/allow-partial-compaction: ### Commit Message Enhance `DefaultCompactor` and `MockMerger` for Improved Flexibility - **`compactor.rs`**: - Added `Clone` trait to `DefaultSstMerger` and `MockMerger` to allow cloning. - Removed `Arc` wrapping from `DefaultCompactor`'s `merger` field for direct usage. - Updated `merge_ssts` method to require `Clone` trait for `SstMerger`. - Modified `MockMerger` to use `Arc` for `results` and `call_idx` to ensure thread safety. - Adjusted error handling to use `error::InvalidMetaSnafu` directly. Signed-off-by: Lei, HUANG --------- Signed-off-by: Lei, HUANG --- src/mito2/src/compaction.rs | 2 +- src/mito2/src/compaction/compactor.rs | 390 +++++++++++++++++++++++--- 2 files changed, 354 insertions(+), 38 deletions(-) diff --git a/src/mito2/src/compaction.rs b/src/mito2/src/compaction.rs index 944c51ebd6..d2120690ac 100644 --- a/src/mito2/src/compaction.rs +++ b/src/mito2/src/compaction.rs @@ -562,7 +562,7 @@ impl CompactionScheduler { listener, picker_output, compaction_region, - compactor: Arc::new(DefaultCompactor {}), + compactor: Arc::new(DefaultCompactor::default()), memory_manager: self.memory_manager.clone(), memory_policy: self.memory_policy, estimated_memory_bytes: estimated_bytes, diff --git a/src/mito2/src/compaction/compactor.rs b/src/mito2/src/compaction/compactor.rs index ff4317331f..fd3d01b276 100644 --- a/src/mito2/src/compaction/compactor.rs +++ b/src/mito2/src/compaction/compactor.rs @@ -38,11 +38,10 @@ use crate::compaction::picker::{PickerOutput, new_picker}; use crate::compaction::{CompactionOutput, CompactionSstReaderBuilder, find_dynamic_options}; use crate::config::MitoConfig; use crate::error::{ - EmptyRegionDirSnafu, InvalidPartitionExprSnafu, JoinSnafu, ObjectStoreNotFoundSnafu, Result, + EmptyRegionDirSnafu, InvalidPartitionExprSnafu, ObjectStoreNotFoundSnafu, Result, }; use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList}; use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions}; -use crate::metrics; use crate::read::FlatSource; use crate::region::options::RegionOptions; use crate::region::version::VersionRef; @@ -56,6 +55,7 @@ use crate::sst::index::puffin_manager::PuffinManagerFactory; use crate::sst::location::region_dir_from_table_dir; use crate::sst::parquet::WriteOptions; use crate::sst::version::{SstVersion, SstVersionRef}; +use crate::{error, metrics}; /// Region version for compaction that does not hold memtables. #[derive(Clone)] @@ -299,12 +299,28 @@ pub trait Compactor: Send + Sync + 'static { ) -> Result<()>; } -/// DefaultCompactor is the default implementation of Compactor. -pub struct DefaultCompactor; - -impl DefaultCompactor { - /// Merge a single compaction output into SST files. +/// Trait for merging a single compaction output into SST files. +/// +/// This is extracted from `DefaultCompactor` to allow injecting mock +/// implementations in tests. +#[async_trait::async_trait] +pub trait SstMerger: Send + Sync + 'static { async fn merge_single_output( + &self, + compaction_region: CompactionRegion, + output: CompactionOutput, + write_opts: WriteOptions, + ) -> Result>; +} + +/// The production [`SstMerger`] that reads, merges, and writes SST files. +#[derive(Clone)] +pub struct DefaultSstMerger; + +#[async_trait::async_trait] +impl SstMerger for DefaultSstMerger { + async fn merge_single_output( + &self, compaction_region: CompactionRegion, output: CompactionOutput, write_opts: WriteOptions, @@ -424,54 +440,113 @@ impl DefaultCompactor { } } +/// DefaultCompactor is the default implementation of Compactor. +/// +/// It is parameterized by an [`SstMerger`] to allow injecting mock +/// implementations in tests. +pub struct DefaultCompactor { + merger: M, +} + +impl Default for DefaultCompactor { + fn default() -> Self { + Self { + merger: DefaultSstMerger, + } + } +} + +impl DefaultCompactor { + pub fn with_merger(merger: M) -> Self { + Self { merger } + } +} + #[async_trait::async_trait] -impl Compactor for DefaultCompactor { +impl Compactor for DefaultCompactor +where + M: Clone, +{ async fn merge_ssts( &self, compaction_region: &CompactionRegion, mut picker_output: PickerOutput, ) -> Result { - let mut futs = Vec::with_capacity(picker_output.outputs.len()); - let mut compacted_inputs = - Vec::with_capacity(picker_output.outputs.iter().map(|o| o.inputs.len()).sum()); let internal_parallelism = compaction_region.max_parallelism.max(1); let compaction_time_window = picker_output.time_window_size; + let region_id = compaction_region.region_id; + + // Build tasks along with their input file metas so we can track which + // inputs correspond to each task. + let mut tasks: Vec<(Vec, _)> = Vec::with_capacity(picker_output.outputs.len()); for output in picker_output.outputs.drain(..) { let inputs_to_remove: Vec<_> = output.inputs.iter().map(|f| f.meta_ref().clone()).collect(); - compacted_inputs.extend(inputs_to_remove.iter().cloned()); let write_opts = WriteOptions { write_buffer_size: compaction_region.engine_config.sst_write_buffer_size, max_file_size: picker_output.max_file_size, ..Default::default() }; - futs.push(Self::merge_single_output( - compaction_region.clone(), - output, - write_opts, - )); - } - let mut output_files = Vec::with_capacity(futs.len()); - while !futs.is_empty() { - let mut task_chunk = Vec::with_capacity(internal_parallelism); - for _ in 0..internal_parallelism { - if let Some(task) = futs.pop() { - task_chunk.push(common_runtime::spawn_compact(task)); - } - } - let metas = futures::future::try_join_all(task_chunk) - .await - .context(JoinSnafu)? - .into_iter() - .collect::>>>()?; - output_files.extend(metas.into_iter().flatten()); + let merger = self.merger.clone(); + let compaction_region = compaction_region.clone(); + let fut = async move { + merger + .merge_single_output(compaction_region, output, write_opts) + .await + }; + tasks.push((inputs_to_remove, fut)); } - // In case of remote compaction, we still allow the region edit after merge to - // clean expired ssts. - let mut inputs: Vec<_> = compacted_inputs.into_iter().collect(); - inputs.extend( + let mut output_files = Vec::with_capacity(tasks.len()); + let mut compacted_inputs = Vec::with_capacity( + tasks.iter().map(|(inputs, _)| inputs.len()).sum::() + + picker_output.expired_ssts.len(), + ); + + while !tasks.is_empty() { + let mut chunk: Vec<(Vec, _)> = Vec::with_capacity(internal_parallelism); + for _ in 0..internal_parallelism { + if let Some(task) = tasks.pop() { + chunk.push(task); + } + } + let spawned: Vec<_> = chunk + .into_iter() + .map(|(inputs, fut)| { + let handle = common_runtime::spawn_compact(fut); + (inputs, handle) + }) + .collect(); + + for (inputs, handle) in spawned { + match handle.await { + Ok(Ok(files)) => { + output_files.extend(files); + compacted_inputs.extend(inputs); + } + Ok(Err(e)) => { + warn!( + e; "Region {} failed to merge compaction output with inputs: [{}], skipping", + region_id, + inputs.iter().map(|f| f.file_id.to_string()).join(",") + ); + } + Err(e) => { + warn!( + "Region {} compaction task join error for inputs: [{}], skipping: {}", + region_id, + inputs.iter().map(|f| f.file_id.to_string()).join(","), + e + ); + return Err(e).context(error::JoinSnafu); + } + } + } + } + + // Include expired SSTs in removals — these don't depend on merge success. + compacted_inputs.extend( picker_output .expired_ssts .iter() @@ -480,7 +555,7 @@ impl Compactor for DefaultCompactor { Ok(MergeOutput { files_to_add: output_files, - files_to_remove: inputs, + files_to_remove: compacted_inputs, compaction_time_window: Some(compaction_time_window), }) } @@ -558,3 +633,244 @@ impl Compactor for DefaultCompactor { Ok(()) } } + +#[cfg(test)] +mod tests { + use std::sync::Mutex; + use std::sync::atomic::{AtomicUsize, Ordering}; + + use store_api::storage::{FileId, RegionId}; + + use super::*; + use crate::cache::CacheManager; + use crate::compaction::picker::PickerOutput; + use crate::sst::file::FileHandle; + use crate::sst::file_purger::NoopFilePurger; + use crate::sst::version::SstVersion; + use crate::test_util::memtable_util::metadata_for_test; + use crate::test_util::scheduler_util::SchedulerEnv; + + fn dummy_file_meta() -> FileMeta { + FileMeta { + region_id: RegionId::new(1, 1), + file_id: FileId::random(), + file_size: 100, + ..Default::default() + } + } + + fn new_file_handle(meta: FileMeta) -> FileHandle { + FileHandle::new(meta, Arc::new(NoopFilePurger)) + } + + /// Build a minimal [`CompactionRegion`] suitable for tests where the + /// [`SstMerger`] is mocked and never touches the access layer. + async fn new_test_compaction_region() -> CompactionRegion { + let env = SchedulerEnv::new().await; + let metadata = metadata_for_test(); + let manifest_ctx = env.mock_manifest_context(metadata.clone()).await; + CompactionRegion { + region_id: RegionId::new(1, 1), + region_options: RegionOptions::default(), + engine_config: Arc::new(MitoConfig::default()), + region_metadata: metadata.clone(), + cache_manager: Arc::new(CacheManager::default()), + access_layer: env.access_layer.clone(), + manifest_ctx, + current_version: CompactionVersion { + metadata, + options: RegionOptions::default(), + ssts: Arc::new(SstVersion::new()), + compaction_time_window: None, + }, + file_purger: None, + ttl: None, + max_parallelism: 1, + } + } + + /// An [`SstMerger`] that returns pre-configured results per call index. + /// + /// Call 0 gets `results[0]`, call 1 gets `results[1]`, etc. + #[derive(Clone)] + struct MockMerger { + results: Arc>>>>, + call_idx: Arc, + } + + impl MockMerger { + fn new(results: Vec>>) -> Self { + Self { + results: Arc::new(Mutex::new(results)), + call_idx: Arc::new(AtomicUsize::new(0)), + } + } + } + + #[async_trait::async_trait] + impl SstMerger for MockMerger { + async fn merge_single_output( + &self, + _compaction_region: CompactionRegion, + _output: CompactionOutput, + _write_opts: WriteOptions, + ) -> Result> { + let idx = self.call_idx.fetch_add(1, Ordering::SeqCst); + match self.results.lock().unwrap().get(idx) { + Some(Ok(files)) => Ok(files.clone()), + Some(Err(_)) => error::InvalidMetaSnafu { + reason: format!("simulated failure at index {idx}"), + } + .fail(), + None => panic!("MockMerger: no result configured for call index {idx}"), + } + } + } + + #[tokio::test] + async fn test_partial_merge_failure_collects_only_successful_outputs() { + common_telemetry::init_default_ut_logging(); + + let compaction_region = new_test_compaction_region().await; + + // Prepare 3 compaction outputs: output 0 and 2 succeed, output 1 fails. + let input_meta_0 = dummy_file_meta(); + let input_meta_1 = dummy_file_meta(); + let input_meta_2 = dummy_file_meta(); + + let output_meta_0 = vec![dummy_file_meta()]; + let output_meta_2 = vec![dummy_file_meta(), dummy_file_meta()]; + + let merger = MockMerger::new(vec![ + Ok(output_meta_0.clone()), + Err(error::InvalidMetaSnafu { + reason: "boom".to_string(), + } + .build()), + Ok(output_meta_2.clone()), + ]); + let compactor = DefaultCompactor::with_merger(merger); + + let picker_output = PickerOutput { + outputs: vec![ + CompactionOutput { + output_level: 1, + inputs: vec![new_file_handle(input_meta_0.clone())], + filter_deleted: false, + output_time_range: None, + }, + CompactionOutput { + output_level: 1, + inputs: vec![new_file_handle(input_meta_1.clone())], + filter_deleted: false, + output_time_range: None, + }, + CompactionOutput { + output_level: 1, + inputs: vec![new_file_handle(input_meta_2.clone())], + filter_deleted: false, + output_time_range: None, + }, + ], + expired_ssts: vec![], + time_window_size: 3600, + max_file_size: None, + }; + + let merge_output = compactor + .merge_ssts(&compaction_region, picker_output) + .await + .unwrap(); + + // Outputs 0 and 2 succeeded (1 + 2 = 3 files added). + assert_eq!(merge_output.files_to_add.len(), 3); + // Only inputs from successful merges should be removed. + assert_eq!(merge_output.files_to_remove.len(), 2); + + let removed_ids: Vec<_> = merge_output + .files_to_remove + .iter() + .map(|f| f.file_id) + .collect(); + assert!(removed_ids.contains(&input_meta_0.file_id)); + assert!(removed_ids.contains(&input_meta_2.file_id)); + // The failed output's input must NOT be removed. + assert!(!removed_ids.contains(&input_meta_1.file_id)); + } + + #[tokio::test] + async fn test_all_outputs_succeed() { + common_telemetry::init_default_ut_logging(); + + let compaction_region = new_test_compaction_region().await; + let input_meta = dummy_file_meta(); + let output_meta = vec![dummy_file_meta()]; + + let merger = MockMerger::new(vec![Ok(output_meta.clone())]); + let compactor = DefaultCompactor::with_merger(merger); + + let picker_output = PickerOutput { + outputs: vec![CompactionOutput { + output_level: 1, + inputs: vec![new_file_handle(input_meta.clone())], + filter_deleted: false, + output_time_range: None, + }], + expired_ssts: vec![], + time_window_size: 3600, + max_file_size: None, + }; + + let merge_output = compactor + .merge_ssts(&compaction_region, picker_output) + .await + .unwrap(); + + assert_eq!(merge_output.files_to_add.len(), 1); + assert_eq!(merge_output.files_to_add[0].file_id, output_meta[0].file_id); + assert_eq!(merge_output.files_to_remove.len(), 1); + assert_eq!(merge_output.files_to_remove[0].file_id, input_meta.file_id); + } + + #[tokio::test] + async fn test_expired_ssts_always_removed() { + common_telemetry::init_default_ut_logging(); + + let compaction_region = new_test_compaction_region().await; + let input_meta = dummy_file_meta(); + let expired_meta = dummy_file_meta(); + + // The single merge output fails, but expired SSTs should still be removed. + let merger = MockMerger::new(vec![Err(error::InvalidMetaSnafu { + reason: "fail".to_string(), + } + .build())]); + let compactor = DefaultCompactor::with_merger(merger); + + let picker_output = PickerOutput { + outputs: vec![CompactionOutput { + output_level: 1, + inputs: vec![new_file_handle(input_meta.clone())], + filter_deleted: false, + output_time_range: None, + }], + expired_ssts: vec![new_file_handle(expired_meta.clone())], + time_window_size: 3600, + max_file_size: None, + }; + + let merge_output = compactor + .merge_ssts(&compaction_region, picker_output) + .await + .unwrap(); + + // No files added (merge failed). + assert!(merge_output.files_to_add.is_empty()); + // Only the expired SST should be in files_to_remove (not the failed merge's input). + assert_eq!(merge_output.files_to_remove.len(), 1); + assert_eq!( + merge_output.files_to_remove[0].file_id, + expired_meta.file_id + ); + } +} From 01a73105b8e5441ee66170cf3816085f68d9019e Mon Sep 17 00:00:00 2001 From: Yingwen Date: Mon, 13 Apr 2026 16:27:53 +0800 Subject: [PATCH 095/195] feat: use partition range cache in scan (#7873) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: use range cache in scan Signed-off-by: evenyag * refactor: rename dedup to skip_dedup Signed-off-by: evenyag * feat: use background concat for buffered batches Signed-off-by: evenyag * chore: fmt Signed-off-by: evenyag * fix: store permits Signed-off-by: evenyag * fix: fix potential panic Signed-off-by: evenyag * fix: skip range-cache wrapping when cache is disabled Signed-off-by: evenyag * fix: avoid potential deadlock Deadlock Chain 1. Range-level merge tasks: Each concurrent build_flat_partition_range_read (line 494-506) calls build_flat_reader_from_sources → create_parallel_flat_sources → spawn_flat_scan_task. These background tasks loop: acquire permit → input.next() → release permit. 2. Final merge tasks: After all range tasks return streams (line 509-511), the distributor calls build_flat_reader_from_sources again (line 520-527) → create_parallel_flat_sources → more spawn_flat_scan_task tasks. These also loop: acquire permit → input.next() → release permit. 3. Circular wait: The final merge tasks' input.next() reads from ReceiverStreams backed by range-level merge tasks. If all num_partitions permits are held by final merge tasks blocked on input.next(), the range-level merge tasks can't acquire permits to produce data → deadlock. Signed-off-by: evenyag * test: add test for small permits Signed-off-by: evenyag * feat: use avg batch size for channel size Signed-off-by: evenyag * test: fix test Signed-off-by: evenyag * chore: address review comments Signed-off-by: evenyag --------- Signed-off-by: evenyag --- src/mito2/src/cache.rs | 70 ++++ src/mito2/src/engine/scan_test.rs | 96 +++++ src/mito2/src/read/range_cache.rs | 658 +++++++++++++++++++----------- src/mito2/src/read/scan_util.rs | 139 ++++++- src/mito2/src/read/seq_scan.rs | 83 +++- src/mito2/src/read/series_scan.rs | 85 ++-- 6 files changed, 845 insertions(+), 286 deletions(-) diff --git a/src/mito2/src/cache.rs b/src/mito2/src/cache.rs index 35db74eee6..5d2559cba1 100644 --- a/src/mito2/src/cache.rs +++ b/src/mito2/src/cache.rs @@ -28,6 +28,7 @@ use std::ops::Range; use std::sync::Arc; use bytes::Bytes; +use common_base::readable_size::ReadableSize; use common_telemetry::warn; use datatypes::arrow::record_batch::RecordBatch; use datatypes::value::Value; @@ -72,6 +73,46 @@ const INDEX_TYPE: &str = "index"; const SELECTOR_RESULT_TYPE: &str = "selector_result"; /// Metrics type key for range scan result cache. const RANGE_RESULT_TYPE: &str = "range_result"; +const RANGE_RESULT_CONCAT_MEMORY_LIMIT: ReadableSize = ReadableSize::mb(512); +const RANGE_RESULT_CONCAT_MEMORY_PERMIT: ReadableSize = ReadableSize::kb(1); + +#[derive(Debug)] +pub(crate) struct RangeResultMemoryLimiter { + semaphore: Arc, + permit_bytes: usize, +} + +impl Default for RangeResultMemoryLimiter { + fn default() -> Self { + Self::new( + RANGE_RESULT_CONCAT_MEMORY_LIMIT.as_bytes() as usize, + RANGE_RESULT_CONCAT_MEMORY_PERMIT.as_bytes() as usize, + ) + } +} + +impl RangeResultMemoryLimiter { + pub(crate) fn new(limit_bytes: usize, permit_bytes: usize) -> Self { + let permit_bytes = permit_bytes.max(1); + let permits = limit_bytes.div_ceil(permit_bytes).max(1); + Self { + semaphore: Arc::new(tokio::sync::Semaphore::new(permits)), + permit_bytes, + } + } + + pub(crate) fn permit_bytes(&self) -> usize { + self.permit_bytes + } + + pub(crate) async fn acquire( + &self, + bytes: usize, + ) -> std::result::Result, tokio::sync::AcquireError> { + let permits = bytes.div_ceil(self.permit_bytes()).max(1) as u32; + self.semaphore.acquire_many(permits).await + } +} /// Cached SST metadata combines the parquet footer with the decoded region metadata. /// @@ -373,6 +414,23 @@ impl CacheStrategy { } } + /// Returns true if the range result cache is enabled. + pub(crate) fn has_range_result_cache(&self) -> bool { + match self { + CacheStrategy::EnableAll(cache_manager) => cache_manager.has_range_result_cache(), + CacheStrategy::Compaction(_) | CacheStrategy::Disabled => false, + } + } + + pub(crate) fn range_result_memory_limiter(&self) -> Option<&Arc> { + match self { + CacheStrategy::EnableAll(cache_manager) => { + Some(cache_manager.range_result_memory_limiter()) + } + CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None, + } + } + /// Calls [CacheManager::write_cache()]. /// It returns None if the strategy is [CacheStrategy::Disabled]. pub fn write_cache(&self) -> Option<&WriteCacheRef> { @@ -476,6 +534,8 @@ pub struct CacheManager { selector_result_cache: Option, /// Cache for range scan outputs in flat format. range_result_cache: Option, + /// Shared memory limiter for async range-result cache tasks. + range_result_memory_limiter: Arc, /// Cache for index result. index_result_cache: Option, } @@ -735,6 +795,15 @@ impl CacheManager { } } + /// Returns true if the range result cache is enabled. + pub(crate) fn has_range_result_cache(&self) -> bool { + self.range_result_cache.is_some() + } + + pub(crate) fn range_result_memory_limiter(&self) -> &Arc { + &self.range_result_memory_limiter + } + /// Gets the write cache. pub(crate) fn write_cache(&self) -> Option<&WriteCacheRef> { self.write_cache.as_ref() @@ -969,6 +1038,7 @@ impl CacheManagerBuilder { puffin_metadata_cache: Some(Arc::new(puffin_metadata_cache)), selector_result_cache, range_result_cache, + range_result_memory_limiter: Arc::new(RangeResultMemoryLimiter::default()), index_result_cache, } } diff --git a/src/mito2/src/engine/scan_test.rs b/src/mito2/src/engine/scan_test.rs index 119b4493fd..a39761ad01 100644 --- a/src/mito2/src/engine/scan_test.rs +++ b/src/mito2/src/engine/scan_test.rs @@ -403,3 +403,99 @@ fn collect_and_assert_partition_rows( actual_rows.sort_by(|a, b| a.0.cmp(&b.0).then(a.2.cmp(&b.2))); actual_rows } + +/// Tests series scan with multiple partition ranges (each with multiple overlapping sources) +/// and small semaphore permits (controlled by num_partitions). +#[tokio::test] +async fn test_series_scan_flat_small_permits() { + let mut env = TestEnv::with_prefix("test_series_scan_small_permits").await; + let engine = env + .create_engine(MitoConfig { + default_flat_format: true, + ..Default::default() + }) + .await; + + let region_id = RegionId::new(1, 1); + let request = CreateRequestBuilder::new() + .insert_option("compaction.type", "twcs") + .insert_option("compaction.twcs.time_window", "1h") + .build(); + let column_schemas = test_util::rows_schema(&request); + + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + // Create overlapping SSTs in each time window so partition ranges have multiple sources. + let put_flush_rows = async |start, end| { + let rows = Rows { + schema: column_schemas.clone(), + rows: test_util::build_rows(start, end), + }; + test_util::put_rows(&engine, region_id, rows).await; + test_util::flush_region(&engine, region_id, None).await; + }; + // Window 0 (0s-999s): 3 overlapping SSTs + put_flush_rows(0, 3).await; + put_flush_rows(1, 5).await; + put_flush_rows(3, 7).await; + // Window 1 (3600s-4599s): 2 overlapping SSTs + put_flush_rows(3600, 3603).await; + put_flush_rows(3601, 3605).await; + // Window 2 (7200s-8199s): 2 overlapping SSTs + put_flush_rows(7200, 7203).await; + put_flush_rows(7201, 7204).await; + + let mut expected_rows = Vec::new(); + for value in [ + 0_i64, 1, 2, 3, 4, 5, 6, 3600, 3601, 3602, 3603, 3604, 7200, 7201, 7202, 7203, + ] { + expected_rows.push((value.to_string(), value as f64, value * 1000)); + } + expected_rows.sort_by(|a, b| a.0.cmp(&b.0).then(a.2.cmp(&b.2))); + + // Test with different semaphore sizes (num_partitions controls Semaphore::new(num_partitions)). + for num_partitions in [1, 2] { + let request = ScanRequest { + distribution: Some(TimeSeriesDistribution::PerSeries), + ..Default::default() + }; + let scanner = engine.scanner(region_id, request).await.unwrap(); + let Scanner::Series(mut scanner) = scanner else { + panic!("Scanner should be series scan"); + }; + + // Collect all partition ranges and redistribute into `num_partitions` partitions. + let raw_ranges: Vec<_> = scanner + .properties() + .partitions + .iter() + .flatten() + .cloned() + .collect(); + assert!( + raw_ranges.len() >= 3, + "expected at least 3 partition ranges, got {}", + raw_ranges.len() + ); + + let mut new_ranges = vec![vec![]; num_partitions]; + for (i, range) in raw_ranges.into_iter().enumerate() { + new_ranges[i % num_partitions].push(range); + } + scanner + .prepare(PrepareRequest { + ranges: Some(new_ranges), + ..Default::default() + }) + .unwrap(); + + let actual_rows = collect_partition_rows_round_robin(&scanner, num_partitions).await; + assert_eq!( + expected_rows, actual_rows, + "mismatch with num_partitions={num_partitions}" + ); + } +} diff --git a/src/mito2/src/read/range_cache.rs b/src/mito2/src/read/range_cache.rs index 2431a21f6a..1daaa6399b 100644 --- a/src/mito2/src/read/range_cache.rs +++ b/src/mito2/src/read/range_cache.rs @@ -18,22 +18,28 @@ use std::mem; use std::sync::Arc; use async_stream::try_stream; +use common_telemetry::warn; use common_time::range::TimestampRange; -use datatypes::arrow::array::{Array, AsArray, DictionaryArray}; -use datatypes::arrow::datatypes::UInt32Type; +use datatypes::arrow::compute::concat_batches; use datatypes::arrow::record_batch::RecordBatch; use datatypes::prelude::ConcreteDataType; use futures::TryStreamExt; +use snafu::ResultExt; use store_api::region_engine::PartitionRange; use store_api::storage::{ColumnId, FileId, RegionId, TimeSeriesRowSelector}; +use tokio::sync::{mpsc, oneshot}; use crate::cache::CacheStrategy; +use crate::error::{ComputeArrowSnafu, Result, UnexpectedSnafu}; use crate::read::BoxedRecordBatchStream; use crate::read::scan_region::StreamContext; use crate::read::scan_util::PartitionMetrics; use crate::region::options::MergeMode; use crate::sst::file::FileTimeRange; -use crate::sst::parquet::flat_format::primary_key_column_index; +use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE; + +const RANGE_CACHE_COMPACT_THRESHOLD_BYTES: usize = 2 * 1024 * 1024; +const RANGE_CACHE_SKIP_BYTES: usize = 512 * 1024 * 1024; /// Fingerprint of the scan request fields that affect partition range cache reuse. /// @@ -187,29 +193,48 @@ impl RangeScanCacheKey { } /// Cached result for one range scan. +#[derive(Debug)] +pub(crate) struct CachedBatchSlice { + batch: RecordBatch, + slice_lengths: Vec, +} + +impl CachedBatchSlice { + fn metadata_size(&self) -> usize { + self.slice_lengths.capacity() * mem::size_of::() + } +} + pub(crate) struct RangeScanCacheValue { - pub(crate) batches: Vec, - /// Precomputed size of all batches, accounting for shared dictionary values. + cached_batches: Vec, + /// Precomputed size of all compacted batches. estimated_batches_size: usize, } impl RangeScanCacheValue { - pub(crate) fn new(batches: Vec, estimated_batches_size: usize) -> Self { + pub(crate) fn new( + cached_batches: Vec, + estimated_batches_size: usize, + ) -> Self { Self { - batches, + cached_batches, estimated_batches_size, } } pub(crate) fn estimated_size(&self) -> usize { mem::size_of::() - + self.batches.capacity() * mem::size_of::() + + self.cached_batches.capacity() * mem::size_of::() + + self + .cached_batches + .iter() + .map(CachedBatchSlice::metadata_size) + .sum::() + self.estimated_batches_size } } /// Row groups and whether all sources are file-only for a partition range. -#[allow(dead_code)] pub(crate) struct PartitionRangeRowGroups { /// Sorted (file_id, row_group_index) pairs. pub(crate) row_groups: Vec<(FileId, i64)>, @@ -217,7 +242,6 @@ pub(crate) struct PartitionRangeRowGroups { } /// Collects (file_id, row_group_index) pairs from a partition range's row group indices. -#[allow(dead_code)] pub(crate) fn collect_partition_range_row_groups( stream_ctx: &StreamContext, part_range: &PartitionRange, @@ -244,11 +268,14 @@ pub(crate) fn collect_partition_range_row_groups( } /// Builds a cache key for the given partition range if it is eligible for caching. -#[allow(dead_code)] pub(crate) fn build_range_cache_key( stream_ctx: &StreamContext, part_range: &PartitionRange, ) -> Option { + if !stream_ctx.input.cache_strategy.has_range_result_cache() { + return None; + } + let fingerprint = stream_ctx.scan_fingerprint.as_ref()?; // Dyn filters can change at runtime, so we can't cache when they're present. @@ -283,7 +310,6 @@ pub(crate) fn build_range_cache_key( }) } -#[allow(dead_code)] fn query_time_range_covers_partition_range( query_time_range: Option<&TimestampRange>, partition_time_range: FileTimeRange, @@ -297,117 +323,232 @@ fn query_time_range_covers_partition_range( } /// Returns a stream that replays cached record batches. -#[allow(dead_code)] pub(crate) fn cached_flat_range_stream(value: Arc) -> BoxedRecordBatchStream { - Box::pin(futures::stream::iter( - value.batches.clone().into_iter().map(Ok), - )) + Box::pin(try_stream! { + for cached_batch in &value.cached_batches { + let mut offset = 0; + for &len in &cached_batch.slice_lengths { + yield cached_batch.batch.slice(offset, len); + offset += len; + } + } + }) } -/// Returns true if two primary key dictionary arrays share the same underlying -/// values buffers by pointer comparison. -/// -/// The primary key column is always `DictionaryArray` with `Binary` values. -fn pk_values_ptr_eq(a: &DictionaryArray, b: &DictionaryArray) -> bool { - let a = a.values().as_binary::(); - let b = b.values().as_binary::(); - let values_eq = a.values().ptr_eq(b.values()) && a.offsets().ptr_eq(b.offsets()); - match (a.nulls(), b.nulls()) { - (Some(a), Some(b)) => values_eq && a.inner().ptr_eq(b.inner()), - (None, None) => values_eq, - _ => false, +enum CacheConcatCommand { + Compact(Vec), + Finish { + pending: Vec, + key: RangeScanCacheKey, + cache_strategy: CacheStrategy, + part_metrics: PartitionMetrics, + result_tx: Option>>>, + }, +} + +#[derive(Default)] +struct CacheConcatState { + cached_batches: Vec, + estimated_size: usize, +} + +impl CacheConcatState { + async fn compact( + &mut self, + batches: Vec, + limiter: &crate::cache::RangeResultMemoryLimiter, + ) -> Result<()> { + if batches.is_empty() { + return Ok(()); + } + + let input_size = batches + .iter() + .map(RecordBatch::get_array_memory_size) + .sum::(); + let _permit = limiter.acquire(input_size).await.map_err(|_| { + UnexpectedSnafu { + reason: "range result memory limiter is unexpectedly closed", + } + .build() + })?; + + let compacted = compact_record_batches(batches)?; + self.estimated_size += compacted.batch.get_array_memory_size(); + self.cached_batches.push(compacted); + Ok(()) + } + + fn finish(self) -> RangeScanCacheValue { + RangeScanCacheValue::new(self.cached_batches, self.estimated_size) } } -/// Buffers record batches for caching, tracking memory size while deduplicating -/// shared dictionary values across batches. -/// -/// Uses the primary key column as a proxy to detect dictionary sharing: if the PK -/// column's dictionary values are pointer-equal across batches, we assume all -/// dictionary columns share their values and deduct the total dictionary values size. -struct CacheBatchBuffer { +fn compact_record_batches(batches: Vec) -> Result { + debug_assert!(!batches.is_empty()); + + let slice_lengths = batches.iter().map(RecordBatch::num_rows).collect(); + build_cached_batch_slice(batches, slice_lengths) +} + +fn build_cached_batch_slice( batches: Vec, - /// Running total of batch memory. - total_size: usize, - /// The first batch's PK dictionary array, for pointer comparison. - /// `None` if no dictionary PK column exists or no batch has been added yet. - first_pk_dict: Option>, - /// Sum of `get_array_memory_size()` of all dictionary value arrays from the first batch. - total_dict_values_size: usize, - /// Whether the PK dictionary is still shared across all batches seen so far. - shared: bool, + slice_lengths: Vec, +) -> Result { + let batch = if batches.len() == 1 { + batches.into_iter().next().unwrap() + } else { + let schema = batches[0].schema(); + concat_batches(&schema, &batches).context(ComputeArrowSnafu)? + }; + + Ok(CachedBatchSlice { + batch, + slice_lengths, + }) +} + +async fn run_cache_concat_task( + mut rx: mpsc::UnboundedReceiver, + limiter: Arc, +) { + let mut state = CacheConcatState::default(); + + while let Some(cmd) = rx.recv().await { + match cmd { + CacheConcatCommand::Compact(batches) => { + if let Err(err) = state.compact(batches, &limiter).await { + warn!(err; "Failed to compact range cache batches"); + return; + } + } + CacheConcatCommand::Finish { + pending, + key, + cache_strategy, + part_metrics, + result_tx, + } => { + let result = state + .compact(pending, &limiter) + .await + .map(|()| state.finish()); + if let Err(err) = &result { + warn!(err; "Failed to finalize range cache batches"); + } + + let value = result.ok().map(Arc::new); + if let Some(value) = &value { + part_metrics + .inc_range_cache_size(key.estimated_size() + value.estimated_size()); + cache_strategy.put_range_result(key, value.clone()); + } + if let Some(tx) = result_tx { + let _ = tx.send(value); + } + return; + } + } + } +} + +struct CacheBatchBuffer { + buffered_batches: Vec, + buffered_rows: usize, + buffered_size: usize, + total_weight: usize, + sender: Option>, } impl CacheBatchBuffer { - fn new() -> Self { + fn new(cache_strategy: &CacheStrategy) -> Self { + let sender = cache_strategy.range_result_memory_limiter().map(|limiter| { + let (tx, rx) = mpsc::unbounded_channel(); + common_runtime::spawn_global(run_cache_concat_task(rx, limiter.clone())); + tx + }); + Self { - batches: Vec::new(), - total_size: 0, - first_pk_dict: None, - total_dict_values_size: 0, - shared: true, + buffered_batches: Vec::new(), + buffered_rows: 0, + buffered_size: 0, + total_weight: 0, + sender, } } - fn push(&mut self, batch: RecordBatch) { - if self.batches.is_empty() { - self.init_first_batch(&batch); - } else { - self.add_subsequent_batch(&batch); + fn push(&mut self, batch: RecordBatch) -> Result<()> { + if self.sender.is_none() { + return Ok(()); } - self.batches.push(batch); - } - fn init_first_batch(&mut self, batch: &RecordBatch) { - self.total_size += batch.get_array_memory_size(); - - let pk_col_idx = primary_key_column_index(batch.num_columns()); - let mut total_dict_values_size = 0; - for col_idx in 0..batch.num_columns() { - let col = batch.column(col_idx); - if let Some(dict) = col.as_any().downcast_ref::>() { - total_dict_values_size += dict.values().get_array_memory_size(); - if col_idx == pk_col_idx { - self.first_pk_dict = Some(dict.clone()); - } - } - } - self.total_dict_values_size = total_dict_values_size; - } - - fn add_subsequent_batch(&mut self, batch: &RecordBatch) { let batch_size = batch.get_array_memory_size(); - - if self.shared - && let Some(first_pk_dict) = &self.first_pk_dict - { - let pk_col_idx = primary_key_column_index(batch.num_columns()); - let col = batch.column(pk_col_idx); - if let Some(dict) = col.as_any().downcast_ref::>() - && pk_values_ptr_eq(first_pk_dict, dict) - { - // PK dict is shared, deduct all dict values sizes. - self.total_size += batch_size - self.total_dict_values_size; - return; - } - // Dictionary diverged. - self.shared = false; + self.total_weight += batch_size; + if self.total_weight > RANGE_CACHE_SKIP_BYTES { + self.buffered_batches.clear(); + self.buffered_rows = 0; + self.buffered_size = 0; + self.sender = None; + return Ok(()); } - self.total_size += batch_size; + self.buffered_rows += batch.num_rows(); + self.buffered_size += batch_size; + self.buffered_batches.push(batch); + + if self.buffered_rows > DEFAULT_READ_BATCH_SIZE + || self.buffered_size > RANGE_CACHE_COMPACT_THRESHOLD_BYTES + { + self.notify_compact(); + } + + Ok(()) } - fn estimated_batches_size(&self) -> usize { - self.total_size + fn notify_compact(&mut self) { + if self.buffered_batches.is_empty() || self.sender.is_none() { + return; + } + + let batches = mem::take(&mut self.buffered_batches); + self.buffered_rows = 0; + self.buffered_size = 0; + + let Some(sender) = &self.sender else { + return; + }; + if sender.send(CacheConcatCommand::Compact(batches)).is_err() { + self.sender = None; + } } - fn into_batches(self) -> Vec { - self.batches + fn finish( + mut self, + key: RangeScanCacheKey, + cache_strategy: CacheStrategy, + part_metrics: PartitionMetrics, + result_tx: Option>>>, + ) { + let Some(sender) = self.sender.take() else { + return; + }; + + if sender + .send(CacheConcatCommand::Finish { + pending: mem::take(&mut self.buffered_batches), + key, + cache_strategy, + part_metrics, + result_tx, + }) + .is_err() + { + self.sender = None; + } } } /// Wraps a stream to cache its output for future range cache hits. -#[allow(dead_code)] pub(crate) fn cache_flat_range_stream( mut stream: BoxedRecordBatchStream, cache_strategy: CacheStrategy, @@ -415,17 +556,13 @@ pub(crate) fn cache_flat_range_stream( part_metrics: PartitionMetrics, ) -> BoxedRecordBatchStream { Box::pin(try_stream! { - let mut buffer = CacheBatchBuffer::new(); + let mut buffer = CacheBatchBuffer::new(&cache_strategy); while let Some(batch) = stream.try_next().await? { - buffer.push(batch.clone()); + buffer.push(batch.clone())?; yield batch; } - let estimated_size = buffer.estimated_batches_size(); - let batches = buffer.into_batches(); - let value = Arc::new(RangeScanCacheValue::new(batches, estimated_size)); - part_metrics.inc_range_cache_size(key.estimated_size() + value.estimated_size()); - cache_strategy.put_range_result(key, value); + buffer.finish(key, cache_strategy, part_metrics, None); }) } @@ -486,10 +623,11 @@ mod tests { use common_time::Timestamp; use common_time::range::TimestampRange; use common_time::timestamp::TimeUnit; + use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion_common::ScalarValue; use datafusion_expr::{Expr, col, lit}; use smallvec::smallvec; - use store_api::storage::FileId; + use store_api::storage::{FileId, RegionId}; use super::*; use crate::cache::CacheManager; @@ -508,6 +646,44 @@ mod tests { )) } + fn test_cache_context(strategy: &CacheStrategy) -> (RangeScanCacheKey, PartitionMetrics) { + let region_id = RegionId::new(1, 1); + let key = RangeScanCacheKey { + region_id, + row_groups: vec![], + scan: ScanRequestFingerprintBuilder { + read_column_ids: vec![], + read_column_types: vec![], + filters: vec![], + time_filters: vec![], + series_row_selector: None, + append_mode: false, + filter_deleted: false, + merge_mode: MergeMode::LastRow, + partition_expr_version: 0, + } + .build(), + }; + + let metrics_set = ExecutionPlanMetricsSet::new(); + let part_metrics = + PartitionMetrics::new(region_id, 0, "test", Instant::now(), false, &metrics_set); + + assert!(strategy.get_range_result(&key).is_none()); + (key, part_metrics) + } + + async fn finish_cache_batch_buffer( + buffer: CacheBatchBuffer, + key: RangeScanCacheKey, + cache_strategy: CacheStrategy, + part_metrics: PartitionMetrics, + ) -> Option> { + let (tx, rx) = oneshot::channel(); + buffer.finish(key, cache_strategy, part_metrics, Some(tx)); + rx.await.context(crate::error::RecvSnafu).ok().flatten() + } + async fn new_stream_context( filters: Vec, query_time_range: Option, @@ -687,169 +863,175 @@ mod tests { ); } - /// Creates a test schema with 5 columns where the primary key dictionary column - /// is at index 2 (`num_columns - 3`), matching the flat format layout. - /// - /// Layout: `[field0: Int64, field1: Int64, pk: Dictionary, ts: Int64, seq: Int64]` - fn dict_test_schema() -> Arc { + fn test_schema() -> Arc { use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field, Schema}; - Arc::new(Schema::new(vec![ - Field::new("field0", ArrowDataType::Int64, false), - Field::new("field1", ArrowDataType::Int64, false), - Field::new( - "pk", - ArrowDataType::Dictionary( - Box::new(ArrowDataType::UInt32), - Box::new(ArrowDataType::Binary), - ), - false, - ), - Field::new("ts", ArrowDataType::Int64, false), - Field::new("seq", ArrowDataType::Int64, false), - ])) + + Arc::new(Schema::new(vec![Field::new( + "value", + ArrowDataType::Int64, + false, + )])) } - /// Helper to create a record batch with a dictionary column at the primary key position. - fn make_dict_batch( - schema: Arc, - dict_values: &datatypes::arrow::array::BinaryArray, - keys: &[u32], - int_values: &[i64], - ) -> RecordBatch { - use datatypes::arrow::array::{Int64Array, UInt32Array}; + fn make_batch(values: &[i64]) -> RecordBatch { + use datatypes::arrow::array::Int64Array; - let key_array = UInt32Array::from(keys.to_vec()); - let dict_array: DictionaryArray = - DictionaryArray::new(key_array, Arc::new(dict_values.clone())); - let int_array = Int64Array::from(int_values.to_vec()); - let zeros = Int64Array::from(vec![0i64; int_values.len()]); RecordBatch::try_new( - schema, - vec![ - Arc::new(zeros.clone()), - Arc::new(int_array), - Arc::new(dict_array), - Arc::new(zeros.clone()), - Arc::new(zeros), - ], + test_schema(), + vec![Arc::new(Int64Array::from(values.to_vec()))], ) .unwrap() } - /// Computes the total `get_array_memory_size()` of all dictionary value arrays in a batch. - fn compute_total_dict_values_size(batch: &RecordBatch) -> usize { - batch - .columns() - .iter() - .filter_map(|col| { - col.as_any() - .downcast_ref::>() - .map(|dict| dict.values().get_array_memory_size()) - }) - .sum() - } - - #[test] - fn cache_batch_buffer_empty() { - let buffer = CacheBatchBuffer::new(); - assert_eq!(buffer.estimated_batches_size(), 0); - assert!(buffer.into_batches().is_empty()); - } - - #[test] - fn cache_batch_buffer_single_batch() { + fn make_large_binary_batch(rows: usize, bytes_per_row: usize) -> RecordBatch { use datatypes::arrow::array::BinaryArray; + use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field, Schema}; - let schema = dict_test_schema(); - let dict_values = BinaryArray::from_vec(vec![b"a", b"b", b"c"]); - let batch = make_dict_batch(schema, &dict_values, &[0, 1, 2], &[10, 20, 30]); + let schema = Arc::new(Schema::new(vec![Field::new( + "value", + ArrowDataType::Binary, + false, + )])); + let payload = vec![b'x'; bytes_per_row]; + let values = (0..rows).map(|_| payload.as_slice()).collect::>(); - let full_size = batch.get_array_memory_size(); - - let mut buffer = CacheBatchBuffer::new(); - buffer.push(batch); - assert_eq!(buffer.estimated_batches_size(), full_size); - assert_eq!(buffer.into_batches().len(), 1); + RecordBatch::try_new(schema, vec![Arc::new(BinaryArray::from_vec(values))]).unwrap() } #[test] - fn cache_batch_buffer_shared_dictionary() { - use datatypes::arrow::array::BinaryArray; + fn compact_record_batches_keeps_original_boundaries() { + let batches = vec![make_batch(&[1, 2]), make_batch(&[3]), make_batch(&[4, 5])]; - let schema = dict_test_schema(); - let dict_values = BinaryArray::from_vec(vec![b"alpha", b"beta", b"gamma"]); + let compacted = compact_record_batches(batches).unwrap(); - // Two batches sharing the same dictionary values array. - let batch1 = make_dict_batch(schema.clone(), &dict_values, &[0, 1], &[10, 20]); - let batch2 = make_dict_batch(schema, &dict_values, &[1, 2], &[30, 40]); + assert_eq!(compacted.batch.num_rows(), 5); + assert_eq!(compacted.slice_lengths, vec![2, 1, 2]); + } - let batch1_full = batch1.get_array_memory_size(); - let batch2_full = batch2.get_array_memory_size(); + #[tokio::test] + async fn cached_flat_range_stream_replays_original_batches() { + let value = Arc::new(RangeScanCacheValue::new( + vec![CachedBatchSlice { + batch: make_batch(&[1, 2, 3]), + slice_lengths: vec![2, 1], + }], + make_batch(&[1, 2, 3]).get_array_memory_size(), + )); - // The total dictionary values size that should be deduplicated for the second batch. - let dict_values_size = compute_total_dict_values_size(&batch2); + let replayed = cached_flat_range_stream(value) + .try_collect::>() + .await + .unwrap(); - let mut buffer = CacheBatchBuffer::new(); - buffer.push(batch1); - buffer.push(batch2); + assert_eq!(replayed.len(), 2); + assert_eq!(replayed[0].num_rows(), 2); + assert_eq!(replayed[1].num_rows(), 1); + } - // Second batch's dict values should not be counted again. + #[tokio::test] + async fn cache_batch_buffer_finishes_pending_batches() { + let strategy = test_cache_strategy(); + let batch = make_batch(&[1, 2, 3]); + let expected_size = batch.get_array_memory_size(); + let (key, part_metrics) = test_cache_context(&strategy); + + let mut buffer = CacheBatchBuffer::new(&strategy); + buffer.push(batch).unwrap(); + + let value = finish_cache_batch_buffer(buffer, key.clone(), strategy.clone(), part_metrics) + .await + .unwrap(); + assert_eq!(value.cached_batches.len(), 1); + assert_eq!(value.cached_batches[0].slice_lengths, vec![3]); + assert_eq!(value.estimated_batches_size, expected_size); + assert!(Arc::ptr_eq( + &value, + &strategy.get_range_result(&key).unwrap() + )); + } + + #[tokio::test] + async fn cache_batch_buffer_compacts_when_rows_exceed_default_batch_size() { + let strategy = test_cache_strategy(); + let batch = make_batch(&vec![1; DEFAULT_READ_BATCH_SIZE / 2 + 1]); + let (key, part_metrics) = test_cache_context(&strategy); + + let mut buffer = CacheBatchBuffer::new(&strategy); + buffer.push(batch.clone()).unwrap(); + buffer.push(batch).unwrap(); + + assert_eq!(buffer.buffered_rows, 0); + assert!(buffer.buffered_batches.is_empty()); + + let value = finish_cache_batch_buffer(buffer, key, strategy, part_metrics) + .await + .unwrap(); + assert_eq!(value.cached_batches.len(), 1); assert_eq!( - buffer.estimated_batches_size(), - batch1_full + batch2_full - dict_values_size + value.cached_batches[0].slice_lengths, + vec![ + DEFAULT_READ_BATCH_SIZE / 2 + 1, + DEFAULT_READ_BATCH_SIZE / 2 + 1 + ] ); - assert_eq!(buffer.into_batches().len(), 2); } - #[test] - fn cache_batch_buffer_non_shared_dictionary() { - use datatypes::arrow::array::BinaryArray; + #[tokio::test] + async fn cache_batch_buffer_compacts_when_buffered_size_exceeds_threshold() { + let strategy = test_cache_strategy(); + let large_batch = make_large_binary_batch(DEFAULT_READ_BATCH_SIZE, 4096); + let (key, part_metrics) = test_cache_context(&strategy); - let schema = dict_test_schema(); - let dict_values1 = BinaryArray::from_vec(vec![b"a", b"b"]); - let dict_values2 = BinaryArray::from_vec(vec![b"x", b"y"]); + let mut buffer = CacheBatchBuffer::new(&strategy); + buffer.push(large_batch.clone()).unwrap(); - let batch1 = make_dict_batch(schema.clone(), &dict_values1, &[0, 1], &[10, 20]); - let batch2 = make_dict_batch(schema, &dict_values2, &[0, 1], &[30, 40]); + assert_eq!(buffer.buffered_rows, 0); + assert!(buffer.buffered_batches.is_empty()); - let batch1_full = batch1.get_array_memory_size(); - let batch2_full = batch2.get_array_memory_size(); - - let mut buffer = CacheBatchBuffer::new(); - buffer.push(batch1); - buffer.push(batch2); - - // Different dictionaries: full size for both. - assert_eq!(buffer.estimated_batches_size(), batch1_full + batch2_full); - } - - #[test] - fn cache_batch_buffer_shared_then_diverged() { - use datatypes::arrow::array::BinaryArray; - - let schema = dict_test_schema(); - let shared_values = BinaryArray::from_vec(vec![b"a", b"b", b"c"]); - let different_values = BinaryArray::from_vec(vec![b"x", b"y"]); - - let batch1 = make_dict_batch(schema.clone(), &shared_values, &[0], &[1]); - let batch2 = make_dict_batch(schema.clone(), &shared_values, &[1], &[2]); - let batch3 = make_dict_batch(schema, &different_values, &[0], &[3]); - - let size1 = batch1.get_array_memory_size(); - let size2 = batch2.get_array_memory_size(); - let size3 = batch3.get_array_memory_size(); - - let dict_values_size = compute_total_dict_values_size(&batch2); - - let mut buffer = CacheBatchBuffer::new(); - buffer.push(batch1); - buffer.push(batch2); - buffer.push(batch3); - - // batch2 shares dict with batch1 (dedup), batch3 does not (full size). + let value = finish_cache_batch_buffer(buffer, key, strategy, part_metrics) + .await + .unwrap(); + assert_eq!(value.cached_batches.len(), 1); assert_eq!( - buffer.estimated_batches_size(), - size1 + (size2 - dict_values_size) + size3 + value.cached_batches[0].slice_lengths, + vec![large_batch.num_rows()] + ); + } + + #[tokio::test] + async fn cache_batch_buffer_uses_compacted_size_for_weight() { + let strategy = test_cache_strategy(); + let batch1 = make_batch(&[1, 2]); + let batch2 = make_batch(&[3, 4]); + let (key, part_metrics) = test_cache_context(&strategy); + let expected = concat_batches(&test_schema(), &[batch1.clone(), batch2.clone()]) + .unwrap() + .get_array_memory_size(); + + let mut buffer = CacheBatchBuffer::new(&strategy); + buffer.push(batch1).unwrap(); + buffer.push(batch2).unwrap(); + + let value = finish_cache_batch_buffer(buffer, key, strategy, part_metrics) + .await + .unwrap(); + assert_eq!(value.estimated_batches_size, expected); + } + + #[tokio::test] + async fn cache_batch_buffer_skips_cache_when_weight_exceeds_limit() { + let strategy = test_cache_strategy(); + let (key, part_metrics) = test_cache_context(&strategy); + let mut buffer = CacheBatchBuffer::new(&strategy); + buffer.total_weight = RANGE_CACHE_SKIP_BYTES; + + buffer.push(make_batch(&[1])).unwrap(); + + assert!(buffer.sender.is_none()); + assert!( + finish_cache_batch_buffer(buffer, key, strategy, part_metrics) + .await + .is_none() ); } } diff --git a/src/mito2/src/read/scan_util.rs b/src/mito2/src/read/scan_util.rs index 8fc946b3d3..80563f32a9 100644 --- a/src/mito2/src/read/scan_util.rs +++ b/src/mito2/src/read/scan_util.rs @@ -1258,13 +1258,25 @@ pub(crate) fn should_split_flat_batches_for_merge( // This is a file range. let file_index = index.index - stream_ctx.input.num_memtables(); let file = &stream_ctx.input.files[file_index]; - if file.meta_ref().num_rows < SPLIT_ROW_THRESHOLD || file.meta_ref().num_series == 0 { + let file_meta = file.meta_ref(); + if file_meta.level == 0 { + // Always split level 0 files. + num_files_to_split += 1; + continue; + } else if file_meta.num_rows < SPLIT_ROW_THRESHOLD || file_meta.num_series == 0 { // If the file doesn't have enough rows, or the number of series is unavailable, skips it. continue; } - debug_assert!(file.meta_ref().num_rows > 0); - if !can_split_series(file.meta_ref().num_rows, file.meta_ref().num_series) { + debug_assert!(file_meta.num_rows > 0); + if !can_split_series(file_meta.num_rows, file_meta.num_series) { // We can't split batches in a file. + common_telemetry::trace!( + "Can't split series for file {}, level: {}, num_rows: {}, num_series: {}", + file_meta.file_id, + file_meta.level, + file_meta.num_rows, + file_meta.num_series, + ); return None; } else { num_files_to_split += 1; @@ -1310,14 +1322,108 @@ pub(crate) fn compute_parallel_channel_size(estimated_rows_per_batch: usize) -> size.clamp(2, 64) } +/// Computes the average estimated rows per batch across multiple range readers. +pub(crate) fn compute_average_batch_size( + estimated_rows_per_batch: impl IntoIterator, +) -> usize { + let mut total = 0usize; + let mut count = 0usize; + for size in estimated_rows_per_batch { + total += size; + count += 1; + } + + if count == 0 { + return DEFAULT_READ_BATCH_SIZE; + } + + (total / count).clamp(1, DEFAULT_READ_BATCH_SIZE) +} + fn can_split_series(num_rows: u64, num_series: u64) -> bool { - assert!(num_series > 0); - assert!(num_rows > 0); + if num_rows == 0 || num_series == 0 { + return false; + } // It doesn't have too many series or it will have enough rows for each batch. num_series < NUM_SERIES_THRESHOLD || num_rows / num_series >= BATCH_SIZE_THRESHOLD } +#[cfg(test)] +mod split_tests { + use std::sync::Arc; + + use common_time::Timestamp; + use smallvec::smallvec; + use store_api::storage::FileId; + + use super::*; + use crate::read::projection::ProjectionMapper; + use crate::read::range::{RangeMeta, RowGroupIndex, SourceIndex}; + use crate::read::scan_region::{ScanInput, StreamContext}; + use crate::sst::file::FileHandle; + use crate::test_util::memtable_util::metadata_with_primary_key; + use crate::test_util::scheduler_util::SchedulerEnv; + use crate::test_util::sst_util::sst_file_handle_with_file_id; + + async fn new_stream_context_with_files(files: Vec) -> StreamContext { + let env = SchedulerEnv::new().await; + let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false)); + let mapper = ProjectionMapper::new(&metadata, [0, 2, 3].into_iter()).unwrap(); + let input = ScanInput::new(env.access_layer.clone(), mapper).with_files(files); + + StreamContext { + input, + ranges: vec![], + scan_fingerprint: None, + query_start: std::time::Instant::now(), + } + } + + fn single_file_range_meta() -> RangeMeta { + RangeMeta { + time_range: ( + Timestamp::new_millisecond(0), + Timestamp::new_millisecond(1000), + ), + indices: smallvec![SourceIndex { + index: 0, + num_row_groups: 1, + }], + row_group_indices: smallvec![RowGroupIndex { + index: 0, + row_group_index: 0, + }], + num_rows: 1024, + } + } + + #[tokio::test] + async fn should_split_level_zero_file_even_when_series_stats_are_missing() { + let mut file = sst_file_handle_with_file_id(FileId::random(), 0, 1000) + .meta_ref() + .clone(); + file.level = 0; + file.num_rows = DEFAULT_ROW_GROUP_SIZE as u64; + file.num_row_groups = 1; + file.num_series = 0; + + let file = FileHandle::new(file, crate::test_util::new_noop_file_purger()); + let stream_ctx = Arc::new(new_stream_context_with_files(vec![file]).await); + + assert!( + should_split_flat_batches_for_merge(&stream_ctx, &single_file_range_meta()).is_some() + ); + } + + #[test] + fn can_split_series_returns_false_for_zero_inputs() { + assert!(!can_split_series(0, 1)); + assert!(!can_split_series(1, 0)); + assert!(!can_split_series(0, 0)); + } +} + /// Creates a new [ReaderFilterMetrics] with optional apply metrics initialized /// based on the `explain_verbose` flag. fn new_filter_metrics(explain_verbose: bool) -> ReaderFilterMetrics { @@ -1653,6 +1759,7 @@ mod tests { let meta = FileMeta { region_id: RegionId::new(123, 456), file_id: Default::default(), + level: 1, time_range: ( Timestamp::new_millisecond(0), Timestamp::new_millisecond(1000), @@ -1816,4 +1923,26 @@ mod tests { compute_parallel_channel_size(DEFAULT_READ_BATCH_SIZE * 2) ); } + + #[test] + fn test_compute_average_batch_size_uses_arithmetic_mean() { + assert_eq!(24, compute_average_batch_size([16, 24, 32])); + } + + #[test] + fn test_compute_average_batch_size_clamps_values() { + assert_eq!( + DEFAULT_READ_BATCH_SIZE, + compute_average_batch_size([DEFAULT_READ_BATCH_SIZE, DEFAULT_READ_BATCH_SIZE * 2]) + ); + assert_eq!(1, compute_average_batch_size([0, 1])); + } + + #[test] + fn test_compute_average_batch_size_falls_back_when_empty() { + assert_eq!( + DEFAULT_READ_BATCH_SIZE, + compute_average_batch_size(std::iter::empty()) + ); + } } diff --git a/src/mito2/src/read/seq_scan.rs b/src/mito2/src/read/seq_scan.rs index 15ab435425..932d382834 100644 --- a/src/mito2/src/read/seq_scan.rs +++ b/src/mito2/src/read/seq_scan.rs @@ -41,6 +41,9 @@ use crate::read::flat_merge::FlatMergeReader; use crate::read::last_row::FlatLastRowReader; use crate::read::pruner::{PartitionPruner, Pruner}; use crate::read::range::RangeMeta; +use crate::read::range_cache::{ + build_range_cache_key, cache_flat_range_stream, cached_flat_range_stream, +}; use crate::read::scan_region::{ScanInput, StreamContext}; use crate::read::scan_util::{ PartitionMetrics, PartitionMetricsList, SplitRecordBatchStream, compute_parallel_channel_size, @@ -181,19 +184,22 @@ impl SeqScan { sources, None, None, + false, compute_parallel_channel_size(DEFAULT_READ_BATCH_SIZE), ) .await } - /// Builds a flat reader to read sources that returns RecordBatch. If `semaphore` is provided, reads sources in parallel - /// if possible. + /// Builds a flat reader to read sources that returns RecordBatch. + /// If `semaphore` is provided, reads sources in parallel if possible. + /// If `skip_dedup` is true, the merged stream is returned without applying flat dedup. #[tracing::instrument(level = tracing::Level::DEBUG, skip_all)] pub(crate) async fn build_flat_reader_from_sources( stream_ctx: &StreamContext, mut sources: Vec, semaphore: Option>, part_metrics: Option<&PartitionMetrics>, + skip_dedup: bool, channel_size: usize, ) -> Result { if let Some(semaphore) = semaphore.as_ref() { @@ -215,7 +221,7 @@ impl SeqScan { FlatMergeReader::new(schema, sources, DEFAULT_READ_BATCH_SIZE, metrics_reporter) .await?; - let dedup = !stream_ctx.input.append_mode; + let dedup = !skip_dedup && !stream_ctx.input.append_mode; let dedup_metrics_reporter = part_metrics.map(|m| m.dedup_metrics_reporter()); let reader = if dedup { match stream_ctx.input.merge_mode { @@ -253,6 +259,62 @@ impl SeqScan { Ok(reader) } + /// Builds a flat read stream for one partition range. + pub(crate) async fn build_flat_partition_range_read( + stream_ctx: &Arc, + part_range: &PartitionRange, + compaction: bool, + part_metrics: &PartitionMetrics, + partition_pruner: Arc, + file_scan_semaphore: Option>, + merge_semaphore: Option>, + ) -> Result<(BoxedRecordBatchStream, usize)> { + let cache_key = build_range_cache_key(stream_ctx, part_range); + + if let Some(key) = cache_key.as_ref() { + if let Some(value) = stream_ctx.input.cache_strategy.get_range_result(key) { + part_metrics.inc_range_cache_hit(); + return Ok((cached_flat_range_stream(value), DEFAULT_READ_BATCH_SIZE)); + } + part_metrics.inc_range_cache_miss(); + } + + let mut sources = Vec::new(); + let split_batch_size = build_flat_sources( + stream_ctx, + part_range, + compaction, + part_metrics, + partition_pruner, + &mut sources, + file_scan_semaphore, + ) + .await?; + let estimated_rows_per_batch = split_batch_size.unwrap_or(DEFAULT_READ_BATCH_SIZE); + let channel_size = compute_parallel_channel_size(estimated_rows_per_batch); + let stream = Self::build_flat_reader_from_sources( + stream_ctx, + sources, + merge_semaphore, + Some(part_metrics), + false, + channel_size, + ) + .await?; + + let stream = match cache_key { + Some(key) => cache_flat_range_stream( + stream, + stream_ctx.input.cache_strategy.clone(), + key, + part_metrics.clone(), + ), + None => stream, + }; + + Ok((stream, estimated_rows_per_batch)) + } + /// Scans the given partition when the part list is set properly. /// Otherwise the returned stream might not contains any data. fn scan_partition_impl( @@ -331,23 +393,16 @@ impl SeqScan { // Scans each part. for part_range in partition_ranges { - let mut sources = Vec::new(); - let split_batch_size = build_flat_sources( + let (mut reader, _) = Self::build_flat_partition_range_read( &stream_ctx, &part_range, compaction, &part_metrics, partition_pruner.clone(), - &mut sources, file_scan_semaphore.clone(), - ).await?; - - let channel_size = compute_parallel_channel_size( - split_batch_size.unwrap_or(DEFAULT_READ_BATCH_SIZE), - ); - let mut reader = - Self::build_flat_reader_from_sources(&stream_ctx, sources, semaphore.clone(), Some(&part_metrics), channel_size) - .await?; + semaphore.clone(), + ) + .await?; let mut metrics = ScannerMetrics { scan_cost: fetch_start.elapsed(), diff --git a/src/mito2/src/read/series_scan.rs b/src/mito2/src/read/series_scan.rs index bf7ed072ab..7883c1d553 100644 --- a/src/mito2/src/read/series_scan.rs +++ b/src/mito2/src/read/series_scan.rs @@ -41,18 +41,18 @@ use tokio::sync::mpsc::error::{SendTimeoutError, TrySendError}; use tokio::sync::mpsc::{self, Receiver, Sender}; use crate::error::{ - Error, InvalidSenderSnafu, PartitionOutOfRangeSnafu, Result, ScanMultiTimesSnafu, + Error, InvalidSenderSnafu, JoinSnafu, PartitionOutOfRangeSnafu, Result, ScanMultiTimesSnafu, ScanSeriesSnafu, TooManyFilesToReadSnafu, }; use crate::read::ScannerMetrics; use crate::read::pruner::{PartitionPruner, Pruner}; use crate::read::scan_region::{ScanInput, StreamContext}; use crate::read::scan_util::{ - PartitionMetrics, PartitionMetricsList, SeriesDistributorMetrics, compute_parallel_channel_size, + PartitionMetrics, PartitionMetricsList, SeriesDistributorMetrics, compute_average_batch_size, + compute_parallel_channel_size, }; -use crate::read::seq_scan::{SeqScan, build_flat_sources}; +use crate::read::seq_scan::SeqScan; use crate::read::stream::{ConvertBatchStream, ScanBatch, ScanBatchStream}; -use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE; use crate::sst::parquet::flat_format::primary_key_column_index; use crate::sst::parquet::format::PrimaryKeyArray; @@ -227,7 +227,8 @@ impl SeriesScan { let (senders, receivers) = new_channel_list(self.properties.num_partitions()); let mut distributor = SeriesDistributor { stream_ctx: self.stream_ctx.clone(), - semaphore: Some(Arc::new(Semaphore::new(self.properties.num_partitions()))), + range_semaphore: Some(Arc::new(Semaphore::new(self.properties.num_partitions()))), + final_merge_semaphore: Some(Arc::new(Semaphore::new(self.properties.num_partitions()))), partitions: self.properties.partitions.clone(), pruner: self.pruner.clone(), senders, @@ -420,8 +421,13 @@ impl SeriesScan { struct SeriesDistributor { /// Context for the scan stream. stream_ctx: Arc, - /// Optional semaphore for limiting the number of concurrent scans. - semaphore: Option>, + /// Semaphore for file scanning and range-level merging. + range_semaphore: Option>, + /// Semaphore for the final merge across all range streams. + /// Must be separate from `range_semaphore` to avoid deadlock: final merge tasks + /// hold a permit while waiting for data from range-level merge tasks, which also + /// need permits to produce data. + final_merge_semaphore: Option>, /// Partition ranges to scan. partitions: Vec>, /// Shared pruner for file range building. @@ -483,36 +489,57 @@ impl SeriesDistributor { // build part cost. let mut fetch_start = Instant::now(); - // Scans all parts. - let mut sources = Vec::with_capacity(self.partitions.len()); - let mut min_batch_size: Option = None; + // Builds one deduped stream per partition range, then merges across ranges. + let build_start = Instant::now(); + let mut tasks = Vec::new(); for partition in &self.partitions { - sources.reserve(partition.len()); for part_range in partition { - let split_batch_size = build_flat_sources( - &self.stream_ctx, - part_range, - false, - &part_metrics, - partition_pruner.clone(), - &mut sources, - self.semaphore.clone(), - ) - .await?; - if let Some(size) = split_batch_size { - min_batch_size = Some(min_batch_size.map_or(size, |cur| cur.min(size))); - } + let stream_ctx = self.stream_ctx.clone(); + let part_range = *part_range; + let part_metrics = part_metrics.clone(); + let partition_pruner = partition_pruner.clone(); + let file_scan_semaphore = self.range_semaphore.clone(); + let merge_semaphore = self.range_semaphore.clone(); + tasks.push(common_runtime::spawn_global(async move { + SeqScan::build_flat_partition_range_read( + &stream_ctx, + &part_range, + false, + &part_metrics, + partition_pruner, + file_scan_semaphore, + merge_semaphore, + ) + .await + })); } } - - // Builds a flat reader that merge sources from all parts. + let mut range_streams = Vec::with_capacity(tasks.len()); + let mut estimated_batch_sizes = Vec::with_capacity(tasks.len()); + for task in tasks { + let (stream, estimated_batch_size) = task.await.context(JoinSnafu)??; + range_streams.push(stream); + estimated_batch_sizes.push(estimated_batch_size); + } let channel_size = - compute_parallel_channel_size(min_batch_size.unwrap_or(DEFAULT_READ_BATCH_SIZE)); + compute_parallel_channel_size(compute_average_batch_size(estimated_batch_sizes)); + common_telemetry::debug!( + "SeriesDistributor built {} range_streams, region: {}, build cost: {:?}, channel_size: {}", + range_streams.len(), + self.stream_ctx.input.region_metadata().region_id, + build_start.elapsed(), + channel_size, + ); + + // Each partition range stream is already deduped, so skip dedup here. + // Use a separate semaphore for the final merge to avoid deadlock with + // range-level merge tasks that share the range_semaphore. let mut reader = SeqScan::build_flat_reader_from_sources( &self.stream_ctx, - sources, - self.semaphore.clone(), + range_streams, + self.final_merge_semaphore.clone(), Some(&part_metrics), + true, channel_size, ) .await?; From 57f19212534b2329b1418fd4297b3acf58287eb4 Mon Sep 17 00:00:00 2001 From: Weny Xu Date: Mon, 13 Apr 2026 17:04:02 +0800 Subject: [PATCH 096/195] feat: propagate staging leader through lease and heartbeat (#7950) * feat(mito): expose staging leader role state * fix(region): clear staging metadata on leader exit * feat: propagate staging leader role through heartbeat and metasrv * chore: update comments Signed-off-by: WenyXu * fix(region): unify staging exit role transitions * chore: update proto Signed-off-by: WenyXu --------- Signed-off-by: WenyXu --- Cargo.lock | 3 +- Cargo.toml | 2 +- src/common/meta/src/datanode.rs | 23 ++ src/datanode/src/alive_keeper.rs | 138 +++++++++ src/datanode/src/heartbeat.rs | 6 +- src/datanode/src/region_server.rs | 1 + .../handler/collect_cluster_info_handler.rs | 2 +- .../handler/collect_leader_region_handler.rs | 2 +- .../src/handler/persist_stats_handler.rs | 5 +- .../src/handler/region_lease_handler.rs | 59 ++++ src/meta-srv/src/region/lease_keeper.rs | 124 +++++++- src/mito2/src/engine.rs | 10 +- .../src/engine/apply_staging_manifest_test.rs | 2 +- src/mito2/src/engine/set_role_state_test.rs | 266 +++++++++++++++++- src/mito2/src/engine/staging_test.rs | 2 +- src/mito2/src/region.rs | 173 +++++++++--- src/mito2/src/region/opener.rs | 5 +- src/mito2/src/worker/handle_apply_staging.rs | 2 +- src/mito2/src/worker/handle_enter_staging.rs | 10 +- src/store-api/src/region_engine.rs | 12 +- 20 files changed, 782 insertions(+), 65 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 872095752b..4f6339d83a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5681,7 +5681,7 @@ dependencies = [ [[package]] name = "greptime-proto" version = "0.1.0" -source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=092ba1d01e2da676dca66cca7eebb55009da8ef8#092ba1d01e2da676dca66cca7eebb55009da8ef8" +source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=26a50f4069f50c37d65b45e0d39ae0cb42de5425#26a50f4069f50c37d65b45e0d39ae0cb42de5425" dependencies = [ "prost 0.14.1", "prost-types 0.14.1", @@ -5691,7 +5691,6 @@ dependencies = [ "strum_macros 0.25.3", "tonic 0.14.2", "tonic-prost", - "tonic-prost-build", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 227608bf64..34e10d9173 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -154,7 +154,7 @@ etcd-client = { version = "0.17", features = [ fst = "0.4.7" futures = "0.3" futures-util = "0.3" -greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "092ba1d01e2da676dca66cca7eebb55009da8ef8" } +greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "26a50f4069f50c37d65b45e0d39ae0cb42de5425" } hex = "0.4" http = "1" humantime = "2.1" diff --git a/src/common/meta/src/datanode.rs b/src/common/meta/src/datanode.rs index 8b521d8e43..d6c6229801 100644 --- a/src/common/meta/src/datanode.rs +++ b/src/common/meta/src/datanode.rs @@ -573,4 +573,27 @@ mod tests { let region_num = stat_val.region_num().unwrap(); assert_eq!(2, region_num); } + + #[test] + fn test_region_stat_from_heartbeat_preserves_staging_leader_role() { + let request = HeartbeatRequest { + header: Some(RequestHeader::default()), + peer: Some(api::v1::meta::Peer { + id: 1, + addr: "127.0.0.1:3001".to_string(), + }), + region_stats: vec![api::v1::meta::RegionStat { + region_id: RegionId::new(1024, 1).as_u64(), + engine: "mito".to_string(), + role: api::v1::meta::RegionRole::StagingLeader.into(), + ..Default::default() + }], + ..Default::default() + }; + + let stat = Stat::try_from(&request).unwrap(); + + assert_eq!(stat.region_stats.len(), 1); + assert_eq!(stat.region_stats[0].role, RegionRole::StagingLeader); + } } diff --git a/src/datanode/src/alive_keeper.rs b/src/datanode/src/alive_keeper.rs index 57f4e00aa2..dbf99fdb28 100644 --- a/src/datanode/src/alive_keeper.rs +++ b/src/datanode/src/alive_keeper.rs @@ -503,6 +503,7 @@ mod test { use mito2::config::MitoConfig; use mito2::test_util::{CreateRequestBuilder, TestEnv}; use store_api::region_engine::RegionEngine; + use store_api::region_request::{EnterStagingRequest, StagingPartitionDirective}; use super::*; use crate::tests::mock_region_server; @@ -621,4 +622,141 @@ mod test { > Instant::now() + Duration::from_millis(heartbeat_interval_millis * 4) ); } + + #[tokio::test(flavor = "multi_thread")] + async fn renew_staging_leader_keeps_region_in_staging() { + let mut region_server = mock_region_server(); + let mut engine_env = TestEnv::with_prefix("region-alive-keeper-staging").await; + let engine = engine_env.create_engine(MitoConfig::default()).await; + let engine = Arc::new(engine); + region_server.register_engine(engine.clone()); + + let alive_keeper = Arc::new(RegionAliveKeeper::new( + region_server.clone(), + None, + Duration::from_millis(100), + )); + + let region_id = RegionId::new(1024, 2); + region_server + .handle_request( + region_id, + RegionRequest::Create(CreateRequestBuilder::new().build()), + ) + .await + .unwrap(); + region_server + .handle_request( + region_id, + RegionRequest::EnterStaging(EnterStagingRequest { + partition_directive: StagingPartitionDirective::RejectAllWrites, + }), + ) + .await + .unwrap(); + + alive_keeper.register_region(region_id).await; + alive_keeper + .renew_region_leases( + &[GrantedRegion { + region_id: region_id.as_u64(), + role: api::v1::meta::RegionRole::StagingLeader.into(), + extensions: HashMap::new(), + }], + Instant::now() + Duration::from_millis(3000), + ) + .await; + + assert_eq!(engine.role(region_id).unwrap(), RegionRole::StagingLeader); + } + + #[tokio::test(flavor = "multi_thread")] + async fn renew_staging_leader_exit_into_leader() { + common_telemetry::init_default_ut_logging(); + let mut region_server = mock_region_server(); + let mut engine_env = TestEnv::with_prefix("region-alive-keeper-staging-exit").await; + let engine = engine_env.create_engine(MitoConfig::default()).await; + let engine = Arc::new(engine); + region_server.register_engine(engine.clone()); + + let alive_keeper = Arc::new(RegionAliveKeeper::new( + region_server.clone(), + None, + Duration::from_millis(100), + )); + + let region_id = RegionId::new(1024, 2); + region_server + .handle_request( + region_id, + RegionRequest::Create(CreateRequestBuilder::new().build()), + ) + .await + .unwrap(); + region_server + .handle_request( + region_id, + RegionRequest::EnterStaging(EnterStagingRequest { + partition_directive: StagingPartitionDirective::RejectAllWrites, + }), + ) + .await + .unwrap(); + + alive_keeper.register_region(region_id).await; + alive_keeper + .renew_region_leases( + &[GrantedRegion { + region_id: region_id.as_u64(), + role: api::v1::meta::RegionRole::Leader.into(), + extensions: HashMap::new(), + }], + Instant::now() + Duration::from_millis(3000), + ) + .await; + + tokio::time::sleep(Duration::from_millis(100)).await; + assert_eq!(engine.role(region_id).unwrap(), RegionRole::Leader); + } + + #[tokio::test(flavor = "multi_thread")] + async fn renew_staging_leader_does_not_promote_normal_leader_into_staging() { + let mut region_server = mock_region_server(); + let mut engine_env = TestEnv::with_prefix("region-alive-keeper-non-staging").await; + let engine = engine_env.create_engine(MitoConfig::default()).await; + let engine = Arc::new(engine); + region_server.register_engine(engine.clone()); + + let alive_keeper = Arc::new(RegionAliveKeeper::new( + region_server.clone(), + None, + Duration::from_millis(100), + )); + + let region_id = RegionId::new(1024, 4); + region_server + .handle_request( + region_id, + RegionRequest::Create(CreateRequestBuilder::new().build()), + ) + .await + .unwrap(); + region_server + .set_region_role(region_id, RegionRole::Leader) + .unwrap(); + + alive_keeper.register_region(region_id).await; + alive_keeper + .renew_region_leases( + &[GrantedRegion { + region_id: region_id.as_u64(), + role: api::v1::meta::RegionRole::StagingLeader.into(), + extensions: HashMap::new(), + }], + Instant::now() + Duration::from_millis(3000), + ) + .await; + tokio::time::sleep(Duration::from_millis(100)).await; + assert_eq!(engine.role(region_id).unwrap(), RegionRole::Leader); + } } diff --git a/src/datanode/src/heartbeat.rs b/src/datanode/src/heartbeat.rs index be662dfe94..fe8866b7f9 100644 --- a/src/datanode/src/heartbeat.rs +++ b/src/datanode/src/heartbeat.rs @@ -148,9 +148,9 @@ impl HeartbeatTask { let mut follower_region_lease_count = 0; for lease in &lease.regions { match lease.role() { - RegionRole::Leader | RegionRole::DowngradingLeader => { - leader_region_lease_count += 1 - } + RegionRole::Leader + | RegionRole::StagingLeader + | RegionRole::DowngradingLeader => leader_region_lease_count += 1, RegionRole::Follower => follower_region_lease_count += 1, } } diff --git a/src/datanode/src/region_server.rs b/src/datanode/src/region_server.rs index ec10691bea..aa3ffbfe3a 100644 --- a/src/datanode/src/region_server.rs +++ b/src/datanode/src/region_server.rs @@ -360,6 +360,7 @@ impl RegionServer { engine.role(region_id).map(|role| match role { RegionRole::Follower => false, RegionRole::Leader => true, + RegionRole::StagingLeader => true, RegionRole::DowngradingLeader => true, }) }) diff --git a/src/meta-srv/src/handler/collect_cluster_info_handler.rs b/src/meta-srv/src/handler/collect_cluster_info_handler.rs index c96229f9cf..3fc785a1cb 100644 --- a/src/meta-srv/src/handler/collect_cluster_info_handler.rs +++ b/src/meta-srv/src/handler/collect_cluster_info_handler.rs @@ -129,7 +129,7 @@ impl HeartbeatHandler for CollectDatanodeClusterInfoHandler { let leader_regions = stat .region_stats .iter() - .filter(|s| s.role == RegionRole::Leader) + .filter(|s| matches!(s.role, RegionRole::Leader | RegionRole::StagingLeader)) .count(); let follower_regions = stat.region_stats.len() - leader_regions; diff --git a/src/meta-srv/src/handler/collect_leader_region_handler.rs b/src/meta-srv/src/handler/collect_leader_region_handler.rs index ddb4cd0ea3..95b03e3341 100644 --- a/src/meta-srv/src/handler/collect_leader_region_handler.rs +++ b/src/meta-srv/src/handler/collect_leader_region_handler.rs @@ -40,7 +40,7 @@ impl HeartbeatHandler for CollectLeaderRegionHandler { let mut key_values = Vec::with_capacity(current_stat.region_stats.len()); for stat in current_stat.region_stats.iter() { - if stat.role != RegionRole::Leader { + if !matches!(stat.role, RegionRole::Leader | RegionRole::StagingLeader) { continue; } diff --git a/src/meta-srv/src/handler/persist_stats_handler.rs b/src/meta-srv/src/handler/persist_stats_handler.rs index 75281f982a..d863070225 100644 --- a/src/meta-srv/src/handler/persist_stats_handler.rs +++ b/src/meta-srv/src/handler/persist_stats_handler.rs @@ -121,7 +121,10 @@ fn to_persisted_if_leader( datanode_id: DatanodeId, timestamp_millis: i64, ) -> Option<(Row, PersistedRegionStat)> { - if matches!(region_stat.role, RegionRole::Leader) { + if matches!( + region_stat.role, + RegionRole::Leader | RegionRole::StagingLeader + ) { let persisted_region_stat = last_persisted_region_stats.get(®ion_stat.id).map(|s| *s); Some(( compute_persist_region_stat( diff --git a/src/meta-srv/src/handler/region_lease_handler.rs b/src/meta-srv/src/handler/region_lease_handler.rs index fef84ef0db..c6c1d44521 100644 --- a/src/meta-srv/src/handler/region_lease_handler.rs +++ b/src/meta-srv/src/handler/region_lease_handler.rs @@ -398,6 +398,65 @@ mod test { assert_eq!(acc.inactive_region_ids, HashSet::from([no_exist_region_id])); } + #[tokio::test] + async fn test_handle_staging_leader() { + let datanode_id = 1; + let region_number = 1u32; + let table_id = 10; + let region_id = RegionId::new(table_id, region_number); + let peer = Peer::empty(datanode_id); + let table_info = new_test_table_info(table_id); + + let region_routes = vec![RegionRoute { + region: Region::new_test(region_id), + leader_peer: Some(peer.clone()), + leader_state: Some(LeaderState::Staging), + ..Default::default() + }]; + + let keeper = new_test_keeper(); + let table_metadata_manager = keeper.table_metadata_manager(); + + table_metadata_manager + .create_table_metadata( + table_info, + TableRouteValue::physical(region_routes), + HashMap::default(), + ) + .await + .unwrap(); + + let builder = MetasrvBuilder::new(); + let metasrv = builder.build().await.unwrap(); + let ctx = &mut metasrv.new_ctx(); + + let req = HeartbeatRequest { + duration_since_epoch: 1234, + ..Default::default() + }; + + let acc = &mut HeartbeatAccumulator::default(); + acc.stat = Some(Stat { + id: peer.id, + region_stats: vec![new_empty_region_stat(region_id, RegionRole::StagingLeader)], + ..Default::default() + }); + + let handler = RegionLeaseHandler::new( + default_distributed_time_constants().region_lease.as_secs(), + table_metadata_manager.clone(), + Default::default(), + None, + ); + + handler.handle(&req, ctx, acc).await.unwrap(); + + assert_region_lease( + acc, + vec![GrantedRegion::new(region_id, RegionRole::StagingLeader)], + ); + } + fn assert_region_lease(acc: &HeartbeatAccumulator, expected: Vec) { let region_lease = acc.region_lease.as_ref().unwrap().clone(); let granted: Vec = region_lease diff --git a/src/meta-srv/src/region/lease_keeper.rs b/src/meta-srv/src/region/lease_keeper.rs index 6d282fb49f..ac9f7d71b9 100644 --- a/src/meta-srv/src/region/lease_keeper.rs +++ b/src/meta-srv/src/region/lease_keeper.rs @@ -63,7 +63,9 @@ fn renew_region_lease_via_region_route( if let Some(leader) = ®ion_route.leader_peer && leader.id == datanode_id { - let region_role = if region_route.is_leader_downgrading() { + let region_role = if region_route.is_leader_staging() { + RegionRole::StagingLeader + } else if region_route.is_leader_downgrading() { RegionRole::DowngradingLeader } else { RegionRole::Leader @@ -313,6 +315,12 @@ mod tests { renew_region_lease_via_region_route(®ion_route, leader_peer_id, region_id), Some((region_id, RegionRole::DowngradingLeader)) ); + + region_route.leader_state = Some(LeaderState::Staging); + assert_eq!( + renew_region_lease_via_region_route(®ion_route, leader_peer_id, region_id), + Some((region_id, RegionRole::StagingLeader)) + ); } #[tokio::test] @@ -581,4 +589,118 @@ mod tests { ); } } + + #[tokio::test] + async fn test_renew_region_leases_reported_staging_expected_leader() { + let table_id = 1024; + let table_info: TableInfo = new_test_table_info(table_id); + + let region_id = RegionId::new(table_id, 1); + let leader_peer_id = 1024; + let region_route = RegionRouteBuilder::default() + .region(Region::new_test(region_id)) + .leader_peer(Peer::empty(leader_peer_id)) + .build() + .unwrap(); + + let keeper = new_test_keeper(); + let table_metadata_manager = keeper.table_metadata_manager(); + table_metadata_manager + .create_table_metadata( + table_info, + TableRouteValue::physical(vec![region_route]), + HashMap::default(), + ) + .await + .unwrap(); + + let RenewRegionLeasesResponse { + non_exists, + renewed, + } = keeper + .renew_region_leases(leader_peer_id, &[(region_id, RegionRole::StagingLeader)]) + .await + .unwrap(); + + assert!(non_exists.is_empty()); + assert_eq!( + renewed, + HashMap::from([( + region_id, + RegionLeaseInfo::from((region_id, RegionRole::Leader)) + )]) + ); + } + + #[tokio::test] + async fn test_renew_region_leases_reported_staging_expected_staging() { + let table_id = 1024; + let table_info: TableInfo = new_test_table_info(table_id); + + let region_id = RegionId::new(table_id, 1); + let leader_peer_id = 1024; + let region_route = RegionRouteBuilder::default() + .region(Region::new_test(region_id)) + .leader_peer(Peer::empty(leader_peer_id)) + .leader_state(LeaderState::Staging) + .build() + .unwrap(); + + let keeper = new_test_keeper(); + let table_metadata_manager = keeper.table_metadata_manager(); + table_metadata_manager + .create_table_metadata( + table_info, + TableRouteValue::physical(vec![region_route]), + HashMap::default(), + ) + .await + .unwrap(); + + let RenewRegionLeasesResponse { + non_exists, + renewed, + } = keeper + .renew_region_leases(leader_peer_id, &[(region_id, RegionRole::StagingLeader)]) + .await + .unwrap(); + + assert!(non_exists.is_empty()); + assert_eq!( + renewed, + HashMap::from([( + region_id, + RegionLeaseInfo::from((region_id, RegionRole::StagingLeader)) + )]) + ); + } + + #[tokio::test] + async fn test_renew_region_leases_operating_region_preserves_reported_role() { + let keeper = new_test_keeper(); + let datanode_id = 1024; + let region_id = RegionId::new(2048, 1); + + let _guard = keeper + .memory_region_keeper + .register(datanode_id, region_id) + .unwrap(); + + let RenewRegionLeasesResponse { + non_exists, + renewed, + } = keeper + .renew_region_leases(datanode_id, &[(region_id, RegionRole::StagingLeader)]) + .await + .unwrap(); + + assert!(non_exists.is_empty()); + assert_eq!( + renewed, + HashMap::from([( + region_id, + RegionLeaseInfo::operating(region_id, RegionRole::StagingLeader) + )]) + ); + } } diff --git a/src/mito2/src/engine.rs b/src/mito2/src/engine.rs index d006067f0d..5bd1002581 100644 --- a/src/mito2/src/engine.rs +++ b/src/mito2/src/engine.rs @@ -1114,13 +1114,9 @@ impl EngineInner { } fn role(&self, region_id: RegionId) -> Option { - self.workers.get_region(region_id).map(|region| { - if region.is_follower() { - RegionRole::Follower - } else { - RegionRole::Leader - } - }) + self.workers + .get_region(region_id) + .map(|region| region.region_role()) } } diff --git a/src/mito2/src/engine/apply_staging_manifest_test.rs b/src/mito2/src/engine/apply_staging_manifest_test.rs index a82fcfe049..efa0713cfc 100644 --- a/src/mito2/src/engine/apply_staging_manifest_test.rs +++ b/src/mito2/src/engine/apply_staging_manifest_test.rs @@ -333,7 +333,7 @@ async fn test_apply_staging_manifest_success_with_format(flat_format: bool) { let staging_manifest = region.manifest_ctx.staging_manifest().await; assert!(staging_manifest.is_none()); // The staging partition expr should be cleared. - assert!(region.staging_partition_info.lock().unwrap().is_none()); + assert!(region.manifest_ctx.staging_partition_info().is_none()); // The staging manifest directory should be empty. let data_home = env.data_home(); let region_dir = format!("{}/data/test/1_0000000001", data_home.display()); diff --git a/src/mito2/src/engine/set_role_state_test.rs b/src/mito2/src/engine/set_role_state_test.rs index 4fb15ab7fe..40e03b063a 100644 --- a/src/mito2/src/engine/set_role_state_test.rs +++ b/src/mito2/src/engine/set_role_state_test.rs @@ -19,7 +19,9 @@ use store_api::region_engine::{ RegionEngine, RegionRole, SetRegionRoleStateResponse, SetRegionRoleStateSuccess, SettableRegionRoleState, }; -use store_api::region_request::{RegionPutRequest, RegionRequest}; +use store_api::region_request::{ + EnterStagingRequest, RegionPutRequest, RegionRequest, StagingPartitionDirective, +}; use store_api::storage::RegionId; use crate::config::MitoConfig; @@ -241,12 +243,14 @@ async fn test_unified_state_transitions_with_format(flat_format: bool) { .await .unwrap(); assert_success_response(&result, 0); + assert_eq!(engine.role(region_id), Some(RegionRole::StagingLeader)); let result = engine .set_region_role_state_gracefully(region_id, SettableRegionRoleState::Leader) .await .unwrap(); assert_success_response(&result, 0); + assert_eq!(engine.role(region_id), Some(RegionRole::Leader)); // Leader -> StagingLeader -> Follower (exit staging via demotion) engine @@ -259,6 +263,7 @@ async fn test_unified_state_transitions_with_format(flat_format: bool) { .await .unwrap(); assert_success_response(&result, 0); + assert_eq!(engine.role(region_id), Some(RegionRole::Follower)); // Note: Direct Follower -> Leader promotion is no longer allowed // Use existing set_region_role method for follower -> leader promotion @@ -277,6 +282,7 @@ async fn test_unified_state_transitions_with_format(flat_format: bool) { .await .unwrap(); assert_success_response(&result, 0); + assert_eq!(engine.role(region_id), Some(RegionRole::DowngradingLeader)); // Note: Direct DowngradingLeader -> Leader is no longer allowed // Use existing set_region_role method for downgrading -> leader promotion @@ -325,6 +331,264 @@ async fn test_restricted_state_transitions() { test_restricted_state_transitions_with_format(true).await; } +#[tokio::test] +async fn test_direct_set_region_role_staging_leader_is_noop() { + let mut env = TestEnv::new().await; + let engine = env.create_engine(MitoConfig::default()).await; + + let region_id = RegionId::new(1, 1); + let request = CreateRequestBuilder::new().build(); + + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + engine + .set_region_role(region_id, RegionRole::StagingLeader) + .unwrap(); + + assert_eq!(engine.role(region_id), Some(RegionRole::Leader)); + + engine + .set_region_role(region_id, RegionRole::Follower) + .unwrap(); + engine + .set_region_role(region_id, RegionRole::StagingLeader) + .unwrap(); + + assert_eq!(engine.role(region_id), Some(RegionRole::Follower)); +} + +#[tokio::test] +async fn test_direct_set_region_role_exits_staging_state_only() { + let mut env = TestEnv::new().await; + let engine = env.create_engine(MitoConfig::default()).await; + + let region_id = RegionId::new(1, 1); + let request = CreateRequestBuilder::new().build(); + + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + engine + .handle_request( + region_id, + RegionRequest::EnterStaging(EnterStagingRequest { + partition_directive: StagingPartitionDirective::RejectAllWrites, + }), + ) + .await + .unwrap(); + assert_eq!(engine.role(region_id), Some(RegionRole::StagingLeader)); + assert!( + engine + .get_region(region_id) + .unwrap() + .manifest_ctx + .staging_partition_info() + .is_some() + ); + + engine + .set_region_role(region_id, RegionRole::Leader) + .unwrap(); + assert_eq!(engine.role(region_id), Some(RegionRole::Leader)); + assert!( + engine + .get_region(region_id) + .unwrap() + .manifest_ctx + .staging_partition_info() + .is_none() + ); +} + +#[tokio::test] +async fn test_set_region_role_can_exit_staging_to_leader() { + let mut env = TestEnv::new().await; + let engine = env.create_engine(MitoConfig::default()).await; + + let region_id = RegionId::new(1, 1); + let request = CreateRequestBuilder::new().build(); + + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + engine + .set_region_role_state_gracefully(region_id, SettableRegionRoleState::StagingLeader) + .await + .unwrap(); + assert_eq!(engine.role(region_id), Some(RegionRole::StagingLeader)); + + engine + .set_region_role(region_id, RegionRole::Leader) + .unwrap(); + + assert_eq!(engine.role(region_id), Some(RegionRole::Leader)); + assert!( + engine + .get_region(region_id) + .unwrap() + .manifest_ctx + .staging_partition_info() + .is_none() + ); +} + +#[tokio::test] +async fn test_set_region_role_leader_clears_staging_partition_info() { + let mut env = TestEnv::new().await; + let engine = env.create_engine(MitoConfig::default()).await; + + let region_id = RegionId::new(1, 1); + let request = CreateRequestBuilder::new().build(); + + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + engine + .handle_request( + region_id, + RegionRequest::EnterStaging(EnterStagingRequest { + partition_directive: StagingPartitionDirective::RejectAllWrites, + }), + ) + .await + .unwrap(); + + let region = engine.get_region(region_id).unwrap(); + assert!(region.manifest_ctx.staging_partition_info().is_some()); + + engine + .set_region_role(region_id, RegionRole::Leader) + .unwrap(); + + let region = engine.get_region(region_id).unwrap(); + assert_eq!(engine.role(region_id), Some(RegionRole::Leader)); + assert!(region.manifest_ctx.staging_partition_info().is_none()); +} + +#[tokio::test] +async fn test_set_region_role_follower_clears_staging_partition_info() { + let mut env = TestEnv::new().await; + let engine = env.create_engine(MitoConfig::default()).await; + + let region_id = RegionId::new(1, 1); + let request = CreateRequestBuilder::new().build(); + + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + engine + .handle_request( + region_id, + RegionRequest::EnterStaging(EnterStagingRequest { + partition_directive: StagingPartitionDirective::RejectAllWrites, + }), + ) + .await + .unwrap(); + + let region = engine.get_region(region_id).unwrap(); + assert!(region.manifest_ctx.staging_partition_info().is_some()); + + engine + .set_region_role(region_id, RegionRole::Follower) + .unwrap(); + + let region = engine.get_region(region_id).unwrap(); + assert_eq!(engine.role(region_id), Some(RegionRole::Follower)); + assert!(region.manifest_ctx.staging_partition_info().is_none()); +} + +#[tokio::test] +async fn test_set_region_role_downgrading_leader_clears_staging_partition_info() { + let mut env = TestEnv::new().await; + let engine = env.create_engine(MitoConfig::default()).await; + + let region_id = RegionId::new(1, 1); + let request = CreateRequestBuilder::new().build(); + + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + engine + .handle_request( + region_id, + RegionRequest::EnterStaging(EnterStagingRequest { + partition_directive: StagingPartitionDirective::RejectAllWrites, + }), + ) + .await + .unwrap(); + + let region = engine.get_region(region_id).unwrap(); + assert!(region.manifest_ctx.staging_partition_info().is_some()); + + engine + .set_region_role(region_id, RegionRole::DowngradingLeader) + .unwrap(); + + let region = engine.get_region(region_id).unwrap(); + assert_eq!(engine.role(region_id), Some(RegionRole::DowngradingLeader)); + assert!(region.manifest_ctx.staging_partition_info().is_none()); +} + +#[tokio::test] +async fn test_can_reenter_staging_after_direct_exit_cleanup() { + let mut env = TestEnv::new().await; + let engine = env.create_engine(MitoConfig::default()).await; + + let region_id = RegionId::new(1, 1); + let request = CreateRequestBuilder::new().build(); + + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + engine + .handle_request( + region_id, + RegionRequest::EnterStaging(EnterStagingRequest { + partition_directive: StagingPartitionDirective::RejectAllWrites, + }), + ) + .await + .unwrap(); + engine + .set_region_role(region_id, RegionRole::Follower) + .unwrap(); + engine + .set_region_role(region_id, RegionRole::Leader) + .unwrap(); + + engine + .handle_request( + region_id, + RegionRequest::EnterStaging(EnterStagingRequest { + partition_directive: StagingPartitionDirective::RejectAllWrites, + }), + ) + .await + .unwrap(); + + let region = engine.get_region(region_id).unwrap(); + assert_eq!(engine.role(region_id), Some(RegionRole::StagingLeader)); + assert!(region.manifest_ctx.staging_partition_info().is_some()); +} + async fn test_restricted_state_transitions_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env diff --git a/src/mito2/src/engine/staging_test.rs b/src/mito2/src/engine/staging_test.rs index bd90779e0b..9846933d1f 100644 --- a/src/mito2/src/engine/staging_test.rs +++ b/src/mito2/src/engine/staging_test.rs @@ -547,7 +547,7 @@ async fn test_staging_manifest_directory_with_format(flat_format: bool) { .await .unwrap(); let region = engine.get_region(region_id).unwrap(); - let staging_partition_info = region.staging_partition_info.lock().unwrap().clone(); + let staging_partition_info = region.manifest_ctx.staging_partition_info(); assert_eq!( staging_partition_info .unwrap() diff --git a/src/mito2/src/region.rs b/src/mito2/src/region.rs index 26ab96c779..3804b28afb 100644 --- a/src/mito2/src/region.rs +++ b/src/mito2/src/region.rs @@ -156,11 +156,6 @@ pub struct MitoRegion { pub(crate) topic_latest_entry_id: AtomicU64, /// The total bytes written to the region. pub(crate) written_bytes: Arc, - /// Partition info of the region in staging mode. - /// - /// During the staging mode, the region metadata in [`VersionControlRef`] is not updated, - /// so we need to store the partition info separately. - pub(crate) staging_partition_info: Mutex>, /// manifest stats stats: ManifestStats, } @@ -333,6 +328,17 @@ impl MitoRegion { self.manifest_ctx.set_role(next_role, self.region_id); } + pub(crate) fn region_role(&self) -> RegionRole { + match self.state() { + RegionRoleState::Follower => RegionRole::Follower, + RegionRoleState::Leader(RegionLeaderState::Staging) => RegionRole::StagingLeader, + RegionRoleState::Leader(RegionLeaderState::Downgrading) => { + RegionRole::DowngradingLeader + } + RegionRoleState::Leader(_) => RegionRole::Leader, + } + } + /// Sets the altering state. /// You should call this method in the worker loop. pub(crate) fn set_altering(&self) -> Result<()> { @@ -393,9 +399,8 @@ impl MitoRegion { /// You should call this method in the worker loop. /// Transitions from Staging to Writable state. pub fn exit_staging(&self) -> Result<()> { - *self.staging_partition_info.lock().unwrap() = None; - self.compare_exchange_state( - RegionLeaderState::Staging, + self.manifest_ctx.exit_staging( + self.region_id, RegionRoleState::Leader(RegionLeaderState::Writable), ) } @@ -819,7 +824,7 @@ impl MitoRegion { pub fn maybe_staging_partition_expr_str(&self) -> Option { let is_staging = self.is_staging(); if is_staging { - let staging_partition_info = self.staging_partition_info.lock().unwrap(); + let staging_partition_info = self.manifest_ctx.staging_partition_info(); if staging_partition_info.is_none() { warn!( "Staging partition expr is none for region {} in staging state", @@ -837,8 +842,8 @@ impl MitoRegion { pub fn expected_partition_expr_version(&self) -> u64 { if self.is_staging() { - let staging_partition_info = self.staging_partition_info.lock().unwrap(); - staging_partition_info + self.manifest_ctx + .staging_partition_info() .as_ref() .map(|info| info.partition_rule_version) .unwrap_or_default() @@ -852,8 +857,8 @@ impl MitoRegion { if !self.is_staging() { return false; } - let staging_partition_info = self.staging_partition_info.lock().unwrap(); - staging_partition_info + self.manifest_ctx + .staging_partition_info() .as_ref() .map(|info| { matches!( @@ -873,6 +878,11 @@ pub(crate) struct ManifestContext { /// The state of the region. The region checks the state before updating /// manifest. state: AtomicCell, + /// Partition info of the region in staging mode. + /// + /// During the staging mode, the region metadata in [`VersionControlRef`] is not updated, + /// so we need to store the partition info separately. + staging_partition_info: Mutex>, } impl ManifestContext { @@ -880,9 +890,46 @@ impl ManifestContext { ManifestContext { manifest_manager: tokio::sync::RwLock::new(manager), state: AtomicCell::new(state), + staging_partition_info: Mutex::new(None), } } + pub(crate) fn staging_partition_info(&self) -> Option { + self.staging_partition_info.lock().unwrap().clone() + } + + pub(crate) fn set_staging_partition_info(&self, staging_partition_info: StagingPartitionInfo) { + let mut current = self.staging_partition_info.lock().unwrap(); + debug_assert!(current.is_none()); + *current = Some(staging_partition_info); + } + + fn clear_staging_partition_info(&self) { + *self.staging_partition_info.lock().unwrap() = None; + } + + pub(crate) fn exit_staging( + &self, + region_id: RegionId, + next_state: RegionRoleState, + ) -> Result<()> { + self.state + .compare_exchange( + RegionRoleState::Leader(RegionLeaderState::Staging), + next_state, + ) + .map_err(|actual| { + RegionStateSnafu { + region_id, + state: actual, + expect: RegionRoleState::Leader(RegionLeaderState::Staging), + } + .build() + })?; + self.clear_staging_partition_info(); + Ok(()) + } + pub(crate) async fn manifest_version(&self) -> ManifestVersion { self.manifest_manager .read() @@ -1028,27 +1075,50 @@ impl ManifestContext { /// Sets the [`RegionRole`]. /// /// ```text - /// +------------------------------------------+ - /// | +-----------------+ | - /// | | | | - /// +---+------+ +-------+-----+ +--v-v---+ - /// | Follower | | Downgrading | | Leader | - /// +---^-^----+ +-----+-^-----+ +--+-+---+ - /// | | | | | | - /// | +------------------+ +-----------------+ | - /// +------------------------------------------+ - /// - /// Transition: - /// - Follower -> Leader - /// - Downgrading Leader -> Leader - /// - Leader -> Follower - /// - Downgrading Leader -> Follower - /// - Leader -> Downgrading Leader + /// +---------------------+ + /// | Staging Leader | + /// +----------+----------+ + /// | + /// v + /// +----------+ +------+-------+ +-------------+ + /// | Follower | <-> | Leader | <-> | Downgrading | + /// +-----+----+ +------+-------+ +------+------+ + /// ^ ^ | + /// +-----------------+--------------------+ /// /// ``` + /// + /// # State Transitions + /// + /// From `Follower`: + /// - `Follower -> Leader` + /// + /// From `Leader`: + /// - `Leader -> Follower` + /// - `Leader -> Downgrading Leader` + /// + /// From `Staging Leader`: + /// - `Staging Leader -> Leader` + /// - `Staging Leader -> Follower` + /// - `Staging Leader -> Downgrading Leader` + /// + /// From `Downgrading Leader`: + /// - `Downgrading Leader -> Leader` + /// - `Downgrading Leader -> Follower` pub(crate) fn set_role(&self, next_role: RegionRole, region_id: RegionId) { match next_role { RegionRole::Follower => { + if self + .exit_staging(region_id, RegionRoleState::Follower) + .is_ok() + { + info!( + "Convert region {} to follower, previous role state: {:?}", + region_id, + RegionRoleState::Leader(RegionLeaderState::Staging) + ); + return; + } match self.state.fetch_update(|state| { if !matches!(state, RegionRoleState::Follower) { Some(RegionRoleState::Follower) @@ -1071,6 +1141,20 @@ impl ManifestContext { } } RegionRole::Leader => { + if self + .exit_staging( + region_id, + RegionRoleState::Leader(RegionLeaderState::Writable), + ) + .is_ok() + { + info!( + "Convert region {} to leader, previous role state: {:?}", + region_id, + RegionRoleState::Leader(RegionLeaderState::Staging) + ); + return; + } match self.state.fetch_update(|state| { if matches!( state, @@ -1096,7 +1180,27 @@ impl ManifestContext { } } } + RegionRole::StagingLeader => { + info!( + "Ignore direct conversion of region {} to staging leader; staging requires the dedicated workflow", + region_id + ); + } RegionRole::DowngradingLeader => { + if self + .exit_staging( + region_id, + RegionRoleState::Leader(RegionLeaderState::Downgrading), + ) + .is_ok() + { + info!( + "Convert region {} to downgrading region, previous role state: {:?}", + region_id, + RegionRoleState::Leader(RegionLeaderState::Staging) + ); + return; + } match self.state.compare_exchange( RegionRoleState::Leader(RegionLeaderState::Writable), RegionRoleState::Leader(RegionLeaderState::Downgrading), @@ -1438,8 +1542,8 @@ pub fn parse_partition_expr(partition_expr_str: Option<&str>) -> Result StagingLeader should be ignored. + manifest_ctx.set_role(RegionRole::StagingLeader, region_id); + assert_eq!( + manifest_ctx.state.load(), + RegionRoleState::Leader(RegionLeaderState::Writable) + ); + // Leader -> Downgrading Leader manifest_ctx.set_role(RegionRole::DowngradingLeader, region_id); assert_eq!( @@ -1825,7 +1935,6 @@ mod tests { topic_latest_entry_id: Default::default(), written_bytes: Arc::new(AtomicU64::new(0)), stats: ManifestStats::default(), - staging_partition_info: Mutex::new(None), }; // Test initial state diff --git a/src/mito2/src/region/opener.rs b/src/mito2/src/region/opener.rs index b23e73557d..c1240c3829 100644 --- a/src/mito2/src/region/opener.rs +++ b/src/mito2/src/region/opener.rs @@ -17,7 +17,7 @@ use std::any::TypeId; use std::collections::HashMap; use std::sync::atomic::{AtomicI64, AtomicU64}; -use std::sync::{Arc, LazyLock, Mutex}; +use std::sync::{Arc, LazyLock}; use std::time::Instant; use common_telemetry::{debug, error, info, warn}; @@ -349,7 +349,6 @@ impl RegionOpener { topic_latest_entry_id: AtomicU64::new(0), written_bytes: Arc::new(AtomicU64::new(0)), stats: self.stats, - staging_partition_info: Mutex::new(None), })) } @@ -586,8 +585,6 @@ impl RegionOpener { topic_latest_entry_id: AtomicU64::new(topic_latest_entry_id), written_bytes: Arc::new(AtomicU64::new(0)), stats: self.stats.clone(), - // TODO(weny): reload the staging partition info from the manifest. - staging_partition_info: Mutex::new(None), }; let region = Arc::new(region); diff --git a/src/mito2/src/worker/handle_apply_staging.rs b/src/mito2/src/worker/handle_apply_staging.rs index e773150356..876d5c3c31 100644 --- a/src/mito2/src/worker/handle_apply_staging.rs +++ b/src/mito2/src/worker/handle_apply_staging.rs @@ -75,7 +75,7 @@ impl RegionWorkerLoop { return; } - let staging_partition_info = region.staging_partition_info.lock().unwrap().clone(); + let staging_partition_info = region.manifest_ctx.staging_partition_info(); let staging_partition_expr = staging_partition_info .as_ref() diff --git a/src/mito2/src/worker/handle_enter_staging.rs b/src/mito2/src/worker/handle_enter_staging.rs index 8b75fdd24f..83bd51df15 100644 --- a/src/mito2/src/worker/handle_enter_staging.rs +++ b/src/mito2/src/worker/handle_enter_staging.rs @@ -42,7 +42,7 @@ impl RegionWorkerLoop { // If the region is already in staging mode, verify the partition directive matches. if region.is_staging() { - let staging_partition_info = region.staging_partition_info.lock().unwrap().clone(); + let staging_partition_info = region.manifest_ctx.staging_partition_info(); // If the partition directive mismatches, return error. if staging_partition_info .as_ref() @@ -279,10 +279,8 @@ impl RegionWorkerLoop { region: &MitoRegionRef, partition_directive: StagingPartitionDirective, ) { - let mut staging_partition_info = region.staging_partition_info.lock().unwrap(); - debug_assert!(staging_partition_info.is_none()); - *staging_partition_info = Some(StagingPartitionInfo::from_partition_directive( - partition_directive, - )); + region.manifest_ctx.set_staging_partition_info( + StagingPartitionInfo::from_partition_directive(partition_directive), + ); } } diff --git a/src/store-api/src/region_engine.rs b/src/store-api/src/region_engine.rs index 287f64d225..b235fcffc7 100644 --- a/src/store-api/src/region_engine.rs +++ b/src/store-api/src/region_engine.rs @@ -67,7 +67,7 @@ impl From for RegionRole { SettableRegionRoleState::Follower => RegionRole::Follower, SettableRegionRoleState::DowngradingLeader => RegionRole::DowngradingLeader, SettableRegionRoleState::Leader => RegionRole::Leader, - SettableRegionRoleState::StagingLeader => RegionRole::Leader, // Still a leader role + SettableRegionRoleState::StagingLeader => RegionRole::StagingLeader, } } } @@ -210,6 +210,11 @@ pub enum RegionRole { Follower, // Writable region(mito2), Readonly region(file). Leader, + // Leader is in staging mode. + // + // This is leader-like and writable, but it follows the staging workflow + // semantics instead of a normal leader's steady state. + StagingLeader, // Leader is downgrading to follower. // // This state is used to prevent new write requests. @@ -221,6 +226,7 @@ impl Display for RegionRole { match self { RegionRole::Follower => write!(f, "Follower"), RegionRole::Leader => write!(f, "Leader"), + RegionRole::StagingLeader => write!(f, "Leader(Staging)"), RegionRole::DowngradingLeader => write!(f, "Leader(Downgrading)"), } } @@ -228,7 +234,7 @@ impl Display for RegionRole { impl RegionRole { pub fn writable(&self) -> bool { - matches!(self, RegionRole::Leader) + matches!(self, RegionRole::Leader | RegionRole::StagingLeader) } } @@ -237,6 +243,7 @@ impl From for PbRegionRole { match value { RegionRole::Follower => PbRegionRole::Follower, RegionRole::Leader => PbRegionRole::Leader, + RegionRole::StagingLeader => PbRegionRole::StagingLeader, RegionRole::DowngradingLeader => PbRegionRole::DowngradingLeader, } } @@ -246,6 +253,7 @@ impl From for RegionRole { fn from(value: PbRegionRole) -> Self { match value { PbRegionRole::Leader => RegionRole::Leader, + PbRegionRole::StagingLeader => RegionRole::StagingLeader, PbRegionRole::Follower => RegionRole::Follower, PbRegionRole::DowngradingLeader => RegionRole::DowngradingLeader, } From a24c58e25c5ab209d9034088073e894800e25a7b Mon Sep 17 00:00:00 2001 From: Yingwen Date: Mon, 13 Apr 2026 17:11:38 +0800 Subject: [PATCH 097/195] chore: fix git cliff errors in latest version (#7947) * chore: fix git cliff errors in latest version - Fix errors in v2.12.0 - Do not generate logs for beta/rc tags between the compared commits Signed-off-by: evenyag * chore: preserve blank line before release date in changelog Signed-off-by: evenyag --------- Signed-off-by: evenyag --- cliff.toml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cliff.toml b/cliff.toml index 4245203e92..2b35ddab5c 100644 --- a/cliff.toml +++ b/cliff.toml @@ -12,7 +12,9 @@ footer = "" body = """ # {{ version }} +{% if timestamp -%} Release date: {{ timestamp | date(format="%B %d, %Y") }} +{% endif -%} {%- set breakings = commits | filter(attribute="breaking", value=true) -%} {%- if breakings | length > 0 %} @@ -118,7 +120,10 @@ filter_commits = false # regex for skipping tags # skip_tags = "" # regex for ignoring tags -ignore_tags = ".*-nightly-.*" +# Ignore nightly tags and build-suffixed release tags such as +# v1.0.0-rc.2-13cdfa9b5-20260325-1774407105 so their commits are merged into +# the next visible release section instead of creating extra headings. +ignore_tags = ".*-nightly-.*|^v[0-9]+\\.[0-9]+\\.[0-9]+(-(alpha|beta|rc)\\.[0-9]+)?-[0-9a-f]{7,}-[0-9]{8}-[0-9]+$" # sort the tags topologically topo_order = false # sort the commits inside sections by oldest/newest order From 3750819f937d13408012ae36f621311c780fb7eb Mon Sep 17 00:00:00 2001 From: discord9 <55937128+discord9@users.noreply.github.com> Date: Mon, 13 Apr 2026 21:04:11 +0800 Subject: [PATCH 098/195] fix: match term zh (#7952) * fix: match term zh Signed-off-by: discord9 * chore: per gemini Signed-off-by: discord9 * chore: revert accident change Signed-off-by: discord9 * feat: unicode script han Signed-off-by: discord9 --------- Signed-off-by: discord9 --- Cargo.lock | 1 + Cargo.toml | 1 + src/common/function/Cargo.toml | 1 + .../function/src/scalars/matches_term.rs | 170 ++++++++++++++---- src/index/src/fulltext_index/tokenizer.rs | 20 +++ .../common/function/matches_term.result | 65 +++++++ .../common/function/matches_term.sql | 10 ++ 7 files changed, 238 insertions(+), 30 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4f6339d83a..68d1dac297 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2348,6 +2348,7 @@ dependencies = [ "geohash", "h3o", "hyperloglogplus", + "icu_properties", "jsonb", "jsonpath-rust 0.7.5", "memchr", diff --git a/Cargo.toml b/Cargo.toml index 34e10d9173..66c35acee8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -161,6 +161,7 @@ humantime = "2.1" humantime-serde = "1.1" hyper = "1.1" hyper-util = "0.1" +icu_properties = "2.0.1" itertools = "0.14" jsonb = { version = "0.4.4", default-features = false } lazy_static = "1.4" diff --git a/src/common/function/Cargo.toml b/src/common/function/Cargo.toml index d164b9285d..43ddf9ae0c 100644 --- a/src/common/function/Cargo.toml +++ b/src/common/function/Cargo.toml @@ -47,6 +47,7 @@ geo-types = { version = "0.7", optional = true } geohash = { version = "0.13", optional = true } h3o = { version = "0.6", optional = true } hyperloglogplus = "0.4" +icu_properties.workspace = true jsonb.workspace = true jsonpath-rust = "0.7.5" memchr = "2.7" diff --git a/src/common/function/src/scalars/matches_term.rs b/src/common/function/src/scalars/matches_term.rs index 8dfb25cbc0..ec1b34d408 100644 --- a/src/common/function/src/scalars/matches_term.rs +++ b/src/common/function/src/scalars/matches_term.rs @@ -20,6 +20,8 @@ use datafusion_common::arrow::compute; use datafusion_common::arrow::datatypes::DataType; use datafusion_common::{DataFusionError, ScalarValue}; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility}; +use icu_properties::props::Script; +use icu_properties::{CodePointMapData, CodePointMapDataBorrowed}; use memchr::memmem; use crate::function::Function; @@ -27,10 +29,11 @@ use crate::function_registry::FunctionRegistry; /// Exact term/phrase matching function for text columns. /// -/// This function checks if a text column contains exact term/phrase matches -/// with non-alphanumeric boundaries. Designed for: -/// - Whole-word matching (e.g. "cat" in "cat!" but not in "category") +/// This function uses script-aware matching rules: +/// - ASCII-only terms keep whole-word style boundary matching, like Whole-word matching (e.g. "cat" in "cat!" but not in "category") /// - Phrase matching (e.g. "hello world" in "note:hello world!") +/// - Terms containing Han characters match as contiguous substrings +/// - Mixed-script identifiers and numeric terms remain searchable in Chinese text /// /// # Signature /// `matches_term(text: String, term: String) -> Boolean` @@ -43,9 +46,8 @@ use crate::function_registry::FunctionRegistry; /// BooleanVector where each element indicates if the corresponding text /// contains an exact match of the term, following these rules: /// 1. Exact substring match found (case-sensitive) -/// 2. Match boundaries are either: -/// - Start/end of text -/// - Any non-alphanumeric character (including spaces, hyphens, punctuation, etc.) +/// 2. For ASCII-only terms, adjacent ASCII word characters block the match +/// 3. For Han-containing terms, contiguous substring match is sufficient /// /// # Examples /// ``` @@ -60,6 +62,9 @@ use crate::function_registry::FunctionRegistry; /// SELECT matches_term(column, 'critical error') FROM logs; /// -- Match in: "ERROR:critical error!" /// -- No match: "critical_errors" +/// -- Chinese substring examples -- +/// SELECT matches_term(column, '手机') FROM table; +/// -- Text: "登录手机号18888888888的动态key" => true /// /// -- Empty string handling -- /// SELECT matches_term(column, '') FROM table; @@ -204,9 +209,8 @@ impl Function for MatchesTermFunction { /// /// A term is considered matched when: /// 1. The exact sequence appears in the text -/// 2. It is either: -/// - At the start/end of text with adjacent non-alphanumeric character -/// - Surrounded by non-alphanumeric characters +/// 2. ASCII-only terms are not adjacent to ASCII word characters +/// 3. Han-containing terms match as contiguous substrings /// /// # Examples /// ``` @@ -215,28 +219,105 @@ impl Function for MatchesTermFunction { /// assert!(finder.find("dog,cat")); // Term preceded by comma /// assert!(!finder.find("category")); // Partial match rejected /// -/// let finder = MatchesTermFinder::new("world"); -/// assert!(finder.find("hello-world")); // Hyphen boundary +/// let finder = MatchesTermFinder::new("手机"); +/// assert!(finder.find("登录手机号18888888888的动态key")); /// ``` #[derive(Clone, Debug)] pub struct MatchesTermFinder { finder: memmem::Finder<'static>, term: String, - starts_with_non_alnum: bool, - ends_with_non_alnum: bool, + term_kind: TermKind, + starts_with_other: bool, + ends_with_other: bool, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum CharClass { + AsciiWord, + Han, + UnicodeWord, + Other, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum TermKind { + AsciiLike, + UnicodeWord, + HanContaining, +} + +fn classify_char(c: char) -> CharClass { + if c.is_ascii_alphanumeric() { + CharClass::AsciiWord + } else if is_han(c) { + CharClass::Han + } else if c.is_alphanumeric() { + CharClass::UnicodeWord + } else { + CharClass::Other + } +} + +static HAN_SCRIPT_DATA: CodePointMapDataBorrowed<'static, Script> = + CodePointMapData::