feat: add validate method to CreateExpr (#3772)

* feat: add validate method to CreateExpr

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add sqlness reproducer

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* verify region create request

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix existing test

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add tailing empty line

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add more validation

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix typo

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* disable metric table fuzz

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* minor refactor

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
This commit is contained in:
Ruihang Xia
2024-04-24 15:29:10 +08:00
committed by GitHub
parent 659d34a170
commit df01ac05a1
9 changed files with 345 additions and 85 deletions

View File

@@ -25,7 +25,7 @@ use store_api::region_request::{
AddColumn, AffectedRows, AlterKind, RegionAlterRequest, RegionPutRequest, RegionRequest,
};
use store_api::storage::consts::ReservedColumnId;
use store_api::storage::RegionId;
use store_api::storage::{ConcreteDataType, RegionId};
use crate::error::{
ColumnTypeMismatchSnafu, MitoReadOperationSnafu, MitoWriteOperationSnafu, Result,
@@ -128,7 +128,8 @@ impl DataRegion {
if c.semantic_type == SemanticType::Tag {
if !c.column_schema.data_type.is_string() {
return ColumnTypeMismatchSnafu {
column_type: c.column_schema.data_type.clone(),
expect: ConcreteDataType::string_datatype(),
actual: c.column_schema.data_type.clone(),
}
.fail();
}

View File

@@ -43,9 +43,11 @@ use crate::engine::options::{
};
use crate::engine::MetricEngineInner;
use crate::error::{
ColumnNotFoundSnafu, ConflictRegionOptionSnafu, CreateMitoRegionSnafu,
InternalColumnOccupiedSnafu, MissingRegionOptionSnafu, MitoReadOperationSnafu,
ParseRegionIdSnafu, PhysicalRegionNotFoundSnafu, Result, SerializeColumnMetadataSnafu,
AddingFieldColumnSnafu, ColumnNotFoundSnafu, ColumnTypeMismatchSnafu,
ConflictRegionOptionSnafu, CreateMitoRegionSnafu, InternalColumnOccupiedSnafu,
InvalidMetadataSnafu, MissingRegionOptionSnafu, MitoReadOperationSnafu,
MultipleFieldColumnSnafu, NoFieldColumnSnafu, ParseRegionIdSnafu, PhysicalRegionNotFoundSnafu,
Result, SerializeColumnMetadataSnafu,
};
use crate::metrics::{LOGICAL_REGION_COUNT, PHYSICAL_COLUMN_COUNT, PHYSICAL_REGION_COUNT};
use crate::utils::{to_data_region_id, to_metadata_region_id};
@@ -191,6 +193,14 @@ impl MetricEngineInner {
})?;
for col in &request.column_metadatas {
if !physical_columns.contains(&col.column_schema.name) {
// Multi-field on physical table is explicit forbidden at present
// TODO(ruihang): support multi-field on both logical and physical column
ensure!(
col.semantic_type != SemanticType::Field,
AddingFieldColumnSnafu {
name: col.column_schema.name.clone()
}
);
new_columns.push(col.clone());
} else {
existing_columns.push(col.column_schema.name.clone());
@@ -290,6 +300,8 @@ impl MetricEngineInner {
/// - required table option is present ([PHYSICAL_TABLE_METADATA_KEY] or
/// [LOGICAL_TABLE_METADATA_KEY])
fn verify_region_create_request(request: &RegionCreateRequest) -> Result<()> {
request.validate().context(InvalidMetadataSnafu)?;
let name_to_index = request
.column_metadatas
.iter()
@@ -323,6 +335,41 @@ impl MetricEngineInner {
ConflictRegionOptionSnafu {}
);
// check if only one field column is declared, and all tag columns are string
let mut field_col: Option<&ColumnMetadata> = None;
for col in &request.column_metadatas {
match col.semantic_type {
SemanticType::Tag => ensure!(
col.column_schema.data_type == ConcreteDataType::string_datatype(),
ColumnTypeMismatchSnafu {
expect: ConcreteDataType::string_datatype(),
actual: col.column_schema.data_type.clone(),
}
),
SemanticType::Field => {
if field_col.is_some() {
MultipleFieldColumnSnafu {
previous: field_col.unwrap().column_schema.name.clone(),
current: col.column_schema.name.clone(),
}
.fail()?;
}
field_col = Some(col)
}
SemanticType::Timestamp => {}
}
}
let field_col = field_col.context(NoFieldColumnSnafu)?;
// make sure the field column is float64 type
ensure!(
field_col.column_schema.data_type == ConcreteDataType::float64_datatype(),
ColumnTypeMismatchSnafu {
expect: ConcreteDataType::float64_datatype(),
actual: field_col.column_schema.data_type.clone(),
}
);
Ok(())
}
@@ -531,6 +578,15 @@ mod test {
false,
),
},
ColumnMetadata {
column_id: 2,
semantic_type: SemanticType::Field,
column_schema: ColumnSchema::new(
"column2".to_string(),
ConcreteDataType::float64_datatype(),
false,
),
},
],
region_dir: "test_dir".to_string(),
engine: METRIC_ENGINE_NAME.to_string(),
@@ -539,37 +595,51 @@ mod test {
.into_iter()
.collect(),
};
let result = MetricEngineInner::verify_region_create_request(&request);
assert!(result.is_ok());
MetricEngineInner::verify_region_create_request(&request).unwrap();
}
#[test]
fn test_verify_region_create_request_options() {
let mut request = RegionCreateRequest {
column_metadatas: vec![],
column_metadatas: vec![
ColumnMetadata {
column_id: 0,
semantic_type: SemanticType::Timestamp,
column_schema: ColumnSchema::new(
METADATA_SCHEMA_TIMESTAMP_COLUMN_NAME,
ConcreteDataType::timestamp_millisecond_datatype(),
false,
),
},
ColumnMetadata {
column_id: 1,
semantic_type: SemanticType::Field,
column_schema: ColumnSchema::new(
"val".to_string(),
ConcreteDataType::float64_datatype(),
false,
),
},
],
region_dir: "test_dir".to_string(),
engine: METRIC_ENGINE_NAME.to_string(),
primary_key: vec![],
options: HashMap::new(),
};
let result = MetricEngineInner::verify_region_create_request(&request);
assert!(result.is_err());
MetricEngineInner::verify_region_create_request(&request).unwrap_err();
let mut options = HashMap::new();
options.insert(PHYSICAL_TABLE_METADATA_KEY.to_string(), "value".to_string());
request.options.clone_from(&options);
let result = MetricEngineInner::verify_region_create_request(&request);
assert!(result.is_ok());
MetricEngineInner::verify_region_create_request(&request).unwrap();
options.insert(LOGICAL_TABLE_METADATA_KEY.to_string(), "value".to_string());
request.options.clone_from(&options);
let result = MetricEngineInner::verify_region_create_request(&request);
assert!(result.is_err());
MetricEngineInner::verify_region_create_request(&request).unwrap_err();
options.remove(PHYSICAL_TABLE_METADATA_KEY).unwrap();
request.options = options;
let result = MetricEngineInner::verify_region_create_request(&request);
assert!(result.is_ok());
MetricEngineInner::verify_region_create_request(&request).unwrap();
}
#[tokio::test]

View File

@@ -133,9 +133,10 @@ pub enum Error {
location: Location,
},
#[snafu(display("Column type mismatch. Expect string, got {:?}", column_type))]
#[snafu(display("Column type mismatch. Expect {:?}, got {:?}", expect, actual))]
ColumnTypeMismatch {
column_type: ConcreteDataType,
expect: ConcreteDataType,
actual: ConcreteDataType,
location: Location,
},
@@ -169,6 +170,19 @@ pub enum Error {
request: RegionRequest,
location: Location,
},
#[snafu(display("Multiple field column found: {} and {}", previous, current))]
MultipleFieldColumn {
previous: String,
current: String,
location: Location,
},
#[snafu(display("Adding field column {} to physical table", name))]
AddingFieldColumn { name: String, location: Location },
#[snafu(display("No field column found"))]
NoFieldColumn { location: Location },
}
pub type Result<T, E = Error> = std::result::Result<T, E>;
@@ -182,7 +196,10 @@ impl ErrorExt for Error {
| MissingRegionOption { .. }
| ConflictRegionOption { .. }
| ColumnTypeMismatch { .. }
| PhysicalRegionBusy { .. } => StatusCode::InvalidArguments,
| PhysicalRegionBusy { .. }
| MultipleFieldColumn { .. }
| NoFieldColumn { .. }
| AddingFieldColumn { .. } => StatusCode::InvalidArguments,
ForbiddenPhysicalAlter { .. } | UnsupportedRegionRequest { .. } => {
StatusCode::Unsupported

View File

@@ -210,9 +210,9 @@ pub fn create_logical_region_request(
),
},
];
for tag in tags {
for (bias, tag) in tags.iter().enumerate() {
column_metadatas.push(ColumnMetadata {
column_id: 2,
column_id: 2 + bias as ColumnId,
semantic_type: SemanticType::Tag,
column_schema: ColumnSchema::new(
tag.to_string(),

View File

@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use std::collections::{HashMap, HashSet};
use api::helper::ColumnDataTypeWrapper;
use api::v1::alter_expr::Kind;
@@ -31,7 +31,7 @@ use query::sql::{
};
use session::context::QueryContextRef;
use session::table_name::table_idents_to_full_name;
use snafu::{ensure, ResultExt};
use snafu::{ensure, OptionExt, ResultExt};
use sql::ast::{ColumnDef, ColumnOption, TableConstraint};
use sql::statements::alter::{AlterTable, AlterTableOperation};
use sql::statements::create::{CreateExternalTable, CreateTable, TIME_INDEX};
@@ -214,9 +214,72 @@ pub fn create_to_expr(create: &CreateTable, query_ctx: QueryContextRef) -> Resul
table_id: None,
engine: create.engine.to_string(),
};
validate_create_expr(&expr)?;
Ok(expr)
}
/// Validate the [`CreateTableExpr`] request.
pub fn validate_create_expr(create: &CreateTableExpr) -> Result<()> {
// construct column list
let mut column_to_indices = HashMap::with_capacity(create.column_defs.len());
for (idx, column) in create.column_defs.iter().enumerate() {
if let Some(indices) = column_to_indices.get(&column.name) {
return InvalidSqlSnafu {
err_msg: format!(
"column name `{}` is duplicated at index {} and {}",
column.name, indices, idx
),
}
.fail();
}
column_to_indices.insert(&column.name, idx);
}
// verify time_index exists
let _ = column_to_indices
.get(&create.time_index)
.with_context(|| InvalidSqlSnafu {
err_msg: format!(
"column name `{}` is not found in column list",
create.time_index
),
})?;
// verify primary_key exists
for pk in &create.primary_keys {
let _ = column_to_indices
.get(&pk)
.with_context(|| InvalidSqlSnafu {
err_msg: format!("column name `{}` is not found in column list", pk),
})?;
}
// construct primary_key set
let mut pk_set = HashSet::new();
for pk in &create.primary_keys {
if !pk_set.insert(pk) {
return InvalidSqlSnafu {
err_msg: format!("column name `{}` is duplicated in primary keys", pk),
}
.fail();
}
}
// verify time index is not primary key
if pk_set.contains(&create.time_index) {
return InvalidSqlSnafu {
err_msg: format!(
"column name `{}` is both primary key and time index",
create.time_index
),
}
.fail();
}
Ok(())
}
fn find_primary_keys(
columns: &[ColumnDef],
constraints: &[TableConstraint],
@@ -457,6 +520,33 @@ mod tests {
);
}
#[test]
fn test_invalid_create_to_expr() {
let cases = [
// duplicate column declaration
"CREATE TABLE monitor (host STRING primary key, ts TIMESTAMP TIME INDEX, some_column text, some_column string);",
// duplicate primary key
"CREATE TABLE monitor (host STRING, ts TIMESTAMP TIME INDEX, some_column STRING, PRIMARY KEY (some_column, host, some_column));",
// time index is primary key
"CREATE TABLE monitor (host STRING, ts TIMESTAMP TIME INDEX, PRIMARY KEY (host, ts));"
];
for sql in cases {
let stmt = ParserContext::create_with_dialect(
sql,
&GreptimeDbDialect {},
ParseOptions::default(),
)
.unwrap()
.pop()
.unwrap();
let Statement::CreateTable(create_table) = stmt else {
unreachable!()
};
create_to_expr(&create_table, QueryContext::arc()).unwrap_err();
}
}
#[test]
fn test_create_to_expr_with_default_timestamp_value() {
let sql = "CREATE TABLE monitor (v double,ts TIMESTAMP default '2024-01-30T00:01:01',TIME INDEX (ts)) engine=mito;";

View File

@@ -247,6 +247,53 @@ pub struct RegionCreateRequest {
pub region_dir: String,
}
impl RegionCreateRequest {
/// Checks whether the request is valid, returns an error if it is invalid.
pub fn validate(&self) -> Result<()> {
// time index must exist
ensure!(
self.column_metadatas
.iter()
.any(|x| x.semantic_type == SemanticType::Timestamp),
InvalidRegionRequestSnafu {
region_id: RegionId::new(0, 0),
err: "missing timestamp column in create region request".to_string(),
}
);
// build column id to indices
let mut column_id_to_indices = HashMap::with_capacity(self.column_metadatas.len());
for (i, c) in self.column_metadatas.iter().enumerate() {
if let Some(previous) = column_id_to_indices.insert(c.column_id, i) {
return InvalidRegionRequestSnafu {
region_id: RegionId::new(0, 0),
err: format!(
"duplicate column id {} (at position {} and {}) in create region request",
c.column_id, previous, i
),
}
.fail();
}
}
// primary key must exist
for column_id in &self.primary_key {
ensure!(
column_id_to_indices.contains_key(column_id),
InvalidRegionRequestSnafu {
region_id: RegionId::new(0, 0),
err: format!(
"missing primary key column {} in create region request",
column_id
),
}
);
}
Ok(())
}
}
#[derive(Debug, Clone, Default)]
pub struct RegionDropRequest {}
@@ -965,4 +1012,46 @@ mod tests {
metadata.schema_version = 1;
request.validate(&metadata).unwrap();
}
#[test]
fn test_validate_create_region() {
let column_metadatas = vec![
ColumnMetadata {
column_schema: ColumnSchema::new(
"ts",
ConcreteDataType::timestamp_millisecond_datatype(),
false,
),
semantic_type: SemanticType::Timestamp,
column_id: 1,
},
ColumnMetadata {
column_schema: ColumnSchema::new(
"tag_0",
ConcreteDataType::string_datatype(),
true,
),
semantic_type: SemanticType::Tag,
column_id: 2,
},
ColumnMetadata {
column_schema: ColumnSchema::new(
"field_0",
ConcreteDataType::string_datatype(),
true,
),
semantic_type: SemanticType::Field,
column_id: 3,
},
];
let create = RegionCreateRequest {
engine: "mito".to_string(),
column_metadatas,
primary_key: vec![3, 4],
options: HashMap::new(),
region_dir: "path".to_string(),
};
assert!(create.validate().is_err());
}
}

View File

@@ -61,34 +61,19 @@ impl Arbitrary<'_> for FuzzInput {
fn generate_expr(input: FuzzInput) -> Result<CreateTableExpr> {
let mut rng = ChaChaRng::seed_from_u64(input.seed);
let metric_engine = rng.gen_bool(0.5);
let if_not_exists = rng.gen_bool(0.5);
if metric_engine {
let create_table_generator = CreateTableExprGeneratorBuilder::default()
.name_generator(Box::new(MappedGenerator::new(
WordGenerator,
merge_two_word_map_fn(random_capitalize_map, uppercase_and_keyword_backtick_map),
)))
.columns(input.columns)
.engine("metric")
.if_not_exists(if_not_exists)
.with_clause([("physical_metric_table".to_string(), "".to_string())])
.build()
.unwrap();
create_table_generator.generate(&mut rng)
} else {
let create_table_generator = CreateTableExprGeneratorBuilder::default()
.name_generator(Box::new(MappedGenerator::new(
WordGenerator,
merge_two_word_map_fn(random_capitalize_map, uppercase_and_keyword_backtick_map),
)))
.columns(input.columns)
.engine("mito")
.if_not_exists(if_not_exists)
.build()
.unwrap();
create_table_generator.generate(&mut rng)
}
let create_table_generator = CreateTableExprGeneratorBuilder::default()
.name_generator(Box::new(MappedGenerator::new(
WordGenerator,
merge_two_word_map_fn(random_capitalize_map, uppercase_and_keyword_backtick_map),
)))
.columns(input.columns)
.engine("mito")
.if_not_exists(if_not_exists)
.build()
.unwrap();
create_table_generator.generate(&mut rng)
}
async fn execute_create_table(ctx: FuzzContext, input: FuzzInput) -> Result<()> {

View File

@@ -20,6 +20,11 @@ DESC TABLE phy;
| val | Float64 | | YES | | FIELD |
+--------+----------------------+-----+------+---------+---------------+
-- create table with duplicate column def
CREATE TABLE t1(ts timestamp time index, val double, host text, host string) engine=metric with ("on_physical_table" = "phy");
Error: 1004(InvalidArguments), Invalid SQL, error: column name `host` is duplicated at index 2 and 3
CREATE TABLE t1 (ts timestamp time index, val double, host string primary key) engine = metric with ("on_physical_table" = "phy");
Affected Rows: 0
@@ -28,6 +33,21 @@ CREATE TABLE t2 (ts timestamp time index, job string primary key, val double) en
Affected Rows: 0
-- create logical table with different data type on field column
CREATE TABLE t3 (ts timestamp time index, val string, host string, primary key (host)) engine=metric with ("on_physical_table" = "phy");
Error: 1004(InvalidArguments), Column type mismatch. Expect Float64(Float64Type), got String(StringType)
-- create logical table with different data type on tag column
CREATE TABLE t4 (ts timestamp time index, val double, host double, primary key (host)) engine=metric with ("on_physical_table" = "phy");
Error: 1004(InvalidArguments), Column type mismatch. Expect String(StringType), got Float64(Float64Type)
-- create logical table with different column name on field column
CREATE TABLE t5 (ts timestamp time index, valval double, host string primary key) engine = metric with ("on_physical_table" = "phy");
Error: 1004(InvalidArguments), Adding field column valval to physical table
SELECT table_catalog, table_schema, table_name, table_type, engine FROM information_schema.tables WHERE engine = 'metric' order by table_name;
+---------------+--------------+------------+------------+--------+
@@ -126,18 +146,10 @@ Affected Rows: 0
-- fuzz test case https://github.com/GreptimeTeam/greptimedb/issues/3612
CREATE TABLE `auT`(
incidunt TIMESTAMP(3) TIME INDEX,
`QuaErAT` BOOLEAN,
`REPREHenDERIt` BOOLEAN DEFAULT true,
`Et` INT NULL,
`AutEM` INT,
esse DOUBLE,
`Tempore` BOOLEAN,
`reruM` BOOLEAN,
`eRrOR` BOOLEAN NULL,
`cOMmodi` BOOLEAN,
`PERfERENdIS` DOUBLE,
`eSt` FLOAT DEFAULT 0.70978713,
PRIMARY KEY(`cOMmodi`, `PERfERENdIS`, esse)
`REPREHenDERIt` double DEFAULT 0.70978713,
`cOMmodi` STRING,
`PERfERENdIS` STRING,
PRIMARY KEY(`cOMmodi`, `PERfERENdIS`)
) ENGINE = metric with ("physical_metric_table" = "");
Affected Rows: 0
@@ -148,17 +160,9 @@ DESC TABLE `auT`;
| Column | Type | Key | Null | Default | Semantic Type |
+---------------+----------------------+-----+------+------------+---------------+
| incidunt | TimestampMillisecond | PRI | NO | | TIMESTAMP |
| QuaErAT | Boolean | | YES | | FIELD |
| REPREHenDERIt | Boolean | | YES | true | FIELD |
| Et | Int32 | | YES | | FIELD |
| AutEM | Int32 | | YES | | FIELD |
| esse | Float64 | PRI | YES | | TAG |
| Tempore | Boolean | | YES | | FIELD |
| reruM | Boolean | | YES | | FIELD |
| eRrOR | Boolean | | YES | | FIELD |
| cOMmodi | Boolean | PRI | YES | | TAG |
| PERfERENdIS | Float64 | PRI | YES | | TAG |
| eSt | Float32 | | YES | 0.70978713 | FIELD |
| REPREHenDERIt | Float64 | | YES | 0.70978713 | FIELD |
| cOMmodi | String | PRI | YES | | TAG |
| PERfERENdIS | String | PRI | YES | | TAG |
+---------------+----------------------+-----+------+------------+---------------+
DROP TABLE `auT`;

View File

@@ -4,10 +4,22 @@ SHOW TABLES;
DESC TABLE phy;
-- create table with duplicate column def
CREATE TABLE t1(ts timestamp time index, val double, host text, host string) engine=metric with ("on_physical_table" = "phy");
CREATE TABLE t1 (ts timestamp time index, val double, host string primary key) engine = metric with ("on_physical_table" = "phy");
CREATE TABLE t2 (ts timestamp time index, job string primary key, val double) engine = metric with ("on_physical_table" = "phy");
-- create logical table with different data type on field column
CREATE TABLE t3 (ts timestamp time index, val string, host string, primary key (host)) engine=metric with ("on_physical_table" = "phy");
-- create logical table with different data type on tag column
CREATE TABLE t4 (ts timestamp time index, val double, host double, primary key (host)) engine=metric with ("on_physical_table" = "phy");
-- create logical table with different column name on field column
CREATE TABLE t5 (ts timestamp time index, valval double, host string primary key) engine = metric with ("on_physical_table" = "phy");
SELECT table_catalog, table_schema, table_name, table_type, engine FROM information_schema.tables WHERE engine = 'metric' order by table_name;
DESC TABLE phy;
@@ -38,18 +50,10 @@ DROP TABLE phy2;
-- fuzz test case https://github.com/GreptimeTeam/greptimedb/issues/3612
CREATE TABLE `auT`(
incidunt TIMESTAMP(3) TIME INDEX,
`QuaErAT` BOOLEAN,
`REPREHenDERIt` BOOLEAN DEFAULT true,
`Et` INT NULL,
`AutEM` INT,
esse DOUBLE,
`Tempore` BOOLEAN,
`reruM` BOOLEAN,
`eRrOR` BOOLEAN NULL,
`cOMmodi` BOOLEAN,
`PERfERENdIS` DOUBLE,
`eSt` FLOAT DEFAULT 0.70978713,
PRIMARY KEY(`cOMmodi`, `PERfERENdIS`, esse)
`REPREHenDERIt` double DEFAULT 0.70978713,
`cOMmodi` STRING,
`PERfERENdIS` STRING,
PRIMARY KEY(`cOMmodi`, `PERfERENdIS`)
) ENGINE = metric with ("physical_metric_table" = "");
DESC TABLE `auT`;