From 25fab2ba7d0b17fbf739f96f981e150c8c0d25eb Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Fri, 21 Nov 2025 11:28:14 +0800 Subject: [PATCH] feat: don't validate external table's region schema (#7268) * feat: don't validate external table's region schema Signed-off-by: Ruihang Xia * fix format Signed-off-by: Ruihang Xia --------- Signed-off-by: Ruihang Xia --- src/file-engine/src/manifest.rs | 50 ++++++++++++++- src/store-api/src/metadata.rs | 106 +++++++++++++++++++++++++++++++- 2 files changed, 154 insertions(+), 2 deletions(-) diff --git a/src/file-engine/src/manifest.rs b/src/file-engine/src/manifest.rs index 7e8aa7a732..ac2732fe69 100644 --- a/src/file-engine/src/manifest.rs +++ b/src/file-engine/src/manifest.rs @@ -94,7 +94,9 @@ impl FileRegionManifest { builder.push_column_metadata(column.clone()); } builder.primary_key(self.primary_key.clone()); - let metadata = builder.build().context(InvalidMetadataSnafu)?; + let metadata = builder + .build_without_validation() + .context(InvalidMetadataSnafu)?; Ok(Arc::new(metadata)) } @@ -127,3 +129,49 @@ impl FileRegionManifest { .context(MissingRequiredFieldSnafu { name }) } } + +#[cfg(test)] +mod tests { + use api::v1::SemanticType; + use datatypes::prelude::ConcreteDataType; + use datatypes::schema::ColumnSchema; + + use super::*; + + #[test] + fn metadata_allows_internal_column_name() { + let manifest = FileRegionManifest { + region_id: RegionId::new(1, 0), + column_metadatas: vec![ + ColumnMetadata { + column_schema: ColumnSchema::new( + "__primary_key", + ConcreteDataType::string_datatype(), + false, + ), + semantic_type: SemanticType::Tag, + column_id: 1, + }, + ColumnMetadata { + column_schema: ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + semantic_type: SemanticType::Timestamp, + column_id: 2, + }, + ], + primary_key: vec![1], + options: HashMap::default(), + }; + + let metadata = manifest.metadata().unwrap(); + assert!( + metadata + .column_metadatas + .iter() + .any(|c| c.column_schema.name == "__primary_key") + ); + } +} diff --git a/src/store-api/src/metadata.rs b/src/store-api/src/metadata.rs index 4c4b2f3fb9..5cb00004ef 100644 --- a/src/store-api/src/metadata.rs +++ b/src/store-api/src/metadata.rs @@ -640,6 +640,18 @@ impl RegionMetadataBuilder { /// Consumes the builder and build a [RegionMetadata]. pub fn build(self) -> Result { + self.build_with_options(true) + } + + /// Builds metadata without running validation. + /// + /// Intended for file/external engines that should accept arbitrary schemas + /// coming from files. + pub fn build_without_validation(self) -> Result { + self.build_with_options(false) + } + + fn build_with_options(self, validate: bool) -> Result { let skipped = SkippedFields::new(&self.column_metadatas)?; let meta = RegionMetadata { @@ -654,7 +666,9 @@ impl RegionMetadataBuilder { partition_expr: self.partition_expr, }; - meta.validate()?; + if validate { + meta.validate()?; + } Ok(meta) } @@ -1929,6 +1943,96 @@ mod test { ); } + #[test] + fn test_allow_internal_column_name() { + let mut builder = create_builder(); + builder + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "__primary_key", + ConcreteDataType::string_datatype(), + false, + ), + semantic_type: SemanticType::Tag, + column_id: 1, + }) + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + semantic_type: SemanticType::Timestamp, + column_id: 2, + }) + .primary_key(vec![1]); + + let metadata = builder.build_without_validation().unwrap(); + assert_eq!( + "__primary_key", + metadata.column_metadatas[0].column_schema.name + ); + } + + #[test] + fn test_build_without_validation() { + // Primary key points to a Field column, which would normally fail validation. + let mut builder = create_builder(); + builder + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + semantic_type: SemanticType::Timestamp, + column_id: 1, + }) + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "field", + ConcreteDataType::string_datatype(), + true, + ), + semantic_type: SemanticType::Field, + column_id: 2, + }) + .primary_key(vec![2]); + + // Unvalidated build should succeed. + let metadata = builder.build_without_validation().unwrap(); + assert_eq!(vec![2], metadata.primary_key); + + // Validated build still rejects it. + let mut builder = create_builder(); + builder + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + semantic_type: SemanticType::Timestamp, + column_id: 1, + }) + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "field", + ConcreteDataType::string_datatype(), + true, + ), + semantic_type: SemanticType::Field, + column_id: 2, + }) + .primary_key(vec![2]); + let err = builder.build().unwrap_err(); + assert!( + err.to_string() + .contains("semantic type of column field should be Tag"), + "unexpected err: {err}" + ); + } + #[test] fn test_debug_for_column_metadata() { let region_metadata = build_test_region_metadata();