fix: only skips auto convert when encoding is sparse (#7056)

* fix: only skips auto convert when encoding is sparse

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: address comment and add tests

Signed-off-by: evenyag <realevenyag@gmail.com>

---------

Signed-off-by: evenyag <realevenyag@gmail.com>
This commit is contained in:
Yingwen
2025-10-09 14:50:48 +08:00
committed by GitHub
parent 4cdcf2ef39
commit 45b1458254
2 changed files with 29 additions and 7 deletions

View File

@@ -140,6 +140,8 @@ pub struct FlatReadFormat {
impl FlatReadFormat {
/// Creates a helper with existing `metadata` and `column_ids` to read.
///
/// If `skip_auto_convert` is true, skips auto conversion of format when the encoding is sparse encoding.
pub fn new(
metadata: RegionMetadataRef,
column_ids: impl Iterator<Item = ColumnId>,
@@ -154,11 +156,18 @@ impl FlatReadFormat {
let parquet_adapter = if is_legacy {
// Safety: is_legacy_format() ensures primary_key is not empty.
ParquetAdapter::PrimaryKeyToFlat(ParquetPrimaryKeyToFlat::new(
metadata,
column_ids,
skip_auto_convert,
))
if metadata.primary_key_encoding == PrimaryKeyEncoding::Sparse {
// Only skip auto convert when the primary key encoding is sparse.
ParquetAdapter::PrimaryKeyToFlat(ParquetPrimaryKeyToFlat::new(
metadata,
column_ids,
skip_auto_convert,
))
} else {
ParquetAdapter::PrimaryKeyToFlat(ParquetPrimaryKeyToFlat::new(
metadata, column_ids, false,
))
}
} else {
ParquetAdapter::Flat(ParquetFlat::new(metadata, column_ids))
};
@@ -370,6 +379,12 @@ impl ParquetPrimaryKeyToFlat {
column_ids: impl Iterator<Item = ColumnId>,
skip_auto_convert: bool,
) -> ParquetPrimaryKeyToFlat {
assert!(if skip_auto_convert {
metadata.primary_key_encoding == PrimaryKeyEncoding::Sparse
} else {
true
});
let column_ids: Vec<_> = column_ids.collect();
// Creates a map to lookup index based on the new format.

View File

@@ -1740,7 +1740,7 @@ mod tests {
.collect();
let format = FlatReadFormat::new(
metadata.clone(),
column_ids.into_iter(),
column_ids.clone().into_iter(),
None,
"test",
false,
@@ -1797,7 +1797,7 @@ mod tests {
let record_batch = RecordBatch::try_new(old_schema, columns).unwrap();
// Test conversion with sparse encoding
let result = format.convert_batch(record_batch, None).unwrap();
let result = format.convert_batch(record_batch.clone(), None).unwrap();
// Construct expected RecordBatch in flat format with decoded primary key columns
let tag0_array = Arc::new(DictionaryArray::new(
@@ -1826,5 +1826,12 @@ mod tests {
// Compare the actual result with the expected record batch
assert_eq!(expected_record_batch, result);
let format =
FlatReadFormat::new(metadata.clone(), column_ids.into_iter(), None, "test", true)
.unwrap();
// Test conversion with sparse encoding and skip convert.
let result = format.convert_batch(record_batch.clone(), None).unwrap();
assert_eq!(record_batch, result);
}
}