Skip to main content

mito2/read/
compat.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Utilities to adapt readers with different schema.
16
17use std::collections::HashMap;
18use std::sync::Arc;
19
20use api::v1::SemanticType;
21use datatypes::arrow::array::{
22    Array, ArrayRef, BinaryArray, BinaryBuilder, DictionaryArray, UInt32Array,
23};
24use datatypes::arrow::compute::{TakeOptions, take};
25use datatypes::arrow::datatypes::{FieldRef, Schema, SchemaRef};
26use datatypes::arrow::record_batch::RecordBatch;
27use datatypes::data_type::ConcreteDataType;
28use datatypes::prelude::DataType;
29use datatypes::value::Value;
30use datatypes::vectors::VectorRef;
31use datatypes::vectors::json::array::JsonArray;
32use mito_codec::row_converter::{
33    CompositeValues, PrimaryKeyCodec, SortField, build_primary_key_codec,
34    build_primary_key_codec_with_fields,
35};
36use snafu::{OptionExt, ResultExt, ensure};
37use store_api::codec::PrimaryKeyEncoding;
38use store_api::metadata::{RegionMetadata, RegionMetadataRef};
39use store_api::storage::ColumnId;
40
41use crate::error::{
42    CompatReaderSnafu, ComputeArrowSnafu, ConvertValueSnafu, CreateDefaultSnafu, DecodeSnafu,
43    EncodeSnafu, NewRecordBatchSnafu, Result, UnsupportedOperationSnafu,
44};
45use crate::read::flat_projection::{FlatProjectionMapper, flat_projected_columns};
46use crate::sst::parquet::flat_format::primary_key_column_index;
47use crate::sst::parquet::format::{FormatProjection, INTERNAL_COLUMN_NUM, PrimaryKeyArray};
48use crate::sst::{internal_fields, tag_maybe_to_dictionary_field};
49
50/// Returns true if `left` and `right` have same columns and primary key encoding.
51pub(crate) fn has_same_columns_and_pk_encoding(
52    left: &RegionMetadata,
53    right: &RegionMetadata,
54) -> bool {
55    if left.primary_key_encoding != right.primary_key_encoding {
56        return false;
57    }
58
59    if left.column_metadatas.len() != right.column_metadatas.len() {
60        return false;
61    }
62
63    for (left_col, right_col) in left.column_metadatas.iter().zip(&right.column_metadatas) {
64        if left_col.column_id != right_col.column_id || !left_col.is_same_datatype(right_col) {
65            return false;
66        }
67        debug_assert_eq!(
68            left_col.column_schema.data_type,
69            right_col.column_schema.data_type
70        );
71        debug_assert_eq!(left_col.semantic_type, right_col.semantic_type);
72    }
73
74    true
75}
76
77/// A helper struct to adapt schema of the batch to an expected schema.
78pub(crate) struct FlatCompatBatch {
79    /// Indices to convert actual fields to expect fields.
80    index_or_defaults: Vec<IndexOrDefault>,
81    /// Expected arrow schema.
82    arrow_schema: SchemaRef,
83    /// Primary key adapter.
84    compat_pk: FlatCompatPrimaryKey,
85}
86
87impl FlatCompatBatch {
88    /// Creates a [FlatCompatBatch].
89    ///
90    /// - `mapper` is built from the metadata users expect to see.
91    /// - `actual` is the [RegionMetadata] of the input parquet.
92    /// - `format_projection` is the projection of the read format for the input parquet.
93    /// - `compaction` indicates whether the reader is for compaction.
94    pub(crate) fn try_new(
95        mapper: &FlatProjectionMapper,
96        actual: &RegionMetadataRef,
97        format_projection: &FormatProjection,
98        compaction: bool,
99    ) -> Result<Option<Self>> {
100        let actual_schema = flat_projected_columns(actual, format_projection);
101        let expect_schema = mapper.batch_schema();
102        if expect_schema == actual_schema {
103            // Although the SST has a different schema, but the schema after projection is the same
104            // as expected schema.
105            return Ok(None);
106        }
107
108        if actual.primary_key_encoding == PrimaryKeyEncoding::Sparse && compaction {
109            // Special handling for sparse encoding in compaction.
110            return FlatCompatBatch::try_new_compact_sparse(mapper, actual);
111        }
112
113        let (index_or_defaults, fields) =
114            Self::compute_index_and_fields(&actual_schema, expect_schema, mapper.metadata())?;
115
116        let compat_pk = FlatCompatPrimaryKey::new(mapper.metadata(), actual)?;
117
118        Ok(Some(Self {
119            index_or_defaults,
120            arrow_schema: Arc::new(Schema::new(fields)),
121            compat_pk,
122        }))
123    }
124
125    fn compute_index_and_fields(
126        actual_schema: &[(ColumnId, ConcreteDataType)],
127        expect_schema: &[(ColumnId, ConcreteDataType)],
128        expect_metadata: &RegionMetadata,
129    ) -> Result<(Vec<IndexOrDefault>, Vec<FieldRef>)> {
130        // Maps column id to the index and data type in the actual schema.
131        let actual_schema_index: HashMap<_, _> = actual_schema
132            .iter()
133            .enumerate()
134            .map(|(idx, (column_id, data_type))| (*column_id, (idx, data_type)))
135            .collect();
136
137        let mut index_or_defaults = Vec::with_capacity(expect_schema.len());
138        let mut fields = Vec::with_capacity(expect_schema.len());
139        for (column_id, expect_data_type) in expect_schema {
140            // Safety: expect_schema comes from the same mapper.
141            let column_index = expect_metadata.column_index_by_id(*column_id).unwrap();
142            let expect_column = &expect_metadata.column_metadatas[column_index];
143            let column_field = &expect_metadata.schema.arrow_schema().fields()[column_index];
144            // For tag columns, we need to create a dictionary field.
145            if expect_column.semantic_type == SemanticType::Tag {
146                fields.push(tag_maybe_to_dictionary_field(
147                    &expect_column.column_schema.data_type,
148                    column_field,
149                ));
150            } else {
151                fields.push(column_field.clone());
152            };
153
154            if let Some((index, actual_data_type)) = actual_schema_index.get(column_id) {
155                let mut cast_type = None;
156
157                // Same column different type.
158                if expect_data_type != *actual_data_type {
159                    cast_type = Some(expect_data_type.clone())
160                }
161                // Source has this column.
162                index_or_defaults.push(IndexOrDefault::Index {
163                    pos: *index,
164                    cast_type,
165                });
166            } else {
167                // Create a default vector with 1 element for that column.
168                let default_vector = expect_column
169                    .column_schema
170                    .create_default_vector(1)
171                    .context(CreateDefaultSnafu {
172                        region_id: expect_metadata.region_id,
173                        column: &expect_column.column_schema.name,
174                    })?
175                    .with_context(|| CompatReaderSnafu {
176                        region_id: expect_metadata.region_id,
177                        reason: format!(
178                            "column {} does not have a default value to read",
179                            expect_column.column_schema.name
180                        ),
181                    })?;
182                index_or_defaults.push(IndexOrDefault::DefaultValue {
183                    default_vector,
184                    semantic_type: expect_column.semantic_type,
185                });
186            };
187        }
188        fields.extend_from_slice(&internal_fields());
189
190        Ok((index_or_defaults, fields))
191    }
192
193    fn try_new_compact_sparse(
194        mapper: &FlatProjectionMapper,
195        actual: &RegionMetadataRef,
196    ) -> Result<Option<Self>> {
197        // Currently, we don't support converting sparse encoding back to dense encoding in
198        // flat format.
199        ensure!(
200            mapper.metadata().primary_key_encoding == PrimaryKeyEncoding::Sparse,
201            UnsupportedOperationSnafu {
202                err_msg: "Flat format doesn't support converting sparse encoding back to dense encoding"
203            }
204        );
205
206        // For sparse encoding, we don't need to check the primary keys.
207        // Since this is for compaction, we always read all columns.
208        let actual_schema: Vec<_> = actual
209            .field_columns()
210            .chain([actual.time_index_column()])
211            .map(|col| (col.column_id, col.column_schema.data_type.clone()))
212            .collect();
213        let expect_schema: Vec<_> = mapper
214            .metadata()
215            .field_columns()
216            .chain([mapper.metadata().time_index_column()])
217            .map(|col| (col.column_id, col.column_schema.data_type.clone()))
218            .collect();
219
220        let (index_or_defaults, fields) =
221            Self::compute_index_and_fields(&actual_schema, &expect_schema, mapper.metadata())?;
222
223        let compat_pk = FlatCompatPrimaryKey::default();
224
225        Ok(Some(Self {
226            index_or_defaults,
227            arrow_schema: Arc::new(Schema::new(fields)),
228            compat_pk,
229        }))
230    }
231
232    /// Make columns of the `batch` compatible.
233    pub(crate) fn compat(&self, batch: RecordBatch) -> Result<RecordBatch> {
234        let len = batch.num_rows();
235        let columns = self
236            .index_or_defaults
237            .iter()
238            .map(|index_or_default| match index_or_default {
239                IndexOrDefault::Index { pos, cast_type } => {
240                    let old_column = batch.column(*pos);
241
242                    if let Some(ty) = cast_type {
243                        let casted = if let Some(json_type) = ty.as_json()
244                            && json_type.is_json2()
245                        {
246                            JsonArray::from(old_column)
247                                .try_align(&json_type.as_arrow_type())
248                                .context(ConvertValueSnafu)?
249                        } else {
250                            datatypes::arrow::compute::cast(old_column, &ty.as_arrow_type())
251                                .context(ComputeArrowSnafu)?
252                        };
253                        Ok(casted)
254                    } else {
255                        Ok(old_column.clone())
256                    }
257                }
258                IndexOrDefault::DefaultValue {
259                    default_vector,
260                    semantic_type,
261                } => repeat_vector(default_vector, len, *semantic_type == SemanticType::Tag),
262            })
263            .chain(
264                // Adds internal columns.
265                batch.columns()[batch.num_columns() - INTERNAL_COLUMN_NUM..]
266                    .iter()
267                    .map(|col| Ok(col.clone())),
268            )
269            .collect::<Result<Vec<_>>>()?;
270
271        let compat_batch = RecordBatch::try_new(self.arrow_schema.clone(), columns)
272            .context(NewRecordBatchSnafu)?;
273
274        // Handles primary keys.
275        self.compat_pk.compat(compat_batch)
276    }
277}
278
279/// Repeats the vector value `to_len` times.
280fn repeat_vector(vector: &VectorRef, to_len: usize, is_tag: bool) -> Result<ArrayRef> {
281    assert_eq!(1, vector.len());
282    let data_type = vector.data_type();
283    if is_tag && data_type.is_string() {
284        let values = vector.to_arrow_array();
285        if values.is_null(0) {
286            // Creates a dictionary array with `to_len` null keys.
287            let keys = UInt32Array::new_null(to_len);
288            Ok(Arc::new(DictionaryArray::new(keys, values.slice(0, 0))))
289        } else {
290            let keys = UInt32Array::from_value(0, to_len);
291            Ok(Arc::new(DictionaryArray::new(keys, values)))
292        }
293    } else {
294        let keys = UInt32Array::from_value(0, to_len);
295        take(
296            &vector.to_arrow_array(),
297            &keys,
298            Some(TakeOptions {
299                check_bounds: false,
300            }),
301        )
302        .context(ComputeArrowSnafu)
303    }
304}
305
306/// Returns true if the actual primary keys is the same as expected.
307fn is_primary_key_same(expect: &RegionMetadata, actual: &RegionMetadata) -> Result<bool> {
308    ensure!(
309        actual.primary_key.len() <= expect.primary_key.len(),
310        CompatReaderSnafu {
311            region_id: expect.region_id,
312            reason: format!(
313                "primary key has more columns {} than expect {}",
314                actual.primary_key.len(),
315                expect.primary_key.len()
316            ),
317        }
318    );
319    ensure!(
320        actual.primary_key == expect.primary_key[..actual.primary_key.len()],
321        CompatReaderSnafu {
322            region_id: expect.region_id,
323            reason: format!(
324                "primary key has different prefix, expect: {:?}, actual: {:?}",
325                expect.primary_key, actual.primary_key
326            ),
327        }
328    );
329
330    Ok(actual.primary_key.len() == expect.primary_key.len())
331}
332
333/// Index in source batch or a default value to fill a column.
334#[derive(Debug)]
335enum IndexOrDefault {
336    /// Index of the column in source batch.
337    Index {
338        pos: usize,
339        cast_type: Option<ConcreteDataType>,
340    },
341    /// Default value for the column.
342    DefaultValue {
343        /// Default value. The vector has only 1 element.
344        default_vector: VectorRef,
345        /// Semantic type of the column.
346        semantic_type: SemanticType,
347    },
348}
349
350/// Helper to rewrite primary key to another encoding for flat format.
351struct FlatRewritePrimaryKey {
352    /// New primary key encoder.
353    codec: Arc<dyn PrimaryKeyCodec>,
354    /// Metadata of the expected region.
355    metadata: RegionMetadataRef,
356    /// Original primary key codec.
357    /// If we need to rewrite the primary key.
358    old_codec: Arc<dyn PrimaryKeyCodec>,
359}
360
361impl FlatRewritePrimaryKey {
362    fn new(
363        expect: &RegionMetadataRef,
364        actual: &RegionMetadataRef,
365    ) -> Option<FlatRewritePrimaryKey> {
366        if expect.primary_key_encoding == actual.primary_key_encoding {
367            return None;
368        }
369        let codec = build_primary_key_codec(expect);
370        let old_codec = build_primary_key_codec(actual);
371
372        Some(FlatRewritePrimaryKey {
373            codec,
374            metadata: expect.clone(),
375            old_codec,
376        })
377    }
378
379    /// Rewrites the primary key of the `batch`.
380    /// It also appends the values to the primary key.
381    fn rewrite_key(
382        &self,
383        append_values: &[(ColumnId, Value)],
384        batch: RecordBatch,
385    ) -> Result<RecordBatch> {
386        let old_pk_dict_array = batch
387            .column(primary_key_column_index(batch.num_columns()))
388            .as_any()
389            .downcast_ref::<PrimaryKeyArray>()
390            .unwrap();
391        let old_pk_values_array = old_pk_dict_array
392            .values()
393            .as_any()
394            .downcast_ref::<BinaryArray>()
395            .unwrap();
396        let mut builder = BinaryBuilder::with_capacity(
397            old_pk_values_array.len(),
398            old_pk_values_array.value_data().len(),
399        );
400
401        // Binary buffer for the primary key.
402        let mut buffer = Vec::with_capacity(
403            old_pk_values_array.value_data().len() / old_pk_values_array.len().max(1),
404        );
405        let mut column_id_values = Vec::new();
406        // Iterates the binary array and rewrites the primary key.
407        for value in old_pk_values_array.iter() {
408            let Some(old_pk) = value else {
409                builder.append_null();
410                continue;
411            };
412            // Decodes the old primary key.
413            let mut pk_values = self.old_codec.decode(old_pk).context(DecodeSnafu)?;
414            pk_values.extend(append_values);
415
416            buffer.clear();
417            column_id_values.clear();
418            // Encodes the new primary key.
419            match pk_values {
420                CompositeValues::Dense(dense_values) => {
421                    self.codec
422                        .encode_values(dense_values.as_slice(), &mut buffer)
423                        .context(EncodeSnafu)?;
424                }
425                CompositeValues::Sparse(sparse_values) => {
426                    for id in &self.metadata.primary_key {
427                        let value = sparse_values.get_or_null(*id);
428                        column_id_values.push((*id, value.clone()));
429                    }
430                    self.codec
431                        .encode_values(&column_id_values, &mut buffer)
432                        .context(EncodeSnafu)?;
433                }
434            }
435            builder.append_value(&buffer);
436        }
437        let new_pk_values_array = Arc::new(builder.finish());
438        let new_pk_dict_array =
439            PrimaryKeyArray::new(old_pk_dict_array.keys().clone(), new_pk_values_array);
440
441        let mut columns = batch.columns().to_vec();
442        columns[primary_key_column_index(batch.num_columns())] = Arc::new(new_pk_dict_array);
443
444        RecordBatch::try_new(batch.schema(), columns).context(NewRecordBatchSnafu)
445    }
446}
447
448/// Helper to make primary key compatible for flat format.
449#[derive(Default)]
450struct FlatCompatPrimaryKey {
451    /// Primary key rewriter.
452    rewriter: Option<FlatRewritePrimaryKey>,
453    /// Converter to append values to primary keys.
454    converter: Option<Arc<dyn PrimaryKeyCodec>>,
455    /// Default values to append.
456    values: Vec<(ColumnId, Value)>,
457}
458
459impl FlatCompatPrimaryKey {
460    fn new(expect: &RegionMetadataRef, actual: &RegionMetadataRef) -> Result<Self> {
461        let rewriter = FlatRewritePrimaryKey::new(expect, actual);
462
463        if is_primary_key_same(expect, actual)? {
464            return Ok(Self {
465                rewriter,
466                converter: None,
467                values: Vec::new(),
468            });
469        }
470
471        // We need to append default values to the primary key.
472        let to_add = &expect.primary_key[actual.primary_key.len()..];
473        let mut values = Vec::with_capacity(to_add.len());
474        let mut fields = Vec::with_capacity(to_add.len());
475        for column_id in to_add {
476            // Safety: The id comes from expect region metadata.
477            let column = expect.column_by_id(*column_id).unwrap();
478            fields.push((
479                *column_id,
480                SortField::new(column.column_schema.data_type.clone()),
481            ));
482            let default_value = column
483                .column_schema
484                .create_default()
485                .context(CreateDefaultSnafu {
486                    region_id: expect.region_id,
487                    column: &column.column_schema.name,
488                })?
489                .with_context(|| CompatReaderSnafu {
490                    region_id: expect.region_id,
491                    reason: format!(
492                        "key column {} does not have a default value to read",
493                        column.column_schema.name
494                    ),
495                })?;
496            values.push((*column_id, default_value));
497        }
498        // is_primary_key_same() is false so we have different number of primary key columns.
499        debug_assert!(!fields.is_empty());
500
501        // Create converter to append values.
502        let converter = Some(build_primary_key_codec_with_fields(
503            expect.primary_key_encoding,
504            fields.into_iter(),
505        ));
506
507        Ok(Self {
508            rewriter,
509            converter,
510            values,
511        })
512    }
513
514    /// Makes primary key of the `batch` compatible.
515    ///
516    /// Callers must ensure other columns except the `__primary_key` column is compatible.
517    fn compat(&self, batch: RecordBatch) -> Result<RecordBatch> {
518        if let Some(rewriter) = &self.rewriter {
519            // If we have different encoding, rewrite the whole primary key.
520            return rewriter.rewrite_key(&self.values, batch);
521        }
522
523        self.append_key(batch)
524    }
525
526    /// Appends values to the primary key of the `batch`.
527    fn append_key(&self, batch: RecordBatch) -> Result<RecordBatch> {
528        let Some(converter) = &self.converter else {
529            return Ok(batch);
530        };
531
532        let old_pk_dict_array = batch
533            .column(primary_key_column_index(batch.num_columns()))
534            .as_any()
535            .downcast_ref::<PrimaryKeyArray>()
536            .unwrap();
537        let old_pk_values_array = old_pk_dict_array
538            .values()
539            .as_any()
540            .downcast_ref::<BinaryArray>()
541            .unwrap();
542        let mut builder = BinaryBuilder::with_capacity(
543            old_pk_values_array.len(),
544            old_pk_values_array.value_data().len()
545                + converter.estimated_size().unwrap_or_default() * old_pk_values_array.len(),
546        );
547
548        // Binary buffer for the primary key.
549        let mut buffer = Vec::with_capacity(
550            old_pk_values_array.value_data().len() / old_pk_values_array.len().max(1)
551                + converter.estimated_size().unwrap_or_default(),
552        );
553
554        // Iterates the binary array and appends values to the primary key.
555        for value in old_pk_values_array.iter() {
556            let Some(old_pk) = value else {
557                builder.append_null();
558                continue;
559            };
560
561            buffer.clear();
562            buffer.extend_from_slice(old_pk);
563            converter
564                .encode_values(&self.values, &mut buffer)
565                .context(EncodeSnafu)?;
566
567            builder.append_value(&buffer);
568        }
569
570        let new_pk_values_array = Arc::new(builder.finish());
571        let new_pk_dict_array =
572            PrimaryKeyArray::new(old_pk_dict_array.keys().clone(), new_pk_values_array);
573
574        // Overrides the primary key column.
575        let mut columns = batch.columns().to_vec();
576        columns[primary_key_column_index(batch.num_columns())] = Arc::new(new_pk_dict_array);
577
578        RecordBatch::try_new(batch.schema(), columns).context(NewRecordBatchSnafu)
579    }
580}
581
582#[cfg(test)]
583mod tests {
584    use std::sync::Arc;
585
586    use api::v1::{OpType, SemanticType};
587    use datatypes::arrow::array::{
588        ArrayRef, BinaryDictionaryBuilder, Int64Array, StringDictionaryBuilder,
589        TimestampMillisecondArray, UInt8Array, UInt64Array,
590    };
591    use datatypes::arrow::datatypes::UInt32Type;
592    use datatypes::arrow::record_batch::RecordBatch;
593    use datatypes::prelude::ConcreteDataType;
594    use datatypes::schema::ColumnSchema;
595    use datatypes::value::ValueRef;
596    use mito_codec::row_converter::{
597        DensePrimaryKeyCodec, PrimaryKeyCodecExt, SparsePrimaryKeyCodec,
598    };
599    use store_api::codec::PrimaryKeyEncoding;
600    use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder};
601    use store_api::storage::RegionId;
602
603    use super::*;
604    use crate::read::flat_projection::FlatProjectionMapper;
605    use crate::sst::parquet::flat_format::FlatReadFormat;
606    use crate::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema};
607
608    /// Creates a new [RegionMetadata].
609    fn new_metadata(
610        semantic_types: &[(ColumnId, SemanticType, ConcreteDataType)],
611        primary_key: &[ColumnId],
612    ) -> RegionMetadata {
613        let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
614        for (id, semantic_type, data_type) in semantic_types {
615            let column_schema = match semantic_type {
616                SemanticType::Tag => {
617                    ColumnSchema::new(format!("tag_{id}"), data_type.clone(), true)
618                }
619                SemanticType::Field => {
620                    ColumnSchema::new(format!("field_{id}"), data_type.clone(), true)
621                }
622                SemanticType::Timestamp => ColumnSchema::new("ts", data_type.clone(), false),
623            };
624
625            builder.push_column_metadata(ColumnMetadata {
626                column_schema,
627                semantic_type: *semantic_type,
628                column_id: *id,
629            });
630        }
631        builder.primary_key(primary_key.to_vec());
632        builder.build().unwrap()
633    }
634
635    /// Encode primary key.
636    fn encode_key(keys: &[Option<&str>]) -> Vec<u8> {
637        let fields = (0..keys.len())
638            .map(|_| (0, SortField::new(ConcreteDataType::string_datatype())))
639            .collect();
640        let converter = DensePrimaryKeyCodec::with_fields(fields);
641        let row = keys.iter().map(|str_opt| match str_opt {
642            Some(v) => ValueRef::String(v),
643            None => ValueRef::Null,
644        });
645
646        converter.encode(row).unwrap()
647    }
648
649    /// Encode sparse primary key.
650    fn encode_sparse_key(keys: &[(ColumnId, Option<&str>)]) -> Vec<u8> {
651        let fields = (0..keys.len())
652            .map(|_| (1, SortField::new(ConcreteDataType::string_datatype())))
653            .collect();
654        let converter = SparsePrimaryKeyCodec::with_fields(fields);
655        let row = keys
656            .iter()
657            .map(|(id, str_opt)| match str_opt {
658                Some(v) => (*id, ValueRef::String(v)),
659                None => (*id, ValueRef::Null),
660            })
661            .collect::<Vec<_>>();
662        let mut buffer = vec![];
663        converter.encode_value_refs(&row, &mut buffer).unwrap();
664        buffer
665    }
666
667    /// Creates a primary key array for flat format testing.
668    fn build_flat_test_pk_array(primary_keys: &[&[u8]]) -> ArrayRef {
669        let mut builder = BinaryDictionaryBuilder::<UInt32Type>::new();
670        for &pk in primary_keys {
671            builder.append(pk).unwrap();
672        }
673        Arc::new(builder.finish())
674    }
675
676    #[test]
677    fn test_flat_compat_batch_with_missing_columns() {
678        let actual_metadata = Arc::new(new_metadata(
679            &[
680                (
681                    0,
682                    SemanticType::Timestamp,
683                    ConcreteDataType::timestamp_millisecond_datatype(),
684                ),
685                (1, SemanticType::Tag, ConcreteDataType::string_datatype()),
686                (2, SemanticType::Field, ConcreteDataType::int64_datatype()),
687            ],
688            &[1],
689        ));
690
691        let expected_metadata = Arc::new(new_metadata(
692            &[
693                (
694                    0,
695                    SemanticType::Timestamp,
696                    ConcreteDataType::timestamp_millisecond_datatype(),
697                ),
698                (1, SemanticType::Tag, ConcreteDataType::string_datatype()),
699                (2, SemanticType::Field, ConcreteDataType::int64_datatype()),
700                // Adds a new field.
701                (3, SemanticType::Field, ConcreteDataType::int64_datatype()),
702            ],
703            &[1],
704        ));
705
706        let mapper = FlatProjectionMapper::all(&expected_metadata).unwrap();
707        let read_format = FlatReadFormat::new(
708            actual_metadata.clone(),
709            [0, 1, 2, 3].into_iter(),
710            None,
711            "test",
712            false,
713        )
714        .unwrap();
715        let format_projection = read_format.format_projection();
716
717        let compat_batch =
718            FlatCompatBatch::try_new(&mapper, &actual_metadata, format_projection, false)
719                .unwrap()
720                .unwrap();
721
722        let mut tag_builder = StringDictionaryBuilder::<UInt32Type>::new();
723        tag_builder.append_value("tag1");
724        tag_builder.append_value("tag1");
725        let tag_dict_array = Arc::new(tag_builder.finish());
726
727        let k1 = encode_key(&[Some("tag1")]);
728        let input_columns: Vec<ArrayRef> = vec![
729            tag_dict_array.clone(),
730            Arc::new(Int64Array::from(vec![100, 200])),
731            Arc::new(TimestampMillisecondArray::from_iter_values([1000, 2000])),
732            build_flat_test_pk_array(&[&k1, &k1]),
733            Arc::new(UInt64Array::from_iter_values([1, 2])),
734            Arc::new(UInt8Array::from_iter_values([
735                OpType::Put as u8,
736                OpType::Put as u8,
737            ])),
738        ];
739        let input_schema =
740            to_flat_sst_arrow_schema(&actual_metadata, &FlatSchemaOptions::default());
741        let input_batch = RecordBatch::try_new(input_schema, input_columns).unwrap();
742
743        let result = compat_batch.compat(input_batch).unwrap();
744
745        let expected_schema =
746            to_flat_sst_arrow_schema(&expected_metadata, &FlatSchemaOptions::default());
747
748        let expected_columns: Vec<ArrayRef> = vec![
749            tag_dict_array.clone(),
750            Arc::new(Int64Array::from(vec![100, 200])),
751            Arc::new(Int64Array::from(vec![None::<i64>, None::<i64>])),
752            Arc::new(TimestampMillisecondArray::from_iter_values([1000, 2000])),
753            build_flat_test_pk_array(&[&k1, &k1]),
754            Arc::new(UInt64Array::from_iter_values([1, 2])),
755            Arc::new(UInt8Array::from_iter_values([
756                OpType::Put as u8,
757                OpType::Put as u8,
758            ])),
759        ];
760        let expected_batch = RecordBatch::try_new(expected_schema, expected_columns).unwrap();
761
762        assert_eq!(expected_batch, result);
763    }
764
765    #[test]
766    fn test_flat_compat_batch_with_read_projection_superset() {
767        let actual_metadata = Arc::new(new_metadata(
768            &[
769                (
770                    0,
771                    SemanticType::Timestamp,
772                    ConcreteDataType::timestamp_millisecond_datatype(),
773                ),
774                (1, SemanticType::Tag, ConcreteDataType::string_datatype()),
775                (2, SemanticType::Field, ConcreteDataType::int64_datatype()),
776            ],
777            &[1],
778        ));
779
780        let expected_metadata = Arc::new(new_metadata(
781            &[
782                (
783                    0,
784                    SemanticType::Timestamp,
785                    ConcreteDataType::timestamp_millisecond_datatype(),
786                ),
787                (1, SemanticType::Tag, ConcreteDataType::string_datatype()),
788                (2, SemanticType::Field, ConcreteDataType::int64_datatype()),
789                // Adds a new field.
790                (3, SemanticType::Field, ConcreteDataType::int64_datatype()),
791            ],
792            &[1],
793        ));
794
795        // Output projection: tag_1, field_2. Read also includes field_3.
796        let mapper = FlatProjectionMapper::new_with_read_columns(
797            &expected_metadata,
798            vec![1, 2],
799            vec![1, 2, 3],
800        )
801        .unwrap();
802        let read_format = FlatReadFormat::new(
803            actual_metadata.clone(),
804            [1, 2, 3].into_iter(),
805            None,
806            "test",
807            false,
808        )
809        .unwrap();
810        let format_projection = read_format.format_projection();
811
812        let compat_batch =
813            FlatCompatBatch::try_new(&mapper, &actual_metadata, format_projection, false)
814                .unwrap()
815                .unwrap();
816
817        let mut tag_builder = StringDictionaryBuilder::<UInt32Type>::new();
818        tag_builder.append_value("tag1");
819        tag_builder.append_value("tag1");
820        let tag_dict_array = Arc::new(tag_builder.finish());
821
822        let k1 = encode_key(&[Some("tag1")]);
823        let input_columns: Vec<ArrayRef> = vec![
824            tag_dict_array.clone(),
825            Arc::new(Int64Array::from(vec![100, 200])),
826            Arc::new(TimestampMillisecondArray::from_iter_values([1000, 2000])),
827            build_flat_test_pk_array(&[&k1, &k1]),
828            Arc::new(UInt64Array::from_iter_values([1, 2])),
829            Arc::new(UInt8Array::from_iter_values([
830                OpType::Put as u8,
831                OpType::Put as u8,
832            ])),
833        ];
834        let input_schema =
835            to_flat_sst_arrow_schema(&actual_metadata, &FlatSchemaOptions::default());
836        let input_batch = RecordBatch::try_new(input_schema, input_columns).unwrap();
837
838        let result = compat_batch.compat(input_batch).unwrap();
839
840        let expected_schema =
841            to_flat_sst_arrow_schema(&expected_metadata, &FlatSchemaOptions::default());
842        let expected_columns: Vec<ArrayRef> = vec![
843            tag_dict_array.clone(),
844            Arc::new(Int64Array::from(vec![100, 200])),
845            Arc::new(Int64Array::from(vec![None::<i64>, None::<i64>])),
846            Arc::new(TimestampMillisecondArray::from_iter_values([1000, 2000])),
847            build_flat_test_pk_array(&[&k1, &k1]),
848            Arc::new(UInt64Array::from_iter_values([1, 2])),
849            Arc::new(UInt8Array::from_iter_values([
850                OpType::Put as u8,
851                OpType::Put as u8,
852            ])),
853        ];
854        let expected_batch = RecordBatch::try_new(expected_schema, expected_columns).unwrap();
855
856        assert_eq!(expected_batch, result);
857    }
858
859    #[test]
860    fn test_flat_compat_batch_with_different_pk_encoding() {
861        let mut actual_metadata = new_metadata(
862            &[
863                (
864                    0,
865                    SemanticType::Timestamp,
866                    ConcreteDataType::timestamp_millisecond_datatype(),
867                ),
868                (1, SemanticType::Tag, ConcreteDataType::string_datatype()),
869                (2, SemanticType::Field, ConcreteDataType::int64_datatype()),
870            ],
871            &[1],
872        );
873        actual_metadata.primary_key_encoding = PrimaryKeyEncoding::Dense;
874        let actual_metadata = Arc::new(actual_metadata);
875
876        let mut expected_metadata = new_metadata(
877            &[
878                (
879                    0,
880                    SemanticType::Timestamp,
881                    ConcreteDataType::timestamp_millisecond_datatype(),
882                ),
883                (1, SemanticType::Tag, ConcreteDataType::string_datatype()),
884                (2, SemanticType::Field, ConcreteDataType::int64_datatype()),
885                (3, SemanticType::Tag, ConcreteDataType::string_datatype()),
886            ],
887            &[1, 3],
888        );
889        expected_metadata.primary_key_encoding = PrimaryKeyEncoding::Sparse;
890        let expected_metadata = Arc::new(expected_metadata);
891
892        let mapper = FlatProjectionMapper::all(&expected_metadata).unwrap();
893        let read_format = FlatReadFormat::new(
894            actual_metadata.clone(),
895            [0, 1, 2, 3].into_iter(),
896            None,
897            "test",
898            false,
899        )
900        .unwrap();
901        let format_projection = read_format.format_projection();
902
903        let compat_batch =
904            FlatCompatBatch::try_new(&mapper, &actual_metadata, format_projection, false)
905                .unwrap()
906                .unwrap();
907
908        // Tag array.
909        let mut tag1_builder = StringDictionaryBuilder::<UInt32Type>::new();
910        tag1_builder.append_value("tag1");
911        tag1_builder.append_value("tag1");
912        let tag1_dict_array = Arc::new(tag1_builder.finish());
913
914        let k1 = encode_key(&[Some("tag1")]);
915        let input_columns: Vec<ArrayRef> = vec![
916            tag1_dict_array.clone(),
917            Arc::new(Int64Array::from(vec![100, 200])),
918            Arc::new(TimestampMillisecondArray::from_iter_values([1000, 2000])),
919            build_flat_test_pk_array(&[&k1, &k1]),
920            Arc::new(UInt64Array::from_iter_values([1, 2])),
921            Arc::new(UInt8Array::from_iter_values([
922                OpType::Put as u8,
923                OpType::Put as u8,
924            ])),
925        ];
926        let input_schema =
927            to_flat_sst_arrow_schema(&actual_metadata, &FlatSchemaOptions::default());
928        let input_batch = RecordBatch::try_new(input_schema, input_columns).unwrap();
929
930        let result = compat_batch.compat(input_batch).unwrap();
931
932        let sparse_k1 = encode_sparse_key(&[(1, Some("tag1")), (3, None)]);
933        let mut null_tag_builder = StringDictionaryBuilder::<UInt32Type>::new();
934        null_tag_builder.append_nulls(2);
935        let null_tag_dict_array = Arc::new(null_tag_builder.finish());
936        let expected_columns: Vec<ArrayRef> = vec![
937            tag1_dict_array.clone(),
938            null_tag_dict_array,
939            Arc::new(Int64Array::from(vec![100, 200])),
940            Arc::new(TimestampMillisecondArray::from_iter_values([1000, 2000])),
941            build_flat_test_pk_array(&[&sparse_k1, &sparse_k1]),
942            Arc::new(UInt64Array::from_iter_values([1, 2])),
943            Arc::new(UInt8Array::from_iter_values([
944                OpType::Put as u8,
945                OpType::Put as u8,
946            ])),
947        ];
948        let output_schema =
949            to_flat_sst_arrow_schema(&expected_metadata, &FlatSchemaOptions::default());
950        let expected_batch = RecordBatch::try_new(output_schema, expected_columns).unwrap();
951
952        assert_eq!(expected_batch, result);
953    }
954
955    #[test]
956    fn test_flat_compat_batch_compact_sparse() {
957        let mut actual_metadata = new_metadata(
958            &[
959                (
960                    0,
961                    SemanticType::Timestamp,
962                    ConcreteDataType::timestamp_millisecond_datatype(),
963                ),
964                (2, SemanticType::Field, ConcreteDataType::int64_datatype()),
965            ],
966            &[],
967        );
968        actual_metadata.primary_key_encoding = PrimaryKeyEncoding::Sparse;
969        let actual_metadata = Arc::new(actual_metadata);
970
971        let mut expected_metadata = new_metadata(
972            &[
973                (
974                    0,
975                    SemanticType::Timestamp,
976                    ConcreteDataType::timestamp_millisecond_datatype(),
977                ),
978                (2, SemanticType::Field, ConcreteDataType::int64_datatype()),
979                (3, SemanticType::Field, ConcreteDataType::int64_datatype()),
980            ],
981            &[],
982        );
983        expected_metadata.primary_key_encoding = PrimaryKeyEncoding::Sparse;
984        let expected_metadata = Arc::new(expected_metadata);
985
986        let mapper = FlatProjectionMapper::all(&expected_metadata).unwrap();
987        let read_format = FlatReadFormat::new(
988            actual_metadata.clone(),
989            [0, 2, 3].into_iter(),
990            None,
991            "test",
992            true,
993        )
994        .unwrap();
995        let format_projection = read_format.format_projection();
996
997        let compat_batch =
998            FlatCompatBatch::try_new(&mapper, &actual_metadata, format_projection, true)
999                .unwrap()
1000                .unwrap();
1001
1002        let sparse_k1 = encode_sparse_key(&[]);
1003        let input_columns: Vec<ArrayRef> = vec![
1004            Arc::new(Int64Array::from(vec![100, 200])),
1005            Arc::new(TimestampMillisecondArray::from_iter_values([1000, 2000])),
1006            build_flat_test_pk_array(&[&sparse_k1, &sparse_k1]),
1007            Arc::new(UInt64Array::from_iter_values([1, 2])),
1008            Arc::new(UInt8Array::from_iter_values([
1009                OpType::Put as u8,
1010                OpType::Put as u8,
1011            ])),
1012        ];
1013        let input_schema =
1014            to_flat_sst_arrow_schema(&actual_metadata, &FlatSchemaOptions::default());
1015        let input_batch = RecordBatch::try_new(input_schema, input_columns).unwrap();
1016
1017        let result = compat_batch.compat(input_batch).unwrap();
1018
1019        let expected_columns: Vec<ArrayRef> = vec![
1020            Arc::new(Int64Array::from(vec![100, 200])),
1021            Arc::new(Int64Array::from(vec![None::<i64>, None::<i64>])),
1022            Arc::new(TimestampMillisecondArray::from_iter_values([1000, 2000])),
1023            build_flat_test_pk_array(&[&sparse_k1, &sparse_k1]),
1024            Arc::new(UInt64Array::from_iter_values([1, 2])),
1025            Arc::new(UInt8Array::from_iter_values([
1026                OpType::Put as u8,
1027                OpType::Put as u8,
1028            ])),
1029        ];
1030        let output_schema =
1031            to_flat_sst_arrow_schema(&expected_metadata, &FlatSchemaOptions::default());
1032        let expected_batch = RecordBatch::try_new(output_schema, expected_columns).unwrap();
1033
1034        assert_eq!(expected_batch, result);
1035    }
1036}