Skip to main content

mito2/read/
flat_projection.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Utilities for projection on flat format.
16
17use std::collections::HashMap;
18use std::sync::Arc;
19
20use api::v1::SemanticType;
21use common_error::ext::BoxedError;
22use common_recordbatch::error::{
23    ArrowComputeSnafu, DataTypesSnafu, ExternalSnafu, NewDfRecordBatchSnafu,
24};
25use common_recordbatch::{DfRecordBatch, RecordBatch};
26use datatypes::arrow::array::Array;
27use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field};
28use datatypes::extension::json::is_json_extension_type;
29use datatypes::prelude::{ConcreteDataType, DataType};
30use datatypes::schema::{Schema, SchemaRef};
31use datatypes::types::json_type::JsonNativeType;
32use datatypes::value::Value;
33use datatypes::vectors::Helper;
34use datatypes::vectors::json::array::JsonArray;
35use snafu::{OptionExt, ResultExt};
36use store_api::metadata::{RegionMetadata, RegionMetadataRef};
37use store_api::storage::ColumnId;
38
39use crate::cache::CacheStrategy;
40use crate::error::{InvalidRequestSnafu, RecordBatchSnafu, Result};
41use crate::read::projection::{read_column_ids_from_projection, repeated_vector_with_cache};
42use crate::read::read_columns::ReadColumns;
43use crate::sst::parquet::flat_format::sst_column_id_indices;
44use crate::sst::parquet::format::FormatProjection;
45use crate::sst::{
46    FlatSchemaOptions, internal_fields, tag_maybe_to_dictionary_field, to_flat_sst_arrow_schema,
47    with_field_id,
48};
49
50/// Handles projection and converts batches in flat format with correct schema.
51///
52/// This mapper support duplicate and unsorted projection indices.
53/// The output schema is determined by the projection indices.
54pub struct FlatProjectionMapper {
55    /// Metadata of the region.
56    metadata: RegionMetadataRef,
57    /// Schema for converted [RecordBatch] to return.
58    output_schema: SchemaRef,
59    /// The columns to read from memtables and SSTs.
60    /// The mapper won't deduplicate the column ids.
61    ///
62    /// Note that this doesn't contain the `__table_id` and `__tsid`.
63    read_cols: ReadColumns,
64    /// Ids and DataTypes of columns of the expected batch.
65    /// We can use this to check if the batch is compatible with the expected schema.
66    ///
67    /// It doesn't contain internal columns but always contains the time index column.
68    batch_schema: Vec<(ColumnId, ConcreteDataType)>,
69    /// `true` If the original projection is empty.
70    is_empty_projection: bool,
71    /// The index in flat format [RecordBatch] for each column in the output [RecordBatch].
72    batch_indices: Vec<usize>,
73    /// Precomputed Arrow schema for input batches.
74    input_arrow_schema: datatypes::arrow::datatypes::SchemaRef,
75}
76
77impl FlatProjectionMapper {
78    /// Returns a new mapper with projection.
79    /// If `projection` is empty, it outputs [RecordBatch] without any column but only a row count.
80    /// `SELECT COUNT(*) FROM table` is an example that uses an empty projection. DataFusion accepts
81    /// empty `RecordBatch` and only use its row count in this query.
82    pub fn new(
83        metadata: &RegionMetadataRef,
84        projection: impl IntoIterator<Item = usize>,
85    ) -> Result<Self> {
86        let projection: Vec<_> = projection.into_iter().collect();
87        let read_column_ids = read_column_ids_from_projection(metadata, &projection)?;
88        let read_cols = ReadColumns::from_deduped_column_ids(read_column_ids);
89        Self::new_with_read_columns(metadata, projection, read_cols, None)
90    }
91
92    /// Returns a new mapper with output projection and explicit read columns.
93    pub fn new_with_read_columns(
94        metadata: &RegionMetadataRef,
95        projection: Vec<usize>,
96        read_cols: ReadColumns,
97        json_type_hint: Option<&HashMap<String, JsonNativeType>>,
98    ) -> Result<Self> {
99        // If the original projection is empty.
100        let is_empty_projection = projection.is_empty();
101
102        // Output column schemas for the projection.
103        let mut col_schemas = Vec::with_capacity(projection.len());
104        // Column ids of the output projection without deduplication.
105        let mut output_col_ids = Vec::with_capacity(projection.len());
106        for idx in &projection {
107            let col = metadata
108                .column_metadatas
109                .get(*idx)
110                .with_context(|| InvalidRequestSnafu {
111                    region_id: metadata.region_id,
112                    reason: format!("projection index {} is out of bound", idx),
113                })?;
114            output_col_ids.push(col.column_id);
115
116            let mut schema = col.column_schema.clone();
117            if let Some(concretized) = json_type_hint
118                .and_then(|x| x.get(&schema.name))
119                .cloned()
120                .map(ConcreteDataType::json2)
121                && schema.data_type.is_json()
122            {
123                schema.data_type = concretized;
124            }
125            col_schemas.push(schema);
126        }
127
128        // Creates a map to lookup index.
129        let id_to_index = sst_column_id_indices(metadata);
130
131        // TODO(yingwen): Support different flat schema options.
132        let format_projection = FormatProjection::compute_format_projection(
133            &id_to_index,
134            // All columns with internal columns.
135            metadata.column_metadatas.len() + 3,
136            read_cols.clone(),
137        );
138
139        let mut batch_schema = flat_projected_columns(metadata, &format_projection);
140
141        if let Some(json_type_hint) = json_type_hint
142            && !json_type_hint.is_empty()
143        {
144            for (column_id, data_type) in batch_schema.iter_mut() {
145                if let Some(concretized) = metadata
146                    .column_by_id(*column_id)
147                    .and_then(|x| json_type_hint.get(&x.column_schema.name).cloned())
148                    .map(ConcreteDataType::json2)
149                {
150                    *data_type = concretized;
151                }
152            }
153        }
154
155        // Safety: We get the column id from the metadata.
156        let input_arrow_schema = compute_input_arrow_schema(metadata, &batch_schema);
157
158        // If projection is empty, we don't output any column.
159        let output_schema = if is_empty_projection {
160            Arc::new(Schema::new(vec![]))
161        } else {
162            // Safety: Columns come from existing schema.
163            Arc::new(Schema::new(col_schemas))
164        };
165
166        let batch_indices = if is_empty_projection {
167            vec![]
168        } else {
169            output_col_ids
170                .iter()
171                .map(|id| {
172                    // Safety: The map is computed from the read projection.
173                    format_projection
174                        .column_id_to_projected_index
175                        .get(id)
176                        .copied()
177                        .with_context(|| {
178                            let name = metadata
179                                .column_by_id(*id)
180                                .map(|column| column.column_schema.name.clone())
181                                .unwrap_or_else(|| id.to_string());
182                            InvalidRequestSnafu {
183                                region_id: metadata.region_id,
184                                reason: format!(
185                                    "output column {} is missing in read projection",
186                                    name
187                                ),
188                            }
189                        })
190                })
191                .collect::<Result<Vec<_>>>()?
192        };
193
194        Ok(FlatProjectionMapper {
195            metadata: metadata.clone(),
196            output_schema,
197            read_cols,
198            batch_schema,
199            is_empty_projection,
200            batch_indices,
201            input_arrow_schema,
202        })
203    }
204
205    /// Returns a new mapper without projection.
206    pub fn all(metadata: &RegionMetadataRef) -> Result<Self> {
207        FlatProjectionMapper::new(metadata, 0..metadata.column_metadatas.len())
208    }
209
210    /// Returns the metadata that created the mapper.
211    pub(crate) fn metadata(&self) -> &RegionMetadataRef {
212        &self.metadata
213    }
214    /// Returns projected columns that we need to read from memtables and SSTs.
215    pub(crate) fn read_columns(&self) -> &ReadColumns {
216        &self.read_cols
217    }
218
219    /// Returns the field column start index in output batch.
220    pub(crate) fn field_column_start(&self) -> usize {
221        for (idx, column_id) in self
222            .batch_schema
223            .iter()
224            .map(|(column_id, _)| column_id)
225            .enumerate()
226        {
227            // Safety: We get the column id from the metadata in new().
228            if self
229                .metadata
230                .column_by_id(*column_id)
231                .unwrap()
232                .semantic_type
233                == SemanticType::Field
234            {
235                return idx;
236            }
237        }
238
239        self.batch_schema.len()
240    }
241
242    /// Returns ids of columns of the batch that the mapper expects to convert.
243    pub(crate) fn batch_schema(&self) -> &[(ColumnId, ConcreteDataType)] {
244        &self.batch_schema
245    }
246
247    /// Returns the input arrow schema from sources.
248    ///
249    /// The merge reader can use this schema.
250    pub(crate) fn input_arrow_schema(
251        &self,
252        compaction: bool,
253    ) -> datatypes::arrow::datatypes::SchemaRef {
254        if !compaction {
255            self.input_arrow_schema.clone()
256        } else {
257            // For compaction, we need to build a different schema from encoding.
258            let mut options = FlatSchemaOptions::from_encoding(self.metadata.primary_key_encoding);
259            options.concretized_json_types = self
260                .input_arrow_schema
261                .fields()
262                .iter()
263                .filter(|&field| is_json_extension_type(field))
264                .map(|field| (field.name().clone(), field.data_type().clone()))
265                .collect();
266            to_flat_sst_arrow_schema(&self.metadata, &options)
267        }
268    }
269
270    /// Returns the schema of converted [RecordBatch].
271    /// This is the schema that the stream will output. This schema may contain
272    /// less columns than [FlatProjectionMapper::column_ids()].
273    pub(crate) fn output_schema(&self) -> SchemaRef {
274        self.output_schema.clone()
275    }
276
277    /// Converts a flat format [RecordBatch] to a normal [RecordBatch].
278    ///
279    /// The batch must match the `projection` using to build the mapper.
280    pub(crate) fn convert(
281        &self,
282        batch: &datatypes::arrow::record_batch::RecordBatch,
283        cache_strategy: &CacheStrategy,
284    ) -> common_recordbatch::error::Result<RecordBatch> {
285        if self.is_empty_projection {
286            return RecordBatch::new_with_count(self.output_schema.clone(), batch.num_rows());
287        }
288        // Construct output record batch directly from Arrow arrays to avoid
289        // Arrow -> Vector -> Arrow roundtrips in the hot path.
290        let mut arrays = Vec::with_capacity(self.output_schema.num_columns());
291        for (output_idx, index) in self.batch_indices.iter().enumerate() {
292            let mut array = batch.column(*index).clone();
293            // Cast dictionary values to the target type.
294            if let ArrowDataType::Dictionary(_key_type, value_type) = array.data_type() {
295                // When a string dictionary column contains only a single value, reuse a cached
296                // repeated vector to avoid repeatedly expanding the dictionary.
297                if let Some(dict_array) = single_value_string_dictionary(
298                    &array,
299                    &self.output_schema.column_schemas()[output_idx].data_type,
300                    value_type.as_ref(),
301                ) {
302                    let dict_values = dict_array.values();
303                    let value = if dict_values.is_null(0) {
304                        Value::Null
305                    } else {
306                        Value::from(datatypes::arrow_array::string_array_value(dict_values, 0))
307                    };
308
309                    let repeated = repeated_vector_with_cache(
310                        &self.output_schema.column_schemas()[output_idx].data_type,
311                        &value,
312                        batch.num_rows(),
313                        cache_strategy,
314                    )?;
315                    array = repeated.to_arrow_array();
316                } else {
317                    let casted = datatypes::arrow::compute::cast(&array, value_type)
318                        .context(ArrowComputeSnafu)?;
319                    array = casted;
320                }
321            }
322
323            let field = &self.output_schema.arrow_schema().fields()[output_idx];
324            if is_json_extension_type(field) {
325                array = JsonArray::from(&array)
326                    .try_align(field.data_type())
327                    .context(DataTypesSnafu)?;
328            }
329
330            arrays.push(array);
331        }
332
333        let df_record_batch =
334            DfRecordBatch::try_new(self.output_schema.arrow_schema().clone(), arrays)
335                .context(NewDfRecordBatchSnafu)?;
336        Ok(RecordBatch::from_df_record_batch(
337            self.output_schema.clone(),
338            df_record_batch,
339        ))
340    }
341
342    /// Projects columns from the input batch and converts them into vectors.
343    pub(crate) fn project_vectors(
344        &self,
345        batch: &datatypes::arrow::record_batch::RecordBatch,
346    ) -> common_recordbatch::error::Result<Vec<datatypes::vectors::VectorRef>> {
347        let mut columns = Vec::with_capacity(self.output_schema.num_columns());
348        for index in &self.batch_indices {
349            let mut array = batch.column(*index).clone();
350            // Casts dictionary values to the target type.
351            if let datatypes::arrow::datatypes::DataType::Dictionary(_key_type, value_type) =
352                array.data_type()
353            {
354                let casted = datatypes::arrow::compute::cast(&array, value_type)
355                    .context(ArrowComputeSnafu)?;
356                array = casted;
357            }
358            let vector = Helper::try_into_vector(array)
359                .map_err(BoxedError::new)
360                .context(ExternalSnafu)?;
361            columns.push(vector);
362        }
363        Ok(columns)
364    }
365}
366
367fn single_value_string_dictionary<'a>(
368    array: &'a Arc<dyn Array>,
369    output_type: &ConcreteDataType,
370    value_type: &ArrowDataType,
371) -> Option<&'a datatypes::arrow::array::DictionaryArray<datatypes::arrow::datatypes::UInt32Type>> {
372    if !matches!(
373        value_type,
374        ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 | ArrowDataType::Utf8View
375    ) || !output_type.is_string()
376    {
377        return None;
378    }
379
380    let dict_array = array
381        .as_any()
382        .downcast_ref::<datatypes::arrow::array::DictionaryArray<
383            datatypes::arrow::datatypes::UInt32Type,
384        >>()?;
385
386    (dict_array.values().len() == 1 && dict_array.null_count() == 0).then_some(dict_array)
387}
388
389/// Returns ids and datatypes of columns of the output batch after applying the `projection`.
390///
391/// It adds the time index column if it doesn't present in the projection.
392pub(crate) fn flat_projected_columns(
393    metadata: &RegionMetadata,
394    format_projection: &FormatProjection,
395) -> Vec<(ColumnId, ConcreteDataType)> {
396    let time_index = metadata.time_index_column();
397    let num_columns = if format_projection
398        .column_id_to_projected_index
399        .contains_key(&time_index.column_id)
400    {
401        format_projection.column_id_to_projected_index.len()
402    } else {
403        format_projection.column_id_to_projected_index.len() + 1
404    };
405    let mut schema = vec![None; num_columns];
406    for (column_id, index) in &format_projection.column_id_to_projected_index {
407        // Safety: FormatProjection ensures the id is valid.
408        schema[*index] = Some((
409            *column_id,
410            metadata
411                .column_by_id(*column_id)
412                .unwrap()
413                .column_schema
414                .data_type
415                .clone(),
416        ));
417    }
418    if num_columns != format_projection.column_id_to_projected_index.len() {
419        schema[num_columns - 1] = Some((
420            time_index.column_id,
421            time_index.column_schema.data_type.clone(),
422        ));
423    }
424
425    // Safety: FormatProjection ensures all indices can be unwrapped.
426    schema.into_iter().map(|id_type| id_type.unwrap()).collect()
427}
428
429/// Computes the Arrow schema for input batches.
430///
431/// # Panics
432/// Panics if it can't find the column by the column id in the batch_schema.
433pub(crate) fn compute_input_arrow_schema(
434    metadata: &RegionMetadata,
435    batch_schema: &[(ColumnId, ConcreteDataType)],
436) -> datatypes::arrow::datatypes::SchemaRef {
437    let mut new_fields = Vec::with_capacity(batch_schema.len() + 3);
438    for (column_id, data_type) in batch_schema {
439        let column_metadata = metadata.column_by_id(*column_id).unwrap();
440        let field = Field::new(
441            &column_metadata.column_schema.name,
442            data_type.as_arrow_type(),
443            column_metadata.column_schema.is_nullable(),
444        )
445        .with_metadata(column_metadata.column_schema.metadata().clone());
446        let field = with_field_id(field, *column_id);
447        if column_metadata.semantic_type == SemanticType::Tag {
448            new_fields.push(tag_maybe_to_dictionary_field(
449                &column_metadata.column_schema.data_type,
450                &Arc::new(field),
451            ));
452        } else {
453            new_fields.push(Arc::new(field));
454        }
455    }
456    new_fields.extend_from_slice(&internal_fields());
457
458    Arc::new(datatypes::arrow::datatypes::Schema::new(new_fields))
459}
460
461/// Helper to project compaction batches into flat format columns
462/// (fields + time index + __primary_key + __sequence + __op_type).
463pub(crate) struct CompactionProjectionMapper {
464    mapper: FlatProjectionMapper,
465    assembler: DfBatchAssembler,
466}
467
468impl CompactionProjectionMapper {
469    pub(crate) fn try_new(metadata: &RegionMetadataRef) -> Result<Self> {
470        let projection = metadata
471            .column_metadatas
472            .iter()
473            .enumerate()
474            .filter_map(|(idx, col)| {
475                if matches!(col.semantic_type, SemanticType::Field) {
476                    Some(idx)
477                } else {
478                    None
479                }
480            })
481            .chain([metadata.time_index_column_pos()])
482            .collect::<Vec<_>>();
483
484        let read_col_ids = metadata.column_metadatas.iter().map(|col| col.column_id);
485        let read_cols = ReadColumns::from_deduped_column_ids(read_col_ids);
486        let mapper =
487            FlatProjectionMapper::new_with_read_columns(metadata, projection, read_cols, None)?;
488        let assembler = DfBatchAssembler::new(mapper.output_schema());
489
490        Ok(Self { mapper, assembler })
491    }
492
493    /// Projects columns and appends internal columns for compaction output.
494    ///
495    /// The input batch is expected to be in flat format with internal columns appended.
496    pub(crate) fn project(&self, batch: DfRecordBatch) -> Result<DfRecordBatch> {
497        let columns = self
498            .mapper
499            .project_vectors(&batch)
500            .context(RecordBatchSnafu)?;
501        self.assembler
502            .build_df_record_batch_with_internal(&batch, columns)
503            .context(RecordBatchSnafu)
504    }
505}
506
507/// Builds [DfRecordBatch] with internal columns appended.
508pub(crate) struct DfBatchAssembler {
509    output_arrow_schema_with_internal: datatypes::arrow::datatypes::SchemaRef,
510}
511
512impl DfBatchAssembler {
513    /// Precomputes the output schema with internal columns.
514    pub(crate) fn new(output_schema: SchemaRef) -> Self {
515        let fields = output_schema
516            .arrow_schema()
517            .fields()
518            .into_iter()
519            .chain(internal_fields().iter())
520            .cloned()
521            .collect::<Vec<_>>();
522        let output_arrow_schema_with_internal =
523            Arc::new(datatypes::arrow::datatypes::Schema::new(fields));
524        Self {
525            output_arrow_schema_with_internal,
526        }
527    }
528
529    /// Builds a [DfRecordBatch] from projected vectors plus internal columns.
530    ///
531    /// Assumes the input batch already contains internal columns as the last three fields
532    /// ("__primary_key", "__sequence", "__op_type").
533    pub(crate) fn build_df_record_batch_with_internal(
534        &self,
535        batch: &datatypes::arrow::record_batch::RecordBatch,
536        mut columns: Vec<datatypes::vectors::VectorRef>,
537    ) -> common_recordbatch::error::Result<DfRecordBatch> {
538        let num_columns = batch.columns().len();
539        // The last 3 columns are the internal columns.
540        let internal_indices = [num_columns - 3, num_columns - 2, num_columns - 1];
541        for index in internal_indices.iter() {
542            let array = batch.column(*index).clone();
543            let vector = Helper::try_into_vector(array)
544                .map_err(BoxedError::new)
545                .context(ExternalSnafu)?;
546            columns.push(vector);
547        }
548        RecordBatch::to_df_record_batch(self.output_arrow_schema_with_internal.clone(), columns)
549    }
550}