Skip to main content

mito2/read/
flat_projection.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Utilities for projection on flat format.
16
17use std::sync::Arc;
18
19use api::v1::SemanticType;
20use common_error::ext::BoxedError;
21use common_recordbatch::error::{
22    ArrowComputeSnafu, DataTypesSnafu, ExternalSnafu, NewDfRecordBatchSnafu,
23};
24use common_recordbatch::{DfRecordBatch, RecordBatch};
25use datatypes::arrow::array::Array;
26use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field};
27use datatypes::extension::json::is_json_extension_type;
28use datatypes::prelude::{ConcreteDataType, DataType};
29use datatypes::schema::{Schema, SchemaRef};
30use datatypes::value::Value;
31use datatypes::vectors::Helper;
32use datatypes::vectors::json::array::JsonArray;
33use snafu::{OptionExt, ResultExt};
34use store_api::metadata::{RegionMetadata, RegionMetadataRef};
35use store_api::storage::ColumnId;
36
37use crate::cache::CacheStrategy;
38use crate::error::{InvalidRequestSnafu, RecordBatchSnafu, Result};
39use crate::read::projection::{read_column_ids_from_projection, repeated_vector_with_cache};
40use crate::read::read_columns::ReadColumns;
41use crate::sst::parquet::flat_format::sst_column_id_indices;
42use crate::sst::parquet::format::FormatProjection;
43use crate::sst::{
44    FlatSchemaOptions, internal_fields, tag_maybe_to_dictionary_field, to_flat_sst_arrow_schema,
45    with_field_id,
46};
47
48/// Handles projection and converts batches in flat format with correct schema.
49///
50/// This mapper support duplicate and unsorted projection indices.
51/// The output schema is determined by the projection indices.
52pub struct FlatProjectionMapper {
53    /// Metadata of the region.
54    metadata: RegionMetadataRef,
55    /// Schema for converted [RecordBatch] to return.
56    output_schema: SchemaRef,
57    /// The columns to read from memtables and SSTs.
58    /// The mapper won't deduplicate the column ids.
59    ///
60    /// Note that this doesn't contain the `__table_id` and `__tsid`.
61    read_cols: ReadColumns,
62    /// Ids and DataTypes of columns of the expected batch.
63    /// We can use this to check if the batch is compatible with the expected schema.
64    ///
65    /// It doesn't contain internal columns but always contains the time index column.
66    batch_schema: Vec<(ColumnId, ConcreteDataType)>,
67    /// `true` If the original projection is empty.
68    is_empty_projection: bool,
69    /// The index in flat format [RecordBatch] for each column in the output [RecordBatch].
70    batch_indices: Vec<usize>,
71    /// Precomputed Arrow schema for input batches.
72    input_arrow_schema: datatypes::arrow::datatypes::SchemaRef,
73}
74
75impl FlatProjectionMapper {
76    /// Returns a new mapper with projection.
77    /// If `projection` is empty, it outputs [RecordBatch] without any column but only a row count.
78    /// `SELECT COUNT(*) FROM table` is an example that uses an empty projection. DataFusion accepts
79    /// empty `RecordBatch` and only use its row count in this query.
80    pub fn new(
81        metadata: &RegionMetadataRef,
82        projection: impl IntoIterator<Item = usize>,
83    ) -> Result<Self> {
84        let projection: Vec<_> = projection.into_iter().collect();
85        let read_column_ids = read_column_ids_from_projection(metadata, &projection)?;
86        let read_cols = ReadColumns::from_deduped_column_ids(read_column_ids);
87        Self::new_with_read_columns(metadata, projection, read_cols)
88    }
89
90    /// Returns a new mapper with output projection and explicit read columns.
91    pub fn new_with_read_columns(
92        metadata: &RegionMetadataRef,
93        projection: Vec<usize>,
94        read_cols: ReadColumns,
95    ) -> Result<Self> {
96        // If the original projection is empty.
97        let is_empty_projection = projection.is_empty();
98
99        // Output column schemas for the projection.
100        let mut col_schemas = Vec::with_capacity(projection.len());
101        // Column ids of the output projection without deduplication.
102        let mut output_col_ids = Vec::with_capacity(projection.len());
103        for idx in &projection {
104            let col = metadata
105                .column_metadatas
106                .get(*idx)
107                .with_context(|| InvalidRequestSnafu {
108                    region_id: metadata.region_id,
109                    reason: format!("projection index {} is out of bound", idx),
110                })?;
111            output_col_ids.push(col.column_id);
112            col_schemas.push(col.column_schema.clone());
113        }
114
115        // Creates a map to lookup index.
116        let id_to_index = sst_column_id_indices(metadata);
117
118        // TODO(yingwen): Support different flat schema options.
119        let format_projection = FormatProjection::compute_format_projection(
120            &id_to_index,
121            // All columns with internal columns.
122            metadata.column_metadatas.len() + 3,
123            read_cols.clone(),
124        );
125
126        let batch_schema = flat_projected_columns(metadata, &format_projection);
127
128        // Safety: We get the column id from the metadata.
129        let input_arrow_schema = compute_input_arrow_schema(metadata, &batch_schema);
130
131        // If projection is empty, we don't output any column.
132        let output_schema = if is_empty_projection {
133            Arc::new(Schema::new(vec![]))
134        } else {
135            // Safety: Columns come from existing schema.
136            Arc::new(Schema::new(col_schemas))
137        };
138
139        let batch_indices = if is_empty_projection {
140            vec![]
141        } else {
142            output_col_ids
143                .iter()
144                .map(|id| {
145                    // Safety: The map is computed from the read projection.
146                    format_projection
147                        .column_id_to_projected_index
148                        .get(id)
149                        .copied()
150                        .with_context(|| {
151                            let name = metadata
152                                .column_by_id(*id)
153                                .map(|column| column.column_schema.name.clone())
154                                .unwrap_or_else(|| id.to_string());
155                            InvalidRequestSnafu {
156                                region_id: metadata.region_id,
157                                reason: format!(
158                                    "output column {} is missing in read projection",
159                                    name
160                                ),
161                            }
162                        })
163                })
164                .collect::<Result<Vec<_>>>()?
165        };
166
167        Ok(FlatProjectionMapper {
168            metadata: metadata.clone(),
169            output_schema,
170            read_cols,
171            batch_schema,
172            is_empty_projection,
173            batch_indices,
174            input_arrow_schema,
175        })
176    }
177
178    /// Returns a new mapper without projection.
179    pub fn all(metadata: &RegionMetadataRef) -> Result<Self> {
180        FlatProjectionMapper::new(metadata, 0..metadata.column_metadatas.len())
181    }
182
183    /// Returns the metadata that created the mapper.
184    pub(crate) fn metadata(&self) -> &RegionMetadataRef {
185        &self.metadata
186    }
187    /// Returns projected columns that we need to read from memtables and SSTs.
188    pub(crate) fn read_columns(&self) -> &ReadColumns {
189        &self.read_cols
190    }
191
192    /// Returns the field column start index in output batch.
193    pub(crate) fn field_column_start(&self) -> usize {
194        for (idx, column_id) in self
195            .batch_schema
196            .iter()
197            .map(|(column_id, _)| column_id)
198            .enumerate()
199        {
200            // Safety: We get the column id from the metadata in new().
201            if self
202                .metadata
203                .column_by_id(*column_id)
204                .unwrap()
205                .semantic_type
206                == SemanticType::Field
207            {
208                return idx;
209            }
210        }
211
212        self.batch_schema.len()
213    }
214
215    /// Returns ids of columns of the batch that the mapper expects to convert.
216    pub(crate) fn batch_schema(&self) -> &[(ColumnId, ConcreteDataType)] {
217        &self.batch_schema
218    }
219
220    /// Returns the input arrow schema from sources.
221    ///
222    /// The merge reader can use this schema.
223    pub(crate) fn input_arrow_schema(
224        &self,
225        compaction: bool,
226    ) -> datatypes::arrow::datatypes::SchemaRef {
227        if !compaction {
228            self.input_arrow_schema.clone()
229        } else {
230            // For compaction, we need to build a different schema from encoding.
231            to_flat_sst_arrow_schema(
232                &self.metadata,
233                &FlatSchemaOptions::from_encoding(self.metadata.primary_key_encoding),
234            )
235        }
236    }
237
238    /// Returns the schema of converted [RecordBatch].
239    /// This is the schema that the stream will output. This schema may contain
240    /// less columns than [FlatProjectionMapper::column_ids()].
241    pub(crate) fn output_schema(&self) -> SchemaRef {
242        self.output_schema.clone()
243    }
244
245    pub(crate) fn with_output_schema(&mut self, schema: SchemaRef) {
246        self.output_schema = schema;
247    }
248
249    /// Converts a flat format [RecordBatch] to a normal [RecordBatch].
250    ///
251    /// The batch must match the `projection` using to build the mapper.
252    pub(crate) fn convert(
253        &self,
254        batch: &datatypes::arrow::record_batch::RecordBatch,
255        cache_strategy: &CacheStrategy,
256    ) -> common_recordbatch::error::Result<RecordBatch> {
257        if self.is_empty_projection {
258            return RecordBatch::new_with_count(self.output_schema.clone(), batch.num_rows());
259        }
260        // Construct output record batch directly from Arrow arrays to avoid
261        // Arrow -> Vector -> Arrow roundtrips in the hot path.
262        let mut arrays = Vec::with_capacity(self.output_schema.num_columns());
263        for (output_idx, index) in self.batch_indices.iter().enumerate() {
264            let mut array = batch.column(*index).clone();
265            // Cast dictionary values to the target type.
266            if let ArrowDataType::Dictionary(_key_type, value_type) = array.data_type() {
267                // When a string dictionary column contains only a single value, reuse a cached
268                // repeated vector to avoid repeatedly expanding the dictionary.
269                if let Some(dict_array) = single_value_string_dictionary(
270                    &array,
271                    &self.output_schema.column_schemas()[output_idx].data_type,
272                    value_type.as_ref(),
273                ) {
274                    let dict_values = dict_array.values();
275                    let value = if dict_values.is_null(0) {
276                        Value::Null
277                    } else {
278                        Value::from(datatypes::arrow_array::string_array_value(dict_values, 0))
279                    };
280
281                    let repeated = repeated_vector_with_cache(
282                        &self.output_schema.column_schemas()[output_idx].data_type,
283                        &value,
284                        batch.num_rows(),
285                        cache_strategy,
286                    )?;
287                    array = repeated.to_arrow_array();
288                } else {
289                    let casted = datatypes::arrow::compute::cast(&array, value_type)
290                        .context(ArrowComputeSnafu)?;
291                    array = casted;
292                }
293            }
294
295            let field = &self.output_schema.arrow_schema().fields()[output_idx];
296            if is_json_extension_type(field) {
297                array = JsonArray::from(&array)
298                    .try_align(field.data_type())
299                    .context(DataTypesSnafu)?;
300            }
301
302            arrays.push(array);
303        }
304
305        let df_record_batch =
306            DfRecordBatch::try_new(self.output_schema.arrow_schema().clone(), arrays)
307                .context(NewDfRecordBatchSnafu)?;
308        Ok(RecordBatch::from_df_record_batch(
309            self.output_schema.clone(),
310            df_record_batch,
311        ))
312    }
313
314    /// Projects columns from the input batch and converts them into vectors.
315    pub(crate) fn project_vectors(
316        &self,
317        batch: &datatypes::arrow::record_batch::RecordBatch,
318    ) -> common_recordbatch::error::Result<Vec<datatypes::vectors::VectorRef>> {
319        let mut columns = Vec::with_capacity(self.output_schema.num_columns());
320        for index in &self.batch_indices {
321            let mut array = batch.column(*index).clone();
322            // Casts dictionary values to the target type.
323            if let datatypes::arrow::datatypes::DataType::Dictionary(_key_type, value_type) =
324                array.data_type()
325            {
326                let casted = datatypes::arrow::compute::cast(&array, value_type)
327                    .context(ArrowComputeSnafu)?;
328                array = casted;
329            }
330            let vector = Helper::try_into_vector(array)
331                .map_err(BoxedError::new)
332                .context(ExternalSnafu)?;
333            columns.push(vector);
334        }
335        Ok(columns)
336    }
337}
338
339fn single_value_string_dictionary<'a>(
340    array: &'a Arc<dyn Array>,
341    output_type: &ConcreteDataType,
342    value_type: &ArrowDataType,
343) -> Option<&'a datatypes::arrow::array::DictionaryArray<datatypes::arrow::datatypes::UInt32Type>> {
344    if !matches!(
345        value_type,
346        ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 | ArrowDataType::Utf8View
347    ) || !output_type.is_string()
348    {
349        return None;
350    }
351
352    let dict_array = array
353        .as_any()
354        .downcast_ref::<datatypes::arrow::array::DictionaryArray<
355            datatypes::arrow::datatypes::UInt32Type,
356        >>()?;
357
358    (dict_array.values().len() == 1 && dict_array.null_count() == 0).then_some(dict_array)
359}
360
361/// Returns ids and datatypes of columns of the output batch after applying the `projection`.
362///
363/// It adds the time index column if it doesn't present in the projection.
364pub(crate) fn flat_projected_columns(
365    metadata: &RegionMetadata,
366    format_projection: &FormatProjection,
367) -> Vec<(ColumnId, ConcreteDataType)> {
368    let time_index = metadata.time_index_column();
369    let num_columns = if format_projection
370        .column_id_to_projected_index
371        .contains_key(&time_index.column_id)
372    {
373        format_projection.column_id_to_projected_index.len()
374    } else {
375        format_projection.column_id_to_projected_index.len() + 1
376    };
377    let mut schema = vec![None; num_columns];
378    for (column_id, index) in &format_projection.column_id_to_projected_index {
379        // Safety: FormatProjection ensures the id is valid.
380        schema[*index] = Some((
381            *column_id,
382            metadata
383                .column_by_id(*column_id)
384                .unwrap()
385                .column_schema
386                .data_type
387                .clone(),
388        ));
389    }
390    if num_columns != format_projection.column_id_to_projected_index.len() {
391        schema[num_columns - 1] = Some((
392            time_index.column_id,
393            time_index.column_schema.data_type.clone(),
394        ));
395    }
396
397    // Safety: FormatProjection ensures all indices can be unwrapped.
398    schema.into_iter().map(|id_type| id_type.unwrap()).collect()
399}
400
401/// Computes the Arrow schema for input batches.
402///
403/// # Panics
404/// Panics if it can't find the column by the column id in the batch_schema.
405pub(crate) fn compute_input_arrow_schema(
406    metadata: &RegionMetadata,
407    batch_schema: &[(ColumnId, ConcreteDataType)],
408) -> datatypes::arrow::datatypes::SchemaRef {
409    let mut new_fields = Vec::with_capacity(batch_schema.len() + 3);
410    for (column_id, _) in batch_schema {
411        let column_metadata = metadata.column_by_id(*column_id).unwrap();
412        let field = Field::new(
413            &column_metadata.column_schema.name,
414            column_metadata.column_schema.data_type.as_arrow_type(),
415            column_metadata.column_schema.is_nullable(),
416        );
417        let field = with_field_id(field, *column_id);
418        if column_metadata.semantic_type == SemanticType::Tag {
419            new_fields.push(tag_maybe_to_dictionary_field(
420                &column_metadata.column_schema.data_type,
421                &Arc::new(field),
422            ));
423        } else {
424            new_fields.push(Arc::new(field));
425        }
426    }
427    new_fields.extend_from_slice(&internal_fields());
428
429    Arc::new(datatypes::arrow::datatypes::Schema::new(new_fields))
430}
431
432/// Helper to project compaction batches into flat format columns
433/// (fields + time index + __primary_key + __sequence + __op_type).
434pub(crate) struct CompactionProjectionMapper {
435    mapper: FlatProjectionMapper,
436    assembler: DfBatchAssembler,
437}
438
439impl CompactionProjectionMapper {
440    pub(crate) fn try_new(metadata: &RegionMetadataRef) -> Result<Self> {
441        let projection = metadata
442            .column_metadatas
443            .iter()
444            .enumerate()
445            .filter_map(|(idx, col)| {
446                if matches!(col.semantic_type, SemanticType::Field) {
447                    Some(idx)
448                } else {
449                    None
450                }
451            })
452            .chain([metadata.time_index_column_pos()])
453            .collect::<Vec<_>>();
454
455        let read_col_ids = metadata.column_metadatas.iter().map(|col| col.column_id);
456        let read_cols = ReadColumns::from_deduped_column_ids(read_col_ids);
457        let mapper = FlatProjectionMapper::new_with_read_columns(metadata, projection, read_cols)?;
458        let assembler = DfBatchAssembler::new(mapper.output_schema());
459
460        Ok(Self { mapper, assembler })
461    }
462
463    /// Projects columns and appends internal columns for compaction output.
464    ///
465    /// The input batch is expected to be in flat format with internal columns appended.
466    pub(crate) fn project(&self, batch: DfRecordBatch) -> Result<DfRecordBatch> {
467        let columns = self
468            .mapper
469            .project_vectors(&batch)
470            .context(RecordBatchSnafu)?;
471        self.assembler
472            .build_df_record_batch_with_internal(&batch, columns)
473            .context(RecordBatchSnafu)
474    }
475}
476
477/// Builds [DfRecordBatch] with internal columns appended.
478pub(crate) struct DfBatchAssembler {
479    output_arrow_schema_with_internal: datatypes::arrow::datatypes::SchemaRef,
480}
481
482impl DfBatchAssembler {
483    /// Precomputes the output schema with internal columns.
484    pub(crate) fn new(output_schema: SchemaRef) -> Self {
485        let fields = output_schema
486            .arrow_schema()
487            .fields()
488            .into_iter()
489            .chain(internal_fields().iter())
490            .cloned()
491            .collect::<Vec<_>>();
492        let output_arrow_schema_with_internal =
493            Arc::new(datatypes::arrow::datatypes::Schema::new(fields));
494        Self {
495            output_arrow_schema_with_internal,
496        }
497    }
498
499    /// Builds a [DfRecordBatch] from projected vectors plus internal columns.
500    ///
501    /// Assumes the input batch already contains internal columns as the last three fields
502    /// ("__primary_key", "__sequence", "__op_type").
503    pub(crate) fn build_df_record_batch_with_internal(
504        &self,
505        batch: &datatypes::arrow::record_batch::RecordBatch,
506        mut columns: Vec<datatypes::vectors::VectorRef>,
507    ) -> common_recordbatch::error::Result<DfRecordBatch> {
508        let num_columns = batch.columns().len();
509        // The last 3 columns are the internal columns.
510        let internal_indices = [num_columns - 3, num_columns - 2, num_columns - 1];
511        for index in internal_indices.iter() {
512            let array = batch.column(*index).clone();
513            let vector = Helper::try_into_vector(array)
514                .map_err(BoxedError::new)
515                .context(ExternalSnafu)?;
516            columns.push(vector);
517        }
518        RecordBatch::to_df_record_batch(self.output_arrow_schema_with_internal.clone(), columns)
519    }
520}