Skip to main content

mito2/read/
flat_projection.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Utilities for projection on flat format.
16
17use std::sync::Arc;
18
19use api::v1::SemanticType;
20use common_error::ext::BoxedError;
21use common_recordbatch::error::{ArrowComputeSnafu, ExternalSnafu, NewDfRecordBatchSnafu};
22use common_recordbatch::{DfRecordBatch, RecordBatch};
23use datatypes::arrow::array::Array;
24use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field};
25use datatypes::prelude::{ConcreteDataType, DataType};
26use datatypes::schema::{Schema, SchemaRef};
27use datatypes::value::Value;
28use datatypes::vectors::Helper;
29use snafu::{OptionExt, ResultExt};
30use store_api::metadata::{RegionMetadata, RegionMetadataRef};
31use store_api::storage::ColumnId;
32
33use crate::cache::CacheStrategy;
34use crate::error::{InvalidRequestSnafu, RecordBatchSnafu, Result};
35use crate::read::projection::{read_column_ids_from_projection, repeated_vector_with_cache};
36use crate::sst::parquet::flat_format::sst_column_id_indices;
37use crate::sst::parquet::format::FormatProjection;
38use crate::sst::{
39    FlatSchemaOptions, internal_fields, tag_maybe_to_dictionary_field, to_flat_sst_arrow_schema,
40    with_field_id,
41};
42
43/// Handles projection and converts batches in flat format with correct schema.
44///
45/// This mapper support duplicate and unsorted projection indices.
46/// The output schema is determined by the projection indices.
47pub struct FlatProjectionMapper {
48    /// Metadata of the region.
49    metadata: RegionMetadataRef,
50    /// Schema for converted [RecordBatch] to return.
51    output_schema: SchemaRef,
52    /// Ids of columns to read from memtables and SSTs.
53    /// The mapper won't deduplicate the column ids.
54    ///
55    /// Note that this doesn't contain the `__table_id` and `__tsid`.
56    read_column_ids: Vec<ColumnId>,
57    /// Ids and DataTypes of columns of the expected batch.
58    /// We can use this to check if the batch is compatible with the expected schema.
59    ///
60    /// It doesn't contain internal columns but always contains the time index column.
61    batch_schema: Vec<(ColumnId, ConcreteDataType)>,
62    /// `true` If the original projection is empty.
63    is_empty_projection: bool,
64    /// The index in flat format [RecordBatch] for each column in the output [RecordBatch].
65    batch_indices: Vec<usize>,
66    /// Precomputed Arrow schema for input batches.
67    input_arrow_schema: datatypes::arrow::datatypes::SchemaRef,
68}
69
70impl FlatProjectionMapper {
71    /// Returns a new mapper with projection.
72    /// If `projection` is empty, it outputs [RecordBatch] without any column but only a row count.
73    /// `SELECT COUNT(*) FROM table` is an example that uses an empty projection. DataFusion accepts
74    /// empty `RecordBatch` and only use its row count in this query.
75    pub fn new(
76        metadata: &RegionMetadataRef,
77        projection: impl Iterator<Item = usize>,
78    ) -> Result<Self> {
79        let projection: Vec<_> = projection.collect();
80        let read_column_ids = read_column_ids_from_projection(metadata, &projection)?;
81        Self::new_with_read_columns(metadata, projection, read_column_ids)
82    }
83
84    /// Returns a new mapper with output projection and explicit read columns.
85    pub fn new_with_read_columns(
86        metadata: &RegionMetadataRef,
87        projection: Vec<usize>,
88        read_column_ids: Vec<ColumnId>,
89    ) -> Result<Self> {
90        // If the original projection is empty.
91        let is_empty_projection = projection.is_empty();
92
93        // Output column schemas for the projection.
94        let mut column_schemas = Vec::with_capacity(projection.len());
95        // Column ids of the output projection without deduplication.
96        let mut output_column_ids = Vec::with_capacity(projection.len());
97        for idx in &projection {
98            // For each projection index, we get the column id for projection.
99            let column =
100                metadata
101                    .column_metadatas
102                    .get(*idx)
103                    .with_context(|| InvalidRequestSnafu {
104                        region_id: metadata.region_id,
105                        reason: format!("projection index {} is out of bound", idx),
106                    })?;
107
108            output_column_ids.push(column.column_id);
109            // Safety: idx is valid.
110            column_schemas.push(metadata.schema.column_schemas()[*idx].clone());
111        }
112
113        // Creates a map to lookup index.
114        let id_to_index = sst_column_id_indices(metadata);
115        // TODO(yingwen): Support different flat schema options.
116        let format_projection = FormatProjection::compute_format_projection(
117            &id_to_index,
118            // All columns with internal columns.
119            metadata.column_metadatas.len() + 3,
120            read_column_ids.iter().copied(),
121        );
122
123        let batch_schema = flat_projected_columns(metadata, &format_projection);
124
125        // Safety: We get the column id from the metadata.
126        let input_arrow_schema = compute_input_arrow_schema(metadata, &batch_schema);
127
128        // If projection is empty, we don't output any column.
129        let output_schema = if is_empty_projection {
130            Arc::new(Schema::new(vec![]))
131        } else {
132            // Safety: Columns come from existing schema.
133            Arc::new(Schema::new(column_schemas))
134        };
135
136        let batch_indices = if is_empty_projection {
137            vec![]
138        } else {
139            output_column_ids
140                .iter()
141                .map(|id| {
142                    // Safety: The map is computed from the read projection.
143                    format_projection
144                        .column_id_to_projected_index
145                        .get(id)
146                        .copied()
147                        .with_context(|| {
148                            let name = metadata
149                                .column_by_id(*id)
150                                .map(|column| column.column_schema.name.clone())
151                                .unwrap_or_else(|| id.to_string());
152                            InvalidRequestSnafu {
153                                region_id: metadata.region_id,
154                                reason: format!(
155                                    "output column {} is missing in read projection",
156                                    name
157                                ),
158                            }
159                        })
160                })
161                .collect::<Result<Vec<_>>>()?
162        };
163
164        Ok(FlatProjectionMapper {
165            metadata: metadata.clone(),
166            output_schema,
167            read_column_ids,
168            batch_schema,
169            is_empty_projection,
170            batch_indices,
171            input_arrow_schema,
172        })
173    }
174
175    /// Returns a new mapper without projection.
176    pub fn all(metadata: &RegionMetadataRef) -> Result<Self> {
177        FlatProjectionMapper::new(metadata, 0..metadata.column_metadatas.len())
178    }
179
180    /// Returns the metadata that created the mapper.
181    pub(crate) fn metadata(&self) -> &RegionMetadataRef {
182        &self.metadata
183    }
184
185    /// Returns ids of projected columns that we need to read
186    /// from memtables and SSTs.
187    pub(crate) fn column_ids(&self) -> &[ColumnId] {
188        &self.read_column_ids
189    }
190
191    /// Returns the field column start index in output batch.
192    pub(crate) fn field_column_start(&self) -> usize {
193        for (idx, column_id) in self
194            .batch_schema
195            .iter()
196            .map(|(column_id, _)| column_id)
197            .enumerate()
198        {
199            // Safety: We get the column id from the metadata in new().
200            if self
201                .metadata
202                .column_by_id(*column_id)
203                .unwrap()
204                .semantic_type
205                == SemanticType::Field
206            {
207                return idx;
208            }
209        }
210
211        self.batch_schema.len()
212    }
213
214    /// Returns ids of columns of the batch that the mapper expects to convert.
215    pub(crate) fn batch_schema(&self) -> &[(ColumnId, ConcreteDataType)] {
216        &self.batch_schema
217    }
218
219    /// Returns the input arrow schema from sources.
220    ///
221    /// The merge reader can use this schema.
222    pub(crate) fn input_arrow_schema(
223        &self,
224        compaction: bool,
225    ) -> datatypes::arrow::datatypes::SchemaRef {
226        if !compaction {
227            self.input_arrow_schema.clone()
228        } else {
229            // For compaction, we need to build a different schema from encoding.
230            to_flat_sst_arrow_schema(
231                &self.metadata,
232                &FlatSchemaOptions::from_encoding(self.metadata.primary_key_encoding),
233            )
234        }
235    }
236
237    /// Returns the schema of converted [RecordBatch].
238    /// This is the schema that the stream will output. This schema may contain
239    /// less columns than [FlatProjectionMapper::column_ids()].
240    pub(crate) fn output_schema(&self) -> SchemaRef {
241        self.output_schema.clone()
242    }
243
244    /// Converts a flat format [RecordBatch] to a normal [RecordBatch].
245    ///
246    /// The batch must match the `projection` using to build the mapper.
247    pub(crate) fn convert(
248        &self,
249        batch: &datatypes::arrow::record_batch::RecordBatch,
250        cache_strategy: &CacheStrategy,
251    ) -> common_recordbatch::error::Result<RecordBatch> {
252        if self.is_empty_projection {
253            return RecordBatch::new_with_count(self.output_schema.clone(), batch.num_rows());
254        }
255        // Construct output record batch directly from Arrow arrays to avoid
256        // Arrow -> Vector -> Arrow roundtrips in the hot path.
257        let mut arrays = Vec::with_capacity(self.output_schema.num_columns());
258        for (output_idx, index) in self.batch_indices.iter().enumerate() {
259            let mut array = batch.column(*index).clone();
260            // Cast dictionary values to the target type.
261            if let ArrowDataType::Dictionary(_key_type, value_type) = array.data_type() {
262                // When a string dictionary column contains only a single value, reuse a cached
263                // repeated vector to avoid repeatedly expanding the dictionary.
264                if let Some(dict_array) = single_value_string_dictionary(
265                    &array,
266                    &self.output_schema.column_schemas()[output_idx].data_type,
267                    value_type.as_ref(),
268                ) {
269                    let dict_values = dict_array.values();
270                    let value = if dict_values.is_null(0) {
271                        Value::Null
272                    } else {
273                        Value::from(datatypes::arrow_array::string_array_value(dict_values, 0))
274                    };
275
276                    let repeated = repeated_vector_with_cache(
277                        &self.output_schema.column_schemas()[output_idx].data_type,
278                        &value,
279                        batch.num_rows(),
280                        cache_strategy,
281                    )?;
282                    array = repeated.to_arrow_array();
283                } else {
284                    let casted = datatypes::arrow::compute::cast(&array, value_type)
285                        .context(ArrowComputeSnafu)?;
286                    array = casted;
287                }
288            }
289            arrays.push(array);
290        }
291
292        let df_record_batch =
293            DfRecordBatch::try_new(self.output_schema.arrow_schema().clone(), arrays)
294                .context(NewDfRecordBatchSnafu)?;
295        Ok(RecordBatch::from_df_record_batch(
296            self.output_schema.clone(),
297            df_record_batch,
298        ))
299    }
300
301    /// Projects columns from the input batch and converts them into vectors.
302    pub(crate) fn project_vectors(
303        &self,
304        batch: &datatypes::arrow::record_batch::RecordBatch,
305    ) -> common_recordbatch::error::Result<Vec<datatypes::vectors::VectorRef>> {
306        let mut columns = Vec::with_capacity(self.output_schema.num_columns());
307        for index in &self.batch_indices {
308            let mut array = batch.column(*index).clone();
309            // Casts dictionary values to the target type.
310            if let datatypes::arrow::datatypes::DataType::Dictionary(_key_type, value_type) =
311                array.data_type()
312            {
313                let casted = datatypes::arrow::compute::cast(&array, value_type)
314                    .context(ArrowComputeSnafu)?;
315                array = casted;
316            }
317            let vector = Helper::try_into_vector(array)
318                .map_err(BoxedError::new)
319                .context(ExternalSnafu)?;
320            columns.push(vector);
321        }
322        Ok(columns)
323    }
324}
325
326fn single_value_string_dictionary<'a>(
327    array: &'a Arc<dyn Array>,
328    output_type: &ConcreteDataType,
329    value_type: &ArrowDataType,
330) -> Option<&'a datatypes::arrow::array::DictionaryArray<datatypes::arrow::datatypes::UInt32Type>> {
331    if !matches!(
332        value_type,
333        ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 | ArrowDataType::Utf8View
334    ) || !output_type.is_string()
335    {
336        return None;
337    }
338
339    let dict_array = array
340        .as_any()
341        .downcast_ref::<datatypes::arrow::array::DictionaryArray<
342            datatypes::arrow::datatypes::UInt32Type,
343        >>()?;
344
345    (dict_array.values().len() == 1 && dict_array.null_count() == 0).then_some(dict_array)
346}
347
348/// Returns ids and datatypes of columns of the output batch after applying the `projection`.
349///
350/// It adds the time index column if it doesn't present in the projection.
351pub(crate) fn flat_projected_columns(
352    metadata: &RegionMetadata,
353    format_projection: &FormatProjection,
354) -> Vec<(ColumnId, ConcreteDataType)> {
355    let time_index = metadata.time_index_column();
356    let num_columns = if format_projection
357        .column_id_to_projected_index
358        .contains_key(&time_index.column_id)
359    {
360        format_projection.column_id_to_projected_index.len()
361    } else {
362        format_projection.column_id_to_projected_index.len() + 1
363    };
364    let mut schema = vec![None; num_columns];
365    for (column_id, index) in &format_projection.column_id_to_projected_index {
366        // Safety: FormatProjection ensures the id is valid.
367        schema[*index] = Some((
368            *column_id,
369            metadata
370                .column_by_id(*column_id)
371                .unwrap()
372                .column_schema
373                .data_type
374                .clone(),
375        ));
376    }
377    if num_columns != format_projection.column_id_to_projected_index.len() {
378        schema[num_columns - 1] = Some((
379            time_index.column_id,
380            time_index.column_schema.data_type.clone(),
381        ));
382    }
383
384    // Safety: FormatProjection ensures all indices can be unwrapped.
385    schema.into_iter().map(|id_type| id_type.unwrap()).collect()
386}
387
388/// Computes the Arrow schema for input batches.
389///
390/// # Panics
391/// Panics if it can't find the column by the column id in the batch_schema.
392pub(crate) fn compute_input_arrow_schema(
393    metadata: &RegionMetadata,
394    batch_schema: &[(ColumnId, ConcreteDataType)],
395) -> datatypes::arrow::datatypes::SchemaRef {
396    let mut new_fields = Vec::with_capacity(batch_schema.len() + 3);
397    for (column_id, _) in batch_schema {
398        let column_metadata = metadata.column_by_id(*column_id).unwrap();
399        let field = Field::new(
400            &column_metadata.column_schema.name,
401            column_metadata.column_schema.data_type.as_arrow_type(),
402            column_metadata.column_schema.is_nullable(),
403        );
404        let field = with_field_id(field, *column_id);
405        if column_metadata.semantic_type == SemanticType::Tag {
406            new_fields.push(tag_maybe_to_dictionary_field(
407                &column_metadata.column_schema.data_type,
408                &Arc::new(field),
409            ));
410        } else {
411            new_fields.push(Arc::new(field));
412        }
413    }
414    new_fields.extend_from_slice(&internal_fields());
415
416    Arc::new(datatypes::arrow::datatypes::Schema::new(new_fields))
417}
418
419/// Helper to project compaction batches into flat format columns
420/// (fields + time index + __primary_key + __sequence + __op_type).
421pub(crate) struct CompactionProjectionMapper {
422    mapper: FlatProjectionMapper,
423    assembler: DfBatchAssembler,
424}
425
426impl CompactionProjectionMapper {
427    pub(crate) fn try_new(metadata: &RegionMetadataRef) -> Result<Self> {
428        let projection = metadata
429            .column_metadatas
430            .iter()
431            .enumerate()
432            .filter_map(|(idx, col)| {
433                if matches!(col.semantic_type, SemanticType::Field) {
434                    Some(idx)
435                } else {
436                    None
437                }
438            })
439            .chain([metadata.time_index_column_pos()])
440            .collect::<Vec<_>>();
441
442        let mapper = FlatProjectionMapper::new_with_read_columns(
443            metadata,
444            projection,
445            metadata
446                .column_metadatas
447                .iter()
448                .map(|col| col.column_id)
449                .collect(),
450        )?;
451        let assembler = DfBatchAssembler::new(mapper.output_schema());
452
453        Ok(Self { mapper, assembler })
454    }
455
456    /// Projects columns and appends internal columns for compaction output.
457    ///
458    /// The input batch is expected to be in flat format with internal columns appended.
459    pub(crate) fn project(&self, batch: DfRecordBatch) -> Result<DfRecordBatch> {
460        let columns = self
461            .mapper
462            .project_vectors(&batch)
463            .context(RecordBatchSnafu)?;
464        self.assembler
465            .build_df_record_batch_with_internal(&batch, columns)
466            .context(RecordBatchSnafu)
467    }
468}
469
470/// Builds [DfRecordBatch] with internal columns appended.
471pub(crate) struct DfBatchAssembler {
472    output_arrow_schema_with_internal: datatypes::arrow::datatypes::SchemaRef,
473}
474
475impl DfBatchAssembler {
476    /// Precomputes the output schema with internal columns.
477    pub(crate) fn new(output_schema: SchemaRef) -> Self {
478        let fields = output_schema
479            .arrow_schema()
480            .fields()
481            .into_iter()
482            .chain(internal_fields().iter())
483            .cloned()
484            .collect::<Vec<_>>();
485        let output_arrow_schema_with_internal =
486            Arc::new(datatypes::arrow::datatypes::Schema::new(fields));
487        Self {
488            output_arrow_schema_with_internal,
489        }
490    }
491
492    /// Builds a [DfRecordBatch] from projected vectors plus internal columns.
493    ///
494    /// Assumes the input batch already contains internal columns as the last three fields
495    /// ("__primary_key", "__sequence", "__op_type").
496    pub(crate) fn build_df_record_batch_with_internal(
497        &self,
498        batch: &datatypes::arrow::record_batch::RecordBatch,
499        mut columns: Vec<datatypes::vectors::VectorRef>,
500    ) -> common_recordbatch::error::Result<DfRecordBatch> {
501        let num_columns = batch.columns().len();
502        // The last 3 columns are the internal columns.
503        let internal_indices = [num_columns - 3, num_columns - 2, num_columns - 1];
504        for index in internal_indices.iter() {
505            let array = batch.column(*index).clone();
506            let vector = Helper::try_into_vector(array)
507                .map_err(BoxedError::new)
508                .context(ExternalSnafu)?;
509            columns.push(vector);
510        }
511        RecordBatch::to_df_record_batch(self.output_arrow_schema_with_internal.clone(), columns)
512    }
513}