Skip to main content

mito2/memtable/bulk/
row_group_reader.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::sync::Arc;
16
17use bytes::Bytes;
18use parquet::arrow::ProjectionMask;
19use parquet::arrow::arrow_reader::{
20    ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReader,
21    ParquetRecordBatchReaderBuilder, RowSelection,
22};
23use parquet::file::metadata::ParquetMetaData;
24use snafu::ResultExt;
25
26use crate::error;
27use crate::error::ReadDataPartSnafu;
28use crate::memtable::bulk::chunk_reader::MemtableChunkReader;
29use crate::memtable::bulk::context::BulkIterContextRef;
30use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE;
31
32pub(crate) struct MemtableRowGroupReaderBuilder {
33    projection: ProjectionMask,
34    parquet_metadata: Arc<ParquetMetaData>,
35    arrow_metadata: ArrowReaderMetadata,
36    data: Bytes,
37}
38
39impl MemtableRowGroupReaderBuilder {
40    pub(crate) fn try_new(
41        context: &BulkIterContextRef,
42        projection: ProjectionMask,
43        parquet_metadata: Arc<ParquetMetaData>,
44        data: Bytes,
45    ) -> error::Result<Self> {
46        // Create ArrowReaderMetadata for building the reader.
47        let arrow_reader_options =
48            ArrowReaderOptions::new().with_schema(context.read_format().arrow_schema().clone());
49        let arrow_metadata =
50            ArrowReaderMetadata::try_new(parquet_metadata.clone(), arrow_reader_options)
51                .context(ReadDataPartSnafu)?;
52        Ok(Self {
53            projection,
54            parquet_metadata,
55            arrow_metadata,
56            data,
57        })
58    }
59
60    /// Builds a reader to read the row group at `row_group_idx` from memory.
61    pub(crate) fn build_row_group_reader(
62        &self,
63        row_group_idx: usize,
64        row_selection: Option<RowSelection>,
65    ) -> error::Result<ParquetRecordBatchReader> {
66        let chunk_reader = MemtableChunkReader::new(self.data.clone());
67
68        let mut builder = ParquetRecordBatchReaderBuilder::new_with_metadata(
69            chunk_reader,
70            self.arrow_metadata.clone(),
71        )
72        .with_row_groups(vec![row_group_idx])
73        .with_projection(self.projection.clone())
74        .with_batch_size(DEFAULT_READ_BATCH_SIZE);
75
76        if let Some(selection) = row_selection {
77            builder = builder.with_row_selection(selection);
78        }
79
80        builder.build().context(ReadDataPartSnafu)
81    }
82
83    /// Computes whether to skip field filters for a specific row group based on PreFilterMode.
84    pub(crate) fn compute_skip_fields(
85        &self,
86        context: &BulkIterContextRef,
87        row_group_idx: usize,
88    ) -> bool {
89        use crate::sst::parquet::file_range::{PreFilterMode, row_group_contains_delete};
90
91        match context.pre_filter_mode() {
92            PreFilterMode::All => false,
93            PreFilterMode::SkipFields => true,
94            PreFilterMode::SkipFieldsOnDelete => {
95                // Check if this specific row group contains delete op
96                row_group_contains_delete(&self.parquet_metadata, row_group_idx, "memtable")
97                    .unwrap_or(true)
98            }
99        }
100    }
101}