Skip to main content

mito2/read/
prune.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::ops::BitAnd;
16use std::sync::Arc;
17
18use common_recordbatch::filter::SimpleFilterEvaluator;
19use common_time::Timestamp;
20use datatypes::arrow::array::BooleanArray;
21use datatypes::arrow::buffer::BooleanBuffer;
22use datatypes::arrow::record_batch::RecordBatch;
23use snafu::ResultExt;
24
25use crate::error::{RecordBatchSnafu, Result};
26use crate::memtable::BoxedBatchIterator;
27use crate::read::last_row::{FlatRowGroupLastRowCachedReader, RowGroupLastRowCachedReader};
28use crate::read::{Batch, BatchReader};
29use crate::sst::file::FileTimeRange;
30use crate::sst::parquet::file_range::FileRangeContextRef;
31use crate::sst::parquet::reader::{FlatRowGroupReader, ReaderMetrics, RowGroupReader};
32
33pub enum Source {
34    RowGroup(RowGroupReader),
35    LastRow(RowGroupLastRowCachedReader),
36}
37
38impl Source {
39    async fn next_batch(&mut self) -> Result<Option<Batch>> {
40        match self {
41            Source::RowGroup(r) => r.next_batch().await,
42            Source::LastRow(r) => r.next_batch().await,
43        }
44    }
45}
46
47pub struct PruneReader {
48    /// Context for file ranges.
49    context: FileRangeContextRef,
50    source: Source,
51    metrics: ReaderMetrics,
52    /// Whether to skip field filters for this row group.
53    skip_fields: bool,
54}
55
56impl PruneReader {
57    pub(crate) fn new_with_row_group_reader(
58        ctx: FileRangeContextRef,
59        reader: RowGroupReader,
60        skip_fields: bool,
61    ) -> Self {
62        Self {
63            context: ctx,
64            source: Source::RowGroup(reader),
65            metrics: Default::default(),
66            skip_fields,
67        }
68    }
69
70    pub(crate) fn new_with_last_row_reader(
71        ctx: FileRangeContextRef,
72        reader: RowGroupLastRowCachedReader,
73        skip_fields: bool,
74    ) -> Self {
75        Self {
76            context: ctx,
77            source: Source::LastRow(reader),
78            metrics: Default::default(),
79            skip_fields,
80        }
81    }
82
83    /// Merge metrics with the inner reader and return the merged metrics.
84    pub(crate) fn metrics(&self) -> ReaderMetrics {
85        let mut metrics = self.metrics.clone();
86        match &self.source {
87            Source::RowGroup(r) => {
88                metrics.merge_from(r.metrics());
89            }
90            Source::LastRow(r) => {
91                if let Some(inner_metrics) = r.metrics() {
92                    metrics.merge_from(inner_metrics);
93                }
94            }
95        }
96
97        metrics
98    }
99
100    pub(crate) async fn next_batch(&mut self) -> Result<Option<Batch>> {
101        while let Some(b) = self.source.next_batch().await? {
102            match self.prune(b)? {
103                Some(b) => {
104                    return Ok(Some(b));
105                }
106                None => {
107                    continue;
108                }
109            }
110        }
111        Ok(None)
112    }
113
114    /// Prunes batches by the pushed down predicate.
115    fn prune(&mut self, batch: Batch) -> Result<Option<Batch>> {
116        // fast path
117        if self.context.filters().is_empty() && !self.context.has_partition_filter() {
118            return Ok(Some(batch));
119        }
120
121        let num_rows_before_filter = batch.num_rows();
122        let Some(batch_filtered) = self.context.precise_filter(batch, self.skip_fields)? else {
123            // the entire batch is filtered out
124            self.metrics.filter_metrics.rows_precise_filtered += num_rows_before_filter;
125            return Ok(None);
126        };
127
128        // update metric
129        let filtered_rows = num_rows_before_filter - batch_filtered.num_rows();
130        self.metrics.filter_metrics.rows_precise_filtered += filtered_rows;
131
132        if !batch_filtered.is_empty() {
133            Ok(Some(batch_filtered))
134        } else {
135            Ok(None)
136        }
137    }
138}
139
140/// An iterator that prunes batches by time range.
141pub(crate) struct PruneTimeIterator {
142    iter: BoxedBatchIterator,
143    time_range: FileTimeRange,
144    /// Precise time filters.
145    time_filters: Option<Arc<Vec<SimpleFilterEvaluator>>>,
146}
147
148impl PruneTimeIterator {
149    /// Creates a new `PruneTimeIterator` with the given iterator and time range.
150    pub(crate) fn new(
151        iter: BoxedBatchIterator,
152        time_range: FileTimeRange,
153        time_filters: Option<Arc<Vec<SimpleFilterEvaluator>>>,
154    ) -> Self {
155        Self {
156            iter,
157            time_range,
158            time_filters,
159        }
160    }
161
162    /// Prune batch by time range.
163    fn prune(&self, batch: Batch) -> Result<Batch> {
164        if batch.is_empty() {
165            return Ok(batch);
166        }
167
168        // fast path, the batch is within the time range.
169        // Note that the time range is inclusive.
170        if self.time_range.0 <= batch.first_timestamp().unwrap()
171            && batch.last_timestamp().unwrap() <= self.time_range.1
172        {
173            return self.prune_by_time_filters(batch, Vec::new());
174        }
175
176        // slow path, prune the batch by time range.
177        // Note that the timestamp precision may be different from the time range.
178        // Safety: We know this is the timestamp type.
179        let unit = batch
180            .timestamps()
181            .data_type()
182            .as_timestamp()
183            .unwrap()
184            .unit();
185        let mut mask = Vec::with_capacity(batch.timestamps().len());
186        let timestamps = batch.timestamps_native().unwrap();
187        for ts in timestamps {
188            let ts = Timestamp::new(*ts, unit);
189            if self.time_range.0 <= ts && ts <= self.time_range.1 {
190                mask.push(true);
191            } else {
192                mask.push(false);
193            }
194        }
195
196        self.prune_by_time_filters(batch, mask)
197    }
198
199    /// Prunes the batch by time filters.
200    /// Also applies existing mask to the batch if the mask is not empty.
201    fn prune_by_time_filters(&self, mut batch: Batch, existing_mask: Vec<bool>) -> Result<Batch> {
202        if let Some(filters) = &self.time_filters {
203            let mut mask = BooleanBuffer::new_set(batch.num_rows());
204            for filter in filters.iter() {
205                let result = filter
206                    .evaluate_vector(batch.timestamps())
207                    .context(RecordBatchSnafu)?;
208                mask = mask.bitand(&result);
209            }
210
211            if !existing_mask.is_empty() {
212                mask = mask.bitand(&BooleanBuffer::from(existing_mask));
213            }
214
215            batch.filter(&BooleanArray::from(mask).into())?;
216        } else if !existing_mask.is_empty() {
217            batch.filter(&BooleanArray::from(existing_mask).into())?;
218        }
219
220        Ok(batch)
221    }
222
223    // Prune and return the next non-empty batch.
224    fn next_non_empty_batch(&mut self) -> Result<Option<Batch>> {
225        while let Some(batch) = self.iter.next() {
226            let batch = batch?;
227            let pruned_batch = self.prune(batch)?;
228            if !pruned_batch.is_empty() {
229                return Ok(Some(pruned_batch));
230            }
231        }
232        Ok(None)
233    }
234}
235
236impl Iterator for PruneTimeIterator {
237    type Item = Result<Batch>;
238
239    fn next(&mut self) -> Option<Self::Item> {
240        self.next_non_empty_batch().transpose()
241    }
242}
243
244pub enum FlatSource {
245    RowGroup(FlatRowGroupReader),
246    LastRow(FlatRowGroupLastRowCachedReader),
247}
248
249impl FlatSource {
250    async fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
251        match self {
252            FlatSource::RowGroup(r) => r.next_batch().await,
253            FlatSource::LastRow(r) => r.next_batch().await,
254        }
255    }
256}
257
258/// A flat format reader that returns RecordBatch instead of Batch.
259pub struct FlatPruneReader {
260    /// Context for file ranges.
261    context: FileRangeContextRef,
262    source: FlatSource,
263    metrics: ReaderMetrics,
264    /// Whether to skip field filters for this row group.
265    skip_fields: bool,
266}
267
268impl FlatPruneReader {
269    pub(crate) fn new_with_row_group_reader(
270        ctx: FileRangeContextRef,
271        reader: FlatRowGroupReader,
272        skip_fields: bool,
273    ) -> Self {
274        Self {
275            context: ctx,
276            source: FlatSource::RowGroup(reader),
277            metrics: Default::default(),
278            skip_fields,
279        }
280    }
281
282    pub(crate) fn new_with_last_row_reader(
283        ctx: FileRangeContextRef,
284        reader: FlatRowGroupLastRowCachedReader,
285        skip_fields: bool,
286    ) -> Self {
287        Self {
288            context: ctx,
289            source: FlatSource::LastRow(reader),
290            metrics: Default::default(),
291            skip_fields,
292        }
293    }
294
295    /// Returns metrics.
296    pub(crate) fn metrics(&self) -> ReaderMetrics {
297        self.metrics.clone()
298    }
299
300    pub(crate) async fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
301        loop {
302            let start = std::time::Instant::now();
303            let batch = self.source.next_batch().await?;
304            self.metrics.scan_cost += start.elapsed();
305
306            let Some(record_batch) = batch else {
307                return Ok(None);
308            };
309
310            // Update metrics for the received batch
311            self.metrics.num_rows += record_batch.num_rows();
312            self.metrics.num_batches += 1;
313
314            match self.prune_flat(record_batch)? {
315                Some(filtered_batch) => {
316                    return Ok(Some(filtered_batch));
317                }
318                None => {
319                    continue;
320                }
321            }
322        }
323    }
324
325    /// Prunes batches by the pushed down predicate and returns RecordBatch.
326    fn prune_flat(&mut self, record_batch: RecordBatch) -> Result<Option<RecordBatch>> {
327        // fast path
328        if self.context.filters().is_empty() && !self.context.has_partition_filter() {
329            return Ok(Some(record_batch));
330        }
331
332        let num_rows_before_filter = record_batch.num_rows();
333        let Some(filtered_batch) = self
334            .context
335            .precise_filter_flat(record_batch, self.skip_fields)?
336        else {
337            // the entire batch is filtered out
338            self.metrics.filter_metrics.rows_precise_filtered += num_rows_before_filter;
339            return Ok(None);
340        };
341
342        // update metric
343        let filtered_rows = num_rows_before_filter - filtered_batch.num_rows();
344        self.metrics.filter_metrics.rows_precise_filtered += filtered_rows;
345
346        if filtered_batch.num_rows() > 0 {
347            Ok(Some(filtered_batch))
348        } else {
349            Ok(None)
350        }
351    }
352}
353
354#[cfg(test)]
355mod tests {
356    use api::v1::OpType;
357    use datafusion_common::ScalarValue;
358    use datafusion_expr::{Expr, col, lit};
359
360    use super::*;
361    use crate::test_util::new_batch;
362
363    #[test]
364    fn test_prune_time_iter_empty() {
365        let input = [];
366        let iter = input.into_iter().map(Ok);
367        let iter = PruneTimeIterator::new(
368            Box::new(iter),
369            (
370                Timestamp::new_millisecond(0),
371                Timestamp::new_millisecond(1000),
372            ),
373            None,
374        );
375        let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
376        assert!(actual.is_empty());
377    }
378
379    #[test]
380    fn test_prune_time_iter_filter() {
381        let input = [
382            new_batch(
383                b"k1",
384                &[10, 11],
385                &[20, 20],
386                &[OpType::Put, OpType::Put],
387                &[110, 111],
388            ),
389            new_batch(
390                b"k1",
391                &[15, 16],
392                &[20, 20],
393                &[OpType::Put, OpType::Put],
394                &[115, 116],
395            ),
396            new_batch(
397                b"k1",
398                &[17, 18],
399                &[20, 20],
400                &[OpType::Put, OpType::Put],
401                &[117, 118],
402            ),
403        ];
404
405        let iter = input.clone().into_iter().map(Ok);
406        let iter = PruneTimeIterator::new(
407            Box::new(iter),
408            (
409                Timestamp::new_millisecond(10),
410                Timestamp::new_millisecond(15),
411            ),
412            None,
413        );
414        let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
415        assert_eq!(
416            actual,
417            [
418                new_batch(
419                    b"k1",
420                    &[10, 11],
421                    &[20, 20],
422                    &[OpType::Put, OpType::Put],
423                    &[110, 111],
424                ),
425                new_batch(b"k1", &[15], &[20], &[OpType::Put], &[115],),
426            ]
427        );
428
429        let iter = input.clone().into_iter().map(Ok);
430        let iter = PruneTimeIterator::new(
431            Box::new(iter),
432            (
433                Timestamp::new_millisecond(11),
434                Timestamp::new_millisecond(20),
435            ),
436            None,
437        );
438        let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
439        assert_eq!(
440            actual,
441            [
442                new_batch(b"k1", &[11], &[20], &[OpType::Put], &[111],),
443                new_batch(
444                    b"k1",
445                    &[15, 16],
446                    &[20, 20],
447                    &[OpType::Put, OpType::Put],
448                    &[115, 116],
449                ),
450                new_batch(
451                    b"k1",
452                    &[17, 18],
453                    &[20, 20],
454                    &[OpType::Put, OpType::Put],
455                    &[117, 118],
456                ),
457            ]
458        );
459
460        let iter = input.into_iter().map(Ok);
461        let iter = PruneTimeIterator::new(
462            Box::new(iter),
463            (
464                Timestamp::new_millisecond(10),
465                Timestamp::new_millisecond(18),
466            ),
467            None,
468        );
469        let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
470        assert_eq!(
471            actual,
472            [
473                new_batch(
474                    b"k1",
475                    &[10, 11],
476                    &[20, 20],
477                    &[OpType::Put, OpType::Put],
478                    &[110, 111],
479                ),
480                new_batch(
481                    b"k1",
482                    &[15, 16],
483                    &[20, 20],
484                    &[OpType::Put, OpType::Put],
485                    &[115, 116],
486                ),
487                new_batch(
488                    b"k1",
489                    &[17, 18],
490                    &[20, 20],
491                    &[OpType::Put, OpType::Put],
492                    &[117, 118],
493                ),
494            ]
495        );
496    }
497
498    fn create_time_filters(expr: &[Expr]) -> Option<Arc<Vec<SimpleFilterEvaluator>>> {
499        let filters = expr
500            .iter()
501            .map(|expr| SimpleFilterEvaluator::try_new(expr).unwrap())
502            .collect();
503        Some(Arc::new(filters))
504    }
505
506    #[test]
507    fn test_prune_time_iter_with_time_filters() {
508        let input = [
509            new_batch(
510                b"k1",
511                &[10, 11],
512                &[20, 20],
513                &[OpType::Put, OpType::Put],
514                &[110, 111],
515            ),
516            new_batch(
517                b"k1",
518                &[15, 16],
519                &[20, 20],
520                &[OpType::Put, OpType::Put],
521                &[115, 116],
522            ),
523            new_batch(
524                b"k1",
525                &[17, 18],
526                &[20, 20],
527                &[OpType::Put, OpType::Put],
528                &[117, 118],
529            ),
530        ];
531
532        let iter = input.clone().into_iter().map(Ok);
533        // We won't use the column name.
534        let time_filters = create_time_filters(&[
535            col("ts").gt_eq(lit(ScalarValue::TimestampMillisecond(Some(10), None))),
536            col("ts").lt(lit(ScalarValue::TimestampMillisecond(Some(16), None))),
537        ]);
538        let iter = PruneTimeIterator::new(
539            Box::new(iter),
540            (
541                Timestamp::new_millisecond(10),
542                Timestamp::new_millisecond(20),
543            ),
544            time_filters,
545        );
546        let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
547        assert_eq!(
548            actual,
549            [
550                new_batch(
551                    b"k1",
552                    &[10, 11],
553                    &[20, 20],
554                    &[OpType::Put, OpType::Put],
555                    &[110, 111],
556                ),
557                new_batch(b"k1", &[15], &[20], &[OpType::Put], &[115],),
558            ]
559        );
560    }
561
562    #[test]
563    fn test_prune_time_iter_in_range_with_time_filters() {
564        let input = [
565            new_batch(
566                b"k1",
567                &[10, 11],
568                &[20, 20],
569                &[OpType::Put, OpType::Put],
570                &[110, 111],
571            ),
572            new_batch(
573                b"k1",
574                &[15, 16],
575                &[20, 20],
576                &[OpType::Put, OpType::Put],
577                &[115, 116],
578            ),
579            new_batch(
580                b"k1",
581                &[17, 18],
582                &[20, 20],
583                &[OpType::Put, OpType::Put],
584                &[117, 118],
585            ),
586        ];
587
588        let iter = input.clone().into_iter().map(Ok);
589        // We won't use the column name.
590        let time_filters = create_time_filters(&[
591            col("ts").gt_eq(lit(ScalarValue::TimestampMillisecond(Some(10), None))),
592            col("ts").lt(lit(ScalarValue::TimestampMillisecond(Some(16), None))),
593        ]);
594        let iter = PruneTimeIterator::new(
595            Box::new(iter),
596            (
597                Timestamp::new_millisecond(5),
598                Timestamp::new_millisecond(18),
599            ),
600            time_filters,
601        );
602        let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
603        assert_eq!(
604            actual,
605            [
606                new_batch(
607                    b"k1",
608                    &[10, 11],
609                    &[20, 20],
610                    &[OpType::Put, OpType::Put],
611                    &[110, 111],
612                ),
613                new_batch(b"k1", &[15], &[20], &[OpType::Put], &[115],),
614            ]
615        );
616    }
617}