mito2/read/
prune.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::ops::BitAnd;
16use std::sync::Arc;
17
18use common_recordbatch::filter::SimpleFilterEvaluator;
19use common_time::Timestamp;
20use datatypes::arrow::array::BooleanArray;
21use datatypes::arrow::buffer::BooleanBuffer;
22use datatypes::arrow::record_batch::RecordBatch;
23use snafu::ResultExt;
24
25use crate::error::{RecordBatchSnafu, Result};
26use crate::memtable::BoxedBatchIterator;
27use crate::read::last_row::{FlatRowGroupLastRowCachedReader, RowGroupLastRowCachedReader};
28use crate::read::{Batch, BatchReader};
29use crate::sst::file::FileTimeRange;
30use crate::sst::parquet::file_range::FileRangeContextRef;
31use crate::sst::parquet::reader::{FlatRowGroupReader, ReaderMetrics, RowGroupReader};
32
33pub enum Source {
34    RowGroup(RowGroupReader),
35    LastRow(RowGroupLastRowCachedReader),
36}
37
38impl Source {
39    async fn next_batch(&mut self) -> Result<Option<Batch>> {
40        match self {
41            Source::RowGroup(r) => r.next_batch().await,
42            Source::LastRow(r) => r.next_batch().await,
43        }
44    }
45}
46
47pub struct PruneReader {
48    /// Context for file ranges.
49    context: FileRangeContextRef,
50    source: Source,
51    metrics: ReaderMetrics,
52    /// Whether to skip field filters for this row group.
53    skip_fields: bool,
54}
55
56impl PruneReader {
57    pub(crate) fn new_with_row_group_reader(
58        ctx: FileRangeContextRef,
59        reader: RowGroupReader,
60        skip_fields: bool,
61    ) -> Self {
62        Self {
63            context: ctx,
64            source: Source::RowGroup(reader),
65            metrics: Default::default(),
66            skip_fields,
67        }
68    }
69
70    pub(crate) fn new_with_last_row_reader(
71        ctx: FileRangeContextRef,
72        reader: RowGroupLastRowCachedReader,
73        skip_fields: bool,
74    ) -> Self {
75        Self {
76            context: ctx,
77            source: Source::LastRow(reader),
78            metrics: Default::default(),
79            skip_fields,
80        }
81    }
82
83    pub(crate) fn reset_source(&mut self, source: Source, skip_fields: bool) {
84        self.source = source;
85        self.skip_fields = skip_fields;
86    }
87
88    /// Merge metrics with the inner reader and return the merged metrics.
89    pub(crate) fn metrics(&self) -> ReaderMetrics {
90        let mut metrics = self.metrics.clone();
91        match &self.source {
92            Source::RowGroup(r) => {
93                metrics.merge_from(r.metrics());
94            }
95            Source::LastRow(r) => {
96                if let Some(inner_metrics) = r.metrics() {
97                    metrics.merge_from(inner_metrics);
98                }
99            }
100        }
101
102        metrics
103    }
104
105    pub(crate) async fn next_batch(&mut self) -> Result<Option<Batch>> {
106        while let Some(b) = self.source.next_batch().await? {
107            match self.prune(b)? {
108                Some(b) => {
109                    return Ok(Some(b));
110                }
111                None => {
112                    continue;
113                }
114            }
115        }
116        Ok(None)
117    }
118
119    /// Prunes batches by the pushed down predicate.
120    fn prune(&mut self, batch: Batch) -> Result<Option<Batch>> {
121        // fast path
122        if self.context.filters().is_empty() && !self.context.has_partition_filter() {
123            return Ok(Some(batch));
124        }
125
126        let num_rows_before_filter = batch.num_rows();
127        let Some(batch_filtered) = self.context.precise_filter(batch, self.skip_fields)? else {
128            // the entire batch is filtered out
129            self.metrics.filter_metrics.rows_precise_filtered += num_rows_before_filter;
130            return Ok(None);
131        };
132
133        // update metric
134        let filtered_rows = num_rows_before_filter - batch_filtered.num_rows();
135        self.metrics.filter_metrics.rows_precise_filtered += filtered_rows;
136
137        if !batch_filtered.is_empty() {
138            Ok(Some(batch_filtered))
139        } else {
140            Ok(None)
141        }
142    }
143}
144
145/// An iterator that prunes batches by time range.
146pub(crate) struct PruneTimeIterator {
147    iter: BoxedBatchIterator,
148    time_range: FileTimeRange,
149    /// Precise time filters.
150    time_filters: Option<Arc<Vec<SimpleFilterEvaluator>>>,
151}
152
153impl PruneTimeIterator {
154    /// Creates a new `PruneTimeIterator` with the given iterator and time range.
155    pub(crate) fn new(
156        iter: BoxedBatchIterator,
157        time_range: FileTimeRange,
158        time_filters: Option<Arc<Vec<SimpleFilterEvaluator>>>,
159    ) -> Self {
160        Self {
161            iter,
162            time_range,
163            time_filters,
164        }
165    }
166
167    /// Prune batch by time range.
168    fn prune(&self, batch: Batch) -> Result<Batch> {
169        if batch.is_empty() {
170            return Ok(batch);
171        }
172
173        // fast path, the batch is within the time range.
174        // Note that the time range is inclusive.
175        if self.time_range.0 <= batch.first_timestamp().unwrap()
176            && batch.last_timestamp().unwrap() <= self.time_range.1
177        {
178            return self.prune_by_time_filters(batch, Vec::new());
179        }
180
181        // slow path, prune the batch by time range.
182        // Note that the timestamp precision may be different from the time range.
183        // Safety: We know this is the timestamp type.
184        let unit = batch
185            .timestamps()
186            .data_type()
187            .as_timestamp()
188            .unwrap()
189            .unit();
190        let mut mask = Vec::with_capacity(batch.timestamps().len());
191        let timestamps = batch.timestamps_native().unwrap();
192        for ts in timestamps {
193            let ts = Timestamp::new(*ts, unit);
194            if self.time_range.0 <= ts && ts <= self.time_range.1 {
195                mask.push(true);
196            } else {
197                mask.push(false);
198            }
199        }
200
201        self.prune_by_time_filters(batch, mask)
202    }
203
204    /// Prunes the batch by time filters.
205    /// Also applies existing mask to the batch if the mask is not empty.
206    fn prune_by_time_filters(&self, mut batch: Batch, existing_mask: Vec<bool>) -> Result<Batch> {
207        if let Some(filters) = &self.time_filters {
208            let mut mask = BooleanBuffer::new_set(batch.num_rows());
209            for filter in filters.iter() {
210                let result = filter
211                    .evaluate_vector(batch.timestamps())
212                    .context(RecordBatchSnafu)?;
213                mask = mask.bitand(&result);
214            }
215
216            if !existing_mask.is_empty() {
217                mask = mask.bitand(&BooleanBuffer::from(existing_mask));
218            }
219
220            batch.filter(&BooleanArray::from(mask).into())?;
221        } else if !existing_mask.is_empty() {
222            batch.filter(&BooleanArray::from(existing_mask).into())?;
223        }
224
225        Ok(batch)
226    }
227
228    // Prune and return the next non-empty batch.
229    fn next_non_empty_batch(&mut self) -> Result<Option<Batch>> {
230        while let Some(batch) = self.iter.next() {
231            let batch = batch?;
232            let pruned_batch = self.prune(batch)?;
233            if !pruned_batch.is_empty() {
234                return Ok(Some(pruned_batch));
235            }
236        }
237        Ok(None)
238    }
239}
240
241impl Iterator for PruneTimeIterator {
242    type Item = Result<Batch>;
243
244    fn next(&mut self) -> Option<Self::Item> {
245        self.next_non_empty_batch().transpose()
246    }
247}
248
249pub enum FlatSource {
250    RowGroup(FlatRowGroupReader),
251    LastRow(FlatRowGroupLastRowCachedReader),
252}
253
254impl FlatSource {
255    fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
256        match self {
257            FlatSource::RowGroup(r) => r.next_batch(),
258            FlatSource::LastRow(r) => r.next_batch(),
259        }
260    }
261}
262
263/// A flat format reader that returns RecordBatch instead of Batch.
264pub struct FlatPruneReader {
265    /// Context for file ranges.
266    context: FileRangeContextRef,
267    source: FlatSource,
268    metrics: ReaderMetrics,
269    /// Whether to skip field filters for this row group.
270    skip_fields: bool,
271}
272
273impl FlatPruneReader {
274    pub(crate) fn new_with_row_group_reader(
275        ctx: FileRangeContextRef,
276        reader: FlatRowGroupReader,
277        skip_fields: bool,
278    ) -> Self {
279        Self {
280            context: ctx,
281            source: FlatSource::RowGroup(reader),
282            metrics: Default::default(),
283            skip_fields,
284        }
285    }
286
287    pub(crate) fn new_with_last_row_reader(
288        ctx: FileRangeContextRef,
289        reader: FlatRowGroupLastRowCachedReader,
290        skip_fields: bool,
291    ) -> Self {
292        Self {
293            context: ctx,
294            source: FlatSource::LastRow(reader),
295            metrics: Default::default(),
296            skip_fields,
297        }
298    }
299
300    /// Returns metrics.
301    pub(crate) fn metrics(&self) -> ReaderMetrics {
302        self.metrics.clone()
303    }
304
305    pub(crate) fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
306        while let Some(record_batch) = {
307            let start = std::time::Instant::now();
308            let batch = self.source.next_batch()?;
309            self.metrics.scan_cost += start.elapsed();
310            batch
311        } {
312            // Update metrics for the received batch
313            self.metrics.num_rows += record_batch.num_rows();
314            self.metrics.num_batches += 1;
315
316            match self.prune_flat(record_batch)? {
317                Some(filtered_batch) => {
318                    return Ok(Some(filtered_batch));
319                }
320                None => {
321                    continue;
322                }
323            }
324        }
325
326        Ok(None)
327    }
328
329    /// Prunes batches by the pushed down predicate and returns RecordBatch.
330    fn prune_flat(&mut self, record_batch: RecordBatch) -> Result<Option<RecordBatch>> {
331        // fast path
332        if self.context.filters().is_empty() && !self.context.has_partition_filter() {
333            return Ok(Some(record_batch));
334        }
335
336        let num_rows_before_filter = record_batch.num_rows();
337        let Some(filtered_batch) = self
338            .context
339            .precise_filter_flat(record_batch, self.skip_fields)?
340        else {
341            // the entire batch is filtered out
342            self.metrics.filter_metrics.rows_precise_filtered += num_rows_before_filter;
343            return Ok(None);
344        };
345
346        // update metric
347        let filtered_rows = num_rows_before_filter - filtered_batch.num_rows();
348        self.metrics.filter_metrics.rows_precise_filtered += filtered_rows;
349
350        if filtered_batch.num_rows() > 0 {
351            Ok(Some(filtered_batch))
352        } else {
353            Ok(None)
354        }
355    }
356}
357
358#[cfg(test)]
359mod tests {
360    use api::v1::OpType;
361    use datafusion_common::ScalarValue;
362    use datafusion_expr::{Expr, col, lit};
363
364    use super::*;
365    use crate::test_util::new_batch;
366
367    #[test]
368    fn test_prune_time_iter_empty() {
369        let input = [];
370        let iter = input.into_iter().map(Ok);
371        let iter = PruneTimeIterator::new(
372            Box::new(iter),
373            (
374                Timestamp::new_millisecond(0),
375                Timestamp::new_millisecond(1000),
376            ),
377            None,
378        );
379        let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
380        assert!(actual.is_empty());
381    }
382
383    #[test]
384    fn test_prune_time_iter_filter() {
385        let input = [
386            new_batch(
387                b"k1",
388                &[10, 11],
389                &[20, 20],
390                &[OpType::Put, OpType::Put],
391                &[110, 111],
392            ),
393            new_batch(
394                b"k1",
395                &[15, 16],
396                &[20, 20],
397                &[OpType::Put, OpType::Put],
398                &[115, 116],
399            ),
400            new_batch(
401                b"k1",
402                &[17, 18],
403                &[20, 20],
404                &[OpType::Put, OpType::Put],
405                &[117, 118],
406            ),
407        ];
408
409        let iter = input.clone().into_iter().map(Ok);
410        let iter = PruneTimeIterator::new(
411            Box::new(iter),
412            (
413                Timestamp::new_millisecond(10),
414                Timestamp::new_millisecond(15),
415            ),
416            None,
417        );
418        let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
419        assert_eq!(
420            actual,
421            [
422                new_batch(
423                    b"k1",
424                    &[10, 11],
425                    &[20, 20],
426                    &[OpType::Put, OpType::Put],
427                    &[110, 111],
428                ),
429                new_batch(b"k1", &[15], &[20], &[OpType::Put], &[115],),
430            ]
431        );
432
433        let iter = input.clone().into_iter().map(Ok);
434        let iter = PruneTimeIterator::new(
435            Box::new(iter),
436            (
437                Timestamp::new_millisecond(11),
438                Timestamp::new_millisecond(20),
439            ),
440            None,
441        );
442        let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
443        assert_eq!(
444            actual,
445            [
446                new_batch(b"k1", &[11], &[20], &[OpType::Put], &[111],),
447                new_batch(
448                    b"k1",
449                    &[15, 16],
450                    &[20, 20],
451                    &[OpType::Put, OpType::Put],
452                    &[115, 116],
453                ),
454                new_batch(
455                    b"k1",
456                    &[17, 18],
457                    &[20, 20],
458                    &[OpType::Put, OpType::Put],
459                    &[117, 118],
460                ),
461            ]
462        );
463
464        let iter = input.into_iter().map(Ok);
465        let iter = PruneTimeIterator::new(
466            Box::new(iter),
467            (
468                Timestamp::new_millisecond(10),
469                Timestamp::new_millisecond(18),
470            ),
471            None,
472        );
473        let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
474        assert_eq!(
475            actual,
476            [
477                new_batch(
478                    b"k1",
479                    &[10, 11],
480                    &[20, 20],
481                    &[OpType::Put, OpType::Put],
482                    &[110, 111],
483                ),
484                new_batch(
485                    b"k1",
486                    &[15, 16],
487                    &[20, 20],
488                    &[OpType::Put, OpType::Put],
489                    &[115, 116],
490                ),
491                new_batch(
492                    b"k1",
493                    &[17, 18],
494                    &[20, 20],
495                    &[OpType::Put, OpType::Put],
496                    &[117, 118],
497                ),
498            ]
499        );
500    }
501
502    fn create_time_filters(expr: &[Expr]) -> Option<Arc<Vec<SimpleFilterEvaluator>>> {
503        let filters = expr
504            .iter()
505            .map(|expr| SimpleFilterEvaluator::try_new(expr).unwrap())
506            .collect();
507        Some(Arc::new(filters))
508    }
509
510    #[test]
511    fn test_prune_time_iter_with_time_filters() {
512        let input = [
513            new_batch(
514                b"k1",
515                &[10, 11],
516                &[20, 20],
517                &[OpType::Put, OpType::Put],
518                &[110, 111],
519            ),
520            new_batch(
521                b"k1",
522                &[15, 16],
523                &[20, 20],
524                &[OpType::Put, OpType::Put],
525                &[115, 116],
526            ),
527            new_batch(
528                b"k1",
529                &[17, 18],
530                &[20, 20],
531                &[OpType::Put, OpType::Put],
532                &[117, 118],
533            ),
534        ];
535
536        let iter = input.clone().into_iter().map(Ok);
537        // We won't use the column name.
538        let time_filters = create_time_filters(&[
539            col("ts").gt_eq(lit(ScalarValue::TimestampMillisecond(Some(10), None))),
540            col("ts").lt(lit(ScalarValue::TimestampMillisecond(Some(16), None))),
541        ]);
542        let iter = PruneTimeIterator::new(
543            Box::new(iter),
544            (
545                Timestamp::new_millisecond(10),
546                Timestamp::new_millisecond(20),
547            ),
548            time_filters,
549        );
550        let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
551        assert_eq!(
552            actual,
553            [
554                new_batch(
555                    b"k1",
556                    &[10, 11],
557                    &[20, 20],
558                    &[OpType::Put, OpType::Put],
559                    &[110, 111],
560                ),
561                new_batch(b"k1", &[15], &[20], &[OpType::Put], &[115],),
562            ]
563        );
564    }
565
566    #[test]
567    fn test_prune_time_iter_in_range_with_time_filters() {
568        let input = [
569            new_batch(
570                b"k1",
571                &[10, 11],
572                &[20, 20],
573                &[OpType::Put, OpType::Put],
574                &[110, 111],
575            ),
576            new_batch(
577                b"k1",
578                &[15, 16],
579                &[20, 20],
580                &[OpType::Put, OpType::Put],
581                &[115, 116],
582            ),
583            new_batch(
584                b"k1",
585                &[17, 18],
586                &[20, 20],
587                &[OpType::Put, OpType::Put],
588                &[117, 118],
589            ),
590        ];
591
592        let iter = input.clone().into_iter().map(Ok);
593        // We won't use the column name.
594        let time_filters = create_time_filters(&[
595            col("ts").gt_eq(lit(ScalarValue::TimestampMillisecond(Some(10), None))),
596            col("ts").lt(lit(ScalarValue::TimestampMillisecond(Some(16), None))),
597        ]);
598        let iter = PruneTimeIterator::new(
599            Box::new(iter),
600            (
601                Timestamp::new_millisecond(5),
602                Timestamp::new_millisecond(18),
603            ),
604            time_filters,
605        );
606        let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
607        assert_eq!(
608            actual,
609            [
610                new_batch(
611                    b"k1",
612                    &[10, 11],
613                    &[20, 20],
614                    &[OpType::Put, OpType::Put],
615                    &[110, 111],
616                ),
617                new_batch(b"k1", &[15], &[20], &[OpType::Put], &[115],),
618            ]
619        );
620    }
621}