1use std::ops::BitAnd;
16use std::sync::Arc;
17
18use common_recordbatch::filter::SimpleFilterEvaluator;
19use common_time::Timestamp;
20use datatypes::arrow::array::BooleanArray;
21use datatypes::arrow::buffer::BooleanBuffer;
22use datatypes::arrow::record_batch::RecordBatch;
23use snafu::ResultExt;
24
25use crate::error::{RecordBatchSnafu, Result};
26use crate::memtable::BoxedBatchIterator;
27use crate::read::last_row::{FlatRowGroupLastRowCachedReader, RowGroupLastRowCachedReader};
28use crate::read::{Batch, BatchReader};
29use crate::sst::file::FileTimeRange;
30use crate::sst::parquet::file_range::FileRangeContextRef;
31use crate::sst::parquet::reader::{FlatRowGroupReader, ReaderMetrics, RowGroupReader};
32
33pub enum Source {
34 RowGroup(RowGroupReader),
35 LastRow(RowGroupLastRowCachedReader),
36}
37
38impl Source {
39 async fn next_batch(&mut self) -> Result<Option<Batch>> {
40 match self {
41 Source::RowGroup(r) => r.next_batch().await,
42 Source::LastRow(r) => r.next_batch().await,
43 }
44 }
45}
46
47pub struct PruneReader {
48 context: FileRangeContextRef,
50 source: Source,
51 metrics: ReaderMetrics,
52 skip_fields: bool,
54}
55
56impl PruneReader {
57 pub(crate) fn new_with_row_group_reader(
58 ctx: FileRangeContextRef,
59 reader: RowGroupReader,
60 skip_fields: bool,
61 ) -> Self {
62 Self {
63 context: ctx,
64 source: Source::RowGroup(reader),
65 metrics: Default::default(),
66 skip_fields,
67 }
68 }
69
70 pub(crate) fn new_with_last_row_reader(
71 ctx: FileRangeContextRef,
72 reader: RowGroupLastRowCachedReader,
73 skip_fields: bool,
74 ) -> Self {
75 Self {
76 context: ctx,
77 source: Source::LastRow(reader),
78 metrics: Default::default(),
79 skip_fields,
80 }
81 }
82
83 pub(crate) fn reset_source(&mut self, source: Source, skip_fields: bool) {
84 self.source = source;
85 self.skip_fields = skip_fields;
86 }
87
88 pub(crate) fn metrics(&self) -> ReaderMetrics {
90 let mut metrics = self.metrics.clone();
91 match &self.source {
92 Source::RowGroup(r) => {
93 metrics.merge_from(r.metrics());
94 }
95 Source::LastRow(r) => {
96 if let Some(inner_metrics) = r.metrics() {
97 metrics.merge_from(inner_metrics);
98 }
99 }
100 }
101
102 metrics
103 }
104
105 pub(crate) async fn next_batch(&mut self) -> Result<Option<Batch>> {
106 while let Some(b) = self.source.next_batch().await? {
107 match self.prune(b)? {
108 Some(b) => {
109 return Ok(Some(b));
110 }
111 None => {
112 continue;
113 }
114 }
115 }
116 Ok(None)
117 }
118
119 fn prune(&mut self, batch: Batch) -> Result<Option<Batch>> {
121 if self.context.filters().is_empty() && !self.context.has_partition_filter() {
123 return Ok(Some(batch));
124 }
125
126 let num_rows_before_filter = batch.num_rows();
127 let Some(batch_filtered) = self.context.precise_filter(batch, self.skip_fields)? else {
128 self.metrics.filter_metrics.rows_precise_filtered += num_rows_before_filter;
130 return Ok(None);
131 };
132
133 let filtered_rows = num_rows_before_filter - batch_filtered.num_rows();
135 self.metrics.filter_metrics.rows_precise_filtered += filtered_rows;
136
137 if !batch_filtered.is_empty() {
138 Ok(Some(batch_filtered))
139 } else {
140 Ok(None)
141 }
142 }
143}
144
145pub(crate) struct PruneTimeIterator {
147 iter: BoxedBatchIterator,
148 time_range: FileTimeRange,
149 time_filters: Option<Arc<Vec<SimpleFilterEvaluator>>>,
151}
152
153impl PruneTimeIterator {
154 pub(crate) fn new(
156 iter: BoxedBatchIterator,
157 time_range: FileTimeRange,
158 time_filters: Option<Arc<Vec<SimpleFilterEvaluator>>>,
159 ) -> Self {
160 Self {
161 iter,
162 time_range,
163 time_filters,
164 }
165 }
166
167 fn prune(&self, batch: Batch) -> Result<Batch> {
169 if batch.is_empty() {
170 return Ok(batch);
171 }
172
173 if self.time_range.0 <= batch.first_timestamp().unwrap()
176 && batch.last_timestamp().unwrap() <= self.time_range.1
177 {
178 return self.prune_by_time_filters(batch, Vec::new());
179 }
180
181 let unit = batch
185 .timestamps()
186 .data_type()
187 .as_timestamp()
188 .unwrap()
189 .unit();
190 let mut mask = Vec::with_capacity(batch.timestamps().len());
191 let timestamps = batch.timestamps_native().unwrap();
192 for ts in timestamps {
193 let ts = Timestamp::new(*ts, unit);
194 if self.time_range.0 <= ts && ts <= self.time_range.1 {
195 mask.push(true);
196 } else {
197 mask.push(false);
198 }
199 }
200
201 self.prune_by_time_filters(batch, mask)
202 }
203
204 fn prune_by_time_filters(&self, mut batch: Batch, existing_mask: Vec<bool>) -> Result<Batch> {
207 if let Some(filters) = &self.time_filters {
208 let mut mask = BooleanBuffer::new_set(batch.num_rows());
209 for filter in filters.iter() {
210 let result = filter
211 .evaluate_vector(batch.timestamps())
212 .context(RecordBatchSnafu)?;
213 mask = mask.bitand(&result);
214 }
215
216 if !existing_mask.is_empty() {
217 mask = mask.bitand(&BooleanBuffer::from(existing_mask));
218 }
219
220 batch.filter(&BooleanArray::from(mask).into())?;
221 } else if !existing_mask.is_empty() {
222 batch.filter(&BooleanArray::from(existing_mask).into())?;
223 }
224
225 Ok(batch)
226 }
227
228 fn next_non_empty_batch(&mut self) -> Result<Option<Batch>> {
230 while let Some(batch) = self.iter.next() {
231 let batch = batch?;
232 let pruned_batch = self.prune(batch)?;
233 if !pruned_batch.is_empty() {
234 return Ok(Some(pruned_batch));
235 }
236 }
237 Ok(None)
238 }
239}
240
241impl Iterator for PruneTimeIterator {
242 type Item = Result<Batch>;
243
244 fn next(&mut self) -> Option<Self::Item> {
245 self.next_non_empty_batch().transpose()
246 }
247}
248
249pub enum FlatSource {
250 RowGroup(FlatRowGroupReader),
251 LastRow(FlatRowGroupLastRowCachedReader),
252}
253
254impl FlatSource {
255 fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
256 match self {
257 FlatSource::RowGroup(r) => r.next_batch(),
258 FlatSource::LastRow(r) => r.next_batch(),
259 }
260 }
261}
262
263pub struct FlatPruneReader {
265 context: FileRangeContextRef,
267 source: FlatSource,
268 metrics: ReaderMetrics,
269 skip_fields: bool,
271}
272
273impl FlatPruneReader {
274 pub(crate) fn new_with_row_group_reader(
275 ctx: FileRangeContextRef,
276 reader: FlatRowGroupReader,
277 skip_fields: bool,
278 ) -> Self {
279 Self {
280 context: ctx,
281 source: FlatSource::RowGroup(reader),
282 metrics: Default::default(),
283 skip_fields,
284 }
285 }
286
287 pub(crate) fn new_with_last_row_reader(
288 ctx: FileRangeContextRef,
289 reader: FlatRowGroupLastRowCachedReader,
290 skip_fields: bool,
291 ) -> Self {
292 Self {
293 context: ctx,
294 source: FlatSource::LastRow(reader),
295 metrics: Default::default(),
296 skip_fields,
297 }
298 }
299
300 pub(crate) fn metrics(&self) -> ReaderMetrics {
302 self.metrics.clone()
303 }
304
305 pub(crate) fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
306 while let Some(record_batch) = {
307 let start = std::time::Instant::now();
308 let batch = self.source.next_batch()?;
309 self.metrics.scan_cost += start.elapsed();
310 batch
311 } {
312 self.metrics.num_rows += record_batch.num_rows();
314 self.metrics.num_batches += 1;
315
316 match self.prune_flat(record_batch)? {
317 Some(filtered_batch) => {
318 return Ok(Some(filtered_batch));
319 }
320 None => {
321 continue;
322 }
323 }
324 }
325
326 Ok(None)
327 }
328
329 fn prune_flat(&mut self, record_batch: RecordBatch) -> Result<Option<RecordBatch>> {
331 if self.context.filters().is_empty() && !self.context.has_partition_filter() {
333 return Ok(Some(record_batch));
334 }
335
336 let num_rows_before_filter = record_batch.num_rows();
337 let Some(filtered_batch) = self
338 .context
339 .precise_filter_flat(record_batch, self.skip_fields)?
340 else {
341 self.metrics.filter_metrics.rows_precise_filtered += num_rows_before_filter;
343 return Ok(None);
344 };
345
346 let filtered_rows = num_rows_before_filter - filtered_batch.num_rows();
348 self.metrics.filter_metrics.rows_precise_filtered += filtered_rows;
349
350 if filtered_batch.num_rows() > 0 {
351 Ok(Some(filtered_batch))
352 } else {
353 Ok(None)
354 }
355 }
356}
357
358#[cfg(test)]
359mod tests {
360 use api::v1::OpType;
361 use datafusion_common::ScalarValue;
362 use datafusion_expr::{Expr, col, lit};
363
364 use super::*;
365 use crate::test_util::new_batch;
366
367 #[test]
368 fn test_prune_time_iter_empty() {
369 let input = [];
370 let iter = input.into_iter().map(Ok);
371 let iter = PruneTimeIterator::new(
372 Box::new(iter),
373 (
374 Timestamp::new_millisecond(0),
375 Timestamp::new_millisecond(1000),
376 ),
377 None,
378 );
379 let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
380 assert!(actual.is_empty());
381 }
382
383 #[test]
384 fn test_prune_time_iter_filter() {
385 let input = [
386 new_batch(
387 b"k1",
388 &[10, 11],
389 &[20, 20],
390 &[OpType::Put, OpType::Put],
391 &[110, 111],
392 ),
393 new_batch(
394 b"k1",
395 &[15, 16],
396 &[20, 20],
397 &[OpType::Put, OpType::Put],
398 &[115, 116],
399 ),
400 new_batch(
401 b"k1",
402 &[17, 18],
403 &[20, 20],
404 &[OpType::Put, OpType::Put],
405 &[117, 118],
406 ),
407 ];
408
409 let iter = input.clone().into_iter().map(Ok);
410 let iter = PruneTimeIterator::new(
411 Box::new(iter),
412 (
413 Timestamp::new_millisecond(10),
414 Timestamp::new_millisecond(15),
415 ),
416 None,
417 );
418 let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
419 assert_eq!(
420 actual,
421 [
422 new_batch(
423 b"k1",
424 &[10, 11],
425 &[20, 20],
426 &[OpType::Put, OpType::Put],
427 &[110, 111],
428 ),
429 new_batch(b"k1", &[15], &[20], &[OpType::Put], &[115],),
430 ]
431 );
432
433 let iter = input.clone().into_iter().map(Ok);
434 let iter = PruneTimeIterator::new(
435 Box::new(iter),
436 (
437 Timestamp::new_millisecond(11),
438 Timestamp::new_millisecond(20),
439 ),
440 None,
441 );
442 let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
443 assert_eq!(
444 actual,
445 [
446 new_batch(b"k1", &[11], &[20], &[OpType::Put], &[111],),
447 new_batch(
448 b"k1",
449 &[15, 16],
450 &[20, 20],
451 &[OpType::Put, OpType::Put],
452 &[115, 116],
453 ),
454 new_batch(
455 b"k1",
456 &[17, 18],
457 &[20, 20],
458 &[OpType::Put, OpType::Put],
459 &[117, 118],
460 ),
461 ]
462 );
463
464 let iter = input.into_iter().map(Ok);
465 let iter = PruneTimeIterator::new(
466 Box::new(iter),
467 (
468 Timestamp::new_millisecond(10),
469 Timestamp::new_millisecond(18),
470 ),
471 None,
472 );
473 let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
474 assert_eq!(
475 actual,
476 [
477 new_batch(
478 b"k1",
479 &[10, 11],
480 &[20, 20],
481 &[OpType::Put, OpType::Put],
482 &[110, 111],
483 ),
484 new_batch(
485 b"k1",
486 &[15, 16],
487 &[20, 20],
488 &[OpType::Put, OpType::Put],
489 &[115, 116],
490 ),
491 new_batch(
492 b"k1",
493 &[17, 18],
494 &[20, 20],
495 &[OpType::Put, OpType::Put],
496 &[117, 118],
497 ),
498 ]
499 );
500 }
501
502 fn create_time_filters(expr: &[Expr]) -> Option<Arc<Vec<SimpleFilterEvaluator>>> {
503 let filters = expr
504 .iter()
505 .map(|expr| SimpleFilterEvaluator::try_new(expr).unwrap())
506 .collect();
507 Some(Arc::new(filters))
508 }
509
510 #[test]
511 fn test_prune_time_iter_with_time_filters() {
512 let input = [
513 new_batch(
514 b"k1",
515 &[10, 11],
516 &[20, 20],
517 &[OpType::Put, OpType::Put],
518 &[110, 111],
519 ),
520 new_batch(
521 b"k1",
522 &[15, 16],
523 &[20, 20],
524 &[OpType::Put, OpType::Put],
525 &[115, 116],
526 ),
527 new_batch(
528 b"k1",
529 &[17, 18],
530 &[20, 20],
531 &[OpType::Put, OpType::Put],
532 &[117, 118],
533 ),
534 ];
535
536 let iter = input.clone().into_iter().map(Ok);
537 let time_filters = create_time_filters(&[
539 col("ts").gt_eq(lit(ScalarValue::TimestampMillisecond(Some(10), None))),
540 col("ts").lt(lit(ScalarValue::TimestampMillisecond(Some(16), None))),
541 ]);
542 let iter = PruneTimeIterator::new(
543 Box::new(iter),
544 (
545 Timestamp::new_millisecond(10),
546 Timestamp::new_millisecond(20),
547 ),
548 time_filters,
549 );
550 let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
551 assert_eq!(
552 actual,
553 [
554 new_batch(
555 b"k1",
556 &[10, 11],
557 &[20, 20],
558 &[OpType::Put, OpType::Put],
559 &[110, 111],
560 ),
561 new_batch(b"k1", &[15], &[20], &[OpType::Put], &[115],),
562 ]
563 );
564 }
565
566 #[test]
567 fn test_prune_time_iter_in_range_with_time_filters() {
568 let input = [
569 new_batch(
570 b"k1",
571 &[10, 11],
572 &[20, 20],
573 &[OpType::Put, OpType::Put],
574 &[110, 111],
575 ),
576 new_batch(
577 b"k1",
578 &[15, 16],
579 &[20, 20],
580 &[OpType::Put, OpType::Put],
581 &[115, 116],
582 ),
583 new_batch(
584 b"k1",
585 &[17, 18],
586 &[20, 20],
587 &[OpType::Put, OpType::Put],
588 &[117, 118],
589 ),
590 ];
591
592 let iter = input.clone().into_iter().map(Ok);
593 let time_filters = create_time_filters(&[
595 col("ts").gt_eq(lit(ScalarValue::TimestampMillisecond(Some(10), None))),
596 col("ts").lt(lit(ScalarValue::TimestampMillisecond(Some(16), None))),
597 ]);
598 let iter = PruneTimeIterator::new(
599 Box::new(iter),
600 (
601 Timestamp::new_millisecond(5),
602 Timestamp::new_millisecond(18),
603 ),
604 time_filters,
605 );
606 let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
607 assert_eq!(
608 actual,
609 [
610 new_batch(
611 b"k1",
612 &[10, 11],
613 &[20, 20],
614 &[OpType::Put, OpType::Put],
615 &[110, 111],
616 ),
617 new_batch(b"k1", &[15], &[20], &[OpType::Put], &[115],),
618 ]
619 );
620 }
621}