1use std::ops::BitAnd;
16use std::sync::Arc;
17
18use common_recordbatch::filter::SimpleFilterEvaluator;
19use common_time::Timestamp;
20use datatypes::arrow::array::BooleanArray;
21use datatypes::arrow::buffer::BooleanBuffer;
22use datatypes::arrow::record_batch::RecordBatch;
23use snafu::ResultExt;
24
25use crate::error::{RecordBatchSnafu, Result};
26use crate::memtable::BoxedBatchIterator;
27use crate::read::last_row::{FlatRowGroupLastRowCachedReader, RowGroupLastRowCachedReader};
28use crate::read::{Batch, BatchReader};
29use crate::sst::file::FileTimeRange;
30use crate::sst::parquet::file_range::FileRangeContextRef;
31use crate::sst::parquet::reader::{FlatRowGroupReader, ReaderMetrics, RowGroupReader};
32
33pub enum Source {
34 RowGroup(RowGroupReader),
35 LastRow(RowGroupLastRowCachedReader),
36}
37
38impl Source {
39 async fn next_batch(&mut self) -> Result<Option<Batch>> {
40 match self {
41 Source::RowGroup(r) => r.next_batch().await,
42 Source::LastRow(r) => r.next_batch().await,
43 }
44 }
45}
46
47pub struct PruneReader {
48 context: FileRangeContextRef,
50 source: Source,
51 metrics: ReaderMetrics,
52 skip_fields: bool,
54}
55
56impl PruneReader {
57 pub(crate) fn new_with_row_group_reader(
58 ctx: FileRangeContextRef,
59 reader: RowGroupReader,
60 skip_fields: bool,
61 ) -> Self {
62 Self {
63 context: ctx,
64 source: Source::RowGroup(reader),
65 metrics: Default::default(),
66 skip_fields,
67 }
68 }
69
70 pub(crate) fn new_with_last_row_reader(
71 ctx: FileRangeContextRef,
72 reader: RowGroupLastRowCachedReader,
73 skip_fields: bool,
74 ) -> Self {
75 Self {
76 context: ctx,
77 source: Source::LastRow(reader),
78 metrics: Default::default(),
79 skip_fields,
80 }
81 }
82
83 pub(crate) fn metrics(&self) -> ReaderMetrics {
85 let mut metrics = self.metrics.clone();
86 match &self.source {
87 Source::RowGroup(r) => {
88 metrics.merge_from(r.metrics());
89 }
90 Source::LastRow(r) => {
91 if let Some(inner_metrics) = r.metrics() {
92 metrics.merge_from(inner_metrics);
93 }
94 }
95 }
96
97 metrics
98 }
99
100 pub(crate) async fn next_batch(&mut self) -> Result<Option<Batch>> {
101 while let Some(b) = self.source.next_batch().await? {
102 match self.prune(b)? {
103 Some(b) => {
104 return Ok(Some(b));
105 }
106 None => {
107 continue;
108 }
109 }
110 }
111 Ok(None)
112 }
113
114 fn prune(&mut self, batch: Batch) -> Result<Option<Batch>> {
116 if self.context.filters().is_empty() && !self.context.has_partition_filter() {
118 return Ok(Some(batch));
119 }
120
121 let num_rows_before_filter = batch.num_rows();
122 let Some(batch_filtered) = self.context.precise_filter(batch, self.skip_fields)? else {
123 self.metrics.filter_metrics.rows_precise_filtered += num_rows_before_filter;
125 return Ok(None);
126 };
127
128 let filtered_rows = num_rows_before_filter - batch_filtered.num_rows();
130 self.metrics.filter_metrics.rows_precise_filtered += filtered_rows;
131
132 if !batch_filtered.is_empty() {
133 Ok(Some(batch_filtered))
134 } else {
135 Ok(None)
136 }
137 }
138}
139
140pub(crate) struct PruneTimeIterator {
142 iter: BoxedBatchIterator,
143 time_range: FileTimeRange,
144 time_filters: Option<Arc<Vec<SimpleFilterEvaluator>>>,
146}
147
148impl PruneTimeIterator {
149 pub(crate) fn new(
151 iter: BoxedBatchIterator,
152 time_range: FileTimeRange,
153 time_filters: Option<Arc<Vec<SimpleFilterEvaluator>>>,
154 ) -> Self {
155 Self {
156 iter,
157 time_range,
158 time_filters,
159 }
160 }
161
162 fn prune(&self, batch: Batch) -> Result<Batch> {
164 if batch.is_empty() {
165 return Ok(batch);
166 }
167
168 if self.time_range.0 <= batch.first_timestamp().unwrap()
171 && batch.last_timestamp().unwrap() <= self.time_range.1
172 {
173 return self.prune_by_time_filters(batch, Vec::new());
174 }
175
176 let unit = batch
180 .timestamps()
181 .data_type()
182 .as_timestamp()
183 .unwrap()
184 .unit();
185 let mut mask = Vec::with_capacity(batch.timestamps().len());
186 let timestamps = batch.timestamps_native().unwrap();
187 for ts in timestamps {
188 let ts = Timestamp::new(*ts, unit);
189 if self.time_range.0 <= ts && ts <= self.time_range.1 {
190 mask.push(true);
191 } else {
192 mask.push(false);
193 }
194 }
195
196 self.prune_by_time_filters(batch, mask)
197 }
198
199 fn prune_by_time_filters(&self, mut batch: Batch, existing_mask: Vec<bool>) -> Result<Batch> {
202 if let Some(filters) = &self.time_filters {
203 let mut mask = BooleanBuffer::new_set(batch.num_rows());
204 for filter in filters.iter() {
205 let result = filter
206 .evaluate_vector(batch.timestamps())
207 .context(RecordBatchSnafu)?;
208 mask = mask.bitand(&result);
209 }
210
211 if !existing_mask.is_empty() {
212 mask = mask.bitand(&BooleanBuffer::from(existing_mask));
213 }
214
215 batch.filter(&BooleanArray::from(mask).into())?;
216 } else if !existing_mask.is_empty() {
217 batch.filter(&BooleanArray::from(existing_mask).into())?;
218 }
219
220 Ok(batch)
221 }
222
223 fn next_non_empty_batch(&mut self) -> Result<Option<Batch>> {
225 while let Some(batch) = self.iter.next() {
226 let batch = batch?;
227 let pruned_batch = self.prune(batch)?;
228 if !pruned_batch.is_empty() {
229 return Ok(Some(pruned_batch));
230 }
231 }
232 Ok(None)
233 }
234}
235
236impl Iterator for PruneTimeIterator {
237 type Item = Result<Batch>;
238
239 fn next(&mut self) -> Option<Self::Item> {
240 self.next_non_empty_batch().transpose()
241 }
242}
243
244pub enum FlatSource {
245 RowGroup(FlatRowGroupReader),
246 LastRow(FlatRowGroupLastRowCachedReader),
247}
248
249impl FlatSource {
250 async fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
251 match self {
252 FlatSource::RowGroup(r) => r.next_batch().await,
253 FlatSource::LastRow(r) => r.next_batch().await,
254 }
255 }
256}
257
258pub struct FlatPruneReader {
260 context: FileRangeContextRef,
262 source: FlatSource,
263 metrics: ReaderMetrics,
264 skip_fields: bool,
266}
267
268impl FlatPruneReader {
269 pub(crate) fn new_with_row_group_reader(
270 ctx: FileRangeContextRef,
271 reader: FlatRowGroupReader,
272 skip_fields: bool,
273 ) -> Self {
274 Self {
275 context: ctx,
276 source: FlatSource::RowGroup(reader),
277 metrics: Default::default(),
278 skip_fields,
279 }
280 }
281
282 pub(crate) fn new_with_last_row_reader(
283 ctx: FileRangeContextRef,
284 reader: FlatRowGroupLastRowCachedReader,
285 skip_fields: bool,
286 ) -> Self {
287 Self {
288 context: ctx,
289 source: FlatSource::LastRow(reader),
290 metrics: Default::default(),
291 skip_fields,
292 }
293 }
294
295 pub(crate) fn metrics(&self) -> ReaderMetrics {
297 self.metrics.clone()
298 }
299
300 pub(crate) async fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
301 loop {
302 let start = std::time::Instant::now();
303 let batch = self.source.next_batch().await?;
304 self.metrics.scan_cost += start.elapsed();
305
306 let Some(record_batch) = batch else {
307 return Ok(None);
308 };
309
310 self.metrics.num_rows += record_batch.num_rows();
312 self.metrics.num_batches += 1;
313
314 match self.prune_flat(record_batch)? {
315 Some(filtered_batch) => {
316 return Ok(Some(filtered_batch));
317 }
318 None => {
319 continue;
320 }
321 }
322 }
323 }
324
325 fn prune_flat(&mut self, record_batch: RecordBatch) -> Result<Option<RecordBatch>> {
327 if self.context.filters().is_empty() && !self.context.has_partition_filter() {
329 return Ok(Some(record_batch));
330 }
331
332 let num_rows_before_filter = record_batch.num_rows();
333 let Some(filtered_batch) = self
334 .context
335 .precise_filter_flat(record_batch, self.skip_fields)?
336 else {
337 self.metrics.filter_metrics.rows_precise_filtered += num_rows_before_filter;
339 return Ok(None);
340 };
341
342 let filtered_rows = num_rows_before_filter - filtered_batch.num_rows();
344 self.metrics.filter_metrics.rows_precise_filtered += filtered_rows;
345
346 if filtered_batch.num_rows() > 0 {
347 Ok(Some(filtered_batch))
348 } else {
349 Ok(None)
350 }
351 }
352}
353
354#[cfg(test)]
355mod tests {
356 use api::v1::OpType;
357 use datafusion_common::ScalarValue;
358 use datafusion_expr::{Expr, col, lit};
359
360 use super::*;
361 use crate::test_util::new_batch;
362
363 #[test]
364 fn test_prune_time_iter_empty() {
365 let input = [];
366 let iter = input.into_iter().map(Ok);
367 let iter = PruneTimeIterator::new(
368 Box::new(iter),
369 (
370 Timestamp::new_millisecond(0),
371 Timestamp::new_millisecond(1000),
372 ),
373 None,
374 );
375 let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
376 assert!(actual.is_empty());
377 }
378
379 #[test]
380 fn test_prune_time_iter_filter() {
381 let input = [
382 new_batch(
383 b"k1",
384 &[10, 11],
385 &[20, 20],
386 &[OpType::Put, OpType::Put],
387 &[110, 111],
388 ),
389 new_batch(
390 b"k1",
391 &[15, 16],
392 &[20, 20],
393 &[OpType::Put, OpType::Put],
394 &[115, 116],
395 ),
396 new_batch(
397 b"k1",
398 &[17, 18],
399 &[20, 20],
400 &[OpType::Put, OpType::Put],
401 &[117, 118],
402 ),
403 ];
404
405 let iter = input.clone().into_iter().map(Ok);
406 let iter = PruneTimeIterator::new(
407 Box::new(iter),
408 (
409 Timestamp::new_millisecond(10),
410 Timestamp::new_millisecond(15),
411 ),
412 None,
413 );
414 let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
415 assert_eq!(
416 actual,
417 [
418 new_batch(
419 b"k1",
420 &[10, 11],
421 &[20, 20],
422 &[OpType::Put, OpType::Put],
423 &[110, 111],
424 ),
425 new_batch(b"k1", &[15], &[20], &[OpType::Put], &[115],),
426 ]
427 );
428
429 let iter = input.clone().into_iter().map(Ok);
430 let iter = PruneTimeIterator::new(
431 Box::new(iter),
432 (
433 Timestamp::new_millisecond(11),
434 Timestamp::new_millisecond(20),
435 ),
436 None,
437 );
438 let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
439 assert_eq!(
440 actual,
441 [
442 new_batch(b"k1", &[11], &[20], &[OpType::Put], &[111],),
443 new_batch(
444 b"k1",
445 &[15, 16],
446 &[20, 20],
447 &[OpType::Put, OpType::Put],
448 &[115, 116],
449 ),
450 new_batch(
451 b"k1",
452 &[17, 18],
453 &[20, 20],
454 &[OpType::Put, OpType::Put],
455 &[117, 118],
456 ),
457 ]
458 );
459
460 let iter = input.into_iter().map(Ok);
461 let iter = PruneTimeIterator::new(
462 Box::new(iter),
463 (
464 Timestamp::new_millisecond(10),
465 Timestamp::new_millisecond(18),
466 ),
467 None,
468 );
469 let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
470 assert_eq!(
471 actual,
472 [
473 new_batch(
474 b"k1",
475 &[10, 11],
476 &[20, 20],
477 &[OpType::Put, OpType::Put],
478 &[110, 111],
479 ),
480 new_batch(
481 b"k1",
482 &[15, 16],
483 &[20, 20],
484 &[OpType::Put, OpType::Put],
485 &[115, 116],
486 ),
487 new_batch(
488 b"k1",
489 &[17, 18],
490 &[20, 20],
491 &[OpType::Put, OpType::Put],
492 &[117, 118],
493 ),
494 ]
495 );
496 }
497
498 fn create_time_filters(expr: &[Expr]) -> Option<Arc<Vec<SimpleFilterEvaluator>>> {
499 let filters = expr
500 .iter()
501 .map(|expr| SimpleFilterEvaluator::try_new(expr).unwrap())
502 .collect();
503 Some(Arc::new(filters))
504 }
505
506 #[test]
507 fn test_prune_time_iter_with_time_filters() {
508 let input = [
509 new_batch(
510 b"k1",
511 &[10, 11],
512 &[20, 20],
513 &[OpType::Put, OpType::Put],
514 &[110, 111],
515 ),
516 new_batch(
517 b"k1",
518 &[15, 16],
519 &[20, 20],
520 &[OpType::Put, OpType::Put],
521 &[115, 116],
522 ),
523 new_batch(
524 b"k1",
525 &[17, 18],
526 &[20, 20],
527 &[OpType::Put, OpType::Put],
528 &[117, 118],
529 ),
530 ];
531
532 let iter = input.clone().into_iter().map(Ok);
533 let time_filters = create_time_filters(&[
535 col("ts").gt_eq(lit(ScalarValue::TimestampMillisecond(Some(10), None))),
536 col("ts").lt(lit(ScalarValue::TimestampMillisecond(Some(16), None))),
537 ]);
538 let iter = PruneTimeIterator::new(
539 Box::new(iter),
540 (
541 Timestamp::new_millisecond(10),
542 Timestamp::new_millisecond(20),
543 ),
544 time_filters,
545 );
546 let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
547 assert_eq!(
548 actual,
549 [
550 new_batch(
551 b"k1",
552 &[10, 11],
553 &[20, 20],
554 &[OpType::Put, OpType::Put],
555 &[110, 111],
556 ),
557 new_batch(b"k1", &[15], &[20], &[OpType::Put], &[115],),
558 ]
559 );
560 }
561
562 #[test]
563 fn test_prune_time_iter_in_range_with_time_filters() {
564 let input = [
565 new_batch(
566 b"k1",
567 &[10, 11],
568 &[20, 20],
569 &[OpType::Put, OpType::Put],
570 &[110, 111],
571 ),
572 new_batch(
573 b"k1",
574 &[15, 16],
575 &[20, 20],
576 &[OpType::Put, OpType::Put],
577 &[115, 116],
578 ),
579 new_batch(
580 b"k1",
581 &[17, 18],
582 &[20, 20],
583 &[OpType::Put, OpType::Put],
584 &[117, 118],
585 ),
586 ];
587
588 let iter = input.clone().into_iter().map(Ok);
589 let time_filters = create_time_filters(&[
591 col("ts").gt_eq(lit(ScalarValue::TimestampMillisecond(Some(10), None))),
592 col("ts").lt(lit(ScalarValue::TimestampMillisecond(Some(16), None))),
593 ]);
594 let iter = PruneTimeIterator::new(
595 Box::new(iter),
596 (
597 Timestamp::new_millisecond(5),
598 Timestamp::new_millisecond(18),
599 ),
600 time_filters,
601 );
602 let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
603 assert_eq!(
604 actual,
605 [
606 new_batch(
607 b"k1",
608 &[10, 11],
609 &[20, 20],
610 &[OpType::Put, OpType::Put],
611 &[110, 111],
612 ),
613 new_batch(b"k1", &[15], &[20], &[OpType::Put], &[115],),
614 ]
615 );
616 }
617}