1use std::fmt;
18use std::sync::Arc;
19use std::time::Instant;
20
21use async_stream::try_stream;
22use common_error::ext::BoxedError;
23use common_recordbatch::util::ChainedRecordBatchStream;
24use common_recordbatch::{RecordBatchStreamWrapper, SendableRecordBatchStream};
25use common_telemetry::tracing;
26use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
27use datafusion::physical_plan::{DisplayAs, DisplayFormatType};
28use datatypes::schema::SchemaRef;
29use futures::{StreamExt, TryStreamExt};
30use snafu::{OptionExt, ensure};
31use store_api::metadata::RegionMetadataRef;
32use store_api::region_engine::{
33 PartitionRange, PrepareRequest, QueryScanContext, RegionScanner, ScannerProperties,
34};
35use store_api::storage::TimeSeriesRowSelector;
36use tokio::sync::Semaphore;
37
38use crate::error::{PartitionOutOfRangeSnafu, Result, TooManyFilesToReadSnafu, UnexpectedSnafu};
39use crate::read::dedup::{DedupReader, LastNonNull, LastRow};
40use crate::read::flat_dedup::{FlatDedupReader, FlatLastNonNull, FlatLastRow};
41use crate::read::flat_merge::FlatMergeReader;
42use crate::read::last_row::LastRowReader;
43use crate::read::merge::MergeReaderBuilder;
44use crate::read::pruner::{PartitionPruner, Pruner};
45use crate::read::range::RangeMeta;
46use crate::read::scan_region::{ScanInput, StreamContext};
47use crate::read::scan_util::{
48 PartitionMetrics, PartitionMetricsList, SplitRecordBatchStream, scan_file_ranges,
49 scan_flat_file_ranges, scan_flat_mem_ranges, scan_mem_ranges,
50 should_split_flat_batches_for_merge,
51};
52use crate::read::stream::{ConvertBatchStream, ScanBatch, ScanBatchStream};
53use crate::read::{
54 Batch, BatchReader, BoxedBatchReader, BoxedRecordBatchStream, ScannerMetrics, Source, scan_util,
55};
56use crate::region::options::MergeMode;
57use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE;
58
59pub struct SeqScan {
64 properties: ScannerProperties,
66 stream_ctx: Arc<StreamContext>,
68 pruner: Arc<Pruner>,
70 metrics_list: PartitionMetricsList,
73}
74
75impl SeqScan {
76 pub(crate) fn new(input: ScanInput) -> Self {
79 let mut properties = ScannerProperties::default()
80 .with_append_mode(input.append_mode)
81 .with_total_rows(input.total_rows());
82 let stream_ctx = Arc::new(StreamContext::seq_scan_ctx(input));
83 properties.partitions = vec![stream_ctx.partition_ranges()];
84
85 let num_workers = common_stat::get_total_cpu_cores().max(1);
87 let pruner = Arc::new(Pruner::new(stream_ctx.clone(), num_workers));
88
89 Self {
90 properties,
91 stream_ctx,
92 pruner,
93 metrics_list: PartitionMetricsList::default(),
94 }
95 }
96
97 #[tracing::instrument(
102 skip_all,
103 fields(region_id = %self.stream_ctx.input.mapper.metadata().region_id)
104 )]
105 pub fn build_stream(&self) -> Result<SendableRecordBatchStream, BoxedError> {
106 let metrics_set = ExecutionPlanMetricsSet::new();
107 let streams = (0..self.properties.partitions.len())
108 .map(|partition: usize| {
109 self.scan_partition(&QueryScanContext::default(), &metrics_set, partition)
110 })
111 .collect::<Result<Vec<_>, _>>()?;
112
113 let aggr_stream = ChainedRecordBatchStream::new(streams).map_err(BoxedError::new)?;
114 Ok(Box::pin(aggr_stream))
115 }
116
117 pub(crate) fn scan_all_partitions(&self) -> Result<ScanBatchStream> {
119 let metrics_set = ExecutionPlanMetricsSet::new();
120
121 let streams = (0..self.properties.partitions.len())
122 .map(|partition| {
123 let metrics = self.new_partition_metrics(false, &metrics_set, partition);
124 self.scan_batch_in_partition(partition, metrics)
125 })
126 .collect::<Result<Vec<_>>>()?;
127
128 Ok(Box::pin(futures::stream::iter(streams).flatten()))
129 }
130
131 pub async fn build_reader_for_compaction(&self) -> Result<BoxedBatchReader> {
136 assert!(self.stream_ctx.input.compaction);
137
138 let metrics_set = ExecutionPlanMetricsSet::new();
139 let part_metrics = self.new_partition_metrics(false, &metrics_set, 0);
140 debug_assert_eq!(1, self.properties.partitions.len());
141 let partition_ranges = &self.properties.partitions[0];
142
143 let reader = Self::merge_all_ranges_for_compaction(
144 &self.stream_ctx,
145 partition_ranges,
146 &part_metrics,
147 self.pruner.clone(),
148 )
149 .await?;
150 Ok(Box::new(reader))
151 }
152
153 pub async fn build_flat_reader_for_compaction(&self) -> Result<BoxedRecordBatchStream> {
158 assert!(self.stream_ctx.input.compaction);
159
160 let metrics_set = ExecutionPlanMetricsSet::new();
161 let part_metrics = self.new_partition_metrics(false, &metrics_set, 0);
162 debug_assert_eq!(1, self.properties.partitions.len());
163 let partition_ranges = &self.properties.partitions[0];
164
165 let reader = Self::merge_all_flat_ranges_for_compaction(
166 &self.stream_ctx,
167 partition_ranges,
168 &part_metrics,
169 self.pruner.clone(),
170 )
171 .await?;
172 Ok(reader)
173 }
174
175 async fn merge_all_ranges_for_compaction(
178 stream_ctx: &Arc<StreamContext>,
179 partition_ranges: &[PartitionRange],
180 part_metrics: &PartitionMetrics,
181 pruner: Arc<Pruner>,
182 ) -> Result<BoxedBatchReader> {
183 pruner.add_partition_ranges(partition_ranges);
184 let partition_pruner = Arc::new(PartitionPruner::new(pruner, partition_ranges));
185
186 let mut sources = Vec::new();
187 for part_range in partition_ranges {
188 build_sources(
189 stream_ctx,
190 part_range,
191 true,
192 part_metrics,
193 partition_pruner.clone(),
194 &mut sources,
195 None,
196 )
197 .await?;
198 }
199
200 common_telemetry::debug!(
201 "Build reader to read all parts, region_id: {}, num_part_ranges: {}, num_sources: {}",
202 stream_ctx.input.mapper.metadata().region_id,
203 partition_ranges.len(),
204 sources.len()
205 );
206 Self::build_reader_from_sources(stream_ctx, sources, None, None).await
207 }
208
209 async fn merge_all_flat_ranges_for_compaction(
212 stream_ctx: &Arc<StreamContext>,
213 partition_ranges: &[PartitionRange],
214 part_metrics: &PartitionMetrics,
215 pruner: Arc<Pruner>,
216 ) -> Result<BoxedRecordBatchStream> {
217 pruner.add_partition_ranges(partition_ranges);
218 let partition_pruner = Arc::new(PartitionPruner::new(pruner, partition_ranges));
219
220 let mut sources = Vec::new();
221 for part_range in partition_ranges {
222 build_flat_sources(
223 stream_ctx,
224 part_range,
225 true,
226 part_metrics,
227 partition_pruner.clone(),
228 &mut sources,
229 None,
230 )
231 .await?;
232 }
233
234 common_telemetry::debug!(
235 "Build flat reader to read all parts, region_id: {}, num_part_ranges: {}, num_sources: {}",
236 stream_ctx.input.mapper.metadata().region_id,
237 partition_ranges.len(),
238 sources.len()
239 );
240 Self::build_flat_reader_from_sources(stream_ctx, sources, None, None).await
241 }
242
243 #[tracing::instrument(level = tracing::Level::DEBUG, skip_all)]
246 pub(crate) async fn build_reader_from_sources(
247 stream_ctx: &StreamContext,
248 mut sources: Vec<Source>,
249 semaphore: Option<Arc<Semaphore>>,
250 part_metrics: Option<&PartitionMetrics>,
251 ) -> Result<BoxedBatchReader> {
252 if let Some(semaphore) = semaphore.as_ref() {
253 if sources.len() > 1 {
255 sources = stream_ctx
256 .input
257 .create_parallel_sources(sources, semaphore.clone())?;
258 }
259 }
260
261 let mut builder = MergeReaderBuilder::from_sources(sources);
262 if let Some(metrics) = part_metrics {
263 builder.with_metrics_reporter(Some(metrics.merge_metrics_reporter()));
264 }
265 let reader = builder.build().await?;
266
267 let dedup = !stream_ctx.input.append_mode;
268 let dedup_metrics_reporter = part_metrics.map(|m| m.dedup_metrics_reporter());
269 let reader = if dedup {
270 match stream_ctx.input.merge_mode {
271 MergeMode::LastRow => Box::new(DedupReader::new(
272 reader,
273 LastRow::new(stream_ctx.input.filter_deleted),
274 dedup_metrics_reporter,
275 )) as _,
276 MergeMode::LastNonNull => Box::new(DedupReader::new(
277 reader,
278 LastNonNull::new(stream_ctx.input.filter_deleted),
279 dedup_metrics_reporter,
280 )) as _,
281 }
282 } else {
283 Box::new(reader) as _
284 };
285
286 let reader = match &stream_ctx.input.series_row_selector {
287 Some(TimeSeriesRowSelector::LastRow) => Box::new(LastRowReader::new(reader)) as _,
288 None => reader,
289 };
290
291 Ok(reader)
292 }
293
294 #[tracing::instrument(level = tracing::Level::DEBUG, skip_all)]
297 pub(crate) async fn build_flat_reader_from_sources(
298 stream_ctx: &StreamContext,
299 mut sources: Vec<BoxedRecordBatchStream>,
300 semaphore: Option<Arc<Semaphore>>,
301 part_metrics: Option<&PartitionMetrics>,
302 ) -> Result<BoxedRecordBatchStream> {
303 if let Some(semaphore) = semaphore.as_ref() {
304 if sources.len() > 1 {
306 sources = stream_ctx
307 .input
308 .create_parallel_flat_sources(sources, semaphore.clone())?;
309 }
310 }
311
312 let mapper = stream_ctx.input.mapper.as_flat().unwrap();
313 let schema = mapper.input_arrow_schema(stream_ctx.input.compaction);
314
315 let metrics_reporter = part_metrics.map(|m| m.merge_metrics_reporter());
316 let reader =
317 FlatMergeReader::new(schema, sources, DEFAULT_READ_BATCH_SIZE, metrics_reporter)
318 .await?;
319
320 let dedup = !stream_ctx.input.append_mode;
321 let dedup_metrics_reporter = part_metrics.map(|m| m.dedup_metrics_reporter());
322 let reader = if dedup {
323 match stream_ctx.input.merge_mode {
324 MergeMode::LastRow => Box::pin(
325 FlatDedupReader::new(
326 reader.into_stream().boxed(),
327 FlatLastRow::new(stream_ctx.input.filter_deleted),
328 dedup_metrics_reporter,
329 )
330 .into_stream(),
331 ) as _,
332 MergeMode::LastNonNull => Box::pin(
333 FlatDedupReader::new(
334 reader.into_stream().boxed(),
335 FlatLastNonNull::new(
336 mapper.field_column_start(),
337 stream_ctx.input.filter_deleted,
338 ),
339 dedup_metrics_reporter,
340 )
341 .into_stream(),
342 ) as _,
343 }
344 } else {
345 Box::pin(reader.into_stream()) as _
346 };
347
348 Ok(reader)
349 }
350
351 fn scan_partition_impl(
354 &self,
355 ctx: &QueryScanContext,
356 metrics_set: &ExecutionPlanMetricsSet,
357 partition: usize,
358 ) -> Result<SendableRecordBatchStream> {
359 if ctx.explain_verbose {
360 common_telemetry::info!(
361 "SeqScan partition {}, region_id: {}",
362 partition,
363 self.stream_ctx.input.region_metadata().region_id
364 );
365 }
366
367 let metrics = self.new_partition_metrics(ctx.explain_verbose, metrics_set, partition);
368 let input = &self.stream_ctx.input;
369
370 let batch_stream = if input.flat_format {
371 self.scan_flat_batch_in_partition(partition, metrics.clone())?
373 } else {
374 self.scan_batch_in_partition(partition, metrics.clone())?
376 };
377 let record_batch_stream = ConvertBatchStream::new(
378 batch_stream,
379 input.mapper.clone(),
380 input.cache_strategy.clone(),
381 metrics,
382 );
383
384 Ok(Box::pin(RecordBatchStreamWrapper::new(
385 input.mapper.output_schema(),
386 Box::pin(record_batch_stream),
387 )))
388 }
389
390 #[tracing::instrument(
391 skip_all,
392 fields(
393 region_id = %self.stream_ctx.input.mapper.metadata().region_id,
394 partition = partition
395 )
396 )]
397 fn scan_batch_in_partition(
398 &self,
399 partition: usize,
400 part_metrics: PartitionMetrics,
401 ) -> Result<ScanBatchStream> {
402 ensure!(
403 partition < self.properties.partitions.len(),
404 PartitionOutOfRangeSnafu {
405 given: partition,
406 all: self.properties.partitions.len(),
407 }
408 );
409
410 if self.properties.partitions[partition].is_empty() {
411 return Ok(Box::pin(futures::stream::empty()));
412 }
413
414 let stream_ctx = self.stream_ctx.clone();
415 let semaphore = self.new_semaphore();
416 let partition_ranges = self.properties.partitions[partition].clone();
417 let compaction = self.stream_ctx.input.compaction;
418 let distinguish_range = self.properties.distinguish_partition_range;
419 let file_scan_semaphore = if compaction { None } else { semaphore.clone() };
420 let pruner = self.pruner.clone();
421 pruner.add_partition_ranges(&partition_ranges);
426 let partition_pruner = Arc::new(PartitionPruner::new(pruner, &partition_ranges));
427
428 let stream = try_stream! {
429 part_metrics.on_first_poll();
430 let mut fetch_start = Instant::now();
433
434 let _mapper = stream_ctx.input.mapper.as_primary_key().context(UnexpectedSnafu {
435 reason: "Unexpected format",
436 })?;
437 for part_range in partition_ranges {
439 let mut sources = Vec::new();
440 build_sources(
441 &stream_ctx,
442 &part_range,
443 compaction,
444 &part_metrics,
445 partition_pruner.clone(),
446 &mut sources,
447 file_scan_semaphore.clone(),
448 ).await?;
449
450 let mut reader =
451 Self::build_reader_from_sources(&stream_ctx, sources, semaphore.clone(), Some(&part_metrics))
452 .await?;
453 #[cfg(debug_assertions)]
454 let mut checker = crate::read::BatchChecker::default()
455 .with_start(Some(part_range.start))
456 .with_end(Some(part_range.end));
457
458 let mut metrics = ScannerMetrics {
459 scan_cost: fetch_start.elapsed(),
460 ..Default::default()
461 };
462 fetch_start = Instant::now();
463
464 while let Some(batch) = reader.next_batch().await? {
465 metrics.scan_cost += fetch_start.elapsed();
466 metrics.num_batches += 1;
467 metrics.num_rows += batch.num_rows();
468
469 debug_assert!(!batch.is_empty());
470 if batch.is_empty() {
471 fetch_start = Instant::now();
472 continue;
473 }
474
475 #[cfg(debug_assertions)]
476 checker.ensure_part_range_batch(
477 "SeqScan",
478 _mapper.metadata().region_id,
479 partition,
480 part_range,
481 &batch,
482 );
483
484 let yield_start = Instant::now();
485 yield ScanBatch::Normal(batch);
486 metrics.yield_cost += yield_start.elapsed();
487
488 fetch_start = Instant::now();
489 }
490
491 if distinguish_range {
494 let yield_start = Instant::now();
495 yield ScanBatch::Normal(Batch::empty());
496 metrics.yield_cost += yield_start.elapsed();
497 }
498
499 metrics.scan_cost += fetch_start.elapsed();
500 fetch_start = Instant::now();
501 part_metrics.merge_metrics(&metrics);
502 }
503
504 part_metrics.on_finish();
505 };
506 Ok(Box::pin(stream))
507 }
508
509 #[tracing::instrument(
510 skip_all,
511 fields(
512 region_id = %self.stream_ctx.input.mapper.metadata().region_id,
513 partition = partition
514 )
515 )]
516 fn scan_flat_batch_in_partition(
517 &self,
518 partition: usize,
519 part_metrics: PartitionMetrics,
520 ) -> Result<ScanBatchStream> {
521 ensure!(
522 partition < self.properties.partitions.len(),
523 PartitionOutOfRangeSnafu {
524 given: partition,
525 all: self.properties.partitions.len(),
526 }
527 );
528
529 if self.properties.partitions[partition].is_empty() {
530 return Ok(Box::pin(futures::stream::empty()));
531 }
532
533 let stream_ctx = self.stream_ctx.clone();
534 let semaphore = self.new_semaphore();
535 let partition_ranges = self.properties.partitions[partition].clone();
536 let compaction = self.stream_ctx.input.compaction;
537 let file_scan_semaphore = if compaction { None } else { semaphore.clone() };
538 let pruner = self.pruner.clone();
539 pruner.add_partition_ranges(&partition_ranges);
544 let partition_pruner = Arc::new(PartitionPruner::new(pruner, &partition_ranges));
545
546 let stream = try_stream! {
547 part_metrics.on_first_poll();
548 let mut fetch_start = Instant::now();
551
552 for part_range in partition_ranges {
554 let mut sources = Vec::new();
555 build_flat_sources(
556 &stream_ctx,
557 &part_range,
558 compaction,
559 &part_metrics,
560 partition_pruner.clone(),
561 &mut sources,
562 file_scan_semaphore.clone(),
563 ).await?;
564
565 let mut reader =
566 Self::build_flat_reader_from_sources(&stream_ctx, sources, semaphore.clone(), Some(&part_metrics))
567 .await?;
568
569 let mut metrics = ScannerMetrics {
570 scan_cost: fetch_start.elapsed(),
571 ..Default::default()
572 };
573 fetch_start = Instant::now();
574
575 while let Some(record_batch) = reader.try_next().await? {
576 metrics.scan_cost += fetch_start.elapsed();
577 metrics.num_batches += 1;
578 metrics.num_rows += record_batch.num_rows();
579
580 debug_assert!(record_batch.num_rows() > 0);
581 if record_batch.num_rows() == 0 {
582 fetch_start = Instant::now();
583 continue;
584 }
585
586 let yield_start = Instant::now();
587 yield ScanBatch::RecordBatch(record_batch);
588 metrics.yield_cost += yield_start.elapsed();
589
590 fetch_start = Instant::now();
591 }
592
593 metrics.scan_cost += fetch_start.elapsed();
594 fetch_start = Instant::now();
595 part_metrics.merge_metrics(&metrics);
596 }
597
598 part_metrics.on_finish();
599 };
600 Ok(Box::pin(stream))
601 }
602
603 fn new_semaphore(&self) -> Option<Arc<Semaphore>> {
604 if self.properties.target_partitions() > self.properties.num_partitions() {
605 Some(Arc::new(Semaphore::new(
611 self.properties.target_partitions() - self.properties.num_partitions() + 1,
612 )))
613 } else {
614 None
615 }
616 }
617
618 fn new_partition_metrics(
621 &self,
622 explain_verbose: bool,
623 metrics_set: &ExecutionPlanMetricsSet,
624 partition: usize,
625 ) -> PartitionMetrics {
626 let metrics = PartitionMetrics::new(
627 self.stream_ctx.input.mapper.metadata().region_id,
628 partition,
629 get_scanner_type(self.stream_ctx.input.compaction),
630 self.stream_ctx.query_start,
631 explain_verbose,
632 metrics_set,
633 );
634
635 if !self.stream_ctx.input.compaction {
636 self.metrics_list.set(partition, metrics.clone());
637 }
638
639 metrics
640 }
641
642 fn max_files_in_partition(ranges: &[RangeMeta], partition_ranges: &[PartitionRange]) -> usize {
644 partition_ranges
645 .iter()
646 .map(|part_range| {
647 let range_meta = &ranges[part_range.identifier];
648 range_meta.indices.len()
649 })
650 .max()
651 .unwrap_or(0)
652 }
653
654 pub(crate) fn check_scan_limit(&self) -> Result<()> {
656 let total_max_files: usize = self
658 .properties
659 .partitions
660 .iter()
661 .map(|partition| Self::max_files_in_partition(&self.stream_ctx.ranges, partition))
662 .sum();
663
664 let max_concurrent_files = self.stream_ctx.input.max_concurrent_scan_files;
665 if total_max_files > max_concurrent_files {
666 return TooManyFilesToReadSnafu {
667 actual: total_max_files,
668 max: max_concurrent_files,
669 }
670 .fail();
671 }
672
673 Ok(())
674 }
675}
676
677impl RegionScanner for SeqScan {
678 fn name(&self) -> &str {
679 "SeqScan"
680 }
681
682 fn properties(&self) -> &ScannerProperties {
683 &self.properties
684 }
685
686 fn schema(&self) -> SchemaRef {
687 self.stream_ctx.input.mapper.output_schema()
688 }
689
690 fn metadata(&self) -> RegionMetadataRef {
691 self.stream_ctx.input.mapper.metadata().clone()
692 }
693
694 fn scan_partition(
695 &self,
696 ctx: &QueryScanContext,
697 metrics_set: &ExecutionPlanMetricsSet,
698 partition: usize,
699 ) -> Result<SendableRecordBatchStream, BoxedError> {
700 self.scan_partition_impl(ctx, metrics_set, partition)
701 .map_err(BoxedError::new)
702 }
703
704 fn prepare(&mut self, request: PrepareRequest) -> Result<(), BoxedError> {
705 self.properties.prepare(request);
706
707 self.check_scan_limit().map_err(BoxedError::new)?;
708
709 Ok(())
710 }
711
712 fn has_predicate_without_region(&self) -> bool {
713 let predicate = self
714 .stream_ctx
715 .input
716 .predicate_group()
717 .predicate_without_region();
718 predicate.is_some()
719 }
720
721 fn add_dyn_filter_to_predicate(
722 &mut self,
723 filter_exprs: Vec<Arc<dyn datafusion::physical_plan::PhysicalExpr>>,
724 ) -> Vec<bool> {
725 self.stream_ctx.add_dyn_filter_to_predicate(filter_exprs)
726 }
727
728 fn set_logical_region(&mut self, logical_region: bool) {
729 self.properties.set_logical_region(logical_region);
730 }
731}
732
733impl DisplayAs for SeqScan {
734 fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
735 write!(
736 f,
737 "SeqScan: region={}, ",
738 self.stream_ctx.input.mapper.metadata().region_id
739 )?;
740 match t {
741 DisplayFormatType::Default | DisplayFormatType::TreeRender => {
743 self.stream_ctx.format_for_explain(false, f)
744 }
745 DisplayFormatType::Verbose => {
746 self.stream_ctx.format_for_explain(true, f)?;
747 self.metrics_list.format_verbose_metrics(f)
748 }
749 }
750 }
751}
752
753impl fmt::Debug for SeqScan {
754 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
755 f.debug_struct("SeqScan")
756 .field("num_ranges", &self.stream_ctx.ranges.len())
757 .finish()
758 }
759}
760
761pub(crate) async fn build_sources(
763 stream_ctx: &Arc<StreamContext>,
764 part_range: &PartitionRange,
765 compaction: bool,
766 part_metrics: &PartitionMetrics,
767 partition_pruner: Arc<PartitionPruner>,
768 sources: &mut Vec<Source>,
769 semaphore: Option<Arc<Semaphore>>,
770) -> Result<()> {
771 let range_meta = &stream_ctx.ranges[part_range.identifier];
773 #[cfg(debug_assertions)]
774 if compaction {
775 debug_assert_eq!(range_meta.indices.len(), range_meta.row_group_indices.len());
777 for (i, row_group_idx) in range_meta.row_group_indices.iter().enumerate() {
778 debug_assert_eq!(
780 -1, row_group_idx.row_group_index,
781 "Expect {} range scan all row groups, given: {}",
782 i, row_group_idx.row_group_index,
783 );
784 }
785 }
786
787 let read_type = if compaction {
788 "compaction"
789 } else {
790 "seq_scan_files"
791 };
792 let num_indices = range_meta.row_group_indices.len();
793 if num_indices == 0 {
794 return Ok(());
795 }
796
797 sources.reserve(num_indices);
798 let mut ordered_sources = Vec::with_capacity(num_indices);
799 ordered_sources.resize_with(num_indices, || None);
800 let mut file_scan_tasks = Vec::new();
801
802 for (position, index) in range_meta.row_group_indices.iter().enumerate() {
803 if stream_ctx.is_mem_range_index(*index) {
804 let stream = scan_mem_ranges(
805 stream_ctx.clone(),
806 part_metrics.clone(),
807 *index,
808 range_meta.time_range,
809 );
810 ordered_sources[position] = Some(Source::Stream(Box::pin(stream) as _));
811 } else if stream_ctx.is_file_range_index(*index) {
812 if let Some(semaphore_ref) = semaphore.as_ref() {
813 let stream_ctx = stream_ctx.clone();
815 let part_metrics = part_metrics.clone();
816 let partition_pruner = partition_pruner.clone();
817 let semaphore = Arc::clone(semaphore_ref);
818 let row_group_index = *index;
819 file_scan_tasks.push(async move {
820 let _permit = semaphore.acquire().await.unwrap();
821 let stream = scan_file_ranges(
822 stream_ctx,
823 part_metrics,
824 row_group_index,
825 read_type,
826 partition_pruner,
827 )
828 .await?;
829 Ok((position, Source::Stream(Box::pin(stream) as _)))
830 });
831 } else {
832 let stream = scan_file_ranges(
834 stream_ctx.clone(),
835 part_metrics.clone(),
836 *index,
837 read_type,
838 partition_pruner.clone(),
839 )
840 .await?;
841 ordered_sources[position] = Some(Source::Stream(Box::pin(stream) as _));
842 }
843 } else {
844 let stream =
845 scan_util::maybe_scan_other_ranges(stream_ctx, *index, part_metrics).await?;
846 ordered_sources[position] = Some(Source::Stream(stream));
847 }
848 }
849
850 if !file_scan_tasks.is_empty() {
851 let results = futures::future::try_join_all(file_scan_tasks).await?;
852 for (position, source) in results {
853 ordered_sources[position] = Some(source);
854 }
855 }
856
857 for source in ordered_sources.into_iter().flatten() {
858 sources.push(source);
859 }
860 Ok(())
861}
862
863pub(crate) async fn build_flat_sources(
865 stream_ctx: &Arc<StreamContext>,
866 part_range: &PartitionRange,
867 compaction: bool,
868 part_metrics: &PartitionMetrics,
869 partition_pruner: Arc<PartitionPruner>,
870 sources: &mut Vec<BoxedRecordBatchStream>,
871 semaphore: Option<Arc<Semaphore>>,
872) -> Result<()> {
873 let range_meta = &stream_ctx.ranges[part_range.identifier];
875 #[cfg(debug_assertions)]
876 if compaction {
877 debug_assert_eq!(range_meta.indices.len(), range_meta.row_group_indices.len());
879 for (i, row_group_idx) in range_meta.row_group_indices.iter().enumerate() {
880 debug_assert_eq!(
882 -1, row_group_idx.row_group_index,
883 "Expect {} range scan all row groups, given: {}",
884 i, row_group_idx.row_group_index,
885 );
886 }
887 }
888
889 let read_type = if compaction {
890 "compaction"
891 } else {
892 "seq_scan_files"
893 };
894 let num_indices = range_meta.row_group_indices.len();
895 if num_indices == 0 {
896 return Ok(());
897 }
898
899 let should_split = should_split_flat_batches_for_merge(stream_ctx, range_meta);
900 sources.reserve(num_indices);
901 let mut ordered_sources = Vec::with_capacity(num_indices);
902 ordered_sources.resize_with(num_indices, || None);
903 let mut file_scan_tasks = Vec::new();
904
905 for (position, index) in range_meta.row_group_indices.iter().enumerate() {
906 if stream_ctx.is_mem_range_index(*index) {
907 let stream = scan_flat_mem_ranges(
908 stream_ctx.clone(),
909 part_metrics.clone(),
910 *index,
911 range_meta.time_range,
912 );
913 ordered_sources[position] = Some(Box::pin(stream) as _);
914 } else if stream_ctx.is_file_range_index(*index) {
915 if let Some(semaphore_ref) = semaphore.as_ref() {
916 let stream_ctx = stream_ctx.clone();
918 let part_metrics = part_metrics.clone();
919 let partition_pruner = partition_pruner.clone();
920 let semaphore = Arc::clone(semaphore_ref);
921 let row_group_index = *index;
922 file_scan_tasks.push(async move {
923 let _permit = semaphore.acquire().await.unwrap();
924 let stream = scan_flat_file_ranges(
925 stream_ctx,
926 part_metrics,
927 row_group_index,
928 read_type,
929 partition_pruner,
930 )
931 .await?;
932 Ok((position, Box::pin(stream) as _))
933 });
934 } else {
935 let stream = scan_flat_file_ranges(
937 stream_ctx.clone(),
938 part_metrics.clone(),
939 *index,
940 read_type,
941 partition_pruner.clone(),
942 )
943 .await?;
944 ordered_sources[position] = Some(Box::pin(stream) as _);
945 }
946 } else {
947 let stream =
948 scan_util::maybe_scan_flat_other_ranges(stream_ctx, *index, part_metrics).await?;
949 ordered_sources[position] = Some(stream);
950 }
951 }
952
953 if !file_scan_tasks.is_empty() {
954 let results = futures::future::try_join_all(file_scan_tasks).await?;
955 for (position, stream) in results {
956 ordered_sources[position] = Some(stream);
957 }
958 }
959
960 for stream in ordered_sources.into_iter().flatten() {
961 if should_split {
962 sources.push(Box::pin(SplitRecordBatchStream::new(stream)));
963 } else {
964 sources.push(stream);
965 }
966 }
967
968 if should_split {
969 common_telemetry::debug!(
970 "Splitting record batches, region: {}, sources: {}, part_range: {:?}",
971 stream_ctx.input.region_metadata().region_id,
972 sources.len(),
973 part_range,
974 );
975 }
976
977 Ok(())
978}
979
980#[cfg(test)]
981impl SeqScan {
982 pub(crate) fn input(&self) -> &ScanInput {
984 &self.stream_ctx.input
985 }
986}
987
988fn get_scanner_type(compaction: bool) -> &'static str {
990 if compaction {
991 "SeqScan(compaction)"
992 } else {
993 "SeqScan"
994 }
995}