1use std::fmt;
18use std::sync::Arc;
19use std::time::Instant;
20
21use async_stream::try_stream;
22use common_error::ext::BoxedError;
23use common_recordbatch::util::ChainedRecordBatchStream;
24use common_recordbatch::{RecordBatchStreamWrapper, SendableRecordBatchStream};
25use common_telemetry::tracing;
26use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
27use datafusion::physical_plan::{DisplayAs, DisplayFormatType};
28use datatypes::schema::SchemaRef;
29use futures::{StreamExt, TryStreamExt};
30use snafu::ensure;
31use store_api::metadata::RegionMetadataRef;
32use store_api::region_engine::{
33 PartitionRange, PrepareRequest, QueryScanContext, RegionScanner, ScannerProperties,
34};
35use store_api::storage::TimeSeriesRowSelector;
36use tokio::sync::Semaphore;
37
38use crate::error::{PartitionOutOfRangeSnafu, Result, TooManyFilesToReadSnafu};
39use crate::read::flat_dedup::{FlatDedupReader, FlatLastNonNull, FlatLastRow};
40use crate::read::flat_merge::FlatMergeReader;
41use crate::read::last_row::FlatLastRowReader;
42use crate::read::pruner::{PartitionPruner, Pruner};
43use crate::read::range::RangeMeta;
44use crate::read::range_cache::{
45 build_range_cache_key, cache_flat_range_stream, cached_flat_range_stream,
46};
47use crate::read::scan_region::{ScanInput, StreamContext};
48use crate::read::scan_util::{
49 PartitionMetrics, PartitionMetricsList, SplitRecordBatchStream, compute_parallel_channel_size,
50 scan_flat_file_ranges, scan_flat_mem_ranges, should_split_flat_batches_for_merge,
51};
52use crate::read::stream::{ConvertBatchStream, ScanBatch, ScanBatchStream};
53use crate::read::{BoxedRecordBatchStream, ScannerMetrics, scan_util};
54use crate::region::options::MergeMode;
55use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE;
56
57pub struct SeqScan {
62 properties: ScannerProperties,
64 stream_ctx: Arc<StreamContext>,
66 pruner: Arc<Pruner>,
68 metrics_list: PartitionMetricsList,
71}
72
73impl SeqScan {
74 pub(crate) fn new(input: ScanInput) -> Self {
77 let mut properties = ScannerProperties::default()
78 .with_append_mode(input.append_mode)
79 .with_total_rows(input.total_rows());
80 let stream_ctx = Arc::new(StreamContext::seq_scan_ctx(input));
81 properties.partitions = vec![stream_ctx.partition_ranges()];
82
83 let num_workers = common_stat::get_total_cpu_cores().max(1);
85 let pruner = Arc::new(Pruner::new(stream_ctx.clone(), num_workers));
86
87 Self {
88 properties,
89 stream_ctx,
90 pruner,
91 metrics_list: PartitionMetricsList::default(),
92 }
93 }
94
95 #[tracing::instrument(
100 skip_all,
101 fields(region_id = %self.stream_ctx.input.mapper.metadata().region_id)
102 )]
103 pub fn build_stream(&self) -> Result<SendableRecordBatchStream, BoxedError> {
104 let metrics_set = ExecutionPlanMetricsSet::new();
105 let streams = (0..self.properties.partitions.len())
106 .map(|partition: usize| {
107 self.scan_partition(&QueryScanContext::default(), &metrics_set, partition)
108 })
109 .collect::<Result<Vec<_>, _>>()?;
110
111 let aggr_stream = ChainedRecordBatchStream::new(streams).map_err(BoxedError::new)?;
112 Ok(Box::pin(aggr_stream))
113 }
114
115 pub(crate) fn scan_all_partitions(&self) -> Result<ScanBatchStream> {
117 let metrics_set = ExecutionPlanMetricsSet::new();
118
119 let streams = (0..self.properties.partitions.len())
120 .map(|partition| {
121 let metrics = self.new_partition_metrics(false, &metrics_set, partition);
122 self.scan_flat_batch_in_partition(partition, metrics)
123 })
124 .collect::<Result<Vec<_>>>()?;
125
126 Ok(Box::pin(futures::stream::iter(streams).flatten()))
127 }
128
129 pub async fn build_flat_reader_for_compaction(&self) -> Result<BoxedRecordBatchStream> {
134 assert!(self.stream_ctx.input.compaction);
135
136 let metrics_set = ExecutionPlanMetricsSet::new();
137 let part_metrics = self.new_partition_metrics(false, &metrics_set, 0);
138 debug_assert_eq!(1, self.properties.partitions.len());
139 let partition_ranges = &self.properties.partitions[0];
140
141 let reader = Self::merge_all_flat_ranges_for_compaction(
142 &self.stream_ctx,
143 partition_ranges,
144 &part_metrics,
145 self.pruner.clone(),
146 )
147 .await?;
148 Ok(reader)
149 }
150
151 async fn merge_all_flat_ranges_for_compaction(
154 stream_ctx: &Arc<StreamContext>,
155 partition_ranges: &[PartitionRange],
156 part_metrics: &PartitionMetrics,
157 pruner: Arc<Pruner>,
158 ) -> Result<BoxedRecordBatchStream> {
159 pruner.add_partition_ranges(partition_ranges);
160 let partition_pruner = Arc::new(PartitionPruner::new(pruner, partition_ranges));
161
162 let mut sources = Vec::new();
163 for part_range in partition_ranges {
164 build_flat_sources(
165 stream_ctx,
166 part_range,
167 true,
168 part_metrics,
169 partition_pruner.clone(),
170 &mut sources,
171 None,
172 )
173 .await?;
174 }
175
176 common_telemetry::debug!(
177 "Build flat reader to read all parts, region_id: {}, num_part_ranges: {}, num_sources: {}",
178 stream_ctx.input.mapper.metadata().region_id,
179 partition_ranges.len(),
180 sources.len()
181 );
182 Self::build_flat_reader_from_sources(
183 stream_ctx,
184 sources,
185 None,
186 None,
187 false,
188 compute_parallel_channel_size(DEFAULT_READ_BATCH_SIZE),
189 )
190 .await
191 }
192
193 #[tracing::instrument(level = tracing::Level::DEBUG, skip_all)]
197 pub(crate) async fn build_flat_reader_from_sources(
198 stream_ctx: &StreamContext,
199 mut sources: Vec<BoxedRecordBatchStream>,
200 semaphore: Option<Arc<Semaphore>>,
201 part_metrics: Option<&PartitionMetrics>,
202 skip_dedup: bool,
203 channel_size: usize,
204 ) -> Result<BoxedRecordBatchStream> {
205 if let Some(semaphore) = semaphore.as_ref() {
206 if sources.len() > 1 {
208 sources = stream_ctx.input.create_parallel_flat_sources(
209 sources,
210 semaphore.clone(),
211 channel_size,
212 )?;
213 }
214 }
215
216 let mapper = stream_ctx.input.mapper.as_flat().unwrap();
217 let schema = mapper.input_arrow_schema(stream_ctx.input.compaction);
218
219 let metrics_reporter = part_metrics.map(|m| m.merge_metrics_reporter());
220 let reader =
221 FlatMergeReader::new(schema, sources, DEFAULT_READ_BATCH_SIZE, metrics_reporter)
222 .await?;
223
224 let dedup = !skip_dedup && !stream_ctx.input.append_mode;
225 let dedup_metrics_reporter = part_metrics.map(|m| m.dedup_metrics_reporter());
226 let reader = if dedup {
227 match stream_ctx.input.merge_mode {
228 MergeMode::LastRow => Box::pin(
229 FlatDedupReader::new(
230 reader.into_stream().boxed(),
231 FlatLastRow::new(stream_ctx.input.filter_deleted),
232 dedup_metrics_reporter,
233 )
234 .into_stream(),
235 ) as _,
236 MergeMode::LastNonNull => Box::pin(
237 FlatDedupReader::new(
238 reader.into_stream().boxed(),
239 FlatLastNonNull::new(
240 mapper.field_column_start(),
241 stream_ctx.input.filter_deleted,
242 ),
243 dedup_metrics_reporter,
244 )
245 .into_stream(),
246 ) as _,
247 }
248 } else {
249 Box::pin(reader.into_stream()) as _
250 };
251
252 let reader = match &stream_ctx.input.series_row_selector {
253 Some(TimeSeriesRowSelector::LastRow) => {
254 Box::pin(FlatLastRowReader::new(reader).into_stream()) as _
255 }
256 None => reader,
257 };
258
259 Ok(reader)
260 }
261
262 pub(crate) async fn build_flat_partition_range_read(
264 stream_ctx: &Arc<StreamContext>,
265 part_range: &PartitionRange,
266 compaction: bool,
267 part_metrics: &PartitionMetrics,
268 partition_pruner: Arc<PartitionPruner>,
269 file_scan_semaphore: Option<Arc<Semaphore>>,
270 merge_semaphore: Option<Arc<Semaphore>>,
271 ) -> Result<(BoxedRecordBatchStream, usize)> {
272 let cache_key = build_range_cache_key(stream_ctx, part_range);
273
274 if let Some(key) = cache_key.as_ref() {
275 if let Some(value) = stream_ctx.input.cache_strategy.get_range_result(key) {
276 part_metrics.inc_range_cache_hit();
277 return Ok((cached_flat_range_stream(value), DEFAULT_READ_BATCH_SIZE));
278 }
279 part_metrics.inc_range_cache_miss();
280 }
281
282 let mut sources = Vec::new();
283 let split_batch_size = build_flat_sources(
284 stream_ctx,
285 part_range,
286 compaction,
287 part_metrics,
288 partition_pruner,
289 &mut sources,
290 file_scan_semaphore,
291 )
292 .await?;
293 let estimated_rows_per_batch = split_batch_size.unwrap_or(DEFAULT_READ_BATCH_SIZE);
294 let channel_size = compute_parallel_channel_size(estimated_rows_per_batch);
295 let stream = Self::build_flat_reader_from_sources(
296 stream_ctx,
297 sources,
298 merge_semaphore,
299 Some(part_metrics),
300 false,
301 channel_size,
302 )
303 .await?;
304
305 let stream = match cache_key {
306 Some(key) => cache_flat_range_stream(
307 stream,
308 stream_ctx.input.cache_strategy.clone(),
309 key,
310 part_metrics.clone(),
311 ),
312 None => stream,
313 };
314
315 Ok((stream, estimated_rows_per_batch))
316 }
317
318 fn scan_partition_impl(
321 &self,
322 ctx: &QueryScanContext,
323 metrics_set: &ExecutionPlanMetricsSet,
324 partition: usize,
325 ) -> Result<SendableRecordBatchStream> {
326 if ctx.explain_verbose {
327 common_telemetry::info!(
328 "SeqScan partition {}, region_id: {}",
329 partition,
330 self.stream_ctx.input.region_metadata().region_id
331 );
332 }
333
334 let metrics = self.new_partition_metrics(ctx.explain_verbose, metrics_set, partition);
335 let input = &self.stream_ctx.input;
336
337 let batch_stream = self.scan_flat_batch_in_partition(partition, metrics.clone())?;
338 let record_batch_stream = ConvertBatchStream::new(
339 batch_stream,
340 input.mapper.clone(),
341 input.cache_strategy.clone(),
342 metrics,
343 );
344
345 Ok(Box::pin(RecordBatchStreamWrapper::new(
346 input.mapper.output_schema(),
347 Box::pin(record_batch_stream),
348 )))
349 }
350
351 #[tracing::instrument(
352 skip_all,
353 fields(
354 region_id = %self.stream_ctx.input.mapper.metadata().region_id,
355 partition = partition
356 )
357 )]
358 fn scan_flat_batch_in_partition(
359 &self,
360 partition: usize,
361 part_metrics: PartitionMetrics,
362 ) -> Result<ScanBatchStream> {
363 ensure!(
364 partition < self.properties.partitions.len(),
365 PartitionOutOfRangeSnafu {
366 given: partition,
367 all: self.properties.partitions.len(),
368 }
369 );
370
371 if self.properties.partitions[partition].is_empty() {
372 return Ok(Box::pin(futures::stream::empty()));
373 }
374
375 let stream_ctx = self.stream_ctx.clone();
376 let semaphore = self.new_semaphore();
377 let partition_ranges = self.properties.partitions[partition].clone();
378 let compaction = self.stream_ctx.input.compaction;
379 let file_scan_semaphore = if compaction { None } else { semaphore.clone() };
380 let pruner = self.pruner.clone();
381 pruner.add_partition_ranges(&partition_ranges);
386 let partition_pruner = Arc::new(PartitionPruner::new(pruner, &partition_ranges));
387
388 let stream = try_stream! {
389 part_metrics.on_first_poll();
390 let mut fetch_start = Instant::now();
393
394 for part_range in partition_ranges {
396 let (mut reader, _) = Self::build_flat_partition_range_read(
397 &stream_ctx,
398 &part_range,
399 compaction,
400 &part_metrics,
401 partition_pruner.clone(),
402 file_scan_semaphore.clone(),
403 semaphore.clone(),
404 )
405 .await?;
406
407 let mut metrics = ScannerMetrics {
408 scan_cost: fetch_start.elapsed(),
409 ..Default::default()
410 };
411 fetch_start = Instant::now();
412
413 while let Some(record_batch) = reader.try_next().await? {
414 metrics.scan_cost += fetch_start.elapsed();
415 metrics.num_batches += 1;
416 metrics.num_rows += record_batch.num_rows();
417
418 debug_assert!(record_batch.num_rows() > 0);
419 if record_batch.num_rows() == 0 {
420 fetch_start = Instant::now();
421 continue;
422 }
423
424 let yield_start = Instant::now();
425 yield ScanBatch::RecordBatch(record_batch);
426 metrics.yield_cost += yield_start.elapsed();
427
428 fetch_start = Instant::now();
429 }
430
431 metrics.scan_cost += fetch_start.elapsed();
432 fetch_start = Instant::now();
433 part_metrics.merge_metrics(&metrics);
434 }
435
436 part_metrics.on_finish();
437 };
438 Ok(Box::pin(stream))
439 }
440
441 fn new_semaphore(&self) -> Option<Arc<Semaphore>> {
442 if self.properties.target_partitions() > self.properties.num_partitions() {
443 Some(Arc::new(Semaphore::new(
449 self.properties.target_partitions() - self.properties.num_partitions() + 1,
450 )))
451 } else {
452 None
453 }
454 }
455
456 fn new_partition_metrics(
459 &self,
460 explain_verbose: bool,
461 metrics_set: &ExecutionPlanMetricsSet,
462 partition: usize,
463 ) -> PartitionMetrics {
464 let metrics = PartitionMetrics::new(
465 self.stream_ctx.input.mapper.metadata().region_id,
466 partition,
467 get_scanner_type(self.stream_ctx.input.compaction),
468 self.stream_ctx.query_start,
469 explain_verbose,
470 metrics_set,
471 );
472
473 if !self.stream_ctx.input.compaction {
474 self.metrics_list.set(partition, metrics.clone());
475 }
476
477 metrics
478 }
479
480 fn max_files_in_partition(ranges: &[RangeMeta], partition_ranges: &[PartitionRange]) -> usize {
482 partition_ranges
483 .iter()
484 .map(|part_range| {
485 let range_meta = &ranges[part_range.identifier];
486 range_meta.indices.len()
487 })
488 .max()
489 .unwrap_or(0)
490 }
491
492 pub(crate) fn check_scan_limit(&self) -> Result<()> {
494 let total_max_files: usize = self
496 .properties
497 .partitions
498 .iter()
499 .map(|partition| Self::max_files_in_partition(&self.stream_ctx.ranges, partition))
500 .sum();
501
502 let max_concurrent_files = self.stream_ctx.input.max_concurrent_scan_files;
503 if total_max_files > max_concurrent_files {
504 return TooManyFilesToReadSnafu {
505 actual: total_max_files,
506 max: max_concurrent_files,
507 }
508 .fail();
509 }
510
511 Ok(())
512 }
513}
514
515impl RegionScanner for SeqScan {
516 fn name(&self) -> &str {
517 "SeqScan"
518 }
519
520 fn properties(&self) -> &ScannerProperties {
521 &self.properties
522 }
523
524 fn schema(&self) -> SchemaRef {
525 self.stream_ctx.input.mapper.output_schema()
526 }
527
528 fn metadata(&self) -> RegionMetadataRef {
529 self.stream_ctx.input.mapper.metadata().clone()
530 }
531
532 fn scan_partition(
533 &self,
534 ctx: &QueryScanContext,
535 metrics_set: &ExecutionPlanMetricsSet,
536 partition: usize,
537 ) -> Result<SendableRecordBatchStream, BoxedError> {
538 self.scan_partition_impl(ctx, metrics_set, partition)
539 .map_err(BoxedError::new)
540 }
541
542 fn prepare(&mut self, request: PrepareRequest) -> Result<(), BoxedError> {
543 self.properties.prepare(request);
544
545 self.check_scan_limit().map_err(BoxedError::new)?;
546
547 Ok(())
548 }
549
550 fn has_predicate_without_region(&self) -> bool {
551 let predicate = self
552 .stream_ctx
553 .input
554 .predicate_group()
555 .predicate_without_region();
556 predicate.is_some()
557 }
558
559 fn add_dyn_filter_to_predicate(
560 &mut self,
561 filter_exprs: Vec<Arc<dyn datafusion::physical_plan::PhysicalExpr>>,
562 ) -> Vec<bool> {
563 self.stream_ctx.add_dyn_filter_to_predicate(filter_exprs)
564 }
565
566 fn set_logical_region(&mut self, logical_region: bool) {
567 self.properties.set_logical_region(logical_region);
568 }
569}
570
571impl DisplayAs for SeqScan {
572 fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
573 write!(
574 f,
575 "SeqScan: region={}, ",
576 self.stream_ctx.input.mapper.metadata().region_id
577 )?;
578 match t {
579 DisplayFormatType::Default | DisplayFormatType::TreeRender => {
581 self.stream_ctx.format_for_explain(false, f)
582 }
583 DisplayFormatType::Verbose => {
584 self.stream_ctx.format_for_explain(true, f)?;
585 self.metrics_list.format_verbose_metrics(f)
586 }
587 }
588 }
589}
590
591impl fmt::Debug for SeqScan {
592 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
593 f.debug_struct("SeqScan")
594 .field("num_ranges", &self.stream_ctx.ranges.len())
595 .finish()
596 }
597}
598
599pub(crate) async fn build_flat_sources(
602 stream_ctx: &Arc<StreamContext>,
603 part_range: &PartitionRange,
604 compaction: bool,
605 part_metrics: &PartitionMetrics,
606 partition_pruner: Arc<PartitionPruner>,
607 sources: &mut Vec<BoxedRecordBatchStream>,
608 semaphore: Option<Arc<Semaphore>>,
609) -> Result<Option<usize>> {
610 let range_meta = &stream_ctx.ranges[part_range.identifier];
612 #[cfg(debug_assertions)]
613 if compaction {
614 debug_assert_eq!(range_meta.indices.len(), range_meta.row_group_indices.len());
616 for (i, row_group_idx) in range_meta.row_group_indices.iter().enumerate() {
617 debug_assert_eq!(
619 -1, row_group_idx.row_group_index,
620 "Expect {} range scan all row groups, given: {}",
621 i, row_group_idx.row_group_index,
622 );
623 }
624 }
625
626 let read_type = if compaction {
627 "compaction"
628 } else {
629 "seq_scan_files"
630 };
631 let num_indices = range_meta.row_group_indices.len();
632 if num_indices == 0 {
633 return Ok(None);
634 }
635
636 let split_batch_size = should_split_flat_batches_for_merge(stream_ctx, range_meta);
637 let should_split = split_batch_size.is_some();
638 sources.reserve(num_indices);
639 let mut ordered_sources = Vec::with_capacity(num_indices);
640 ordered_sources.resize_with(num_indices, || None);
641 let mut file_scan_tasks = Vec::new();
642
643 for (position, index) in range_meta.row_group_indices.iter().enumerate() {
644 if stream_ctx.is_mem_range_index(*index) {
645 let stream = scan_flat_mem_ranges(
646 stream_ctx.clone(),
647 part_metrics.clone(),
648 *index,
649 range_meta.time_range,
650 );
651 ordered_sources[position] = Some(Box::pin(stream) as _);
652 } else if stream_ctx.is_file_range_index(*index) {
653 if let Some(semaphore_ref) = semaphore.as_ref() {
654 let stream_ctx = stream_ctx.clone();
656 let part_metrics = part_metrics.clone();
657 let partition_pruner = partition_pruner.clone();
658 let semaphore = Arc::clone(semaphore_ref);
659 let row_group_index = *index;
660 file_scan_tasks.push(async move {
661 let _permit = semaphore.acquire().await.unwrap();
662 let stream = scan_flat_file_ranges(
663 stream_ctx,
664 part_metrics,
665 row_group_index,
666 read_type,
667 partition_pruner,
668 )
669 .await?;
670 Ok((position, Box::pin(stream) as _))
671 });
672 } else {
673 let stream = scan_flat_file_ranges(
675 stream_ctx.clone(),
676 part_metrics.clone(),
677 *index,
678 read_type,
679 partition_pruner.clone(),
680 )
681 .await?;
682 ordered_sources[position] = Some(Box::pin(stream) as _);
683 }
684 } else {
685 let stream =
686 scan_util::maybe_scan_flat_other_ranges(stream_ctx, *index, part_metrics).await?;
687 ordered_sources[position] = Some(stream);
688 }
689 }
690
691 if !file_scan_tasks.is_empty() {
692 let results = futures::future::try_join_all(file_scan_tasks).await?;
693 for (position, stream) in results {
694 ordered_sources[position] = Some(stream);
695 }
696 }
697
698 for stream in ordered_sources.into_iter().flatten() {
699 if should_split {
700 sources.push(Box::pin(SplitRecordBatchStream::new(stream)));
701 } else {
702 sources.push(stream);
703 }
704 }
705
706 if should_split {
707 common_telemetry::debug!(
708 "Splitting record batches, region: {}, sources: {}, part_range: {:?}",
709 stream_ctx.input.region_metadata().region_id,
710 sources.len(),
711 part_range,
712 );
713 }
714
715 Ok(split_batch_size)
716}
717
718#[cfg(test)]
719impl SeqScan {
720 pub(crate) fn input(&self) -> &ScanInput {
722 &self.stream_ctx.input
723 }
724}
725
726fn get_scanner_type(compaction: bool) -> &'static str {
728 if compaction {
729 "SeqScan(compaction)"
730 } else {
731 "SeqScan"
732 }
733}