feat: Implements row group level parallel unordered scanner (#3992)

* feat: unordered scanner

* feat: support compat

* chore: update debug print

fix: missing ranges in scan parts

* fix: ensure chunk size > 0

* fix: parallel is disabled if there is only one file and memtable

* chore: reader metrics

* chore: remove todo

* refactor: add ScanPartBuilder trait

* chore: pass file meta to the part builder

* chore: make part builder private

* docs: update comment

* chore: remove meta()

* refactor: only prune file ranges in ScanInput

replaces ScanPartBuilder with FileRangeCollector which only collect file
ranges

* chore: address typo

* fix: panic when no partition

* feat: Postpone part distribution

* chore: handle empty partition in mito

* style: fix clippy
This commit is contained in:
Yingwen
2024-05-29 19:06:08 +08:00
committed by GitHub
parent f0effd2680
commit 848bd7e553
14 changed files with 492 additions and 173 deletions

View File

@@ -656,7 +656,7 @@ mod tests {
let file_metas: Vec<_> = data.version.ssts.levels()[0]
.files
.values()
.map(|file| file.meta())
.map(|file| file.meta_ref().clone())
.collect();
// 5 files for next compaction and removes old files.

View File

@@ -110,7 +110,7 @@ impl CompactionTaskImpl {
Vec::with_capacity(self.outputs.iter().map(|o| o.inputs.len()).sum());
for output in self.outputs.drain(..) {
compacted_inputs.extend(output.inputs.iter().map(FileHandle::meta));
compacted_inputs.extend(output.inputs.iter().map(|f| f.meta_ref().clone()));
info!(
"Compaction region {} output [{}]-> {}",
@@ -229,7 +229,7 @@ impl CompactionTaskImpl {
return Err(e);
}
};
deleted.extend(self.expired_ssts.iter().map(FileHandle::meta));
deleted.extend(self.expired_ssts.iter().map(|f| f.meta_ref().clone()));
let merge_time = merge_timer.stop_and_record();
info!(
"Compacted SST files, region_id: {}, input: {:?}, output: {:?}, window: {:?}, waiter_num: {}, merge_time: {}s",

View File

@@ -126,12 +126,11 @@ impl MitoEngine {
&self,
region_id: RegionId,
request: ScanRequest,
) -> std::result::Result<SendableRecordBatchStream, BoxedError> {
) -> Result<SendableRecordBatchStream, BoxedError> {
self.scanner(region_id, request)
.map_err(BoxedError::new)?
.scan()
.await
.map_err(BoxedError::new)
}
/// Returns a scanner to scan for `request`.

View File

@@ -50,6 +50,7 @@ use crate::error::{
ComputeArrowSnafu, ComputeVectorSnafu, ConvertVectorSnafu, InvalidBatchSnafu, Result,
};
use crate::memtable::BoxedBatchIterator;
use crate::sst::parquet::reader::RowGroupReader;
/// Storage internal representation of a batch of rows for a primary key (time series).
///
@@ -699,6 +700,8 @@ pub enum Source {
Iter(BoxedBatchIterator),
/// Source from a [BoxedBatchStream].
Stream(BoxedBatchStream),
/// Source from a [RowGroupReader].
RowGroupReader(RowGroupReader),
}
impl Source {
@@ -708,6 +711,7 @@ impl Source {
Source::Reader(reader) => reader.next_batch().await,
Source::Iter(iter) => iter.next().transpose(),
Source::Stream(stream) => stream.try_next().await,
Source::RowGroupReader(reader) => reader.next_batch().await,
}
}
}

View File

@@ -32,10 +32,8 @@ use crate::row_converter::{McmpRowCodec, RowCodec, SortField};
pub struct CompatReader<R> {
/// Underlying reader.
reader: R,
/// Optional primary key adapter.
compat_pk: Option<CompatPrimaryKey>,
/// Optional fields adapter.
compat_fields: Option<CompatFields>,
/// Helper to compat batches.
compat: CompatBatch,
}
impl<R> CompatReader<R> {
@@ -48,13 +46,9 @@ impl<R> CompatReader<R> {
reader_meta: RegionMetadataRef,
reader: R,
) -> Result<CompatReader<R>> {
let compat_pk = may_compat_primary_key(mapper.metadata(), &reader_meta)?;
let compat_fields = may_compat_fields(mapper, &reader_meta)?;
Ok(CompatReader {
reader,
compat_pk,
compat_fields,
compat: CompatBatch::new(mapper, reader_meta)?,
})
}
}
@@ -66,6 +60,36 @@ impl<R: BatchReader> BatchReader for CompatReader<R> {
return Ok(None);
};
batch = self.compat.compat_batch(batch)?;
Ok(Some(batch))
}
}
/// A helper struct to adapt schema of the batch to an expected schema.
pub(crate) struct CompatBatch {
/// Optional primary key adapter.
compat_pk: Option<CompatPrimaryKey>,
/// Optional fields adapter.
compat_fields: Option<CompatFields>,
}
impl CompatBatch {
/// Creates a new [CompatBatch].
/// - `mapper` is built from the metadata users expect to see.
/// - `reader_meta` is the metadata of the input reader.
pub(crate) fn new(mapper: &ProjectionMapper, reader_meta: RegionMetadataRef) -> Result<Self> {
let compat_pk = may_compat_primary_key(mapper.metadata(), &reader_meta)?;
let compat_fields = may_compat_fields(mapper, &reader_meta)?;
Ok(Self {
compat_pk,
compat_fields,
})
}
/// Adapts the `batch` to the expected schema.
pub(crate) fn compat_batch(&self, mut batch: Batch) -> Result<Batch> {
if let Some(compat_pk) = &self.compat_pk {
batch = compat_pk.compat(batch)?;
}
@@ -73,7 +97,7 @@ impl<R: BatchReader> BatchReader for CompatReader<R> {
batch = compat_fields.compat(batch);
}
Ok(Some(batch))
Ok(batch)
}
}

View File

@@ -14,9 +14,11 @@
//! Scans a region according to the scan request.
use std::fmt;
use std::sync::Arc;
use std::time::Instant;
use common_error::ext::BoxedError;
use common_recordbatch::SendableRecordBatchStream;
use common_telemetry::{debug, error, warn};
use common_time::range::TimestampRange;
@@ -32,15 +34,16 @@ use crate::cache::CacheManagerRef;
use crate::error::Result;
use crate::memtable::MemtableRef;
use crate::metrics::READ_SST_COUNT;
use crate::read::compat::CompatReader;
use crate::read::compat::{CompatBatch, CompatReader};
use crate::read::projection::ProjectionMapper;
use crate::read::seq_scan::SeqScan;
use crate::read::unordered_scan::UnorderedScan;
use crate::read::{compat, Batch, Source};
use crate::region::version::VersionRef;
use crate::sst::file::FileHandle;
use crate::sst::file::{FileHandle, FileMeta};
use crate::sst::index::applier::builder::SstIndexApplierBuilder;
use crate::sst::index::applier::SstIndexApplierRef;
use crate::sst::parquet::file_range::FileRange;
/// A scanner scans a region and returns a [SendableRecordBatchStream].
pub(crate) enum Scanner {
@@ -51,20 +54,24 @@ pub(crate) enum Scanner {
}
impl Scanner {
/// Returns a [SendableRecordBatchStream] to retrieve scan results.
pub(crate) async fn scan(&self) -> Result<SendableRecordBatchStream> {
/// Returns a [SendableRecordBatchStream] to retrieve scan results from all partitions.
pub(crate) async fn scan(&self) -> Result<SendableRecordBatchStream, BoxedError> {
match self {
Scanner::Seq(seq_scan) => seq_scan.build_stream().await,
Scanner::Seq(seq_scan) => seq_scan.build_stream().await.map_err(BoxedError::new),
Scanner::Unordered(unordered_scan) => unordered_scan.build_stream().await,
}
}
/// Returns a [RegionScanner] to scan the region.
pub(crate) async fn region_scanner(&self) -> Result<RegionScannerRef> {
let stream = self.scan().await?;
let scanner = SinglePartitionScanner::new(stream);
Ok(Arc::new(scanner))
pub(crate) async fn region_scanner(self) -> Result<RegionScannerRef> {
match self {
Scanner::Seq(seq_scan) => {
let stream = seq_scan.build_stream().await?;
let scanner = Arc::new(SinglePartitionScanner::new(stream));
Ok(scanner)
}
Scanner::Unordered(unordered_scan) => Ok(Arc::new(unordered_scan)),
}
}
}
@@ -222,9 +229,7 @@ impl ScanRegion {
/// Unordered scan.
pub(crate) fn unordered_scan(self) -> Result<UnorderedScan> {
let input = self.scan_input(true)?;
let scan = UnorderedScan::new(input);
Ok(scan)
Ok(UnorderedScan::new(input))
}
#[cfg(test)]
@@ -386,7 +391,7 @@ pub(crate) struct ScanInput {
/// Time range filter for time index.
time_range: Option<TimestampRange>,
/// Predicate to push down.
predicate: Option<Predicate>,
pub(crate) predicate: Option<Predicate>,
/// Memtables to scan.
pub(crate) memtables: Vec<MemtableRef>,
/// Handles to SST files to scan.
@@ -498,7 +503,6 @@ impl ScanInput {
}
/// Sets whether to remove deletion markers during scan.
#[allow(unused)]
#[must_use]
pub(crate) fn with_filter_deleted(mut self, filter_deleted: bool) -> Self {
self.filter_deleted = filter_deleted;
@@ -572,6 +576,61 @@ impl ScanInput {
Ok(sources)
}
/// Prunes file ranges to scan and adds them tothe `collector`.
pub(crate) async fn prune_file_ranges(
&self,
collector: &mut impl FileRangeCollector,
) -> Result<()> {
for file in &self.files {
let res = self
.access_layer
.read_sst(file.clone())
.predicate(self.predicate.clone())
.time_range(self.time_range)
.projection(Some(self.mapper.column_ids().to_vec()))
.cache(self.cache_manager.clone())
.index_applier(self.index_applier.clone())
.expected_metadata(Some(self.mapper.metadata().clone()))
.build_reader_input()
.await;
let (mut file_range_ctx, row_groups) = match res {
Ok(x) => x,
Err(e) => {
if e.is_object_not_found() && self.ignore_file_not_found {
error!(e; "File to scan does not exist, region_id: {}, file: {}", file.region_id(), file.file_id());
continue;
} else {
return Err(e);
}
}
};
if !compat::has_same_columns(
self.mapper.metadata(),
file_range_ctx.read_format().metadata(),
) {
// They have different schema. We need to adapt the batch first so the
// mapper can convert it.
let compat = CompatBatch::new(
&self.mapper,
file_range_ctx.read_format().metadata().clone(),
)?;
file_range_ctx.set_compat_batch(Some(compat));
}
// Build ranges from row groups.
let file_range_ctx = Arc::new(file_range_ctx);
let file_ranges = row_groups
.into_iter()
.map(|(row_group_idx, row_selection)| {
FileRange::new(file_range_ctx.clone(), row_group_idx, row_selection)
});
collector.append_file_ranges(file.meta_ref(), file_ranges);
}
READ_SST_COUNT.observe(self.files.len() as f64);
Ok(())
}
/// Scans the input source in another task and sends batches to the sender.
pub(crate) fn spawn_scan_task(
&self,
@@ -620,3 +679,35 @@ impl ScanInput {
self.files.iter().map(|file| file.file_id()).collect()
}
}
/// A partition of a scanner to read.
/// It contains memtables and file ranges to scan.
#[derive(Default)]
pub(crate) struct ScanPart {
/// Memtables to scan.
/// We scan the whole memtable now. We might scan a range of the memtable in the future.
pub(crate) memtables: Vec<MemtableRef>,
/// File ranges to scan.
pub(crate) file_ranges: Vec<FileRange>,
}
impl fmt::Debug for ScanPart {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"ScanPart({} memtables, {} file ranges)",
self.memtables.len(),
self.file_ranges.len()
)
}
}
/// A trait to collect file ranges to scan.
pub(crate) trait FileRangeCollector {
/// Appends file ranges from the **same file** to the collector.
fn append_file_ranges(
&mut self,
file_meta: &FileMeta,
file_ranges: impl Iterator<Item = FileRange>,
);
}

View File

@@ -14,148 +14,101 @@
//! Unordered scanner.
use std::fmt;
use std::sync::Arc;
use std::time::{Duration, Instant};
use async_stream::try_stream;
use async_stream::{stream, try_stream};
use common_error::ext::BoxedError;
use common_recordbatch::error::ExternalSnafu;
use common_recordbatch::{RecordBatch, RecordBatchStreamWrapper, SendableRecordBatchStream};
use common_telemetry::debug;
use datafusion::physical_plan::{DisplayAs, DisplayFormatType};
use datatypes::schema::SchemaRef;
use futures::StreamExt;
use snafu::ResultExt;
use tokio::sync::{mpsc, Semaphore};
use tokio_stream::wrappers::ReceiverStream;
use store_api::region_engine::{RegionScanner, ScannerPartitioning, ScannerProperties};
use tokio::sync::Mutex;
use crate::cache::CacheManager;
use crate::error::Result;
use crate::memtable::MemtableRef;
use crate::metrics::{READ_BATCHES_RETURN, READ_ROWS_RETURN, READ_STAGE_ELAPSED};
use crate::read::compat::CompatBatch;
use crate::read::projection::ProjectionMapper;
use crate::read::scan_region::ScanInput;
use crate::read::scan_region::{FileRangeCollector, ScanInput, ScanPart};
use crate::read::Source;
use crate::sst::file::FileMeta;
use crate::sst::parquet::file_range::FileRange;
use crate::sst::parquet::reader::ReaderMetrics;
/// Scans a region without providing any output ordering guarantee.
///
/// Only an append only table should use this scanner.
pub struct UnorderedScan {
input: ScanInput,
/// Properties of the scanner.
properties: ScannerProperties,
/// Context of streams.
stream_ctx: Arc<StreamContext>,
}
impl UnorderedScan {
/// Creates a new [UnorderedScan].
pub(crate) fn new(input: ScanInput) -> Self {
Self { input }
let query_start = input.query_start.unwrap_or_else(Instant::now);
let prepare_scan_cost = query_start.elapsed();
let properties =
ScannerProperties::new(ScannerPartitioning::Unknown(input.parallelism.parallelism));
// Observes metrics.
READ_STAGE_ELAPSED
.with_label_values(&["prepare_scan"])
.observe(prepare_scan_cost.as_secs_f64());
let stream_ctx = Arc::new(StreamContext {
input,
parts: Mutex::new(ScanPartList::default()),
query_start,
prepare_scan_cost,
});
Self {
properties,
stream_ctx,
}
}
/// Scans the region and returns a stream.
pub async fn build_stream(&self) -> Result<SendableRecordBatchStream> {
let enable_parallel = self.enable_parallel_scan();
if enable_parallel {
self.scan_in_parallel().await
} else {
self.scan_sources().await
}
}
/// Scans all sources one by one.
async fn scan_sources(&self) -> Result<SendableRecordBatchStream> {
let mut metrics = Metrics::default();
let build_start = Instant::now();
let query_start = self.input.query_start.unwrap_or(build_start);
metrics.prepare_scan_cost = query_start.elapsed();
// Scans all memtables and SSTs.
let sources = self.input.build_sources().await?;
metrics.build_source_cost = build_start.elapsed();
Self::observe_metrics_on_start(&metrics);
let mapper = self.input.mapper.clone();
let cache_manager = self.input.cache_manager.clone();
let stream = try_stream! {
for mut source in sources {
let cache = cache_manager.as_deref();
while let Some(batch) = Self::fetch_from_source(&mut source, &mapper, cache, &mut metrics).await? {
metrics.num_batches += 1;
metrics.num_rows += batch.num_rows();
yield batch;
pub(crate) async fn build_stream(&self) -> Result<SendableRecordBatchStream, BoxedError> {
let part_num = self.properties.partitioning().num_partitions();
let streams = (0..part_num)
.map(|i| self.scan_partition(i))
.collect::<Result<Vec<_>, BoxedError>>()?;
let stream = stream! {
for mut stream in streams {
while let Some(rb) = stream.next().await {
yield rb;
}
}
metrics.total_cost = query_start.elapsed();
Self::observe_metrics_on_finish(&metrics);
debug!("Unordered scan finished, region_id: {}, metrics: {:?}", mapper.metadata().region_id, metrics);
};
let stream = Box::pin(RecordBatchStreamWrapper::new(
self.input.mapper.output_schema(),
self.schema(),
Box::pin(stream),
));
Ok(stream)
}
/// Scans all sources in parallel.
async fn scan_in_parallel(&self) -> Result<SendableRecordBatchStream> {
debug_assert!(self.input.parallelism.allow_parallel_scan());
let mut metrics = Metrics::default();
let build_start = Instant::now();
let query_start = self.input.query_start.unwrap_or(build_start);
metrics.prepare_scan_cost = query_start.elapsed();
// Scans all memtables and SSTs.
let sources = self.input.build_sources().await?;
metrics.build_source_cost = build_start.elapsed();
Self::observe_metrics_on_start(&metrics);
let (sender, receiver) = mpsc::channel(self.input.parallelism.channel_size);
let semaphore = Arc::new(Semaphore::new(self.input.parallelism.parallelism));
// Spawn a task for each source.
for source in sources {
self.input
.spawn_scan_task(source, semaphore.clone(), sender.clone());
}
let stream = Box::pin(ReceiverStream::new(receiver));
let mapper = self.input.mapper.clone();
let cache_manager = self.input.cache_manager.clone();
// For simplicity, we wrap the receiver into a stream to reuse code. We can use the channel directly if it
// becomes a bottleneck.
let mut source = Source::Stream(stream);
let stream = try_stream! {
let cache = cache_manager.as_deref();
while let Some(batch) = Self::fetch_from_source(&mut source, &mapper, cache, &mut metrics).await? {
metrics.num_batches += 1;
metrics.num_rows += batch.num_rows();
yield batch;
}
metrics.total_cost = query_start.elapsed();
Self::observe_metrics_on_finish(&metrics);
debug!("Unordered scan in parallel finished, region_id: {}, metrics: {:?}", mapper.metadata().region_id, metrics);
};
let stream = Box::pin(RecordBatchStreamWrapper::new(
self.input.mapper.output_schema(),
Box::pin(stream),
));
Ok(stream)
}
/// Returns whether to scan in parallel.
fn enable_parallel_scan(&self) -> bool {
self.input.parallelism.allow_parallel_scan()
&& (self.input.files.len() + self.input.memtables.len()) > 1
}
/// Fetch a batch from the source and convert it into a record batch.
async fn fetch_from_source(
source: &mut Source,
mapper: &ProjectionMapper,
cache: Option<&CacheManager>,
compat_batch: Option<&CompatBatch>,
metrics: &mut Metrics,
) -> common_recordbatch::error::Result<Option<RecordBatch>> {
let start = Instant::now();
let Some(batch) = source
let Some(mut batch) = source
.next_batch()
.await
.map_err(BoxedError::new)
@@ -166,6 +119,13 @@ impl UnorderedScan {
return Ok(None);
};
if let Some(compat) = compat_batch {
batch = compat
.compat_batch(batch)
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
}
let convert_start = Instant::now();
let record_batch = mapper.convert(&batch, cache)?;
metrics.convert_cost += convert_start.elapsed();
@@ -174,15 +134,6 @@ impl UnorderedScan {
Ok(Some(record_batch))
}
fn observe_metrics_on_start(metrics: &Metrics) {
READ_STAGE_ELAPSED
.with_label_values(&["prepare_scan"])
.observe(metrics.prepare_scan_cost.as_secs_f64());
READ_STAGE_ELAPSED
.with_label_values(&["build_source"])
.observe(metrics.build_source_cost.as_secs_f64());
}
fn observe_metrics_on_finish(metrics: &Metrics) {
READ_STAGE_ELAPSED
.with_label_values(&["convert_rb"])
@@ -198,21 +149,168 @@ impl UnorderedScan {
}
}
impl RegionScanner for UnorderedScan {
fn properties(&self) -> &ScannerProperties {
&self.properties
}
fn schema(&self) -> SchemaRef {
self.stream_ctx.input.mapper.output_schema()
}
fn scan_partition(&self, partition: usize) -> Result<SendableRecordBatchStream, BoxedError> {
let mut metrics = Metrics {
prepare_scan_cost: self.stream_ctx.prepare_scan_cost,
..Default::default()
};
let stream_ctx = self.stream_ctx.clone();
let stream = try_stream! {
let mut parts = stream_ctx.parts.lock().await;
parts
.maybe_init_parts(&stream_ctx.input, &mut metrics)
.await
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
let Some(part) = parts.get_part(partition) else {
return;
};
let mapper = &stream_ctx.input.mapper;
let memtable_sources = part
.memtables
.iter()
.map(|mem| {
let iter = mem.iter(
Some(mapper.column_ids()),
stream_ctx.input.predicate.clone(),
)?;
Ok(Source::Iter(iter))
})
.collect::<Result<Vec<_>>>()
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
let query_start = stream_ctx.query_start;
let cache = stream_ctx.input.cache_manager.as_deref();
// Scans memtables first.
for mut source in memtable_sources {
while let Some(batch) = Self::fetch_from_source(&mut source, mapper, cache, None, &mut metrics).await? {
metrics.num_batches += 1;
metrics.num_rows += batch.num_rows();
yield batch;
}
}
// Then scans file ranges.
let mut reader_metrics = ReaderMetrics::default();
for file_range in &part.file_ranges {
let reader = file_range.reader().await.map_err(BoxedError::new).context(ExternalSnafu)?;
let compat_batch = file_range.compat_batch();
let mut source = Source::RowGroupReader(reader);
while let Some(batch) = Self::fetch_from_source(&mut source, mapper, cache, compat_batch, &mut metrics).await? {
metrics.num_batches += 1;
metrics.num_rows += batch.num_rows();
yield batch;
}
if let Source::RowGroupReader(reader) = source {
reader_metrics.merge_from(reader.metrics());
}
}
metrics.total_cost = query_start.elapsed();
Self::observe_metrics_on_finish(&metrics);
debug!(
"Unordered scan partition {} finished, region_id: {}, metrics: {:?}, reader_metrics: {:?}",
partition, mapper.metadata().region_id, metrics, reader_metrics
);
};
let stream = Box::pin(RecordBatchStreamWrapper::new(
self.stream_ctx.input.mapper.output_schema(),
Box::pin(stream),
));
Ok(stream)
}
}
impl DisplayAs for UnorderedScan {
fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "UnorderedScan: [{:?}]", self.stream_ctx.parts)
}
}
impl fmt::Debug for UnorderedScan {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("UnorderedScan")
.field("parts", &self.stream_ctx.parts)
.field("prepare_scan_cost", &self.stream_ctx.prepare_scan_cost)
.finish()
}
}
#[cfg(test)]
impl UnorderedScan {
/// Returns the input.
pub(crate) fn input(&self) -> &ScanInput {
&self.input
&self.stream_ctx.input
}
}
/// List of [ScanPart]s.
#[derive(Debug, Default)]
struct ScanPartList(Option<Vec<ScanPart>>);
impl ScanPartList {
/// Initializes parts if they are not built yet.
async fn maybe_init_parts(&mut self, input: &ScanInput, metrics: &mut Metrics) -> Result<()> {
if self.0.is_none() {
let now = Instant::now();
let mut distributor = UnorderedDistributor::default();
input.prune_file_ranges(&mut distributor).await?;
self.0 = Some(distributor.build_parts(&input.memtables, input.parallelism.parallelism));
metrics.build_parts_cost = now.elapsed();
READ_STAGE_ELAPSED
.with_label_values(&["build_parts"])
.observe(metrics.build_parts_cost.as_secs_f64());
}
Ok(())
}
/// Gets the part by index, returns None if the index is out of bound.
/// # Panics
/// Panics if parts are not initialized.
fn get_part(&mut self, index: usize) -> Option<&ScanPart> {
let parts = self.0.as_ref().unwrap();
parts.get(index)
}
}
/// Context shared by different streams.
/// It contains the input and distributes input to multiple parts
/// to scan.
struct StreamContext {
/// Input memtables and files.
input: ScanInput,
/// Parts to scan.
/// The scanner builds parts to scan from the input lazily.
/// The mutex is used to ensure the parts are only built once.
parts: Mutex<ScanPartList>,
// Metrics:
/// The start time of the query.
query_start: Instant,
/// Time elapsed before creating the scanner.
prepare_scan_cost: Duration,
}
/// Metrics for [UnorderedScan].
// We print all fields in logs so we disable the dead_code lint.
#[allow(dead_code)]
#[derive(Debug, Default)]
struct Metrics {
/// Duration to prepare the scan task.
prepare_scan_cost: Duration,
/// Duration to build sources.
build_source_cost: Duration,
/// Duration to build parts.
build_parts_cost: Duration,
/// Duration to scan data.
scan_cost: Duration,
/// Duration to convert batches.
@@ -224,3 +322,66 @@ struct Metrics {
/// Number of rows returned.
num_rows: usize,
}
/// Builds [ScanPart]s without preserving order. It distributes file ranges and memtables
/// across partitions. Each partition scans a subset of memtables and file ranges. There
/// is no output ordering guarantee of each partition.
#[derive(Default)]
struct UnorderedDistributor {
file_ranges: Vec<FileRange>,
}
impl FileRangeCollector for UnorderedDistributor {
fn append_file_ranges(
&mut self,
_file_meta: &FileMeta,
file_ranges: impl Iterator<Item = FileRange>,
) {
self.file_ranges.extend(file_ranges);
}
}
impl UnorderedDistributor {
/// Distributes file ranges and memtables across partitions according to the `parallelism`.
/// The output number of parts may be `<= parallelism`.
fn build_parts(self, memtables: &[MemtableRef], parallelism: usize) -> Vec<ScanPart> {
if parallelism <= 1 {
// Returns a single part.
let part = ScanPart {
memtables: memtables.to_vec(),
file_ranges: self.file_ranges,
};
return vec![part];
}
let mems_per_part = ((memtables.len() + parallelism - 1) / parallelism).max(1);
let ranges_per_part = ((self.file_ranges.len() + parallelism - 1) / parallelism).max(1);
common_telemetry::debug!(
"Parallel scan is enabled, parallelism: {}, {} memtables, {} file_ranges, mems_per_part: {}, ranges_per_part: {}",
parallelism,
memtables.len(),
self.file_ranges.len(),
mems_per_part,
ranges_per_part
);
let mut scan_parts = memtables
.chunks(mems_per_part)
.map(|mems| ScanPart {
memtables: mems.to_vec(),
file_ranges: Vec::new(),
})
.collect::<Vec<_>>();
for (i, ranges) in self.file_ranges.chunks(ranges_per_part).enumerate() {
if i == scan_parts.len() {
scan_parts.push(ScanPart {
memtables: Vec::new(),
file_ranges: ranges.to_vec(),
});
} else {
scan_parts[i].file_ranges = ranges.to_vec();
}
}
scan_parts
}
}

View File

@@ -175,8 +175,9 @@ impl FileHandle {
self.inner.compacting.store(compacting, Ordering::Relaxed);
}
pub fn meta(&self) -> FileMeta {
self.inner.meta.clone()
/// Returns a reference to the [FileMeta].
pub fn meta_ref(&self) -> &FileMeta {
&self.inner.meta
}
}

View File

@@ -25,6 +25,7 @@ use parquet::arrow::arrow_reader::RowSelection;
use snafu::ResultExt;
use crate::error::{FieldTypeMismatchSnafu, FilterRecordBatchSnafu, Result};
use crate::read::compat::CompatBatch;
use crate::read::Batch;
use crate::row_converter::{McmpRowCodec, RowCodec};
use crate::sst::parquet::format::ReadFormat;
@@ -32,6 +33,7 @@ use crate::sst::parquet::reader::{RowGroupReader, RowGroupReaderBuilder, SimpleF
/// A range of a parquet SST. Now it is a row group.
/// We can read different file ranges in parallel.
#[derive(Clone)]
pub struct FileRange {
/// Shared context.
context: FileRangeContextRef,
@@ -56,7 +58,6 @@ impl FileRange {
}
/// Returns a reader to read the [FileRange].
#[allow(dead_code)]
pub(crate) async fn reader(&self) -> Result<RowGroupReader> {
let parquet_reader = self
.context
@@ -66,6 +67,11 @@ impl FileRange {
Ok(RowGroupReader::new(self.context.clone(), parquet_reader))
}
/// Returns the helper to compat batches.
pub(crate) fn compat_batch(&self) -> Option<&CompatBatch> {
self.context.compat_batch()
}
}
/// Context shared by ranges of the same parquet SST.
@@ -78,6 +84,8 @@ pub(crate) struct FileRangeContext {
read_format: ReadFormat,
/// Decoder for primary keys
codec: McmpRowCodec,
/// Optional helper to compat batches.
compat_batch: Option<CompatBatch>,
}
pub(crate) type FileRangeContextRef = Arc<FileRangeContext>;
@@ -95,6 +103,7 @@ impl FileRangeContext {
filters,
read_format,
codec,
compat_batch: None,
}
}
@@ -118,6 +127,16 @@ impl FileRangeContext {
&self.reader_builder
}
/// Returns the helper to compat batches.
pub(crate) fn compat_batch(&self) -> Option<&CompatBatch> {
self.compat_batch.as_ref()
}
/// Sets the `CompatBatch` to the context.
pub(crate) fn set_compat_batch(&mut self, compat: Option<CompatBatch>) {
self.compat_batch = compat;
}
/// TRY THE BEST to perform pushed down predicate precisely on the input batch.
/// Return the filtered batch. If the entire batch is filtered out, return None.
///

View File

@@ -53,7 +53,7 @@ use crate::read::{Batch, BatchReader};
use crate::row_converter::{McmpRowCodec, SortField};
use crate::sst::file::FileHandle;
use crate::sst::index::applier::SstIndexApplierRef;
use crate::sst::parquet::file_range::{FileRange, FileRangeContext, FileRangeContextRef};
use crate::sst::parquet::file_range::{FileRangeContext, FileRangeContextRef};
use crate::sst::parquet::format::ReadFormat;
use crate::sst::parquet::metadata::MetadataLoader;
use crate::sst::parquet::row_group::InMemoryRowGroup;
@@ -155,29 +155,17 @@ impl ParquetReaderBuilder {
/// This needs to perform IO operation.
pub async fn build(&self) -> Result<ParquetReader> {
let (context, row_groups) = self.build_reader_input().await?;
ParquetReader::new(context, row_groups).await
}
/// Builds [FileRange]s to read and pushes them to `file_ranges`.
#[allow(dead_code)]
pub async fn build_file_ranges(&self, file_ranges: &mut Vec<FileRange>) -> Result<()> {
let (context, row_groups) = self.build_reader_input().await?;
file_ranges.reserve_exact(row_groups.len());
for (row_group_idx, row_selection) in row_groups {
let file_range = FileRange::new(context.clone(), row_group_idx, row_selection);
file_ranges.push(file_range);
}
Ok(())
ParquetReader::new(Arc::new(context), row_groups).await
}
/// Builds a [FileRangeContext] and collects row groups to read.
///
/// This needs to perform IO operation.
async fn build_reader_input(&self) -> Result<(FileRangeContextRef, RowGroupMap)> {
pub(crate) async fn build_reader_input(&self) -> Result<(FileRangeContext, RowGroupMap)> {
let start = Instant::now();
let file_path = self.file_handle.file_path(&self.file_dir);
let file_size = self.file_handle.meta().file_size;
let file_size = self.file_handle.meta_ref().file_size;
// Loads parquet metadata of the file.
let parquet_meta = self.read_parquet_metadata(&file_path, file_size).await?;
// Decodes region metadata.
@@ -211,7 +199,7 @@ impl ParquetReaderBuilder {
parquet_to_arrow_field_levels(parquet_schema_desc, projection_mask.clone(), hint)
.context(ReadParquetSnafu { path: &file_path })?;
let mut metrics = Metrics::default();
let mut metrics = ReaderMetrics::default();
let row_groups = self
.row_groups_to_read(&read_format, &parquet_meta, &mut metrics)
@@ -258,7 +246,7 @@ impl ParquetReaderBuilder {
);
let context = FileRangeContext::new(reader_builder, filters, read_format, codec);
Ok((Arc::new(context), row_groups))
Ok((context, row_groups))
}
/// Decodes region metadata from key value.
@@ -324,7 +312,7 @@ impl ParquetReaderBuilder {
&self,
read_format: &ReadFormat,
parquet_meta: &ParquetMetaData,
metrics: &mut Metrics,
metrics: &mut ReaderMetrics,
) -> BTreeMap<usize, Option<RowSelection>> {
let num_row_groups = parquet_meta.num_row_groups();
if num_row_groups == 0 {
@@ -346,13 +334,13 @@ impl ParquetReaderBuilder {
async fn prune_row_groups_by_inverted_index(
&self,
parquet_meta: &ParquetMetaData,
metrics: &mut Metrics,
metrics: &mut ReaderMetrics,
) -> Option<BTreeMap<usize, Option<RowSelection>>> {
let Some(index_applier) = &self.index_applier else {
return None;
};
if !self.file_handle.meta().inverted_index_available() {
if !self.file_handle.meta_ref().inverted_index_available() {
return None;
}
@@ -428,7 +416,7 @@ impl ParquetReaderBuilder {
&self,
read_format: &ReadFormat,
parquet_meta: &ParquetMetaData,
metrics: &mut Metrics,
metrics: &mut ReaderMetrics,
) -> Option<BTreeMap<usize, Option<RowSelection>>> {
let Some(predicate) = &self.predicate else {
return None;
@@ -513,7 +501,7 @@ fn time_range_to_predicate(
/// Parquet reader metrics.
#[derive(Debug, Default)]
struct Metrics {
pub(crate) struct ReaderMetrics {
/// Number of row groups before filtering.
num_row_groups_before_filtering: usize,
/// Number of row groups filtered by inverted index.
@@ -538,6 +526,24 @@ struct Metrics {
num_rows: usize,
}
impl ReaderMetrics {
/// Adds `other` metrics to this metrics.
pub(crate) fn merge_from(&mut self, other: &ReaderMetrics) {
self.num_row_groups_before_filtering += other.num_row_groups_before_filtering;
self.num_row_groups_inverted_index_filtered += other.num_row_groups_inverted_index_filtered;
self.num_row_groups_min_max_filtered += other.num_row_groups_min_max_filtered;
self.num_rows_precise_filtered += other.num_rows_precise_filtered;
self.num_rows_in_row_group_before_filtering += other.num_rows_in_row_group_before_filtering;
self.num_rows_in_row_group_inverted_index_filtered +=
other.num_rows_in_row_group_inverted_index_filtered;
self.build_cost += other.build_cost;
self.scan_cost += other.scan_cost;
self.num_record_batches += other.num_record_batches;
self.num_batches += other.num_batches;
self.num_rows += other.num_rows;
}
}
/// Builder to build a [ParquetRecordBatchReader] for a row group.
pub(crate) struct RowGroupReaderBuilder {
/// SST file to read.
@@ -606,12 +612,12 @@ enum ReaderState {
/// The reader is reading a row group.
Readable(RowGroupReader),
/// The reader is exhausted.
Exhausted(Metrics),
Exhausted(ReaderMetrics),
}
impl ReaderState {
/// Returns the metrics of the reader.
fn metrics(&self) -> &Metrics {
fn metrics(&self) -> &ReaderMetrics {
match self {
ReaderState::Readable(reader) => &reader.metrics,
ReaderState::Exhausted(m) => m,
@@ -807,7 +813,7 @@ impl ParquetReader {
.await?;
ReaderState::Readable(RowGroupReader::new(context.clone(), parquet_reader))
} else {
ReaderState::Exhausted(Metrics::default())
ReaderState::Exhausted(ReaderMetrics::default())
};
Ok(ParquetReader {
@@ -829,7 +835,7 @@ impl ParquetReader {
}
/// Reader to read a row group of a parquet file.
pub(crate) struct RowGroupReader {
pub struct RowGroupReader {
/// Context for file ranges.
context: FileRangeContextRef,
/// Inner parquet reader.
@@ -837,7 +843,7 @@ pub(crate) struct RowGroupReader {
/// Buffered batches to return.
batches: VecDeque<Batch>,
/// Local scan metrics.
metrics: Metrics,
metrics: ReaderMetrics,
}
impl RowGroupReader {
@@ -847,17 +853,22 @@ impl RowGroupReader {
context,
reader,
batches: VecDeque::new(),
metrics: Metrics::default(),
metrics: ReaderMetrics::default(),
}
}
/// Gets the metrics.
pub(crate) fn metrics(&self) -> &ReaderMetrics {
&self.metrics
}
/// Resets the parquet reader.
fn reset_reader(&mut self, reader: ParquetRecordBatchReader) {
self.reader = reader;
}
/// Tries to fetch next [Batch] from the reader.
async fn next_batch(&mut self) -> Result<Option<Batch>> {
pub(crate) async fn next_batch(&mut self) -> Result<Option<Batch>> {
if let Some(batch) = self.batches.pop_front() {
self.metrics.num_rows += batch.num_rows();
return Ok(Some(batch));

View File

@@ -93,7 +93,7 @@ impl SstVersion {
.files
.values()
.map(|file_handle| {
let meta = file_handle.meta();
let meta = file_handle.meta_ref();
meta.file_size + meta.index_file_size
})
.sum::<u64>()

View File

@@ -170,6 +170,8 @@ pub trait RegionScanner: Debug + DisplayAs + Send + Sync {
fn schema(&self) -> SchemaRef;
/// Scans the partition and returns a stream of record batches.
/// # Panics
/// Panics if the `partition` is out of bound.
fn scan_partition(&self, partition: usize) -> Result<SendableRecordBatchStream, BoxedError>;
}