refactor: add scan_to_stream() to Table trait to postpone the stream generation (#1639)

* add scan_to_stream to Table

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* impl parquet stream

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* reorganise adapters

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* implement scan_to_stream for mito table

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* clean up

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add location info

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix: table scan

* UT pass

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* impl project record batch

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix information schema

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix clippy

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* resolve CR comments

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* remove one todo

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix errors generated by merge commit

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add output_ordering method to record batch stream

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix rustfmt

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* enhance error types

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
Co-authored-by: Lei, HUANG <mrsatangel@gmail.com>
This commit is contained in:
Ruihang Xia
2023-05-29 20:03:47 +08:00
committed by GitHub
parent 0eaae634fa
commit b27c569ae0
34 changed files with 824 additions and 327 deletions

View File

@@ -459,26 +459,20 @@ fn is_null(null_mask: &BitVec, idx: usize) -> Option<bool> {
#[cfg(test)]
mod tests {
use std::any::Any;
use std::sync::Arc;
use std::{assert_eq, unimplemented, vec};
use std::{assert_eq, vec};
use api::helper::ColumnDataTypeWrapper;
use api::v1::column::{self, SemanticType, Values};
use api::v1::{Column, ColumnDataType};
use common_base::BitVec;
use common_catalog::consts::MITO_ENGINE;
use common_query::physical_plan::PhysicalPlanRef;
use common_query::prelude::Expr;
use common_time::timestamp::Timestamp;
use datatypes::data_type::ConcreteDataType;
use datatypes::schema::{ColumnSchema, SchemaBuilder, SchemaRef};
use datatypes::schema::{ColumnSchema, SchemaBuilder};
use datatypes::types::{TimestampMillisecondType, TimestampSecondType, TimestampType};
use datatypes::value::Value;
use snafu::ResultExt;
use table::error::Result as TableResult;
use table::metadata::TableInfoRef;
use table::Table;
use super::*;
use crate::error;
@@ -733,49 +727,6 @@ mod tests {
assert_eq!(None, is_null(&null_mask, 99));
}
struct DemoTable;
#[async_trait::async_trait]
impl Table for DemoTable {
fn as_any(&self) -> &dyn Any {
self
}
fn schema(&self) -> SchemaRef {
let column_schemas = vec![
ColumnSchema::new("host", ConcreteDataType::string_datatype(), false),
ColumnSchema::new("cpu", ConcreteDataType::float64_datatype(), true),
ColumnSchema::new("memory", ConcreteDataType::float64_datatype(), true),
ColumnSchema::new(
"ts",
ConcreteDataType::timestamp_millisecond_datatype(),
true,
)
.with_time_index(true),
];
Arc::new(
SchemaBuilder::try_from(column_schemas)
.unwrap()
.build()
.unwrap(),
)
}
fn table_info(&self) -> TableInfoRef {
unimplemented!()
}
async fn scan(
&self,
_projection: Option<&Vec<usize>>,
_filters: &[Expr],
_limit: Option<usize>,
) -> TableResult<PhysicalPlanRef> {
unimplemented!();
}
}
fn mock_insert_batch() -> (Vec<Column>, u32) {
let row_count = 2;

View File

@@ -71,6 +71,7 @@ pub trait PhysicalPlan: Debug + Send + Sync {
) -> Result<SendableRecordBatchStream>;
}
/// Adapt DataFusion's [`ExecutionPlan`](DfPhysicalPlan) to GreptimeDB's [`PhysicalPlan`].
#[derive(Debug)]
pub struct PhysicalPlanAdapter {
schema: SchemaRef,

View File

@@ -111,7 +111,7 @@ impl Stream for DfRecordBatchStreamAdapter {
}
}
/// DataFusion SendableRecordBatchStream -> Greptime RecordBatchStream
/// DataFusion [SendableRecordBatchStream](DfSendableRecordBatchStream) -> Greptime [RecordBatchStream]
pub struct RecordBatchStreamAdapter {
schema: SchemaRef,
stream: DfSendableRecordBatchStream,

View File

@@ -70,6 +70,19 @@ pub enum Error {
location: Location,
},
#[snafu(display(
"Failed to project Arrow RecordBatch with schema {:?} and projection {:?}, source: {}",
schema,
projection,
source
))]
ProjectArrowRecordBatch {
source: datatypes::arrow::error::ArrowError,
location: Location,
schema: datatypes::schema::SchemaRef,
projection: Vec<usize>,
},
#[snafu(display("Column {} not exists in table {}", column_name, table_name))]
ColumnNotExists {
column_name: String,
@@ -101,7 +114,8 @@ impl ErrorExt for Error {
| Error::PollStream { .. }
| Error::Format { .. }
| Error::InitRecordbatchStream { .. }
| Error::ColumnNotExists { .. } => StatusCode::Internal,
| Error::ColumnNotExists { .. }
| Error::ProjectArrowRecordBatch { .. } => StatusCode::Internal,
Error::External { source } => source.status_code(),

View File

@@ -22,6 +22,7 @@ use std::sync::Arc;
use datafusion::physical_plan::memory::MemoryStream;
pub use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream;
use datatypes::arrow::compute::SortOptions;
pub use datatypes::arrow::record_batch::RecordBatch as DfRecordBatch;
use datatypes::arrow::util::pretty;
use datatypes::prelude::VectorRef;
@@ -34,10 +35,20 @@ use snafu::{ensure, ResultExt};
pub trait RecordBatchStream: Stream<Item = Result<RecordBatch>> {
fn schema(&self) -> SchemaRef;
fn output_ordering(&self) -> Option<&[OrderOption]> {
None
}
}
pub type SendableRecordBatchStream = Pin<Box<dyn RecordBatchStream + Send>>;
#[derive(Debug, Clone, Copy)]
pub struct OrderOption {
pub index: usize,
pub options: SortOptions,
}
/// EmptyRecordBatchStream can be used to create a RecordBatchStream
/// that will produce no results
pub struct EmptyRecordBatchStream {
@@ -181,6 +192,26 @@ impl Stream for SimpleRecordBatchStream {
}
}
/// Adapt a [Stream] of [RecordBatch] to a [RecordBatchStream].
pub struct RecordBatchStreamAdaptor {
pub schema: SchemaRef,
pub stream: Pin<Box<dyn Stream<Item = Result<RecordBatch>> + Send>>,
}
impl RecordBatchStream for RecordBatchStreamAdaptor {
fn schema(&self) -> SchemaRef {
self.schema.clone()
}
}
impl Stream for RecordBatchStreamAdaptor {
type Item = Result<RecordBatch>;
fn poll_next(mut self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
Pin::new(&mut self.stream).poll_next(ctx)
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;

View File

@@ -13,6 +13,7 @@
// limitations under the License.
use std::collections::HashMap;
use std::sync::Arc;
use datatypes::schema::SchemaRef;
use datatypes::value::Value;
@@ -21,7 +22,10 @@ use serde::ser::{Error, SerializeStruct};
use serde::{Serialize, Serializer};
use snafu::{OptionExt, ResultExt};
use crate::error::{self, CastVectorSnafu, ColumnNotExistsSnafu, Result};
use crate::error::{
self, CastVectorSnafu, ColumnNotExistsSnafu, DataTypesSnafu, ProjectArrowRecordBatchSnafu,
Result,
};
use crate::DfRecordBatch;
/// A two-dimensional batch of column-oriented data with a defined schema.
@@ -51,6 +55,26 @@ impl RecordBatch {
})
}
pub fn try_project(&self, indices: &[usize]) -> Result<Self> {
let schema = Arc::new(self.schema.try_project(indices).context(DataTypesSnafu)?);
let mut columns = Vec::with_capacity(indices.len());
for index in indices {
columns.push(self.columns[*index].clone());
}
let df_record_batch = self.df_record_batch.project(indices).with_context(|_| {
ProjectArrowRecordBatchSnafu {
schema: self.schema.clone(),
projection: indices.to_vec(),
}
})?;
Ok(Self {
schema,
columns,
df_record_batch,
})
}
/// Create a new [`RecordBatch`] from `schema` and `df_record_batch`.
///
/// This method doesn't check the schema.