feat: query engine impl on datafusion (#10)

* feat: query engine impl on datafusion

* feat: adds physical_optimizer, physical_planner and executor

* feat: impl adpaters between datafuion and greptime query engine core APIs.

* feat: impl PhysicalPlanAdapter and ExecutionPlanAdapter

* feat: rename table datafusion mod to adapter

* fix: clippy warning

* fix: conflicts with develop branch

* feat: add database mod

* fix: CR comment

* fix: by CR comments

* fix: conflicts with develop branch

* fix: by CR comments
This commit is contained in:
dennis zhuang
2022-04-26 15:17:32 +08:00
committed by GitHub
parent e334e55bf7
commit 3a2f794f6c
35 changed files with 2597 additions and 8 deletions

View File

@@ -3,7 +3,19 @@ name = "table"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies.arrow]
package = "arrow2"
version="0.10"
features = ["io_csv", "io_json", "io_parquet", "io_parquet_compression", "io_ipc", "ahash", "compute"]
[dependencies]
async-trait = "0.1"
chrono = { version = "0.4", features = ["serde"] }
common-query = {path = "../common/query" }
common-recordbatch = {path = "../common/recordbatch" }
datafusion = { git = "https://github.com/apache/arrow-datafusion.git" , branch = "arrow2", features = ["simd"]}
datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git" , branch = "arrow2"}
datatypes = { path = "../datatypes" }
futures = "0.3"
serde = "1.0.136"
snafu = "0.7.0"

18
src/table/src/error.rs Normal file
View File

@@ -0,0 +1,18 @@
use datafusion::error::DataFusionError;
use snafu::Snafu;
#[derive(Debug, Snafu)]
#[snafu(visibility(pub))]
pub enum Error {
#[snafu(display("Datafusion error: {}", source))]
Datafusion { source: DataFusionError },
#[snafu(display("Not expected to run ExecutionPlan more than once."))]
ExecuteRepeatedly,
}
pub type Result<T> = std::result::Result<T, Error>;
impl From<Error> for DataFusionError {
fn from(e: Error) -> DataFusionError {
DataFusionError::External(Box::new(e))
}
}

View File

@@ -1,5 +1,5 @@
mod engine;
pub mod error;
pub mod table;
/// Table abstraction.
#[async_trait::async_trait]
pub trait Table: Send + Sync {}
pub use crate::table::{Table, TableRef};

104
src/table/src/table.rs Normal file
View File

@@ -0,0 +1,104 @@
use std::any::Any;
use std::collections::HashMap;
use std::sync::Arc;
use chrono::DateTime;
use chrono::Utc;
use common_query::logical_plan::Expr;
use common_recordbatch::SendableRecordBatchStream;
use datatypes::schema::{Schema, SchemaRef};
use crate::error::Result;
pub mod adapter;
pub mod memory;
pub type TableId = u64;
pub type TableVersion = u64;
/// Indicates whether and how a filter expression can be handled by a
/// Table for table scans.
#[derive(Debug, Clone, PartialEq)]
pub enum TableProviderFilterPushDown {
/// The expression cannot be used by the provider.
Unsupported,
/// The expression can be used to help minimise the data retrieved,
/// but the provider cannot guarantee that all returned tuples
/// satisfy the filter. The Filter plan node containing this expression
/// will be preserved.
Inexact,
/// The provider guarantees that all returned data satisfies this
/// filter expression. The Filter plan node containing this expression
/// will be removed.
Exact,
}
/// Indicates the type of this table for metadata/catalog purposes.
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum TableType {
/// An ordinary physical table.
Base,
/// A non-materialised table that itself uses a query internally to provide data.
View,
/// A transient table.
Temporary,
}
#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, Eq, PartialEq, Default)]
pub struct TableIdent {
pub table_id: TableId,
pub version: TableVersion,
}
#[derive(Debug)]
pub struct TableInfo {
pub ident: TableIdent,
pub name: String,
pub desc: Option<String>,
pub meta: TableMeta,
}
#[derive(Clone, Debug)]
pub struct TableMeta {
pub schema: Arc<Schema>,
pub engine: String,
pub engine_options: HashMap<String, String>,
pub options: HashMap<String, String>,
pub created_on: DateTime<Utc>,
}
/// Table abstraction.
#[async_trait::async_trait]
pub trait Table: Send + Sync {
/// Returns the table as [`Any`](std::any::Any) so that it can be
/// downcast to a specific implementation.
fn as_any(&self) -> &dyn Any;
/// Get a reference to the schema for this table
fn schema(&self) -> SchemaRef;
/// Get the type of this table for metadata/catalog purposes.
fn table_type(&self) -> TableType {
TableType::Base
}
/// Scan the table and returns a SendableRecordBatchStream.
async fn scan(
&self,
projection: &Option<Vec<usize>>,
filters: &[Expr],
// limit can be used to reduce the amount scanned
// from the datasource as a performance optimization.
// If set, it contains the amount of rows needed by the `LogicalPlan`,
// The datasource should return *at least* this number of rows if available.
limit: Option<usize>,
) -> Result<SendableRecordBatchStream>;
/// Tests whether the table provider can make use of a filter expression
/// to optimise data retrieval.
fn supports_filter_pushdown(&self, _filter: &Expr) -> Result<TableProviderFilterPushDown> {
Ok(TableProviderFilterPushDown::Unsupported)
}
}
pub type TableRef = Arc<dyn Table>;

View File

@@ -0,0 +1,308 @@
use core::pin::Pin;
use core::task::{Context, Poll};
use std::any::Any;
use std::fmt;
use std::fmt::Debug;
use std::mem;
use std::sync::{Arc, Mutex};
use arrow::error::{ArrowError, Result as ArrowResult};
use common_query::logical_plan::Expr;
use common_recordbatch::error::{self as recordbatch_error, Result as RecordBatchResult};
use common_recordbatch::{RecordBatch, RecordBatchStream, SendableRecordBatchStream};
use datafusion::arrow::datatypes::SchemaRef as DfSchemaRef;
/// Datafusion table adpaters
use datafusion::datasource::{
datasource::TableProviderFilterPushDown as DfTableProviderFilterPushDown, TableProvider,
TableType as DfTableType,
};
use datafusion::error::{DataFusionError, Result as DfResult};
use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv};
use datafusion::logical_plan::Expr as DfExpr;
use datafusion::physical_plan::{
expressions::PhysicalSortExpr, ExecutionPlan, Partitioning,
RecordBatchStream as DfRecordBatchStream,
SendableRecordBatchStream as DfSendableRecordBatchStream, Statistics,
};
use datafusion_common::record_batch::RecordBatch as DfRecordBatch;
use datatypes::schema::SchemaRef as TableSchemaRef;
use datatypes::schema::{Schema, SchemaRef};
use futures::Stream;
use snafu::prelude::*;
use super::{Table, TableProviderFilterPushDown, TableRef, TableType};
use crate::error::{self, Result};
/// Greptime SendableRecordBatchStream -> datafusion ExecutionPlan.
struct ExecutionPlanAdapter {
stream: Mutex<Option<SendableRecordBatchStream>>,
schema: SchemaRef,
}
impl Debug for ExecutionPlanAdapter {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
//TODO(dennis) better debug info
write!(f, "ExecutionPlan(PlaceHolder)")
}
}
#[async_trait::async_trait]
impl ExecutionPlan for ExecutionPlanAdapter {
fn as_any(&self) -> &dyn Any {
self
}
fn schema(&self) -> DfSchemaRef {
self.schema.arrow_schema().clone()
}
fn output_partitioning(&self) -> Partitioning {
// FIXME(dennis)
Partitioning::UnknownPartitioning(1)
}
fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
// FIXME(dennis)
None
}
fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
// TODO(dennis)
vec![]
}
fn with_new_children(
&self,
_children: Vec<Arc<dyn ExecutionPlan>>,
) -> DfResult<Arc<dyn ExecutionPlan>> {
// TODO(dennis)
todo!();
}
async fn execute(
&self,
_partition: usize,
_runtime: Arc<RuntimeEnv>,
) -> DfResult<DfSendableRecordBatchStream> {
let mut stream = self.stream.lock().unwrap();
if stream.is_some() {
let stream = mem::replace(&mut *stream, None);
Ok(Box::pin(DfRecordBatchStreamAdapter::new(stream.unwrap())))
} else {
error::ExecuteRepeatedlySnafu
.fail()
.map_err(|e| DataFusionError::External(Box::new(e)))
}
}
fn statistics(&self) -> Statistics {
//TODO(dennis)
Statistics::default()
}
}
/// Greptime Table -> datafusion TableProvider
pub struct DfTableProviderAdapter {
table: TableRef,
}
impl DfTableProviderAdapter {
pub fn new(table: TableRef) -> Self {
Self { table }
}
}
#[async_trait::async_trait]
impl TableProvider for DfTableProviderAdapter {
fn as_any(&self) -> &dyn Any {
self
}
fn schema(&self) -> DfSchemaRef {
self.table.schema().arrow_schema().clone()
}
fn table_type(&self) -> DfTableType {
match self.table.table_type() {
TableType::Base => DfTableType::Base,
TableType::View => DfTableType::View,
TableType::Temporary => DfTableType::Temporary,
}
}
async fn scan(
&self,
projection: &Option<Vec<usize>>,
filters: &[DfExpr],
limit: Option<usize>,
) -> DfResult<Arc<dyn ExecutionPlan>> {
let filters: Vec<Expr> = filters.iter().map(Clone::clone).map(Expr::new).collect();
match self.table.scan(projection, &filters, limit).await {
Ok(stream) => Ok(Arc::new(ExecutionPlanAdapter {
schema: stream.schema(),
stream: Mutex::new(Some(stream)),
})),
Err(e) => Err(e.into()),
}
}
fn supports_filter_pushdown(&self, filter: &DfExpr) -> DfResult<DfTableProviderFilterPushDown> {
match self
.table
.supports_filter_pushdown(&Expr::new(filter.clone()))
{
Ok(p) => match p {
TableProviderFilterPushDown::Unsupported => {
Ok(DfTableProviderFilterPushDown::Unsupported)
}
TableProviderFilterPushDown::Inexact => Ok(DfTableProviderFilterPushDown::Inexact),
TableProviderFilterPushDown::Exact => Ok(DfTableProviderFilterPushDown::Exact),
},
Err(e) => Err(e.into()),
}
}
}
/// Datafusion TableProvider -> greptime Table
pub struct TableAdapter {
table_provider: Arc<dyn TableProvider>,
}
impl TableAdapter {
pub fn new(table_provider: Arc<dyn TableProvider>) -> Self {
Self { table_provider }
}
}
#[async_trait::async_trait]
impl Table for TableAdapter {
fn as_any(&self) -> &dyn Any {
self
}
fn schema(&self) -> TableSchemaRef {
Arc::new(self.table_provider.schema().into())
}
fn table_type(&self) -> TableType {
match self.table_provider.table_type() {
DfTableType::Base => TableType::Base,
DfTableType::View => TableType::View,
DfTableType::Temporary => TableType::Temporary,
}
}
async fn scan(
&self,
projection: &Option<Vec<usize>>,
filters: &[Expr],
limit: Option<usize>,
) -> Result<SendableRecordBatchStream> {
let filters: Vec<DfExpr> = filters.iter().map(|e| e.df_expr().clone()).collect();
let execution_plan = self
.table_provider
.scan(projection, &filters, limit)
.await
.context(error::DatafusionSnafu)?;
// FIXME(dennis) Partitioning and runtime
let runtime = RuntimeEnv::new(RuntimeConfig::default()).context(error::DatafusionSnafu)?;
let df_stream = execution_plan
.execute(0, Arc::new(runtime))
.await
.context(error::DatafusionSnafu)?;
Ok(Box::pin(RecordBatchStreamAdapter::new(df_stream)))
}
fn supports_filter_pushdown(&self, filter: &Expr) -> Result<TableProviderFilterPushDown> {
match self
.table_provider
.supports_filter_pushdown(filter.df_expr())
.context(error::DatafusionSnafu)?
{
DfTableProviderFilterPushDown::Unsupported => {
Ok(TableProviderFilterPushDown::Unsupported)
}
DfTableProviderFilterPushDown::Inexact => Ok(TableProviderFilterPushDown::Inexact),
DfTableProviderFilterPushDown::Exact => Ok(TableProviderFilterPushDown::Exact),
}
}
}
/// Greptime SendableRecordBatchStream -> datafusion RecordBatchStream
pub struct DfRecordBatchStreamAdapter {
stream: SendableRecordBatchStream,
}
impl DfRecordBatchStreamAdapter {
pub fn new(stream: SendableRecordBatchStream) -> Self {
Self { stream }
}
}
impl DfRecordBatchStream for DfRecordBatchStreamAdapter {
fn schema(&self) -> DfSchemaRef {
self.stream.schema().arrow_schema().clone()
}
}
impl Stream for DfRecordBatchStreamAdapter {
type Item = ArrowResult<DfRecordBatch>;
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
match Pin::new(&mut self.stream).poll_next(cx) {
Poll::Pending => Poll::Pending,
Poll::Ready(Some(recordbatch)) => match recordbatch {
Ok(recordbatch) => Poll::Ready(Some(Ok(recordbatch.df_recordbatch))),
Err(e) => Poll::Ready(Some(Err(ArrowError::External("".to_owned(), Box::new(e))))),
},
Poll::Ready(None) => Poll::Ready(None),
}
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.stream.size_hint()
}
}
/// Datafusion SendableRecordBatchStream to greptime RecordBatchStream
pub struct RecordBatchStreamAdapter {
stream: DfSendableRecordBatchStream,
}
impl RecordBatchStreamAdapter {
pub fn new(stream: DfSendableRecordBatchStream) -> Self {
Self { stream }
}
}
impl RecordBatchStream for RecordBatchStreamAdapter {
fn schema(&self) -> SchemaRef {
Arc::new(Schema::new(self.stream.schema()))
}
}
impl Stream for RecordBatchStreamAdapter {
type Item = RecordBatchResult<RecordBatch>;
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
match Pin::new(&mut self.stream).poll_next(cx) {
Poll::Pending => Poll::Pending,
Poll::Ready(Some(df_recordbatch)) => Poll::Ready(Some(Ok(RecordBatch {
schema: self.schema(),
df_recordbatch: df_recordbatch.context(recordbatch_error::ArrowSnafu)?,
}))),
Poll::Ready(None) => Poll::Ready(None),
}
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.stream.size_hint()
}
}

View File

@@ -0,0 +1 @@