refactor: add scan_to_stream() to Table trait to postpone the stream generation (#1639)

* add scan_to_stream to Table Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * impl parquet stream Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * reorganise adapters Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * implement scan_to_stream for mito table Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * clean up Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * add location info Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix: table scan * UT pass Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * impl project record batch Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix information schema Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix clippy Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * resolve CR comments Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * remove one todo Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix errors generated by merge commit Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * add output_ordering method to record batch stream Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix rustfmt Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * enhance error types Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com> Co-authored-by: Lei, HUANG <mrsatangel@gmail.com>
2026-05-24 17:00:37 +00:00 · 2023-05-29 20:03:47 +08:00
parent 0eaae634fa
commit b27c569ae0
34 changed files with 824 additions and 327 deletions
--- a/src/common/grpc-expr/src/insert.rs
+++ b/src/common/grpc-expr/src/insert.rs
@@ -459,26 +459,20 @@ fn is_null(null_mask: &BitVec, idx: usize) -> Option<bool> {

 #[cfg(test)]
 mod tests {
-    use std::any::Any;
    use std::sync::Arc;
-    use std::{assert_eq, unimplemented, vec};
+    use std::{assert_eq, vec};

    use api::helper::ColumnDataTypeWrapper;
    use api::v1::column::{self, SemanticType, Values};
    use api::v1::{Column, ColumnDataType};
    use common_base::BitVec;
    use common_catalog::consts::MITO_ENGINE;
-    use common_query::physical_plan::PhysicalPlanRef;
-    use common_query::prelude::Expr;
    use common_time::timestamp::Timestamp;
    use datatypes::data_type::ConcreteDataType;
-    use datatypes::schema::{ColumnSchema, SchemaBuilder, SchemaRef};
+    use datatypes::schema::{ColumnSchema, SchemaBuilder};
    use datatypes::types::{TimestampMillisecondType, TimestampSecondType, TimestampType};
    use datatypes::value::Value;
    use snafu::ResultExt;
-    use table::error::Result as TableResult;
-    use table::metadata::TableInfoRef;
-    use table::Table;

    use super::*;
    use crate::error;
@@ -733,49 +727,6 @@ mod tests {
        assert_eq!(None, is_null(&null_mask, 99));
    }

-    struct DemoTable;
-
-    #[async_trait::async_trait]
-    impl Table for DemoTable {
-        fn as_any(&self) -> &dyn Any {
-            self
-        }
-
-        fn schema(&self) -> SchemaRef {
-            let column_schemas = vec![
-                ColumnSchema::new("host", ConcreteDataType::string_datatype(), false),
-                ColumnSchema::new("cpu", ConcreteDataType::float64_datatype(), true),
-                ColumnSchema::new("memory", ConcreteDataType::float64_datatype(), true),
-                ColumnSchema::new(
-                    "ts",
-                    ConcreteDataType::timestamp_millisecond_datatype(),
-                    true,
-                )
-                .with_time_index(true),
-            ];
-
-            Arc::new(
-                SchemaBuilder::try_from(column_schemas)
-                    .unwrap()
-                    .build()
-                    .unwrap(),
-            )
-        }
-
-        fn table_info(&self) -> TableInfoRef {
-            unimplemented!()
-        }
-
-        async fn scan(
-            &self,
-            _projection: Option<&Vec<usize>>,
-            _filters: &[Expr],
-            _limit: Option<usize>,
-        ) -> TableResult<PhysicalPlanRef> {
-            unimplemented!();
-        }
-    }
-
    fn mock_insert_batch() -> (Vec<Column>, u32) {
        let row_count = 2;

--- a/src/common/query/src/physical_plan.rs
+++ b/src/common/query/src/physical_plan.rs
@@ -71,6 +71,7 @@ pub trait PhysicalPlan: Debug + Send + Sync {
    ) -> Result<SendableRecordBatchStream>;
 }

+/// Adapt DataFusion's [`ExecutionPlan`](DfPhysicalPlan) to GreptimeDB's [`PhysicalPlan`].
 #[derive(Debug)]
 pub struct PhysicalPlanAdapter {
    schema: SchemaRef,
--- a/src/common/recordbatch/src/adapter.rs
+++ b/src/common/recordbatch/src/adapter.rs
@@ -111,7 +111,7 @@ impl Stream for DfRecordBatchStreamAdapter {
    }
 }

-/// DataFusion SendableRecordBatchStream -> Greptime RecordBatchStream
+/// DataFusion [SendableRecordBatchStream](DfSendableRecordBatchStream) -> Greptime [RecordBatchStream]
 pub struct RecordBatchStreamAdapter {
    schema: SchemaRef,
    stream: DfSendableRecordBatchStream,
--- a/src/common/recordbatch/src/error.rs
+++ b/src/common/recordbatch/src/error.rs
@@ -70,6 +70,19 @@ pub enum Error {
        location: Location,
    },

+    #[snafu(display(
+        "Failed to project Arrow RecordBatch with schema {:?} and projection {:?}, source: {}",
+        schema,
+        projection,
+        source
+    ))]
+    ProjectArrowRecordBatch {
+        source: datatypes::arrow::error::ArrowError,
+        location: Location,
+        schema: datatypes::schema::SchemaRef,
+        projection: Vec<usize>,
+    },
+
    #[snafu(display("Column {} not exists in table {}", column_name, table_name))]
    ColumnNotExists {
        column_name: String,
@@ -101,7 +114,8 @@ impl ErrorExt for Error {
            | Error::PollStream { .. }
            | Error::Format { .. }
            | Error::InitRecordbatchStream { .. }
-            | Error::ColumnNotExists { .. } => StatusCode::Internal,
+            | Error::ColumnNotExists { .. }
+            | Error::ProjectArrowRecordBatch { .. } => StatusCode::Internal,

            Error::External { source } => source.status_code(),

--- a/src/common/recordbatch/src/lib.rs
+++ b/src/common/recordbatch/src/lib.rs
@@ -22,6 +22,7 @@ use std::sync::Arc;

 use datafusion::physical_plan::memory::MemoryStream;
 pub use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream;
+use datatypes::arrow::compute::SortOptions;
 pub use datatypes::arrow::record_batch::RecordBatch as DfRecordBatch;
 use datatypes::arrow::util::pretty;
 use datatypes::prelude::VectorRef;
@@ -34,10 +35,20 @@ use snafu::{ensure, ResultExt};

 pub trait RecordBatchStream: Stream<Item = Result<RecordBatch>> {
    fn schema(&self) -> SchemaRef;
+
+    fn output_ordering(&self) -> Option<&[OrderOption]> {
+        None
+    }
 }

 pub type SendableRecordBatchStream = Pin<Box<dyn RecordBatchStream + Send>>;

+#[derive(Debug, Clone, Copy)]
+pub struct OrderOption {
+    pub index: usize,
+    pub options: SortOptions,
+}
+
 /// EmptyRecordBatchStream can be used to create a RecordBatchStream
 /// that will produce no results
 pub struct EmptyRecordBatchStream {
@@ -181,6 +192,26 @@ impl Stream for SimpleRecordBatchStream {
    }
 }

+/// Adapt a [Stream] of [RecordBatch] to a [RecordBatchStream].
+pub struct RecordBatchStreamAdaptor {
+    pub schema: SchemaRef,
+    pub stream: Pin<Box<dyn Stream<Item = Result<RecordBatch>> + Send>>,
+}
+
+impl RecordBatchStream for RecordBatchStreamAdaptor {
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+}
+
+impl Stream for RecordBatchStreamAdaptor {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(mut self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        Pin::new(&mut self.stream).poll_next(ctx)
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use std::sync::Arc;
--- a/src/common/recordbatch/src/recordbatch.rs
+++ b/src/common/recordbatch/src/recordbatch.rs
@@ -13,6 +13,7 @@
 // limitations under the License.

 use std::collections::HashMap;
+use std::sync::Arc;

 use datatypes::schema::SchemaRef;
 use datatypes::value::Value;
@@ -21,7 +22,10 @@ use serde::ser::{Error, SerializeStruct};
 use serde::{Serialize, Serializer};
 use snafu::{OptionExt, ResultExt};

-use crate::error::{self, CastVectorSnafu, ColumnNotExistsSnafu, Result};
+use crate::error::{
+    self, CastVectorSnafu, ColumnNotExistsSnafu, DataTypesSnafu, ProjectArrowRecordBatchSnafu,
+    Result,
+};
 use crate::DfRecordBatch;

 /// A two-dimensional batch of column-oriented data with a defined schema.
@@ -51,6 +55,26 @@ impl RecordBatch {
        })
    }

+    pub fn try_project(&self, indices: &[usize]) -> Result<Self> {
+        let schema = Arc::new(self.schema.try_project(indices).context(DataTypesSnafu)?);
+        let mut columns = Vec::with_capacity(indices.len());
+        for index in indices {
+            columns.push(self.columns[*index].clone());
+        }
+        let df_record_batch = self.df_record_batch.project(indices).with_context(|_| {
+            ProjectArrowRecordBatchSnafu {
+                schema: self.schema.clone(),
+                projection: indices.to_vec(),
+            }
+        })?;
+
+        Ok(Self {
+            schema,
+            columns,
+            df_record_batch,
+        })
+    }
+
    /// Create a new [`RecordBatch`] from `schema` and `df_record_batch`.
    ///
    /// This method doesn't check the schema.