refactor: add scan_to_stream() to Table trait to postpone the stream generation (#1639)

* add scan_to_stream to Table Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * impl parquet stream Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * reorganise adapters Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * implement scan_to_stream for mito table Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * clean up Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * add location info Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix: table scan * UT pass Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * impl project record batch Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix information schema Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix clippy Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * resolve CR comments Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * remove one todo Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix errors generated by merge commit Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * add output_ordering method to record batch stream Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix rustfmt Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * enhance error types Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com> Co-authored-by: Lei, HUANG <mrsatangel@gmail.com>
2026-05-26 09:50:40 +00:00 · 2023-05-29 20:03:47 +08:00
parent 0eaae634fa
commit b27c569ae0
34 changed files with 824 additions and 327 deletions
--- a/src/common/recordbatch/src/adapter.rs
+++ b/src/common/recordbatch/src/adapter.rs
@@ -111,7 +111,7 @@ impl Stream for DfRecordBatchStreamAdapter {
    }
 }

-/// DataFusion SendableRecordBatchStream -> Greptime RecordBatchStream
+/// DataFusion [SendableRecordBatchStream](DfSendableRecordBatchStream) -> Greptime [RecordBatchStream]
 pub struct RecordBatchStreamAdapter {
    schema: SchemaRef,
    stream: DfSendableRecordBatchStream,
--- a/src/common/recordbatch/src/error.rs
+++ b/src/common/recordbatch/src/error.rs
@@ -70,6 +70,19 @@ pub enum Error {
        location: Location,
    },

+    #[snafu(display(
+        "Failed to project Arrow RecordBatch with schema {:?} and projection {:?}, source: {}",
+        schema,
+        projection,
+        source
+    ))]
+    ProjectArrowRecordBatch {
+        source: datatypes::arrow::error::ArrowError,
+        location: Location,
+        schema: datatypes::schema::SchemaRef,
+        projection: Vec<usize>,
+    },
+
    #[snafu(display("Column {} not exists in table {}", column_name, table_name))]
    ColumnNotExists {
        column_name: String,
@@ -101,7 +114,8 @@ impl ErrorExt for Error {
            | Error::PollStream { .. }
            | Error::Format { .. }
            | Error::InitRecordbatchStream { .. }
-            | Error::ColumnNotExists { .. } => StatusCode::Internal,
+            | Error::ColumnNotExists { .. }
+            | Error::ProjectArrowRecordBatch { .. } => StatusCode::Internal,

            Error::External { source } => source.status_code(),

--- a/src/common/recordbatch/src/lib.rs
+++ b/src/common/recordbatch/src/lib.rs
@@ -22,6 +22,7 @@ use std::sync::Arc;

 use datafusion::physical_plan::memory::MemoryStream;
 pub use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream;
+use datatypes::arrow::compute::SortOptions;
 pub use datatypes::arrow::record_batch::RecordBatch as DfRecordBatch;
 use datatypes::arrow::util::pretty;
 use datatypes::prelude::VectorRef;
@@ -34,10 +35,20 @@ use snafu::{ensure, ResultExt};

 pub trait RecordBatchStream: Stream<Item = Result<RecordBatch>> {
    fn schema(&self) -> SchemaRef;
+
+    fn output_ordering(&self) -> Option<&[OrderOption]> {
+        None
+    }
 }

 pub type SendableRecordBatchStream = Pin<Box<dyn RecordBatchStream + Send>>;

+#[derive(Debug, Clone, Copy)]
+pub struct OrderOption {
+    pub index: usize,
+    pub options: SortOptions,
+}
+
 /// EmptyRecordBatchStream can be used to create a RecordBatchStream
 /// that will produce no results
 pub struct EmptyRecordBatchStream {
@@ -181,6 +192,26 @@ impl Stream for SimpleRecordBatchStream {
    }
 }

+/// Adapt a [Stream] of [RecordBatch] to a [RecordBatchStream].
+pub struct RecordBatchStreamAdaptor {
+    pub schema: SchemaRef,
+    pub stream: Pin<Box<dyn Stream<Item = Result<RecordBatch>> + Send>>,
+}
+
+impl RecordBatchStream for RecordBatchStreamAdaptor {
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+}
+
+impl Stream for RecordBatchStreamAdaptor {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(mut self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        Pin::new(&mut self.stream).poll_next(ctx)
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use std::sync::Arc;
--- a/src/common/recordbatch/src/recordbatch.rs
+++ b/src/common/recordbatch/src/recordbatch.rs
@@ -13,6 +13,7 @@
 // limitations under the License.

 use std::collections::HashMap;
+use std::sync::Arc;

 use datatypes::schema::SchemaRef;
 use datatypes::value::Value;
@@ -21,7 +22,10 @@ use serde::ser::{Error, SerializeStruct};
 use serde::{Serialize, Serializer};
 use snafu::{OptionExt, ResultExt};

-use crate::error::{self, CastVectorSnafu, ColumnNotExistsSnafu, Result};
+use crate::error::{
+    self, CastVectorSnafu, ColumnNotExistsSnafu, DataTypesSnafu, ProjectArrowRecordBatchSnafu,
+    Result,
+};
 use crate::DfRecordBatch;

 /// A two-dimensional batch of column-oriented data with a defined schema.
@@ -51,6 +55,26 @@ impl RecordBatch {
        })
    }

+    pub fn try_project(&self, indices: &[usize]) -> Result<Self> {
+        let schema = Arc::new(self.schema.try_project(indices).context(DataTypesSnafu)?);
+        let mut columns = Vec::with_capacity(indices.len());
+        for index in indices {
+            columns.push(self.columns[*index].clone());
+        }
+        let df_record_batch = self.df_record_batch.project(indices).with_context(|_| {
+            ProjectArrowRecordBatchSnafu {
+                schema: self.schema.clone(),
+                projection: indices.to_vec(),
+            }
+        })?;
+
+        Ok(Self {
+            schema,
+            columns,
+            df_record_batch,
+        })
+    }
+
    /// Create a new [`RecordBatch`] from `schema` and `df_record_batch`.
    ///
    /// This method doesn't check the schema.