mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-05-26 09:50:40 +00:00
refactor: add scan_to_stream() to Table trait to postpone the stream generation (#1639)
* add scan_to_stream to Table Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * impl parquet stream Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * reorganise adapters Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * implement scan_to_stream for mito table Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * clean up Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * add location info Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix: table scan * UT pass Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * impl project record batch Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix information schema Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix clippy Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * resolve CR comments Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * remove one todo Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix errors generated by merge commit Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * add output_ordering method to record batch stream Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix rustfmt Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * enhance error types Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com> Co-authored-by: Lei, HUANG <mrsatangel@gmail.com>
This commit is contained in:
@@ -111,7 +111,7 @@ impl Stream for DfRecordBatchStreamAdapter {
|
||||
}
|
||||
}
|
||||
|
||||
/// DataFusion SendableRecordBatchStream -> Greptime RecordBatchStream
|
||||
/// DataFusion [SendableRecordBatchStream](DfSendableRecordBatchStream) -> Greptime [RecordBatchStream]
|
||||
pub struct RecordBatchStreamAdapter {
|
||||
schema: SchemaRef,
|
||||
stream: DfSendableRecordBatchStream,
|
||||
|
||||
@@ -70,6 +70,19 @@ pub enum Error {
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display(
|
||||
"Failed to project Arrow RecordBatch with schema {:?} and projection {:?}, source: {}",
|
||||
schema,
|
||||
projection,
|
||||
source
|
||||
))]
|
||||
ProjectArrowRecordBatch {
|
||||
source: datatypes::arrow::error::ArrowError,
|
||||
location: Location,
|
||||
schema: datatypes::schema::SchemaRef,
|
||||
projection: Vec<usize>,
|
||||
},
|
||||
|
||||
#[snafu(display("Column {} not exists in table {}", column_name, table_name))]
|
||||
ColumnNotExists {
|
||||
column_name: String,
|
||||
@@ -101,7 +114,8 @@ impl ErrorExt for Error {
|
||||
| Error::PollStream { .. }
|
||||
| Error::Format { .. }
|
||||
| Error::InitRecordbatchStream { .. }
|
||||
| Error::ColumnNotExists { .. } => StatusCode::Internal,
|
||||
| Error::ColumnNotExists { .. }
|
||||
| Error::ProjectArrowRecordBatch { .. } => StatusCode::Internal,
|
||||
|
||||
Error::External { source } => source.status_code(),
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@ use std::sync::Arc;
|
||||
|
||||
use datafusion::physical_plan::memory::MemoryStream;
|
||||
pub use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream;
|
||||
use datatypes::arrow::compute::SortOptions;
|
||||
pub use datatypes::arrow::record_batch::RecordBatch as DfRecordBatch;
|
||||
use datatypes::arrow::util::pretty;
|
||||
use datatypes::prelude::VectorRef;
|
||||
@@ -34,10 +35,20 @@ use snafu::{ensure, ResultExt};
|
||||
|
||||
pub trait RecordBatchStream: Stream<Item = Result<RecordBatch>> {
|
||||
fn schema(&self) -> SchemaRef;
|
||||
|
||||
fn output_ordering(&self) -> Option<&[OrderOption]> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub type SendableRecordBatchStream = Pin<Box<dyn RecordBatchStream + Send>>;
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct OrderOption {
|
||||
pub index: usize,
|
||||
pub options: SortOptions,
|
||||
}
|
||||
|
||||
/// EmptyRecordBatchStream can be used to create a RecordBatchStream
|
||||
/// that will produce no results
|
||||
pub struct EmptyRecordBatchStream {
|
||||
@@ -181,6 +192,26 @@ impl Stream for SimpleRecordBatchStream {
|
||||
}
|
||||
}
|
||||
|
||||
/// Adapt a [Stream] of [RecordBatch] to a [RecordBatchStream].
|
||||
pub struct RecordBatchStreamAdaptor {
|
||||
pub schema: SchemaRef,
|
||||
pub stream: Pin<Box<dyn Stream<Item = Result<RecordBatch>> + Send>>,
|
||||
}
|
||||
|
||||
impl RecordBatchStream for RecordBatchStreamAdaptor {
|
||||
fn schema(&self) -> SchemaRef {
|
||||
self.schema.clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl Stream for RecordBatchStreamAdaptor {
|
||||
type Item = Result<RecordBatch>;
|
||||
|
||||
fn poll_next(mut self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
Pin::new(&mut self.stream).poll_next(ctx)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use datatypes::schema::SchemaRef;
|
||||
use datatypes::value::Value;
|
||||
@@ -21,7 +22,10 @@ use serde::ser::{Error, SerializeStruct};
|
||||
use serde::{Serialize, Serializer};
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
|
||||
use crate::error::{self, CastVectorSnafu, ColumnNotExistsSnafu, Result};
|
||||
use crate::error::{
|
||||
self, CastVectorSnafu, ColumnNotExistsSnafu, DataTypesSnafu, ProjectArrowRecordBatchSnafu,
|
||||
Result,
|
||||
};
|
||||
use crate::DfRecordBatch;
|
||||
|
||||
/// A two-dimensional batch of column-oriented data with a defined schema.
|
||||
@@ -51,6 +55,26 @@ impl RecordBatch {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn try_project(&self, indices: &[usize]) -> Result<Self> {
|
||||
let schema = Arc::new(self.schema.try_project(indices).context(DataTypesSnafu)?);
|
||||
let mut columns = Vec::with_capacity(indices.len());
|
||||
for index in indices {
|
||||
columns.push(self.columns[*index].clone());
|
||||
}
|
||||
let df_record_batch = self.df_record_batch.project(indices).with_context(|_| {
|
||||
ProjectArrowRecordBatchSnafu {
|
||||
schema: self.schema.clone(),
|
||||
projection: indices.to_vec(),
|
||||
}
|
||||
})?;
|
||||
|
||||
Ok(Self {
|
||||
schema,
|
||||
columns,
|
||||
df_record_batch,
|
||||
})
|
||||
}
|
||||
|
||||
/// Create a new [`RecordBatch`] from `schema` and `df_record_batch`.
|
||||
///
|
||||
/// This method doesn't check the schema.
|
||||
|
||||
Reference in New Issue
Block a user