refactor: add scan_to_stream() to Table trait to postpone the stream generation (#1639)

* add scan_to_stream to Table

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* impl parquet stream

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* reorganise adapters

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* implement scan_to_stream for mito table

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* clean up

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add location info

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix: table scan

* UT pass

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* impl project record batch

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix information schema

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix clippy

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* resolve CR comments

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* remove one todo

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix errors generated by merge commit

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add output_ordering method to record batch stream

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix rustfmt

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* enhance error types

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
Co-authored-by: Lei, HUANG <mrsatangel@gmail.com>
This commit is contained in:
Ruihang Xia
2023-05-29 20:03:47 +08:00
committed by GitHub
parent 0eaae634fa
commit b27c569ae0
34 changed files with 824 additions and 327 deletions

View File

@@ -89,6 +89,12 @@ pub enum Error {
location: Location,
},
#[snafu(display("Failed to project arrow schema, source: {}", source))]
ProjectArrowSchema {
source: arrow::error::ArrowError,
location: Location,
},
#[snafu(display("Unsupported column default constraint expression: {}", expr))]
UnsupportedDefaultExpr { expr: String, location: Location },

View File

@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#![feature(let_chains)]
pub mod arrow_array;
pub mod data_type;
pub mod error;

View File

@@ -24,7 +24,7 @@ use datafusion_common::DFSchemaRef;
use snafu::{ensure, ResultExt};
use crate::data_type::DataType;
use crate::error::{self, Error, Result};
use crate::error::{self, Error, ProjectArrowSchemaSnafu, Result};
pub use crate::schema::column_schema::{ColumnSchema, Metadata, COMMENT_KEY, TIME_INDEX_KEY};
pub use crate::schema::constraint::ColumnDefaultConstraint;
pub use crate::schema::raw::RawSchema;
@@ -70,12 +70,10 @@ impl Schema {
SchemaBuilder::try_from(column_schemas)?.build()
}
#[inline]
pub fn arrow_schema(&self) -> &Arc<ArrowSchema> {
&self.arrow_schema
}
#[inline]
pub fn column_schemas(&self) -> &[ColumnSchema] {
&self.column_schemas
}
@@ -89,51 +87,75 @@ impl Schema {
/// Retrieve the column's name by index
/// # Panics
/// This method **may** panic if the index is out of range of column schemas.
#[inline]
pub fn column_name_by_index(&self, idx: usize) -> &str {
&self.column_schemas[idx].name
}
#[inline]
pub fn column_index_by_name(&self, name: &str) -> Option<usize> {
self.name_to_index.get(name).copied()
}
#[inline]
pub fn contains_column(&self, name: &str) -> bool {
self.name_to_index.contains_key(name)
}
#[inline]
pub fn num_columns(&self) -> usize {
self.column_schemas.len()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.column_schemas.is_empty()
}
/// Returns index of the timestamp key column.
#[inline]
pub fn timestamp_index(&self) -> Option<usize> {
self.timestamp_index
}
#[inline]
pub fn timestamp_column(&self) -> Option<&ColumnSchema> {
self.timestamp_index.map(|idx| &self.column_schemas[idx])
}
#[inline]
pub fn version(&self) -> u32 {
self.version
}
#[inline]
pub fn metadata(&self) -> &HashMap<String, String> {
&self.arrow_schema.metadata
}
/// Generate a new projected schema
///
/// # Panic
///
/// If the index out ouf bound
pub fn try_project(&self, indices: &[usize]) -> Result<Self> {
let mut column_schemas = Vec::with_capacity(indices.len());
let mut timestamp_index = None;
for index in indices {
if let Some(ts_index) = self.timestamp_index && ts_index == *index {
timestamp_index = Some(column_schemas.len());
}
column_schemas.push(self.column_schemas[*index].clone());
}
let arrow_schema = self
.arrow_schema
.project(indices)
.context(ProjectArrowSchemaSnafu)?;
let name_to_index = column_schemas
.iter()
.enumerate()
.map(|(pos, column_schema)| (column_schema.name.clone(), pos))
.collect();
Ok(Self {
column_schemas,
name_to_index,
arrow_schema: Arc::new(arrow_schema),
timestamp_index,
version: self.version,
})
}
}
#[derive(Default)]