mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-05-31 04:10:38 +00:00
feat: support setting time range in Copy From statement (#4405)
* feat: support setting time range in Copy From statement * test: add batch_filter_test * fix: ts data type inconsistent error * test: add sqlness test for copy from with statement * fix: sqlness result error * fix: cr comments
This commit is contained in:
@@ -25,7 +25,7 @@ use datafusion::logical_expr::{LogicalPlan, LogicalPlanBuilder};
|
||||
use datafusion_common::Column;
|
||||
use datafusion_expr::col;
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
pub use expr::build_filter_from_timestamp;
|
||||
pub use expr::{build_filter_from_timestamp, build_same_type_ts_filter};
|
||||
|
||||
pub use self::accumulator::{Accumulator, AggregateFunctionCreator, AggregateFunctionCreatorRef};
|
||||
pub use self::udaf::AggregateFunction;
|
||||
|
||||
@@ -18,6 +18,35 @@ use common_time::Timestamp;
|
||||
use datafusion_common::{Column, ScalarValue};
|
||||
use datafusion_expr::expr::Expr;
|
||||
use datafusion_expr::{and, binary_expr, Operator};
|
||||
use datatypes::data_type::DataType;
|
||||
use datatypes::schema::ColumnSchema;
|
||||
use datatypes::value::Value;
|
||||
|
||||
/// Builds a filter for a timestamp column with the same type as the timestamp column.
|
||||
/// Returns [None] if time range is [None] or full time range.
|
||||
pub fn build_same_type_ts_filter(
|
||||
ts_schema: &ColumnSchema,
|
||||
time_range: Option<TimestampRange>,
|
||||
) -> Option<Expr> {
|
||||
let ts_type = ts_schema.data_type.clone();
|
||||
let time_range = time_range?;
|
||||
let start = time_range
|
||||
.start()
|
||||
.and_then(|start| ts_type.try_cast(Value::Timestamp(start)));
|
||||
let end = time_range
|
||||
.end()
|
||||
.and_then(|end| ts_type.try_cast(Value::Timestamp(end)));
|
||||
|
||||
let time_range = match (start, end) {
|
||||
(Some(Value::Timestamp(start)), Some(Value::Timestamp(end))) => {
|
||||
TimestampRange::new(start, end)
|
||||
}
|
||||
(Some(Value::Timestamp(start)), None) => Some(TimestampRange::from_start(start)),
|
||||
(None, Some(Value::Timestamp(end))) => Some(TimestampRange::until_end(end, false)),
|
||||
_ => return None,
|
||||
};
|
||||
build_filter_from_timestamp(&ts_schema.name, time_range.as_ref())
|
||||
}
|
||||
|
||||
/// Builds an `Expr` that filters timestamp column from given timestamp range.
|
||||
/// Returns [None] if time range is [None] or full time range.
|
||||
|
||||
@@ -22,19 +22,25 @@ use std::task::{Context, Poll};
|
||||
use datafusion::arrow::compute::cast;
|
||||
use datafusion::arrow::datatypes::SchemaRef as DfSchemaRef;
|
||||
use datafusion::error::Result as DfResult;
|
||||
use datafusion::execution::context::ExecutionProps;
|
||||
use datafusion::logical_expr::utils::conjunction;
|
||||
use datafusion::logical_expr::Expr;
|
||||
use datafusion::physical_expr::create_physical_expr;
|
||||
use datafusion::physical_plan::metrics::{BaselineMetrics, MetricValue};
|
||||
use datafusion::physical_plan::{
|
||||
accept, displayable, ExecutionPlan, ExecutionPlanVisitor,
|
||||
accept, displayable, ExecutionPlan, ExecutionPlanVisitor, PhysicalExpr,
|
||||
RecordBatchStream as DfRecordBatchStream,
|
||||
};
|
||||
use datafusion_common::arrow::error::ArrowError;
|
||||
use datafusion_common::DataFusionError;
|
||||
use datafusion_common::{DataFusionError, ToDFSchema};
|
||||
use datatypes::arrow::array::Array;
|
||||
use datatypes::schema::{Schema, SchemaRef};
|
||||
use futures::ready;
|
||||
use pin_project::pin_project;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::error::{self, Result};
|
||||
use crate::filter::batch_filter;
|
||||
use crate::{
|
||||
DfRecordBatch, DfSendableRecordBatchStream, OrderOption, RecordBatch, RecordBatchStream,
|
||||
SendableRecordBatchStream, Stream,
|
||||
@@ -50,6 +56,7 @@ pub struct RecordBatchStreamTypeAdapter<T, E> {
|
||||
stream: T,
|
||||
projected_schema: DfSchemaRef,
|
||||
projection: Vec<usize>,
|
||||
predicate: Option<Arc<dyn PhysicalExpr>>,
|
||||
phantom: PhantomData<E>,
|
||||
}
|
||||
|
||||
@@ -69,9 +76,28 @@ where
|
||||
stream,
|
||||
projected_schema,
|
||||
projection,
|
||||
predicate: None,
|
||||
phantom: Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_filter(mut self, filters: Vec<Expr>) -> Result<Self> {
|
||||
let filters = if let Some(expr) = conjunction(filters) {
|
||||
let df_schema = self
|
||||
.projected_schema
|
||||
.clone()
|
||||
.to_dfschema_ref()
|
||||
.context(error::PhysicalExprSnafu)?;
|
||||
|
||||
let filters = create_physical_expr(&expr, &df_schema, &ExecutionProps::new())
|
||||
.context(error::PhysicalExprSnafu)?;
|
||||
Some(filters)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
self.predicate = filters;
|
||||
Ok(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T, E> DfRecordBatchStream for RecordBatchStreamTypeAdapter<T, E>
|
||||
@@ -99,6 +125,8 @@ where
|
||||
|
||||
let projected_schema = this.projected_schema.clone();
|
||||
let projection = this.projection.clone();
|
||||
let predicate = this.predicate.clone();
|
||||
|
||||
let batch = batch.map(|b| {
|
||||
b.and_then(|b| {
|
||||
let projected_column = b.project(&projection)?;
|
||||
@@ -121,6 +149,11 @@ where
|
||||
}
|
||||
}
|
||||
let record_batch = DfRecordBatch::try_new(projected_schema, columns)?;
|
||||
let record_batch = if let Some(predicate) = predicate {
|
||||
batch_filter(&record_batch, &predicate)?
|
||||
} else {
|
||||
record_batch
|
||||
};
|
||||
Ok(record_batch)
|
||||
})
|
||||
});
|
||||
|
||||
@@ -73,6 +73,14 @@ pub enum Error {
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Create physical expr error"))]
|
||||
PhysicalExpr {
|
||||
#[snafu(source)]
|
||||
error: datafusion::error::DataFusionError,
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Fail to format record batch"))]
|
||||
Format {
|
||||
#[snafu(source)]
|
||||
@@ -167,7 +175,8 @@ impl ErrorExt for Error {
|
||||
| Error::PollStream { .. }
|
||||
| Error::Format { .. }
|
||||
| Error::ToArrowScalar { .. }
|
||||
| Error::ProjectArrowRecordBatch { .. } => StatusCode::Internal,
|
||||
| Error::ProjectArrowRecordBatch { .. }
|
||||
| Error::PhysicalExpr { .. } => StatusCode::Internal,
|
||||
|
||||
Error::ArrowCompute { .. } => StatusCode::IllegalState,
|
||||
|
||||
|
||||
@@ -14,11 +14,18 @@
|
||||
|
||||
//! Util record batch stream wrapper that can perform precise filter.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use datafusion::error::Result as DfResult;
|
||||
use datafusion::logical_expr::{Expr, Literal, Operator};
|
||||
use datafusion::physical_plan::PhysicalExpr;
|
||||
use datafusion_common::arrow::array::{ArrayRef, Datum, Scalar};
|
||||
use datafusion_common::arrow::buffer::BooleanBuffer;
|
||||
use datafusion_common::arrow::compute::kernels::cmp;
|
||||
use datafusion_common::ScalarValue;
|
||||
use datafusion_common::cast::{as_boolean_array, as_null_array};
|
||||
use datafusion_common::{internal_err, DataFusionError, ScalarValue};
|
||||
use datatypes::arrow::array::{Array, BooleanArray, RecordBatch};
|
||||
use datatypes::arrow::compute::filter_record_batch;
|
||||
use datatypes::vectors::VectorRef;
|
||||
use snafu::ResultExt;
|
||||
|
||||
@@ -144,13 +151,43 @@ impl SimpleFilterEvaluator {
|
||||
}
|
||||
}
|
||||
|
||||
/// Evaluate the predicate on the input [RecordBatch], and return a new [RecordBatch].
|
||||
/// Copy from datafusion::physical_plan::src::filter.rs
|
||||
pub fn batch_filter(
|
||||
batch: &RecordBatch,
|
||||
predicate: &Arc<dyn PhysicalExpr>,
|
||||
) -> DfResult<RecordBatch> {
|
||||
predicate
|
||||
.evaluate(batch)
|
||||
.and_then(|v| v.into_array(batch.num_rows()))
|
||||
.and_then(|array| {
|
||||
let filter_array = match as_boolean_array(&array) {
|
||||
Ok(boolean_array) => Ok(boolean_array.clone()),
|
||||
Err(_) => {
|
||||
let Ok(null_array) = as_null_array(&array) else {
|
||||
return internal_err!(
|
||||
"Cannot create filter_array from non-boolean predicates"
|
||||
);
|
||||
};
|
||||
|
||||
// if the predicate is null, then the result is also null
|
||||
Ok::<BooleanArray, DataFusionError>(BooleanArray::new_null(null_array.len()))
|
||||
}
|
||||
}?;
|
||||
Ok(filter_record_batch(batch, &filter_array)?)
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use datafusion::logical_expr::BinaryExpr;
|
||||
use datafusion_common::Column;
|
||||
use datafusion::execution::context::ExecutionProps;
|
||||
use datafusion::logical_expr::{col, lit, BinaryExpr};
|
||||
use datafusion::physical_expr::create_physical_expr;
|
||||
use datafusion_common::{Column, DFSchema};
|
||||
use datatypes::arrow::datatypes::{DataType, Field, Schema};
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -281,4 +318,35 @@ mod test {
|
||||
let result = evaluator.evaluate_scalar(&input_3).unwrap();
|
||||
assert!(!result);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn batch_filter_test() {
|
||||
let expr = col("ts").gt(lit(123456u64));
|
||||
let schema = Schema::new(vec![
|
||||
Field::new("a", DataType::Int32, true),
|
||||
Field::new("ts", DataType::UInt64, false),
|
||||
]);
|
||||
let df_schema = DFSchema::try_from(schema.clone()).unwrap();
|
||||
let props = ExecutionProps::new();
|
||||
let physical_expr = create_physical_expr(&expr, &df_schema, &props).unwrap();
|
||||
let batch = RecordBatch::try_new(
|
||||
Arc::new(schema),
|
||||
vec![
|
||||
Arc::new(datatypes::arrow::array::Int32Array::from(vec![4, 5, 6])),
|
||||
Arc::new(datatypes::arrow::array::UInt64Array::from(vec![
|
||||
123456, 123457, 123458,
|
||||
])),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
let new_batch = batch_filter(&batch, &physical_expr).unwrap();
|
||||
assert_eq!(new_batch.num_rows(), 2);
|
||||
let first_column_values = new_batch
|
||||
.column(0)
|
||||
.as_any()
|
||||
.downcast_ref::<datatypes::arrow::array::Int32Array>()
|
||||
.unwrap();
|
||||
let expected = datatypes::arrow::array::Int32Array::from(vec![5, 6]);
|
||||
assert_eq!(first_column_values, &expected);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user