feat: support setting time range in Copy From statement (#4405)

* feat: support setting time range in Copy From statement

* test: add batch_filter_test

* fix: ts data type inconsistent error

* test: add sqlness test for copy from with statement

* fix: sqlness result error

* fix: cr comments
This commit is contained in:
taobo
2024-07-30 00:55:19 +08:00
committed by GitHub
parent 53fc14a50b
commit 1138f32af9
13 changed files with 294 additions and 96 deletions

View File

@@ -25,7 +25,7 @@ use datafusion::logical_expr::{LogicalPlan, LogicalPlanBuilder};
use datafusion_common::Column;
use datafusion_expr::col;
use datatypes::prelude::ConcreteDataType;
pub use expr::build_filter_from_timestamp;
pub use expr::{build_filter_from_timestamp, build_same_type_ts_filter};
pub use self::accumulator::{Accumulator, AggregateFunctionCreator, AggregateFunctionCreatorRef};
pub use self::udaf::AggregateFunction;

View File

@@ -18,6 +18,35 @@ use common_time::Timestamp;
use datafusion_common::{Column, ScalarValue};
use datafusion_expr::expr::Expr;
use datafusion_expr::{and, binary_expr, Operator};
use datatypes::data_type::DataType;
use datatypes::schema::ColumnSchema;
use datatypes::value::Value;
/// Builds a filter for a timestamp column with the same type as the timestamp column.
/// Returns [None] if time range is [None] or full time range.
pub fn build_same_type_ts_filter(
ts_schema: &ColumnSchema,
time_range: Option<TimestampRange>,
) -> Option<Expr> {
let ts_type = ts_schema.data_type.clone();
let time_range = time_range?;
let start = time_range
.start()
.and_then(|start| ts_type.try_cast(Value::Timestamp(start)));
let end = time_range
.end()
.and_then(|end| ts_type.try_cast(Value::Timestamp(end)));
let time_range = match (start, end) {
(Some(Value::Timestamp(start)), Some(Value::Timestamp(end))) => {
TimestampRange::new(start, end)
}
(Some(Value::Timestamp(start)), None) => Some(TimestampRange::from_start(start)),
(None, Some(Value::Timestamp(end))) => Some(TimestampRange::until_end(end, false)),
_ => return None,
};
build_filter_from_timestamp(&ts_schema.name, time_range.as_ref())
}
/// Builds an `Expr` that filters timestamp column from given timestamp range.
/// Returns [None] if time range is [None] or full time range.

View File

@@ -22,19 +22,25 @@ use std::task::{Context, Poll};
use datafusion::arrow::compute::cast;
use datafusion::arrow::datatypes::SchemaRef as DfSchemaRef;
use datafusion::error::Result as DfResult;
use datafusion::execution::context::ExecutionProps;
use datafusion::logical_expr::utils::conjunction;
use datafusion::logical_expr::Expr;
use datafusion::physical_expr::create_physical_expr;
use datafusion::physical_plan::metrics::{BaselineMetrics, MetricValue};
use datafusion::physical_plan::{
accept, displayable, ExecutionPlan, ExecutionPlanVisitor,
accept, displayable, ExecutionPlan, ExecutionPlanVisitor, PhysicalExpr,
RecordBatchStream as DfRecordBatchStream,
};
use datafusion_common::arrow::error::ArrowError;
use datafusion_common::DataFusionError;
use datafusion_common::{DataFusionError, ToDFSchema};
use datatypes::arrow::array::Array;
use datatypes::schema::{Schema, SchemaRef};
use futures::ready;
use pin_project::pin_project;
use snafu::ResultExt;
use crate::error::{self, Result};
use crate::filter::batch_filter;
use crate::{
DfRecordBatch, DfSendableRecordBatchStream, OrderOption, RecordBatch, RecordBatchStream,
SendableRecordBatchStream, Stream,
@@ -50,6 +56,7 @@ pub struct RecordBatchStreamTypeAdapter<T, E> {
stream: T,
projected_schema: DfSchemaRef,
projection: Vec<usize>,
predicate: Option<Arc<dyn PhysicalExpr>>,
phantom: PhantomData<E>,
}
@@ -69,9 +76,28 @@ where
stream,
projected_schema,
projection,
predicate: None,
phantom: Default::default(),
}
}
pub fn with_filter(mut self, filters: Vec<Expr>) -> Result<Self> {
let filters = if let Some(expr) = conjunction(filters) {
let df_schema = self
.projected_schema
.clone()
.to_dfschema_ref()
.context(error::PhysicalExprSnafu)?;
let filters = create_physical_expr(&expr, &df_schema, &ExecutionProps::new())
.context(error::PhysicalExprSnafu)?;
Some(filters)
} else {
None
};
self.predicate = filters;
Ok(self)
}
}
impl<T, E> DfRecordBatchStream for RecordBatchStreamTypeAdapter<T, E>
@@ -99,6 +125,8 @@ where
let projected_schema = this.projected_schema.clone();
let projection = this.projection.clone();
let predicate = this.predicate.clone();
let batch = batch.map(|b| {
b.and_then(|b| {
let projected_column = b.project(&projection)?;
@@ -121,6 +149,11 @@ where
}
}
let record_batch = DfRecordBatch::try_new(projected_schema, columns)?;
let record_batch = if let Some(predicate) = predicate {
batch_filter(&record_batch, &predicate)?
} else {
record_batch
};
Ok(record_batch)
})
});

View File

@@ -73,6 +73,14 @@ pub enum Error {
location: Location,
},
#[snafu(display("Create physical expr error"))]
PhysicalExpr {
#[snafu(source)]
error: datafusion::error::DataFusionError,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Fail to format record batch"))]
Format {
#[snafu(source)]
@@ -167,7 +175,8 @@ impl ErrorExt for Error {
| Error::PollStream { .. }
| Error::Format { .. }
| Error::ToArrowScalar { .. }
| Error::ProjectArrowRecordBatch { .. } => StatusCode::Internal,
| Error::ProjectArrowRecordBatch { .. }
| Error::PhysicalExpr { .. } => StatusCode::Internal,
Error::ArrowCompute { .. } => StatusCode::IllegalState,

View File

@@ -14,11 +14,18 @@
//! Util record batch stream wrapper that can perform precise filter.
use std::sync::Arc;
use datafusion::error::Result as DfResult;
use datafusion::logical_expr::{Expr, Literal, Operator};
use datafusion::physical_plan::PhysicalExpr;
use datafusion_common::arrow::array::{ArrayRef, Datum, Scalar};
use datafusion_common::arrow::buffer::BooleanBuffer;
use datafusion_common::arrow::compute::kernels::cmp;
use datafusion_common::ScalarValue;
use datafusion_common::cast::{as_boolean_array, as_null_array};
use datafusion_common::{internal_err, DataFusionError, ScalarValue};
use datatypes::arrow::array::{Array, BooleanArray, RecordBatch};
use datatypes::arrow::compute::filter_record_batch;
use datatypes::vectors::VectorRef;
use snafu::ResultExt;
@@ -144,13 +151,43 @@ impl SimpleFilterEvaluator {
}
}
/// Evaluate the predicate on the input [RecordBatch], and return a new [RecordBatch].
/// Copy from datafusion::physical_plan::src::filter.rs
pub fn batch_filter(
batch: &RecordBatch,
predicate: &Arc<dyn PhysicalExpr>,
) -> DfResult<RecordBatch> {
predicate
.evaluate(batch)
.and_then(|v| v.into_array(batch.num_rows()))
.and_then(|array| {
let filter_array = match as_boolean_array(&array) {
Ok(boolean_array) => Ok(boolean_array.clone()),
Err(_) => {
let Ok(null_array) = as_null_array(&array) else {
return internal_err!(
"Cannot create filter_array from non-boolean predicates"
);
};
// if the predicate is null, then the result is also null
Ok::<BooleanArray, DataFusionError>(BooleanArray::new_null(null_array.len()))
}
}?;
Ok(filter_record_batch(batch, &filter_array)?)
})
}
#[cfg(test)]
mod test {
use std::sync::Arc;
use datafusion::logical_expr::BinaryExpr;
use datafusion_common::Column;
use datafusion::execution::context::ExecutionProps;
use datafusion::logical_expr::{col, lit, BinaryExpr};
use datafusion::physical_expr::create_physical_expr;
use datafusion_common::{Column, DFSchema};
use datatypes::arrow::datatypes::{DataType, Field, Schema};
use super::*;
@@ -281,4 +318,35 @@ mod test {
let result = evaluator.evaluate_scalar(&input_3).unwrap();
assert!(!result);
}
#[test]
fn batch_filter_test() {
let expr = col("ts").gt(lit(123456u64));
let schema = Schema::new(vec![
Field::new("a", DataType::Int32, true),
Field::new("ts", DataType::UInt64, false),
]);
let df_schema = DFSchema::try_from(schema.clone()).unwrap();
let props = ExecutionProps::new();
let physical_expr = create_physical_expr(&expr, &df_schema, &props).unwrap();
let batch = RecordBatch::try_new(
Arc::new(schema),
vec![
Arc::new(datatypes::arrow::array::Int32Array::from(vec![4, 5, 6])),
Arc::new(datatypes::arrow::array::UInt64Array::from(vec![
123456, 123457, 123458,
])),
],
)
.unwrap();
let new_batch = batch_filter(&batch, &physical_expr).unwrap();
assert_eq!(new_batch.num_rows(), 2);
let first_column_values = new_batch
.column(0)
.as_any()
.downcast_ref::<datatypes::arrow::array::Int32Array>()
.unwrap();
let expected = datatypes::arrow::array::Int32Array::from(vec![5, 6]);
assert_eq!(first_column_values, &expected);
}
}