feat(flow): add eval_batch for ScalarExpr (#4551)

* refactor: better perf flow

* feat(WIP): batching proc

* feat: UnaryFunc::eval_batch untested

* feat: BinaryFunc::eval_batch untested

* feat: VariadicFunc::eval_batch un tested

* feat: literal eval_batch

* refactor: move DfScalarFunc to separate file

* chore: remove unused imports

* feat: eval_batch df func&ifthen

* chore: remove unused file

* refactor: use Batch type

* chore: remove unused

* chore: remove a done TODO

* refactor: per review

* chore: import

* refactor: eval_batch if then

* chore: typo
This commit is contained in:
discord9
2024-08-14 19:29:30 +08:00
committed by GitHub
parent c1b1be47ba
commit 2c3fccb516
30 changed files with 980 additions and 389 deletions

1
Cargo.lock generated
View File

@@ -3798,6 +3798,7 @@ name = "flow"
version = "0.9.1"
dependencies = [
"api",
"arrow",
"arrow-schema",
"async-recursion",
"async-trait",

View File

@@ -9,6 +9,7 @@ workspace = true
[dependencies]
api.workspace = true
arrow.workspace = true
arrow-schema.workspace = true
async-recursion = "1.0"
async-trait.workspace = true

View File

@@ -16,32 +16,21 @@
//!
//! And the [`Context`] is the environment for the render process, it contains all the necessary information for the render process
use std::cell::RefCell;
use std::collections::{BTreeMap, VecDeque};
use std::ops::Range;
use std::rc::Rc;
use std::collections::BTreeMap;
use datatypes::data_type::ConcreteDataType;
use datatypes::value::{ListValue, Value};
use hydroflow::futures::SinkExt;
use hydroflow::lattices::cc_traits::Get;
use hydroflow::scheduled::graph::Hydroflow;
use hydroflow::scheduled::graph_ext::GraphExt;
use hydroflow::scheduled::port::{PortCtx, SEND};
use itertools::Itertools;
use snafu::{ensure, OptionExt, ResultExt};
use snafu::OptionExt;
use super::state::Scheduler;
use crate::compute::state::DataflowState;
use crate::compute::types::{Arranged, Collection, CollectionBundle, ErrCollector, Toff};
use crate::error::{Error, EvalSnafu, InvalidQuerySnafu, NotImplementedSnafu, PlanSnafu};
use crate::expr::error::{DataTypeSnafu, InternalSnafu};
use crate::expr::{
self, EvalError, GlobalId, LocalId, MapFilterProject, MfpPlan, SafeMfpPlan, ScalarExpr,
};
use crate::plan::{AccumulablePlan, KeyValPlan, Plan, ReducePlan, TypedPlan};
use crate::repr::{self, DiffRow, KeyValDiffRow, Row};
use crate::utils::{ArrangeHandler, ArrangeReader, ArrangeWriter, Arrangement};
use crate::compute::types::{Collection, CollectionBundle, ErrCollector, Toff};
use crate::error::{Error, InvalidQuerySnafu, NotImplementedSnafu};
use crate::expr::{self, GlobalId, LocalId};
use crate::plan::{Plan, TypedPlan};
use crate::repr::{self, DiffRow};
mod map;
mod reduce;
@@ -218,20 +207,17 @@ mod test {
use std::cell::RefCell;
use std::rc::Rc;
use common_time::DateTime;
use datatypes::data_type::ConcreteDataType;
use hydroflow::scheduled::graph::Hydroflow;
use hydroflow::scheduled::graph_ext::GraphExt;
use hydroflow::scheduled::handoff::VecHandoff;
use pretty_assertions::{assert_eq, assert_ne};
use pretty_assertions::assert_eq;
use super::*;
use crate::expr::BinaryFunc;
use crate::repr::Row;
pub fn run_and_check(
state: &mut DataflowState,
df: &mut Hydroflow,
time_range: Range<i64>,
time_range: std::ops::Range<i64>,
expected: BTreeMap<i64, Vec<DiffRow>>,
output: Rc<RefCell<Vec<DiffRow>>>,
) {

View File

@@ -24,7 +24,7 @@ use crate::compute::state::Scheduler;
use crate::compute::types::{Arranged, Collection, CollectionBundle, ErrCollector, Toff};
use crate::error::{Error, PlanSnafu};
use crate::expr::{EvalError, MapFilterProject, MfpPlan, ScalarExpr};
use crate::plan::{Plan, TypedPlan};
use crate::plan::TypedPlan;
use crate::repr::{self, DiffRow, KeyValDiffRow, Row};
use crate::utils::ArrangeHandler;
@@ -206,8 +206,6 @@ fn eval_mfp_core(
#[cfg(test)]
mod test {
use std::cell::RefCell;
use std::rc::Rc;
use datatypes::data_type::ConcreteDataType;
use hydroflow::scheduled::graph::Hydroflow;
@@ -216,6 +214,7 @@ mod test {
use crate::compute::render::test::{get_output_handle, harness_test_ctx, run_and_check};
use crate::compute::state::DataflowState;
use crate::expr::{self, BinaryFunc, GlobalId};
use crate::plan::Plan;
use crate::repr::{ColumnType, RelationType};
/// test if temporal filter works properly

View File

@@ -18,17 +18,15 @@ use std::ops::Range;
use datatypes::data_type::ConcreteDataType;
use datatypes::value::{ListValue, Value};
use hydroflow::scheduled::graph_ext::GraphExt;
use hydroflow::scheduled::port::{PortCtx, SEND};
use itertools::Itertools;
use snafu::{ensure, OptionExt, ResultExt};
use crate::compute::render::{Context, SubgraphArg};
use crate::compute::state::Scheduler;
use crate::compute::types::{Arranged, Collection, CollectionBundle, ErrCollector, Toff};
use crate::error::{Error, PlanSnafu};
use crate::expr::error::{DataAlreadyExpiredSnafu, DataTypeSnafu, InternalSnafu};
use crate::expr::{AggregateExpr, EvalError, ScalarExpr};
use crate::plan::{AccumulablePlan, AggrWithIndex, KeyValPlan, Plan, ReducePlan, TypedPlan};
use crate::expr::{EvalError, ScalarExpr};
use crate::plan::{AccumulablePlan, AggrWithIndex, KeyValPlan, ReducePlan, TypedPlan};
use crate::repr::{self, DiffRow, KeyValDiffRow, RelationType, Row};
use crate::utils::{ArrangeHandler, ArrangeReader, ArrangeWriter, KeyExpiryManager};
@@ -790,8 +788,6 @@ fn from_val_to_slice_idx(
// TODO(discord9): add tests for accum ser/de
#[cfg(test)]
mod test {
use std::cell::RefCell;
use std::rc::Rc;
use common_time::{DateTime, Interval, Timestamp};
use datatypes::data_type::{ConcreteDataType, ConcreteDataType as CDT};
@@ -800,7 +796,10 @@ mod test {
use super::*;
use crate::compute::render::test::{get_output_handle, harness_test_ctx, run_and_check};
use crate::compute::state::DataflowState;
use crate::expr::{self, AggregateFunc, BinaryFunc, GlobalId, MapFilterProject, UnaryFunc};
use crate::expr::{
self, AggregateExpr, AggregateFunc, BinaryFunc, GlobalId, MapFilterProject, UnaryFunc,
};
use crate::plan::Plan;
use crate::repr::{ColumnType, RelationType};
/// SELECT sum(number) FROM numbers_with_ts GROUP BY tumble(ts, '1 second', '2021-07-01 00:00:00')

View File

@@ -16,7 +16,7 @@
use std::collections::{BTreeMap, VecDeque};
use common_telemetry::{debug, info};
use common_telemetry::debug;
use hydroflow::scheduled::graph_ext::GraphExt;
use itertools::Itertools;
use snafu::OptionExt;
@@ -27,7 +27,7 @@ use crate::compute::render::Context;
use crate::compute::types::{Arranged, Collection, CollectionBundle, Toff};
use crate::error::{Error, PlanSnafu};
use crate::expr::error::InternalSnafu;
use crate::expr::{EvalError, GlobalId};
use crate::expr::EvalError;
use crate::repr::{DiffRow, Row, BROADCAST_CAP};
#[allow(clippy::mutable_key_type)]

View File

@@ -13,7 +13,7 @@
// limitations under the License.
use std::cell::RefCell;
use std::collections::{BTreeMap, BTreeSet, VecDeque};
use std::collections::{BTreeMap, VecDeque};
use std::rc::Rc;
use hydroflow::scheduled::graph::Hydroflow;

View File

@@ -22,12 +22,11 @@ use hydroflow::scheduled::handoff::TeeingHandoff;
use hydroflow::scheduled::port::RecvPort;
use hydroflow::scheduled::SubgraphId;
use itertools::Itertools;
use tokio::sync::{Mutex, RwLock};
use tokio::sync::Mutex;
use crate::compute::render::Context;
use crate::expr::{EvalError, ScalarExpr};
use crate::repr::DiffRow;
use crate::utils::{ArrangeHandler, Arrangement};
use crate::utils::ArrangeHandler;
pub type Toff<T = DiffRow> = TeeingHandoff<T>;

View File

@@ -14,6 +14,7 @@
//! for declare Expression in dataflow, including map, reduce, id and join(TODO!) etc.
mod df_func;
pub(crate) mod error;
mod func;
mod id;
@@ -22,9 +23,92 @@ mod relation;
mod scalar;
mod signature;
pub(crate) use error::{EvalError, InvalidArgumentSnafu, OptimizeSnafu};
use datatypes::prelude::DataType;
use datatypes::vectors::VectorRef;
pub(crate) use df_func::{DfScalarFunction, RawDfScalarFn};
pub(crate) use error::{EvalError, InvalidArgumentSnafu};
pub(crate) use func::{BinaryFunc, UnaryFunc, UnmaterializableFunc, VariadicFunc};
pub(crate) use id::{GlobalId, Id, LocalId};
use itertools::Itertools;
pub(crate) use linear::{MapFilterProject, MfpPlan, SafeMfpPlan};
pub(crate) use relation::{AggregateExpr, AggregateFunc};
pub(crate) use scalar::{DfScalarFunction, RawDfScalarFn, ScalarExpr, TypedExpr};
pub(crate) use scalar::{ScalarExpr, TypedExpr};
use snafu::{ensure, ResultExt};
use crate::expr::error::DataTypeSnafu;
/// A batch of vectors with the same length but without schema, only useful in dataflow
pub struct Batch {
batch: Vec<VectorRef>,
row_count: usize,
}
impl Batch {
pub fn new(batch: Vec<VectorRef>, row_count: usize) -> Self {
Self { batch, row_count }
}
pub fn batch(&self) -> &[VectorRef] {
&self.batch
}
pub fn row_count(&self) -> usize {
self.row_count
}
/// Slices the `Batch`, returning a new `Batch`.
///
/// # Panics
/// This function panics if `offset + length > self.row_count()`.
pub fn slice(&self, offset: usize, length: usize) -> Batch {
let batch = self
.batch()
.iter()
.map(|v| v.slice(offset, length))
.collect_vec();
Batch::new(batch, length)
}
/// append another batch to self
pub fn append_batch(&mut self, other: Batch) -> Result<(), EvalError> {
ensure!(
self.batch.len() == other.batch.len(),
InvalidArgumentSnafu {
reason: format!(
"Expect two batch to have same numbers of column, found {} and {} columns",
self.batch.len(),
other.batch.len()
)
}
);
let batch_builders = self
.batch
.iter()
.map(|v| {
v.data_type()
.create_mutable_vector(self.row_count() + other.row_count())
})
.collect_vec();
let mut result = vec![];
let zelf_row_count = self.row_count();
let other_row_count = other.row_count();
for (idx, mut builder) in batch_builders.into_iter().enumerate() {
builder
.extend_slice_of(self.batch()[idx].as_ref(), 0, zelf_row_count)
.context(DataTypeSnafu {
msg: "Failed to extend vector",
})?;
builder
.extend_slice_of(other.batch()[idx].as_ref(), 0, other_row_count)
.context(DataTypeSnafu {
msg: "Failed to extend vector",
})?;
result.push(builder.to_vector());
}
self.batch = result;
self.row_count = zelf_row_count + other_row_count;
Ok(())
}
}

View File

@@ -0,0 +1,293 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Porting Datafusion scalar function to our scalar function to be used in dataflow
use std::sync::Arc;
use arrow::array::RecordBatchOptions;
use bytes::BytesMut;
use common_error::ext::BoxedError;
use common_recordbatch::DfRecordBatch;
use common_telemetry::debug;
use datafusion_physical_expr::PhysicalExpr;
use datatypes::data_type::DataType;
use datatypes::value::Value;
use datatypes::vectors::VectorRef;
use prost::Message;
use snafu::{IntoError, ResultExt};
use substrait::error::{DecodeRelSnafu, EncodeRelSnafu};
use substrait::substrait_proto_df::proto::expression::ScalarFunction;
use crate::error::Error;
use crate::expr::error::{
ArrowSnafu, DatafusionSnafu as EvalDatafusionSnafu, EvalError, ExternalSnafu,
InvalidArgumentSnafu,
};
use crate::expr::{Batch, ScalarExpr};
use crate::repr::RelationDesc;
use crate::transform::{from_scalar_fn_to_df_fn_impl, FunctionExtensions};
/// A way to represent a scalar function that is implemented in Datafusion
#[derive(Debug, Clone)]
pub struct DfScalarFunction {
/// The raw bytes encoded datafusion scalar function
pub(crate) raw_fn: RawDfScalarFn,
// TODO(discord9): directly from datafusion expr
/// The implementation of the function
pub(crate) fn_impl: Arc<dyn PhysicalExpr>,
/// The input schema of the function
pub(crate) df_schema: Arc<datafusion_common::DFSchema>,
}
impl DfScalarFunction {
pub fn new(raw_fn: RawDfScalarFn, fn_impl: Arc<dyn PhysicalExpr>) -> Result<Self, Error> {
Ok(Self {
df_schema: Arc::new(raw_fn.input_schema.to_df_schema()?),
raw_fn,
fn_impl,
})
}
pub async fn try_from_raw_fn(raw_fn: RawDfScalarFn) -> Result<Self, Error> {
Ok(Self {
fn_impl: raw_fn.get_fn_impl().await?,
df_schema: Arc::new(raw_fn.input_schema.to_df_schema()?),
raw_fn,
})
}
/// Evaluate a batch of expressions using input values
pub fn eval_batch(&self, batch: &Batch, exprs: &[ScalarExpr]) -> Result<VectorRef, EvalError> {
let row_count = batch.row_count();
let batch: Vec<_> = exprs
.iter()
.map(|expr| expr.eval_batch(batch))
.collect::<Result<_, _>>()?;
let schema = self.df_schema.inner().clone();
let arrays = batch
.iter()
.map(|array| array.to_arrow_array())
.collect::<Vec<_>>();
let rb = DfRecordBatch::try_new_with_options(schema, arrays, &RecordBatchOptions::new().with_row_count(Some(row_count))).map_err(|err| {
ArrowSnafu {
context:
"Failed to create RecordBatch from values when eval_batch datafusion scalar function",
}
.into_error(err)
})?;
let len = rb.num_rows();
let res = self.fn_impl.evaluate(&rb).map_err(|err| {
EvalDatafusionSnafu {
raw: err,
context: "Failed to evaluate datafusion scalar function",
}
.build()
})?;
let res = common_query::columnar_value::ColumnarValue::try_from(&res)
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
let res_vec = res
.try_into_vector(len)
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
Ok(res_vec)
}
/// eval a list of expressions using input values
fn eval_args(values: &[Value], exprs: &[ScalarExpr]) -> Result<Vec<Value>, EvalError> {
exprs
.iter()
.map(|expr| expr.eval(values))
.collect::<Result<_, _>>()
}
// TODO(discord9): add RecordBatch support
pub fn eval(&self, values: &[Value], exprs: &[ScalarExpr]) -> Result<Value, EvalError> {
// first eval exprs to construct values to feed to datafusion
let values: Vec<_> = Self::eval_args(values, exprs)?;
if values.is_empty() {
return InvalidArgumentSnafu {
reason: "values is empty".to_string(),
}
.fail();
}
// TODO(discord9): make cols all array length of one
let mut cols = vec![];
for (idx, typ) in self
.raw_fn
.input_schema
.typ()
.column_types
.iter()
.enumerate()
{
let typ = typ.scalar_type();
let mut array = typ.create_mutable_vector(1);
array.push_value_ref(values[idx].as_value_ref());
cols.push(array.to_vector().to_arrow_array());
}
let schema = self.df_schema.inner().clone();
let rb = DfRecordBatch::try_new_with_options(
schema,
cols,
&RecordBatchOptions::new().with_row_count(Some(1)),
)
.map_err(|err| {
ArrowSnafu {
context:
"Failed to create RecordBatch from values when eval datafusion scalar function",
}
.into_error(err)
})?;
let res = self.fn_impl.evaluate(&rb).map_err(|err| {
EvalDatafusionSnafu {
raw: err,
context: "Failed to evaluate datafusion scalar function",
}
.build()
})?;
let res = common_query::columnar_value::ColumnarValue::try_from(&res)
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
let res_vec = res
.try_into_vector(1)
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
let res_val = res_vec
.try_get(0)
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
Ok(res_val)
}
}
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct RawDfScalarFn {
/// The raw bytes encoded datafusion scalar function
pub(crate) f: bytes::BytesMut,
/// The input schema of the function
pub(crate) input_schema: RelationDesc,
/// Extension contains mapping from function reference to function name
pub(crate) extensions: FunctionExtensions,
}
impl RawDfScalarFn {
pub fn from_proto(
f: &substrait::substrait_proto_df::proto::expression::ScalarFunction,
input_schema: RelationDesc,
extensions: FunctionExtensions,
) -> Result<Self, Error> {
let mut buf = BytesMut::new();
f.encode(&mut buf)
.context(EncodeRelSnafu)
.map_err(BoxedError::new)
.context(crate::error::ExternalSnafu)?;
Ok(Self {
f: buf,
input_schema,
extensions,
})
}
async fn get_fn_impl(&self) -> Result<Arc<dyn PhysicalExpr>, Error> {
let f = ScalarFunction::decode(&mut self.f.as_ref())
.context(DecodeRelSnafu)
.map_err(BoxedError::new)
.context(crate::error::ExternalSnafu)?;
debug!("Decoded scalar function: {:?}", f);
let input_schema = &self.input_schema;
let extensions = &self.extensions;
from_scalar_fn_to_df_fn_impl(&f, input_schema, extensions).await
}
}
impl std::cmp::PartialEq for DfScalarFunction {
fn eq(&self, other: &Self) -> bool {
self.raw_fn.eq(&other.raw_fn)
}
}
// can't derive Eq because of Arc<dyn PhysicalExpr> not eq, so implement it manually
impl std::cmp::Eq for DfScalarFunction {}
impl std::cmp::PartialOrd for DfScalarFunction {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl std::cmp::Ord for DfScalarFunction {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.raw_fn.cmp(&other.raw_fn)
}
}
impl std::hash::Hash for DfScalarFunction {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.raw_fn.hash(state);
}
}
#[cfg(test)]
mod test {
use datatypes::prelude::ConcreteDataType;
use substrait::substrait_proto_df::proto::expression::literal::LiteralType;
use substrait::substrait_proto_df::proto::expression::{Literal, RexType};
use substrait::substrait_proto_df::proto::function_argument::ArgType;
use substrait::substrait_proto_df::proto::{Expression, FunctionArgument};
use super::*;
use crate::repr::{ColumnType, RelationType};
#[tokio::test]
async fn test_df_scalar_function() {
let raw_scalar_func = ScalarFunction {
function_reference: 0,
arguments: vec![FunctionArgument {
arg_type: Some(ArgType::Value(Expression {
rex_type: Some(RexType::Literal(Literal {
nullable: false,
type_variation_reference: 0,
literal_type: Some(LiteralType::I64(-1)),
})),
})),
}],
output_type: None,
..Default::default()
};
let input_schema = RelationDesc::try_new(
RelationType::new(vec![ColumnType::new_nullable(
ConcreteDataType::null_datatype(),
)]),
vec!["null_column".to_string()],
)
.unwrap();
let extensions = FunctionExtensions::from_iter(vec![(0, "abs")]);
let raw_fn = RawDfScalarFn::from_proto(&raw_scalar_func, input_schema, extensions).unwrap();
let df_func = DfScalarFunction::try_from_raw_fn(raw_fn).await.unwrap();
assert_eq!(
df_func
.eval(&[Value::Null], &[ScalarExpr::Column(0)])
.unwrap(),
Value::Int64(1)
);
}
}

View File

@@ -14,17 +14,12 @@
//! Error handling for expression evaluation.
use std::any::Any;
use arrow_schema::ArrowError;
use common_error::ext::BoxedError;
use common_macro::stack_trace_debug;
use common_telemetry::common_error::ext::ErrorExt;
use common_telemetry::common_error::status_code::StatusCode;
use datafusion_common::DataFusionError;
use datatypes::data_type::ConcreteDataType;
use serde::{Deserialize, Serialize};
use snafu::{Location, ResultExt, Snafu};
use snafu::{Location, Snafu};
fn is_send_sync() {
fn check<T: Send + Sync>() {}
@@ -113,6 +108,7 @@ pub enum EvalError {
#[snafu(display("Arrow error: {raw:?}, context: {context}"))]
Arrow {
#[snafu(source)]
raw: ArrowError,
context: String,
#[snafu(implicit)]

View File

@@ -15,17 +15,20 @@
//! This module contains the definition of functions that can be used in expressions.
use std::collections::HashMap;
use std::sync::OnceLock;
use std::sync::{Arc, OnceLock};
use arrow::array::{ArrayRef, BooleanArray};
use common_error::ext::BoxedError;
use common_telemetry::debug;
use common_time::timestamp::TimeUnit;
use common_time::{DateTime, Timestamp};
use datafusion_expr::Operator;
use datatypes::data_type::ConcreteDataType;
use datatypes::prelude::DataType;
use datatypes::types::cast;
use datatypes::types::cast::CastOption;
use datatypes::value::Value;
use datatypes::vectors::{
BooleanVector, DateTimeVector, Helper, TimestampMillisecondVector, VectorRef,
};
use serde::{Deserialize, Serialize};
use smallvec::smallvec;
use snafu::{ensure, OptionExt, ResultExt};
@@ -34,12 +37,12 @@ use substrait::df_logical_plan::consumer::name_to_op;
use crate::error::{Error, ExternalSnafu, InvalidQuerySnafu, PlanSnafu};
use crate::expr::error::{
CastValueSnafu, DivisionByZeroSnafu, EvalError, InternalSnafu, OverflowSnafu,
ArrowSnafu, CastValueSnafu, DataTypeSnafu, DivisionByZeroSnafu, EvalError, OverflowSnafu,
TryFromValueSnafu, TypeMismatchSnafu,
};
use crate::expr::signature::{GenericFn, Signature};
use crate::expr::{InvalidArgumentSnafu, ScalarExpr, TypedExpr};
use crate::repr::{self, value_to_internal_ts, Row};
use crate::expr::{Batch, InvalidArgumentSnafu, ScalarExpr, TypedExpr};
use crate::repr::{self, value_to_internal_ts};
/// UnmaterializableFunc is a function that can't be eval independently,
/// and require special handling
@@ -221,6 +224,129 @@ impl UnaryFunc {
}
}
pub fn eval_batch(&self, batch: &Batch, expr: &ScalarExpr) -> Result<VectorRef, EvalError> {
let arg_col = expr.eval_batch(batch)?;
match self {
Self::Not => {
let arrow_array = arg_col.to_arrow_array();
let bool_array = arrow_array
.as_any()
.downcast_ref::<BooleanArray>()
.context({
TypeMismatchSnafu {
expected: ConcreteDataType::boolean_datatype(),
actual: arg_col.data_type(),
}
})?;
let ret = arrow::compute::not(bool_array).context(ArrowSnafu { context: "not" })?;
let ret = BooleanVector::from(ret);
Ok(Arc::new(ret))
}
Self::IsNull => {
let arrow_array = arg_col.to_arrow_array();
let ret = arrow::compute::is_null(&arrow_array)
.context(ArrowSnafu { context: "is_null" })?;
let ret = BooleanVector::from(ret);
Ok(Arc::new(ret))
}
Self::IsTrue | Self::IsFalse => {
let arrow_array = arg_col.to_arrow_array();
let bool_array = arrow_array
.as_any()
.downcast_ref::<BooleanArray>()
.context({
TypeMismatchSnafu {
expected: ConcreteDataType::boolean_datatype(),
actual: arg_col.data_type(),
}
})?;
if matches!(self, Self::IsTrue) {
Ok(Arc::new(BooleanVector::from(bool_array.clone())))
} else {
let ret =
arrow::compute::not(bool_array).context(ArrowSnafu { context: "not" })?;
Ok(Arc::new(BooleanVector::from(ret)))
}
}
Self::StepTimestamp => {
let datetime_array = get_datetime_array(&arg_col)?;
let date_array_ref = datetime_array
.as_any()
.downcast_ref::<arrow::array::Date64Array>()
.context({
TypeMismatchSnafu {
expected: ConcreteDataType::boolean_datatype(),
actual: ConcreteDataType::from_arrow_type(datetime_array.data_type()),
}
})?;
let ret = arrow::compute::unary(date_array_ref, |arr| arr + 1);
let ret = DateTimeVector::from(ret);
Ok(Arc::new(ret))
}
Self::Cast(to) => {
let arrow_array = arg_col.to_arrow_array();
let ret = arrow::compute::cast(&arrow_array, &to.as_arrow_type())
.context(ArrowSnafu { context: "cast" })?;
let vector = Helper::try_into_vector(ret).context(DataTypeSnafu {
msg: "Fail to convert to Vector",
})?;
Ok(vector)
}
Self::TumbleWindowFloor {
window_size,
start_time,
} => {
let datetime_array = get_datetime_array(&arg_col)?;
let date_array_ref = datetime_array
.as_any()
.downcast_ref::<arrow::array::Date64Array>()
.context({
TypeMismatchSnafu {
expected: ConcreteDataType::boolean_datatype(),
actual: ConcreteDataType::from_arrow_type(datetime_array.data_type()),
}
})?;
let start_time = start_time.map(|t| t.val());
let window_size = (window_size.to_nanosecond() / 1_000_000) as repr::Duration; // nanosecond to millisecond
let ret = arrow::compute::unary(date_array_ref, |ts| {
get_window_start(ts, window_size, start_time)
});
let ret = TimestampMillisecondVector::from(ret);
Ok(Arc::new(ret))
}
Self::TumbleWindowCeiling {
window_size,
start_time,
} => {
let datetime_array = get_datetime_array(&arg_col)?;
let date_array_ref = datetime_array
.as_any()
.downcast_ref::<arrow::array::Date64Array>()
.context({
TypeMismatchSnafu {
expected: ConcreteDataType::boolean_datatype(),
actual: ConcreteDataType::from_arrow_type(datetime_array.data_type()),
}
})?;
let start_time = start_time.map(|t| t.val());
let window_size = (window_size.to_nanosecond() / 1_000_000) as repr::Duration; // nanosecond to millisecond
let ret = arrow::compute::unary(date_array_ref, |ts| {
get_window_start(ts, window_size, start_time) + window_size
});
let ret = TimestampMillisecondVector::from(ret);
Ok(Arc::new(ret))
}
}
}
/// Evaluate the function with given values and expression
///
/// # Arguments
@@ -314,6 +440,23 @@ impl UnaryFunc {
}
}
fn get_datetime_array(vector: &VectorRef) -> Result<arrow::array::ArrayRef, EvalError> {
let arrow_array = vector.to_arrow_array();
let datetime_array =
if *arrow_array.data_type() == ConcreteDataType::datetime_datatype().as_arrow_type() {
arrow_array
} else {
arrow::compute::cast(
&arrow_array,
&ConcreteDataType::datetime_datatype().as_arrow_type(),
)
.context(ArrowSnafu {
context: "Trying to cast to datetime in StepTimestamp",
})?
};
Ok(datetime_array)
}
fn get_window_start(
ts: repr::Timestamp,
window_size: repr::Duration,
@@ -692,6 +835,98 @@ impl BinaryFunc {
Ok((spec_fn, signature))
}
pub fn eval_batch(
&self,
batch: &Batch,
expr1: &ScalarExpr,
expr2: &ScalarExpr,
) -> Result<VectorRef, EvalError> {
let left = expr1.eval_batch(batch)?;
let left = left.to_arrow_array();
let right = expr2.eval_batch(batch)?;
let right = right.to_arrow_array();
let arrow_array: ArrayRef = match self {
Self::Eq => Arc::new(
arrow::compute::kernels::cmp::eq(&left, &right)
.context(ArrowSnafu { context: "eq" })?,
),
Self::NotEq => Arc::new(
arrow::compute::kernels::cmp::neq(&left, &right)
.context(ArrowSnafu { context: "neq" })?,
),
Self::Lt => Arc::new(
arrow::compute::kernels::cmp::lt(&left, &right)
.context(ArrowSnafu { context: "lt" })?,
),
Self::Lte => Arc::new(
arrow::compute::kernels::cmp::lt_eq(&left, &right)
.context(ArrowSnafu { context: "lte" })?,
),
Self::Gt => Arc::new(
arrow::compute::kernels::cmp::gt(&left, &right)
.context(ArrowSnafu { context: "gt" })?,
),
Self::Gte => Arc::new(
arrow::compute::kernels::cmp::gt_eq(&left, &right)
.context(ArrowSnafu { context: "gte" })?,
),
Self::AddInt16
| Self::AddInt32
| Self::AddInt64
| Self::AddUInt16
| Self::AddUInt32
| Self::AddUInt64
| Self::AddFloat32
| Self::AddFloat64 => arrow::compute::kernels::numeric::add(&left, &right)
.context(ArrowSnafu { context: "add" })?,
Self::SubInt16
| Self::SubInt32
| Self::SubInt64
| Self::SubUInt16
| Self::SubUInt32
| Self::SubUInt64
| Self::SubFloat32
| Self::SubFloat64 => arrow::compute::kernels::numeric::sub(&left, &right)
.context(ArrowSnafu { context: "sub" })?,
Self::MulInt16
| Self::MulInt32
| Self::MulInt64
| Self::MulUInt16
| Self::MulUInt32
| Self::MulUInt64
| Self::MulFloat32
| Self::MulFloat64 => arrow::compute::kernels::numeric::mul(&left, &right)
.context(ArrowSnafu { context: "mul" })?,
Self::DivInt16
| Self::DivInt32
| Self::DivInt64
| Self::DivUInt16
| Self::DivUInt32
| Self::DivUInt64
| Self::DivFloat32
| Self::DivFloat64 => arrow::compute::kernels::numeric::mul(&left, &right)
.context(ArrowSnafu { context: "div" })?,
Self::ModInt16
| Self::ModInt32
| Self::ModInt64
| Self::ModUInt16
| Self::ModUInt32
| Self::ModUInt64 => arrow::compute::kernels::numeric::rem(&left, &right)
.context(ArrowSnafu { context: "rem" })?,
};
let vector = Helper::try_into_vector(arrow_array).context(DataTypeSnafu {
msg: "Fail to convert to Vector",
})?;
Ok(vector)
}
/// Evaluate the function with given values and expression
///
/// # Arguments
@@ -824,6 +1059,51 @@ impl VariadicFunc {
}
}
pub fn eval_batch(&self, batch: &Batch, exprs: &[ScalarExpr]) -> Result<VectorRef, EvalError> {
ensure!(
!exprs.is_empty(),
InvalidArgumentSnafu {
reason: format!("Variadic function {:?} requires at least 1 arguments", self)
}
);
let args = exprs
.iter()
.map(|expr| expr.eval_batch(batch).map(|v| v.to_arrow_array()))
.collect::<Result<Vec<_>, _>>()?;
let mut iter = args.into_iter();
let first = iter.next().unwrap();
let mut left = first
.as_any()
.downcast_ref::<BooleanArray>()
.context({
TypeMismatchSnafu {
expected: ConcreteDataType::boolean_datatype(),
actual: ConcreteDataType::from_arrow_type(first.data_type()),
}
})?
.clone();
for right in iter {
let right = right.as_any().downcast_ref::<BooleanArray>().context({
TypeMismatchSnafu {
expected: ConcreteDataType::boolean_datatype(),
actual: ConcreteDataType::from_arrow_type(right.data_type()),
}
})?;
left = match self {
Self::And => {
arrow::compute::and(&left, right).context(ArrowSnafu { context: "and" })?
}
Self::Or => {
arrow::compute::or(&left, right).context(ArrowSnafu { context: "or" })?
}
}
}
Ok(Arc::new(BooleanVector::from(left)))
}
/// Evaluate the function with given values and expressions
pub fn eval(&self, values: &[Value], exprs: &[ScalarExpr]) -> Result<Value, EvalError> {
match self {

View File

@@ -14,17 +14,15 @@
//! define MapFilterProject which is a compound operator that can be applied row-by-row.
use std::collections::{BTreeMap, BTreeSet, VecDeque};
use std::collections::{BTreeMap, BTreeSet};
use common_telemetry::debug;
use datatypes::value::Value;
use itertools::Itertools;
use serde::{Deserialize, Serialize};
use snafu::{ensure, OptionExt};
use snafu::ensure;
use crate::error::{Error, InvalidQuerySnafu};
use crate::expr::error::{EvalError, InternalSnafu};
use crate::expr::{Id, InvalidArgumentSnafu, LocalId, ScalarExpr};
use crate::expr::{InvalidArgumentSnafu, ScalarExpr};
use crate::repr::{self, value_to_internal_ts, Diff, Row};
/// A compound operator that can be applied row-by-row.
@@ -738,7 +736,6 @@ impl MfpPlan {
#[cfg(test)]
mod test {
use datatypes::data_type::ConcreteDataType;
use itertools::Itertools;
use super::*;
use crate::expr::{BinaryFunc, UnaryFunc, UnmaterializableFunc};

View File

@@ -15,7 +15,6 @@
//! Describes an aggregation function and it's input expression.
pub(crate) use func::AggregateFunc;
use serde::{Deserialize, Serialize};
use crate::expr::ScalarExpr;

View File

@@ -24,11 +24,9 @@ use std::any::type_name;
use std::fmt::Display;
use common_decimal::Decimal128;
use common_time::{Date, DateTime};
use datatypes::data_type::ConcreteDataType;
use datatypes::value::{OrderedF32, OrderedF64, OrderedFloat, Value};
use enum_dispatch::enum_dispatch;
use hydroflow::futures::stream::Concat;
use serde::{Deserialize, Serialize};
use snafu::ensure;
@@ -761,7 +759,10 @@ fn ty_eq_without_precision(left: ConcreteDataType, right: ConcreteDataType) -> b
#[allow(clippy::too_many_lines)]
#[cfg(test)]
mod test {
use common_time::DateTime;
use super::*;
#[test]
fn test_accum() {
let testcases = vec![

View File

@@ -16,16 +16,15 @@ use std::collections::HashMap;
use std::str::FromStr;
use std::sync::OnceLock;
use common_time::{Date, DateTime};
use datatypes::prelude::ConcreteDataType;
use datatypes::value::{OrderedF32, OrderedF64, Value};
use datatypes::value::Value;
use serde::{Deserialize, Serialize};
use smallvec::smallvec;
use snafu::{IntoError, OptionExt, ResultExt};
use snafu::{IntoError, OptionExt};
use strum::{EnumIter, IntoEnumIterator};
use crate::error::{DatafusionSnafu, Error, InvalidQuerySnafu};
use crate::expr::error::{EvalError, TryFromValueSnafu, TypeMismatchSnafu};
use crate::expr::error::EvalError;
use crate::expr::relation::accum::{Accum, Accumulator};
use crate::expr::signature::{GenericFn, Signature};
use crate::repr::Diff;

View File

@@ -15,34 +15,22 @@
//! Scalar expressions.
use std::collections::{BTreeMap, BTreeSet};
use std::sync::{Arc, Mutex};
use bytes::BytesMut;
use common_error::ext::BoxedError;
use common_recordbatch::DfRecordBatch;
use common_telemetry::debug;
use datafusion_physical_expr::PhysicalExpr;
use datatypes::data_type::DataType;
use datatypes::prelude::ConcreteDataType;
use datatypes::prelude::{ConcreteDataType, DataType};
use datatypes::value::Value;
use datatypes::{arrow_array, value};
use prost::Message;
use serde::{Deserialize, Serialize};
use snafu::{ensure, ResultExt};
use substrait::error::{DecodeRelSnafu, EncodeRelSnafu};
use substrait::substrait_proto_df::proto::expression::{RexType, ScalarFunction};
use substrait::substrait_proto_df::proto::Expression;
use datatypes::vectors::{BooleanVector, Helper, NullVector, Vector, VectorRef};
use snafu::{ensure, OptionExt, ResultExt};
use crate::error::{
DatafusionSnafu, Error, InvalidQuerySnafu, UnexpectedSnafu, UnsupportedTemporalFilterSnafu,
};
use crate::expr::error::{
ArrowSnafu, DatafusionSnafu as EvalDatafusionSnafu, EvalError, ExternalSnafu,
InvalidArgumentSnafu, OptimizeSnafu,
DataTypeSnafu, EvalError, InternalSnafu, InvalidArgumentSnafu, OptimizeSnafu, TypeMismatchSnafu,
};
use crate::expr::func::{BinaryFunc, UnaryFunc, UnmaterializableFunc, VariadicFunc};
use crate::repr::{ColumnType, RelationDesc, RelationType};
use crate::transform::{from_scalar_fn_to_df_fn_impl, FunctionExtensions};
use crate::expr::{Batch, DfScalarFunction};
use crate::repr::{ColumnType, RelationType};
/// A scalar expression with a known type.
#[derive(Ord, PartialOrd, Clone, Debug, Eq, PartialEq, Hash)]
pub struct TypedExpr {
@@ -174,163 +162,6 @@ pub enum ScalarExpr {
},
}
/// A way to represent a scalar function that is implemented in Datafusion
#[derive(Debug, Clone)]
pub struct DfScalarFunction {
raw_fn: RawDfScalarFn,
// TODO(discord9): directly from datafusion expr
fn_impl: Arc<dyn PhysicalExpr>,
df_schema: Arc<datafusion_common::DFSchema>,
}
impl DfScalarFunction {
pub fn new(raw_fn: RawDfScalarFn, fn_impl: Arc<dyn PhysicalExpr>) -> Result<Self, Error> {
Ok(Self {
df_schema: Arc::new(raw_fn.input_schema.to_df_schema()?),
raw_fn,
fn_impl,
})
}
pub async fn try_from_raw_fn(raw_fn: RawDfScalarFn) -> Result<Self, Error> {
Ok(Self {
fn_impl: raw_fn.get_fn_impl().await?,
df_schema: Arc::new(raw_fn.input_schema.to_df_schema()?),
raw_fn,
})
}
/// eval a list of expressions using input values
fn eval_args(values: &[Value], exprs: &[ScalarExpr]) -> Result<Vec<Value>, EvalError> {
exprs
.iter()
.map(|expr| expr.eval(values))
.collect::<Result<_, _>>()
}
// TODO(discord9): add RecordBatch support
pub fn eval(&self, values: &[Value], exprs: &[ScalarExpr]) -> Result<Value, EvalError> {
// first eval exprs to construct values to feed to datafusion
let values: Vec<_> = Self::eval_args(values, exprs)?;
if values.is_empty() {
return InvalidArgumentSnafu {
reason: "values is empty".to_string(),
}
.fail();
}
// TODO(discord9): make cols all array length of one
let mut cols = vec![];
for (idx, typ) in self
.raw_fn
.input_schema
.typ()
.column_types
.iter()
.enumerate()
{
let typ = typ.scalar_type();
let mut array = typ.create_mutable_vector(1);
array.push_value_ref(values[idx].as_value_ref());
cols.push(array.to_vector().to_arrow_array());
}
let schema = self.df_schema.inner().clone();
let rb = DfRecordBatch::try_new(schema, cols).map_err(|err| {
ArrowSnafu {
raw: err,
context:
"Failed to create RecordBatch from values when eval datafusion scalar function",
}
.build()
})?;
let res = self.fn_impl.evaluate(&rb).map_err(|err| {
EvalDatafusionSnafu {
raw: err,
context: "Failed to evaluate datafusion scalar function",
}
.build()
})?;
let res = common_query::columnar_value::ColumnarValue::try_from(&res)
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
let res_vec = res
.try_into_vector(1)
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
let res_val = res_vec
.try_get(0)
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
Ok(res_val)
}
}
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct RawDfScalarFn {
/// The raw bytes encoded datafusion scalar function
pub(crate) f: bytes::BytesMut,
/// The input schema of the function
pub(crate) input_schema: RelationDesc,
/// Extension contains mapping from function reference to function name
pub(crate) extensions: FunctionExtensions,
}
impl RawDfScalarFn {
pub fn from_proto(
f: &substrait::substrait_proto_df::proto::expression::ScalarFunction,
input_schema: RelationDesc,
extensions: FunctionExtensions,
) -> Result<Self, Error> {
let mut buf = BytesMut::new();
f.encode(&mut buf)
.context(EncodeRelSnafu)
.map_err(BoxedError::new)
.context(crate::error::ExternalSnafu)?;
Ok(Self {
f: buf,
input_schema,
extensions,
})
}
async fn get_fn_impl(&self) -> Result<Arc<dyn PhysicalExpr>, Error> {
let f = ScalarFunction::decode(&mut self.f.as_ref())
.context(DecodeRelSnafu)
.map_err(BoxedError::new)
.context(crate::error::ExternalSnafu)?;
debug!("Decoded scalar function: {:?}", f);
let input_schema = &self.input_schema;
let extensions = &self.extensions;
from_scalar_fn_to_df_fn_impl(&f, input_schema, extensions).await
}
}
impl std::cmp::PartialEq for DfScalarFunction {
fn eq(&self, other: &Self) -> bool {
self.raw_fn.eq(&other.raw_fn)
}
}
// can't derive Eq because of Arc<dyn PhysicalExpr> not eq, so implement it manually
impl std::cmp::Eq for DfScalarFunction {}
impl std::cmp::PartialOrd for DfScalarFunction {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl std::cmp::Ord for DfScalarFunction {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.raw_fn.cmp(&other.raw_fn)
}
}
impl std::hash::Hash for DfScalarFunction {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.raw_fn.hash(state);
}
}
impl ScalarExpr {
pub fn with_type(self, typ: ColumnType) -> TypedExpr {
TypedExpr::new(self, typ)
@@ -428,6 +259,177 @@ impl ScalarExpr {
}
}
pub fn eval_batch(&self, batch: &Batch) -> Result<VectorRef, EvalError> {
match self {
ScalarExpr::Column(i) => Ok(batch.batch()[*i].clone()),
ScalarExpr::Literal(val, dt) => Ok(Helper::try_from_scalar_value(
val.try_to_scalar_value(dt).context(DataTypeSnafu {
msg: "Failed to convert literal to scalar value",
})?,
batch.row_count(),
)
.context(DataTypeSnafu {
msg: "Failed to convert scalar value to vector ref when parsing literal",
})?),
ScalarExpr::CallUnmaterializable(_) => OptimizeSnafu {
reason: "Can't eval unmaterializable function",
}
.fail()?,
ScalarExpr::CallUnary { func, expr } => func.eval_batch(batch, expr),
ScalarExpr::CallBinary { func, expr1, expr2 } => func.eval_batch(batch, expr1, expr2),
ScalarExpr::CallVariadic { func, exprs } => func.eval_batch(batch, exprs),
ScalarExpr::CallDf {
df_scalar_fn,
exprs,
} => df_scalar_fn.eval_batch(batch, exprs),
ScalarExpr::If { cond, then, els } => Self::eval_if_then(batch, cond, then, els),
}
}
fn eval_if_then(
batch: &Batch,
cond: &ScalarExpr,
then: &ScalarExpr,
els: &ScalarExpr,
) -> Result<VectorRef, EvalError> {
let conds = cond.eval_batch(batch)?;
let bool_conds = conds
.as_any()
.downcast_ref::<BooleanVector>()
.context({
TypeMismatchSnafu {
expected: ConcreteDataType::boolean_datatype(),
actual: conds.data_type(),
}
})?
.as_boolean_array();
let mut then_input_batch = None;
let mut else_input_batch = None;
let mut null_input_batch = None;
// instructions for how to reassembly result vector,
// iterate over (type of vec, offset, length) and append to resulting vec
let mut assembly_idx = vec![];
// append batch, returning appended batch's slice in (offset, length)
fn append_batch(
batch: &mut Option<Batch>,
to_be_append: Batch,
) -> Result<(usize, usize), EvalError> {
let len = to_be_append.row_count();
if let Some(batch) = batch {
let offset = batch.row_count();
batch.append_batch(to_be_append)?;
Ok((offset, len))
} else {
*batch = Some(to_be_append);
Ok((0, len))
}
}
let mut prev_cond: Option<Option<bool>> = None;
let mut prev_start_idx: Option<usize> = None;
// first put different conds' vector into different batches
for (idx, cond) in bool_conds.iter().enumerate() {
// if belong to same slice and not last one continue
if prev_cond == Some(cond) {
continue;
} else if let Some(prev_cond_idx) = prev_start_idx {
let prev_cond = prev_cond.unwrap();
// put a slice to corresponding batch
let slice_offset = prev_cond_idx;
let slice_length = idx - prev_cond_idx;
let to_be_append = batch.slice(slice_offset, slice_length);
let to_put_back = match prev_cond {
Some(true) => (
Some(true),
append_batch(&mut then_input_batch, to_be_append)?,
),
Some(false) => (
Some(false),
append_batch(&mut else_input_batch, to_be_append)?,
),
None => (None, append_batch(&mut null_input_batch, to_be_append)?),
};
assembly_idx.push(to_put_back);
}
prev_cond = Some(cond);
prev_start_idx = Some(idx);
}
// deal with empty and last slice case
if let Some(slice_offset) = prev_start_idx {
let prev_cond = prev_cond.unwrap();
let slice_length = bool_conds.len() - slice_offset;
let to_be_append = batch.slice(slice_offset, slice_length);
let to_put_back = match prev_cond {
Some(true) => (
Some(true),
append_batch(&mut then_input_batch, to_be_append)?,
),
Some(false) => (
Some(false),
append_batch(&mut else_input_batch, to_be_append)?,
),
None => (None, append_batch(&mut null_input_batch, to_be_append)?),
};
assembly_idx.push(to_put_back);
}
let then_output_vec = then_input_batch
.map(|batch| then.eval_batch(&batch))
.transpose()?;
let else_output_vec = else_input_batch
.map(|batch| els.eval_batch(&batch))
.transpose()?;
let null_output_vec = null_input_batch
.map(|null| NullVector::new(null.row_count()).slice(0, null.row_count()));
let dt = then_output_vec
.as_ref()
.map(|v| v.data_type())
.or(else_output_vec.as_ref().map(|v| v.data_type()))
.unwrap_or(ConcreteDataType::null_datatype());
let mut builder = dt.create_mutable_vector(conds.len());
for (cond, (offset, length)) in assembly_idx {
let slice = match cond {
Some(true) => then_output_vec.as_ref(),
Some(false) => else_output_vec.as_ref(),
None => null_output_vec.as_ref(),
}
.context(InternalSnafu {
reason: "Expect corresponding output vector to exist",
})?;
// TODO(discord9): seems `extend_slice_of` doesn't support NullVector or ConstantVector
// consider adding it maybe?
if slice.data_type().is_null() {
builder.push_nulls(length);
} else if slice.is_const() {
let arr = slice.slice(offset, length).to_arrow_array();
let vector = Helper::try_into_vector(arr).context(DataTypeSnafu {
msg: "Failed to convert arrow array to vector",
})?;
builder
.extend_slice_of(vector.as_ref(), 0, vector.len())
.context(DataTypeSnafu {
msg: "Failed to build result vector for if-then expression",
})?;
} else {
builder
.extend_slice_of(slice.as_ref(), offset, length)
.context(DataTypeSnafu {
msg: "Failed to build result vector for if-then expression",
})?;
}
}
let result_vec = builder.to_vector();
Ok(result_vec)
}
/// Eval this expression with the given values.
pub fn eval(&self, values: &[Value]) -> Result<Value, EvalError> {
match self {
@@ -747,18 +749,11 @@ impl ScalarExpr {
#[cfg(test)]
mod test {
use datatypes::arrow::array::Scalar;
use query::parser::QueryLanguageParser;
use query::QueryEngine;
use session::context::QueryContext;
use substrait::extension_serializer;
use substrait::substrait_proto_df::proto::expression::literal::LiteralType;
use substrait::substrait_proto_df::proto::expression::Literal;
use substrait::substrait_proto_df::proto::function_argument::ArgType;
use substrait::substrait_proto_df::proto::r#type::Kind;
use substrait::substrait_proto_df::proto::{r#type, FunctionArgument, Type};
use datatypes::vectors::Int32Vector;
use pretty_assertions::assert_eq;
use super::*;
#[test]
fn test_extract_bound() {
let test_list: [(ScalarExpr, Result<_, EvalError>); 5] = [
@@ -849,37 +844,68 @@ mod test {
assert!(matches!(res, Err(Error::InvalidQuery { .. })));
}
#[tokio::test]
async fn test_df_scalar_function() {
let raw_scalar_func = ScalarFunction {
function_reference: 0,
arguments: vec![FunctionArgument {
arg_type: Some(ArgType::Value(Expression {
rex_type: Some(RexType::Literal(Literal {
nullable: false,
type_variation_reference: 0,
literal_type: Some(LiteralType::I64(-1)),
})),
})),
}],
output_type: None,
..Default::default()
};
let input_schema = RelationDesc::try_new(
RelationType::new(vec![ColumnType::new_nullable(
ConcreteDataType::null_datatype(),
)]),
vec!["null_column".to_string()],
)
.unwrap();
let extensions = FunctionExtensions::from_iter(vec![(0, "abs")]);
let raw_fn = RawDfScalarFn::from_proto(&raw_scalar_func, input_schema, extensions).unwrap();
let df_func = DfScalarFunction::try_from_raw_fn(raw_fn).await.unwrap();
assert_eq!(
df_func
.eval(&[Value::Null], &[ScalarExpr::Column(0)])
.unwrap(),
Value::Int64(1)
);
#[test]
fn test_eval_batch() {
// TODO(discord9): add more tests
{
let expr = ScalarExpr::If {
cond: Box::new(ScalarExpr::Column(0).call_binary(
ScalarExpr::literal(Value::from(0), ConcreteDataType::int32_datatype()),
BinaryFunc::Eq,
)),
then: Box::new(ScalarExpr::literal(
Value::from(42),
ConcreteDataType::int32_datatype(),
)),
els: Box::new(ScalarExpr::literal(
Value::from(37),
ConcreteDataType::int32_datatype(),
)),
};
let raw = vec![
None,
Some(0),
Some(1),
None,
None,
Some(0),
Some(0),
Some(1),
Some(1),
];
let raw_len = raw.len();
let vectors = vec![Int32Vector::from(raw).slice(0, raw_len)];
let batch = Batch::new(vectors, raw_len);
let expected = Int32Vector::from(vec![
None,
Some(42),
Some(37),
None,
None,
Some(42),
Some(42),
Some(37),
Some(37),
])
.slice(0, raw_len);
assert_eq!(expr.eval_batch(&batch).unwrap(), expected);
let raw = vec![Some(0)];
let raw_len = raw.len();
let vectors = vec![Int32Vector::from(raw).slice(0, raw_len)];
let batch = Batch::new(vectors, raw_len);
let expected = Int32Vector::from(vec![Some(42)]).slice(0, raw_len);
assert_eq!(expr.eval_batch(&batch).unwrap(), expected);
let raw: Vec<Option<i32>> = vec![];
let raw_len = raw.len();
let vectors = vec![Int32Vector::from(raw).slice(0, raw_len)];
let batch = Batch::new(vectors, raw_len);
let expected = NullVector::new(raw_len).slice(0, raw_len);
assert_eq!(expr.eval_batch(&batch).unwrap(), expected);
}
}
}

View File

@@ -19,24 +19,21 @@ use std::sync::Arc;
use api::v1::meta::{HeartbeatRequest, Peer};
use common_error::ext::BoxedError;
use common_grpc::channel_manager::{ChannelConfig, ChannelManager};
use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler;
use common_meta::heartbeat::handler::{
HandlerGroupExecutor, HeartbeatResponseHandlerContext, HeartbeatResponseHandlerExecutorRef,
HeartbeatResponseHandlerContext, HeartbeatResponseHandlerExecutorRef,
};
use common_meta::heartbeat::mailbox::{HeartbeatMailbox, MailboxRef, OutgoingMessage};
use common_meta::heartbeat::utils::outgoing_message_to_mailbox_message;
use common_telemetry::{debug, error, info, warn};
use greptime_proto::v1::meta::NodeInfo;
use meta_client::client::{HeartbeatSender, HeartbeatStream, MetaClient, MetaClientBuilder};
use meta_client::{MetaClientOptions, MetaClientType};
use meta_client::client::{HeartbeatSender, HeartbeatStream, MetaClient};
use servers::addrs;
use servers::heartbeat_options::HeartbeatOptions;
use snafu::ResultExt;
use tokio::sync::mpsc;
use tokio::time::{Duration, Instant};
use tokio::time::Duration;
use crate::error::{ExternalSnafu, MetaClientInitSnafu};
use crate::error::ExternalSnafu;
use crate::{Error, FlownodeOptions};
/// The flownode heartbeat task which sending `[HeartbeatRequest]` to Metasrv periodically in background.

View File

@@ -19,7 +19,6 @@
#![feature(let_chains)]
#![feature(duration_abs_diff)]
#![allow(dead_code)]
#![allow(unused_imports)]
#![warn(clippy::missing_docs_in_private_items)]
#![warn(clippy::too_many_lines)]
// allow unused for now because it should be use later

View File

@@ -20,17 +20,11 @@ mod reduce;
use std::collections::BTreeSet;
use datatypes::arrow::ipc::Map;
use serde::{Deserialize, Serialize};
use crate::error::Error;
use crate::expr::{
AggregateExpr, EvalError, GlobalId, Id, LocalId, MapFilterProject, SafeMfpPlan, ScalarExpr,
TypedExpr,
};
use crate::expr::{GlobalId, Id, LocalId, MapFilterProject, SafeMfpPlan, TypedExpr};
use crate::plan::join::JoinPlan;
pub(crate) use crate::plan::reduce::{AccumulablePlan, AggrWithIndex, KeyValPlan, ReducePlan};
use crate::repr::{ColumnType, DiffRow, RelationDesc, RelationType};
use crate::repr::{DiffRow, RelationDesc};
/// A plan for a dataflow component. But with type to indicate the output type of the relation.
#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]

View File

@@ -12,8 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use serde::{Deserialize, Serialize};
use crate::expr::ScalarExpr;
use crate::plan::SafeMfpPlan;

View File

@@ -12,9 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use serde::{Deserialize, Serialize};
use crate::expr::{AggregateExpr, Id, LocalId, MapFilterProject, SafeMfpPlan, ScalarExpr};
use crate::expr::{AggregateExpr, SafeMfpPlan};
/// Describe how to extract key-value pair from a `Row`
#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]

View File

@@ -17,14 +17,10 @@
mod relation;
use std::borrow::Borrow;
use std::slice::SliceIndex;
use api::helper::{pb_value_to_value_ref, value_to_grpc_value};
use api::v1::Row as ProtoRow;
use datatypes::data_type::ConcreteDataType;
use datatypes::types::cast;
use datatypes::types::cast::CastOption;
use datatypes::value::Value;
use itertools::Itertools;
pub(crate) use relation::{ColumnType, Key, RelationDesc, RelationType};

View File

@@ -12,8 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::{BTreeMap, HashMap};
use datafusion_common::DFSchema;
use datatypes::data_type::DataType;
use datatypes::prelude::ConcreteDataType;
@@ -22,7 +20,7 @@ use serde::{Deserialize, Serialize};
use snafu::{ensure, OptionExt, ResultExt};
use crate::error::{DatafusionSnafu, InternalSnafu, InvalidQuerySnafu, Result, UnexpectedSnafu};
use crate::expr::{MapFilterProject, SafeMfpPlan, ScalarExpr};
use crate::expr::{SafeMfpPlan, ScalarExpr};
/// a set of column indices that are "keys" for the collection.
#[derive(Default, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize, Hash)]

View File

@@ -20,35 +20,27 @@ use std::sync::Arc;
use api::v1::{RowDeleteRequests, RowInsertRequests};
use cache::{TABLE_FLOWNODE_SET_CACHE_NAME, TABLE_ROUTE_CACHE_NAME};
use catalog::CatalogManagerRef;
use client::client_manager::NodeClients;
use common_base::Plugins;
use common_error::ext::BoxedError;
use common_grpc::channel_manager::ChannelConfig;
use common_meta::cache::{
LayeredCacheRegistry, LayeredCacheRegistryRef, TableFlownodeSetCacheRef, TableRouteCacheRef,
};
use common_meta::ddl::{table_meta, ProcedureExecutorRef};
use common_meta::heartbeat::handler::HandlerGroupExecutor;
use common_meta::cache::{LayeredCacheRegistryRef, TableFlownodeSetCacheRef, TableRouteCacheRef};
use common_meta::ddl::ProcedureExecutorRef;
use common_meta::key::flow::FlowMetadataManagerRef;
use common_meta::key::TableMetadataManagerRef;
use common_meta::kv_backend::KvBackendRef;
use common_meta::node_manager::{self, Flownode, NodeManagerRef};
use common_meta::node_manager::{Flownode, NodeManagerRef};
use common_query::Output;
use common_telemetry::tracing::info;
use futures::{FutureExt, StreamExt, TryStreamExt};
use futures::{FutureExt, TryStreamExt};
use greptime_proto::v1::flow::{flow_server, FlowRequest, FlowResponse, InsertRequests};
use itertools::Itertools;
use meta_client::client::MetaClient;
use operator::delete::Deleter;
use operator::insert::Inserter;
use operator::statement::StatementExecutor;
use partition::manager::PartitionRuleManager;
use query::{QueryEngine, QueryEngineFactory};
use serde::de::Unexpected;
use servers::error::{AlreadyStartedSnafu, StartGrpcSnafu, TcpBindSnafu, TcpIncomingSnafu};
use servers::heartbeat_options::HeartbeatOptions;
use servers::server::Server;
use session::context::{QueryContext, QueryContextBuilder, QueryContextRef};
use session::context::{QueryContextBuilder, QueryContextRef};
use snafu::{ensure, OptionExt, ResultExt};
use tokio::net::TcpListener;
use tokio::sync::{broadcast, oneshot, Mutex};

View File

@@ -16,37 +16,25 @@
use std::collections::{BTreeMap, HashMap};
use std::sync::Arc;
use bytes::buf::IntoIter;
use common_error::ext::BoxedError;
use common_telemetry::info;
use datafusion::optimizer::simplify_expressions::SimplifyExpressions;
use datafusion::optimizer::{OptimizerContext, OptimizerRule};
use datatypes::data_type::ConcreteDataType as CDT;
use literal::{from_substrait_literal, from_substrait_type};
use prost::Message;
use query::parser::QueryLanguageParser;
use query::plan::LogicalPlan;
use query::query_engine::DefaultSerializer;
use query::QueryEngine;
use serde::{Deserialize, Serialize};
use session::context::QueryContext;
use snafu::{OptionExt, ResultExt};
use snafu::ResultExt;
/// note here we are using the `substrait_proto_df` crate from the `substrait` module and
/// rename it to `substrait_proto`
use substrait::{
substrait_proto_df as substrait_proto, DFLogicalSubstraitConvertor, SubstraitPlan,
};
use substrait::{substrait_proto_df as substrait_proto, DFLogicalSubstraitConvertor};
use substrait_proto::proto::extensions::simple_extension_declaration::MappingType;
use substrait_proto::proto::extensions::SimpleExtensionDeclaration;
use crate::adapter::FlownodeContext;
use crate::error::{
DatafusionSnafu, Error, ExternalSnafu, InvalidQueryProstSnafu, NotImplementedSnafu,
TableNotFoundSnafu, UnexpectedSnafu,
};
use crate::expr::GlobalId;
use crate::error::{DatafusionSnafu, Error, ExternalSnafu, NotImplementedSnafu, UnexpectedSnafu};
use crate::plan::TypedPlan;
use crate::repr::RelationType;
/// a simple macro to generate a not implemented error
macro_rules! not_impl_err {
($($arg:tt)*) => {
@@ -202,7 +190,7 @@ mod test {
use catalog::RegisterTableRequest;
use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, NUMBERS_TABLE_ID};
use common_time::{Date, DateTime};
use common_time::DateTime;
use datatypes::prelude::*;
use datatypes::schema::Schema;
use datatypes::vectors::VectorRef;
@@ -219,7 +207,8 @@ mod test {
use super::*;
use crate::adapter::node_context::IdToNameMap;
use crate::repr::ColumnType;
use crate::expr::GlobalId;
use crate::repr::{ColumnType, RelationType};
pub fn create_test_ctx() -> FlownodeContext {
let mut schemas = HashMap::new();

View File

@@ -12,49 +12,23 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::{BTreeMap, HashMap};
use std::collections::BTreeMap;
use common_decimal::Decimal128;
use common_time::{Date, Timestamp};
use datatypes::arrow::compute::kernels::window;
use datatypes::arrow::ipc::Binary;
use datatypes::data_type::{ConcreteDataType as CDT, DataType};
use datatypes::data_type::DataType;
use datatypes::value::Value;
use hydroflow::futures::future::Map;
use itertools::Itertools;
use snafu::{OptionExt, ResultExt};
use substrait::variation_const::{
DATE_32_TYPE_VARIATION_REF, DATE_64_TYPE_VARIATION_REF, DEFAULT_TYPE_VARIATION_REF,
TIMESTAMP_MICRO_TYPE_VARIATION_REF, TIMESTAMP_MILLI_TYPE_VARIATION_REF,
TIMESTAMP_NANO_TYPE_VARIATION_REF, TIMESTAMP_SECOND_TYPE_VARIATION_REF,
UNSIGNED_INTEGER_TYPE_VARIATION_REF,
};
use snafu::OptionExt;
use substrait_proto::proto::aggregate_function::AggregationInvocation;
use substrait_proto::proto::aggregate_rel::{Grouping, Measure};
use substrait_proto::proto::expression::field_reference::ReferenceType::DirectReference;
use substrait_proto::proto::expression::literal::LiteralType;
use substrait_proto::proto::expression::reference_segment::ReferenceType::StructField;
use substrait_proto::proto::expression::{
IfThen, Literal, MaskExpression, RexType, ScalarFunction,
};
use substrait_proto::proto::extensions::simple_extension_declaration::MappingType;
use substrait_proto::proto::extensions::SimpleExtensionDeclaration;
use substrait_proto::proto::function_argument::ArgType;
use substrait_proto::proto::r#type::Kind;
use substrait_proto::proto::read_rel::ReadType;
use substrait_proto::proto::rel::RelType;
use substrait_proto::proto::{self, plan_rel, Expression, Plan as SubPlan, Rel};
use substrait_proto::proto::{self};
use crate::error::{
DatatypesSnafu, Error, EvalSnafu, InvalidQuerySnafu, NotImplementedSnafu, PlanSnafu,
TableNotFoundSnafu,
};
use crate::error::{Error, NotImplementedSnafu, PlanSnafu};
use crate::expr::{
AggregateExpr, AggregateFunc, BinaryFunc, GlobalId, MapFilterProject, SafeMfpPlan, ScalarExpr,
TypedExpr, UnaryFunc, UnmaterializableFunc, VariadicFunc,
AggregateExpr, AggregateFunc, BinaryFunc, MapFilterProject, ScalarExpr, TypedExpr, UnaryFunc,
};
use crate::plan::{AccumulablePlan, AggrWithIndex, KeyValPlan, Plan, ReducePlan, TypedPlan};
use crate::repr::{self, ColumnType, RelationDesc, RelationType};
use crate::repr::{ColumnType, RelationDesc, RelationType};
use crate::transform::{substrait_proto, FlownodeContext, FunctionExtensions};
impl TypedExpr {
@@ -472,13 +446,14 @@ mod test {
use bytes::BytesMut;
use common_time::{DateTime, Interval};
use datatypes::prelude::ConcreteDataType;
use pretty_assertions::{assert_eq, assert_ne};
use pretty_assertions::assert_eq;
use super::*;
use crate::expr::{DfScalarFunction, RawDfScalarFn};
use crate::expr::{DfScalarFunction, GlobalId, RawDfScalarFn};
use crate::plan::{Plan, TypedPlan};
use crate::repr::{self, ColumnType, RelationType};
use crate::repr::{ColumnType, RelationType};
use crate::transform::test::{create_test_ctx, create_test_query_engine, sql_to_substrait};
use crate::transform::CDT;
/// TODO(discord9): add more illegal sql tests
#[tokio::test]
async fn test_missing_key_check() {

View File

@@ -34,8 +34,7 @@ use substrait::variation_const::{
};
use substrait_proto::proto::expression::literal::LiteralType;
use substrait_proto::proto::expression::Literal;
use substrait_proto::proto::r#type::{self, parameter, Kind, Parameter};
use substrait_proto::proto::Type;
use substrait_proto::proto::r#type::Kind;
use crate::error::{Error, NotImplementedSnafu, PlanSnafu, UnexpectedSnafu};
use crate::transform::substrait_proto;

View File

@@ -22,11 +22,9 @@ use substrait_proto::proto::read_rel::ReadType;
use substrait_proto::proto::rel::RelType;
use substrait_proto::proto::{plan_rel, Plan as SubPlan, ProjectRel, Rel};
use crate::error::{
Error, InternalSnafu, InvalidQuerySnafu, NotImplementedSnafu, PlanSnafu, UnexpectedSnafu,
};
use crate::error::{Error, InvalidQuerySnafu, NotImplementedSnafu, PlanSnafu, UnexpectedSnafu};
use crate::expr::{MapFilterProject, ScalarExpr, TypedExpr, UnaryFunc};
use crate::plan::{KeyValPlan, Plan, ReducePlan, TypedPlan};
use crate::plan::{KeyValPlan, Plan, TypedPlan};
use crate::repr::{self, RelationDesc, RelationType};
use crate::transform::{substrait_proto, FlownodeContext, FunctionExtensions};
@@ -350,7 +348,7 @@ mod test {
use super::*;
use crate::expr::{GlobalId, ScalarExpr};
use crate::plan::{Plan, TypedPlan};
use crate::repr::{self, ColumnType, RelationType};
use crate::repr::{ColumnType, RelationType};
use crate::transform::test::{create_test_ctx, create_test_query_engine, sql_to_substrait};
use crate::transform::CDT;

View File

@@ -19,14 +19,11 @@ use std::ops::Bound;
use std::sync::Arc;
use common_telemetry::debug;
use itertools::Itertools;
use serde::{Deserialize, Serialize};
use smallvec::{smallvec, SmallVec};
use tokio::sync::{Mutex, RwLock};
use tokio::sync::RwLock;
use crate::expr::error::InternalSnafu;
use crate::expr::{EvalError, ScalarExpr};
use crate::repr::{value_to_internal_ts, Diff, DiffRow, Duration, KeyValDiffRow, Row, Timestamp};
use crate::repr::{value_to_internal_ts, DiffRow, Duration, KeyValDiffRow, Row, Timestamp};
/// A batch of updates, arranged by key
pub type Batch = BTreeMap<Row, SmallVec<[DiffRow; 2]>>;
@@ -585,6 +582,7 @@ mod test {
use std::borrow::Borrow;
use datatypes::value::Value;
use itertools::Itertools;
use super::*;