refactor: rewrite some UDFs to DataFusion style (part 4) (#7011)

Signed-off-by: luofucong <luofc@foxmail.com>
This commit is contained in:
LFC
2025-09-25 03:50:58 +08:00
committed by GitHub
parent a14c01a807
commit 6d0dd2540e
12 changed files with 710 additions and 507 deletions

View File

@@ -17,6 +17,9 @@ use std::slice;
use std::sync::Arc;
use datafusion::arrow::util::pretty::pretty_format_batches;
use datafusion_common::arrow::array::ArrayRef;
use datafusion_common::arrow::compute;
use datafusion_common::arrow::datatypes::{DataType as ArrowDataType, SchemaRef as ArrowSchemaRef};
use datatypes::arrow::array::RecordBatchOptions;
use datatypes::prelude::DataType;
use datatypes::schema::SchemaRef;
@@ -28,8 +31,8 @@ use snafu::{OptionExt, ResultExt, ensure};
use crate::DfRecordBatch;
use crate::error::{
self, CastVectorSnafu, ColumnNotExistsSnafu, DataTypesSnafu, ProjectArrowRecordBatchSnafu,
Result,
self, ArrowComputeSnafu, CastVectorSnafu, ColumnNotExistsSnafu, DataTypesSnafu,
ProjectArrowRecordBatchSnafu, Result,
};
/// A two-dimensional batch of column-oriented data with a defined schema.
@@ -49,6 +52,15 @@ impl RecordBatch {
let columns: Vec<_> = columns.into_iter().collect();
let arrow_arrays = columns.iter().map(|v| v.to_arrow_array()).collect();
// Casting the arrays here to match the schema, is a temporary solution to support Arrow's
// view array types (`StringViewArray` and `BinaryViewArray`).
// As to "support": the arrays here are created from vectors, which do not have types
// corresponding to view arrays. What we can do is to only cast them.
// As to "temporary": we are planing to use Arrow's RecordBatch directly in the read path.
// the casting here will be removed in the end.
// TODO(LFC): Remove the casting here once `Batch` is no longer used.
let arrow_arrays = Self::cast_view_arrays(schema.arrow_schema(), arrow_arrays)?;
let df_record_batch = DfRecordBatch::try_new(schema.arrow_schema().clone(), arrow_arrays)
.context(error::NewDfRecordBatchSnafu)?;
@@ -59,6 +71,24 @@ impl RecordBatch {
})
}
fn cast_view_arrays(
schema: &ArrowSchemaRef,
mut arrays: Vec<ArrayRef>,
) -> Result<Vec<ArrayRef>> {
for (f, a) in schema.fields().iter().zip(arrays.iter_mut()) {
let expected = f.data_type();
let actual = a.data_type();
if matches!(
(expected, actual),
(ArrowDataType::Utf8View, ArrowDataType::Utf8)
| (ArrowDataType::BinaryView, ArrowDataType::Binary)
) {
*a = compute::cast(a, expected).context(ArrowComputeSnafu)?;
}
}
Ok(arrays)
}
/// Create an empty [`RecordBatch`] from `schema`.
pub fn new_empty(schema: SchemaRef) -> RecordBatch {
let df_record_batch = DfRecordBatch::new_empty(schema.arrow_schema().clone());