refactor: rewrite some UDFs to DataFusion style (part 4) (#7011)

Signed-off-by: luofucong <luofc@foxmail.com>
2026-05-26 18:00:41 +00:00 · 2025-09-25 03:50:58 +08:00
parent a14c01a807
commit 6d0dd2540e
12 changed files with 710 additions and 507 deletions
--- a/src/common/recordbatch/src/recordbatch.rs
+++ b/src/common/recordbatch/src/recordbatch.rs
@@ -17,6 +17,9 @@ use std::slice;
 use std::sync::Arc;

 use datafusion::arrow::util::pretty::pretty_format_batches;
+use datafusion_common::arrow::array::ArrayRef;
+use datafusion_common::arrow::compute;
+use datafusion_common::arrow::datatypes::{DataType as ArrowDataType, SchemaRef as ArrowSchemaRef};
 use datatypes::arrow::array::RecordBatchOptions;
 use datatypes::prelude::DataType;
 use datatypes::schema::SchemaRef;
@@ -28,8 +31,8 @@ use snafu::{OptionExt, ResultExt, ensure};

 use crate::DfRecordBatch;
 use crate::error::{
-    self, CastVectorSnafu, ColumnNotExistsSnafu, DataTypesSnafu, ProjectArrowRecordBatchSnafu,
-    Result,
+    self, ArrowComputeSnafu, CastVectorSnafu, ColumnNotExistsSnafu, DataTypesSnafu,
+    ProjectArrowRecordBatchSnafu, Result,
 };

 /// A two-dimensional batch of column-oriented data with a defined schema.
@@ -49,6 +52,15 @@ impl RecordBatch {
        let columns: Vec<_> = columns.into_iter().collect();
        let arrow_arrays = columns.iter().map(|v| v.to_arrow_array()).collect();

+        // Casting the arrays here to match the schema, is a temporary solution to support Arrow's
+        // view array types (`StringViewArray` and `BinaryViewArray`).
+        // As to "support": the arrays here are created from vectors, which do not have types
+        // corresponding to view arrays. What we can do is to only cast them.
+        // As to "temporary": we are planing to use Arrow's RecordBatch directly in the read path.
+        // the casting here will be removed in the end.
+        // TODO(LFC): Remove the casting here once `Batch` is no longer used.
+        let arrow_arrays = Self::cast_view_arrays(schema.arrow_schema(), arrow_arrays)?;
+
        let df_record_batch = DfRecordBatch::try_new(schema.arrow_schema().clone(), arrow_arrays)
            .context(error::NewDfRecordBatchSnafu)?;

@@ -59,6 +71,24 @@ impl RecordBatch {
        })
    }

+    fn cast_view_arrays(
+        schema: &ArrowSchemaRef,
+        mut arrays: Vec<ArrayRef>,
+    ) -> Result<Vec<ArrayRef>> {
+        for (f, a) in schema.fields().iter().zip(arrays.iter_mut()) {
+            let expected = f.data_type();
+            let actual = a.data_type();
+            if matches!(
+                (expected, actual),
+                (ArrowDataType::Utf8View, ArrowDataType::Utf8)
+                    | (ArrowDataType::BinaryView, ArrowDataType::Binary)
+            ) {
+                *a = compute::cast(a, expected).context(ArrowComputeSnafu)?;
+            }
+        }
+        Ok(arrays)
+    }
+
    /// Create an empty [`RecordBatch`] from `schema`.
    pub fn new_empty(schema: SchemaRef) -> RecordBatch {
        let df_record_batch = DfRecordBatch::new_empty(schema.arrow_schema().clone());