feat: adds ConcretDataType and more datatypes impl (#31)

* feat: adds ConcretDataType and impl binary/boolean/null types and vectors

* feat: adds String to ConcretDataType

* docs:  ConcretDataType::from_arrow_type may panic
This commit is contained in:
dennis zhuang
2022-05-19 11:21:11 +08:00
committed by GitHub
parent 5777732fde
commit b0d2e2e91b
17 changed files with 541 additions and 32 deletions

View File

@@ -11,6 +11,7 @@ features = ["io_csv", "io_json", "io_parquet", "io_parquet_compression", "io_ipc
[dependencies]
common-base = { path = "../common/base" }
common-error = { path = "../common/error" }
enum_dispatch = "0.3"
paste = "1.0"
serde = { version = "1.0.136", features = ["derive"] }
serde_json = "1.0.79"

View File

@@ -0,0 +1,6 @@
use arrow::array::{BinaryArray, MutableBinaryArray, MutableUtf8Array, Utf8Array};
pub type LargeBinaryArray = BinaryArray<i64>;
pub type MutableLargeBinaryArray = MutableBinaryArray<i64>;
pub type MutableStringArray = MutableUtf8Array<i32>;
pub type StringArray = Utf8Array<i32>;

View File

@@ -3,8 +3,68 @@ use std::sync::Arc;
use arrow::datatypes::DataType as ArrowDataType;
use crate::type_id::LogicalTypeId;
use crate::types::{
BinaryType, BooleanType, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type,
NullType, StringType, UInt16Type, UInt32Type, UInt64Type, UInt8Type,
};
use crate::value::Value;
#[derive(Clone, Debug)]
#[enum_dispatch::enum_dispatch(DataType)]
pub enum ConcretDataType {
Null(NullType),
Boolean(BooleanType),
// Numeric types:
Int8(Int8Type),
Int16(Int16Type),
Int32(Int32Type),
Int64(Int64Type),
UInt8(UInt8Type),
UInt16(UInt16Type),
UInt32(UInt32Type),
UInt64(UInt64Type),
Float32(Float32Type),
Float64(Float64Type),
// String types
Binary(BinaryType),
String(StringType),
}
impl ConcretDataType {
/// Convert arrow data type to [ConcretDataType].
///
/// # Panics
/// Panic if given arrow data type is not supported.
pub fn from_arrow_type(dt: &ArrowDataType) -> Self {
match dt {
ArrowDataType::Null => ConcretDataType::Null(NullType::default()),
ArrowDataType::Boolean => ConcretDataType::Boolean(BooleanType::default()),
ArrowDataType::Binary | ArrowDataType::LargeBinary => {
ConcretDataType::Binary(BinaryType::default())
}
ArrowDataType::UInt8 => ConcretDataType::UInt8(UInt8Type::default()),
ArrowDataType::UInt16 => ConcretDataType::UInt16(UInt16Type::default()),
ArrowDataType::UInt32 => ConcretDataType::UInt32(UInt32Type::default()),
ArrowDataType::UInt64 => ConcretDataType::UInt64(UInt64Type::default()),
ArrowDataType::Int8 => ConcretDataType::Int8(Int8Type::default()),
ArrowDataType::Int16 => ConcretDataType::Int16(Int16Type::default()),
ArrowDataType::Int32 => ConcretDataType::Int32(Int32Type::default()),
ArrowDataType::Int64 => ConcretDataType::Int64(Int64Type::default()),
ArrowDataType::Float32 => ConcretDataType::Float32(Float32Type::default()),
ArrowDataType::Float64 => ConcretDataType::Float64(Float64Type::default()),
ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => {
ConcretDataType::String(StringType::default())
}
_ => {
unimplemented!("arrow data_type: {:?}", dt)
}
}
}
}
/// Data type abstraction.
pub trait DataType: std::fmt::Debug + Send + Sync {
/// Name of this data type.
@@ -21,3 +81,76 @@ pub trait DataType: std::fmt::Debug + Send + Sync {
}
pub type DataTypeRef = Arc<dyn DataType>;
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_from_arrow_type() {
assert!(matches!(
ConcretDataType::from_arrow_type(&ArrowDataType::Null),
ConcretDataType::Null(_)
));
assert!(matches!(
ConcretDataType::from_arrow_type(&ArrowDataType::Boolean),
ConcretDataType::Boolean(_)
));
assert!(matches!(
ConcretDataType::from_arrow_type(&ArrowDataType::Binary),
ConcretDataType::Binary(_)
));
assert!(matches!(
ConcretDataType::from_arrow_type(&ArrowDataType::LargeBinary),
ConcretDataType::Binary(_)
));
assert!(matches!(
ConcretDataType::from_arrow_type(&ArrowDataType::Int8),
ConcretDataType::Int8(_)
));
assert!(matches!(
ConcretDataType::from_arrow_type(&ArrowDataType::Int16),
ConcretDataType::Int16(_)
));
assert!(matches!(
ConcretDataType::from_arrow_type(&ArrowDataType::Int32),
ConcretDataType::Int32(_)
));
assert!(matches!(
ConcretDataType::from_arrow_type(&ArrowDataType::Int64),
ConcretDataType::Int64(_)
));
assert!(matches!(
ConcretDataType::from_arrow_type(&ArrowDataType::UInt8),
ConcretDataType::UInt8(_)
));
assert!(matches!(
ConcretDataType::from_arrow_type(&ArrowDataType::UInt16),
ConcretDataType::UInt16(_)
));
assert!(matches!(
ConcretDataType::from_arrow_type(&ArrowDataType::UInt32),
ConcretDataType::UInt32(_)
));
assert!(matches!(
ConcretDataType::from_arrow_type(&ArrowDataType::UInt64),
ConcretDataType::UInt64(_)
));
assert!(matches!(
ConcretDataType::from_arrow_type(&ArrowDataType::Float32),
ConcretDataType::Float32(_)
));
assert!(matches!(
ConcretDataType::from_arrow_type(&ArrowDataType::Float64),
ConcretDataType::Float64(_)
));
assert!(matches!(
ConcretDataType::from_arrow_type(&ArrowDataType::Utf8),
ConcretDataType::String(_)
));
assert!(matches!(
ConcretDataType::from_arrow_type(&ArrowDataType::LargeUtf8),
ConcretDataType::String(_)
));
}
}

View File

@@ -1,25 +1,14 @@
#![feature(generic_associated_types)]
use arrow::array;
use arrow::array::{BinaryArray, MutableBinaryArray, Utf8Array};
pub mod arrow_array;
mod data_type;
pub mod deserialize;
pub mod error;
pub mod prelude;
mod scalars;
pub mod schema;
pub mod serialize;
pub mod type_id;
mod types;
pub mod value;
pub mod vectors;
pub type LargeBinaryArray = BinaryArray<i64>;
pub type MutableLargeBinaryArray = MutableBinaryArray<i64>;
pub type StringArray = Utf8Array<i32>;
pub type MutableStringArray = array::MutableUtf8Array<i32>;
pub mod schema;
pub mod deserialize;
pub mod serialize;
pub mod error;

View File

@@ -1,4 +1,4 @@
pub use crate::data_type::{DataType, DataTypeRef};
pub use crate::data_type::{ConcretDataType, DataType, DataTypeRef};
pub use crate::scalars::{ScalarVector, ScalarVectorBuilder};
pub use crate::type_id::LogicalTypeId;
pub use crate::value::Value;

View File

@@ -1,9 +1,16 @@
mod binary_type;
mod boolean_type;
mod null_type;
mod primitive_traits;
mod primitive_type;
mod string_type;
pub use binary_type::BinaryType;
pub use boolean_type::BooleanType;
pub use null_type::NullType;
pub use primitive_traits::Primitive;
pub use primitive_type::{DataTypeBuilder, PrimitiveType};
pub use primitive_type::{
DataTypeBuilder, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type,
PrimitiveType, UInt16Type, UInt32Type, UInt64Type, UInt8Type,
};
pub use string_type::StringType;

View File

@@ -7,7 +7,7 @@ use crate::data_type::{DataType, DataTypeRef};
use crate::type_id::LogicalTypeId;
use crate::value::Value;
#[derive(Debug, Default)]
#[derive(Debug, Default, Clone)]
pub struct BinaryType;
impl BinaryType {

View File

@@ -0,0 +1,34 @@
use std::sync::Arc;
use arrow::datatypes::DataType as ArrowDataType;
use crate::data_type::{DataType, DataTypeRef};
use crate::type_id::LogicalTypeId;
use crate::value::Value;
#[derive(Debug, Default, Clone)]
pub struct BooleanType;
impl BooleanType {
pub fn arc() -> DataTypeRef {
Arc::new(Self)
}
}
impl DataType for BooleanType {
fn name(&self) -> &str {
"Boolean"
}
fn logical_type_id(&self) -> LogicalTypeId {
LogicalTypeId::Boolean
}
fn default_value(&self) -> Value {
bool::default().into()
}
fn as_arrow_type(&self) -> ArrowDataType {
ArrowDataType::Boolean
}
}

View File

@@ -0,0 +1,34 @@
use std::sync::Arc;
use arrow::datatypes::DataType as ArrowDataType;
use crate::data_type::{DataType, DataTypeRef};
use crate::type_id::LogicalTypeId;
use crate::value::Value;
#[derive(Debug, Default, Clone)]
pub struct NullType;
impl NullType {
pub fn arc() -> DataTypeRef {
Arc::new(Self)
}
}
impl DataType for NullType {
fn name(&self) -> &str {
"Null"
}
fn logical_type_id(&self) -> LogicalTypeId {
LogicalTypeId::Null
}
fn default_value(&self) -> Value {
Value::Null
}
fn as_arrow_type(&self) -> ArrowDataType {
ArrowDataType::Null
}
}

View File

@@ -2,24 +2,18 @@ use std::marker::PhantomData;
use std::sync::Arc;
use arrow::datatypes::DataType as ArrowDataType;
use paste::paste;
use crate::data_type::{DataType, DataTypeRef};
use crate::type_id::LogicalTypeId;
use crate::types::primitive_traits::Primitive;
use crate::value::Value;
#[derive(Clone)]
pub struct PrimitiveType<T: Primitive> {
_phantom: PhantomData<T>,
}
impl<T: Primitive> PrimitiveType<T> {
pub fn new() -> Self {
Self {
_phantom: PhantomData,
}
}
}
/// Create a new [DataTypeRef] from a primitive type.
pub trait DataTypeBuilder {
fn build_data_type() -> DataTypeRef;
@@ -30,7 +24,7 @@ macro_rules! impl_build_data_type {
paste::paste! {
impl DataTypeBuilder for $Type {
fn build_data_type() -> DataTypeRef {
Arc::new(PrimitiveType::<$Type>::new())
Arc::new(PrimitiveType::<$Type>::default())
}
}
}
@@ -63,7 +57,19 @@ macro_rules! impl_numeric {
}
}
impl Default for PrimitiveType<$Type> {
fn default() -> Self {
Self {
_phantom: PhantomData,
}
}
}
impl_build_data_type!($Type);
paste! {
pub type [<$TypeId Type>]=PrimitiveType<$Type>;
}
};
}

View File

@@ -6,7 +6,7 @@ use common_base::bytes::StringBytes;
use crate::data_type::DataType;
use crate::prelude::{DataTypeRef, LogicalTypeId, Value};
#[derive(Debug, Default)]
#[derive(Debug, Default, Clone)]
pub struct StringType;
impl StringType {

View File

@@ -1,4 +1,6 @@
pub mod binary;
pub mod boolean;
pub mod null;
pub mod primitive;
mod string;

View File

@@ -6,6 +6,7 @@ use arrow::array::BinaryValueIter;
use arrow::bitmap::utils::ZipValidity;
use snafu::ResultExt;
use crate::arrow_array::{LargeBinaryArray, MutableLargeBinaryArray};
use crate::data_type::DataTypeRef;
use crate::error::Result;
use crate::error::SerializeSnafu;
@@ -13,7 +14,6 @@ use crate::scalars::{ScalarVector, ScalarVectorBuilder};
use crate::serialize::Serializable;
use crate::types::BinaryType;
use crate::vectors::Vector;
use crate::{LargeBinaryArray, MutableLargeBinaryArray};
/// Vector of binary strings.
#[derive(Debug)]
@@ -99,8 +99,8 @@ mod tests {
use serde::*;
use super::BinaryVector;
use crate::arrow_array::LargeBinaryArray;
use crate::serialize::Serializable;
use crate::LargeBinaryArray;
#[test]
pub fn test_serialize_binary_vector_to_json() {

View File

@@ -0,0 +1,207 @@
use std::any::Any;
use std::borrow::Borrow;
use std::sync::Arc;
use arrow::array::{ArrayRef, BooleanArray, MutableBooleanArray};
use arrow::bitmap::utils::{BitmapIter, ZipValidity};
use snafu::ResultExt;
use crate::data_type::DataTypeRef;
use crate::error::Result;
use crate::scalars::{ScalarVector, ScalarVectorBuilder};
use crate::serialize::Serializable;
use crate::types::BooleanType;
use crate::vectors::Vector;
/// Vector of boolean.
#[derive(Debug)]
pub struct BooleanVector {
array: BooleanArray,
}
impl From<Vec<bool>> for BooleanVector {
fn from(data: Vec<bool>) -> Self {
BooleanVector {
array: BooleanArray::from_slice(&data),
}
}
}
impl From<BooleanArray> for BooleanVector {
fn from(array: BooleanArray) -> Self {
Self { array }
}
}
impl From<Vec<Option<bool>>> for BooleanVector {
fn from(data: Vec<Option<bool>>) -> Self {
BooleanVector {
array: BooleanArray::from(data),
}
}
}
impl<Ptr: Borrow<Option<bool>>> FromIterator<Ptr> for BooleanVector {
fn from_iter<I: IntoIterator<Item = Ptr>>(iter: I) -> Self {
BooleanVector {
array: BooleanArray::from_iter(iter),
}
}
}
impl Vector for BooleanVector {
fn data_type(&self) -> DataTypeRef {
BooleanType::arc()
}
fn as_any(&self) -> &dyn Any {
self
}
fn len(&self) -> usize {
self.array.len()
}
fn to_arrow_array(&self) -> ArrayRef {
Arc::new(self.array.clone())
}
}
impl ScalarVector for BooleanVector {
type RefItem<'a> = bool;
type Iter<'a> = ZipValidity<'a, bool, BitmapIter<'a>>;
type Builder = BooleanVectorBuilder;
fn get_data(&self, idx: usize) -> Option<Self::RefItem<'_>> {
if idx < self.len() {
Some(self.array.value(idx))
} else {
None
}
}
fn iter_data(&self) -> Self::Iter<'_> {
self.array.iter()
}
}
pub struct BooleanVectorBuilder {
mutable_array: MutableBooleanArray,
}
impl ScalarVectorBuilder for BooleanVectorBuilder {
type VectorType = BooleanVector;
fn with_capacity(capacity: usize) -> Self {
Self {
mutable_array: MutableBooleanArray::with_capacity(capacity),
}
}
fn push(&mut self, value: Option<<Self::VectorType as ScalarVector>::RefItem<'_>>) {
self.mutable_array.push(value);
}
fn finish(self) -> Self::VectorType {
BooleanVector {
array: self.mutable_array.into(),
}
}
}
impl Serializable for BooleanVector {
fn serialize_to_json(&self) -> Result<Vec<serde_json::Value>> {
self.iter_data()
.map(serde_json::to_value)
.collect::<serde_json::Result<_>>()
.context(crate::error::SerializeSnafu)
}
}
#[cfg(test)]
mod tests {
use serde::*;
use super::*;
use crate::serialize::Serializable;
#[test]
pub fn test_serialize_boolean_vector_to_json() {
let vector = BooleanVector {
array: BooleanArray::from_slice(&vec![true, false, true, true, false, false]),
};
let json_value = vector.serialize_to_json().unwrap();
let mut output = vec![];
let mut serializer = serde_json::ser::Serializer::new(&mut output);
json_value.serialize(&mut serializer).unwrap();
assert_eq!(
"[true,false,true,true,false,false]",
String::from_utf8_lossy(&output)
);
}
#[test]
fn test_boolean_vector_from_vec() {
let vec = BooleanVector::from(vec![false, true, false, true]);
assert_eq!(4, vec.len());
for i in 0..4 {
assert_eq!(
i == 1 || i == 3,
vec.get_data(i).unwrap(),
"failed at {}",
i
)
}
}
#[test]
fn test_boolean_vector_from_iter() {
let v = vec![Some(false), Some(true), Some(false), Some(true)];
let vec = v.into_iter().collect::<BooleanVector>();
assert_eq!(4, vec.len());
for i in 0..3 {
assert_eq!(
i == 1 || i == 3,
vec.get_data(i).unwrap(),
"failed at {}",
i
)
}
}
#[test]
fn test_boolean_vector_from_vec_option() {
let vec = BooleanVector::from(vec![Some(false), Some(true), None, Some(true)]);
assert_eq!(4, vec.len());
for i in 0..4 {
assert_eq!(
i == 1 || i == 3,
vec.get_data(i).unwrap(),
"failed at {}",
i
)
}
}
#[test]
fn test_boolean_vector_builder() {
let mut builder = BooleanVectorBuilder::with_capacity(4);
builder.push(Some(false));
builder.push(Some(true));
builder.push(Some(false));
builder.push(Some(true));
let vec = builder.finish();
assert_eq!(4, vec.len());
for i in 0..4 {
assert_eq!(
i == 1 || i == 3,
vec.get_data(i).unwrap(),
"failed at {}",
i
)
}
}
}

View File

@@ -0,0 +1,77 @@
use std::any::Any;
use std::fmt;
use std::sync::Arc;
use arrow::array::ArrayRef;
use arrow::array::{Array, NullArray};
use arrow::datatypes::DataType as ArrowDataType;
use crate::data_type::DataTypeRef;
use crate::types::NullType;
use crate::vectors::Vector;
pub struct NullVector {
array: NullArray,
}
impl NullVector {
pub fn new(n: usize) -> Self {
Self {
array: NullArray::new(ArrowDataType::Null, n),
}
}
}
impl From<NullArray> for NullVector {
fn from(array: NullArray) -> Self {
Self { array }
}
}
impl Vector for NullVector {
fn data_type(&self) -> DataTypeRef {
NullType::arc()
}
fn as_any(&self) -> &dyn Any {
self
}
fn len(&self) -> usize {
self.array.len()
}
fn to_arrow_array(&self) -> ArrayRef {
Arc::new(self.array.clone())
}
}
impl fmt::Debug for NullVector {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "NullVector({})", self.len())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_null_array() {
let null_arr = NullVector::new(32);
assert_eq!(null_arr.len(), 32);
let arrow_arr = null_arr.to_arrow_array();
assert_eq!(arrow_arr.null_count(), 32);
let array2 = arrow_arr.slice(8, 16);
assert_eq!(array2.len(), 16);
assert_eq!(array2.null_count(), 16);
}
#[test]
fn test_debug_null_array() {
let array = NullVector::new(1024 * 1024);
assert_eq!(format!("{:?}", array), "NullVector(1048576)");
}
}

View File

@@ -6,13 +6,13 @@ use arrow::bitmap::utils::ZipValidity;
use serde_json::Value;
use snafu::ResultExt;
use crate::arrow_array::{MutableStringArray, StringArray};
use crate::data_type::DataTypeRef;
use crate::error::SerializeSnafu;
use crate::prelude::{ScalarVectorBuilder, Vector};
use crate::scalars::ScalarVector;
use crate::serialize::Serializable;
use crate::types::StringType;
use crate::{MutableStringArray, StringArray};
/// String array wrapper
#[derive(Clone)]