feat: Switch to datatypes2

This commit is contained in:
evenyag
2022-12-05 20:30:47 +08:00
parent 504059a699
commit cc1ec26416
86 changed files with 4336 additions and 4356 deletions

19
Cargo.lock generated
View File

@@ -2043,25 +2043,6 @@ dependencies = [
"snafu",
]
[[package]]
name = "datatypes2"
version = "0.1.0"
dependencies = [
"arrow",
"common-base",
"common-error",
"common-time",
"datafusion-common",
"enum_dispatch",
"num",
"num-traits",
"ordered-float 3.4.0",
"paste",
"serde",
"serde_json",
"snafu",
]
[[package]]
name = "derive-new"
version = "0.5.9"

View File

@@ -20,7 +20,6 @@ members = [
"src/common/time",
"src/datanode",
"src/datatypes",
"src/datatypes2",
"src/frontend",
"src/log-store",
"src/meta-client",

View File

@@ -9,11 +9,10 @@ default = []
test = []
[dependencies]
arrow = "26.0.0"
common-base = { path = "../common/base" }
common-error = { path = "../common/error" }
common-time = { path = "../common/time" }
datafusion-common = "14.0.0"
datafusion-common = "14.0"
enum_dispatch = "0.3"
num = "0.4"
num-traits = "0.2"
@@ -22,3 +21,4 @@ paste = "1.0"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
snafu = { version = "0.7", features = ["backtraces"] }
arrow = "26.0"

View File

@@ -12,13 +12,18 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use arrow::array::{self, Array, ListArray, PrimitiveArray};
use arrow::array::{
Array, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, Int16Array,
Int32Array, Int64Array, Int8Array, ListArray, UInt16Array, UInt32Array, UInt64Array,
UInt8Array,
};
use arrow::datatypes::DataType;
use common_time::timestamp::Timestamp;
use common_time::timestamp::TimeUnit;
use common_time::Timestamp;
use snafu::OptionExt;
use crate::data_type::ConcreteDataType;
use crate::error::{ConversionSnafu, Result};
use crate::prelude::ConcreteDataType;
use crate::value::{ListValue, Value};
pub type BinaryArray = arrow::array::LargeBinaryArray;
@@ -36,6 +41,7 @@ macro_rules! cast_array {
};
}
// TODO(yingwen): Remove this function.
pub fn arrow_array_get(array: &dyn Array, idx: usize) -> Result<Value> {
if array.is_null(idx) {
return Ok(Value::Null);
@@ -43,42 +49,46 @@ pub fn arrow_array_get(array: &dyn Array, idx: usize) -> Result<Value> {
let result = match array.data_type() {
DataType::Null => Value::Null,
DataType::Boolean => Value::Boolean(cast_array!(array, array::BooleanArray).value(idx)),
DataType::Binary | DataType::LargeBinary => {
Value::Binary(cast_array!(array, BinaryArray).value(idx).into())
}
DataType::Int8 => Value::Int8(cast_array!(array, PrimitiveArray::<i8>).value(idx)),
DataType::Int16 => Value::Int16(cast_array!(array, PrimitiveArray::<i16>).value(idx)),
DataType::Int32 => Value::Int32(cast_array!(array, PrimitiveArray::<i32>).value(idx)),
DataType::Int64 => Value::Int64(cast_array!(array, PrimitiveArray::<i64>).value(idx)),
DataType::UInt8 => Value::UInt8(cast_array!(array, PrimitiveArray::<u8>).value(idx)),
DataType::UInt16 => Value::UInt16(cast_array!(array, PrimitiveArray::<u16>).value(idx)),
DataType::UInt32 => Value::UInt32(cast_array!(array, PrimitiveArray::<u32>).value(idx)),
DataType::UInt64 => Value::UInt64(cast_array!(array, PrimitiveArray::<u64>).value(idx)),
DataType::Float32 => {
Value::Float32(cast_array!(array, PrimitiveArray::<f32>).value(idx).into())
}
DataType::Float64 => {
Value::Float64(cast_array!(array, PrimitiveArray::<f64>).value(idx).into())
}
DataType::Utf8 | DataType::LargeUtf8 => {
Value::String(cast_array!(array, StringArray).value(idx).into())
}
DataType::Timestamp(t, _) => {
let value = cast_array!(array, PrimitiveArray::<i64>).value(idx);
let unit = match ConcreteDataType::from_arrow_time_unit(t) {
ConcreteDataType::Timestamp(t) => t.unit,
_ => unreachable!(),
};
Value::Timestamp(Timestamp::new(value, unit))
}
DataType::Boolean => Value::Boolean(cast_array!(array, BooleanArray).value(idx)),
DataType::Binary => Value::Binary(cast_array!(array, BinaryArray).value(idx).into()),
DataType::Int8 => Value::Int8(cast_array!(array, Int8Array).value(idx)),
DataType::Int16 => Value::Int16(cast_array!(array, Int16Array).value(idx)),
DataType::Int32 => Value::Int32(cast_array!(array, Int32Array).value(idx)),
DataType::Int64 => Value::Int64(cast_array!(array, Int64Array).value(idx)),
DataType::UInt8 => Value::UInt8(cast_array!(array, UInt8Array).value(idx)),
DataType::UInt16 => Value::UInt16(cast_array!(array, UInt16Array).value(idx)),
DataType::UInt32 => Value::UInt32(cast_array!(array, UInt32Array).value(idx)),
DataType::UInt64 => Value::UInt64(cast_array!(array, UInt64Array).value(idx)),
DataType::Float32 => Value::Float32(cast_array!(array, Float32Array).value(idx).into()),
DataType::Float64 => Value::Float64(cast_array!(array, Float64Array).value(idx).into()),
DataType::Utf8 => Value::String(cast_array!(array, StringArray).value(idx).into()),
DataType::Date32 => Value::Date(cast_array!(array, Date32Array).value(idx).into()),
DataType::Date64 => Value::DateTime(cast_array!(array, Date64Array).value(idx).into()),
DataType::Timestamp(t, _) => match t {
arrow::datatypes::TimeUnit::Second => Value::Timestamp(Timestamp::new(
cast_array!(array, arrow::array::TimestampSecondArray).value(idx),
TimeUnit::Second,
)),
arrow::datatypes::TimeUnit::Millisecond => Value::Timestamp(Timestamp::new(
cast_array!(array, arrow::array::TimestampMillisecondArray).value(idx),
TimeUnit::Millisecond,
)),
arrow::datatypes::TimeUnit::Microsecond => Value::Timestamp(Timestamp::new(
cast_array!(array, arrow::array::TimestampMicrosecondArray).value(idx),
TimeUnit::Microsecond,
)),
arrow::datatypes::TimeUnit::Nanosecond => Value::Timestamp(Timestamp::new(
cast_array!(array, arrow::array::TimestampNanosecondArray).value(idx),
TimeUnit::Nanosecond,
)),
},
DataType::List(_) => {
let array = cast_array!(array, ListArray::<i32>).value(idx);
let inner_datatype = ConcreteDataType::try_from(array.data_type())?;
let array = cast_array!(array, ListArray).value(idx);
let item_type = ConcreteDataType::try_from(array.data_type())?;
let values = (0..array.len())
.map(|i| arrow_array_get(&*array, i))
.collect::<Result<Vec<Value>>>()?;
Value::List(ListValue::new(Some(Box::new(values)), inner_datatype))
Value::List(ListValue::new(Some(Box::new(values)), item_type))
}
_ => unimplemented!("Arrow array datatype: {:?}", array.data_type()),
};
@@ -88,45 +98,74 @@ pub fn arrow_array_get(array: &dyn Array, idx: usize) -> Result<Value> {
#[cfg(test)]
mod test {
use std::sync::Arc;
use arrow::array::{
BooleanArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
MutableListArray, MutablePrimitiveArray, TryExtend, UInt16Array, UInt32Array, UInt64Array,
LargeBinaryArray, TimestampMicrosecondArray, TimestampMillisecondArray,
TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array,
UInt8Array,
};
use arrow::buffer::Buffer;
use arrow::datatypes::{DataType, TimeUnit as ArrowTimeUnit};
use arrow::datatypes::Int32Type;
use common_time::timestamp::{TimeUnit, Timestamp};
use paste::paste;
use super::*;
use crate::prelude::Vector;
use crate::vectors::TimestampVector;
use crate::data_type::ConcreteDataType;
use crate::types::TimestampType;
macro_rules! test_arrow_array_get_for_timestamps {
( $($unit: ident), *) => {
$(
paste! {
let mut builder = arrow::array::[<Timestamp $unit Array>]::builder(3);
builder.append_value(1);
builder.append_value(0);
builder.append_value(-1);
let ts_array = Arc::new(builder.finish()) as Arc<dyn Array>;
let v = arrow_array_get(&ts_array, 1).unwrap();
assert_eq!(
ConcreteDataType::Timestamp(TimestampType::$unit(
$crate::types::[<Timestamp $unit Type>]::default(),
)),
v.data_type()
);
}
)*
};
}
#[test]
fn test_timestamp_array() {
test_arrow_array_get_for_timestamps![Second, Millisecond, Microsecond, Nanosecond];
}
#[test]
fn test_arrow_array_access() {
let array1 = BooleanArray::from_slice(vec![true, true, false, false]);
let array1 = BooleanArray::from(vec![true, true, false, false]);
assert_eq!(Value::Boolean(true), arrow_array_get(&array1, 1).unwrap());
let array1 = Int8Array::from_vec(vec![1, 2, 3, 4]);
let array1 = Int8Array::from(vec![1, 2, 3, 4]);
assert_eq!(Value::Int8(2), arrow_array_get(&array1, 1).unwrap());
let array1 = UInt8Array::from_vec(vec![1, 2, 3, 4]);
let array1 = UInt8Array::from(vec![1, 2, 3, 4]);
assert_eq!(Value::UInt8(2), arrow_array_get(&array1, 1).unwrap());
let array1 = Int16Array::from_vec(vec![1, 2, 3, 4]);
let array1 = Int16Array::from(vec![1, 2, 3, 4]);
assert_eq!(Value::Int16(2), arrow_array_get(&array1, 1).unwrap());
let array1 = UInt16Array::from_vec(vec![1, 2, 3, 4]);
let array1 = UInt16Array::from(vec![1, 2, 3, 4]);
assert_eq!(Value::UInt16(2), arrow_array_get(&array1, 1).unwrap());
let array1 = Int32Array::from_vec(vec![1, 2, 3, 4]);
let array1 = Int32Array::from(vec![1, 2, 3, 4]);
assert_eq!(Value::Int32(2), arrow_array_get(&array1, 1).unwrap());
let array1 = UInt32Array::from_vec(vec![1, 2, 3, 4]);
let array1 = UInt32Array::from(vec![1, 2, 3, 4]);
assert_eq!(Value::UInt32(2), arrow_array_get(&array1, 1).unwrap());
let array = Int64Array::from_vec(vec![1, 2, 3, 4]);
let array = Int64Array::from(vec![1, 2, 3, 4]);
assert_eq!(Value::Int64(2), arrow_array_get(&array, 1).unwrap());
let array1 = UInt64Array::from_vec(vec![1, 2, 3, 4]);
let array1 = UInt64Array::from(vec![1, 2, 3, 4]);
assert_eq!(Value::UInt64(2), arrow_array_get(&array1, 1).unwrap());
let array1 = Float32Array::from_vec(vec![1f32, 2f32, 3f32, 4f32]);
let array1 = Float32Array::from(vec![1f32, 2f32, 3f32, 4f32]);
assert_eq!(
Value::Float32(2f32.into()),
arrow_array_get(&array1, 1).unwrap()
);
let array1 = Float64Array::from_vec(vec![1f64, 2f64, 3f64, 4f64]);
let array1 = Float64Array::from(vec![1f64, 2f64, 3f64, 4f64]);
assert_eq!(
Value::Float64(2f64.into()),
arrow_array_get(&array1, 1).unwrap()
@@ -139,55 +178,42 @@ mod test {
);
assert_eq!(Value::Null, arrow_array_get(&array2, 1).unwrap());
let array3 = super::BinaryArray::from(vec![
let array3 = LargeBinaryArray::from(vec![
Some("hello".as_bytes()),
None,
Some("world".as_bytes()),
]);
assert_eq!(
Value::Binary("hello".as_bytes().into()),
arrow_array_get(&array3, 0).unwrap()
);
assert_eq!(Value::Null, arrow_array_get(&array3, 1).unwrap());
let vector = TimestampVector::new(Int64Array::from_vec(vec![1, 2, 3, 4]));
let array = vector.to_boxed_arrow_array();
let value = arrow_array_get(&*array, 1).unwrap();
let array = TimestampSecondArray::from(vec![1, 2, 3]);
let value = arrow_array_get(&array, 1).unwrap();
assert_eq!(value, Value::Timestamp(Timestamp::new(2, TimeUnit::Second)));
let array = TimestampMillisecondArray::from(vec![1, 2, 3]);
let value = arrow_array_get(&array, 1).unwrap();
assert_eq!(
value,
Value::Timestamp(Timestamp::new(2, TimeUnit::Millisecond))
);
let array4 = PrimitiveArray::<i64>::from_data(
DataType::Timestamp(ArrowTimeUnit::Millisecond, None),
Buffer::from_slice(&vec![1, 2, 3, 4]),
None,
);
let array = TimestampMicrosecondArray::from(vec![1, 2, 3]);
let value = arrow_array_get(&array, 1).unwrap();
assert_eq!(
Value::Timestamp(Timestamp::new(1, TimeUnit::Millisecond)),
arrow_array_get(&array4, 0).unwrap()
);
let array4 = PrimitiveArray::<i64>::from_data(
DataType::Timestamp(ArrowTimeUnit::Nanosecond, None),
Buffer::from_slice(&vec![1, 2, 3, 4]),
None,
value,
Value::Timestamp(Timestamp::new(2, TimeUnit::Microsecond))
);
let array = TimestampNanosecondArray::from(vec![1, 2, 3]);
let value = arrow_array_get(&array, 1).unwrap();
assert_eq!(
Value::Timestamp(Timestamp::new(1, TimeUnit::Nanosecond)),
arrow_array_get(&array4, 0).unwrap()
value,
Value::Timestamp(Timestamp::new(2, TimeUnit::Nanosecond))
);
// test list array
let data = vec![
Some(vec![Some(1i32), Some(2), Some(3)]),
Some(vec![Some(1), Some(2), Some(3)]),
None,
Some(vec![Some(4), None, Some(6)]),
];
let mut arrow_array = MutableListArray::<i32, MutablePrimitiveArray<i32>>::new();
arrow_array.try_extend(data).unwrap();
let arrow_array: ListArray<i32> = arrow_array.into();
let arrow_array = ListArray::from_iter_primitive::<Int32Type, _, _>(data);
let v0 = arrow_array_get(&arrow_array, 0).unwrap();
match v0 {

View File

@@ -14,7 +14,7 @@
use std::sync::Arc;
use arrow::datatypes::DataType as ArrowDataType;
use arrow::datatypes::{DataType as ArrowDataType, TimeUnit as ArrowTimeUnit};
use common_time::timestamp::TimeUnit;
use paste::paste;
use serde::{Deserialize, Serialize};
@@ -23,13 +23,14 @@ use crate::error::{self, Error, Result};
use crate::type_id::LogicalTypeId;
use crate::types::{
BinaryType, BooleanType, DateTimeType, DateType, Float32Type, Float64Type, Int16Type,
Int32Type, Int64Type, Int8Type, ListType, NullType, StringType, TimestampType, UInt16Type,
UInt32Type, UInt64Type, UInt8Type,
Int32Type, Int64Type, Int8Type, ListType, NullType, StringType, TimestampMicrosecondType,
TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, TimestampType,
UInt16Type, UInt32Type, UInt64Type, UInt8Type,
};
use crate::value::Value;
use crate::vectors::MutableVector;
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
#[enum_dispatch::enum_dispatch(DataType)]
pub enum ConcreteDataType {
Null(NullType),
@@ -47,17 +48,21 @@ pub enum ConcreteDataType {
Float32(Float32Type),
Float64(Float64Type),
// String types
// String types:
Binary(BinaryType),
String(StringType),
// Date types:
Date(DateType),
DateTime(DateTimeType),
Timestamp(TimestampType),
// Compound types:
List(ListType),
}
// TODO(yingwen): Refactor these `is_xxx()` methods, such as adding a `properties()` method
// returning all these properties to the `DataType` trait
impl ConcreteDataType {
pub fn is_float(&self) -> bool {
matches!(
@@ -70,7 +75,7 @@ impl ConcreteDataType {
matches!(self, ConcreteDataType::Boolean(_))
}
pub fn stringifiable(&self) -> bool {
pub fn is_stringifiable(&self) -> bool {
matches!(
self,
ConcreteDataType::String(_)
@@ -103,13 +108,6 @@ impl ConcreteDataType {
)
}
pub fn is_timestamp(&self) -> bool {
matches!(
self,
ConcreteDataType::Timestamp(_) | ConcreteDataType::Int64(_)
)
}
pub fn numerics() -> Vec<ConcreteDataType> {
vec![
ConcreteDataType::int8_datatype(),
@@ -161,7 +159,7 @@ impl TryFrom<&ArrowDataType> for ConcreteDataType {
ArrowDataType::Binary | ArrowDataType::LargeBinary => Self::binary_datatype(),
ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => Self::string_datatype(),
ArrowDataType::List(field) => Self::List(ListType::new(
ConcreteDataType::from_arrow_type(&field.data_type),
ConcreteDataType::from_arrow_type(field.data_type()),
)),
_ => {
return error::UnsupportedArrowTypeSnafu {
@@ -191,38 +189,52 @@ macro_rules! impl_new_concrete_type_functions {
impl_new_concrete_type_functions!(
Null, Boolean, UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64,
Binary, String, Date, DateTime
Binary, Date, DateTime, String
);
impl ConcreteDataType {
pub fn list_datatype(inner_type: ConcreteDataType) -> ConcreteDataType {
ConcreteDataType::List(ListType::new(inner_type))
pub fn timestamp_second_datatype() -> Self {
ConcreteDataType::Timestamp(TimestampType::Second(TimestampSecondType::default()))
}
pub fn timestamp_millisecond_datatype() -> Self {
ConcreteDataType::Timestamp(TimestampType::Millisecond(
TimestampMillisecondType::default(),
))
}
pub fn timestamp_microsecond_datatype() -> Self {
ConcreteDataType::Timestamp(TimestampType::Microsecond(
TimestampMicrosecondType::default(),
))
}
pub fn timestamp_nanosecond_datatype() -> Self {
ConcreteDataType::Timestamp(TimestampType::Nanosecond(TimestampNanosecondType::default()))
}
pub fn timestamp_datatype(unit: TimeUnit) -> Self {
ConcreteDataType::Timestamp(TimestampType::new(unit))
}
pub fn timestamp_millis_datatype() -> Self {
ConcreteDataType::Timestamp(TimestampType::new(TimeUnit::Millisecond))
match unit {
TimeUnit::Second => Self::timestamp_second_datatype(),
TimeUnit::Millisecond => Self::timestamp_millisecond_datatype(),
TimeUnit::Microsecond => Self::timestamp_microsecond_datatype(),
TimeUnit::Nanosecond => Self::timestamp_nanosecond_datatype(),
}
}
/// Converts from arrow timestamp unit to
// TODO(hl): maybe impl From<ArrowTimestamp> for our timestamp ?
pub fn from_arrow_time_unit(t: &arrow::datatypes::TimeUnit) -> Self {
pub fn from_arrow_time_unit(t: &ArrowTimeUnit) -> Self {
match t {
arrow::datatypes::TimeUnit::Second => Self::timestamp_datatype(TimeUnit::Second),
arrow::datatypes::TimeUnit::Millisecond => {
Self::timestamp_datatype(TimeUnit::Millisecond)
}
arrow::datatypes::TimeUnit::Microsecond => {
Self::timestamp_datatype(TimeUnit::Microsecond)
}
arrow::datatypes::TimeUnit::Nanosecond => {
Self::timestamp_datatype(TimeUnit::Nanosecond)
}
ArrowTimeUnit::Second => Self::timestamp_second_datatype(),
ArrowTimeUnit::Millisecond => Self::timestamp_millisecond_datatype(),
ArrowTimeUnit::Microsecond => Self::timestamp_microsecond_datatype(),
ArrowTimeUnit::Nanosecond => Self::timestamp_nanosecond_datatype(),
}
}
pub fn list_datatype(item_type: ConcreteDataType) -> ConcreteDataType {
ConcreteDataType::List(ListType::new(item_type))
}
}
/// Data type abstraction.
@@ -237,11 +249,15 @@ pub trait DataType: std::fmt::Debug + Send + Sync {
/// Returns the default value of this type.
fn default_value(&self) -> Value;
/// Convert this type as [arrow2::datatypes::DataType].
/// Convert this type as [arrow::datatypes::DataType].
fn as_arrow_type(&self) -> ArrowDataType;
/// Create a mutable vector with given `capacity` of this type.
/// Creates a mutable vector with given `capacity` of this type.
fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector>;
/// Returns true if the data type is compatible with timestamp type so we can
/// use it as a timestamp.
fn is_timestamp_compatible(&self) -> bool;
}
pub type DataTypeRef = Arc<dyn DataType>;
@@ -324,10 +340,6 @@ mod tests {
ConcreteDataType::from_arrow_type(&ArrowDataType::Utf8),
ConcreteDataType::String(_)
));
assert!(matches!(
ConcreteDataType::from_arrow_type(&ArrowDataType::Utf8),
ConcreteDataType::String(_)
));
assert_eq!(
ConcreteDataType::from_arrow_type(&ArrowDataType::List(Box::new(Field::new(
"item",
@@ -345,31 +357,48 @@ mod tests {
#[test]
fn test_from_arrow_timestamp() {
assert_eq!(
ConcreteDataType::timestamp_millis_datatype(),
ConcreteDataType::from_arrow_time_unit(&arrow::datatypes::TimeUnit::Millisecond)
ConcreteDataType::timestamp_millisecond_datatype(),
ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Millisecond)
);
assert_eq!(
ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond),
ConcreteDataType::from_arrow_time_unit(&arrow::datatypes::TimeUnit::Microsecond)
ConcreteDataType::timestamp_microsecond_datatype(),
ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Microsecond)
);
assert_eq!(
ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond),
ConcreteDataType::from_arrow_time_unit(&arrow::datatypes::TimeUnit::Nanosecond)
ConcreteDataType::timestamp_nanosecond_datatype(),
ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Nanosecond)
);
assert_eq!(
ConcreteDataType::timestamp_datatype(TimeUnit::Second),
ConcreteDataType::from_arrow_time_unit(&arrow::datatypes::TimeUnit::Second)
ConcreteDataType::timestamp_second_datatype(),
ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Second)
);
}
#[test]
fn test_is_timestamp() {
assert!(ConcreteDataType::timestamp_millis_datatype().is_timestamp());
assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Second).is_timestamp());
assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Millisecond).is_timestamp());
assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond).is_timestamp());
assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond).is_timestamp());
assert!(ConcreteDataType::int64_datatype().is_timestamp());
fn test_is_timestamp_compatible() {
assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Second).is_timestamp_compatible());
assert!(
ConcreteDataType::timestamp_datatype(TimeUnit::Millisecond).is_timestamp_compatible()
);
assert!(
ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond).is_timestamp_compatible()
);
assert!(
ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond).is_timestamp_compatible()
);
assert!(ConcreteDataType::timestamp_second_datatype().is_timestamp_compatible());
assert!(ConcreteDataType::timestamp_millisecond_datatype().is_timestamp_compatible());
assert!(ConcreteDataType::timestamp_microsecond_datatype().is_timestamp_compatible());
assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_timestamp_compatible());
assert!(ConcreteDataType::int64_datatype().is_timestamp_compatible());
assert!(!ConcreteDataType::null_datatype().is_timestamp_compatible());
assert!(!ConcreteDataType::binary_datatype().is_timestamp_compatible());
assert!(!ConcreteDataType::boolean_datatype().is_timestamp_compatible());
assert!(!ConcreteDataType::date_datatype().is_timestamp_compatible());
assert!(!ConcreteDataType::datetime_datatype().is_timestamp_compatible());
assert!(!ConcreteDataType::string_datatype().is_timestamp_compatible());
assert!(!ConcreteDataType::int32_datatype().is_timestamp_compatible());
assert!(!ConcreteDataType::uint64_datatype().is_timestamp_compatible());
}
#[test]
@@ -377,4 +406,81 @@ mod tests {
assert!(ConcreteDataType::null_datatype().is_null());
assert!(!ConcreteDataType::int32_datatype().is_null());
}
#[test]
fn test_is_float() {
assert!(!ConcreteDataType::int32_datatype().is_float());
assert!(ConcreteDataType::float32_datatype().is_float());
assert!(ConcreteDataType::float64_datatype().is_float());
}
#[test]
fn test_is_boolean() {
assert!(!ConcreteDataType::int32_datatype().is_boolean());
assert!(!ConcreteDataType::float32_datatype().is_boolean());
assert!(ConcreteDataType::boolean_datatype().is_boolean());
}
#[test]
fn test_is_stringifiable() {
assert!(!ConcreteDataType::int32_datatype().is_stringifiable());
assert!(!ConcreteDataType::float32_datatype().is_stringifiable());
assert!(ConcreteDataType::string_datatype().is_stringifiable());
assert!(ConcreteDataType::date_datatype().is_stringifiable());
assert!(ConcreteDataType::datetime_datatype().is_stringifiable());
assert!(ConcreteDataType::timestamp_second_datatype().is_stringifiable());
assert!(ConcreteDataType::timestamp_millisecond_datatype().is_stringifiable());
assert!(ConcreteDataType::timestamp_microsecond_datatype().is_stringifiable());
assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_stringifiable());
}
#[test]
fn test_is_signed() {
assert!(ConcreteDataType::int8_datatype().is_signed());
assert!(ConcreteDataType::int16_datatype().is_signed());
assert!(ConcreteDataType::int32_datatype().is_signed());
assert!(ConcreteDataType::int64_datatype().is_signed());
assert!(ConcreteDataType::date_datatype().is_signed());
assert!(ConcreteDataType::datetime_datatype().is_signed());
assert!(ConcreteDataType::timestamp_second_datatype().is_signed());
assert!(ConcreteDataType::timestamp_millisecond_datatype().is_signed());
assert!(ConcreteDataType::timestamp_microsecond_datatype().is_signed());
assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_signed());
assert!(!ConcreteDataType::uint8_datatype().is_signed());
assert!(!ConcreteDataType::uint16_datatype().is_signed());
assert!(!ConcreteDataType::uint32_datatype().is_signed());
assert!(!ConcreteDataType::uint64_datatype().is_signed());
assert!(!ConcreteDataType::float32_datatype().is_signed());
assert!(!ConcreteDataType::float64_datatype().is_signed());
}
#[test]
fn test_is_unsigned() {
assert!(!ConcreteDataType::int8_datatype().is_unsigned());
assert!(!ConcreteDataType::int16_datatype().is_unsigned());
assert!(!ConcreteDataType::int32_datatype().is_unsigned());
assert!(!ConcreteDataType::int64_datatype().is_unsigned());
assert!(!ConcreteDataType::date_datatype().is_unsigned());
assert!(!ConcreteDataType::datetime_datatype().is_unsigned());
assert!(!ConcreteDataType::timestamp_second_datatype().is_unsigned());
assert!(!ConcreteDataType::timestamp_millisecond_datatype().is_unsigned());
assert!(!ConcreteDataType::timestamp_microsecond_datatype().is_unsigned());
assert!(!ConcreteDataType::timestamp_nanosecond_datatype().is_unsigned());
assert!(ConcreteDataType::uint8_datatype().is_unsigned());
assert!(ConcreteDataType::uint16_datatype().is_unsigned());
assert!(ConcreteDataType::uint32_datatype().is_unsigned());
assert!(ConcreteDataType::uint64_datatype().is_unsigned());
assert!(!ConcreteDataType::float32_datatype().is_unsigned());
assert!(!ConcreteDataType::float64_datatype().is_unsigned());
}
#[test]
fn test_numerics() {
let nums = ConcreteDataType::numerics();
assert_eq!(10, nums.len());
}
}

View File

@@ -23,6 +23,7 @@ pub mod prelude;
mod scalars;
pub mod schema;
pub mod serialize;
mod timestamp;
pub mod type_id;
pub mod types;
pub mod value;

View File

@@ -12,27 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
///! Some helper macros for datatypes, copied from databend.
#[macro_export]
macro_rules! for_all_scalar_types {
($macro:tt $(, $x:tt)*) => {
$macro! {
[$($x),*],
{ i8 },
{ i16 },
{ i32 },
{ i64 },
{ u8 },
{ u16 },
{ u32 },
{ u64 },
{ f32 },
{ f64 },
{ bool },
}
};
}
//! Some helper macros for datatypes, copied from databend.
/// Apply the macro rules to all primitive types.
#[macro_export]
macro_rules! for_all_primitive_types {
($macro:tt $(, $x:tt)*) => {
@@ -52,6 +34,8 @@ macro_rules! for_all_primitive_types {
};
}
/// Match the logical type and apply `$body` to all primitive types and
/// `nbody` to other types.
#[macro_export]
macro_rules! with_match_primitive_type_id {
($key_type:expr, | $_:tt $T:ident | $body:tt, $nbody:tt) => {{
@@ -62,17 +46,21 @@ macro_rules! with_match_primitive_type_id {
}
use $crate::type_id::LogicalTypeId;
use $crate::types::{
Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type,
UInt32Type, UInt64Type, UInt8Type,
};
match $key_type {
LogicalTypeId::Int8 => __with_ty__! { i8 },
LogicalTypeId::Int16 => __with_ty__! { i16 },
LogicalTypeId::Int32 => __with_ty__! { i32 },
LogicalTypeId::Int64 => __with_ty__! { i64 },
LogicalTypeId::UInt8 => __with_ty__! { u8 },
LogicalTypeId::UInt16 => __with_ty__! { u16 },
LogicalTypeId::UInt32 => __with_ty__! { u32 },
LogicalTypeId::UInt64 => __with_ty__! { u64 },
LogicalTypeId::Float32 => __with_ty__! { f32 },
LogicalTypeId::Float64 => __with_ty__! { f64 },
LogicalTypeId::Int8 => __with_ty__! { Int8Type },
LogicalTypeId::Int16 => __with_ty__! { Int16Type },
LogicalTypeId::Int32 => __with_ty__! { Int32Type },
LogicalTypeId::Int64 => __with_ty__! { Int64Type },
LogicalTypeId::UInt8 => __with_ty__! { UInt8Type },
LogicalTypeId::UInt16 => __with_ty__! { UInt16Type },
LogicalTypeId::UInt32 => __with_ty__! { UInt32Type },
LogicalTypeId::UInt64 => __with_ty__! { UInt64Type },
LogicalTypeId::Float32 => __with_ty__! { Float32Type },
LogicalTypeId::Float64 => __with_ty__! { Float64Type },
_ => $nbody,
}

View File

@@ -16,8 +16,5 @@ pub use crate::data_type::{ConcreteDataType, DataType, DataTypeRef};
pub use crate::macros::*;
pub use crate::scalars::{Scalar, ScalarRef, ScalarVector, ScalarVectorBuilder};
pub use crate::type_id::LogicalTypeId;
pub use crate::types::Primitive;
pub use crate::value::{Value, ValueRef};
pub use crate::vectors::{
Helper as VectorHelper, MutableVector, Validity, Vector, VectorBuilder, VectorRef,
};
pub use crate::vectors::{MutableVector, Validity, Vector, VectorRef};

View File

@@ -14,11 +14,17 @@
use std::any::Any;
use common_time::{Date, DateTime, Timestamp};
use common_time::{Date, DateTime};
use crate::prelude::*;
use crate::value::{ListValue, ListValueRef};
use crate::vectors::*;
use crate::types::{
Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, UInt32Type,
UInt64Type, UInt8Type,
};
use crate::value::{ListValue, ListValueRef, Value};
use crate::vectors::{
BinaryVector, BooleanVector, DateTimeVector, DateVector, ListVector, MutableVector,
PrimitiveVector, StringVector, Vector,
};
fn get_iter_capacity<T, I: Iterator<Item = T>>(iter: &I) -> usize {
match iter.size_hint() {
@@ -35,7 +41,7 @@ where
for<'a> Self::VectorType: ScalarVector<RefItem<'a> = Self::RefType<'a>>,
{
type VectorType: ScalarVector<OwnedItem = Self>;
type RefType<'a>: ScalarRef<'a, ScalarType = Self, VectorType = Self::VectorType>
type RefType<'a>: ScalarRef<'a, ScalarType = Self>
where
Self: 'a;
/// Get a reference of the current value.
@@ -46,7 +52,6 @@ where
}
pub trait ScalarRef<'a>: std::fmt::Debug + Clone + Copy + Send + 'a {
type VectorType: ScalarVector<RefItem<'a> = Self>;
/// The corresponding [`Scalar`] type.
type ScalarType: Scalar<RefType<'a> = Self>;
@@ -63,7 +68,7 @@ where
{
type OwnedItem: Scalar<VectorType = Self>;
/// The reference item of this vector.
type RefItem<'a>: ScalarRef<'a, ScalarType = Self::OwnedItem, VectorType = Self>
type RefItem<'a>: ScalarRef<'a, ScalarType = Self::OwnedItem>
where
Self: 'a;
@@ -137,47 +142,46 @@ pub trait ScalarVectorBuilder: MutableVector {
fn finish(&mut self) -> Self::VectorType;
}
macro_rules! impl_primitive_scalar_type {
($native:ident) => {
impl Scalar for $native {
type VectorType = PrimitiveVector<$native>;
type RefType<'a> = $native;
macro_rules! impl_scalar_for_native {
($Native: ident, $DataType: ident) => {
impl Scalar for $Native {
type VectorType = PrimitiveVector<$DataType>;
type RefType<'a> = $Native;
#[inline]
fn as_scalar_ref(&self) -> $native {
fn as_scalar_ref(&self) -> $Native {
*self
}
#[allow(clippy::needless_lifetimes)]
#[inline]
fn upcast_gat<'short, 'long: 'short>(long: $native) -> $native {
fn upcast_gat<'short, 'long: 'short>(long: $Native) -> $Native {
long
}
}
/// Implement [`ScalarRef`] for primitive types. Note that primitive types are both [`Scalar`] and [`ScalarRef`].
impl<'a> ScalarRef<'a> for $native {
type VectorType = PrimitiveVector<$native>;
type ScalarType = $native;
impl<'a> ScalarRef<'a> for $Native {
type ScalarType = $Native;
#[inline]
fn to_owned_scalar(&self) -> $native {
fn to_owned_scalar(&self) -> $Native {
*self
}
}
};
}
impl_primitive_scalar_type!(u8);
impl_primitive_scalar_type!(u16);
impl_primitive_scalar_type!(u32);
impl_primitive_scalar_type!(u64);
impl_primitive_scalar_type!(i8);
impl_primitive_scalar_type!(i16);
impl_primitive_scalar_type!(i32);
impl_primitive_scalar_type!(i64);
impl_primitive_scalar_type!(f32);
impl_primitive_scalar_type!(f64);
impl_scalar_for_native!(u8, UInt8Type);
impl_scalar_for_native!(u16, UInt16Type);
impl_scalar_for_native!(u32, UInt32Type);
impl_scalar_for_native!(u64, UInt64Type);
impl_scalar_for_native!(i8, Int8Type);
impl_scalar_for_native!(i16, Int16Type);
impl_scalar_for_native!(i32, Int32Type);
impl_scalar_for_native!(i64, Int64Type);
impl_scalar_for_native!(f32, Float32Type);
impl_scalar_for_native!(f64, Float64Type);
impl Scalar for bool {
type VectorType = BooleanVector;
@@ -196,7 +200,6 @@ impl Scalar for bool {
}
impl<'a> ScalarRef<'a> for bool {
type VectorType = BooleanVector;
type ScalarType = bool;
#[inline]
@@ -221,7 +224,6 @@ impl Scalar for String {
}
impl<'a> ScalarRef<'a> for &'a str {
type VectorType = StringVector;
type ScalarType = String;
#[inline]
@@ -246,7 +248,6 @@ impl Scalar for Vec<u8> {
}
impl<'a> ScalarRef<'a> for &'a [u8] {
type VectorType = BinaryVector;
type ScalarType = Vec<u8>;
#[inline]
@@ -269,7 +270,6 @@ impl Scalar for Date {
}
impl<'a> ScalarRef<'a> for Date {
type VectorType = DateVector;
type ScalarType = Date;
fn to_owned_scalar(&self) -> Self::ScalarType {
@@ -291,7 +291,6 @@ impl Scalar for DateTime {
}
impl<'a> ScalarRef<'a> for DateTime {
type VectorType = DateTimeVector;
type ScalarType = DateTime;
fn to_owned_scalar(&self) -> Self::ScalarType {
@@ -299,27 +298,7 @@ impl<'a> ScalarRef<'a> for DateTime {
}
}
impl Scalar for Timestamp {
type VectorType = TimestampVector;
type RefType<'a> = Timestamp;
fn as_scalar_ref(&self) -> Self::RefType<'_> {
*self
}
fn upcast_gat<'short, 'long: 'short>(long: Self::RefType<'long>) -> Self::RefType<'short> {
long
}
}
impl<'a> ScalarRef<'a> for Timestamp {
type VectorType = TimestampVector;
type ScalarType = Timestamp;
fn to_owned_scalar(&self) -> Self::ScalarType {
*self
}
}
// Timestamp types implement Scalar and ScalarRef in `src/timestamp.rs`.
impl Scalar for ListValue {
type VectorType = ListVector;
@@ -335,7 +314,6 @@ impl Scalar for ListValue {
}
impl<'a> ScalarRef<'a> for ListValueRef<'a> {
type VectorType = ListVector;
type ScalarType = ListValue;
fn to_owned_scalar(&self) -> Self::ScalarType {
@@ -357,8 +335,9 @@ impl<'a> ScalarRef<'a> for ListValueRef<'a> {
#[cfg(test)]
mod tests {
use super::*;
use crate::vectors::binary::BinaryVector;
use crate::vectors::primitive::Int32Vector;
use crate::data_type::ConcreteDataType;
use crate::timestamp::TimestampSecond;
use crate::vectors::{BinaryVector, Int32Vector, ListVectorBuilder, TimestampSecondVector};
fn build_vector_from_slice<T: ScalarVector>(items: &[Option<T::RefItem<'_>>]) -> T {
let mut builder = T::Builder::with_capacity(items.len());
@@ -454,11 +433,11 @@ mod tests {
#[test]
fn test_build_timestamp_vector() {
let expect: Vec<Option<Timestamp>> = vec![Some(10.into()), None, Some(42.into())];
let vector: TimestampVector = build_vector_from_slice(&expect);
let expect: Vec<Option<TimestampSecond>> = vec![Some(10.into()), None, Some(42.into())];
let vector: TimestampSecondVector = build_vector_from_slice(&expect);
assert_vector_eq(&expect, &vector);
let val = vector.get_data(0).unwrap();
assert_eq!(val, val.as_scalar_ref());
assert_eq!(10, val.to_owned_scalar().value());
assert_eq!(TimestampSecond::from(10), val.to_owned_scalar());
}
}

View File

@@ -12,128 +12,27 @@
// See the License for the specific language governing permissions and
// limitations under the License.
mod column_schema;
mod constraint;
mod raw;
use std::collections::HashMap;
use std::sync::Arc;
pub use arrow::datatypes::Metadata;
use arrow::datatypes::{Field, Schema as ArrowSchema};
use serde::{Deserialize, Serialize};
use snafu::{ensure, ResultExt};
use crate::data_type::{ConcreteDataType, DataType};
use crate::error::{self, DeserializeSnafu, Error, Result, SerializeSnafu};
use crate::data_type::DataType;
use crate::error::{self, Error, Result};
pub use crate::schema::column_schema::{ColumnSchema, Metadata};
pub use crate::schema::constraint::ColumnDefaultConstraint;
pub use crate::schema::raw::RawSchema;
use crate::vectors::VectorRef;
/// Key used to store whether the column is time index in arrow field's metadata.
const TIME_INDEX_KEY: &str = "greptime:time_index";
/// Key used to store version number of the schema in metadata.
const VERSION_KEY: &str = "greptime:version";
/// Key used to store default constraint in arrow field's metadata.
const ARROW_FIELD_DEFAULT_CONSTRAINT_KEY: &str = "greptime:default_constraint";
/// Schema of a column, used as an immutable struct.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ColumnSchema {
pub name: String,
pub data_type: ConcreteDataType,
is_nullable: bool,
is_time_index: bool,
default_constraint: Option<ColumnDefaultConstraint>,
metadata: Metadata,
}
impl ColumnSchema {
pub fn new<T: Into<String>>(
name: T,
data_type: ConcreteDataType,
is_nullable: bool,
) -> ColumnSchema {
ColumnSchema {
name: name.into(),
data_type,
is_nullable,
is_time_index: false,
default_constraint: None,
metadata: Metadata::new(),
}
}
#[inline]
pub fn is_time_index(&self) -> bool {
self.is_time_index
}
#[inline]
pub fn is_nullable(&self) -> bool {
self.is_nullable
}
#[inline]
pub fn default_constraint(&self) -> Option<&ColumnDefaultConstraint> {
self.default_constraint.as_ref()
}
#[inline]
pub fn metadata(&self) -> &Metadata {
&self.metadata
}
pub fn with_time_index(mut self, is_time_index: bool) -> Self {
self.is_time_index = is_time_index;
if is_time_index {
self.metadata
.insert(TIME_INDEX_KEY.to_string(), "true".to_string());
} else {
self.metadata.remove(TIME_INDEX_KEY);
}
self
}
pub fn with_default_constraint(
mut self,
default_constraint: Option<ColumnDefaultConstraint>,
) -> Result<Self> {
if let Some(constraint) = &default_constraint {
constraint.validate(&self.data_type, self.is_nullable)?;
}
self.default_constraint = default_constraint;
Ok(self)
}
/// Creates a new [`ColumnSchema`] with given metadata.
pub fn with_metadata(mut self, metadata: Metadata) -> Self {
self.metadata = metadata;
self
}
pub fn create_default_vector(&self, num_rows: usize) -> Result<Option<VectorRef>> {
match &self.default_constraint {
Some(c) => c
.create_default_vector(&self.data_type, self.is_nullable, num_rows)
.map(Some),
None => {
if self.is_nullable {
// No default constraint, use null as default value.
// TODO(yingwen): Use NullVector once it supports setting logical type.
ColumnDefaultConstraint::null_value()
.create_default_vector(&self.data_type, self.is_nullable, num_rows)
.map(Some)
} else {
Ok(None)
}
}
}
}
}
/// A common schema, should be immutable.
#[derive(Debug, Clone, PartialEq)]
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Schema {
column_schemas: Vec<ColumnSchema>,
name_to_index: HashMap<String, usize>,
@@ -231,7 +130,7 @@ impl Schema {
}
#[inline]
pub fn metadata(&self) -> &Metadata {
pub fn metadata(&self) -> &HashMap<String, String> {
&self.arrow_schema.metadata
}
}
@@ -243,7 +142,7 @@ pub struct SchemaBuilder {
fields: Vec<Field>,
timestamp_index: Option<usize>,
version: u32,
metadata: Metadata,
metadata: HashMap<String, String>,
}
impl TryFrom<Vec<ColumnSchema>> for SchemaBuilder {
@@ -292,7 +191,7 @@ impl SchemaBuilder {
self.metadata
.insert(VERSION_KEY.to_string(), self.version.to_string());
let arrow_schema = ArrowSchema::from(self.fields).with_metadata(self.metadata);
let arrow_schema = ArrowSchema::new(self.fields).with_metadata(self.metadata);
Ok(Schema {
column_schemas: self.column_schemas,
@@ -347,7 +246,7 @@ fn validate_timestamp_index(column_schemas: &[ColumnSchema], timestamp_index: us
let column_schema = &column_schemas[timestamp_index];
ensure!(
column_schema.data_type.is_timestamp(),
column_schema.data_type.is_timestamp_compatible(),
error::InvalidTimestampIndexSnafu {
index: timestamp_index,
}
@@ -364,58 +263,6 @@ fn validate_timestamp_index(column_schemas: &[ColumnSchema], timestamp_index: us
pub type SchemaRef = Arc<Schema>;
impl TryFrom<&Field> for ColumnSchema {
type Error = Error;
fn try_from(field: &Field) -> Result<ColumnSchema> {
let data_type = ConcreteDataType::try_from(&field.data_type)?;
let mut metadata = field.metadata.clone();
let default_constraint = match metadata.remove(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY) {
Some(json) => Some(serde_json::from_str(&json).context(DeserializeSnafu { json })?),
None => None,
};
let is_time_index = metadata.contains_key(TIME_INDEX_KEY);
Ok(ColumnSchema {
name: field.name.clone(),
data_type,
is_nullable: field.is_nullable,
is_time_index,
default_constraint,
metadata,
})
}
}
impl TryFrom<&ColumnSchema> for Field {
type Error = Error;
fn try_from(column_schema: &ColumnSchema) -> Result<Field> {
let mut metadata = column_schema.metadata.clone();
if let Some(value) = &column_schema.default_constraint {
// Adds an additional metadata to store the default constraint.
let old = metadata.insert(
ARROW_FIELD_DEFAULT_CONSTRAINT_KEY.to_string(),
serde_json::to_string(&value).context(SerializeSnafu)?,
);
ensure!(
old.is_none(),
error::DuplicateMetaSnafu {
key: ARROW_FIELD_DEFAULT_CONSTRAINT_KEY,
}
);
}
Ok(Field::new(
column_schema.name.clone(),
column_schema.data_type.as_arrow_type(),
column_schema.is_nullable(),
)
.with_metadata(metadata))
}
}
impl TryFrom<Arc<ArrowSchema>> for Schema {
type Error = Error;
@@ -424,7 +271,7 @@ impl TryFrom<Arc<ArrowSchema>> for Schema {
let mut name_to_index = HashMap::with_capacity(arrow_schema.fields.len());
for field in &arrow_schema.fields {
let column_schema = ColumnSchema::try_from(field)?;
name_to_index.insert(field.name.clone(), column_schemas.len());
name_to_index.insert(field.name().to_string(), column_schemas.len());
column_schemas.push(column_schema);
}
@@ -465,7 +312,7 @@ impl TryFrom<ArrowSchema> for Schema {
}
}
fn try_parse_version(metadata: &Metadata, key: &str) -> Result<u32> {
fn try_parse_version(metadata: &HashMap<String, String>, key: &str) -> Result<u32> {
if let Some(value) = metadata.get(key) {
let version = value
.parse()
@@ -479,127 +326,8 @@ fn try_parse_version(metadata: &Metadata, key: &str) -> Result<u32> {
#[cfg(test)]
mod tests {
use arrow::datatypes::DataType as ArrowDataType;
use super::*;
use crate::value::Value;
#[test]
fn test_column_schema() {
let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true);
let field = Field::try_from(&column_schema).unwrap();
assert_eq!("test", field.name);
assert_eq!(ArrowDataType::Int32, field.data_type);
assert!(field.is_nullable);
let new_column_schema = ColumnSchema::try_from(&field).unwrap();
assert_eq!(column_schema, new_column_schema);
}
#[test]
fn test_column_schema_with_default_constraint() {
let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true)
.with_default_constraint(Some(ColumnDefaultConstraint::Value(Value::from(99))))
.unwrap();
assert!(column_schema
.metadata()
.get(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY)
.is_none());
let field = Field::try_from(&column_schema).unwrap();
assert_eq!("test", field.name);
assert_eq!(ArrowDataType::Int32, field.data_type);
assert!(field.is_nullable);
assert_eq!(
"{\"Value\":{\"Int32\":99}}",
field
.metadata
.get(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY)
.unwrap()
);
let new_column_schema = ColumnSchema::try_from(&field).unwrap();
assert_eq!(column_schema, new_column_schema);
}
#[test]
fn test_column_schema_with_metadata() {
let mut metadata = Metadata::new();
metadata.insert("k1".to_string(), "v1".to_string());
let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true)
.with_metadata(metadata)
.with_default_constraint(Some(ColumnDefaultConstraint::null_value()))
.unwrap();
assert_eq!("v1", column_schema.metadata().get("k1").unwrap());
assert!(column_schema
.metadata()
.get(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY)
.is_none());
let field = Field::try_from(&column_schema).unwrap();
assert_eq!("v1", field.metadata.get("k1").unwrap());
assert!(field
.metadata
.get(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY)
.is_some());
let new_column_schema = ColumnSchema::try_from(&field).unwrap();
assert_eq!(column_schema, new_column_schema);
}
#[test]
fn test_column_schema_with_duplicate_metadata() {
let mut metadata = Metadata::new();
metadata.insert(
ARROW_FIELD_DEFAULT_CONSTRAINT_KEY.to_string(),
"v1".to_string(),
);
let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true)
.with_metadata(metadata)
.with_default_constraint(Some(ColumnDefaultConstraint::null_value()))
.unwrap();
Field::try_from(&column_schema).unwrap_err();
}
#[test]
fn test_column_schema_invalid_default_constraint() {
ColumnSchema::new("test", ConcreteDataType::int32_datatype(), false)
.with_default_constraint(Some(ColumnDefaultConstraint::null_value()))
.unwrap_err();
}
#[test]
fn test_column_default_constraint_try_into_from() {
let default_constraint = ColumnDefaultConstraint::Value(Value::from(42i64));
let bytes: Vec<u8> = default_constraint.clone().try_into().unwrap();
let from_value = ColumnDefaultConstraint::try_from(&bytes[..]).unwrap();
assert_eq!(default_constraint, from_value);
}
#[test]
fn test_column_schema_create_default_null() {
// Implicit default null.
let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true);
let v = column_schema.create_default_vector(5).unwrap().unwrap();
assert_eq!(5, v.len());
assert!(v.only_null());
// Explicit default null.
let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true)
.with_default_constraint(Some(ColumnDefaultConstraint::null_value()))
.unwrap();
let v = column_schema.create_default_vector(5).unwrap().unwrap();
assert_eq!(5, v.len());
assert!(v.only_null());
}
#[test]
fn test_column_schema_no_default() {
let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), false);
assert!(column_schema.create_default_vector(5).unwrap().is_none());
}
use crate::data_type::ConcreteDataType;
#[test]
fn test_build_empty_schema() {
@@ -654,8 +382,12 @@ mod tests {
fn test_schema_with_timestamp() {
let column_schemas = vec![
ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true),
ColumnSchema::new("ts", ConcreteDataType::timestamp_millis_datatype(), false)
.with_time_index(true),
ColumnSchema::new(
"ts",
ConcreteDataType::timestamp_millisecond_datatype(),
false,
)
.with_time_index(true),
];
let schema = SchemaBuilder::try_from(column_schemas.clone())
.unwrap()

View File

@@ -22,7 +22,7 @@ use snafu::{ensure, ResultExt};
use crate::data_type::{ConcreteDataType, DataType};
use crate::error::{self, Result};
use crate::value::Value;
use crate::vectors::{Int64Vector, TimestampVector, VectorRef};
use crate::vectors::{Int64Vector, TimestampMillisecondVector, VectorRef};
const CURRENT_TIMESTAMP: &str = "current_timestamp()";
@@ -81,7 +81,7 @@ impl ColumnDefaultConstraint {
error::UnsupportedDefaultExprSnafu { expr }
);
ensure!(
data_type.is_timestamp(),
data_type.is_timestamp_compatible(),
error::DefaultValueTypeSnafu {
reason: "return value of the function must has timestamp type",
}
@@ -162,8 +162,10 @@ fn create_current_timestamp_vector(
data_type: &ConcreteDataType,
num_rows: usize,
) -> Result<VectorRef> {
// FIXME(yingwen): We should implements cast in VectorOp so we could cast the millisecond vector
// to other data type and avoid this match.
match data_type {
ConcreteDataType::Timestamp(_) => Ok(Arc::new(TimestampVector::from_values(
ConcreteDataType::Timestamp(_) => Ok(Arc::new(TimestampMillisecondVector::from_values(
std::iter::repeat(util::current_time_millis()).take(num_rows),
))),
ConcreteDataType::Int64(_) => Ok(Arc::new(Int64Vector::from_values(
@@ -217,7 +219,7 @@ mod tests {
fn test_validate_function_constraint() {
let constraint = ColumnDefaultConstraint::Function(CURRENT_TIMESTAMP.to_string());
constraint
.validate(&ConcreteDataType::timestamp_millis_datatype(), false)
.validate(&ConcreteDataType::timestamp_millisecond_datatype(), false)
.unwrap();
constraint
.validate(&ConcreteDataType::boolean_datatype(), false)
@@ -225,7 +227,7 @@ mod tests {
let constraint = ColumnDefaultConstraint::Function("hello()".to_string());
constraint
.validate(&ConcreteDataType::timestamp_millis_datatype(), false)
.validate(&ConcreteDataType::timestamp_millisecond_datatype(), false)
.unwrap_err();
}
@@ -262,7 +264,7 @@ mod tests {
fn test_create_default_vector_by_func() {
let constraint = ColumnDefaultConstraint::Function(CURRENT_TIMESTAMP.to_string());
// Timestamp type.
let data_type = ConcreteDataType::timestamp_millis_datatype();
let data_type = ConcreteDataType::timestamp_millisecond_datatype();
let v = constraint
.create_default_vector(&data_type, false, 4)
.unwrap();
@@ -286,7 +288,7 @@ mod tests {
);
let constraint = ColumnDefaultConstraint::Function("no".to_string());
let data_type = ConcreteDataType::timestamp_millis_datatype();
let data_type = ConcreteDataType::timestamp_millisecond_datatype();
constraint
.create_default_vector(&data_type, false, 4)
.unwrap_err();

View File

@@ -20,7 +20,7 @@ use crate::schema::{ColumnSchema, Schema, SchemaBuilder};
/// Struct used to serialize and deserialize [`Schema`](crate::schema::Schema).
///
/// This struct only contains necessary data to recover the Schema.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct RawSchema {
pub column_schemas: Vec<ColumnSchema>,
pub timestamp_index: Option<usize>,
@@ -56,8 +56,12 @@ mod tests {
fn test_raw_convert() {
let column_schemas = vec![
ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true),
ColumnSchema::new("ts", ConcreteDataType::timestamp_millis_datatype(), false)
.with_time_index(true),
ColumnSchema::new(
"ts",
ConcreteDataType::timestamp_millisecond_datatype(),
false,
)
.with_time_index(true),
];
let schema = SchemaBuilder::try_from(column_schemas)
.unwrap()

View File

@@ -42,7 +42,10 @@ pub enum LogicalTypeId {
/// seconds/milliseconds/microseconds/nanoseconds, determined by precision.
DateTime,
Timestamp,
TimestampSecond,
TimestampMillisecond,
TimestampMicrosecond,
TimestampNanosecond,
List,
}
@@ -74,7 +77,14 @@ impl LogicalTypeId {
LogicalTypeId::Binary => ConcreteDataType::binary_datatype(),
LogicalTypeId::Date => ConcreteDataType::date_datatype(),
LogicalTypeId::DateTime => ConcreteDataType::datetime_datatype(),
LogicalTypeId::Timestamp => ConcreteDataType::timestamp_millis_datatype(), // to timestamp type with default time unit
LogicalTypeId::TimestampSecond => ConcreteDataType::timestamp_second_datatype(),
LogicalTypeId::TimestampMillisecond => {
ConcreteDataType::timestamp_millisecond_datatype()
}
LogicalTypeId::TimestampMicrosecond => {
ConcreteDataType::timestamp_microsecond_datatype()
}
LogicalTypeId::TimestampNanosecond => ConcreteDataType::timestamp_nanosecond_datatype(),
LogicalTypeId::List => {
ConcreteDataType::list_datatype(ConcreteDataType::null_datatype())
}

View File

@@ -14,25 +14,24 @@
mod binary_type;
mod boolean_type;
mod date;
mod datetime;
mod date_type;
mod datetime_type;
mod list_type;
mod null_type;
mod primitive_traits;
mod primitive_type;
mod string_type;
mod timestamp;
mod timestamp_type;
pub use binary_type::BinaryType;
pub use boolean_type::BooleanType;
pub use date::DateType;
pub use datetime::DateTimeType;
pub use date_type::DateType;
pub use datetime_type::DateTimeType;
pub use list_type::ListType;
pub use null_type::NullType;
pub use primitive_traits::{OrdPrimitive, Primitive};
pub use primitive_type::{
Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, PrimitiveElement,
PrimitiveType, UInt16Type, UInt32Type, UInt64Type, UInt8Type,
Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LogicalPrimitiveType,
NativeType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, WrapperType,
};
pub use string_type::StringType;
pub use timestamp::TimestampType;
pub use timestamp_type::*;

View File

@@ -53,4 +53,8 @@ impl DataType for BinaryType {
fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector> {
Box::new(BinaryVectorBuilder::with_capacity(capacity))
}
fn is_timestamp_compatible(&self) -> bool {
false
}
}

View File

@@ -52,4 +52,8 @@ impl DataType for BooleanType {
fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector> {
Box::new(BooleanVectorBuilder::with_capacity(capacity))
}
fn is_timestamp_compatible(&self) -> bool {
false
}
}

View File

@@ -15,15 +15,17 @@
use arrow::datatypes::{DataType as ArrowDataType, Field};
use serde::{Deserialize, Serialize};
use crate::prelude::*;
use crate::value::ListValue;
use crate::data_type::{ConcreteDataType, DataType};
use crate::type_id::LogicalTypeId;
use crate::value::{ListValue, Value};
use crate::vectors::{ListVectorBuilder, MutableVector};
/// Used to represent the List datatype.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct ListType {
/// The type of List's inner data.
inner: Box<ConcreteDataType>,
/// The type of List's item.
// Use Box to avoid recursive dependency, as enum ConcreteDataType depends on ListType.
item_type: Box<ConcreteDataType>,
}
impl Default for ListType {
@@ -33,9 +35,10 @@ impl Default for ListType {
}
impl ListType {
pub fn new(datatype: ConcreteDataType) -> Self {
/// Create a new `ListType` whose item's data type is `item_type`.
pub fn new(item_type: ConcreteDataType) -> Self {
ListType {
inner: Box::new(datatype),
item_type: Box::new(item_type),
}
}
}
@@ -50,20 +53,24 @@ impl DataType for ListType {
}
fn default_value(&self) -> Value {
Value::List(ListValue::new(None, *self.inner.clone()))
Value::List(ListValue::new(None, *self.item_type.clone()))
}
fn as_arrow_type(&self) -> ArrowDataType {
let field = Box::new(Field::new("item", self.inner.as_arrow_type(), true));
let field = Box::new(Field::new("item", self.item_type.as_arrow_type(), true));
ArrowDataType::List(field)
}
fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector> {
Box::new(ListVectorBuilder::with_type_capacity(
*self.inner.clone(),
*self.item_type.clone(),
capacity,
))
}
fn is_timestamp_compatible(&self) -> bool {
false
}
}
#[cfg(test)]

View File

@@ -27,7 +27,7 @@ pub struct NullType;
impl NullType {
pub fn arc() -> DataTypeRef {
Arc::new(Self)
Arc::new(NullType)
}
}
@@ -51,4 +51,8 @@ impl DataType for NullType {
fn create_mutable_vector(&self, _capacity: usize) -> Box<dyn MutableVector> {
Box::new(NullVectorBuilder::default())
}
fn is_timestamp_compatible(&self) -> bool {
false
}
}

View File

@@ -12,12 +12,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::TypeId;
use std::marker::PhantomData;
use std::cmp::Ordering;
use arrow::array::PrimitiveArray;
use arrow::datatypes::DataType as ArrowDataType;
use paste::paste;
use arrow::datatypes::{ArrowNativeType, ArrowPrimitiveType, DataType as ArrowDataType};
use common_time::{Date, DateTime};
use num::NumCast;
use serde::{Deserialize, Serialize};
use snafu::OptionExt;
@@ -25,92 +24,226 @@ use crate::data_type::{ConcreteDataType, DataType};
use crate::error::{self, Result};
use crate::scalars::{Scalar, ScalarRef, ScalarVectorBuilder};
use crate::type_id::LogicalTypeId;
use crate::types::primitive_traits::Primitive;
use crate::types::{DateTimeType, DateType};
use crate::value::{Value, ValueRef};
use crate::vectors::{MutableVector, PrimitiveVector, PrimitiveVectorBuilder, Vector};
#[derive(Clone, Serialize, Deserialize)]
pub struct PrimitiveType<T: Primitive> {
#[serde(skip)]
_phantom: PhantomData<T>,
/// Data types that can be used as arrow's native type.
pub trait NativeType: ArrowNativeType + NumCast {
/// Largest numeric type this primitive type can be cast to.
type LargestType: NativeType;
}
impl<T: Primitive, U: Primitive> PartialEq<PrimitiveType<U>> for PrimitiveType<T> {
fn eq(&self, _other: &PrimitiveType<U>) -> bool {
TypeId::of::<T>() == TypeId::of::<U>()
}
macro_rules! impl_native_type {
($Type: ident, $LargestType: ident) => {
impl NativeType for $Type {
type LargestType = $LargestType;
}
};
}
impl<T: Primitive> Eq for PrimitiveType<T> {}
impl_native_type!(u8, u64);
impl_native_type!(u16, u64);
impl_native_type!(u32, u64);
impl_native_type!(u64, u64);
impl_native_type!(i8, i64);
impl_native_type!(i16, i64);
impl_native_type!(i32, i64);
impl_native_type!(i64, i64);
impl_native_type!(f32, f64);
impl_native_type!(f64, f64);
/// A trait that provide helper methods for a primitive type to implementing the [PrimitiveVector].
pub trait PrimitiveElement
where
for<'a> Self: Primitive
+ Scalar<VectorType = PrimitiveVector<Self>>
+ ScalarRef<'a, ScalarType = Self, VectorType = PrimitiveVector<Self>>
+ Scalar<RefType<'a> = Self>,
/// Represents the wrapper type that wraps a native type using the `newtype pattern`,
/// such as [Date](`common_time::Date`) is a wrapper type for the underlying native
/// type `i32`.
pub trait WrapperType:
Copy
+ Scalar
+ PartialEq
+ Into<Value>
+ Into<ValueRef<'static>>
+ Serialize
+ Into<serde_json::Value>
{
/// Logical primitive type that this wrapper type belongs to.
type LogicalType: LogicalPrimitiveType<Wrapper = Self, Native = Self::Native>;
/// The underlying native type.
type Native: NativeType;
/// Convert native type into this wrapper type.
fn from_native(value: Self::Native) -> Self;
/// Convert this wrapper type into native type.
fn into_native(self) -> Self::Native;
}
/// Trait bridging the logical primitive type with [ArrowPrimitiveType].
pub trait LogicalPrimitiveType: 'static + Sized {
/// Arrow primitive type of this logical type.
type ArrowPrimitive: ArrowPrimitiveType<Native = Self::Native>;
/// Native (physical) type of this logical type.
type Native: NativeType;
/// Wrapper type that the vector returns.
type Wrapper: WrapperType<LogicalType = Self, Native = Self::Native>
+ for<'a> Scalar<VectorType = PrimitiveVector<Self>, RefType<'a> = Self::Wrapper>
+ for<'a> ScalarRef<'a, ScalarType = Self::Wrapper>;
/// Construct the data type struct.
fn build_data_type() -> ConcreteDataType;
/// Returns the name of the type id.
fn type_name() -> String;
/// Return the name of the type.
fn type_name() -> &'static str;
/// Dynamic cast the vector to the concrete vector type.
fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveArray<Self>>;
fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveVector<Self>>;
/// Cast value ref to the primitive type.
fn cast_value_ref(value: ValueRef) -> Result<Option<Self>>;
fn cast_value_ref(value: ValueRef) -> Result<Option<Self::Wrapper>>;
}
macro_rules! impl_primitive_element {
($Type:ident, $TypeId:ident) => {
paste::paste! {
impl PrimitiveElement for $Type {
fn build_data_type() -> ConcreteDataType {
ConcreteDataType::$TypeId(PrimitiveType::<$Type>::default())
}
/// A new type for [WrapperType], complement the `Ord` feature for it. Wrapping non ordered
/// primitive types like `f32` and `f64` in `OrdPrimitive` can make them be used in places that
/// require `Ord`. For example, in `Median` or `Percentile` UDAFs.
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct OrdPrimitive<T: WrapperType>(pub T);
fn type_name() -> String {
stringify!($TypeId).to_string()
}
impl<T: WrapperType> OrdPrimitive<T> {
pub fn as_primitive(&self) -> T {
self.0
}
}
fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveArray<$Type>> {
let primitive_vector = vector
.as_any()
.downcast_ref::<PrimitiveVector<$Type>>()
.with_context(|| error::CastTypeSnafu {
msg: format!(
"Failed to cast {} to vector of primitive type {}",
vector.vector_type_name(),
stringify!($TypeId)
),
})?;
Ok(&primitive_vector.array)
}
impl<T: WrapperType> Eq for OrdPrimitive<T> {}
fn cast_value_ref(value: ValueRef) -> Result<Option<Self>> {
match value {
ValueRef::Null => Ok(None),
ValueRef::$TypeId(v) => Ok(Some(v.into())),
other => error::CastTypeSnafu {
msg: format!(
"Failed to cast value {:?} to primitive type {}",
other,
stringify!($TypeId),
),
}.fail(),
impl<T: WrapperType> PartialOrd for OrdPrimitive<T> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<T: WrapperType> Ord for OrdPrimitive<T> {
fn cmp(&self, other: &Self) -> Ordering {
Into::<Value>::into(self.0).cmp(&Into::<Value>::into(other.0))
}
}
impl<T: WrapperType> From<OrdPrimitive<T>> for Value {
fn from(p: OrdPrimitive<T>) -> Self {
p.0.into()
}
}
macro_rules! impl_wrapper {
($Type: ident, $LogicalType: ident) => {
impl WrapperType for $Type {
type LogicalType = $LogicalType;
type Native = $Type;
fn from_native(value: Self::Native) -> Self {
value
}
fn into_native(self) -> Self::Native {
self
}
}
};
}
impl_wrapper!(u8, UInt8Type);
impl_wrapper!(u16, UInt16Type);
impl_wrapper!(u32, UInt32Type);
impl_wrapper!(u64, UInt64Type);
impl_wrapper!(i8, Int8Type);
impl_wrapper!(i16, Int16Type);
impl_wrapper!(i32, Int32Type);
impl_wrapper!(i64, Int64Type);
impl_wrapper!(f32, Float32Type);
impl_wrapper!(f64, Float64Type);
impl WrapperType for Date {
type LogicalType = DateType;
type Native = i32;
fn from_native(value: i32) -> Self {
Date::new(value)
}
fn into_native(self) -> i32 {
self.val()
}
}
impl WrapperType for DateTime {
type LogicalType = DateTimeType;
type Native = i64;
fn from_native(value: Self::Native) -> Self {
DateTime::new(value)
}
fn into_native(self) -> Self::Native {
self.val()
}
}
macro_rules! define_logical_primitive_type {
($Native: ident, $TypeId: ident, $DataType: ident) => {
// We need to define it as an empty struct `struct DataType {}` instead of a struct-unit
// `struct DataType;` to ensure the serialized JSON string is compatible with previous
// implementation.
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct $DataType {}
impl LogicalPrimitiveType for $DataType {
type ArrowPrimitive = arrow::datatypes::$DataType;
type Native = $Native;
type Wrapper = $Native;
fn build_data_type() -> ConcreteDataType {
ConcreteDataType::$TypeId($DataType::default())
}
fn type_name() -> &'static str {
stringify!($TypeId)
}
fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveVector<$DataType>> {
vector
.as_any()
.downcast_ref::<PrimitiveVector<$DataType>>()
.with_context(|| error::CastTypeSnafu {
msg: format!(
"Failed to cast {} to vector of primitive type {}",
vector.vector_type_name(),
stringify!($TypeId)
),
})
}
fn cast_value_ref(value: ValueRef) -> Result<Option<$Native>> {
match value {
ValueRef::Null => Ok(None),
ValueRef::$TypeId(v) => Ok(Some(v.into())),
other => error::CastTypeSnafu {
msg: format!(
"Failed to cast value {:?} to primitive type {}",
other,
stringify!($TypeId),
),
}
.fail(),
}
}
}
};
}
macro_rules! impl_numeric {
($Type:ident, $TypeId:ident) => {
impl DataType for PrimitiveType<$Type> {
macro_rules! define_non_timestamp_primitive {
($Native: ident, $TypeId: ident, $DataType: ident) => {
define_logical_primitive_type!($Native, $TypeId, $DataType);
impl DataType for $DataType {
fn name(&self) -> &str {
stringify!($TypeId)
}
@@ -120,7 +253,7 @@ macro_rules! impl_numeric {
}
fn default_value(&self) -> Value {
$Type::default().into()
$Native::default().into()
}
fn as_arrow_type(&self) -> ArrowDataType {
@@ -128,61 +261,98 @@ macro_rules! impl_numeric {
}
fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector> {
Box::new(PrimitiveVectorBuilder::<$Type>::with_capacity(capacity))
Box::new(PrimitiveVectorBuilder::<$DataType>::with_capacity(capacity))
}
}
impl std::fmt::Debug for PrimitiveType<$Type> {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "{}", self.name())
fn is_timestamp_compatible(&self) -> bool {
false
}
}
impl Default for PrimitiveType<$Type> {
fn default() -> Self {
Self {
_phantom: PhantomData,
}
}
}
impl_primitive_element!($Type, $TypeId);
paste! {
pub type [<$TypeId Type>]=PrimitiveType<$Type>;
}
};
}
impl_numeric!(u8, UInt8);
impl_numeric!(u16, UInt16);
impl_numeric!(u32, UInt32);
impl_numeric!(u64, UInt64);
impl_numeric!(i8, Int8);
impl_numeric!(i16, Int16);
impl_numeric!(i32, Int32);
impl_numeric!(i64, Int64);
impl_numeric!(f32, Float32);
impl_numeric!(f64, Float64);
define_non_timestamp_primitive!(u8, UInt8, UInt8Type);
define_non_timestamp_primitive!(u16, UInt16, UInt16Type);
define_non_timestamp_primitive!(u32, UInt32, UInt32Type);
define_non_timestamp_primitive!(u64, UInt64, UInt64Type);
define_non_timestamp_primitive!(i8, Int8, Int8Type);
define_non_timestamp_primitive!(i16, Int16, Int16Type);
define_non_timestamp_primitive!(i32, Int32, Int32Type);
define_non_timestamp_primitive!(f32, Float32, Float32Type);
define_non_timestamp_primitive!(f64, Float64, Float64Type);
// Timestamp primitive:
define_logical_primitive_type!(i64, Int64, Int64Type);
impl DataType for Int64Type {
fn name(&self) -> &str {
"Int64"
}
fn logical_type_id(&self) -> LogicalTypeId {
LogicalTypeId::Int64
}
fn default_value(&self) -> Value {
Value::Int64(0)
}
fn as_arrow_type(&self) -> ArrowDataType {
ArrowDataType::Int64
}
fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector> {
Box::new(PrimitiveVectorBuilder::<Int64Type>::with_capacity(capacity))
}
fn is_timestamp_compatible(&self) -> bool {
true
}
}
#[cfg(test)]
mod tests {
use std::collections::BinaryHeap;
use super::*;
#[test]
fn test_eq() {
assert_eq!(UInt8Type::default(), UInt8Type::default());
assert_eq!(UInt16Type::default(), UInt16Type::default());
assert_eq!(UInt32Type::default(), UInt32Type::default());
assert_eq!(UInt64Type::default(), UInt64Type::default());
assert_eq!(Int8Type::default(), Int8Type::default());
assert_eq!(Int16Type::default(), Int16Type::default());
assert_eq!(Int32Type::default(), Int32Type::default());
assert_eq!(Int64Type::default(), Int64Type::default());
assert_eq!(Float32Type::default(), Float32Type::default());
assert_eq!(Float64Type::default(), Float64Type::default());
fn test_ord_primitive() {
struct Foo<T>
where
T: WrapperType,
{
heap: BinaryHeap<OrdPrimitive<T>>,
}
assert_ne!(Float32Type::default(), Float64Type::default());
assert_ne!(Float32Type::default(), Int32Type::default());
impl<T> Foo<T>
where
T: WrapperType,
{
fn push(&mut self, value: T) {
let value = OrdPrimitive::<T>(value);
self.heap.push(value);
}
}
macro_rules! test {
($Type:ident) => {
let mut foo = Foo::<$Type> {
heap: BinaryHeap::new(),
};
foo.push($Type::default());
};
}
test!(u8);
test!(u16);
test!(u32);
test!(u64);
test!(i8);
test!(i16);
test!(i32);
test!(i64);
test!(f32);
test!(f64);
}
}

View File

@@ -18,9 +18,10 @@ use arrow::datatypes::DataType as ArrowDataType;
use common_base::bytes::StringBytes;
use serde::{Deserialize, Serialize};
use crate::data_type::DataType;
use crate::prelude::{DataTypeRef, LogicalTypeId, Value};
use crate::scalars::ScalarVectorBuilder;
use crate::data_type::{DataType, DataTypeRef};
use crate::prelude::ScalarVectorBuilder;
use crate::type_id::LogicalTypeId;
use crate::value::Value;
use crate::vectors::{MutableVector, StringVectorBuilder};
#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -52,4 +53,8 @@ impl DataType for StringType {
fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector> {
Box::new(StringVectorBuilder::with_capacity(capacity))
}
fn is_timestamp_compatible(&self) -> bool {
false
}
}

View File

@@ -110,6 +110,7 @@ impl Value {
/// # Panics
/// Panics if the data type is not supported.
pub fn data_type(&self) -> ConcreteDataType {
// TODO(yingwen): Implement this once all data types are implemented.
match self {
Value::Null => ConcreteDataType::null_datatype(),
Value::Boolean(_) => ConcreteDataType::boolean_datatype(),
@@ -125,10 +126,10 @@ impl Value {
Value::Float64(_) => ConcreteDataType::float64_datatype(),
Value::String(_) => ConcreteDataType::string_datatype(),
Value::Binary(_) => ConcreteDataType::binary_datatype(),
Value::List(list) => ConcreteDataType::list_datatype(list.datatype().clone()),
Value::Date(_) => ConcreteDataType::date_datatype(),
Value::DateTime(_) => ConcreteDataType::datetime_datatype(),
Value::Timestamp(v) => ConcreteDataType::timestamp_datatype(v.unit()),
Value::List(list) => ConcreteDataType::list_datatype(list.datatype().clone()),
}
}
@@ -193,7 +194,12 @@ impl Value {
Value::List(_) => LogicalTypeId::List,
Value::Date(_) => LogicalTypeId::Date,
Value::DateTime(_) => LogicalTypeId::DateTime,
Value::Timestamp(_) => LogicalTypeId::Timestamp,
Value::Timestamp(t) => match t.unit() {
TimeUnit::Second => LogicalTypeId::TimestampSecond,
TimeUnit::Millisecond => LogicalTypeId::TimestampMillisecond,
TimeUnit::Microsecond => LogicalTypeId::TimestampMicrosecond,
TimeUnit::Nanosecond => LogicalTypeId::TimestampNanosecond,
},
}
}
}
@@ -277,6 +283,9 @@ impl_value_from!(Float32, f32);
impl_value_from!(Float64, f64);
impl_value_from!(String, StringBytes);
impl_value_from!(Binary, Bytes);
impl_value_from!(Date, Date);
impl_value_from!(DateTime, DateTime);
impl_value_from!(Timestamp, Timestamp);
impl From<String> for Value {
fn from(string: String) -> Value {
@@ -296,12 +305,6 @@ impl From<Vec<u8>> for Value {
}
}
impl From<Timestamp> for Value {
fn from(v: Timestamp) -> Self {
Value::Timestamp(v)
}
}
impl From<&[u8]> for Value {
fn from(bytes: &[u8]) -> Value {
Value::Binary(bytes.into())
@@ -337,6 +340,7 @@ impl TryFrom<Value> for serde_json::Value {
}
}
// TODO(yingwen): Consider removing the `datatype` field from `ListValue`.
/// List value.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ListValue {
@@ -391,6 +395,7 @@ impl TryFrom<ScalarValue> for Value {
fn try_from(v: ScalarValue) -> Result<Self> {
let v = match v {
ScalarValue::Null => Value::Null,
ScalarValue::Boolean(b) => Value::from(b),
ScalarValue::Float32(f) => Value::from(f),
ScalarValue::Float64(f) => Value::from(f),
@@ -405,8 +410,10 @@ impl TryFrom<ScalarValue> for Value {
ScalarValue::Utf8(s) | ScalarValue::LargeUtf8(s) => {
Value::from(s.map(StringBytes::from))
}
ScalarValue::Binary(b) | ScalarValue::LargeBinary(b) => Value::from(b.map(Bytes::from)),
ScalarValue::List(vs, t) => {
ScalarValue::Binary(b)
| ScalarValue::LargeBinary(b)
| ScalarValue::FixedSizeBinary(_, b) => Value::from(b.map(Bytes::from)),
ScalarValue::List(vs, field) => {
let items = if let Some(vs) = vs {
let vs = vs
.into_iter()
@@ -416,7 +423,7 @@ impl TryFrom<ScalarValue> for Value {
} else {
None
};
let datatype = t.as_ref().try_into()?;
let datatype = ConcreteDataType::try_from(field.data_type())?;
Value::List(ListValue::new(items, datatype))
}
ScalarValue::Date32(d) => d.map(|x| Value::Date(Date::new(x))).unwrap_or(Value::Null),
@@ -435,7 +442,13 @@ impl TryFrom<ScalarValue> for Value {
ScalarValue::TimestampNanosecond(t, _) => t
.map(|x| Value::Timestamp(Timestamp::new(x, TimeUnit::Nanosecond)))
.unwrap_or(Value::Null),
_ => {
ScalarValue::Decimal128(_, _, _)
| ScalarValue::Time64(_)
| ScalarValue::IntervalYearMonth(_)
| ScalarValue::IntervalDayTime(_)
| ScalarValue::IntervalMonthDayNano(_)
| ScalarValue::Struct(_, _)
| ScalarValue::Dictionary(_, _) => {
return error::UnsupportedArrowTypeSnafu {
arrow_type: v.get_datatype(),
}
@@ -545,15 +558,6 @@ impl<'a> Ord for ValueRef<'a> {
}
}
/// A helper trait to convert copyable types to `ValueRef`.
///
/// It could replace the usage of `Into<ValueRef<'a>>`, thus avoid confusion between `Into<Value>`
/// and `Into<ValueRef<'a>>` in generic codes. One typical usage is the [`Primitive`](crate::primitive_traits::Primitive) trait.
pub trait IntoValueRef<'a> {
/// Convert itself to [ValueRef].
fn into_value_ref(self) -> ValueRef<'a>;
}
macro_rules! impl_value_ref_from {
($Variant:ident, $Type:ident) => {
impl From<$Type> for ValueRef<'_> {
@@ -562,12 +566,6 @@ macro_rules! impl_value_ref_from {
}
}
impl<'a> IntoValueRef<'a> for $Type {
fn into_value_ref(self) -> ValueRef<'a> {
ValueRef::$Variant(self.into())
}
}
impl From<Option<$Type>> for ValueRef<'_> {
fn from(value: Option<$Type>) -> Self {
match value {
@@ -576,15 +574,6 @@ macro_rules! impl_value_ref_from {
}
}
}
impl<'a> IntoValueRef<'a> for Option<$Type> {
fn into_value_ref(self) -> ValueRef<'a> {
match self {
Some(v) => ValueRef::$Variant(v.into()),
None => ValueRef::Null,
}
}
}
};
}
@@ -599,6 +588,9 @@ impl_value_ref_from!(Int32, i32);
impl_value_ref_from!(Int64, i64);
impl_value_ref_from!(Float32, f32);
impl_value_ref_from!(Float64, f64);
impl_value_ref_from!(Date, Date);
impl_value_ref_from!(DateTime, DateTime);
impl_value_ref_from!(Timestamp, Timestamp);
impl<'a> From<&'a str> for ValueRef<'a> {
fn from(string: &'a str) -> ValueRef<'a> {
@@ -628,6 +620,7 @@ impl<'a> From<Option<ListValueRef<'a>>> for ValueRef<'a> {
/// if it becomes bottleneck.
#[derive(Debug, Clone, Copy)]
pub enum ListValueRef<'a> {
// TODO(yingwen): Consider replace this by VectorRef.
Indexed { vector: &'a ListVector, idx: usize },
Ref { val: &'a ListValue },
}
@@ -785,19 +778,16 @@ mod tests {
Some(Box::new(vec![Value::Int32(1), Value::Null])),
ConcreteDataType::int32_datatype()
)),
ScalarValue::List(
Some(Box::new(vec![
ScalarValue::Int32(Some(1)),
ScalarValue::Int32(None)
])),
Box::new(ArrowDataType::Int32)
ScalarValue::new_list(
Some(vec![ScalarValue::Int32(Some(1)), ScalarValue::Int32(None)]),
ArrowDataType::Int32,
)
.try_into()
.unwrap()
);
assert_eq!(
Value::List(ListValue::new(None, ConcreteDataType::uint32_datatype())),
ScalarValue::List(None, Box::new(ArrowDataType::UInt32))
ScalarValue::new_list(None, ArrowDataType::UInt32)
.try_into()
.unwrap()
);
@@ -980,6 +970,10 @@ mod tests {
ConcreteDataType::int32_datatype(),
)),
);
check_type_and_value(
&ConcreteDataType::list_datatype(ConcreteDataType::null_datatype()),
&Value::List(ListValue::default()),
);
check_type_and_value(
&ConcreteDataType::date_datatype(),
&Value::Date(Date::new(1)),
@@ -989,7 +983,7 @@ mod tests {
&Value::DateTime(DateTime::new(1)),
);
check_type_and_value(
&ConcreteDataType::timestamp_millis_datatype(),
&ConcreteDataType::timestamp_millisecond_datatype(),
&Value::Timestamp(Timestamp::from_millis(1)),
);
}
@@ -1208,59 +1202,6 @@ mod tests {
assert!(wrong_value.as_list().is_err());
}
#[test]
fn test_into_value_ref() {
macro_rules! check_into_value_ref {
($Variant: ident, $data: expr, $PrimitiveType: ident, $Wrapper: ident) => {
let data: $PrimitiveType = $data;
assert_eq!(
ValueRef::$Variant($Wrapper::from(data)),
data.into_value_ref()
);
assert_eq!(
ValueRef::$Variant($Wrapper::from(data)),
ValueRef::from(data)
);
assert_eq!(
ValueRef::$Variant($Wrapper::from(data)),
Some(data).into_value_ref()
);
assert_eq!(
ValueRef::$Variant($Wrapper::from(data)),
ValueRef::from(Some(data))
);
let x: Option<$PrimitiveType> = None;
assert_eq!(ValueRef::Null, x.into_value_ref());
assert_eq!(ValueRef::Null, x.into());
};
}
macro_rules! check_primitive_into_value_ref {
($Variant: ident, $data: expr, $PrimitiveType: ident) => {
check_into_value_ref!($Variant, $data, $PrimitiveType, $PrimitiveType)
};
}
check_primitive_into_value_ref!(Boolean, true, bool);
check_primitive_into_value_ref!(UInt8, 10, u8);
check_primitive_into_value_ref!(UInt16, 20, u16);
check_primitive_into_value_ref!(UInt32, 30, u32);
check_primitive_into_value_ref!(UInt64, 40, u64);
check_primitive_into_value_ref!(Int8, -10, i8);
check_primitive_into_value_ref!(Int16, -20, i16);
check_primitive_into_value_ref!(Int32, -30, i32);
check_primitive_into_value_ref!(Int64, -40, i64);
check_into_value_ref!(Float32, 10.0, f32, OrderedF32);
check_into_value_ref!(Float64, 10.0, f64, OrderedF64);
let hello = "hello";
assert_eq!(
ValueRef::Binary(hello.as_bytes()),
ValueRef::from(hello.as_bytes())
);
assert_eq!(ValueRef::String(hello), ValueRef::from(hello));
}
#[test]
fn test_display() {
assert_eq!(Value::Null.to_string(), "Null");
@@ -1301,10 +1242,34 @@ mod tests {
assert_eq!(
Value::List(ListValue::new(
Some(Box::new(vec![])),
ConcreteDataType::timestamp_datatype(TimeUnit::Millisecond),
ConcreteDataType::timestamp_second_datatype(),
))
.to_string(),
"Timestamp[]"
"TimestampSecondType[]"
);
assert_eq!(
Value::List(ListValue::new(
Some(Box::new(vec![])),
ConcreteDataType::timestamp_millisecond_datatype(),
))
.to_string(),
"TimestampMillisecondType[]"
);
assert_eq!(
Value::List(ListValue::new(
Some(Box::new(vec![])),
ConcreteDataType::timestamp_microsecond_datatype(),
))
.to_string(),
"TimestampMicrosecondType[]"
);
assert_eq!(
Value::List(ListValue::new(
Some(Box::new(vec![])),
ConcreteDataType::timestamp_nanosecond_datatype(),
))
.to_string(),
"TimestampNanosecondType[]"
);
}
}

View File

@@ -12,68 +12,59 @@
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod binary;
pub mod boolean;
mod builder;
pub mod constant;
pub mod date;
pub mod datetime;
mod eq;
mod helper;
mod list;
pub mod mutable;
pub mod null;
mod operations;
pub mod primitive;
mod string;
mod timestamp;
use std::any::Any;
use std::fmt::Debug;
use std::sync::Arc;
use arrow::array::{Array, ArrayRef};
use arrow::bitmap::Bitmap;
pub use binary::*;
pub use boolean::*;
pub use builder::VectorBuilder;
pub use constant::*;
pub use date::*;
pub use datetime::*;
pub use helper::Helper;
pub use list::*;
pub use mutable::MutableVector;
pub use null::*;
pub use operations::VectorOp;
pub use primitive::*;
use snafu::ensure;
pub use string::*;
pub use timestamp::*;
use crate::data_type::ConcreteDataType;
use crate::error::{self, Result};
use crate::serialize::Serializable;
use crate::value::{Value, ValueRef};
use crate::vectors::operations::VectorOp;
#[derive(Debug, PartialEq)]
pub enum Validity<'a> {
/// Whether the array slot is valid or not (null).
Slots(&'a Bitmap),
/// All slots are valid.
AllValid,
/// All slots are null.
AllNull,
}
mod binary;
mod boolean;
mod constant;
mod date;
mod datetime;
mod eq;
mod helper;
mod list;
mod null;
mod operations;
mod primitive;
mod string;
mod timestamp;
mod validity;
impl<'a> Validity<'a> {
pub fn slots(&self) -> Option<&Bitmap> {
match self {
Validity::Slots(bitmap) => Some(bitmap),
_ => None,
}
}
}
pub use binary::{BinaryVector, BinaryVectorBuilder};
pub use boolean::{BooleanVector, BooleanVectorBuilder};
pub use constant::ConstantVector;
pub use date::{DateVector, DateVectorBuilder};
pub use datetime::{DateTimeVector, DateTimeVectorBuilder};
pub use helper::Helper;
pub use list::{ListIter, ListVector, ListVectorBuilder};
pub use null::{NullVector, NullVectorBuilder};
pub use primitive::{
Float32Vector, Float32VectorBuilder, Float64Vector, Float64VectorBuilder, Int16Vector,
Int16VectorBuilder, Int32Vector, Int32VectorBuilder, Int64Vector, Int64VectorBuilder,
Int8Vector, Int8VectorBuilder, PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder,
UInt16Vector, UInt16VectorBuilder, UInt32Vector, UInt32VectorBuilder, UInt64Vector,
UInt64VectorBuilder, UInt8Vector, UInt8VectorBuilder,
};
pub use string::{StringVector, StringVectorBuilder};
pub use timestamp::{
TimestampMicrosecondVector, TimestampMicrosecondVectorBuilder, TimestampMillisecondVector,
TimestampMillisecondVectorBuilder, TimestampNanosecondVector, TimestampNanosecondVectorBuilder,
TimestampSecondVector, TimestampSecondVectorBuilder,
};
pub use validity::Validity;
// TODO(yingwen): arrow 28.0 implements Clone for all arrays, we could upgrade to it and simplify
// some codes in methods such as `to_arrow_array()` and `to_boxed_arrow_array()`.
/// Vector of data values.
pub trait Vector: Send + Sync + Serializable + Debug + VectorOp {
/// Returns the data type of the vector.
@@ -110,13 +101,7 @@ pub trait Vector: Send + Sync + Serializable + Debug + VectorOp {
/// The number of null slots on this [`Vector`].
/// # Implementation
/// This is `O(1)`.
fn null_count(&self) -> usize {
match self.validity() {
Validity::Slots(bitmap) => bitmap.null_count(),
Validity::AllValid => 0,
Validity::AllNull => self.len(),
}
}
fn null_count(&self) -> usize;
/// Returns true when it's a ConstantColumn
fn is_const(&self) -> bool {
@@ -165,6 +150,42 @@ pub trait Vector: Send + Sync + Serializable + Debug + VectorOp {
pub type VectorRef = Arc<dyn Vector>;
/// Mutable vector that could be used to build an immutable vector.
pub trait MutableVector: Send + Sync {
/// Returns the data type of the vector.
fn data_type(&self) -> ConcreteDataType;
/// Returns the length of the vector.
fn len(&self) -> usize;
/// Returns whether the vector is empty.
fn is_empty(&self) -> bool {
self.len() == 0
}
/// Convert to Any, to enable dynamic casting.
fn as_any(&self) -> &dyn Any;
/// Convert to mutable Any, to enable dynamic casting.
fn as_mut_any(&mut self) -> &mut dyn Any;
/// Convert `self` to an (immutable) [VectorRef] and reset `self`.
fn to_vector(&mut self) -> VectorRef;
/// Push value ref to this mutable vector.
///
/// Returns error if data type unmatch.
fn push_value_ref(&mut self, value: ValueRef) -> Result<()>;
/// Extend this mutable vector by slice of `vector`.
///
/// Returns error if data type unmatch.
///
/// # Panics
/// Panics if `offset + length > vector.len()`.
fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()>;
}
/// Helper to define `try_from_arrow_array(array: arrow::array::ArrayRef)` function.
macro_rules! impl_try_from_arrow_array_for_vector {
($Array: ident, $Vector: ident) => {
@@ -172,16 +193,20 @@ macro_rules! impl_try_from_arrow_array_for_vector {
pub fn try_from_arrow_array(
array: impl AsRef<dyn arrow::array::Array>,
) -> crate::error::Result<$Vector> {
Ok($Vector::from(
array
.as_ref()
.as_any()
.downcast_ref::<$Array>()
.with_context(|| crate::error::ConversionSnafu {
from: std::format!("{:?}", array.as_ref().data_type()),
})?
.clone(),
))
use snafu::OptionExt;
let data = array
.as_ref()
.as_any()
.downcast_ref::<$Array>()
.with_context(|| crate::error::ConversionSnafu {
from: std::format!("{:?}", array.as_ref().data_type()),
})?
.data()
.clone();
let concrete_array = $Array::from(data);
Ok($Vector::from(concrete_array))
}
}
};
@@ -189,10 +214,7 @@ macro_rules! impl_try_from_arrow_array_for_vector {
macro_rules! impl_validity_for_vector {
($array: expr) => {
match $array.validity() {
Some(bitmap) => Validity::Slots(bitmap),
None => Validity::AllValid,
}
Validity::from_array_data($array.data())
};
}
@@ -219,10 +241,11 @@ macro_rules! impl_get_ref_for_vector {
}
macro_rules! impl_extend_for_builder {
($mutable_array: expr, $vector: ident, $VectorType: ident, $offset: ident, $length: ident) => {{
($mutable_vector: expr, $vector: ident, $VectorType: ident, $offset: ident, $length: ident) => {{
use snafu::OptionExt;
let concrete_vector = $vector
let sliced_vector = $vector.slice($offset, $length);
let concrete_vector = sliced_vector
.as_any()
.downcast_ref::<$VectorType>()
.with_context(|| crate::error::CastTypeSnafu {
@@ -232,8 +255,9 @@ macro_rules! impl_extend_for_builder {
stringify!($VectorType)
),
})?;
let slice = concrete_vector.array.slice($offset, $length);
$mutable_array.extend_trusted_len(slice.iter());
for value in concrete_vector.iter_data() {
$mutable_vector.push(value);
}
Ok(())
}};
}
@@ -245,27 +269,27 @@ pub(crate) use {
#[cfg(test)]
pub mod tests {
use arrow::array::{Array, PrimitiveArray};
use arrow::array::{Array, Int32Array, UInt8Array};
use serde_json;
use super::helper::Helper;
use super::*;
use crate::data_type::DataType;
use crate::types::PrimitiveElement;
use crate::types::{Int32Type, LogicalPrimitiveType};
use crate::vectors::helper::Helper;
#[test]
fn test_df_columns_to_vector() {
let df_column: Arc<dyn Array> = Arc::new(PrimitiveArray::from_slice(vec![1, 2, 3]));
let df_column: Arc<dyn Array> = Arc::new(Int32Array::from(vec![1, 2, 3]));
let vector = Helper::try_into_vector(df_column).unwrap();
assert_eq!(
i32::build_data_type().as_arrow_type(),
Int32Type::build_data_type().as_arrow_type(),
vector.data_type().as_arrow_type()
);
}
#[test]
fn test_serialize_i32_vector() {
let df_column: Arc<dyn Array> = Arc::new(PrimitiveArray::<i32>::from_slice(vec![1, 2, 3]));
let df_column: Arc<dyn Array> = Arc::new(Int32Array::from(vec![1, 2, 3]));
let json_value = Helper::try_into_vector(df_column)
.unwrap()
.serialize_to_json()
@@ -275,7 +299,7 @@ pub mod tests {
#[test]
fn test_serialize_i8_vector() {
let df_column: Arc<dyn Array> = Arc::new(PrimitiveArray::from_slice(vec![1u8, 2u8, 3u8]));
let df_column: Arc<dyn Array> = Arc::new(UInt8Array::from(vec![1, 2, 3]));
let json_value = Helper::try_into_vector(df_column)
.unwrap()
.serialize_to_json()

View File

@@ -15,9 +15,8 @@
use std::any::Any;
use std::sync::Arc;
use arrow::array::{Array, ArrayRef};
use arrow::array::{ArrayIter, GenericByteArray};
use snafu::{OptionExt, ResultExt};
use arrow::array::{Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef};
use snafu::ResultExt;
use crate::arrow_array::{BinaryArray, MutableBinaryArray};
use crate::data_type::ConcreteDataType;
@@ -37,6 +36,16 @@ impl BinaryVector {
pub(crate) fn as_arrow(&self) -> &dyn Array {
&self.array
}
fn to_array_data(&self) -> ArrayData {
self.array.data().clone()
}
fn from_array_data(data: ArrayData) -> BinaryVector {
BinaryVector {
array: BinaryArray::from(data),
}
}
}
impl From<BinaryArray> for BinaryVector {
@@ -48,7 +57,7 @@ impl From<BinaryArray> for BinaryVector {
impl From<Vec<Option<Vec<u8>>>> for BinaryVector {
fn from(data: Vec<Option<Vec<u8>>>) -> Self {
Self {
array: BinaryArray::from(data),
array: BinaryArray::from_iter(data),
}
}
}
@@ -71,11 +80,13 @@ impl Vector for BinaryVector {
}
fn to_arrow_array(&self) -> ArrayRef {
Arc::new(self.array.clone())
let data = self.to_array_data();
Arc::new(BinaryArray::from(data))
}
fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
Box::new(self.array.clone())
let data = self.to_array_data();
Box::new(BinaryArray::from(data))
}
fn validity(&self) -> Validity {
@@ -83,7 +94,11 @@ impl Vector for BinaryVector {
}
fn memory_size(&self) -> usize {
self.array.values().len() + self.array.offsets().len() * std::mem::size_of::<i64>()
self.array.get_buffer_memory_size()
}
fn null_count(&self) -> usize {
self.array.null_count()
}
fn is_null(&self, row: usize) -> bool {
@@ -91,7 +106,8 @@ impl Vector for BinaryVector {
}
fn slice(&self, offset: usize, length: usize) -> VectorRef {
Arc::new(Self::from(self.array.slice(offset, length)))
let data = self.array.data().slice(offset, length);
Arc::new(Self::from_array_data(data))
}
fn get(&self, index: usize) -> Value {
@@ -148,12 +164,15 @@ impl MutableVector for BinaryVectorBuilder {
}
fn push_value_ref(&mut self, value: ValueRef) -> Result<()> {
self.mutable_array.push(value.as_binary()?);
match value.as_binary()? {
Some(v) => self.mutable_array.append_value(v),
None => self.mutable_array.append_null(),
}
Ok(())
}
fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> {
vectors::impl_extend_for_builder!(self.mutable_array, vector, BinaryVector, offset, length)
vectors::impl_extend_for_builder!(self, vector, BinaryVector, offset, length)
}
}
@@ -162,17 +181,20 @@ impl ScalarVectorBuilder for BinaryVectorBuilder {
fn with_capacity(capacity: usize) -> Self {
Self {
mutable_array: MutableBinaryArray::with_capacity(capacity),
mutable_array: MutableBinaryArray::with_capacity(capacity, 0),
}
}
fn push(&mut self, value: Option<<Self::VectorType as ScalarVector>::RefItem<'_>>) {
self.mutable_array.push(value);
match value {
Some(v) => self.mutable_array.append_value(v),
None => self.mutable_array.append_null(),
}
}
fn finish(&mut self) -> Self::VectorType {
BinaryVector {
array: std::mem::take(&mut self.mutable_array).into(),
array: self.mutable_array.finish(),
}
}
}
@@ -205,14 +227,17 @@ mod tests {
#[test]
fn test_binary_vector_misc() {
let v = BinaryVector::from(BinaryArray::from_slice(&[vec![1, 2, 3], vec![1, 2, 3]]));
let v = BinaryVector::from(BinaryArray::from_iter_values(&[
vec![1, 2, 3],
vec![1, 2, 3],
]));
assert_eq!(2, v.len());
assert_eq!("BinaryVector", v.vector_type_name());
assert!(!v.is_const());
assert_eq!(Validity::AllValid, v.validity());
assert!(v.validity().is_all_valid());
assert!(!v.only_null());
assert_eq!(30, v.memory_size());
assert_eq!(128, v.memory_size());
for i in 0..2 {
assert!(!v.is_null(i));
@@ -227,7 +252,10 @@ mod tests {
#[test]
fn test_serialize_binary_vector_to_json() {
let vector = BinaryVector::from(BinaryArray::from_slice(&[vec![1, 2, 3], vec![1, 2, 3]]));
let vector = BinaryVector::from(BinaryArray::from_iter_values(&[
vec![1, 2, 3],
vec![1, 2, 3],
]));
let json_value = vector.serialize_to_json().unwrap();
assert_eq!(
@@ -253,8 +281,8 @@ mod tests {
#[test]
fn test_from_arrow_array() {
let arrow_array = BinaryArray::from_slice(&[vec![1, 2, 3], vec![1, 2, 3]]);
let original = arrow_array.clone();
let arrow_array = BinaryArray::from_iter_values(&[vec![1, 2, 3], vec![1, 2, 3]]);
let original = BinaryArray::from(arrow_array.data().clone());
let vector = BinaryVector::from(arrow_array);
assert_eq!(original, vector.array);
}
@@ -289,7 +317,7 @@ mod tests {
builder.push(Some(b"world"));
let vector = builder.finish();
assert_eq!(0, vector.null_count());
assert_eq!(Validity::AllValid, vector.validity());
assert!(vector.validity().is_all_valid());
let mut builder = BinaryVectorBuilder::with_capacity(3);
builder.push(Some(b"hello"));
@@ -298,9 +326,10 @@ mod tests {
let vector = builder.finish();
assert_eq!(1, vector.null_count());
let validity = vector.validity();
let slots = validity.slots().unwrap();
assert_eq!(1, slots.null_count());
assert!(!slots.get_bit(1));
assert!(!validity.is_set(1));
assert_eq!(1, validity.null_count());
assert!(!validity.is_set(1));
}
#[test]

View File

@@ -16,9 +16,10 @@ use std::any::Any;
use std::borrow::Borrow;
use std::sync::Arc;
use arrow::array::{Array, ArrayRef, BooleanArray, MutableArray, MutableBooleanArray};
use arrow::bitmap::utils::{BitmapIter, ZipValidity};
use snafu::{OptionExt, ResultExt};
use arrow::array::{
Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef, BooleanArray, BooleanBuilder,
};
use snafu::ResultExt;
use crate::data_type::ConcreteDataType;
use crate::error::Result;
@@ -41,12 +42,26 @@ impl BooleanVector {
pub(crate) fn as_boolean_array(&self) -> &BooleanArray {
&self.array
}
fn to_array_data(&self) -> ArrayData {
self.array.data().clone()
}
fn from_array_data(data: ArrayData) -> BooleanVector {
BooleanVector {
array: BooleanArray::from(data),
}
}
pub(crate) fn false_count(&self) -> usize {
self.array.false_count()
}
}
impl From<Vec<bool>> for BooleanVector {
fn from(data: Vec<bool>) -> Self {
BooleanVector {
array: BooleanArray::from_slice(&data),
array: BooleanArray::from(data),
}
}
}
@@ -91,11 +106,13 @@ impl Vector for BooleanVector {
}
fn to_arrow_array(&self) -> ArrayRef {
Arc::new(self.array.clone())
let data = self.to_array_data();
Arc::new(BooleanArray::from(data))
}
fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
Box::new(self.array.clone())
let data = self.to_array_data();
Box::new(BooleanArray::from(data))
}
fn validity(&self) -> Validity {
@@ -103,7 +120,11 @@ impl Vector for BooleanVector {
}
fn memory_size(&self) -> usize {
self.array.values().as_slice().0.len()
self.array.get_buffer_memory_size()
}
fn null_count(&self) -> usize {
self.array.null_count()
}
fn is_null(&self, row: usize) -> bool {
@@ -111,7 +132,8 @@ impl Vector for BooleanVector {
}
fn slice(&self, offset: usize, length: usize) -> VectorRef {
Arc::new(Self::from(self.array.slice(offset, length)))
let data = self.array.data().slice(offset, length);
Arc::new(Self::from_array_data(data))
}
fn get(&self, index: usize) -> Value {
@@ -126,7 +148,7 @@ impl Vector for BooleanVector {
impl ScalarVector for BooleanVector {
type OwnedItem = bool;
type RefItem<'a> = bool;
type Iter<'a> = ZipValidity<'a, bool, BitmapIter<'a>>;
type Iter<'a> = ArrayIter<&'a BooleanArray>;
type Builder = BooleanVectorBuilder;
fn get_data(&self, idx: usize) -> Option<Self::RefItem<'_>> {
@@ -143,7 +165,7 @@ impl ScalarVector for BooleanVector {
}
pub struct BooleanVectorBuilder {
mutable_array: MutableBooleanArray,
mutable_array: BooleanBuilder,
}
impl MutableVector for BooleanVectorBuilder {
@@ -168,12 +190,15 @@ impl MutableVector for BooleanVectorBuilder {
}
fn push_value_ref(&mut self, value: ValueRef) -> Result<()> {
self.mutable_array.push(value.as_boolean()?);
match value.as_boolean()? {
Some(v) => self.mutable_array.append_value(v),
None => self.mutable_array.append_null(),
}
Ok(())
}
fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> {
vectors::impl_extend_for_builder!(self.mutable_array, vector, BooleanVector, offset, length)
vectors::impl_extend_for_builder!(self, vector, BooleanVector, offset, length)
}
}
@@ -182,17 +207,20 @@ impl ScalarVectorBuilder for BooleanVectorBuilder {
fn with_capacity(capacity: usize) -> Self {
Self {
mutable_array: MutableBooleanArray::with_capacity(capacity),
mutable_array: BooleanBuilder::with_capacity(capacity),
}
}
fn push(&mut self, value: Option<<Self::VectorType as ScalarVector>::RefItem<'_>>) {
self.mutable_array.push(value);
match value {
Some(v) => self.mutable_array.append_value(v),
None => self.mutable_array.append_null(),
}
}
fn finish(&mut self) -> Self::VectorType {
BooleanVector {
array: std::mem::take(&mut self.mutable_array).into(),
array: self.mutable_array.finish(),
}
}
}
@@ -225,9 +253,9 @@ mod tests {
assert_eq!(9, v.len());
assert_eq!("BooleanVector", v.vector_type_name());
assert!(!v.is_const());
assert_eq!(Validity::AllValid, v.validity());
assert!(v.validity().is_all_valid());
assert!(!v.only_null());
assert_eq!(2, v.memory_size());
assert_eq!(64, v.memory_size());
for (i, b) in bools.iter().enumerate() {
assert!(!v.is_null(i));
@@ -316,13 +344,12 @@ mod tests {
let vector = BooleanVector::from(vec![Some(true), None, Some(false)]);
assert_eq!(1, vector.null_count());
let validity = vector.validity();
let slots = validity.slots().unwrap();
assert_eq!(1, slots.null_count());
assert!(!slots.get_bit(1));
assert_eq!(1, validity.null_count());
assert!(!validity.is_set(1));
let vector = BooleanVector::from(vec![true, false, false]);
assert_eq!(0, vector.null_count());
assert_eq!(Validity::AllValid, vector.validity());
assert!(vector.validity().is_all_valid());
}
#[test]

View File

@@ -55,6 +55,27 @@ impl ConstantVector {
pub fn get_constant_ref(&self) -> ValueRef {
self.vector.get_ref(0)
}
pub(crate) fn replicate_vector(&self, offsets: &[usize]) -> VectorRef {
assert_eq!(offsets.len(), self.len());
if offsets.is_empty() {
return self.slice(0, 0);
}
Arc::new(ConstantVector::new(
self.vector.clone(),
*offsets.last().unwrap(),
))
}
pub(crate) fn filter_vector(&self, filter: &BooleanVector) -> Result<VectorRef> {
let length = self.len() - filter.false_count();
if length == self.len() {
return Ok(Arc::new(self.clone()));
}
Ok(Arc::new(ConstantVector::new(self.inner().clone(), length)))
}
}
impl Vector for ConstantVector {
@@ -90,9 +111,9 @@ impl Vector for ConstantVector {
fn validity(&self) -> Validity {
if self.vector.is_null(0) {
Validity::AllNull
Validity::all_null(self.length)
} else {
Validity::AllValid
Validity::all_valid(self.length)
}
}
@@ -122,6 +143,14 @@ impl Vector for ConstantVector {
fn get_ref(&self, _index: usize) -> ValueRef {
self.vector.get_ref(0)
}
fn null_count(&self) -> usize {
if self.only_null() {
self.len()
} else {
0
}
}
}
impl fmt::Debug for ConstantVector {
@@ -140,33 +169,6 @@ impl Serializable for ConstantVector {
}
}
pub(crate) fn replicate_constant(vector: &ConstantVector, offsets: &[usize]) -> VectorRef {
assert_eq!(offsets.len(), vector.len());
if offsets.is_empty() {
return vector.slice(0, 0);
}
Arc::new(ConstantVector::new(
vector.vector.clone(),
*offsets.last().unwrap(),
))
}
pub(crate) fn filter_constant(
vector: &ConstantVector,
filter: &BooleanVector,
) -> Result<VectorRef> {
let length = filter.len() - filter.as_boolean_array().values().null_count();
if length == vector.len() {
return Ok(Arc::new(vector.clone()));
}
Ok(Arc::new(ConstantVector::new(
vector.inner().clone(),
length,
)))
}
#[cfg(test)]
mod tests {
use arrow::datatypes::DataType as ArrowDataType;
@@ -182,9 +184,9 @@ mod tests {
assert_eq!("ConstantVector", c.vector_type_name());
assert!(c.is_const());
assert_eq!(10, c.len());
assert_eq!(Validity::AllValid, c.validity());
assert!(c.validity().is_all_valid());
assert!(!c.only_null());
assert_eq!(4, c.memory_size());
assert_eq!(64, c.memory_size());
for i in 0..10 {
assert!(!c.is_null(i));

View File

@@ -12,258 +12,28 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use std::sync::Arc;
use crate::types::DateType;
use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder};
use arrow::array::{Array, ArrayRef, PrimitiveArray};
use common_time::date::Date;
use snafu::OptionExt;
use crate::data_type::ConcreteDataType;
use crate::error::{self, Result};
use crate::prelude::*;
use crate::scalars::ScalarVector;
use crate::serialize::Serializable;
use crate::vectors::{MutableVector, PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder};
#[derive(Debug, Clone, PartialEq)]
pub struct DateVector {
array: PrimitiveVector<i32>,
}
impl DateVector {
pub fn new(array: PrimitiveArray<i32>) -> Self {
Self {
array: PrimitiveVector { array },
}
}
pub fn try_from_arrow_array(array: impl AsRef<dyn Array>) -> Result<Self> {
Ok(Self::new(
array
.as_ref()
.as_any()
.downcast_ref::<PrimitiveArray<i32>>()
.with_context(|| error::ConversionSnafu {
from: format!("{:?}", array.as_ref().data_type()),
})?
.clone(),
))
}
pub(crate) fn as_arrow(&self) -> &dyn Array {
self.array.as_arrow()
}
}
impl Vector for DateVector {
fn data_type(&self) -> ConcreteDataType {
ConcreteDataType::date_datatype()
}
fn vector_type_name(&self) -> String {
"DateVector".to_string()
}
fn as_any(&self) -> &dyn Any {
self
}
fn len(&self) -> usize {
self.array.len()
}
fn to_arrow_array(&self) -> ArrayRef {
let validity = self.array.array.validity().cloned();
let buffer = self.array.array.values().clone();
Arc::new(PrimitiveArray::new(
arrow::datatypes::DataType::Date32,
buffer,
validity,
))
}
fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
let validity = self.array.array.validity().cloned();
let buffer = self.array.array.values().clone();
Box::new(PrimitiveArray::new(
arrow::datatypes::DataType::Date32,
buffer,
validity,
))
}
fn validity(&self) -> Validity {
self.array.validity()
}
fn memory_size(&self) -> usize {
self.array.memory_size()
}
fn is_null(&self, row: usize) -> bool {
self.array.is_null(row)
}
fn slice(&self, offset: usize, length: usize) -> VectorRef {
Arc::new(Self {
array: PrimitiveVector::new(self.array.array.slice(offset, length)),
})
}
fn get(&self, index: usize) -> Value {
match self.array.get(index) {
Value::Int32(v) => Value::Date(Date::new(v)),
Value::Null => Value::Null,
_ => {
unreachable!()
}
}
}
fn get_ref(&self, index: usize) -> ValueRef {
match self.array.get(index) {
Value::Int32(v) => ValueRef::Date(Date::new(v)),
Value::Null => ValueRef::Null,
_ => {
unreachable!()
}
}
}
}
impl From<Vec<Option<i32>>> for DateVector {
fn from(data: Vec<Option<i32>>) -> Self {
Self {
array: PrimitiveVector::<i32>::from(data),
}
}
}
pub struct DateIter<'a> {
iter: PrimitiveIter<'a, i32>,
}
impl<'a> Iterator for DateIter<'a> {
type Item = Option<Date>;
fn next(&mut self) -> Option<Self::Item> {
self.iter.next().map(|v| v.map(Date::new))
}
}
impl ScalarVector for DateVector {
type OwnedItem = Date;
type RefItem<'a> = Date;
type Iter<'a> = DateIter<'a>;
type Builder = DateVectorBuilder;
fn get_data(&self, idx: usize) -> Option<Self::RefItem<'_>> {
self.array.get_data(idx).map(Date::new)
}
fn iter_data(&self) -> Self::Iter<'_> {
DateIter {
iter: self.array.iter_data(),
}
}
}
impl Serializable for DateVector {
fn serialize_to_json(&self) -> Result<Vec<serde_json::Value>> {
Ok(self
.array
.iter_data()
.map(|v| v.map(Date::new))
.map(|v| match v {
None => serde_json::Value::Null,
Some(v) => v.into(),
})
.collect::<Vec<_>>())
}
}
pub struct DateVectorBuilder {
buffer: PrimitiveVectorBuilder<i32>,
}
impl MutableVector for DateVectorBuilder {
fn data_type(&self) -> ConcreteDataType {
ConcreteDataType::date_datatype()
}
fn len(&self) -> usize {
self.buffer.len()
}
fn as_any(&self) -> &dyn Any {
self
}
fn as_mut_any(&mut self) -> &mut dyn Any {
self
}
fn to_vector(&mut self) -> VectorRef {
Arc::new(self.finish())
}
fn push_value_ref(&mut self, value: ValueRef) -> Result<()> {
self.buffer.push(value.as_date()?.map(|d| d.val()));
Ok(())
}
fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> {
let concrete_vector = vector
.as_any()
.downcast_ref::<DateVector>()
.with_context(|| error::CastTypeSnafu {
msg: format!(
"Failed to convert vector from {} to DateVector",
vector.vector_type_name()
),
})?;
self.buffer
.extend_slice_of(&concrete_vector.array, offset, length)?;
Ok(())
}
}
impl ScalarVectorBuilder for DateVectorBuilder {
type VectorType = DateVector;
fn with_capacity(capacity: usize) -> Self {
Self {
buffer: PrimitiveVectorBuilder::with_capacity(capacity),
}
}
fn push(&mut self, value: Option<<Self::VectorType as ScalarVector>::RefItem<'_>>) {
self.buffer.push(value.map(|d| d.val()))
}
fn finish(&mut self) -> Self::VectorType {
Self::VectorType {
array: self.buffer.finish(),
}
}
}
pub(crate) fn replicate_date(vector: &DateVector, offsets: &[usize]) -> VectorRef {
let array = crate::vectors::primitive::replicate_primitive_with_type(
&vector.array,
offsets,
vector.data_type(),
);
Arc::new(DateVector { array })
}
// Vector for [`Date`](common_time::Date).
pub type DateVector = PrimitiveVector<DateType>;
// Builder to build DateVector.
pub type DateVectorBuilder = PrimitiveVectorBuilder<DateType>;
#[cfg(test)]
mod tests {
use std::sync::Arc;
use arrow::array::Array;
use common_time::date::Date;
use super::*;
use crate::data_type::DataType;
use crate::scalars::{ScalarVector, ScalarVectorBuilder};
use crate::serialize::Serializable;
use crate::types::DateType;
use crate::value::{Value, ValueRef};
use crate::vectors::{Vector, VectorRef};
#[test]
fn test_build_date_vector() {
@@ -288,7 +58,7 @@ mod tests {
#[test]
fn test_date_scalar() {
let vector = DateVector::from_slice(&[Date::new(1), Date::new(2)]);
let vector = DateVector::from_slice(&[1, 2]);
assert_eq!(2, vector.len());
assert_eq!(Some(Date::new(1)), vector.get_data(0));
assert_eq!(Some(Date::new(2)), vector.get_data(1));
@@ -296,7 +66,7 @@ mod tests {
#[test]
fn test_date_vector_builder() {
let input = DateVector::from_slice(&[Date::new(1), Date::new(2), Date::new(3)]);
let input = DateVector::from_slice(&[1, 2, 3]);
let mut builder = DateType::default().create_mutable_vector(3);
builder
@@ -309,19 +79,25 @@ mod tests {
.is_err());
let vector = builder.to_vector();
let expect: VectorRef = Arc::new(DateVector::from_slice(&[
Date::new(5),
Date::new(2),
Date::new(3),
]));
let expect: VectorRef = Arc::new(DateVector::from_slice(&[5, 2, 3]));
assert_eq!(expect, vector);
}
#[test]
fn test_date_from_arrow() {
let vector = DateVector::from_slice(&[Date::new(1), Date::new(2)]);
let vector = DateVector::from_slice(&[1, 2]);
let arrow = vector.as_arrow().slice(0, vector.len());
let vector2 = DateVector::try_from_arrow_array(&arrow).unwrap();
assert_eq!(vector, vector2);
}
#[test]
fn test_serialize_date_vector() {
let vector = DateVector::from_slice(&[-1, 0, 1]);
let serialized_json = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap();
assert_eq!(
r#"["1969-12-31","1970-01-01","1970-01-02"]"#,
serialized_json
);
}
}

View File

@@ -12,264 +12,32 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use std::sync::Arc;
use crate::types::DateTimeType;
use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder};
use arrow::array::{Array, ArrayRef, PrimitiveArray};
use common_time::datetime::DateTime;
use snafu::OptionExt;
use crate::data_type::ConcreteDataType;
use crate::error::{self, Result};
use crate::prelude::{
MutableVector, ScalarVector, ScalarVectorBuilder, Validity, Value, ValueRef, Vector, VectorRef,
};
use crate::serialize::Serializable;
use crate::vectors::{PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder};
#[derive(Debug, Clone, PartialEq)]
pub struct DateTimeVector {
array: PrimitiveVector<i64>,
}
impl DateTimeVector {
pub fn new(array: PrimitiveArray<i64>) -> Self {
Self {
array: PrimitiveVector { array },
}
}
pub fn try_from_arrow_array(array: impl AsRef<dyn Array>) -> Result<Self> {
Ok(Self::new(
array
.as_ref()
.as_any()
.downcast_ref::<PrimitiveArray<i64>>()
.with_context(|| error::ConversionSnafu {
from: format!("{:?}", array.as_ref().data_type()),
})?
.clone(),
))
}
pub(crate) fn as_arrow(&self) -> &dyn Array {
self.array.as_arrow()
}
}
impl Vector for DateTimeVector {
fn data_type(&self) -> ConcreteDataType {
ConcreteDataType::datetime_datatype()
}
fn vector_type_name(&self) -> String {
"DateTimeVector".to_string()
}
fn as_any(&self) -> &dyn Any {
self
}
fn len(&self) -> usize {
self.array.len()
}
fn to_arrow_array(&self) -> ArrayRef {
let validity = self.array.array.validity().cloned();
let buffer = self.array.array.values().clone();
Arc::new(PrimitiveArray::new(
arrow::datatypes::DataType::Date64,
buffer,
validity,
))
}
fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
let validity = self.array.array.validity().cloned();
let buffer = self.array.array.values().clone();
Box::new(PrimitiveArray::new(
arrow::datatypes::DataType::Date64,
buffer,
validity,
))
}
fn validity(&self) -> Validity {
self.array.validity()
}
fn memory_size(&self) -> usize {
self.array.memory_size()
}
fn is_null(&self, row: usize) -> bool {
self.array.is_null(row)
}
fn slice(&self, offset: usize, length: usize) -> VectorRef {
Arc::new(Self {
array: PrimitiveVector::new(self.array.array.slice(offset, length)),
})
}
fn get(&self, index: usize) -> Value {
match self.array.get(index) {
Value::Int64(v) => Value::DateTime(DateTime::new(v)),
Value::Null => Value::Null,
_ => {
unreachable!()
}
}
}
fn get_ref(&self, index: usize) -> ValueRef {
match self.array.get(index) {
Value::Int64(v) => ValueRef::DateTime(DateTime::new(v)),
Value::Null => ValueRef::Null,
_ => {
unreachable!()
}
}
}
}
impl Serializable for DateTimeVector {
fn serialize_to_json(&self) -> crate::Result<Vec<serde_json::Value>> {
Ok(self
.array
.iter_data()
.map(|v| v.map(DateTime::new))
.map(|v| match v {
None => serde_json::Value::Null,
Some(v) => v.into(),
})
.collect::<Vec<_>>())
}
}
impl From<Vec<Option<i64>>> for DateTimeVector {
fn from(data: Vec<Option<i64>>) -> Self {
Self {
array: PrimitiveVector::<i64>::from(data),
}
}
}
pub struct DateTimeVectorBuilder {
buffer: PrimitiveVectorBuilder<i64>,
}
impl ScalarVectorBuilder for DateTimeVectorBuilder {
type VectorType = DateTimeVector;
fn with_capacity(capacity: usize) -> Self {
Self {
buffer: PrimitiveVectorBuilder::with_capacity(capacity),
}
}
fn push(&mut self, value: Option<<Self::VectorType as ScalarVector>::RefItem<'_>>) {
self.buffer.push(value.map(|d| d.val()))
}
fn finish(&mut self) -> Self::VectorType {
Self::VectorType {
array: self.buffer.finish(),
}
}
}
impl MutableVector for DateTimeVectorBuilder {
fn data_type(&self) -> ConcreteDataType {
ConcreteDataType::datetime_datatype()
}
fn len(&self) -> usize {
self.buffer.len()
}
fn as_any(&self) -> &dyn Any {
self
}
fn as_mut_any(&mut self) -> &mut dyn Any {
self
}
fn to_vector(&mut self) -> VectorRef {
Arc::new(self.finish())
}
fn push_value_ref(&mut self, value: ValueRef) -> Result<()> {
self.buffer.push(value.as_datetime()?.map(|d| d.val()));
Ok(())
}
fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> {
let concrete_vector = vector
.as_any()
.downcast_ref::<DateTimeVector>()
.with_context(|| error::CastTypeSnafu {
msg: format!(
"Failed to convert vector from {} to DateVector",
vector.vector_type_name()
),
})?;
self.buffer
.extend_slice_of(&concrete_vector.array, offset, length)?;
Ok(())
}
}
pub struct DateTimeIter<'a> {
iter: PrimitiveIter<'a, i64>,
}
impl<'a> Iterator for DateTimeIter<'a> {
type Item = Option<DateTime>;
fn next(&mut self) -> Option<Self::Item> {
self.iter.next().map(|v| v.map(DateTime::new))
}
}
impl ScalarVector for DateTimeVector {
type OwnedItem = DateTime;
type RefItem<'a> = DateTime;
type Iter<'a> = DateTimeIter<'a>;
type Builder = DateTimeVectorBuilder;
fn get_data(&self, idx: usize) -> Option<Self::RefItem<'_>> {
self.array.get_data(idx).map(DateTime::new)
}
fn iter_data(&self) -> Self::Iter<'_> {
DateTimeIter {
iter: self.array.iter_data(),
}
}
}
pub(crate) fn replicate_datetime(vector: &DateTimeVector, offsets: &[usize]) -> VectorRef {
let array = crate::vectors::primitive::replicate_primitive_with_type(
&vector.array,
offsets,
vector.data_type(),
);
Arc::new(DateTimeVector { array })
}
/// Vector of [`DateTime`](common_time::Date)
pub type DateTimeVector = PrimitiveVector<DateTimeType>;
/// Builder for [`DateTimeVector`].
pub type DateTimeVectorBuilder = PrimitiveVectorBuilder<DateTimeType>;
#[cfg(test)]
mod tests {
use std::assert_matches::assert_matches;
use std::sync::Arc;
use arrow::array::{Array, PrimitiveArray};
use common_time::DateTime;
use datafusion_common::from_slice::FromSlice;
use super::*;
use crate::data_type::DataType;
use crate::types::DateTimeType;
use crate::prelude::{
ConcreteDataType, ScalarVector, ScalarVectorBuilder, Value, ValueRef, Vector, VectorRef,
};
use crate::serialize::Serializable;
#[test]
fn test_datetime_vector() {
let v = DateTimeVector::new(PrimitiveArray::from_vec(vec![1, 2, 3]));
let v = DateTimeVector::new(PrimitiveArray::from_slice(&[1, 2, 3]));
assert_eq!(ConcreteDataType::datetime_datatype(), v.data_type());
assert_eq!(3, v.len());
assert_eq!("DateTimeVector", v.vector_type_name());
@@ -287,9 +55,8 @@ mod tests {
assert_eq!(Some(DateTime::new(2)), iter.next().unwrap());
assert_eq!(Some(DateTime::new(3)), iter.next().unwrap());
assert!(!v.is_null(0));
assert_eq!(24, v.memory_size()); // size of i64 * 3
assert_eq!(64, v.memory_size());
assert_matches!(v.validity(), Validity::AllValid);
if let Value::DateTime(d) = v.get(0) {
assert_eq!(1, d.val());
} else {
@@ -314,8 +81,11 @@ mod tests {
assert_eq!(Value::Null, v.get(1));
assert_eq!(Value::DateTime(DateTime::new(-1)), v.get(2));
let input =
DateTimeVector::from_slice(&[DateTime::new(1), DateTime::new(2), DateTime::new(3)]);
let input = DateTimeVector::from_wrapper_slice(&[
DateTime::new(1),
DateTime::new(2),
DateTime::new(3),
]);
let mut builder = DateTimeType::default().create_mutable_vector(3);
builder
@@ -328,7 +98,7 @@ mod tests {
.is_err());
let vector = builder.to_vector();
let expect: VectorRef = Arc::new(DateTimeVector::from_slice(&[
let expect: VectorRef = Arc::new(DateTimeVector::from_wrapper_slice(&[
DateTime::new(5),
DateTime::new(2),
DateTime::new(3),
@@ -338,7 +108,7 @@ mod tests {
#[test]
fn test_datetime_from_arrow() {
let vector = DateTimeVector::from_slice(&[DateTime::new(1), DateTime::new(2)]);
let vector = DateTimeVector::from_wrapper_slice(&[DateTime::new(1), DateTime::new(2)]);
let arrow = vector.as_arrow().slice(0, vector.len());
let vector2 = DateTimeVector::try_from_arrow_array(&arrow).unwrap();
assert_eq!(vector, vector2);

View File

@@ -15,9 +15,12 @@
use std::sync::Arc;
use crate::data_type::DataType;
use crate::types::TimestampType;
use crate::vectors::constant::ConstantVector;
use crate::vectors::{
BinaryVector, BooleanVector, ConstantVector, DateTimeVector, DateVector, ListVector,
PrimitiveVector, StringVector, TimestampVector, Vector,
BinaryVector, BooleanVector, DateTimeVector, DateVector, ListVector, PrimitiveVector,
StringVector, TimestampMicrosecondVector, TimestampMillisecondVector,
TimestampNanosecondVector, TimestampSecondVector, Vector,
};
use crate::with_match_primitive_type_id;
@@ -76,7 +79,20 @@ fn equal(lhs: &dyn Vector, rhs: &dyn Vector) -> bool {
String(_) => is_vector_eq!(StringVector, lhs, rhs),
Date(_) => is_vector_eq!(DateVector, lhs, rhs),
DateTime(_) => is_vector_eq!(DateTimeVector, lhs, rhs),
Timestamp(_) => is_vector_eq!(TimestampVector, lhs, rhs),
Timestamp(t) => match t {
TimestampType::Second(_) => {
is_vector_eq!(TimestampSecondVector, lhs, rhs)
}
TimestampType::Millisecond(_) => {
is_vector_eq!(TimestampMillisecondVector, lhs, rhs)
}
TimestampType::Microsecond(_) => {
is_vector_eq!(TimestampMicrosecondVector, lhs, rhs)
}
TimestampType::Nanosecond(_) => {
is_vector_eq!(TimestampNanosecondVector, lhs, rhs)
}
},
List(_) => is_vector_eq!(ListVector, lhs, rhs),
UInt8(_) | UInt16(_) | UInt32(_) | UInt64(_) | Int8(_) | Int16(_) | Int32(_) | Int64(_)
| Float32(_) | Float64(_) => {
@@ -95,13 +111,10 @@ fn equal(lhs: &dyn Vector, rhs: &dyn Vector) -> bool {
#[cfg(test)]
mod tests {
use arrow::array::{ListArray, MutableListArray, MutablePrimitiveArray, TryExtend};
use super::*;
use crate::vectors::{
Float32Vector, Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector,
NullVector, TimestampVector, UInt16Vector, UInt32Vector, UInt64Vector, UInt8Vector,
VectorRef,
list, Float32Vector, Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector,
NullVector, UInt16Vector, UInt32Vector, UInt64Vector, UInt8Vector, VectorRef,
};
fn assert_vector_ref_eq(vector: VectorRef) {
@@ -132,14 +145,21 @@ mod tests {
assert_vector_ref_eq(Arc::new(BooleanVector::from(vec![true, false])));
assert_vector_ref_eq(Arc::new(DateVector::from(vec![Some(100), Some(120)])));
assert_vector_ref_eq(Arc::new(DateTimeVector::from(vec![Some(100), Some(120)])));
assert_vector_ref_eq(Arc::new(TimestampVector::from_values([100, 120])));
assert_vector_ref_eq(Arc::new(TimestampSecondVector::from_values([100, 120])));
assert_vector_ref_eq(Arc::new(TimestampMillisecondVector::from_values([
100, 120,
])));
assert_vector_ref_eq(Arc::new(TimestampMicrosecondVector::from_values([
100, 120,
])));
assert_vector_ref_eq(Arc::new(TimestampNanosecondVector::from_values([100, 120])));
let mut arrow_array = MutableListArray::<i32, MutablePrimitiveArray<i64>>::new();
arrow_array
.try_extend(vec![Some(vec![Some(1), Some(2), Some(3)])])
.unwrap();
let arrow_array: ListArray<i32> = arrow_array.into();
assert_vector_ref_eq(Arc::new(ListVector::from(arrow_array)));
let list_vector = list::tests::new_list_vector(&[
Some(vec![Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4)]),
]);
assert_vector_ref_eq(Arc::new(list_vector));
assert_vector_ref_eq(Arc::new(NullVector::new(4)));
assert_vector_ref_eq(Arc::new(StringVector::from(vec![

View File

@@ -17,19 +17,26 @@
use std::any::Any;
use std::sync::Arc;
use arrow::array::Array;
use arrow::array::{Array, ArrayRef, StringArray};
use arrow::compute;
use arrow::datatypes::DataType as ArrowDataType;
use arrow::compute::kernels::comparison;
use arrow::datatypes::{DataType as ArrowDataType, TimeUnit};
use datafusion_common::ScalarValue;
use snafu::{OptionExt, ResultExt};
use crate::arrow_array::StringArray;
use crate::error::{ConversionSnafu, Result, UnknownVectorSnafu};
use crate::scalars::*;
use crate::vectors::date::DateVector;
use crate::vectors::datetime::DateTimeVector;
use crate::vectors::*;
use crate::data_type::ConcreteDataType;
use crate::error::{self, Result};
use crate::scalars::{Scalar, ScalarVectorBuilder};
use crate::value::{ListValue, ListValueRef};
use crate::vectors::{
BinaryVector, BooleanVector, ConstantVector, DateTimeVector, DateVector, Float32Vector,
Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector, ListVector,
ListVectorBuilder, MutableVector, NullVector, StringVector, TimestampMicrosecondVector,
TimestampMillisecondVector, TimestampNanosecondVector, TimestampSecondVector, UInt16Vector,
UInt32Vector, UInt64Vector, UInt8Vector, Vector, VectorRef,
};
/// Helper functions for `Vector`.
pub struct Helper;
impl Helper {
@@ -47,7 +54,7 @@ impl Helper {
let arr = vector
.as_any()
.downcast_ref::<<T as Scalar>::VectorType>()
.with_context(|| UnknownVectorSnafu {
.with_context(|| error::UnknownVectorSnafu {
msg: format!(
"downcast vector error, vector type: {:?}, expected vector: {:?}",
vector.vector_type_name(),
@@ -61,7 +68,7 @@ impl Helper {
let arr = vector
.as_any()
.downcast_ref::<T>()
.with_context(|| UnknownVectorSnafu {
.with_context(|| error::UnknownVectorSnafu {
msg: format!(
"downcast vector error, vector type: {:?}, expected vector: {:?}",
vector.vector_type_name(),
@@ -78,7 +85,7 @@ impl Helper {
let arr = vector
.as_mut_any()
.downcast_mut()
.with_context(|| UnknownVectorSnafu {
.with_context(|| error::UnknownVectorSnafu {
msg: format!(
"downcast vector error, vector type: {:?}, expected vector: {:?}",
ty,
@@ -94,7 +101,7 @@ impl Helper {
let arr = vector
.as_any()
.downcast_ref::<<T as Scalar>::VectorType>()
.with_context(|| UnknownVectorSnafu {
.with_context(|| error::UnknownVectorSnafu {
msg: format!(
"downcast vector error, vector type: {:?}, expected vector: {:?}",
vector.vector_type_name(),
@@ -105,11 +112,9 @@ impl Helper {
}
/// Try to cast an arrow scalar value into vector
///
/// # Panics
/// Panic if given scalar value is not supported.
pub fn try_from_scalar_value(value: ScalarValue, length: usize) -> Result<VectorRef> {
let vector = match value {
ScalarValue::Null => ConstantVector::new(Arc::new(NullVector::new(1)), length),
ScalarValue::Boolean(v) => {
ConstantVector::new(Arc::new(BooleanVector::from(vec![v])), length)
}
@@ -143,17 +148,29 @@ impl Helper {
ScalarValue::UInt64(v) => {
ConstantVector::new(Arc::new(UInt64Vector::from(vec![v])), length)
}
ScalarValue::Utf8(v) => {
ScalarValue::Utf8(v) | ScalarValue::LargeUtf8(v) => {
ConstantVector::new(Arc::new(StringVector::from(vec![v])), length)
}
ScalarValue::LargeUtf8(v) => {
ConstantVector::new(Arc::new(StringVector::from(vec![v])), length)
}
ScalarValue::Binary(v) => {
ScalarValue::Binary(v)
| ScalarValue::LargeBinary(v)
| ScalarValue::FixedSizeBinary(_, v) => {
ConstantVector::new(Arc::new(BinaryVector::from(vec![v])), length)
}
ScalarValue::LargeBinary(v) => {
ConstantVector::new(Arc::new(BinaryVector::from(vec![v])), length)
ScalarValue::List(v, field) => {
let item_type = ConcreteDataType::try_from(field.data_type())?;
let mut builder = ListVectorBuilder::with_type_capacity(item_type.clone(), 1);
if let Some(values) = v {
let values = values
.into_iter()
.map(ScalarValue::try_into)
.collect::<Result<_>>()?;
let list_value = ListValue::new(Some(Box::new(values)), item_type);
builder.push(Some(ListValueRef::Ref { val: &list_value }));
} else {
builder.push(None);
}
let list_vector = builder.to_vector();
ConstantVector::new(list_vector, length)
}
ScalarValue::Date32(v) => {
ConstantVector::new(Arc::new(DateVector::from(vec![v])), length)
@@ -161,8 +178,30 @@ impl Helper {
ScalarValue::Date64(v) => {
ConstantVector::new(Arc::new(DateTimeVector::from(vec![v])), length)
}
_ => {
return ConversionSnafu {
ScalarValue::TimestampSecond(v, _) => {
// Timezone is unimplemented now.
ConstantVector::new(Arc::new(TimestampSecondVector::from(vec![v])), length)
}
ScalarValue::TimestampMillisecond(v, _) => {
// Timezone is unimplemented now.
ConstantVector::new(Arc::new(TimestampMillisecondVector::from(vec![v])), length)
}
ScalarValue::TimestampMicrosecond(v, _) => {
// Timezone is unimplemented now.
ConstantVector::new(Arc::new(TimestampMicrosecondVector::from(vec![v])), length)
}
ScalarValue::TimestampNanosecond(v, _) => {
// Timezone is unimplemented now.
ConstantVector::new(Arc::new(TimestampNanosecondVector::from(vec![v])), length)
}
ScalarValue::Decimal128(_, _, _)
| ScalarValue::Time64(_)
| ScalarValue::IntervalYearMonth(_)
| ScalarValue::IntervalDayTime(_)
| ScalarValue::IntervalMonthDayNano(_)
| ScalarValue::Struct(_, _)
| ScalarValue::Dictionary(_, _) => {
return error::ConversionSnafu {
from: format!("Unsupported scalar value: {}", value),
}
.fail()
@@ -180,9 +219,7 @@ impl Helper {
Ok(match array.as_ref().data_type() {
ArrowDataType::Null => Arc::new(NullVector::try_from_arrow_array(array)?),
ArrowDataType::Boolean => Arc::new(BooleanVector::try_from_arrow_array(array)?),
ArrowDataType::Binary | ArrowDataType::LargeBinary => {
Arc::new(BinaryVector::try_from_arrow_array(array)?)
}
ArrowDataType::LargeBinary => Arc::new(BinaryVector::try_from_arrow_array(array)?),
ArrowDataType::Int8 => Arc::new(Int8Vector::try_from_arrow_array(array)?),
ArrowDataType::Int16 => Arc::new(Int16Vector::try_from_arrow_array(array)?),
ArrowDataType::Int32 => Arc::new(Int32Vector::try_from_arrow_array(array)?),
@@ -193,48 +230,80 @@ impl Helper {
ArrowDataType::UInt64 => Arc::new(UInt64Vector::try_from_arrow_array(array)?),
ArrowDataType::Float32 => Arc::new(Float32Vector::try_from_arrow_array(array)?),
ArrowDataType::Float64 => Arc::new(Float64Vector::try_from_arrow_array(array)?),
ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => {
Arc::new(StringVector::try_from_arrow_array(array)?)
}
ArrowDataType::Utf8 => Arc::new(StringVector::try_from_arrow_array(array)?),
ArrowDataType::Date32 => Arc::new(DateVector::try_from_arrow_array(array)?),
ArrowDataType::Date64 => Arc::new(DateTimeVector::try_from_arrow_array(array)?),
ArrowDataType::List(_) => Arc::new(ListVector::try_from_arrow_array(array)?),
ArrowDataType::Timestamp(_, _) => {
Arc::new(TimestampVector::try_from_arrow_array(array)?)
ArrowDataType::Timestamp(unit, _) => match unit {
TimeUnit::Second => Arc::new(TimestampSecondVector::try_from_arrow_array(array)?),
TimeUnit::Millisecond => {
Arc::new(TimestampMillisecondVector::try_from_arrow_array(array)?)
}
TimeUnit::Microsecond => {
Arc::new(TimestampMicrosecondVector::try_from_arrow_array(array)?)
}
TimeUnit::Nanosecond => {
Arc::new(TimestampNanosecondVector::try_from_arrow_array(array)?)
}
},
ArrowDataType::Float16
| ArrowDataType::Time32(_)
| ArrowDataType::Time64(_)
| ArrowDataType::Duration(_)
| ArrowDataType::Interval(_)
| ArrowDataType::Binary
| ArrowDataType::FixedSizeBinary(_)
| ArrowDataType::LargeUtf8
| ArrowDataType::LargeList(_)
| ArrowDataType::FixedSizeList(_, _)
| ArrowDataType::Struct(_)
| ArrowDataType::Union(_, _, _)
| ArrowDataType::Dictionary(_, _)
| ArrowDataType::Decimal128(_, _)
| ArrowDataType::Decimal256(_, _)
| ArrowDataType::Map(_, _) => {
unimplemented!("Arrow array datatype: {:?}", array.as_ref().data_type())
}
_ => unimplemented!("Arrow array datatype: {:?}", array.as_ref().data_type()),
})
}
/// Try to cast slice of `arrays` to vectors.
pub fn try_into_vectors(arrays: &[ArrayRef]) -> Result<Vec<VectorRef>> {
arrays.iter().map(Self::try_into_vector).collect()
}
/// Perform SQL like operation on `names` and a scalar `s`.
pub fn like_utf8(names: Vec<String>, s: &str) -> Result<VectorRef> {
let array = StringArray::from_slice(&names);
let array = StringArray::from(names);
let filter =
compute::like::like_utf8_scalar(&array, s).context(error::ArrowComputeSnafu)?;
let filter = comparison::like_utf8_scalar(&array, s).context(error::ArrowComputeSnafu)?;
let result = compute::filter::filter(&array, &filter).context(error::ArrowComputeSnafu)?;
let result = compute::filter(&array, &filter).context(error::ArrowComputeSnafu)?;
Helper::try_into_vector(result)
}
}
#[cfg(test)]
mod tests {
use arrow::array::Int32Array;
use common_time::date::Date;
use common_time::datetime::DateTime;
use arrow::array::{
ArrayRef, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, Int16Array,
Int32Array, Int64Array, Int8Array, LargeBinaryArray, ListArray, NullArray,
TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
};
use arrow::datatypes::{Field, Int32Type};
use common_time::{Date, DateTime};
use super::*;
use crate::value::Value;
use crate::vectors::ConcreteDataType;
#[test]
fn test_try_into_vectors() {
let arrays: Vec<ArrayRef> = vec![
Arc::new(Int32Array::from_vec(vec![1])),
Arc::new(Int32Array::from_vec(vec![2])),
Arc::new(Int32Array::from_vec(vec![3])),
Arc::new(Int32Array::from(vec![1])),
Arc::new(Int32Array::from(vec![2])),
Arc::new(Int32Array::from(vec![3])),
];
let vectors = Helper::try_into_vectors(&arrays);
assert!(vectors.is_ok());
@@ -246,10 +315,10 @@ mod tests {
}
#[test]
pub fn test_try_into_date_vector() {
fn test_try_into_date_vector() {
let vector = DateVector::from(vec![Some(1), Some(2), None]);
let arrow_array = vector.to_arrow_array();
assert_eq!(&arrow::datatypes::DataType::Date32, arrow_array.data_type());
assert_eq!(&ArrowDataType::Date32, arrow_array.data_type());
let vector_converted = Helper::try_into_vector(arrow_array).unwrap();
assert_eq!(vector.len(), vector_converted.len());
for i in 0..vector_converted.len() {
@@ -258,7 +327,7 @@ mod tests {
}
#[test]
pub fn test_try_from_scalar_date_value() {
fn test_try_from_scalar_date_value() {
let vector = Helper::try_from_scalar_value(ScalarValue::Date32(Some(42)), 3).unwrap();
assert_eq!(ConcreteDataType::date_datatype(), vector.data_type());
assert_eq!(3, vector.len());
@@ -268,7 +337,7 @@ mod tests {
}
#[test]
pub fn test_try_from_scalar_datetime_value() {
fn test_try_from_scalar_datetime_value() {
let vector = Helper::try_from_scalar_value(ScalarValue::Date64(Some(42)), 3).unwrap();
assert_eq!(ConcreteDataType::datetime_datatype(), vector.data_type());
assert_eq!(3, vector.len());
@@ -277,6 +346,28 @@ mod tests {
}
}
#[test]
fn test_try_from_list_value() {
let value = ScalarValue::List(
Some(vec![
ScalarValue::Int32(Some(1)),
ScalarValue::Int32(Some(2)),
]),
Box::new(Field::new("item", ArrowDataType::Int32, true)),
);
let vector = Helper::try_from_scalar_value(value, 3).unwrap();
assert_eq!(
ConcreteDataType::list_datatype(ConcreteDataType::int32_datatype()),
vector.data_type()
);
assert_eq!(3, vector.len());
for i in 0..vector.len() {
let v = vector.get(i);
let items = v.as_list().unwrap().unwrap().items().as_ref().unwrap();
assert_eq!(vec![Value::Int32(1), Value::Int32(2)], **items);
}
}
#[test]
fn test_like_utf8() {
fn assert_vector(expected: Vec<&str>, actual: &VectorRef) {
@@ -301,4 +392,40 @@ mod tests {
let ret = Helper::like_utf8(names, "%").unwrap();
assert_vector(vec!["greptime", "hello", "public", "world"], &ret);
}
fn check_try_into_vector(array: impl Array + 'static) {
let array: ArrayRef = Arc::new(array);
let vector = Helper::try_into_vector(array.clone()).unwrap();
assert_eq!(&array, &vector.to_arrow_array());
}
#[test]
fn test_try_into_vector() {
check_try_into_vector(NullArray::new(2));
check_try_into_vector(BooleanArray::from(vec![true, false]));
check_try_into_vector(LargeBinaryArray::from(vec![
"hello".as_bytes(),
"world".as_bytes(),
]));
check_try_into_vector(Int8Array::from(vec![1, 2, 3]));
check_try_into_vector(Int16Array::from(vec![1, 2, 3]));
check_try_into_vector(Int32Array::from(vec![1, 2, 3]));
check_try_into_vector(Int64Array::from(vec![1, 2, 3]));
check_try_into_vector(UInt8Array::from(vec![1, 2, 3]));
check_try_into_vector(UInt16Array::from(vec![1, 2, 3]));
check_try_into_vector(UInt32Array::from(vec![1, 2, 3]));
check_try_into_vector(UInt64Array::from(vec![1, 2, 3]));
check_try_into_vector(Float32Array::from(vec![1.0, 2.0, 3.0]));
check_try_into_vector(Float64Array::from(vec![1.0, 2.0, 3.0]));
check_try_into_vector(StringArray::from(vec!["hello", "world"]));
check_try_into_vector(Date32Array::from(vec![1, 2, 3]));
check_try_into_vector(Date64Array::from(vec![1, 2, 3]));
let data = vec![None, Some(vec![Some(6), Some(7)])];
let list_array = ListArray::from_iter_primitive::<Int32Type, _, _>(data);
check_try_into_vector(list_array);
check_try_into_vector(TimestampSecondArray::from(vec![1, 2, 3]));
check_try_into_vector(TimestampMillisecondArray::from(vec![1, 2, 3]));
check_try_into_vector(TimestampMicrosecondArray::from(vec![1, 2, 3]));
check_try_into_vector(TimestampNanosecondArray::from(vec![1, 2, 3]));
}
}

View File

@@ -13,39 +13,48 @@
// limitations under the License.
use std::any::Any;
use std::ops::Range;
use std::sync::Arc;
use arrow::array::{Array, ArrayRef, ListArray};
use arrow::bitmap::utils::ZipValidity;
use arrow::bitmap::MutableBitmap;
use arrow::array::{
Array, ArrayData, ArrayRef, BooleanBufferBuilder, Int32BufferBuilder, ListArray,
};
use arrow::buffer::Buffer;
use arrow::datatypes::DataType as ArrowDataType;
use serde_json::Value as JsonValue;
use snafu::prelude::*;
use crate::data_type::{ConcreteDataType, DataType};
use crate::error::Result;
use crate::prelude::*;
use crate::scalars::{ScalarVector, ScalarVectorBuilder};
use crate::serialize::Serializable;
use crate::types::ListType;
use crate::value::{ListValue, ListValueRef};
use crate::vectors::{impl_try_from_arrow_array_for_vector, impl_validity_for_vector};
type ArrowListArray = ListArray<i32>;
use crate::value::{ListValue, ListValueRef, Value, ValueRef};
use crate::vectors::{self, Helper, MutableVector, Validity, Vector, VectorRef};
/// Vector of Lists, basically backed by Arrow's `ListArray`.
#[derive(Debug, Clone, PartialEq)]
#[derive(Debug, PartialEq)]
pub struct ListVector {
array: ArrowListArray,
inner_datatype: ConcreteDataType,
array: ListArray,
/// The datatype of the items in the list.
item_type: ConcreteDataType,
}
impl ListVector {
/// Only iterate values in the [ListVector].
///
/// Be careful to use this method as it would ignore validity and replace null
/// by empty vector.
pub fn values_iter(&self) -> Box<dyn Iterator<Item = Result<VectorRef>> + '_> {
Box::new(self.array.values_iter().map(VectorHelper::try_into_vector))
/// Iterate elements as [VectorRef].
pub fn values_iter(&self) -> impl Iterator<Item = Result<Option<VectorRef>>> + '_ {
self.array
.iter()
.map(|value_opt| value_opt.map(Helper::try_into_vector).transpose())
}
fn to_array_data(&self) -> ArrayData {
self.array.data().clone()
}
fn from_array_data_and_type(data: ArrayData, item_type: ConcreteDataType) -> Self {
Self {
array: ListArray::from(data),
item_type,
}
}
pub(crate) fn as_arrow(&self) -> &dyn Array {
@@ -55,7 +64,7 @@ impl ListVector {
impl Vector for ListVector {
fn data_type(&self) -> ConcreteDataType {
ConcreteDataType::List(ListType::new(self.inner_datatype.clone()))
ConcreteDataType::List(ListType::new(self.item_type.clone()))
}
fn vector_type_name(&self) -> String {
@@ -71,21 +80,25 @@ impl Vector for ListVector {
}
fn to_arrow_array(&self) -> ArrayRef {
Arc::new(self.array.clone())
let data = self.to_array_data();
Arc::new(ListArray::from(data))
}
fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
Box::new(self.array.clone())
let data = self.to_array_data();
Box::new(ListArray::from(data))
}
fn validity(&self) -> Validity {
impl_validity_for_vector!(self.array)
vectors::impl_validity_for_vector!(self.array)
}
fn memory_size(&self) -> usize {
let offsets_bytes = self.array.offsets().len() * std::mem::size_of::<i64>();
let value_refs_bytes = self.array.values().len() * std::mem::size_of::<Arc<dyn Array>>();
offsets_bytes + value_refs_bytes
self.array.get_buffer_memory_size()
}
fn null_count(&self) -> usize {
self.array.null_count()
}
fn is_null(&self, row: usize) -> bool {
@@ -93,7 +106,8 @@ impl Vector for ListVector {
}
fn slice(&self, offset: usize, length: usize) -> VectorRef {
Arc::new(ListVector::from(self.array.slice(offset, length)))
let data = self.array.data().slice(offset, length);
Arc::new(Self::from_array_data_and_type(data, self.item_type.clone()))
}
fn get(&self, index: usize) -> Value {
@@ -102,7 +116,7 @@ impl Vector for ListVector {
}
let array = &self.array.value(index);
let vector = VectorHelper::try_into_vector(array).unwrap_or_else(|_| {
let vector = Helper::try_into_vector(array).unwrap_or_else(|_| {
panic!(
"arrow array with datatype {:?} cannot converted to our vector",
array.data_type()
@@ -113,7 +127,7 @@ impl Vector for ListVector {
.collect::<Vec<Value>>();
Value::List(ListValue::new(
Some(Box::new(values)),
self.inner_datatype.clone(),
self.item_type.clone(),
))
}
@@ -131,7 +145,7 @@ impl Serializable for ListVector {
.iter()
.map(|v| match v {
None => Ok(JsonValue::Null),
Some(v) => VectorHelper::try_into_vector(v)
Some(v) => Helper::try_into_vector(v)
.and_then(|v| v.serialize_to_json())
.map(JsonValue::Array),
})
@@ -139,70 +153,64 @@ impl Serializable for ListVector {
}
}
impl From<ArrowListArray> for ListVector {
fn from(array: ArrowListArray) -> Self {
let inner_datatype = ConcreteDataType::from_arrow_type(match array.data_type() {
ArrowDataType::List(field) => &field.data_type,
_ => unreachable!(),
impl From<ListArray> for ListVector {
fn from(array: ListArray) -> Self {
let item_type = ConcreteDataType::from_arrow_type(match array.data_type() {
ArrowDataType::List(field) => field.data_type(),
other => panic!(
"Try to create ListVector from an arrow array with type {:?}",
other
),
});
Self {
array,
inner_datatype,
}
Self { array, item_type }
}
}
impl_try_from_arrow_array_for_vector!(ArrowListArray, ListVector);
vectors::impl_try_from_arrow_array_for_vector!(ListArray, ListVector);
pub struct ListVectorIter<'a> {
pub struct ListIter<'a> {
vector: &'a ListVector,
iter: ZipValidity<'a, usize, Range<usize>>,
idx: usize,
}
impl<'a> ListVectorIter<'a> {
pub fn new(vector: &'a ListVector) -> ListVectorIter<'a> {
let iter = ZipValidity::new(
0..vector.len(),
vector.array.validity().as_ref().map(|x| x.iter()),
);
Self { vector, iter }
impl<'a> ListIter<'a> {
fn new(vector: &'a ListVector) -> ListIter {
ListIter { vector, idx: 0 }
}
}
impl<'a> Iterator for ListVectorIter<'a> {
impl<'a> Iterator for ListIter<'a> {
type Item = Option<ListValueRef<'a>>;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
self.iter.next().map(|idx_opt| {
idx_opt.map(|idx| ListValueRef::Indexed {
vector: self.vector,
idx,
})
})
if self.idx >= self.vector.len() {
return None;
}
let idx = self.idx;
self.idx += 1;
if self.vector.is_null(idx) {
return Some(None);
}
Some(Some(ListValueRef::Indexed {
vector: self.vector,
idx,
}))
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.iter.size_hint()
}
#[inline]
fn nth(&mut self, n: usize) -> Option<Self::Item> {
self.iter.nth(n).map(|idx_opt| {
idx_opt.map(|idx| ListValueRef::Indexed {
vector: self.vector,
idx,
})
})
(self.vector.len(), Some(self.vector.len()))
}
}
impl ScalarVector for ListVector {
type OwnedItem = ListValue;
type RefItem<'a> = ListValueRef<'a>;
type Iter<'a> = ListVectorIter<'a>;
type Iter<'a> = ListIter<'a>;
type Builder = ListVectorBuilder;
fn get_data(&self, idx: usize) -> Option<Self::RefItem<'_>> {
@@ -214,86 +222,68 @@ impl ScalarVector for ListVector {
}
fn iter_data(&self) -> Self::Iter<'_> {
ListVectorIter::new(self)
ListIter::new(self)
}
}
// Some codes are ported from arrow2's MutableListArray.
// Ports from arrow's GenericListBuilder.
// See https://github.com/apache/arrow-rs/blob/94565bca99b5d9932a3e9a8e094aaf4e4384b1e5/arrow-array/src/builder/generic_list_builder.rs
/// [ListVector] builder.
pub struct ListVectorBuilder {
inner_type: ConcreteDataType,
offsets: Vec<i32>,
values: Box<dyn MutableVector>,
validity: Option<MutableBitmap>,
item_type: ConcreteDataType,
offsets_builder: Int32BufferBuilder,
null_buffer_builder: NullBufferBuilder,
values_builder: Box<dyn MutableVector>,
}
impl ListVectorBuilder {
pub fn with_type_capacity(inner_type: ConcreteDataType, capacity: usize) -> ListVectorBuilder {
let mut offsets = Vec::with_capacity(capacity + 1);
offsets.push(0);
// The actual required capacity might greater than the capacity of the `ListVector`
// if there exists child vector that has more than one element.
let values = inner_type.create_mutable_vector(capacity);
/// Creates a new [`ListVectorBuilder`]. `item_type` is the data type of the list item, `capacity`
/// is the number of items to pre-allocate space for in this builder.
pub fn with_type_capacity(item_type: ConcreteDataType, capacity: usize) -> ListVectorBuilder {
let mut offsets_builder = Int32BufferBuilder::new(capacity + 1);
offsets_builder.append(0);
// The actual required capacity might be greater than the capacity of the `ListVector`
// if the child vector has more than one element.
let values_builder = item_type.create_mutable_vector(capacity);
ListVectorBuilder {
inner_type,
offsets,
values,
validity: None,
item_type,
offsets_builder,
null_buffer_builder: NullBufferBuilder::new(capacity),
values_builder,
}
}
#[inline]
fn last_offset(&self) -> i32 {
*self.offsets.last().unwrap()
/// Finish the current variable-length list vector slot.
fn finish_list(&mut self, is_valid: bool) {
self.offsets_builder
.append(i32::try_from(self.values_builder.len()).unwrap());
self.null_buffer_builder.append(is_valid);
}
fn push_null(&mut self) {
self.offsets.push(self.last_offset());
match &mut self.validity {
Some(validity) => validity.push(false),
None => self.init_validity(),
}
}
fn init_validity(&mut self) {
let len = self.offsets.len() - 1;
let mut validity = MutableBitmap::with_capacity(self.offsets.capacity());
validity.extend_constant(len, true);
validity.set(len - 1, false);
self.validity = Some(validity)
self.finish_list(false);
}
fn push_list_value(&mut self, list_value: &ListValue) -> Result<()> {
if let Some(items) = list_value.items() {
for item in &**items {
self.values.push_value_ref(item.as_value_ref())?;
self.values_builder.push_value_ref(item.as_value_ref())?;
}
}
self.push_valid();
self.finish_list(true);
Ok(())
}
/// Needs to be called when a valid value was extended to this builder.
fn push_valid(&mut self) {
let size = self.values.len();
let size = i32::try_from(size).unwrap();
assert!(size >= *self.offsets.last().unwrap());
self.offsets.push(size);
if let Some(validity) = &mut self.validity {
validity.push(true)
}
}
}
impl MutableVector for ListVectorBuilder {
fn data_type(&self) -> ConcreteDataType {
ConcreteDataType::list_datatype(self.inner_type.clone())
ConcreteDataType::list_datatype(self.item_type.clone())
}
fn len(&self) -> usize {
self.offsets.len() - 1
self.null_buffer_builder.len()
}
fn as_any(&self) -> &dyn Any {
@@ -348,51 +338,181 @@ impl ScalarVectorBuilder for ListVectorBuilder {
self.push_value_ref(value.into()).unwrap_or_else(|e| {
panic!(
"Failed to push value, expect value type {:?}, err:{}",
self.inner_type, e
self.item_type, e
);
});
}
fn finish(&mut self) -> Self::VectorType {
let array = ArrowListArray::try_new(
ConcreteDataType::list_datatype(self.inner_type.clone()).as_arrow_type(),
std::mem::take(&mut self.offsets).into(),
self.values.to_vector().to_arrow_array(),
std::mem::take(&mut self.validity).map(|x| x.into()),
)
.unwrap(); // The `ListVectorBuilder` itself should ensure it always builds a valid array.
let len = self.len();
let values_vector = self.values_builder.to_vector();
let values_arr = values_vector.to_arrow_array();
let values_data = values_arr.data();
let offset_buffer = self.offsets_builder.finish();
let null_bit_buffer = self.null_buffer_builder.finish();
// Re-initialize the offsets_builder.
self.offsets_builder.append(0);
let data_type = ConcreteDataType::list_datatype(self.item_type.clone()).as_arrow_type();
let array_data_builder = ArrayData::builder(data_type)
.len(len)
.add_buffer(offset_buffer)
.add_child_data(values_data.clone())
.null_bit_buffer(null_bit_buffer);
let array_data = unsafe { array_data_builder.build_unchecked() };
let array = ListArray::from(array_data);
ListVector {
array,
inner_datatype: self.inner_type.clone(),
item_type: self.item_type.clone(),
}
}
}
// Ports from https://github.com/apache/arrow-rs/blob/94565bca99b5d9932a3e9a8e094aaf4e4384b1e5/arrow-array/src/builder/null_buffer_builder.rs
/// Builder for creating the null bit buffer.
/// This builder only materializes the buffer when we append `false`.
/// If you only append `true`s to the builder, what you get will be
/// `None` when calling [`finish`](#method.finish).
/// This optimization is **very** important for the performance.
#[derive(Debug)]
struct NullBufferBuilder {
bitmap_builder: Option<BooleanBufferBuilder>,
/// Store the length of the buffer before materializing.
len: usize,
capacity: usize,
}
impl NullBufferBuilder {
/// Creates a new empty builder.
/// `capacity` is the number of bits in the null buffer.
fn new(capacity: usize) -> Self {
Self {
bitmap_builder: None,
len: 0,
capacity,
}
}
fn len(&self) -> usize {
if let Some(b) = &self.bitmap_builder {
b.len()
} else {
self.len
}
}
/// Appends a `true` into the builder
/// to indicate that this item is not null.
#[inline]
fn append_non_null(&mut self) {
if let Some(buf) = self.bitmap_builder.as_mut() {
buf.append(true)
} else {
self.len += 1;
}
}
/// Appends a `false` into the builder
/// to indicate that this item is null.
#[inline]
fn append_null(&mut self) {
self.materialize_if_needed();
self.bitmap_builder.as_mut().unwrap().append(false);
}
/// Appends a boolean value into the builder.
#[inline]
fn append(&mut self, not_null: bool) {
if not_null {
self.append_non_null()
} else {
self.append_null()
}
}
/// Builds the null buffer and resets the builder.
/// Returns `None` if the builder only contains `true`s.
fn finish(&mut self) -> Option<Buffer> {
let buf = self.bitmap_builder.as_mut().map(|b| b.finish());
self.bitmap_builder = None;
self.len = 0;
buf
}
#[inline]
fn materialize_if_needed(&mut self) {
if self.bitmap_builder.is_none() {
self.materialize()
}
}
#[cold]
fn materialize(&mut self) {
if self.bitmap_builder.is_none() {
let mut b = BooleanBufferBuilder::new(self.len.max(self.capacity));
b.append_n(self.len, true);
self.bitmap_builder = Some(b);
}
}
}
#[cfg(test)]
mod tests {
use arrow::array::{MutableListArray, MutablePrimitiveArray, TryExtend};
pub mod tests {
use arrow::array::{Int32Array, Int32Builder, ListBuilder};
use serde_json::json;
use super::*;
use crate::scalars::ScalarRef;
use crate::types::ListType;
use crate::vectors::Int32Vector;
pub fn new_list_vector(data: &[Option<Vec<Option<i32>>>]) -> ListVector {
let mut builder =
ListVectorBuilder::with_type_capacity(ConcreteDataType::int32_datatype(), 8);
for vec_opt in data {
if let Some(vec) = vec_opt {
let values = vec.iter().map(|v| Value::from(*v)).collect();
let values = Some(Box::new(values));
let list_value = ListValue::new(values, ConcreteDataType::int32_datatype());
builder.push(Some(ListValueRef::Ref { val: &list_value }));
} else {
builder.push(None);
}
}
builder.finish()
}
fn new_list_array(data: &[Option<Vec<Option<i32>>>]) -> ListArray {
let mut builder = ListBuilder::new(Int32Builder::new());
for vec_opt in data {
if let Some(vec) = vec_opt {
for value_opt in vec {
builder.values().append_option(*value_opt);
}
builder.append(true);
} else {
builder.append(false);
}
}
builder.finish()
}
#[test]
fn test_list_vector() {
let data = vec![
Some(vec![Some(1i32), Some(2), Some(3)]),
Some(vec![Some(1), Some(2), Some(3)]),
None,
Some(vec![Some(4), None, Some(6)]),
];
let mut arrow_array = MutableListArray::<i32, MutablePrimitiveArray<i32>>::new();
arrow_array.try_extend(data).unwrap();
let arrow_array: ArrowListArray = arrow_array.into();
let list_vector = new_list_vector(&data);
let list_vector = ListVector {
array: arrow_array.clone(),
inner_datatype: ConcreteDataType::int32_datatype(),
};
assert_eq!(
ConcreteDataType::List(ListType::new(ConcreteDataType::int32_datatype())),
list_vector.data_type()
@@ -403,30 +523,34 @@ mod tests {
assert!(list_vector.is_null(1));
assert!(!list_vector.is_null(2));
let arrow_array = new_list_array(&data);
assert_eq!(
arrow_array,
list_vector
*list_vector
.to_arrow_array()
.as_any()
.downcast_ref::<ArrowListArray>()
.downcast_ref::<ListArray>()
.unwrap()
.clone()
);
assert_eq!(
Validity::Slots(arrow_array.validity().unwrap()),
list_vector.validity()
);
assert_eq!(
arrow_array.offsets().len() * std::mem::size_of::<i64>()
+ arrow_array.values().len() * std::mem::size_of::<Arc<dyn Array>>(),
list_vector.memory_size()
);
let validity = list_vector.validity();
assert!(!validity.is_all_null());
assert!(!validity.is_all_valid());
assert!(validity.is_set(0));
assert!(!validity.is_set(1));
assert!(validity.is_set(2));
assert_eq!(256, list_vector.memory_size());
let slice = list_vector.slice(0, 2);
let slice = list_vector.slice(0, 2).to_arrow_array();
let sliced_array = slice.as_any().downcast_ref::<ListArray>().unwrap();
assert_eq!(
"ListArray[[1, 2, 3], None]",
format!("{:?}", slice.to_arrow_array())
Int32Array::from_iter_values([1, 2, 3]),
*sliced_array
.value(0)
.as_any()
.downcast_ref::<Int32Array>()
.unwrap()
);
assert!(sliced_array.is_null(1));
assert_eq!(
Value::List(ListValue::new(
@@ -467,52 +591,48 @@ mod tests {
#[test]
fn test_from_arrow_array() {
let data = vec![
Some(vec![Some(1u32), Some(2), Some(3)]),
Some(vec![Some(1), Some(2), Some(3)]),
None,
Some(vec![Some(4), None, Some(6)]),
];
let mut arrow_array = MutableListArray::<i32, MutablePrimitiveArray<u32>>::new();
arrow_array.try_extend(data).unwrap();
let arrow_array: ArrowListArray = arrow_array.into();
let arrow_array = new_list_array(&data);
let array_ref: ArrayRef = Arc::new(arrow_array);
let expect = new_list_vector(&data);
// Test try from ArrayRef
let list_vector = ListVector::try_from_arrow_array(array_ref).unwrap();
assert_eq!(
"ListVector { array: ListArray[[1, 2, 3], None, [4, None, 6]], inner_datatype: UInt32(UInt32) }",
format!("{:?}", list_vector)
);
assert_eq!(expect, list_vector);
// Test from
let arrow_array = new_list_array(&data);
let list_vector = ListVector::from(arrow_array);
assert_eq!(expect, list_vector);
}
#[test]
fn test_iter_list_vector_values() {
let data = vec![
Some(vec![Some(1i64), Some(2), Some(3)]),
Some(vec![Some(1), Some(2), Some(3)]),
None,
Some(vec![Some(4), None, Some(6)]),
];
let mut arrow_array = MutableListArray::<i32, MutablePrimitiveArray<i64>>::new();
arrow_array.try_extend(data).unwrap();
let arrow_array: ArrowListArray = arrow_array.into();
let list_vector = new_list_vector(&data);
let list_vector = ListVector::from(arrow_array);
assert_eq!(
ConcreteDataType::List(ListType::new(ConcreteDataType::int64_datatype())),
ConcreteDataType::List(ListType::new(ConcreteDataType::int32_datatype())),
list_vector.data_type()
);
let mut iter = list_vector.values_iter();
assert_eq!(
"Int64[1, 2, 3]",
format!("{:?}", iter.next().unwrap().unwrap().to_arrow_array())
Arc::new(Int32Vector::from_slice(&[1, 2, 3])) as VectorRef,
*iter.next().unwrap().unwrap().unwrap()
);
assert!(iter.next().unwrap().unwrap().is_none());
assert_eq!(
"Int64[]",
format!("{:?}", iter.next().unwrap().unwrap().to_arrow_array())
);
assert_eq!(
"Int64[4, None, 6]",
format!("{:?}", iter.next().unwrap().unwrap().to_arrow_array())
Arc::new(Int32Vector::from(vec![Some(4), None, Some(6)])) as VectorRef,
*iter.next().unwrap().unwrap().unwrap(),
);
assert!(iter.next().is_none())
}
@@ -520,30 +640,18 @@ mod tests {
#[test]
fn test_serialize_to_json() {
let data = vec![
Some(vec![Some(1i64), Some(2), Some(3)]),
Some(vec![Some(1), Some(2), Some(3)]),
None,
Some(vec![Some(4), None, Some(6)]),
];
let mut arrow_array = MutableListArray::<i32, MutablePrimitiveArray<i64>>::new();
arrow_array.try_extend(data).unwrap();
let arrow_array: ArrowListArray = arrow_array.into();
let list_vector = ListVector::from(arrow_array);
let list_vector = new_list_vector(&data);
assert_eq!(
vec![json!([1, 2, 3]), json!(null), json!([4, null, 6]),],
list_vector.serialize_to_json().unwrap()
);
}
fn new_list_vector(data: Vec<Option<Vec<Option<i32>>>>) -> ListVector {
let mut arrow_array = MutableListArray::<i32, MutablePrimitiveArray<i32>>::new();
arrow_array.try_extend(data).unwrap();
let arrow_array: ArrowListArray = arrow_array.into();
ListVector::from(arrow_array)
}
#[test]
fn test_list_vector_builder() {
let mut builder =
@@ -567,14 +675,14 @@ mod tests {
None,
Some(vec![Some(7), Some(8), None]),
];
let input = new_list_vector(data);
let input = new_list_vector(&data);
builder.extend_slice_of(&input, 1, 2).unwrap();
assert!(builder
.extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1)
.is_err());
let vector = builder.to_vector();
let expect: VectorRef = Arc::new(new_list_vector(vec![
let expect: VectorRef = Arc::new(new_list_vector(&[
Some(vec![Some(4), None, Some(6)]),
None,
Some(vec![Some(7), Some(8), None]),
@@ -599,7 +707,7 @@ mod tests {
}));
let vector = builder.finish();
let expect = new_list_vector(vec![None, Some(vec![Some(4), None, Some(6)])]);
let expect = new_list_vector(&[None, Some(vec![Some(4), None, Some(6)])]);
assert_eq!(expect, vector);
assert!(vector.get_data(0).is_none());

View File

@@ -16,8 +16,7 @@ use std::any::Any;
use std::fmt;
use std::sync::Arc;
use arrow::array::{Array, ArrayRef, NullArray};
use arrow::datatypes::DataType as ArrowDataType;
use arrow::array::{Array, ArrayData, ArrayRef, NullArray};
use snafu::{ensure, OptionExt};
use crate::data_type::ConcreteDataType;
@@ -27,21 +26,28 @@ use crate::types::NullType;
use crate::value::{Value, ValueRef};
use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef};
/// A vector where all elements are nulls.
#[derive(PartialEq)]
pub struct NullVector {
array: NullArray,
}
// TODO(yingwen): Support null vector with other logical types.
impl NullVector {
/// Create a new `NullVector` with `n` elements.
pub fn new(n: usize) -> Self {
Self {
array: NullArray::new(ArrowDataType::Null, n),
array: NullArray::new(n),
}
}
pub(crate) fn as_arrow(&self) -> &dyn Array {
&self.array
}
fn to_array_data(&self) -> ArrayData {
self.array.data().clone()
}
}
impl From<NullArray> for NullVector {
@@ -68,21 +74,28 @@ impl Vector for NullVector {
}
fn to_arrow_array(&self) -> ArrayRef {
Arc::new(self.array.clone())
// TODO(yingwen): Replaced by clone after upgrading to arrow 28.0.
let data = self.to_array_data();
Arc::new(NullArray::from(data))
}
fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
Box::new(self.array.clone())
let data = self.to_array_data();
Box::new(NullArray::from(data))
}
fn validity(&self) -> Validity {
Validity::AllNull
Validity::all_null(self.array.len())
}
fn memory_size(&self) -> usize {
0
}
fn null_count(&self) -> usize {
self.array.null_count()
}
fn is_null(&self, _row: usize) -> bool {
true
}
@@ -217,7 +230,7 @@ mod tests {
assert_eq!("NullVector", v.vector_type_name());
assert!(!v.is_const());
assert_eq!(Validity::AllNull, v.validity());
assert!(v.validity().is_all_null());
assert!(v.only_null());
for i in 0..32 {
@@ -246,7 +259,7 @@ mod tests {
#[test]
fn test_null_vector_validity() {
let vector = NullVector::new(5);
assert_eq!(Validity::AllNull, vector.validity());
assert!(vector.validity().is_all_null());
assert_eq!(5, vector.null_count());
}

View File

@@ -19,10 +19,11 @@ mod replicate;
use common_base::BitVec;
use crate::error::Result;
use crate::types::PrimitiveElement;
use crate::types::LogicalPrimitiveType;
use crate::vectors::constant::ConstantVector;
use crate::vectors::{
BinaryVector, BooleanVector, ConstantVector, DateTimeVector, DateVector, ListVector,
NullVector, PrimitiveVector, StringVector, TimestampVector, Vector, VectorRef,
BinaryVector, BooleanVector, ListVector, NullVector, PrimitiveVector, StringVector, Vector,
VectorRef,
};
/// Vector compute operations.
@@ -59,10 +60,10 @@ pub trait VectorOp {
}
macro_rules! impl_scalar_vector_op {
($( { $VectorType: ident, $replicate: ident } ),+) => {$(
($($VectorType: ident),+) => {$(
impl VectorOp for $VectorType {
fn replicate(&self, offsets: &[usize]) -> VectorRef {
replicate::$replicate(self, offsets)
replicate::replicate_scalar(self, offsets)
}
fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) {
@@ -77,28 +78,21 @@ macro_rules! impl_scalar_vector_op {
)+};
}
impl_scalar_vector_op!(
{ BinaryVector, replicate_scalar },
{ BooleanVector, replicate_scalar },
{ ListVector, replicate_scalar },
{ StringVector, replicate_scalar },
{ DateVector, replicate_date },
{ DateTimeVector, replicate_datetime },
{ TimestampVector, replicate_timestamp }
);
impl_scalar_vector_op!(BinaryVector, BooleanVector, ListVector, StringVector);
impl VectorOp for ConstantVector {
impl<T: LogicalPrimitiveType> VectorOp for PrimitiveVector<T> {
fn replicate(&self, offsets: &[usize]) -> VectorRef {
replicate::replicate_constant(self, offsets)
std::sync::Arc::new(replicate::replicate_primitive(self, offsets))
}
fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) {
let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::<ConstantVector>());
find_unique::find_unique_constant(self, selected, prev_vector);
let prev_vector =
prev_vector.and_then(|pv| pv.as_any().downcast_ref::<PrimitiveVector<T>>());
find_unique::find_unique_scalar(self, selected, prev_vector);
}
fn filter(&self, filter: &BooleanVector) -> Result<VectorRef> {
filter::filter_constant(self, filter)
filter::filter_non_constant!(self, PrimitiveVector<T>, filter)
}
}
@@ -117,21 +111,17 @@ impl VectorOp for NullVector {
}
}
impl<T> VectorOp for PrimitiveVector<T>
where
T: PrimitiveElement,
{
impl VectorOp for ConstantVector {
fn replicate(&self, offsets: &[usize]) -> VectorRef {
replicate::replicate_primitive(self, offsets)
self.replicate_vector(offsets)
}
fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) {
let prev_vector =
prev_vector.and_then(|pv| pv.as_any().downcast_ref::<PrimitiveVector<T>>());
find_unique::find_unique_scalar(self, selected, prev_vector);
let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::<ConstantVector>());
find_unique::find_unique_constant(self, selected, prev_vector);
}
fn filter(&self, filter: &BooleanVector) -> Result<VectorRef> {
filter::filter_non_constant!(self, PrimitiveVector<T>, filter)
self.filter_vector(filter)
}
}

View File

@@ -12,16 +12,15 @@
// See the License for the specific language governing permissions and
// limitations under the License.
pub(crate) use crate::vectors::constant::filter_constant;
macro_rules! filter_non_constant {
($vector: expr, $VectorType: ty, $filter: ident) => {{
use std::sync::Arc;
use arrow::compute;
use snafu::ResultExt;
let arrow_array = $vector.as_arrow();
let filtered = arrow::compute::filter::filter(arrow_array, $filter.as_boolean_array())
let filtered = compute::filter(arrow_array, $filter.as_boolean_array())
.context(crate::error::ArrowComputeSnafu)?;
Ok(Arc::new(<$VectorType>::try_from_arrow_array(filtered)?))
}};
@@ -33,9 +32,16 @@ pub(crate) use filter_non_constant;
mod tests {
use std::sync::Arc;
use common_time::{Date, DateTime};
use crate::scalars::ScalarVector;
use crate::timestamp::{
TimestampMicrosecond, TimestampMillisecond, TimestampNanosecond, TimestampSecond,
};
use crate::types::WrapperType;
use crate::vectors::constant::ConstantVector;
use crate::vectors::{
BooleanVector, ConstantVector, Int32Vector, NullVector, StringVector, VectorOp, VectorRef,
BooleanVector, Int32Vector, NullVector, StringVector, VectorOp, VectorRef,
};
fn check_filter_primitive(expect: &[i32], input: &[i32], filter: &[bool]) {
@@ -105,7 +111,6 @@ mod tests {
($VectorType: ident, $ValueType: ident, $method: ident) => {{
use std::sync::Arc;
use common_time::$ValueType;
use $crate::vectors::{$VectorType, VectorRef};
let v = $VectorType::from_iterator((0..5).map($ValueType::$method));
@@ -123,6 +128,18 @@ mod tests {
fn test_filter_date_like() {
impl_filter_date_like_test!(DateVector, Date, new);
impl_filter_date_like_test!(DateTimeVector, DateTime, new);
impl_filter_date_like_test!(TimestampVector, Timestamp, from_millis);
impl_filter_date_like_test!(TimestampSecondVector, TimestampSecond, from_native);
impl_filter_date_like_test!(
TimestampMillisecondVector,
TimestampMillisecond,
from_native
);
impl_filter_date_like_test!(
TimestampMicrosecondVector,
TimestampMicrosecond,
from_native
);
impl_filter_date_like_test!(TimestampNanosecondVector, TimestampNanosecond, from_native);
}
}

View File

@@ -15,7 +15,8 @@
use common_base::BitVec;
use crate::scalars::ScalarVector;
use crate::vectors::{ConstantVector, NullVector, Vector};
use crate::vectors::constant::ConstantVector;
use crate::vectors::{NullVector, Vector};
// To implement `find_unique()` correctly, we need to keep in mind that always marks an element as
// selected when it is different from the previous one, and leaves the `selected` unchanged
@@ -70,7 +71,7 @@ pub(crate) fn find_unique_null(
return;
}
let is_first_not_duplicate = prev_vector.map(|pv| pv.is_empty()).unwrap_or(true);
let is_first_not_duplicate = prev_vector.map(NullVector::is_empty).unwrap_or(true);
if is_first_not_duplicate {
selected.set(0, true);
}
@@ -104,8 +105,11 @@ pub(crate) fn find_unique_constant(
mod tests {
use std::sync::Arc;
use common_time::{Date, DateTime};
use super::*;
use crate::vectors::{Int32Vector, StringVector, VectorOp};
use crate::timestamp::*;
use crate::vectors::{Int32Vector, StringVector, Vector, VectorOp};
fn check_bitmap(expect: &[bool], selected: &BitVec) {
let actual = selected.iter().collect::<Vec<_>>();
@@ -121,7 +125,7 @@ mod tests {
input: impl Iterator<Item = Option<i32>>,
prev: Option<&[i32]>,
) {
let input = Int32Vector::from_iter(input);
let input = Int32Vector::from(input.collect::<Vec<_>>());
let prev = prev.map(Int32Vector::from_slice);
let mut selected = BitVec::repeat(false, input.len());
@@ -341,7 +345,6 @@ mod tests {
macro_rules! impl_find_unique_date_like_test {
($VectorType: ident, $ValueType: ident, $method: ident) => {{
use common_time::$ValueType;
use $crate::vectors::$VectorType;
let v = $VectorType::from_iterator([8, 8, 9, 10].into_iter().map($ValueType::$method));
@@ -356,6 +359,9 @@ mod tests {
fn test_find_unique_date_like() {
impl_find_unique_date_like_test!(DateVector, Date, new);
impl_find_unique_date_like_test!(DateTimeVector, DateTime, new);
impl_find_unique_date_like_test!(TimestampVector, Timestamp, from_millis);
impl_find_unique_date_like_test!(TimestampSecondVector, TimestampSecond, from);
impl_find_unique_date_like_test!(TimestampMillisecondVector, TimestampMillisecond, from);
impl_find_unique_date_like_test!(TimestampMicrosecondVector, TimestampMicrosecond, from);
impl_find_unique_date_like_test!(TimestampNanosecondVector, TimestampNanosecond, from);
}
}

View File

@@ -13,12 +13,8 @@
// limitations under the License.
use crate::prelude::*;
pub(crate) use crate::vectors::constant::replicate_constant;
pub(crate) use crate::vectors::date::replicate_date;
pub(crate) use crate::vectors::datetime::replicate_datetime;
pub(crate) use crate::vectors::null::replicate_null;
pub(crate) use crate::vectors::primitive::replicate_primitive;
pub(crate) use crate::vectors::timestamp::replicate_timestamp;
pub(crate) fn replicate_scalar<C: ScalarVector>(c: &C, offsets: &[usize]) -> VectorRef {
assert_eq!(offsets.len(), c.len());
@@ -43,8 +39,13 @@ pub(crate) fn replicate_scalar<C: ScalarVector>(c: &C, offsets: &[usize]) -> Vec
mod tests {
use std::sync::Arc;
use common_time::timestamp::TimeUnit;
use common_time::{Date, DateTime, Timestamp};
use paste::paste;
use super::*;
use crate::vectors::{ConstantVector, Int32Vector, NullVector, StringVector, VectorOp};
use crate::vectors::constant::ConstantVector;
use crate::vectors::{Int32Vector, NullVector, StringVector, VectorOp};
#[test]
fn test_replicate_primitive() {
@@ -120,7 +121,6 @@ mod tests {
macro_rules! impl_replicate_date_like_test {
($VectorType: ident, $ValueType: ident, $method: ident) => {{
use common_time::$ValueType;
use $crate::vectors::$VectorType;
let v = $VectorType::from_iterator((0..5).map($ValueType::$method));
@@ -138,10 +138,33 @@ mod tests {
}};
}
macro_rules! impl_replicate_timestamp_test {
($unit: ident) => {{
paste!{
use $crate::vectors::[<Timestamp $unit Vector>];
use $crate::timestamp::[<Timestamp $unit>];
let v = [<Timestamp $unit Vector>]::from_iterator((0..5).map([<Timestamp $unit>]::from));
let offsets = [0, 1, 2, 3, 4];
let v = v.replicate(&offsets);
assert_eq!(4, v.len());
for i in 0..4 {
assert_eq!(
Value::Timestamp(Timestamp::new(i as i64 + 1, TimeUnit::$unit)),
v.get(i)
);
}
}
}};
}
#[test]
fn test_replicate_date_like() {
impl_replicate_date_like_test!(DateVector, Date, new);
impl_replicate_date_like_test!(DateTimeVector, DateTime, new);
impl_replicate_date_like_test!(TimestampVector, Timestamp, from_millis);
impl_replicate_timestamp_test!(Second);
impl_replicate_timestamp_test!(Millisecond);
impl_replicate_timestamp_test!(Microsecond);
impl_replicate_timestamp_test!(Nanosecond);
}
}

View File

@@ -13,75 +13,111 @@
// limitations under the License.
use std::any::Any;
use std::iter::FromIterator;
use std::slice::Iter;
use std::fmt;
use std::sync::Arc;
use arrow::array::{Array, ArrayRef, MutableArray, MutablePrimitiveArray, PrimitiveArray};
use arrow::bitmap::utils::ZipValidity;
use arrow::array::{
Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef, PrimitiveArray, PrimitiveBuilder,
};
use serde_json::Value as JsonValue;
use snafu::{OptionExt, ResultExt};
use snafu::OptionExt;
use crate::data_type::{ConcreteDataType, DataType};
use crate::error::{ConversionSnafu, Result, SerializeSnafu};
use crate::data_type::ConcreteDataType;
use crate::error::{self, Result};
use crate::scalars::{Scalar, ScalarRef, ScalarVector, ScalarVectorBuilder};
use crate::serialize::Serializable;
use crate::types::{Primitive, PrimitiveElement};
use crate::types::{
Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LogicalPrimitiveType,
UInt16Type, UInt32Type, UInt64Type, UInt8Type, WrapperType,
};
use crate::value::{Value, ValueRef};
use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef};
pub type UInt8Vector = PrimitiveVector<UInt8Type>;
pub type UInt16Vector = PrimitiveVector<UInt16Type>;
pub type UInt32Vector = PrimitiveVector<UInt32Type>;
pub type UInt64Vector = PrimitiveVector<UInt64Type>;
pub type Int8Vector = PrimitiveVector<Int8Type>;
pub type Int16Vector = PrimitiveVector<Int16Type>;
pub type Int32Vector = PrimitiveVector<Int32Type>;
pub type Int64Vector = PrimitiveVector<Int64Type>;
pub type Float32Vector = PrimitiveVector<Float32Type>;
pub type Float64Vector = PrimitiveVector<Float64Type>;
/// Vector for primitive data types.
#[derive(Debug, Clone, PartialEq)]
pub struct PrimitiveVector<T: Primitive> {
pub(crate) array: PrimitiveArray<T>,
pub struct PrimitiveVector<T: LogicalPrimitiveType> {
array: PrimitiveArray<T::ArrowPrimitive>,
}
impl<T: Primitive> PrimitiveVector<T> {
pub fn new(array: PrimitiveArray<T>) -> Self {
impl<T: LogicalPrimitiveType> PrimitiveVector<T> {
pub fn new(array: PrimitiveArray<T::ArrowPrimitive>) -> Self {
Self { array }
}
pub fn try_from_arrow_array(array: impl AsRef<dyn Array>) -> Result<Self> {
Ok(Self::new(
array
.as_ref()
.as_any()
.downcast_ref::<PrimitiveArray<T>>()
.with_context(|| ConversionSnafu {
from: format!("{:?}", array.as_ref().data_type()),
})?
.clone(),
))
let data = array
.as_ref()
.as_any()
.downcast_ref::<PrimitiveArray<T::ArrowPrimitive>>()
.with_context(|| error::ConversionSnafu {
from: format!("{:?}", array.as_ref().data_type()),
})?
.data()
.clone();
let concrete_array = PrimitiveArray::<T::ArrowPrimitive>::from(data);
Ok(Self::new(concrete_array))
}
pub fn from_slice<P: AsRef<[T]>>(slice: P) -> Self {
pub fn from_slice<P: AsRef<[T::Native]>>(slice: P) -> Self {
let iter = slice.as_ref().iter().copied();
Self {
array: PrimitiveArray::from_slice(slice),
array: PrimitiveArray::from_iter_values(iter),
}
}
pub fn from_vec(array: Vec<T>) -> Self {
pub fn from_wrapper_slice<P: AsRef<[T::Wrapper]>>(slice: P) -> Self {
let iter = slice.as_ref().iter().copied().map(WrapperType::into_native);
Self {
array: PrimitiveArray::from_vec(array),
array: PrimitiveArray::from_iter_values(iter),
}
}
pub fn from_values<I: IntoIterator<Item = T>>(iter: I) -> Self {
pub fn from_vec(array: Vec<T::Native>) -> Self {
Self {
array: PrimitiveArray::from_values(iter),
array: PrimitiveArray::from_iter_values(array),
}
}
pub(crate) fn as_arrow(&self) -> &dyn Array {
pub fn from_values<I: IntoIterator<Item = T::Native>>(iter: I) -> Self {
Self {
array: PrimitiveArray::from_iter_values(iter),
}
}
pub(crate) fn as_arrow(&self) -> &PrimitiveArray<T::ArrowPrimitive> {
&self.array
}
fn slice(&self, offset: usize, length: usize) -> Self {
Self::from(self.array.slice(offset, length))
fn to_array_data(&self) -> ArrayData {
self.array.data().clone()
}
fn from_array_data(data: ArrayData) -> Self {
Self {
array: PrimitiveArray::from(data),
}
}
// To distinguish with `Vector::slice()`.
fn get_slice(&self, offset: usize, length: usize) -> Self {
let data = self.array.data().slice(offset, length);
Self::from_array_data(data)
}
}
impl<T: PrimitiveElement> Vector for PrimitiveVector<T> {
impl<T: LogicalPrimitiveType> Vector for PrimitiveVector<T> {
fn data_type(&self) -> ConcreteDataType {
T::build_data_type()
}
@@ -99,11 +135,13 @@ impl<T: PrimitiveElement> Vector for PrimitiveVector<T> {
}
fn to_arrow_array(&self) -> ArrayRef {
Arc::new(self.array.clone())
let data = self.to_array_data();
Arc::new(PrimitiveArray::<T::ArrowPrimitive>::from(data))
}
fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
Box::new(self.array.clone())
let data = self.to_array_data();
Box::new(PrimitiveArray::<T::ArrowPrimitive>::from(data))
}
fn validity(&self) -> Validity {
@@ -111,7 +149,11 @@ impl<T: PrimitiveElement> Vector for PrimitiveVector<T> {
}
fn memory_size(&self) -> usize {
self.array.values().len() * std::mem::size_of::<T>()
self.array.get_buffer_memory_size()
}
fn null_count(&self) -> usize {
self.array.null_count()
}
fn is_null(&self, row: usize) -> bool {
@@ -119,57 +161,80 @@ impl<T: PrimitiveElement> Vector for PrimitiveVector<T> {
}
fn slice(&self, offset: usize, length: usize) -> VectorRef {
Arc::new(self.slice(offset, length))
let data = self.array.data().slice(offset, length);
Arc::new(Self::from_array_data(data))
}
fn get(&self, index: usize) -> Value {
vectors::impl_get_for_vector!(self.array, index)
if self.array.is_valid(index) {
// Safety: The index have been checked by `is_valid()`.
let wrapper = unsafe { T::Wrapper::from_native(self.array.value_unchecked(index)) };
wrapper.into()
} else {
Value::Null
}
}
fn get_ref(&self, index: usize) -> ValueRef {
if self.array.is_valid(index) {
// Safety: The index have been checked by `is_valid()`.
unsafe { self.array.value_unchecked(index).into_value_ref() }
let wrapper = unsafe { T::Wrapper::from_native(self.array.value_unchecked(index)) };
wrapper.into()
} else {
ValueRef::Null
}
}
}
impl<T: Primitive> From<PrimitiveArray<T>> for PrimitiveVector<T> {
fn from(array: PrimitiveArray<T>) -> Self {
impl<T: LogicalPrimitiveType> fmt::Debug for PrimitiveVector<T> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("PrimitiveVector")
.field("array", &self.array)
.finish()
}
}
impl<T: LogicalPrimitiveType> From<PrimitiveArray<T::ArrowPrimitive>> for PrimitiveVector<T> {
fn from(array: PrimitiveArray<T::ArrowPrimitive>) -> Self {
Self { array }
}
}
impl<T: Primitive> From<Vec<Option<T>>> for PrimitiveVector<T> {
fn from(v: Vec<Option<T>>) -> Self {
impl<T: LogicalPrimitiveType> From<Vec<Option<T::Native>>> for PrimitiveVector<T> {
fn from(v: Vec<Option<T::Native>>) -> Self {
Self {
array: PrimitiveArray::<T>::from(v),
array: PrimitiveArray::from_iter(v),
}
}
}
impl<T: Primitive, Ptr: std::borrow::Borrow<Option<T>>> FromIterator<Ptr> for PrimitiveVector<T> {
fn from_iter<I: IntoIterator<Item = Ptr>>(iter: I) -> Self {
Self {
array: MutablePrimitiveArray::<T>::from_iter(iter).into(),
}
pub struct PrimitiveIter<'a, T: LogicalPrimitiveType> {
iter: ArrayIter<&'a PrimitiveArray<T::ArrowPrimitive>>,
}
impl<'a, T: LogicalPrimitiveType> Iterator for PrimitiveIter<'a, T> {
type Item = Option<T::Wrapper>;
fn next(&mut self) -> Option<Option<T::Wrapper>> {
self.iter
.next()
.map(|item| item.map(T::Wrapper::from_native))
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.iter.size_hint()
}
}
impl<T> ScalarVector for PrimitiveVector<T>
where
T: PrimitiveElement,
{
type OwnedItem = T;
type RefItem<'a> = T;
impl<T: LogicalPrimitiveType> ScalarVector for PrimitiveVector<T> {
type OwnedItem = T::Wrapper;
type RefItem<'a> = T::Wrapper;
type Iter<'a> = PrimitiveIter<'a, T>;
type Builder = PrimitiveVectorBuilder<T>;
fn get_data(&self, idx: usize) -> Option<Self::RefItem<'_>> {
if self.array.is_valid(idx) {
Some(self.array.value(idx))
Some(T::Wrapper::from_native(self.array.value(idx)))
} else {
None
}
@@ -182,59 +247,47 @@ where
}
}
pub type UInt8Vector = PrimitiveVector<u8>;
pub type UInt16Vector = PrimitiveVector<u16>;
pub type UInt32Vector = PrimitiveVector<u32>;
pub type UInt64Vector = PrimitiveVector<u64>;
pub type Int8Vector = PrimitiveVector<i8>;
pub type Int16Vector = PrimitiveVector<i16>;
pub type Int32Vector = PrimitiveVector<i32>;
pub type Int64Vector = PrimitiveVector<i64>;
pub type Float32Vector = PrimitiveVector<f32>;
pub type Float64Vector = PrimitiveVector<f64>;
pub struct PrimitiveIter<'a, T> {
iter: ZipValidity<'a, &'a T, Iter<'a, T>>,
}
impl<'a, T: Copy> Iterator for PrimitiveIter<'a, T> {
type Item = Option<T>;
fn next(&mut self) -> Option<Option<T>> {
self.iter.next().map(|v| v.copied())
}
}
impl<T: PrimitiveElement> Serializable for PrimitiveVector<T> {
impl<T: LogicalPrimitiveType> Serializable for PrimitiveVector<T> {
fn serialize_to_json(&self) -> Result<Vec<JsonValue>> {
self.array
.iter()
.map(serde_json::to_value)
.collect::<serde_json::Result<_>>()
.context(SerializeSnafu)
let res = self
.iter_data()
.map(|v| match v {
None => serde_json::Value::Null,
// use WrapperType's Into<serde_json::Value> bound instead of
// serde_json::to_value to facilitate customized serialization
// for WrapperType
Some(v) => v.into(),
})
.collect::<Vec<_>>();
Ok(res)
}
}
pub struct PrimitiveVectorBuilder<T: PrimitiveElement> {
pub(crate) mutable_array: MutablePrimitiveArray<T>,
impl<T: LogicalPrimitiveType> PartialEq for PrimitiveVector<T> {
fn eq(&self, other: &PrimitiveVector<T>) -> bool {
self.array == other.array
}
}
pub type UInt8VectorBuilder = PrimitiveVectorBuilder<u8>;
pub type UInt16VectorBuilder = PrimitiveVectorBuilder<u16>;
pub type UInt32VectorBuilder = PrimitiveVectorBuilder<u32>;
pub type UInt64VectorBuilder = PrimitiveVectorBuilder<u64>;
pub type UInt8VectorBuilder = PrimitiveVectorBuilder<UInt8Type>;
pub type UInt16VectorBuilder = PrimitiveVectorBuilder<UInt16Type>;
pub type UInt32VectorBuilder = PrimitiveVectorBuilder<UInt32Type>;
pub type UInt64VectorBuilder = PrimitiveVectorBuilder<UInt64Type>;
pub type Int8VectorBuilder = PrimitiveVectorBuilder<i8>;
pub type Int16VectorBuilder = PrimitiveVectorBuilder<i16>;
pub type Int32VectorBuilder = PrimitiveVectorBuilder<i32>;
pub type Int64VectorBuilder = PrimitiveVectorBuilder<i64>;
pub type Int8VectorBuilder = PrimitiveVectorBuilder<Int8Type>;
pub type Int16VectorBuilder = PrimitiveVectorBuilder<Int16Type>;
pub type Int32VectorBuilder = PrimitiveVectorBuilder<Int32Type>;
pub type Int64VectorBuilder = PrimitiveVectorBuilder<Int64Type>;
pub type Float32VectorBuilder = PrimitiveVectorBuilder<f32>;
pub type Float64VectorBuilder = PrimitiveVectorBuilder<f64>;
pub type Float32VectorBuilder = PrimitiveVectorBuilder<Float32Type>;
pub type Float64VectorBuilder = PrimitiveVectorBuilder<Float64Type>;
impl<T: PrimitiveElement> MutableVector for PrimitiveVectorBuilder<T> {
/// Builder to build a primitive vector.
pub struct PrimitiveVectorBuilder<T: LogicalPrimitiveType> {
mutable_array: PrimitiveBuilder<T::ArrowPrimitive>,
}
impl<T: LogicalPrimitiveType> MutableVector for PrimitiveVectorBuilder<T> {
fn data_type(&self) -> ConcreteDataType {
T::build_data_type()
}
@@ -257,81 +310,62 @@ impl<T: PrimitiveElement> MutableVector for PrimitiveVectorBuilder<T> {
fn push_value_ref(&mut self, value: ValueRef) -> Result<()> {
let primitive = T::cast_value_ref(value)?;
self.mutable_array.push(primitive);
match primitive {
Some(v) => self.mutable_array.append_value(v.into_native()),
None => self.mutable_array.append_null(),
}
Ok(())
}
fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> {
let primitive = T::cast_vector(vector)?;
// Slice the underlying array to avoid creating a new Arc.
let slice = primitive.slice(offset, length);
self.mutable_array.extend_trusted_len(slice.iter());
let slice = primitive.get_slice(offset, length);
for v in slice.iter_data() {
self.push(v);
}
Ok(())
}
}
impl<T> ScalarVectorBuilder for PrimitiveVectorBuilder<T>
where
T: Scalar<VectorType = PrimitiveVector<T>> + PrimitiveElement,
for<'a> T: ScalarRef<'a, ScalarType = T, VectorType = PrimitiveVector<T>>,
for<'a> T: Scalar<RefType<'a> = T>,
T: LogicalPrimitiveType,
T::Wrapper: Scalar<VectorType = PrimitiveVector<T>>,
for<'a> T::Wrapper: ScalarRef<'a, ScalarType = T::Wrapper>,
for<'a> T::Wrapper: Scalar<RefType<'a> = T::Wrapper>,
{
type VectorType = PrimitiveVector<T>;
fn with_capacity(capacity: usize) -> Self {
Self {
mutable_array: MutablePrimitiveArray::with_capacity(capacity),
mutable_array: PrimitiveBuilder::with_capacity(capacity),
}
}
fn push(&mut self, value: Option<<Self::VectorType as ScalarVector>::RefItem<'_>>) {
self.mutable_array.push(value);
self.mutable_array
.append_option(value.map(|v| v.into_native()));
}
fn finish(&mut self) -> Self::VectorType {
PrimitiveVector {
array: std::mem::take(&mut self.mutable_array).into(),
array: self.mutable_array.finish(),
}
}
}
impl<T: PrimitiveElement> PrimitiveVectorBuilder<T> {
fn with_type_capacity(data_type: ConcreteDataType, capacity: usize) -> Self {
Self {
mutable_array: MutablePrimitiveArray::with_capacity_from(
capacity,
data_type.as_arrow_type(),
),
}
}
}
pub(crate) fn replicate_primitive<T: PrimitiveElement>(
pub(crate) fn replicate_primitive<T: LogicalPrimitiveType>(
vector: &PrimitiveVector<T>,
offsets: &[usize],
) -> VectorRef {
Arc::new(replicate_primitive_with_type(
vector,
offsets,
T::build_data_type(),
))
}
pub(crate) fn replicate_primitive_with_type<T: PrimitiveElement>(
vector: &PrimitiveVector<T>,
offsets: &[usize],
data_type: ConcreteDataType,
) -> PrimitiveVector<T> {
assert_eq!(offsets.len(), vector.len());
if offsets.is_empty() {
return vector.slice(0, 0);
return vector.get_slice(0, 0);
}
let mut builder = PrimitiveVectorBuilder::<T>::with_type_capacity(
data_type,
*offsets.last().unwrap() as usize,
);
let mut builder = PrimitiveVectorBuilder::<T>::with_capacity(*offsets.last().unwrap() as usize);
let mut previous_offset = 0;
@@ -339,14 +373,15 @@ pub(crate) fn replicate_primitive_with_type<T: PrimitiveElement>(
let repeat_times = *offset - previous_offset;
match value {
Some(data) => {
builder.mutable_array.extend_trusted_len(
std::iter::repeat(*data)
.take(repeat_times)
.map(Option::Some),
);
unsafe {
// Safety: std::iter::Repeat and std::iter::Take implement TrustedLen.
builder
.mutable_array
.append_trusted_len_iter(std::iter::repeat(data).take(repeat_times));
}
}
None => {
builder.mutable_array.extend_constant(repeat_times, None);
builder.mutable_array.append_nulls(repeat_times);
}
}
previous_offset = *offset;
@@ -356,6 +391,7 @@ pub(crate) fn replicate_primitive_with_type<T: PrimitiveElement>(
#[cfg(test)]
mod tests {
use arrow::array::Int32Array;
use arrow::datatypes::DataType as ArrowDataType;
use serde_json;
@@ -364,11 +400,11 @@ mod tests {
use crate::serialize::Serializable;
use crate::types::Int64Type;
fn check_vec(v: PrimitiveVector<i32>) {
fn check_vec(v: Int32Vector) {
assert_eq!(4, v.len());
assert_eq!("Int32Vector", v.vector_type_name());
assert!(!v.is_const());
assert_eq!(Validity::AllValid, v.validity());
assert!(v.validity().is_all_valid());
assert!(!v.only_null());
for i in 0..4 {
@@ -387,26 +423,26 @@ mod tests {
#[test]
fn test_from_values() {
let v = PrimitiveVector::<i32>::from_values(vec![1, 2, 3, 4]);
let v = Int32Vector::from_values(vec![1, 2, 3, 4]);
check_vec(v);
}
#[test]
fn test_from_vec() {
let v = PrimitiveVector::<i32>::from_vec(vec![1, 2, 3, 4]);
let v = Int32Vector::from_vec(vec![1, 2, 3, 4]);
check_vec(v);
}
#[test]
fn test_from_slice() {
let v = PrimitiveVector::<i32>::from_slice(vec![1, 2, 3, 4]);
let v = Int32Vector::from_slice(vec![1, 2, 3, 4]);
check_vec(v);
}
#[test]
fn test_serialize_primitive_vector_with_null_to_json() {
let input = [Some(1i32), Some(2i32), None, Some(4i32), None];
let mut builder = PrimitiveVectorBuilder::with_capacity(input.len());
let mut builder = Int32VectorBuilder::with_capacity(input.len());
for v in input {
builder.push(v);
}
@@ -421,15 +457,15 @@ mod tests {
#[test]
fn test_from_arrow_array() {
let arrow_array = PrimitiveArray::from_slice(vec![1, 2, 3, 4]);
let v = PrimitiveVector::from(arrow_array);
let arrow_array = Int32Array::from(vec![1, 2, 3, 4]);
let v = Int32Vector::from(arrow_array);
check_vec(v);
}
#[test]
fn test_primitive_vector_build_get() {
let input = [Some(1i32), Some(2i32), None, Some(4i32), None];
let mut builder = PrimitiveVectorBuilder::with_capacity(input.len());
let mut builder = Int32VectorBuilder::with_capacity(input.len());
for v in input {
builder.push(v);
}
@@ -448,29 +484,28 @@ mod tests {
#[test]
fn test_primitive_vector_validity() {
let input = [Some(1i32), Some(2i32), None, None];
let mut builder = PrimitiveVectorBuilder::with_capacity(input.len());
let mut builder = Int32VectorBuilder::with_capacity(input.len());
for v in input {
builder.push(v);
}
let vector = builder.finish();
assert_eq!(2, vector.null_count());
let validity = vector.validity();
let slots = validity.slots().unwrap();
assert_eq!(2, slots.null_count());
assert!(!slots.get_bit(2));
assert!(!slots.get_bit(3));
assert_eq!(2, validity.null_count());
assert!(!validity.is_set(2));
assert!(!validity.is_set(3));
let vector = PrimitiveVector::<i32>::from_slice(vec![1, 2, 3, 4]);
let vector = Int32Vector::from_slice(vec![1, 2, 3, 4]);
assert_eq!(0, vector.null_count());
assert_eq!(Validity::AllValid, vector.validity());
assert!(vector.validity().is_all_valid());
}
#[test]
fn test_memory_size() {
let v = PrimitiveVector::<i32>::from_slice((0..5).collect::<Vec<i32>>());
assert_eq!(20, v.memory_size());
let v = PrimitiveVector::<i64>::from(vec![Some(0i64), Some(1i64), Some(2i64), None, None]);
assert_eq!(40, v.memory_size());
let v = Int32Vector::from_slice((0..5).collect::<Vec<i32>>());
assert_eq!(64, v.memory_size());
let v = Int64Vector::from(vec![Some(0i64), Some(1i64), Some(2i64), None, None]);
assert_eq!(128, v.memory_size());
}
#[test]
@@ -489,4 +524,29 @@ mod tests {
let expect: VectorRef = Arc::new(Int64Vector::from_slice(&[123, 8, 9]));
assert_eq!(expect, vector);
}
#[test]
fn test_from_wrapper_slice() {
macro_rules! test_from_wrapper_slice {
($vec: ident, $ty: ident) => {
let from_wrapper_slice = $vec::from_wrapper_slice(&[
$ty::from_native($ty::MAX),
$ty::from_native($ty::MIN),
]);
let from_slice = $vec::from_slice(&[$ty::MAX, $ty::MIN]);
assert_eq!(from_wrapper_slice, from_slice);
};
}
test_from_wrapper_slice!(UInt8Vector, u8);
test_from_wrapper_slice!(Int8Vector, i8);
test_from_wrapper_slice!(UInt16Vector, u16);
test_from_wrapper_slice!(Int16Vector, i16);
test_from_wrapper_slice!(UInt32Vector, u32);
test_from_wrapper_slice!(Int32Vector, i32);
test_from_wrapper_slice!(UInt64Vector, u64);
test_from_wrapper_slice!(Int64Vector, i64);
test_from_wrapper_slice!(Float32Vector, f32);
test_from_wrapper_slice!(Float64Vector, f64);
}
}

View File

@@ -15,22 +15,19 @@
use std::any::Any;
use std::sync::Arc;
use arrow::array::{Array, ArrayRef, MutableArray, Utf8ValuesIter};
use arrow::bitmap::utils::ZipValidity;
use serde_json::Value as JsonValue;
use snafu::{OptionExt, ResultExt};
use arrow::array::{Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef};
use snafu::ResultExt;
use crate::arrow_array::{MutableStringArray, StringArray};
use crate::data_type::ConcreteDataType;
use crate::error::{Result, SerializeSnafu};
use crate::error::{self, Result};
use crate::scalars::{ScalarVector, ScalarVectorBuilder};
use crate::serialize::Serializable;
use crate::types::StringType;
use crate::value::{Value, ValueRef};
use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef};
/// String array wrapper
#[derive(Debug, Clone, PartialEq)]
/// Vector of strings.
#[derive(Debug, PartialEq)]
pub struct StringVector {
array: StringArray,
}
@@ -39,6 +36,16 @@ impl StringVector {
pub(crate) fn as_arrow(&self) -> &dyn Array {
&self.array
}
fn to_array_data(&self) -> ArrayData {
self.array.data().clone()
}
fn from_array_data(data: ArrayData) -> Self {
Self {
array: StringArray::from(data),
}
}
}
impl From<StringArray> for StringVector {
@@ -50,19 +57,7 @@ impl From<StringArray> for StringVector {
impl From<Vec<Option<String>>> for StringVector {
fn from(data: Vec<Option<String>>) -> Self {
Self {
array: StringArray::from(data),
}
}
}
impl From<Vec<String>> for StringVector {
fn from(data: Vec<String>) -> Self {
Self {
array: StringArray::from(
data.into_iter()
.map(Option::Some)
.collect::<Vec<Option<String>>>(),
),
array: StringArray::from_iter(data),
}
}
}
@@ -70,7 +65,31 @@ impl From<Vec<String>> for StringVector {
impl From<Vec<Option<&str>>> for StringVector {
fn from(data: Vec<Option<&str>>) -> Self {
Self {
array: StringArray::from(data),
array: StringArray::from_iter(data),
}
}
}
impl From<&[Option<String>]> for StringVector {
fn from(data: &[Option<String>]) -> Self {
Self {
array: StringArray::from_iter(data),
}
}
}
impl From<&[Option<&str>]> for StringVector {
fn from(data: &[Option<&str>]) -> Self {
Self {
array: StringArray::from_iter(data),
}
}
}
impl From<Vec<String>> for StringVector {
fn from(data: Vec<String>) -> Self {
Self {
array: StringArray::from_iter(data.into_iter().map(Some)),
}
}
}
@@ -78,18 +97,14 @@ impl From<Vec<Option<&str>>> for StringVector {
impl From<Vec<&str>> for StringVector {
fn from(data: Vec<&str>) -> Self {
Self {
array: StringArray::from(
data.into_iter()
.map(Option::Some)
.collect::<Vec<Option<&str>>>(),
),
array: StringArray::from_iter(data.into_iter().map(Some)),
}
}
}
impl Vector for StringVector {
fn data_type(&self) -> ConcreteDataType {
ConcreteDataType::String(StringType::default())
ConcreteDataType::string_datatype()
}
fn vector_type_name(&self) -> String {
@@ -105,11 +120,13 @@ impl Vector for StringVector {
}
fn to_arrow_array(&self) -> ArrayRef {
Arc::new(self.array.clone())
let data = self.to_array_data();
Arc::new(StringArray::from(data))
}
fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
Box::new(self.array.clone())
let data = self.to_array_data();
Box::new(StringArray::from(data))
}
fn validity(&self) -> Validity {
@@ -117,7 +134,11 @@ impl Vector for StringVector {
}
fn memory_size(&self) -> usize {
self.len() * std::mem::size_of::<i64>() + self.array.values().len()
self.array.get_buffer_memory_size()
}
fn null_count(&self) -> usize {
self.array.null_count()
}
fn is_null(&self, row: usize) -> bool {
@@ -125,7 +146,8 @@ impl Vector for StringVector {
}
fn slice(&self, offset: usize, length: usize) -> VectorRef {
Arc::new(Self::from(self.array.slice(offset, length)))
let data = self.array.data().slice(offset, length);
Arc::new(Self::from_array_data(data))
}
fn get(&self, index: usize) -> Value {
@@ -140,7 +162,7 @@ impl Vector for StringVector {
impl ScalarVector for StringVector {
type OwnedItem = String;
type RefItem<'a> = &'a str;
type Iter<'a> = ZipValidity<'a, &'a str, Utf8ValuesIter<'a, i32>>;
type Iter<'a> = ArrayIter<&'a StringArray>;
type Builder = StringVectorBuilder;
fn get_data(&self, idx: usize) -> Option<Self::RefItem<'_>> {
@@ -157,7 +179,7 @@ impl ScalarVector for StringVector {
}
pub struct StringVectorBuilder {
buffer: MutableStringArray,
mutable_array: MutableStringArray,
}
impl MutableVector for StringVectorBuilder {
@@ -166,7 +188,7 @@ impl MutableVector for StringVectorBuilder {
}
fn len(&self) -> usize {
self.buffer.len()
self.mutable_array.len()
}
fn as_any(&self) -> &dyn Any {
@@ -182,12 +204,15 @@ impl MutableVector for StringVectorBuilder {
}
fn push_value_ref(&mut self, value: ValueRef) -> Result<()> {
self.buffer.push(value.as_string()?);
match value.as_string()? {
Some(v) => self.mutable_array.append_value(v),
None => self.mutable_array.append_null(),
}
Ok(())
}
fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> {
vectors::impl_extend_for_builder!(self.buffer, vector, StringVector, offset, length)
vectors::impl_extend_for_builder!(self, vector, StringVector, offset, length)
}
}
@@ -196,30 +221,30 @@ impl ScalarVectorBuilder for StringVectorBuilder {
fn with_capacity(capacity: usize) -> Self {
Self {
buffer: MutableStringArray::with_capacity(capacity),
mutable_array: MutableStringArray::with_capacity(capacity, 0),
}
}
fn push(&mut self, value: Option<<Self::VectorType as ScalarVector>::RefItem<'_>>) {
self.buffer.push(value)
match value {
Some(v) => self.mutable_array.append_value(v),
None => self.mutable_array.append_null(),
}
}
fn finish(&mut self) -> Self::VectorType {
Self::VectorType {
array: std::mem::take(&mut self.buffer).into(),
StringVector {
array: self.mutable_array.finish(),
}
}
}
impl Serializable for StringVector {
fn serialize_to_json(&self) -> crate::error::Result<Vec<JsonValue>> {
fn serialize_to_json(&self) -> Result<Vec<serde_json::Value>> {
self.iter_data()
.map(|v| match v {
None => Ok(serde_json::Value::Null),
Some(s) => serde_json::to_value(s),
})
.map(serde_json::to_value)
.collect::<serde_json::Result<_>>()
.context(SerializeSnafu)
.context(error::SerializeSnafu)
}
}
@@ -227,60 +252,9 @@ vectors::impl_try_from_arrow_array_for_vector!(StringArray, StringVector);
#[cfg(test)]
mod tests {
use arrow::datatypes::DataType as ArrowDataType;
use serde_json;
use arrow::datatypes::DataType;
use super::*;
use crate::data_type::DataType;
#[test]
fn test_string_vector_misc() {
let strs = vec!["hello", "greptime", "rust"];
let v = StringVector::from(strs.clone());
assert_eq!(3, v.len());
assert_eq!("StringVector", v.vector_type_name());
assert!(!v.is_const());
assert_eq!(Validity::AllValid, v.validity());
assert!(!v.only_null());
assert_eq!(41, v.memory_size());
for (i, s) in strs.iter().enumerate() {
assert_eq!(Value::from(*s), v.get(i));
assert_eq!(ValueRef::from(*s), v.get_ref(i));
assert_eq!(Value::from(*s), v.try_get(i).unwrap());
}
let arrow_arr = v.to_arrow_array();
assert_eq!(3, arrow_arr.len());
assert_eq!(&ArrowDataType::Utf8, arrow_arr.data_type());
}
#[test]
fn test_serialize_string_vector() {
let mut builder = StringVectorBuilder::with_capacity(3);
builder.push(Some("hello"));
builder.push(None);
builder.push(Some("world"));
let string_vector = builder.finish();
let serialized =
serde_json::to_string(&string_vector.serialize_to_json().unwrap()).unwrap();
assert_eq!(r#"["hello",null,"world"]"#, serialized);
}
#[test]
fn test_from_arrow_array() {
let mut builder = MutableStringArray::new();
builder.push(Some("A"));
builder.push(Some("B"));
builder.push::<&str>(None);
builder.push(Some("D"));
let string_array: StringArray = builder.into();
let vector = StringVector::from(string_array);
assert_eq!(
r#"["A","B",null,"D"]"#,
serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(),
);
}
#[test]
fn test_string_vector_build_get() {
@@ -310,7 +284,7 @@ mod tests {
#[test]
fn test_string_vector_builder() {
let mut builder = StringType::default().create_mutable_vector(3);
let mut builder = StringVectorBuilder::with_capacity(3);
builder.push_value_ref(ValueRef::String("hello")).unwrap();
assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err());
@@ -324,4 +298,73 @@ mod tests {
let expect: VectorRef = Arc::new(StringVector::from_slice(&["hello", "one", "two"]));
assert_eq!(expect, vector);
}
#[test]
fn test_string_vector_misc() {
let strs = vec!["hello", "greptime", "rust"];
let v = StringVector::from(strs.clone());
assert_eq!(3, v.len());
assert_eq!("StringVector", v.vector_type_name());
assert!(!v.is_const());
assert!(v.validity().is_all_valid());
assert!(!v.only_null());
assert_eq!(128, v.memory_size());
for (i, s) in strs.iter().enumerate() {
assert_eq!(Value::from(*s), v.get(i));
assert_eq!(ValueRef::from(*s), v.get_ref(i));
assert_eq!(Value::from(*s), v.try_get(i).unwrap());
}
let arrow_arr = v.to_arrow_array();
assert_eq!(3, arrow_arr.len());
assert_eq!(&DataType::Utf8, arrow_arr.data_type());
}
#[test]
fn test_serialize_string_vector() {
let mut builder = StringVectorBuilder::with_capacity(3);
builder.push(Some("hello"));
builder.push(None);
builder.push(Some("world"));
let string_vector = builder.finish();
let serialized =
serde_json::to_string(&string_vector.serialize_to_json().unwrap()).unwrap();
assert_eq!(r#"["hello",null,"world"]"#, serialized);
}
#[test]
fn test_from_arrow_array() {
let mut builder = MutableStringArray::new();
builder.append_option(Some("A"));
builder.append_option(Some("B"));
builder.append_null();
builder.append_option(Some("D"));
let string_array: StringArray = builder.finish();
let vector = StringVector::from(string_array);
assert_eq!(
r#"["A","B",null,"D"]"#,
serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(),
);
}
#[test]
fn test_from_non_option_string() {
let nul = String::from_utf8(vec![0]).unwrap();
let corpus = vec!["😅😅😅", "😍😍😍😍", "🥵🥵", nul.as_str()];
let vector = StringVector::from(corpus);
let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap();
assert_eq!(r#"["😅😅😅","😍😍😍😍","🥵🥵","\u0000"]"#, serialized);
let corpus = vec![
"🀀🀀🀀".to_string(),
"🀁🀁🀁".to_string(),
"🀂🀂🀂".to_string(),
"🀃🀃🀃".to_string(),
"🀆🀆".to_string(),
];
let vector = StringVector::from(corpus);
let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap();
assert_eq!(r#"["🀀🀀🀀","🀁🀁🀁","🀂🀂🀂","🀃🀃🀃","🀆🀆"]"#, serialized);
}
}

View File

@@ -12,308 +12,20 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use std::sync::Arc;
use arrow::array::{Array, ArrayRef, PrimitiveArray};
use common_time::timestamp::{TimeUnit, Timestamp};
use snafu::OptionExt;
use crate::data_type::{ConcreteDataType, DataType};
use crate::error;
use crate::error::Result;
use crate::prelude::{
MutableVector, ScalarVector, ScalarVectorBuilder, Validity, Value, ValueRef, Vector, VectorRef,
use crate::types::{
TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
TimestampSecondType,
};
use crate::serialize::Serializable;
use crate::types::TimestampType;
use crate::vectors::{PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder};
use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder};
/// `TimestampVector` stores timestamp in millisecond since UNIX Epoch.
#[derive(Debug, Clone, PartialEq)]
pub struct TimestampVector {
array: PrimitiveVector<i64>,
}
pub type TimestampSecondVector = PrimitiveVector<TimestampSecondType>;
pub type TimestampSecondVectorBuilder = PrimitiveVectorBuilder<TimestampSecondType>;
impl TimestampVector {
pub fn new(array: PrimitiveArray<i64>) -> Self {
Self {
array: PrimitiveVector { array },
}
}
pub type TimestampMillisecondVector = PrimitiveVector<TimestampMillisecondType>;
pub type TimestampMillisecondVectorBuilder = PrimitiveVectorBuilder<TimestampMillisecondType>;
pub fn try_from_arrow_array(array: impl AsRef<dyn Array>) -> Result<Self> {
Ok(Self::new(
array
.as_ref()
.as_any()
.downcast_ref::<PrimitiveArray<i64>>()
.with_context(|| error::ConversionSnafu {
from: format!("{:?}", array.as_ref().data_type()),
})?
.clone(),
))
}
pub type TimestampMicrosecondVector = PrimitiveVector<TimestampMicrosecondType>;
pub type TimestampMicrosecondVectorBuilder = PrimitiveVectorBuilder<TimestampMicrosecondType>;
pub fn from_values<I: IntoIterator<Item = i64>>(iter: I) -> Self {
Self {
array: PrimitiveVector {
array: PrimitiveArray::from_values(iter),
},
}
}
pub(crate) fn as_arrow(&self) -> &dyn Array {
self.array.as_arrow()
}
}
impl Vector for TimestampVector {
fn data_type(&self) -> ConcreteDataType {
ConcreteDataType::timestamp_millis_datatype()
}
fn vector_type_name(&self) -> String {
"TimestampVector".to_string()
}
fn as_any(&self) -> &dyn Any {
self
}
fn len(&self) -> usize {
self.array.len()
}
fn to_arrow_array(&self) -> ArrayRef {
let validity = self.array.array.validity().cloned();
let buffer = self.array.array.values().clone();
Arc::new(PrimitiveArray::new(
TimestampType::new(TimeUnit::Millisecond).as_arrow_type(),
buffer,
validity,
))
}
fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
let validity = self.array.array.validity().cloned();
let values = self.array.array.values().clone();
Box::new(PrimitiveArray::new(
arrow::datatypes::DataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None),
values,
validity,
))
}
fn validity(&self) -> Validity {
self.array.validity()
}
fn memory_size(&self) -> usize {
self.array.memory_size()
}
fn is_null(&self, row: usize) -> bool {
self.array.is_null(row)
}
fn slice(&self, offset: usize, length: usize) -> VectorRef {
Arc::new(Self {
array: PrimitiveVector {
array: self.array.array.slice(offset, length),
},
})
}
fn get(&self, index: usize) -> Value {
match self.array.get(index) {
Value::Null => Value::Null,
Value::Int64(v) => Value::Timestamp(Timestamp::from_millis(v)),
_ => {
unreachable!()
}
}
}
fn get_ref(&self, index: usize) -> ValueRef {
match self.array.get(index) {
Value::Int64(v) => ValueRef::Timestamp(Timestamp::from_millis(v)),
Value::Null => ValueRef::Null,
_ => unreachable!(),
}
}
}
impl Serializable for TimestampVector {
fn serialize_to_json(&self) -> Result<Vec<serde_json::Value>> {
Ok(self
.array
.iter_data()
.map(|v| match v {
None => serde_json::Value::Null,
Some(v) => v.into(),
})
.collect::<Vec<_>>())
}
}
impl ScalarVector for TimestampVector {
type OwnedItem = Timestamp;
type RefItem<'a> = Timestamp;
type Iter<'a> = TimestampDataIter<'a>;
type Builder = TimestampVectorBuilder;
fn get_data(&self, idx: usize) -> Option<Self::RefItem<'_>> {
self.array.get_data(idx).map(Timestamp::from_millis)
}
fn iter_data(&self) -> Self::Iter<'_> {
TimestampDataIter {
iter: self.array.iter_data(),
}
}
}
pub struct TimestampDataIter<'a> {
iter: PrimitiveIter<'a, i64>,
}
impl<'a> Iterator for TimestampDataIter<'a> {
type Item = Option<Timestamp>;
fn next(&mut self) -> Option<Self::Item> {
self.iter.next().map(|v| v.map(Timestamp::from_millis))
}
}
pub struct TimestampVectorBuilder {
buffer: PrimitiveVectorBuilder<i64>,
}
impl MutableVector for TimestampVectorBuilder {
fn data_type(&self) -> ConcreteDataType {
ConcreteDataType::timestamp_millis_datatype()
}
fn len(&self) -> usize {
self.buffer.len()
}
fn as_any(&self) -> &dyn Any {
self
}
fn as_mut_any(&mut self) -> &mut dyn Any {
self
}
fn to_vector(&mut self) -> VectorRef {
Arc::new(self.finish())
}
fn push_value_ref(&mut self, value: ValueRef) -> Result<()> {
// TODO(hl): vector and vector builder should also support customized time unit.
self.buffer.push(
value
.as_timestamp()?
.map(|t| t.convert_to(TimeUnit::Millisecond)),
);
Ok(())
}
fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> {
let concrete_vector = vector
.as_any()
.downcast_ref::<TimestampVector>()
.with_context(|| error::CastTypeSnafu {
msg: format!(
"Failed to convert vector from {} to DateVector",
vector.vector_type_name()
),
})?;
self.buffer
.extend_slice_of(&concrete_vector.array, offset, length)?;
Ok(())
}
}
impl ScalarVectorBuilder for TimestampVectorBuilder {
type VectorType = TimestampVector;
fn with_capacity(capacity: usize) -> Self {
Self {
buffer: PrimitiveVectorBuilder::with_capacity(capacity),
}
}
/// Pushes a Timestamp value into vector builder. The timestamp must be with time unit
/// `Second`/`MilliSecond`/`Microsecond`.
fn push(&mut self, value: Option<<Self::VectorType as ScalarVector>::RefItem<'_>>) {
self.buffer
.push(value.map(|v| v.convert_to(TimeUnit::Millisecond)));
}
fn finish(&mut self) -> Self::VectorType {
Self::VectorType {
array: self.buffer.finish(),
}
}
}
pub(crate) fn replicate_timestamp(vector: &TimestampVector, offsets: &[usize]) -> VectorRef {
let array = crate::vectors::primitive::replicate_primitive_with_type(
&vector.array,
offsets,
vector.data_type(),
);
Arc::new(TimestampVector { array })
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
pub fn test_build_timestamp_vector() {
let mut builder = TimestampVectorBuilder::with_capacity(3);
builder.push(Some(Timestamp::new(1, TimeUnit::Second)));
builder.push(None);
builder.push(Some(Timestamp::new(2, TimeUnit::Millisecond)));
let vector = builder.finish();
assert_eq!(
ConcreteDataType::timestamp_millis_datatype(),
vector.data_type()
);
assert_eq!(3, vector.len());
assert_eq!(
Value::Timestamp(Timestamp::new(1000, TimeUnit::Millisecond)),
vector.get(0)
);
assert_eq!(Value::Null, vector.get(1));
assert_eq!(
Value::Timestamp(Timestamp::new(2, TimeUnit::Millisecond)),
vector.get(2)
);
assert_eq!(
vec![
Some(Timestamp::new(1000, TimeUnit::Millisecond)),
None,
Some(Timestamp::new(2, TimeUnit::Millisecond)),
],
vector.iter_data().collect::<Vec<_>>()
);
}
#[test]
fn test_timestamp_from_arrow() {
let vector =
TimestampVector::from_slice(&[Timestamp::from_millis(1), Timestamp::from_millis(2)]);
let arrow = vector.as_arrow().slice(0, vector.len());
let vector2 = TimestampVector::try_from_arrow_array(&arrow).unwrap();
assert_eq!(vector, vector2);
}
}
pub type TimestampNanosecondVector = PrimitiveVector<TimestampNanosecondType>;
pub type TimestampNanosecondVectorBuilder = PrimitiveVectorBuilder<TimestampNanosecondType>;

View File

@@ -9,10 +9,11 @@ default = []
test = []
[dependencies]
arrow = "26.0.0"
common-base = { path = "../common/base" }
common-error = { path = "../common/error" }
common-time = { path = "../common/time" }
datafusion-common = "14.0"
datafusion-common = "14.0.0"
enum_dispatch = "0.3"
num = "0.4"
num-traits = "0.2"
@@ -21,4 +22,3 @@ paste = "1.0"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
snafu = { version = "0.7", features = ["backtraces"] }
arrow = "26.0"

View File

@@ -12,18 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use arrow::array::{
Array, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, Int16Array,
Int32Array, Int64Array, Int8Array, ListArray, UInt16Array, UInt32Array, UInt64Array,
UInt8Array,
};
use arrow::array::{self, Array, ListArray, PrimitiveArray};
use arrow::datatypes::DataType;
use common_time::timestamp::TimeUnit;
use common_time::Timestamp;
use common_time::timestamp::Timestamp;
use snafu::OptionExt;
use crate::data_type::ConcreteDataType;
use crate::error::{ConversionSnafu, Result};
use crate::prelude::ConcreteDataType;
use crate::value::{ListValue, Value};
pub type BinaryArray = arrow::array::LargeBinaryArray;
@@ -41,7 +36,6 @@ macro_rules! cast_array {
};
}
// TODO(yingwen): Remove this function.
pub fn arrow_array_get(array: &dyn Array, idx: usize) -> Result<Value> {
if array.is_null(idx) {
return Ok(Value::Null);
@@ -49,46 +43,42 @@ pub fn arrow_array_get(array: &dyn Array, idx: usize) -> Result<Value> {
let result = match array.data_type() {
DataType::Null => Value::Null,
DataType::Boolean => Value::Boolean(cast_array!(array, BooleanArray).value(idx)),
DataType::Binary => Value::Binary(cast_array!(array, BinaryArray).value(idx).into()),
DataType::Int8 => Value::Int8(cast_array!(array, Int8Array).value(idx)),
DataType::Int16 => Value::Int16(cast_array!(array, Int16Array).value(idx)),
DataType::Int32 => Value::Int32(cast_array!(array, Int32Array).value(idx)),
DataType::Int64 => Value::Int64(cast_array!(array, Int64Array).value(idx)),
DataType::UInt8 => Value::UInt8(cast_array!(array, UInt8Array).value(idx)),
DataType::UInt16 => Value::UInt16(cast_array!(array, UInt16Array).value(idx)),
DataType::UInt32 => Value::UInt32(cast_array!(array, UInt32Array).value(idx)),
DataType::UInt64 => Value::UInt64(cast_array!(array, UInt64Array).value(idx)),
DataType::Float32 => Value::Float32(cast_array!(array, Float32Array).value(idx).into()),
DataType::Float64 => Value::Float64(cast_array!(array, Float64Array).value(idx).into()),
DataType::Utf8 => Value::String(cast_array!(array, StringArray).value(idx).into()),
DataType::Date32 => Value::Date(cast_array!(array, Date32Array).value(idx).into()),
DataType::Date64 => Value::DateTime(cast_array!(array, Date64Array).value(idx).into()),
DataType::Timestamp(t, _) => match t {
arrow::datatypes::TimeUnit::Second => Value::Timestamp(Timestamp::new(
cast_array!(array, arrow::array::TimestampSecondArray).value(idx),
TimeUnit::Second,
)),
arrow::datatypes::TimeUnit::Millisecond => Value::Timestamp(Timestamp::new(
cast_array!(array, arrow::array::TimestampMillisecondArray).value(idx),
TimeUnit::Millisecond,
)),
arrow::datatypes::TimeUnit::Microsecond => Value::Timestamp(Timestamp::new(
cast_array!(array, arrow::array::TimestampMicrosecondArray).value(idx),
TimeUnit::Microsecond,
)),
arrow::datatypes::TimeUnit::Nanosecond => Value::Timestamp(Timestamp::new(
cast_array!(array, arrow::array::TimestampNanosecondArray).value(idx),
TimeUnit::Nanosecond,
)),
},
DataType::Boolean => Value::Boolean(cast_array!(array, array::BooleanArray).value(idx)),
DataType::Binary | DataType::LargeBinary => {
Value::Binary(cast_array!(array, BinaryArray).value(idx).into())
}
DataType::Int8 => Value::Int8(cast_array!(array, PrimitiveArray::<i8>).value(idx)),
DataType::Int16 => Value::Int16(cast_array!(array, PrimitiveArray::<i16>).value(idx)),
DataType::Int32 => Value::Int32(cast_array!(array, PrimitiveArray::<i32>).value(idx)),
DataType::Int64 => Value::Int64(cast_array!(array, PrimitiveArray::<i64>).value(idx)),
DataType::UInt8 => Value::UInt8(cast_array!(array, PrimitiveArray::<u8>).value(idx)),
DataType::UInt16 => Value::UInt16(cast_array!(array, PrimitiveArray::<u16>).value(idx)),
DataType::UInt32 => Value::UInt32(cast_array!(array, PrimitiveArray::<u32>).value(idx)),
DataType::UInt64 => Value::UInt64(cast_array!(array, PrimitiveArray::<u64>).value(idx)),
DataType::Float32 => {
Value::Float32(cast_array!(array, PrimitiveArray::<f32>).value(idx).into())
}
DataType::Float64 => {
Value::Float64(cast_array!(array, PrimitiveArray::<f64>).value(idx).into())
}
DataType::Utf8 | DataType::LargeUtf8 => {
Value::String(cast_array!(array, StringArray).value(idx).into())
}
DataType::Timestamp(t, _) => {
let value = cast_array!(array, PrimitiveArray::<i64>).value(idx);
let unit = match ConcreteDataType::from_arrow_time_unit(t) {
ConcreteDataType::Timestamp(t) => t.unit,
_ => unreachable!(),
};
Value::Timestamp(Timestamp::new(value, unit))
}
DataType::List(_) => {
let array = cast_array!(array, ListArray).value(idx);
let item_type = ConcreteDataType::try_from(array.data_type())?;
let array = cast_array!(array, ListArray::<i32>).value(idx);
let inner_datatype = ConcreteDataType::try_from(array.data_type())?;
let values = (0..array.len())
.map(|i| arrow_array_get(&*array, i))
.collect::<Result<Vec<Value>>>()?;
Value::List(ListValue::new(Some(Box::new(values)), item_type))
Value::List(ListValue::new(Some(Box::new(values)), inner_datatype))
}
_ => unimplemented!("Arrow array datatype: {:?}", array.data_type()),
};
@@ -98,74 +88,45 @@ pub fn arrow_array_get(array: &dyn Array, idx: usize) -> Result<Value> {
#[cfg(test)]
mod test {
use std::sync::Arc;
use arrow::array::{
BooleanArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
LargeBinaryArray, TimestampMicrosecondArray, TimestampMillisecondArray,
TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array,
MutableListArray, MutablePrimitiveArray, TryExtend, UInt16Array, UInt32Array, UInt64Array,
UInt8Array,
};
use arrow::datatypes::Int32Type;
use arrow::buffer::Buffer;
use arrow::datatypes::{DataType, TimeUnit as ArrowTimeUnit};
use common_time::timestamp::{TimeUnit, Timestamp};
use paste::paste;
use super::*;
use crate::data_type::ConcreteDataType;
use crate::types::TimestampType;
macro_rules! test_arrow_array_get_for_timestamps {
( $($unit: ident), *) => {
$(
paste! {
let mut builder = arrow::array::[<Timestamp $unit Array>]::builder(3);
builder.append_value(1);
builder.append_value(0);
builder.append_value(-1);
let ts_array = Arc::new(builder.finish()) as Arc<dyn Array>;
let v = arrow_array_get(&ts_array, 1).unwrap();
assert_eq!(
ConcreteDataType::Timestamp(TimestampType::$unit(
$crate::types::[<Timestamp $unit Type>]::default(),
)),
v.data_type()
);
}
)*
};
}
#[test]
fn test_timestamp_array() {
test_arrow_array_get_for_timestamps![Second, Millisecond, Microsecond, Nanosecond];
}
use crate::prelude::Vector;
use crate::vectors::TimestampVector;
#[test]
fn test_arrow_array_access() {
let array1 = BooleanArray::from(vec![true, true, false, false]);
let array1 = BooleanArray::from_slice(vec![true, true, false, false]);
assert_eq!(Value::Boolean(true), arrow_array_get(&array1, 1).unwrap());
let array1 = Int8Array::from(vec![1, 2, 3, 4]);
let array1 = Int8Array::from_vec(vec![1, 2, 3, 4]);
assert_eq!(Value::Int8(2), arrow_array_get(&array1, 1).unwrap());
let array1 = UInt8Array::from(vec![1, 2, 3, 4]);
let array1 = UInt8Array::from_vec(vec![1, 2, 3, 4]);
assert_eq!(Value::UInt8(2), arrow_array_get(&array1, 1).unwrap());
let array1 = Int16Array::from(vec![1, 2, 3, 4]);
let array1 = Int16Array::from_vec(vec![1, 2, 3, 4]);
assert_eq!(Value::Int16(2), arrow_array_get(&array1, 1).unwrap());
let array1 = UInt16Array::from(vec![1, 2, 3, 4]);
let array1 = UInt16Array::from_vec(vec![1, 2, 3, 4]);
assert_eq!(Value::UInt16(2), arrow_array_get(&array1, 1).unwrap());
let array1 = Int32Array::from(vec![1, 2, 3, 4]);
let array1 = Int32Array::from_vec(vec![1, 2, 3, 4]);
assert_eq!(Value::Int32(2), arrow_array_get(&array1, 1).unwrap());
let array1 = UInt32Array::from(vec![1, 2, 3, 4]);
let array1 = UInt32Array::from_vec(vec![1, 2, 3, 4]);
assert_eq!(Value::UInt32(2), arrow_array_get(&array1, 1).unwrap());
let array = Int64Array::from(vec![1, 2, 3, 4]);
let array = Int64Array::from_vec(vec![1, 2, 3, 4]);
assert_eq!(Value::Int64(2), arrow_array_get(&array, 1).unwrap());
let array1 = UInt64Array::from(vec![1, 2, 3, 4]);
let array1 = UInt64Array::from_vec(vec![1, 2, 3, 4]);
assert_eq!(Value::UInt64(2), arrow_array_get(&array1, 1).unwrap());
let array1 = Float32Array::from(vec![1f32, 2f32, 3f32, 4f32]);
let array1 = Float32Array::from_vec(vec![1f32, 2f32, 3f32, 4f32]);
assert_eq!(
Value::Float32(2f32.into()),
arrow_array_get(&array1, 1).unwrap()
);
let array1 = Float64Array::from(vec![1f64, 2f64, 3f64, 4f64]);
let array1 = Float64Array::from_vec(vec![1f64, 2f64, 3f64, 4f64]);
assert_eq!(
Value::Float64(2f64.into()),
arrow_array_get(&array1, 1).unwrap()
@@ -178,42 +139,55 @@ mod test {
);
assert_eq!(Value::Null, arrow_array_get(&array2, 1).unwrap());
let array3 = LargeBinaryArray::from(vec![
let array3 = super::BinaryArray::from(vec![
Some("hello".as_bytes()),
None,
Some("world".as_bytes()),
]);
assert_eq!(
Value::Binary("hello".as_bytes().into()),
arrow_array_get(&array3, 0).unwrap()
);
assert_eq!(Value::Null, arrow_array_get(&array3, 1).unwrap());
let array = TimestampSecondArray::from(vec![1, 2, 3]);
let value = arrow_array_get(&array, 1).unwrap();
assert_eq!(value, Value::Timestamp(Timestamp::new(2, TimeUnit::Second)));
let array = TimestampMillisecondArray::from(vec![1, 2, 3]);
let value = arrow_array_get(&array, 1).unwrap();
let vector = TimestampVector::new(Int64Array::from_vec(vec![1, 2, 3, 4]));
let array = vector.to_boxed_arrow_array();
let value = arrow_array_get(&*array, 1).unwrap();
assert_eq!(
value,
Value::Timestamp(Timestamp::new(2, TimeUnit::Millisecond))
);
let array = TimestampMicrosecondArray::from(vec![1, 2, 3]);
let value = arrow_array_get(&array, 1).unwrap();
assert_eq!(
value,
Value::Timestamp(Timestamp::new(2, TimeUnit::Microsecond))
let array4 = PrimitiveArray::<i64>::from_data(
DataType::Timestamp(ArrowTimeUnit::Millisecond, None),
Buffer::from_slice(&vec![1, 2, 3, 4]),
None,
);
let array = TimestampNanosecondArray::from(vec![1, 2, 3]);
let value = arrow_array_get(&array, 1).unwrap();
assert_eq!(
value,
Value::Timestamp(Timestamp::new(2, TimeUnit::Nanosecond))
Value::Timestamp(Timestamp::new(1, TimeUnit::Millisecond)),
arrow_array_get(&array4, 0).unwrap()
);
let array4 = PrimitiveArray::<i64>::from_data(
DataType::Timestamp(ArrowTimeUnit::Nanosecond, None),
Buffer::from_slice(&vec![1, 2, 3, 4]),
None,
);
assert_eq!(
Value::Timestamp(Timestamp::new(1, TimeUnit::Nanosecond)),
arrow_array_get(&array4, 0).unwrap()
);
// test list array
let data = vec![
Some(vec![Some(1), Some(2), Some(3)]),
Some(vec![Some(1i32), Some(2), Some(3)]),
None,
Some(vec![Some(4), None, Some(6)]),
];
let arrow_array = ListArray::from_iter_primitive::<Int32Type, _, _>(data);
let mut arrow_array = MutableListArray::<i32, MutablePrimitiveArray<i32>>::new();
arrow_array.try_extend(data).unwrap();
let arrow_array: ListArray<i32> = arrow_array.into();
let v0 = arrow_array_get(&arrow_array, 0).unwrap();
match v0 {

View File

@@ -14,7 +14,7 @@
use std::sync::Arc;
use arrow::datatypes::{DataType as ArrowDataType, TimeUnit as ArrowTimeUnit};
use arrow::datatypes::DataType as ArrowDataType;
use common_time::timestamp::TimeUnit;
use paste::paste;
use serde::{Deserialize, Serialize};
@@ -23,14 +23,13 @@ use crate::error::{self, Error, Result};
use crate::type_id::LogicalTypeId;
use crate::types::{
BinaryType, BooleanType, DateTimeType, DateType, Float32Type, Float64Type, Int16Type,
Int32Type, Int64Type, Int8Type, ListType, NullType, StringType, TimestampMicrosecondType,
TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, TimestampType,
UInt16Type, UInt32Type, UInt64Type, UInt8Type,
Int32Type, Int64Type, Int8Type, ListType, NullType, StringType, TimestampType, UInt16Type,
UInt32Type, UInt64Type, UInt8Type,
};
use crate::value::Value;
use crate::vectors::MutableVector;
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[enum_dispatch::enum_dispatch(DataType)]
pub enum ConcreteDataType {
Null(NullType),
@@ -48,21 +47,17 @@ pub enum ConcreteDataType {
Float32(Float32Type),
Float64(Float64Type),
// String types:
// String types
Binary(BinaryType),
String(StringType),
// Date types:
Date(DateType),
DateTime(DateTimeType),
Timestamp(TimestampType),
// Compound types:
List(ListType),
}
// TODO(yingwen): Refactor these `is_xxx()` methods, such as adding a `properties()` method
// returning all these properties to the `DataType` trait
impl ConcreteDataType {
pub fn is_float(&self) -> bool {
matches!(
@@ -75,7 +70,7 @@ impl ConcreteDataType {
matches!(self, ConcreteDataType::Boolean(_))
}
pub fn is_stringifiable(&self) -> bool {
pub fn stringifiable(&self) -> bool {
matches!(
self,
ConcreteDataType::String(_)
@@ -108,6 +103,13 @@ impl ConcreteDataType {
)
}
pub fn is_timestamp(&self) -> bool {
matches!(
self,
ConcreteDataType::Timestamp(_) | ConcreteDataType::Int64(_)
)
}
pub fn numerics() -> Vec<ConcreteDataType> {
vec![
ConcreteDataType::int8_datatype(),
@@ -159,7 +161,7 @@ impl TryFrom<&ArrowDataType> for ConcreteDataType {
ArrowDataType::Binary | ArrowDataType::LargeBinary => Self::binary_datatype(),
ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => Self::string_datatype(),
ArrowDataType::List(field) => Self::List(ListType::new(
ConcreteDataType::from_arrow_type(field.data_type()),
ConcreteDataType::from_arrow_type(&field.data_type),
)),
_ => {
return error::UnsupportedArrowTypeSnafu {
@@ -189,52 +191,38 @@ macro_rules! impl_new_concrete_type_functions {
impl_new_concrete_type_functions!(
Null, Boolean, UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64,
Binary, Date, DateTime, String
Binary, String, Date, DateTime
);
impl ConcreteDataType {
pub fn timestamp_second_datatype() -> Self {
ConcreteDataType::Timestamp(TimestampType::Second(TimestampSecondType::default()))
}
pub fn timestamp_millisecond_datatype() -> Self {
ConcreteDataType::Timestamp(TimestampType::Millisecond(
TimestampMillisecondType::default(),
))
}
pub fn timestamp_microsecond_datatype() -> Self {
ConcreteDataType::Timestamp(TimestampType::Microsecond(
TimestampMicrosecondType::default(),
))
}
pub fn timestamp_nanosecond_datatype() -> Self {
ConcreteDataType::Timestamp(TimestampType::Nanosecond(TimestampNanosecondType::default()))
pub fn list_datatype(inner_type: ConcreteDataType) -> ConcreteDataType {
ConcreteDataType::List(ListType::new(inner_type))
}
pub fn timestamp_datatype(unit: TimeUnit) -> Self {
match unit {
TimeUnit::Second => Self::timestamp_second_datatype(),
TimeUnit::Millisecond => Self::timestamp_millisecond_datatype(),
TimeUnit::Microsecond => Self::timestamp_microsecond_datatype(),
TimeUnit::Nanosecond => Self::timestamp_nanosecond_datatype(),
}
ConcreteDataType::Timestamp(TimestampType::new(unit))
}
pub fn timestamp_millis_datatype() -> Self {
ConcreteDataType::Timestamp(TimestampType::new(TimeUnit::Millisecond))
}
/// Converts from arrow timestamp unit to
pub fn from_arrow_time_unit(t: &ArrowTimeUnit) -> Self {
// TODO(hl): maybe impl From<ArrowTimestamp> for our timestamp ?
pub fn from_arrow_time_unit(t: &arrow::datatypes::TimeUnit) -> Self {
match t {
ArrowTimeUnit::Second => Self::timestamp_second_datatype(),
ArrowTimeUnit::Millisecond => Self::timestamp_millisecond_datatype(),
ArrowTimeUnit::Microsecond => Self::timestamp_microsecond_datatype(),
ArrowTimeUnit::Nanosecond => Self::timestamp_nanosecond_datatype(),
arrow::datatypes::TimeUnit::Second => Self::timestamp_datatype(TimeUnit::Second),
arrow::datatypes::TimeUnit::Millisecond => {
Self::timestamp_datatype(TimeUnit::Millisecond)
}
arrow::datatypes::TimeUnit::Microsecond => {
Self::timestamp_datatype(TimeUnit::Microsecond)
}
arrow::datatypes::TimeUnit::Nanosecond => {
Self::timestamp_datatype(TimeUnit::Nanosecond)
}
}
}
pub fn list_datatype(item_type: ConcreteDataType) -> ConcreteDataType {
ConcreteDataType::List(ListType::new(item_type))
}
}
/// Data type abstraction.
@@ -249,15 +237,11 @@ pub trait DataType: std::fmt::Debug + Send + Sync {
/// Returns the default value of this type.
fn default_value(&self) -> Value;
/// Convert this type as [arrow::datatypes::DataType].
/// Convert this type as [arrow2::datatypes::DataType].
fn as_arrow_type(&self) -> ArrowDataType;
/// Creates a mutable vector with given `capacity` of this type.
/// Create a mutable vector with given `capacity` of this type.
fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector>;
/// Returns true if the data type is compatible with timestamp type so we can
/// use it as a timestamp.
fn is_timestamp_compatible(&self) -> bool;
}
pub type DataTypeRef = Arc<dyn DataType>;
@@ -340,6 +324,10 @@ mod tests {
ConcreteDataType::from_arrow_type(&ArrowDataType::Utf8),
ConcreteDataType::String(_)
));
assert!(matches!(
ConcreteDataType::from_arrow_type(&ArrowDataType::Utf8),
ConcreteDataType::String(_)
));
assert_eq!(
ConcreteDataType::from_arrow_type(&ArrowDataType::List(Box::new(Field::new(
"item",
@@ -357,48 +345,31 @@ mod tests {
#[test]
fn test_from_arrow_timestamp() {
assert_eq!(
ConcreteDataType::timestamp_millisecond_datatype(),
ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Millisecond)
ConcreteDataType::timestamp_millis_datatype(),
ConcreteDataType::from_arrow_time_unit(&arrow::datatypes::TimeUnit::Millisecond)
);
assert_eq!(
ConcreteDataType::timestamp_microsecond_datatype(),
ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Microsecond)
ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond),
ConcreteDataType::from_arrow_time_unit(&arrow::datatypes::TimeUnit::Microsecond)
);
assert_eq!(
ConcreteDataType::timestamp_nanosecond_datatype(),
ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Nanosecond)
ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond),
ConcreteDataType::from_arrow_time_unit(&arrow::datatypes::TimeUnit::Nanosecond)
);
assert_eq!(
ConcreteDataType::timestamp_second_datatype(),
ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Second)
ConcreteDataType::timestamp_datatype(TimeUnit::Second),
ConcreteDataType::from_arrow_time_unit(&arrow::datatypes::TimeUnit::Second)
);
}
#[test]
fn test_is_timestamp_compatible() {
assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Second).is_timestamp_compatible());
assert!(
ConcreteDataType::timestamp_datatype(TimeUnit::Millisecond).is_timestamp_compatible()
);
assert!(
ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond).is_timestamp_compatible()
);
assert!(
ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond).is_timestamp_compatible()
);
assert!(ConcreteDataType::timestamp_second_datatype().is_timestamp_compatible());
assert!(ConcreteDataType::timestamp_millisecond_datatype().is_timestamp_compatible());
assert!(ConcreteDataType::timestamp_microsecond_datatype().is_timestamp_compatible());
assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_timestamp_compatible());
assert!(ConcreteDataType::int64_datatype().is_timestamp_compatible());
assert!(!ConcreteDataType::null_datatype().is_timestamp_compatible());
assert!(!ConcreteDataType::binary_datatype().is_timestamp_compatible());
assert!(!ConcreteDataType::boolean_datatype().is_timestamp_compatible());
assert!(!ConcreteDataType::date_datatype().is_timestamp_compatible());
assert!(!ConcreteDataType::datetime_datatype().is_timestamp_compatible());
assert!(!ConcreteDataType::string_datatype().is_timestamp_compatible());
assert!(!ConcreteDataType::int32_datatype().is_timestamp_compatible());
assert!(!ConcreteDataType::uint64_datatype().is_timestamp_compatible());
fn test_is_timestamp() {
assert!(ConcreteDataType::timestamp_millis_datatype().is_timestamp());
assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Second).is_timestamp());
assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Millisecond).is_timestamp());
assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond).is_timestamp());
assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond).is_timestamp());
assert!(ConcreteDataType::int64_datatype().is_timestamp());
}
#[test]
@@ -406,81 +377,4 @@ mod tests {
assert!(ConcreteDataType::null_datatype().is_null());
assert!(!ConcreteDataType::int32_datatype().is_null());
}
#[test]
fn test_is_float() {
assert!(!ConcreteDataType::int32_datatype().is_float());
assert!(ConcreteDataType::float32_datatype().is_float());
assert!(ConcreteDataType::float64_datatype().is_float());
}
#[test]
fn test_is_boolean() {
assert!(!ConcreteDataType::int32_datatype().is_boolean());
assert!(!ConcreteDataType::float32_datatype().is_boolean());
assert!(ConcreteDataType::boolean_datatype().is_boolean());
}
#[test]
fn test_is_stringifiable() {
assert!(!ConcreteDataType::int32_datatype().is_stringifiable());
assert!(!ConcreteDataType::float32_datatype().is_stringifiable());
assert!(ConcreteDataType::string_datatype().is_stringifiable());
assert!(ConcreteDataType::date_datatype().is_stringifiable());
assert!(ConcreteDataType::datetime_datatype().is_stringifiable());
assert!(ConcreteDataType::timestamp_second_datatype().is_stringifiable());
assert!(ConcreteDataType::timestamp_millisecond_datatype().is_stringifiable());
assert!(ConcreteDataType::timestamp_microsecond_datatype().is_stringifiable());
assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_stringifiable());
}
#[test]
fn test_is_signed() {
assert!(ConcreteDataType::int8_datatype().is_signed());
assert!(ConcreteDataType::int16_datatype().is_signed());
assert!(ConcreteDataType::int32_datatype().is_signed());
assert!(ConcreteDataType::int64_datatype().is_signed());
assert!(ConcreteDataType::date_datatype().is_signed());
assert!(ConcreteDataType::datetime_datatype().is_signed());
assert!(ConcreteDataType::timestamp_second_datatype().is_signed());
assert!(ConcreteDataType::timestamp_millisecond_datatype().is_signed());
assert!(ConcreteDataType::timestamp_microsecond_datatype().is_signed());
assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_signed());
assert!(!ConcreteDataType::uint8_datatype().is_signed());
assert!(!ConcreteDataType::uint16_datatype().is_signed());
assert!(!ConcreteDataType::uint32_datatype().is_signed());
assert!(!ConcreteDataType::uint64_datatype().is_signed());
assert!(!ConcreteDataType::float32_datatype().is_signed());
assert!(!ConcreteDataType::float64_datatype().is_signed());
}
#[test]
fn test_is_unsigned() {
assert!(!ConcreteDataType::int8_datatype().is_unsigned());
assert!(!ConcreteDataType::int16_datatype().is_unsigned());
assert!(!ConcreteDataType::int32_datatype().is_unsigned());
assert!(!ConcreteDataType::int64_datatype().is_unsigned());
assert!(!ConcreteDataType::date_datatype().is_unsigned());
assert!(!ConcreteDataType::datetime_datatype().is_unsigned());
assert!(!ConcreteDataType::timestamp_second_datatype().is_unsigned());
assert!(!ConcreteDataType::timestamp_millisecond_datatype().is_unsigned());
assert!(!ConcreteDataType::timestamp_microsecond_datatype().is_unsigned());
assert!(!ConcreteDataType::timestamp_nanosecond_datatype().is_unsigned());
assert!(ConcreteDataType::uint8_datatype().is_unsigned());
assert!(ConcreteDataType::uint16_datatype().is_unsigned());
assert!(ConcreteDataType::uint32_datatype().is_unsigned());
assert!(ConcreteDataType::uint64_datatype().is_unsigned());
assert!(!ConcreteDataType::float32_datatype().is_unsigned());
assert!(!ConcreteDataType::float64_datatype().is_unsigned());
}
#[test]
fn test_numerics() {
let nums = ConcreteDataType::numerics();
assert_eq!(10, nums.len());
}
}

View File

@@ -23,7 +23,6 @@ pub mod prelude;
mod scalars;
pub mod schema;
pub mod serialize;
mod timestamp;
pub mod type_id;
pub mod types;
pub mod value;

View File

@@ -12,9 +12,27 @@
// See the License for the specific language governing permissions and
// limitations under the License.
//! Some helper macros for datatypes, copied from databend.
///! Some helper macros for datatypes, copied from databend.
#[macro_export]
macro_rules! for_all_scalar_types {
($macro:tt $(, $x:tt)*) => {
$macro! {
[$($x),*],
{ i8 },
{ i16 },
{ i32 },
{ i64 },
{ u8 },
{ u16 },
{ u32 },
{ u64 },
{ f32 },
{ f64 },
{ bool },
}
};
}
/// Apply the macro rules to all primitive types.
#[macro_export]
macro_rules! for_all_primitive_types {
($macro:tt $(, $x:tt)*) => {
@@ -34,8 +52,6 @@ macro_rules! for_all_primitive_types {
};
}
/// Match the logical type and apply `$body` to all primitive types and
/// `nbody` to other types.
#[macro_export]
macro_rules! with_match_primitive_type_id {
($key_type:expr, | $_:tt $T:ident | $body:tt, $nbody:tt) => {{
@@ -46,21 +62,17 @@ macro_rules! with_match_primitive_type_id {
}
use $crate::type_id::LogicalTypeId;
use $crate::types::{
Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type,
UInt32Type, UInt64Type, UInt8Type,
};
match $key_type {
LogicalTypeId::Int8 => __with_ty__! { Int8Type },
LogicalTypeId::Int16 => __with_ty__! { Int16Type },
LogicalTypeId::Int32 => __with_ty__! { Int32Type },
LogicalTypeId::Int64 => __with_ty__! { Int64Type },
LogicalTypeId::UInt8 => __with_ty__! { UInt8Type },
LogicalTypeId::UInt16 => __with_ty__! { UInt16Type },
LogicalTypeId::UInt32 => __with_ty__! { UInt32Type },
LogicalTypeId::UInt64 => __with_ty__! { UInt64Type },
LogicalTypeId::Float32 => __with_ty__! { Float32Type },
LogicalTypeId::Float64 => __with_ty__! { Float64Type },
LogicalTypeId::Int8 => __with_ty__! { i8 },
LogicalTypeId::Int16 => __with_ty__! { i16 },
LogicalTypeId::Int32 => __with_ty__! { i32 },
LogicalTypeId::Int64 => __with_ty__! { i64 },
LogicalTypeId::UInt8 => __with_ty__! { u8 },
LogicalTypeId::UInt16 => __with_ty__! { u16 },
LogicalTypeId::UInt32 => __with_ty__! { u32 },
LogicalTypeId::UInt64 => __with_ty__! { u64 },
LogicalTypeId::Float32 => __with_ty__! { f32 },
LogicalTypeId::Float64 => __with_ty__! { f64 },
_ => $nbody,
}

View File

@@ -16,5 +16,8 @@ pub use crate::data_type::{ConcreteDataType, DataType, DataTypeRef};
pub use crate::macros::*;
pub use crate::scalars::{Scalar, ScalarRef, ScalarVector, ScalarVectorBuilder};
pub use crate::type_id::LogicalTypeId;
pub use crate::types::Primitive;
pub use crate::value::{Value, ValueRef};
pub use crate::vectors::{MutableVector, Validity, Vector, VectorRef};
pub use crate::vectors::{
Helper as VectorHelper, MutableVector, Validity, Vector, VectorBuilder, VectorRef,
};

View File

@@ -14,17 +14,11 @@
use std::any::Any;
use common_time::{Date, DateTime};
use common_time::{Date, DateTime, Timestamp};
use crate::types::{
Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, UInt32Type,
UInt64Type, UInt8Type,
};
use crate::value::{ListValue, ListValueRef, Value};
use crate::vectors::{
BinaryVector, BooleanVector, DateTimeVector, DateVector, ListVector, MutableVector,
PrimitiveVector, StringVector, Vector,
};
use crate::prelude::*;
use crate::value::{ListValue, ListValueRef};
use crate::vectors::*;
fn get_iter_capacity<T, I: Iterator<Item = T>>(iter: &I) -> usize {
match iter.size_hint() {
@@ -41,7 +35,7 @@ where
for<'a> Self::VectorType: ScalarVector<RefItem<'a> = Self::RefType<'a>>,
{
type VectorType: ScalarVector<OwnedItem = Self>;
type RefType<'a>: ScalarRef<'a, ScalarType = Self>
type RefType<'a>: ScalarRef<'a, ScalarType = Self, VectorType = Self::VectorType>
where
Self: 'a;
/// Get a reference of the current value.
@@ -52,6 +46,7 @@ where
}
pub trait ScalarRef<'a>: std::fmt::Debug + Clone + Copy + Send + 'a {
type VectorType: ScalarVector<RefItem<'a> = Self>;
/// The corresponding [`Scalar`] type.
type ScalarType: Scalar<RefType<'a> = Self>;
@@ -68,7 +63,7 @@ where
{
type OwnedItem: Scalar<VectorType = Self>;
/// The reference item of this vector.
type RefItem<'a>: ScalarRef<'a, ScalarType = Self::OwnedItem>
type RefItem<'a>: ScalarRef<'a, ScalarType = Self::OwnedItem, VectorType = Self>
where
Self: 'a;
@@ -142,46 +137,47 @@ pub trait ScalarVectorBuilder: MutableVector {
fn finish(&mut self) -> Self::VectorType;
}
macro_rules! impl_scalar_for_native {
($Native: ident, $DataType: ident) => {
impl Scalar for $Native {
type VectorType = PrimitiveVector<$DataType>;
type RefType<'a> = $Native;
macro_rules! impl_primitive_scalar_type {
($native:ident) => {
impl Scalar for $native {
type VectorType = PrimitiveVector<$native>;
type RefType<'a> = $native;
#[inline]
fn as_scalar_ref(&self) -> $Native {
fn as_scalar_ref(&self) -> $native {
*self
}
#[allow(clippy::needless_lifetimes)]
#[inline]
fn upcast_gat<'short, 'long: 'short>(long: $Native) -> $Native {
fn upcast_gat<'short, 'long: 'short>(long: $native) -> $native {
long
}
}
/// Implement [`ScalarRef`] for primitive types. Note that primitive types are both [`Scalar`] and [`ScalarRef`].
impl<'a> ScalarRef<'a> for $Native {
type ScalarType = $Native;
impl<'a> ScalarRef<'a> for $native {
type VectorType = PrimitiveVector<$native>;
type ScalarType = $native;
#[inline]
fn to_owned_scalar(&self) -> $Native {
fn to_owned_scalar(&self) -> $native {
*self
}
}
};
}
impl_scalar_for_native!(u8, UInt8Type);
impl_scalar_for_native!(u16, UInt16Type);
impl_scalar_for_native!(u32, UInt32Type);
impl_scalar_for_native!(u64, UInt64Type);
impl_scalar_for_native!(i8, Int8Type);
impl_scalar_for_native!(i16, Int16Type);
impl_scalar_for_native!(i32, Int32Type);
impl_scalar_for_native!(i64, Int64Type);
impl_scalar_for_native!(f32, Float32Type);
impl_scalar_for_native!(f64, Float64Type);
impl_primitive_scalar_type!(u8);
impl_primitive_scalar_type!(u16);
impl_primitive_scalar_type!(u32);
impl_primitive_scalar_type!(u64);
impl_primitive_scalar_type!(i8);
impl_primitive_scalar_type!(i16);
impl_primitive_scalar_type!(i32);
impl_primitive_scalar_type!(i64);
impl_primitive_scalar_type!(f32);
impl_primitive_scalar_type!(f64);
impl Scalar for bool {
type VectorType = BooleanVector;
@@ -200,6 +196,7 @@ impl Scalar for bool {
}
impl<'a> ScalarRef<'a> for bool {
type VectorType = BooleanVector;
type ScalarType = bool;
#[inline]
@@ -224,6 +221,7 @@ impl Scalar for String {
}
impl<'a> ScalarRef<'a> for &'a str {
type VectorType = StringVector;
type ScalarType = String;
#[inline]
@@ -248,6 +246,7 @@ impl Scalar for Vec<u8> {
}
impl<'a> ScalarRef<'a> for &'a [u8] {
type VectorType = BinaryVector;
type ScalarType = Vec<u8>;
#[inline]
@@ -270,6 +269,7 @@ impl Scalar for Date {
}
impl<'a> ScalarRef<'a> for Date {
type VectorType = DateVector;
type ScalarType = Date;
fn to_owned_scalar(&self) -> Self::ScalarType {
@@ -291,6 +291,7 @@ impl Scalar for DateTime {
}
impl<'a> ScalarRef<'a> for DateTime {
type VectorType = DateTimeVector;
type ScalarType = DateTime;
fn to_owned_scalar(&self) -> Self::ScalarType {
@@ -298,7 +299,27 @@ impl<'a> ScalarRef<'a> for DateTime {
}
}
// Timestamp types implement Scalar and ScalarRef in `src/timestamp.rs`.
impl Scalar for Timestamp {
type VectorType = TimestampVector;
type RefType<'a> = Timestamp;
fn as_scalar_ref(&self) -> Self::RefType<'_> {
*self
}
fn upcast_gat<'short, 'long: 'short>(long: Self::RefType<'long>) -> Self::RefType<'short> {
long
}
}
impl<'a> ScalarRef<'a> for Timestamp {
type VectorType = TimestampVector;
type ScalarType = Timestamp;
fn to_owned_scalar(&self) -> Self::ScalarType {
*self
}
}
impl Scalar for ListValue {
type VectorType = ListVector;
@@ -314,6 +335,7 @@ impl Scalar for ListValue {
}
impl<'a> ScalarRef<'a> for ListValueRef<'a> {
type VectorType = ListVector;
type ScalarType = ListValue;
fn to_owned_scalar(&self) -> Self::ScalarType {
@@ -335,9 +357,8 @@ impl<'a> ScalarRef<'a> for ListValueRef<'a> {
#[cfg(test)]
mod tests {
use super::*;
use crate::data_type::ConcreteDataType;
use crate::timestamp::TimestampSecond;
use crate::vectors::{BinaryVector, Int32Vector, ListVectorBuilder, TimestampSecondVector};
use crate::vectors::binary::BinaryVector;
use crate::vectors::primitive::Int32Vector;
fn build_vector_from_slice<T: ScalarVector>(items: &[Option<T::RefItem<'_>>]) -> T {
let mut builder = T::Builder::with_capacity(items.len());
@@ -433,11 +454,11 @@ mod tests {
#[test]
fn test_build_timestamp_vector() {
let expect: Vec<Option<TimestampSecond>> = vec![Some(10.into()), None, Some(42.into())];
let vector: TimestampSecondVector = build_vector_from_slice(&expect);
let expect: Vec<Option<Timestamp>> = vec![Some(10.into()), None, Some(42.into())];
let vector: TimestampVector = build_vector_from_slice(&expect);
assert_vector_eq(&expect, &vector);
let val = vector.get_data(0).unwrap();
assert_eq!(val, val.as_scalar_ref());
assert_eq!(TimestampSecond::from(10), val.to_owned_scalar());
assert_eq!(10, val.to_owned_scalar().value());
}
}

View File

@@ -12,27 +12,128 @@
// See the License for the specific language governing permissions and
// limitations under the License.
mod column_schema;
mod constraint;
mod raw;
use std::collections::HashMap;
use std::sync::Arc;
pub use arrow::datatypes::Metadata;
use arrow::datatypes::{Field, Schema as ArrowSchema};
use serde::{Deserialize, Serialize};
use snafu::{ensure, ResultExt};
use crate::data_type::DataType;
use crate::error::{self, Error, Result};
pub use crate::schema::column_schema::{ColumnSchema, Metadata};
use crate::data_type::{ConcreteDataType, DataType};
use crate::error::{self, DeserializeSnafu, Error, Result, SerializeSnafu};
pub use crate::schema::constraint::ColumnDefaultConstraint;
pub use crate::schema::raw::RawSchema;
use crate::vectors::VectorRef;
/// Key used to store whether the column is time index in arrow field's metadata.
const TIME_INDEX_KEY: &str = "greptime:time_index";
/// Key used to store version number of the schema in metadata.
const VERSION_KEY: &str = "greptime:version";
/// Key used to store default constraint in arrow field's metadata.
const ARROW_FIELD_DEFAULT_CONSTRAINT_KEY: &str = "greptime:default_constraint";
/// Schema of a column, used as an immutable struct.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ColumnSchema {
pub name: String,
pub data_type: ConcreteDataType,
is_nullable: bool,
is_time_index: bool,
default_constraint: Option<ColumnDefaultConstraint>,
metadata: Metadata,
}
impl ColumnSchema {
pub fn new<T: Into<String>>(
name: T,
data_type: ConcreteDataType,
is_nullable: bool,
) -> ColumnSchema {
ColumnSchema {
name: name.into(),
data_type,
is_nullable,
is_time_index: false,
default_constraint: None,
metadata: Metadata::new(),
}
}
#[inline]
pub fn is_time_index(&self) -> bool {
self.is_time_index
}
#[inline]
pub fn is_nullable(&self) -> bool {
self.is_nullable
}
#[inline]
pub fn default_constraint(&self) -> Option<&ColumnDefaultConstraint> {
self.default_constraint.as_ref()
}
#[inline]
pub fn metadata(&self) -> &Metadata {
&self.metadata
}
pub fn with_time_index(mut self, is_time_index: bool) -> Self {
self.is_time_index = is_time_index;
if is_time_index {
self.metadata
.insert(TIME_INDEX_KEY.to_string(), "true".to_string());
} else {
self.metadata.remove(TIME_INDEX_KEY);
}
self
}
pub fn with_default_constraint(
mut self,
default_constraint: Option<ColumnDefaultConstraint>,
) -> Result<Self> {
if let Some(constraint) = &default_constraint {
constraint.validate(&self.data_type, self.is_nullable)?;
}
self.default_constraint = default_constraint;
Ok(self)
}
/// Creates a new [`ColumnSchema`] with given metadata.
pub fn with_metadata(mut self, metadata: Metadata) -> Self {
self.metadata = metadata;
self
}
pub fn create_default_vector(&self, num_rows: usize) -> Result<Option<VectorRef>> {
match &self.default_constraint {
Some(c) => c
.create_default_vector(&self.data_type, self.is_nullable, num_rows)
.map(Some),
None => {
if self.is_nullable {
// No default constraint, use null as default value.
// TODO(yingwen): Use NullVector once it supports setting logical type.
ColumnDefaultConstraint::null_value()
.create_default_vector(&self.data_type, self.is_nullable, num_rows)
.map(Some)
} else {
Ok(None)
}
}
}
}
}
/// A common schema, should be immutable.
#[derive(Debug, Clone, PartialEq, Eq)]
#[derive(Debug, Clone, PartialEq)]
pub struct Schema {
column_schemas: Vec<ColumnSchema>,
name_to_index: HashMap<String, usize>,
@@ -130,7 +231,7 @@ impl Schema {
}
#[inline]
pub fn metadata(&self) -> &HashMap<String, String> {
pub fn metadata(&self) -> &Metadata {
&self.arrow_schema.metadata
}
}
@@ -142,7 +243,7 @@ pub struct SchemaBuilder {
fields: Vec<Field>,
timestamp_index: Option<usize>,
version: u32,
metadata: HashMap<String, String>,
metadata: Metadata,
}
impl TryFrom<Vec<ColumnSchema>> for SchemaBuilder {
@@ -191,7 +292,7 @@ impl SchemaBuilder {
self.metadata
.insert(VERSION_KEY.to_string(), self.version.to_string());
let arrow_schema = ArrowSchema::new(self.fields).with_metadata(self.metadata);
let arrow_schema = ArrowSchema::from(self.fields).with_metadata(self.metadata);
Ok(Schema {
column_schemas: self.column_schemas,
@@ -246,7 +347,7 @@ fn validate_timestamp_index(column_schemas: &[ColumnSchema], timestamp_index: us
let column_schema = &column_schemas[timestamp_index];
ensure!(
column_schema.data_type.is_timestamp_compatible(),
column_schema.data_type.is_timestamp(),
error::InvalidTimestampIndexSnafu {
index: timestamp_index,
}
@@ -263,6 +364,58 @@ fn validate_timestamp_index(column_schemas: &[ColumnSchema], timestamp_index: us
pub type SchemaRef = Arc<Schema>;
impl TryFrom<&Field> for ColumnSchema {
type Error = Error;
fn try_from(field: &Field) -> Result<ColumnSchema> {
let data_type = ConcreteDataType::try_from(&field.data_type)?;
let mut metadata = field.metadata.clone();
let default_constraint = match metadata.remove(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY) {
Some(json) => Some(serde_json::from_str(&json).context(DeserializeSnafu { json })?),
None => None,
};
let is_time_index = metadata.contains_key(TIME_INDEX_KEY);
Ok(ColumnSchema {
name: field.name.clone(),
data_type,
is_nullable: field.is_nullable,
is_time_index,
default_constraint,
metadata,
})
}
}
impl TryFrom<&ColumnSchema> for Field {
type Error = Error;
fn try_from(column_schema: &ColumnSchema) -> Result<Field> {
let mut metadata = column_schema.metadata.clone();
if let Some(value) = &column_schema.default_constraint {
// Adds an additional metadata to store the default constraint.
let old = metadata.insert(
ARROW_FIELD_DEFAULT_CONSTRAINT_KEY.to_string(),
serde_json::to_string(&value).context(SerializeSnafu)?,
);
ensure!(
old.is_none(),
error::DuplicateMetaSnafu {
key: ARROW_FIELD_DEFAULT_CONSTRAINT_KEY,
}
);
}
Ok(Field::new(
column_schema.name.clone(),
column_schema.data_type.as_arrow_type(),
column_schema.is_nullable(),
)
.with_metadata(metadata))
}
}
impl TryFrom<Arc<ArrowSchema>> for Schema {
type Error = Error;
@@ -271,7 +424,7 @@ impl TryFrom<Arc<ArrowSchema>> for Schema {
let mut name_to_index = HashMap::with_capacity(arrow_schema.fields.len());
for field in &arrow_schema.fields {
let column_schema = ColumnSchema::try_from(field)?;
name_to_index.insert(field.name().to_string(), column_schemas.len());
name_to_index.insert(field.name.clone(), column_schemas.len());
column_schemas.push(column_schema);
}
@@ -312,7 +465,7 @@ impl TryFrom<ArrowSchema> for Schema {
}
}
fn try_parse_version(metadata: &HashMap<String, String>, key: &str) -> Result<u32> {
fn try_parse_version(metadata: &Metadata, key: &str) -> Result<u32> {
if let Some(value) = metadata.get(key) {
let version = value
.parse()
@@ -326,8 +479,127 @@ fn try_parse_version(metadata: &HashMap<String, String>, key: &str) -> Result<u3
#[cfg(test)]
mod tests {
use arrow::datatypes::DataType as ArrowDataType;
use super::*;
use crate::data_type::ConcreteDataType;
use crate::value::Value;
#[test]
fn test_column_schema() {
let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true);
let field = Field::try_from(&column_schema).unwrap();
assert_eq!("test", field.name);
assert_eq!(ArrowDataType::Int32, field.data_type);
assert!(field.is_nullable);
let new_column_schema = ColumnSchema::try_from(&field).unwrap();
assert_eq!(column_schema, new_column_schema);
}
#[test]
fn test_column_schema_with_default_constraint() {
let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true)
.with_default_constraint(Some(ColumnDefaultConstraint::Value(Value::from(99))))
.unwrap();
assert!(column_schema
.metadata()
.get(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY)
.is_none());
let field = Field::try_from(&column_schema).unwrap();
assert_eq!("test", field.name);
assert_eq!(ArrowDataType::Int32, field.data_type);
assert!(field.is_nullable);
assert_eq!(
"{\"Value\":{\"Int32\":99}}",
field
.metadata
.get(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY)
.unwrap()
);
let new_column_schema = ColumnSchema::try_from(&field).unwrap();
assert_eq!(column_schema, new_column_schema);
}
#[test]
fn test_column_schema_with_metadata() {
let mut metadata = Metadata::new();
metadata.insert("k1".to_string(), "v1".to_string());
let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true)
.with_metadata(metadata)
.with_default_constraint(Some(ColumnDefaultConstraint::null_value()))
.unwrap();
assert_eq!("v1", column_schema.metadata().get("k1").unwrap());
assert!(column_schema
.metadata()
.get(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY)
.is_none());
let field = Field::try_from(&column_schema).unwrap();
assert_eq!("v1", field.metadata.get("k1").unwrap());
assert!(field
.metadata
.get(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY)
.is_some());
let new_column_schema = ColumnSchema::try_from(&field).unwrap();
assert_eq!(column_schema, new_column_schema);
}
#[test]
fn test_column_schema_with_duplicate_metadata() {
let mut metadata = Metadata::new();
metadata.insert(
ARROW_FIELD_DEFAULT_CONSTRAINT_KEY.to_string(),
"v1".to_string(),
);
let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true)
.with_metadata(metadata)
.with_default_constraint(Some(ColumnDefaultConstraint::null_value()))
.unwrap();
Field::try_from(&column_schema).unwrap_err();
}
#[test]
fn test_column_schema_invalid_default_constraint() {
ColumnSchema::new("test", ConcreteDataType::int32_datatype(), false)
.with_default_constraint(Some(ColumnDefaultConstraint::null_value()))
.unwrap_err();
}
#[test]
fn test_column_default_constraint_try_into_from() {
let default_constraint = ColumnDefaultConstraint::Value(Value::from(42i64));
let bytes: Vec<u8> = default_constraint.clone().try_into().unwrap();
let from_value = ColumnDefaultConstraint::try_from(&bytes[..]).unwrap();
assert_eq!(default_constraint, from_value);
}
#[test]
fn test_column_schema_create_default_null() {
// Implicit default null.
let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true);
let v = column_schema.create_default_vector(5).unwrap().unwrap();
assert_eq!(5, v.len());
assert!(v.only_null());
// Explicit default null.
let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true)
.with_default_constraint(Some(ColumnDefaultConstraint::null_value()))
.unwrap();
let v = column_schema.create_default_vector(5).unwrap().unwrap();
assert_eq!(5, v.len());
assert!(v.only_null());
}
#[test]
fn test_column_schema_no_default() {
let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), false);
assert!(column_schema.create_default_vector(5).unwrap().is_none());
}
#[test]
fn test_build_empty_schema() {
@@ -382,12 +654,8 @@ mod tests {
fn test_schema_with_timestamp() {
let column_schemas = vec![
ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true),
ColumnSchema::new(
"ts",
ConcreteDataType::timestamp_millisecond_datatype(),
false,
)
.with_time_index(true),
ColumnSchema::new("ts", ConcreteDataType::timestamp_millis_datatype(), false)
.with_time_index(true),
];
let schema = SchemaBuilder::try_from(column_schemas.clone())
.unwrap()

View File

@@ -22,7 +22,7 @@ use snafu::{ensure, ResultExt};
use crate::data_type::{ConcreteDataType, DataType};
use crate::error::{self, Result};
use crate::value::Value;
use crate::vectors::{Int64Vector, TimestampMillisecondVector, VectorRef};
use crate::vectors::{Int64Vector, TimestampVector, VectorRef};
const CURRENT_TIMESTAMP: &str = "current_timestamp()";
@@ -81,7 +81,7 @@ impl ColumnDefaultConstraint {
error::UnsupportedDefaultExprSnafu { expr }
);
ensure!(
data_type.is_timestamp_compatible(),
data_type.is_timestamp(),
error::DefaultValueTypeSnafu {
reason: "return value of the function must has timestamp type",
}
@@ -162,10 +162,8 @@ fn create_current_timestamp_vector(
data_type: &ConcreteDataType,
num_rows: usize,
) -> Result<VectorRef> {
// FIXME(yingwen): We should implements cast in VectorOp so we could cast the millisecond vector
// to other data type and avoid this match.
match data_type {
ConcreteDataType::Timestamp(_) => Ok(Arc::new(TimestampMillisecondVector::from_values(
ConcreteDataType::Timestamp(_) => Ok(Arc::new(TimestampVector::from_values(
std::iter::repeat(util::current_time_millis()).take(num_rows),
))),
ConcreteDataType::Int64(_) => Ok(Arc::new(Int64Vector::from_values(
@@ -219,7 +217,7 @@ mod tests {
fn test_validate_function_constraint() {
let constraint = ColumnDefaultConstraint::Function(CURRENT_TIMESTAMP.to_string());
constraint
.validate(&ConcreteDataType::timestamp_millisecond_datatype(), false)
.validate(&ConcreteDataType::timestamp_millis_datatype(), false)
.unwrap();
constraint
.validate(&ConcreteDataType::boolean_datatype(), false)
@@ -227,7 +225,7 @@ mod tests {
let constraint = ColumnDefaultConstraint::Function("hello()".to_string());
constraint
.validate(&ConcreteDataType::timestamp_millisecond_datatype(), false)
.validate(&ConcreteDataType::timestamp_millis_datatype(), false)
.unwrap_err();
}
@@ -264,7 +262,7 @@ mod tests {
fn test_create_default_vector_by_func() {
let constraint = ColumnDefaultConstraint::Function(CURRENT_TIMESTAMP.to_string());
// Timestamp type.
let data_type = ConcreteDataType::timestamp_millisecond_datatype();
let data_type = ConcreteDataType::timestamp_millis_datatype();
let v = constraint
.create_default_vector(&data_type, false, 4)
.unwrap();
@@ -288,7 +286,7 @@ mod tests {
);
let constraint = ColumnDefaultConstraint::Function("no".to_string());
let data_type = ConcreteDataType::timestamp_millisecond_datatype();
let data_type = ConcreteDataType::timestamp_millis_datatype();
constraint
.create_default_vector(&data_type, false, 4)
.unwrap_err();

View File

@@ -20,7 +20,7 @@ use crate::schema::{ColumnSchema, Schema, SchemaBuilder};
/// Struct used to serialize and deserialize [`Schema`](crate::schema::Schema).
///
/// This struct only contains necessary data to recover the Schema.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct RawSchema {
pub column_schemas: Vec<ColumnSchema>,
pub timestamp_index: Option<usize>,
@@ -56,12 +56,8 @@ mod tests {
fn test_raw_convert() {
let column_schemas = vec![
ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true),
ColumnSchema::new(
"ts",
ConcreteDataType::timestamp_millisecond_datatype(),
false,
)
.with_time_index(true),
ColumnSchema::new("ts", ConcreteDataType::timestamp_millis_datatype(), false)
.with_time_index(true),
];
let schema = SchemaBuilder::try_from(column_schemas)
.unwrap()

View File

@@ -42,10 +42,7 @@ pub enum LogicalTypeId {
/// seconds/milliseconds/microseconds/nanoseconds, determined by precision.
DateTime,
TimestampSecond,
TimestampMillisecond,
TimestampMicrosecond,
TimestampNanosecond,
Timestamp,
List,
}
@@ -77,14 +74,7 @@ impl LogicalTypeId {
LogicalTypeId::Binary => ConcreteDataType::binary_datatype(),
LogicalTypeId::Date => ConcreteDataType::date_datatype(),
LogicalTypeId::DateTime => ConcreteDataType::datetime_datatype(),
LogicalTypeId::TimestampSecond => ConcreteDataType::timestamp_second_datatype(),
LogicalTypeId::TimestampMillisecond => {
ConcreteDataType::timestamp_millisecond_datatype()
}
LogicalTypeId::TimestampMicrosecond => {
ConcreteDataType::timestamp_microsecond_datatype()
}
LogicalTypeId::TimestampNanosecond => ConcreteDataType::timestamp_nanosecond_datatype(),
LogicalTypeId::Timestamp => ConcreteDataType::timestamp_millis_datatype(), // to timestamp type with default time unit
LogicalTypeId::List => {
ConcreteDataType::list_datatype(ConcreteDataType::null_datatype())
}

View File

@@ -14,24 +14,25 @@
mod binary_type;
mod boolean_type;
mod date_type;
mod datetime_type;
mod date;
mod datetime;
mod list_type;
mod null_type;
mod primitive_traits;
mod primitive_type;
mod string_type;
mod timestamp_type;
mod timestamp;
pub use binary_type::BinaryType;
pub use boolean_type::BooleanType;
pub use date_type::DateType;
pub use datetime_type::DateTimeType;
pub use date::DateType;
pub use datetime::DateTimeType;
pub use list_type::ListType;
pub use null_type::NullType;
pub use primitive_traits::{OrdPrimitive, Primitive};
pub use primitive_type::{
Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LogicalPrimitiveType,
NativeType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, WrapperType,
Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, PrimitiveElement,
PrimitiveType, UInt16Type, UInt32Type, UInt64Type, UInt8Type,
};
pub use string_type::StringType;
pub use timestamp_type::*;
pub use timestamp::TimestampType;

View File

@@ -53,8 +53,4 @@ impl DataType for BinaryType {
fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector> {
Box::new(BinaryVectorBuilder::with_capacity(capacity))
}
fn is_timestamp_compatible(&self) -> bool {
false
}
}

View File

@@ -52,8 +52,4 @@ impl DataType for BooleanType {
fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector> {
Box::new(BooleanVectorBuilder::with_capacity(capacity))
}
fn is_timestamp_compatible(&self) -> bool {
false
}
}

View File

@@ -15,17 +15,15 @@
use arrow::datatypes::{DataType as ArrowDataType, Field};
use serde::{Deserialize, Serialize};
use crate::data_type::{ConcreteDataType, DataType};
use crate::type_id::LogicalTypeId;
use crate::value::{ListValue, Value};
use crate::prelude::*;
use crate::value::ListValue;
use crate::vectors::{ListVectorBuilder, MutableVector};
/// Used to represent the List datatype.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ListType {
/// The type of List's item.
// Use Box to avoid recursive dependency, as enum ConcreteDataType depends on ListType.
item_type: Box<ConcreteDataType>,
/// The type of List's inner data.
inner: Box<ConcreteDataType>,
}
impl Default for ListType {
@@ -35,10 +33,9 @@ impl Default for ListType {
}
impl ListType {
/// Create a new `ListType` whose item's data type is `item_type`.
pub fn new(item_type: ConcreteDataType) -> Self {
pub fn new(datatype: ConcreteDataType) -> Self {
ListType {
item_type: Box::new(item_type),
inner: Box::new(datatype),
}
}
}
@@ -53,24 +50,20 @@ impl DataType for ListType {
}
fn default_value(&self) -> Value {
Value::List(ListValue::new(None, *self.item_type.clone()))
Value::List(ListValue::new(None, *self.inner.clone()))
}
fn as_arrow_type(&self) -> ArrowDataType {
let field = Box::new(Field::new("item", self.item_type.as_arrow_type(), true));
let field = Box::new(Field::new("item", self.inner.as_arrow_type(), true));
ArrowDataType::List(field)
}
fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector> {
Box::new(ListVectorBuilder::with_type_capacity(
*self.item_type.clone(),
*self.inner.clone(),
capacity,
))
}
fn is_timestamp_compatible(&self) -> bool {
false
}
}
#[cfg(test)]

View File

@@ -27,7 +27,7 @@ pub struct NullType;
impl NullType {
pub fn arc() -> DataTypeRef {
Arc::new(NullType)
Arc::new(Self)
}
}
@@ -51,8 +51,4 @@ impl DataType for NullType {
fn create_mutable_vector(&self, _capacity: usize) -> Box<dyn MutableVector> {
Box::new(NullVectorBuilder::default())
}
fn is_timestamp_compatible(&self) -> bool {
false
}
}

View File

@@ -12,11 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::cmp::Ordering;
use std::any::TypeId;
use std::marker::PhantomData;
use arrow::datatypes::{ArrowNativeType, ArrowPrimitiveType, DataType as ArrowDataType};
use common_time::{Date, DateTime};
use num::NumCast;
use arrow::array::PrimitiveArray;
use arrow::datatypes::DataType as ArrowDataType;
use paste::paste;
use serde::{Deserialize, Serialize};
use snafu::OptionExt;
@@ -24,226 +25,92 @@ use crate::data_type::{ConcreteDataType, DataType};
use crate::error::{self, Result};
use crate::scalars::{Scalar, ScalarRef, ScalarVectorBuilder};
use crate::type_id::LogicalTypeId;
use crate::types::{DateTimeType, DateType};
use crate::types::primitive_traits::Primitive;
use crate::value::{Value, ValueRef};
use crate::vectors::{MutableVector, PrimitiveVector, PrimitiveVectorBuilder, Vector};
/// Data types that can be used as arrow's native type.
pub trait NativeType: ArrowNativeType + NumCast {
/// Largest numeric type this primitive type can be cast to.
type LargestType: NativeType;
#[derive(Clone, Serialize, Deserialize)]
pub struct PrimitiveType<T: Primitive> {
#[serde(skip)]
_phantom: PhantomData<T>,
}
macro_rules! impl_native_type {
($Type: ident, $LargestType: ident) => {
impl NativeType for $Type {
type LargestType = $LargestType;
}
};
impl<T: Primitive, U: Primitive> PartialEq<PrimitiveType<U>> for PrimitiveType<T> {
fn eq(&self, _other: &PrimitiveType<U>) -> bool {
TypeId::of::<T>() == TypeId::of::<U>()
}
}
impl_native_type!(u8, u64);
impl_native_type!(u16, u64);
impl_native_type!(u32, u64);
impl_native_type!(u64, u64);
impl_native_type!(i8, i64);
impl_native_type!(i16, i64);
impl_native_type!(i32, i64);
impl_native_type!(i64, i64);
impl_native_type!(f32, f64);
impl_native_type!(f64, f64);
impl<T: Primitive> Eq for PrimitiveType<T> {}
/// Represents the wrapper type that wraps a native type using the `newtype pattern`,
/// such as [Date](`common_time::Date`) is a wrapper type for the underlying native
/// type `i32`.
pub trait WrapperType:
Copy
+ Scalar
+ PartialEq
+ Into<Value>
+ Into<ValueRef<'static>>
+ Serialize
+ Into<serde_json::Value>
/// A trait that provide helper methods for a primitive type to implementing the [PrimitiveVector].
pub trait PrimitiveElement
where
for<'a> Self: Primitive
+ Scalar<VectorType = PrimitiveVector<Self>>
+ ScalarRef<'a, ScalarType = Self, VectorType = PrimitiveVector<Self>>
+ Scalar<RefType<'a> = Self>,
{
/// Logical primitive type that this wrapper type belongs to.
type LogicalType: LogicalPrimitiveType<Wrapper = Self, Native = Self::Native>;
/// The underlying native type.
type Native: NativeType;
/// Convert native type into this wrapper type.
fn from_native(value: Self::Native) -> Self;
/// Convert this wrapper type into native type.
fn into_native(self) -> Self::Native;
}
/// Trait bridging the logical primitive type with [ArrowPrimitiveType].
pub trait LogicalPrimitiveType: 'static + Sized {
/// Arrow primitive type of this logical type.
type ArrowPrimitive: ArrowPrimitiveType<Native = Self::Native>;
/// Native (physical) type of this logical type.
type Native: NativeType;
/// Wrapper type that the vector returns.
type Wrapper: WrapperType<LogicalType = Self, Native = Self::Native>
+ for<'a> Scalar<VectorType = PrimitiveVector<Self>, RefType<'a> = Self::Wrapper>
+ for<'a> ScalarRef<'a, ScalarType = Self::Wrapper>;
/// Construct the data type struct.
fn build_data_type() -> ConcreteDataType;
/// Return the name of the type.
fn type_name() -> &'static str;
/// Returns the name of the type id.
fn type_name() -> String;
/// Dynamic cast the vector to the concrete vector type.
fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveVector<Self>>;
fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveArray<Self>>;
/// Cast value ref to the primitive type.
fn cast_value_ref(value: ValueRef) -> Result<Option<Self::Wrapper>>;
fn cast_value_ref(value: ValueRef) -> Result<Option<Self>>;
}
/// A new type for [WrapperType], complement the `Ord` feature for it. Wrapping non ordered
/// primitive types like `f32` and `f64` in `OrdPrimitive` can make them be used in places that
/// require `Ord`. For example, in `Median` or `Percentile` UDAFs.
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct OrdPrimitive<T: WrapperType>(pub T);
macro_rules! impl_primitive_element {
($Type:ident, $TypeId:ident) => {
paste::paste! {
impl PrimitiveElement for $Type {
fn build_data_type() -> ConcreteDataType {
ConcreteDataType::$TypeId(PrimitiveType::<$Type>::default())
}
impl<T: WrapperType> OrdPrimitive<T> {
pub fn as_primitive(&self) -> T {
self.0
}
}
fn type_name() -> String {
stringify!($TypeId).to_string()
}
impl<T: WrapperType> Eq for OrdPrimitive<T> {}
fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveArray<$Type>> {
let primitive_vector = vector
.as_any()
.downcast_ref::<PrimitiveVector<$Type>>()
.with_context(|| error::CastTypeSnafu {
msg: format!(
"Failed to cast {} to vector of primitive type {}",
vector.vector_type_name(),
stringify!($TypeId)
),
})?;
Ok(&primitive_vector.array)
}
impl<T: WrapperType> PartialOrd for OrdPrimitive<T> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<T: WrapperType> Ord for OrdPrimitive<T> {
fn cmp(&self, other: &Self) -> Ordering {
Into::<Value>::into(self.0).cmp(&Into::<Value>::into(other.0))
}
}
impl<T: WrapperType> From<OrdPrimitive<T>> for Value {
fn from(p: OrdPrimitive<T>) -> Self {
p.0.into()
}
}
macro_rules! impl_wrapper {
($Type: ident, $LogicalType: ident) => {
impl WrapperType for $Type {
type LogicalType = $LogicalType;
type Native = $Type;
fn from_native(value: Self::Native) -> Self {
value
}
fn into_native(self) -> Self::Native {
self
}
}
};
}
impl_wrapper!(u8, UInt8Type);
impl_wrapper!(u16, UInt16Type);
impl_wrapper!(u32, UInt32Type);
impl_wrapper!(u64, UInt64Type);
impl_wrapper!(i8, Int8Type);
impl_wrapper!(i16, Int16Type);
impl_wrapper!(i32, Int32Type);
impl_wrapper!(i64, Int64Type);
impl_wrapper!(f32, Float32Type);
impl_wrapper!(f64, Float64Type);
impl WrapperType for Date {
type LogicalType = DateType;
type Native = i32;
fn from_native(value: i32) -> Self {
Date::new(value)
}
fn into_native(self) -> i32 {
self.val()
}
}
impl WrapperType for DateTime {
type LogicalType = DateTimeType;
type Native = i64;
fn from_native(value: Self::Native) -> Self {
DateTime::new(value)
}
fn into_native(self) -> Self::Native {
self.val()
}
}
macro_rules! define_logical_primitive_type {
($Native: ident, $TypeId: ident, $DataType: ident) => {
// We need to define it as an empty struct `struct DataType {}` instead of a struct-unit
// `struct DataType;` to ensure the serialized JSON string is compatible with previous
// implementation.
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct $DataType {}
impl LogicalPrimitiveType for $DataType {
type ArrowPrimitive = arrow::datatypes::$DataType;
type Native = $Native;
type Wrapper = $Native;
fn build_data_type() -> ConcreteDataType {
ConcreteDataType::$TypeId($DataType::default())
}
fn type_name() -> &'static str {
stringify!($TypeId)
}
fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveVector<$DataType>> {
vector
.as_any()
.downcast_ref::<PrimitiveVector<$DataType>>()
.with_context(|| error::CastTypeSnafu {
msg: format!(
"Failed to cast {} to vector of primitive type {}",
vector.vector_type_name(),
stringify!($TypeId)
),
})
}
fn cast_value_ref(value: ValueRef) -> Result<Option<$Native>> {
match value {
ValueRef::Null => Ok(None),
ValueRef::$TypeId(v) => Ok(Some(v.into())),
other => error::CastTypeSnafu {
msg: format!(
"Failed to cast value {:?} to primitive type {}",
other,
stringify!($TypeId),
),
fn cast_value_ref(value: ValueRef) -> Result<Option<Self>> {
match value {
ValueRef::Null => Ok(None),
ValueRef::$TypeId(v) => Ok(Some(v.into())),
other => error::CastTypeSnafu {
msg: format!(
"Failed to cast value {:?} to primitive type {}",
other,
stringify!($TypeId),
),
}.fail(),
}
.fail(),
}
}
}
};
}
macro_rules! define_non_timestamp_primitive {
($Native: ident, $TypeId: ident, $DataType: ident) => {
define_logical_primitive_type!($Native, $TypeId, $DataType);
impl DataType for $DataType {
macro_rules! impl_numeric {
($Type:ident, $TypeId:ident) => {
impl DataType for PrimitiveType<$Type> {
fn name(&self) -> &str {
stringify!($TypeId)
}
@@ -253,7 +120,7 @@ macro_rules! define_non_timestamp_primitive {
}
fn default_value(&self) -> Value {
$Native::default().into()
$Type::default().into()
}
fn as_arrow_type(&self) -> ArrowDataType {
@@ -261,98 +128,61 @@ macro_rules! define_non_timestamp_primitive {
}
fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector> {
Box::new(PrimitiveVectorBuilder::<$DataType>::with_capacity(capacity))
Box::new(PrimitiveVectorBuilder::<$Type>::with_capacity(capacity))
}
}
fn is_timestamp_compatible(&self) -> bool {
false
impl std::fmt::Debug for PrimitiveType<$Type> {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "{}", self.name())
}
}
impl Default for PrimitiveType<$Type> {
fn default() -> Self {
Self {
_phantom: PhantomData,
}
}
}
impl_primitive_element!($Type, $TypeId);
paste! {
pub type [<$TypeId Type>]=PrimitiveType<$Type>;
}
};
}
define_non_timestamp_primitive!(u8, UInt8, UInt8Type);
define_non_timestamp_primitive!(u16, UInt16, UInt16Type);
define_non_timestamp_primitive!(u32, UInt32, UInt32Type);
define_non_timestamp_primitive!(u64, UInt64, UInt64Type);
define_non_timestamp_primitive!(i8, Int8, Int8Type);
define_non_timestamp_primitive!(i16, Int16, Int16Type);
define_non_timestamp_primitive!(i32, Int32, Int32Type);
define_non_timestamp_primitive!(f32, Float32, Float32Type);
define_non_timestamp_primitive!(f64, Float64, Float64Type);
// Timestamp primitive:
define_logical_primitive_type!(i64, Int64, Int64Type);
impl DataType for Int64Type {
fn name(&self) -> &str {
"Int64"
}
fn logical_type_id(&self) -> LogicalTypeId {
LogicalTypeId::Int64
}
fn default_value(&self) -> Value {
Value::Int64(0)
}
fn as_arrow_type(&self) -> ArrowDataType {
ArrowDataType::Int64
}
fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector> {
Box::new(PrimitiveVectorBuilder::<Int64Type>::with_capacity(capacity))
}
fn is_timestamp_compatible(&self) -> bool {
true
}
}
impl_numeric!(u8, UInt8);
impl_numeric!(u16, UInt16);
impl_numeric!(u32, UInt32);
impl_numeric!(u64, UInt64);
impl_numeric!(i8, Int8);
impl_numeric!(i16, Int16);
impl_numeric!(i32, Int32);
impl_numeric!(i64, Int64);
impl_numeric!(f32, Float32);
impl_numeric!(f64, Float64);
#[cfg(test)]
mod tests {
use std::collections::BinaryHeap;
use super::*;
#[test]
fn test_ord_primitive() {
struct Foo<T>
where
T: WrapperType,
{
heap: BinaryHeap<OrdPrimitive<T>>,
}
fn test_eq() {
assert_eq!(UInt8Type::default(), UInt8Type::default());
assert_eq!(UInt16Type::default(), UInt16Type::default());
assert_eq!(UInt32Type::default(), UInt32Type::default());
assert_eq!(UInt64Type::default(), UInt64Type::default());
assert_eq!(Int8Type::default(), Int8Type::default());
assert_eq!(Int16Type::default(), Int16Type::default());
assert_eq!(Int32Type::default(), Int32Type::default());
assert_eq!(Int64Type::default(), Int64Type::default());
assert_eq!(Float32Type::default(), Float32Type::default());
assert_eq!(Float64Type::default(), Float64Type::default());
impl<T> Foo<T>
where
T: WrapperType,
{
fn push(&mut self, value: T) {
let value = OrdPrimitive::<T>(value);
self.heap.push(value);
}
}
macro_rules! test {
($Type:ident) => {
let mut foo = Foo::<$Type> {
heap: BinaryHeap::new(),
};
foo.push($Type::default());
};
}
test!(u8);
test!(u16);
test!(u32);
test!(u64);
test!(i8);
test!(i16);
test!(i32);
test!(i64);
test!(f32);
test!(f64);
assert_ne!(Float32Type::default(), Float64Type::default());
assert_ne!(Float32Type::default(), Int32Type::default());
}
}

View File

@@ -18,10 +18,9 @@ use arrow::datatypes::DataType as ArrowDataType;
use common_base::bytes::StringBytes;
use serde::{Deserialize, Serialize};
use crate::data_type::{DataType, DataTypeRef};
use crate::prelude::ScalarVectorBuilder;
use crate::type_id::LogicalTypeId;
use crate::value::Value;
use crate::data_type::DataType;
use crate::prelude::{DataTypeRef, LogicalTypeId, Value};
use crate::scalars::ScalarVectorBuilder;
use crate::vectors::{MutableVector, StringVectorBuilder};
#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -53,8 +52,4 @@ impl DataType for StringType {
fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector> {
Box::new(StringVectorBuilder::with_capacity(capacity))
}
fn is_timestamp_compatible(&self) -> bool {
false
}
}

View File

@@ -110,7 +110,6 @@ impl Value {
/// # Panics
/// Panics if the data type is not supported.
pub fn data_type(&self) -> ConcreteDataType {
// TODO(yingwen): Implement this once all data types are implemented.
match self {
Value::Null => ConcreteDataType::null_datatype(),
Value::Boolean(_) => ConcreteDataType::boolean_datatype(),
@@ -126,10 +125,10 @@ impl Value {
Value::Float64(_) => ConcreteDataType::float64_datatype(),
Value::String(_) => ConcreteDataType::string_datatype(),
Value::Binary(_) => ConcreteDataType::binary_datatype(),
Value::List(list) => ConcreteDataType::list_datatype(list.datatype().clone()),
Value::Date(_) => ConcreteDataType::date_datatype(),
Value::DateTime(_) => ConcreteDataType::datetime_datatype(),
Value::Timestamp(v) => ConcreteDataType::timestamp_datatype(v.unit()),
Value::List(list) => ConcreteDataType::list_datatype(list.datatype().clone()),
}
}
@@ -194,12 +193,7 @@ impl Value {
Value::List(_) => LogicalTypeId::List,
Value::Date(_) => LogicalTypeId::Date,
Value::DateTime(_) => LogicalTypeId::DateTime,
Value::Timestamp(t) => match t.unit() {
TimeUnit::Second => LogicalTypeId::TimestampSecond,
TimeUnit::Millisecond => LogicalTypeId::TimestampMillisecond,
TimeUnit::Microsecond => LogicalTypeId::TimestampMicrosecond,
TimeUnit::Nanosecond => LogicalTypeId::TimestampNanosecond,
},
Value::Timestamp(_) => LogicalTypeId::Timestamp,
}
}
}
@@ -283,9 +277,6 @@ impl_value_from!(Float32, f32);
impl_value_from!(Float64, f64);
impl_value_from!(String, StringBytes);
impl_value_from!(Binary, Bytes);
impl_value_from!(Date, Date);
impl_value_from!(DateTime, DateTime);
impl_value_from!(Timestamp, Timestamp);
impl From<String> for Value {
fn from(string: String) -> Value {
@@ -305,6 +296,12 @@ impl From<Vec<u8>> for Value {
}
}
impl From<Timestamp> for Value {
fn from(v: Timestamp) -> Self {
Value::Timestamp(v)
}
}
impl From<&[u8]> for Value {
fn from(bytes: &[u8]) -> Value {
Value::Binary(bytes.into())
@@ -340,7 +337,6 @@ impl TryFrom<Value> for serde_json::Value {
}
}
// TODO(yingwen): Consider removing the `datatype` field from `ListValue`.
/// List value.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ListValue {
@@ -395,7 +391,6 @@ impl TryFrom<ScalarValue> for Value {
fn try_from(v: ScalarValue) -> Result<Self> {
let v = match v {
ScalarValue::Null => Value::Null,
ScalarValue::Boolean(b) => Value::from(b),
ScalarValue::Float32(f) => Value::from(f),
ScalarValue::Float64(f) => Value::from(f),
@@ -410,10 +405,8 @@ impl TryFrom<ScalarValue> for Value {
ScalarValue::Utf8(s) | ScalarValue::LargeUtf8(s) => {
Value::from(s.map(StringBytes::from))
}
ScalarValue::Binary(b)
| ScalarValue::LargeBinary(b)
| ScalarValue::FixedSizeBinary(_, b) => Value::from(b.map(Bytes::from)),
ScalarValue::List(vs, field) => {
ScalarValue::Binary(b) | ScalarValue::LargeBinary(b) => Value::from(b.map(Bytes::from)),
ScalarValue::List(vs, t) => {
let items = if let Some(vs) = vs {
let vs = vs
.into_iter()
@@ -423,7 +416,7 @@ impl TryFrom<ScalarValue> for Value {
} else {
None
};
let datatype = ConcreteDataType::try_from(field.data_type())?;
let datatype = t.as_ref().try_into()?;
Value::List(ListValue::new(items, datatype))
}
ScalarValue::Date32(d) => d.map(|x| Value::Date(Date::new(x))).unwrap_or(Value::Null),
@@ -442,13 +435,7 @@ impl TryFrom<ScalarValue> for Value {
ScalarValue::TimestampNanosecond(t, _) => t
.map(|x| Value::Timestamp(Timestamp::new(x, TimeUnit::Nanosecond)))
.unwrap_or(Value::Null),
ScalarValue::Decimal128(_, _, _)
| ScalarValue::Time64(_)
| ScalarValue::IntervalYearMonth(_)
| ScalarValue::IntervalDayTime(_)
| ScalarValue::IntervalMonthDayNano(_)
| ScalarValue::Struct(_, _)
| ScalarValue::Dictionary(_, _) => {
_ => {
return error::UnsupportedArrowTypeSnafu {
arrow_type: v.get_datatype(),
}
@@ -558,6 +545,15 @@ impl<'a> Ord for ValueRef<'a> {
}
}
/// A helper trait to convert copyable types to `ValueRef`.
///
/// It could replace the usage of `Into<ValueRef<'a>>`, thus avoid confusion between `Into<Value>`
/// and `Into<ValueRef<'a>>` in generic codes. One typical usage is the [`Primitive`](crate::primitive_traits::Primitive) trait.
pub trait IntoValueRef<'a> {
/// Convert itself to [ValueRef].
fn into_value_ref(self) -> ValueRef<'a>;
}
macro_rules! impl_value_ref_from {
($Variant:ident, $Type:ident) => {
impl From<$Type> for ValueRef<'_> {
@@ -566,6 +562,12 @@ macro_rules! impl_value_ref_from {
}
}
impl<'a> IntoValueRef<'a> for $Type {
fn into_value_ref(self) -> ValueRef<'a> {
ValueRef::$Variant(self.into())
}
}
impl From<Option<$Type>> for ValueRef<'_> {
fn from(value: Option<$Type>) -> Self {
match value {
@@ -574,6 +576,15 @@ macro_rules! impl_value_ref_from {
}
}
}
impl<'a> IntoValueRef<'a> for Option<$Type> {
fn into_value_ref(self) -> ValueRef<'a> {
match self {
Some(v) => ValueRef::$Variant(v.into()),
None => ValueRef::Null,
}
}
}
};
}
@@ -588,9 +599,6 @@ impl_value_ref_from!(Int32, i32);
impl_value_ref_from!(Int64, i64);
impl_value_ref_from!(Float32, f32);
impl_value_ref_from!(Float64, f64);
impl_value_ref_from!(Date, Date);
impl_value_ref_from!(DateTime, DateTime);
impl_value_ref_from!(Timestamp, Timestamp);
impl<'a> From<&'a str> for ValueRef<'a> {
fn from(string: &'a str) -> ValueRef<'a> {
@@ -620,7 +628,6 @@ impl<'a> From<Option<ListValueRef<'a>>> for ValueRef<'a> {
/// if it becomes bottleneck.
#[derive(Debug, Clone, Copy)]
pub enum ListValueRef<'a> {
// TODO(yingwen): Consider replace this by VectorRef.
Indexed { vector: &'a ListVector, idx: usize },
Ref { val: &'a ListValue },
}
@@ -778,16 +785,19 @@ mod tests {
Some(Box::new(vec![Value::Int32(1), Value::Null])),
ConcreteDataType::int32_datatype()
)),
ScalarValue::new_list(
Some(vec![ScalarValue::Int32(Some(1)), ScalarValue::Int32(None)]),
ArrowDataType::Int32,
ScalarValue::List(
Some(Box::new(vec![
ScalarValue::Int32(Some(1)),
ScalarValue::Int32(None)
])),
Box::new(ArrowDataType::Int32)
)
.try_into()
.unwrap()
);
assert_eq!(
Value::List(ListValue::new(None, ConcreteDataType::uint32_datatype())),
ScalarValue::new_list(None, ArrowDataType::UInt32)
ScalarValue::List(None, Box::new(ArrowDataType::UInt32))
.try_into()
.unwrap()
);
@@ -970,10 +980,6 @@ mod tests {
ConcreteDataType::int32_datatype(),
)),
);
check_type_and_value(
&ConcreteDataType::list_datatype(ConcreteDataType::null_datatype()),
&Value::List(ListValue::default()),
);
check_type_and_value(
&ConcreteDataType::date_datatype(),
&Value::Date(Date::new(1)),
@@ -983,7 +989,7 @@ mod tests {
&Value::DateTime(DateTime::new(1)),
);
check_type_and_value(
&ConcreteDataType::timestamp_millisecond_datatype(),
&ConcreteDataType::timestamp_millis_datatype(),
&Value::Timestamp(Timestamp::from_millis(1)),
);
}
@@ -1202,6 +1208,59 @@ mod tests {
assert!(wrong_value.as_list().is_err());
}
#[test]
fn test_into_value_ref() {
macro_rules! check_into_value_ref {
($Variant: ident, $data: expr, $PrimitiveType: ident, $Wrapper: ident) => {
let data: $PrimitiveType = $data;
assert_eq!(
ValueRef::$Variant($Wrapper::from(data)),
data.into_value_ref()
);
assert_eq!(
ValueRef::$Variant($Wrapper::from(data)),
ValueRef::from(data)
);
assert_eq!(
ValueRef::$Variant($Wrapper::from(data)),
Some(data).into_value_ref()
);
assert_eq!(
ValueRef::$Variant($Wrapper::from(data)),
ValueRef::from(Some(data))
);
let x: Option<$PrimitiveType> = None;
assert_eq!(ValueRef::Null, x.into_value_ref());
assert_eq!(ValueRef::Null, x.into());
};
}
macro_rules! check_primitive_into_value_ref {
($Variant: ident, $data: expr, $PrimitiveType: ident) => {
check_into_value_ref!($Variant, $data, $PrimitiveType, $PrimitiveType)
};
}
check_primitive_into_value_ref!(Boolean, true, bool);
check_primitive_into_value_ref!(UInt8, 10, u8);
check_primitive_into_value_ref!(UInt16, 20, u16);
check_primitive_into_value_ref!(UInt32, 30, u32);
check_primitive_into_value_ref!(UInt64, 40, u64);
check_primitive_into_value_ref!(Int8, -10, i8);
check_primitive_into_value_ref!(Int16, -20, i16);
check_primitive_into_value_ref!(Int32, -30, i32);
check_primitive_into_value_ref!(Int64, -40, i64);
check_into_value_ref!(Float32, 10.0, f32, OrderedF32);
check_into_value_ref!(Float64, 10.0, f64, OrderedF64);
let hello = "hello";
assert_eq!(
ValueRef::Binary(hello.as_bytes()),
ValueRef::from(hello.as_bytes())
);
assert_eq!(ValueRef::String(hello), ValueRef::from(hello));
}
#[test]
fn test_display() {
assert_eq!(Value::Null.to_string(), "Null");
@@ -1242,34 +1301,10 @@ mod tests {
assert_eq!(
Value::List(ListValue::new(
Some(Box::new(vec![])),
ConcreteDataType::timestamp_second_datatype(),
ConcreteDataType::timestamp_datatype(TimeUnit::Millisecond),
))
.to_string(),
"TimestampSecondType[]"
);
assert_eq!(
Value::List(ListValue::new(
Some(Box::new(vec![])),
ConcreteDataType::timestamp_millisecond_datatype(),
))
.to_string(),
"TimestampMillisecondType[]"
);
assert_eq!(
Value::List(ListValue::new(
Some(Box::new(vec![])),
ConcreteDataType::timestamp_microsecond_datatype(),
))
.to_string(),
"TimestampMicrosecondType[]"
);
assert_eq!(
Value::List(ListValue::new(
Some(Box::new(vec![])),
ConcreteDataType::timestamp_nanosecond_datatype(),
))
.to_string(),
"TimestampNanosecondType[]"
"Timestamp[]"
);
}
}

View File

@@ -12,59 +12,68 @@
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod binary;
pub mod boolean;
mod builder;
pub mod constant;
pub mod date;
pub mod datetime;
mod eq;
mod helper;
mod list;
pub mod mutable;
pub mod null;
mod operations;
pub mod primitive;
mod string;
mod timestamp;
use std::any::Any;
use std::fmt::Debug;
use std::sync::Arc;
use arrow::array::{Array, ArrayRef};
use arrow::bitmap::Bitmap;
pub use binary::*;
pub use boolean::*;
pub use builder::VectorBuilder;
pub use constant::*;
pub use date::*;
pub use datetime::*;
pub use helper::Helper;
pub use list::*;
pub use mutable::MutableVector;
pub use null::*;
pub use operations::VectorOp;
pub use primitive::*;
use snafu::ensure;
pub use string::*;
pub use timestamp::*;
use crate::data_type::ConcreteDataType;
use crate::error::{self, Result};
use crate::serialize::Serializable;
use crate::value::{Value, ValueRef};
use crate::vectors::operations::VectorOp;
mod binary;
mod boolean;
mod constant;
mod date;
mod datetime;
mod eq;
mod helper;
mod list;
mod null;
mod operations;
mod primitive;
mod string;
mod timestamp;
mod validity;
#[derive(Debug, PartialEq)]
pub enum Validity<'a> {
/// Whether the array slot is valid or not (null).
Slots(&'a Bitmap),
/// All slots are valid.
AllValid,
/// All slots are null.
AllNull,
}
pub use binary::{BinaryVector, BinaryVectorBuilder};
pub use boolean::{BooleanVector, BooleanVectorBuilder};
pub use constant::ConstantVector;
pub use date::{DateVector, DateVectorBuilder};
pub use datetime::{DateTimeVector, DateTimeVectorBuilder};
pub use helper::Helper;
pub use list::{ListIter, ListVector, ListVectorBuilder};
pub use null::{NullVector, NullVectorBuilder};
pub use primitive::{
Float32Vector, Float32VectorBuilder, Float64Vector, Float64VectorBuilder, Int16Vector,
Int16VectorBuilder, Int32Vector, Int32VectorBuilder, Int64Vector, Int64VectorBuilder,
Int8Vector, Int8VectorBuilder, PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder,
UInt16Vector, UInt16VectorBuilder, UInt32Vector, UInt32VectorBuilder, UInt64Vector,
UInt64VectorBuilder, UInt8Vector, UInt8VectorBuilder,
};
pub use string::{StringVector, StringVectorBuilder};
pub use timestamp::{
TimestampMicrosecondVector, TimestampMicrosecondVectorBuilder, TimestampMillisecondVector,
TimestampMillisecondVectorBuilder, TimestampNanosecondVector, TimestampNanosecondVectorBuilder,
TimestampSecondVector, TimestampSecondVectorBuilder,
};
pub use validity::Validity;
impl<'a> Validity<'a> {
pub fn slots(&self) -> Option<&Bitmap> {
match self {
Validity::Slots(bitmap) => Some(bitmap),
_ => None,
}
}
}
// TODO(yingwen): arrow 28.0 implements Clone for all arrays, we could upgrade to it and simplify
// some codes in methods such as `to_arrow_array()` and `to_boxed_arrow_array()`.
/// Vector of data values.
pub trait Vector: Send + Sync + Serializable + Debug + VectorOp {
/// Returns the data type of the vector.
@@ -101,7 +110,13 @@ pub trait Vector: Send + Sync + Serializable + Debug + VectorOp {
/// The number of null slots on this [`Vector`].
/// # Implementation
/// This is `O(1)`.
fn null_count(&self) -> usize;
fn null_count(&self) -> usize {
match self.validity() {
Validity::Slots(bitmap) => bitmap.null_count(),
Validity::AllValid => 0,
Validity::AllNull => self.len(),
}
}
/// Returns true when it's a ConstantColumn
fn is_const(&self) -> bool {
@@ -150,42 +165,6 @@ pub trait Vector: Send + Sync + Serializable + Debug + VectorOp {
pub type VectorRef = Arc<dyn Vector>;
/// Mutable vector that could be used to build an immutable vector.
pub trait MutableVector: Send + Sync {
/// Returns the data type of the vector.
fn data_type(&self) -> ConcreteDataType;
/// Returns the length of the vector.
fn len(&self) -> usize;
/// Returns whether the vector is empty.
fn is_empty(&self) -> bool {
self.len() == 0
}
/// Convert to Any, to enable dynamic casting.
fn as_any(&self) -> &dyn Any;
/// Convert to mutable Any, to enable dynamic casting.
fn as_mut_any(&mut self) -> &mut dyn Any;
/// Convert `self` to an (immutable) [VectorRef] and reset `self`.
fn to_vector(&mut self) -> VectorRef;
/// Push value ref to this mutable vector.
///
/// Returns error if data type unmatch.
fn push_value_ref(&mut self, value: ValueRef) -> Result<()>;
/// Extend this mutable vector by slice of `vector`.
///
/// Returns error if data type unmatch.
///
/// # Panics
/// Panics if `offset + length > vector.len()`.
fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()>;
}
/// Helper to define `try_from_arrow_array(array: arrow::array::ArrayRef)` function.
macro_rules! impl_try_from_arrow_array_for_vector {
($Array: ident, $Vector: ident) => {
@@ -193,20 +172,16 @@ macro_rules! impl_try_from_arrow_array_for_vector {
pub fn try_from_arrow_array(
array: impl AsRef<dyn arrow::array::Array>,
) -> crate::error::Result<$Vector> {
use snafu::OptionExt;
let data = array
.as_ref()
.as_any()
.downcast_ref::<$Array>()
.with_context(|| crate::error::ConversionSnafu {
from: std::format!("{:?}", array.as_ref().data_type()),
})?
.data()
.clone();
let concrete_array = $Array::from(data);
Ok($Vector::from(concrete_array))
Ok($Vector::from(
array
.as_ref()
.as_any()
.downcast_ref::<$Array>()
.with_context(|| crate::error::ConversionSnafu {
from: std::format!("{:?}", array.as_ref().data_type()),
})?
.clone(),
))
}
}
};
@@ -214,7 +189,10 @@ macro_rules! impl_try_from_arrow_array_for_vector {
macro_rules! impl_validity_for_vector {
($array: expr) => {
Validity::from_array_data($array.data())
match $array.validity() {
Some(bitmap) => Validity::Slots(bitmap),
None => Validity::AllValid,
}
};
}
@@ -241,11 +219,10 @@ macro_rules! impl_get_ref_for_vector {
}
macro_rules! impl_extend_for_builder {
($mutable_vector: expr, $vector: ident, $VectorType: ident, $offset: ident, $length: ident) => {{
($mutable_array: expr, $vector: ident, $VectorType: ident, $offset: ident, $length: ident) => {{
use snafu::OptionExt;
let sliced_vector = $vector.slice($offset, $length);
let concrete_vector = sliced_vector
let concrete_vector = $vector
.as_any()
.downcast_ref::<$VectorType>()
.with_context(|| crate::error::CastTypeSnafu {
@@ -255,9 +232,8 @@ macro_rules! impl_extend_for_builder {
stringify!($VectorType)
),
})?;
for value in concrete_vector.iter_data() {
$mutable_vector.push(value);
}
let slice = concrete_vector.array.slice($offset, $length);
$mutable_array.extend_trusted_len(slice.iter());
Ok(())
}};
}
@@ -269,27 +245,27 @@ pub(crate) use {
#[cfg(test)]
pub mod tests {
use arrow::array::{Array, Int32Array, UInt8Array};
use arrow::array::{Array, PrimitiveArray};
use serde_json;
use super::helper::Helper;
use super::*;
use crate::data_type::DataType;
use crate::types::{Int32Type, LogicalPrimitiveType};
use crate::vectors::helper::Helper;
use crate::types::PrimitiveElement;
#[test]
fn test_df_columns_to_vector() {
let df_column: Arc<dyn Array> = Arc::new(Int32Array::from(vec![1, 2, 3]));
let df_column: Arc<dyn Array> = Arc::new(PrimitiveArray::from_slice(vec![1, 2, 3]));
let vector = Helper::try_into_vector(df_column).unwrap();
assert_eq!(
Int32Type::build_data_type().as_arrow_type(),
i32::build_data_type().as_arrow_type(),
vector.data_type().as_arrow_type()
);
}
#[test]
fn test_serialize_i32_vector() {
let df_column: Arc<dyn Array> = Arc::new(Int32Array::from(vec![1, 2, 3]));
let df_column: Arc<dyn Array> = Arc::new(PrimitiveArray::<i32>::from_slice(vec![1, 2, 3]));
let json_value = Helper::try_into_vector(df_column)
.unwrap()
.serialize_to_json()
@@ -299,7 +275,7 @@ pub mod tests {
#[test]
fn test_serialize_i8_vector() {
let df_column: Arc<dyn Array> = Arc::new(UInt8Array::from(vec![1, 2, 3]));
let df_column: Arc<dyn Array> = Arc::new(PrimitiveArray::from_slice(vec![1u8, 2u8, 3u8]));
let json_value = Helper::try_into_vector(df_column)
.unwrap()
.serialize_to_json()

View File

@@ -15,8 +15,9 @@
use std::any::Any;
use std::sync::Arc;
use arrow::array::{Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef};
use snafu::ResultExt;
use arrow::array::{Array, ArrayRef};
use arrow::array::{ArrayIter, GenericByteArray};
use snafu::{OptionExt, ResultExt};
use crate::arrow_array::{BinaryArray, MutableBinaryArray};
use crate::data_type::ConcreteDataType;
@@ -36,16 +37,6 @@ impl BinaryVector {
pub(crate) fn as_arrow(&self) -> &dyn Array {
&self.array
}
fn to_array_data(&self) -> ArrayData {
self.array.data().clone()
}
fn from_array_data(data: ArrayData) -> BinaryVector {
BinaryVector {
array: BinaryArray::from(data),
}
}
}
impl From<BinaryArray> for BinaryVector {
@@ -57,7 +48,7 @@ impl From<BinaryArray> for BinaryVector {
impl From<Vec<Option<Vec<u8>>>> for BinaryVector {
fn from(data: Vec<Option<Vec<u8>>>) -> Self {
Self {
array: BinaryArray::from_iter(data),
array: BinaryArray::from(data),
}
}
}
@@ -80,13 +71,11 @@ impl Vector for BinaryVector {
}
fn to_arrow_array(&self) -> ArrayRef {
let data = self.to_array_data();
Arc::new(BinaryArray::from(data))
Arc::new(self.array.clone())
}
fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
let data = self.to_array_data();
Box::new(BinaryArray::from(data))
Box::new(self.array.clone())
}
fn validity(&self) -> Validity {
@@ -94,11 +83,7 @@ impl Vector for BinaryVector {
}
fn memory_size(&self) -> usize {
self.array.get_buffer_memory_size()
}
fn null_count(&self) -> usize {
self.array.null_count()
self.array.values().len() + self.array.offsets().len() * std::mem::size_of::<i64>()
}
fn is_null(&self, row: usize) -> bool {
@@ -106,8 +91,7 @@ impl Vector for BinaryVector {
}
fn slice(&self, offset: usize, length: usize) -> VectorRef {
let data = self.array.data().slice(offset, length);
Arc::new(Self::from_array_data(data))
Arc::new(Self::from(self.array.slice(offset, length)))
}
fn get(&self, index: usize) -> Value {
@@ -164,15 +148,12 @@ impl MutableVector for BinaryVectorBuilder {
}
fn push_value_ref(&mut self, value: ValueRef) -> Result<()> {
match value.as_binary()? {
Some(v) => self.mutable_array.append_value(v),
None => self.mutable_array.append_null(),
}
self.mutable_array.push(value.as_binary()?);
Ok(())
}
fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> {
vectors::impl_extend_for_builder!(self, vector, BinaryVector, offset, length)
vectors::impl_extend_for_builder!(self.mutable_array, vector, BinaryVector, offset, length)
}
}
@@ -181,20 +162,17 @@ impl ScalarVectorBuilder for BinaryVectorBuilder {
fn with_capacity(capacity: usize) -> Self {
Self {
mutable_array: MutableBinaryArray::with_capacity(capacity, 0),
mutable_array: MutableBinaryArray::with_capacity(capacity),
}
}
fn push(&mut self, value: Option<<Self::VectorType as ScalarVector>::RefItem<'_>>) {
match value {
Some(v) => self.mutable_array.append_value(v),
None => self.mutable_array.append_null(),
}
self.mutable_array.push(value);
}
fn finish(&mut self) -> Self::VectorType {
BinaryVector {
array: self.mutable_array.finish(),
array: std::mem::take(&mut self.mutable_array).into(),
}
}
}
@@ -227,17 +205,14 @@ mod tests {
#[test]
fn test_binary_vector_misc() {
let v = BinaryVector::from(BinaryArray::from_iter_values(&[
vec![1, 2, 3],
vec![1, 2, 3],
]));
let v = BinaryVector::from(BinaryArray::from_slice(&[vec![1, 2, 3], vec![1, 2, 3]]));
assert_eq!(2, v.len());
assert_eq!("BinaryVector", v.vector_type_name());
assert!(!v.is_const());
assert!(v.validity().is_all_valid());
assert_eq!(Validity::AllValid, v.validity());
assert!(!v.only_null());
assert_eq!(128, v.memory_size());
assert_eq!(30, v.memory_size());
for i in 0..2 {
assert!(!v.is_null(i));
@@ -252,10 +227,7 @@ mod tests {
#[test]
fn test_serialize_binary_vector_to_json() {
let vector = BinaryVector::from(BinaryArray::from_iter_values(&[
vec![1, 2, 3],
vec![1, 2, 3],
]));
let vector = BinaryVector::from(BinaryArray::from_slice(&[vec![1, 2, 3], vec![1, 2, 3]]));
let json_value = vector.serialize_to_json().unwrap();
assert_eq!(
@@ -281,8 +253,8 @@ mod tests {
#[test]
fn test_from_arrow_array() {
let arrow_array = BinaryArray::from_iter_values(&[vec![1, 2, 3], vec![1, 2, 3]]);
let original = BinaryArray::from(arrow_array.data().clone());
let arrow_array = BinaryArray::from_slice(&[vec![1, 2, 3], vec![1, 2, 3]]);
let original = arrow_array.clone();
let vector = BinaryVector::from(arrow_array);
assert_eq!(original, vector.array);
}
@@ -317,7 +289,7 @@ mod tests {
builder.push(Some(b"world"));
let vector = builder.finish();
assert_eq!(0, vector.null_count());
assert!(vector.validity().is_all_valid());
assert_eq!(Validity::AllValid, vector.validity());
let mut builder = BinaryVectorBuilder::with_capacity(3);
builder.push(Some(b"hello"));
@@ -326,10 +298,9 @@ mod tests {
let vector = builder.finish();
assert_eq!(1, vector.null_count());
let validity = vector.validity();
assert!(!validity.is_set(1));
assert_eq!(1, validity.null_count());
assert!(!validity.is_set(1));
let slots = validity.slots().unwrap();
assert_eq!(1, slots.null_count());
assert!(!slots.get_bit(1));
}
#[test]

View File

@@ -16,10 +16,9 @@ use std::any::Any;
use std::borrow::Borrow;
use std::sync::Arc;
use arrow::array::{
Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef, BooleanArray, BooleanBuilder,
};
use snafu::ResultExt;
use arrow::array::{Array, ArrayRef, BooleanArray, MutableArray, MutableBooleanArray};
use arrow::bitmap::utils::{BitmapIter, ZipValidity};
use snafu::{OptionExt, ResultExt};
use crate::data_type::ConcreteDataType;
use crate::error::Result;
@@ -42,26 +41,12 @@ impl BooleanVector {
pub(crate) fn as_boolean_array(&self) -> &BooleanArray {
&self.array
}
fn to_array_data(&self) -> ArrayData {
self.array.data().clone()
}
fn from_array_data(data: ArrayData) -> BooleanVector {
BooleanVector {
array: BooleanArray::from(data),
}
}
pub(crate) fn false_count(&self) -> usize {
self.array.false_count()
}
}
impl From<Vec<bool>> for BooleanVector {
fn from(data: Vec<bool>) -> Self {
BooleanVector {
array: BooleanArray::from(data),
array: BooleanArray::from_slice(&data),
}
}
}
@@ -106,13 +91,11 @@ impl Vector for BooleanVector {
}
fn to_arrow_array(&self) -> ArrayRef {
let data = self.to_array_data();
Arc::new(BooleanArray::from(data))
Arc::new(self.array.clone())
}
fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
let data = self.to_array_data();
Box::new(BooleanArray::from(data))
Box::new(self.array.clone())
}
fn validity(&self) -> Validity {
@@ -120,11 +103,7 @@ impl Vector for BooleanVector {
}
fn memory_size(&self) -> usize {
self.array.get_buffer_memory_size()
}
fn null_count(&self) -> usize {
self.array.null_count()
self.array.values().as_slice().0.len()
}
fn is_null(&self, row: usize) -> bool {
@@ -132,8 +111,7 @@ impl Vector for BooleanVector {
}
fn slice(&self, offset: usize, length: usize) -> VectorRef {
let data = self.array.data().slice(offset, length);
Arc::new(Self::from_array_data(data))
Arc::new(Self::from(self.array.slice(offset, length)))
}
fn get(&self, index: usize) -> Value {
@@ -148,7 +126,7 @@ impl Vector for BooleanVector {
impl ScalarVector for BooleanVector {
type OwnedItem = bool;
type RefItem<'a> = bool;
type Iter<'a> = ArrayIter<&'a BooleanArray>;
type Iter<'a> = ZipValidity<'a, bool, BitmapIter<'a>>;
type Builder = BooleanVectorBuilder;
fn get_data(&self, idx: usize) -> Option<Self::RefItem<'_>> {
@@ -165,7 +143,7 @@ impl ScalarVector for BooleanVector {
}
pub struct BooleanVectorBuilder {
mutable_array: BooleanBuilder,
mutable_array: MutableBooleanArray,
}
impl MutableVector for BooleanVectorBuilder {
@@ -190,15 +168,12 @@ impl MutableVector for BooleanVectorBuilder {
}
fn push_value_ref(&mut self, value: ValueRef) -> Result<()> {
match value.as_boolean()? {
Some(v) => self.mutable_array.append_value(v),
None => self.mutable_array.append_null(),
}
self.mutable_array.push(value.as_boolean()?);
Ok(())
}
fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> {
vectors::impl_extend_for_builder!(self, vector, BooleanVector, offset, length)
vectors::impl_extend_for_builder!(self.mutable_array, vector, BooleanVector, offset, length)
}
}
@@ -207,20 +182,17 @@ impl ScalarVectorBuilder for BooleanVectorBuilder {
fn with_capacity(capacity: usize) -> Self {
Self {
mutable_array: BooleanBuilder::with_capacity(capacity),
mutable_array: MutableBooleanArray::with_capacity(capacity),
}
}
fn push(&mut self, value: Option<<Self::VectorType as ScalarVector>::RefItem<'_>>) {
match value {
Some(v) => self.mutable_array.append_value(v),
None => self.mutable_array.append_null(),
}
self.mutable_array.push(value);
}
fn finish(&mut self) -> Self::VectorType {
BooleanVector {
array: self.mutable_array.finish(),
array: std::mem::take(&mut self.mutable_array).into(),
}
}
}
@@ -253,9 +225,9 @@ mod tests {
assert_eq!(9, v.len());
assert_eq!("BooleanVector", v.vector_type_name());
assert!(!v.is_const());
assert!(v.validity().is_all_valid());
assert_eq!(Validity::AllValid, v.validity());
assert!(!v.only_null());
assert_eq!(64, v.memory_size());
assert_eq!(2, v.memory_size());
for (i, b) in bools.iter().enumerate() {
assert!(!v.is_null(i));
@@ -344,12 +316,13 @@ mod tests {
let vector = BooleanVector::from(vec![Some(true), None, Some(false)]);
assert_eq!(1, vector.null_count());
let validity = vector.validity();
assert_eq!(1, validity.null_count());
assert!(!validity.is_set(1));
let slots = validity.slots().unwrap();
assert_eq!(1, slots.null_count());
assert!(!slots.get_bit(1));
let vector = BooleanVector::from(vec![true, false, false]);
assert_eq!(0, vector.null_count());
assert!(vector.validity().is_all_valid());
assert_eq!(Validity::AllValid, vector.validity());
}
#[test]

View File

@@ -55,27 +55,6 @@ impl ConstantVector {
pub fn get_constant_ref(&self) -> ValueRef {
self.vector.get_ref(0)
}
pub(crate) fn replicate_vector(&self, offsets: &[usize]) -> VectorRef {
assert_eq!(offsets.len(), self.len());
if offsets.is_empty() {
return self.slice(0, 0);
}
Arc::new(ConstantVector::new(
self.vector.clone(),
*offsets.last().unwrap(),
))
}
pub(crate) fn filter_vector(&self, filter: &BooleanVector) -> Result<VectorRef> {
let length = self.len() - filter.false_count();
if length == self.len() {
return Ok(Arc::new(self.clone()));
}
Ok(Arc::new(ConstantVector::new(self.inner().clone(), length)))
}
}
impl Vector for ConstantVector {
@@ -111,9 +90,9 @@ impl Vector for ConstantVector {
fn validity(&self) -> Validity {
if self.vector.is_null(0) {
Validity::all_null(self.length)
Validity::AllNull
} else {
Validity::all_valid(self.length)
Validity::AllValid
}
}
@@ -143,14 +122,6 @@ impl Vector for ConstantVector {
fn get_ref(&self, _index: usize) -> ValueRef {
self.vector.get_ref(0)
}
fn null_count(&self) -> usize {
if self.only_null() {
self.len()
} else {
0
}
}
}
impl fmt::Debug for ConstantVector {
@@ -169,6 +140,33 @@ impl Serializable for ConstantVector {
}
}
pub(crate) fn replicate_constant(vector: &ConstantVector, offsets: &[usize]) -> VectorRef {
assert_eq!(offsets.len(), vector.len());
if offsets.is_empty() {
return vector.slice(0, 0);
}
Arc::new(ConstantVector::new(
vector.vector.clone(),
*offsets.last().unwrap(),
))
}
pub(crate) fn filter_constant(
vector: &ConstantVector,
filter: &BooleanVector,
) -> Result<VectorRef> {
let length = filter.len() - filter.as_boolean_array().values().null_count();
if length == vector.len() {
return Ok(Arc::new(vector.clone()));
}
Ok(Arc::new(ConstantVector::new(
vector.inner().clone(),
length,
)))
}
#[cfg(test)]
mod tests {
use arrow::datatypes::DataType as ArrowDataType;
@@ -184,9 +182,9 @@ mod tests {
assert_eq!("ConstantVector", c.vector_type_name());
assert!(c.is_const());
assert_eq!(10, c.len());
assert!(c.validity().is_all_valid());
assert_eq!(Validity::AllValid, c.validity());
assert!(!c.only_null());
assert_eq!(64, c.memory_size());
assert_eq!(4, c.memory_size());
for i in 0..10 {
assert!(!c.is_null(i));

View File

@@ -12,28 +12,258 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use crate::types::DateType;
use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder};
use std::any::Any;
use std::sync::Arc;
// Vector for [`Date`](common_time::Date).
pub type DateVector = PrimitiveVector<DateType>;
// Builder to build DateVector.
pub type DateVectorBuilder = PrimitiveVectorBuilder<DateType>;
use arrow::array::{Array, ArrayRef, PrimitiveArray};
use common_time::date::Date;
use snafu::OptionExt;
use crate::data_type::ConcreteDataType;
use crate::error::{self, Result};
use crate::prelude::*;
use crate::scalars::ScalarVector;
use crate::serialize::Serializable;
use crate::vectors::{MutableVector, PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder};
#[derive(Debug, Clone, PartialEq)]
pub struct DateVector {
array: PrimitiveVector<i32>,
}
impl DateVector {
pub fn new(array: PrimitiveArray<i32>) -> Self {
Self {
array: PrimitiveVector { array },
}
}
pub fn try_from_arrow_array(array: impl AsRef<dyn Array>) -> Result<Self> {
Ok(Self::new(
array
.as_ref()
.as_any()
.downcast_ref::<PrimitiveArray<i32>>()
.with_context(|| error::ConversionSnafu {
from: format!("{:?}", array.as_ref().data_type()),
})?
.clone(),
))
}
pub(crate) fn as_arrow(&self) -> &dyn Array {
self.array.as_arrow()
}
}
impl Vector for DateVector {
fn data_type(&self) -> ConcreteDataType {
ConcreteDataType::date_datatype()
}
fn vector_type_name(&self) -> String {
"DateVector".to_string()
}
fn as_any(&self) -> &dyn Any {
self
}
fn len(&self) -> usize {
self.array.len()
}
fn to_arrow_array(&self) -> ArrayRef {
let validity = self.array.array.validity().cloned();
let buffer = self.array.array.values().clone();
Arc::new(PrimitiveArray::new(
arrow::datatypes::DataType::Date32,
buffer,
validity,
))
}
fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
let validity = self.array.array.validity().cloned();
let buffer = self.array.array.values().clone();
Box::new(PrimitiveArray::new(
arrow::datatypes::DataType::Date32,
buffer,
validity,
))
}
fn validity(&self) -> Validity {
self.array.validity()
}
fn memory_size(&self) -> usize {
self.array.memory_size()
}
fn is_null(&self, row: usize) -> bool {
self.array.is_null(row)
}
fn slice(&self, offset: usize, length: usize) -> VectorRef {
Arc::new(Self {
array: PrimitiveVector::new(self.array.array.slice(offset, length)),
})
}
fn get(&self, index: usize) -> Value {
match self.array.get(index) {
Value::Int32(v) => Value::Date(Date::new(v)),
Value::Null => Value::Null,
_ => {
unreachable!()
}
}
}
fn get_ref(&self, index: usize) -> ValueRef {
match self.array.get(index) {
Value::Int32(v) => ValueRef::Date(Date::new(v)),
Value::Null => ValueRef::Null,
_ => {
unreachable!()
}
}
}
}
impl From<Vec<Option<i32>>> for DateVector {
fn from(data: Vec<Option<i32>>) -> Self {
Self {
array: PrimitiveVector::<i32>::from(data),
}
}
}
pub struct DateIter<'a> {
iter: PrimitiveIter<'a, i32>,
}
impl<'a> Iterator for DateIter<'a> {
type Item = Option<Date>;
fn next(&mut self) -> Option<Self::Item> {
self.iter.next().map(|v| v.map(Date::new))
}
}
impl ScalarVector for DateVector {
type OwnedItem = Date;
type RefItem<'a> = Date;
type Iter<'a> = DateIter<'a>;
type Builder = DateVectorBuilder;
fn get_data(&self, idx: usize) -> Option<Self::RefItem<'_>> {
self.array.get_data(idx).map(Date::new)
}
fn iter_data(&self) -> Self::Iter<'_> {
DateIter {
iter: self.array.iter_data(),
}
}
}
impl Serializable for DateVector {
fn serialize_to_json(&self) -> Result<Vec<serde_json::Value>> {
Ok(self
.array
.iter_data()
.map(|v| v.map(Date::new))
.map(|v| match v {
None => serde_json::Value::Null,
Some(v) => v.into(),
})
.collect::<Vec<_>>())
}
}
pub struct DateVectorBuilder {
buffer: PrimitiveVectorBuilder<i32>,
}
impl MutableVector for DateVectorBuilder {
fn data_type(&self) -> ConcreteDataType {
ConcreteDataType::date_datatype()
}
fn len(&self) -> usize {
self.buffer.len()
}
fn as_any(&self) -> &dyn Any {
self
}
fn as_mut_any(&mut self) -> &mut dyn Any {
self
}
fn to_vector(&mut self) -> VectorRef {
Arc::new(self.finish())
}
fn push_value_ref(&mut self, value: ValueRef) -> Result<()> {
self.buffer.push(value.as_date()?.map(|d| d.val()));
Ok(())
}
fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> {
let concrete_vector = vector
.as_any()
.downcast_ref::<DateVector>()
.with_context(|| error::CastTypeSnafu {
msg: format!(
"Failed to convert vector from {} to DateVector",
vector.vector_type_name()
),
})?;
self.buffer
.extend_slice_of(&concrete_vector.array, offset, length)?;
Ok(())
}
}
impl ScalarVectorBuilder for DateVectorBuilder {
type VectorType = DateVector;
fn with_capacity(capacity: usize) -> Self {
Self {
buffer: PrimitiveVectorBuilder::with_capacity(capacity),
}
}
fn push(&mut self, value: Option<<Self::VectorType as ScalarVector>::RefItem<'_>>) {
self.buffer.push(value.map(|d| d.val()))
}
fn finish(&mut self) -> Self::VectorType {
Self::VectorType {
array: self.buffer.finish(),
}
}
}
pub(crate) fn replicate_date(vector: &DateVector, offsets: &[usize]) -> VectorRef {
let array = crate::vectors::primitive::replicate_primitive_with_type(
&vector.array,
offsets,
vector.data_type(),
);
Arc::new(DateVector { array })
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use arrow::array::Array;
use common_time::date::Date;
use super::*;
use crate::data_type::DataType;
use crate::scalars::{ScalarVector, ScalarVectorBuilder};
use crate::serialize::Serializable;
use crate::types::DateType;
use crate::value::{Value, ValueRef};
use crate::vectors::{Vector, VectorRef};
#[test]
fn test_build_date_vector() {
@@ -58,7 +288,7 @@ mod tests {
#[test]
fn test_date_scalar() {
let vector = DateVector::from_slice(&[1, 2]);
let vector = DateVector::from_slice(&[Date::new(1), Date::new(2)]);
assert_eq!(2, vector.len());
assert_eq!(Some(Date::new(1)), vector.get_data(0));
assert_eq!(Some(Date::new(2)), vector.get_data(1));
@@ -66,7 +296,7 @@ mod tests {
#[test]
fn test_date_vector_builder() {
let input = DateVector::from_slice(&[1, 2, 3]);
let input = DateVector::from_slice(&[Date::new(1), Date::new(2), Date::new(3)]);
let mut builder = DateType::default().create_mutable_vector(3);
builder
@@ -79,25 +309,19 @@ mod tests {
.is_err());
let vector = builder.to_vector();
let expect: VectorRef = Arc::new(DateVector::from_slice(&[5, 2, 3]));
let expect: VectorRef = Arc::new(DateVector::from_slice(&[
Date::new(5),
Date::new(2),
Date::new(3),
]));
assert_eq!(expect, vector);
}
#[test]
fn test_date_from_arrow() {
let vector = DateVector::from_slice(&[1, 2]);
let vector = DateVector::from_slice(&[Date::new(1), Date::new(2)]);
let arrow = vector.as_arrow().slice(0, vector.len());
let vector2 = DateVector::try_from_arrow_array(&arrow).unwrap();
assert_eq!(vector, vector2);
}
#[test]
fn test_serialize_date_vector() {
let vector = DateVector::from_slice(&[-1, 0, 1]);
let serialized_json = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap();
assert_eq!(
r#"["1969-12-31","1970-01-01","1970-01-02"]"#,
serialized_json
);
}
}

View File

@@ -12,32 +12,264 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use crate::types::DateTimeType;
use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder};
use std::any::Any;
use std::sync::Arc;
/// Vector of [`DateTime`](common_time::Date)
pub type DateTimeVector = PrimitiveVector<DateTimeType>;
/// Builder for [`DateTimeVector`].
pub type DateTimeVectorBuilder = PrimitiveVectorBuilder<DateTimeType>;
use arrow::array::{Array, ArrayRef, PrimitiveArray};
use common_time::datetime::DateTime;
use snafu::OptionExt;
use crate::data_type::ConcreteDataType;
use crate::error::{self, Result};
use crate::prelude::{
MutableVector, ScalarVector, ScalarVectorBuilder, Validity, Value, ValueRef, Vector, VectorRef,
};
use crate::serialize::Serializable;
use crate::vectors::{PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder};
#[derive(Debug, Clone, PartialEq)]
pub struct DateTimeVector {
array: PrimitiveVector<i64>,
}
impl DateTimeVector {
pub fn new(array: PrimitiveArray<i64>) -> Self {
Self {
array: PrimitiveVector { array },
}
}
pub fn try_from_arrow_array(array: impl AsRef<dyn Array>) -> Result<Self> {
Ok(Self::new(
array
.as_ref()
.as_any()
.downcast_ref::<PrimitiveArray<i64>>()
.with_context(|| error::ConversionSnafu {
from: format!("{:?}", array.as_ref().data_type()),
})?
.clone(),
))
}
pub(crate) fn as_arrow(&self) -> &dyn Array {
self.array.as_arrow()
}
}
impl Vector for DateTimeVector {
fn data_type(&self) -> ConcreteDataType {
ConcreteDataType::datetime_datatype()
}
fn vector_type_name(&self) -> String {
"DateTimeVector".to_string()
}
fn as_any(&self) -> &dyn Any {
self
}
fn len(&self) -> usize {
self.array.len()
}
fn to_arrow_array(&self) -> ArrayRef {
let validity = self.array.array.validity().cloned();
let buffer = self.array.array.values().clone();
Arc::new(PrimitiveArray::new(
arrow::datatypes::DataType::Date64,
buffer,
validity,
))
}
fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
let validity = self.array.array.validity().cloned();
let buffer = self.array.array.values().clone();
Box::new(PrimitiveArray::new(
arrow::datatypes::DataType::Date64,
buffer,
validity,
))
}
fn validity(&self) -> Validity {
self.array.validity()
}
fn memory_size(&self) -> usize {
self.array.memory_size()
}
fn is_null(&self, row: usize) -> bool {
self.array.is_null(row)
}
fn slice(&self, offset: usize, length: usize) -> VectorRef {
Arc::new(Self {
array: PrimitiveVector::new(self.array.array.slice(offset, length)),
})
}
fn get(&self, index: usize) -> Value {
match self.array.get(index) {
Value::Int64(v) => Value::DateTime(DateTime::new(v)),
Value::Null => Value::Null,
_ => {
unreachable!()
}
}
}
fn get_ref(&self, index: usize) -> ValueRef {
match self.array.get(index) {
Value::Int64(v) => ValueRef::DateTime(DateTime::new(v)),
Value::Null => ValueRef::Null,
_ => {
unreachable!()
}
}
}
}
impl Serializable for DateTimeVector {
fn serialize_to_json(&self) -> crate::Result<Vec<serde_json::Value>> {
Ok(self
.array
.iter_data()
.map(|v| v.map(DateTime::new))
.map(|v| match v {
None => serde_json::Value::Null,
Some(v) => v.into(),
})
.collect::<Vec<_>>())
}
}
impl From<Vec<Option<i64>>> for DateTimeVector {
fn from(data: Vec<Option<i64>>) -> Self {
Self {
array: PrimitiveVector::<i64>::from(data),
}
}
}
pub struct DateTimeVectorBuilder {
buffer: PrimitiveVectorBuilder<i64>,
}
impl ScalarVectorBuilder for DateTimeVectorBuilder {
type VectorType = DateTimeVector;
fn with_capacity(capacity: usize) -> Self {
Self {
buffer: PrimitiveVectorBuilder::with_capacity(capacity),
}
}
fn push(&mut self, value: Option<<Self::VectorType as ScalarVector>::RefItem<'_>>) {
self.buffer.push(value.map(|d| d.val()))
}
fn finish(&mut self) -> Self::VectorType {
Self::VectorType {
array: self.buffer.finish(),
}
}
}
impl MutableVector for DateTimeVectorBuilder {
fn data_type(&self) -> ConcreteDataType {
ConcreteDataType::datetime_datatype()
}
fn len(&self) -> usize {
self.buffer.len()
}
fn as_any(&self) -> &dyn Any {
self
}
fn as_mut_any(&mut self) -> &mut dyn Any {
self
}
fn to_vector(&mut self) -> VectorRef {
Arc::new(self.finish())
}
fn push_value_ref(&mut self, value: ValueRef) -> Result<()> {
self.buffer.push(value.as_datetime()?.map(|d| d.val()));
Ok(())
}
fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> {
let concrete_vector = vector
.as_any()
.downcast_ref::<DateTimeVector>()
.with_context(|| error::CastTypeSnafu {
msg: format!(
"Failed to convert vector from {} to DateVector",
vector.vector_type_name()
),
})?;
self.buffer
.extend_slice_of(&concrete_vector.array, offset, length)?;
Ok(())
}
}
pub struct DateTimeIter<'a> {
iter: PrimitiveIter<'a, i64>,
}
impl<'a> Iterator for DateTimeIter<'a> {
type Item = Option<DateTime>;
fn next(&mut self) -> Option<Self::Item> {
self.iter.next().map(|v| v.map(DateTime::new))
}
}
impl ScalarVector for DateTimeVector {
type OwnedItem = DateTime;
type RefItem<'a> = DateTime;
type Iter<'a> = DateTimeIter<'a>;
type Builder = DateTimeVectorBuilder;
fn get_data(&self, idx: usize) -> Option<Self::RefItem<'_>> {
self.array.get_data(idx).map(DateTime::new)
}
fn iter_data(&self) -> Self::Iter<'_> {
DateTimeIter {
iter: self.array.iter_data(),
}
}
}
pub(crate) fn replicate_datetime(vector: &DateTimeVector, offsets: &[usize]) -> VectorRef {
let array = crate::vectors::primitive::replicate_primitive_with_type(
&vector.array,
offsets,
vector.data_type(),
);
Arc::new(DateTimeVector { array })
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use arrow::array::{Array, PrimitiveArray};
use common_time::DateTime;
use datafusion_common::from_slice::FromSlice;
use std::assert_matches::assert_matches;
use super::*;
use crate::data_type::DataType;
use crate::prelude::{
ConcreteDataType, ScalarVector, ScalarVectorBuilder, Value, ValueRef, Vector, VectorRef,
};
use crate::serialize::Serializable;
use crate::types::DateTimeType;
#[test]
fn test_datetime_vector() {
let v = DateTimeVector::new(PrimitiveArray::from_slice(&[1, 2, 3]));
let v = DateTimeVector::new(PrimitiveArray::from_vec(vec![1, 2, 3]));
assert_eq!(ConcreteDataType::datetime_datatype(), v.data_type());
assert_eq!(3, v.len());
assert_eq!("DateTimeVector", v.vector_type_name());
@@ -55,8 +287,9 @@ mod tests {
assert_eq!(Some(DateTime::new(2)), iter.next().unwrap());
assert_eq!(Some(DateTime::new(3)), iter.next().unwrap());
assert!(!v.is_null(0));
assert_eq!(64, v.memory_size());
assert_eq!(24, v.memory_size()); // size of i64 * 3
assert_matches!(v.validity(), Validity::AllValid);
if let Value::DateTime(d) = v.get(0) {
assert_eq!(1, d.val());
} else {
@@ -81,11 +314,8 @@ mod tests {
assert_eq!(Value::Null, v.get(1));
assert_eq!(Value::DateTime(DateTime::new(-1)), v.get(2));
let input = DateTimeVector::from_wrapper_slice(&[
DateTime::new(1),
DateTime::new(2),
DateTime::new(3),
]);
let input =
DateTimeVector::from_slice(&[DateTime::new(1), DateTime::new(2), DateTime::new(3)]);
let mut builder = DateTimeType::default().create_mutable_vector(3);
builder
@@ -98,7 +328,7 @@ mod tests {
.is_err());
let vector = builder.to_vector();
let expect: VectorRef = Arc::new(DateTimeVector::from_wrapper_slice(&[
let expect: VectorRef = Arc::new(DateTimeVector::from_slice(&[
DateTime::new(5),
DateTime::new(2),
DateTime::new(3),
@@ -108,7 +338,7 @@ mod tests {
#[test]
fn test_datetime_from_arrow() {
let vector = DateTimeVector::from_wrapper_slice(&[DateTime::new(1), DateTime::new(2)]);
let vector = DateTimeVector::from_slice(&[DateTime::new(1), DateTime::new(2)]);
let arrow = vector.as_arrow().slice(0, vector.len());
let vector2 = DateTimeVector::try_from_arrow_array(&arrow).unwrap();
assert_eq!(vector, vector2);

View File

@@ -15,12 +15,9 @@
use std::sync::Arc;
use crate::data_type::DataType;
use crate::types::TimestampType;
use crate::vectors::constant::ConstantVector;
use crate::vectors::{
BinaryVector, BooleanVector, DateTimeVector, DateVector, ListVector, PrimitiveVector,
StringVector, TimestampMicrosecondVector, TimestampMillisecondVector,
TimestampNanosecondVector, TimestampSecondVector, Vector,
BinaryVector, BooleanVector, ConstantVector, DateTimeVector, DateVector, ListVector,
PrimitiveVector, StringVector, TimestampVector, Vector,
};
use crate::with_match_primitive_type_id;
@@ -79,20 +76,7 @@ fn equal(lhs: &dyn Vector, rhs: &dyn Vector) -> bool {
String(_) => is_vector_eq!(StringVector, lhs, rhs),
Date(_) => is_vector_eq!(DateVector, lhs, rhs),
DateTime(_) => is_vector_eq!(DateTimeVector, lhs, rhs),
Timestamp(t) => match t {
TimestampType::Second(_) => {
is_vector_eq!(TimestampSecondVector, lhs, rhs)
}
TimestampType::Millisecond(_) => {
is_vector_eq!(TimestampMillisecondVector, lhs, rhs)
}
TimestampType::Microsecond(_) => {
is_vector_eq!(TimestampMicrosecondVector, lhs, rhs)
}
TimestampType::Nanosecond(_) => {
is_vector_eq!(TimestampNanosecondVector, lhs, rhs)
}
},
Timestamp(_) => is_vector_eq!(TimestampVector, lhs, rhs),
List(_) => is_vector_eq!(ListVector, lhs, rhs),
UInt8(_) | UInt16(_) | UInt32(_) | UInt64(_) | Int8(_) | Int16(_) | Int32(_) | Int64(_)
| Float32(_) | Float64(_) => {
@@ -111,10 +95,13 @@ fn equal(lhs: &dyn Vector, rhs: &dyn Vector) -> bool {
#[cfg(test)]
mod tests {
use arrow::array::{ListArray, MutableListArray, MutablePrimitiveArray, TryExtend};
use super::*;
use crate::vectors::{
list, Float32Vector, Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector,
NullVector, UInt16Vector, UInt32Vector, UInt64Vector, UInt8Vector, VectorRef,
Float32Vector, Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector,
NullVector, TimestampVector, UInt16Vector, UInt32Vector, UInt64Vector, UInt8Vector,
VectorRef,
};
fn assert_vector_ref_eq(vector: VectorRef) {
@@ -145,21 +132,14 @@ mod tests {
assert_vector_ref_eq(Arc::new(BooleanVector::from(vec![true, false])));
assert_vector_ref_eq(Arc::new(DateVector::from(vec![Some(100), Some(120)])));
assert_vector_ref_eq(Arc::new(DateTimeVector::from(vec![Some(100), Some(120)])));
assert_vector_ref_eq(Arc::new(TimestampSecondVector::from_values([100, 120])));
assert_vector_ref_eq(Arc::new(TimestampMillisecondVector::from_values([
100, 120,
])));
assert_vector_ref_eq(Arc::new(TimestampMicrosecondVector::from_values([
100, 120,
])));
assert_vector_ref_eq(Arc::new(TimestampNanosecondVector::from_values([100, 120])));
assert_vector_ref_eq(Arc::new(TimestampVector::from_values([100, 120])));
let list_vector = list::tests::new_list_vector(&[
Some(vec![Some(1), Some(2)]),
None,
Some(vec![Some(3), Some(4)]),
]);
assert_vector_ref_eq(Arc::new(list_vector));
let mut arrow_array = MutableListArray::<i32, MutablePrimitiveArray<i64>>::new();
arrow_array
.try_extend(vec![Some(vec![Some(1), Some(2), Some(3)])])
.unwrap();
let arrow_array: ListArray<i32> = arrow_array.into();
assert_vector_ref_eq(Arc::new(ListVector::from(arrow_array)));
assert_vector_ref_eq(Arc::new(NullVector::new(4)));
assert_vector_ref_eq(Arc::new(StringVector::from(vec![

View File

@@ -17,26 +17,19 @@
use std::any::Any;
use std::sync::Arc;
use arrow::array::{Array, ArrayRef, StringArray};
use arrow::array::Array;
use arrow::compute;
use arrow::compute::kernels::comparison;
use arrow::datatypes::{DataType as ArrowDataType, TimeUnit};
use arrow::datatypes::DataType as ArrowDataType;
use datafusion_common::ScalarValue;
use snafu::{OptionExt, ResultExt};
use crate::data_type::ConcreteDataType;
use crate::error::{self, Result};
use crate::scalars::{Scalar, ScalarVectorBuilder};
use crate::value::{ListValue, ListValueRef};
use crate::vectors::{
BinaryVector, BooleanVector, ConstantVector, DateTimeVector, DateVector, Float32Vector,
Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector, ListVector,
ListVectorBuilder, MutableVector, NullVector, StringVector, TimestampMicrosecondVector,
TimestampMillisecondVector, TimestampNanosecondVector, TimestampSecondVector, UInt16Vector,
UInt32Vector, UInt64Vector, UInt8Vector, Vector, VectorRef,
};
use crate::arrow_array::StringArray;
use crate::error::{ConversionSnafu, Result, UnknownVectorSnafu};
use crate::scalars::*;
use crate::vectors::date::DateVector;
use crate::vectors::datetime::DateTimeVector;
use crate::vectors::*;
/// Helper functions for `Vector`.
pub struct Helper;
impl Helper {
@@ -54,7 +47,7 @@ impl Helper {
let arr = vector
.as_any()
.downcast_ref::<<T as Scalar>::VectorType>()
.with_context(|| error::UnknownVectorSnafu {
.with_context(|| UnknownVectorSnafu {
msg: format!(
"downcast vector error, vector type: {:?}, expected vector: {:?}",
vector.vector_type_name(),
@@ -68,7 +61,7 @@ impl Helper {
let arr = vector
.as_any()
.downcast_ref::<T>()
.with_context(|| error::UnknownVectorSnafu {
.with_context(|| UnknownVectorSnafu {
msg: format!(
"downcast vector error, vector type: {:?}, expected vector: {:?}",
vector.vector_type_name(),
@@ -85,7 +78,7 @@ impl Helper {
let arr = vector
.as_mut_any()
.downcast_mut()
.with_context(|| error::UnknownVectorSnafu {
.with_context(|| UnknownVectorSnafu {
msg: format!(
"downcast vector error, vector type: {:?}, expected vector: {:?}",
ty,
@@ -101,7 +94,7 @@ impl Helper {
let arr = vector
.as_any()
.downcast_ref::<<T as Scalar>::VectorType>()
.with_context(|| error::UnknownVectorSnafu {
.with_context(|| UnknownVectorSnafu {
msg: format!(
"downcast vector error, vector type: {:?}, expected vector: {:?}",
vector.vector_type_name(),
@@ -112,9 +105,11 @@ impl Helper {
}
/// Try to cast an arrow scalar value into vector
///
/// # Panics
/// Panic if given scalar value is not supported.
pub fn try_from_scalar_value(value: ScalarValue, length: usize) -> Result<VectorRef> {
let vector = match value {
ScalarValue::Null => ConstantVector::new(Arc::new(NullVector::new(1)), length),
ScalarValue::Boolean(v) => {
ConstantVector::new(Arc::new(BooleanVector::from(vec![v])), length)
}
@@ -148,29 +143,17 @@ impl Helper {
ScalarValue::UInt64(v) => {
ConstantVector::new(Arc::new(UInt64Vector::from(vec![v])), length)
}
ScalarValue::Utf8(v) | ScalarValue::LargeUtf8(v) => {
ScalarValue::Utf8(v) => {
ConstantVector::new(Arc::new(StringVector::from(vec![v])), length)
}
ScalarValue::Binary(v)
| ScalarValue::LargeBinary(v)
| ScalarValue::FixedSizeBinary(_, v) => {
ScalarValue::LargeUtf8(v) => {
ConstantVector::new(Arc::new(StringVector::from(vec![v])), length)
}
ScalarValue::Binary(v) => {
ConstantVector::new(Arc::new(BinaryVector::from(vec![v])), length)
}
ScalarValue::List(v, field) => {
let item_type = ConcreteDataType::try_from(field.data_type())?;
let mut builder = ListVectorBuilder::with_type_capacity(item_type.clone(), 1);
if let Some(values) = v {
let values = values
.into_iter()
.map(ScalarValue::try_into)
.collect::<Result<_>>()?;
let list_value = ListValue::new(Some(Box::new(values)), item_type);
builder.push(Some(ListValueRef::Ref { val: &list_value }));
} else {
builder.push(None);
}
let list_vector = builder.to_vector();
ConstantVector::new(list_vector, length)
ScalarValue::LargeBinary(v) => {
ConstantVector::new(Arc::new(BinaryVector::from(vec![v])), length)
}
ScalarValue::Date32(v) => {
ConstantVector::new(Arc::new(DateVector::from(vec![v])), length)
@@ -178,30 +161,8 @@ impl Helper {
ScalarValue::Date64(v) => {
ConstantVector::new(Arc::new(DateTimeVector::from(vec![v])), length)
}
ScalarValue::TimestampSecond(v, _) => {
// Timezone is unimplemented now.
ConstantVector::new(Arc::new(TimestampSecondVector::from(vec![v])), length)
}
ScalarValue::TimestampMillisecond(v, _) => {
// Timezone is unimplemented now.
ConstantVector::new(Arc::new(TimestampMillisecondVector::from(vec![v])), length)
}
ScalarValue::TimestampMicrosecond(v, _) => {
// Timezone is unimplemented now.
ConstantVector::new(Arc::new(TimestampMicrosecondVector::from(vec![v])), length)
}
ScalarValue::TimestampNanosecond(v, _) => {
// Timezone is unimplemented now.
ConstantVector::new(Arc::new(TimestampNanosecondVector::from(vec![v])), length)
}
ScalarValue::Decimal128(_, _, _)
| ScalarValue::Time64(_)
| ScalarValue::IntervalYearMonth(_)
| ScalarValue::IntervalDayTime(_)
| ScalarValue::IntervalMonthDayNano(_)
| ScalarValue::Struct(_, _)
| ScalarValue::Dictionary(_, _) => {
return error::ConversionSnafu {
_ => {
return ConversionSnafu {
from: format!("Unsupported scalar value: {}", value),
}
.fail()
@@ -219,7 +180,9 @@ impl Helper {
Ok(match array.as_ref().data_type() {
ArrowDataType::Null => Arc::new(NullVector::try_from_arrow_array(array)?),
ArrowDataType::Boolean => Arc::new(BooleanVector::try_from_arrow_array(array)?),
ArrowDataType::LargeBinary => Arc::new(BinaryVector::try_from_arrow_array(array)?),
ArrowDataType::Binary | ArrowDataType::LargeBinary => {
Arc::new(BinaryVector::try_from_arrow_array(array)?)
}
ArrowDataType::Int8 => Arc::new(Int8Vector::try_from_arrow_array(array)?),
ArrowDataType::Int16 => Arc::new(Int16Vector::try_from_arrow_array(array)?),
ArrowDataType::Int32 => Arc::new(Int32Vector::try_from_arrow_array(array)?),
@@ -230,80 +193,48 @@ impl Helper {
ArrowDataType::UInt64 => Arc::new(UInt64Vector::try_from_arrow_array(array)?),
ArrowDataType::Float32 => Arc::new(Float32Vector::try_from_arrow_array(array)?),
ArrowDataType::Float64 => Arc::new(Float64Vector::try_from_arrow_array(array)?),
ArrowDataType::Utf8 => Arc::new(StringVector::try_from_arrow_array(array)?),
ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => {
Arc::new(StringVector::try_from_arrow_array(array)?)
}
ArrowDataType::Date32 => Arc::new(DateVector::try_from_arrow_array(array)?),
ArrowDataType::Date64 => Arc::new(DateTimeVector::try_from_arrow_array(array)?),
ArrowDataType::List(_) => Arc::new(ListVector::try_from_arrow_array(array)?),
ArrowDataType::Timestamp(unit, _) => match unit {
TimeUnit::Second => Arc::new(TimestampSecondVector::try_from_arrow_array(array)?),
TimeUnit::Millisecond => {
Arc::new(TimestampMillisecondVector::try_from_arrow_array(array)?)
}
TimeUnit::Microsecond => {
Arc::new(TimestampMicrosecondVector::try_from_arrow_array(array)?)
}
TimeUnit::Nanosecond => {
Arc::new(TimestampNanosecondVector::try_from_arrow_array(array)?)
}
},
ArrowDataType::Float16
| ArrowDataType::Time32(_)
| ArrowDataType::Time64(_)
| ArrowDataType::Duration(_)
| ArrowDataType::Interval(_)
| ArrowDataType::Binary
| ArrowDataType::FixedSizeBinary(_)
| ArrowDataType::LargeUtf8
| ArrowDataType::LargeList(_)
| ArrowDataType::FixedSizeList(_, _)
| ArrowDataType::Struct(_)
| ArrowDataType::Union(_, _, _)
| ArrowDataType::Dictionary(_, _)
| ArrowDataType::Decimal128(_, _)
| ArrowDataType::Decimal256(_, _)
| ArrowDataType::Map(_, _) => {
unimplemented!("Arrow array datatype: {:?}", array.as_ref().data_type())
ArrowDataType::Timestamp(_, _) => {
Arc::new(TimestampVector::try_from_arrow_array(array)?)
}
_ => unimplemented!("Arrow array datatype: {:?}", array.as_ref().data_type()),
})
}
/// Try to cast slice of `arrays` to vectors.
pub fn try_into_vectors(arrays: &[ArrayRef]) -> Result<Vec<VectorRef>> {
arrays.iter().map(Self::try_into_vector).collect()
}
/// Perform SQL like operation on `names` and a scalar `s`.
pub fn like_utf8(names: Vec<String>, s: &str) -> Result<VectorRef> {
let array = StringArray::from(names);
let array = StringArray::from_slice(&names);
let filter = comparison::like_utf8_scalar(&array, s).context(error::ArrowComputeSnafu)?;
let filter =
compute::like::like_utf8_scalar(&array, s).context(error::ArrowComputeSnafu)?;
let result = compute::filter(&array, &filter).context(error::ArrowComputeSnafu)?;
let result = compute::filter::filter(&array, &filter).context(error::ArrowComputeSnafu)?;
Helper::try_into_vector(result)
}
}
#[cfg(test)]
mod tests {
use arrow::array::{
ArrayRef, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, Int16Array,
Int32Array, Int64Array, Int8Array, LargeBinaryArray, ListArray, NullArray,
TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
};
use arrow::datatypes::{Field, Int32Type};
use common_time::{Date, DateTime};
use arrow::array::Int32Array;
use common_time::date::Date;
use common_time::datetime::DateTime;
use super::*;
use crate::value::Value;
use crate::vectors::ConcreteDataType;
#[test]
fn test_try_into_vectors() {
let arrays: Vec<ArrayRef> = vec![
Arc::new(Int32Array::from(vec![1])),
Arc::new(Int32Array::from(vec![2])),
Arc::new(Int32Array::from(vec![3])),
Arc::new(Int32Array::from_vec(vec![1])),
Arc::new(Int32Array::from_vec(vec![2])),
Arc::new(Int32Array::from_vec(vec![3])),
];
let vectors = Helper::try_into_vectors(&arrays);
assert!(vectors.is_ok());
@@ -315,10 +246,10 @@ mod tests {
}
#[test]
fn test_try_into_date_vector() {
pub fn test_try_into_date_vector() {
let vector = DateVector::from(vec![Some(1), Some(2), None]);
let arrow_array = vector.to_arrow_array();
assert_eq!(&ArrowDataType::Date32, arrow_array.data_type());
assert_eq!(&arrow::datatypes::DataType::Date32, arrow_array.data_type());
let vector_converted = Helper::try_into_vector(arrow_array).unwrap();
assert_eq!(vector.len(), vector_converted.len());
for i in 0..vector_converted.len() {
@@ -327,7 +258,7 @@ mod tests {
}
#[test]
fn test_try_from_scalar_date_value() {
pub fn test_try_from_scalar_date_value() {
let vector = Helper::try_from_scalar_value(ScalarValue::Date32(Some(42)), 3).unwrap();
assert_eq!(ConcreteDataType::date_datatype(), vector.data_type());
assert_eq!(3, vector.len());
@@ -337,7 +268,7 @@ mod tests {
}
#[test]
fn test_try_from_scalar_datetime_value() {
pub fn test_try_from_scalar_datetime_value() {
let vector = Helper::try_from_scalar_value(ScalarValue::Date64(Some(42)), 3).unwrap();
assert_eq!(ConcreteDataType::datetime_datatype(), vector.data_type());
assert_eq!(3, vector.len());
@@ -346,28 +277,6 @@ mod tests {
}
}
#[test]
fn test_try_from_list_value() {
let value = ScalarValue::List(
Some(vec![
ScalarValue::Int32(Some(1)),
ScalarValue::Int32(Some(2)),
]),
Box::new(Field::new("item", ArrowDataType::Int32, true)),
);
let vector = Helper::try_from_scalar_value(value, 3).unwrap();
assert_eq!(
ConcreteDataType::list_datatype(ConcreteDataType::int32_datatype()),
vector.data_type()
);
assert_eq!(3, vector.len());
for i in 0..vector.len() {
let v = vector.get(i);
let items = v.as_list().unwrap().unwrap().items().as_ref().unwrap();
assert_eq!(vec![Value::Int32(1), Value::Int32(2)], **items);
}
}
#[test]
fn test_like_utf8() {
fn assert_vector(expected: Vec<&str>, actual: &VectorRef) {
@@ -392,40 +301,4 @@ mod tests {
let ret = Helper::like_utf8(names, "%").unwrap();
assert_vector(vec!["greptime", "hello", "public", "world"], &ret);
}
fn check_try_into_vector(array: impl Array + 'static) {
let array: ArrayRef = Arc::new(array);
let vector = Helper::try_into_vector(array.clone()).unwrap();
assert_eq!(&array, &vector.to_arrow_array());
}
#[test]
fn test_try_into_vector() {
check_try_into_vector(NullArray::new(2));
check_try_into_vector(BooleanArray::from(vec![true, false]));
check_try_into_vector(LargeBinaryArray::from(vec![
"hello".as_bytes(),
"world".as_bytes(),
]));
check_try_into_vector(Int8Array::from(vec![1, 2, 3]));
check_try_into_vector(Int16Array::from(vec![1, 2, 3]));
check_try_into_vector(Int32Array::from(vec![1, 2, 3]));
check_try_into_vector(Int64Array::from(vec![1, 2, 3]));
check_try_into_vector(UInt8Array::from(vec![1, 2, 3]));
check_try_into_vector(UInt16Array::from(vec![1, 2, 3]));
check_try_into_vector(UInt32Array::from(vec![1, 2, 3]));
check_try_into_vector(UInt64Array::from(vec![1, 2, 3]));
check_try_into_vector(Float32Array::from(vec![1.0, 2.0, 3.0]));
check_try_into_vector(Float64Array::from(vec![1.0, 2.0, 3.0]));
check_try_into_vector(StringArray::from(vec!["hello", "world"]));
check_try_into_vector(Date32Array::from(vec![1, 2, 3]));
check_try_into_vector(Date64Array::from(vec![1, 2, 3]));
let data = vec![None, Some(vec![Some(6), Some(7)])];
let list_array = ListArray::from_iter_primitive::<Int32Type, _, _>(data);
check_try_into_vector(list_array);
check_try_into_vector(TimestampSecondArray::from(vec![1, 2, 3]));
check_try_into_vector(TimestampMillisecondArray::from(vec![1, 2, 3]));
check_try_into_vector(TimestampMicrosecondArray::from(vec![1, 2, 3]));
check_try_into_vector(TimestampNanosecondArray::from(vec![1, 2, 3]));
}
}

View File

@@ -13,48 +13,39 @@
// limitations under the License.
use std::any::Any;
use std::ops::Range;
use std::sync::Arc;
use arrow::array::{
Array, ArrayData, ArrayRef, BooleanBufferBuilder, Int32BufferBuilder, ListArray,
};
use arrow::buffer::Buffer;
use arrow::array::{Array, ArrayRef, ListArray};
use arrow::bitmap::utils::ZipValidity;
use arrow::bitmap::MutableBitmap;
use arrow::datatypes::DataType as ArrowDataType;
use serde_json::Value as JsonValue;
use snafu::prelude::*;
use crate::data_type::{ConcreteDataType, DataType};
use crate::error::Result;
use crate::scalars::{ScalarVector, ScalarVectorBuilder};
use crate::prelude::*;
use crate::serialize::Serializable;
use crate::types::ListType;
use crate::value::{ListValue, ListValueRef, Value, ValueRef};
use crate::vectors::{self, Helper, MutableVector, Validity, Vector, VectorRef};
use crate::value::{ListValue, ListValueRef};
use crate::vectors::{impl_try_from_arrow_array_for_vector, impl_validity_for_vector};
type ArrowListArray = ListArray<i32>;
/// Vector of Lists, basically backed by Arrow's `ListArray`.
#[derive(Debug, PartialEq)]
#[derive(Debug, Clone, PartialEq)]
pub struct ListVector {
array: ListArray,
/// The datatype of the items in the list.
item_type: ConcreteDataType,
array: ArrowListArray,
inner_datatype: ConcreteDataType,
}
impl ListVector {
/// Iterate elements as [VectorRef].
pub fn values_iter(&self) -> impl Iterator<Item = Result<Option<VectorRef>>> + '_ {
self.array
.iter()
.map(|value_opt| value_opt.map(Helper::try_into_vector).transpose())
}
fn to_array_data(&self) -> ArrayData {
self.array.data().clone()
}
fn from_array_data_and_type(data: ArrayData, item_type: ConcreteDataType) -> Self {
Self {
array: ListArray::from(data),
item_type,
}
/// Only iterate values in the [ListVector].
///
/// Be careful to use this method as it would ignore validity and replace null
/// by empty vector.
pub fn values_iter(&self) -> Box<dyn Iterator<Item = Result<VectorRef>> + '_> {
Box::new(self.array.values_iter().map(VectorHelper::try_into_vector))
}
pub(crate) fn as_arrow(&self) -> &dyn Array {
@@ -64,7 +55,7 @@ impl ListVector {
impl Vector for ListVector {
fn data_type(&self) -> ConcreteDataType {
ConcreteDataType::List(ListType::new(self.item_type.clone()))
ConcreteDataType::List(ListType::new(self.inner_datatype.clone()))
}
fn vector_type_name(&self) -> String {
@@ -80,25 +71,21 @@ impl Vector for ListVector {
}
fn to_arrow_array(&self) -> ArrayRef {
let data = self.to_array_data();
Arc::new(ListArray::from(data))
Arc::new(self.array.clone())
}
fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
let data = self.to_array_data();
Box::new(ListArray::from(data))
Box::new(self.array.clone())
}
fn validity(&self) -> Validity {
vectors::impl_validity_for_vector!(self.array)
impl_validity_for_vector!(self.array)
}
fn memory_size(&self) -> usize {
self.array.get_buffer_memory_size()
}
fn null_count(&self) -> usize {
self.array.null_count()
let offsets_bytes = self.array.offsets().len() * std::mem::size_of::<i64>();
let value_refs_bytes = self.array.values().len() * std::mem::size_of::<Arc<dyn Array>>();
offsets_bytes + value_refs_bytes
}
fn is_null(&self, row: usize) -> bool {
@@ -106,8 +93,7 @@ impl Vector for ListVector {
}
fn slice(&self, offset: usize, length: usize) -> VectorRef {
let data = self.array.data().slice(offset, length);
Arc::new(Self::from_array_data_and_type(data, self.item_type.clone()))
Arc::new(ListVector::from(self.array.slice(offset, length)))
}
fn get(&self, index: usize) -> Value {
@@ -116,7 +102,7 @@ impl Vector for ListVector {
}
let array = &self.array.value(index);
let vector = Helper::try_into_vector(array).unwrap_or_else(|_| {
let vector = VectorHelper::try_into_vector(array).unwrap_or_else(|_| {
panic!(
"arrow array with datatype {:?} cannot converted to our vector",
array.data_type()
@@ -127,7 +113,7 @@ impl Vector for ListVector {
.collect::<Vec<Value>>();
Value::List(ListValue::new(
Some(Box::new(values)),
self.item_type.clone(),
self.inner_datatype.clone(),
))
}
@@ -145,7 +131,7 @@ impl Serializable for ListVector {
.iter()
.map(|v| match v {
None => Ok(JsonValue::Null),
Some(v) => Helper::try_into_vector(v)
Some(v) => VectorHelper::try_into_vector(v)
.and_then(|v| v.serialize_to_json())
.map(JsonValue::Array),
})
@@ -153,64 +139,70 @@ impl Serializable for ListVector {
}
}
impl From<ListArray> for ListVector {
fn from(array: ListArray) -> Self {
let item_type = ConcreteDataType::from_arrow_type(match array.data_type() {
ArrowDataType::List(field) => field.data_type(),
other => panic!(
"Try to create ListVector from an arrow array with type {:?}",
other
),
impl From<ArrowListArray> for ListVector {
fn from(array: ArrowListArray) -> Self {
let inner_datatype = ConcreteDataType::from_arrow_type(match array.data_type() {
ArrowDataType::List(field) => &field.data_type,
_ => unreachable!(),
});
Self { array, item_type }
Self {
array,
inner_datatype,
}
}
}
vectors::impl_try_from_arrow_array_for_vector!(ListArray, ListVector);
impl_try_from_arrow_array_for_vector!(ArrowListArray, ListVector);
pub struct ListIter<'a> {
pub struct ListVectorIter<'a> {
vector: &'a ListVector,
idx: usize,
iter: ZipValidity<'a, usize, Range<usize>>,
}
impl<'a> ListIter<'a> {
fn new(vector: &'a ListVector) -> ListIter {
ListIter { vector, idx: 0 }
impl<'a> ListVectorIter<'a> {
pub fn new(vector: &'a ListVector) -> ListVectorIter<'a> {
let iter = ZipValidity::new(
0..vector.len(),
vector.array.validity().as_ref().map(|x| x.iter()),
);
Self { vector, iter }
}
}
impl<'a> Iterator for ListIter<'a> {
impl<'a> Iterator for ListVectorIter<'a> {
type Item = Option<ListValueRef<'a>>;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
if self.idx >= self.vector.len() {
return None;
}
let idx = self.idx;
self.idx += 1;
if self.vector.is_null(idx) {
return Some(None);
}
Some(Some(ListValueRef::Indexed {
vector: self.vector,
idx,
}))
self.iter.next().map(|idx_opt| {
idx_opt.map(|idx| ListValueRef::Indexed {
vector: self.vector,
idx,
})
})
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
(self.vector.len(), Some(self.vector.len()))
self.iter.size_hint()
}
#[inline]
fn nth(&mut self, n: usize) -> Option<Self::Item> {
self.iter.nth(n).map(|idx_opt| {
idx_opt.map(|idx| ListValueRef::Indexed {
vector: self.vector,
idx,
})
})
}
}
impl ScalarVector for ListVector {
type OwnedItem = ListValue;
type RefItem<'a> = ListValueRef<'a>;
type Iter<'a> = ListIter<'a>;
type Iter<'a> = ListVectorIter<'a>;
type Builder = ListVectorBuilder;
fn get_data(&self, idx: usize) -> Option<Self::RefItem<'_>> {
@@ -222,68 +214,86 @@ impl ScalarVector for ListVector {
}
fn iter_data(&self) -> Self::Iter<'_> {
ListIter::new(self)
ListVectorIter::new(self)
}
}
// Ports from arrow's GenericListBuilder.
// See https://github.com/apache/arrow-rs/blob/94565bca99b5d9932a3e9a8e094aaf4e4384b1e5/arrow-array/src/builder/generic_list_builder.rs
/// [ListVector] builder.
// Some codes are ported from arrow2's MutableListArray.
pub struct ListVectorBuilder {
item_type: ConcreteDataType,
offsets_builder: Int32BufferBuilder,
null_buffer_builder: NullBufferBuilder,
values_builder: Box<dyn MutableVector>,
inner_type: ConcreteDataType,
offsets: Vec<i32>,
values: Box<dyn MutableVector>,
validity: Option<MutableBitmap>,
}
impl ListVectorBuilder {
/// Creates a new [`ListVectorBuilder`]. `item_type` is the data type of the list item, `capacity`
/// is the number of items to pre-allocate space for in this builder.
pub fn with_type_capacity(item_type: ConcreteDataType, capacity: usize) -> ListVectorBuilder {
let mut offsets_builder = Int32BufferBuilder::new(capacity + 1);
offsets_builder.append(0);
// The actual required capacity might be greater than the capacity of the `ListVector`
// if the child vector has more than one element.
let values_builder = item_type.create_mutable_vector(capacity);
pub fn with_type_capacity(inner_type: ConcreteDataType, capacity: usize) -> ListVectorBuilder {
let mut offsets = Vec::with_capacity(capacity + 1);
offsets.push(0);
// The actual required capacity might greater than the capacity of the `ListVector`
// if there exists child vector that has more than one element.
let values = inner_type.create_mutable_vector(capacity);
ListVectorBuilder {
item_type,
offsets_builder,
null_buffer_builder: NullBufferBuilder::new(capacity),
values_builder,
inner_type,
offsets,
values,
validity: None,
}
}
/// Finish the current variable-length list vector slot.
fn finish_list(&mut self, is_valid: bool) {
self.offsets_builder
.append(i32::try_from(self.values_builder.len()).unwrap());
self.null_buffer_builder.append(is_valid);
#[inline]
fn last_offset(&self) -> i32 {
*self.offsets.last().unwrap()
}
fn push_null(&mut self) {
self.finish_list(false);
self.offsets.push(self.last_offset());
match &mut self.validity {
Some(validity) => validity.push(false),
None => self.init_validity(),
}
}
fn init_validity(&mut self) {
let len = self.offsets.len() - 1;
let mut validity = MutableBitmap::with_capacity(self.offsets.capacity());
validity.extend_constant(len, true);
validity.set(len - 1, false);
self.validity = Some(validity)
}
fn push_list_value(&mut self, list_value: &ListValue) -> Result<()> {
if let Some(items) = list_value.items() {
for item in &**items {
self.values_builder.push_value_ref(item.as_value_ref())?;
self.values.push_value_ref(item.as_value_ref())?;
}
}
self.finish_list(true);
self.push_valid();
Ok(())
}
/// Needs to be called when a valid value was extended to this builder.
fn push_valid(&mut self) {
let size = self.values.len();
let size = i32::try_from(size).unwrap();
assert!(size >= *self.offsets.last().unwrap());
self.offsets.push(size);
if let Some(validity) = &mut self.validity {
validity.push(true)
}
}
}
impl MutableVector for ListVectorBuilder {
fn data_type(&self) -> ConcreteDataType {
ConcreteDataType::list_datatype(self.item_type.clone())
ConcreteDataType::list_datatype(self.inner_type.clone())
}
fn len(&self) -> usize {
self.null_buffer_builder.len()
self.offsets.len() - 1
}
fn as_any(&self) -> &dyn Any {
@@ -338,181 +348,51 @@ impl ScalarVectorBuilder for ListVectorBuilder {
self.push_value_ref(value.into()).unwrap_or_else(|e| {
panic!(
"Failed to push value, expect value type {:?}, err:{}",
self.item_type, e
self.inner_type, e
);
});
}
fn finish(&mut self) -> Self::VectorType {
let len = self.len();
let values_vector = self.values_builder.to_vector();
let values_arr = values_vector.to_arrow_array();
let values_data = values_arr.data();
let offset_buffer = self.offsets_builder.finish();
let null_bit_buffer = self.null_buffer_builder.finish();
// Re-initialize the offsets_builder.
self.offsets_builder.append(0);
let data_type = ConcreteDataType::list_datatype(self.item_type.clone()).as_arrow_type();
let array_data_builder = ArrayData::builder(data_type)
.len(len)
.add_buffer(offset_buffer)
.add_child_data(values_data.clone())
.null_bit_buffer(null_bit_buffer);
let array_data = unsafe { array_data_builder.build_unchecked() };
let array = ListArray::from(array_data);
let array = ArrowListArray::try_new(
ConcreteDataType::list_datatype(self.inner_type.clone()).as_arrow_type(),
std::mem::take(&mut self.offsets).into(),
self.values.to_vector().to_arrow_array(),
std::mem::take(&mut self.validity).map(|x| x.into()),
)
.unwrap(); // The `ListVectorBuilder` itself should ensure it always builds a valid array.
ListVector {
array,
item_type: self.item_type.clone(),
}
}
}
// Ports from https://github.com/apache/arrow-rs/blob/94565bca99b5d9932a3e9a8e094aaf4e4384b1e5/arrow-array/src/builder/null_buffer_builder.rs
/// Builder for creating the null bit buffer.
/// This builder only materializes the buffer when we append `false`.
/// If you only append `true`s to the builder, what you get will be
/// `None` when calling [`finish`](#method.finish).
/// This optimization is **very** important for the performance.
#[derive(Debug)]
struct NullBufferBuilder {
bitmap_builder: Option<BooleanBufferBuilder>,
/// Store the length of the buffer before materializing.
len: usize,
capacity: usize,
}
impl NullBufferBuilder {
/// Creates a new empty builder.
/// `capacity` is the number of bits in the null buffer.
fn new(capacity: usize) -> Self {
Self {
bitmap_builder: None,
len: 0,
capacity,
}
}
fn len(&self) -> usize {
if let Some(b) = &self.bitmap_builder {
b.len()
} else {
self.len
}
}
/// Appends a `true` into the builder
/// to indicate that this item is not null.
#[inline]
fn append_non_null(&mut self) {
if let Some(buf) = self.bitmap_builder.as_mut() {
buf.append(true)
} else {
self.len += 1;
}
}
/// Appends a `false` into the builder
/// to indicate that this item is null.
#[inline]
fn append_null(&mut self) {
self.materialize_if_needed();
self.bitmap_builder.as_mut().unwrap().append(false);
}
/// Appends a boolean value into the builder.
#[inline]
fn append(&mut self, not_null: bool) {
if not_null {
self.append_non_null()
} else {
self.append_null()
}
}
/// Builds the null buffer and resets the builder.
/// Returns `None` if the builder only contains `true`s.
fn finish(&mut self) -> Option<Buffer> {
let buf = self.bitmap_builder.as_mut().map(|b| b.finish());
self.bitmap_builder = None;
self.len = 0;
buf
}
#[inline]
fn materialize_if_needed(&mut self) {
if self.bitmap_builder.is_none() {
self.materialize()
}
}
#[cold]
fn materialize(&mut self) {
if self.bitmap_builder.is_none() {
let mut b = BooleanBufferBuilder::new(self.len.max(self.capacity));
b.append_n(self.len, true);
self.bitmap_builder = Some(b);
inner_datatype: self.inner_type.clone(),
}
}
}
#[cfg(test)]
pub mod tests {
use arrow::array::{Int32Array, Int32Builder, ListBuilder};
mod tests {
use arrow::array::{MutableListArray, MutablePrimitiveArray, TryExtend};
use serde_json::json;
use super::*;
use crate::scalars::ScalarRef;
use crate::types::ListType;
use crate::vectors::Int32Vector;
pub fn new_list_vector(data: &[Option<Vec<Option<i32>>>]) -> ListVector {
let mut builder =
ListVectorBuilder::with_type_capacity(ConcreteDataType::int32_datatype(), 8);
for vec_opt in data {
if let Some(vec) = vec_opt {
let values = vec.iter().map(|v| Value::from(*v)).collect();
let values = Some(Box::new(values));
let list_value = ListValue::new(values, ConcreteDataType::int32_datatype());
builder.push(Some(ListValueRef::Ref { val: &list_value }));
} else {
builder.push(None);
}
}
builder.finish()
}
fn new_list_array(data: &[Option<Vec<Option<i32>>>]) -> ListArray {
let mut builder = ListBuilder::new(Int32Builder::new());
for vec_opt in data {
if let Some(vec) = vec_opt {
for value_opt in vec {
builder.values().append_option(*value_opt);
}
builder.append(true);
} else {
builder.append(false);
}
}
builder.finish()
}
#[test]
fn test_list_vector() {
let data = vec![
Some(vec![Some(1), Some(2), Some(3)]),
Some(vec![Some(1i32), Some(2), Some(3)]),
None,
Some(vec![Some(4), None, Some(6)]),
];
let list_vector = new_list_vector(&data);
let mut arrow_array = MutableListArray::<i32, MutablePrimitiveArray<i32>>::new();
arrow_array.try_extend(data).unwrap();
let arrow_array: ArrowListArray = arrow_array.into();
let list_vector = ListVector {
array: arrow_array.clone(),
inner_datatype: ConcreteDataType::int32_datatype(),
};
assert_eq!(
ConcreteDataType::List(ListType::new(ConcreteDataType::int32_datatype())),
list_vector.data_type()
@@ -523,34 +403,30 @@ pub mod tests {
assert!(list_vector.is_null(1));
assert!(!list_vector.is_null(2));
let arrow_array = new_list_array(&data);
assert_eq!(
arrow_array,
*list_vector
list_vector
.to_arrow_array()
.as_any()
.downcast_ref::<ListArray>()
.downcast_ref::<ArrowListArray>()
.unwrap()
.clone()
);
let validity = list_vector.validity();
assert!(!validity.is_all_null());
assert!(!validity.is_all_valid());
assert!(validity.is_set(0));
assert!(!validity.is_set(1));
assert!(validity.is_set(2));
assert_eq!(256, list_vector.memory_size());
let slice = list_vector.slice(0, 2).to_arrow_array();
let sliced_array = slice.as_any().downcast_ref::<ListArray>().unwrap();
assert_eq!(
Int32Array::from_iter_values([1, 2, 3]),
*sliced_array
.value(0)
.as_any()
.downcast_ref::<Int32Array>()
.unwrap()
Validity::Slots(arrow_array.validity().unwrap()),
list_vector.validity()
);
assert_eq!(
arrow_array.offsets().len() * std::mem::size_of::<i64>()
+ arrow_array.values().len() * std::mem::size_of::<Arc<dyn Array>>(),
list_vector.memory_size()
);
let slice = list_vector.slice(0, 2);
assert_eq!(
"ListArray[[1, 2, 3], None]",
format!("{:?}", slice.to_arrow_array())
);
assert!(sliced_array.is_null(1));
assert_eq!(
Value::List(ListValue::new(
@@ -591,48 +467,52 @@ pub mod tests {
#[test]
fn test_from_arrow_array() {
let data = vec![
Some(vec![Some(1), Some(2), Some(3)]),
Some(vec![Some(1u32), Some(2), Some(3)]),
None,
Some(vec![Some(4), None, Some(6)]),
];
let arrow_array = new_list_array(&data);
let mut arrow_array = MutableListArray::<i32, MutablePrimitiveArray<u32>>::new();
arrow_array.try_extend(data).unwrap();
let arrow_array: ArrowListArray = arrow_array.into();
let array_ref: ArrayRef = Arc::new(arrow_array);
let expect = new_list_vector(&data);
// Test try from ArrayRef
let list_vector = ListVector::try_from_arrow_array(array_ref).unwrap();
assert_eq!(expect, list_vector);
// Test from
let arrow_array = new_list_array(&data);
let list_vector = ListVector::from(arrow_array);
assert_eq!(expect, list_vector);
assert_eq!(
"ListVector { array: ListArray[[1, 2, 3], None, [4, None, 6]], inner_datatype: UInt32(UInt32) }",
format!("{:?}", list_vector)
);
}
#[test]
fn test_iter_list_vector_values() {
let data = vec![
Some(vec![Some(1), Some(2), Some(3)]),
Some(vec![Some(1i64), Some(2), Some(3)]),
None,
Some(vec![Some(4), None, Some(6)]),
];
let list_vector = new_list_vector(&data);
let mut arrow_array = MutableListArray::<i32, MutablePrimitiveArray<i64>>::new();
arrow_array.try_extend(data).unwrap();
let arrow_array: ArrowListArray = arrow_array.into();
let list_vector = ListVector::from(arrow_array);
assert_eq!(
ConcreteDataType::List(ListType::new(ConcreteDataType::int32_datatype())),
ConcreteDataType::List(ListType::new(ConcreteDataType::int64_datatype())),
list_vector.data_type()
);
let mut iter = list_vector.values_iter();
assert_eq!(
Arc::new(Int32Vector::from_slice(&[1, 2, 3])) as VectorRef,
*iter.next().unwrap().unwrap().unwrap()
"Int64[1, 2, 3]",
format!("{:?}", iter.next().unwrap().unwrap().to_arrow_array())
);
assert!(iter.next().unwrap().unwrap().is_none());
assert_eq!(
Arc::new(Int32Vector::from(vec![Some(4), None, Some(6)])) as VectorRef,
*iter.next().unwrap().unwrap().unwrap(),
"Int64[]",
format!("{:?}", iter.next().unwrap().unwrap().to_arrow_array())
);
assert_eq!(
"Int64[4, None, 6]",
format!("{:?}", iter.next().unwrap().unwrap().to_arrow_array())
);
assert!(iter.next().is_none())
}
@@ -640,18 +520,30 @@ pub mod tests {
#[test]
fn test_serialize_to_json() {
let data = vec![
Some(vec![Some(1), Some(2), Some(3)]),
Some(vec![Some(1i64), Some(2), Some(3)]),
None,
Some(vec![Some(4), None, Some(6)]),
];
let list_vector = new_list_vector(&data);
let mut arrow_array = MutableListArray::<i32, MutablePrimitiveArray<i64>>::new();
arrow_array.try_extend(data).unwrap();
let arrow_array: ArrowListArray = arrow_array.into();
let list_vector = ListVector::from(arrow_array);
assert_eq!(
vec![json!([1, 2, 3]), json!(null), json!([4, null, 6]),],
list_vector.serialize_to_json().unwrap()
);
}
fn new_list_vector(data: Vec<Option<Vec<Option<i32>>>>) -> ListVector {
let mut arrow_array = MutableListArray::<i32, MutablePrimitiveArray<i32>>::new();
arrow_array.try_extend(data).unwrap();
let arrow_array: ArrowListArray = arrow_array.into();
ListVector::from(arrow_array)
}
#[test]
fn test_list_vector_builder() {
let mut builder =
@@ -675,14 +567,14 @@ pub mod tests {
None,
Some(vec![Some(7), Some(8), None]),
];
let input = new_list_vector(&data);
let input = new_list_vector(data);
builder.extend_slice_of(&input, 1, 2).unwrap();
assert!(builder
.extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1)
.is_err());
let vector = builder.to_vector();
let expect: VectorRef = Arc::new(new_list_vector(&[
let expect: VectorRef = Arc::new(new_list_vector(vec![
Some(vec![Some(4), None, Some(6)]),
None,
Some(vec![Some(7), Some(8), None]),
@@ -707,7 +599,7 @@ pub mod tests {
}));
let vector = builder.finish();
let expect = new_list_vector(&[None, Some(vec![Some(4), None, Some(6)])]);
let expect = new_list_vector(vec![None, Some(vec![Some(4), None, Some(6)])]);
assert_eq!(expect, vector);
assert!(vector.get_data(0).is_none());

View File

@@ -16,7 +16,8 @@ use std::any::Any;
use std::fmt;
use std::sync::Arc;
use arrow::array::{Array, ArrayData, ArrayRef, NullArray};
use arrow::array::{Array, ArrayRef, NullArray};
use arrow::datatypes::DataType as ArrowDataType;
use snafu::{ensure, OptionExt};
use crate::data_type::ConcreteDataType;
@@ -26,28 +27,21 @@ use crate::types::NullType;
use crate::value::{Value, ValueRef};
use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef};
/// A vector where all elements are nulls.
#[derive(PartialEq)]
pub struct NullVector {
array: NullArray,
}
// TODO(yingwen): Support null vector with other logical types.
impl NullVector {
/// Create a new `NullVector` with `n` elements.
pub fn new(n: usize) -> Self {
Self {
array: NullArray::new(n),
array: NullArray::new(ArrowDataType::Null, n),
}
}
pub(crate) fn as_arrow(&self) -> &dyn Array {
&self.array
}
fn to_array_data(&self) -> ArrayData {
self.array.data().clone()
}
}
impl From<NullArray> for NullVector {
@@ -74,28 +68,21 @@ impl Vector for NullVector {
}
fn to_arrow_array(&self) -> ArrayRef {
// TODO(yingwen): Replaced by clone after upgrading to arrow 28.0.
let data = self.to_array_data();
Arc::new(NullArray::from(data))
Arc::new(self.array.clone())
}
fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
let data = self.to_array_data();
Box::new(NullArray::from(data))
Box::new(self.array.clone())
}
fn validity(&self) -> Validity {
Validity::all_null(self.array.len())
Validity::AllNull
}
fn memory_size(&self) -> usize {
0
}
fn null_count(&self) -> usize {
self.array.null_count()
}
fn is_null(&self, _row: usize) -> bool {
true
}
@@ -230,7 +217,7 @@ mod tests {
assert_eq!("NullVector", v.vector_type_name());
assert!(!v.is_const());
assert!(v.validity().is_all_null());
assert_eq!(Validity::AllNull, v.validity());
assert!(v.only_null());
for i in 0..32 {
@@ -259,7 +246,7 @@ mod tests {
#[test]
fn test_null_vector_validity() {
let vector = NullVector::new(5);
assert!(vector.validity().is_all_null());
assert_eq!(Validity::AllNull, vector.validity());
assert_eq!(5, vector.null_count());
}

View File

@@ -19,11 +19,10 @@ mod replicate;
use common_base::BitVec;
use crate::error::Result;
use crate::types::LogicalPrimitiveType;
use crate::vectors::constant::ConstantVector;
use crate::types::PrimitiveElement;
use crate::vectors::{
BinaryVector, BooleanVector, ListVector, NullVector, PrimitiveVector, StringVector, Vector,
VectorRef,
BinaryVector, BooleanVector, ConstantVector, DateTimeVector, DateVector, ListVector,
NullVector, PrimitiveVector, StringVector, TimestampVector, Vector, VectorRef,
};
/// Vector compute operations.
@@ -60,10 +59,10 @@ pub trait VectorOp {
}
macro_rules! impl_scalar_vector_op {
($($VectorType: ident),+) => {$(
($( { $VectorType: ident, $replicate: ident } ),+) => {$(
impl VectorOp for $VectorType {
fn replicate(&self, offsets: &[usize]) -> VectorRef {
replicate::replicate_scalar(self, offsets)
replicate::$replicate(self, offsets)
}
fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) {
@@ -78,21 +77,28 @@ macro_rules! impl_scalar_vector_op {
)+};
}
impl_scalar_vector_op!(BinaryVector, BooleanVector, ListVector, StringVector);
impl_scalar_vector_op!(
{ BinaryVector, replicate_scalar },
{ BooleanVector, replicate_scalar },
{ ListVector, replicate_scalar },
{ StringVector, replicate_scalar },
{ DateVector, replicate_date },
{ DateTimeVector, replicate_datetime },
{ TimestampVector, replicate_timestamp }
);
impl<T: LogicalPrimitiveType> VectorOp for PrimitiveVector<T> {
impl VectorOp for ConstantVector {
fn replicate(&self, offsets: &[usize]) -> VectorRef {
std::sync::Arc::new(replicate::replicate_primitive(self, offsets))
replicate::replicate_constant(self, offsets)
}
fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) {
let prev_vector =
prev_vector.and_then(|pv| pv.as_any().downcast_ref::<PrimitiveVector<T>>());
find_unique::find_unique_scalar(self, selected, prev_vector);
let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::<ConstantVector>());
find_unique::find_unique_constant(self, selected, prev_vector);
}
fn filter(&self, filter: &BooleanVector) -> Result<VectorRef> {
filter::filter_non_constant!(self, PrimitiveVector<T>, filter)
filter::filter_constant(self, filter)
}
}
@@ -111,17 +117,21 @@ impl VectorOp for NullVector {
}
}
impl VectorOp for ConstantVector {
impl<T> VectorOp for PrimitiveVector<T>
where
T: PrimitiveElement,
{
fn replicate(&self, offsets: &[usize]) -> VectorRef {
self.replicate_vector(offsets)
replicate::replicate_primitive(self, offsets)
}
fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) {
let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::<ConstantVector>());
find_unique::find_unique_constant(self, selected, prev_vector);
let prev_vector =
prev_vector.and_then(|pv| pv.as_any().downcast_ref::<PrimitiveVector<T>>());
find_unique::find_unique_scalar(self, selected, prev_vector);
}
fn filter(&self, filter: &BooleanVector) -> Result<VectorRef> {
self.filter_vector(filter)
filter::filter_non_constant!(self, PrimitiveVector<T>, filter)
}
}

View File

@@ -12,15 +12,16 @@
// See the License for the specific language governing permissions and
// limitations under the License.
pub(crate) use crate::vectors::constant::filter_constant;
macro_rules! filter_non_constant {
($vector: expr, $VectorType: ty, $filter: ident) => {{
use std::sync::Arc;
use arrow::compute;
use snafu::ResultExt;
let arrow_array = $vector.as_arrow();
let filtered = compute::filter(arrow_array, $filter.as_boolean_array())
let filtered = arrow::compute::filter::filter(arrow_array, $filter.as_boolean_array())
.context(crate::error::ArrowComputeSnafu)?;
Ok(Arc::new(<$VectorType>::try_from_arrow_array(filtered)?))
}};
@@ -32,16 +33,9 @@ pub(crate) use filter_non_constant;
mod tests {
use std::sync::Arc;
use common_time::{Date, DateTime};
use crate::scalars::ScalarVector;
use crate::timestamp::{
TimestampMicrosecond, TimestampMillisecond, TimestampNanosecond, TimestampSecond,
};
use crate::types::WrapperType;
use crate::vectors::constant::ConstantVector;
use crate::vectors::{
BooleanVector, Int32Vector, NullVector, StringVector, VectorOp, VectorRef,
BooleanVector, ConstantVector, Int32Vector, NullVector, StringVector, VectorOp, VectorRef,
};
fn check_filter_primitive(expect: &[i32], input: &[i32], filter: &[bool]) {
@@ -111,6 +105,7 @@ mod tests {
($VectorType: ident, $ValueType: ident, $method: ident) => {{
use std::sync::Arc;
use common_time::$ValueType;
use $crate::vectors::{$VectorType, VectorRef};
let v = $VectorType::from_iterator((0..5).map($ValueType::$method));
@@ -128,18 +123,6 @@ mod tests {
fn test_filter_date_like() {
impl_filter_date_like_test!(DateVector, Date, new);
impl_filter_date_like_test!(DateTimeVector, DateTime, new);
impl_filter_date_like_test!(TimestampSecondVector, TimestampSecond, from_native);
impl_filter_date_like_test!(
TimestampMillisecondVector,
TimestampMillisecond,
from_native
);
impl_filter_date_like_test!(
TimestampMicrosecondVector,
TimestampMicrosecond,
from_native
);
impl_filter_date_like_test!(TimestampNanosecondVector, TimestampNanosecond, from_native);
impl_filter_date_like_test!(TimestampVector, Timestamp, from_millis);
}
}

View File

@@ -15,8 +15,7 @@
use common_base::BitVec;
use crate::scalars::ScalarVector;
use crate::vectors::constant::ConstantVector;
use crate::vectors::{NullVector, Vector};
use crate::vectors::{ConstantVector, NullVector, Vector};
// To implement `find_unique()` correctly, we need to keep in mind that always marks an element as
// selected when it is different from the previous one, and leaves the `selected` unchanged
@@ -71,7 +70,7 @@ pub(crate) fn find_unique_null(
return;
}
let is_first_not_duplicate = prev_vector.map(NullVector::is_empty).unwrap_or(true);
let is_first_not_duplicate = prev_vector.map(|pv| pv.is_empty()).unwrap_or(true);
if is_first_not_duplicate {
selected.set(0, true);
}
@@ -105,11 +104,8 @@ pub(crate) fn find_unique_constant(
mod tests {
use std::sync::Arc;
use common_time::{Date, DateTime};
use super::*;
use crate::timestamp::*;
use crate::vectors::{Int32Vector, StringVector, Vector, VectorOp};
use crate::vectors::{Int32Vector, StringVector, VectorOp};
fn check_bitmap(expect: &[bool], selected: &BitVec) {
let actual = selected.iter().collect::<Vec<_>>();
@@ -125,7 +121,7 @@ mod tests {
input: impl Iterator<Item = Option<i32>>,
prev: Option<&[i32]>,
) {
let input = Int32Vector::from(input.collect::<Vec<_>>());
let input = Int32Vector::from_iter(input);
let prev = prev.map(Int32Vector::from_slice);
let mut selected = BitVec::repeat(false, input.len());
@@ -345,6 +341,7 @@ mod tests {
macro_rules! impl_find_unique_date_like_test {
($VectorType: ident, $ValueType: ident, $method: ident) => {{
use common_time::$ValueType;
use $crate::vectors::$VectorType;
let v = $VectorType::from_iterator([8, 8, 9, 10].into_iter().map($ValueType::$method));
@@ -359,9 +356,6 @@ mod tests {
fn test_find_unique_date_like() {
impl_find_unique_date_like_test!(DateVector, Date, new);
impl_find_unique_date_like_test!(DateTimeVector, DateTime, new);
impl_find_unique_date_like_test!(TimestampSecondVector, TimestampSecond, from);
impl_find_unique_date_like_test!(TimestampMillisecondVector, TimestampMillisecond, from);
impl_find_unique_date_like_test!(TimestampMicrosecondVector, TimestampMicrosecond, from);
impl_find_unique_date_like_test!(TimestampNanosecondVector, TimestampNanosecond, from);
impl_find_unique_date_like_test!(TimestampVector, Timestamp, from_millis);
}
}

View File

@@ -13,8 +13,12 @@
// limitations under the License.
use crate::prelude::*;
pub(crate) use crate::vectors::constant::replicate_constant;
pub(crate) use crate::vectors::date::replicate_date;
pub(crate) use crate::vectors::datetime::replicate_datetime;
pub(crate) use crate::vectors::null::replicate_null;
pub(crate) use crate::vectors::primitive::replicate_primitive;
pub(crate) use crate::vectors::timestamp::replicate_timestamp;
pub(crate) fn replicate_scalar<C: ScalarVector>(c: &C, offsets: &[usize]) -> VectorRef {
assert_eq!(offsets.len(), c.len());
@@ -39,13 +43,8 @@ pub(crate) fn replicate_scalar<C: ScalarVector>(c: &C, offsets: &[usize]) -> Vec
mod tests {
use std::sync::Arc;
use common_time::timestamp::TimeUnit;
use common_time::{Date, DateTime, Timestamp};
use paste::paste;
use super::*;
use crate::vectors::constant::ConstantVector;
use crate::vectors::{Int32Vector, NullVector, StringVector, VectorOp};
use crate::vectors::{ConstantVector, Int32Vector, NullVector, StringVector, VectorOp};
#[test]
fn test_replicate_primitive() {
@@ -121,6 +120,7 @@ mod tests {
macro_rules! impl_replicate_date_like_test {
($VectorType: ident, $ValueType: ident, $method: ident) => {{
use common_time::$ValueType;
use $crate::vectors::$VectorType;
let v = $VectorType::from_iterator((0..5).map($ValueType::$method));
@@ -138,33 +138,10 @@ mod tests {
}};
}
macro_rules! impl_replicate_timestamp_test {
($unit: ident) => {{
paste!{
use $crate::vectors::[<Timestamp $unit Vector>];
use $crate::timestamp::[<Timestamp $unit>];
let v = [<Timestamp $unit Vector>]::from_iterator((0..5).map([<Timestamp $unit>]::from));
let offsets = [0, 1, 2, 3, 4];
let v = v.replicate(&offsets);
assert_eq!(4, v.len());
for i in 0..4 {
assert_eq!(
Value::Timestamp(Timestamp::new(i as i64 + 1, TimeUnit::$unit)),
v.get(i)
);
}
}
}};
}
#[test]
fn test_replicate_date_like() {
impl_replicate_date_like_test!(DateVector, Date, new);
impl_replicate_date_like_test!(DateTimeVector, DateTime, new);
impl_replicate_timestamp_test!(Second);
impl_replicate_timestamp_test!(Millisecond);
impl_replicate_timestamp_test!(Microsecond);
impl_replicate_timestamp_test!(Nanosecond);
impl_replicate_date_like_test!(TimestampVector, Timestamp, from_millis);
}
}

View File

@@ -13,111 +13,75 @@
// limitations under the License.
use std::any::Any;
use std::fmt;
use std::iter::FromIterator;
use std::slice::Iter;
use std::sync::Arc;
use arrow::array::{
Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef, PrimitiveArray, PrimitiveBuilder,
};
use arrow::array::{Array, ArrayRef, MutableArray, MutablePrimitiveArray, PrimitiveArray};
use arrow::bitmap::utils::ZipValidity;
use serde_json::Value as JsonValue;
use snafu::OptionExt;
use snafu::{OptionExt, ResultExt};
use crate::data_type::ConcreteDataType;
use crate::error::{self, Result};
use crate::data_type::{ConcreteDataType, DataType};
use crate::error::{ConversionSnafu, Result, SerializeSnafu};
use crate::scalars::{Scalar, ScalarRef, ScalarVector, ScalarVectorBuilder};
use crate::serialize::Serializable;
use crate::types::{
Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LogicalPrimitiveType,
UInt16Type, UInt32Type, UInt64Type, UInt8Type, WrapperType,
};
use crate::types::{Primitive, PrimitiveElement};
use crate::value::{Value, ValueRef};
use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef};
pub type UInt8Vector = PrimitiveVector<UInt8Type>;
pub type UInt16Vector = PrimitiveVector<UInt16Type>;
pub type UInt32Vector = PrimitiveVector<UInt32Type>;
pub type UInt64Vector = PrimitiveVector<UInt64Type>;
pub type Int8Vector = PrimitiveVector<Int8Type>;
pub type Int16Vector = PrimitiveVector<Int16Type>;
pub type Int32Vector = PrimitiveVector<Int32Type>;
pub type Int64Vector = PrimitiveVector<Int64Type>;
pub type Float32Vector = PrimitiveVector<Float32Type>;
pub type Float64Vector = PrimitiveVector<Float64Type>;
/// Vector for primitive data types.
pub struct PrimitiveVector<T: LogicalPrimitiveType> {
array: PrimitiveArray<T::ArrowPrimitive>,
#[derive(Debug, Clone, PartialEq)]
pub struct PrimitiveVector<T: Primitive> {
pub(crate) array: PrimitiveArray<T>,
}
impl<T: LogicalPrimitiveType> PrimitiveVector<T> {
pub fn new(array: PrimitiveArray<T::ArrowPrimitive>) -> Self {
impl<T: Primitive> PrimitiveVector<T> {
pub fn new(array: PrimitiveArray<T>) -> Self {
Self { array }
}
pub fn try_from_arrow_array(array: impl AsRef<dyn Array>) -> Result<Self> {
let data = array
.as_ref()
.as_any()
.downcast_ref::<PrimitiveArray<T::ArrowPrimitive>>()
.with_context(|| error::ConversionSnafu {
from: format!("{:?}", array.as_ref().data_type()),
})?
.data()
.clone();
let concrete_array = PrimitiveArray::<T::ArrowPrimitive>::from(data);
Ok(Self::new(concrete_array))
Ok(Self::new(
array
.as_ref()
.as_any()
.downcast_ref::<PrimitiveArray<T>>()
.with_context(|| ConversionSnafu {
from: format!("{:?}", array.as_ref().data_type()),
})?
.clone(),
))
}
pub fn from_slice<P: AsRef<[T::Native]>>(slice: P) -> Self {
let iter = slice.as_ref().iter().copied();
pub fn from_slice<P: AsRef<[T]>>(slice: P) -> Self {
Self {
array: PrimitiveArray::from_iter_values(iter),
array: PrimitiveArray::from_slice(slice),
}
}
pub fn from_wrapper_slice<P: AsRef<[T::Wrapper]>>(slice: P) -> Self {
let iter = slice.as_ref().iter().copied().map(WrapperType::into_native);
pub fn from_vec(array: Vec<T>) -> Self {
Self {
array: PrimitiveArray::from_iter_values(iter),
array: PrimitiveArray::from_vec(array),
}
}
pub fn from_vec(array: Vec<T::Native>) -> Self {
pub fn from_values<I: IntoIterator<Item = T>>(iter: I) -> Self {
Self {
array: PrimitiveArray::from_iter_values(array),
array: PrimitiveArray::from_values(iter),
}
}
pub fn from_values<I: IntoIterator<Item = T::Native>>(iter: I) -> Self {
Self {
array: PrimitiveArray::from_iter_values(iter),
}
}
pub(crate) fn as_arrow(&self) -> &PrimitiveArray<T::ArrowPrimitive> {
pub(crate) fn as_arrow(&self) -> &dyn Array {
&self.array
}
fn to_array_data(&self) -> ArrayData {
self.array.data().clone()
}
fn from_array_data(data: ArrayData) -> Self {
Self {
array: PrimitiveArray::from(data),
}
}
// To distinguish with `Vector::slice()`.
fn get_slice(&self, offset: usize, length: usize) -> Self {
let data = self.array.data().slice(offset, length);
Self::from_array_data(data)
fn slice(&self, offset: usize, length: usize) -> Self {
Self::from(self.array.slice(offset, length))
}
}
impl<T: LogicalPrimitiveType> Vector for PrimitiveVector<T> {
impl<T: PrimitiveElement> Vector for PrimitiveVector<T> {
fn data_type(&self) -> ConcreteDataType {
T::build_data_type()
}
@@ -135,13 +99,11 @@ impl<T: LogicalPrimitiveType> Vector for PrimitiveVector<T> {
}
fn to_arrow_array(&self) -> ArrayRef {
let data = self.to_array_data();
Arc::new(PrimitiveArray::<T::ArrowPrimitive>::from(data))
Arc::new(self.array.clone())
}
fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
let data = self.to_array_data();
Box::new(PrimitiveArray::<T::ArrowPrimitive>::from(data))
Box::new(self.array.clone())
}
fn validity(&self) -> Validity {
@@ -149,11 +111,7 @@ impl<T: LogicalPrimitiveType> Vector for PrimitiveVector<T> {
}
fn memory_size(&self) -> usize {
self.array.get_buffer_memory_size()
}
fn null_count(&self) -> usize {
self.array.null_count()
self.array.values().len() * std::mem::size_of::<T>()
}
fn is_null(&self, row: usize) -> bool {
@@ -161,80 +119,57 @@ impl<T: LogicalPrimitiveType> Vector for PrimitiveVector<T> {
}
fn slice(&self, offset: usize, length: usize) -> VectorRef {
let data = self.array.data().slice(offset, length);
Arc::new(Self::from_array_data(data))
Arc::new(self.slice(offset, length))
}
fn get(&self, index: usize) -> Value {
if self.array.is_valid(index) {
// Safety: The index have been checked by `is_valid()`.
let wrapper = unsafe { T::Wrapper::from_native(self.array.value_unchecked(index)) };
wrapper.into()
} else {
Value::Null
}
vectors::impl_get_for_vector!(self.array, index)
}
fn get_ref(&self, index: usize) -> ValueRef {
if self.array.is_valid(index) {
// Safety: The index have been checked by `is_valid()`.
let wrapper = unsafe { T::Wrapper::from_native(self.array.value_unchecked(index)) };
wrapper.into()
unsafe { self.array.value_unchecked(index).into_value_ref() }
} else {
ValueRef::Null
}
}
}
impl<T: LogicalPrimitiveType> fmt::Debug for PrimitiveVector<T> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("PrimitiveVector")
.field("array", &self.array)
.finish()
}
}
impl<T: LogicalPrimitiveType> From<PrimitiveArray<T::ArrowPrimitive>> for PrimitiveVector<T> {
fn from(array: PrimitiveArray<T::ArrowPrimitive>) -> Self {
impl<T: Primitive> From<PrimitiveArray<T>> for PrimitiveVector<T> {
fn from(array: PrimitiveArray<T>) -> Self {
Self { array }
}
}
impl<T: LogicalPrimitiveType> From<Vec<Option<T::Native>>> for PrimitiveVector<T> {
fn from(v: Vec<Option<T::Native>>) -> Self {
impl<T: Primitive> From<Vec<Option<T>>> for PrimitiveVector<T> {
fn from(v: Vec<Option<T>>) -> Self {
Self {
array: PrimitiveArray::from_iter(v),
array: PrimitiveArray::<T>::from(v),
}
}
}
pub struct PrimitiveIter<'a, T: LogicalPrimitiveType> {
iter: ArrayIter<&'a PrimitiveArray<T::ArrowPrimitive>>,
}
impl<'a, T: LogicalPrimitiveType> Iterator for PrimitiveIter<'a, T> {
type Item = Option<T::Wrapper>;
fn next(&mut self) -> Option<Option<T::Wrapper>> {
self.iter
.next()
.map(|item| item.map(T::Wrapper::from_native))
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.iter.size_hint()
impl<T: Primitive, Ptr: std::borrow::Borrow<Option<T>>> FromIterator<Ptr> for PrimitiveVector<T> {
fn from_iter<I: IntoIterator<Item = Ptr>>(iter: I) -> Self {
Self {
array: MutablePrimitiveArray::<T>::from_iter(iter).into(),
}
}
}
impl<T: LogicalPrimitiveType> ScalarVector for PrimitiveVector<T> {
type OwnedItem = T::Wrapper;
type RefItem<'a> = T::Wrapper;
impl<T> ScalarVector for PrimitiveVector<T>
where
T: PrimitiveElement,
{
type OwnedItem = T;
type RefItem<'a> = T;
type Iter<'a> = PrimitiveIter<'a, T>;
type Builder = PrimitiveVectorBuilder<T>;
fn get_data(&self, idx: usize) -> Option<Self::RefItem<'_>> {
if self.array.is_valid(idx) {
Some(T::Wrapper::from_native(self.array.value(idx)))
Some(self.array.value(idx))
} else {
None
}
@@ -247,47 +182,59 @@ impl<T: LogicalPrimitiveType> ScalarVector for PrimitiveVector<T> {
}
}
impl<T: LogicalPrimitiveType> Serializable for PrimitiveVector<T> {
pub type UInt8Vector = PrimitiveVector<u8>;
pub type UInt16Vector = PrimitiveVector<u16>;
pub type UInt32Vector = PrimitiveVector<u32>;
pub type UInt64Vector = PrimitiveVector<u64>;
pub type Int8Vector = PrimitiveVector<i8>;
pub type Int16Vector = PrimitiveVector<i16>;
pub type Int32Vector = PrimitiveVector<i32>;
pub type Int64Vector = PrimitiveVector<i64>;
pub type Float32Vector = PrimitiveVector<f32>;
pub type Float64Vector = PrimitiveVector<f64>;
pub struct PrimitiveIter<'a, T> {
iter: ZipValidity<'a, &'a T, Iter<'a, T>>,
}
impl<'a, T: Copy> Iterator for PrimitiveIter<'a, T> {
type Item = Option<T>;
fn next(&mut self) -> Option<Option<T>> {
self.iter.next().map(|v| v.copied())
}
}
impl<T: PrimitiveElement> Serializable for PrimitiveVector<T> {
fn serialize_to_json(&self) -> Result<Vec<JsonValue>> {
let res = self
.iter_data()
.map(|v| match v {
None => serde_json::Value::Null,
// use WrapperType's Into<serde_json::Value> bound instead of
// serde_json::to_value to facilitate customized serialization
// for WrapperType
Some(v) => v.into(),
})
.collect::<Vec<_>>();
Ok(res)
self.array
.iter()
.map(serde_json::to_value)
.collect::<serde_json::Result<_>>()
.context(SerializeSnafu)
}
}
impl<T: LogicalPrimitiveType> PartialEq for PrimitiveVector<T> {
fn eq(&self, other: &PrimitiveVector<T>) -> bool {
self.array == other.array
}
pub struct PrimitiveVectorBuilder<T: PrimitiveElement> {
pub(crate) mutable_array: MutablePrimitiveArray<T>,
}
pub type UInt8VectorBuilder = PrimitiveVectorBuilder<UInt8Type>;
pub type UInt16VectorBuilder = PrimitiveVectorBuilder<UInt16Type>;
pub type UInt32VectorBuilder = PrimitiveVectorBuilder<UInt32Type>;
pub type UInt64VectorBuilder = PrimitiveVectorBuilder<UInt64Type>;
pub type UInt8VectorBuilder = PrimitiveVectorBuilder<u8>;
pub type UInt16VectorBuilder = PrimitiveVectorBuilder<u16>;
pub type UInt32VectorBuilder = PrimitiveVectorBuilder<u32>;
pub type UInt64VectorBuilder = PrimitiveVectorBuilder<u64>;
pub type Int8VectorBuilder = PrimitiveVectorBuilder<Int8Type>;
pub type Int16VectorBuilder = PrimitiveVectorBuilder<Int16Type>;
pub type Int32VectorBuilder = PrimitiveVectorBuilder<Int32Type>;
pub type Int64VectorBuilder = PrimitiveVectorBuilder<Int64Type>;
pub type Int8VectorBuilder = PrimitiveVectorBuilder<i8>;
pub type Int16VectorBuilder = PrimitiveVectorBuilder<i16>;
pub type Int32VectorBuilder = PrimitiveVectorBuilder<i32>;
pub type Int64VectorBuilder = PrimitiveVectorBuilder<i64>;
pub type Float32VectorBuilder = PrimitiveVectorBuilder<Float32Type>;
pub type Float64VectorBuilder = PrimitiveVectorBuilder<Float64Type>;
pub type Float32VectorBuilder = PrimitiveVectorBuilder<f32>;
pub type Float64VectorBuilder = PrimitiveVectorBuilder<f64>;
/// Builder to build a primitive vector.
pub struct PrimitiveVectorBuilder<T: LogicalPrimitiveType> {
mutable_array: PrimitiveBuilder<T::ArrowPrimitive>,
}
impl<T: LogicalPrimitiveType> MutableVector for PrimitiveVectorBuilder<T> {
impl<T: PrimitiveElement> MutableVector for PrimitiveVectorBuilder<T> {
fn data_type(&self) -> ConcreteDataType {
T::build_data_type()
}
@@ -310,62 +257,81 @@ impl<T: LogicalPrimitiveType> MutableVector for PrimitiveVectorBuilder<T> {
fn push_value_ref(&mut self, value: ValueRef) -> Result<()> {
let primitive = T::cast_value_ref(value)?;
match primitive {
Some(v) => self.mutable_array.append_value(v.into_native()),
None => self.mutable_array.append_null(),
}
self.mutable_array.push(primitive);
Ok(())
}
fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> {
let primitive = T::cast_vector(vector)?;
// Slice the underlying array to avoid creating a new Arc.
let slice = primitive.get_slice(offset, length);
for v in slice.iter_data() {
self.push(v);
}
let slice = primitive.slice(offset, length);
self.mutable_array.extend_trusted_len(slice.iter());
Ok(())
}
}
impl<T> ScalarVectorBuilder for PrimitiveVectorBuilder<T>
where
T: LogicalPrimitiveType,
T::Wrapper: Scalar<VectorType = PrimitiveVector<T>>,
for<'a> T::Wrapper: ScalarRef<'a, ScalarType = T::Wrapper>,
for<'a> T::Wrapper: Scalar<RefType<'a> = T::Wrapper>,
T: Scalar<VectorType = PrimitiveVector<T>> + PrimitiveElement,
for<'a> T: ScalarRef<'a, ScalarType = T, VectorType = PrimitiveVector<T>>,
for<'a> T: Scalar<RefType<'a> = T>,
{
type VectorType = PrimitiveVector<T>;
fn with_capacity(capacity: usize) -> Self {
Self {
mutable_array: PrimitiveBuilder::with_capacity(capacity),
mutable_array: MutablePrimitiveArray::with_capacity(capacity),
}
}
fn push(&mut self, value: Option<<Self::VectorType as ScalarVector>::RefItem<'_>>) {
self.mutable_array
.append_option(value.map(|v| v.into_native()));
self.mutable_array.push(value);
}
fn finish(&mut self) -> Self::VectorType {
PrimitiveVector {
array: self.mutable_array.finish(),
array: std::mem::take(&mut self.mutable_array).into(),
}
}
}
pub(crate) fn replicate_primitive<T: LogicalPrimitiveType>(
impl<T: PrimitiveElement> PrimitiveVectorBuilder<T> {
fn with_type_capacity(data_type: ConcreteDataType, capacity: usize) -> Self {
Self {
mutable_array: MutablePrimitiveArray::with_capacity_from(
capacity,
data_type.as_arrow_type(),
),
}
}
}
pub(crate) fn replicate_primitive<T: PrimitiveElement>(
vector: &PrimitiveVector<T>,
offsets: &[usize],
) -> VectorRef {
Arc::new(replicate_primitive_with_type(
vector,
offsets,
T::build_data_type(),
))
}
pub(crate) fn replicate_primitive_with_type<T: PrimitiveElement>(
vector: &PrimitiveVector<T>,
offsets: &[usize],
data_type: ConcreteDataType,
) -> PrimitiveVector<T> {
assert_eq!(offsets.len(), vector.len());
if offsets.is_empty() {
return vector.get_slice(0, 0);
return vector.slice(0, 0);
}
let mut builder = PrimitiveVectorBuilder::<T>::with_capacity(*offsets.last().unwrap() as usize);
let mut builder = PrimitiveVectorBuilder::<T>::with_type_capacity(
data_type,
*offsets.last().unwrap() as usize,
);
let mut previous_offset = 0;
@@ -373,15 +339,14 @@ pub(crate) fn replicate_primitive<T: LogicalPrimitiveType>(
let repeat_times = *offset - previous_offset;
match value {
Some(data) => {
unsafe {
// Safety: std::iter::Repeat and std::iter::Take implement TrustedLen.
builder
.mutable_array
.append_trusted_len_iter(std::iter::repeat(data).take(repeat_times));
}
builder.mutable_array.extend_trusted_len(
std::iter::repeat(*data)
.take(repeat_times)
.map(Option::Some),
);
}
None => {
builder.mutable_array.append_nulls(repeat_times);
builder.mutable_array.extend_constant(repeat_times, None);
}
}
previous_offset = *offset;
@@ -391,7 +356,6 @@ pub(crate) fn replicate_primitive<T: LogicalPrimitiveType>(
#[cfg(test)]
mod tests {
use arrow::array::Int32Array;
use arrow::datatypes::DataType as ArrowDataType;
use serde_json;
@@ -400,11 +364,11 @@ mod tests {
use crate::serialize::Serializable;
use crate::types::Int64Type;
fn check_vec(v: Int32Vector) {
fn check_vec(v: PrimitiveVector<i32>) {
assert_eq!(4, v.len());
assert_eq!("Int32Vector", v.vector_type_name());
assert!(!v.is_const());
assert!(v.validity().is_all_valid());
assert_eq!(Validity::AllValid, v.validity());
assert!(!v.only_null());
for i in 0..4 {
@@ -423,26 +387,26 @@ mod tests {
#[test]
fn test_from_values() {
let v = Int32Vector::from_values(vec![1, 2, 3, 4]);
let v = PrimitiveVector::<i32>::from_values(vec![1, 2, 3, 4]);
check_vec(v);
}
#[test]
fn test_from_vec() {
let v = Int32Vector::from_vec(vec![1, 2, 3, 4]);
let v = PrimitiveVector::<i32>::from_vec(vec![1, 2, 3, 4]);
check_vec(v);
}
#[test]
fn test_from_slice() {
let v = Int32Vector::from_slice(vec![1, 2, 3, 4]);
let v = PrimitiveVector::<i32>::from_slice(vec![1, 2, 3, 4]);
check_vec(v);
}
#[test]
fn test_serialize_primitive_vector_with_null_to_json() {
let input = [Some(1i32), Some(2i32), None, Some(4i32), None];
let mut builder = Int32VectorBuilder::with_capacity(input.len());
let mut builder = PrimitiveVectorBuilder::with_capacity(input.len());
for v in input {
builder.push(v);
}
@@ -457,15 +421,15 @@ mod tests {
#[test]
fn test_from_arrow_array() {
let arrow_array = Int32Array::from(vec![1, 2, 3, 4]);
let v = Int32Vector::from(arrow_array);
let arrow_array = PrimitiveArray::from_slice(vec![1, 2, 3, 4]);
let v = PrimitiveVector::from(arrow_array);
check_vec(v);
}
#[test]
fn test_primitive_vector_build_get() {
let input = [Some(1i32), Some(2i32), None, Some(4i32), None];
let mut builder = Int32VectorBuilder::with_capacity(input.len());
let mut builder = PrimitiveVectorBuilder::with_capacity(input.len());
for v in input {
builder.push(v);
}
@@ -484,28 +448,29 @@ mod tests {
#[test]
fn test_primitive_vector_validity() {
let input = [Some(1i32), Some(2i32), None, None];
let mut builder = Int32VectorBuilder::with_capacity(input.len());
let mut builder = PrimitiveVectorBuilder::with_capacity(input.len());
for v in input {
builder.push(v);
}
let vector = builder.finish();
assert_eq!(2, vector.null_count());
let validity = vector.validity();
assert_eq!(2, validity.null_count());
assert!(!validity.is_set(2));
assert!(!validity.is_set(3));
let slots = validity.slots().unwrap();
assert_eq!(2, slots.null_count());
assert!(!slots.get_bit(2));
assert!(!slots.get_bit(3));
let vector = Int32Vector::from_slice(vec![1, 2, 3, 4]);
let vector = PrimitiveVector::<i32>::from_slice(vec![1, 2, 3, 4]);
assert_eq!(0, vector.null_count());
assert!(vector.validity().is_all_valid());
assert_eq!(Validity::AllValid, vector.validity());
}
#[test]
fn test_memory_size() {
let v = Int32Vector::from_slice((0..5).collect::<Vec<i32>>());
assert_eq!(64, v.memory_size());
let v = Int64Vector::from(vec![Some(0i64), Some(1i64), Some(2i64), None, None]);
assert_eq!(128, v.memory_size());
let v = PrimitiveVector::<i32>::from_slice((0..5).collect::<Vec<i32>>());
assert_eq!(20, v.memory_size());
let v = PrimitiveVector::<i64>::from(vec![Some(0i64), Some(1i64), Some(2i64), None, None]);
assert_eq!(40, v.memory_size());
}
#[test]
@@ -524,29 +489,4 @@ mod tests {
let expect: VectorRef = Arc::new(Int64Vector::from_slice(&[123, 8, 9]));
assert_eq!(expect, vector);
}
#[test]
fn test_from_wrapper_slice() {
macro_rules! test_from_wrapper_slice {
($vec: ident, $ty: ident) => {
let from_wrapper_slice = $vec::from_wrapper_slice(&[
$ty::from_native($ty::MAX),
$ty::from_native($ty::MIN),
]);
let from_slice = $vec::from_slice(&[$ty::MAX, $ty::MIN]);
assert_eq!(from_wrapper_slice, from_slice);
};
}
test_from_wrapper_slice!(UInt8Vector, u8);
test_from_wrapper_slice!(Int8Vector, i8);
test_from_wrapper_slice!(UInt16Vector, u16);
test_from_wrapper_slice!(Int16Vector, i16);
test_from_wrapper_slice!(UInt32Vector, u32);
test_from_wrapper_slice!(Int32Vector, i32);
test_from_wrapper_slice!(UInt64Vector, u64);
test_from_wrapper_slice!(Int64Vector, i64);
test_from_wrapper_slice!(Float32Vector, f32);
test_from_wrapper_slice!(Float64Vector, f64);
}
}

View File

@@ -15,19 +15,22 @@
use std::any::Any;
use std::sync::Arc;
use arrow::array::{Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef};
use snafu::ResultExt;
use arrow::array::{Array, ArrayRef, MutableArray, Utf8ValuesIter};
use arrow::bitmap::utils::ZipValidity;
use serde_json::Value as JsonValue;
use snafu::{OptionExt, ResultExt};
use crate::arrow_array::{MutableStringArray, StringArray};
use crate::data_type::ConcreteDataType;
use crate::error::{self, Result};
use crate::error::{Result, SerializeSnafu};
use crate::scalars::{ScalarVector, ScalarVectorBuilder};
use crate::serialize::Serializable;
use crate::types::StringType;
use crate::value::{Value, ValueRef};
use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef};
/// Vector of strings.
#[derive(Debug, PartialEq)]
/// String array wrapper
#[derive(Debug, Clone, PartialEq)]
pub struct StringVector {
array: StringArray,
}
@@ -36,16 +39,6 @@ impl StringVector {
pub(crate) fn as_arrow(&self) -> &dyn Array {
&self.array
}
fn to_array_data(&self) -> ArrayData {
self.array.data().clone()
}
fn from_array_data(data: ArrayData) -> Self {
Self {
array: StringArray::from(data),
}
}
}
impl From<StringArray> for StringVector {
@@ -57,31 +50,7 @@ impl From<StringArray> for StringVector {
impl From<Vec<Option<String>>> for StringVector {
fn from(data: Vec<Option<String>>) -> Self {
Self {
array: StringArray::from_iter(data),
}
}
}
impl From<Vec<Option<&str>>> for StringVector {
fn from(data: Vec<Option<&str>>) -> Self {
Self {
array: StringArray::from_iter(data),
}
}
}
impl From<&[Option<String>]> for StringVector {
fn from(data: &[Option<String>]) -> Self {
Self {
array: StringArray::from_iter(data),
}
}
}
impl From<&[Option<&str>]> for StringVector {
fn from(data: &[Option<&str>]) -> Self {
Self {
array: StringArray::from_iter(data),
array: StringArray::from(data),
}
}
}
@@ -89,7 +58,19 @@ impl From<&[Option<&str>]> for StringVector {
impl From<Vec<String>> for StringVector {
fn from(data: Vec<String>) -> Self {
Self {
array: StringArray::from_iter(data.into_iter().map(Some)),
array: StringArray::from(
data.into_iter()
.map(Option::Some)
.collect::<Vec<Option<String>>>(),
),
}
}
}
impl From<Vec<Option<&str>>> for StringVector {
fn from(data: Vec<Option<&str>>) -> Self {
Self {
array: StringArray::from(data),
}
}
}
@@ -97,14 +78,18 @@ impl From<Vec<String>> for StringVector {
impl From<Vec<&str>> for StringVector {
fn from(data: Vec<&str>) -> Self {
Self {
array: StringArray::from_iter(data.into_iter().map(Some)),
array: StringArray::from(
data.into_iter()
.map(Option::Some)
.collect::<Vec<Option<&str>>>(),
),
}
}
}
impl Vector for StringVector {
fn data_type(&self) -> ConcreteDataType {
ConcreteDataType::string_datatype()
ConcreteDataType::String(StringType::default())
}
fn vector_type_name(&self) -> String {
@@ -120,13 +105,11 @@ impl Vector for StringVector {
}
fn to_arrow_array(&self) -> ArrayRef {
let data = self.to_array_data();
Arc::new(StringArray::from(data))
Arc::new(self.array.clone())
}
fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
let data = self.to_array_data();
Box::new(StringArray::from(data))
Box::new(self.array.clone())
}
fn validity(&self) -> Validity {
@@ -134,11 +117,7 @@ impl Vector for StringVector {
}
fn memory_size(&self) -> usize {
self.array.get_buffer_memory_size()
}
fn null_count(&self) -> usize {
self.array.null_count()
self.len() * std::mem::size_of::<i64>() + self.array.values().len()
}
fn is_null(&self, row: usize) -> bool {
@@ -146,8 +125,7 @@ impl Vector for StringVector {
}
fn slice(&self, offset: usize, length: usize) -> VectorRef {
let data = self.array.data().slice(offset, length);
Arc::new(Self::from_array_data(data))
Arc::new(Self::from(self.array.slice(offset, length)))
}
fn get(&self, index: usize) -> Value {
@@ -162,7 +140,7 @@ impl Vector for StringVector {
impl ScalarVector for StringVector {
type OwnedItem = String;
type RefItem<'a> = &'a str;
type Iter<'a> = ArrayIter<&'a StringArray>;
type Iter<'a> = ZipValidity<'a, &'a str, Utf8ValuesIter<'a, i32>>;
type Builder = StringVectorBuilder;
fn get_data(&self, idx: usize) -> Option<Self::RefItem<'_>> {
@@ -179,7 +157,7 @@ impl ScalarVector for StringVector {
}
pub struct StringVectorBuilder {
mutable_array: MutableStringArray,
buffer: MutableStringArray,
}
impl MutableVector for StringVectorBuilder {
@@ -188,7 +166,7 @@ impl MutableVector for StringVectorBuilder {
}
fn len(&self) -> usize {
self.mutable_array.len()
self.buffer.len()
}
fn as_any(&self) -> &dyn Any {
@@ -204,15 +182,12 @@ impl MutableVector for StringVectorBuilder {
}
fn push_value_ref(&mut self, value: ValueRef) -> Result<()> {
match value.as_string()? {
Some(v) => self.mutable_array.append_value(v),
None => self.mutable_array.append_null(),
}
self.buffer.push(value.as_string()?);
Ok(())
}
fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> {
vectors::impl_extend_for_builder!(self, vector, StringVector, offset, length)
vectors::impl_extend_for_builder!(self.buffer, vector, StringVector, offset, length)
}
}
@@ -221,30 +196,30 @@ impl ScalarVectorBuilder for StringVectorBuilder {
fn with_capacity(capacity: usize) -> Self {
Self {
mutable_array: MutableStringArray::with_capacity(capacity, 0),
buffer: MutableStringArray::with_capacity(capacity),
}
}
fn push(&mut self, value: Option<<Self::VectorType as ScalarVector>::RefItem<'_>>) {
match value {
Some(v) => self.mutable_array.append_value(v),
None => self.mutable_array.append_null(),
}
self.buffer.push(value)
}
fn finish(&mut self) -> Self::VectorType {
StringVector {
array: self.mutable_array.finish(),
Self::VectorType {
array: std::mem::take(&mut self.buffer).into(),
}
}
}
impl Serializable for StringVector {
fn serialize_to_json(&self) -> Result<Vec<serde_json::Value>> {
fn serialize_to_json(&self) -> crate::error::Result<Vec<JsonValue>> {
self.iter_data()
.map(serde_json::to_value)
.map(|v| match v {
None => Ok(serde_json::Value::Null),
Some(s) => serde_json::to_value(s),
})
.collect::<serde_json::Result<_>>()
.context(error::SerializeSnafu)
.context(SerializeSnafu)
}
}
@@ -252,9 +227,60 @@ vectors::impl_try_from_arrow_array_for_vector!(StringArray, StringVector);
#[cfg(test)]
mod tests {
use arrow::datatypes::DataType;
use arrow::datatypes::DataType as ArrowDataType;
use serde_json;
use super::*;
use crate::data_type::DataType;
#[test]
fn test_string_vector_misc() {
let strs = vec!["hello", "greptime", "rust"];
let v = StringVector::from(strs.clone());
assert_eq!(3, v.len());
assert_eq!("StringVector", v.vector_type_name());
assert!(!v.is_const());
assert_eq!(Validity::AllValid, v.validity());
assert!(!v.only_null());
assert_eq!(41, v.memory_size());
for (i, s) in strs.iter().enumerate() {
assert_eq!(Value::from(*s), v.get(i));
assert_eq!(ValueRef::from(*s), v.get_ref(i));
assert_eq!(Value::from(*s), v.try_get(i).unwrap());
}
let arrow_arr = v.to_arrow_array();
assert_eq!(3, arrow_arr.len());
assert_eq!(&ArrowDataType::Utf8, arrow_arr.data_type());
}
#[test]
fn test_serialize_string_vector() {
let mut builder = StringVectorBuilder::with_capacity(3);
builder.push(Some("hello"));
builder.push(None);
builder.push(Some("world"));
let string_vector = builder.finish();
let serialized =
serde_json::to_string(&string_vector.serialize_to_json().unwrap()).unwrap();
assert_eq!(r#"["hello",null,"world"]"#, serialized);
}
#[test]
fn test_from_arrow_array() {
let mut builder = MutableStringArray::new();
builder.push(Some("A"));
builder.push(Some("B"));
builder.push::<&str>(None);
builder.push(Some("D"));
let string_array: StringArray = builder.into();
let vector = StringVector::from(string_array);
assert_eq!(
r#"["A","B",null,"D"]"#,
serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(),
);
}
#[test]
fn test_string_vector_build_get() {
@@ -284,7 +310,7 @@ mod tests {
#[test]
fn test_string_vector_builder() {
let mut builder = StringVectorBuilder::with_capacity(3);
let mut builder = StringType::default().create_mutable_vector(3);
builder.push_value_ref(ValueRef::String("hello")).unwrap();
assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err());
@@ -298,73 +324,4 @@ mod tests {
let expect: VectorRef = Arc::new(StringVector::from_slice(&["hello", "one", "two"]));
assert_eq!(expect, vector);
}
#[test]
fn test_string_vector_misc() {
let strs = vec!["hello", "greptime", "rust"];
let v = StringVector::from(strs.clone());
assert_eq!(3, v.len());
assert_eq!("StringVector", v.vector_type_name());
assert!(!v.is_const());
assert!(v.validity().is_all_valid());
assert!(!v.only_null());
assert_eq!(128, v.memory_size());
for (i, s) in strs.iter().enumerate() {
assert_eq!(Value::from(*s), v.get(i));
assert_eq!(ValueRef::from(*s), v.get_ref(i));
assert_eq!(Value::from(*s), v.try_get(i).unwrap());
}
let arrow_arr = v.to_arrow_array();
assert_eq!(3, arrow_arr.len());
assert_eq!(&DataType::Utf8, arrow_arr.data_type());
}
#[test]
fn test_serialize_string_vector() {
let mut builder = StringVectorBuilder::with_capacity(3);
builder.push(Some("hello"));
builder.push(None);
builder.push(Some("world"));
let string_vector = builder.finish();
let serialized =
serde_json::to_string(&string_vector.serialize_to_json().unwrap()).unwrap();
assert_eq!(r#"["hello",null,"world"]"#, serialized);
}
#[test]
fn test_from_arrow_array() {
let mut builder = MutableStringArray::new();
builder.append_option(Some("A"));
builder.append_option(Some("B"));
builder.append_null();
builder.append_option(Some("D"));
let string_array: StringArray = builder.finish();
let vector = StringVector::from(string_array);
assert_eq!(
r#"["A","B",null,"D"]"#,
serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(),
);
}
#[test]
fn test_from_non_option_string() {
let nul = String::from_utf8(vec![0]).unwrap();
let corpus = vec!["😅😅😅", "😍😍😍😍", "🥵🥵", nul.as_str()];
let vector = StringVector::from(corpus);
let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap();
assert_eq!(r#"["😅😅😅","😍😍😍😍","🥵🥵","\u0000"]"#, serialized);
let corpus = vec![
"🀀🀀🀀".to_string(),
"🀁🀁🀁".to_string(),
"🀂🀂🀂".to_string(),
"🀃🀃🀃".to_string(),
"🀆🀆".to_string(),
];
let vector = StringVector::from(corpus);
let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap();
assert_eq!(r#"["🀀🀀🀀","🀁🀁🀁","🀂🀂🀂","🀃🀃🀃","🀆🀆"]"#, serialized);
}
}

View File

@@ -12,20 +12,308 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use crate::types::{
TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
TimestampSecondType,
use std::any::Any;
use std::sync::Arc;
use arrow::array::{Array, ArrayRef, PrimitiveArray};
use common_time::timestamp::{TimeUnit, Timestamp};
use snafu::OptionExt;
use crate::data_type::{ConcreteDataType, DataType};
use crate::error;
use crate::error::Result;
use crate::prelude::{
MutableVector, ScalarVector, ScalarVectorBuilder, Validity, Value, ValueRef, Vector, VectorRef,
};
use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder};
use crate::serialize::Serializable;
use crate::types::TimestampType;
use crate::vectors::{PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder};
pub type TimestampSecondVector = PrimitiveVector<TimestampSecondType>;
pub type TimestampSecondVectorBuilder = PrimitiveVectorBuilder<TimestampSecondType>;
/// `TimestampVector` stores timestamp in millisecond since UNIX Epoch.
#[derive(Debug, Clone, PartialEq)]
pub struct TimestampVector {
array: PrimitiveVector<i64>,
}
pub type TimestampMillisecondVector = PrimitiveVector<TimestampMillisecondType>;
pub type TimestampMillisecondVectorBuilder = PrimitiveVectorBuilder<TimestampMillisecondType>;
impl TimestampVector {
pub fn new(array: PrimitiveArray<i64>) -> Self {
Self {
array: PrimitiveVector { array },
}
}
pub type TimestampMicrosecondVector = PrimitiveVector<TimestampMicrosecondType>;
pub type TimestampMicrosecondVectorBuilder = PrimitiveVectorBuilder<TimestampMicrosecondType>;
pub fn try_from_arrow_array(array: impl AsRef<dyn Array>) -> Result<Self> {
Ok(Self::new(
array
.as_ref()
.as_any()
.downcast_ref::<PrimitiveArray<i64>>()
.with_context(|| error::ConversionSnafu {
from: format!("{:?}", array.as_ref().data_type()),
})?
.clone(),
))
}
pub type TimestampNanosecondVector = PrimitiveVector<TimestampNanosecondType>;
pub type TimestampNanosecondVectorBuilder = PrimitiveVectorBuilder<TimestampNanosecondType>;
pub fn from_values<I: IntoIterator<Item = i64>>(iter: I) -> Self {
Self {
array: PrimitiveVector {
array: PrimitiveArray::from_values(iter),
},
}
}
pub(crate) fn as_arrow(&self) -> &dyn Array {
self.array.as_arrow()
}
}
impl Vector for TimestampVector {
fn data_type(&self) -> ConcreteDataType {
ConcreteDataType::timestamp_millis_datatype()
}
fn vector_type_name(&self) -> String {
"TimestampVector".to_string()
}
fn as_any(&self) -> &dyn Any {
self
}
fn len(&self) -> usize {
self.array.len()
}
fn to_arrow_array(&self) -> ArrayRef {
let validity = self.array.array.validity().cloned();
let buffer = self.array.array.values().clone();
Arc::new(PrimitiveArray::new(
TimestampType::new(TimeUnit::Millisecond).as_arrow_type(),
buffer,
validity,
))
}
fn to_boxed_arrow_array(&self) -> Box<dyn Array> {
let validity = self.array.array.validity().cloned();
let values = self.array.array.values().clone();
Box::new(PrimitiveArray::new(
arrow::datatypes::DataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None),
values,
validity,
))
}
fn validity(&self) -> Validity {
self.array.validity()
}
fn memory_size(&self) -> usize {
self.array.memory_size()
}
fn is_null(&self, row: usize) -> bool {
self.array.is_null(row)
}
fn slice(&self, offset: usize, length: usize) -> VectorRef {
Arc::new(Self {
array: PrimitiveVector {
array: self.array.array.slice(offset, length),
},
})
}
fn get(&self, index: usize) -> Value {
match self.array.get(index) {
Value::Null => Value::Null,
Value::Int64(v) => Value::Timestamp(Timestamp::from_millis(v)),
_ => {
unreachable!()
}
}
}
fn get_ref(&self, index: usize) -> ValueRef {
match self.array.get(index) {
Value::Int64(v) => ValueRef::Timestamp(Timestamp::from_millis(v)),
Value::Null => ValueRef::Null,
_ => unreachable!(),
}
}
}
impl Serializable for TimestampVector {
fn serialize_to_json(&self) -> Result<Vec<serde_json::Value>> {
Ok(self
.array
.iter_data()
.map(|v| match v {
None => serde_json::Value::Null,
Some(v) => v.into(),
})
.collect::<Vec<_>>())
}
}
impl ScalarVector for TimestampVector {
type OwnedItem = Timestamp;
type RefItem<'a> = Timestamp;
type Iter<'a> = TimestampDataIter<'a>;
type Builder = TimestampVectorBuilder;
fn get_data(&self, idx: usize) -> Option<Self::RefItem<'_>> {
self.array.get_data(idx).map(Timestamp::from_millis)
}
fn iter_data(&self) -> Self::Iter<'_> {
TimestampDataIter {
iter: self.array.iter_data(),
}
}
}
pub struct TimestampDataIter<'a> {
iter: PrimitiveIter<'a, i64>,
}
impl<'a> Iterator for TimestampDataIter<'a> {
type Item = Option<Timestamp>;
fn next(&mut self) -> Option<Self::Item> {
self.iter.next().map(|v| v.map(Timestamp::from_millis))
}
}
pub struct TimestampVectorBuilder {
buffer: PrimitiveVectorBuilder<i64>,
}
impl MutableVector for TimestampVectorBuilder {
fn data_type(&self) -> ConcreteDataType {
ConcreteDataType::timestamp_millis_datatype()
}
fn len(&self) -> usize {
self.buffer.len()
}
fn as_any(&self) -> &dyn Any {
self
}
fn as_mut_any(&mut self) -> &mut dyn Any {
self
}
fn to_vector(&mut self) -> VectorRef {
Arc::new(self.finish())
}
fn push_value_ref(&mut self, value: ValueRef) -> Result<()> {
// TODO(hl): vector and vector builder should also support customized time unit.
self.buffer.push(
value
.as_timestamp()?
.map(|t| t.convert_to(TimeUnit::Millisecond)),
);
Ok(())
}
fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> {
let concrete_vector = vector
.as_any()
.downcast_ref::<TimestampVector>()
.with_context(|| error::CastTypeSnafu {
msg: format!(
"Failed to convert vector from {} to DateVector",
vector.vector_type_name()
),
})?;
self.buffer
.extend_slice_of(&concrete_vector.array, offset, length)?;
Ok(())
}
}
impl ScalarVectorBuilder for TimestampVectorBuilder {
type VectorType = TimestampVector;
fn with_capacity(capacity: usize) -> Self {
Self {
buffer: PrimitiveVectorBuilder::with_capacity(capacity),
}
}
/// Pushes a Timestamp value into vector builder. The timestamp must be with time unit
/// `Second`/`MilliSecond`/`Microsecond`.
fn push(&mut self, value: Option<<Self::VectorType as ScalarVector>::RefItem<'_>>) {
self.buffer
.push(value.map(|v| v.convert_to(TimeUnit::Millisecond)));
}
fn finish(&mut self) -> Self::VectorType {
Self::VectorType {
array: self.buffer.finish(),
}
}
}
pub(crate) fn replicate_timestamp(vector: &TimestampVector, offsets: &[usize]) -> VectorRef {
let array = crate::vectors::primitive::replicate_primitive_with_type(
&vector.array,
offsets,
vector.data_type(),
);
Arc::new(TimestampVector { array })
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
pub fn test_build_timestamp_vector() {
let mut builder = TimestampVectorBuilder::with_capacity(3);
builder.push(Some(Timestamp::new(1, TimeUnit::Second)));
builder.push(None);
builder.push(Some(Timestamp::new(2, TimeUnit::Millisecond)));
let vector = builder.finish();
assert_eq!(
ConcreteDataType::timestamp_millis_datatype(),
vector.data_type()
);
assert_eq!(3, vector.len());
assert_eq!(
Value::Timestamp(Timestamp::new(1000, TimeUnit::Millisecond)),
vector.get(0)
);
assert_eq!(Value::Null, vector.get(1));
assert_eq!(
Value::Timestamp(Timestamp::new(2, TimeUnit::Millisecond)),
vector.get(2)
);
assert_eq!(
vec![
Some(Timestamp::new(1000, TimeUnit::Millisecond)),
None,
Some(Timestamp::new(2, TimeUnit::Millisecond)),
],
vector.iter_data().collect::<Vec<_>>()
);
}
#[test]
fn test_timestamp_from_arrow() {
let vector =
TimestampVector::from_slice(&[Timestamp::from_millis(1), Timestamp::from_millis(2)]);
let arrow = vector.as_arrow().slice(0, vector.len());
let vector2 = TimestampVector::try_from_arrow_array(&arrow).unwrap();
assert_eq!(vector, vector2);
}
}