diff --git a/Cargo.lock b/Cargo.lock index 89ba76e6d0..784aa0f52c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2043,25 +2043,6 @@ dependencies = [ "snafu", ] -[[package]] -name = "datatypes2" -version = "0.1.0" -dependencies = [ - "arrow", - "common-base", - "common-error", - "common-time", - "datafusion-common", - "enum_dispatch", - "num", - "num-traits", - "ordered-float 3.4.0", - "paste", - "serde", - "serde_json", - "snafu", -] - [[package]] name = "derive-new" version = "0.5.9" diff --git a/Cargo.toml b/Cargo.toml index 77d94f0f37..e2b097f617 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,6 @@ members = [ "src/common/time", "src/datanode", "src/datatypes", - "src/datatypes2", "src/frontend", "src/log-store", "src/meta-client", diff --git a/src/datatypes/Cargo.toml b/src/datatypes/Cargo.toml index e49f173ab4..0ca8bf378c 100644 --- a/src/datatypes/Cargo.toml +++ b/src/datatypes/Cargo.toml @@ -9,11 +9,10 @@ default = [] test = [] [dependencies] -arrow = "26.0.0" common-base = { path = "../common/base" } common-error = { path = "../common/error" } common-time = { path = "../common/time" } -datafusion-common = "14.0.0" +datafusion-common = "14.0" enum_dispatch = "0.3" num = "0.4" num-traits = "0.2" @@ -22,3 +21,4 @@ paste = "1.0" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" snafu = { version = "0.7", features = ["backtraces"] } +arrow = "26.0" diff --git a/src/datatypes/src/arrow_array.rs b/src/datatypes/src/arrow_array.rs index 3444598ede..7405c8a665 100644 --- a/src/datatypes/src/arrow_array.rs +++ b/src/datatypes/src/arrow_array.rs @@ -12,13 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -use arrow::array::{self, Array, ListArray, PrimitiveArray}; +use arrow::array::{ + Array, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, Int16Array, + Int32Array, Int64Array, Int8Array, ListArray, UInt16Array, UInt32Array, UInt64Array, + UInt8Array, +}; use arrow::datatypes::DataType; -use common_time::timestamp::Timestamp; +use common_time::timestamp::TimeUnit; +use common_time::Timestamp; use snafu::OptionExt; +use crate::data_type::ConcreteDataType; use crate::error::{ConversionSnafu, Result}; -use crate::prelude::ConcreteDataType; use crate::value::{ListValue, Value}; pub type BinaryArray = arrow::array::LargeBinaryArray; @@ -36,6 +41,7 @@ macro_rules! cast_array { }; } +// TODO(yingwen): Remove this function. pub fn arrow_array_get(array: &dyn Array, idx: usize) -> Result { if array.is_null(idx) { return Ok(Value::Null); @@ -43,42 +49,46 @@ pub fn arrow_array_get(array: &dyn Array, idx: usize) -> Result { let result = match array.data_type() { DataType::Null => Value::Null, - DataType::Boolean => Value::Boolean(cast_array!(array, array::BooleanArray).value(idx)), - DataType::Binary | DataType::LargeBinary => { - Value::Binary(cast_array!(array, BinaryArray).value(idx).into()) - } - DataType::Int8 => Value::Int8(cast_array!(array, PrimitiveArray::).value(idx)), - DataType::Int16 => Value::Int16(cast_array!(array, PrimitiveArray::).value(idx)), - DataType::Int32 => Value::Int32(cast_array!(array, PrimitiveArray::).value(idx)), - DataType::Int64 => Value::Int64(cast_array!(array, PrimitiveArray::).value(idx)), - DataType::UInt8 => Value::UInt8(cast_array!(array, PrimitiveArray::).value(idx)), - DataType::UInt16 => Value::UInt16(cast_array!(array, PrimitiveArray::).value(idx)), - DataType::UInt32 => Value::UInt32(cast_array!(array, PrimitiveArray::).value(idx)), - DataType::UInt64 => Value::UInt64(cast_array!(array, PrimitiveArray::).value(idx)), - DataType::Float32 => { - Value::Float32(cast_array!(array, PrimitiveArray::).value(idx).into()) - } - DataType::Float64 => { - Value::Float64(cast_array!(array, PrimitiveArray::).value(idx).into()) - } - DataType::Utf8 | DataType::LargeUtf8 => { - Value::String(cast_array!(array, StringArray).value(idx).into()) - } - DataType::Timestamp(t, _) => { - let value = cast_array!(array, PrimitiveArray::).value(idx); - let unit = match ConcreteDataType::from_arrow_time_unit(t) { - ConcreteDataType::Timestamp(t) => t.unit, - _ => unreachable!(), - }; - Value::Timestamp(Timestamp::new(value, unit)) - } + DataType::Boolean => Value::Boolean(cast_array!(array, BooleanArray).value(idx)), + DataType::Binary => Value::Binary(cast_array!(array, BinaryArray).value(idx).into()), + DataType::Int8 => Value::Int8(cast_array!(array, Int8Array).value(idx)), + DataType::Int16 => Value::Int16(cast_array!(array, Int16Array).value(idx)), + DataType::Int32 => Value::Int32(cast_array!(array, Int32Array).value(idx)), + DataType::Int64 => Value::Int64(cast_array!(array, Int64Array).value(idx)), + DataType::UInt8 => Value::UInt8(cast_array!(array, UInt8Array).value(idx)), + DataType::UInt16 => Value::UInt16(cast_array!(array, UInt16Array).value(idx)), + DataType::UInt32 => Value::UInt32(cast_array!(array, UInt32Array).value(idx)), + DataType::UInt64 => Value::UInt64(cast_array!(array, UInt64Array).value(idx)), + DataType::Float32 => Value::Float32(cast_array!(array, Float32Array).value(idx).into()), + DataType::Float64 => Value::Float64(cast_array!(array, Float64Array).value(idx).into()), + DataType::Utf8 => Value::String(cast_array!(array, StringArray).value(idx).into()), + DataType::Date32 => Value::Date(cast_array!(array, Date32Array).value(idx).into()), + DataType::Date64 => Value::DateTime(cast_array!(array, Date64Array).value(idx).into()), + DataType::Timestamp(t, _) => match t { + arrow::datatypes::TimeUnit::Second => Value::Timestamp(Timestamp::new( + cast_array!(array, arrow::array::TimestampSecondArray).value(idx), + TimeUnit::Second, + )), + arrow::datatypes::TimeUnit::Millisecond => Value::Timestamp(Timestamp::new( + cast_array!(array, arrow::array::TimestampMillisecondArray).value(idx), + TimeUnit::Millisecond, + )), + arrow::datatypes::TimeUnit::Microsecond => Value::Timestamp(Timestamp::new( + cast_array!(array, arrow::array::TimestampMicrosecondArray).value(idx), + TimeUnit::Microsecond, + )), + arrow::datatypes::TimeUnit::Nanosecond => Value::Timestamp(Timestamp::new( + cast_array!(array, arrow::array::TimestampNanosecondArray).value(idx), + TimeUnit::Nanosecond, + )), + }, DataType::List(_) => { - let array = cast_array!(array, ListArray::).value(idx); - let inner_datatype = ConcreteDataType::try_from(array.data_type())?; + let array = cast_array!(array, ListArray).value(idx); + let item_type = ConcreteDataType::try_from(array.data_type())?; let values = (0..array.len()) .map(|i| arrow_array_get(&*array, i)) .collect::>>()?; - Value::List(ListValue::new(Some(Box::new(values)), inner_datatype)) + Value::List(ListValue::new(Some(Box::new(values)), item_type)) } _ => unimplemented!("Arrow array datatype: {:?}", array.data_type()), }; @@ -88,45 +98,74 @@ pub fn arrow_array_get(array: &dyn Array, idx: usize) -> Result { #[cfg(test)] mod test { + use std::sync::Arc; + use arrow::array::{ BooleanArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, - MutableListArray, MutablePrimitiveArray, TryExtend, UInt16Array, UInt32Array, UInt64Array, + LargeBinaryArray, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; - use arrow::buffer::Buffer; - use arrow::datatypes::{DataType, TimeUnit as ArrowTimeUnit}; + use arrow::datatypes::Int32Type; use common_time::timestamp::{TimeUnit, Timestamp}; + use paste::paste; use super::*; - use crate::prelude::Vector; - use crate::vectors::TimestampVector; + use crate::data_type::ConcreteDataType; + use crate::types::TimestampType; + + macro_rules! test_arrow_array_get_for_timestamps { + ( $($unit: ident), *) => { + $( + paste! { + let mut builder = arrow::array::[]::builder(3); + builder.append_value(1); + builder.append_value(0); + builder.append_value(-1); + let ts_array = Arc::new(builder.finish()) as Arc; + let v = arrow_array_get(&ts_array, 1).unwrap(); + assert_eq!( + ConcreteDataType::Timestamp(TimestampType::$unit( + $crate::types::[]::default(), + )), + v.data_type() + ); + } + )* + }; + } + + #[test] + fn test_timestamp_array() { + test_arrow_array_get_for_timestamps![Second, Millisecond, Microsecond, Nanosecond]; + } #[test] fn test_arrow_array_access() { - let array1 = BooleanArray::from_slice(vec![true, true, false, false]); + let array1 = BooleanArray::from(vec![true, true, false, false]); assert_eq!(Value::Boolean(true), arrow_array_get(&array1, 1).unwrap()); - let array1 = Int8Array::from_vec(vec![1, 2, 3, 4]); + let array1 = Int8Array::from(vec![1, 2, 3, 4]); assert_eq!(Value::Int8(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = UInt8Array::from_vec(vec![1, 2, 3, 4]); + let array1 = UInt8Array::from(vec![1, 2, 3, 4]); assert_eq!(Value::UInt8(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = Int16Array::from_vec(vec![1, 2, 3, 4]); + let array1 = Int16Array::from(vec![1, 2, 3, 4]); assert_eq!(Value::Int16(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = UInt16Array::from_vec(vec![1, 2, 3, 4]); + let array1 = UInt16Array::from(vec![1, 2, 3, 4]); assert_eq!(Value::UInt16(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = Int32Array::from_vec(vec![1, 2, 3, 4]); + let array1 = Int32Array::from(vec![1, 2, 3, 4]); assert_eq!(Value::Int32(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = UInt32Array::from_vec(vec![1, 2, 3, 4]); + let array1 = UInt32Array::from(vec![1, 2, 3, 4]); assert_eq!(Value::UInt32(2), arrow_array_get(&array1, 1).unwrap()); - let array = Int64Array::from_vec(vec![1, 2, 3, 4]); + let array = Int64Array::from(vec![1, 2, 3, 4]); assert_eq!(Value::Int64(2), arrow_array_get(&array, 1).unwrap()); - let array1 = UInt64Array::from_vec(vec![1, 2, 3, 4]); + let array1 = UInt64Array::from(vec![1, 2, 3, 4]); assert_eq!(Value::UInt64(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = Float32Array::from_vec(vec![1f32, 2f32, 3f32, 4f32]); + let array1 = Float32Array::from(vec![1f32, 2f32, 3f32, 4f32]); assert_eq!( Value::Float32(2f32.into()), arrow_array_get(&array1, 1).unwrap() ); - let array1 = Float64Array::from_vec(vec![1f64, 2f64, 3f64, 4f64]); + let array1 = Float64Array::from(vec![1f64, 2f64, 3f64, 4f64]); assert_eq!( Value::Float64(2f64.into()), arrow_array_get(&array1, 1).unwrap() @@ -139,55 +178,42 @@ mod test { ); assert_eq!(Value::Null, arrow_array_get(&array2, 1).unwrap()); - let array3 = super::BinaryArray::from(vec![ + let array3 = LargeBinaryArray::from(vec![ Some("hello".as_bytes()), None, Some("world".as_bytes()), ]); - assert_eq!( - Value::Binary("hello".as_bytes().into()), - arrow_array_get(&array3, 0).unwrap() - ); assert_eq!(Value::Null, arrow_array_get(&array3, 1).unwrap()); - let vector = TimestampVector::new(Int64Array::from_vec(vec![1, 2, 3, 4])); - let array = vector.to_boxed_arrow_array(); - let value = arrow_array_get(&*array, 1).unwrap(); + let array = TimestampSecondArray::from(vec![1, 2, 3]); + let value = arrow_array_get(&array, 1).unwrap(); + assert_eq!(value, Value::Timestamp(Timestamp::new(2, TimeUnit::Second))); + let array = TimestampMillisecondArray::from(vec![1, 2, 3]); + let value = arrow_array_get(&array, 1).unwrap(); assert_eq!( value, Value::Timestamp(Timestamp::new(2, TimeUnit::Millisecond)) ); - - let array4 = PrimitiveArray::::from_data( - DataType::Timestamp(ArrowTimeUnit::Millisecond, None), - Buffer::from_slice(&vec![1, 2, 3, 4]), - None, - ); + let array = TimestampMicrosecondArray::from(vec![1, 2, 3]); + let value = arrow_array_get(&array, 1).unwrap(); assert_eq!( - Value::Timestamp(Timestamp::new(1, TimeUnit::Millisecond)), - arrow_array_get(&array4, 0).unwrap() - ); - - let array4 = PrimitiveArray::::from_data( - DataType::Timestamp(ArrowTimeUnit::Nanosecond, None), - Buffer::from_slice(&vec![1, 2, 3, 4]), - None, + value, + Value::Timestamp(Timestamp::new(2, TimeUnit::Microsecond)) ); + let array = TimestampNanosecondArray::from(vec![1, 2, 3]); + let value = arrow_array_get(&array, 1).unwrap(); assert_eq!( - Value::Timestamp(Timestamp::new(1, TimeUnit::Nanosecond)), - arrow_array_get(&array4, 0).unwrap() + value, + Value::Timestamp(Timestamp::new(2, TimeUnit::Nanosecond)) ); // test list array let data = vec![ - Some(vec![Some(1i32), Some(2), Some(3)]), + Some(vec![Some(1), Some(2), Some(3)]), None, Some(vec![Some(4), None, Some(6)]), ]; - - let mut arrow_array = MutableListArray::>::new(); - arrow_array.try_extend(data).unwrap(); - let arrow_array: ListArray = arrow_array.into(); + let arrow_array = ListArray::from_iter_primitive::(data); let v0 = arrow_array_get(&arrow_array, 0).unwrap(); match v0 { diff --git a/src/datatypes/src/data_type.rs b/src/datatypes/src/data_type.rs index e14a3d8e84..0d06d566b6 100644 --- a/src/datatypes/src/data_type.rs +++ b/src/datatypes/src/data_type.rs @@ -14,7 +14,7 @@ use std::sync::Arc; -use arrow::datatypes::DataType as ArrowDataType; +use arrow::datatypes::{DataType as ArrowDataType, TimeUnit as ArrowTimeUnit}; use common_time::timestamp::TimeUnit; use paste::paste; use serde::{Deserialize, Serialize}; @@ -23,13 +23,14 @@ use crate::error::{self, Error, Result}; use crate::type_id::LogicalTypeId; use crate::types::{ BinaryType, BooleanType, DateTimeType, DateType, Float32Type, Float64Type, Int16Type, - Int32Type, Int64Type, Int8Type, ListType, NullType, StringType, TimestampType, UInt16Type, - UInt32Type, UInt64Type, UInt8Type, + Int32Type, Int64Type, Int8Type, ListType, NullType, StringType, TimestampMicrosecondType, + TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, TimestampType, + UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; use crate::value::Value; use crate::vectors::MutableVector; -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] #[enum_dispatch::enum_dispatch(DataType)] pub enum ConcreteDataType { Null(NullType), @@ -47,17 +48,21 @@ pub enum ConcreteDataType { Float32(Float32Type), Float64(Float64Type), - // String types + // String types: Binary(BinaryType), String(StringType), + // Date types: Date(DateType), DateTime(DateTimeType), Timestamp(TimestampType), + // Compound types: List(ListType), } +// TODO(yingwen): Refactor these `is_xxx()` methods, such as adding a `properties()` method +// returning all these properties to the `DataType` trait impl ConcreteDataType { pub fn is_float(&self) -> bool { matches!( @@ -70,7 +75,7 @@ impl ConcreteDataType { matches!(self, ConcreteDataType::Boolean(_)) } - pub fn stringifiable(&self) -> bool { + pub fn is_stringifiable(&self) -> bool { matches!( self, ConcreteDataType::String(_) @@ -103,13 +108,6 @@ impl ConcreteDataType { ) } - pub fn is_timestamp(&self) -> bool { - matches!( - self, - ConcreteDataType::Timestamp(_) | ConcreteDataType::Int64(_) - ) - } - pub fn numerics() -> Vec { vec![ ConcreteDataType::int8_datatype(), @@ -161,7 +159,7 @@ impl TryFrom<&ArrowDataType> for ConcreteDataType { ArrowDataType::Binary | ArrowDataType::LargeBinary => Self::binary_datatype(), ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => Self::string_datatype(), ArrowDataType::List(field) => Self::List(ListType::new( - ConcreteDataType::from_arrow_type(&field.data_type), + ConcreteDataType::from_arrow_type(field.data_type()), )), _ => { return error::UnsupportedArrowTypeSnafu { @@ -191,38 +189,52 @@ macro_rules! impl_new_concrete_type_functions { impl_new_concrete_type_functions!( Null, Boolean, UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64, - Binary, String, Date, DateTime + Binary, Date, DateTime, String ); impl ConcreteDataType { - pub fn list_datatype(inner_type: ConcreteDataType) -> ConcreteDataType { - ConcreteDataType::List(ListType::new(inner_type)) + pub fn timestamp_second_datatype() -> Self { + ConcreteDataType::Timestamp(TimestampType::Second(TimestampSecondType::default())) + } + + pub fn timestamp_millisecond_datatype() -> Self { + ConcreteDataType::Timestamp(TimestampType::Millisecond( + TimestampMillisecondType::default(), + )) + } + + pub fn timestamp_microsecond_datatype() -> Self { + ConcreteDataType::Timestamp(TimestampType::Microsecond( + TimestampMicrosecondType::default(), + )) + } + + pub fn timestamp_nanosecond_datatype() -> Self { + ConcreteDataType::Timestamp(TimestampType::Nanosecond(TimestampNanosecondType::default())) } pub fn timestamp_datatype(unit: TimeUnit) -> Self { - ConcreteDataType::Timestamp(TimestampType::new(unit)) - } - - pub fn timestamp_millis_datatype() -> Self { - ConcreteDataType::Timestamp(TimestampType::new(TimeUnit::Millisecond)) + match unit { + TimeUnit::Second => Self::timestamp_second_datatype(), + TimeUnit::Millisecond => Self::timestamp_millisecond_datatype(), + TimeUnit::Microsecond => Self::timestamp_microsecond_datatype(), + TimeUnit::Nanosecond => Self::timestamp_nanosecond_datatype(), + } } /// Converts from arrow timestamp unit to - // TODO(hl): maybe impl From for our timestamp ? - pub fn from_arrow_time_unit(t: &arrow::datatypes::TimeUnit) -> Self { + pub fn from_arrow_time_unit(t: &ArrowTimeUnit) -> Self { match t { - arrow::datatypes::TimeUnit::Second => Self::timestamp_datatype(TimeUnit::Second), - arrow::datatypes::TimeUnit::Millisecond => { - Self::timestamp_datatype(TimeUnit::Millisecond) - } - arrow::datatypes::TimeUnit::Microsecond => { - Self::timestamp_datatype(TimeUnit::Microsecond) - } - arrow::datatypes::TimeUnit::Nanosecond => { - Self::timestamp_datatype(TimeUnit::Nanosecond) - } + ArrowTimeUnit::Second => Self::timestamp_second_datatype(), + ArrowTimeUnit::Millisecond => Self::timestamp_millisecond_datatype(), + ArrowTimeUnit::Microsecond => Self::timestamp_microsecond_datatype(), + ArrowTimeUnit::Nanosecond => Self::timestamp_nanosecond_datatype(), } } + + pub fn list_datatype(item_type: ConcreteDataType) -> ConcreteDataType { + ConcreteDataType::List(ListType::new(item_type)) + } } /// Data type abstraction. @@ -237,11 +249,15 @@ pub trait DataType: std::fmt::Debug + Send + Sync { /// Returns the default value of this type. fn default_value(&self) -> Value; - /// Convert this type as [arrow2::datatypes::DataType]. + /// Convert this type as [arrow::datatypes::DataType]. fn as_arrow_type(&self) -> ArrowDataType; - /// Create a mutable vector with given `capacity` of this type. + /// Creates a mutable vector with given `capacity` of this type. fn create_mutable_vector(&self, capacity: usize) -> Box; + + /// Returns true if the data type is compatible with timestamp type so we can + /// use it as a timestamp. + fn is_timestamp_compatible(&self) -> bool; } pub type DataTypeRef = Arc; @@ -324,10 +340,6 @@ mod tests { ConcreteDataType::from_arrow_type(&ArrowDataType::Utf8), ConcreteDataType::String(_) )); - assert!(matches!( - ConcreteDataType::from_arrow_type(&ArrowDataType::Utf8), - ConcreteDataType::String(_) - )); assert_eq!( ConcreteDataType::from_arrow_type(&ArrowDataType::List(Box::new(Field::new( "item", @@ -345,31 +357,48 @@ mod tests { #[test] fn test_from_arrow_timestamp() { assert_eq!( - ConcreteDataType::timestamp_millis_datatype(), - ConcreteDataType::from_arrow_time_unit(&arrow::datatypes::TimeUnit::Millisecond) + ConcreteDataType::timestamp_millisecond_datatype(), + ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Millisecond) ); assert_eq!( - ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond), - ConcreteDataType::from_arrow_time_unit(&arrow::datatypes::TimeUnit::Microsecond) + ConcreteDataType::timestamp_microsecond_datatype(), + ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Microsecond) ); assert_eq!( - ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond), - ConcreteDataType::from_arrow_time_unit(&arrow::datatypes::TimeUnit::Nanosecond) + ConcreteDataType::timestamp_nanosecond_datatype(), + ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Nanosecond) ); assert_eq!( - ConcreteDataType::timestamp_datatype(TimeUnit::Second), - ConcreteDataType::from_arrow_time_unit(&arrow::datatypes::TimeUnit::Second) + ConcreteDataType::timestamp_second_datatype(), + ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Second) ); } #[test] - fn test_is_timestamp() { - assert!(ConcreteDataType::timestamp_millis_datatype().is_timestamp()); - assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Second).is_timestamp()); - assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Millisecond).is_timestamp()); - assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond).is_timestamp()); - assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond).is_timestamp()); - assert!(ConcreteDataType::int64_datatype().is_timestamp()); + fn test_is_timestamp_compatible() { + assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Second).is_timestamp_compatible()); + assert!( + ConcreteDataType::timestamp_datatype(TimeUnit::Millisecond).is_timestamp_compatible() + ); + assert!( + ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond).is_timestamp_compatible() + ); + assert!( + ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond).is_timestamp_compatible() + ); + assert!(ConcreteDataType::timestamp_second_datatype().is_timestamp_compatible()); + assert!(ConcreteDataType::timestamp_millisecond_datatype().is_timestamp_compatible()); + assert!(ConcreteDataType::timestamp_microsecond_datatype().is_timestamp_compatible()); + assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_timestamp_compatible()); + assert!(ConcreteDataType::int64_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::null_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::binary_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::boolean_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::date_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::datetime_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::string_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::int32_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::uint64_datatype().is_timestamp_compatible()); } #[test] @@ -377,4 +406,81 @@ mod tests { assert!(ConcreteDataType::null_datatype().is_null()); assert!(!ConcreteDataType::int32_datatype().is_null()); } + + #[test] + fn test_is_float() { + assert!(!ConcreteDataType::int32_datatype().is_float()); + assert!(ConcreteDataType::float32_datatype().is_float()); + assert!(ConcreteDataType::float64_datatype().is_float()); + } + + #[test] + fn test_is_boolean() { + assert!(!ConcreteDataType::int32_datatype().is_boolean()); + assert!(!ConcreteDataType::float32_datatype().is_boolean()); + assert!(ConcreteDataType::boolean_datatype().is_boolean()); + } + + #[test] + fn test_is_stringifiable() { + assert!(!ConcreteDataType::int32_datatype().is_stringifiable()); + assert!(!ConcreteDataType::float32_datatype().is_stringifiable()); + assert!(ConcreteDataType::string_datatype().is_stringifiable()); + assert!(ConcreteDataType::date_datatype().is_stringifiable()); + assert!(ConcreteDataType::datetime_datatype().is_stringifiable()); + assert!(ConcreteDataType::timestamp_second_datatype().is_stringifiable()); + assert!(ConcreteDataType::timestamp_millisecond_datatype().is_stringifiable()); + assert!(ConcreteDataType::timestamp_microsecond_datatype().is_stringifiable()); + assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_stringifiable()); + } + + #[test] + fn test_is_signed() { + assert!(ConcreteDataType::int8_datatype().is_signed()); + assert!(ConcreteDataType::int16_datatype().is_signed()); + assert!(ConcreteDataType::int32_datatype().is_signed()); + assert!(ConcreteDataType::int64_datatype().is_signed()); + assert!(ConcreteDataType::date_datatype().is_signed()); + assert!(ConcreteDataType::datetime_datatype().is_signed()); + assert!(ConcreteDataType::timestamp_second_datatype().is_signed()); + assert!(ConcreteDataType::timestamp_millisecond_datatype().is_signed()); + assert!(ConcreteDataType::timestamp_microsecond_datatype().is_signed()); + assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_signed()); + + assert!(!ConcreteDataType::uint8_datatype().is_signed()); + assert!(!ConcreteDataType::uint16_datatype().is_signed()); + assert!(!ConcreteDataType::uint32_datatype().is_signed()); + assert!(!ConcreteDataType::uint64_datatype().is_signed()); + + assert!(!ConcreteDataType::float32_datatype().is_signed()); + assert!(!ConcreteDataType::float64_datatype().is_signed()); + } + + #[test] + fn test_is_unsigned() { + assert!(!ConcreteDataType::int8_datatype().is_unsigned()); + assert!(!ConcreteDataType::int16_datatype().is_unsigned()); + assert!(!ConcreteDataType::int32_datatype().is_unsigned()); + assert!(!ConcreteDataType::int64_datatype().is_unsigned()); + assert!(!ConcreteDataType::date_datatype().is_unsigned()); + assert!(!ConcreteDataType::datetime_datatype().is_unsigned()); + assert!(!ConcreteDataType::timestamp_second_datatype().is_unsigned()); + assert!(!ConcreteDataType::timestamp_millisecond_datatype().is_unsigned()); + assert!(!ConcreteDataType::timestamp_microsecond_datatype().is_unsigned()); + assert!(!ConcreteDataType::timestamp_nanosecond_datatype().is_unsigned()); + + assert!(ConcreteDataType::uint8_datatype().is_unsigned()); + assert!(ConcreteDataType::uint16_datatype().is_unsigned()); + assert!(ConcreteDataType::uint32_datatype().is_unsigned()); + assert!(ConcreteDataType::uint64_datatype().is_unsigned()); + + assert!(!ConcreteDataType::float32_datatype().is_unsigned()); + assert!(!ConcreteDataType::float64_datatype().is_unsigned()); + } + + #[test] + fn test_numerics() { + let nums = ConcreteDataType::numerics(); + assert_eq!(10, nums.len()); + } } diff --git a/src/datatypes/src/lib.rs b/src/datatypes/src/lib.rs index f6f6db112a..256d347eac 100644 --- a/src/datatypes/src/lib.rs +++ b/src/datatypes/src/lib.rs @@ -23,6 +23,7 @@ pub mod prelude; mod scalars; pub mod schema; pub mod serialize; +mod timestamp; pub mod type_id; pub mod types; pub mod value; diff --git a/src/datatypes/src/macros.rs b/src/datatypes/src/macros.rs index 18be9fa375..37c0a42e3f 100644 --- a/src/datatypes/src/macros.rs +++ b/src/datatypes/src/macros.rs @@ -12,27 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -///! Some helper macros for datatypes, copied from databend. -#[macro_export] -macro_rules! for_all_scalar_types { - ($macro:tt $(, $x:tt)*) => { - $macro! { - [$($x),*], - { i8 }, - { i16 }, - { i32 }, - { i64 }, - { u8 }, - { u16 }, - { u32 }, - { u64 }, - { f32 }, - { f64 }, - { bool }, - } - }; -} +//! Some helper macros for datatypes, copied from databend. +/// Apply the macro rules to all primitive types. #[macro_export] macro_rules! for_all_primitive_types { ($macro:tt $(, $x:tt)*) => { @@ -52,6 +34,8 @@ macro_rules! for_all_primitive_types { }; } +/// Match the logical type and apply `$body` to all primitive types and +/// `nbody` to other types. #[macro_export] macro_rules! with_match_primitive_type_id { ($key_type:expr, | $_:tt $T:ident | $body:tt, $nbody:tt) => {{ @@ -62,17 +46,21 @@ macro_rules! with_match_primitive_type_id { } use $crate::type_id::LogicalTypeId; + use $crate::types::{ + Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, + UInt32Type, UInt64Type, UInt8Type, + }; match $key_type { - LogicalTypeId::Int8 => __with_ty__! { i8 }, - LogicalTypeId::Int16 => __with_ty__! { i16 }, - LogicalTypeId::Int32 => __with_ty__! { i32 }, - LogicalTypeId::Int64 => __with_ty__! { i64 }, - LogicalTypeId::UInt8 => __with_ty__! { u8 }, - LogicalTypeId::UInt16 => __with_ty__! { u16 }, - LogicalTypeId::UInt32 => __with_ty__! { u32 }, - LogicalTypeId::UInt64 => __with_ty__! { u64 }, - LogicalTypeId::Float32 => __with_ty__! { f32 }, - LogicalTypeId::Float64 => __with_ty__! { f64 }, + LogicalTypeId::Int8 => __with_ty__! { Int8Type }, + LogicalTypeId::Int16 => __with_ty__! { Int16Type }, + LogicalTypeId::Int32 => __with_ty__! { Int32Type }, + LogicalTypeId::Int64 => __with_ty__! { Int64Type }, + LogicalTypeId::UInt8 => __with_ty__! { UInt8Type }, + LogicalTypeId::UInt16 => __with_ty__! { UInt16Type }, + LogicalTypeId::UInt32 => __with_ty__! { UInt32Type }, + LogicalTypeId::UInt64 => __with_ty__! { UInt64Type }, + LogicalTypeId::Float32 => __with_ty__! { Float32Type }, + LogicalTypeId::Float64 => __with_ty__! { Float64Type }, _ => $nbody, } diff --git a/src/datatypes/src/prelude.rs b/src/datatypes/src/prelude.rs index 014a40efaf..f6bd298316 100644 --- a/src/datatypes/src/prelude.rs +++ b/src/datatypes/src/prelude.rs @@ -16,8 +16,5 @@ pub use crate::data_type::{ConcreteDataType, DataType, DataTypeRef}; pub use crate::macros::*; pub use crate::scalars::{Scalar, ScalarRef, ScalarVector, ScalarVectorBuilder}; pub use crate::type_id::LogicalTypeId; -pub use crate::types::Primitive; pub use crate::value::{Value, ValueRef}; -pub use crate::vectors::{ - Helper as VectorHelper, MutableVector, Validity, Vector, VectorBuilder, VectorRef, -}; +pub use crate::vectors::{MutableVector, Validity, Vector, VectorRef}; diff --git a/src/datatypes/src/scalars.rs b/src/datatypes/src/scalars.rs index ddb8eff007..327ebaa629 100644 --- a/src/datatypes/src/scalars.rs +++ b/src/datatypes/src/scalars.rs @@ -14,11 +14,17 @@ use std::any::Any; -use common_time::{Date, DateTime, Timestamp}; +use common_time::{Date, DateTime}; -use crate::prelude::*; -use crate::value::{ListValue, ListValueRef}; -use crate::vectors::*; +use crate::types::{ + Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, UInt32Type, + UInt64Type, UInt8Type, +}; +use crate::value::{ListValue, ListValueRef, Value}; +use crate::vectors::{ + BinaryVector, BooleanVector, DateTimeVector, DateVector, ListVector, MutableVector, + PrimitiveVector, StringVector, Vector, +}; fn get_iter_capacity>(iter: &I) -> usize { match iter.size_hint() { @@ -35,7 +41,7 @@ where for<'a> Self::VectorType: ScalarVector = Self::RefType<'a>>, { type VectorType: ScalarVector; - type RefType<'a>: ScalarRef<'a, ScalarType = Self, VectorType = Self::VectorType> + type RefType<'a>: ScalarRef<'a, ScalarType = Self> where Self: 'a; /// Get a reference of the current value. @@ -46,7 +52,6 @@ where } pub trait ScalarRef<'a>: std::fmt::Debug + Clone + Copy + Send + 'a { - type VectorType: ScalarVector = Self>; /// The corresponding [`Scalar`] type. type ScalarType: Scalar = Self>; @@ -63,7 +68,7 @@ where { type OwnedItem: Scalar; /// The reference item of this vector. - type RefItem<'a>: ScalarRef<'a, ScalarType = Self::OwnedItem, VectorType = Self> + type RefItem<'a>: ScalarRef<'a, ScalarType = Self::OwnedItem> where Self: 'a; @@ -137,47 +142,46 @@ pub trait ScalarVectorBuilder: MutableVector { fn finish(&mut self) -> Self::VectorType; } -macro_rules! impl_primitive_scalar_type { - ($native:ident) => { - impl Scalar for $native { - type VectorType = PrimitiveVector<$native>; - type RefType<'a> = $native; +macro_rules! impl_scalar_for_native { + ($Native: ident, $DataType: ident) => { + impl Scalar for $Native { + type VectorType = PrimitiveVector<$DataType>; + type RefType<'a> = $Native; #[inline] - fn as_scalar_ref(&self) -> $native { + fn as_scalar_ref(&self) -> $Native { *self } #[allow(clippy::needless_lifetimes)] #[inline] - fn upcast_gat<'short, 'long: 'short>(long: $native) -> $native { + fn upcast_gat<'short, 'long: 'short>(long: $Native) -> $Native { long } } /// Implement [`ScalarRef`] for primitive types. Note that primitive types are both [`Scalar`] and [`ScalarRef`]. - impl<'a> ScalarRef<'a> for $native { - type VectorType = PrimitiveVector<$native>; - type ScalarType = $native; + impl<'a> ScalarRef<'a> for $Native { + type ScalarType = $Native; #[inline] - fn to_owned_scalar(&self) -> $native { + fn to_owned_scalar(&self) -> $Native { *self } } }; } -impl_primitive_scalar_type!(u8); -impl_primitive_scalar_type!(u16); -impl_primitive_scalar_type!(u32); -impl_primitive_scalar_type!(u64); -impl_primitive_scalar_type!(i8); -impl_primitive_scalar_type!(i16); -impl_primitive_scalar_type!(i32); -impl_primitive_scalar_type!(i64); -impl_primitive_scalar_type!(f32); -impl_primitive_scalar_type!(f64); +impl_scalar_for_native!(u8, UInt8Type); +impl_scalar_for_native!(u16, UInt16Type); +impl_scalar_for_native!(u32, UInt32Type); +impl_scalar_for_native!(u64, UInt64Type); +impl_scalar_for_native!(i8, Int8Type); +impl_scalar_for_native!(i16, Int16Type); +impl_scalar_for_native!(i32, Int32Type); +impl_scalar_for_native!(i64, Int64Type); +impl_scalar_for_native!(f32, Float32Type); +impl_scalar_for_native!(f64, Float64Type); impl Scalar for bool { type VectorType = BooleanVector; @@ -196,7 +200,6 @@ impl Scalar for bool { } impl<'a> ScalarRef<'a> for bool { - type VectorType = BooleanVector; type ScalarType = bool; #[inline] @@ -221,7 +224,6 @@ impl Scalar for String { } impl<'a> ScalarRef<'a> for &'a str { - type VectorType = StringVector; type ScalarType = String; #[inline] @@ -246,7 +248,6 @@ impl Scalar for Vec { } impl<'a> ScalarRef<'a> for &'a [u8] { - type VectorType = BinaryVector; type ScalarType = Vec; #[inline] @@ -269,7 +270,6 @@ impl Scalar for Date { } impl<'a> ScalarRef<'a> for Date { - type VectorType = DateVector; type ScalarType = Date; fn to_owned_scalar(&self) -> Self::ScalarType { @@ -291,7 +291,6 @@ impl Scalar for DateTime { } impl<'a> ScalarRef<'a> for DateTime { - type VectorType = DateTimeVector; type ScalarType = DateTime; fn to_owned_scalar(&self) -> Self::ScalarType { @@ -299,27 +298,7 @@ impl<'a> ScalarRef<'a> for DateTime { } } -impl Scalar for Timestamp { - type VectorType = TimestampVector; - type RefType<'a> = Timestamp; - - fn as_scalar_ref(&self) -> Self::RefType<'_> { - *self - } - - fn upcast_gat<'short, 'long: 'short>(long: Self::RefType<'long>) -> Self::RefType<'short> { - long - } -} - -impl<'a> ScalarRef<'a> for Timestamp { - type VectorType = TimestampVector; - type ScalarType = Timestamp; - - fn to_owned_scalar(&self) -> Self::ScalarType { - *self - } -} +// Timestamp types implement Scalar and ScalarRef in `src/timestamp.rs`. impl Scalar for ListValue { type VectorType = ListVector; @@ -335,7 +314,6 @@ impl Scalar for ListValue { } impl<'a> ScalarRef<'a> for ListValueRef<'a> { - type VectorType = ListVector; type ScalarType = ListValue; fn to_owned_scalar(&self) -> Self::ScalarType { @@ -357,8 +335,9 @@ impl<'a> ScalarRef<'a> for ListValueRef<'a> { #[cfg(test)] mod tests { use super::*; - use crate::vectors::binary::BinaryVector; - use crate::vectors::primitive::Int32Vector; + use crate::data_type::ConcreteDataType; + use crate::timestamp::TimestampSecond; + use crate::vectors::{BinaryVector, Int32Vector, ListVectorBuilder, TimestampSecondVector}; fn build_vector_from_slice(items: &[Option>]) -> T { let mut builder = T::Builder::with_capacity(items.len()); @@ -454,11 +433,11 @@ mod tests { #[test] fn test_build_timestamp_vector() { - let expect: Vec> = vec![Some(10.into()), None, Some(42.into())]; - let vector: TimestampVector = build_vector_from_slice(&expect); + let expect: Vec> = vec![Some(10.into()), None, Some(42.into())]; + let vector: TimestampSecondVector = build_vector_from_slice(&expect); assert_vector_eq(&expect, &vector); let val = vector.get_data(0).unwrap(); assert_eq!(val, val.as_scalar_ref()); - assert_eq!(10, val.to_owned_scalar().value()); + assert_eq!(TimestampSecond::from(10), val.to_owned_scalar()); } } diff --git a/src/datatypes/src/schema.rs b/src/datatypes/src/schema.rs index a1792fd665..328fe0de24 100644 --- a/src/datatypes/src/schema.rs +++ b/src/datatypes/src/schema.rs @@ -12,128 +12,27 @@ // See the License for the specific language governing permissions and // limitations under the License. +mod column_schema; mod constraint; mod raw; use std::collections::HashMap; use std::sync::Arc; -pub use arrow::datatypes::Metadata; use arrow::datatypes::{Field, Schema as ArrowSchema}; -use serde::{Deserialize, Serialize}; use snafu::{ensure, ResultExt}; -use crate::data_type::{ConcreteDataType, DataType}; -use crate::error::{self, DeserializeSnafu, Error, Result, SerializeSnafu}; +use crate::data_type::DataType; +use crate::error::{self, Error, Result}; +pub use crate::schema::column_schema::{ColumnSchema, Metadata}; pub use crate::schema::constraint::ColumnDefaultConstraint; pub use crate::schema::raw::RawSchema; -use crate::vectors::VectorRef; -/// Key used to store whether the column is time index in arrow field's metadata. -const TIME_INDEX_KEY: &str = "greptime:time_index"; /// Key used to store version number of the schema in metadata. const VERSION_KEY: &str = "greptime:version"; -/// Key used to store default constraint in arrow field's metadata. -const ARROW_FIELD_DEFAULT_CONSTRAINT_KEY: &str = "greptime:default_constraint"; - -/// Schema of a column, used as an immutable struct. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -pub struct ColumnSchema { - pub name: String, - pub data_type: ConcreteDataType, - is_nullable: bool, - is_time_index: bool, - default_constraint: Option, - metadata: Metadata, -} - -impl ColumnSchema { - pub fn new>( - name: T, - data_type: ConcreteDataType, - is_nullable: bool, - ) -> ColumnSchema { - ColumnSchema { - name: name.into(), - data_type, - is_nullable, - is_time_index: false, - default_constraint: None, - metadata: Metadata::new(), - } - } - - #[inline] - pub fn is_time_index(&self) -> bool { - self.is_time_index - } - - #[inline] - pub fn is_nullable(&self) -> bool { - self.is_nullable - } - - #[inline] - pub fn default_constraint(&self) -> Option<&ColumnDefaultConstraint> { - self.default_constraint.as_ref() - } - - #[inline] - pub fn metadata(&self) -> &Metadata { - &self.metadata - } - - pub fn with_time_index(mut self, is_time_index: bool) -> Self { - self.is_time_index = is_time_index; - if is_time_index { - self.metadata - .insert(TIME_INDEX_KEY.to_string(), "true".to_string()); - } else { - self.metadata.remove(TIME_INDEX_KEY); - } - self - } - - pub fn with_default_constraint( - mut self, - default_constraint: Option, - ) -> Result { - if let Some(constraint) = &default_constraint { - constraint.validate(&self.data_type, self.is_nullable)?; - } - - self.default_constraint = default_constraint; - Ok(self) - } - - /// Creates a new [`ColumnSchema`] with given metadata. - pub fn with_metadata(mut self, metadata: Metadata) -> Self { - self.metadata = metadata; - self - } - - pub fn create_default_vector(&self, num_rows: usize) -> Result> { - match &self.default_constraint { - Some(c) => c - .create_default_vector(&self.data_type, self.is_nullable, num_rows) - .map(Some), - None => { - if self.is_nullable { - // No default constraint, use null as default value. - // TODO(yingwen): Use NullVector once it supports setting logical type. - ColumnDefaultConstraint::null_value() - .create_default_vector(&self.data_type, self.is_nullable, num_rows) - .map(Some) - } else { - Ok(None) - } - } - } - } -} /// A common schema, should be immutable. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct Schema { column_schemas: Vec, name_to_index: HashMap, @@ -231,7 +130,7 @@ impl Schema { } #[inline] - pub fn metadata(&self) -> &Metadata { + pub fn metadata(&self) -> &HashMap { &self.arrow_schema.metadata } } @@ -243,7 +142,7 @@ pub struct SchemaBuilder { fields: Vec, timestamp_index: Option, version: u32, - metadata: Metadata, + metadata: HashMap, } impl TryFrom> for SchemaBuilder { @@ -292,7 +191,7 @@ impl SchemaBuilder { self.metadata .insert(VERSION_KEY.to_string(), self.version.to_string()); - let arrow_schema = ArrowSchema::from(self.fields).with_metadata(self.metadata); + let arrow_schema = ArrowSchema::new(self.fields).with_metadata(self.metadata); Ok(Schema { column_schemas: self.column_schemas, @@ -347,7 +246,7 @@ fn validate_timestamp_index(column_schemas: &[ColumnSchema], timestamp_index: us let column_schema = &column_schemas[timestamp_index]; ensure!( - column_schema.data_type.is_timestamp(), + column_schema.data_type.is_timestamp_compatible(), error::InvalidTimestampIndexSnafu { index: timestamp_index, } @@ -364,58 +263,6 @@ fn validate_timestamp_index(column_schemas: &[ColumnSchema], timestamp_index: us pub type SchemaRef = Arc; -impl TryFrom<&Field> for ColumnSchema { - type Error = Error; - - fn try_from(field: &Field) -> Result { - let data_type = ConcreteDataType::try_from(&field.data_type)?; - let mut metadata = field.metadata.clone(); - let default_constraint = match metadata.remove(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY) { - Some(json) => Some(serde_json::from_str(&json).context(DeserializeSnafu { json })?), - None => None, - }; - let is_time_index = metadata.contains_key(TIME_INDEX_KEY); - - Ok(ColumnSchema { - name: field.name.clone(), - data_type, - is_nullable: field.is_nullable, - is_time_index, - default_constraint, - metadata, - }) - } -} - -impl TryFrom<&ColumnSchema> for Field { - type Error = Error; - - fn try_from(column_schema: &ColumnSchema) -> Result { - let mut metadata = column_schema.metadata.clone(); - if let Some(value) = &column_schema.default_constraint { - // Adds an additional metadata to store the default constraint. - let old = metadata.insert( - ARROW_FIELD_DEFAULT_CONSTRAINT_KEY.to_string(), - serde_json::to_string(&value).context(SerializeSnafu)?, - ); - - ensure!( - old.is_none(), - error::DuplicateMetaSnafu { - key: ARROW_FIELD_DEFAULT_CONSTRAINT_KEY, - } - ); - } - - Ok(Field::new( - column_schema.name.clone(), - column_schema.data_type.as_arrow_type(), - column_schema.is_nullable(), - ) - .with_metadata(metadata)) - } -} - impl TryFrom> for Schema { type Error = Error; @@ -424,7 +271,7 @@ impl TryFrom> for Schema { let mut name_to_index = HashMap::with_capacity(arrow_schema.fields.len()); for field in &arrow_schema.fields { let column_schema = ColumnSchema::try_from(field)?; - name_to_index.insert(field.name.clone(), column_schemas.len()); + name_to_index.insert(field.name().to_string(), column_schemas.len()); column_schemas.push(column_schema); } @@ -465,7 +312,7 @@ impl TryFrom for Schema { } } -fn try_parse_version(metadata: &Metadata, key: &str) -> Result { +fn try_parse_version(metadata: &HashMap, key: &str) -> Result { if let Some(value) = metadata.get(key) { let version = value .parse() @@ -479,127 +326,8 @@ fn try_parse_version(metadata: &Metadata, key: &str) -> Result { #[cfg(test)] mod tests { - use arrow::datatypes::DataType as ArrowDataType; - use super::*; - use crate::value::Value; - - #[test] - fn test_column_schema() { - let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true); - let field = Field::try_from(&column_schema).unwrap(); - assert_eq!("test", field.name); - assert_eq!(ArrowDataType::Int32, field.data_type); - assert!(field.is_nullable); - - let new_column_schema = ColumnSchema::try_from(&field).unwrap(); - assert_eq!(column_schema, new_column_schema); - } - - #[test] - fn test_column_schema_with_default_constraint() { - let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true) - .with_default_constraint(Some(ColumnDefaultConstraint::Value(Value::from(99)))) - .unwrap(); - assert!(column_schema - .metadata() - .get(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY) - .is_none()); - - let field = Field::try_from(&column_schema).unwrap(); - assert_eq!("test", field.name); - assert_eq!(ArrowDataType::Int32, field.data_type); - assert!(field.is_nullable); - assert_eq!( - "{\"Value\":{\"Int32\":99}}", - field - .metadata - .get(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY) - .unwrap() - ); - - let new_column_schema = ColumnSchema::try_from(&field).unwrap(); - assert_eq!(column_schema, new_column_schema); - } - - #[test] - fn test_column_schema_with_metadata() { - let mut metadata = Metadata::new(); - metadata.insert("k1".to_string(), "v1".to_string()); - let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true) - .with_metadata(metadata) - .with_default_constraint(Some(ColumnDefaultConstraint::null_value())) - .unwrap(); - assert_eq!("v1", column_schema.metadata().get("k1").unwrap()); - assert!(column_schema - .metadata() - .get(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY) - .is_none()); - - let field = Field::try_from(&column_schema).unwrap(); - assert_eq!("v1", field.metadata.get("k1").unwrap()); - assert!(field - .metadata - .get(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY) - .is_some()); - - let new_column_schema = ColumnSchema::try_from(&field).unwrap(); - assert_eq!(column_schema, new_column_schema); - } - - #[test] - fn test_column_schema_with_duplicate_metadata() { - let mut metadata = Metadata::new(); - metadata.insert( - ARROW_FIELD_DEFAULT_CONSTRAINT_KEY.to_string(), - "v1".to_string(), - ); - let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true) - .with_metadata(metadata) - .with_default_constraint(Some(ColumnDefaultConstraint::null_value())) - .unwrap(); - Field::try_from(&column_schema).unwrap_err(); - } - - #[test] - fn test_column_schema_invalid_default_constraint() { - ColumnSchema::new("test", ConcreteDataType::int32_datatype(), false) - .with_default_constraint(Some(ColumnDefaultConstraint::null_value())) - .unwrap_err(); - } - - #[test] - fn test_column_default_constraint_try_into_from() { - let default_constraint = ColumnDefaultConstraint::Value(Value::from(42i64)); - - let bytes: Vec = default_constraint.clone().try_into().unwrap(); - let from_value = ColumnDefaultConstraint::try_from(&bytes[..]).unwrap(); - - assert_eq!(default_constraint, from_value); - } - - #[test] - fn test_column_schema_create_default_null() { - // Implicit default null. - let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true); - let v = column_schema.create_default_vector(5).unwrap().unwrap(); - assert_eq!(5, v.len()); - assert!(v.only_null()); - - // Explicit default null. - let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true) - .with_default_constraint(Some(ColumnDefaultConstraint::null_value())) - .unwrap(); - let v = column_schema.create_default_vector(5).unwrap().unwrap(); - assert_eq!(5, v.len()); - assert!(v.only_null()); - } - - #[test] - fn test_column_schema_no_default() { - let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), false); - assert!(column_schema.create_default_vector(5).unwrap().is_none()); - } + use crate::data_type::ConcreteDataType; #[test] fn test_build_empty_schema() { @@ -654,8 +382,12 @@ mod tests { fn test_schema_with_timestamp() { let column_schemas = vec![ ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true), - ColumnSchema::new("ts", ConcreteDataType::timestamp_millis_datatype(), false) - .with_time_index(true), + ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ) + .with_time_index(true), ]; let schema = SchemaBuilder::try_from(column_schemas.clone()) .unwrap() diff --git a/src/datatypes2/src/schema/column_schema.rs b/src/datatypes/src/schema/column_schema.rs similarity index 100% rename from src/datatypes2/src/schema/column_schema.rs rename to src/datatypes/src/schema/column_schema.rs diff --git a/src/datatypes/src/schema/constraint.rs b/src/datatypes/src/schema/constraint.rs index 3750fcebcf..4dd3ecc14b 100644 --- a/src/datatypes/src/schema/constraint.rs +++ b/src/datatypes/src/schema/constraint.rs @@ -22,7 +22,7 @@ use snafu::{ensure, ResultExt}; use crate::data_type::{ConcreteDataType, DataType}; use crate::error::{self, Result}; use crate::value::Value; -use crate::vectors::{Int64Vector, TimestampVector, VectorRef}; +use crate::vectors::{Int64Vector, TimestampMillisecondVector, VectorRef}; const CURRENT_TIMESTAMP: &str = "current_timestamp()"; @@ -81,7 +81,7 @@ impl ColumnDefaultConstraint { error::UnsupportedDefaultExprSnafu { expr } ); ensure!( - data_type.is_timestamp(), + data_type.is_timestamp_compatible(), error::DefaultValueTypeSnafu { reason: "return value of the function must has timestamp type", } @@ -162,8 +162,10 @@ fn create_current_timestamp_vector( data_type: &ConcreteDataType, num_rows: usize, ) -> Result { + // FIXME(yingwen): We should implements cast in VectorOp so we could cast the millisecond vector + // to other data type and avoid this match. match data_type { - ConcreteDataType::Timestamp(_) => Ok(Arc::new(TimestampVector::from_values( + ConcreteDataType::Timestamp(_) => Ok(Arc::new(TimestampMillisecondVector::from_values( std::iter::repeat(util::current_time_millis()).take(num_rows), ))), ConcreteDataType::Int64(_) => Ok(Arc::new(Int64Vector::from_values( @@ -217,7 +219,7 @@ mod tests { fn test_validate_function_constraint() { let constraint = ColumnDefaultConstraint::Function(CURRENT_TIMESTAMP.to_string()); constraint - .validate(&ConcreteDataType::timestamp_millis_datatype(), false) + .validate(&ConcreteDataType::timestamp_millisecond_datatype(), false) .unwrap(); constraint .validate(&ConcreteDataType::boolean_datatype(), false) @@ -225,7 +227,7 @@ mod tests { let constraint = ColumnDefaultConstraint::Function("hello()".to_string()); constraint - .validate(&ConcreteDataType::timestamp_millis_datatype(), false) + .validate(&ConcreteDataType::timestamp_millisecond_datatype(), false) .unwrap_err(); } @@ -262,7 +264,7 @@ mod tests { fn test_create_default_vector_by_func() { let constraint = ColumnDefaultConstraint::Function(CURRENT_TIMESTAMP.to_string()); // Timestamp type. - let data_type = ConcreteDataType::timestamp_millis_datatype(); + let data_type = ConcreteDataType::timestamp_millisecond_datatype(); let v = constraint .create_default_vector(&data_type, false, 4) .unwrap(); @@ -286,7 +288,7 @@ mod tests { ); let constraint = ColumnDefaultConstraint::Function("no".to_string()); - let data_type = ConcreteDataType::timestamp_millis_datatype(); + let data_type = ConcreteDataType::timestamp_millisecond_datatype(); constraint .create_default_vector(&data_type, false, 4) .unwrap_err(); diff --git a/src/datatypes/src/schema/raw.rs b/src/datatypes/src/schema/raw.rs index f415a1ab85..75f0853b4b 100644 --- a/src/datatypes/src/schema/raw.rs +++ b/src/datatypes/src/schema/raw.rs @@ -20,7 +20,7 @@ use crate::schema::{ColumnSchema, Schema, SchemaBuilder}; /// Struct used to serialize and deserialize [`Schema`](crate::schema::Schema). /// /// This struct only contains necessary data to recover the Schema. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct RawSchema { pub column_schemas: Vec, pub timestamp_index: Option, @@ -56,8 +56,12 @@ mod tests { fn test_raw_convert() { let column_schemas = vec![ ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true), - ColumnSchema::new("ts", ConcreteDataType::timestamp_millis_datatype(), false) - .with_time_index(true), + ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ) + .with_time_index(true), ]; let schema = SchemaBuilder::try_from(column_schemas) .unwrap() diff --git a/src/datatypes2/src/timestamp.rs b/src/datatypes/src/timestamp.rs similarity index 100% rename from src/datatypes2/src/timestamp.rs rename to src/datatypes/src/timestamp.rs diff --git a/src/datatypes/src/type_id.rs b/src/datatypes/src/type_id.rs index fa11430dec..bcb7ea52b1 100644 --- a/src/datatypes/src/type_id.rs +++ b/src/datatypes/src/type_id.rs @@ -42,7 +42,10 @@ pub enum LogicalTypeId { /// seconds/milliseconds/microseconds/nanoseconds, determined by precision. DateTime, - Timestamp, + TimestampSecond, + TimestampMillisecond, + TimestampMicrosecond, + TimestampNanosecond, List, } @@ -74,7 +77,14 @@ impl LogicalTypeId { LogicalTypeId::Binary => ConcreteDataType::binary_datatype(), LogicalTypeId::Date => ConcreteDataType::date_datatype(), LogicalTypeId::DateTime => ConcreteDataType::datetime_datatype(), - LogicalTypeId::Timestamp => ConcreteDataType::timestamp_millis_datatype(), // to timestamp type with default time unit + LogicalTypeId::TimestampSecond => ConcreteDataType::timestamp_second_datatype(), + LogicalTypeId::TimestampMillisecond => { + ConcreteDataType::timestamp_millisecond_datatype() + } + LogicalTypeId::TimestampMicrosecond => { + ConcreteDataType::timestamp_microsecond_datatype() + } + LogicalTypeId::TimestampNanosecond => ConcreteDataType::timestamp_nanosecond_datatype(), LogicalTypeId::List => { ConcreteDataType::list_datatype(ConcreteDataType::null_datatype()) } diff --git a/src/datatypes/src/types.rs b/src/datatypes/src/types.rs index aabeb59db3..186704fdfd 100644 --- a/src/datatypes/src/types.rs +++ b/src/datatypes/src/types.rs @@ -14,25 +14,24 @@ mod binary_type; mod boolean_type; -mod date; -mod datetime; +mod date_type; +mod datetime_type; mod list_type; mod null_type; -mod primitive_traits; mod primitive_type; mod string_type; -mod timestamp; + +mod timestamp_type; pub use binary_type::BinaryType; pub use boolean_type::BooleanType; -pub use date::DateType; -pub use datetime::DateTimeType; +pub use date_type::DateType; +pub use datetime_type::DateTimeType; pub use list_type::ListType; pub use null_type::NullType; -pub use primitive_traits::{OrdPrimitive, Primitive}; pub use primitive_type::{ - Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, PrimitiveElement, - PrimitiveType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, + Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LogicalPrimitiveType, + NativeType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, WrapperType, }; pub use string_type::StringType; -pub use timestamp::TimestampType; +pub use timestamp_type::*; diff --git a/src/datatypes/src/types/binary_type.rs b/src/datatypes/src/types/binary_type.rs index 13922ff063..0d06724fff 100644 --- a/src/datatypes/src/types/binary_type.rs +++ b/src/datatypes/src/types/binary_type.rs @@ -53,4 +53,8 @@ impl DataType for BinaryType { fn create_mutable_vector(&self, capacity: usize) -> Box { Box::new(BinaryVectorBuilder::with_capacity(capacity)) } + + fn is_timestamp_compatible(&self) -> bool { + false + } } diff --git a/src/datatypes/src/types/boolean_type.rs b/src/datatypes/src/types/boolean_type.rs index 4566f1d826..36d92169eb 100644 --- a/src/datatypes/src/types/boolean_type.rs +++ b/src/datatypes/src/types/boolean_type.rs @@ -52,4 +52,8 @@ impl DataType for BooleanType { fn create_mutable_vector(&self, capacity: usize) -> Box { Box::new(BooleanVectorBuilder::with_capacity(capacity)) } + + fn is_timestamp_compatible(&self) -> bool { + false + } } diff --git a/src/datatypes2/src/types/date_type.rs b/src/datatypes/src/types/date_type.rs similarity index 100% rename from src/datatypes2/src/types/date_type.rs rename to src/datatypes/src/types/date_type.rs diff --git a/src/datatypes2/src/types/datetime_type.rs b/src/datatypes/src/types/datetime_type.rs similarity index 100% rename from src/datatypes2/src/types/datetime_type.rs rename to src/datatypes/src/types/datetime_type.rs diff --git a/src/datatypes/src/types/list_type.rs b/src/datatypes/src/types/list_type.rs index 1ada109011..b9875ca362 100644 --- a/src/datatypes/src/types/list_type.rs +++ b/src/datatypes/src/types/list_type.rs @@ -15,15 +15,17 @@ use arrow::datatypes::{DataType as ArrowDataType, Field}; use serde::{Deserialize, Serialize}; -use crate::prelude::*; -use crate::value::ListValue; +use crate::data_type::{ConcreteDataType, DataType}; +use crate::type_id::LogicalTypeId; +use crate::value::{ListValue, Value}; use crate::vectors::{ListVectorBuilder, MutableVector}; /// Used to represent the List datatype. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ListType { - /// The type of List's inner data. - inner: Box, + /// The type of List's item. + // Use Box to avoid recursive dependency, as enum ConcreteDataType depends on ListType. + item_type: Box, } impl Default for ListType { @@ -33,9 +35,10 @@ impl Default for ListType { } impl ListType { - pub fn new(datatype: ConcreteDataType) -> Self { + /// Create a new `ListType` whose item's data type is `item_type`. + pub fn new(item_type: ConcreteDataType) -> Self { ListType { - inner: Box::new(datatype), + item_type: Box::new(item_type), } } } @@ -50,20 +53,24 @@ impl DataType for ListType { } fn default_value(&self) -> Value { - Value::List(ListValue::new(None, *self.inner.clone())) + Value::List(ListValue::new(None, *self.item_type.clone())) } fn as_arrow_type(&self) -> ArrowDataType { - let field = Box::new(Field::new("item", self.inner.as_arrow_type(), true)); + let field = Box::new(Field::new("item", self.item_type.as_arrow_type(), true)); ArrowDataType::List(field) } fn create_mutable_vector(&self, capacity: usize) -> Box { Box::new(ListVectorBuilder::with_type_capacity( - *self.inner.clone(), + *self.item_type.clone(), capacity, )) } + + fn is_timestamp_compatible(&self) -> bool { + false + } } #[cfg(test)] diff --git a/src/datatypes/src/types/null_type.rs b/src/datatypes/src/types/null_type.rs index a0b027dd14..b9bb2dc752 100644 --- a/src/datatypes/src/types/null_type.rs +++ b/src/datatypes/src/types/null_type.rs @@ -27,7 +27,7 @@ pub struct NullType; impl NullType { pub fn arc() -> DataTypeRef { - Arc::new(Self) + Arc::new(NullType) } } @@ -51,4 +51,8 @@ impl DataType for NullType { fn create_mutable_vector(&self, _capacity: usize) -> Box { Box::new(NullVectorBuilder::default()) } + + fn is_timestamp_compatible(&self) -> bool { + false + } } diff --git a/src/datatypes/src/types/primitive_type.rs b/src/datatypes/src/types/primitive_type.rs index b9f07ce82c..e389ca13bf 100644 --- a/src/datatypes/src/types/primitive_type.rs +++ b/src/datatypes/src/types/primitive_type.rs @@ -12,12 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::any::TypeId; -use std::marker::PhantomData; +use std::cmp::Ordering; -use arrow::array::PrimitiveArray; -use arrow::datatypes::DataType as ArrowDataType; -use paste::paste; +use arrow::datatypes::{ArrowNativeType, ArrowPrimitiveType, DataType as ArrowDataType}; +use common_time::{Date, DateTime}; +use num::NumCast; use serde::{Deserialize, Serialize}; use snafu::OptionExt; @@ -25,92 +24,226 @@ use crate::data_type::{ConcreteDataType, DataType}; use crate::error::{self, Result}; use crate::scalars::{Scalar, ScalarRef, ScalarVectorBuilder}; use crate::type_id::LogicalTypeId; -use crate::types::primitive_traits::Primitive; +use crate::types::{DateTimeType, DateType}; use crate::value::{Value, ValueRef}; use crate::vectors::{MutableVector, PrimitiveVector, PrimitiveVectorBuilder, Vector}; -#[derive(Clone, Serialize, Deserialize)] -pub struct PrimitiveType { - #[serde(skip)] - _phantom: PhantomData, +/// Data types that can be used as arrow's native type. +pub trait NativeType: ArrowNativeType + NumCast { + /// Largest numeric type this primitive type can be cast to. + type LargestType: NativeType; } -impl PartialEq> for PrimitiveType { - fn eq(&self, _other: &PrimitiveType) -> bool { - TypeId::of::() == TypeId::of::() - } +macro_rules! impl_native_type { + ($Type: ident, $LargestType: ident) => { + impl NativeType for $Type { + type LargestType = $LargestType; + } + }; } -impl Eq for PrimitiveType {} +impl_native_type!(u8, u64); +impl_native_type!(u16, u64); +impl_native_type!(u32, u64); +impl_native_type!(u64, u64); +impl_native_type!(i8, i64); +impl_native_type!(i16, i64); +impl_native_type!(i32, i64); +impl_native_type!(i64, i64); +impl_native_type!(f32, f64); +impl_native_type!(f64, f64); -/// A trait that provide helper methods for a primitive type to implementing the [PrimitiveVector]. -pub trait PrimitiveElement -where - for<'a> Self: Primitive - + Scalar> - + ScalarRef<'a, ScalarType = Self, VectorType = PrimitiveVector> - + Scalar = Self>, +/// Represents the wrapper type that wraps a native type using the `newtype pattern`, +/// such as [Date](`common_time::Date`) is a wrapper type for the underlying native +/// type `i32`. +pub trait WrapperType: + Copy + + Scalar + + PartialEq + + Into + + Into> + + Serialize + + Into { + /// Logical primitive type that this wrapper type belongs to. + type LogicalType: LogicalPrimitiveType; + /// The underlying native type. + type Native: NativeType; + + /// Convert native type into this wrapper type. + fn from_native(value: Self::Native) -> Self; + + /// Convert this wrapper type into native type. + fn into_native(self) -> Self::Native; +} + +/// Trait bridging the logical primitive type with [ArrowPrimitiveType]. +pub trait LogicalPrimitiveType: 'static + Sized { + /// Arrow primitive type of this logical type. + type ArrowPrimitive: ArrowPrimitiveType; + /// Native (physical) type of this logical type. + type Native: NativeType; + /// Wrapper type that the vector returns. + type Wrapper: WrapperType + + for<'a> Scalar, RefType<'a> = Self::Wrapper> + + for<'a> ScalarRef<'a, ScalarType = Self::Wrapper>; + /// Construct the data type struct. fn build_data_type() -> ConcreteDataType; - /// Returns the name of the type id. - fn type_name() -> String; + /// Return the name of the type. + fn type_name() -> &'static str; /// Dynamic cast the vector to the concrete vector type. - fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveArray>; + fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveVector>; /// Cast value ref to the primitive type. - fn cast_value_ref(value: ValueRef) -> Result>; + fn cast_value_ref(value: ValueRef) -> Result>; } -macro_rules! impl_primitive_element { - ($Type:ident, $TypeId:ident) => { - paste::paste! { - impl PrimitiveElement for $Type { - fn build_data_type() -> ConcreteDataType { - ConcreteDataType::$TypeId(PrimitiveType::<$Type>::default()) - } +/// A new type for [WrapperType], complement the `Ord` feature for it. Wrapping non ordered +/// primitive types like `f32` and `f64` in `OrdPrimitive` can make them be used in places that +/// require `Ord`. For example, in `Median` or `Percentile` UDAFs. +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct OrdPrimitive(pub T); - fn type_name() -> String { - stringify!($TypeId).to_string() - } +impl OrdPrimitive { + pub fn as_primitive(&self) -> T { + self.0 + } +} - fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveArray<$Type>> { - let primitive_vector = vector - .as_any() - .downcast_ref::>() - .with_context(|| error::CastTypeSnafu { - msg: format!( - "Failed to cast {} to vector of primitive type {}", - vector.vector_type_name(), - stringify!($TypeId) - ), - })?; - Ok(&primitive_vector.array) - } +impl Eq for OrdPrimitive {} - fn cast_value_ref(value: ValueRef) -> Result> { - match value { - ValueRef::Null => Ok(None), - ValueRef::$TypeId(v) => Ok(Some(v.into())), - other => error::CastTypeSnafu { - msg: format!( - "Failed to cast value {:?} to primitive type {}", - other, - stringify!($TypeId), - ), - }.fail(), +impl PartialOrd for OrdPrimitive { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for OrdPrimitive { + fn cmp(&self, other: &Self) -> Ordering { + Into::::into(self.0).cmp(&Into::::into(other.0)) + } +} + +impl From> for Value { + fn from(p: OrdPrimitive) -> Self { + p.0.into() + } +} + +macro_rules! impl_wrapper { + ($Type: ident, $LogicalType: ident) => { + impl WrapperType for $Type { + type LogicalType = $LogicalType; + type Native = $Type; + + fn from_native(value: Self::Native) -> Self { + value + } + + fn into_native(self) -> Self::Native { + self + } + } + }; +} + +impl_wrapper!(u8, UInt8Type); +impl_wrapper!(u16, UInt16Type); +impl_wrapper!(u32, UInt32Type); +impl_wrapper!(u64, UInt64Type); +impl_wrapper!(i8, Int8Type); +impl_wrapper!(i16, Int16Type); +impl_wrapper!(i32, Int32Type); +impl_wrapper!(i64, Int64Type); +impl_wrapper!(f32, Float32Type); +impl_wrapper!(f64, Float64Type); + +impl WrapperType for Date { + type LogicalType = DateType; + type Native = i32; + + fn from_native(value: i32) -> Self { + Date::new(value) + } + + fn into_native(self) -> i32 { + self.val() + } +} + +impl WrapperType for DateTime { + type LogicalType = DateTimeType; + type Native = i64; + + fn from_native(value: Self::Native) -> Self { + DateTime::new(value) + } + + fn into_native(self) -> Self::Native { + self.val() + } +} + +macro_rules! define_logical_primitive_type { + ($Native: ident, $TypeId: ident, $DataType: ident) => { + // We need to define it as an empty struct `struct DataType {}` instead of a struct-unit + // `struct DataType;` to ensure the serialized JSON string is compatible with previous + // implementation. + #[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] + pub struct $DataType {} + + impl LogicalPrimitiveType for $DataType { + type ArrowPrimitive = arrow::datatypes::$DataType; + type Native = $Native; + type Wrapper = $Native; + + fn build_data_type() -> ConcreteDataType { + ConcreteDataType::$TypeId($DataType::default()) + } + + fn type_name() -> &'static str { + stringify!($TypeId) + } + + fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveVector<$DataType>> { + vector + .as_any() + .downcast_ref::>() + .with_context(|| error::CastTypeSnafu { + msg: format!( + "Failed to cast {} to vector of primitive type {}", + vector.vector_type_name(), + stringify!($TypeId) + ), + }) + } + + fn cast_value_ref(value: ValueRef) -> Result> { + match value { + ValueRef::Null => Ok(None), + ValueRef::$TypeId(v) => Ok(Some(v.into())), + other => error::CastTypeSnafu { + msg: format!( + "Failed to cast value {:?} to primitive type {}", + other, + stringify!($TypeId), + ), } + .fail(), } } } }; } -macro_rules! impl_numeric { - ($Type:ident, $TypeId:ident) => { - impl DataType for PrimitiveType<$Type> { +macro_rules! define_non_timestamp_primitive { + ($Native: ident, $TypeId: ident, $DataType: ident) => { + define_logical_primitive_type!($Native, $TypeId, $DataType); + + impl DataType for $DataType { fn name(&self) -> &str { stringify!($TypeId) } @@ -120,7 +253,7 @@ macro_rules! impl_numeric { } fn default_value(&self) -> Value { - $Type::default().into() + $Native::default().into() } fn as_arrow_type(&self) -> ArrowDataType { @@ -128,61 +261,98 @@ macro_rules! impl_numeric { } fn create_mutable_vector(&self, capacity: usize) -> Box { - Box::new(PrimitiveVectorBuilder::<$Type>::with_capacity(capacity)) + Box::new(PrimitiveVectorBuilder::<$DataType>::with_capacity(capacity)) } - } - impl std::fmt::Debug for PrimitiveType<$Type> { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{}", self.name()) + fn is_timestamp_compatible(&self) -> bool { + false } } - - impl Default for PrimitiveType<$Type> { - fn default() -> Self { - Self { - _phantom: PhantomData, - } - } - } - - impl_primitive_element!($Type, $TypeId); - - paste! { - pub type [<$TypeId Type>]=PrimitiveType<$Type>; - } }; } -impl_numeric!(u8, UInt8); -impl_numeric!(u16, UInt16); -impl_numeric!(u32, UInt32); -impl_numeric!(u64, UInt64); -impl_numeric!(i8, Int8); -impl_numeric!(i16, Int16); -impl_numeric!(i32, Int32); -impl_numeric!(i64, Int64); -impl_numeric!(f32, Float32); -impl_numeric!(f64, Float64); +define_non_timestamp_primitive!(u8, UInt8, UInt8Type); +define_non_timestamp_primitive!(u16, UInt16, UInt16Type); +define_non_timestamp_primitive!(u32, UInt32, UInt32Type); +define_non_timestamp_primitive!(u64, UInt64, UInt64Type); +define_non_timestamp_primitive!(i8, Int8, Int8Type); +define_non_timestamp_primitive!(i16, Int16, Int16Type); +define_non_timestamp_primitive!(i32, Int32, Int32Type); +define_non_timestamp_primitive!(f32, Float32, Float32Type); +define_non_timestamp_primitive!(f64, Float64, Float64Type); + +// Timestamp primitive: +define_logical_primitive_type!(i64, Int64, Int64Type); + +impl DataType for Int64Type { + fn name(&self) -> &str { + "Int64" + } + + fn logical_type_id(&self) -> LogicalTypeId { + LogicalTypeId::Int64 + } + + fn default_value(&self) -> Value { + Value::Int64(0) + } + + fn as_arrow_type(&self) -> ArrowDataType { + ArrowDataType::Int64 + } + + fn create_mutable_vector(&self, capacity: usize) -> Box { + Box::new(PrimitiveVectorBuilder::::with_capacity(capacity)) + } + + fn is_timestamp_compatible(&self) -> bool { + true + } +} #[cfg(test)] mod tests { + use std::collections::BinaryHeap; + use super::*; #[test] - fn test_eq() { - assert_eq!(UInt8Type::default(), UInt8Type::default()); - assert_eq!(UInt16Type::default(), UInt16Type::default()); - assert_eq!(UInt32Type::default(), UInt32Type::default()); - assert_eq!(UInt64Type::default(), UInt64Type::default()); - assert_eq!(Int8Type::default(), Int8Type::default()); - assert_eq!(Int16Type::default(), Int16Type::default()); - assert_eq!(Int32Type::default(), Int32Type::default()); - assert_eq!(Int64Type::default(), Int64Type::default()); - assert_eq!(Float32Type::default(), Float32Type::default()); - assert_eq!(Float64Type::default(), Float64Type::default()); + fn test_ord_primitive() { + struct Foo + where + T: WrapperType, + { + heap: BinaryHeap>, + } - assert_ne!(Float32Type::default(), Float64Type::default()); - assert_ne!(Float32Type::default(), Int32Type::default()); + impl Foo + where + T: WrapperType, + { + fn push(&mut self, value: T) { + let value = OrdPrimitive::(value); + self.heap.push(value); + } + } + + macro_rules! test { + ($Type:ident) => { + let mut foo = Foo::<$Type> { + heap: BinaryHeap::new(), + }; + foo.push($Type::default()); + }; + } + + test!(u8); + test!(u16); + test!(u32); + test!(u64); + test!(i8); + test!(i16); + test!(i32); + test!(i64); + test!(f32); + test!(f64); } } diff --git a/src/datatypes/src/types/string_type.rs b/src/datatypes/src/types/string_type.rs index 736a3faac9..799cbbbdd3 100644 --- a/src/datatypes/src/types/string_type.rs +++ b/src/datatypes/src/types/string_type.rs @@ -18,9 +18,10 @@ use arrow::datatypes::DataType as ArrowDataType; use common_base::bytes::StringBytes; use serde::{Deserialize, Serialize}; -use crate::data_type::DataType; -use crate::prelude::{DataTypeRef, LogicalTypeId, Value}; -use crate::scalars::ScalarVectorBuilder; +use crate::data_type::{DataType, DataTypeRef}; +use crate::prelude::ScalarVectorBuilder; +use crate::type_id::LogicalTypeId; +use crate::value::Value; use crate::vectors::{MutableVector, StringVectorBuilder}; #[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -52,4 +53,8 @@ impl DataType for StringType { fn create_mutable_vector(&self, capacity: usize) -> Box { Box::new(StringVectorBuilder::with_capacity(capacity)) } + + fn is_timestamp_compatible(&self) -> bool { + false + } } diff --git a/src/datatypes2/src/types/timestamp_type.rs b/src/datatypes/src/types/timestamp_type.rs similarity index 100% rename from src/datatypes2/src/types/timestamp_type.rs rename to src/datatypes/src/types/timestamp_type.rs diff --git a/src/datatypes/src/value.rs b/src/datatypes/src/value.rs index d5e0ae3e9f..bade88d419 100644 --- a/src/datatypes/src/value.rs +++ b/src/datatypes/src/value.rs @@ -110,6 +110,7 @@ impl Value { /// # Panics /// Panics if the data type is not supported. pub fn data_type(&self) -> ConcreteDataType { + // TODO(yingwen): Implement this once all data types are implemented. match self { Value::Null => ConcreteDataType::null_datatype(), Value::Boolean(_) => ConcreteDataType::boolean_datatype(), @@ -125,10 +126,10 @@ impl Value { Value::Float64(_) => ConcreteDataType::float64_datatype(), Value::String(_) => ConcreteDataType::string_datatype(), Value::Binary(_) => ConcreteDataType::binary_datatype(), - Value::List(list) => ConcreteDataType::list_datatype(list.datatype().clone()), Value::Date(_) => ConcreteDataType::date_datatype(), Value::DateTime(_) => ConcreteDataType::datetime_datatype(), Value::Timestamp(v) => ConcreteDataType::timestamp_datatype(v.unit()), + Value::List(list) => ConcreteDataType::list_datatype(list.datatype().clone()), } } @@ -193,7 +194,12 @@ impl Value { Value::List(_) => LogicalTypeId::List, Value::Date(_) => LogicalTypeId::Date, Value::DateTime(_) => LogicalTypeId::DateTime, - Value::Timestamp(_) => LogicalTypeId::Timestamp, + Value::Timestamp(t) => match t.unit() { + TimeUnit::Second => LogicalTypeId::TimestampSecond, + TimeUnit::Millisecond => LogicalTypeId::TimestampMillisecond, + TimeUnit::Microsecond => LogicalTypeId::TimestampMicrosecond, + TimeUnit::Nanosecond => LogicalTypeId::TimestampNanosecond, + }, } } } @@ -277,6 +283,9 @@ impl_value_from!(Float32, f32); impl_value_from!(Float64, f64); impl_value_from!(String, StringBytes); impl_value_from!(Binary, Bytes); +impl_value_from!(Date, Date); +impl_value_from!(DateTime, DateTime); +impl_value_from!(Timestamp, Timestamp); impl From for Value { fn from(string: String) -> Value { @@ -296,12 +305,6 @@ impl From> for Value { } } -impl From for Value { - fn from(v: Timestamp) -> Self { - Value::Timestamp(v) - } -} - impl From<&[u8]> for Value { fn from(bytes: &[u8]) -> Value { Value::Binary(bytes.into()) @@ -337,6 +340,7 @@ impl TryFrom for serde_json::Value { } } +// TODO(yingwen): Consider removing the `datatype` field from `ListValue`. /// List value. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct ListValue { @@ -391,6 +395,7 @@ impl TryFrom for Value { fn try_from(v: ScalarValue) -> Result { let v = match v { + ScalarValue::Null => Value::Null, ScalarValue::Boolean(b) => Value::from(b), ScalarValue::Float32(f) => Value::from(f), ScalarValue::Float64(f) => Value::from(f), @@ -405,8 +410,10 @@ impl TryFrom for Value { ScalarValue::Utf8(s) | ScalarValue::LargeUtf8(s) => { Value::from(s.map(StringBytes::from)) } - ScalarValue::Binary(b) | ScalarValue::LargeBinary(b) => Value::from(b.map(Bytes::from)), - ScalarValue::List(vs, t) => { + ScalarValue::Binary(b) + | ScalarValue::LargeBinary(b) + | ScalarValue::FixedSizeBinary(_, b) => Value::from(b.map(Bytes::from)), + ScalarValue::List(vs, field) => { let items = if let Some(vs) = vs { let vs = vs .into_iter() @@ -416,7 +423,7 @@ impl TryFrom for Value { } else { None }; - let datatype = t.as_ref().try_into()?; + let datatype = ConcreteDataType::try_from(field.data_type())?; Value::List(ListValue::new(items, datatype)) } ScalarValue::Date32(d) => d.map(|x| Value::Date(Date::new(x))).unwrap_or(Value::Null), @@ -435,7 +442,13 @@ impl TryFrom for Value { ScalarValue::TimestampNanosecond(t, _) => t .map(|x| Value::Timestamp(Timestamp::new(x, TimeUnit::Nanosecond))) .unwrap_or(Value::Null), - _ => { + ScalarValue::Decimal128(_, _, _) + | ScalarValue::Time64(_) + | ScalarValue::IntervalYearMonth(_) + | ScalarValue::IntervalDayTime(_) + | ScalarValue::IntervalMonthDayNano(_) + | ScalarValue::Struct(_, _) + | ScalarValue::Dictionary(_, _) => { return error::UnsupportedArrowTypeSnafu { arrow_type: v.get_datatype(), } @@ -545,15 +558,6 @@ impl<'a> Ord for ValueRef<'a> { } } -/// A helper trait to convert copyable types to `ValueRef`. -/// -/// It could replace the usage of `Into>`, thus avoid confusion between `Into` -/// and `Into>` in generic codes. One typical usage is the [`Primitive`](crate::primitive_traits::Primitive) trait. -pub trait IntoValueRef<'a> { - /// Convert itself to [ValueRef]. - fn into_value_ref(self) -> ValueRef<'a>; -} - macro_rules! impl_value_ref_from { ($Variant:ident, $Type:ident) => { impl From<$Type> for ValueRef<'_> { @@ -562,12 +566,6 @@ macro_rules! impl_value_ref_from { } } - impl<'a> IntoValueRef<'a> for $Type { - fn into_value_ref(self) -> ValueRef<'a> { - ValueRef::$Variant(self.into()) - } - } - impl From> for ValueRef<'_> { fn from(value: Option<$Type>) -> Self { match value { @@ -576,15 +574,6 @@ macro_rules! impl_value_ref_from { } } } - - impl<'a> IntoValueRef<'a> for Option<$Type> { - fn into_value_ref(self) -> ValueRef<'a> { - match self { - Some(v) => ValueRef::$Variant(v.into()), - None => ValueRef::Null, - } - } - } }; } @@ -599,6 +588,9 @@ impl_value_ref_from!(Int32, i32); impl_value_ref_from!(Int64, i64); impl_value_ref_from!(Float32, f32); impl_value_ref_from!(Float64, f64); +impl_value_ref_from!(Date, Date); +impl_value_ref_from!(DateTime, DateTime); +impl_value_ref_from!(Timestamp, Timestamp); impl<'a> From<&'a str> for ValueRef<'a> { fn from(string: &'a str) -> ValueRef<'a> { @@ -628,6 +620,7 @@ impl<'a> From>> for ValueRef<'a> { /// if it becomes bottleneck. #[derive(Debug, Clone, Copy)] pub enum ListValueRef<'a> { + // TODO(yingwen): Consider replace this by VectorRef. Indexed { vector: &'a ListVector, idx: usize }, Ref { val: &'a ListValue }, } @@ -785,19 +778,16 @@ mod tests { Some(Box::new(vec![Value::Int32(1), Value::Null])), ConcreteDataType::int32_datatype() )), - ScalarValue::List( - Some(Box::new(vec![ - ScalarValue::Int32(Some(1)), - ScalarValue::Int32(None) - ])), - Box::new(ArrowDataType::Int32) + ScalarValue::new_list( + Some(vec![ScalarValue::Int32(Some(1)), ScalarValue::Int32(None)]), + ArrowDataType::Int32, ) .try_into() .unwrap() ); assert_eq!( Value::List(ListValue::new(None, ConcreteDataType::uint32_datatype())), - ScalarValue::List(None, Box::new(ArrowDataType::UInt32)) + ScalarValue::new_list(None, ArrowDataType::UInt32) .try_into() .unwrap() ); @@ -980,6 +970,10 @@ mod tests { ConcreteDataType::int32_datatype(), )), ); + check_type_and_value( + &ConcreteDataType::list_datatype(ConcreteDataType::null_datatype()), + &Value::List(ListValue::default()), + ); check_type_and_value( &ConcreteDataType::date_datatype(), &Value::Date(Date::new(1)), @@ -989,7 +983,7 @@ mod tests { &Value::DateTime(DateTime::new(1)), ); check_type_and_value( - &ConcreteDataType::timestamp_millis_datatype(), + &ConcreteDataType::timestamp_millisecond_datatype(), &Value::Timestamp(Timestamp::from_millis(1)), ); } @@ -1208,59 +1202,6 @@ mod tests { assert!(wrong_value.as_list().is_err()); } - #[test] - fn test_into_value_ref() { - macro_rules! check_into_value_ref { - ($Variant: ident, $data: expr, $PrimitiveType: ident, $Wrapper: ident) => { - let data: $PrimitiveType = $data; - assert_eq!( - ValueRef::$Variant($Wrapper::from(data)), - data.into_value_ref() - ); - assert_eq!( - ValueRef::$Variant($Wrapper::from(data)), - ValueRef::from(data) - ); - assert_eq!( - ValueRef::$Variant($Wrapper::from(data)), - Some(data).into_value_ref() - ); - assert_eq!( - ValueRef::$Variant($Wrapper::from(data)), - ValueRef::from(Some(data)) - ); - let x: Option<$PrimitiveType> = None; - assert_eq!(ValueRef::Null, x.into_value_ref()); - assert_eq!(ValueRef::Null, x.into()); - }; - } - - macro_rules! check_primitive_into_value_ref { - ($Variant: ident, $data: expr, $PrimitiveType: ident) => { - check_into_value_ref!($Variant, $data, $PrimitiveType, $PrimitiveType) - }; - } - - check_primitive_into_value_ref!(Boolean, true, bool); - check_primitive_into_value_ref!(UInt8, 10, u8); - check_primitive_into_value_ref!(UInt16, 20, u16); - check_primitive_into_value_ref!(UInt32, 30, u32); - check_primitive_into_value_ref!(UInt64, 40, u64); - check_primitive_into_value_ref!(Int8, -10, i8); - check_primitive_into_value_ref!(Int16, -20, i16); - check_primitive_into_value_ref!(Int32, -30, i32); - check_primitive_into_value_ref!(Int64, -40, i64); - check_into_value_ref!(Float32, 10.0, f32, OrderedF32); - check_into_value_ref!(Float64, 10.0, f64, OrderedF64); - - let hello = "hello"; - assert_eq!( - ValueRef::Binary(hello.as_bytes()), - ValueRef::from(hello.as_bytes()) - ); - assert_eq!(ValueRef::String(hello), ValueRef::from(hello)); - } - #[test] fn test_display() { assert_eq!(Value::Null.to_string(), "Null"); @@ -1301,10 +1242,34 @@ mod tests { assert_eq!( Value::List(ListValue::new( Some(Box::new(vec![])), - ConcreteDataType::timestamp_datatype(TimeUnit::Millisecond), + ConcreteDataType::timestamp_second_datatype(), )) .to_string(), - "Timestamp[]" + "TimestampSecondType[]" + ); + assert_eq!( + Value::List(ListValue::new( + Some(Box::new(vec![])), + ConcreteDataType::timestamp_millisecond_datatype(), + )) + .to_string(), + "TimestampMillisecondType[]" + ); + assert_eq!( + Value::List(ListValue::new( + Some(Box::new(vec![])), + ConcreteDataType::timestamp_microsecond_datatype(), + )) + .to_string(), + "TimestampMicrosecondType[]" + ); + assert_eq!( + Value::List(ListValue::new( + Some(Box::new(vec![])), + ConcreteDataType::timestamp_nanosecond_datatype(), + )) + .to_string(), + "TimestampNanosecondType[]" ); } } diff --git a/src/datatypes/src/vectors.rs b/src/datatypes/src/vectors.rs index 6c9402849f..38fa762d4b 100644 --- a/src/datatypes/src/vectors.rs +++ b/src/datatypes/src/vectors.rs @@ -12,68 +12,59 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub mod binary; -pub mod boolean; -mod builder; -pub mod constant; -pub mod date; -pub mod datetime; -mod eq; -mod helper; -mod list; -pub mod mutable; -pub mod null; -mod operations; -pub mod primitive; -mod string; -mod timestamp; - use std::any::Any; use std::fmt::Debug; use std::sync::Arc; use arrow::array::{Array, ArrayRef}; -use arrow::bitmap::Bitmap; -pub use binary::*; -pub use boolean::*; -pub use builder::VectorBuilder; -pub use constant::*; -pub use date::*; -pub use datetime::*; -pub use helper::Helper; -pub use list::*; -pub use mutable::MutableVector; -pub use null::*; -pub use operations::VectorOp; -pub use primitive::*; use snafu::ensure; -pub use string::*; -pub use timestamp::*; use crate::data_type::ConcreteDataType; use crate::error::{self, Result}; use crate::serialize::Serializable; use crate::value::{Value, ValueRef}; +use crate::vectors::operations::VectorOp; -#[derive(Debug, PartialEq)] -pub enum Validity<'a> { - /// Whether the array slot is valid or not (null). - Slots(&'a Bitmap), - /// All slots are valid. - AllValid, - /// All slots are null. - AllNull, -} +mod binary; +mod boolean; +mod constant; +mod date; +mod datetime; +mod eq; +mod helper; +mod list; +mod null; +mod operations; +mod primitive; +mod string; +mod timestamp; +mod validity; -impl<'a> Validity<'a> { - pub fn slots(&self) -> Option<&Bitmap> { - match self { - Validity::Slots(bitmap) => Some(bitmap), - _ => None, - } - } -} +pub use binary::{BinaryVector, BinaryVectorBuilder}; +pub use boolean::{BooleanVector, BooleanVectorBuilder}; +pub use constant::ConstantVector; +pub use date::{DateVector, DateVectorBuilder}; +pub use datetime::{DateTimeVector, DateTimeVectorBuilder}; +pub use helper::Helper; +pub use list::{ListIter, ListVector, ListVectorBuilder}; +pub use null::{NullVector, NullVectorBuilder}; +pub use primitive::{ + Float32Vector, Float32VectorBuilder, Float64Vector, Float64VectorBuilder, Int16Vector, + Int16VectorBuilder, Int32Vector, Int32VectorBuilder, Int64Vector, Int64VectorBuilder, + Int8Vector, Int8VectorBuilder, PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder, + UInt16Vector, UInt16VectorBuilder, UInt32Vector, UInt32VectorBuilder, UInt64Vector, + UInt64VectorBuilder, UInt8Vector, UInt8VectorBuilder, +}; +pub use string::{StringVector, StringVectorBuilder}; +pub use timestamp::{ + TimestampMicrosecondVector, TimestampMicrosecondVectorBuilder, TimestampMillisecondVector, + TimestampMillisecondVectorBuilder, TimestampNanosecondVector, TimestampNanosecondVectorBuilder, + TimestampSecondVector, TimestampSecondVectorBuilder, +}; +pub use validity::Validity; +// TODO(yingwen): arrow 28.0 implements Clone for all arrays, we could upgrade to it and simplify +// some codes in methods such as `to_arrow_array()` and `to_boxed_arrow_array()`. /// Vector of data values. pub trait Vector: Send + Sync + Serializable + Debug + VectorOp { /// Returns the data type of the vector. @@ -110,13 +101,7 @@ pub trait Vector: Send + Sync + Serializable + Debug + VectorOp { /// The number of null slots on this [`Vector`]. /// # Implementation /// This is `O(1)`. - fn null_count(&self) -> usize { - match self.validity() { - Validity::Slots(bitmap) => bitmap.null_count(), - Validity::AllValid => 0, - Validity::AllNull => self.len(), - } - } + fn null_count(&self) -> usize; /// Returns true when it's a ConstantColumn fn is_const(&self) -> bool { @@ -165,6 +150,42 @@ pub trait Vector: Send + Sync + Serializable + Debug + VectorOp { pub type VectorRef = Arc; +/// Mutable vector that could be used to build an immutable vector. +pub trait MutableVector: Send + Sync { + /// Returns the data type of the vector. + fn data_type(&self) -> ConcreteDataType; + + /// Returns the length of the vector. + fn len(&self) -> usize; + + /// Returns whether the vector is empty. + fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Convert to Any, to enable dynamic casting. + fn as_any(&self) -> &dyn Any; + + /// Convert to mutable Any, to enable dynamic casting. + fn as_mut_any(&mut self) -> &mut dyn Any; + + /// Convert `self` to an (immutable) [VectorRef] and reset `self`. + fn to_vector(&mut self) -> VectorRef; + + /// Push value ref to this mutable vector. + /// + /// Returns error if data type unmatch. + fn push_value_ref(&mut self, value: ValueRef) -> Result<()>; + + /// Extend this mutable vector by slice of `vector`. + /// + /// Returns error if data type unmatch. + /// + /// # Panics + /// Panics if `offset + length > vector.len()`. + fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()>; +} + /// Helper to define `try_from_arrow_array(array: arrow::array::ArrayRef)` function. macro_rules! impl_try_from_arrow_array_for_vector { ($Array: ident, $Vector: ident) => { @@ -172,16 +193,20 @@ macro_rules! impl_try_from_arrow_array_for_vector { pub fn try_from_arrow_array( array: impl AsRef, ) -> crate::error::Result<$Vector> { - Ok($Vector::from( - array - .as_ref() - .as_any() - .downcast_ref::<$Array>() - .with_context(|| crate::error::ConversionSnafu { - from: std::format!("{:?}", array.as_ref().data_type()), - })? - .clone(), - )) + use snafu::OptionExt; + + let data = array + .as_ref() + .as_any() + .downcast_ref::<$Array>() + .with_context(|| crate::error::ConversionSnafu { + from: std::format!("{:?}", array.as_ref().data_type()), + })? + .data() + .clone(); + + let concrete_array = $Array::from(data); + Ok($Vector::from(concrete_array)) } } }; @@ -189,10 +214,7 @@ macro_rules! impl_try_from_arrow_array_for_vector { macro_rules! impl_validity_for_vector { ($array: expr) => { - match $array.validity() { - Some(bitmap) => Validity::Slots(bitmap), - None => Validity::AllValid, - } + Validity::from_array_data($array.data()) }; } @@ -219,10 +241,11 @@ macro_rules! impl_get_ref_for_vector { } macro_rules! impl_extend_for_builder { - ($mutable_array: expr, $vector: ident, $VectorType: ident, $offset: ident, $length: ident) => {{ + ($mutable_vector: expr, $vector: ident, $VectorType: ident, $offset: ident, $length: ident) => {{ use snafu::OptionExt; - let concrete_vector = $vector + let sliced_vector = $vector.slice($offset, $length); + let concrete_vector = sliced_vector .as_any() .downcast_ref::<$VectorType>() .with_context(|| crate::error::CastTypeSnafu { @@ -232,8 +255,9 @@ macro_rules! impl_extend_for_builder { stringify!($VectorType) ), })?; - let slice = concrete_vector.array.slice($offset, $length); - $mutable_array.extend_trusted_len(slice.iter()); + for value in concrete_vector.iter_data() { + $mutable_vector.push(value); + } Ok(()) }}; } @@ -245,27 +269,27 @@ pub(crate) use { #[cfg(test)] pub mod tests { - use arrow::array::{Array, PrimitiveArray}; + use arrow::array::{Array, Int32Array, UInt8Array}; use serde_json; - use super::helper::Helper; use super::*; use crate::data_type::DataType; - use crate::types::PrimitiveElement; + use crate::types::{Int32Type, LogicalPrimitiveType}; + use crate::vectors::helper::Helper; #[test] fn test_df_columns_to_vector() { - let df_column: Arc = Arc::new(PrimitiveArray::from_slice(vec![1, 2, 3])); + let df_column: Arc = Arc::new(Int32Array::from(vec![1, 2, 3])); let vector = Helper::try_into_vector(df_column).unwrap(); assert_eq!( - i32::build_data_type().as_arrow_type(), + Int32Type::build_data_type().as_arrow_type(), vector.data_type().as_arrow_type() ); } #[test] fn test_serialize_i32_vector() { - let df_column: Arc = Arc::new(PrimitiveArray::::from_slice(vec![1, 2, 3])); + let df_column: Arc = Arc::new(Int32Array::from(vec![1, 2, 3])); let json_value = Helper::try_into_vector(df_column) .unwrap() .serialize_to_json() @@ -275,7 +299,7 @@ pub mod tests { #[test] fn test_serialize_i8_vector() { - let df_column: Arc = Arc::new(PrimitiveArray::from_slice(vec![1u8, 2u8, 3u8])); + let df_column: Arc = Arc::new(UInt8Array::from(vec![1, 2, 3])); let json_value = Helper::try_into_vector(df_column) .unwrap() .serialize_to_json() diff --git a/src/datatypes/src/vectors/binary.rs b/src/datatypes/src/vectors/binary.rs index 7be3dc6a8e..3b5defc8ec 100644 --- a/src/datatypes/src/vectors/binary.rs +++ b/src/datatypes/src/vectors/binary.rs @@ -15,9 +15,8 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{Array, ArrayRef}; -use arrow::array::{ArrayIter, GenericByteArray}; -use snafu::{OptionExt, ResultExt}; +use arrow::array::{Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef}; +use snafu::ResultExt; use crate::arrow_array::{BinaryArray, MutableBinaryArray}; use crate::data_type::ConcreteDataType; @@ -37,6 +36,16 @@ impl BinaryVector { pub(crate) fn as_arrow(&self) -> &dyn Array { &self.array } + + fn to_array_data(&self) -> ArrayData { + self.array.data().clone() + } + + fn from_array_data(data: ArrayData) -> BinaryVector { + BinaryVector { + array: BinaryArray::from(data), + } + } } impl From for BinaryVector { @@ -48,7 +57,7 @@ impl From for BinaryVector { impl From>>> for BinaryVector { fn from(data: Vec>>) -> Self { Self { - array: BinaryArray::from(data), + array: BinaryArray::from_iter(data), } } } @@ -71,11 +80,13 @@ impl Vector for BinaryVector { } fn to_arrow_array(&self) -> ArrayRef { - Arc::new(self.array.clone()) + let data = self.to_array_data(); + Arc::new(BinaryArray::from(data)) } fn to_boxed_arrow_array(&self) -> Box { - Box::new(self.array.clone()) + let data = self.to_array_data(); + Box::new(BinaryArray::from(data)) } fn validity(&self) -> Validity { @@ -83,7 +94,11 @@ impl Vector for BinaryVector { } fn memory_size(&self) -> usize { - self.array.values().len() + self.array.offsets().len() * std::mem::size_of::() + self.array.get_buffer_memory_size() + } + + fn null_count(&self) -> usize { + self.array.null_count() } fn is_null(&self, row: usize) -> bool { @@ -91,7 +106,8 @@ impl Vector for BinaryVector { } fn slice(&self, offset: usize, length: usize) -> VectorRef { - Arc::new(Self::from(self.array.slice(offset, length))) + let data = self.array.data().slice(offset, length); + Arc::new(Self::from_array_data(data)) } fn get(&self, index: usize) -> Value { @@ -148,12 +164,15 @@ impl MutableVector for BinaryVectorBuilder { } fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - self.mutable_array.push(value.as_binary()?); + match value.as_binary()? { + Some(v) => self.mutable_array.append_value(v), + None => self.mutable_array.append_null(), + } Ok(()) } fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - vectors::impl_extend_for_builder!(self.mutable_array, vector, BinaryVector, offset, length) + vectors::impl_extend_for_builder!(self, vector, BinaryVector, offset, length) } } @@ -162,17 +181,20 @@ impl ScalarVectorBuilder for BinaryVectorBuilder { fn with_capacity(capacity: usize) -> Self { Self { - mutable_array: MutableBinaryArray::with_capacity(capacity), + mutable_array: MutableBinaryArray::with_capacity(capacity, 0), } } fn push(&mut self, value: Option<::RefItem<'_>>) { - self.mutable_array.push(value); + match value { + Some(v) => self.mutable_array.append_value(v), + None => self.mutable_array.append_null(), + } } fn finish(&mut self) -> Self::VectorType { BinaryVector { - array: std::mem::take(&mut self.mutable_array).into(), + array: self.mutable_array.finish(), } } } @@ -205,14 +227,17 @@ mod tests { #[test] fn test_binary_vector_misc() { - let v = BinaryVector::from(BinaryArray::from_slice(&[vec![1, 2, 3], vec![1, 2, 3]])); + let v = BinaryVector::from(BinaryArray::from_iter_values(&[ + vec![1, 2, 3], + vec![1, 2, 3], + ])); assert_eq!(2, v.len()); assert_eq!("BinaryVector", v.vector_type_name()); assert!(!v.is_const()); - assert_eq!(Validity::AllValid, v.validity()); + assert!(v.validity().is_all_valid()); assert!(!v.only_null()); - assert_eq!(30, v.memory_size()); + assert_eq!(128, v.memory_size()); for i in 0..2 { assert!(!v.is_null(i)); @@ -227,7 +252,10 @@ mod tests { #[test] fn test_serialize_binary_vector_to_json() { - let vector = BinaryVector::from(BinaryArray::from_slice(&[vec![1, 2, 3], vec![1, 2, 3]])); + let vector = BinaryVector::from(BinaryArray::from_iter_values(&[ + vec![1, 2, 3], + vec![1, 2, 3], + ])); let json_value = vector.serialize_to_json().unwrap(); assert_eq!( @@ -253,8 +281,8 @@ mod tests { #[test] fn test_from_arrow_array() { - let arrow_array = BinaryArray::from_slice(&[vec![1, 2, 3], vec![1, 2, 3]]); - let original = arrow_array.clone(); + let arrow_array = BinaryArray::from_iter_values(&[vec![1, 2, 3], vec![1, 2, 3]]); + let original = BinaryArray::from(arrow_array.data().clone()); let vector = BinaryVector::from(arrow_array); assert_eq!(original, vector.array); } @@ -289,7 +317,7 @@ mod tests { builder.push(Some(b"world")); let vector = builder.finish(); assert_eq!(0, vector.null_count()); - assert_eq!(Validity::AllValid, vector.validity()); + assert!(vector.validity().is_all_valid()); let mut builder = BinaryVectorBuilder::with_capacity(3); builder.push(Some(b"hello")); @@ -298,9 +326,10 @@ mod tests { let vector = builder.finish(); assert_eq!(1, vector.null_count()); let validity = vector.validity(); - let slots = validity.slots().unwrap(); - assert_eq!(1, slots.null_count()); - assert!(!slots.get_bit(1)); + assert!(!validity.is_set(1)); + + assert_eq!(1, validity.null_count()); + assert!(!validity.is_set(1)); } #[test] diff --git a/src/datatypes/src/vectors/boolean.rs b/src/datatypes/src/vectors/boolean.rs index 11c40bd661..2b4e5b8e10 100644 --- a/src/datatypes/src/vectors/boolean.rs +++ b/src/datatypes/src/vectors/boolean.rs @@ -16,9 +16,10 @@ use std::any::Any; use std::borrow::Borrow; use std::sync::Arc; -use arrow::array::{Array, ArrayRef, BooleanArray, MutableArray, MutableBooleanArray}; -use arrow::bitmap::utils::{BitmapIter, ZipValidity}; -use snafu::{OptionExt, ResultExt}; +use arrow::array::{ + Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef, BooleanArray, BooleanBuilder, +}; +use snafu::ResultExt; use crate::data_type::ConcreteDataType; use crate::error::Result; @@ -41,12 +42,26 @@ impl BooleanVector { pub(crate) fn as_boolean_array(&self) -> &BooleanArray { &self.array } + + fn to_array_data(&self) -> ArrayData { + self.array.data().clone() + } + + fn from_array_data(data: ArrayData) -> BooleanVector { + BooleanVector { + array: BooleanArray::from(data), + } + } + + pub(crate) fn false_count(&self) -> usize { + self.array.false_count() + } } impl From> for BooleanVector { fn from(data: Vec) -> Self { BooleanVector { - array: BooleanArray::from_slice(&data), + array: BooleanArray::from(data), } } } @@ -91,11 +106,13 @@ impl Vector for BooleanVector { } fn to_arrow_array(&self) -> ArrayRef { - Arc::new(self.array.clone()) + let data = self.to_array_data(); + Arc::new(BooleanArray::from(data)) } fn to_boxed_arrow_array(&self) -> Box { - Box::new(self.array.clone()) + let data = self.to_array_data(); + Box::new(BooleanArray::from(data)) } fn validity(&self) -> Validity { @@ -103,7 +120,11 @@ impl Vector for BooleanVector { } fn memory_size(&self) -> usize { - self.array.values().as_slice().0.len() + self.array.get_buffer_memory_size() + } + + fn null_count(&self) -> usize { + self.array.null_count() } fn is_null(&self, row: usize) -> bool { @@ -111,7 +132,8 @@ impl Vector for BooleanVector { } fn slice(&self, offset: usize, length: usize) -> VectorRef { - Arc::new(Self::from(self.array.slice(offset, length))) + let data = self.array.data().slice(offset, length); + Arc::new(Self::from_array_data(data)) } fn get(&self, index: usize) -> Value { @@ -126,7 +148,7 @@ impl Vector for BooleanVector { impl ScalarVector for BooleanVector { type OwnedItem = bool; type RefItem<'a> = bool; - type Iter<'a> = ZipValidity<'a, bool, BitmapIter<'a>>; + type Iter<'a> = ArrayIter<&'a BooleanArray>; type Builder = BooleanVectorBuilder; fn get_data(&self, idx: usize) -> Option> { @@ -143,7 +165,7 @@ impl ScalarVector for BooleanVector { } pub struct BooleanVectorBuilder { - mutable_array: MutableBooleanArray, + mutable_array: BooleanBuilder, } impl MutableVector for BooleanVectorBuilder { @@ -168,12 +190,15 @@ impl MutableVector for BooleanVectorBuilder { } fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - self.mutable_array.push(value.as_boolean()?); + match value.as_boolean()? { + Some(v) => self.mutable_array.append_value(v), + None => self.mutable_array.append_null(), + } Ok(()) } fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - vectors::impl_extend_for_builder!(self.mutable_array, vector, BooleanVector, offset, length) + vectors::impl_extend_for_builder!(self, vector, BooleanVector, offset, length) } } @@ -182,17 +207,20 @@ impl ScalarVectorBuilder for BooleanVectorBuilder { fn with_capacity(capacity: usize) -> Self { Self { - mutable_array: MutableBooleanArray::with_capacity(capacity), + mutable_array: BooleanBuilder::with_capacity(capacity), } } fn push(&mut self, value: Option<::RefItem<'_>>) { - self.mutable_array.push(value); + match value { + Some(v) => self.mutable_array.append_value(v), + None => self.mutable_array.append_null(), + } } fn finish(&mut self) -> Self::VectorType { BooleanVector { - array: std::mem::take(&mut self.mutable_array).into(), + array: self.mutable_array.finish(), } } } @@ -225,9 +253,9 @@ mod tests { assert_eq!(9, v.len()); assert_eq!("BooleanVector", v.vector_type_name()); assert!(!v.is_const()); - assert_eq!(Validity::AllValid, v.validity()); + assert!(v.validity().is_all_valid()); assert!(!v.only_null()); - assert_eq!(2, v.memory_size()); + assert_eq!(64, v.memory_size()); for (i, b) in bools.iter().enumerate() { assert!(!v.is_null(i)); @@ -316,13 +344,12 @@ mod tests { let vector = BooleanVector::from(vec![Some(true), None, Some(false)]); assert_eq!(1, vector.null_count()); let validity = vector.validity(); - let slots = validity.slots().unwrap(); - assert_eq!(1, slots.null_count()); - assert!(!slots.get_bit(1)); + assert_eq!(1, validity.null_count()); + assert!(!validity.is_set(1)); let vector = BooleanVector::from(vec![true, false, false]); assert_eq!(0, vector.null_count()); - assert_eq!(Validity::AllValid, vector.validity()); + assert!(vector.validity().is_all_valid()); } #[test] diff --git a/src/datatypes/src/vectors/constant.rs b/src/datatypes/src/vectors/constant.rs index d5522007a1..87739e9131 100644 --- a/src/datatypes/src/vectors/constant.rs +++ b/src/datatypes/src/vectors/constant.rs @@ -55,6 +55,27 @@ impl ConstantVector { pub fn get_constant_ref(&self) -> ValueRef { self.vector.get_ref(0) } + + pub(crate) fn replicate_vector(&self, offsets: &[usize]) -> VectorRef { + assert_eq!(offsets.len(), self.len()); + + if offsets.is_empty() { + return self.slice(0, 0); + } + + Arc::new(ConstantVector::new( + self.vector.clone(), + *offsets.last().unwrap(), + )) + } + + pub(crate) fn filter_vector(&self, filter: &BooleanVector) -> Result { + let length = self.len() - filter.false_count(); + if length == self.len() { + return Ok(Arc::new(self.clone())); + } + Ok(Arc::new(ConstantVector::new(self.inner().clone(), length))) + } } impl Vector for ConstantVector { @@ -90,9 +111,9 @@ impl Vector for ConstantVector { fn validity(&self) -> Validity { if self.vector.is_null(0) { - Validity::AllNull + Validity::all_null(self.length) } else { - Validity::AllValid + Validity::all_valid(self.length) } } @@ -122,6 +143,14 @@ impl Vector for ConstantVector { fn get_ref(&self, _index: usize) -> ValueRef { self.vector.get_ref(0) } + + fn null_count(&self) -> usize { + if self.only_null() { + self.len() + } else { + 0 + } + } } impl fmt::Debug for ConstantVector { @@ -140,33 +169,6 @@ impl Serializable for ConstantVector { } } -pub(crate) fn replicate_constant(vector: &ConstantVector, offsets: &[usize]) -> VectorRef { - assert_eq!(offsets.len(), vector.len()); - - if offsets.is_empty() { - return vector.slice(0, 0); - } - - Arc::new(ConstantVector::new( - vector.vector.clone(), - *offsets.last().unwrap(), - )) -} - -pub(crate) fn filter_constant( - vector: &ConstantVector, - filter: &BooleanVector, -) -> Result { - let length = filter.len() - filter.as_boolean_array().values().null_count(); - if length == vector.len() { - return Ok(Arc::new(vector.clone())); - } - Ok(Arc::new(ConstantVector::new( - vector.inner().clone(), - length, - ))) -} - #[cfg(test)] mod tests { use arrow::datatypes::DataType as ArrowDataType; @@ -182,9 +184,9 @@ mod tests { assert_eq!("ConstantVector", c.vector_type_name()); assert!(c.is_const()); assert_eq!(10, c.len()); - assert_eq!(Validity::AllValid, c.validity()); + assert!(c.validity().is_all_valid()); assert!(!c.only_null()); - assert_eq!(4, c.memory_size()); + assert_eq!(64, c.memory_size()); for i in 0..10 { assert!(!c.is_null(i)); diff --git a/src/datatypes/src/vectors/date.rs b/src/datatypes/src/vectors/date.rs index 0198b3622f..d0a66b80fb 100644 --- a/src/datatypes/src/vectors/date.rs +++ b/src/datatypes/src/vectors/date.rs @@ -12,258 +12,28 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::any::Any; -use std::sync::Arc; +use crate::types::DateType; +use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder}; -use arrow::array::{Array, ArrayRef, PrimitiveArray}; -use common_time::date::Date; -use snafu::OptionExt; - -use crate::data_type::ConcreteDataType; -use crate::error::{self, Result}; -use crate::prelude::*; -use crate::scalars::ScalarVector; -use crate::serialize::Serializable; -use crate::vectors::{MutableVector, PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder}; - -#[derive(Debug, Clone, PartialEq)] -pub struct DateVector { - array: PrimitiveVector, -} - -impl DateVector { - pub fn new(array: PrimitiveArray) -> Self { - Self { - array: PrimitiveVector { array }, - } - } - - pub fn try_from_arrow_array(array: impl AsRef) -> Result { - Ok(Self::new( - array - .as_ref() - .as_any() - .downcast_ref::>() - .with_context(|| error::ConversionSnafu { - from: format!("{:?}", array.as_ref().data_type()), - })? - .clone(), - )) - } - - pub(crate) fn as_arrow(&self) -> &dyn Array { - self.array.as_arrow() - } -} - -impl Vector for DateVector { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::date_datatype() - } - - fn vector_type_name(&self) -> String { - "DateVector".to_string() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn len(&self) -> usize { - self.array.len() - } - - fn to_arrow_array(&self) -> ArrayRef { - let validity = self.array.array.validity().cloned(); - let buffer = self.array.array.values().clone(); - Arc::new(PrimitiveArray::new( - arrow::datatypes::DataType::Date32, - buffer, - validity, - )) - } - - fn to_boxed_arrow_array(&self) -> Box { - let validity = self.array.array.validity().cloned(); - let buffer = self.array.array.values().clone(); - Box::new(PrimitiveArray::new( - arrow::datatypes::DataType::Date32, - buffer, - validity, - )) - } - - fn validity(&self) -> Validity { - self.array.validity() - } - - fn memory_size(&self) -> usize { - self.array.memory_size() - } - - fn is_null(&self, row: usize) -> bool { - self.array.is_null(row) - } - - fn slice(&self, offset: usize, length: usize) -> VectorRef { - Arc::new(Self { - array: PrimitiveVector::new(self.array.array.slice(offset, length)), - }) - } - - fn get(&self, index: usize) -> Value { - match self.array.get(index) { - Value::Int32(v) => Value::Date(Date::new(v)), - Value::Null => Value::Null, - _ => { - unreachable!() - } - } - } - - fn get_ref(&self, index: usize) -> ValueRef { - match self.array.get(index) { - Value::Int32(v) => ValueRef::Date(Date::new(v)), - Value::Null => ValueRef::Null, - _ => { - unreachable!() - } - } - } -} - -impl From>> for DateVector { - fn from(data: Vec>) -> Self { - Self { - array: PrimitiveVector::::from(data), - } - } -} - -pub struct DateIter<'a> { - iter: PrimitiveIter<'a, i32>, -} - -impl<'a> Iterator for DateIter<'a> { - type Item = Option; - - fn next(&mut self) -> Option { - self.iter.next().map(|v| v.map(Date::new)) - } -} - -impl ScalarVector for DateVector { - type OwnedItem = Date; - type RefItem<'a> = Date; - type Iter<'a> = DateIter<'a>; - - type Builder = DateVectorBuilder; - - fn get_data(&self, idx: usize) -> Option> { - self.array.get_data(idx).map(Date::new) - } - - fn iter_data(&self) -> Self::Iter<'_> { - DateIter { - iter: self.array.iter_data(), - } - } -} - -impl Serializable for DateVector { - fn serialize_to_json(&self) -> Result> { - Ok(self - .array - .iter_data() - .map(|v| v.map(Date::new)) - .map(|v| match v { - None => serde_json::Value::Null, - Some(v) => v.into(), - }) - .collect::>()) - } -} - -pub struct DateVectorBuilder { - buffer: PrimitiveVectorBuilder, -} - -impl MutableVector for DateVectorBuilder { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::date_datatype() - } - - fn len(&self) -> usize { - self.buffer.len() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn to_vector(&mut self) -> VectorRef { - Arc::new(self.finish()) - } - - fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - self.buffer.push(value.as_date()?.map(|d| d.val())); - Ok(()) - } - - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - let concrete_vector = vector - .as_any() - .downcast_ref::() - .with_context(|| error::CastTypeSnafu { - msg: format!( - "Failed to convert vector from {} to DateVector", - vector.vector_type_name() - ), - })?; - self.buffer - .extend_slice_of(&concrete_vector.array, offset, length)?; - Ok(()) - } -} - -impl ScalarVectorBuilder for DateVectorBuilder { - type VectorType = DateVector; - - fn with_capacity(capacity: usize) -> Self { - Self { - buffer: PrimitiveVectorBuilder::with_capacity(capacity), - } - } - - fn push(&mut self, value: Option<::RefItem<'_>>) { - self.buffer.push(value.map(|d| d.val())) - } - - fn finish(&mut self) -> Self::VectorType { - Self::VectorType { - array: self.buffer.finish(), - } - } -} - -pub(crate) fn replicate_date(vector: &DateVector, offsets: &[usize]) -> VectorRef { - let array = crate::vectors::primitive::replicate_primitive_with_type( - &vector.array, - offsets, - vector.data_type(), - ); - Arc::new(DateVector { array }) -} +// Vector for [`Date`](common_time::Date). +pub type DateVector = PrimitiveVector; +// Builder to build DateVector. +pub type DateVectorBuilder = PrimitiveVectorBuilder; #[cfg(test)] mod tests { + use std::sync::Arc; + + use arrow::array::Array; + use common_time::date::Date; + use super::*; use crate::data_type::DataType; + use crate::scalars::{ScalarVector, ScalarVectorBuilder}; + use crate::serialize::Serializable; use crate::types::DateType; + use crate::value::{Value, ValueRef}; + use crate::vectors::{Vector, VectorRef}; #[test] fn test_build_date_vector() { @@ -288,7 +58,7 @@ mod tests { #[test] fn test_date_scalar() { - let vector = DateVector::from_slice(&[Date::new(1), Date::new(2)]); + let vector = DateVector::from_slice(&[1, 2]); assert_eq!(2, vector.len()); assert_eq!(Some(Date::new(1)), vector.get_data(0)); assert_eq!(Some(Date::new(2)), vector.get_data(1)); @@ -296,7 +66,7 @@ mod tests { #[test] fn test_date_vector_builder() { - let input = DateVector::from_slice(&[Date::new(1), Date::new(2), Date::new(3)]); + let input = DateVector::from_slice(&[1, 2, 3]); let mut builder = DateType::default().create_mutable_vector(3); builder @@ -309,19 +79,25 @@ mod tests { .is_err()); let vector = builder.to_vector(); - let expect: VectorRef = Arc::new(DateVector::from_slice(&[ - Date::new(5), - Date::new(2), - Date::new(3), - ])); + let expect: VectorRef = Arc::new(DateVector::from_slice(&[5, 2, 3])); assert_eq!(expect, vector); } #[test] fn test_date_from_arrow() { - let vector = DateVector::from_slice(&[Date::new(1), Date::new(2)]); + let vector = DateVector::from_slice(&[1, 2]); let arrow = vector.as_arrow().slice(0, vector.len()); let vector2 = DateVector::try_from_arrow_array(&arrow).unwrap(); assert_eq!(vector, vector2); } + + #[test] + fn test_serialize_date_vector() { + let vector = DateVector::from_slice(&[-1, 0, 1]); + let serialized_json = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(); + assert_eq!( + r#"["1969-12-31","1970-01-01","1970-01-02"]"#, + serialized_json + ); + } } diff --git a/src/datatypes/src/vectors/datetime.rs b/src/datatypes/src/vectors/datetime.rs index 732e56004c..a40a3e54d3 100644 --- a/src/datatypes/src/vectors/datetime.rs +++ b/src/datatypes/src/vectors/datetime.rs @@ -12,264 +12,32 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::any::Any; -use std::sync::Arc; +use crate::types::DateTimeType; +use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder}; -use arrow::array::{Array, ArrayRef, PrimitiveArray}; -use common_time::datetime::DateTime; -use snafu::OptionExt; - -use crate::data_type::ConcreteDataType; -use crate::error::{self, Result}; -use crate::prelude::{ - MutableVector, ScalarVector, ScalarVectorBuilder, Validity, Value, ValueRef, Vector, VectorRef, -}; -use crate::serialize::Serializable; -use crate::vectors::{PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder}; - -#[derive(Debug, Clone, PartialEq)] -pub struct DateTimeVector { - array: PrimitiveVector, -} - -impl DateTimeVector { - pub fn new(array: PrimitiveArray) -> Self { - Self { - array: PrimitiveVector { array }, - } - } - - pub fn try_from_arrow_array(array: impl AsRef) -> Result { - Ok(Self::new( - array - .as_ref() - .as_any() - .downcast_ref::>() - .with_context(|| error::ConversionSnafu { - from: format!("{:?}", array.as_ref().data_type()), - })? - .clone(), - )) - } - - pub(crate) fn as_arrow(&self) -> &dyn Array { - self.array.as_arrow() - } -} - -impl Vector for DateTimeVector { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::datetime_datatype() - } - - fn vector_type_name(&self) -> String { - "DateTimeVector".to_string() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn len(&self) -> usize { - self.array.len() - } - - fn to_arrow_array(&self) -> ArrayRef { - let validity = self.array.array.validity().cloned(); - let buffer = self.array.array.values().clone(); - Arc::new(PrimitiveArray::new( - arrow::datatypes::DataType::Date64, - buffer, - validity, - )) - } - - fn to_boxed_arrow_array(&self) -> Box { - let validity = self.array.array.validity().cloned(); - let buffer = self.array.array.values().clone(); - Box::new(PrimitiveArray::new( - arrow::datatypes::DataType::Date64, - buffer, - validity, - )) - } - - fn validity(&self) -> Validity { - self.array.validity() - } - - fn memory_size(&self) -> usize { - self.array.memory_size() - } - - fn is_null(&self, row: usize) -> bool { - self.array.is_null(row) - } - - fn slice(&self, offset: usize, length: usize) -> VectorRef { - Arc::new(Self { - array: PrimitiveVector::new(self.array.array.slice(offset, length)), - }) - } - - fn get(&self, index: usize) -> Value { - match self.array.get(index) { - Value::Int64(v) => Value::DateTime(DateTime::new(v)), - Value::Null => Value::Null, - _ => { - unreachable!() - } - } - } - - fn get_ref(&self, index: usize) -> ValueRef { - match self.array.get(index) { - Value::Int64(v) => ValueRef::DateTime(DateTime::new(v)), - Value::Null => ValueRef::Null, - _ => { - unreachable!() - } - } - } -} - -impl Serializable for DateTimeVector { - fn serialize_to_json(&self) -> crate::Result> { - Ok(self - .array - .iter_data() - .map(|v| v.map(DateTime::new)) - .map(|v| match v { - None => serde_json::Value::Null, - Some(v) => v.into(), - }) - .collect::>()) - } -} - -impl From>> for DateTimeVector { - fn from(data: Vec>) -> Self { - Self { - array: PrimitiveVector::::from(data), - } - } -} - -pub struct DateTimeVectorBuilder { - buffer: PrimitiveVectorBuilder, -} - -impl ScalarVectorBuilder for DateTimeVectorBuilder { - type VectorType = DateTimeVector; - - fn with_capacity(capacity: usize) -> Self { - Self { - buffer: PrimitiveVectorBuilder::with_capacity(capacity), - } - } - - fn push(&mut self, value: Option<::RefItem<'_>>) { - self.buffer.push(value.map(|d| d.val())) - } - - fn finish(&mut self) -> Self::VectorType { - Self::VectorType { - array: self.buffer.finish(), - } - } -} - -impl MutableVector for DateTimeVectorBuilder { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::datetime_datatype() - } - - fn len(&self) -> usize { - self.buffer.len() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn to_vector(&mut self) -> VectorRef { - Arc::new(self.finish()) - } - - fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - self.buffer.push(value.as_datetime()?.map(|d| d.val())); - Ok(()) - } - - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - let concrete_vector = vector - .as_any() - .downcast_ref::() - .with_context(|| error::CastTypeSnafu { - msg: format!( - "Failed to convert vector from {} to DateVector", - vector.vector_type_name() - ), - })?; - self.buffer - .extend_slice_of(&concrete_vector.array, offset, length)?; - Ok(()) - } -} - -pub struct DateTimeIter<'a> { - iter: PrimitiveIter<'a, i64>, -} - -impl<'a> Iterator for DateTimeIter<'a> { - type Item = Option; - - fn next(&mut self) -> Option { - self.iter.next().map(|v| v.map(DateTime::new)) - } -} - -impl ScalarVector for DateTimeVector { - type OwnedItem = DateTime; - type RefItem<'a> = DateTime; - type Iter<'a> = DateTimeIter<'a>; - type Builder = DateTimeVectorBuilder; - - fn get_data(&self, idx: usize) -> Option> { - self.array.get_data(idx).map(DateTime::new) - } - - fn iter_data(&self) -> Self::Iter<'_> { - DateTimeIter { - iter: self.array.iter_data(), - } - } -} - -pub(crate) fn replicate_datetime(vector: &DateTimeVector, offsets: &[usize]) -> VectorRef { - let array = crate::vectors::primitive::replicate_primitive_with_type( - &vector.array, - offsets, - vector.data_type(), - ); - Arc::new(DateTimeVector { array }) -} +/// Vector of [`DateTime`](common_time::Date) +pub type DateTimeVector = PrimitiveVector; +/// Builder for [`DateTimeVector`]. +pub type DateTimeVectorBuilder = PrimitiveVectorBuilder; #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; + use std::sync::Arc; + + use arrow::array::{Array, PrimitiveArray}; + use common_time::DateTime; + use datafusion_common::from_slice::FromSlice; use super::*; use crate::data_type::DataType; - use crate::types::DateTimeType; + use crate::prelude::{ + ConcreteDataType, ScalarVector, ScalarVectorBuilder, Value, ValueRef, Vector, VectorRef, + }; + use crate::serialize::Serializable; #[test] fn test_datetime_vector() { - let v = DateTimeVector::new(PrimitiveArray::from_vec(vec![1, 2, 3])); + let v = DateTimeVector::new(PrimitiveArray::from_slice(&[1, 2, 3])); assert_eq!(ConcreteDataType::datetime_datatype(), v.data_type()); assert_eq!(3, v.len()); assert_eq!("DateTimeVector", v.vector_type_name()); @@ -287,9 +55,8 @@ mod tests { assert_eq!(Some(DateTime::new(2)), iter.next().unwrap()); assert_eq!(Some(DateTime::new(3)), iter.next().unwrap()); assert!(!v.is_null(0)); - assert_eq!(24, v.memory_size()); // size of i64 * 3 + assert_eq!(64, v.memory_size()); - assert_matches!(v.validity(), Validity::AllValid); if let Value::DateTime(d) = v.get(0) { assert_eq!(1, d.val()); } else { @@ -314,8 +81,11 @@ mod tests { assert_eq!(Value::Null, v.get(1)); assert_eq!(Value::DateTime(DateTime::new(-1)), v.get(2)); - let input = - DateTimeVector::from_slice(&[DateTime::new(1), DateTime::new(2), DateTime::new(3)]); + let input = DateTimeVector::from_wrapper_slice(&[ + DateTime::new(1), + DateTime::new(2), + DateTime::new(3), + ]); let mut builder = DateTimeType::default().create_mutable_vector(3); builder @@ -328,7 +98,7 @@ mod tests { .is_err()); let vector = builder.to_vector(); - let expect: VectorRef = Arc::new(DateTimeVector::from_slice(&[ + let expect: VectorRef = Arc::new(DateTimeVector::from_wrapper_slice(&[ DateTime::new(5), DateTime::new(2), DateTime::new(3), @@ -338,7 +108,7 @@ mod tests { #[test] fn test_datetime_from_arrow() { - let vector = DateTimeVector::from_slice(&[DateTime::new(1), DateTime::new(2)]); + let vector = DateTimeVector::from_wrapper_slice(&[DateTime::new(1), DateTime::new(2)]); let arrow = vector.as_arrow().slice(0, vector.len()); let vector2 = DateTimeVector::try_from_arrow_array(&arrow).unwrap(); assert_eq!(vector, vector2); diff --git a/src/datatypes/src/vectors/eq.rs b/src/datatypes/src/vectors/eq.rs index d47167c3f9..55359026d4 100644 --- a/src/datatypes/src/vectors/eq.rs +++ b/src/datatypes/src/vectors/eq.rs @@ -15,9 +15,12 @@ use std::sync::Arc; use crate::data_type::DataType; +use crate::types::TimestampType; +use crate::vectors::constant::ConstantVector; use crate::vectors::{ - BinaryVector, BooleanVector, ConstantVector, DateTimeVector, DateVector, ListVector, - PrimitiveVector, StringVector, TimestampVector, Vector, + BinaryVector, BooleanVector, DateTimeVector, DateVector, ListVector, PrimitiveVector, + StringVector, TimestampMicrosecondVector, TimestampMillisecondVector, + TimestampNanosecondVector, TimestampSecondVector, Vector, }; use crate::with_match_primitive_type_id; @@ -76,7 +79,20 @@ fn equal(lhs: &dyn Vector, rhs: &dyn Vector) -> bool { String(_) => is_vector_eq!(StringVector, lhs, rhs), Date(_) => is_vector_eq!(DateVector, lhs, rhs), DateTime(_) => is_vector_eq!(DateTimeVector, lhs, rhs), - Timestamp(_) => is_vector_eq!(TimestampVector, lhs, rhs), + Timestamp(t) => match t { + TimestampType::Second(_) => { + is_vector_eq!(TimestampSecondVector, lhs, rhs) + } + TimestampType::Millisecond(_) => { + is_vector_eq!(TimestampMillisecondVector, lhs, rhs) + } + TimestampType::Microsecond(_) => { + is_vector_eq!(TimestampMicrosecondVector, lhs, rhs) + } + TimestampType::Nanosecond(_) => { + is_vector_eq!(TimestampNanosecondVector, lhs, rhs) + } + }, List(_) => is_vector_eq!(ListVector, lhs, rhs), UInt8(_) | UInt16(_) | UInt32(_) | UInt64(_) | Int8(_) | Int16(_) | Int32(_) | Int64(_) | Float32(_) | Float64(_) => { @@ -95,13 +111,10 @@ fn equal(lhs: &dyn Vector, rhs: &dyn Vector) -> bool { #[cfg(test)] mod tests { - use arrow::array::{ListArray, MutableListArray, MutablePrimitiveArray, TryExtend}; - use super::*; use crate::vectors::{ - Float32Vector, Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector, - NullVector, TimestampVector, UInt16Vector, UInt32Vector, UInt64Vector, UInt8Vector, - VectorRef, + list, Float32Vector, Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector, + NullVector, UInt16Vector, UInt32Vector, UInt64Vector, UInt8Vector, VectorRef, }; fn assert_vector_ref_eq(vector: VectorRef) { @@ -132,14 +145,21 @@ mod tests { assert_vector_ref_eq(Arc::new(BooleanVector::from(vec![true, false]))); assert_vector_ref_eq(Arc::new(DateVector::from(vec![Some(100), Some(120)]))); assert_vector_ref_eq(Arc::new(DateTimeVector::from(vec![Some(100), Some(120)]))); - assert_vector_ref_eq(Arc::new(TimestampVector::from_values([100, 120]))); + assert_vector_ref_eq(Arc::new(TimestampSecondVector::from_values([100, 120]))); + assert_vector_ref_eq(Arc::new(TimestampMillisecondVector::from_values([ + 100, 120, + ]))); + assert_vector_ref_eq(Arc::new(TimestampMicrosecondVector::from_values([ + 100, 120, + ]))); + assert_vector_ref_eq(Arc::new(TimestampNanosecondVector::from_values([100, 120]))); - let mut arrow_array = MutableListArray::>::new(); - arrow_array - .try_extend(vec![Some(vec![Some(1), Some(2), Some(3)])]) - .unwrap(); - let arrow_array: ListArray = arrow_array.into(); - assert_vector_ref_eq(Arc::new(ListVector::from(arrow_array))); + let list_vector = list::tests::new_list_vector(&[ + Some(vec![Some(1), Some(2)]), + None, + Some(vec![Some(3), Some(4)]), + ]); + assert_vector_ref_eq(Arc::new(list_vector)); assert_vector_ref_eq(Arc::new(NullVector::new(4))); assert_vector_ref_eq(Arc::new(StringVector::from(vec![ diff --git a/src/datatypes/src/vectors/helper.rs b/src/datatypes/src/vectors/helper.rs index 60a9f8511f..f3236ca0ec 100644 --- a/src/datatypes/src/vectors/helper.rs +++ b/src/datatypes/src/vectors/helper.rs @@ -17,19 +17,26 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::Array; +use arrow::array::{Array, ArrayRef, StringArray}; use arrow::compute; -use arrow::datatypes::DataType as ArrowDataType; +use arrow::compute::kernels::comparison; +use arrow::datatypes::{DataType as ArrowDataType, TimeUnit}; use datafusion_common::ScalarValue; use snafu::{OptionExt, ResultExt}; -use crate::arrow_array::StringArray; -use crate::error::{ConversionSnafu, Result, UnknownVectorSnafu}; -use crate::scalars::*; -use crate::vectors::date::DateVector; -use crate::vectors::datetime::DateTimeVector; -use crate::vectors::*; +use crate::data_type::ConcreteDataType; +use crate::error::{self, Result}; +use crate::scalars::{Scalar, ScalarVectorBuilder}; +use crate::value::{ListValue, ListValueRef}; +use crate::vectors::{ + BinaryVector, BooleanVector, ConstantVector, DateTimeVector, DateVector, Float32Vector, + Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector, ListVector, + ListVectorBuilder, MutableVector, NullVector, StringVector, TimestampMicrosecondVector, + TimestampMillisecondVector, TimestampNanosecondVector, TimestampSecondVector, UInt16Vector, + UInt32Vector, UInt64Vector, UInt8Vector, Vector, VectorRef, +}; +/// Helper functions for `Vector`. pub struct Helper; impl Helper { @@ -47,7 +54,7 @@ impl Helper { let arr = vector .as_any() .downcast_ref::<::VectorType>() - .with_context(|| UnknownVectorSnafu { + .with_context(|| error::UnknownVectorSnafu { msg: format!( "downcast vector error, vector type: {:?}, expected vector: {:?}", vector.vector_type_name(), @@ -61,7 +68,7 @@ impl Helper { let arr = vector .as_any() .downcast_ref::() - .with_context(|| UnknownVectorSnafu { + .with_context(|| error::UnknownVectorSnafu { msg: format!( "downcast vector error, vector type: {:?}, expected vector: {:?}", vector.vector_type_name(), @@ -78,7 +85,7 @@ impl Helper { let arr = vector .as_mut_any() .downcast_mut() - .with_context(|| UnknownVectorSnafu { + .with_context(|| error::UnknownVectorSnafu { msg: format!( "downcast vector error, vector type: {:?}, expected vector: {:?}", ty, @@ -94,7 +101,7 @@ impl Helper { let arr = vector .as_any() .downcast_ref::<::VectorType>() - .with_context(|| UnknownVectorSnafu { + .with_context(|| error::UnknownVectorSnafu { msg: format!( "downcast vector error, vector type: {:?}, expected vector: {:?}", vector.vector_type_name(), @@ -105,11 +112,9 @@ impl Helper { } /// Try to cast an arrow scalar value into vector - /// - /// # Panics - /// Panic if given scalar value is not supported. pub fn try_from_scalar_value(value: ScalarValue, length: usize) -> Result { let vector = match value { + ScalarValue::Null => ConstantVector::new(Arc::new(NullVector::new(1)), length), ScalarValue::Boolean(v) => { ConstantVector::new(Arc::new(BooleanVector::from(vec![v])), length) } @@ -143,17 +148,29 @@ impl Helper { ScalarValue::UInt64(v) => { ConstantVector::new(Arc::new(UInt64Vector::from(vec![v])), length) } - ScalarValue::Utf8(v) => { + ScalarValue::Utf8(v) | ScalarValue::LargeUtf8(v) => { ConstantVector::new(Arc::new(StringVector::from(vec![v])), length) } - ScalarValue::LargeUtf8(v) => { - ConstantVector::new(Arc::new(StringVector::from(vec![v])), length) - } - ScalarValue::Binary(v) => { + ScalarValue::Binary(v) + | ScalarValue::LargeBinary(v) + | ScalarValue::FixedSizeBinary(_, v) => { ConstantVector::new(Arc::new(BinaryVector::from(vec![v])), length) } - ScalarValue::LargeBinary(v) => { - ConstantVector::new(Arc::new(BinaryVector::from(vec![v])), length) + ScalarValue::List(v, field) => { + let item_type = ConcreteDataType::try_from(field.data_type())?; + let mut builder = ListVectorBuilder::with_type_capacity(item_type.clone(), 1); + if let Some(values) = v { + let values = values + .into_iter() + .map(ScalarValue::try_into) + .collect::>()?; + let list_value = ListValue::new(Some(Box::new(values)), item_type); + builder.push(Some(ListValueRef::Ref { val: &list_value })); + } else { + builder.push(None); + } + let list_vector = builder.to_vector(); + ConstantVector::new(list_vector, length) } ScalarValue::Date32(v) => { ConstantVector::new(Arc::new(DateVector::from(vec![v])), length) @@ -161,8 +178,30 @@ impl Helper { ScalarValue::Date64(v) => { ConstantVector::new(Arc::new(DateTimeVector::from(vec![v])), length) } - _ => { - return ConversionSnafu { + ScalarValue::TimestampSecond(v, _) => { + // Timezone is unimplemented now. + ConstantVector::new(Arc::new(TimestampSecondVector::from(vec![v])), length) + } + ScalarValue::TimestampMillisecond(v, _) => { + // Timezone is unimplemented now. + ConstantVector::new(Arc::new(TimestampMillisecondVector::from(vec![v])), length) + } + ScalarValue::TimestampMicrosecond(v, _) => { + // Timezone is unimplemented now. + ConstantVector::new(Arc::new(TimestampMicrosecondVector::from(vec![v])), length) + } + ScalarValue::TimestampNanosecond(v, _) => { + // Timezone is unimplemented now. + ConstantVector::new(Arc::new(TimestampNanosecondVector::from(vec![v])), length) + } + ScalarValue::Decimal128(_, _, _) + | ScalarValue::Time64(_) + | ScalarValue::IntervalYearMonth(_) + | ScalarValue::IntervalDayTime(_) + | ScalarValue::IntervalMonthDayNano(_) + | ScalarValue::Struct(_, _) + | ScalarValue::Dictionary(_, _) => { + return error::ConversionSnafu { from: format!("Unsupported scalar value: {}", value), } .fail() @@ -180,9 +219,7 @@ impl Helper { Ok(match array.as_ref().data_type() { ArrowDataType::Null => Arc::new(NullVector::try_from_arrow_array(array)?), ArrowDataType::Boolean => Arc::new(BooleanVector::try_from_arrow_array(array)?), - ArrowDataType::Binary | ArrowDataType::LargeBinary => { - Arc::new(BinaryVector::try_from_arrow_array(array)?) - } + ArrowDataType::LargeBinary => Arc::new(BinaryVector::try_from_arrow_array(array)?), ArrowDataType::Int8 => Arc::new(Int8Vector::try_from_arrow_array(array)?), ArrowDataType::Int16 => Arc::new(Int16Vector::try_from_arrow_array(array)?), ArrowDataType::Int32 => Arc::new(Int32Vector::try_from_arrow_array(array)?), @@ -193,48 +230,80 @@ impl Helper { ArrowDataType::UInt64 => Arc::new(UInt64Vector::try_from_arrow_array(array)?), ArrowDataType::Float32 => Arc::new(Float32Vector::try_from_arrow_array(array)?), ArrowDataType::Float64 => Arc::new(Float64Vector::try_from_arrow_array(array)?), - ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => { - Arc::new(StringVector::try_from_arrow_array(array)?) - } + ArrowDataType::Utf8 => Arc::new(StringVector::try_from_arrow_array(array)?), ArrowDataType::Date32 => Arc::new(DateVector::try_from_arrow_array(array)?), ArrowDataType::Date64 => Arc::new(DateTimeVector::try_from_arrow_array(array)?), ArrowDataType::List(_) => Arc::new(ListVector::try_from_arrow_array(array)?), - ArrowDataType::Timestamp(_, _) => { - Arc::new(TimestampVector::try_from_arrow_array(array)?) + ArrowDataType::Timestamp(unit, _) => match unit { + TimeUnit::Second => Arc::new(TimestampSecondVector::try_from_arrow_array(array)?), + TimeUnit::Millisecond => { + Arc::new(TimestampMillisecondVector::try_from_arrow_array(array)?) + } + TimeUnit::Microsecond => { + Arc::new(TimestampMicrosecondVector::try_from_arrow_array(array)?) + } + TimeUnit::Nanosecond => { + Arc::new(TimestampNanosecondVector::try_from_arrow_array(array)?) + } + }, + ArrowDataType::Float16 + | ArrowDataType::Time32(_) + | ArrowDataType::Time64(_) + | ArrowDataType::Duration(_) + | ArrowDataType::Interval(_) + | ArrowDataType::Binary + | ArrowDataType::FixedSizeBinary(_) + | ArrowDataType::LargeUtf8 + | ArrowDataType::LargeList(_) + | ArrowDataType::FixedSizeList(_, _) + | ArrowDataType::Struct(_) + | ArrowDataType::Union(_, _, _) + | ArrowDataType::Dictionary(_, _) + | ArrowDataType::Decimal128(_, _) + | ArrowDataType::Decimal256(_, _) + | ArrowDataType::Map(_, _) => { + unimplemented!("Arrow array datatype: {:?}", array.as_ref().data_type()) } - _ => unimplemented!("Arrow array datatype: {:?}", array.as_ref().data_type()), }) } + /// Try to cast slice of `arrays` to vectors. pub fn try_into_vectors(arrays: &[ArrayRef]) -> Result> { arrays.iter().map(Self::try_into_vector).collect() } + /// Perform SQL like operation on `names` and a scalar `s`. pub fn like_utf8(names: Vec, s: &str) -> Result { - let array = StringArray::from_slice(&names); + let array = StringArray::from(names); - let filter = - compute::like::like_utf8_scalar(&array, s).context(error::ArrowComputeSnafu)?; + let filter = comparison::like_utf8_scalar(&array, s).context(error::ArrowComputeSnafu)?; - let result = compute::filter::filter(&array, &filter).context(error::ArrowComputeSnafu)?; + let result = compute::filter(&array, &filter).context(error::ArrowComputeSnafu)?; Helper::try_into_vector(result) } } #[cfg(test)] mod tests { - use arrow::array::Int32Array; - use common_time::date::Date; - use common_time::datetime::DateTime; + use arrow::array::{ + ArrayRef, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, Int16Array, + Int32Array, Int64Array, Int8Array, LargeBinaryArray, ListArray, NullArray, + TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, + }; + use arrow::datatypes::{Field, Int32Type}; + use common_time::{Date, DateTime}; use super::*; + use crate::value::Value; + use crate::vectors::ConcreteDataType; #[test] fn test_try_into_vectors() { let arrays: Vec = vec![ - Arc::new(Int32Array::from_vec(vec![1])), - Arc::new(Int32Array::from_vec(vec![2])), - Arc::new(Int32Array::from_vec(vec![3])), + Arc::new(Int32Array::from(vec![1])), + Arc::new(Int32Array::from(vec![2])), + Arc::new(Int32Array::from(vec![3])), ]; let vectors = Helper::try_into_vectors(&arrays); assert!(vectors.is_ok()); @@ -246,10 +315,10 @@ mod tests { } #[test] - pub fn test_try_into_date_vector() { + fn test_try_into_date_vector() { let vector = DateVector::from(vec![Some(1), Some(2), None]); let arrow_array = vector.to_arrow_array(); - assert_eq!(&arrow::datatypes::DataType::Date32, arrow_array.data_type()); + assert_eq!(&ArrowDataType::Date32, arrow_array.data_type()); let vector_converted = Helper::try_into_vector(arrow_array).unwrap(); assert_eq!(vector.len(), vector_converted.len()); for i in 0..vector_converted.len() { @@ -258,7 +327,7 @@ mod tests { } #[test] - pub fn test_try_from_scalar_date_value() { + fn test_try_from_scalar_date_value() { let vector = Helper::try_from_scalar_value(ScalarValue::Date32(Some(42)), 3).unwrap(); assert_eq!(ConcreteDataType::date_datatype(), vector.data_type()); assert_eq!(3, vector.len()); @@ -268,7 +337,7 @@ mod tests { } #[test] - pub fn test_try_from_scalar_datetime_value() { + fn test_try_from_scalar_datetime_value() { let vector = Helper::try_from_scalar_value(ScalarValue::Date64(Some(42)), 3).unwrap(); assert_eq!(ConcreteDataType::datetime_datatype(), vector.data_type()); assert_eq!(3, vector.len()); @@ -277,6 +346,28 @@ mod tests { } } + #[test] + fn test_try_from_list_value() { + let value = ScalarValue::List( + Some(vec![ + ScalarValue::Int32(Some(1)), + ScalarValue::Int32(Some(2)), + ]), + Box::new(Field::new("item", ArrowDataType::Int32, true)), + ); + let vector = Helper::try_from_scalar_value(value, 3).unwrap(); + assert_eq!( + ConcreteDataType::list_datatype(ConcreteDataType::int32_datatype()), + vector.data_type() + ); + assert_eq!(3, vector.len()); + for i in 0..vector.len() { + let v = vector.get(i); + let items = v.as_list().unwrap().unwrap().items().as_ref().unwrap(); + assert_eq!(vec![Value::Int32(1), Value::Int32(2)], **items); + } + } + #[test] fn test_like_utf8() { fn assert_vector(expected: Vec<&str>, actual: &VectorRef) { @@ -301,4 +392,40 @@ mod tests { let ret = Helper::like_utf8(names, "%").unwrap(); assert_vector(vec!["greptime", "hello", "public", "world"], &ret); } + + fn check_try_into_vector(array: impl Array + 'static) { + let array: ArrayRef = Arc::new(array); + let vector = Helper::try_into_vector(array.clone()).unwrap(); + assert_eq!(&array, &vector.to_arrow_array()); + } + + #[test] + fn test_try_into_vector() { + check_try_into_vector(NullArray::new(2)); + check_try_into_vector(BooleanArray::from(vec![true, false])); + check_try_into_vector(LargeBinaryArray::from(vec![ + "hello".as_bytes(), + "world".as_bytes(), + ])); + check_try_into_vector(Int8Array::from(vec![1, 2, 3])); + check_try_into_vector(Int16Array::from(vec![1, 2, 3])); + check_try_into_vector(Int32Array::from(vec![1, 2, 3])); + check_try_into_vector(Int64Array::from(vec![1, 2, 3])); + check_try_into_vector(UInt8Array::from(vec![1, 2, 3])); + check_try_into_vector(UInt16Array::from(vec![1, 2, 3])); + check_try_into_vector(UInt32Array::from(vec![1, 2, 3])); + check_try_into_vector(UInt64Array::from(vec![1, 2, 3])); + check_try_into_vector(Float32Array::from(vec![1.0, 2.0, 3.0])); + check_try_into_vector(Float64Array::from(vec![1.0, 2.0, 3.0])); + check_try_into_vector(StringArray::from(vec!["hello", "world"])); + check_try_into_vector(Date32Array::from(vec![1, 2, 3])); + check_try_into_vector(Date64Array::from(vec![1, 2, 3])); + let data = vec![None, Some(vec![Some(6), Some(7)])]; + let list_array = ListArray::from_iter_primitive::(data); + check_try_into_vector(list_array); + check_try_into_vector(TimestampSecondArray::from(vec![1, 2, 3])); + check_try_into_vector(TimestampMillisecondArray::from(vec![1, 2, 3])); + check_try_into_vector(TimestampMicrosecondArray::from(vec![1, 2, 3])); + check_try_into_vector(TimestampNanosecondArray::from(vec![1, 2, 3])); + } } diff --git a/src/datatypes/src/vectors/list.rs b/src/datatypes/src/vectors/list.rs index 76d9dd8717..747e03557b 100644 --- a/src/datatypes/src/vectors/list.rs +++ b/src/datatypes/src/vectors/list.rs @@ -13,39 +13,48 @@ // limitations under the License. use std::any::Any; -use std::ops::Range; use std::sync::Arc; -use arrow::array::{Array, ArrayRef, ListArray}; -use arrow::bitmap::utils::ZipValidity; -use arrow::bitmap::MutableBitmap; +use arrow::array::{ + Array, ArrayData, ArrayRef, BooleanBufferBuilder, Int32BufferBuilder, ListArray, +}; +use arrow::buffer::Buffer; use arrow::datatypes::DataType as ArrowDataType; use serde_json::Value as JsonValue; -use snafu::prelude::*; +use crate::data_type::{ConcreteDataType, DataType}; use crate::error::Result; -use crate::prelude::*; +use crate::scalars::{ScalarVector, ScalarVectorBuilder}; use crate::serialize::Serializable; use crate::types::ListType; -use crate::value::{ListValue, ListValueRef}; -use crate::vectors::{impl_try_from_arrow_array_for_vector, impl_validity_for_vector}; - -type ArrowListArray = ListArray; +use crate::value::{ListValue, ListValueRef, Value, ValueRef}; +use crate::vectors::{self, Helper, MutableVector, Validity, Vector, VectorRef}; /// Vector of Lists, basically backed by Arrow's `ListArray`. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, PartialEq)] pub struct ListVector { - array: ArrowListArray, - inner_datatype: ConcreteDataType, + array: ListArray, + /// The datatype of the items in the list. + item_type: ConcreteDataType, } impl ListVector { - /// Only iterate values in the [ListVector]. - /// - /// Be careful to use this method as it would ignore validity and replace null - /// by empty vector. - pub fn values_iter(&self) -> Box> + '_> { - Box::new(self.array.values_iter().map(VectorHelper::try_into_vector)) + /// Iterate elements as [VectorRef]. + pub fn values_iter(&self) -> impl Iterator>> + '_ { + self.array + .iter() + .map(|value_opt| value_opt.map(Helper::try_into_vector).transpose()) + } + + fn to_array_data(&self) -> ArrayData { + self.array.data().clone() + } + + fn from_array_data_and_type(data: ArrayData, item_type: ConcreteDataType) -> Self { + Self { + array: ListArray::from(data), + item_type, + } } pub(crate) fn as_arrow(&self) -> &dyn Array { @@ -55,7 +64,7 @@ impl ListVector { impl Vector for ListVector { fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::List(ListType::new(self.inner_datatype.clone())) + ConcreteDataType::List(ListType::new(self.item_type.clone())) } fn vector_type_name(&self) -> String { @@ -71,21 +80,25 @@ impl Vector for ListVector { } fn to_arrow_array(&self) -> ArrayRef { - Arc::new(self.array.clone()) + let data = self.to_array_data(); + Arc::new(ListArray::from(data)) } fn to_boxed_arrow_array(&self) -> Box { - Box::new(self.array.clone()) + let data = self.to_array_data(); + Box::new(ListArray::from(data)) } fn validity(&self) -> Validity { - impl_validity_for_vector!(self.array) + vectors::impl_validity_for_vector!(self.array) } fn memory_size(&self) -> usize { - let offsets_bytes = self.array.offsets().len() * std::mem::size_of::(); - let value_refs_bytes = self.array.values().len() * std::mem::size_of::>(); - offsets_bytes + value_refs_bytes + self.array.get_buffer_memory_size() + } + + fn null_count(&self) -> usize { + self.array.null_count() } fn is_null(&self, row: usize) -> bool { @@ -93,7 +106,8 @@ impl Vector for ListVector { } fn slice(&self, offset: usize, length: usize) -> VectorRef { - Arc::new(ListVector::from(self.array.slice(offset, length))) + let data = self.array.data().slice(offset, length); + Arc::new(Self::from_array_data_and_type(data, self.item_type.clone())) } fn get(&self, index: usize) -> Value { @@ -102,7 +116,7 @@ impl Vector for ListVector { } let array = &self.array.value(index); - let vector = VectorHelper::try_into_vector(array).unwrap_or_else(|_| { + let vector = Helper::try_into_vector(array).unwrap_or_else(|_| { panic!( "arrow array with datatype {:?} cannot converted to our vector", array.data_type() @@ -113,7 +127,7 @@ impl Vector for ListVector { .collect::>(); Value::List(ListValue::new( Some(Box::new(values)), - self.inner_datatype.clone(), + self.item_type.clone(), )) } @@ -131,7 +145,7 @@ impl Serializable for ListVector { .iter() .map(|v| match v { None => Ok(JsonValue::Null), - Some(v) => VectorHelper::try_into_vector(v) + Some(v) => Helper::try_into_vector(v) .and_then(|v| v.serialize_to_json()) .map(JsonValue::Array), }) @@ -139,70 +153,64 @@ impl Serializable for ListVector { } } -impl From for ListVector { - fn from(array: ArrowListArray) -> Self { - let inner_datatype = ConcreteDataType::from_arrow_type(match array.data_type() { - ArrowDataType::List(field) => &field.data_type, - _ => unreachable!(), +impl From for ListVector { + fn from(array: ListArray) -> Self { + let item_type = ConcreteDataType::from_arrow_type(match array.data_type() { + ArrowDataType::List(field) => field.data_type(), + other => panic!( + "Try to create ListVector from an arrow array with type {:?}", + other + ), }); - Self { - array, - inner_datatype, - } + Self { array, item_type } } } -impl_try_from_arrow_array_for_vector!(ArrowListArray, ListVector); +vectors::impl_try_from_arrow_array_for_vector!(ListArray, ListVector); -pub struct ListVectorIter<'a> { +pub struct ListIter<'a> { vector: &'a ListVector, - iter: ZipValidity<'a, usize, Range>, + idx: usize, } -impl<'a> ListVectorIter<'a> { - pub fn new(vector: &'a ListVector) -> ListVectorIter<'a> { - let iter = ZipValidity::new( - 0..vector.len(), - vector.array.validity().as_ref().map(|x| x.iter()), - ); - - Self { vector, iter } +impl<'a> ListIter<'a> { + fn new(vector: &'a ListVector) -> ListIter { + ListIter { vector, idx: 0 } } } -impl<'a> Iterator for ListVectorIter<'a> { +impl<'a> Iterator for ListIter<'a> { type Item = Option>; #[inline] fn next(&mut self) -> Option { - self.iter.next().map(|idx_opt| { - idx_opt.map(|idx| ListValueRef::Indexed { - vector: self.vector, - idx, - }) - }) + if self.idx >= self.vector.len() { + return None; + } + + let idx = self.idx; + self.idx += 1; + + if self.vector.is_null(idx) { + return Some(None); + } + + Some(Some(ListValueRef::Indexed { + vector: self.vector, + idx, + })) } #[inline] fn size_hint(&self) -> (usize, Option) { - self.iter.size_hint() - } - - #[inline] - fn nth(&mut self, n: usize) -> Option { - self.iter.nth(n).map(|idx_opt| { - idx_opt.map(|idx| ListValueRef::Indexed { - vector: self.vector, - idx, - }) - }) + (self.vector.len(), Some(self.vector.len())) } } impl ScalarVector for ListVector { type OwnedItem = ListValue; type RefItem<'a> = ListValueRef<'a>; - type Iter<'a> = ListVectorIter<'a>; + type Iter<'a> = ListIter<'a>; type Builder = ListVectorBuilder; fn get_data(&self, idx: usize) -> Option> { @@ -214,86 +222,68 @@ impl ScalarVector for ListVector { } fn iter_data(&self) -> Self::Iter<'_> { - ListVectorIter::new(self) + ListIter::new(self) } } -// Some codes are ported from arrow2's MutableListArray. +// Ports from arrow's GenericListBuilder. +// See https://github.com/apache/arrow-rs/blob/94565bca99b5d9932a3e9a8e094aaf4e4384b1e5/arrow-array/src/builder/generic_list_builder.rs +/// [ListVector] builder. pub struct ListVectorBuilder { - inner_type: ConcreteDataType, - offsets: Vec, - values: Box, - validity: Option, + item_type: ConcreteDataType, + offsets_builder: Int32BufferBuilder, + null_buffer_builder: NullBufferBuilder, + values_builder: Box, } impl ListVectorBuilder { - pub fn with_type_capacity(inner_type: ConcreteDataType, capacity: usize) -> ListVectorBuilder { - let mut offsets = Vec::with_capacity(capacity + 1); - offsets.push(0); - // The actual required capacity might greater than the capacity of the `ListVector` - // if there exists child vector that has more than one element. - let values = inner_type.create_mutable_vector(capacity); + /// Creates a new [`ListVectorBuilder`]. `item_type` is the data type of the list item, `capacity` + /// is the number of items to pre-allocate space for in this builder. + pub fn with_type_capacity(item_type: ConcreteDataType, capacity: usize) -> ListVectorBuilder { + let mut offsets_builder = Int32BufferBuilder::new(capacity + 1); + offsets_builder.append(0); + // The actual required capacity might be greater than the capacity of the `ListVector` + // if the child vector has more than one element. + let values_builder = item_type.create_mutable_vector(capacity); ListVectorBuilder { - inner_type, - offsets, - values, - validity: None, + item_type, + offsets_builder, + null_buffer_builder: NullBufferBuilder::new(capacity), + values_builder, } } - #[inline] - fn last_offset(&self) -> i32 { - *self.offsets.last().unwrap() + /// Finish the current variable-length list vector slot. + fn finish_list(&mut self, is_valid: bool) { + self.offsets_builder + .append(i32::try_from(self.values_builder.len()).unwrap()); + self.null_buffer_builder.append(is_valid); } fn push_null(&mut self) { - self.offsets.push(self.last_offset()); - match &mut self.validity { - Some(validity) => validity.push(false), - None => self.init_validity(), - } - } - - fn init_validity(&mut self) { - let len = self.offsets.len() - 1; - - let mut validity = MutableBitmap::with_capacity(self.offsets.capacity()); - validity.extend_constant(len, true); - validity.set(len - 1, false); - self.validity = Some(validity) + self.finish_list(false); } fn push_list_value(&mut self, list_value: &ListValue) -> Result<()> { if let Some(items) = list_value.items() { for item in &**items { - self.values.push_value_ref(item.as_value_ref())?; + self.values_builder.push_value_ref(item.as_value_ref())?; } } - self.push_valid(); + + self.finish_list(true); Ok(()) } - - /// Needs to be called when a valid value was extended to this builder. - fn push_valid(&mut self) { - let size = self.values.len(); - let size = i32::try_from(size).unwrap(); - assert!(size >= *self.offsets.last().unwrap()); - - self.offsets.push(size); - if let Some(validity) = &mut self.validity { - validity.push(true) - } - } } impl MutableVector for ListVectorBuilder { fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::list_datatype(self.inner_type.clone()) + ConcreteDataType::list_datatype(self.item_type.clone()) } fn len(&self) -> usize { - self.offsets.len() - 1 + self.null_buffer_builder.len() } fn as_any(&self) -> &dyn Any { @@ -348,51 +338,181 @@ impl ScalarVectorBuilder for ListVectorBuilder { self.push_value_ref(value.into()).unwrap_or_else(|e| { panic!( "Failed to push value, expect value type {:?}, err:{}", - self.inner_type, e + self.item_type, e ); }); } fn finish(&mut self) -> Self::VectorType { - let array = ArrowListArray::try_new( - ConcreteDataType::list_datatype(self.inner_type.clone()).as_arrow_type(), - std::mem::take(&mut self.offsets).into(), - self.values.to_vector().to_arrow_array(), - std::mem::take(&mut self.validity).map(|x| x.into()), - ) - .unwrap(); // The `ListVectorBuilder` itself should ensure it always builds a valid array. + let len = self.len(); + let values_vector = self.values_builder.to_vector(); + let values_arr = values_vector.to_arrow_array(); + let values_data = values_arr.data(); + + let offset_buffer = self.offsets_builder.finish(); + let null_bit_buffer = self.null_buffer_builder.finish(); + // Re-initialize the offsets_builder. + self.offsets_builder.append(0); + let data_type = ConcreteDataType::list_datatype(self.item_type.clone()).as_arrow_type(); + let array_data_builder = ArrayData::builder(data_type) + .len(len) + .add_buffer(offset_buffer) + .add_child_data(values_data.clone()) + .null_bit_buffer(null_bit_buffer); + + let array_data = unsafe { array_data_builder.build_unchecked() }; + let array = ListArray::from(array_data); ListVector { array, - inner_datatype: self.inner_type.clone(), + item_type: self.item_type.clone(), + } + } +} + +// Ports from https://github.com/apache/arrow-rs/blob/94565bca99b5d9932a3e9a8e094aaf4e4384b1e5/arrow-array/src/builder/null_buffer_builder.rs +/// Builder for creating the null bit buffer. +/// This builder only materializes the buffer when we append `false`. +/// If you only append `true`s to the builder, what you get will be +/// `None` when calling [`finish`](#method.finish). +/// This optimization is **very** important for the performance. +#[derive(Debug)] +struct NullBufferBuilder { + bitmap_builder: Option, + /// Store the length of the buffer before materializing. + len: usize, + capacity: usize, +} + +impl NullBufferBuilder { + /// Creates a new empty builder. + /// `capacity` is the number of bits in the null buffer. + fn new(capacity: usize) -> Self { + Self { + bitmap_builder: None, + len: 0, + capacity, + } + } + + fn len(&self) -> usize { + if let Some(b) = &self.bitmap_builder { + b.len() + } else { + self.len + } + } + + /// Appends a `true` into the builder + /// to indicate that this item is not null. + #[inline] + fn append_non_null(&mut self) { + if let Some(buf) = self.bitmap_builder.as_mut() { + buf.append(true) + } else { + self.len += 1; + } + } + + /// Appends a `false` into the builder + /// to indicate that this item is null. + #[inline] + fn append_null(&mut self) { + self.materialize_if_needed(); + self.bitmap_builder.as_mut().unwrap().append(false); + } + + /// Appends a boolean value into the builder. + #[inline] + fn append(&mut self, not_null: bool) { + if not_null { + self.append_non_null() + } else { + self.append_null() + } + } + + /// Builds the null buffer and resets the builder. + /// Returns `None` if the builder only contains `true`s. + fn finish(&mut self) -> Option { + let buf = self.bitmap_builder.as_mut().map(|b| b.finish()); + self.bitmap_builder = None; + self.len = 0; + buf + } + + #[inline] + fn materialize_if_needed(&mut self) { + if self.bitmap_builder.is_none() { + self.materialize() + } + } + + #[cold] + fn materialize(&mut self) { + if self.bitmap_builder.is_none() { + let mut b = BooleanBufferBuilder::new(self.len.max(self.capacity)); + b.append_n(self.len, true); + self.bitmap_builder = Some(b); } } } #[cfg(test)] -mod tests { - use arrow::array::{MutableListArray, MutablePrimitiveArray, TryExtend}; +pub mod tests { + use arrow::array::{Int32Array, Int32Builder, ListBuilder}; use serde_json::json; use super::*; + use crate::scalars::ScalarRef; use crate::types::ListType; + use crate::vectors::Int32Vector; + + pub fn new_list_vector(data: &[Option>>]) -> ListVector { + let mut builder = + ListVectorBuilder::with_type_capacity(ConcreteDataType::int32_datatype(), 8); + for vec_opt in data { + if let Some(vec) = vec_opt { + let values = vec.iter().map(|v| Value::from(*v)).collect(); + let values = Some(Box::new(values)); + let list_value = ListValue::new(values, ConcreteDataType::int32_datatype()); + + builder.push(Some(ListValueRef::Ref { val: &list_value })); + } else { + builder.push(None); + } + } + + builder.finish() + } + + fn new_list_array(data: &[Option>>]) -> ListArray { + let mut builder = ListBuilder::new(Int32Builder::new()); + for vec_opt in data { + if let Some(vec) = vec_opt { + for value_opt in vec { + builder.values().append_option(*value_opt); + } + + builder.append(true); + } else { + builder.append(false); + } + } + + builder.finish() + } #[test] fn test_list_vector() { let data = vec![ - Some(vec![Some(1i32), Some(2), Some(3)]), + Some(vec![Some(1), Some(2), Some(3)]), None, Some(vec![Some(4), None, Some(6)]), ]; - let mut arrow_array = MutableListArray::>::new(); - arrow_array.try_extend(data).unwrap(); - let arrow_array: ArrowListArray = arrow_array.into(); + let list_vector = new_list_vector(&data); - let list_vector = ListVector { - array: arrow_array.clone(), - inner_datatype: ConcreteDataType::int32_datatype(), - }; assert_eq!( ConcreteDataType::List(ListType::new(ConcreteDataType::int32_datatype())), list_vector.data_type() @@ -403,30 +523,34 @@ mod tests { assert!(list_vector.is_null(1)); assert!(!list_vector.is_null(2)); + let arrow_array = new_list_array(&data); assert_eq!( arrow_array, - list_vector + *list_vector .to_arrow_array() .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap() - .clone() - ); - assert_eq!( - Validity::Slots(arrow_array.validity().unwrap()), - list_vector.validity() - ); - assert_eq!( - arrow_array.offsets().len() * std::mem::size_of::() - + arrow_array.values().len() * std::mem::size_of::>(), - list_vector.memory_size() ); + let validity = list_vector.validity(); + assert!(!validity.is_all_null()); + assert!(!validity.is_all_valid()); + assert!(validity.is_set(0)); + assert!(!validity.is_set(1)); + assert!(validity.is_set(2)); + assert_eq!(256, list_vector.memory_size()); - let slice = list_vector.slice(0, 2); + let slice = list_vector.slice(0, 2).to_arrow_array(); + let sliced_array = slice.as_any().downcast_ref::().unwrap(); assert_eq!( - "ListArray[[1, 2, 3], None]", - format!("{:?}", slice.to_arrow_array()) + Int32Array::from_iter_values([1, 2, 3]), + *sliced_array + .value(0) + .as_any() + .downcast_ref::() + .unwrap() ); + assert!(sliced_array.is_null(1)); assert_eq!( Value::List(ListValue::new( @@ -467,52 +591,48 @@ mod tests { #[test] fn test_from_arrow_array() { let data = vec![ - Some(vec![Some(1u32), Some(2), Some(3)]), + Some(vec![Some(1), Some(2), Some(3)]), None, Some(vec![Some(4), None, Some(6)]), ]; - let mut arrow_array = MutableListArray::>::new(); - arrow_array.try_extend(data).unwrap(); - let arrow_array: ArrowListArray = arrow_array.into(); + let arrow_array = new_list_array(&data); let array_ref: ArrayRef = Arc::new(arrow_array); + let expect = new_list_vector(&data); + // Test try from ArrayRef let list_vector = ListVector::try_from_arrow_array(array_ref).unwrap(); - assert_eq!( - "ListVector { array: ListArray[[1, 2, 3], None, [4, None, 6]], inner_datatype: UInt32(UInt32) }", - format!("{:?}", list_vector) - ); + assert_eq!(expect, list_vector); + + // Test from + let arrow_array = new_list_array(&data); + let list_vector = ListVector::from(arrow_array); + assert_eq!(expect, list_vector); } #[test] fn test_iter_list_vector_values() { let data = vec![ - Some(vec![Some(1i64), Some(2), Some(3)]), + Some(vec![Some(1), Some(2), Some(3)]), None, Some(vec![Some(4), None, Some(6)]), ]; - let mut arrow_array = MutableListArray::>::new(); - arrow_array.try_extend(data).unwrap(); - let arrow_array: ArrowListArray = arrow_array.into(); + let list_vector = new_list_vector(&data); - let list_vector = ListVector::from(arrow_array); assert_eq!( - ConcreteDataType::List(ListType::new(ConcreteDataType::int64_datatype())), + ConcreteDataType::List(ListType::new(ConcreteDataType::int32_datatype())), list_vector.data_type() ); let mut iter = list_vector.values_iter(); assert_eq!( - "Int64[1, 2, 3]", - format!("{:?}", iter.next().unwrap().unwrap().to_arrow_array()) + Arc::new(Int32Vector::from_slice(&[1, 2, 3])) as VectorRef, + *iter.next().unwrap().unwrap().unwrap() ); + assert!(iter.next().unwrap().unwrap().is_none()); assert_eq!( - "Int64[]", - format!("{:?}", iter.next().unwrap().unwrap().to_arrow_array()) - ); - assert_eq!( - "Int64[4, None, 6]", - format!("{:?}", iter.next().unwrap().unwrap().to_arrow_array()) + Arc::new(Int32Vector::from(vec![Some(4), None, Some(6)])) as VectorRef, + *iter.next().unwrap().unwrap().unwrap(), ); assert!(iter.next().is_none()) } @@ -520,30 +640,18 @@ mod tests { #[test] fn test_serialize_to_json() { let data = vec![ - Some(vec![Some(1i64), Some(2), Some(3)]), + Some(vec![Some(1), Some(2), Some(3)]), None, Some(vec![Some(4), None, Some(6)]), ]; - let mut arrow_array = MutableListArray::>::new(); - arrow_array.try_extend(data).unwrap(); - let arrow_array: ArrowListArray = arrow_array.into(); - - let list_vector = ListVector::from(arrow_array); + let list_vector = new_list_vector(&data); assert_eq!( vec![json!([1, 2, 3]), json!(null), json!([4, null, 6]),], list_vector.serialize_to_json().unwrap() ); } - fn new_list_vector(data: Vec>>>) -> ListVector { - let mut arrow_array = MutableListArray::>::new(); - arrow_array.try_extend(data).unwrap(); - let arrow_array: ArrowListArray = arrow_array.into(); - - ListVector::from(arrow_array) - } - #[test] fn test_list_vector_builder() { let mut builder = @@ -567,14 +675,14 @@ mod tests { None, Some(vec![Some(7), Some(8), None]), ]; - let input = new_list_vector(data); + let input = new_list_vector(&data); builder.extend_slice_of(&input, 1, 2).unwrap(); assert!(builder .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) .is_err()); let vector = builder.to_vector(); - let expect: VectorRef = Arc::new(new_list_vector(vec![ + let expect: VectorRef = Arc::new(new_list_vector(&[ Some(vec![Some(4), None, Some(6)]), None, Some(vec![Some(7), Some(8), None]), @@ -599,7 +707,7 @@ mod tests { })); let vector = builder.finish(); - let expect = new_list_vector(vec![None, Some(vec![Some(4), None, Some(6)])]); + let expect = new_list_vector(&[None, Some(vec![Some(4), None, Some(6)])]); assert_eq!(expect, vector); assert!(vector.get_data(0).is_none()); diff --git a/src/datatypes/src/vectors/null.rs b/src/datatypes/src/vectors/null.rs index 64974d99b0..bb66e09b39 100644 --- a/src/datatypes/src/vectors/null.rs +++ b/src/datatypes/src/vectors/null.rs @@ -16,8 +16,7 @@ use std::any::Any; use std::fmt; use std::sync::Arc; -use arrow::array::{Array, ArrayRef, NullArray}; -use arrow::datatypes::DataType as ArrowDataType; +use arrow::array::{Array, ArrayData, ArrayRef, NullArray}; use snafu::{ensure, OptionExt}; use crate::data_type::ConcreteDataType; @@ -27,21 +26,28 @@ use crate::types::NullType; use crate::value::{Value, ValueRef}; use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; +/// A vector where all elements are nulls. #[derive(PartialEq)] pub struct NullVector { array: NullArray, } +// TODO(yingwen): Support null vector with other logical types. impl NullVector { + /// Create a new `NullVector` with `n` elements. pub fn new(n: usize) -> Self { Self { - array: NullArray::new(ArrowDataType::Null, n), + array: NullArray::new(n), } } pub(crate) fn as_arrow(&self) -> &dyn Array { &self.array } + + fn to_array_data(&self) -> ArrayData { + self.array.data().clone() + } } impl From for NullVector { @@ -68,21 +74,28 @@ impl Vector for NullVector { } fn to_arrow_array(&self) -> ArrayRef { - Arc::new(self.array.clone()) + // TODO(yingwen): Replaced by clone after upgrading to arrow 28.0. + let data = self.to_array_data(); + Arc::new(NullArray::from(data)) } fn to_boxed_arrow_array(&self) -> Box { - Box::new(self.array.clone()) + let data = self.to_array_data(); + Box::new(NullArray::from(data)) } fn validity(&self) -> Validity { - Validity::AllNull + Validity::all_null(self.array.len()) } fn memory_size(&self) -> usize { 0 } + fn null_count(&self) -> usize { + self.array.null_count() + } + fn is_null(&self, _row: usize) -> bool { true } @@ -217,7 +230,7 @@ mod tests { assert_eq!("NullVector", v.vector_type_name()); assert!(!v.is_const()); - assert_eq!(Validity::AllNull, v.validity()); + assert!(v.validity().is_all_null()); assert!(v.only_null()); for i in 0..32 { @@ -246,7 +259,7 @@ mod tests { #[test] fn test_null_vector_validity() { let vector = NullVector::new(5); - assert_eq!(Validity::AllNull, vector.validity()); + assert!(vector.validity().is_all_null()); assert_eq!(5, vector.null_count()); } diff --git a/src/datatypes/src/vectors/operations.rs b/src/datatypes/src/vectors/operations.rs index e63f338a05..70ddb4a031 100644 --- a/src/datatypes/src/vectors/operations.rs +++ b/src/datatypes/src/vectors/operations.rs @@ -19,10 +19,11 @@ mod replicate; use common_base::BitVec; use crate::error::Result; -use crate::types::PrimitiveElement; +use crate::types::LogicalPrimitiveType; +use crate::vectors::constant::ConstantVector; use crate::vectors::{ - BinaryVector, BooleanVector, ConstantVector, DateTimeVector, DateVector, ListVector, - NullVector, PrimitiveVector, StringVector, TimestampVector, Vector, VectorRef, + BinaryVector, BooleanVector, ListVector, NullVector, PrimitiveVector, StringVector, Vector, + VectorRef, }; /// Vector compute operations. @@ -59,10 +60,10 @@ pub trait VectorOp { } macro_rules! impl_scalar_vector_op { - ($( { $VectorType: ident, $replicate: ident } ),+) => {$( + ($($VectorType: ident),+) => {$( impl VectorOp for $VectorType { fn replicate(&self, offsets: &[usize]) -> VectorRef { - replicate::$replicate(self, offsets) + replicate::replicate_scalar(self, offsets) } fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { @@ -77,28 +78,21 @@ macro_rules! impl_scalar_vector_op { )+}; } -impl_scalar_vector_op!( - { BinaryVector, replicate_scalar }, - { BooleanVector, replicate_scalar }, - { ListVector, replicate_scalar }, - { StringVector, replicate_scalar }, - { DateVector, replicate_date }, - { DateTimeVector, replicate_datetime }, - { TimestampVector, replicate_timestamp } -); +impl_scalar_vector_op!(BinaryVector, BooleanVector, ListVector, StringVector); -impl VectorOp for ConstantVector { +impl VectorOp for PrimitiveVector { fn replicate(&self, offsets: &[usize]) -> VectorRef { - replicate::replicate_constant(self, offsets) + std::sync::Arc::new(replicate::replicate_primitive(self, offsets)) } fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { - let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::()); - find_unique::find_unique_constant(self, selected, prev_vector); + let prev_vector = + prev_vector.and_then(|pv| pv.as_any().downcast_ref::>()); + find_unique::find_unique_scalar(self, selected, prev_vector); } fn filter(&self, filter: &BooleanVector) -> Result { - filter::filter_constant(self, filter) + filter::filter_non_constant!(self, PrimitiveVector, filter) } } @@ -117,21 +111,17 @@ impl VectorOp for NullVector { } } -impl VectorOp for PrimitiveVector -where - T: PrimitiveElement, -{ +impl VectorOp for ConstantVector { fn replicate(&self, offsets: &[usize]) -> VectorRef { - replicate::replicate_primitive(self, offsets) + self.replicate_vector(offsets) } fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { - let prev_vector = - prev_vector.and_then(|pv| pv.as_any().downcast_ref::>()); - find_unique::find_unique_scalar(self, selected, prev_vector); + let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::()); + find_unique::find_unique_constant(self, selected, prev_vector); } fn filter(&self, filter: &BooleanVector) -> Result { - filter::filter_non_constant!(self, PrimitiveVector, filter) + self.filter_vector(filter) } } diff --git a/src/datatypes/src/vectors/operations/filter.rs b/src/datatypes/src/vectors/operations/filter.rs index 7a9f514a16..8368a6afb4 100644 --- a/src/datatypes/src/vectors/operations/filter.rs +++ b/src/datatypes/src/vectors/operations/filter.rs @@ -12,16 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub(crate) use crate::vectors::constant::filter_constant; - macro_rules! filter_non_constant { ($vector: expr, $VectorType: ty, $filter: ident) => {{ use std::sync::Arc; + use arrow::compute; use snafu::ResultExt; let arrow_array = $vector.as_arrow(); - let filtered = arrow::compute::filter::filter(arrow_array, $filter.as_boolean_array()) + let filtered = compute::filter(arrow_array, $filter.as_boolean_array()) .context(crate::error::ArrowComputeSnafu)?; Ok(Arc::new(<$VectorType>::try_from_arrow_array(filtered)?)) }}; @@ -33,9 +32,16 @@ pub(crate) use filter_non_constant; mod tests { use std::sync::Arc; + use common_time::{Date, DateTime}; + use crate::scalars::ScalarVector; + use crate::timestamp::{ + TimestampMicrosecond, TimestampMillisecond, TimestampNanosecond, TimestampSecond, + }; + use crate::types::WrapperType; + use crate::vectors::constant::ConstantVector; use crate::vectors::{ - BooleanVector, ConstantVector, Int32Vector, NullVector, StringVector, VectorOp, VectorRef, + BooleanVector, Int32Vector, NullVector, StringVector, VectorOp, VectorRef, }; fn check_filter_primitive(expect: &[i32], input: &[i32], filter: &[bool]) { @@ -105,7 +111,6 @@ mod tests { ($VectorType: ident, $ValueType: ident, $method: ident) => {{ use std::sync::Arc; - use common_time::$ValueType; use $crate::vectors::{$VectorType, VectorRef}; let v = $VectorType::from_iterator((0..5).map($ValueType::$method)); @@ -123,6 +128,18 @@ mod tests { fn test_filter_date_like() { impl_filter_date_like_test!(DateVector, Date, new); impl_filter_date_like_test!(DateTimeVector, DateTime, new); - impl_filter_date_like_test!(TimestampVector, Timestamp, from_millis); + + impl_filter_date_like_test!(TimestampSecondVector, TimestampSecond, from_native); + impl_filter_date_like_test!( + TimestampMillisecondVector, + TimestampMillisecond, + from_native + ); + impl_filter_date_like_test!( + TimestampMicrosecondVector, + TimestampMicrosecond, + from_native + ); + impl_filter_date_like_test!(TimestampNanosecondVector, TimestampNanosecond, from_native); } } diff --git a/src/datatypes/src/vectors/operations/find_unique.rs b/src/datatypes/src/vectors/operations/find_unique.rs index d63a3c66b9..7116a9e90d 100644 --- a/src/datatypes/src/vectors/operations/find_unique.rs +++ b/src/datatypes/src/vectors/operations/find_unique.rs @@ -15,7 +15,8 @@ use common_base::BitVec; use crate::scalars::ScalarVector; -use crate::vectors::{ConstantVector, NullVector, Vector}; +use crate::vectors::constant::ConstantVector; +use crate::vectors::{NullVector, Vector}; // To implement `find_unique()` correctly, we need to keep in mind that always marks an element as // selected when it is different from the previous one, and leaves the `selected` unchanged @@ -70,7 +71,7 @@ pub(crate) fn find_unique_null( return; } - let is_first_not_duplicate = prev_vector.map(|pv| pv.is_empty()).unwrap_or(true); + let is_first_not_duplicate = prev_vector.map(NullVector::is_empty).unwrap_or(true); if is_first_not_duplicate { selected.set(0, true); } @@ -104,8 +105,11 @@ pub(crate) fn find_unique_constant( mod tests { use std::sync::Arc; + use common_time::{Date, DateTime}; + use super::*; - use crate::vectors::{Int32Vector, StringVector, VectorOp}; + use crate::timestamp::*; + use crate::vectors::{Int32Vector, StringVector, Vector, VectorOp}; fn check_bitmap(expect: &[bool], selected: &BitVec) { let actual = selected.iter().collect::>(); @@ -121,7 +125,7 @@ mod tests { input: impl Iterator>, prev: Option<&[i32]>, ) { - let input = Int32Vector::from_iter(input); + let input = Int32Vector::from(input.collect::>()); let prev = prev.map(Int32Vector::from_slice); let mut selected = BitVec::repeat(false, input.len()); @@ -341,7 +345,6 @@ mod tests { macro_rules! impl_find_unique_date_like_test { ($VectorType: ident, $ValueType: ident, $method: ident) => {{ - use common_time::$ValueType; use $crate::vectors::$VectorType; let v = $VectorType::from_iterator([8, 8, 9, 10].into_iter().map($ValueType::$method)); @@ -356,6 +359,9 @@ mod tests { fn test_find_unique_date_like() { impl_find_unique_date_like_test!(DateVector, Date, new); impl_find_unique_date_like_test!(DateTimeVector, DateTime, new); - impl_find_unique_date_like_test!(TimestampVector, Timestamp, from_millis); + impl_find_unique_date_like_test!(TimestampSecondVector, TimestampSecond, from); + impl_find_unique_date_like_test!(TimestampMillisecondVector, TimestampMillisecond, from); + impl_find_unique_date_like_test!(TimestampMicrosecondVector, TimestampMicrosecond, from); + impl_find_unique_date_like_test!(TimestampNanosecondVector, TimestampNanosecond, from); } } diff --git a/src/datatypes/src/vectors/operations/replicate.rs b/src/datatypes/src/vectors/operations/replicate.rs index 7fb93134ed..8216517fc6 100644 --- a/src/datatypes/src/vectors/operations/replicate.rs +++ b/src/datatypes/src/vectors/operations/replicate.rs @@ -13,12 +13,8 @@ // limitations under the License. use crate::prelude::*; -pub(crate) use crate::vectors::constant::replicate_constant; -pub(crate) use crate::vectors::date::replicate_date; -pub(crate) use crate::vectors::datetime::replicate_datetime; pub(crate) use crate::vectors::null::replicate_null; pub(crate) use crate::vectors::primitive::replicate_primitive; -pub(crate) use crate::vectors::timestamp::replicate_timestamp; pub(crate) fn replicate_scalar(c: &C, offsets: &[usize]) -> VectorRef { assert_eq!(offsets.len(), c.len()); @@ -43,8 +39,13 @@ pub(crate) fn replicate_scalar(c: &C, offsets: &[usize]) -> Vec mod tests { use std::sync::Arc; + use common_time::timestamp::TimeUnit; + use common_time::{Date, DateTime, Timestamp}; + use paste::paste; + use super::*; - use crate::vectors::{ConstantVector, Int32Vector, NullVector, StringVector, VectorOp}; + use crate::vectors::constant::ConstantVector; + use crate::vectors::{Int32Vector, NullVector, StringVector, VectorOp}; #[test] fn test_replicate_primitive() { @@ -120,7 +121,6 @@ mod tests { macro_rules! impl_replicate_date_like_test { ($VectorType: ident, $ValueType: ident, $method: ident) => {{ - use common_time::$ValueType; use $crate::vectors::$VectorType; let v = $VectorType::from_iterator((0..5).map($ValueType::$method)); @@ -138,10 +138,33 @@ mod tests { }}; } + macro_rules! impl_replicate_timestamp_test { + ($unit: ident) => {{ + paste!{ + use $crate::vectors::[]; + use $crate::timestamp::[]; + let v = []::from_iterator((0..5).map([]::from)); + let offsets = [0, 1, 2, 3, 4]; + let v = v.replicate(&offsets); + assert_eq!(4, v.len()); + for i in 0..4 { + assert_eq!( + Value::Timestamp(Timestamp::new(i as i64 + 1, TimeUnit::$unit)), + v.get(i) + ); + } + } + }}; + } + #[test] fn test_replicate_date_like() { impl_replicate_date_like_test!(DateVector, Date, new); impl_replicate_date_like_test!(DateTimeVector, DateTime, new); - impl_replicate_date_like_test!(TimestampVector, Timestamp, from_millis); + + impl_replicate_timestamp_test!(Second); + impl_replicate_timestamp_test!(Millisecond); + impl_replicate_timestamp_test!(Microsecond); + impl_replicate_timestamp_test!(Nanosecond); } } diff --git a/src/datatypes/src/vectors/primitive.rs b/src/datatypes/src/vectors/primitive.rs index c49295630c..7829c31731 100644 --- a/src/datatypes/src/vectors/primitive.rs +++ b/src/datatypes/src/vectors/primitive.rs @@ -13,75 +13,111 @@ // limitations under the License. use std::any::Any; -use std::iter::FromIterator; -use std::slice::Iter; +use std::fmt; use std::sync::Arc; -use arrow::array::{Array, ArrayRef, MutableArray, MutablePrimitiveArray, PrimitiveArray}; -use arrow::bitmap::utils::ZipValidity; +use arrow::array::{ + Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef, PrimitiveArray, PrimitiveBuilder, +}; use serde_json::Value as JsonValue; -use snafu::{OptionExt, ResultExt}; +use snafu::OptionExt; -use crate::data_type::{ConcreteDataType, DataType}; -use crate::error::{ConversionSnafu, Result, SerializeSnafu}; +use crate::data_type::ConcreteDataType; +use crate::error::{self, Result}; use crate::scalars::{Scalar, ScalarRef, ScalarVector, ScalarVectorBuilder}; use crate::serialize::Serializable; -use crate::types::{Primitive, PrimitiveElement}; +use crate::types::{ + Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LogicalPrimitiveType, + UInt16Type, UInt32Type, UInt64Type, UInt8Type, WrapperType, +}; use crate::value::{Value, ValueRef}; use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; +pub type UInt8Vector = PrimitiveVector; +pub type UInt16Vector = PrimitiveVector; +pub type UInt32Vector = PrimitiveVector; +pub type UInt64Vector = PrimitiveVector; + +pub type Int8Vector = PrimitiveVector; +pub type Int16Vector = PrimitiveVector; +pub type Int32Vector = PrimitiveVector; +pub type Int64Vector = PrimitiveVector; + +pub type Float32Vector = PrimitiveVector; +pub type Float64Vector = PrimitiveVector; + /// Vector for primitive data types. -#[derive(Debug, Clone, PartialEq)] -pub struct PrimitiveVector { - pub(crate) array: PrimitiveArray, +pub struct PrimitiveVector { + array: PrimitiveArray, } -impl PrimitiveVector { - pub fn new(array: PrimitiveArray) -> Self { +impl PrimitiveVector { + pub fn new(array: PrimitiveArray) -> Self { Self { array } } pub fn try_from_arrow_array(array: impl AsRef) -> Result { - Ok(Self::new( - array - .as_ref() - .as_any() - .downcast_ref::>() - .with_context(|| ConversionSnafu { - from: format!("{:?}", array.as_ref().data_type()), - })? - .clone(), - )) + let data = array + .as_ref() + .as_any() + .downcast_ref::>() + .with_context(|| error::ConversionSnafu { + from: format!("{:?}", array.as_ref().data_type()), + })? + .data() + .clone(); + let concrete_array = PrimitiveArray::::from(data); + Ok(Self::new(concrete_array)) } - pub fn from_slice>(slice: P) -> Self { + pub fn from_slice>(slice: P) -> Self { + let iter = slice.as_ref().iter().copied(); Self { - array: PrimitiveArray::from_slice(slice), + array: PrimitiveArray::from_iter_values(iter), } } - pub fn from_vec(array: Vec) -> Self { + pub fn from_wrapper_slice>(slice: P) -> Self { + let iter = slice.as_ref().iter().copied().map(WrapperType::into_native); Self { - array: PrimitiveArray::from_vec(array), + array: PrimitiveArray::from_iter_values(iter), } } - pub fn from_values>(iter: I) -> Self { + pub fn from_vec(array: Vec) -> Self { Self { - array: PrimitiveArray::from_values(iter), + array: PrimitiveArray::from_iter_values(array), } } - pub(crate) fn as_arrow(&self) -> &dyn Array { + pub fn from_values>(iter: I) -> Self { + Self { + array: PrimitiveArray::from_iter_values(iter), + } + } + + pub(crate) fn as_arrow(&self) -> &PrimitiveArray { &self.array } - fn slice(&self, offset: usize, length: usize) -> Self { - Self::from(self.array.slice(offset, length)) + fn to_array_data(&self) -> ArrayData { + self.array.data().clone() + } + + fn from_array_data(data: ArrayData) -> Self { + Self { + array: PrimitiveArray::from(data), + } + } + + // To distinguish with `Vector::slice()`. + fn get_slice(&self, offset: usize, length: usize) -> Self { + let data = self.array.data().slice(offset, length); + Self::from_array_data(data) } } -impl Vector for PrimitiveVector { +impl Vector for PrimitiveVector { fn data_type(&self) -> ConcreteDataType { T::build_data_type() } @@ -99,11 +135,13 @@ impl Vector for PrimitiveVector { } fn to_arrow_array(&self) -> ArrayRef { - Arc::new(self.array.clone()) + let data = self.to_array_data(); + Arc::new(PrimitiveArray::::from(data)) } fn to_boxed_arrow_array(&self) -> Box { - Box::new(self.array.clone()) + let data = self.to_array_data(); + Box::new(PrimitiveArray::::from(data)) } fn validity(&self) -> Validity { @@ -111,7 +149,11 @@ impl Vector for PrimitiveVector { } fn memory_size(&self) -> usize { - self.array.values().len() * std::mem::size_of::() + self.array.get_buffer_memory_size() + } + + fn null_count(&self) -> usize { + self.array.null_count() } fn is_null(&self, row: usize) -> bool { @@ -119,57 +161,80 @@ impl Vector for PrimitiveVector { } fn slice(&self, offset: usize, length: usize) -> VectorRef { - Arc::new(self.slice(offset, length)) + let data = self.array.data().slice(offset, length); + Arc::new(Self::from_array_data(data)) } fn get(&self, index: usize) -> Value { - vectors::impl_get_for_vector!(self.array, index) + if self.array.is_valid(index) { + // Safety: The index have been checked by `is_valid()`. + let wrapper = unsafe { T::Wrapper::from_native(self.array.value_unchecked(index)) }; + wrapper.into() + } else { + Value::Null + } } fn get_ref(&self, index: usize) -> ValueRef { if self.array.is_valid(index) { // Safety: The index have been checked by `is_valid()`. - unsafe { self.array.value_unchecked(index).into_value_ref() } + let wrapper = unsafe { T::Wrapper::from_native(self.array.value_unchecked(index)) }; + wrapper.into() } else { ValueRef::Null } } } -impl From> for PrimitiveVector { - fn from(array: PrimitiveArray) -> Self { +impl fmt::Debug for PrimitiveVector { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("PrimitiveVector") + .field("array", &self.array) + .finish() + } +} + +impl From> for PrimitiveVector { + fn from(array: PrimitiveArray) -> Self { Self { array } } } -impl From>> for PrimitiveVector { - fn from(v: Vec>) -> Self { +impl From>> for PrimitiveVector { + fn from(v: Vec>) -> Self { Self { - array: PrimitiveArray::::from(v), + array: PrimitiveArray::from_iter(v), } } } -impl>> FromIterator for PrimitiveVector { - fn from_iter>(iter: I) -> Self { - Self { - array: MutablePrimitiveArray::::from_iter(iter).into(), - } +pub struct PrimitiveIter<'a, T: LogicalPrimitiveType> { + iter: ArrayIter<&'a PrimitiveArray>, +} + +impl<'a, T: LogicalPrimitiveType> Iterator for PrimitiveIter<'a, T> { + type Item = Option; + + fn next(&mut self) -> Option> { + self.iter + .next() + .map(|item| item.map(T::Wrapper::from_native)) + } + + fn size_hint(&self) -> (usize, Option) { + self.iter.size_hint() } } -impl ScalarVector for PrimitiveVector -where - T: PrimitiveElement, -{ - type OwnedItem = T; - type RefItem<'a> = T; +impl ScalarVector for PrimitiveVector { + type OwnedItem = T::Wrapper; + type RefItem<'a> = T::Wrapper; type Iter<'a> = PrimitiveIter<'a, T>; type Builder = PrimitiveVectorBuilder; fn get_data(&self, idx: usize) -> Option> { if self.array.is_valid(idx) { - Some(self.array.value(idx)) + Some(T::Wrapper::from_native(self.array.value(idx))) } else { None } @@ -182,59 +247,47 @@ where } } -pub type UInt8Vector = PrimitiveVector; -pub type UInt16Vector = PrimitiveVector; -pub type UInt32Vector = PrimitiveVector; -pub type UInt64Vector = PrimitiveVector; - -pub type Int8Vector = PrimitiveVector; -pub type Int16Vector = PrimitiveVector; -pub type Int32Vector = PrimitiveVector; -pub type Int64Vector = PrimitiveVector; - -pub type Float32Vector = PrimitiveVector; -pub type Float64Vector = PrimitiveVector; - -pub struct PrimitiveIter<'a, T> { - iter: ZipValidity<'a, &'a T, Iter<'a, T>>, -} - -impl<'a, T: Copy> Iterator for PrimitiveIter<'a, T> { - type Item = Option; - - fn next(&mut self) -> Option> { - self.iter.next().map(|v| v.copied()) - } -} - -impl Serializable for PrimitiveVector { +impl Serializable for PrimitiveVector { fn serialize_to_json(&self) -> Result> { - self.array - .iter() - .map(serde_json::to_value) - .collect::>() - .context(SerializeSnafu) + let res = self + .iter_data() + .map(|v| match v { + None => serde_json::Value::Null, + // use WrapperType's Into bound instead of + // serde_json::to_value to facilitate customized serialization + // for WrapperType + Some(v) => v.into(), + }) + .collect::>(); + Ok(res) } } -pub struct PrimitiveVectorBuilder { - pub(crate) mutable_array: MutablePrimitiveArray, +impl PartialEq for PrimitiveVector { + fn eq(&self, other: &PrimitiveVector) -> bool { + self.array == other.array + } } -pub type UInt8VectorBuilder = PrimitiveVectorBuilder; -pub type UInt16VectorBuilder = PrimitiveVectorBuilder; -pub type UInt32VectorBuilder = PrimitiveVectorBuilder; -pub type UInt64VectorBuilder = PrimitiveVectorBuilder; +pub type UInt8VectorBuilder = PrimitiveVectorBuilder; +pub type UInt16VectorBuilder = PrimitiveVectorBuilder; +pub type UInt32VectorBuilder = PrimitiveVectorBuilder; +pub type UInt64VectorBuilder = PrimitiveVectorBuilder; -pub type Int8VectorBuilder = PrimitiveVectorBuilder; -pub type Int16VectorBuilder = PrimitiveVectorBuilder; -pub type Int32VectorBuilder = PrimitiveVectorBuilder; -pub type Int64VectorBuilder = PrimitiveVectorBuilder; +pub type Int8VectorBuilder = PrimitiveVectorBuilder; +pub type Int16VectorBuilder = PrimitiveVectorBuilder; +pub type Int32VectorBuilder = PrimitiveVectorBuilder; +pub type Int64VectorBuilder = PrimitiveVectorBuilder; -pub type Float32VectorBuilder = PrimitiveVectorBuilder; -pub type Float64VectorBuilder = PrimitiveVectorBuilder; +pub type Float32VectorBuilder = PrimitiveVectorBuilder; +pub type Float64VectorBuilder = PrimitiveVectorBuilder; -impl MutableVector for PrimitiveVectorBuilder { +/// Builder to build a primitive vector. +pub struct PrimitiveVectorBuilder { + mutable_array: PrimitiveBuilder, +} + +impl MutableVector for PrimitiveVectorBuilder { fn data_type(&self) -> ConcreteDataType { T::build_data_type() } @@ -257,81 +310,62 @@ impl MutableVector for PrimitiveVectorBuilder { fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { let primitive = T::cast_value_ref(value)?; - self.mutable_array.push(primitive); + match primitive { + Some(v) => self.mutable_array.append_value(v.into_native()), + None => self.mutable_array.append_null(), + } Ok(()) } fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { let primitive = T::cast_vector(vector)?; // Slice the underlying array to avoid creating a new Arc. - let slice = primitive.slice(offset, length); - self.mutable_array.extend_trusted_len(slice.iter()); + let slice = primitive.get_slice(offset, length); + for v in slice.iter_data() { + self.push(v); + } Ok(()) } } impl ScalarVectorBuilder for PrimitiveVectorBuilder where - T: Scalar> + PrimitiveElement, - for<'a> T: ScalarRef<'a, ScalarType = T, VectorType = PrimitiveVector>, - for<'a> T: Scalar = T>, + T: LogicalPrimitiveType, + T::Wrapper: Scalar>, + for<'a> T::Wrapper: ScalarRef<'a, ScalarType = T::Wrapper>, + for<'a> T::Wrapper: Scalar = T::Wrapper>, { type VectorType = PrimitiveVector; fn with_capacity(capacity: usize) -> Self { Self { - mutable_array: MutablePrimitiveArray::with_capacity(capacity), + mutable_array: PrimitiveBuilder::with_capacity(capacity), } } fn push(&mut self, value: Option<::RefItem<'_>>) { - self.mutable_array.push(value); + self.mutable_array + .append_option(value.map(|v| v.into_native())); } fn finish(&mut self) -> Self::VectorType { PrimitiveVector { - array: std::mem::take(&mut self.mutable_array).into(), + array: self.mutable_array.finish(), } } } -impl PrimitiveVectorBuilder { - fn with_type_capacity(data_type: ConcreteDataType, capacity: usize) -> Self { - Self { - mutable_array: MutablePrimitiveArray::with_capacity_from( - capacity, - data_type.as_arrow_type(), - ), - } - } -} - -pub(crate) fn replicate_primitive( +pub(crate) fn replicate_primitive( vector: &PrimitiveVector, offsets: &[usize], -) -> VectorRef { - Arc::new(replicate_primitive_with_type( - vector, - offsets, - T::build_data_type(), - )) -} - -pub(crate) fn replicate_primitive_with_type( - vector: &PrimitiveVector, - offsets: &[usize], - data_type: ConcreteDataType, ) -> PrimitiveVector { assert_eq!(offsets.len(), vector.len()); if offsets.is_empty() { - return vector.slice(0, 0); + return vector.get_slice(0, 0); } - let mut builder = PrimitiveVectorBuilder::::with_type_capacity( - data_type, - *offsets.last().unwrap() as usize, - ); + let mut builder = PrimitiveVectorBuilder::::with_capacity(*offsets.last().unwrap() as usize); let mut previous_offset = 0; @@ -339,14 +373,15 @@ pub(crate) fn replicate_primitive_with_type( let repeat_times = *offset - previous_offset; match value { Some(data) => { - builder.mutable_array.extend_trusted_len( - std::iter::repeat(*data) - .take(repeat_times) - .map(Option::Some), - ); + unsafe { + // Safety: std::iter::Repeat and std::iter::Take implement TrustedLen. + builder + .mutable_array + .append_trusted_len_iter(std::iter::repeat(data).take(repeat_times)); + } } None => { - builder.mutable_array.extend_constant(repeat_times, None); + builder.mutable_array.append_nulls(repeat_times); } } previous_offset = *offset; @@ -356,6 +391,7 @@ pub(crate) fn replicate_primitive_with_type( #[cfg(test)] mod tests { + use arrow::array::Int32Array; use arrow::datatypes::DataType as ArrowDataType; use serde_json; @@ -364,11 +400,11 @@ mod tests { use crate::serialize::Serializable; use crate::types::Int64Type; - fn check_vec(v: PrimitiveVector) { + fn check_vec(v: Int32Vector) { assert_eq!(4, v.len()); assert_eq!("Int32Vector", v.vector_type_name()); assert!(!v.is_const()); - assert_eq!(Validity::AllValid, v.validity()); + assert!(v.validity().is_all_valid()); assert!(!v.only_null()); for i in 0..4 { @@ -387,26 +423,26 @@ mod tests { #[test] fn test_from_values() { - let v = PrimitiveVector::::from_values(vec![1, 2, 3, 4]); + let v = Int32Vector::from_values(vec![1, 2, 3, 4]); check_vec(v); } #[test] fn test_from_vec() { - let v = PrimitiveVector::::from_vec(vec![1, 2, 3, 4]); + let v = Int32Vector::from_vec(vec![1, 2, 3, 4]); check_vec(v); } #[test] fn test_from_slice() { - let v = PrimitiveVector::::from_slice(vec![1, 2, 3, 4]); + let v = Int32Vector::from_slice(vec![1, 2, 3, 4]); check_vec(v); } #[test] fn test_serialize_primitive_vector_with_null_to_json() { let input = [Some(1i32), Some(2i32), None, Some(4i32), None]; - let mut builder = PrimitiveVectorBuilder::with_capacity(input.len()); + let mut builder = Int32VectorBuilder::with_capacity(input.len()); for v in input { builder.push(v); } @@ -421,15 +457,15 @@ mod tests { #[test] fn test_from_arrow_array() { - let arrow_array = PrimitiveArray::from_slice(vec![1, 2, 3, 4]); - let v = PrimitiveVector::from(arrow_array); + let arrow_array = Int32Array::from(vec![1, 2, 3, 4]); + let v = Int32Vector::from(arrow_array); check_vec(v); } #[test] fn test_primitive_vector_build_get() { let input = [Some(1i32), Some(2i32), None, Some(4i32), None]; - let mut builder = PrimitiveVectorBuilder::with_capacity(input.len()); + let mut builder = Int32VectorBuilder::with_capacity(input.len()); for v in input { builder.push(v); } @@ -448,29 +484,28 @@ mod tests { #[test] fn test_primitive_vector_validity() { let input = [Some(1i32), Some(2i32), None, None]; - let mut builder = PrimitiveVectorBuilder::with_capacity(input.len()); + let mut builder = Int32VectorBuilder::with_capacity(input.len()); for v in input { builder.push(v); } let vector = builder.finish(); assert_eq!(2, vector.null_count()); let validity = vector.validity(); - let slots = validity.slots().unwrap(); - assert_eq!(2, slots.null_count()); - assert!(!slots.get_bit(2)); - assert!(!slots.get_bit(3)); + assert_eq!(2, validity.null_count()); + assert!(!validity.is_set(2)); + assert!(!validity.is_set(3)); - let vector = PrimitiveVector::::from_slice(vec![1, 2, 3, 4]); + let vector = Int32Vector::from_slice(vec![1, 2, 3, 4]); assert_eq!(0, vector.null_count()); - assert_eq!(Validity::AllValid, vector.validity()); + assert!(vector.validity().is_all_valid()); } #[test] fn test_memory_size() { - let v = PrimitiveVector::::from_slice((0..5).collect::>()); - assert_eq!(20, v.memory_size()); - let v = PrimitiveVector::::from(vec![Some(0i64), Some(1i64), Some(2i64), None, None]); - assert_eq!(40, v.memory_size()); + let v = Int32Vector::from_slice((0..5).collect::>()); + assert_eq!(64, v.memory_size()); + let v = Int64Vector::from(vec![Some(0i64), Some(1i64), Some(2i64), None, None]); + assert_eq!(128, v.memory_size()); } #[test] @@ -489,4 +524,29 @@ mod tests { let expect: VectorRef = Arc::new(Int64Vector::from_slice(&[123, 8, 9])); assert_eq!(expect, vector); } + + #[test] + fn test_from_wrapper_slice() { + macro_rules! test_from_wrapper_slice { + ($vec: ident, $ty: ident) => { + let from_wrapper_slice = $vec::from_wrapper_slice(&[ + $ty::from_native($ty::MAX), + $ty::from_native($ty::MIN), + ]); + let from_slice = $vec::from_slice(&[$ty::MAX, $ty::MIN]); + assert_eq!(from_wrapper_slice, from_slice); + }; + } + + test_from_wrapper_slice!(UInt8Vector, u8); + test_from_wrapper_slice!(Int8Vector, i8); + test_from_wrapper_slice!(UInt16Vector, u16); + test_from_wrapper_slice!(Int16Vector, i16); + test_from_wrapper_slice!(UInt32Vector, u32); + test_from_wrapper_slice!(Int32Vector, i32); + test_from_wrapper_slice!(UInt64Vector, u64); + test_from_wrapper_slice!(Int64Vector, i64); + test_from_wrapper_slice!(Float32Vector, f32); + test_from_wrapper_slice!(Float64Vector, f64); + } } diff --git a/src/datatypes/src/vectors/string.rs b/src/datatypes/src/vectors/string.rs index 638b04dd3e..252116b3b2 100644 --- a/src/datatypes/src/vectors/string.rs +++ b/src/datatypes/src/vectors/string.rs @@ -15,22 +15,19 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{Array, ArrayRef, MutableArray, Utf8ValuesIter}; -use arrow::bitmap::utils::ZipValidity; -use serde_json::Value as JsonValue; -use snafu::{OptionExt, ResultExt}; +use arrow::array::{Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef}; +use snafu::ResultExt; use crate::arrow_array::{MutableStringArray, StringArray}; use crate::data_type::ConcreteDataType; -use crate::error::{Result, SerializeSnafu}; +use crate::error::{self, Result}; use crate::scalars::{ScalarVector, ScalarVectorBuilder}; use crate::serialize::Serializable; -use crate::types::StringType; use crate::value::{Value, ValueRef}; use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; -/// String array wrapper -#[derive(Debug, Clone, PartialEq)] +/// Vector of strings. +#[derive(Debug, PartialEq)] pub struct StringVector { array: StringArray, } @@ -39,6 +36,16 @@ impl StringVector { pub(crate) fn as_arrow(&self) -> &dyn Array { &self.array } + + fn to_array_data(&self) -> ArrayData { + self.array.data().clone() + } + + fn from_array_data(data: ArrayData) -> Self { + Self { + array: StringArray::from(data), + } + } } impl From for StringVector { @@ -50,19 +57,7 @@ impl From for StringVector { impl From>> for StringVector { fn from(data: Vec>) -> Self { Self { - array: StringArray::from(data), - } - } -} - -impl From> for StringVector { - fn from(data: Vec) -> Self { - Self { - array: StringArray::from( - data.into_iter() - .map(Option::Some) - .collect::>>(), - ), + array: StringArray::from_iter(data), } } } @@ -70,7 +65,31 @@ impl From> for StringVector { impl From>> for StringVector { fn from(data: Vec>) -> Self { Self { - array: StringArray::from(data), + array: StringArray::from_iter(data), + } + } +} + +impl From<&[Option]> for StringVector { + fn from(data: &[Option]) -> Self { + Self { + array: StringArray::from_iter(data), + } + } +} + +impl From<&[Option<&str>]> for StringVector { + fn from(data: &[Option<&str>]) -> Self { + Self { + array: StringArray::from_iter(data), + } + } +} + +impl From> for StringVector { + fn from(data: Vec) -> Self { + Self { + array: StringArray::from_iter(data.into_iter().map(Some)), } } } @@ -78,18 +97,14 @@ impl From>> for StringVector { impl From> for StringVector { fn from(data: Vec<&str>) -> Self { Self { - array: StringArray::from( - data.into_iter() - .map(Option::Some) - .collect::>>(), - ), + array: StringArray::from_iter(data.into_iter().map(Some)), } } } impl Vector for StringVector { fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::String(StringType::default()) + ConcreteDataType::string_datatype() } fn vector_type_name(&self) -> String { @@ -105,11 +120,13 @@ impl Vector for StringVector { } fn to_arrow_array(&self) -> ArrayRef { - Arc::new(self.array.clone()) + let data = self.to_array_data(); + Arc::new(StringArray::from(data)) } fn to_boxed_arrow_array(&self) -> Box { - Box::new(self.array.clone()) + let data = self.to_array_data(); + Box::new(StringArray::from(data)) } fn validity(&self) -> Validity { @@ -117,7 +134,11 @@ impl Vector for StringVector { } fn memory_size(&self) -> usize { - self.len() * std::mem::size_of::() + self.array.values().len() + self.array.get_buffer_memory_size() + } + + fn null_count(&self) -> usize { + self.array.null_count() } fn is_null(&self, row: usize) -> bool { @@ -125,7 +146,8 @@ impl Vector for StringVector { } fn slice(&self, offset: usize, length: usize) -> VectorRef { - Arc::new(Self::from(self.array.slice(offset, length))) + let data = self.array.data().slice(offset, length); + Arc::new(Self::from_array_data(data)) } fn get(&self, index: usize) -> Value { @@ -140,7 +162,7 @@ impl Vector for StringVector { impl ScalarVector for StringVector { type OwnedItem = String; type RefItem<'a> = &'a str; - type Iter<'a> = ZipValidity<'a, &'a str, Utf8ValuesIter<'a, i32>>; + type Iter<'a> = ArrayIter<&'a StringArray>; type Builder = StringVectorBuilder; fn get_data(&self, idx: usize) -> Option> { @@ -157,7 +179,7 @@ impl ScalarVector for StringVector { } pub struct StringVectorBuilder { - buffer: MutableStringArray, + mutable_array: MutableStringArray, } impl MutableVector for StringVectorBuilder { @@ -166,7 +188,7 @@ impl MutableVector for StringVectorBuilder { } fn len(&self) -> usize { - self.buffer.len() + self.mutable_array.len() } fn as_any(&self) -> &dyn Any { @@ -182,12 +204,15 @@ impl MutableVector for StringVectorBuilder { } fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - self.buffer.push(value.as_string()?); + match value.as_string()? { + Some(v) => self.mutable_array.append_value(v), + None => self.mutable_array.append_null(), + } Ok(()) } fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - vectors::impl_extend_for_builder!(self.buffer, vector, StringVector, offset, length) + vectors::impl_extend_for_builder!(self, vector, StringVector, offset, length) } } @@ -196,30 +221,30 @@ impl ScalarVectorBuilder for StringVectorBuilder { fn with_capacity(capacity: usize) -> Self { Self { - buffer: MutableStringArray::with_capacity(capacity), + mutable_array: MutableStringArray::with_capacity(capacity, 0), } } fn push(&mut self, value: Option<::RefItem<'_>>) { - self.buffer.push(value) + match value { + Some(v) => self.mutable_array.append_value(v), + None => self.mutable_array.append_null(), + } } fn finish(&mut self) -> Self::VectorType { - Self::VectorType { - array: std::mem::take(&mut self.buffer).into(), + StringVector { + array: self.mutable_array.finish(), } } } impl Serializable for StringVector { - fn serialize_to_json(&self) -> crate::error::Result> { + fn serialize_to_json(&self) -> Result> { self.iter_data() - .map(|v| match v { - None => Ok(serde_json::Value::Null), - Some(s) => serde_json::to_value(s), - }) + .map(serde_json::to_value) .collect::>() - .context(SerializeSnafu) + .context(error::SerializeSnafu) } } @@ -227,60 +252,9 @@ vectors::impl_try_from_arrow_array_for_vector!(StringArray, StringVector); #[cfg(test)] mod tests { - use arrow::datatypes::DataType as ArrowDataType; - use serde_json; + use arrow::datatypes::DataType; use super::*; - use crate::data_type::DataType; - - #[test] - fn test_string_vector_misc() { - let strs = vec!["hello", "greptime", "rust"]; - let v = StringVector::from(strs.clone()); - assert_eq!(3, v.len()); - assert_eq!("StringVector", v.vector_type_name()); - assert!(!v.is_const()); - assert_eq!(Validity::AllValid, v.validity()); - assert!(!v.only_null()); - assert_eq!(41, v.memory_size()); - - for (i, s) in strs.iter().enumerate() { - assert_eq!(Value::from(*s), v.get(i)); - assert_eq!(ValueRef::from(*s), v.get_ref(i)); - assert_eq!(Value::from(*s), v.try_get(i).unwrap()); - } - - let arrow_arr = v.to_arrow_array(); - assert_eq!(3, arrow_arr.len()); - assert_eq!(&ArrowDataType::Utf8, arrow_arr.data_type()); - } - - #[test] - fn test_serialize_string_vector() { - let mut builder = StringVectorBuilder::with_capacity(3); - builder.push(Some("hello")); - builder.push(None); - builder.push(Some("world")); - let string_vector = builder.finish(); - let serialized = - serde_json::to_string(&string_vector.serialize_to_json().unwrap()).unwrap(); - assert_eq!(r#"["hello",null,"world"]"#, serialized); - } - - #[test] - fn test_from_arrow_array() { - let mut builder = MutableStringArray::new(); - builder.push(Some("A")); - builder.push(Some("B")); - builder.push::<&str>(None); - builder.push(Some("D")); - let string_array: StringArray = builder.into(); - let vector = StringVector::from(string_array); - assert_eq!( - r#"["A","B",null,"D"]"#, - serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(), - ); - } #[test] fn test_string_vector_build_get() { @@ -310,7 +284,7 @@ mod tests { #[test] fn test_string_vector_builder() { - let mut builder = StringType::default().create_mutable_vector(3); + let mut builder = StringVectorBuilder::with_capacity(3); builder.push_value_ref(ValueRef::String("hello")).unwrap(); assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); @@ -324,4 +298,73 @@ mod tests { let expect: VectorRef = Arc::new(StringVector::from_slice(&["hello", "one", "two"])); assert_eq!(expect, vector); } + + #[test] + fn test_string_vector_misc() { + let strs = vec!["hello", "greptime", "rust"]; + let v = StringVector::from(strs.clone()); + assert_eq!(3, v.len()); + assert_eq!("StringVector", v.vector_type_name()); + assert!(!v.is_const()); + assert!(v.validity().is_all_valid()); + assert!(!v.only_null()); + assert_eq!(128, v.memory_size()); + + for (i, s) in strs.iter().enumerate() { + assert_eq!(Value::from(*s), v.get(i)); + assert_eq!(ValueRef::from(*s), v.get_ref(i)); + assert_eq!(Value::from(*s), v.try_get(i).unwrap()); + } + + let arrow_arr = v.to_arrow_array(); + assert_eq!(3, arrow_arr.len()); + assert_eq!(&DataType::Utf8, arrow_arr.data_type()); + } + + #[test] + fn test_serialize_string_vector() { + let mut builder = StringVectorBuilder::with_capacity(3); + builder.push(Some("hello")); + builder.push(None); + builder.push(Some("world")); + let string_vector = builder.finish(); + let serialized = + serde_json::to_string(&string_vector.serialize_to_json().unwrap()).unwrap(); + assert_eq!(r#"["hello",null,"world"]"#, serialized); + } + + #[test] + fn test_from_arrow_array() { + let mut builder = MutableStringArray::new(); + builder.append_option(Some("A")); + builder.append_option(Some("B")); + builder.append_null(); + builder.append_option(Some("D")); + let string_array: StringArray = builder.finish(); + let vector = StringVector::from(string_array); + assert_eq!( + r#"["A","B",null,"D"]"#, + serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(), + ); + } + + #[test] + fn test_from_non_option_string() { + let nul = String::from_utf8(vec![0]).unwrap(); + let corpus = vec!["😅😅😅", "😍😍😍😍", "🥵🥵", nul.as_str()]; + let vector = StringVector::from(corpus); + let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(); + assert_eq!(r#"["😅😅😅","😍😍😍😍","🥵🥵","\u0000"]"#, serialized); + + let corpus = vec![ + "🀀🀀🀀".to_string(), + "🀁🀁🀁".to_string(), + "🀂🀂🀂".to_string(), + "🀃🀃🀃".to_string(), + "🀆🀆".to_string(), + ]; + let vector = StringVector::from(corpus); + let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(); + assert_eq!(r#"["🀀🀀🀀","🀁🀁🀁","🀂🀂🀂","🀃🀃🀃","🀆🀆"]"#, serialized); + } } diff --git a/src/datatypes/src/vectors/timestamp.rs b/src/datatypes/src/vectors/timestamp.rs index 62b8332c89..5d9f7f2ed1 100644 --- a/src/datatypes/src/vectors/timestamp.rs +++ b/src/datatypes/src/vectors/timestamp.rs @@ -12,308 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::any::Any; -use std::sync::Arc; - -use arrow::array::{Array, ArrayRef, PrimitiveArray}; -use common_time::timestamp::{TimeUnit, Timestamp}; -use snafu::OptionExt; - -use crate::data_type::{ConcreteDataType, DataType}; -use crate::error; -use crate::error::Result; -use crate::prelude::{ - MutableVector, ScalarVector, ScalarVectorBuilder, Validity, Value, ValueRef, Vector, VectorRef, +use crate::types::{ + TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, + TimestampSecondType, }; -use crate::serialize::Serializable; -use crate::types::TimestampType; -use crate::vectors::{PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder}; +use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder}; -/// `TimestampVector` stores timestamp in millisecond since UNIX Epoch. -#[derive(Debug, Clone, PartialEq)] -pub struct TimestampVector { - array: PrimitiveVector, -} +pub type TimestampSecondVector = PrimitiveVector; +pub type TimestampSecondVectorBuilder = PrimitiveVectorBuilder; -impl TimestampVector { - pub fn new(array: PrimitiveArray) -> Self { - Self { - array: PrimitiveVector { array }, - } - } +pub type TimestampMillisecondVector = PrimitiveVector; +pub type TimestampMillisecondVectorBuilder = PrimitiveVectorBuilder; - pub fn try_from_arrow_array(array: impl AsRef) -> Result { - Ok(Self::new( - array - .as_ref() - .as_any() - .downcast_ref::>() - .with_context(|| error::ConversionSnafu { - from: format!("{:?}", array.as_ref().data_type()), - })? - .clone(), - )) - } +pub type TimestampMicrosecondVector = PrimitiveVector; +pub type TimestampMicrosecondVectorBuilder = PrimitiveVectorBuilder; - pub fn from_values>(iter: I) -> Self { - Self { - array: PrimitiveVector { - array: PrimitiveArray::from_values(iter), - }, - } - } - - pub(crate) fn as_arrow(&self) -> &dyn Array { - self.array.as_arrow() - } -} - -impl Vector for TimestampVector { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::timestamp_millis_datatype() - } - - fn vector_type_name(&self) -> String { - "TimestampVector".to_string() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn len(&self) -> usize { - self.array.len() - } - - fn to_arrow_array(&self) -> ArrayRef { - let validity = self.array.array.validity().cloned(); - let buffer = self.array.array.values().clone(); - Arc::new(PrimitiveArray::new( - TimestampType::new(TimeUnit::Millisecond).as_arrow_type(), - buffer, - validity, - )) - } - - fn to_boxed_arrow_array(&self) -> Box { - let validity = self.array.array.validity().cloned(); - let values = self.array.array.values().clone(); - Box::new(PrimitiveArray::new( - arrow::datatypes::DataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), - values, - validity, - )) - } - - fn validity(&self) -> Validity { - self.array.validity() - } - - fn memory_size(&self) -> usize { - self.array.memory_size() - } - - fn is_null(&self, row: usize) -> bool { - self.array.is_null(row) - } - - fn slice(&self, offset: usize, length: usize) -> VectorRef { - Arc::new(Self { - array: PrimitiveVector { - array: self.array.array.slice(offset, length), - }, - }) - } - - fn get(&self, index: usize) -> Value { - match self.array.get(index) { - Value::Null => Value::Null, - Value::Int64(v) => Value::Timestamp(Timestamp::from_millis(v)), - _ => { - unreachable!() - } - } - } - - fn get_ref(&self, index: usize) -> ValueRef { - match self.array.get(index) { - Value::Int64(v) => ValueRef::Timestamp(Timestamp::from_millis(v)), - Value::Null => ValueRef::Null, - _ => unreachable!(), - } - } -} - -impl Serializable for TimestampVector { - fn serialize_to_json(&self) -> Result> { - Ok(self - .array - .iter_data() - .map(|v| match v { - None => serde_json::Value::Null, - Some(v) => v.into(), - }) - .collect::>()) - } -} - -impl ScalarVector for TimestampVector { - type OwnedItem = Timestamp; - type RefItem<'a> = Timestamp; - type Iter<'a> = TimestampDataIter<'a>; - type Builder = TimestampVectorBuilder; - - fn get_data(&self, idx: usize) -> Option> { - self.array.get_data(idx).map(Timestamp::from_millis) - } - - fn iter_data(&self) -> Self::Iter<'_> { - TimestampDataIter { - iter: self.array.iter_data(), - } - } -} - -pub struct TimestampDataIter<'a> { - iter: PrimitiveIter<'a, i64>, -} - -impl<'a> Iterator for TimestampDataIter<'a> { - type Item = Option; - - fn next(&mut self) -> Option { - self.iter.next().map(|v| v.map(Timestamp::from_millis)) - } -} - -pub struct TimestampVectorBuilder { - buffer: PrimitiveVectorBuilder, -} - -impl MutableVector for TimestampVectorBuilder { - fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::timestamp_millis_datatype() - } - - fn len(&self) -> usize { - self.buffer.len() - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn to_vector(&mut self) -> VectorRef { - Arc::new(self.finish()) - } - - fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - // TODO(hl): vector and vector builder should also support customized time unit. - self.buffer.push( - value - .as_timestamp()? - .map(|t| t.convert_to(TimeUnit::Millisecond)), - ); - Ok(()) - } - - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - let concrete_vector = vector - .as_any() - .downcast_ref::() - .with_context(|| error::CastTypeSnafu { - msg: format!( - "Failed to convert vector from {} to DateVector", - vector.vector_type_name() - ), - })?; - - self.buffer - .extend_slice_of(&concrete_vector.array, offset, length)?; - Ok(()) - } -} - -impl ScalarVectorBuilder for TimestampVectorBuilder { - type VectorType = TimestampVector; - - fn with_capacity(capacity: usize) -> Self { - Self { - buffer: PrimitiveVectorBuilder::with_capacity(capacity), - } - } - - /// Pushes a Timestamp value into vector builder. The timestamp must be with time unit - /// `Second`/`MilliSecond`/`Microsecond`. - fn push(&mut self, value: Option<::RefItem<'_>>) { - self.buffer - .push(value.map(|v| v.convert_to(TimeUnit::Millisecond))); - } - - fn finish(&mut self) -> Self::VectorType { - Self::VectorType { - array: self.buffer.finish(), - } - } -} - -pub(crate) fn replicate_timestamp(vector: &TimestampVector, offsets: &[usize]) -> VectorRef { - let array = crate::vectors::primitive::replicate_primitive_with_type( - &vector.array, - offsets, - vector.data_type(), - ); - Arc::new(TimestampVector { array }) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - pub fn test_build_timestamp_vector() { - let mut builder = TimestampVectorBuilder::with_capacity(3); - builder.push(Some(Timestamp::new(1, TimeUnit::Second))); - builder.push(None); - builder.push(Some(Timestamp::new(2, TimeUnit::Millisecond))); - - let vector = builder.finish(); - assert_eq!( - ConcreteDataType::timestamp_millis_datatype(), - vector.data_type() - ); - assert_eq!(3, vector.len()); - assert_eq!( - Value::Timestamp(Timestamp::new(1000, TimeUnit::Millisecond)), - vector.get(0) - ); - - assert_eq!(Value::Null, vector.get(1)); - assert_eq!( - Value::Timestamp(Timestamp::new(2, TimeUnit::Millisecond)), - vector.get(2) - ); - - assert_eq!( - vec![ - Some(Timestamp::new(1000, TimeUnit::Millisecond)), - None, - Some(Timestamp::new(2, TimeUnit::Millisecond)), - ], - vector.iter_data().collect::>() - ); - } - - #[test] - fn test_timestamp_from_arrow() { - let vector = - TimestampVector::from_slice(&[Timestamp::from_millis(1), Timestamp::from_millis(2)]); - let arrow = vector.as_arrow().slice(0, vector.len()); - let vector2 = TimestampVector::try_from_arrow_array(&arrow).unwrap(); - assert_eq!(vector, vector2); - } -} +pub type TimestampNanosecondVector = PrimitiveVector; +pub type TimestampNanosecondVectorBuilder = PrimitiveVectorBuilder; diff --git a/src/datatypes2/src/vectors/validity.rs b/src/datatypes/src/vectors/validity.rs similarity index 100% rename from src/datatypes2/src/vectors/validity.rs rename to src/datatypes/src/vectors/validity.rs diff --git a/src/datatypes2/Cargo.toml b/src/datatypes2/Cargo.toml index 34941606d4..1a9ac2348b 100644 --- a/src/datatypes2/Cargo.toml +++ b/src/datatypes2/Cargo.toml @@ -9,10 +9,11 @@ default = [] test = [] [dependencies] +arrow = "26.0.0" common-base = { path = "../common/base" } common-error = { path = "../common/error" } common-time = { path = "../common/time" } -datafusion-common = "14.0" +datafusion-common = "14.0.0" enum_dispatch = "0.3" num = "0.4" num-traits = "0.2" @@ -21,4 +22,3 @@ paste = "1.0" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" snafu = { version = "0.7", features = ["backtraces"] } -arrow = "26.0" diff --git a/src/datatypes2/src/arrow_array.rs b/src/datatypes2/src/arrow_array.rs index 7405c8a665..3444598ede 100644 --- a/src/datatypes2/src/arrow_array.rs +++ b/src/datatypes2/src/arrow_array.rs @@ -12,18 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -use arrow::array::{ - Array, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, Int16Array, - Int32Array, Int64Array, Int8Array, ListArray, UInt16Array, UInt32Array, UInt64Array, - UInt8Array, -}; +use arrow::array::{self, Array, ListArray, PrimitiveArray}; use arrow::datatypes::DataType; -use common_time::timestamp::TimeUnit; -use common_time::Timestamp; +use common_time::timestamp::Timestamp; use snafu::OptionExt; -use crate::data_type::ConcreteDataType; use crate::error::{ConversionSnafu, Result}; +use crate::prelude::ConcreteDataType; use crate::value::{ListValue, Value}; pub type BinaryArray = arrow::array::LargeBinaryArray; @@ -41,7 +36,6 @@ macro_rules! cast_array { }; } -// TODO(yingwen): Remove this function. pub fn arrow_array_get(array: &dyn Array, idx: usize) -> Result { if array.is_null(idx) { return Ok(Value::Null); @@ -49,46 +43,42 @@ pub fn arrow_array_get(array: &dyn Array, idx: usize) -> Result { let result = match array.data_type() { DataType::Null => Value::Null, - DataType::Boolean => Value::Boolean(cast_array!(array, BooleanArray).value(idx)), - DataType::Binary => Value::Binary(cast_array!(array, BinaryArray).value(idx).into()), - DataType::Int8 => Value::Int8(cast_array!(array, Int8Array).value(idx)), - DataType::Int16 => Value::Int16(cast_array!(array, Int16Array).value(idx)), - DataType::Int32 => Value::Int32(cast_array!(array, Int32Array).value(idx)), - DataType::Int64 => Value::Int64(cast_array!(array, Int64Array).value(idx)), - DataType::UInt8 => Value::UInt8(cast_array!(array, UInt8Array).value(idx)), - DataType::UInt16 => Value::UInt16(cast_array!(array, UInt16Array).value(idx)), - DataType::UInt32 => Value::UInt32(cast_array!(array, UInt32Array).value(idx)), - DataType::UInt64 => Value::UInt64(cast_array!(array, UInt64Array).value(idx)), - DataType::Float32 => Value::Float32(cast_array!(array, Float32Array).value(idx).into()), - DataType::Float64 => Value::Float64(cast_array!(array, Float64Array).value(idx).into()), - DataType::Utf8 => Value::String(cast_array!(array, StringArray).value(idx).into()), - DataType::Date32 => Value::Date(cast_array!(array, Date32Array).value(idx).into()), - DataType::Date64 => Value::DateTime(cast_array!(array, Date64Array).value(idx).into()), - DataType::Timestamp(t, _) => match t { - arrow::datatypes::TimeUnit::Second => Value::Timestamp(Timestamp::new( - cast_array!(array, arrow::array::TimestampSecondArray).value(idx), - TimeUnit::Second, - )), - arrow::datatypes::TimeUnit::Millisecond => Value::Timestamp(Timestamp::new( - cast_array!(array, arrow::array::TimestampMillisecondArray).value(idx), - TimeUnit::Millisecond, - )), - arrow::datatypes::TimeUnit::Microsecond => Value::Timestamp(Timestamp::new( - cast_array!(array, arrow::array::TimestampMicrosecondArray).value(idx), - TimeUnit::Microsecond, - )), - arrow::datatypes::TimeUnit::Nanosecond => Value::Timestamp(Timestamp::new( - cast_array!(array, arrow::array::TimestampNanosecondArray).value(idx), - TimeUnit::Nanosecond, - )), - }, + DataType::Boolean => Value::Boolean(cast_array!(array, array::BooleanArray).value(idx)), + DataType::Binary | DataType::LargeBinary => { + Value::Binary(cast_array!(array, BinaryArray).value(idx).into()) + } + DataType::Int8 => Value::Int8(cast_array!(array, PrimitiveArray::).value(idx)), + DataType::Int16 => Value::Int16(cast_array!(array, PrimitiveArray::).value(idx)), + DataType::Int32 => Value::Int32(cast_array!(array, PrimitiveArray::).value(idx)), + DataType::Int64 => Value::Int64(cast_array!(array, PrimitiveArray::).value(idx)), + DataType::UInt8 => Value::UInt8(cast_array!(array, PrimitiveArray::).value(idx)), + DataType::UInt16 => Value::UInt16(cast_array!(array, PrimitiveArray::).value(idx)), + DataType::UInt32 => Value::UInt32(cast_array!(array, PrimitiveArray::).value(idx)), + DataType::UInt64 => Value::UInt64(cast_array!(array, PrimitiveArray::).value(idx)), + DataType::Float32 => { + Value::Float32(cast_array!(array, PrimitiveArray::).value(idx).into()) + } + DataType::Float64 => { + Value::Float64(cast_array!(array, PrimitiveArray::).value(idx).into()) + } + DataType::Utf8 | DataType::LargeUtf8 => { + Value::String(cast_array!(array, StringArray).value(idx).into()) + } + DataType::Timestamp(t, _) => { + let value = cast_array!(array, PrimitiveArray::).value(idx); + let unit = match ConcreteDataType::from_arrow_time_unit(t) { + ConcreteDataType::Timestamp(t) => t.unit, + _ => unreachable!(), + }; + Value::Timestamp(Timestamp::new(value, unit)) + } DataType::List(_) => { - let array = cast_array!(array, ListArray).value(idx); - let item_type = ConcreteDataType::try_from(array.data_type())?; + let array = cast_array!(array, ListArray::).value(idx); + let inner_datatype = ConcreteDataType::try_from(array.data_type())?; let values = (0..array.len()) .map(|i| arrow_array_get(&*array, i)) .collect::>>()?; - Value::List(ListValue::new(Some(Box::new(values)), item_type)) + Value::List(ListValue::new(Some(Box::new(values)), inner_datatype)) } _ => unimplemented!("Arrow array datatype: {:?}", array.data_type()), }; @@ -98,74 +88,45 @@ pub fn arrow_array_get(array: &dyn Array, idx: usize) -> Result { #[cfg(test)] mod test { - use std::sync::Arc; - use arrow::array::{ BooleanArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, - LargeBinaryArray, TimestampMicrosecondArray, TimestampMillisecondArray, - TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, + MutableListArray, MutablePrimitiveArray, TryExtend, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; - use arrow::datatypes::Int32Type; + use arrow::buffer::Buffer; + use arrow::datatypes::{DataType, TimeUnit as ArrowTimeUnit}; use common_time::timestamp::{TimeUnit, Timestamp}; - use paste::paste; use super::*; - use crate::data_type::ConcreteDataType; - use crate::types::TimestampType; - - macro_rules! test_arrow_array_get_for_timestamps { - ( $($unit: ident), *) => { - $( - paste! { - let mut builder = arrow::array::[]::builder(3); - builder.append_value(1); - builder.append_value(0); - builder.append_value(-1); - let ts_array = Arc::new(builder.finish()) as Arc; - let v = arrow_array_get(&ts_array, 1).unwrap(); - assert_eq!( - ConcreteDataType::Timestamp(TimestampType::$unit( - $crate::types::[]::default(), - )), - v.data_type() - ); - } - )* - }; - } - - #[test] - fn test_timestamp_array() { - test_arrow_array_get_for_timestamps![Second, Millisecond, Microsecond, Nanosecond]; - } + use crate::prelude::Vector; + use crate::vectors::TimestampVector; #[test] fn test_arrow_array_access() { - let array1 = BooleanArray::from(vec![true, true, false, false]); + let array1 = BooleanArray::from_slice(vec![true, true, false, false]); assert_eq!(Value::Boolean(true), arrow_array_get(&array1, 1).unwrap()); - let array1 = Int8Array::from(vec![1, 2, 3, 4]); + let array1 = Int8Array::from_vec(vec![1, 2, 3, 4]); assert_eq!(Value::Int8(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = UInt8Array::from(vec![1, 2, 3, 4]); + let array1 = UInt8Array::from_vec(vec![1, 2, 3, 4]); assert_eq!(Value::UInt8(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = Int16Array::from(vec![1, 2, 3, 4]); + let array1 = Int16Array::from_vec(vec![1, 2, 3, 4]); assert_eq!(Value::Int16(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = UInt16Array::from(vec![1, 2, 3, 4]); + let array1 = UInt16Array::from_vec(vec![1, 2, 3, 4]); assert_eq!(Value::UInt16(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = Int32Array::from(vec![1, 2, 3, 4]); + let array1 = Int32Array::from_vec(vec![1, 2, 3, 4]); assert_eq!(Value::Int32(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = UInt32Array::from(vec![1, 2, 3, 4]); + let array1 = UInt32Array::from_vec(vec![1, 2, 3, 4]); assert_eq!(Value::UInt32(2), arrow_array_get(&array1, 1).unwrap()); - let array = Int64Array::from(vec![1, 2, 3, 4]); + let array = Int64Array::from_vec(vec![1, 2, 3, 4]); assert_eq!(Value::Int64(2), arrow_array_get(&array, 1).unwrap()); - let array1 = UInt64Array::from(vec![1, 2, 3, 4]); + let array1 = UInt64Array::from_vec(vec![1, 2, 3, 4]); assert_eq!(Value::UInt64(2), arrow_array_get(&array1, 1).unwrap()); - let array1 = Float32Array::from(vec![1f32, 2f32, 3f32, 4f32]); + let array1 = Float32Array::from_vec(vec![1f32, 2f32, 3f32, 4f32]); assert_eq!( Value::Float32(2f32.into()), arrow_array_get(&array1, 1).unwrap() ); - let array1 = Float64Array::from(vec![1f64, 2f64, 3f64, 4f64]); + let array1 = Float64Array::from_vec(vec![1f64, 2f64, 3f64, 4f64]); assert_eq!( Value::Float64(2f64.into()), arrow_array_get(&array1, 1).unwrap() @@ -178,42 +139,55 @@ mod test { ); assert_eq!(Value::Null, arrow_array_get(&array2, 1).unwrap()); - let array3 = LargeBinaryArray::from(vec![ + let array3 = super::BinaryArray::from(vec![ Some("hello".as_bytes()), None, Some("world".as_bytes()), ]); + assert_eq!( + Value::Binary("hello".as_bytes().into()), + arrow_array_get(&array3, 0).unwrap() + ); assert_eq!(Value::Null, arrow_array_get(&array3, 1).unwrap()); - let array = TimestampSecondArray::from(vec![1, 2, 3]); - let value = arrow_array_get(&array, 1).unwrap(); - assert_eq!(value, Value::Timestamp(Timestamp::new(2, TimeUnit::Second))); - let array = TimestampMillisecondArray::from(vec![1, 2, 3]); - let value = arrow_array_get(&array, 1).unwrap(); + let vector = TimestampVector::new(Int64Array::from_vec(vec![1, 2, 3, 4])); + let array = vector.to_boxed_arrow_array(); + let value = arrow_array_get(&*array, 1).unwrap(); assert_eq!( value, Value::Timestamp(Timestamp::new(2, TimeUnit::Millisecond)) ); - let array = TimestampMicrosecondArray::from(vec![1, 2, 3]); - let value = arrow_array_get(&array, 1).unwrap(); - assert_eq!( - value, - Value::Timestamp(Timestamp::new(2, TimeUnit::Microsecond)) + + let array4 = PrimitiveArray::::from_data( + DataType::Timestamp(ArrowTimeUnit::Millisecond, None), + Buffer::from_slice(&vec![1, 2, 3, 4]), + None, ); - let array = TimestampNanosecondArray::from(vec![1, 2, 3]); - let value = arrow_array_get(&array, 1).unwrap(); assert_eq!( - value, - Value::Timestamp(Timestamp::new(2, TimeUnit::Nanosecond)) + Value::Timestamp(Timestamp::new(1, TimeUnit::Millisecond)), + arrow_array_get(&array4, 0).unwrap() + ); + + let array4 = PrimitiveArray::::from_data( + DataType::Timestamp(ArrowTimeUnit::Nanosecond, None), + Buffer::from_slice(&vec![1, 2, 3, 4]), + None, + ); + assert_eq!( + Value::Timestamp(Timestamp::new(1, TimeUnit::Nanosecond)), + arrow_array_get(&array4, 0).unwrap() ); // test list array let data = vec![ - Some(vec![Some(1), Some(2), Some(3)]), + Some(vec![Some(1i32), Some(2), Some(3)]), None, Some(vec![Some(4), None, Some(6)]), ]; - let arrow_array = ListArray::from_iter_primitive::(data); + + let mut arrow_array = MutableListArray::>::new(); + arrow_array.try_extend(data).unwrap(); + let arrow_array: ListArray = arrow_array.into(); let v0 = arrow_array_get(&arrow_array, 0).unwrap(); match v0 { diff --git a/src/datatypes2/src/data_type.rs b/src/datatypes2/src/data_type.rs index 0d06d566b6..e14a3d8e84 100644 --- a/src/datatypes2/src/data_type.rs +++ b/src/datatypes2/src/data_type.rs @@ -14,7 +14,7 @@ use std::sync::Arc; -use arrow::datatypes::{DataType as ArrowDataType, TimeUnit as ArrowTimeUnit}; +use arrow::datatypes::DataType as ArrowDataType; use common_time::timestamp::TimeUnit; use paste::paste; use serde::{Deserialize, Serialize}; @@ -23,14 +23,13 @@ use crate::error::{self, Error, Result}; use crate::type_id::LogicalTypeId; use crate::types::{ BinaryType, BooleanType, DateTimeType, DateType, Float32Type, Float64Type, Int16Type, - Int32Type, Int64Type, Int8Type, ListType, NullType, StringType, TimestampMicrosecondType, - TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, TimestampType, - UInt16Type, UInt32Type, UInt64Type, UInt8Type, + Int32Type, Int64Type, Int8Type, ListType, NullType, StringType, TimestampType, UInt16Type, + UInt32Type, UInt64Type, UInt8Type, }; use crate::value::Value; use crate::vectors::MutableVector; -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] #[enum_dispatch::enum_dispatch(DataType)] pub enum ConcreteDataType { Null(NullType), @@ -48,21 +47,17 @@ pub enum ConcreteDataType { Float32(Float32Type), Float64(Float64Type), - // String types: + // String types Binary(BinaryType), String(StringType), - // Date types: Date(DateType), DateTime(DateTimeType), Timestamp(TimestampType), - // Compound types: List(ListType), } -// TODO(yingwen): Refactor these `is_xxx()` methods, such as adding a `properties()` method -// returning all these properties to the `DataType` trait impl ConcreteDataType { pub fn is_float(&self) -> bool { matches!( @@ -75,7 +70,7 @@ impl ConcreteDataType { matches!(self, ConcreteDataType::Boolean(_)) } - pub fn is_stringifiable(&self) -> bool { + pub fn stringifiable(&self) -> bool { matches!( self, ConcreteDataType::String(_) @@ -108,6 +103,13 @@ impl ConcreteDataType { ) } + pub fn is_timestamp(&self) -> bool { + matches!( + self, + ConcreteDataType::Timestamp(_) | ConcreteDataType::Int64(_) + ) + } + pub fn numerics() -> Vec { vec![ ConcreteDataType::int8_datatype(), @@ -159,7 +161,7 @@ impl TryFrom<&ArrowDataType> for ConcreteDataType { ArrowDataType::Binary | ArrowDataType::LargeBinary => Self::binary_datatype(), ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => Self::string_datatype(), ArrowDataType::List(field) => Self::List(ListType::new( - ConcreteDataType::from_arrow_type(field.data_type()), + ConcreteDataType::from_arrow_type(&field.data_type), )), _ => { return error::UnsupportedArrowTypeSnafu { @@ -189,52 +191,38 @@ macro_rules! impl_new_concrete_type_functions { impl_new_concrete_type_functions!( Null, Boolean, UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64, - Binary, Date, DateTime, String + Binary, String, Date, DateTime ); impl ConcreteDataType { - pub fn timestamp_second_datatype() -> Self { - ConcreteDataType::Timestamp(TimestampType::Second(TimestampSecondType::default())) - } - - pub fn timestamp_millisecond_datatype() -> Self { - ConcreteDataType::Timestamp(TimestampType::Millisecond( - TimestampMillisecondType::default(), - )) - } - - pub fn timestamp_microsecond_datatype() -> Self { - ConcreteDataType::Timestamp(TimestampType::Microsecond( - TimestampMicrosecondType::default(), - )) - } - - pub fn timestamp_nanosecond_datatype() -> Self { - ConcreteDataType::Timestamp(TimestampType::Nanosecond(TimestampNanosecondType::default())) + pub fn list_datatype(inner_type: ConcreteDataType) -> ConcreteDataType { + ConcreteDataType::List(ListType::new(inner_type)) } pub fn timestamp_datatype(unit: TimeUnit) -> Self { - match unit { - TimeUnit::Second => Self::timestamp_second_datatype(), - TimeUnit::Millisecond => Self::timestamp_millisecond_datatype(), - TimeUnit::Microsecond => Self::timestamp_microsecond_datatype(), - TimeUnit::Nanosecond => Self::timestamp_nanosecond_datatype(), - } + ConcreteDataType::Timestamp(TimestampType::new(unit)) + } + + pub fn timestamp_millis_datatype() -> Self { + ConcreteDataType::Timestamp(TimestampType::new(TimeUnit::Millisecond)) } /// Converts from arrow timestamp unit to - pub fn from_arrow_time_unit(t: &ArrowTimeUnit) -> Self { + // TODO(hl): maybe impl From for our timestamp ? + pub fn from_arrow_time_unit(t: &arrow::datatypes::TimeUnit) -> Self { match t { - ArrowTimeUnit::Second => Self::timestamp_second_datatype(), - ArrowTimeUnit::Millisecond => Self::timestamp_millisecond_datatype(), - ArrowTimeUnit::Microsecond => Self::timestamp_microsecond_datatype(), - ArrowTimeUnit::Nanosecond => Self::timestamp_nanosecond_datatype(), + arrow::datatypes::TimeUnit::Second => Self::timestamp_datatype(TimeUnit::Second), + arrow::datatypes::TimeUnit::Millisecond => { + Self::timestamp_datatype(TimeUnit::Millisecond) + } + arrow::datatypes::TimeUnit::Microsecond => { + Self::timestamp_datatype(TimeUnit::Microsecond) + } + arrow::datatypes::TimeUnit::Nanosecond => { + Self::timestamp_datatype(TimeUnit::Nanosecond) + } } } - - pub fn list_datatype(item_type: ConcreteDataType) -> ConcreteDataType { - ConcreteDataType::List(ListType::new(item_type)) - } } /// Data type abstraction. @@ -249,15 +237,11 @@ pub trait DataType: std::fmt::Debug + Send + Sync { /// Returns the default value of this type. fn default_value(&self) -> Value; - /// Convert this type as [arrow::datatypes::DataType]. + /// Convert this type as [arrow2::datatypes::DataType]. fn as_arrow_type(&self) -> ArrowDataType; - /// Creates a mutable vector with given `capacity` of this type. + /// Create a mutable vector with given `capacity` of this type. fn create_mutable_vector(&self, capacity: usize) -> Box; - - /// Returns true if the data type is compatible with timestamp type so we can - /// use it as a timestamp. - fn is_timestamp_compatible(&self) -> bool; } pub type DataTypeRef = Arc; @@ -340,6 +324,10 @@ mod tests { ConcreteDataType::from_arrow_type(&ArrowDataType::Utf8), ConcreteDataType::String(_) )); + assert!(matches!( + ConcreteDataType::from_arrow_type(&ArrowDataType::Utf8), + ConcreteDataType::String(_) + )); assert_eq!( ConcreteDataType::from_arrow_type(&ArrowDataType::List(Box::new(Field::new( "item", @@ -357,48 +345,31 @@ mod tests { #[test] fn test_from_arrow_timestamp() { assert_eq!( - ConcreteDataType::timestamp_millisecond_datatype(), - ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Millisecond) + ConcreteDataType::timestamp_millis_datatype(), + ConcreteDataType::from_arrow_time_unit(&arrow::datatypes::TimeUnit::Millisecond) ); assert_eq!( - ConcreteDataType::timestamp_microsecond_datatype(), - ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Microsecond) + ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond), + ConcreteDataType::from_arrow_time_unit(&arrow::datatypes::TimeUnit::Microsecond) ); assert_eq!( - ConcreteDataType::timestamp_nanosecond_datatype(), - ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Nanosecond) + ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond), + ConcreteDataType::from_arrow_time_unit(&arrow::datatypes::TimeUnit::Nanosecond) ); assert_eq!( - ConcreteDataType::timestamp_second_datatype(), - ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Second) + ConcreteDataType::timestamp_datatype(TimeUnit::Second), + ConcreteDataType::from_arrow_time_unit(&arrow::datatypes::TimeUnit::Second) ); } #[test] - fn test_is_timestamp_compatible() { - assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Second).is_timestamp_compatible()); - assert!( - ConcreteDataType::timestamp_datatype(TimeUnit::Millisecond).is_timestamp_compatible() - ); - assert!( - ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond).is_timestamp_compatible() - ); - assert!( - ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond).is_timestamp_compatible() - ); - assert!(ConcreteDataType::timestamp_second_datatype().is_timestamp_compatible()); - assert!(ConcreteDataType::timestamp_millisecond_datatype().is_timestamp_compatible()); - assert!(ConcreteDataType::timestamp_microsecond_datatype().is_timestamp_compatible()); - assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_timestamp_compatible()); - assert!(ConcreteDataType::int64_datatype().is_timestamp_compatible()); - assert!(!ConcreteDataType::null_datatype().is_timestamp_compatible()); - assert!(!ConcreteDataType::binary_datatype().is_timestamp_compatible()); - assert!(!ConcreteDataType::boolean_datatype().is_timestamp_compatible()); - assert!(!ConcreteDataType::date_datatype().is_timestamp_compatible()); - assert!(!ConcreteDataType::datetime_datatype().is_timestamp_compatible()); - assert!(!ConcreteDataType::string_datatype().is_timestamp_compatible()); - assert!(!ConcreteDataType::int32_datatype().is_timestamp_compatible()); - assert!(!ConcreteDataType::uint64_datatype().is_timestamp_compatible()); + fn test_is_timestamp() { + assert!(ConcreteDataType::timestamp_millis_datatype().is_timestamp()); + assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Second).is_timestamp()); + assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Millisecond).is_timestamp()); + assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond).is_timestamp()); + assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond).is_timestamp()); + assert!(ConcreteDataType::int64_datatype().is_timestamp()); } #[test] @@ -406,81 +377,4 @@ mod tests { assert!(ConcreteDataType::null_datatype().is_null()); assert!(!ConcreteDataType::int32_datatype().is_null()); } - - #[test] - fn test_is_float() { - assert!(!ConcreteDataType::int32_datatype().is_float()); - assert!(ConcreteDataType::float32_datatype().is_float()); - assert!(ConcreteDataType::float64_datatype().is_float()); - } - - #[test] - fn test_is_boolean() { - assert!(!ConcreteDataType::int32_datatype().is_boolean()); - assert!(!ConcreteDataType::float32_datatype().is_boolean()); - assert!(ConcreteDataType::boolean_datatype().is_boolean()); - } - - #[test] - fn test_is_stringifiable() { - assert!(!ConcreteDataType::int32_datatype().is_stringifiable()); - assert!(!ConcreteDataType::float32_datatype().is_stringifiable()); - assert!(ConcreteDataType::string_datatype().is_stringifiable()); - assert!(ConcreteDataType::date_datatype().is_stringifiable()); - assert!(ConcreteDataType::datetime_datatype().is_stringifiable()); - assert!(ConcreteDataType::timestamp_second_datatype().is_stringifiable()); - assert!(ConcreteDataType::timestamp_millisecond_datatype().is_stringifiable()); - assert!(ConcreteDataType::timestamp_microsecond_datatype().is_stringifiable()); - assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_stringifiable()); - } - - #[test] - fn test_is_signed() { - assert!(ConcreteDataType::int8_datatype().is_signed()); - assert!(ConcreteDataType::int16_datatype().is_signed()); - assert!(ConcreteDataType::int32_datatype().is_signed()); - assert!(ConcreteDataType::int64_datatype().is_signed()); - assert!(ConcreteDataType::date_datatype().is_signed()); - assert!(ConcreteDataType::datetime_datatype().is_signed()); - assert!(ConcreteDataType::timestamp_second_datatype().is_signed()); - assert!(ConcreteDataType::timestamp_millisecond_datatype().is_signed()); - assert!(ConcreteDataType::timestamp_microsecond_datatype().is_signed()); - assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_signed()); - - assert!(!ConcreteDataType::uint8_datatype().is_signed()); - assert!(!ConcreteDataType::uint16_datatype().is_signed()); - assert!(!ConcreteDataType::uint32_datatype().is_signed()); - assert!(!ConcreteDataType::uint64_datatype().is_signed()); - - assert!(!ConcreteDataType::float32_datatype().is_signed()); - assert!(!ConcreteDataType::float64_datatype().is_signed()); - } - - #[test] - fn test_is_unsigned() { - assert!(!ConcreteDataType::int8_datatype().is_unsigned()); - assert!(!ConcreteDataType::int16_datatype().is_unsigned()); - assert!(!ConcreteDataType::int32_datatype().is_unsigned()); - assert!(!ConcreteDataType::int64_datatype().is_unsigned()); - assert!(!ConcreteDataType::date_datatype().is_unsigned()); - assert!(!ConcreteDataType::datetime_datatype().is_unsigned()); - assert!(!ConcreteDataType::timestamp_second_datatype().is_unsigned()); - assert!(!ConcreteDataType::timestamp_millisecond_datatype().is_unsigned()); - assert!(!ConcreteDataType::timestamp_microsecond_datatype().is_unsigned()); - assert!(!ConcreteDataType::timestamp_nanosecond_datatype().is_unsigned()); - - assert!(ConcreteDataType::uint8_datatype().is_unsigned()); - assert!(ConcreteDataType::uint16_datatype().is_unsigned()); - assert!(ConcreteDataType::uint32_datatype().is_unsigned()); - assert!(ConcreteDataType::uint64_datatype().is_unsigned()); - - assert!(!ConcreteDataType::float32_datatype().is_unsigned()); - assert!(!ConcreteDataType::float64_datatype().is_unsigned()); - } - - #[test] - fn test_numerics() { - let nums = ConcreteDataType::numerics(); - assert_eq!(10, nums.len()); - } } diff --git a/src/datatypes2/src/lib.rs b/src/datatypes2/src/lib.rs index 256d347eac..f6f6db112a 100644 --- a/src/datatypes2/src/lib.rs +++ b/src/datatypes2/src/lib.rs @@ -23,7 +23,6 @@ pub mod prelude; mod scalars; pub mod schema; pub mod serialize; -mod timestamp; pub mod type_id; pub mod types; pub mod value; diff --git a/src/datatypes2/src/macros.rs b/src/datatypes2/src/macros.rs index 37c0a42e3f..18be9fa375 100644 --- a/src/datatypes2/src/macros.rs +++ b/src/datatypes2/src/macros.rs @@ -12,9 +12,27 @@ // See the License for the specific language governing permissions and // limitations under the License. -//! Some helper macros for datatypes, copied from databend. +///! Some helper macros for datatypes, copied from databend. +#[macro_export] +macro_rules! for_all_scalar_types { + ($macro:tt $(, $x:tt)*) => { + $macro! { + [$($x),*], + { i8 }, + { i16 }, + { i32 }, + { i64 }, + { u8 }, + { u16 }, + { u32 }, + { u64 }, + { f32 }, + { f64 }, + { bool }, + } + }; +} -/// Apply the macro rules to all primitive types. #[macro_export] macro_rules! for_all_primitive_types { ($macro:tt $(, $x:tt)*) => { @@ -34,8 +52,6 @@ macro_rules! for_all_primitive_types { }; } -/// Match the logical type and apply `$body` to all primitive types and -/// `nbody` to other types. #[macro_export] macro_rules! with_match_primitive_type_id { ($key_type:expr, | $_:tt $T:ident | $body:tt, $nbody:tt) => {{ @@ -46,21 +62,17 @@ macro_rules! with_match_primitive_type_id { } use $crate::type_id::LogicalTypeId; - use $crate::types::{ - Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, - UInt32Type, UInt64Type, UInt8Type, - }; match $key_type { - LogicalTypeId::Int8 => __with_ty__! { Int8Type }, - LogicalTypeId::Int16 => __with_ty__! { Int16Type }, - LogicalTypeId::Int32 => __with_ty__! { Int32Type }, - LogicalTypeId::Int64 => __with_ty__! { Int64Type }, - LogicalTypeId::UInt8 => __with_ty__! { UInt8Type }, - LogicalTypeId::UInt16 => __with_ty__! { UInt16Type }, - LogicalTypeId::UInt32 => __with_ty__! { UInt32Type }, - LogicalTypeId::UInt64 => __with_ty__! { UInt64Type }, - LogicalTypeId::Float32 => __with_ty__! { Float32Type }, - LogicalTypeId::Float64 => __with_ty__! { Float64Type }, + LogicalTypeId::Int8 => __with_ty__! { i8 }, + LogicalTypeId::Int16 => __with_ty__! { i16 }, + LogicalTypeId::Int32 => __with_ty__! { i32 }, + LogicalTypeId::Int64 => __with_ty__! { i64 }, + LogicalTypeId::UInt8 => __with_ty__! { u8 }, + LogicalTypeId::UInt16 => __with_ty__! { u16 }, + LogicalTypeId::UInt32 => __with_ty__! { u32 }, + LogicalTypeId::UInt64 => __with_ty__! { u64 }, + LogicalTypeId::Float32 => __with_ty__! { f32 }, + LogicalTypeId::Float64 => __with_ty__! { f64 }, _ => $nbody, } diff --git a/src/datatypes2/src/prelude.rs b/src/datatypes2/src/prelude.rs index f6bd298316..014a40efaf 100644 --- a/src/datatypes2/src/prelude.rs +++ b/src/datatypes2/src/prelude.rs @@ -16,5 +16,8 @@ pub use crate::data_type::{ConcreteDataType, DataType, DataTypeRef}; pub use crate::macros::*; pub use crate::scalars::{Scalar, ScalarRef, ScalarVector, ScalarVectorBuilder}; pub use crate::type_id::LogicalTypeId; +pub use crate::types::Primitive; pub use crate::value::{Value, ValueRef}; -pub use crate::vectors::{MutableVector, Validity, Vector, VectorRef}; +pub use crate::vectors::{ + Helper as VectorHelper, MutableVector, Validity, Vector, VectorBuilder, VectorRef, +}; diff --git a/src/datatypes2/src/scalars.rs b/src/datatypes2/src/scalars.rs index 327ebaa629..ddb8eff007 100644 --- a/src/datatypes2/src/scalars.rs +++ b/src/datatypes2/src/scalars.rs @@ -14,17 +14,11 @@ use std::any::Any; -use common_time::{Date, DateTime}; +use common_time::{Date, DateTime, Timestamp}; -use crate::types::{ - Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, UInt32Type, - UInt64Type, UInt8Type, -}; -use crate::value::{ListValue, ListValueRef, Value}; -use crate::vectors::{ - BinaryVector, BooleanVector, DateTimeVector, DateVector, ListVector, MutableVector, - PrimitiveVector, StringVector, Vector, -}; +use crate::prelude::*; +use crate::value::{ListValue, ListValueRef}; +use crate::vectors::*; fn get_iter_capacity>(iter: &I) -> usize { match iter.size_hint() { @@ -41,7 +35,7 @@ where for<'a> Self::VectorType: ScalarVector = Self::RefType<'a>>, { type VectorType: ScalarVector; - type RefType<'a>: ScalarRef<'a, ScalarType = Self> + type RefType<'a>: ScalarRef<'a, ScalarType = Self, VectorType = Self::VectorType> where Self: 'a; /// Get a reference of the current value. @@ -52,6 +46,7 @@ where } pub trait ScalarRef<'a>: std::fmt::Debug + Clone + Copy + Send + 'a { + type VectorType: ScalarVector = Self>; /// The corresponding [`Scalar`] type. type ScalarType: Scalar = Self>; @@ -68,7 +63,7 @@ where { type OwnedItem: Scalar; /// The reference item of this vector. - type RefItem<'a>: ScalarRef<'a, ScalarType = Self::OwnedItem> + type RefItem<'a>: ScalarRef<'a, ScalarType = Self::OwnedItem, VectorType = Self> where Self: 'a; @@ -142,46 +137,47 @@ pub trait ScalarVectorBuilder: MutableVector { fn finish(&mut self) -> Self::VectorType; } -macro_rules! impl_scalar_for_native { - ($Native: ident, $DataType: ident) => { - impl Scalar for $Native { - type VectorType = PrimitiveVector<$DataType>; - type RefType<'a> = $Native; +macro_rules! impl_primitive_scalar_type { + ($native:ident) => { + impl Scalar for $native { + type VectorType = PrimitiveVector<$native>; + type RefType<'a> = $native; #[inline] - fn as_scalar_ref(&self) -> $Native { + fn as_scalar_ref(&self) -> $native { *self } #[allow(clippy::needless_lifetimes)] #[inline] - fn upcast_gat<'short, 'long: 'short>(long: $Native) -> $Native { + fn upcast_gat<'short, 'long: 'short>(long: $native) -> $native { long } } /// Implement [`ScalarRef`] for primitive types. Note that primitive types are both [`Scalar`] and [`ScalarRef`]. - impl<'a> ScalarRef<'a> for $Native { - type ScalarType = $Native; + impl<'a> ScalarRef<'a> for $native { + type VectorType = PrimitiveVector<$native>; + type ScalarType = $native; #[inline] - fn to_owned_scalar(&self) -> $Native { + fn to_owned_scalar(&self) -> $native { *self } } }; } -impl_scalar_for_native!(u8, UInt8Type); -impl_scalar_for_native!(u16, UInt16Type); -impl_scalar_for_native!(u32, UInt32Type); -impl_scalar_for_native!(u64, UInt64Type); -impl_scalar_for_native!(i8, Int8Type); -impl_scalar_for_native!(i16, Int16Type); -impl_scalar_for_native!(i32, Int32Type); -impl_scalar_for_native!(i64, Int64Type); -impl_scalar_for_native!(f32, Float32Type); -impl_scalar_for_native!(f64, Float64Type); +impl_primitive_scalar_type!(u8); +impl_primitive_scalar_type!(u16); +impl_primitive_scalar_type!(u32); +impl_primitive_scalar_type!(u64); +impl_primitive_scalar_type!(i8); +impl_primitive_scalar_type!(i16); +impl_primitive_scalar_type!(i32); +impl_primitive_scalar_type!(i64); +impl_primitive_scalar_type!(f32); +impl_primitive_scalar_type!(f64); impl Scalar for bool { type VectorType = BooleanVector; @@ -200,6 +196,7 @@ impl Scalar for bool { } impl<'a> ScalarRef<'a> for bool { + type VectorType = BooleanVector; type ScalarType = bool; #[inline] @@ -224,6 +221,7 @@ impl Scalar for String { } impl<'a> ScalarRef<'a> for &'a str { + type VectorType = StringVector; type ScalarType = String; #[inline] @@ -248,6 +246,7 @@ impl Scalar for Vec { } impl<'a> ScalarRef<'a> for &'a [u8] { + type VectorType = BinaryVector; type ScalarType = Vec; #[inline] @@ -270,6 +269,7 @@ impl Scalar for Date { } impl<'a> ScalarRef<'a> for Date { + type VectorType = DateVector; type ScalarType = Date; fn to_owned_scalar(&self) -> Self::ScalarType { @@ -291,6 +291,7 @@ impl Scalar for DateTime { } impl<'a> ScalarRef<'a> for DateTime { + type VectorType = DateTimeVector; type ScalarType = DateTime; fn to_owned_scalar(&self) -> Self::ScalarType { @@ -298,7 +299,27 @@ impl<'a> ScalarRef<'a> for DateTime { } } -// Timestamp types implement Scalar and ScalarRef in `src/timestamp.rs`. +impl Scalar for Timestamp { + type VectorType = TimestampVector; + type RefType<'a> = Timestamp; + + fn as_scalar_ref(&self) -> Self::RefType<'_> { + *self + } + + fn upcast_gat<'short, 'long: 'short>(long: Self::RefType<'long>) -> Self::RefType<'short> { + long + } +} + +impl<'a> ScalarRef<'a> for Timestamp { + type VectorType = TimestampVector; + type ScalarType = Timestamp; + + fn to_owned_scalar(&self) -> Self::ScalarType { + *self + } +} impl Scalar for ListValue { type VectorType = ListVector; @@ -314,6 +335,7 @@ impl Scalar for ListValue { } impl<'a> ScalarRef<'a> for ListValueRef<'a> { + type VectorType = ListVector; type ScalarType = ListValue; fn to_owned_scalar(&self) -> Self::ScalarType { @@ -335,9 +357,8 @@ impl<'a> ScalarRef<'a> for ListValueRef<'a> { #[cfg(test)] mod tests { use super::*; - use crate::data_type::ConcreteDataType; - use crate::timestamp::TimestampSecond; - use crate::vectors::{BinaryVector, Int32Vector, ListVectorBuilder, TimestampSecondVector}; + use crate::vectors::binary::BinaryVector; + use crate::vectors::primitive::Int32Vector; fn build_vector_from_slice(items: &[Option>]) -> T { let mut builder = T::Builder::with_capacity(items.len()); @@ -433,11 +454,11 @@ mod tests { #[test] fn test_build_timestamp_vector() { - let expect: Vec> = vec![Some(10.into()), None, Some(42.into())]; - let vector: TimestampSecondVector = build_vector_from_slice(&expect); + let expect: Vec> = vec![Some(10.into()), None, Some(42.into())]; + let vector: TimestampVector = build_vector_from_slice(&expect); assert_vector_eq(&expect, &vector); let val = vector.get_data(0).unwrap(); assert_eq!(val, val.as_scalar_ref()); - assert_eq!(TimestampSecond::from(10), val.to_owned_scalar()); + assert_eq!(10, val.to_owned_scalar().value()); } } diff --git a/src/datatypes2/src/schema.rs b/src/datatypes2/src/schema.rs index 328fe0de24..a1792fd665 100644 --- a/src/datatypes2/src/schema.rs +++ b/src/datatypes2/src/schema.rs @@ -12,27 +12,128 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod column_schema; mod constraint; mod raw; use std::collections::HashMap; use std::sync::Arc; +pub use arrow::datatypes::Metadata; use arrow::datatypes::{Field, Schema as ArrowSchema}; +use serde::{Deserialize, Serialize}; use snafu::{ensure, ResultExt}; -use crate::data_type::DataType; -use crate::error::{self, Error, Result}; -pub use crate::schema::column_schema::{ColumnSchema, Metadata}; +use crate::data_type::{ConcreteDataType, DataType}; +use crate::error::{self, DeserializeSnafu, Error, Result, SerializeSnafu}; pub use crate::schema::constraint::ColumnDefaultConstraint; pub use crate::schema::raw::RawSchema; +use crate::vectors::VectorRef; +/// Key used to store whether the column is time index in arrow field's metadata. +const TIME_INDEX_KEY: &str = "greptime:time_index"; /// Key used to store version number of the schema in metadata. const VERSION_KEY: &str = "greptime:version"; +/// Key used to store default constraint in arrow field's metadata. +const ARROW_FIELD_DEFAULT_CONSTRAINT_KEY: &str = "greptime:default_constraint"; + +/// Schema of a column, used as an immutable struct. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct ColumnSchema { + pub name: String, + pub data_type: ConcreteDataType, + is_nullable: bool, + is_time_index: bool, + default_constraint: Option, + metadata: Metadata, +} + +impl ColumnSchema { + pub fn new>( + name: T, + data_type: ConcreteDataType, + is_nullable: bool, + ) -> ColumnSchema { + ColumnSchema { + name: name.into(), + data_type, + is_nullable, + is_time_index: false, + default_constraint: None, + metadata: Metadata::new(), + } + } + + #[inline] + pub fn is_time_index(&self) -> bool { + self.is_time_index + } + + #[inline] + pub fn is_nullable(&self) -> bool { + self.is_nullable + } + + #[inline] + pub fn default_constraint(&self) -> Option<&ColumnDefaultConstraint> { + self.default_constraint.as_ref() + } + + #[inline] + pub fn metadata(&self) -> &Metadata { + &self.metadata + } + + pub fn with_time_index(mut self, is_time_index: bool) -> Self { + self.is_time_index = is_time_index; + if is_time_index { + self.metadata + .insert(TIME_INDEX_KEY.to_string(), "true".to_string()); + } else { + self.metadata.remove(TIME_INDEX_KEY); + } + self + } + + pub fn with_default_constraint( + mut self, + default_constraint: Option, + ) -> Result { + if let Some(constraint) = &default_constraint { + constraint.validate(&self.data_type, self.is_nullable)?; + } + + self.default_constraint = default_constraint; + Ok(self) + } + + /// Creates a new [`ColumnSchema`] with given metadata. + pub fn with_metadata(mut self, metadata: Metadata) -> Self { + self.metadata = metadata; + self + } + + pub fn create_default_vector(&self, num_rows: usize) -> Result> { + match &self.default_constraint { + Some(c) => c + .create_default_vector(&self.data_type, self.is_nullable, num_rows) + .map(Some), + None => { + if self.is_nullable { + // No default constraint, use null as default value. + // TODO(yingwen): Use NullVector once it supports setting logical type. + ColumnDefaultConstraint::null_value() + .create_default_vector(&self.data_type, self.is_nullable, num_rows) + .map(Some) + } else { + Ok(None) + } + } + } + } +} /// A common schema, should be immutable. -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq)] pub struct Schema { column_schemas: Vec, name_to_index: HashMap, @@ -130,7 +231,7 @@ impl Schema { } #[inline] - pub fn metadata(&self) -> &HashMap { + pub fn metadata(&self) -> &Metadata { &self.arrow_schema.metadata } } @@ -142,7 +243,7 @@ pub struct SchemaBuilder { fields: Vec, timestamp_index: Option, version: u32, - metadata: HashMap, + metadata: Metadata, } impl TryFrom> for SchemaBuilder { @@ -191,7 +292,7 @@ impl SchemaBuilder { self.metadata .insert(VERSION_KEY.to_string(), self.version.to_string()); - let arrow_schema = ArrowSchema::new(self.fields).with_metadata(self.metadata); + let arrow_schema = ArrowSchema::from(self.fields).with_metadata(self.metadata); Ok(Schema { column_schemas: self.column_schemas, @@ -246,7 +347,7 @@ fn validate_timestamp_index(column_schemas: &[ColumnSchema], timestamp_index: us let column_schema = &column_schemas[timestamp_index]; ensure!( - column_schema.data_type.is_timestamp_compatible(), + column_schema.data_type.is_timestamp(), error::InvalidTimestampIndexSnafu { index: timestamp_index, } @@ -263,6 +364,58 @@ fn validate_timestamp_index(column_schemas: &[ColumnSchema], timestamp_index: us pub type SchemaRef = Arc; +impl TryFrom<&Field> for ColumnSchema { + type Error = Error; + + fn try_from(field: &Field) -> Result { + let data_type = ConcreteDataType::try_from(&field.data_type)?; + let mut metadata = field.metadata.clone(); + let default_constraint = match metadata.remove(ARROW_FIELD_DEFAULT_CONSTRAINT_KEY) { + Some(json) => Some(serde_json::from_str(&json).context(DeserializeSnafu { json })?), + None => None, + }; + let is_time_index = metadata.contains_key(TIME_INDEX_KEY); + + Ok(ColumnSchema { + name: field.name.clone(), + data_type, + is_nullable: field.is_nullable, + is_time_index, + default_constraint, + metadata, + }) + } +} + +impl TryFrom<&ColumnSchema> for Field { + type Error = Error; + + fn try_from(column_schema: &ColumnSchema) -> Result { + let mut metadata = column_schema.metadata.clone(); + if let Some(value) = &column_schema.default_constraint { + // Adds an additional metadata to store the default constraint. + let old = metadata.insert( + ARROW_FIELD_DEFAULT_CONSTRAINT_KEY.to_string(), + serde_json::to_string(&value).context(SerializeSnafu)?, + ); + + ensure!( + old.is_none(), + error::DuplicateMetaSnafu { + key: ARROW_FIELD_DEFAULT_CONSTRAINT_KEY, + } + ); + } + + Ok(Field::new( + column_schema.name.clone(), + column_schema.data_type.as_arrow_type(), + column_schema.is_nullable(), + ) + .with_metadata(metadata)) + } +} + impl TryFrom> for Schema { type Error = Error; @@ -271,7 +424,7 @@ impl TryFrom> for Schema { let mut name_to_index = HashMap::with_capacity(arrow_schema.fields.len()); for field in &arrow_schema.fields { let column_schema = ColumnSchema::try_from(field)?; - name_to_index.insert(field.name().to_string(), column_schemas.len()); + name_to_index.insert(field.name.clone(), column_schemas.len()); column_schemas.push(column_schema); } @@ -312,7 +465,7 @@ impl TryFrom for Schema { } } -fn try_parse_version(metadata: &HashMap, key: &str) -> Result { +fn try_parse_version(metadata: &Metadata, key: &str) -> Result { if let Some(value) = metadata.get(key) { let version = value .parse() @@ -326,8 +479,127 @@ fn try_parse_version(metadata: &HashMap, key: &str) -> Result = default_constraint.clone().try_into().unwrap(); + let from_value = ColumnDefaultConstraint::try_from(&bytes[..]).unwrap(); + + assert_eq!(default_constraint, from_value); + } + + #[test] + fn test_column_schema_create_default_null() { + // Implicit default null. + let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true); + let v = column_schema.create_default_vector(5).unwrap().unwrap(); + assert_eq!(5, v.len()); + assert!(v.only_null()); + + // Explicit default null. + let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true) + .with_default_constraint(Some(ColumnDefaultConstraint::null_value())) + .unwrap(); + let v = column_schema.create_default_vector(5).unwrap().unwrap(); + assert_eq!(5, v.len()); + assert!(v.only_null()); + } + + #[test] + fn test_column_schema_no_default() { + let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), false); + assert!(column_schema.create_default_vector(5).unwrap().is_none()); + } #[test] fn test_build_empty_schema() { @@ -382,12 +654,8 @@ mod tests { fn test_schema_with_timestamp() { let column_schemas = vec![ ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true), - ColumnSchema::new( - "ts", - ConcreteDataType::timestamp_millisecond_datatype(), - false, - ) - .with_time_index(true), + ColumnSchema::new("ts", ConcreteDataType::timestamp_millis_datatype(), false) + .with_time_index(true), ]; let schema = SchemaBuilder::try_from(column_schemas.clone()) .unwrap() diff --git a/src/datatypes2/src/schema/constraint.rs b/src/datatypes2/src/schema/constraint.rs index 4dd3ecc14b..3750fcebcf 100644 --- a/src/datatypes2/src/schema/constraint.rs +++ b/src/datatypes2/src/schema/constraint.rs @@ -22,7 +22,7 @@ use snafu::{ensure, ResultExt}; use crate::data_type::{ConcreteDataType, DataType}; use crate::error::{self, Result}; use crate::value::Value; -use crate::vectors::{Int64Vector, TimestampMillisecondVector, VectorRef}; +use crate::vectors::{Int64Vector, TimestampVector, VectorRef}; const CURRENT_TIMESTAMP: &str = "current_timestamp()"; @@ -81,7 +81,7 @@ impl ColumnDefaultConstraint { error::UnsupportedDefaultExprSnafu { expr } ); ensure!( - data_type.is_timestamp_compatible(), + data_type.is_timestamp(), error::DefaultValueTypeSnafu { reason: "return value of the function must has timestamp type", } @@ -162,10 +162,8 @@ fn create_current_timestamp_vector( data_type: &ConcreteDataType, num_rows: usize, ) -> Result { - // FIXME(yingwen): We should implements cast in VectorOp so we could cast the millisecond vector - // to other data type and avoid this match. match data_type { - ConcreteDataType::Timestamp(_) => Ok(Arc::new(TimestampMillisecondVector::from_values( + ConcreteDataType::Timestamp(_) => Ok(Arc::new(TimestampVector::from_values( std::iter::repeat(util::current_time_millis()).take(num_rows), ))), ConcreteDataType::Int64(_) => Ok(Arc::new(Int64Vector::from_values( @@ -219,7 +217,7 @@ mod tests { fn test_validate_function_constraint() { let constraint = ColumnDefaultConstraint::Function(CURRENT_TIMESTAMP.to_string()); constraint - .validate(&ConcreteDataType::timestamp_millisecond_datatype(), false) + .validate(&ConcreteDataType::timestamp_millis_datatype(), false) .unwrap(); constraint .validate(&ConcreteDataType::boolean_datatype(), false) @@ -227,7 +225,7 @@ mod tests { let constraint = ColumnDefaultConstraint::Function("hello()".to_string()); constraint - .validate(&ConcreteDataType::timestamp_millisecond_datatype(), false) + .validate(&ConcreteDataType::timestamp_millis_datatype(), false) .unwrap_err(); } @@ -264,7 +262,7 @@ mod tests { fn test_create_default_vector_by_func() { let constraint = ColumnDefaultConstraint::Function(CURRENT_TIMESTAMP.to_string()); // Timestamp type. - let data_type = ConcreteDataType::timestamp_millisecond_datatype(); + let data_type = ConcreteDataType::timestamp_millis_datatype(); let v = constraint .create_default_vector(&data_type, false, 4) .unwrap(); @@ -288,7 +286,7 @@ mod tests { ); let constraint = ColumnDefaultConstraint::Function("no".to_string()); - let data_type = ConcreteDataType::timestamp_millisecond_datatype(); + let data_type = ConcreteDataType::timestamp_millis_datatype(); constraint .create_default_vector(&data_type, false, 4) .unwrap_err(); diff --git a/src/datatypes2/src/schema/raw.rs b/src/datatypes2/src/schema/raw.rs index 75f0853b4b..f415a1ab85 100644 --- a/src/datatypes2/src/schema/raw.rs +++ b/src/datatypes2/src/schema/raw.rs @@ -20,7 +20,7 @@ use crate::schema::{ColumnSchema, Schema, SchemaBuilder}; /// Struct used to serialize and deserialize [`Schema`](crate::schema::Schema). /// /// This struct only contains necessary data to recover the Schema. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct RawSchema { pub column_schemas: Vec, pub timestamp_index: Option, @@ -56,12 +56,8 @@ mod tests { fn test_raw_convert() { let column_schemas = vec![ ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true), - ColumnSchema::new( - "ts", - ConcreteDataType::timestamp_millisecond_datatype(), - false, - ) - .with_time_index(true), + ColumnSchema::new("ts", ConcreteDataType::timestamp_millis_datatype(), false) + .with_time_index(true), ]; let schema = SchemaBuilder::try_from(column_schemas) .unwrap() diff --git a/src/datatypes2/src/type_id.rs b/src/datatypes2/src/type_id.rs index bcb7ea52b1..fa11430dec 100644 --- a/src/datatypes2/src/type_id.rs +++ b/src/datatypes2/src/type_id.rs @@ -42,10 +42,7 @@ pub enum LogicalTypeId { /// seconds/milliseconds/microseconds/nanoseconds, determined by precision. DateTime, - TimestampSecond, - TimestampMillisecond, - TimestampMicrosecond, - TimestampNanosecond, + Timestamp, List, } @@ -77,14 +74,7 @@ impl LogicalTypeId { LogicalTypeId::Binary => ConcreteDataType::binary_datatype(), LogicalTypeId::Date => ConcreteDataType::date_datatype(), LogicalTypeId::DateTime => ConcreteDataType::datetime_datatype(), - LogicalTypeId::TimestampSecond => ConcreteDataType::timestamp_second_datatype(), - LogicalTypeId::TimestampMillisecond => { - ConcreteDataType::timestamp_millisecond_datatype() - } - LogicalTypeId::TimestampMicrosecond => { - ConcreteDataType::timestamp_microsecond_datatype() - } - LogicalTypeId::TimestampNanosecond => ConcreteDataType::timestamp_nanosecond_datatype(), + LogicalTypeId::Timestamp => ConcreteDataType::timestamp_millis_datatype(), // to timestamp type with default time unit LogicalTypeId::List => { ConcreteDataType::list_datatype(ConcreteDataType::null_datatype()) } diff --git a/src/datatypes2/src/types.rs b/src/datatypes2/src/types.rs index 186704fdfd..aabeb59db3 100644 --- a/src/datatypes2/src/types.rs +++ b/src/datatypes2/src/types.rs @@ -14,24 +14,25 @@ mod binary_type; mod boolean_type; -mod date_type; -mod datetime_type; +mod date; +mod datetime; mod list_type; mod null_type; +mod primitive_traits; mod primitive_type; mod string_type; - -mod timestamp_type; +mod timestamp; pub use binary_type::BinaryType; pub use boolean_type::BooleanType; -pub use date_type::DateType; -pub use datetime_type::DateTimeType; +pub use date::DateType; +pub use datetime::DateTimeType; pub use list_type::ListType; pub use null_type::NullType; +pub use primitive_traits::{OrdPrimitive, Primitive}; pub use primitive_type::{ - Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LogicalPrimitiveType, - NativeType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, WrapperType, + Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, PrimitiveElement, + PrimitiveType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; pub use string_type::StringType; -pub use timestamp_type::*; +pub use timestamp::TimestampType; diff --git a/src/datatypes2/src/types/binary_type.rs b/src/datatypes2/src/types/binary_type.rs index 0d06724fff..13922ff063 100644 --- a/src/datatypes2/src/types/binary_type.rs +++ b/src/datatypes2/src/types/binary_type.rs @@ -53,8 +53,4 @@ impl DataType for BinaryType { fn create_mutable_vector(&self, capacity: usize) -> Box { Box::new(BinaryVectorBuilder::with_capacity(capacity)) } - - fn is_timestamp_compatible(&self) -> bool { - false - } } diff --git a/src/datatypes2/src/types/boolean_type.rs b/src/datatypes2/src/types/boolean_type.rs index 36d92169eb..4566f1d826 100644 --- a/src/datatypes2/src/types/boolean_type.rs +++ b/src/datatypes2/src/types/boolean_type.rs @@ -52,8 +52,4 @@ impl DataType for BooleanType { fn create_mutable_vector(&self, capacity: usize) -> Box { Box::new(BooleanVectorBuilder::with_capacity(capacity)) } - - fn is_timestamp_compatible(&self) -> bool { - false - } } diff --git a/src/datatypes/src/types/date.rs b/src/datatypes2/src/types/date.rs similarity index 100% rename from src/datatypes/src/types/date.rs rename to src/datatypes2/src/types/date.rs diff --git a/src/datatypes/src/types/datetime.rs b/src/datatypes2/src/types/datetime.rs similarity index 100% rename from src/datatypes/src/types/datetime.rs rename to src/datatypes2/src/types/datetime.rs diff --git a/src/datatypes2/src/types/list_type.rs b/src/datatypes2/src/types/list_type.rs index b9875ca362..1ada109011 100644 --- a/src/datatypes2/src/types/list_type.rs +++ b/src/datatypes2/src/types/list_type.rs @@ -15,17 +15,15 @@ use arrow::datatypes::{DataType as ArrowDataType, Field}; use serde::{Deserialize, Serialize}; -use crate::data_type::{ConcreteDataType, DataType}; -use crate::type_id::LogicalTypeId; -use crate::value::{ListValue, Value}; +use crate::prelude::*; +use crate::value::ListValue; use crate::vectors::{ListVectorBuilder, MutableVector}; /// Used to represent the List datatype. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct ListType { - /// The type of List's item. - // Use Box to avoid recursive dependency, as enum ConcreteDataType depends on ListType. - item_type: Box, + /// The type of List's inner data. + inner: Box, } impl Default for ListType { @@ -35,10 +33,9 @@ impl Default for ListType { } impl ListType { - /// Create a new `ListType` whose item's data type is `item_type`. - pub fn new(item_type: ConcreteDataType) -> Self { + pub fn new(datatype: ConcreteDataType) -> Self { ListType { - item_type: Box::new(item_type), + inner: Box::new(datatype), } } } @@ -53,24 +50,20 @@ impl DataType for ListType { } fn default_value(&self) -> Value { - Value::List(ListValue::new(None, *self.item_type.clone())) + Value::List(ListValue::new(None, *self.inner.clone())) } fn as_arrow_type(&self) -> ArrowDataType { - let field = Box::new(Field::new("item", self.item_type.as_arrow_type(), true)); + let field = Box::new(Field::new("item", self.inner.as_arrow_type(), true)); ArrowDataType::List(field) } fn create_mutable_vector(&self, capacity: usize) -> Box { Box::new(ListVectorBuilder::with_type_capacity( - *self.item_type.clone(), + *self.inner.clone(), capacity, )) } - - fn is_timestamp_compatible(&self) -> bool { - false - } } #[cfg(test)] diff --git a/src/datatypes2/src/types/null_type.rs b/src/datatypes2/src/types/null_type.rs index b9bb2dc752..a0b027dd14 100644 --- a/src/datatypes2/src/types/null_type.rs +++ b/src/datatypes2/src/types/null_type.rs @@ -27,7 +27,7 @@ pub struct NullType; impl NullType { pub fn arc() -> DataTypeRef { - Arc::new(NullType) + Arc::new(Self) } } @@ -51,8 +51,4 @@ impl DataType for NullType { fn create_mutable_vector(&self, _capacity: usize) -> Box { Box::new(NullVectorBuilder::default()) } - - fn is_timestamp_compatible(&self) -> bool { - false - } } diff --git a/src/datatypes/src/types/primitive_traits.rs b/src/datatypes2/src/types/primitive_traits.rs similarity index 100% rename from src/datatypes/src/types/primitive_traits.rs rename to src/datatypes2/src/types/primitive_traits.rs diff --git a/src/datatypes2/src/types/primitive_type.rs b/src/datatypes2/src/types/primitive_type.rs index e389ca13bf..b9f07ce82c 100644 --- a/src/datatypes2/src/types/primitive_type.rs +++ b/src/datatypes2/src/types/primitive_type.rs @@ -12,11 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::cmp::Ordering; +use std::any::TypeId; +use std::marker::PhantomData; -use arrow::datatypes::{ArrowNativeType, ArrowPrimitiveType, DataType as ArrowDataType}; -use common_time::{Date, DateTime}; -use num::NumCast; +use arrow::array::PrimitiveArray; +use arrow::datatypes::DataType as ArrowDataType; +use paste::paste; use serde::{Deserialize, Serialize}; use snafu::OptionExt; @@ -24,226 +25,92 @@ use crate::data_type::{ConcreteDataType, DataType}; use crate::error::{self, Result}; use crate::scalars::{Scalar, ScalarRef, ScalarVectorBuilder}; use crate::type_id::LogicalTypeId; -use crate::types::{DateTimeType, DateType}; +use crate::types::primitive_traits::Primitive; use crate::value::{Value, ValueRef}; use crate::vectors::{MutableVector, PrimitiveVector, PrimitiveVectorBuilder, Vector}; -/// Data types that can be used as arrow's native type. -pub trait NativeType: ArrowNativeType + NumCast { - /// Largest numeric type this primitive type can be cast to. - type LargestType: NativeType; +#[derive(Clone, Serialize, Deserialize)] +pub struct PrimitiveType { + #[serde(skip)] + _phantom: PhantomData, } -macro_rules! impl_native_type { - ($Type: ident, $LargestType: ident) => { - impl NativeType for $Type { - type LargestType = $LargestType; - } - }; +impl PartialEq> for PrimitiveType { + fn eq(&self, _other: &PrimitiveType) -> bool { + TypeId::of::() == TypeId::of::() + } } -impl_native_type!(u8, u64); -impl_native_type!(u16, u64); -impl_native_type!(u32, u64); -impl_native_type!(u64, u64); -impl_native_type!(i8, i64); -impl_native_type!(i16, i64); -impl_native_type!(i32, i64); -impl_native_type!(i64, i64); -impl_native_type!(f32, f64); -impl_native_type!(f64, f64); +impl Eq for PrimitiveType {} -/// Represents the wrapper type that wraps a native type using the `newtype pattern`, -/// such as [Date](`common_time::Date`) is a wrapper type for the underlying native -/// type `i32`. -pub trait WrapperType: - Copy - + Scalar - + PartialEq - + Into - + Into> - + Serialize - + Into +/// A trait that provide helper methods for a primitive type to implementing the [PrimitiveVector]. +pub trait PrimitiveElement +where + for<'a> Self: Primitive + + Scalar> + + ScalarRef<'a, ScalarType = Self, VectorType = PrimitiveVector> + + Scalar = Self>, { - /// Logical primitive type that this wrapper type belongs to. - type LogicalType: LogicalPrimitiveType; - /// The underlying native type. - type Native: NativeType; - - /// Convert native type into this wrapper type. - fn from_native(value: Self::Native) -> Self; - - /// Convert this wrapper type into native type. - fn into_native(self) -> Self::Native; -} - -/// Trait bridging the logical primitive type with [ArrowPrimitiveType]. -pub trait LogicalPrimitiveType: 'static + Sized { - /// Arrow primitive type of this logical type. - type ArrowPrimitive: ArrowPrimitiveType; - /// Native (physical) type of this logical type. - type Native: NativeType; - /// Wrapper type that the vector returns. - type Wrapper: WrapperType - + for<'a> Scalar, RefType<'a> = Self::Wrapper> - + for<'a> ScalarRef<'a, ScalarType = Self::Wrapper>; - /// Construct the data type struct. fn build_data_type() -> ConcreteDataType; - /// Return the name of the type. - fn type_name() -> &'static str; + /// Returns the name of the type id. + fn type_name() -> String; /// Dynamic cast the vector to the concrete vector type. - fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveVector>; + fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveArray>; /// Cast value ref to the primitive type. - fn cast_value_ref(value: ValueRef) -> Result>; + fn cast_value_ref(value: ValueRef) -> Result>; } -/// A new type for [WrapperType], complement the `Ord` feature for it. Wrapping non ordered -/// primitive types like `f32` and `f64` in `OrdPrimitive` can make them be used in places that -/// require `Ord`. For example, in `Median` or `Percentile` UDAFs. -#[derive(Debug, Clone, Copy, PartialEq)] -pub struct OrdPrimitive(pub T); +macro_rules! impl_primitive_element { + ($Type:ident, $TypeId:ident) => { + paste::paste! { + impl PrimitiveElement for $Type { + fn build_data_type() -> ConcreteDataType { + ConcreteDataType::$TypeId(PrimitiveType::<$Type>::default()) + } -impl OrdPrimitive { - pub fn as_primitive(&self) -> T { - self.0 - } -} + fn type_name() -> String { + stringify!($TypeId).to_string() + } -impl Eq for OrdPrimitive {} + fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveArray<$Type>> { + let primitive_vector = vector + .as_any() + .downcast_ref::>() + .with_context(|| error::CastTypeSnafu { + msg: format!( + "Failed to cast {} to vector of primitive type {}", + vector.vector_type_name(), + stringify!($TypeId) + ), + })?; + Ok(&primitive_vector.array) + } -impl PartialOrd for OrdPrimitive { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for OrdPrimitive { - fn cmp(&self, other: &Self) -> Ordering { - Into::::into(self.0).cmp(&Into::::into(other.0)) - } -} - -impl From> for Value { - fn from(p: OrdPrimitive) -> Self { - p.0.into() - } -} - -macro_rules! impl_wrapper { - ($Type: ident, $LogicalType: ident) => { - impl WrapperType for $Type { - type LogicalType = $LogicalType; - type Native = $Type; - - fn from_native(value: Self::Native) -> Self { - value - } - - fn into_native(self) -> Self::Native { - self - } - } - }; -} - -impl_wrapper!(u8, UInt8Type); -impl_wrapper!(u16, UInt16Type); -impl_wrapper!(u32, UInt32Type); -impl_wrapper!(u64, UInt64Type); -impl_wrapper!(i8, Int8Type); -impl_wrapper!(i16, Int16Type); -impl_wrapper!(i32, Int32Type); -impl_wrapper!(i64, Int64Type); -impl_wrapper!(f32, Float32Type); -impl_wrapper!(f64, Float64Type); - -impl WrapperType for Date { - type LogicalType = DateType; - type Native = i32; - - fn from_native(value: i32) -> Self { - Date::new(value) - } - - fn into_native(self) -> i32 { - self.val() - } -} - -impl WrapperType for DateTime { - type LogicalType = DateTimeType; - type Native = i64; - - fn from_native(value: Self::Native) -> Self { - DateTime::new(value) - } - - fn into_native(self) -> Self::Native { - self.val() - } -} - -macro_rules! define_logical_primitive_type { - ($Native: ident, $TypeId: ident, $DataType: ident) => { - // We need to define it as an empty struct `struct DataType {}` instead of a struct-unit - // `struct DataType;` to ensure the serialized JSON string is compatible with previous - // implementation. - #[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] - pub struct $DataType {} - - impl LogicalPrimitiveType for $DataType { - type ArrowPrimitive = arrow::datatypes::$DataType; - type Native = $Native; - type Wrapper = $Native; - - fn build_data_type() -> ConcreteDataType { - ConcreteDataType::$TypeId($DataType::default()) - } - - fn type_name() -> &'static str { - stringify!($TypeId) - } - - fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveVector<$DataType>> { - vector - .as_any() - .downcast_ref::>() - .with_context(|| error::CastTypeSnafu { - msg: format!( - "Failed to cast {} to vector of primitive type {}", - vector.vector_type_name(), - stringify!($TypeId) - ), - }) - } - - fn cast_value_ref(value: ValueRef) -> Result> { - match value { - ValueRef::Null => Ok(None), - ValueRef::$TypeId(v) => Ok(Some(v.into())), - other => error::CastTypeSnafu { - msg: format!( - "Failed to cast value {:?} to primitive type {}", - other, - stringify!($TypeId), - ), + fn cast_value_ref(value: ValueRef) -> Result> { + match value { + ValueRef::Null => Ok(None), + ValueRef::$TypeId(v) => Ok(Some(v.into())), + other => error::CastTypeSnafu { + msg: format!( + "Failed to cast value {:?} to primitive type {}", + other, + stringify!($TypeId), + ), + }.fail(), } - .fail(), } } } }; } -macro_rules! define_non_timestamp_primitive { - ($Native: ident, $TypeId: ident, $DataType: ident) => { - define_logical_primitive_type!($Native, $TypeId, $DataType); - - impl DataType for $DataType { +macro_rules! impl_numeric { + ($Type:ident, $TypeId:ident) => { + impl DataType for PrimitiveType<$Type> { fn name(&self) -> &str { stringify!($TypeId) } @@ -253,7 +120,7 @@ macro_rules! define_non_timestamp_primitive { } fn default_value(&self) -> Value { - $Native::default().into() + $Type::default().into() } fn as_arrow_type(&self) -> ArrowDataType { @@ -261,98 +128,61 @@ macro_rules! define_non_timestamp_primitive { } fn create_mutable_vector(&self, capacity: usize) -> Box { - Box::new(PrimitiveVectorBuilder::<$DataType>::with_capacity(capacity)) + Box::new(PrimitiveVectorBuilder::<$Type>::with_capacity(capacity)) } + } - fn is_timestamp_compatible(&self) -> bool { - false + impl std::fmt::Debug for PrimitiveType<$Type> { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}", self.name()) } } + + impl Default for PrimitiveType<$Type> { + fn default() -> Self { + Self { + _phantom: PhantomData, + } + } + } + + impl_primitive_element!($Type, $TypeId); + + paste! { + pub type [<$TypeId Type>]=PrimitiveType<$Type>; + } }; } -define_non_timestamp_primitive!(u8, UInt8, UInt8Type); -define_non_timestamp_primitive!(u16, UInt16, UInt16Type); -define_non_timestamp_primitive!(u32, UInt32, UInt32Type); -define_non_timestamp_primitive!(u64, UInt64, UInt64Type); -define_non_timestamp_primitive!(i8, Int8, Int8Type); -define_non_timestamp_primitive!(i16, Int16, Int16Type); -define_non_timestamp_primitive!(i32, Int32, Int32Type); -define_non_timestamp_primitive!(f32, Float32, Float32Type); -define_non_timestamp_primitive!(f64, Float64, Float64Type); - -// Timestamp primitive: -define_logical_primitive_type!(i64, Int64, Int64Type); - -impl DataType for Int64Type { - fn name(&self) -> &str { - "Int64" - } - - fn logical_type_id(&self) -> LogicalTypeId { - LogicalTypeId::Int64 - } - - fn default_value(&self) -> Value { - Value::Int64(0) - } - - fn as_arrow_type(&self) -> ArrowDataType { - ArrowDataType::Int64 - } - - fn create_mutable_vector(&self, capacity: usize) -> Box { - Box::new(PrimitiveVectorBuilder::::with_capacity(capacity)) - } - - fn is_timestamp_compatible(&self) -> bool { - true - } -} +impl_numeric!(u8, UInt8); +impl_numeric!(u16, UInt16); +impl_numeric!(u32, UInt32); +impl_numeric!(u64, UInt64); +impl_numeric!(i8, Int8); +impl_numeric!(i16, Int16); +impl_numeric!(i32, Int32); +impl_numeric!(i64, Int64); +impl_numeric!(f32, Float32); +impl_numeric!(f64, Float64); #[cfg(test)] mod tests { - use std::collections::BinaryHeap; - use super::*; #[test] - fn test_ord_primitive() { - struct Foo - where - T: WrapperType, - { - heap: BinaryHeap>, - } + fn test_eq() { + assert_eq!(UInt8Type::default(), UInt8Type::default()); + assert_eq!(UInt16Type::default(), UInt16Type::default()); + assert_eq!(UInt32Type::default(), UInt32Type::default()); + assert_eq!(UInt64Type::default(), UInt64Type::default()); + assert_eq!(Int8Type::default(), Int8Type::default()); + assert_eq!(Int16Type::default(), Int16Type::default()); + assert_eq!(Int32Type::default(), Int32Type::default()); + assert_eq!(Int64Type::default(), Int64Type::default()); + assert_eq!(Float32Type::default(), Float32Type::default()); + assert_eq!(Float64Type::default(), Float64Type::default()); - impl Foo - where - T: WrapperType, - { - fn push(&mut self, value: T) { - let value = OrdPrimitive::(value); - self.heap.push(value); - } - } - - macro_rules! test { - ($Type:ident) => { - let mut foo = Foo::<$Type> { - heap: BinaryHeap::new(), - }; - foo.push($Type::default()); - }; - } - - test!(u8); - test!(u16); - test!(u32); - test!(u64); - test!(i8); - test!(i16); - test!(i32); - test!(i64); - test!(f32); - test!(f64); + assert_ne!(Float32Type::default(), Float64Type::default()); + assert_ne!(Float32Type::default(), Int32Type::default()); } } diff --git a/src/datatypes2/src/types/string_type.rs b/src/datatypes2/src/types/string_type.rs index 799cbbbdd3..736a3faac9 100644 --- a/src/datatypes2/src/types/string_type.rs +++ b/src/datatypes2/src/types/string_type.rs @@ -18,10 +18,9 @@ use arrow::datatypes::DataType as ArrowDataType; use common_base::bytes::StringBytes; use serde::{Deserialize, Serialize}; -use crate::data_type::{DataType, DataTypeRef}; -use crate::prelude::ScalarVectorBuilder; -use crate::type_id::LogicalTypeId; -use crate::value::Value; +use crate::data_type::DataType; +use crate::prelude::{DataTypeRef, LogicalTypeId, Value}; +use crate::scalars::ScalarVectorBuilder; use crate::vectors::{MutableVector, StringVectorBuilder}; #[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -53,8 +52,4 @@ impl DataType for StringType { fn create_mutable_vector(&self, capacity: usize) -> Box { Box::new(StringVectorBuilder::with_capacity(capacity)) } - - fn is_timestamp_compatible(&self) -> bool { - false - } } diff --git a/src/datatypes/src/types/timestamp.rs b/src/datatypes2/src/types/timestamp.rs similarity index 100% rename from src/datatypes/src/types/timestamp.rs rename to src/datatypes2/src/types/timestamp.rs diff --git a/src/datatypes2/src/value.rs b/src/datatypes2/src/value.rs index bade88d419..d5e0ae3e9f 100644 --- a/src/datatypes2/src/value.rs +++ b/src/datatypes2/src/value.rs @@ -110,7 +110,6 @@ impl Value { /// # Panics /// Panics if the data type is not supported. pub fn data_type(&self) -> ConcreteDataType { - // TODO(yingwen): Implement this once all data types are implemented. match self { Value::Null => ConcreteDataType::null_datatype(), Value::Boolean(_) => ConcreteDataType::boolean_datatype(), @@ -126,10 +125,10 @@ impl Value { Value::Float64(_) => ConcreteDataType::float64_datatype(), Value::String(_) => ConcreteDataType::string_datatype(), Value::Binary(_) => ConcreteDataType::binary_datatype(), + Value::List(list) => ConcreteDataType::list_datatype(list.datatype().clone()), Value::Date(_) => ConcreteDataType::date_datatype(), Value::DateTime(_) => ConcreteDataType::datetime_datatype(), Value::Timestamp(v) => ConcreteDataType::timestamp_datatype(v.unit()), - Value::List(list) => ConcreteDataType::list_datatype(list.datatype().clone()), } } @@ -194,12 +193,7 @@ impl Value { Value::List(_) => LogicalTypeId::List, Value::Date(_) => LogicalTypeId::Date, Value::DateTime(_) => LogicalTypeId::DateTime, - Value::Timestamp(t) => match t.unit() { - TimeUnit::Second => LogicalTypeId::TimestampSecond, - TimeUnit::Millisecond => LogicalTypeId::TimestampMillisecond, - TimeUnit::Microsecond => LogicalTypeId::TimestampMicrosecond, - TimeUnit::Nanosecond => LogicalTypeId::TimestampNanosecond, - }, + Value::Timestamp(_) => LogicalTypeId::Timestamp, } } } @@ -283,9 +277,6 @@ impl_value_from!(Float32, f32); impl_value_from!(Float64, f64); impl_value_from!(String, StringBytes); impl_value_from!(Binary, Bytes); -impl_value_from!(Date, Date); -impl_value_from!(DateTime, DateTime); -impl_value_from!(Timestamp, Timestamp); impl From for Value { fn from(string: String) -> Value { @@ -305,6 +296,12 @@ impl From> for Value { } } +impl From for Value { + fn from(v: Timestamp) -> Self { + Value::Timestamp(v) + } +} + impl From<&[u8]> for Value { fn from(bytes: &[u8]) -> Value { Value::Binary(bytes.into()) @@ -340,7 +337,6 @@ impl TryFrom for serde_json::Value { } } -// TODO(yingwen): Consider removing the `datatype` field from `ListValue`. /// List value. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct ListValue { @@ -395,7 +391,6 @@ impl TryFrom for Value { fn try_from(v: ScalarValue) -> Result { let v = match v { - ScalarValue::Null => Value::Null, ScalarValue::Boolean(b) => Value::from(b), ScalarValue::Float32(f) => Value::from(f), ScalarValue::Float64(f) => Value::from(f), @@ -410,10 +405,8 @@ impl TryFrom for Value { ScalarValue::Utf8(s) | ScalarValue::LargeUtf8(s) => { Value::from(s.map(StringBytes::from)) } - ScalarValue::Binary(b) - | ScalarValue::LargeBinary(b) - | ScalarValue::FixedSizeBinary(_, b) => Value::from(b.map(Bytes::from)), - ScalarValue::List(vs, field) => { + ScalarValue::Binary(b) | ScalarValue::LargeBinary(b) => Value::from(b.map(Bytes::from)), + ScalarValue::List(vs, t) => { let items = if let Some(vs) = vs { let vs = vs .into_iter() @@ -423,7 +416,7 @@ impl TryFrom for Value { } else { None }; - let datatype = ConcreteDataType::try_from(field.data_type())?; + let datatype = t.as_ref().try_into()?; Value::List(ListValue::new(items, datatype)) } ScalarValue::Date32(d) => d.map(|x| Value::Date(Date::new(x))).unwrap_or(Value::Null), @@ -442,13 +435,7 @@ impl TryFrom for Value { ScalarValue::TimestampNanosecond(t, _) => t .map(|x| Value::Timestamp(Timestamp::new(x, TimeUnit::Nanosecond))) .unwrap_or(Value::Null), - ScalarValue::Decimal128(_, _, _) - | ScalarValue::Time64(_) - | ScalarValue::IntervalYearMonth(_) - | ScalarValue::IntervalDayTime(_) - | ScalarValue::IntervalMonthDayNano(_) - | ScalarValue::Struct(_, _) - | ScalarValue::Dictionary(_, _) => { + _ => { return error::UnsupportedArrowTypeSnafu { arrow_type: v.get_datatype(), } @@ -558,6 +545,15 @@ impl<'a> Ord for ValueRef<'a> { } } +/// A helper trait to convert copyable types to `ValueRef`. +/// +/// It could replace the usage of `Into>`, thus avoid confusion between `Into` +/// and `Into>` in generic codes. One typical usage is the [`Primitive`](crate::primitive_traits::Primitive) trait. +pub trait IntoValueRef<'a> { + /// Convert itself to [ValueRef]. + fn into_value_ref(self) -> ValueRef<'a>; +} + macro_rules! impl_value_ref_from { ($Variant:ident, $Type:ident) => { impl From<$Type> for ValueRef<'_> { @@ -566,6 +562,12 @@ macro_rules! impl_value_ref_from { } } + impl<'a> IntoValueRef<'a> for $Type { + fn into_value_ref(self) -> ValueRef<'a> { + ValueRef::$Variant(self.into()) + } + } + impl From> for ValueRef<'_> { fn from(value: Option<$Type>) -> Self { match value { @@ -574,6 +576,15 @@ macro_rules! impl_value_ref_from { } } } + + impl<'a> IntoValueRef<'a> for Option<$Type> { + fn into_value_ref(self) -> ValueRef<'a> { + match self { + Some(v) => ValueRef::$Variant(v.into()), + None => ValueRef::Null, + } + } + } }; } @@ -588,9 +599,6 @@ impl_value_ref_from!(Int32, i32); impl_value_ref_from!(Int64, i64); impl_value_ref_from!(Float32, f32); impl_value_ref_from!(Float64, f64); -impl_value_ref_from!(Date, Date); -impl_value_ref_from!(DateTime, DateTime); -impl_value_ref_from!(Timestamp, Timestamp); impl<'a> From<&'a str> for ValueRef<'a> { fn from(string: &'a str) -> ValueRef<'a> { @@ -620,7 +628,6 @@ impl<'a> From>> for ValueRef<'a> { /// if it becomes bottleneck. #[derive(Debug, Clone, Copy)] pub enum ListValueRef<'a> { - // TODO(yingwen): Consider replace this by VectorRef. Indexed { vector: &'a ListVector, idx: usize }, Ref { val: &'a ListValue }, } @@ -778,16 +785,19 @@ mod tests { Some(Box::new(vec![Value::Int32(1), Value::Null])), ConcreteDataType::int32_datatype() )), - ScalarValue::new_list( - Some(vec![ScalarValue::Int32(Some(1)), ScalarValue::Int32(None)]), - ArrowDataType::Int32, + ScalarValue::List( + Some(Box::new(vec![ + ScalarValue::Int32(Some(1)), + ScalarValue::Int32(None) + ])), + Box::new(ArrowDataType::Int32) ) .try_into() .unwrap() ); assert_eq!( Value::List(ListValue::new(None, ConcreteDataType::uint32_datatype())), - ScalarValue::new_list(None, ArrowDataType::UInt32) + ScalarValue::List(None, Box::new(ArrowDataType::UInt32)) .try_into() .unwrap() ); @@ -970,10 +980,6 @@ mod tests { ConcreteDataType::int32_datatype(), )), ); - check_type_and_value( - &ConcreteDataType::list_datatype(ConcreteDataType::null_datatype()), - &Value::List(ListValue::default()), - ); check_type_and_value( &ConcreteDataType::date_datatype(), &Value::Date(Date::new(1)), @@ -983,7 +989,7 @@ mod tests { &Value::DateTime(DateTime::new(1)), ); check_type_and_value( - &ConcreteDataType::timestamp_millisecond_datatype(), + &ConcreteDataType::timestamp_millis_datatype(), &Value::Timestamp(Timestamp::from_millis(1)), ); } @@ -1202,6 +1208,59 @@ mod tests { assert!(wrong_value.as_list().is_err()); } + #[test] + fn test_into_value_ref() { + macro_rules! check_into_value_ref { + ($Variant: ident, $data: expr, $PrimitiveType: ident, $Wrapper: ident) => { + let data: $PrimitiveType = $data; + assert_eq!( + ValueRef::$Variant($Wrapper::from(data)), + data.into_value_ref() + ); + assert_eq!( + ValueRef::$Variant($Wrapper::from(data)), + ValueRef::from(data) + ); + assert_eq!( + ValueRef::$Variant($Wrapper::from(data)), + Some(data).into_value_ref() + ); + assert_eq!( + ValueRef::$Variant($Wrapper::from(data)), + ValueRef::from(Some(data)) + ); + let x: Option<$PrimitiveType> = None; + assert_eq!(ValueRef::Null, x.into_value_ref()); + assert_eq!(ValueRef::Null, x.into()); + }; + } + + macro_rules! check_primitive_into_value_ref { + ($Variant: ident, $data: expr, $PrimitiveType: ident) => { + check_into_value_ref!($Variant, $data, $PrimitiveType, $PrimitiveType) + }; + } + + check_primitive_into_value_ref!(Boolean, true, bool); + check_primitive_into_value_ref!(UInt8, 10, u8); + check_primitive_into_value_ref!(UInt16, 20, u16); + check_primitive_into_value_ref!(UInt32, 30, u32); + check_primitive_into_value_ref!(UInt64, 40, u64); + check_primitive_into_value_ref!(Int8, -10, i8); + check_primitive_into_value_ref!(Int16, -20, i16); + check_primitive_into_value_ref!(Int32, -30, i32); + check_primitive_into_value_ref!(Int64, -40, i64); + check_into_value_ref!(Float32, 10.0, f32, OrderedF32); + check_into_value_ref!(Float64, 10.0, f64, OrderedF64); + + let hello = "hello"; + assert_eq!( + ValueRef::Binary(hello.as_bytes()), + ValueRef::from(hello.as_bytes()) + ); + assert_eq!(ValueRef::String(hello), ValueRef::from(hello)); + } + #[test] fn test_display() { assert_eq!(Value::Null.to_string(), "Null"); @@ -1242,34 +1301,10 @@ mod tests { assert_eq!( Value::List(ListValue::new( Some(Box::new(vec![])), - ConcreteDataType::timestamp_second_datatype(), + ConcreteDataType::timestamp_datatype(TimeUnit::Millisecond), )) .to_string(), - "TimestampSecondType[]" - ); - assert_eq!( - Value::List(ListValue::new( - Some(Box::new(vec![])), - ConcreteDataType::timestamp_millisecond_datatype(), - )) - .to_string(), - "TimestampMillisecondType[]" - ); - assert_eq!( - Value::List(ListValue::new( - Some(Box::new(vec![])), - ConcreteDataType::timestamp_microsecond_datatype(), - )) - .to_string(), - "TimestampMicrosecondType[]" - ); - assert_eq!( - Value::List(ListValue::new( - Some(Box::new(vec![])), - ConcreteDataType::timestamp_nanosecond_datatype(), - )) - .to_string(), - "TimestampNanosecondType[]" + "Timestamp[]" ); } } diff --git a/src/datatypes2/src/vectors.rs b/src/datatypes2/src/vectors.rs index 38fa762d4b..6c9402849f 100644 --- a/src/datatypes2/src/vectors.rs +++ b/src/datatypes2/src/vectors.rs @@ -12,59 +12,68 @@ // See the License for the specific language governing permissions and // limitations under the License. +pub mod binary; +pub mod boolean; +mod builder; +pub mod constant; +pub mod date; +pub mod datetime; +mod eq; +mod helper; +mod list; +pub mod mutable; +pub mod null; +mod operations; +pub mod primitive; +mod string; +mod timestamp; + use std::any::Any; use std::fmt::Debug; use std::sync::Arc; use arrow::array::{Array, ArrayRef}; +use arrow::bitmap::Bitmap; +pub use binary::*; +pub use boolean::*; +pub use builder::VectorBuilder; +pub use constant::*; +pub use date::*; +pub use datetime::*; +pub use helper::Helper; +pub use list::*; +pub use mutable::MutableVector; +pub use null::*; +pub use operations::VectorOp; +pub use primitive::*; use snafu::ensure; +pub use string::*; +pub use timestamp::*; use crate::data_type::ConcreteDataType; use crate::error::{self, Result}; use crate::serialize::Serializable; use crate::value::{Value, ValueRef}; -use crate::vectors::operations::VectorOp; -mod binary; -mod boolean; -mod constant; -mod date; -mod datetime; -mod eq; -mod helper; -mod list; -mod null; -mod operations; -mod primitive; -mod string; -mod timestamp; -mod validity; +#[derive(Debug, PartialEq)] +pub enum Validity<'a> { + /// Whether the array slot is valid or not (null). + Slots(&'a Bitmap), + /// All slots are valid. + AllValid, + /// All slots are null. + AllNull, +} -pub use binary::{BinaryVector, BinaryVectorBuilder}; -pub use boolean::{BooleanVector, BooleanVectorBuilder}; -pub use constant::ConstantVector; -pub use date::{DateVector, DateVectorBuilder}; -pub use datetime::{DateTimeVector, DateTimeVectorBuilder}; -pub use helper::Helper; -pub use list::{ListIter, ListVector, ListVectorBuilder}; -pub use null::{NullVector, NullVectorBuilder}; -pub use primitive::{ - Float32Vector, Float32VectorBuilder, Float64Vector, Float64VectorBuilder, Int16Vector, - Int16VectorBuilder, Int32Vector, Int32VectorBuilder, Int64Vector, Int64VectorBuilder, - Int8Vector, Int8VectorBuilder, PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder, - UInt16Vector, UInt16VectorBuilder, UInt32Vector, UInt32VectorBuilder, UInt64Vector, - UInt64VectorBuilder, UInt8Vector, UInt8VectorBuilder, -}; -pub use string::{StringVector, StringVectorBuilder}; -pub use timestamp::{ - TimestampMicrosecondVector, TimestampMicrosecondVectorBuilder, TimestampMillisecondVector, - TimestampMillisecondVectorBuilder, TimestampNanosecondVector, TimestampNanosecondVectorBuilder, - TimestampSecondVector, TimestampSecondVectorBuilder, -}; -pub use validity::Validity; +impl<'a> Validity<'a> { + pub fn slots(&self) -> Option<&Bitmap> { + match self { + Validity::Slots(bitmap) => Some(bitmap), + _ => None, + } + } +} -// TODO(yingwen): arrow 28.0 implements Clone for all arrays, we could upgrade to it and simplify -// some codes in methods such as `to_arrow_array()` and `to_boxed_arrow_array()`. /// Vector of data values. pub trait Vector: Send + Sync + Serializable + Debug + VectorOp { /// Returns the data type of the vector. @@ -101,7 +110,13 @@ pub trait Vector: Send + Sync + Serializable + Debug + VectorOp { /// The number of null slots on this [`Vector`]. /// # Implementation /// This is `O(1)`. - fn null_count(&self) -> usize; + fn null_count(&self) -> usize { + match self.validity() { + Validity::Slots(bitmap) => bitmap.null_count(), + Validity::AllValid => 0, + Validity::AllNull => self.len(), + } + } /// Returns true when it's a ConstantColumn fn is_const(&self) -> bool { @@ -150,42 +165,6 @@ pub trait Vector: Send + Sync + Serializable + Debug + VectorOp { pub type VectorRef = Arc; -/// Mutable vector that could be used to build an immutable vector. -pub trait MutableVector: Send + Sync { - /// Returns the data type of the vector. - fn data_type(&self) -> ConcreteDataType; - - /// Returns the length of the vector. - fn len(&self) -> usize; - - /// Returns whether the vector is empty. - fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// Convert to Any, to enable dynamic casting. - fn as_any(&self) -> &dyn Any; - - /// Convert to mutable Any, to enable dynamic casting. - fn as_mut_any(&mut self) -> &mut dyn Any; - - /// Convert `self` to an (immutable) [VectorRef] and reset `self`. - fn to_vector(&mut self) -> VectorRef; - - /// Push value ref to this mutable vector. - /// - /// Returns error if data type unmatch. - fn push_value_ref(&mut self, value: ValueRef) -> Result<()>; - - /// Extend this mutable vector by slice of `vector`. - /// - /// Returns error if data type unmatch. - /// - /// # Panics - /// Panics if `offset + length > vector.len()`. - fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()>; -} - /// Helper to define `try_from_arrow_array(array: arrow::array::ArrayRef)` function. macro_rules! impl_try_from_arrow_array_for_vector { ($Array: ident, $Vector: ident) => { @@ -193,20 +172,16 @@ macro_rules! impl_try_from_arrow_array_for_vector { pub fn try_from_arrow_array( array: impl AsRef, ) -> crate::error::Result<$Vector> { - use snafu::OptionExt; - - let data = array - .as_ref() - .as_any() - .downcast_ref::<$Array>() - .with_context(|| crate::error::ConversionSnafu { - from: std::format!("{:?}", array.as_ref().data_type()), - })? - .data() - .clone(); - - let concrete_array = $Array::from(data); - Ok($Vector::from(concrete_array)) + Ok($Vector::from( + array + .as_ref() + .as_any() + .downcast_ref::<$Array>() + .with_context(|| crate::error::ConversionSnafu { + from: std::format!("{:?}", array.as_ref().data_type()), + })? + .clone(), + )) } } }; @@ -214,7 +189,10 @@ macro_rules! impl_try_from_arrow_array_for_vector { macro_rules! impl_validity_for_vector { ($array: expr) => { - Validity::from_array_data($array.data()) + match $array.validity() { + Some(bitmap) => Validity::Slots(bitmap), + None => Validity::AllValid, + } }; } @@ -241,11 +219,10 @@ macro_rules! impl_get_ref_for_vector { } macro_rules! impl_extend_for_builder { - ($mutable_vector: expr, $vector: ident, $VectorType: ident, $offset: ident, $length: ident) => {{ + ($mutable_array: expr, $vector: ident, $VectorType: ident, $offset: ident, $length: ident) => {{ use snafu::OptionExt; - let sliced_vector = $vector.slice($offset, $length); - let concrete_vector = sliced_vector + let concrete_vector = $vector .as_any() .downcast_ref::<$VectorType>() .with_context(|| crate::error::CastTypeSnafu { @@ -255,9 +232,8 @@ macro_rules! impl_extend_for_builder { stringify!($VectorType) ), })?; - for value in concrete_vector.iter_data() { - $mutable_vector.push(value); - } + let slice = concrete_vector.array.slice($offset, $length); + $mutable_array.extend_trusted_len(slice.iter()); Ok(()) }}; } @@ -269,27 +245,27 @@ pub(crate) use { #[cfg(test)] pub mod tests { - use arrow::array::{Array, Int32Array, UInt8Array}; + use arrow::array::{Array, PrimitiveArray}; use serde_json; + use super::helper::Helper; use super::*; use crate::data_type::DataType; - use crate::types::{Int32Type, LogicalPrimitiveType}; - use crate::vectors::helper::Helper; + use crate::types::PrimitiveElement; #[test] fn test_df_columns_to_vector() { - let df_column: Arc = Arc::new(Int32Array::from(vec![1, 2, 3])); + let df_column: Arc = Arc::new(PrimitiveArray::from_slice(vec![1, 2, 3])); let vector = Helper::try_into_vector(df_column).unwrap(); assert_eq!( - Int32Type::build_data_type().as_arrow_type(), + i32::build_data_type().as_arrow_type(), vector.data_type().as_arrow_type() ); } #[test] fn test_serialize_i32_vector() { - let df_column: Arc = Arc::new(Int32Array::from(vec![1, 2, 3])); + let df_column: Arc = Arc::new(PrimitiveArray::::from_slice(vec![1, 2, 3])); let json_value = Helper::try_into_vector(df_column) .unwrap() .serialize_to_json() @@ -299,7 +275,7 @@ pub mod tests { #[test] fn test_serialize_i8_vector() { - let df_column: Arc = Arc::new(UInt8Array::from(vec![1, 2, 3])); + let df_column: Arc = Arc::new(PrimitiveArray::from_slice(vec![1u8, 2u8, 3u8])); let json_value = Helper::try_into_vector(df_column) .unwrap() .serialize_to_json() diff --git a/src/datatypes2/src/vectors/binary.rs b/src/datatypes2/src/vectors/binary.rs index 3b5defc8ec..7be3dc6a8e 100644 --- a/src/datatypes2/src/vectors/binary.rs +++ b/src/datatypes2/src/vectors/binary.rs @@ -15,8 +15,9 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef}; -use snafu::ResultExt; +use arrow::array::{Array, ArrayRef}; +use arrow::array::{ArrayIter, GenericByteArray}; +use snafu::{OptionExt, ResultExt}; use crate::arrow_array::{BinaryArray, MutableBinaryArray}; use crate::data_type::ConcreteDataType; @@ -36,16 +37,6 @@ impl BinaryVector { pub(crate) fn as_arrow(&self) -> &dyn Array { &self.array } - - fn to_array_data(&self) -> ArrayData { - self.array.data().clone() - } - - fn from_array_data(data: ArrayData) -> BinaryVector { - BinaryVector { - array: BinaryArray::from(data), - } - } } impl From for BinaryVector { @@ -57,7 +48,7 @@ impl From for BinaryVector { impl From>>> for BinaryVector { fn from(data: Vec>>) -> Self { Self { - array: BinaryArray::from_iter(data), + array: BinaryArray::from(data), } } } @@ -80,13 +71,11 @@ impl Vector for BinaryVector { } fn to_arrow_array(&self) -> ArrayRef { - let data = self.to_array_data(); - Arc::new(BinaryArray::from(data)) + Arc::new(self.array.clone()) } fn to_boxed_arrow_array(&self) -> Box { - let data = self.to_array_data(); - Box::new(BinaryArray::from(data)) + Box::new(self.array.clone()) } fn validity(&self) -> Validity { @@ -94,11 +83,7 @@ impl Vector for BinaryVector { } fn memory_size(&self) -> usize { - self.array.get_buffer_memory_size() - } - - fn null_count(&self) -> usize { - self.array.null_count() + self.array.values().len() + self.array.offsets().len() * std::mem::size_of::() } fn is_null(&self, row: usize) -> bool { @@ -106,8 +91,7 @@ impl Vector for BinaryVector { } fn slice(&self, offset: usize, length: usize) -> VectorRef { - let data = self.array.data().slice(offset, length); - Arc::new(Self::from_array_data(data)) + Arc::new(Self::from(self.array.slice(offset, length))) } fn get(&self, index: usize) -> Value { @@ -164,15 +148,12 @@ impl MutableVector for BinaryVectorBuilder { } fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - match value.as_binary()? { - Some(v) => self.mutable_array.append_value(v), - None => self.mutable_array.append_null(), - } + self.mutable_array.push(value.as_binary()?); Ok(()) } fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - vectors::impl_extend_for_builder!(self, vector, BinaryVector, offset, length) + vectors::impl_extend_for_builder!(self.mutable_array, vector, BinaryVector, offset, length) } } @@ -181,20 +162,17 @@ impl ScalarVectorBuilder for BinaryVectorBuilder { fn with_capacity(capacity: usize) -> Self { Self { - mutable_array: MutableBinaryArray::with_capacity(capacity, 0), + mutable_array: MutableBinaryArray::with_capacity(capacity), } } fn push(&mut self, value: Option<::RefItem<'_>>) { - match value { - Some(v) => self.mutable_array.append_value(v), - None => self.mutable_array.append_null(), - } + self.mutable_array.push(value); } fn finish(&mut self) -> Self::VectorType { BinaryVector { - array: self.mutable_array.finish(), + array: std::mem::take(&mut self.mutable_array).into(), } } } @@ -227,17 +205,14 @@ mod tests { #[test] fn test_binary_vector_misc() { - let v = BinaryVector::from(BinaryArray::from_iter_values(&[ - vec![1, 2, 3], - vec![1, 2, 3], - ])); + let v = BinaryVector::from(BinaryArray::from_slice(&[vec![1, 2, 3], vec![1, 2, 3]])); assert_eq!(2, v.len()); assert_eq!("BinaryVector", v.vector_type_name()); assert!(!v.is_const()); - assert!(v.validity().is_all_valid()); + assert_eq!(Validity::AllValid, v.validity()); assert!(!v.only_null()); - assert_eq!(128, v.memory_size()); + assert_eq!(30, v.memory_size()); for i in 0..2 { assert!(!v.is_null(i)); @@ -252,10 +227,7 @@ mod tests { #[test] fn test_serialize_binary_vector_to_json() { - let vector = BinaryVector::from(BinaryArray::from_iter_values(&[ - vec![1, 2, 3], - vec![1, 2, 3], - ])); + let vector = BinaryVector::from(BinaryArray::from_slice(&[vec![1, 2, 3], vec![1, 2, 3]])); let json_value = vector.serialize_to_json().unwrap(); assert_eq!( @@ -281,8 +253,8 @@ mod tests { #[test] fn test_from_arrow_array() { - let arrow_array = BinaryArray::from_iter_values(&[vec![1, 2, 3], vec![1, 2, 3]]); - let original = BinaryArray::from(arrow_array.data().clone()); + let arrow_array = BinaryArray::from_slice(&[vec![1, 2, 3], vec![1, 2, 3]]); + let original = arrow_array.clone(); let vector = BinaryVector::from(arrow_array); assert_eq!(original, vector.array); } @@ -317,7 +289,7 @@ mod tests { builder.push(Some(b"world")); let vector = builder.finish(); assert_eq!(0, vector.null_count()); - assert!(vector.validity().is_all_valid()); + assert_eq!(Validity::AllValid, vector.validity()); let mut builder = BinaryVectorBuilder::with_capacity(3); builder.push(Some(b"hello")); @@ -326,10 +298,9 @@ mod tests { let vector = builder.finish(); assert_eq!(1, vector.null_count()); let validity = vector.validity(); - assert!(!validity.is_set(1)); - - assert_eq!(1, validity.null_count()); - assert!(!validity.is_set(1)); + let slots = validity.slots().unwrap(); + assert_eq!(1, slots.null_count()); + assert!(!slots.get_bit(1)); } #[test] diff --git a/src/datatypes2/src/vectors/boolean.rs b/src/datatypes2/src/vectors/boolean.rs index 2b4e5b8e10..11c40bd661 100644 --- a/src/datatypes2/src/vectors/boolean.rs +++ b/src/datatypes2/src/vectors/boolean.rs @@ -16,10 +16,9 @@ use std::any::Any; use std::borrow::Borrow; use std::sync::Arc; -use arrow::array::{ - Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef, BooleanArray, BooleanBuilder, -}; -use snafu::ResultExt; +use arrow::array::{Array, ArrayRef, BooleanArray, MutableArray, MutableBooleanArray}; +use arrow::bitmap::utils::{BitmapIter, ZipValidity}; +use snafu::{OptionExt, ResultExt}; use crate::data_type::ConcreteDataType; use crate::error::Result; @@ -42,26 +41,12 @@ impl BooleanVector { pub(crate) fn as_boolean_array(&self) -> &BooleanArray { &self.array } - - fn to_array_data(&self) -> ArrayData { - self.array.data().clone() - } - - fn from_array_data(data: ArrayData) -> BooleanVector { - BooleanVector { - array: BooleanArray::from(data), - } - } - - pub(crate) fn false_count(&self) -> usize { - self.array.false_count() - } } impl From> for BooleanVector { fn from(data: Vec) -> Self { BooleanVector { - array: BooleanArray::from(data), + array: BooleanArray::from_slice(&data), } } } @@ -106,13 +91,11 @@ impl Vector for BooleanVector { } fn to_arrow_array(&self) -> ArrayRef { - let data = self.to_array_data(); - Arc::new(BooleanArray::from(data)) + Arc::new(self.array.clone()) } fn to_boxed_arrow_array(&self) -> Box { - let data = self.to_array_data(); - Box::new(BooleanArray::from(data)) + Box::new(self.array.clone()) } fn validity(&self) -> Validity { @@ -120,11 +103,7 @@ impl Vector for BooleanVector { } fn memory_size(&self) -> usize { - self.array.get_buffer_memory_size() - } - - fn null_count(&self) -> usize { - self.array.null_count() + self.array.values().as_slice().0.len() } fn is_null(&self, row: usize) -> bool { @@ -132,8 +111,7 @@ impl Vector for BooleanVector { } fn slice(&self, offset: usize, length: usize) -> VectorRef { - let data = self.array.data().slice(offset, length); - Arc::new(Self::from_array_data(data)) + Arc::new(Self::from(self.array.slice(offset, length))) } fn get(&self, index: usize) -> Value { @@ -148,7 +126,7 @@ impl Vector for BooleanVector { impl ScalarVector for BooleanVector { type OwnedItem = bool; type RefItem<'a> = bool; - type Iter<'a> = ArrayIter<&'a BooleanArray>; + type Iter<'a> = ZipValidity<'a, bool, BitmapIter<'a>>; type Builder = BooleanVectorBuilder; fn get_data(&self, idx: usize) -> Option> { @@ -165,7 +143,7 @@ impl ScalarVector for BooleanVector { } pub struct BooleanVectorBuilder { - mutable_array: BooleanBuilder, + mutable_array: MutableBooleanArray, } impl MutableVector for BooleanVectorBuilder { @@ -190,15 +168,12 @@ impl MutableVector for BooleanVectorBuilder { } fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - match value.as_boolean()? { - Some(v) => self.mutable_array.append_value(v), - None => self.mutable_array.append_null(), - } + self.mutable_array.push(value.as_boolean()?); Ok(()) } fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - vectors::impl_extend_for_builder!(self, vector, BooleanVector, offset, length) + vectors::impl_extend_for_builder!(self.mutable_array, vector, BooleanVector, offset, length) } } @@ -207,20 +182,17 @@ impl ScalarVectorBuilder for BooleanVectorBuilder { fn with_capacity(capacity: usize) -> Self { Self { - mutable_array: BooleanBuilder::with_capacity(capacity), + mutable_array: MutableBooleanArray::with_capacity(capacity), } } fn push(&mut self, value: Option<::RefItem<'_>>) { - match value { - Some(v) => self.mutable_array.append_value(v), - None => self.mutable_array.append_null(), - } + self.mutable_array.push(value); } fn finish(&mut self) -> Self::VectorType { BooleanVector { - array: self.mutable_array.finish(), + array: std::mem::take(&mut self.mutable_array).into(), } } } @@ -253,9 +225,9 @@ mod tests { assert_eq!(9, v.len()); assert_eq!("BooleanVector", v.vector_type_name()); assert!(!v.is_const()); - assert!(v.validity().is_all_valid()); + assert_eq!(Validity::AllValid, v.validity()); assert!(!v.only_null()); - assert_eq!(64, v.memory_size()); + assert_eq!(2, v.memory_size()); for (i, b) in bools.iter().enumerate() { assert!(!v.is_null(i)); @@ -344,12 +316,13 @@ mod tests { let vector = BooleanVector::from(vec![Some(true), None, Some(false)]); assert_eq!(1, vector.null_count()); let validity = vector.validity(); - assert_eq!(1, validity.null_count()); - assert!(!validity.is_set(1)); + let slots = validity.slots().unwrap(); + assert_eq!(1, slots.null_count()); + assert!(!slots.get_bit(1)); let vector = BooleanVector::from(vec![true, false, false]); assert_eq!(0, vector.null_count()); - assert!(vector.validity().is_all_valid()); + assert_eq!(Validity::AllValid, vector.validity()); } #[test] diff --git a/src/datatypes/src/vectors/builder.rs b/src/datatypes2/src/vectors/builder.rs similarity index 100% rename from src/datatypes/src/vectors/builder.rs rename to src/datatypes2/src/vectors/builder.rs diff --git a/src/datatypes2/src/vectors/constant.rs b/src/datatypes2/src/vectors/constant.rs index 87739e9131..d5522007a1 100644 --- a/src/datatypes2/src/vectors/constant.rs +++ b/src/datatypes2/src/vectors/constant.rs @@ -55,27 +55,6 @@ impl ConstantVector { pub fn get_constant_ref(&self) -> ValueRef { self.vector.get_ref(0) } - - pub(crate) fn replicate_vector(&self, offsets: &[usize]) -> VectorRef { - assert_eq!(offsets.len(), self.len()); - - if offsets.is_empty() { - return self.slice(0, 0); - } - - Arc::new(ConstantVector::new( - self.vector.clone(), - *offsets.last().unwrap(), - )) - } - - pub(crate) fn filter_vector(&self, filter: &BooleanVector) -> Result { - let length = self.len() - filter.false_count(); - if length == self.len() { - return Ok(Arc::new(self.clone())); - } - Ok(Arc::new(ConstantVector::new(self.inner().clone(), length))) - } } impl Vector for ConstantVector { @@ -111,9 +90,9 @@ impl Vector for ConstantVector { fn validity(&self) -> Validity { if self.vector.is_null(0) { - Validity::all_null(self.length) + Validity::AllNull } else { - Validity::all_valid(self.length) + Validity::AllValid } } @@ -143,14 +122,6 @@ impl Vector for ConstantVector { fn get_ref(&self, _index: usize) -> ValueRef { self.vector.get_ref(0) } - - fn null_count(&self) -> usize { - if self.only_null() { - self.len() - } else { - 0 - } - } } impl fmt::Debug for ConstantVector { @@ -169,6 +140,33 @@ impl Serializable for ConstantVector { } } +pub(crate) fn replicate_constant(vector: &ConstantVector, offsets: &[usize]) -> VectorRef { + assert_eq!(offsets.len(), vector.len()); + + if offsets.is_empty() { + return vector.slice(0, 0); + } + + Arc::new(ConstantVector::new( + vector.vector.clone(), + *offsets.last().unwrap(), + )) +} + +pub(crate) fn filter_constant( + vector: &ConstantVector, + filter: &BooleanVector, +) -> Result { + let length = filter.len() - filter.as_boolean_array().values().null_count(); + if length == vector.len() { + return Ok(Arc::new(vector.clone())); + } + Ok(Arc::new(ConstantVector::new( + vector.inner().clone(), + length, + ))) +} + #[cfg(test)] mod tests { use arrow::datatypes::DataType as ArrowDataType; @@ -184,9 +182,9 @@ mod tests { assert_eq!("ConstantVector", c.vector_type_name()); assert!(c.is_const()); assert_eq!(10, c.len()); - assert!(c.validity().is_all_valid()); + assert_eq!(Validity::AllValid, c.validity()); assert!(!c.only_null()); - assert_eq!(64, c.memory_size()); + assert_eq!(4, c.memory_size()); for i in 0..10 { assert!(!c.is_null(i)); diff --git a/src/datatypes2/src/vectors/date.rs b/src/datatypes2/src/vectors/date.rs index d0a66b80fb..0198b3622f 100644 --- a/src/datatypes2/src/vectors/date.rs +++ b/src/datatypes2/src/vectors/date.rs @@ -12,28 +12,258 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::types::DateType; -use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder}; +use std::any::Any; +use std::sync::Arc; -// Vector for [`Date`](common_time::Date). -pub type DateVector = PrimitiveVector; -// Builder to build DateVector. -pub type DateVectorBuilder = PrimitiveVectorBuilder; +use arrow::array::{Array, ArrayRef, PrimitiveArray}; +use common_time::date::Date; +use snafu::OptionExt; + +use crate::data_type::ConcreteDataType; +use crate::error::{self, Result}; +use crate::prelude::*; +use crate::scalars::ScalarVector; +use crate::serialize::Serializable; +use crate::vectors::{MutableVector, PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder}; + +#[derive(Debug, Clone, PartialEq)] +pub struct DateVector { + array: PrimitiveVector, +} + +impl DateVector { + pub fn new(array: PrimitiveArray) -> Self { + Self { + array: PrimitiveVector { array }, + } + } + + pub fn try_from_arrow_array(array: impl AsRef) -> Result { + Ok(Self::new( + array + .as_ref() + .as_any() + .downcast_ref::>() + .with_context(|| error::ConversionSnafu { + from: format!("{:?}", array.as_ref().data_type()), + })? + .clone(), + )) + } + + pub(crate) fn as_arrow(&self) -> &dyn Array { + self.array.as_arrow() + } +} + +impl Vector for DateVector { + fn data_type(&self) -> ConcreteDataType { + ConcreteDataType::date_datatype() + } + + fn vector_type_name(&self) -> String { + "DateVector".to_string() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn len(&self) -> usize { + self.array.len() + } + + fn to_arrow_array(&self) -> ArrayRef { + let validity = self.array.array.validity().cloned(); + let buffer = self.array.array.values().clone(); + Arc::new(PrimitiveArray::new( + arrow::datatypes::DataType::Date32, + buffer, + validity, + )) + } + + fn to_boxed_arrow_array(&self) -> Box { + let validity = self.array.array.validity().cloned(); + let buffer = self.array.array.values().clone(); + Box::new(PrimitiveArray::new( + arrow::datatypes::DataType::Date32, + buffer, + validity, + )) + } + + fn validity(&self) -> Validity { + self.array.validity() + } + + fn memory_size(&self) -> usize { + self.array.memory_size() + } + + fn is_null(&self, row: usize) -> bool { + self.array.is_null(row) + } + + fn slice(&self, offset: usize, length: usize) -> VectorRef { + Arc::new(Self { + array: PrimitiveVector::new(self.array.array.slice(offset, length)), + }) + } + + fn get(&self, index: usize) -> Value { + match self.array.get(index) { + Value::Int32(v) => Value::Date(Date::new(v)), + Value::Null => Value::Null, + _ => { + unreachable!() + } + } + } + + fn get_ref(&self, index: usize) -> ValueRef { + match self.array.get(index) { + Value::Int32(v) => ValueRef::Date(Date::new(v)), + Value::Null => ValueRef::Null, + _ => { + unreachable!() + } + } + } +} + +impl From>> for DateVector { + fn from(data: Vec>) -> Self { + Self { + array: PrimitiveVector::::from(data), + } + } +} + +pub struct DateIter<'a> { + iter: PrimitiveIter<'a, i32>, +} + +impl<'a> Iterator for DateIter<'a> { + type Item = Option; + + fn next(&mut self) -> Option { + self.iter.next().map(|v| v.map(Date::new)) + } +} + +impl ScalarVector for DateVector { + type OwnedItem = Date; + type RefItem<'a> = Date; + type Iter<'a> = DateIter<'a>; + + type Builder = DateVectorBuilder; + + fn get_data(&self, idx: usize) -> Option> { + self.array.get_data(idx).map(Date::new) + } + + fn iter_data(&self) -> Self::Iter<'_> { + DateIter { + iter: self.array.iter_data(), + } + } +} + +impl Serializable for DateVector { + fn serialize_to_json(&self) -> Result> { + Ok(self + .array + .iter_data() + .map(|v| v.map(Date::new)) + .map(|v| match v { + None => serde_json::Value::Null, + Some(v) => v.into(), + }) + .collect::>()) + } +} + +pub struct DateVectorBuilder { + buffer: PrimitiveVectorBuilder, +} + +impl MutableVector for DateVectorBuilder { + fn data_type(&self) -> ConcreteDataType { + ConcreteDataType::date_datatype() + } + + fn len(&self) -> usize { + self.buffer.len() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_mut_any(&mut self) -> &mut dyn Any { + self + } + + fn to_vector(&mut self) -> VectorRef { + Arc::new(self.finish()) + } + + fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { + self.buffer.push(value.as_date()?.map(|d| d.val())); + Ok(()) + } + + fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { + let concrete_vector = vector + .as_any() + .downcast_ref::() + .with_context(|| error::CastTypeSnafu { + msg: format!( + "Failed to convert vector from {} to DateVector", + vector.vector_type_name() + ), + })?; + self.buffer + .extend_slice_of(&concrete_vector.array, offset, length)?; + Ok(()) + } +} + +impl ScalarVectorBuilder for DateVectorBuilder { + type VectorType = DateVector; + + fn with_capacity(capacity: usize) -> Self { + Self { + buffer: PrimitiveVectorBuilder::with_capacity(capacity), + } + } + + fn push(&mut self, value: Option<::RefItem<'_>>) { + self.buffer.push(value.map(|d| d.val())) + } + + fn finish(&mut self) -> Self::VectorType { + Self::VectorType { + array: self.buffer.finish(), + } + } +} + +pub(crate) fn replicate_date(vector: &DateVector, offsets: &[usize]) -> VectorRef { + let array = crate::vectors::primitive::replicate_primitive_with_type( + &vector.array, + offsets, + vector.data_type(), + ); + Arc::new(DateVector { array }) +} #[cfg(test)] mod tests { - use std::sync::Arc; - - use arrow::array::Array; - use common_time::date::Date; - use super::*; use crate::data_type::DataType; - use crate::scalars::{ScalarVector, ScalarVectorBuilder}; - use crate::serialize::Serializable; use crate::types::DateType; - use crate::value::{Value, ValueRef}; - use crate::vectors::{Vector, VectorRef}; #[test] fn test_build_date_vector() { @@ -58,7 +288,7 @@ mod tests { #[test] fn test_date_scalar() { - let vector = DateVector::from_slice(&[1, 2]); + let vector = DateVector::from_slice(&[Date::new(1), Date::new(2)]); assert_eq!(2, vector.len()); assert_eq!(Some(Date::new(1)), vector.get_data(0)); assert_eq!(Some(Date::new(2)), vector.get_data(1)); @@ -66,7 +296,7 @@ mod tests { #[test] fn test_date_vector_builder() { - let input = DateVector::from_slice(&[1, 2, 3]); + let input = DateVector::from_slice(&[Date::new(1), Date::new(2), Date::new(3)]); let mut builder = DateType::default().create_mutable_vector(3); builder @@ -79,25 +309,19 @@ mod tests { .is_err()); let vector = builder.to_vector(); - let expect: VectorRef = Arc::new(DateVector::from_slice(&[5, 2, 3])); + let expect: VectorRef = Arc::new(DateVector::from_slice(&[ + Date::new(5), + Date::new(2), + Date::new(3), + ])); assert_eq!(expect, vector); } #[test] fn test_date_from_arrow() { - let vector = DateVector::from_slice(&[1, 2]); + let vector = DateVector::from_slice(&[Date::new(1), Date::new(2)]); let arrow = vector.as_arrow().slice(0, vector.len()); let vector2 = DateVector::try_from_arrow_array(&arrow).unwrap(); assert_eq!(vector, vector2); } - - #[test] - fn test_serialize_date_vector() { - let vector = DateVector::from_slice(&[-1, 0, 1]); - let serialized_json = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(); - assert_eq!( - r#"["1969-12-31","1970-01-01","1970-01-02"]"#, - serialized_json - ); - } } diff --git a/src/datatypes2/src/vectors/datetime.rs b/src/datatypes2/src/vectors/datetime.rs index a40a3e54d3..732e56004c 100644 --- a/src/datatypes2/src/vectors/datetime.rs +++ b/src/datatypes2/src/vectors/datetime.rs @@ -12,32 +12,264 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::types::DateTimeType; -use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder}; +use std::any::Any; +use std::sync::Arc; -/// Vector of [`DateTime`](common_time::Date) -pub type DateTimeVector = PrimitiveVector; -/// Builder for [`DateTimeVector`]. -pub type DateTimeVectorBuilder = PrimitiveVectorBuilder; +use arrow::array::{Array, ArrayRef, PrimitiveArray}; +use common_time::datetime::DateTime; +use snafu::OptionExt; + +use crate::data_type::ConcreteDataType; +use crate::error::{self, Result}; +use crate::prelude::{ + MutableVector, ScalarVector, ScalarVectorBuilder, Validity, Value, ValueRef, Vector, VectorRef, +}; +use crate::serialize::Serializable; +use crate::vectors::{PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder}; + +#[derive(Debug, Clone, PartialEq)] +pub struct DateTimeVector { + array: PrimitiveVector, +} + +impl DateTimeVector { + pub fn new(array: PrimitiveArray) -> Self { + Self { + array: PrimitiveVector { array }, + } + } + + pub fn try_from_arrow_array(array: impl AsRef) -> Result { + Ok(Self::new( + array + .as_ref() + .as_any() + .downcast_ref::>() + .with_context(|| error::ConversionSnafu { + from: format!("{:?}", array.as_ref().data_type()), + })? + .clone(), + )) + } + + pub(crate) fn as_arrow(&self) -> &dyn Array { + self.array.as_arrow() + } +} + +impl Vector for DateTimeVector { + fn data_type(&self) -> ConcreteDataType { + ConcreteDataType::datetime_datatype() + } + + fn vector_type_name(&self) -> String { + "DateTimeVector".to_string() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn len(&self) -> usize { + self.array.len() + } + + fn to_arrow_array(&self) -> ArrayRef { + let validity = self.array.array.validity().cloned(); + let buffer = self.array.array.values().clone(); + Arc::new(PrimitiveArray::new( + arrow::datatypes::DataType::Date64, + buffer, + validity, + )) + } + + fn to_boxed_arrow_array(&self) -> Box { + let validity = self.array.array.validity().cloned(); + let buffer = self.array.array.values().clone(); + Box::new(PrimitiveArray::new( + arrow::datatypes::DataType::Date64, + buffer, + validity, + )) + } + + fn validity(&self) -> Validity { + self.array.validity() + } + + fn memory_size(&self) -> usize { + self.array.memory_size() + } + + fn is_null(&self, row: usize) -> bool { + self.array.is_null(row) + } + + fn slice(&self, offset: usize, length: usize) -> VectorRef { + Arc::new(Self { + array: PrimitiveVector::new(self.array.array.slice(offset, length)), + }) + } + + fn get(&self, index: usize) -> Value { + match self.array.get(index) { + Value::Int64(v) => Value::DateTime(DateTime::new(v)), + Value::Null => Value::Null, + _ => { + unreachable!() + } + } + } + + fn get_ref(&self, index: usize) -> ValueRef { + match self.array.get(index) { + Value::Int64(v) => ValueRef::DateTime(DateTime::new(v)), + Value::Null => ValueRef::Null, + _ => { + unreachable!() + } + } + } +} + +impl Serializable for DateTimeVector { + fn serialize_to_json(&self) -> crate::Result> { + Ok(self + .array + .iter_data() + .map(|v| v.map(DateTime::new)) + .map(|v| match v { + None => serde_json::Value::Null, + Some(v) => v.into(), + }) + .collect::>()) + } +} + +impl From>> for DateTimeVector { + fn from(data: Vec>) -> Self { + Self { + array: PrimitiveVector::::from(data), + } + } +} + +pub struct DateTimeVectorBuilder { + buffer: PrimitiveVectorBuilder, +} + +impl ScalarVectorBuilder for DateTimeVectorBuilder { + type VectorType = DateTimeVector; + + fn with_capacity(capacity: usize) -> Self { + Self { + buffer: PrimitiveVectorBuilder::with_capacity(capacity), + } + } + + fn push(&mut self, value: Option<::RefItem<'_>>) { + self.buffer.push(value.map(|d| d.val())) + } + + fn finish(&mut self) -> Self::VectorType { + Self::VectorType { + array: self.buffer.finish(), + } + } +} + +impl MutableVector for DateTimeVectorBuilder { + fn data_type(&self) -> ConcreteDataType { + ConcreteDataType::datetime_datatype() + } + + fn len(&self) -> usize { + self.buffer.len() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_mut_any(&mut self) -> &mut dyn Any { + self + } + + fn to_vector(&mut self) -> VectorRef { + Arc::new(self.finish()) + } + + fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { + self.buffer.push(value.as_datetime()?.map(|d| d.val())); + Ok(()) + } + + fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { + let concrete_vector = vector + .as_any() + .downcast_ref::() + .with_context(|| error::CastTypeSnafu { + msg: format!( + "Failed to convert vector from {} to DateVector", + vector.vector_type_name() + ), + })?; + self.buffer + .extend_slice_of(&concrete_vector.array, offset, length)?; + Ok(()) + } +} + +pub struct DateTimeIter<'a> { + iter: PrimitiveIter<'a, i64>, +} + +impl<'a> Iterator for DateTimeIter<'a> { + type Item = Option; + + fn next(&mut self) -> Option { + self.iter.next().map(|v| v.map(DateTime::new)) + } +} + +impl ScalarVector for DateTimeVector { + type OwnedItem = DateTime; + type RefItem<'a> = DateTime; + type Iter<'a> = DateTimeIter<'a>; + type Builder = DateTimeVectorBuilder; + + fn get_data(&self, idx: usize) -> Option> { + self.array.get_data(idx).map(DateTime::new) + } + + fn iter_data(&self) -> Self::Iter<'_> { + DateTimeIter { + iter: self.array.iter_data(), + } + } +} + +pub(crate) fn replicate_datetime(vector: &DateTimeVector, offsets: &[usize]) -> VectorRef { + let array = crate::vectors::primitive::replicate_primitive_with_type( + &vector.array, + offsets, + vector.data_type(), + ); + Arc::new(DateTimeVector { array }) +} #[cfg(test)] mod tests { - use std::sync::Arc; - - use arrow::array::{Array, PrimitiveArray}; - use common_time::DateTime; - use datafusion_common::from_slice::FromSlice; + use std::assert_matches::assert_matches; use super::*; use crate::data_type::DataType; - use crate::prelude::{ - ConcreteDataType, ScalarVector, ScalarVectorBuilder, Value, ValueRef, Vector, VectorRef, - }; - use crate::serialize::Serializable; + use crate::types::DateTimeType; #[test] fn test_datetime_vector() { - let v = DateTimeVector::new(PrimitiveArray::from_slice(&[1, 2, 3])); + let v = DateTimeVector::new(PrimitiveArray::from_vec(vec![1, 2, 3])); assert_eq!(ConcreteDataType::datetime_datatype(), v.data_type()); assert_eq!(3, v.len()); assert_eq!("DateTimeVector", v.vector_type_name()); @@ -55,8 +287,9 @@ mod tests { assert_eq!(Some(DateTime::new(2)), iter.next().unwrap()); assert_eq!(Some(DateTime::new(3)), iter.next().unwrap()); assert!(!v.is_null(0)); - assert_eq!(64, v.memory_size()); + assert_eq!(24, v.memory_size()); // size of i64 * 3 + assert_matches!(v.validity(), Validity::AllValid); if let Value::DateTime(d) = v.get(0) { assert_eq!(1, d.val()); } else { @@ -81,11 +314,8 @@ mod tests { assert_eq!(Value::Null, v.get(1)); assert_eq!(Value::DateTime(DateTime::new(-1)), v.get(2)); - let input = DateTimeVector::from_wrapper_slice(&[ - DateTime::new(1), - DateTime::new(2), - DateTime::new(3), - ]); + let input = + DateTimeVector::from_slice(&[DateTime::new(1), DateTime::new(2), DateTime::new(3)]); let mut builder = DateTimeType::default().create_mutable_vector(3); builder @@ -98,7 +328,7 @@ mod tests { .is_err()); let vector = builder.to_vector(); - let expect: VectorRef = Arc::new(DateTimeVector::from_wrapper_slice(&[ + let expect: VectorRef = Arc::new(DateTimeVector::from_slice(&[ DateTime::new(5), DateTime::new(2), DateTime::new(3), @@ -108,7 +338,7 @@ mod tests { #[test] fn test_datetime_from_arrow() { - let vector = DateTimeVector::from_wrapper_slice(&[DateTime::new(1), DateTime::new(2)]); + let vector = DateTimeVector::from_slice(&[DateTime::new(1), DateTime::new(2)]); let arrow = vector.as_arrow().slice(0, vector.len()); let vector2 = DateTimeVector::try_from_arrow_array(&arrow).unwrap(); assert_eq!(vector, vector2); diff --git a/src/datatypes2/src/vectors/eq.rs b/src/datatypes2/src/vectors/eq.rs index 55359026d4..d47167c3f9 100644 --- a/src/datatypes2/src/vectors/eq.rs +++ b/src/datatypes2/src/vectors/eq.rs @@ -15,12 +15,9 @@ use std::sync::Arc; use crate::data_type::DataType; -use crate::types::TimestampType; -use crate::vectors::constant::ConstantVector; use crate::vectors::{ - BinaryVector, BooleanVector, DateTimeVector, DateVector, ListVector, PrimitiveVector, - StringVector, TimestampMicrosecondVector, TimestampMillisecondVector, - TimestampNanosecondVector, TimestampSecondVector, Vector, + BinaryVector, BooleanVector, ConstantVector, DateTimeVector, DateVector, ListVector, + PrimitiveVector, StringVector, TimestampVector, Vector, }; use crate::with_match_primitive_type_id; @@ -79,20 +76,7 @@ fn equal(lhs: &dyn Vector, rhs: &dyn Vector) -> bool { String(_) => is_vector_eq!(StringVector, lhs, rhs), Date(_) => is_vector_eq!(DateVector, lhs, rhs), DateTime(_) => is_vector_eq!(DateTimeVector, lhs, rhs), - Timestamp(t) => match t { - TimestampType::Second(_) => { - is_vector_eq!(TimestampSecondVector, lhs, rhs) - } - TimestampType::Millisecond(_) => { - is_vector_eq!(TimestampMillisecondVector, lhs, rhs) - } - TimestampType::Microsecond(_) => { - is_vector_eq!(TimestampMicrosecondVector, lhs, rhs) - } - TimestampType::Nanosecond(_) => { - is_vector_eq!(TimestampNanosecondVector, lhs, rhs) - } - }, + Timestamp(_) => is_vector_eq!(TimestampVector, lhs, rhs), List(_) => is_vector_eq!(ListVector, lhs, rhs), UInt8(_) | UInt16(_) | UInt32(_) | UInt64(_) | Int8(_) | Int16(_) | Int32(_) | Int64(_) | Float32(_) | Float64(_) => { @@ -111,10 +95,13 @@ fn equal(lhs: &dyn Vector, rhs: &dyn Vector) -> bool { #[cfg(test)] mod tests { + use arrow::array::{ListArray, MutableListArray, MutablePrimitiveArray, TryExtend}; + use super::*; use crate::vectors::{ - list, Float32Vector, Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector, - NullVector, UInt16Vector, UInt32Vector, UInt64Vector, UInt8Vector, VectorRef, + Float32Vector, Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector, + NullVector, TimestampVector, UInt16Vector, UInt32Vector, UInt64Vector, UInt8Vector, + VectorRef, }; fn assert_vector_ref_eq(vector: VectorRef) { @@ -145,21 +132,14 @@ mod tests { assert_vector_ref_eq(Arc::new(BooleanVector::from(vec![true, false]))); assert_vector_ref_eq(Arc::new(DateVector::from(vec![Some(100), Some(120)]))); assert_vector_ref_eq(Arc::new(DateTimeVector::from(vec![Some(100), Some(120)]))); - assert_vector_ref_eq(Arc::new(TimestampSecondVector::from_values([100, 120]))); - assert_vector_ref_eq(Arc::new(TimestampMillisecondVector::from_values([ - 100, 120, - ]))); - assert_vector_ref_eq(Arc::new(TimestampMicrosecondVector::from_values([ - 100, 120, - ]))); - assert_vector_ref_eq(Arc::new(TimestampNanosecondVector::from_values([100, 120]))); + assert_vector_ref_eq(Arc::new(TimestampVector::from_values([100, 120]))); - let list_vector = list::tests::new_list_vector(&[ - Some(vec![Some(1), Some(2)]), - None, - Some(vec![Some(3), Some(4)]), - ]); - assert_vector_ref_eq(Arc::new(list_vector)); + let mut arrow_array = MutableListArray::>::new(); + arrow_array + .try_extend(vec![Some(vec![Some(1), Some(2), Some(3)])]) + .unwrap(); + let arrow_array: ListArray = arrow_array.into(); + assert_vector_ref_eq(Arc::new(ListVector::from(arrow_array))); assert_vector_ref_eq(Arc::new(NullVector::new(4))); assert_vector_ref_eq(Arc::new(StringVector::from(vec![ diff --git a/src/datatypes2/src/vectors/helper.rs b/src/datatypes2/src/vectors/helper.rs index f3236ca0ec..60a9f8511f 100644 --- a/src/datatypes2/src/vectors/helper.rs +++ b/src/datatypes2/src/vectors/helper.rs @@ -17,26 +17,19 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{Array, ArrayRef, StringArray}; +use arrow::array::Array; use arrow::compute; -use arrow::compute::kernels::comparison; -use arrow::datatypes::{DataType as ArrowDataType, TimeUnit}; +use arrow::datatypes::DataType as ArrowDataType; use datafusion_common::ScalarValue; use snafu::{OptionExt, ResultExt}; -use crate::data_type::ConcreteDataType; -use crate::error::{self, Result}; -use crate::scalars::{Scalar, ScalarVectorBuilder}; -use crate::value::{ListValue, ListValueRef}; -use crate::vectors::{ - BinaryVector, BooleanVector, ConstantVector, DateTimeVector, DateVector, Float32Vector, - Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector, ListVector, - ListVectorBuilder, MutableVector, NullVector, StringVector, TimestampMicrosecondVector, - TimestampMillisecondVector, TimestampNanosecondVector, TimestampSecondVector, UInt16Vector, - UInt32Vector, UInt64Vector, UInt8Vector, Vector, VectorRef, -}; +use crate::arrow_array::StringArray; +use crate::error::{ConversionSnafu, Result, UnknownVectorSnafu}; +use crate::scalars::*; +use crate::vectors::date::DateVector; +use crate::vectors::datetime::DateTimeVector; +use crate::vectors::*; -/// Helper functions for `Vector`. pub struct Helper; impl Helper { @@ -54,7 +47,7 @@ impl Helper { let arr = vector .as_any() .downcast_ref::<::VectorType>() - .with_context(|| error::UnknownVectorSnafu { + .with_context(|| UnknownVectorSnafu { msg: format!( "downcast vector error, vector type: {:?}, expected vector: {:?}", vector.vector_type_name(), @@ -68,7 +61,7 @@ impl Helper { let arr = vector .as_any() .downcast_ref::() - .with_context(|| error::UnknownVectorSnafu { + .with_context(|| UnknownVectorSnafu { msg: format!( "downcast vector error, vector type: {:?}, expected vector: {:?}", vector.vector_type_name(), @@ -85,7 +78,7 @@ impl Helper { let arr = vector .as_mut_any() .downcast_mut() - .with_context(|| error::UnknownVectorSnafu { + .with_context(|| UnknownVectorSnafu { msg: format!( "downcast vector error, vector type: {:?}, expected vector: {:?}", ty, @@ -101,7 +94,7 @@ impl Helper { let arr = vector .as_any() .downcast_ref::<::VectorType>() - .with_context(|| error::UnknownVectorSnafu { + .with_context(|| UnknownVectorSnafu { msg: format!( "downcast vector error, vector type: {:?}, expected vector: {:?}", vector.vector_type_name(), @@ -112,9 +105,11 @@ impl Helper { } /// Try to cast an arrow scalar value into vector + /// + /// # Panics + /// Panic if given scalar value is not supported. pub fn try_from_scalar_value(value: ScalarValue, length: usize) -> Result { let vector = match value { - ScalarValue::Null => ConstantVector::new(Arc::new(NullVector::new(1)), length), ScalarValue::Boolean(v) => { ConstantVector::new(Arc::new(BooleanVector::from(vec![v])), length) } @@ -148,29 +143,17 @@ impl Helper { ScalarValue::UInt64(v) => { ConstantVector::new(Arc::new(UInt64Vector::from(vec![v])), length) } - ScalarValue::Utf8(v) | ScalarValue::LargeUtf8(v) => { + ScalarValue::Utf8(v) => { ConstantVector::new(Arc::new(StringVector::from(vec![v])), length) } - ScalarValue::Binary(v) - | ScalarValue::LargeBinary(v) - | ScalarValue::FixedSizeBinary(_, v) => { + ScalarValue::LargeUtf8(v) => { + ConstantVector::new(Arc::new(StringVector::from(vec![v])), length) + } + ScalarValue::Binary(v) => { ConstantVector::new(Arc::new(BinaryVector::from(vec![v])), length) } - ScalarValue::List(v, field) => { - let item_type = ConcreteDataType::try_from(field.data_type())?; - let mut builder = ListVectorBuilder::with_type_capacity(item_type.clone(), 1); - if let Some(values) = v { - let values = values - .into_iter() - .map(ScalarValue::try_into) - .collect::>()?; - let list_value = ListValue::new(Some(Box::new(values)), item_type); - builder.push(Some(ListValueRef::Ref { val: &list_value })); - } else { - builder.push(None); - } - let list_vector = builder.to_vector(); - ConstantVector::new(list_vector, length) + ScalarValue::LargeBinary(v) => { + ConstantVector::new(Arc::new(BinaryVector::from(vec![v])), length) } ScalarValue::Date32(v) => { ConstantVector::new(Arc::new(DateVector::from(vec![v])), length) @@ -178,30 +161,8 @@ impl Helper { ScalarValue::Date64(v) => { ConstantVector::new(Arc::new(DateTimeVector::from(vec![v])), length) } - ScalarValue::TimestampSecond(v, _) => { - // Timezone is unimplemented now. - ConstantVector::new(Arc::new(TimestampSecondVector::from(vec![v])), length) - } - ScalarValue::TimestampMillisecond(v, _) => { - // Timezone is unimplemented now. - ConstantVector::new(Arc::new(TimestampMillisecondVector::from(vec![v])), length) - } - ScalarValue::TimestampMicrosecond(v, _) => { - // Timezone is unimplemented now. - ConstantVector::new(Arc::new(TimestampMicrosecondVector::from(vec![v])), length) - } - ScalarValue::TimestampNanosecond(v, _) => { - // Timezone is unimplemented now. - ConstantVector::new(Arc::new(TimestampNanosecondVector::from(vec![v])), length) - } - ScalarValue::Decimal128(_, _, _) - | ScalarValue::Time64(_) - | ScalarValue::IntervalYearMonth(_) - | ScalarValue::IntervalDayTime(_) - | ScalarValue::IntervalMonthDayNano(_) - | ScalarValue::Struct(_, _) - | ScalarValue::Dictionary(_, _) => { - return error::ConversionSnafu { + _ => { + return ConversionSnafu { from: format!("Unsupported scalar value: {}", value), } .fail() @@ -219,7 +180,9 @@ impl Helper { Ok(match array.as_ref().data_type() { ArrowDataType::Null => Arc::new(NullVector::try_from_arrow_array(array)?), ArrowDataType::Boolean => Arc::new(BooleanVector::try_from_arrow_array(array)?), - ArrowDataType::LargeBinary => Arc::new(BinaryVector::try_from_arrow_array(array)?), + ArrowDataType::Binary | ArrowDataType::LargeBinary => { + Arc::new(BinaryVector::try_from_arrow_array(array)?) + } ArrowDataType::Int8 => Arc::new(Int8Vector::try_from_arrow_array(array)?), ArrowDataType::Int16 => Arc::new(Int16Vector::try_from_arrow_array(array)?), ArrowDataType::Int32 => Arc::new(Int32Vector::try_from_arrow_array(array)?), @@ -230,80 +193,48 @@ impl Helper { ArrowDataType::UInt64 => Arc::new(UInt64Vector::try_from_arrow_array(array)?), ArrowDataType::Float32 => Arc::new(Float32Vector::try_from_arrow_array(array)?), ArrowDataType::Float64 => Arc::new(Float64Vector::try_from_arrow_array(array)?), - ArrowDataType::Utf8 => Arc::new(StringVector::try_from_arrow_array(array)?), + ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => { + Arc::new(StringVector::try_from_arrow_array(array)?) + } ArrowDataType::Date32 => Arc::new(DateVector::try_from_arrow_array(array)?), ArrowDataType::Date64 => Arc::new(DateTimeVector::try_from_arrow_array(array)?), ArrowDataType::List(_) => Arc::new(ListVector::try_from_arrow_array(array)?), - ArrowDataType::Timestamp(unit, _) => match unit { - TimeUnit::Second => Arc::new(TimestampSecondVector::try_from_arrow_array(array)?), - TimeUnit::Millisecond => { - Arc::new(TimestampMillisecondVector::try_from_arrow_array(array)?) - } - TimeUnit::Microsecond => { - Arc::new(TimestampMicrosecondVector::try_from_arrow_array(array)?) - } - TimeUnit::Nanosecond => { - Arc::new(TimestampNanosecondVector::try_from_arrow_array(array)?) - } - }, - ArrowDataType::Float16 - | ArrowDataType::Time32(_) - | ArrowDataType::Time64(_) - | ArrowDataType::Duration(_) - | ArrowDataType::Interval(_) - | ArrowDataType::Binary - | ArrowDataType::FixedSizeBinary(_) - | ArrowDataType::LargeUtf8 - | ArrowDataType::LargeList(_) - | ArrowDataType::FixedSizeList(_, _) - | ArrowDataType::Struct(_) - | ArrowDataType::Union(_, _, _) - | ArrowDataType::Dictionary(_, _) - | ArrowDataType::Decimal128(_, _) - | ArrowDataType::Decimal256(_, _) - | ArrowDataType::Map(_, _) => { - unimplemented!("Arrow array datatype: {:?}", array.as_ref().data_type()) + ArrowDataType::Timestamp(_, _) => { + Arc::new(TimestampVector::try_from_arrow_array(array)?) } + _ => unimplemented!("Arrow array datatype: {:?}", array.as_ref().data_type()), }) } - /// Try to cast slice of `arrays` to vectors. pub fn try_into_vectors(arrays: &[ArrayRef]) -> Result> { arrays.iter().map(Self::try_into_vector).collect() } - /// Perform SQL like operation on `names` and a scalar `s`. pub fn like_utf8(names: Vec, s: &str) -> Result { - let array = StringArray::from(names); + let array = StringArray::from_slice(&names); - let filter = comparison::like_utf8_scalar(&array, s).context(error::ArrowComputeSnafu)?; + let filter = + compute::like::like_utf8_scalar(&array, s).context(error::ArrowComputeSnafu)?; - let result = compute::filter(&array, &filter).context(error::ArrowComputeSnafu)?; + let result = compute::filter::filter(&array, &filter).context(error::ArrowComputeSnafu)?; Helper::try_into_vector(result) } } #[cfg(test)] mod tests { - use arrow::array::{ - ArrayRef, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, Int16Array, - Int32Array, Int64Array, Int8Array, LargeBinaryArray, ListArray, NullArray, - TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, - TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, - }; - use arrow::datatypes::{Field, Int32Type}; - use common_time::{Date, DateTime}; + use arrow::array::Int32Array; + use common_time::date::Date; + use common_time::datetime::DateTime; use super::*; - use crate::value::Value; - use crate::vectors::ConcreteDataType; #[test] fn test_try_into_vectors() { let arrays: Vec = vec![ - Arc::new(Int32Array::from(vec![1])), - Arc::new(Int32Array::from(vec![2])), - Arc::new(Int32Array::from(vec![3])), + Arc::new(Int32Array::from_vec(vec![1])), + Arc::new(Int32Array::from_vec(vec![2])), + Arc::new(Int32Array::from_vec(vec![3])), ]; let vectors = Helper::try_into_vectors(&arrays); assert!(vectors.is_ok()); @@ -315,10 +246,10 @@ mod tests { } #[test] - fn test_try_into_date_vector() { + pub fn test_try_into_date_vector() { let vector = DateVector::from(vec![Some(1), Some(2), None]); let arrow_array = vector.to_arrow_array(); - assert_eq!(&ArrowDataType::Date32, arrow_array.data_type()); + assert_eq!(&arrow::datatypes::DataType::Date32, arrow_array.data_type()); let vector_converted = Helper::try_into_vector(arrow_array).unwrap(); assert_eq!(vector.len(), vector_converted.len()); for i in 0..vector_converted.len() { @@ -327,7 +258,7 @@ mod tests { } #[test] - fn test_try_from_scalar_date_value() { + pub fn test_try_from_scalar_date_value() { let vector = Helper::try_from_scalar_value(ScalarValue::Date32(Some(42)), 3).unwrap(); assert_eq!(ConcreteDataType::date_datatype(), vector.data_type()); assert_eq!(3, vector.len()); @@ -337,7 +268,7 @@ mod tests { } #[test] - fn test_try_from_scalar_datetime_value() { + pub fn test_try_from_scalar_datetime_value() { let vector = Helper::try_from_scalar_value(ScalarValue::Date64(Some(42)), 3).unwrap(); assert_eq!(ConcreteDataType::datetime_datatype(), vector.data_type()); assert_eq!(3, vector.len()); @@ -346,28 +277,6 @@ mod tests { } } - #[test] - fn test_try_from_list_value() { - let value = ScalarValue::List( - Some(vec![ - ScalarValue::Int32(Some(1)), - ScalarValue::Int32(Some(2)), - ]), - Box::new(Field::new("item", ArrowDataType::Int32, true)), - ); - let vector = Helper::try_from_scalar_value(value, 3).unwrap(); - assert_eq!( - ConcreteDataType::list_datatype(ConcreteDataType::int32_datatype()), - vector.data_type() - ); - assert_eq!(3, vector.len()); - for i in 0..vector.len() { - let v = vector.get(i); - let items = v.as_list().unwrap().unwrap().items().as_ref().unwrap(); - assert_eq!(vec![Value::Int32(1), Value::Int32(2)], **items); - } - } - #[test] fn test_like_utf8() { fn assert_vector(expected: Vec<&str>, actual: &VectorRef) { @@ -392,40 +301,4 @@ mod tests { let ret = Helper::like_utf8(names, "%").unwrap(); assert_vector(vec!["greptime", "hello", "public", "world"], &ret); } - - fn check_try_into_vector(array: impl Array + 'static) { - let array: ArrayRef = Arc::new(array); - let vector = Helper::try_into_vector(array.clone()).unwrap(); - assert_eq!(&array, &vector.to_arrow_array()); - } - - #[test] - fn test_try_into_vector() { - check_try_into_vector(NullArray::new(2)); - check_try_into_vector(BooleanArray::from(vec![true, false])); - check_try_into_vector(LargeBinaryArray::from(vec![ - "hello".as_bytes(), - "world".as_bytes(), - ])); - check_try_into_vector(Int8Array::from(vec![1, 2, 3])); - check_try_into_vector(Int16Array::from(vec![1, 2, 3])); - check_try_into_vector(Int32Array::from(vec![1, 2, 3])); - check_try_into_vector(Int64Array::from(vec![1, 2, 3])); - check_try_into_vector(UInt8Array::from(vec![1, 2, 3])); - check_try_into_vector(UInt16Array::from(vec![1, 2, 3])); - check_try_into_vector(UInt32Array::from(vec![1, 2, 3])); - check_try_into_vector(UInt64Array::from(vec![1, 2, 3])); - check_try_into_vector(Float32Array::from(vec![1.0, 2.0, 3.0])); - check_try_into_vector(Float64Array::from(vec![1.0, 2.0, 3.0])); - check_try_into_vector(StringArray::from(vec!["hello", "world"])); - check_try_into_vector(Date32Array::from(vec![1, 2, 3])); - check_try_into_vector(Date64Array::from(vec![1, 2, 3])); - let data = vec![None, Some(vec![Some(6), Some(7)])]; - let list_array = ListArray::from_iter_primitive::(data); - check_try_into_vector(list_array); - check_try_into_vector(TimestampSecondArray::from(vec![1, 2, 3])); - check_try_into_vector(TimestampMillisecondArray::from(vec![1, 2, 3])); - check_try_into_vector(TimestampMicrosecondArray::from(vec![1, 2, 3])); - check_try_into_vector(TimestampNanosecondArray::from(vec![1, 2, 3])); - } } diff --git a/src/datatypes2/src/vectors/list.rs b/src/datatypes2/src/vectors/list.rs index 747e03557b..76d9dd8717 100644 --- a/src/datatypes2/src/vectors/list.rs +++ b/src/datatypes2/src/vectors/list.rs @@ -13,48 +13,39 @@ // limitations under the License. use std::any::Any; +use std::ops::Range; use std::sync::Arc; -use arrow::array::{ - Array, ArrayData, ArrayRef, BooleanBufferBuilder, Int32BufferBuilder, ListArray, -}; -use arrow::buffer::Buffer; +use arrow::array::{Array, ArrayRef, ListArray}; +use arrow::bitmap::utils::ZipValidity; +use arrow::bitmap::MutableBitmap; use arrow::datatypes::DataType as ArrowDataType; use serde_json::Value as JsonValue; +use snafu::prelude::*; -use crate::data_type::{ConcreteDataType, DataType}; use crate::error::Result; -use crate::scalars::{ScalarVector, ScalarVectorBuilder}; +use crate::prelude::*; use crate::serialize::Serializable; use crate::types::ListType; -use crate::value::{ListValue, ListValueRef, Value, ValueRef}; -use crate::vectors::{self, Helper, MutableVector, Validity, Vector, VectorRef}; +use crate::value::{ListValue, ListValueRef}; +use crate::vectors::{impl_try_from_arrow_array_for_vector, impl_validity_for_vector}; + +type ArrowListArray = ListArray; /// Vector of Lists, basically backed by Arrow's `ListArray`. -#[derive(Debug, PartialEq)] +#[derive(Debug, Clone, PartialEq)] pub struct ListVector { - array: ListArray, - /// The datatype of the items in the list. - item_type: ConcreteDataType, + array: ArrowListArray, + inner_datatype: ConcreteDataType, } impl ListVector { - /// Iterate elements as [VectorRef]. - pub fn values_iter(&self) -> impl Iterator>> + '_ { - self.array - .iter() - .map(|value_opt| value_opt.map(Helper::try_into_vector).transpose()) - } - - fn to_array_data(&self) -> ArrayData { - self.array.data().clone() - } - - fn from_array_data_and_type(data: ArrayData, item_type: ConcreteDataType) -> Self { - Self { - array: ListArray::from(data), - item_type, - } + /// Only iterate values in the [ListVector]. + /// + /// Be careful to use this method as it would ignore validity and replace null + /// by empty vector. + pub fn values_iter(&self) -> Box> + '_> { + Box::new(self.array.values_iter().map(VectorHelper::try_into_vector)) } pub(crate) fn as_arrow(&self) -> &dyn Array { @@ -64,7 +55,7 @@ impl ListVector { impl Vector for ListVector { fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::List(ListType::new(self.item_type.clone())) + ConcreteDataType::List(ListType::new(self.inner_datatype.clone())) } fn vector_type_name(&self) -> String { @@ -80,25 +71,21 @@ impl Vector for ListVector { } fn to_arrow_array(&self) -> ArrayRef { - let data = self.to_array_data(); - Arc::new(ListArray::from(data)) + Arc::new(self.array.clone()) } fn to_boxed_arrow_array(&self) -> Box { - let data = self.to_array_data(); - Box::new(ListArray::from(data)) + Box::new(self.array.clone()) } fn validity(&self) -> Validity { - vectors::impl_validity_for_vector!(self.array) + impl_validity_for_vector!(self.array) } fn memory_size(&self) -> usize { - self.array.get_buffer_memory_size() - } - - fn null_count(&self) -> usize { - self.array.null_count() + let offsets_bytes = self.array.offsets().len() * std::mem::size_of::(); + let value_refs_bytes = self.array.values().len() * std::mem::size_of::>(); + offsets_bytes + value_refs_bytes } fn is_null(&self, row: usize) -> bool { @@ -106,8 +93,7 @@ impl Vector for ListVector { } fn slice(&self, offset: usize, length: usize) -> VectorRef { - let data = self.array.data().slice(offset, length); - Arc::new(Self::from_array_data_and_type(data, self.item_type.clone())) + Arc::new(ListVector::from(self.array.slice(offset, length))) } fn get(&self, index: usize) -> Value { @@ -116,7 +102,7 @@ impl Vector for ListVector { } let array = &self.array.value(index); - let vector = Helper::try_into_vector(array).unwrap_or_else(|_| { + let vector = VectorHelper::try_into_vector(array).unwrap_or_else(|_| { panic!( "arrow array with datatype {:?} cannot converted to our vector", array.data_type() @@ -127,7 +113,7 @@ impl Vector for ListVector { .collect::>(); Value::List(ListValue::new( Some(Box::new(values)), - self.item_type.clone(), + self.inner_datatype.clone(), )) } @@ -145,7 +131,7 @@ impl Serializable for ListVector { .iter() .map(|v| match v { None => Ok(JsonValue::Null), - Some(v) => Helper::try_into_vector(v) + Some(v) => VectorHelper::try_into_vector(v) .and_then(|v| v.serialize_to_json()) .map(JsonValue::Array), }) @@ -153,64 +139,70 @@ impl Serializable for ListVector { } } -impl From for ListVector { - fn from(array: ListArray) -> Self { - let item_type = ConcreteDataType::from_arrow_type(match array.data_type() { - ArrowDataType::List(field) => field.data_type(), - other => panic!( - "Try to create ListVector from an arrow array with type {:?}", - other - ), +impl From for ListVector { + fn from(array: ArrowListArray) -> Self { + let inner_datatype = ConcreteDataType::from_arrow_type(match array.data_type() { + ArrowDataType::List(field) => &field.data_type, + _ => unreachable!(), }); - Self { array, item_type } + Self { + array, + inner_datatype, + } } } -vectors::impl_try_from_arrow_array_for_vector!(ListArray, ListVector); +impl_try_from_arrow_array_for_vector!(ArrowListArray, ListVector); -pub struct ListIter<'a> { +pub struct ListVectorIter<'a> { vector: &'a ListVector, - idx: usize, + iter: ZipValidity<'a, usize, Range>, } -impl<'a> ListIter<'a> { - fn new(vector: &'a ListVector) -> ListIter { - ListIter { vector, idx: 0 } +impl<'a> ListVectorIter<'a> { + pub fn new(vector: &'a ListVector) -> ListVectorIter<'a> { + let iter = ZipValidity::new( + 0..vector.len(), + vector.array.validity().as_ref().map(|x| x.iter()), + ); + + Self { vector, iter } } } -impl<'a> Iterator for ListIter<'a> { +impl<'a> Iterator for ListVectorIter<'a> { type Item = Option>; #[inline] fn next(&mut self) -> Option { - if self.idx >= self.vector.len() { - return None; - } - - let idx = self.idx; - self.idx += 1; - - if self.vector.is_null(idx) { - return Some(None); - } - - Some(Some(ListValueRef::Indexed { - vector: self.vector, - idx, - })) + self.iter.next().map(|idx_opt| { + idx_opt.map(|idx| ListValueRef::Indexed { + vector: self.vector, + idx, + }) + }) } #[inline] fn size_hint(&self) -> (usize, Option) { - (self.vector.len(), Some(self.vector.len())) + self.iter.size_hint() + } + + #[inline] + fn nth(&mut self, n: usize) -> Option { + self.iter.nth(n).map(|idx_opt| { + idx_opt.map(|idx| ListValueRef::Indexed { + vector: self.vector, + idx, + }) + }) } } impl ScalarVector for ListVector { type OwnedItem = ListValue; type RefItem<'a> = ListValueRef<'a>; - type Iter<'a> = ListIter<'a>; + type Iter<'a> = ListVectorIter<'a>; type Builder = ListVectorBuilder; fn get_data(&self, idx: usize) -> Option> { @@ -222,68 +214,86 @@ impl ScalarVector for ListVector { } fn iter_data(&self) -> Self::Iter<'_> { - ListIter::new(self) + ListVectorIter::new(self) } } -// Ports from arrow's GenericListBuilder. -// See https://github.com/apache/arrow-rs/blob/94565bca99b5d9932a3e9a8e094aaf4e4384b1e5/arrow-array/src/builder/generic_list_builder.rs -/// [ListVector] builder. +// Some codes are ported from arrow2's MutableListArray. pub struct ListVectorBuilder { - item_type: ConcreteDataType, - offsets_builder: Int32BufferBuilder, - null_buffer_builder: NullBufferBuilder, - values_builder: Box, + inner_type: ConcreteDataType, + offsets: Vec, + values: Box, + validity: Option, } impl ListVectorBuilder { - /// Creates a new [`ListVectorBuilder`]. `item_type` is the data type of the list item, `capacity` - /// is the number of items to pre-allocate space for in this builder. - pub fn with_type_capacity(item_type: ConcreteDataType, capacity: usize) -> ListVectorBuilder { - let mut offsets_builder = Int32BufferBuilder::new(capacity + 1); - offsets_builder.append(0); - // The actual required capacity might be greater than the capacity of the `ListVector` - // if the child vector has more than one element. - let values_builder = item_type.create_mutable_vector(capacity); + pub fn with_type_capacity(inner_type: ConcreteDataType, capacity: usize) -> ListVectorBuilder { + let mut offsets = Vec::with_capacity(capacity + 1); + offsets.push(0); + // The actual required capacity might greater than the capacity of the `ListVector` + // if there exists child vector that has more than one element. + let values = inner_type.create_mutable_vector(capacity); ListVectorBuilder { - item_type, - offsets_builder, - null_buffer_builder: NullBufferBuilder::new(capacity), - values_builder, + inner_type, + offsets, + values, + validity: None, } } - /// Finish the current variable-length list vector slot. - fn finish_list(&mut self, is_valid: bool) { - self.offsets_builder - .append(i32::try_from(self.values_builder.len()).unwrap()); - self.null_buffer_builder.append(is_valid); + #[inline] + fn last_offset(&self) -> i32 { + *self.offsets.last().unwrap() } fn push_null(&mut self) { - self.finish_list(false); + self.offsets.push(self.last_offset()); + match &mut self.validity { + Some(validity) => validity.push(false), + None => self.init_validity(), + } + } + + fn init_validity(&mut self) { + let len = self.offsets.len() - 1; + + let mut validity = MutableBitmap::with_capacity(self.offsets.capacity()); + validity.extend_constant(len, true); + validity.set(len - 1, false); + self.validity = Some(validity) } fn push_list_value(&mut self, list_value: &ListValue) -> Result<()> { if let Some(items) = list_value.items() { for item in &**items { - self.values_builder.push_value_ref(item.as_value_ref())?; + self.values.push_value_ref(item.as_value_ref())?; } } - - self.finish_list(true); + self.push_valid(); Ok(()) } + + /// Needs to be called when a valid value was extended to this builder. + fn push_valid(&mut self) { + let size = self.values.len(); + let size = i32::try_from(size).unwrap(); + assert!(size >= *self.offsets.last().unwrap()); + + self.offsets.push(size); + if let Some(validity) = &mut self.validity { + validity.push(true) + } + } } impl MutableVector for ListVectorBuilder { fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::list_datatype(self.item_type.clone()) + ConcreteDataType::list_datatype(self.inner_type.clone()) } fn len(&self) -> usize { - self.null_buffer_builder.len() + self.offsets.len() - 1 } fn as_any(&self) -> &dyn Any { @@ -338,181 +348,51 @@ impl ScalarVectorBuilder for ListVectorBuilder { self.push_value_ref(value.into()).unwrap_or_else(|e| { panic!( "Failed to push value, expect value type {:?}, err:{}", - self.item_type, e + self.inner_type, e ); }); } fn finish(&mut self) -> Self::VectorType { - let len = self.len(); - let values_vector = self.values_builder.to_vector(); - let values_arr = values_vector.to_arrow_array(); - let values_data = values_arr.data(); - - let offset_buffer = self.offsets_builder.finish(); - let null_bit_buffer = self.null_buffer_builder.finish(); - // Re-initialize the offsets_builder. - self.offsets_builder.append(0); - let data_type = ConcreteDataType::list_datatype(self.item_type.clone()).as_arrow_type(); - let array_data_builder = ArrayData::builder(data_type) - .len(len) - .add_buffer(offset_buffer) - .add_child_data(values_data.clone()) - .null_bit_buffer(null_bit_buffer); - - let array_data = unsafe { array_data_builder.build_unchecked() }; - let array = ListArray::from(array_data); + let array = ArrowListArray::try_new( + ConcreteDataType::list_datatype(self.inner_type.clone()).as_arrow_type(), + std::mem::take(&mut self.offsets).into(), + self.values.to_vector().to_arrow_array(), + std::mem::take(&mut self.validity).map(|x| x.into()), + ) + .unwrap(); // The `ListVectorBuilder` itself should ensure it always builds a valid array. ListVector { array, - item_type: self.item_type.clone(), - } - } -} - -// Ports from https://github.com/apache/arrow-rs/blob/94565bca99b5d9932a3e9a8e094aaf4e4384b1e5/arrow-array/src/builder/null_buffer_builder.rs -/// Builder for creating the null bit buffer. -/// This builder only materializes the buffer when we append `false`. -/// If you only append `true`s to the builder, what you get will be -/// `None` when calling [`finish`](#method.finish). -/// This optimization is **very** important for the performance. -#[derive(Debug)] -struct NullBufferBuilder { - bitmap_builder: Option, - /// Store the length of the buffer before materializing. - len: usize, - capacity: usize, -} - -impl NullBufferBuilder { - /// Creates a new empty builder. - /// `capacity` is the number of bits in the null buffer. - fn new(capacity: usize) -> Self { - Self { - bitmap_builder: None, - len: 0, - capacity, - } - } - - fn len(&self) -> usize { - if let Some(b) = &self.bitmap_builder { - b.len() - } else { - self.len - } - } - - /// Appends a `true` into the builder - /// to indicate that this item is not null. - #[inline] - fn append_non_null(&mut self) { - if let Some(buf) = self.bitmap_builder.as_mut() { - buf.append(true) - } else { - self.len += 1; - } - } - - /// Appends a `false` into the builder - /// to indicate that this item is null. - #[inline] - fn append_null(&mut self) { - self.materialize_if_needed(); - self.bitmap_builder.as_mut().unwrap().append(false); - } - - /// Appends a boolean value into the builder. - #[inline] - fn append(&mut self, not_null: bool) { - if not_null { - self.append_non_null() - } else { - self.append_null() - } - } - - /// Builds the null buffer and resets the builder. - /// Returns `None` if the builder only contains `true`s. - fn finish(&mut self) -> Option { - let buf = self.bitmap_builder.as_mut().map(|b| b.finish()); - self.bitmap_builder = None; - self.len = 0; - buf - } - - #[inline] - fn materialize_if_needed(&mut self) { - if self.bitmap_builder.is_none() { - self.materialize() - } - } - - #[cold] - fn materialize(&mut self) { - if self.bitmap_builder.is_none() { - let mut b = BooleanBufferBuilder::new(self.len.max(self.capacity)); - b.append_n(self.len, true); - self.bitmap_builder = Some(b); + inner_datatype: self.inner_type.clone(), } } } #[cfg(test)] -pub mod tests { - use arrow::array::{Int32Array, Int32Builder, ListBuilder}; +mod tests { + use arrow::array::{MutableListArray, MutablePrimitiveArray, TryExtend}; use serde_json::json; use super::*; - use crate::scalars::ScalarRef; use crate::types::ListType; - use crate::vectors::Int32Vector; - - pub fn new_list_vector(data: &[Option>>]) -> ListVector { - let mut builder = - ListVectorBuilder::with_type_capacity(ConcreteDataType::int32_datatype(), 8); - for vec_opt in data { - if let Some(vec) = vec_opt { - let values = vec.iter().map(|v| Value::from(*v)).collect(); - let values = Some(Box::new(values)); - let list_value = ListValue::new(values, ConcreteDataType::int32_datatype()); - - builder.push(Some(ListValueRef::Ref { val: &list_value })); - } else { - builder.push(None); - } - } - - builder.finish() - } - - fn new_list_array(data: &[Option>>]) -> ListArray { - let mut builder = ListBuilder::new(Int32Builder::new()); - for vec_opt in data { - if let Some(vec) = vec_opt { - for value_opt in vec { - builder.values().append_option(*value_opt); - } - - builder.append(true); - } else { - builder.append(false); - } - } - - builder.finish() - } #[test] fn test_list_vector() { let data = vec![ - Some(vec![Some(1), Some(2), Some(3)]), + Some(vec![Some(1i32), Some(2), Some(3)]), None, Some(vec![Some(4), None, Some(6)]), ]; - let list_vector = new_list_vector(&data); + let mut arrow_array = MutableListArray::>::new(); + arrow_array.try_extend(data).unwrap(); + let arrow_array: ArrowListArray = arrow_array.into(); + let list_vector = ListVector { + array: arrow_array.clone(), + inner_datatype: ConcreteDataType::int32_datatype(), + }; assert_eq!( ConcreteDataType::List(ListType::new(ConcreteDataType::int32_datatype())), list_vector.data_type() @@ -523,34 +403,30 @@ pub mod tests { assert!(list_vector.is_null(1)); assert!(!list_vector.is_null(2)); - let arrow_array = new_list_array(&data); assert_eq!( arrow_array, - *list_vector + list_vector .to_arrow_array() .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap() + .clone() ); - let validity = list_vector.validity(); - assert!(!validity.is_all_null()); - assert!(!validity.is_all_valid()); - assert!(validity.is_set(0)); - assert!(!validity.is_set(1)); - assert!(validity.is_set(2)); - assert_eq!(256, list_vector.memory_size()); - - let slice = list_vector.slice(0, 2).to_arrow_array(); - let sliced_array = slice.as_any().downcast_ref::().unwrap(); assert_eq!( - Int32Array::from_iter_values([1, 2, 3]), - *sliced_array - .value(0) - .as_any() - .downcast_ref::() - .unwrap() + Validity::Slots(arrow_array.validity().unwrap()), + list_vector.validity() + ); + assert_eq!( + arrow_array.offsets().len() * std::mem::size_of::() + + arrow_array.values().len() * std::mem::size_of::>(), + list_vector.memory_size() + ); + + let slice = list_vector.slice(0, 2); + assert_eq!( + "ListArray[[1, 2, 3], None]", + format!("{:?}", slice.to_arrow_array()) ); - assert!(sliced_array.is_null(1)); assert_eq!( Value::List(ListValue::new( @@ -591,48 +467,52 @@ pub mod tests { #[test] fn test_from_arrow_array() { let data = vec![ - Some(vec![Some(1), Some(2), Some(3)]), + Some(vec![Some(1u32), Some(2), Some(3)]), None, Some(vec![Some(4), None, Some(6)]), ]; - let arrow_array = new_list_array(&data); + let mut arrow_array = MutableListArray::>::new(); + arrow_array.try_extend(data).unwrap(); + let arrow_array: ArrowListArray = arrow_array.into(); let array_ref: ArrayRef = Arc::new(arrow_array); - let expect = new_list_vector(&data); - // Test try from ArrayRef let list_vector = ListVector::try_from_arrow_array(array_ref).unwrap(); - assert_eq!(expect, list_vector); - - // Test from - let arrow_array = new_list_array(&data); - let list_vector = ListVector::from(arrow_array); - assert_eq!(expect, list_vector); + assert_eq!( + "ListVector { array: ListArray[[1, 2, 3], None, [4, None, 6]], inner_datatype: UInt32(UInt32) }", + format!("{:?}", list_vector) + ); } #[test] fn test_iter_list_vector_values() { let data = vec![ - Some(vec![Some(1), Some(2), Some(3)]), + Some(vec![Some(1i64), Some(2), Some(3)]), None, Some(vec![Some(4), None, Some(6)]), ]; - let list_vector = new_list_vector(&data); + let mut arrow_array = MutableListArray::>::new(); + arrow_array.try_extend(data).unwrap(); + let arrow_array: ArrowListArray = arrow_array.into(); + let list_vector = ListVector::from(arrow_array); assert_eq!( - ConcreteDataType::List(ListType::new(ConcreteDataType::int32_datatype())), + ConcreteDataType::List(ListType::new(ConcreteDataType::int64_datatype())), list_vector.data_type() ); let mut iter = list_vector.values_iter(); assert_eq!( - Arc::new(Int32Vector::from_slice(&[1, 2, 3])) as VectorRef, - *iter.next().unwrap().unwrap().unwrap() + "Int64[1, 2, 3]", + format!("{:?}", iter.next().unwrap().unwrap().to_arrow_array()) ); - assert!(iter.next().unwrap().unwrap().is_none()); assert_eq!( - Arc::new(Int32Vector::from(vec![Some(4), None, Some(6)])) as VectorRef, - *iter.next().unwrap().unwrap().unwrap(), + "Int64[]", + format!("{:?}", iter.next().unwrap().unwrap().to_arrow_array()) + ); + assert_eq!( + "Int64[4, None, 6]", + format!("{:?}", iter.next().unwrap().unwrap().to_arrow_array()) ); assert!(iter.next().is_none()) } @@ -640,18 +520,30 @@ pub mod tests { #[test] fn test_serialize_to_json() { let data = vec![ - Some(vec![Some(1), Some(2), Some(3)]), + Some(vec![Some(1i64), Some(2), Some(3)]), None, Some(vec![Some(4), None, Some(6)]), ]; - let list_vector = new_list_vector(&data); + let mut arrow_array = MutableListArray::>::new(); + arrow_array.try_extend(data).unwrap(); + let arrow_array: ArrowListArray = arrow_array.into(); + + let list_vector = ListVector::from(arrow_array); assert_eq!( vec![json!([1, 2, 3]), json!(null), json!([4, null, 6]),], list_vector.serialize_to_json().unwrap() ); } + fn new_list_vector(data: Vec>>>) -> ListVector { + let mut arrow_array = MutableListArray::>::new(); + arrow_array.try_extend(data).unwrap(); + let arrow_array: ArrowListArray = arrow_array.into(); + + ListVector::from(arrow_array) + } + #[test] fn test_list_vector_builder() { let mut builder = @@ -675,14 +567,14 @@ pub mod tests { None, Some(vec![Some(7), Some(8), None]), ]; - let input = new_list_vector(&data); + let input = new_list_vector(data); builder.extend_slice_of(&input, 1, 2).unwrap(); assert!(builder .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) .is_err()); let vector = builder.to_vector(); - let expect: VectorRef = Arc::new(new_list_vector(&[ + let expect: VectorRef = Arc::new(new_list_vector(vec![ Some(vec![Some(4), None, Some(6)]), None, Some(vec![Some(7), Some(8), None]), @@ -707,7 +599,7 @@ pub mod tests { })); let vector = builder.finish(); - let expect = new_list_vector(&[None, Some(vec![Some(4), None, Some(6)])]); + let expect = new_list_vector(vec![None, Some(vec![Some(4), None, Some(6)])]); assert_eq!(expect, vector); assert!(vector.get_data(0).is_none()); diff --git a/src/datatypes/src/vectors/mutable.rs b/src/datatypes2/src/vectors/mutable.rs similarity index 100% rename from src/datatypes/src/vectors/mutable.rs rename to src/datatypes2/src/vectors/mutable.rs diff --git a/src/datatypes2/src/vectors/null.rs b/src/datatypes2/src/vectors/null.rs index bb66e09b39..64974d99b0 100644 --- a/src/datatypes2/src/vectors/null.rs +++ b/src/datatypes2/src/vectors/null.rs @@ -16,7 +16,8 @@ use std::any::Any; use std::fmt; use std::sync::Arc; -use arrow::array::{Array, ArrayData, ArrayRef, NullArray}; +use arrow::array::{Array, ArrayRef, NullArray}; +use arrow::datatypes::DataType as ArrowDataType; use snafu::{ensure, OptionExt}; use crate::data_type::ConcreteDataType; @@ -26,28 +27,21 @@ use crate::types::NullType; use crate::value::{Value, ValueRef}; use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; -/// A vector where all elements are nulls. #[derive(PartialEq)] pub struct NullVector { array: NullArray, } -// TODO(yingwen): Support null vector with other logical types. impl NullVector { - /// Create a new `NullVector` with `n` elements. pub fn new(n: usize) -> Self { Self { - array: NullArray::new(n), + array: NullArray::new(ArrowDataType::Null, n), } } pub(crate) fn as_arrow(&self) -> &dyn Array { &self.array } - - fn to_array_data(&self) -> ArrayData { - self.array.data().clone() - } } impl From for NullVector { @@ -74,28 +68,21 @@ impl Vector for NullVector { } fn to_arrow_array(&self) -> ArrayRef { - // TODO(yingwen): Replaced by clone after upgrading to arrow 28.0. - let data = self.to_array_data(); - Arc::new(NullArray::from(data)) + Arc::new(self.array.clone()) } fn to_boxed_arrow_array(&self) -> Box { - let data = self.to_array_data(); - Box::new(NullArray::from(data)) + Box::new(self.array.clone()) } fn validity(&self) -> Validity { - Validity::all_null(self.array.len()) + Validity::AllNull } fn memory_size(&self) -> usize { 0 } - fn null_count(&self) -> usize { - self.array.null_count() - } - fn is_null(&self, _row: usize) -> bool { true } @@ -230,7 +217,7 @@ mod tests { assert_eq!("NullVector", v.vector_type_name()); assert!(!v.is_const()); - assert!(v.validity().is_all_null()); + assert_eq!(Validity::AllNull, v.validity()); assert!(v.only_null()); for i in 0..32 { @@ -259,7 +246,7 @@ mod tests { #[test] fn test_null_vector_validity() { let vector = NullVector::new(5); - assert!(vector.validity().is_all_null()); + assert_eq!(Validity::AllNull, vector.validity()); assert_eq!(5, vector.null_count()); } diff --git a/src/datatypes2/src/vectors/operations.rs b/src/datatypes2/src/vectors/operations.rs index 70ddb4a031..e63f338a05 100644 --- a/src/datatypes2/src/vectors/operations.rs +++ b/src/datatypes2/src/vectors/operations.rs @@ -19,11 +19,10 @@ mod replicate; use common_base::BitVec; use crate::error::Result; -use crate::types::LogicalPrimitiveType; -use crate::vectors::constant::ConstantVector; +use crate::types::PrimitiveElement; use crate::vectors::{ - BinaryVector, BooleanVector, ListVector, NullVector, PrimitiveVector, StringVector, Vector, - VectorRef, + BinaryVector, BooleanVector, ConstantVector, DateTimeVector, DateVector, ListVector, + NullVector, PrimitiveVector, StringVector, TimestampVector, Vector, VectorRef, }; /// Vector compute operations. @@ -60,10 +59,10 @@ pub trait VectorOp { } macro_rules! impl_scalar_vector_op { - ($($VectorType: ident),+) => {$( + ($( { $VectorType: ident, $replicate: ident } ),+) => {$( impl VectorOp for $VectorType { fn replicate(&self, offsets: &[usize]) -> VectorRef { - replicate::replicate_scalar(self, offsets) + replicate::$replicate(self, offsets) } fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { @@ -78,21 +77,28 @@ macro_rules! impl_scalar_vector_op { )+}; } -impl_scalar_vector_op!(BinaryVector, BooleanVector, ListVector, StringVector); +impl_scalar_vector_op!( + { BinaryVector, replicate_scalar }, + { BooleanVector, replicate_scalar }, + { ListVector, replicate_scalar }, + { StringVector, replicate_scalar }, + { DateVector, replicate_date }, + { DateTimeVector, replicate_datetime }, + { TimestampVector, replicate_timestamp } +); -impl VectorOp for PrimitiveVector { +impl VectorOp for ConstantVector { fn replicate(&self, offsets: &[usize]) -> VectorRef { - std::sync::Arc::new(replicate::replicate_primitive(self, offsets)) + replicate::replicate_constant(self, offsets) } fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { - let prev_vector = - prev_vector.and_then(|pv| pv.as_any().downcast_ref::>()); - find_unique::find_unique_scalar(self, selected, prev_vector); + let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::()); + find_unique::find_unique_constant(self, selected, prev_vector); } fn filter(&self, filter: &BooleanVector) -> Result { - filter::filter_non_constant!(self, PrimitiveVector, filter) + filter::filter_constant(self, filter) } } @@ -111,17 +117,21 @@ impl VectorOp for NullVector { } } -impl VectorOp for ConstantVector { +impl VectorOp for PrimitiveVector +where + T: PrimitiveElement, +{ fn replicate(&self, offsets: &[usize]) -> VectorRef { - self.replicate_vector(offsets) + replicate::replicate_primitive(self, offsets) } fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { - let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::()); - find_unique::find_unique_constant(self, selected, prev_vector); + let prev_vector = + prev_vector.and_then(|pv| pv.as_any().downcast_ref::>()); + find_unique::find_unique_scalar(self, selected, prev_vector); } fn filter(&self, filter: &BooleanVector) -> Result { - self.filter_vector(filter) + filter::filter_non_constant!(self, PrimitiveVector, filter) } } diff --git a/src/datatypes2/src/vectors/operations/filter.rs b/src/datatypes2/src/vectors/operations/filter.rs index 8368a6afb4..7a9f514a16 100644 --- a/src/datatypes2/src/vectors/operations/filter.rs +++ b/src/datatypes2/src/vectors/operations/filter.rs @@ -12,15 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. +pub(crate) use crate::vectors::constant::filter_constant; + macro_rules! filter_non_constant { ($vector: expr, $VectorType: ty, $filter: ident) => {{ use std::sync::Arc; - use arrow::compute; use snafu::ResultExt; let arrow_array = $vector.as_arrow(); - let filtered = compute::filter(arrow_array, $filter.as_boolean_array()) + let filtered = arrow::compute::filter::filter(arrow_array, $filter.as_boolean_array()) .context(crate::error::ArrowComputeSnafu)?; Ok(Arc::new(<$VectorType>::try_from_arrow_array(filtered)?)) }}; @@ -32,16 +33,9 @@ pub(crate) use filter_non_constant; mod tests { use std::sync::Arc; - use common_time::{Date, DateTime}; - use crate::scalars::ScalarVector; - use crate::timestamp::{ - TimestampMicrosecond, TimestampMillisecond, TimestampNanosecond, TimestampSecond, - }; - use crate::types::WrapperType; - use crate::vectors::constant::ConstantVector; use crate::vectors::{ - BooleanVector, Int32Vector, NullVector, StringVector, VectorOp, VectorRef, + BooleanVector, ConstantVector, Int32Vector, NullVector, StringVector, VectorOp, VectorRef, }; fn check_filter_primitive(expect: &[i32], input: &[i32], filter: &[bool]) { @@ -111,6 +105,7 @@ mod tests { ($VectorType: ident, $ValueType: ident, $method: ident) => {{ use std::sync::Arc; + use common_time::$ValueType; use $crate::vectors::{$VectorType, VectorRef}; let v = $VectorType::from_iterator((0..5).map($ValueType::$method)); @@ -128,18 +123,6 @@ mod tests { fn test_filter_date_like() { impl_filter_date_like_test!(DateVector, Date, new); impl_filter_date_like_test!(DateTimeVector, DateTime, new); - - impl_filter_date_like_test!(TimestampSecondVector, TimestampSecond, from_native); - impl_filter_date_like_test!( - TimestampMillisecondVector, - TimestampMillisecond, - from_native - ); - impl_filter_date_like_test!( - TimestampMicrosecondVector, - TimestampMicrosecond, - from_native - ); - impl_filter_date_like_test!(TimestampNanosecondVector, TimestampNanosecond, from_native); + impl_filter_date_like_test!(TimestampVector, Timestamp, from_millis); } } diff --git a/src/datatypes2/src/vectors/operations/find_unique.rs b/src/datatypes2/src/vectors/operations/find_unique.rs index 7116a9e90d..d63a3c66b9 100644 --- a/src/datatypes2/src/vectors/operations/find_unique.rs +++ b/src/datatypes2/src/vectors/operations/find_unique.rs @@ -15,8 +15,7 @@ use common_base::BitVec; use crate::scalars::ScalarVector; -use crate::vectors::constant::ConstantVector; -use crate::vectors::{NullVector, Vector}; +use crate::vectors::{ConstantVector, NullVector, Vector}; // To implement `find_unique()` correctly, we need to keep in mind that always marks an element as // selected when it is different from the previous one, and leaves the `selected` unchanged @@ -71,7 +70,7 @@ pub(crate) fn find_unique_null( return; } - let is_first_not_duplicate = prev_vector.map(NullVector::is_empty).unwrap_or(true); + let is_first_not_duplicate = prev_vector.map(|pv| pv.is_empty()).unwrap_or(true); if is_first_not_duplicate { selected.set(0, true); } @@ -105,11 +104,8 @@ pub(crate) fn find_unique_constant( mod tests { use std::sync::Arc; - use common_time::{Date, DateTime}; - use super::*; - use crate::timestamp::*; - use crate::vectors::{Int32Vector, StringVector, Vector, VectorOp}; + use crate::vectors::{Int32Vector, StringVector, VectorOp}; fn check_bitmap(expect: &[bool], selected: &BitVec) { let actual = selected.iter().collect::>(); @@ -125,7 +121,7 @@ mod tests { input: impl Iterator>, prev: Option<&[i32]>, ) { - let input = Int32Vector::from(input.collect::>()); + let input = Int32Vector::from_iter(input); let prev = prev.map(Int32Vector::from_slice); let mut selected = BitVec::repeat(false, input.len()); @@ -345,6 +341,7 @@ mod tests { macro_rules! impl_find_unique_date_like_test { ($VectorType: ident, $ValueType: ident, $method: ident) => {{ + use common_time::$ValueType; use $crate::vectors::$VectorType; let v = $VectorType::from_iterator([8, 8, 9, 10].into_iter().map($ValueType::$method)); @@ -359,9 +356,6 @@ mod tests { fn test_find_unique_date_like() { impl_find_unique_date_like_test!(DateVector, Date, new); impl_find_unique_date_like_test!(DateTimeVector, DateTime, new); - impl_find_unique_date_like_test!(TimestampSecondVector, TimestampSecond, from); - impl_find_unique_date_like_test!(TimestampMillisecondVector, TimestampMillisecond, from); - impl_find_unique_date_like_test!(TimestampMicrosecondVector, TimestampMicrosecond, from); - impl_find_unique_date_like_test!(TimestampNanosecondVector, TimestampNanosecond, from); + impl_find_unique_date_like_test!(TimestampVector, Timestamp, from_millis); } } diff --git a/src/datatypes2/src/vectors/operations/replicate.rs b/src/datatypes2/src/vectors/operations/replicate.rs index 8216517fc6..7fb93134ed 100644 --- a/src/datatypes2/src/vectors/operations/replicate.rs +++ b/src/datatypes2/src/vectors/operations/replicate.rs @@ -13,8 +13,12 @@ // limitations under the License. use crate::prelude::*; +pub(crate) use crate::vectors::constant::replicate_constant; +pub(crate) use crate::vectors::date::replicate_date; +pub(crate) use crate::vectors::datetime::replicate_datetime; pub(crate) use crate::vectors::null::replicate_null; pub(crate) use crate::vectors::primitive::replicate_primitive; +pub(crate) use crate::vectors::timestamp::replicate_timestamp; pub(crate) fn replicate_scalar(c: &C, offsets: &[usize]) -> VectorRef { assert_eq!(offsets.len(), c.len()); @@ -39,13 +43,8 @@ pub(crate) fn replicate_scalar(c: &C, offsets: &[usize]) -> Vec mod tests { use std::sync::Arc; - use common_time::timestamp::TimeUnit; - use common_time::{Date, DateTime, Timestamp}; - use paste::paste; - use super::*; - use crate::vectors::constant::ConstantVector; - use crate::vectors::{Int32Vector, NullVector, StringVector, VectorOp}; + use crate::vectors::{ConstantVector, Int32Vector, NullVector, StringVector, VectorOp}; #[test] fn test_replicate_primitive() { @@ -121,6 +120,7 @@ mod tests { macro_rules! impl_replicate_date_like_test { ($VectorType: ident, $ValueType: ident, $method: ident) => {{ + use common_time::$ValueType; use $crate::vectors::$VectorType; let v = $VectorType::from_iterator((0..5).map($ValueType::$method)); @@ -138,33 +138,10 @@ mod tests { }}; } - macro_rules! impl_replicate_timestamp_test { - ($unit: ident) => {{ - paste!{ - use $crate::vectors::[]; - use $crate::timestamp::[]; - let v = []::from_iterator((0..5).map([]::from)); - let offsets = [0, 1, 2, 3, 4]; - let v = v.replicate(&offsets); - assert_eq!(4, v.len()); - for i in 0..4 { - assert_eq!( - Value::Timestamp(Timestamp::new(i as i64 + 1, TimeUnit::$unit)), - v.get(i) - ); - } - } - }}; - } - #[test] fn test_replicate_date_like() { impl_replicate_date_like_test!(DateVector, Date, new); impl_replicate_date_like_test!(DateTimeVector, DateTime, new); - - impl_replicate_timestamp_test!(Second); - impl_replicate_timestamp_test!(Millisecond); - impl_replicate_timestamp_test!(Microsecond); - impl_replicate_timestamp_test!(Nanosecond); + impl_replicate_date_like_test!(TimestampVector, Timestamp, from_millis); } } diff --git a/src/datatypes2/src/vectors/primitive.rs b/src/datatypes2/src/vectors/primitive.rs index 7829c31731..c49295630c 100644 --- a/src/datatypes2/src/vectors/primitive.rs +++ b/src/datatypes2/src/vectors/primitive.rs @@ -13,111 +13,75 @@ // limitations under the License. use std::any::Any; -use std::fmt; +use std::iter::FromIterator; +use std::slice::Iter; use std::sync::Arc; -use arrow::array::{ - Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef, PrimitiveArray, PrimitiveBuilder, -}; +use arrow::array::{Array, ArrayRef, MutableArray, MutablePrimitiveArray, PrimitiveArray}; +use arrow::bitmap::utils::ZipValidity; use serde_json::Value as JsonValue; -use snafu::OptionExt; +use snafu::{OptionExt, ResultExt}; -use crate::data_type::ConcreteDataType; -use crate::error::{self, Result}; +use crate::data_type::{ConcreteDataType, DataType}; +use crate::error::{ConversionSnafu, Result, SerializeSnafu}; use crate::scalars::{Scalar, ScalarRef, ScalarVector, ScalarVectorBuilder}; use crate::serialize::Serializable; -use crate::types::{ - Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LogicalPrimitiveType, - UInt16Type, UInt32Type, UInt64Type, UInt8Type, WrapperType, -}; +use crate::types::{Primitive, PrimitiveElement}; use crate::value::{Value, ValueRef}; use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; -pub type UInt8Vector = PrimitiveVector; -pub type UInt16Vector = PrimitiveVector; -pub type UInt32Vector = PrimitiveVector; -pub type UInt64Vector = PrimitiveVector; - -pub type Int8Vector = PrimitiveVector; -pub type Int16Vector = PrimitiveVector; -pub type Int32Vector = PrimitiveVector; -pub type Int64Vector = PrimitiveVector; - -pub type Float32Vector = PrimitiveVector; -pub type Float64Vector = PrimitiveVector; - /// Vector for primitive data types. -pub struct PrimitiveVector { - array: PrimitiveArray, +#[derive(Debug, Clone, PartialEq)] +pub struct PrimitiveVector { + pub(crate) array: PrimitiveArray, } -impl PrimitiveVector { - pub fn new(array: PrimitiveArray) -> Self { +impl PrimitiveVector { + pub fn new(array: PrimitiveArray) -> Self { Self { array } } pub fn try_from_arrow_array(array: impl AsRef) -> Result { - let data = array - .as_ref() - .as_any() - .downcast_ref::>() - .with_context(|| error::ConversionSnafu { - from: format!("{:?}", array.as_ref().data_type()), - })? - .data() - .clone(); - let concrete_array = PrimitiveArray::::from(data); - Ok(Self::new(concrete_array)) + Ok(Self::new( + array + .as_ref() + .as_any() + .downcast_ref::>() + .with_context(|| ConversionSnafu { + from: format!("{:?}", array.as_ref().data_type()), + })? + .clone(), + )) } - pub fn from_slice>(slice: P) -> Self { - let iter = slice.as_ref().iter().copied(); + pub fn from_slice>(slice: P) -> Self { Self { - array: PrimitiveArray::from_iter_values(iter), + array: PrimitiveArray::from_slice(slice), } } - pub fn from_wrapper_slice>(slice: P) -> Self { - let iter = slice.as_ref().iter().copied().map(WrapperType::into_native); + pub fn from_vec(array: Vec) -> Self { Self { - array: PrimitiveArray::from_iter_values(iter), + array: PrimitiveArray::from_vec(array), } } - pub fn from_vec(array: Vec) -> Self { + pub fn from_values>(iter: I) -> Self { Self { - array: PrimitiveArray::from_iter_values(array), + array: PrimitiveArray::from_values(iter), } } - pub fn from_values>(iter: I) -> Self { - Self { - array: PrimitiveArray::from_iter_values(iter), - } - } - - pub(crate) fn as_arrow(&self) -> &PrimitiveArray { + pub(crate) fn as_arrow(&self) -> &dyn Array { &self.array } - fn to_array_data(&self) -> ArrayData { - self.array.data().clone() - } - - fn from_array_data(data: ArrayData) -> Self { - Self { - array: PrimitiveArray::from(data), - } - } - - // To distinguish with `Vector::slice()`. - fn get_slice(&self, offset: usize, length: usize) -> Self { - let data = self.array.data().slice(offset, length); - Self::from_array_data(data) + fn slice(&self, offset: usize, length: usize) -> Self { + Self::from(self.array.slice(offset, length)) } } -impl Vector for PrimitiveVector { +impl Vector for PrimitiveVector { fn data_type(&self) -> ConcreteDataType { T::build_data_type() } @@ -135,13 +99,11 @@ impl Vector for PrimitiveVector { } fn to_arrow_array(&self) -> ArrayRef { - let data = self.to_array_data(); - Arc::new(PrimitiveArray::::from(data)) + Arc::new(self.array.clone()) } fn to_boxed_arrow_array(&self) -> Box { - let data = self.to_array_data(); - Box::new(PrimitiveArray::::from(data)) + Box::new(self.array.clone()) } fn validity(&self) -> Validity { @@ -149,11 +111,7 @@ impl Vector for PrimitiveVector { } fn memory_size(&self) -> usize { - self.array.get_buffer_memory_size() - } - - fn null_count(&self) -> usize { - self.array.null_count() + self.array.values().len() * std::mem::size_of::() } fn is_null(&self, row: usize) -> bool { @@ -161,80 +119,57 @@ impl Vector for PrimitiveVector { } fn slice(&self, offset: usize, length: usize) -> VectorRef { - let data = self.array.data().slice(offset, length); - Arc::new(Self::from_array_data(data)) + Arc::new(self.slice(offset, length)) } fn get(&self, index: usize) -> Value { - if self.array.is_valid(index) { - // Safety: The index have been checked by `is_valid()`. - let wrapper = unsafe { T::Wrapper::from_native(self.array.value_unchecked(index)) }; - wrapper.into() - } else { - Value::Null - } + vectors::impl_get_for_vector!(self.array, index) } fn get_ref(&self, index: usize) -> ValueRef { if self.array.is_valid(index) { // Safety: The index have been checked by `is_valid()`. - let wrapper = unsafe { T::Wrapper::from_native(self.array.value_unchecked(index)) }; - wrapper.into() + unsafe { self.array.value_unchecked(index).into_value_ref() } } else { ValueRef::Null } } } -impl fmt::Debug for PrimitiveVector { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("PrimitiveVector") - .field("array", &self.array) - .finish() - } -} - -impl From> for PrimitiveVector { - fn from(array: PrimitiveArray) -> Self { +impl From> for PrimitiveVector { + fn from(array: PrimitiveArray) -> Self { Self { array } } } -impl From>> for PrimitiveVector { - fn from(v: Vec>) -> Self { +impl From>> for PrimitiveVector { + fn from(v: Vec>) -> Self { Self { - array: PrimitiveArray::from_iter(v), + array: PrimitiveArray::::from(v), } } } -pub struct PrimitiveIter<'a, T: LogicalPrimitiveType> { - iter: ArrayIter<&'a PrimitiveArray>, -} - -impl<'a, T: LogicalPrimitiveType> Iterator for PrimitiveIter<'a, T> { - type Item = Option; - - fn next(&mut self) -> Option> { - self.iter - .next() - .map(|item| item.map(T::Wrapper::from_native)) - } - - fn size_hint(&self) -> (usize, Option) { - self.iter.size_hint() +impl>> FromIterator for PrimitiveVector { + fn from_iter>(iter: I) -> Self { + Self { + array: MutablePrimitiveArray::::from_iter(iter).into(), + } } } -impl ScalarVector for PrimitiveVector { - type OwnedItem = T::Wrapper; - type RefItem<'a> = T::Wrapper; +impl ScalarVector for PrimitiveVector +where + T: PrimitiveElement, +{ + type OwnedItem = T; + type RefItem<'a> = T; type Iter<'a> = PrimitiveIter<'a, T>; type Builder = PrimitiveVectorBuilder; fn get_data(&self, idx: usize) -> Option> { if self.array.is_valid(idx) { - Some(T::Wrapper::from_native(self.array.value(idx))) + Some(self.array.value(idx)) } else { None } @@ -247,47 +182,59 @@ impl ScalarVector for PrimitiveVector { } } -impl Serializable for PrimitiveVector { +pub type UInt8Vector = PrimitiveVector; +pub type UInt16Vector = PrimitiveVector; +pub type UInt32Vector = PrimitiveVector; +pub type UInt64Vector = PrimitiveVector; + +pub type Int8Vector = PrimitiveVector; +pub type Int16Vector = PrimitiveVector; +pub type Int32Vector = PrimitiveVector; +pub type Int64Vector = PrimitiveVector; + +pub type Float32Vector = PrimitiveVector; +pub type Float64Vector = PrimitiveVector; + +pub struct PrimitiveIter<'a, T> { + iter: ZipValidity<'a, &'a T, Iter<'a, T>>, +} + +impl<'a, T: Copy> Iterator for PrimitiveIter<'a, T> { + type Item = Option; + + fn next(&mut self) -> Option> { + self.iter.next().map(|v| v.copied()) + } +} + +impl Serializable for PrimitiveVector { fn serialize_to_json(&self) -> Result> { - let res = self - .iter_data() - .map(|v| match v { - None => serde_json::Value::Null, - // use WrapperType's Into bound instead of - // serde_json::to_value to facilitate customized serialization - // for WrapperType - Some(v) => v.into(), - }) - .collect::>(); - Ok(res) + self.array + .iter() + .map(serde_json::to_value) + .collect::>() + .context(SerializeSnafu) } } -impl PartialEq for PrimitiveVector { - fn eq(&self, other: &PrimitiveVector) -> bool { - self.array == other.array - } +pub struct PrimitiveVectorBuilder { + pub(crate) mutable_array: MutablePrimitiveArray, } -pub type UInt8VectorBuilder = PrimitiveVectorBuilder; -pub type UInt16VectorBuilder = PrimitiveVectorBuilder; -pub type UInt32VectorBuilder = PrimitiveVectorBuilder; -pub type UInt64VectorBuilder = PrimitiveVectorBuilder; +pub type UInt8VectorBuilder = PrimitiveVectorBuilder; +pub type UInt16VectorBuilder = PrimitiveVectorBuilder; +pub type UInt32VectorBuilder = PrimitiveVectorBuilder; +pub type UInt64VectorBuilder = PrimitiveVectorBuilder; -pub type Int8VectorBuilder = PrimitiveVectorBuilder; -pub type Int16VectorBuilder = PrimitiveVectorBuilder; -pub type Int32VectorBuilder = PrimitiveVectorBuilder; -pub type Int64VectorBuilder = PrimitiveVectorBuilder; +pub type Int8VectorBuilder = PrimitiveVectorBuilder; +pub type Int16VectorBuilder = PrimitiveVectorBuilder; +pub type Int32VectorBuilder = PrimitiveVectorBuilder; +pub type Int64VectorBuilder = PrimitiveVectorBuilder; -pub type Float32VectorBuilder = PrimitiveVectorBuilder; -pub type Float64VectorBuilder = PrimitiveVectorBuilder; +pub type Float32VectorBuilder = PrimitiveVectorBuilder; +pub type Float64VectorBuilder = PrimitiveVectorBuilder; -/// Builder to build a primitive vector. -pub struct PrimitiveVectorBuilder { - mutable_array: PrimitiveBuilder, -} - -impl MutableVector for PrimitiveVectorBuilder { +impl MutableVector for PrimitiveVectorBuilder { fn data_type(&self) -> ConcreteDataType { T::build_data_type() } @@ -310,62 +257,81 @@ impl MutableVector for PrimitiveVectorBuilder { fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { let primitive = T::cast_value_ref(value)?; - match primitive { - Some(v) => self.mutable_array.append_value(v.into_native()), - None => self.mutable_array.append_null(), - } + self.mutable_array.push(primitive); Ok(()) } fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { let primitive = T::cast_vector(vector)?; // Slice the underlying array to avoid creating a new Arc. - let slice = primitive.get_slice(offset, length); - for v in slice.iter_data() { - self.push(v); - } + let slice = primitive.slice(offset, length); + self.mutable_array.extend_trusted_len(slice.iter()); Ok(()) } } impl ScalarVectorBuilder for PrimitiveVectorBuilder where - T: LogicalPrimitiveType, - T::Wrapper: Scalar>, - for<'a> T::Wrapper: ScalarRef<'a, ScalarType = T::Wrapper>, - for<'a> T::Wrapper: Scalar = T::Wrapper>, + T: Scalar> + PrimitiveElement, + for<'a> T: ScalarRef<'a, ScalarType = T, VectorType = PrimitiveVector>, + for<'a> T: Scalar = T>, { type VectorType = PrimitiveVector; fn with_capacity(capacity: usize) -> Self { Self { - mutable_array: PrimitiveBuilder::with_capacity(capacity), + mutable_array: MutablePrimitiveArray::with_capacity(capacity), } } fn push(&mut self, value: Option<::RefItem<'_>>) { - self.mutable_array - .append_option(value.map(|v| v.into_native())); + self.mutable_array.push(value); } fn finish(&mut self) -> Self::VectorType { PrimitiveVector { - array: self.mutable_array.finish(), + array: std::mem::take(&mut self.mutable_array).into(), } } } -pub(crate) fn replicate_primitive( +impl PrimitiveVectorBuilder { + fn with_type_capacity(data_type: ConcreteDataType, capacity: usize) -> Self { + Self { + mutable_array: MutablePrimitiveArray::with_capacity_from( + capacity, + data_type.as_arrow_type(), + ), + } + } +} + +pub(crate) fn replicate_primitive( vector: &PrimitiveVector, offsets: &[usize], +) -> VectorRef { + Arc::new(replicate_primitive_with_type( + vector, + offsets, + T::build_data_type(), + )) +} + +pub(crate) fn replicate_primitive_with_type( + vector: &PrimitiveVector, + offsets: &[usize], + data_type: ConcreteDataType, ) -> PrimitiveVector { assert_eq!(offsets.len(), vector.len()); if offsets.is_empty() { - return vector.get_slice(0, 0); + return vector.slice(0, 0); } - let mut builder = PrimitiveVectorBuilder::::with_capacity(*offsets.last().unwrap() as usize); + let mut builder = PrimitiveVectorBuilder::::with_type_capacity( + data_type, + *offsets.last().unwrap() as usize, + ); let mut previous_offset = 0; @@ -373,15 +339,14 @@ pub(crate) fn replicate_primitive( let repeat_times = *offset - previous_offset; match value { Some(data) => { - unsafe { - // Safety: std::iter::Repeat and std::iter::Take implement TrustedLen. - builder - .mutable_array - .append_trusted_len_iter(std::iter::repeat(data).take(repeat_times)); - } + builder.mutable_array.extend_trusted_len( + std::iter::repeat(*data) + .take(repeat_times) + .map(Option::Some), + ); } None => { - builder.mutable_array.append_nulls(repeat_times); + builder.mutable_array.extend_constant(repeat_times, None); } } previous_offset = *offset; @@ -391,7 +356,6 @@ pub(crate) fn replicate_primitive( #[cfg(test)] mod tests { - use arrow::array::Int32Array; use arrow::datatypes::DataType as ArrowDataType; use serde_json; @@ -400,11 +364,11 @@ mod tests { use crate::serialize::Serializable; use crate::types::Int64Type; - fn check_vec(v: Int32Vector) { + fn check_vec(v: PrimitiveVector) { assert_eq!(4, v.len()); assert_eq!("Int32Vector", v.vector_type_name()); assert!(!v.is_const()); - assert!(v.validity().is_all_valid()); + assert_eq!(Validity::AllValid, v.validity()); assert!(!v.only_null()); for i in 0..4 { @@ -423,26 +387,26 @@ mod tests { #[test] fn test_from_values() { - let v = Int32Vector::from_values(vec![1, 2, 3, 4]); + let v = PrimitiveVector::::from_values(vec![1, 2, 3, 4]); check_vec(v); } #[test] fn test_from_vec() { - let v = Int32Vector::from_vec(vec![1, 2, 3, 4]); + let v = PrimitiveVector::::from_vec(vec![1, 2, 3, 4]); check_vec(v); } #[test] fn test_from_slice() { - let v = Int32Vector::from_slice(vec![1, 2, 3, 4]); + let v = PrimitiveVector::::from_slice(vec![1, 2, 3, 4]); check_vec(v); } #[test] fn test_serialize_primitive_vector_with_null_to_json() { let input = [Some(1i32), Some(2i32), None, Some(4i32), None]; - let mut builder = Int32VectorBuilder::with_capacity(input.len()); + let mut builder = PrimitiveVectorBuilder::with_capacity(input.len()); for v in input { builder.push(v); } @@ -457,15 +421,15 @@ mod tests { #[test] fn test_from_arrow_array() { - let arrow_array = Int32Array::from(vec![1, 2, 3, 4]); - let v = Int32Vector::from(arrow_array); + let arrow_array = PrimitiveArray::from_slice(vec![1, 2, 3, 4]); + let v = PrimitiveVector::from(arrow_array); check_vec(v); } #[test] fn test_primitive_vector_build_get() { let input = [Some(1i32), Some(2i32), None, Some(4i32), None]; - let mut builder = Int32VectorBuilder::with_capacity(input.len()); + let mut builder = PrimitiveVectorBuilder::with_capacity(input.len()); for v in input { builder.push(v); } @@ -484,28 +448,29 @@ mod tests { #[test] fn test_primitive_vector_validity() { let input = [Some(1i32), Some(2i32), None, None]; - let mut builder = Int32VectorBuilder::with_capacity(input.len()); + let mut builder = PrimitiveVectorBuilder::with_capacity(input.len()); for v in input { builder.push(v); } let vector = builder.finish(); assert_eq!(2, vector.null_count()); let validity = vector.validity(); - assert_eq!(2, validity.null_count()); - assert!(!validity.is_set(2)); - assert!(!validity.is_set(3)); + let slots = validity.slots().unwrap(); + assert_eq!(2, slots.null_count()); + assert!(!slots.get_bit(2)); + assert!(!slots.get_bit(3)); - let vector = Int32Vector::from_slice(vec![1, 2, 3, 4]); + let vector = PrimitiveVector::::from_slice(vec![1, 2, 3, 4]); assert_eq!(0, vector.null_count()); - assert!(vector.validity().is_all_valid()); + assert_eq!(Validity::AllValid, vector.validity()); } #[test] fn test_memory_size() { - let v = Int32Vector::from_slice((0..5).collect::>()); - assert_eq!(64, v.memory_size()); - let v = Int64Vector::from(vec![Some(0i64), Some(1i64), Some(2i64), None, None]); - assert_eq!(128, v.memory_size()); + let v = PrimitiveVector::::from_slice((0..5).collect::>()); + assert_eq!(20, v.memory_size()); + let v = PrimitiveVector::::from(vec![Some(0i64), Some(1i64), Some(2i64), None, None]); + assert_eq!(40, v.memory_size()); } #[test] @@ -524,29 +489,4 @@ mod tests { let expect: VectorRef = Arc::new(Int64Vector::from_slice(&[123, 8, 9])); assert_eq!(expect, vector); } - - #[test] - fn test_from_wrapper_slice() { - macro_rules! test_from_wrapper_slice { - ($vec: ident, $ty: ident) => { - let from_wrapper_slice = $vec::from_wrapper_slice(&[ - $ty::from_native($ty::MAX), - $ty::from_native($ty::MIN), - ]); - let from_slice = $vec::from_slice(&[$ty::MAX, $ty::MIN]); - assert_eq!(from_wrapper_slice, from_slice); - }; - } - - test_from_wrapper_slice!(UInt8Vector, u8); - test_from_wrapper_slice!(Int8Vector, i8); - test_from_wrapper_slice!(UInt16Vector, u16); - test_from_wrapper_slice!(Int16Vector, i16); - test_from_wrapper_slice!(UInt32Vector, u32); - test_from_wrapper_slice!(Int32Vector, i32); - test_from_wrapper_slice!(UInt64Vector, u64); - test_from_wrapper_slice!(Int64Vector, i64); - test_from_wrapper_slice!(Float32Vector, f32); - test_from_wrapper_slice!(Float64Vector, f64); - } } diff --git a/src/datatypes2/src/vectors/string.rs b/src/datatypes2/src/vectors/string.rs index 252116b3b2..638b04dd3e 100644 --- a/src/datatypes2/src/vectors/string.rs +++ b/src/datatypes2/src/vectors/string.rs @@ -15,19 +15,22 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef}; -use snafu::ResultExt; +use arrow::array::{Array, ArrayRef, MutableArray, Utf8ValuesIter}; +use arrow::bitmap::utils::ZipValidity; +use serde_json::Value as JsonValue; +use snafu::{OptionExt, ResultExt}; use crate::arrow_array::{MutableStringArray, StringArray}; use crate::data_type::ConcreteDataType; -use crate::error::{self, Result}; +use crate::error::{Result, SerializeSnafu}; use crate::scalars::{ScalarVector, ScalarVectorBuilder}; use crate::serialize::Serializable; +use crate::types::StringType; use crate::value::{Value, ValueRef}; use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; -/// Vector of strings. -#[derive(Debug, PartialEq)] +/// String array wrapper +#[derive(Debug, Clone, PartialEq)] pub struct StringVector { array: StringArray, } @@ -36,16 +39,6 @@ impl StringVector { pub(crate) fn as_arrow(&self) -> &dyn Array { &self.array } - - fn to_array_data(&self) -> ArrayData { - self.array.data().clone() - } - - fn from_array_data(data: ArrayData) -> Self { - Self { - array: StringArray::from(data), - } - } } impl From for StringVector { @@ -57,31 +50,7 @@ impl From for StringVector { impl From>> for StringVector { fn from(data: Vec>) -> Self { Self { - array: StringArray::from_iter(data), - } - } -} - -impl From>> for StringVector { - fn from(data: Vec>) -> Self { - Self { - array: StringArray::from_iter(data), - } - } -} - -impl From<&[Option]> for StringVector { - fn from(data: &[Option]) -> Self { - Self { - array: StringArray::from_iter(data), - } - } -} - -impl From<&[Option<&str>]> for StringVector { - fn from(data: &[Option<&str>]) -> Self { - Self { - array: StringArray::from_iter(data), + array: StringArray::from(data), } } } @@ -89,7 +58,19 @@ impl From<&[Option<&str>]> for StringVector { impl From> for StringVector { fn from(data: Vec) -> Self { Self { - array: StringArray::from_iter(data.into_iter().map(Some)), + array: StringArray::from( + data.into_iter() + .map(Option::Some) + .collect::>>(), + ), + } + } +} + +impl From>> for StringVector { + fn from(data: Vec>) -> Self { + Self { + array: StringArray::from(data), } } } @@ -97,14 +78,18 @@ impl From> for StringVector { impl From> for StringVector { fn from(data: Vec<&str>) -> Self { Self { - array: StringArray::from_iter(data.into_iter().map(Some)), + array: StringArray::from( + data.into_iter() + .map(Option::Some) + .collect::>>(), + ), } } } impl Vector for StringVector { fn data_type(&self) -> ConcreteDataType { - ConcreteDataType::string_datatype() + ConcreteDataType::String(StringType::default()) } fn vector_type_name(&self) -> String { @@ -120,13 +105,11 @@ impl Vector for StringVector { } fn to_arrow_array(&self) -> ArrayRef { - let data = self.to_array_data(); - Arc::new(StringArray::from(data)) + Arc::new(self.array.clone()) } fn to_boxed_arrow_array(&self) -> Box { - let data = self.to_array_data(); - Box::new(StringArray::from(data)) + Box::new(self.array.clone()) } fn validity(&self) -> Validity { @@ -134,11 +117,7 @@ impl Vector for StringVector { } fn memory_size(&self) -> usize { - self.array.get_buffer_memory_size() - } - - fn null_count(&self) -> usize { - self.array.null_count() + self.len() * std::mem::size_of::() + self.array.values().len() } fn is_null(&self, row: usize) -> bool { @@ -146,8 +125,7 @@ impl Vector for StringVector { } fn slice(&self, offset: usize, length: usize) -> VectorRef { - let data = self.array.data().slice(offset, length); - Arc::new(Self::from_array_data(data)) + Arc::new(Self::from(self.array.slice(offset, length))) } fn get(&self, index: usize) -> Value { @@ -162,7 +140,7 @@ impl Vector for StringVector { impl ScalarVector for StringVector { type OwnedItem = String; type RefItem<'a> = &'a str; - type Iter<'a> = ArrayIter<&'a StringArray>; + type Iter<'a> = ZipValidity<'a, &'a str, Utf8ValuesIter<'a, i32>>; type Builder = StringVectorBuilder; fn get_data(&self, idx: usize) -> Option> { @@ -179,7 +157,7 @@ impl ScalarVector for StringVector { } pub struct StringVectorBuilder { - mutable_array: MutableStringArray, + buffer: MutableStringArray, } impl MutableVector for StringVectorBuilder { @@ -188,7 +166,7 @@ impl MutableVector for StringVectorBuilder { } fn len(&self) -> usize { - self.mutable_array.len() + self.buffer.len() } fn as_any(&self) -> &dyn Any { @@ -204,15 +182,12 @@ impl MutableVector for StringVectorBuilder { } fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { - match value.as_string()? { - Some(v) => self.mutable_array.append_value(v), - None => self.mutable_array.append_null(), - } + self.buffer.push(value.as_string()?); Ok(()) } fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { - vectors::impl_extend_for_builder!(self, vector, StringVector, offset, length) + vectors::impl_extend_for_builder!(self.buffer, vector, StringVector, offset, length) } } @@ -221,30 +196,30 @@ impl ScalarVectorBuilder for StringVectorBuilder { fn with_capacity(capacity: usize) -> Self { Self { - mutable_array: MutableStringArray::with_capacity(capacity, 0), + buffer: MutableStringArray::with_capacity(capacity), } } fn push(&mut self, value: Option<::RefItem<'_>>) { - match value { - Some(v) => self.mutable_array.append_value(v), - None => self.mutable_array.append_null(), - } + self.buffer.push(value) } fn finish(&mut self) -> Self::VectorType { - StringVector { - array: self.mutable_array.finish(), + Self::VectorType { + array: std::mem::take(&mut self.buffer).into(), } } } impl Serializable for StringVector { - fn serialize_to_json(&self) -> Result> { + fn serialize_to_json(&self) -> crate::error::Result> { self.iter_data() - .map(serde_json::to_value) + .map(|v| match v { + None => Ok(serde_json::Value::Null), + Some(s) => serde_json::to_value(s), + }) .collect::>() - .context(error::SerializeSnafu) + .context(SerializeSnafu) } } @@ -252,9 +227,60 @@ vectors::impl_try_from_arrow_array_for_vector!(StringArray, StringVector); #[cfg(test)] mod tests { - use arrow::datatypes::DataType; + use arrow::datatypes::DataType as ArrowDataType; + use serde_json; use super::*; + use crate::data_type::DataType; + + #[test] + fn test_string_vector_misc() { + let strs = vec!["hello", "greptime", "rust"]; + let v = StringVector::from(strs.clone()); + assert_eq!(3, v.len()); + assert_eq!("StringVector", v.vector_type_name()); + assert!(!v.is_const()); + assert_eq!(Validity::AllValid, v.validity()); + assert!(!v.only_null()); + assert_eq!(41, v.memory_size()); + + for (i, s) in strs.iter().enumerate() { + assert_eq!(Value::from(*s), v.get(i)); + assert_eq!(ValueRef::from(*s), v.get_ref(i)); + assert_eq!(Value::from(*s), v.try_get(i).unwrap()); + } + + let arrow_arr = v.to_arrow_array(); + assert_eq!(3, arrow_arr.len()); + assert_eq!(&ArrowDataType::Utf8, arrow_arr.data_type()); + } + + #[test] + fn test_serialize_string_vector() { + let mut builder = StringVectorBuilder::with_capacity(3); + builder.push(Some("hello")); + builder.push(None); + builder.push(Some("world")); + let string_vector = builder.finish(); + let serialized = + serde_json::to_string(&string_vector.serialize_to_json().unwrap()).unwrap(); + assert_eq!(r#"["hello",null,"world"]"#, serialized); + } + + #[test] + fn test_from_arrow_array() { + let mut builder = MutableStringArray::new(); + builder.push(Some("A")); + builder.push(Some("B")); + builder.push::<&str>(None); + builder.push(Some("D")); + let string_array: StringArray = builder.into(); + let vector = StringVector::from(string_array); + assert_eq!( + r#"["A","B",null,"D"]"#, + serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(), + ); + } #[test] fn test_string_vector_build_get() { @@ -284,7 +310,7 @@ mod tests { #[test] fn test_string_vector_builder() { - let mut builder = StringVectorBuilder::with_capacity(3); + let mut builder = StringType::default().create_mutable_vector(3); builder.push_value_ref(ValueRef::String("hello")).unwrap(); assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); @@ -298,73 +324,4 @@ mod tests { let expect: VectorRef = Arc::new(StringVector::from_slice(&["hello", "one", "two"])); assert_eq!(expect, vector); } - - #[test] - fn test_string_vector_misc() { - let strs = vec!["hello", "greptime", "rust"]; - let v = StringVector::from(strs.clone()); - assert_eq!(3, v.len()); - assert_eq!("StringVector", v.vector_type_name()); - assert!(!v.is_const()); - assert!(v.validity().is_all_valid()); - assert!(!v.only_null()); - assert_eq!(128, v.memory_size()); - - for (i, s) in strs.iter().enumerate() { - assert_eq!(Value::from(*s), v.get(i)); - assert_eq!(ValueRef::from(*s), v.get_ref(i)); - assert_eq!(Value::from(*s), v.try_get(i).unwrap()); - } - - let arrow_arr = v.to_arrow_array(); - assert_eq!(3, arrow_arr.len()); - assert_eq!(&DataType::Utf8, arrow_arr.data_type()); - } - - #[test] - fn test_serialize_string_vector() { - let mut builder = StringVectorBuilder::with_capacity(3); - builder.push(Some("hello")); - builder.push(None); - builder.push(Some("world")); - let string_vector = builder.finish(); - let serialized = - serde_json::to_string(&string_vector.serialize_to_json().unwrap()).unwrap(); - assert_eq!(r#"["hello",null,"world"]"#, serialized); - } - - #[test] - fn test_from_arrow_array() { - let mut builder = MutableStringArray::new(); - builder.append_option(Some("A")); - builder.append_option(Some("B")); - builder.append_null(); - builder.append_option(Some("D")); - let string_array: StringArray = builder.finish(); - let vector = StringVector::from(string_array); - assert_eq!( - r#"["A","B",null,"D"]"#, - serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(), - ); - } - - #[test] - fn test_from_non_option_string() { - let nul = String::from_utf8(vec![0]).unwrap(); - let corpus = vec!["😅😅😅", "😍😍😍😍", "🥵🥵", nul.as_str()]; - let vector = StringVector::from(corpus); - let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(); - assert_eq!(r#"["😅😅😅","😍😍😍😍","🥵🥵","\u0000"]"#, serialized); - - let corpus = vec![ - "🀀🀀🀀".to_string(), - "🀁🀁🀁".to_string(), - "🀂🀂🀂".to_string(), - "🀃🀃🀃".to_string(), - "🀆🀆".to_string(), - ]; - let vector = StringVector::from(corpus); - let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(); - assert_eq!(r#"["🀀🀀🀀","🀁🀁🀁","🀂🀂🀂","🀃🀃🀃","🀆🀆"]"#, serialized); - } } diff --git a/src/datatypes2/src/vectors/timestamp.rs b/src/datatypes2/src/vectors/timestamp.rs index 5d9f7f2ed1..62b8332c89 100644 --- a/src/datatypes2/src/vectors/timestamp.rs +++ b/src/datatypes2/src/vectors/timestamp.rs @@ -12,20 +12,308 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::types::{ - TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, - TimestampSecondType, +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{Array, ArrayRef, PrimitiveArray}; +use common_time::timestamp::{TimeUnit, Timestamp}; +use snafu::OptionExt; + +use crate::data_type::{ConcreteDataType, DataType}; +use crate::error; +use crate::error::Result; +use crate::prelude::{ + MutableVector, ScalarVector, ScalarVectorBuilder, Validity, Value, ValueRef, Vector, VectorRef, }; -use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder}; +use crate::serialize::Serializable; +use crate::types::TimestampType; +use crate::vectors::{PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder}; -pub type TimestampSecondVector = PrimitiveVector; -pub type TimestampSecondVectorBuilder = PrimitiveVectorBuilder; +/// `TimestampVector` stores timestamp in millisecond since UNIX Epoch. +#[derive(Debug, Clone, PartialEq)] +pub struct TimestampVector { + array: PrimitiveVector, +} -pub type TimestampMillisecondVector = PrimitiveVector; -pub type TimestampMillisecondVectorBuilder = PrimitiveVectorBuilder; +impl TimestampVector { + pub fn new(array: PrimitiveArray) -> Self { + Self { + array: PrimitiveVector { array }, + } + } -pub type TimestampMicrosecondVector = PrimitiveVector; -pub type TimestampMicrosecondVectorBuilder = PrimitiveVectorBuilder; + pub fn try_from_arrow_array(array: impl AsRef) -> Result { + Ok(Self::new( + array + .as_ref() + .as_any() + .downcast_ref::>() + .with_context(|| error::ConversionSnafu { + from: format!("{:?}", array.as_ref().data_type()), + })? + .clone(), + )) + } -pub type TimestampNanosecondVector = PrimitiveVector; -pub type TimestampNanosecondVectorBuilder = PrimitiveVectorBuilder; + pub fn from_values>(iter: I) -> Self { + Self { + array: PrimitiveVector { + array: PrimitiveArray::from_values(iter), + }, + } + } + + pub(crate) fn as_arrow(&self) -> &dyn Array { + self.array.as_arrow() + } +} + +impl Vector for TimestampVector { + fn data_type(&self) -> ConcreteDataType { + ConcreteDataType::timestamp_millis_datatype() + } + + fn vector_type_name(&self) -> String { + "TimestampVector".to_string() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn len(&self) -> usize { + self.array.len() + } + + fn to_arrow_array(&self) -> ArrayRef { + let validity = self.array.array.validity().cloned(); + let buffer = self.array.array.values().clone(); + Arc::new(PrimitiveArray::new( + TimestampType::new(TimeUnit::Millisecond).as_arrow_type(), + buffer, + validity, + )) + } + + fn to_boxed_arrow_array(&self) -> Box { + let validity = self.array.array.validity().cloned(); + let values = self.array.array.values().clone(); + Box::new(PrimitiveArray::new( + arrow::datatypes::DataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), + values, + validity, + )) + } + + fn validity(&self) -> Validity { + self.array.validity() + } + + fn memory_size(&self) -> usize { + self.array.memory_size() + } + + fn is_null(&self, row: usize) -> bool { + self.array.is_null(row) + } + + fn slice(&self, offset: usize, length: usize) -> VectorRef { + Arc::new(Self { + array: PrimitiveVector { + array: self.array.array.slice(offset, length), + }, + }) + } + + fn get(&self, index: usize) -> Value { + match self.array.get(index) { + Value::Null => Value::Null, + Value::Int64(v) => Value::Timestamp(Timestamp::from_millis(v)), + _ => { + unreachable!() + } + } + } + + fn get_ref(&self, index: usize) -> ValueRef { + match self.array.get(index) { + Value::Int64(v) => ValueRef::Timestamp(Timestamp::from_millis(v)), + Value::Null => ValueRef::Null, + _ => unreachable!(), + } + } +} + +impl Serializable for TimestampVector { + fn serialize_to_json(&self) -> Result> { + Ok(self + .array + .iter_data() + .map(|v| match v { + None => serde_json::Value::Null, + Some(v) => v.into(), + }) + .collect::>()) + } +} + +impl ScalarVector for TimestampVector { + type OwnedItem = Timestamp; + type RefItem<'a> = Timestamp; + type Iter<'a> = TimestampDataIter<'a>; + type Builder = TimestampVectorBuilder; + + fn get_data(&self, idx: usize) -> Option> { + self.array.get_data(idx).map(Timestamp::from_millis) + } + + fn iter_data(&self) -> Self::Iter<'_> { + TimestampDataIter { + iter: self.array.iter_data(), + } + } +} + +pub struct TimestampDataIter<'a> { + iter: PrimitiveIter<'a, i64>, +} + +impl<'a> Iterator for TimestampDataIter<'a> { + type Item = Option; + + fn next(&mut self) -> Option { + self.iter.next().map(|v| v.map(Timestamp::from_millis)) + } +} + +pub struct TimestampVectorBuilder { + buffer: PrimitiveVectorBuilder, +} + +impl MutableVector for TimestampVectorBuilder { + fn data_type(&self) -> ConcreteDataType { + ConcreteDataType::timestamp_millis_datatype() + } + + fn len(&self) -> usize { + self.buffer.len() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_mut_any(&mut self) -> &mut dyn Any { + self + } + + fn to_vector(&mut self) -> VectorRef { + Arc::new(self.finish()) + } + + fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { + // TODO(hl): vector and vector builder should also support customized time unit. + self.buffer.push( + value + .as_timestamp()? + .map(|t| t.convert_to(TimeUnit::Millisecond)), + ); + Ok(()) + } + + fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { + let concrete_vector = vector + .as_any() + .downcast_ref::() + .with_context(|| error::CastTypeSnafu { + msg: format!( + "Failed to convert vector from {} to DateVector", + vector.vector_type_name() + ), + })?; + + self.buffer + .extend_slice_of(&concrete_vector.array, offset, length)?; + Ok(()) + } +} + +impl ScalarVectorBuilder for TimestampVectorBuilder { + type VectorType = TimestampVector; + + fn with_capacity(capacity: usize) -> Self { + Self { + buffer: PrimitiveVectorBuilder::with_capacity(capacity), + } + } + + /// Pushes a Timestamp value into vector builder. The timestamp must be with time unit + /// `Second`/`MilliSecond`/`Microsecond`. + fn push(&mut self, value: Option<::RefItem<'_>>) { + self.buffer + .push(value.map(|v| v.convert_to(TimeUnit::Millisecond))); + } + + fn finish(&mut self) -> Self::VectorType { + Self::VectorType { + array: self.buffer.finish(), + } + } +} + +pub(crate) fn replicate_timestamp(vector: &TimestampVector, offsets: &[usize]) -> VectorRef { + let array = crate::vectors::primitive::replicate_primitive_with_type( + &vector.array, + offsets, + vector.data_type(), + ); + Arc::new(TimestampVector { array }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + pub fn test_build_timestamp_vector() { + let mut builder = TimestampVectorBuilder::with_capacity(3); + builder.push(Some(Timestamp::new(1, TimeUnit::Second))); + builder.push(None); + builder.push(Some(Timestamp::new(2, TimeUnit::Millisecond))); + + let vector = builder.finish(); + assert_eq!( + ConcreteDataType::timestamp_millis_datatype(), + vector.data_type() + ); + assert_eq!(3, vector.len()); + assert_eq!( + Value::Timestamp(Timestamp::new(1000, TimeUnit::Millisecond)), + vector.get(0) + ); + + assert_eq!(Value::Null, vector.get(1)); + assert_eq!( + Value::Timestamp(Timestamp::new(2, TimeUnit::Millisecond)), + vector.get(2) + ); + + assert_eq!( + vec![ + Some(Timestamp::new(1000, TimeUnit::Millisecond)), + None, + Some(Timestamp::new(2, TimeUnit::Millisecond)), + ], + vector.iter_data().collect::>() + ); + } + + #[test] + fn test_timestamp_from_arrow() { + let vector = + TimestampVector::from_slice(&[Timestamp::from_millis(1), Timestamp::from_millis(2)]); + let arrow = vector.as_arrow().slice(0, vector.len()); + let vector2 = TimestampVector::try_from_arrow_array(&arrow).unwrap(); + assert_eq!(vector, vector2); + } +}