diff --git a/src/datatypes/src/arrow_array.rs b/src/datatypes/src/arrow_array.rs index 456c85a994..27da7d29a5 100644 --- a/src/datatypes/src/arrow_array.rs +++ b/src/datatypes/src/arrow_array.rs @@ -1,6 +1,9 @@ -use arrow::array::{BinaryArray, MutableBinaryArray, MutableUtf8Array, Utf8Array}; +use arrow::array::{ + BinaryArray as ArrowBinaryArray, MutableBinaryArray as ArrowMutableBinaryArray, + MutableUtf8Array, Utf8Array, +}; -pub type LargeBinaryArray = BinaryArray; -pub type MutableLargeBinaryArray = MutableBinaryArray; -pub type MutableStringArray = MutableUtf8Array; -pub type StringArray = Utf8Array; +pub type BinaryArray = ArrowBinaryArray; +pub type MutableBinaryArray = ArrowMutableBinaryArray; +pub type MutableStringArray = MutableUtf8Array; +pub type StringArray = Utf8Array; diff --git a/src/datatypes/src/types/string_type.rs b/src/datatypes/src/types/string_type.rs index 132a0f2e4a..6717b27e74 100644 --- a/src/datatypes/src/types/string_type.rs +++ b/src/datatypes/src/types/string_type.rs @@ -29,6 +29,6 @@ impl DataType for StringType { } fn as_arrow_type(&self) -> ArrowDataType { - ArrowDataType::Utf8 + ArrowDataType::LargeUtf8 } } diff --git a/src/datatypes/src/vectors.rs b/src/datatypes/src/vectors.rs index 2624512739..dd79dc4f0f 100644 --- a/src/datatypes/src/vectors.rs +++ b/src/datatypes/src/vectors.rs @@ -75,6 +75,9 @@ pub trait Vector: Send + Sync + Serializable { /// Returns the validity of the Array. fn validity(&self) -> Validity; + /// Returns the memory size of vector. + fn memory_size(&self) -> usize; + /// The number of null slots on this [`Vector`]. /// # Implementation /// This is `O(1)`. @@ -132,14 +135,15 @@ macro_rules! impl_try_from_arrow_array_for_vector { ($Array: ident, $Vector: ident) => { impl $Vector { pub fn try_from_arrow_array( - array: arrow::array::ArrayRef, + array: impl AsRef, ) -> crate::error::Result<$Vector> { Ok($Vector::from( array + .as_ref() .as_any() .downcast_ref::<$Array>() .with_context(|| crate::error::ConversionSnafu { - from: std::format!("{:?}", array.data_type()), + from: std::format!("{:?}", array.as_ref().data_type()), })? .clone(), )) diff --git a/src/datatypes/src/vectors/binary.rs b/src/datatypes/src/vectors/binary.rs index 074bd2140b..1a7a3aa724 100644 --- a/src/datatypes/src/vectors/binary.rs +++ b/src/datatypes/src/vectors/binary.rs @@ -1,13 +1,13 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{Array, ArrayRef, BinaryArray}; +use arrow::array::{Array, ArrayRef}; use arrow::array::{BinaryValueIter, MutableArray}; use arrow::bitmap::utils::ZipValidity; use snafu::OptionExt; use snafu::ResultExt; -use crate::arrow_array::{LargeBinaryArray, MutableLargeBinaryArray}; +use crate::arrow_array::{BinaryArray, MutableBinaryArray}; use crate::data_type::ConcreteDataType; use crate::error::Result; use crate::error::SerializeSnafu; @@ -19,11 +19,11 @@ use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; /// Vector of binary strings. #[derive(Debug)] pub struct BinaryVector { - array: LargeBinaryArray, + array: BinaryArray, } -impl From> for BinaryVector { - fn from(array: BinaryArray) -> Self { +impl From for BinaryVector { + fn from(array: BinaryArray) -> Self { Self { array } } } @@ -31,7 +31,7 @@ impl From> for BinaryVector { impl From>>> for BinaryVector { fn from(data: Vec>>) -> Self { Self { - array: LargeBinaryArray::from(data), + array: BinaryArray::from(data), } } } @@ -61,6 +61,10 @@ impl Vector for BinaryVector { vectors::impl_validity_for_vector!(self.array) } + fn memory_size(&self) -> usize { + self.array.values().len() + self.array.offsets().len() * std::mem::size_of::() + } + fn is_null(&self, row: usize) -> bool { self.array.is_null(row) } @@ -98,7 +102,7 @@ impl ScalarVector for BinaryVector { } pub struct BinaryVectorBuilder { - mutable_array: MutableLargeBinaryArray, + mutable_array: MutableBinaryArray, } impl MutableVector for BinaryVectorBuilder { @@ -128,7 +132,7 @@ impl ScalarVectorBuilder for BinaryVectorBuilder { fn with_capacity(capacity: usize) -> Self { Self { - mutable_array: MutableLargeBinaryArray::with_capacity(capacity), + mutable_array: MutableBinaryArray::with_capacity(capacity), } } @@ -155,7 +159,7 @@ impl Serializable for BinaryVector { } } -vectors::impl_try_from_arrow_array_for_vector!(LargeBinaryArray, BinaryVector); +vectors::impl_try_from_arrow_array_for_vector!(BinaryArray, BinaryVector); #[cfg(test)] mod tests { @@ -164,21 +168,19 @@ mod tests { use serde_json; use super::*; - use crate::arrow_array::LargeBinaryArray; + use crate::arrow_array::BinaryArray; use crate::serialize::Serializable; #[test] fn test_binary_vector_misc() { - let v = BinaryVector::from(LargeBinaryArray::from_slice(&vec![ - vec![1, 2, 3], - vec![1, 2, 3], - ])); + let v = BinaryVector::from(BinaryArray::from_slice(&vec![vec![1, 2, 3], vec![1, 2, 3]])); assert_eq!(2, v.len()); assert_eq!("BinaryVector", v.vector_type_name()); assert!(!v.is_const()); assert_eq!(Validity::AllValid, v.validity()); assert!(!v.only_null()); + assert_eq!(30, v.memory_size()); for i in 0..2 { assert!(!v.is_null(i)); @@ -192,10 +194,8 @@ mod tests { #[test] fn test_serialize_binary_vector_to_json() { - let vector = BinaryVector::from(LargeBinaryArray::from_slice(&vec![ - vec![1, 2, 3], - vec![1, 2, 3], - ])); + let vector = + BinaryVector::from(BinaryArray::from_slice(&vec![vec![1, 2, 3], vec![1, 2, 3]])); let json_value = vector.serialize_to_json().unwrap(); assert_eq!( @@ -221,7 +221,7 @@ mod tests { #[test] fn test_from_arrow_array() { - let arrow_array = LargeBinaryArray::from_slice(&vec![vec![1, 2, 3], vec![1, 2, 3]]); + let arrow_array = BinaryArray::from_slice(&vec![vec![1, 2, 3], vec![1, 2, 3]]); let original = arrow_array.clone(); let vector = BinaryVector::from(arrow_array); assert_eq!(original, vector.array); diff --git a/src/datatypes/src/vectors/boolean.rs b/src/datatypes/src/vectors/boolean.rs index 6df850c333..2ba04c0be3 100644 --- a/src/datatypes/src/vectors/boolean.rs +++ b/src/datatypes/src/vectors/boolean.rs @@ -76,6 +76,10 @@ impl Vector for BooleanVector { vectors::impl_validity_for_vector!(self.array) } + fn memory_size(&self) -> usize { + self.array.values().as_slice().0.len() + } + fn is_null(&self, row: usize) -> bool { self.array.is_null(row) } @@ -179,13 +183,14 @@ mod tests { #[test] fn test_boolean_vector_misc() { - let bools = vec![true, false, true, true, false, false]; + let bools = vec![true, false, true, true, false, false, true, true, false]; let v = BooleanVector::from(bools.clone()); - assert_eq!(6, v.len()); + assert_eq!(9, v.len()); assert_eq!("BooleanVector", v.vector_type_name()); assert!(!v.is_const()); assert_eq!(Validity::AllValid, v.validity()); assert!(!v.only_null()); + assert_eq!(2, v.memory_size()); for (i, b) in bools.iter().enumerate() { assert!(!v.is_null(i)); @@ -193,7 +198,7 @@ mod tests { } let arrow_arr = v.to_arrow_array(); - assert_eq!(6, arrow_arr.len()); + assert_eq!(9, arrow_arr.len()); assert_eq!(&ArrowDataType::Boolean, arrow_arr.data_type()); } diff --git a/src/datatypes/src/vectors/constant.rs b/src/datatypes/src/vectors/constant.rs index ec58befd38..62e0dcafb8 100644 --- a/src/datatypes/src/vectors/constant.rs +++ b/src/datatypes/src/vectors/constant.rs @@ -66,6 +66,10 @@ impl Vector for ConstantVector { } } + fn memory_size(&self) -> usize { + self.vector.memory_size() + } + fn is_null(&self, _row: usize) -> bool { self.vector.is_null(0) } @@ -133,6 +137,7 @@ mod tests { assert_eq!(10, c.len()); assert_eq!(Validity::AllValid, c.validity()); assert!(!c.only_null()); + assert_eq!(4, c.memory_size()); for i in 0..10 { assert!(!c.is_null(i)); diff --git a/src/datatypes/src/vectors/helper.rs b/src/datatypes/src/vectors/helper.rs index 67ba77747a..7b3ac2abba 100644 --- a/src/datatypes/src/vectors/helper.rs +++ b/src/datatypes/src/vectors/helper.rs @@ -3,6 +3,7 @@ use std::any::Any; use std::sync::Arc; +use arrow::array::Array; use arrow::datatypes::DataType as ArrowDataType; use datafusion_common::ScalarValue; use snafu::OptionExt; @@ -151,8 +152,8 @@ impl Helper { /// /// # Panics /// Panic if given arrow data type is not supported. - pub fn try_into_vector(array: ArrayRef) -> Result { - Ok(match array.data_type() { + pub fn try_into_vector(array: impl AsRef) -> Result { + Ok(match array.as_ref().data_type() { ArrowDataType::Null => Arc::new(NullVector::try_from_arrow_array(array)?), ArrowDataType::Boolean => Arc::new(BooleanVector::try_from_arrow_array(array)?), ArrowDataType::Binary | ArrowDataType::LargeBinary => { @@ -171,7 +172,7 @@ impl Helper { ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => { Arc::new(StringVector::try_from_arrow_array(array)?) } - _ => unimplemented!("Arrow array datatype: {:?}", array.data_type()), + _ => unimplemented!("Arrow array datatype: {:?}", array.as_ref().data_type()), }) } } diff --git a/src/datatypes/src/vectors/null.rs b/src/datatypes/src/vectors/null.rs index 0f3deec579..cb20bdcca0 100644 --- a/src/datatypes/src/vectors/null.rs +++ b/src/datatypes/src/vectors/null.rs @@ -58,6 +58,10 @@ impl Vector for NullVector { Validity::AllNull } + fn memory_size(&self) -> usize { + 0 + } + fn is_null(&self, _row: usize) -> bool { true } @@ -114,6 +118,7 @@ mod tests { let v = NullVector::new(32); assert_eq!(v.len(), 32); + assert_eq!(0, v.memory_size()); let arrow_arr = v.to_arrow_array(); assert_eq!(arrow_arr.null_count(), 32); diff --git a/src/datatypes/src/vectors/primitive.rs b/src/datatypes/src/vectors/primitive.rs index 5beb612781..03b027f39e 100644 --- a/src/datatypes/src/vectors/primitive.rs +++ b/src/datatypes/src/vectors/primitive.rs @@ -28,13 +28,15 @@ impl PrimitiveVector { pub fn new(array: PrimitiveArray) -> Self { Self { array } } - pub fn try_from_arrow_array(array: ArrayRef) -> Result { + + pub fn try_from_arrow_array(array: impl AsRef) -> Result { Ok(Self::new( array + .as_ref() .as_any() .downcast_ref::>() .with_context(|| ConversionSnafu { - from: format!("{:?}", array.data_type()), + from: format!("{:?}", array.as_ref().data_type()), })? .clone(), )) @@ -84,6 +86,10 @@ impl Vector for PrimitiveVector { vectors::impl_validity_for_vector!(self.array) } + fn memory_size(&self) -> usize { + self.array.values().len() * std::mem::size_of::() + } + fn is_null(&self, row: usize) -> bool { self.array.is_null(row) } @@ -283,6 +289,7 @@ impl Serializable for PrimitiveVector { #[cfg(test)] mod tests { + use arrow::datatypes::DataType as ArrowDataType; use serde_json; @@ -402,4 +409,12 @@ mod tests { assert_eq!(Value::Int32(i as i32 + 1), v.get(i)); } } + + #[test] + fn test_memory_size() { + let v = PrimitiveVector::::from_slice((0..5).collect::>()); + assert_eq!(20, v.memory_size()); + let v = PrimitiveVector::::from(vec![Some(0i64), Some(1i64), Some(2i64), None, None]); + assert_eq!(40, v.memory_size()); + } } diff --git a/src/datatypes/src/vectors/string.rs b/src/datatypes/src/vectors/string.rs index 34eb175127..424e02ee7c 100644 --- a/src/datatypes/src/vectors/string.rs +++ b/src/datatypes/src/vectors/string.rs @@ -93,6 +93,10 @@ impl Vector for StringVector { vectors::impl_validity_for_vector!(self.array) } + fn memory_size(&self) -> usize { + self.len() * std::mem::size_of::() + self.array.values().len() + } + fn is_null(&self, row: usize) -> bool { self.array.is_null(row) } @@ -113,7 +117,7 @@ impl Vector for StringVector { impl ScalarVector for StringVector { type OwnedItem = String; type RefItem<'a> = &'a str; - type Iter<'a> = ZipValidity<'a, &'a str, Utf8ValuesIter<'a, i32>>; + type Iter<'a> = ZipValidity<'a, &'a str, Utf8ValuesIter<'a, i64>>; type Builder = StringVectorBuilder; fn get_data(&self, idx: usize) -> Option> { @@ -205,6 +209,7 @@ mod tests { assert!(!v.is_const()); assert_eq!(Validity::AllValid, v.validity()); assert!(!v.only_null()); + assert_eq!(41, v.memory_size()); for (i, s) in strs.iter().enumerate() { assert_eq!(Value::from(*s), v.get(i)); @@ -213,7 +218,7 @@ mod tests { let arrow_arr = v.to_arrow_array(); assert_eq!(3, arrow_arr.len()); - assert_eq!(&ArrowDataType::Utf8, arrow_arr.data_type()); + assert_eq!(&ArrowDataType::LargeUtf8, arrow_arr.data_type()); } #[test]