feat: memory size of vector (#53)

* feat: improve try_into_vector function

* feat: impl memory_size function for vectors

* fix: forgot memory_size assertion in null vector test

* feat: use LargeUtf8 instead of utf8 for string, and rename LargeBianryArray to BinaryArray

* feat: memory_size only calculates heap size
This commit is contained in:
dennis zhuang
2022-06-28 11:06:53 +08:00
committed by GitHub
parent 379d2e2f50
commit b567cfb9bc
10 changed files with 80 additions and 37 deletions

View File

@@ -1,6 +1,9 @@
use arrow::array::{BinaryArray, MutableBinaryArray, MutableUtf8Array, Utf8Array};
use arrow::array::{
BinaryArray as ArrowBinaryArray, MutableBinaryArray as ArrowMutableBinaryArray,
MutableUtf8Array, Utf8Array,
};
pub type LargeBinaryArray = BinaryArray<i64>;
pub type MutableLargeBinaryArray = MutableBinaryArray<i64>;
pub type MutableStringArray = MutableUtf8Array<i32>;
pub type StringArray = Utf8Array<i32>;
pub type BinaryArray = ArrowBinaryArray<i64>;
pub type MutableBinaryArray = ArrowMutableBinaryArray<i64>;
pub type MutableStringArray = MutableUtf8Array<i64>;
pub type StringArray = Utf8Array<i64>;

View File

@@ -29,6 +29,6 @@ impl DataType for StringType {
}
fn as_arrow_type(&self) -> ArrowDataType {
ArrowDataType::Utf8
ArrowDataType::LargeUtf8
}
}

View File

@@ -75,6 +75,9 @@ pub trait Vector: Send + Sync + Serializable {
/// Returns the validity of the Array.
fn validity(&self) -> Validity;
/// Returns the memory size of vector.
fn memory_size(&self) -> usize;
/// The number of null slots on this [`Vector`].
/// # Implementation
/// This is `O(1)`.
@@ -132,14 +135,15 @@ macro_rules! impl_try_from_arrow_array_for_vector {
($Array: ident, $Vector: ident) => {
impl $Vector {
pub fn try_from_arrow_array(
array: arrow::array::ArrayRef,
array: impl AsRef<dyn arrow::array::Array>,
) -> crate::error::Result<$Vector> {
Ok($Vector::from(
array
.as_ref()
.as_any()
.downcast_ref::<$Array>()
.with_context(|| crate::error::ConversionSnafu {
from: std::format!("{:?}", array.data_type()),
from: std::format!("{:?}", array.as_ref().data_type()),
})?
.clone(),
))

View File

@@ -1,13 +1,13 @@
use std::any::Any;
use std::sync::Arc;
use arrow::array::{Array, ArrayRef, BinaryArray};
use arrow::array::{Array, ArrayRef};
use arrow::array::{BinaryValueIter, MutableArray};
use arrow::bitmap::utils::ZipValidity;
use snafu::OptionExt;
use snafu::ResultExt;
use crate::arrow_array::{LargeBinaryArray, MutableLargeBinaryArray};
use crate::arrow_array::{BinaryArray, MutableBinaryArray};
use crate::data_type::ConcreteDataType;
use crate::error::Result;
use crate::error::SerializeSnafu;
@@ -19,11 +19,11 @@ use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef};
/// Vector of binary strings.
#[derive(Debug)]
pub struct BinaryVector {
array: LargeBinaryArray,
array: BinaryArray,
}
impl From<BinaryArray<i64>> for BinaryVector {
fn from(array: BinaryArray<i64>) -> Self {
impl From<BinaryArray> for BinaryVector {
fn from(array: BinaryArray) -> Self {
Self { array }
}
}
@@ -31,7 +31,7 @@ impl From<BinaryArray<i64>> for BinaryVector {
impl From<Vec<Option<Vec<u8>>>> for BinaryVector {
fn from(data: Vec<Option<Vec<u8>>>) -> Self {
Self {
array: LargeBinaryArray::from(data),
array: BinaryArray::from(data),
}
}
}
@@ -61,6 +61,10 @@ impl Vector for BinaryVector {
vectors::impl_validity_for_vector!(self.array)
}
fn memory_size(&self) -> usize {
self.array.values().len() + self.array.offsets().len() * std::mem::size_of::<i64>()
}
fn is_null(&self, row: usize) -> bool {
self.array.is_null(row)
}
@@ -98,7 +102,7 @@ impl ScalarVector for BinaryVector {
}
pub struct BinaryVectorBuilder {
mutable_array: MutableLargeBinaryArray,
mutable_array: MutableBinaryArray,
}
impl MutableVector for BinaryVectorBuilder {
@@ -128,7 +132,7 @@ impl ScalarVectorBuilder for BinaryVectorBuilder {
fn with_capacity(capacity: usize) -> Self {
Self {
mutable_array: MutableLargeBinaryArray::with_capacity(capacity),
mutable_array: MutableBinaryArray::with_capacity(capacity),
}
}
@@ -155,7 +159,7 @@ impl Serializable for BinaryVector {
}
}
vectors::impl_try_from_arrow_array_for_vector!(LargeBinaryArray, BinaryVector);
vectors::impl_try_from_arrow_array_for_vector!(BinaryArray, BinaryVector);
#[cfg(test)]
mod tests {
@@ -164,21 +168,19 @@ mod tests {
use serde_json;
use super::*;
use crate::arrow_array::LargeBinaryArray;
use crate::arrow_array::BinaryArray;
use crate::serialize::Serializable;
#[test]
fn test_binary_vector_misc() {
let v = BinaryVector::from(LargeBinaryArray::from_slice(&vec![
vec![1, 2, 3],
vec![1, 2, 3],
]));
let v = BinaryVector::from(BinaryArray::from_slice(&vec![vec![1, 2, 3], vec![1, 2, 3]]));
assert_eq!(2, v.len());
assert_eq!("BinaryVector", v.vector_type_name());
assert!(!v.is_const());
assert_eq!(Validity::AllValid, v.validity());
assert!(!v.only_null());
assert_eq!(30, v.memory_size());
for i in 0..2 {
assert!(!v.is_null(i));
@@ -192,10 +194,8 @@ mod tests {
#[test]
fn test_serialize_binary_vector_to_json() {
let vector = BinaryVector::from(LargeBinaryArray::from_slice(&vec![
vec![1, 2, 3],
vec![1, 2, 3],
]));
let vector =
BinaryVector::from(BinaryArray::from_slice(&vec![vec![1, 2, 3], vec![1, 2, 3]]));
let json_value = vector.serialize_to_json().unwrap();
assert_eq!(
@@ -221,7 +221,7 @@ mod tests {
#[test]
fn test_from_arrow_array() {
let arrow_array = LargeBinaryArray::from_slice(&vec![vec![1, 2, 3], vec![1, 2, 3]]);
let arrow_array = BinaryArray::from_slice(&vec![vec![1, 2, 3], vec![1, 2, 3]]);
let original = arrow_array.clone();
let vector = BinaryVector::from(arrow_array);
assert_eq!(original, vector.array);

View File

@@ -76,6 +76,10 @@ impl Vector for BooleanVector {
vectors::impl_validity_for_vector!(self.array)
}
fn memory_size(&self) -> usize {
self.array.values().as_slice().0.len()
}
fn is_null(&self, row: usize) -> bool {
self.array.is_null(row)
}
@@ -179,13 +183,14 @@ mod tests {
#[test]
fn test_boolean_vector_misc() {
let bools = vec![true, false, true, true, false, false];
let bools = vec![true, false, true, true, false, false, true, true, false];
let v = BooleanVector::from(bools.clone());
assert_eq!(6, v.len());
assert_eq!(9, v.len());
assert_eq!("BooleanVector", v.vector_type_name());
assert!(!v.is_const());
assert_eq!(Validity::AllValid, v.validity());
assert!(!v.only_null());
assert_eq!(2, v.memory_size());
for (i, b) in bools.iter().enumerate() {
assert!(!v.is_null(i));
@@ -193,7 +198,7 @@ mod tests {
}
let arrow_arr = v.to_arrow_array();
assert_eq!(6, arrow_arr.len());
assert_eq!(9, arrow_arr.len());
assert_eq!(&ArrowDataType::Boolean, arrow_arr.data_type());
}

View File

@@ -66,6 +66,10 @@ impl Vector for ConstantVector {
}
}
fn memory_size(&self) -> usize {
self.vector.memory_size()
}
fn is_null(&self, _row: usize) -> bool {
self.vector.is_null(0)
}
@@ -133,6 +137,7 @@ mod tests {
assert_eq!(10, c.len());
assert_eq!(Validity::AllValid, c.validity());
assert!(!c.only_null());
assert_eq!(4, c.memory_size());
for i in 0..10 {
assert!(!c.is_null(i));

View File

@@ -3,6 +3,7 @@
use std::any::Any;
use std::sync::Arc;
use arrow::array::Array;
use arrow::datatypes::DataType as ArrowDataType;
use datafusion_common::ScalarValue;
use snafu::OptionExt;
@@ -151,8 +152,8 @@ impl Helper {
///
/// # Panics
/// Panic if given arrow data type is not supported.
pub fn try_into_vector(array: ArrayRef) -> Result<VectorRef> {
Ok(match array.data_type() {
pub fn try_into_vector(array: impl AsRef<dyn Array>) -> Result<VectorRef> {
Ok(match array.as_ref().data_type() {
ArrowDataType::Null => Arc::new(NullVector::try_from_arrow_array(array)?),
ArrowDataType::Boolean => Arc::new(BooleanVector::try_from_arrow_array(array)?),
ArrowDataType::Binary | ArrowDataType::LargeBinary => {
@@ -171,7 +172,7 @@ impl Helper {
ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => {
Arc::new(StringVector::try_from_arrow_array(array)?)
}
_ => unimplemented!("Arrow array datatype: {:?}", array.data_type()),
_ => unimplemented!("Arrow array datatype: {:?}", array.as_ref().data_type()),
})
}
}

View File

@@ -58,6 +58,10 @@ impl Vector for NullVector {
Validity::AllNull
}
fn memory_size(&self) -> usize {
0
}
fn is_null(&self, _row: usize) -> bool {
true
}
@@ -114,6 +118,7 @@ mod tests {
let v = NullVector::new(32);
assert_eq!(v.len(), 32);
assert_eq!(0, v.memory_size());
let arrow_arr = v.to_arrow_array();
assert_eq!(arrow_arr.null_count(), 32);

View File

@@ -28,13 +28,15 @@ impl<T: Primitive> PrimitiveVector<T> {
pub fn new(array: PrimitiveArray<T>) -> Self {
Self { array }
}
pub fn try_from_arrow_array(array: ArrayRef) -> Result<Self> {
pub fn try_from_arrow_array(array: impl AsRef<dyn Array>) -> Result<Self> {
Ok(Self::new(
array
.as_ref()
.as_any()
.downcast_ref::<PrimitiveArray<T>>()
.with_context(|| ConversionSnafu {
from: format!("{:?}", array.data_type()),
from: format!("{:?}", array.as_ref().data_type()),
})?
.clone(),
))
@@ -84,6 +86,10 @@ impl<T: Primitive + DataTypeBuilder> Vector for PrimitiveVector<T> {
vectors::impl_validity_for_vector!(self.array)
}
fn memory_size(&self) -> usize {
self.array.values().len() * std::mem::size_of::<T>()
}
fn is_null(&self, row: usize) -> bool {
self.array.is_null(row)
}
@@ -283,6 +289,7 @@ impl<T: Primitive + DataTypeBuilder> Serializable for PrimitiveVector<T> {
#[cfg(test)]
mod tests {
use arrow::datatypes::DataType as ArrowDataType;
use serde_json;
@@ -402,4 +409,12 @@ mod tests {
assert_eq!(Value::Int32(i as i32 + 1), v.get(i));
}
}
#[test]
fn test_memory_size() {
let v = PrimitiveVector::<i32>::from_slice((0..5).collect::<Vec<i32>>());
assert_eq!(20, v.memory_size());
let v = PrimitiveVector::<i64>::from(vec![Some(0i64), Some(1i64), Some(2i64), None, None]);
assert_eq!(40, v.memory_size());
}
}

View File

@@ -93,6 +93,10 @@ impl Vector for StringVector {
vectors::impl_validity_for_vector!(self.array)
}
fn memory_size(&self) -> usize {
self.len() * std::mem::size_of::<i64>() + self.array.values().len()
}
fn is_null(&self, row: usize) -> bool {
self.array.is_null(row)
}
@@ -113,7 +117,7 @@ impl Vector for StringVector {
impl ScalarVector for StringVector {
type OwnedItem = String;
type RefItem<'a> = &'a str;
type Iter<'a> = ZipValidity<'a, &'a str, Utf8ValuesIter<'a, i32>>;
type Iter<'a> = ZipValidity<'a, &'a str, Utf8ValuesIter<'a, i64>>;
type Builder = StringVectorBuilder;
fn get_data(&self, idx: usize) -> Option<Self::RefItem<'_>> {
@@ -205,6 +209,7 @@ mod tests {
assert!(!v.is_const());
assert_eq!(Validity::AllValid, v.validity());
assert!(!v.only_null());
assert_eq!(41, v.memory_size());
for (i, s) in strs.iter().enumerate() {
assert_eq!(Value::from(*s), v.get(i));
@@ -213,7 +218,7 @@ mod tests {
let arrow_arr = v.to_arrow_array();
assert_eq!(3, arrow_arr.len());
assert_eq!(&ArrowDataType::Utf8, arrow_arr.data_type());
assert_eq!(&ArrowDataType::LargeUtf8, arrow_arr.data_type());
}
#[test]