mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-01-07 22:02:56 +00:00
feat: memory size of vector (#53)
* feat: improve try_into_vector function * feat: impl memory_size function for vectors * fix: forgot memory_size assertion in null vector test * feat: use LargeUtf8 instead of utf8 for string, and rename LargeBianryArray to BinaryArray * feat: memory_size only calculates heap size
This commit is contained in:
@@ -1,6 +1,9 @@
|
||||
use arrow::array::{BinaryArray, MutableBinaryArray, MutableUtf8Array, Utf8Array};
|
||||
use arrow::array::{
|
||||
BinaryArray as ArrowBinaryArray, MutableBinaryArray as ArrowMutableBinaryArray,
|
||||
MutableUtf8Array, Utf8Array,
|
||||
};
|
||||
|
||||
pub type LargeBinaryArray = BinaryArray<i64>;
|
||||
pub type MutableLargeBinaryArray = MutableBinaryArray<i64>;
|
||||
pub type MutableStringArray = MutableUtf8Array<i32>;
|
||||
pub type StringArray = Utf8Array<i32>;
|
||||
pub type BinaryArray = ArrowBinaryArray<i64>;
|
||||
pub type MutableBinaryArray = ArrowMutableBinaryArray<i64>;
|
||||
pub type MutableStringArray = MutableUtf8Array<i64>;
|
||||
pub type StringArray = Utf8Array<i64>;
|
||||
|
||||
@@ -29,6 +29,6 @@ impl DataType for StringType {
|
||||
}
|
||||
|
||||
fn as_arrow_type(&self) -> ArrowDataType {
|
||||
ArrowDataType::Utf8
|
||||
ArrowDataType::LargeUtf8
|
||||
}
|
||||
}
|
||||
|
||||
@@ -75,6 +75,9 @@ pub trait Vector: Send + Sync + Serializable {
|
||||
/// Returns the validity of the Array.
|
||||
fn validity(&self) -> Validity;
|
||||
|
||||
/// Returns the memory size of vector.
|
||||
fn memory_size(&self) -> usize;
|
||||
|
||||
/// The number of null slots on this [`Vector`].
|
||||
/// # Implementation
|
||||
/// This is `O(1)`.
|
||||
@@ -132,14 +135,15 @@ macro_rules! impl_try_from_arrow_array_for_vector {
|
||||
($Array: ident, $Vector: ident) => {
|
||||
impl $Vector {
|
||||
pub fn try_from_arrow_array(
|
||||
array: arrow::array::ArrayRef,
|
||||
array: impl AsRef<dyn arrow::array::Array>,
|
||||
) -> crate::error::Result<$Vector> {
|
||||
Ok($Vector::from(
|
||||
array
|
||||
.as_ref()
|
||||
.as_any()
|
||||
.downcast_ref::<$Array>()
|
||||
.with_context(|| crate::error::ConversionSnafu {
|
||||
from: std::format!("{:?}", array.data_type()),
|
||||
from: std::format!("{:?}", array.as_ref().data_type()),
|
||||
})?
|
||||
.clone(),
|
||||
))
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
use std::any::Any;
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow::array::{Array, ArrayRef, BinaryArray};
|
||||
use arrow::array::{Array, ArrayRef};
|
||||
use arrow::array::{BinaryValueIter, MutableArray};
|
||||
use arrow::bitmap::utils::ZipValidity;
|
||||
use snafu::OptionExt;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::arrow_array::{LargeBinaryArray, MutableLargeBinaryArray};
|
||||
use crate::arrow_array::{BinaryArray, MutableBinaryArray};
|
||||
use crate::data_type::ConcreteDataType;
|
||||
use crate::error::Result;
|
||||
use crate::error::SerializeSnafu;
|
||||
@@ -19,11 +19,11 @@ use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef};
|
||||
/// Vector of binary strings.
|
||||
#[derive(Debug)]
|
||||
pub struct BinaryVector {
|
||||
array: LargeBinaryArray,
|
||||
array: BinaryArray,
|
||||
}
|
||||
|
||||
impl From<BinaryArray<i64>> for BinaryVector {
|
||||
fn from(array: BinaryArray<i64>) -> Self {
|
||||
impl From<BinaryArray> for BinaryVector {
|
||||
fn from(array: BinaryArray) -> Self {
|
||||
Self { array }
|
||||
}
|
||||
}
|
||||
@@ -31,7 +31,7 @@ impl From<BinaryArray<i64>> for BinaryVector {
|
||||
impl From<Vec<Option<Vec<u8>>>> for BinaryVector {
|
||||
fn from(data: Vec<Option<Vec<u8>>>) -> Self {
|
||||
Self {
|
||||
array: LargeBinaryArray::from(data),
|
||||
array: BinaryArray::from(data),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -61,6 +61,10 @@ impl Vector for BinaryVector {
|
||||
vectors::impl_validity_for_vector!(self.array)
|
||||
}
|
||||
|
||||
fn memory_size(&self) -> usize {
|
||||
self.array.values().len() + self.array.offsets().len() * std::mem::size_of::<i64>()
|
||||
}
|
||||
|
||||
fn is_null(&self, row: usize) -> bool {
|
||||
self.array.is_null(row)
|
||||
}
|
||||
@@ -98,7 +102,7 @@ impl ScalarVector for BinaryVector {
|
||||
}
|
||||
|
||||
pub struct BinaryVectorBuilder {
|
||||
mutable_array: MutableLargeBinaryArray,
|
||||
mutable_array: MutableBinaryArray,
|
||||
}
|
||||
|
||||
impl MutableVector for BinaryVectorBuilder {
|
||||
@@ -128,7 +132,7 @@ impl ScalarVectorBuilder for BinaryVectorBuilder {
|
||||
|
||||
fn with_capacity(capacity: usize) -> Self {
|
||||
Self {
|
||||
mutable_array: MutableLargeBinaryArray::with_capacity(capacity),
|
||||
mutable_array: MutableBinaryArray::with_capacity(capacity),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -155,7 +159,7 @@ impl Serializable for BinaryVector {
|
||||
}
|
||||
}
|
||||
|
||||
vectors::impl_try_from_arrow_array_for_vector!(LargeBinaryArray, BinaryVector);
|
||||
vectors::impl_try_from_arrow_array_for_vector!(BinaryArray, BinaryVector);
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
@@ -164,21 +168,19 @@ mod tests {
|
||||
use serde_json;
|
||||
|
||||
use super::*;
|
||||
use crate::arrow_array::LargeBinaryArray;
|
||||
use crate::arrow_array::BinaryArray;
|
||||
use crate::serialize::Serializable;
|
||||
|
||||
#[test]
|
||||
fn test_binary_vector_misc() {
|
||||
let v = BinaryVector::from(LargeBinaryArray::from_slice(&vec![
|
||||
vec![1, 2, 3],
|
||||
vec![1, 2, 3],
|
||||
]));
|
||||
let v = BinaryVector::from(BinaryArray::from_slice(&vec![vec![1, 2, 3], vec![1, 2, 3]]));
|
||||
|
||||
assert_eq!(2, v.len());
|
||||
assert_eq!("BinaryVector", v.vector_type_name());
|
||||
assert!(!v.is_const());
|
||||
assert_eq!(Validity::AllValid, v.validity());
|
||||
assert!(!v.only_null());
|
||||
assert_eq!(30, v.memory_size());
|
||||
|
||||
for i in 0..2 {
|
||||
assert!(!v.is_null(i));
|
||||
@@ -192,10 +194,8 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_serialize_binary_vector_to_json() {
|
||||
let vector = BinaryVector::from(LargeBinaryArray::from_slice(&vec![
|
||||
vec![1, 2, 3],
|
||||
vec![1, 2, 3],
|
||||
]));
|
||||
let vector =
|
||||
BinaryVector::from(BinaryArray::from_slice(&vec![vec![1, 2, 3], vec![1, 2, 3]]));
|
||||
|
||||
let json_value = vector.serialize_to_json().unwrap();
|
||||
assert_eq!(
|
||||
@@ -221,7 +221,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_from_arrow_array() {
|
||||
let arrow_array = LargeBinaryArray::from_slice(&vec![vec![1, 2, 3], vec![1, 2, 3]]);
|
||||
let arrow_array = BinaryArray::from_slice(&vec![vec![1, 2, 3], vec![1, 2, 3]]);
|
||||
let original = arrow_array.clone();
|
||||
let vector = BinaryVector::from(arrow_array);
|
||||
assert_eq!(original, vector.array);
|
||||
|
||||
@@ -76,6 +76,10 @@ impl Vector for BooleanVector {
|
||||
vectors::impl_validity_for_vector!(self.array)
|
||||
}
|
||||
|
||||
fn memory_size(&self) -> usize {
|
||||
self.array.values().as_slice().0.len()
|
||||
}
|
||||
|
||||
fn is_null(&self, row: usize) -> bool {
|
||||
self.array.is_null(row)
|
||||
}
|
||||
@@ -179,13 +183,14 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_boolean_vector_misc() {
|
||||
let bools = vec![true, false, true, true, false, false];
|
||||
let bools = vec![true, false, true, true, false, false, true, true, false];
|
||||
let v = BooleanVector::from(bools.clone());
|
||||
assert_eq!(6, v.len());
|
||||
assert_eq!(9, v.len());
|
||||
assert_eq!("BooleanVector", v.vector_type_name());
|
||||
assert!(!v.is_const());
|
||||
assert_eq!(Validity::AllValid, v.validity());
|
||||
assert!(!v.only_null());
|
||||
assert_eq!(2, v.memory_size());
|
||||
|
||||
for (i, b) in bools.iter().enumerate() {
|
||||
assert!(!v.is_null(i));
|
||||
@@ -193,7 +198,7 @@ mod tests {
|
||||
}
|
||||
|
||||
let arrow_arr = v.to_arrow_array();
|
||||
assert_eq!(6, arrow_arr.len());
|
||||
assert_eq!(9, arrow_arr.len());
|
||||
assert_eq!(&ArrowDataType::Boolean, arrow_arr.data_type());
|
||||
}
|
||||
|
||||
|
||||
@@ -66,6 +66,10 @@ impl Vector for ConstantVector {
|
||||
}
|
||||
}
|
||||
|
||||
fn memory_size(&self) -> usize {
|
||||
self.vector.memory_size()
|
||||
}
|
||||
|
||||
fn is_null(&self, _row: usize) -> bool {
|
||||
self.vector.is_null(0)
|
||||
}
|
||||
@@ -133,6 +137,7 @@ mod tests {
|
||||
assert_eq!(10, c.len());
|
||||
assert_eq!(Validity::AllValid, c.validity());
|
||||
assert!(!c.only_null());
|
||||
assert_eq!(4, c.memory_size());
|
||||
|
||||
for i in 0..10 {
|
||||
assert!(!c.is_null(i));
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
use std::any::Any;
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow::array::Array;
|
||||
use arrow::datatypes::DataType as ArrowDataType;
|
||||
use datafusion_common::ScalarValue;
|
||||
use snafu::OptionExt;
|
||||
@@ -151,8 +152,8 @@ impl Helper {
|
||||
///
|
||||
/// # Panics
|
||||
/// Panic if given arrow data type is not supported.
|
||||
pub fn try_into_vector(array: ArrayRef) -> Result<VectorRef> {
|
||||
Ok(match array.data_type() {
|
||||
pub fn try_into_vector(array: impl AsRef<dyn Array>) -> Result<VectorRef> {
|
||||
Ok(match array.as_ref().data_type() {
|
||||
ArrowDataType::Null => Arc::new(NullVector::try_from_arrow_array(array)?),
|
||||
ArrowDataType::Boolean => Arc::new(BooleanVector::try_from_arrow_array(array)?),
|
||||
ArrowDataType::Binary | ArrowDataType::LargeBinary => {
|
||||
@@ -171,7 +172,7 @@ impl Helper {
|
||||
ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => {
|
||||
Arc::new(StringVector::try_from_arrow_array(array)?)
|
||||
}
|
||||
_ => unimplemented!("Arrow array datatype: {:?}", array.data_type()),
|
||||
_ => unimplemented!("Arrow array datatype: {:?}", array.as_ref().data_type()),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -58,6 +58,10 @@ impl Vector for NullVector {
|
||||
Validity::AllNull
|
||||
}
|
||||
|
||||
fn memory_size(&self) -> usize {
|
||||
0
|
||||
}
|
||||
|
||||
fn is_null(&self, _row: usize) -> bool {
|
||||
true
|
||||
}
|
||||
@@ -114,6 +118,7 @@ mod tests {
|
||||
let v = NullVector::new(32);
|
||||
|
||||
assert_eq!(v.len(), 32);
|
||||
assert_eq!(0, v.memory_size());
|
||||
let arrow_arr = v.to_arrow_array();
|
||||
assert_eq!(arrow_arr.null_count(), 32);
|
||||
|
||||
|
||||
@@ -28,13 +28,15 @@ impl<T: Primitive> PrimitiveVector<T> {
|
||||
pub fn new(array: PrimitiveArray<T>) -> Self {
|
||||
Self { array }
|
||||
}
|
||||
pub fn try_from_arrow_array(array: ArrayRef) -> Result<Self> {
|
||||
|
||||
pub fn try_from_arrow_array(array: impl AsRef<dyn Array>) -> Result<Self> {
|
||||
Ok(Self::new(
|
||||
array
|
||||
.as_ref()
|
||||
.as_any()
|
||||
.downcast_ref::<PrimitiveArray<T>>()
|
||||
.with_context(|| ConversionSnafu {
|
||||
from: format!("{:?}", array.data_type()),
|
||||
from: format!("{:?}", array.as_ref().data_type()),
|
||||
})?
|
||||
.clone(),
|
||||
))
|
||||
@@ -84,6 +86,10 @@ impl<T: Primitive + DataTypeBuilder> Vector for PrimitiveVector<T> {
|
||||
vectors::impl_validity_for_vector!(self.array)
|
||||
}
|
||||
|
||||
fn memory_size(&self) -> usize {
|
||||
self.array.values().len() * std::mem::size_of::<T>()
|
||||
}
|
||||
|
||||
fn is_null(&self, row: usize) -> bool {
|
||||
self.array.is_null(row)
|
||||
}
|
||||
@@ -283,6 +289,7 @@ impl<T: Primitive + DataTypeBuilder> Serializable for PrimitiveVector<T> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use arrow::datatypes::DataType as ArrowDataType;
|
||||
use serde_json;
|
||||
|
||||
@@ -402,4 +409,12 @@ mod tests {
|
||||
assert_eq!(Value::Int32(i as i32 + 1), v.get(i));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_memory_size() {
|
||||
let v = PrimitiveVector::<i32>::from_slice((0..5).collect::<Vec<i32>>());
|
||||
assert_eq!(20, v.memory_size());
|
||||
let v = PrimitiveVector::<i64>::from(vec![Some(0i64), Some(1i64), Some(2i64), None, None]);
|
||||
assert_eq!(40, v.memory_size());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -93,6 +93,10 @@ impl Vector for StringVector {
|
||||
vectors::impl_validity_for_vector!(self.array)
|
||||
}
|
||||
|
||||
fn memory_size(&self) -> usize {
|
||||
self.len() * std::mem::size_of::<i64>() + self.array.values().len()
|
||||
}
|
||||
|
||||
fn is_null(&self, row: usize) -> bool {
|
||||
self.array.is_null(row)
|
||||
}
|
||||
@@ -113,7 +117,7 @@ impl Vector for StringVector {
|
||||
impl ScalarVector for StringVector {
|
||||
type OwnedItem = String;
|
||||
type RefItem<'a> = &'a str;
|
||||
type Iter<'a> = ZipValidity<'a, &'a str, Utf8ValuesIter<'a, i32>>;
|
||||
type Iter<'a> = ZipValidity<'a, &'a str, Utf8ValuesIter<'a, i64>>;
|
||||
type Builder = StringVectorBuilder;
|
||||
|
||||
fn get_data(&self, idx: usize) -> Option<Self::RefItem<'_>> {
|
||||
@@ -205,6 +209,7 @@ mod tests {
|
||||
assert!(!v.is_const());
|
||||
assert_eq!(Validity::AllValid, v.validity());
|
||||
assert!(!v.only_null());
|
||||
assert_eq!(41, v.memory_size());
|
||||
|
||||
for (i, s) in strs.iter().enumerate() {
|
||||
assert_eq!(Value::from(*s), v.get(i));
|
||||
@@ -213,7 +218,7 @@ mod tests {
|
||||
|
||||
let arrow_arr = v.to_arrow_array();
|
||||
assert_eq!(3, arrow_arr.len());
|
||||
assert_eq!(&ArrowDataType::Utf8, arrow_arr.data_type());
|
||||
assert_eq!(&ArrowDataType::LargeUtf8, arrow_arr.data_type());
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
Reference in New Issue
Block a user