From 23f235524dbb58795d63e0cdee8cab895567597f Mon Sep 17 00:00:00 2001 From: evenyag Date: Wed, 1 Jun 2022 20:55:58 +0800 Subject: [PATCH] feat: Implements validity() and null_count() for Vector (#38) * feat: Add validity() to Vector * test(datatypes): Add more tests and fix get_data() not returns None for null --- src/common/recordbatch/Cargo.toml | 2 +- src/datatypes/Cargo.toml | 2 +- src/datatypes/src/prelude.rs | 2 +- src/datatypes/src/scalars.rs | 3 + src/datatypes/src/vectors.rs | 54 +++++++++--- src/datatypes/src/vectors/binary.rs | 95 ++++++++++++++++---- src/datatypes/src/vectors/boolean.rs | 115 ++++++++++++++----------- src/datatypes/src/vectors/null.rs | 48 ++++++----- src/datatypes/src/vectors/primitive.rs | 100 +++++++++++++++------ src/datatypes/src/vectors/string.rs | 61 ++++++++----- 10 files changed, 329 insertions(+), 153 deletions(-) diff --git a/src/common/recordbatch/Cargo.toml b/src/common/recordbatch/Cargo.toml index 1fe8f3bb87..d59ccf425b 100644 --- a/src/common/recordbatch/Cargo.toml +++ b/src/common/recordbatch/Cargo.toml @@ -24,6 +24,6 @@ version="0.10" features = ["io_csv", "io_json", "io_parquet", "io_parquet_compression", "io_ipc", "ahash", "compute", "serde_types"] [dev-dependencies] -serde_json = "1.0.81" +serde_json = "1.0" tokio = { version = "1.18", features = ["full"] } diff --git a/src/datatypes/Cargo.toml b/src/datatypes/Cargo.toml index 7a335dbb89..2ad7b8eb1a 100644 --- a/src/datatypes/Cargo.toml +++ b/src/datatypes/Cargo.toml @@ -14,5 +14,5 @@ common-error = { path = "../common/error" } enum_dispatch = "0.3" paste = "1.0" serde = { version = "1.0.136", features = ["derive"] } -serde_json = "1.0.79" +serde_json = "1.0" snafu = { version = "0.7", features = ["backtraces"] } diff --git a/src/datatypes/src/prelude.rs b/src/datatypes/src/prelude.rs index 811cf82ec8..6d1a2d5bfb 100644 --- a/src/datatypes/src/prelude.rs +++ b/src/datatypes/src/prelude.rs @@ -2,4 +2,4 @@ pub use crate::data_type::{ConcreteDataType, DataType, DataTypeRef}; pub use crate::scalars::{ScalarVector, ScalarVectorBuilder}; pub use crate::type_id::LogicalTypeId; pub use crate::value::Value; -pub use crate::vectors::{Vector, VectorRef}; +pub use crate::vectors::{Validity, Vector, VectorRef}; diff --git a/src/datatypes/src/scalars.rs b/src/datatypes/src/scalars.rs index 96cc69fc09..fb9910921b 100644 --- a/src/datatypes/src/scalars.rs +++ b/src/datatypes/src/scalars.rs @@ -20,6 +20,9 @@ pub trait ScalarVector: Vector { /// Returns the reference to an element at given position. /// /// Note: `get()` has bad performance, avoid call this function inside loop. + /// + /// # Panics + /// Panics if `idx >= self.len()`. fn get_data(&self, idx: usize) -> Option>; /// Returns iterator of current vector. diff --git a/src/datatypes/src/vectors.rs b/src/datatypes/src/vectors.rs index 9d17d0d1d7..969d96b8f4 100644 --- a/src/datatypes/src/vectors.rs +++ b/src/datatypes/src/vectors.rs @@ -8,6 +8,7 @@ use std::any::Any; use std::sync::Arc; use arrow::array::ArrayRef; +use arrow::bitmap::Bitmap; use arrow::datatypes::DataType as ArrowDataType; pub use binary::*; pub use boolean::*; @@ -24,6 +25,25 @@ pub use crate::vectors::{ UInt8Vector, }; +#[derive(Debug, PartialEq)] +pub enum Validity<'a> { + /// Whether the array slot is valid or not (null). + Slots(&'a Bitmap), + /// All slots are valid. + AllValid, + /// All slots are null. + AllNull, +} + +impl<'a> Validity<'a> { + pub fn slots(&self) -> Option<&Bitmap> { + match self { + Validity::Slots(bitmap) => Some(bitmap), + _ => None, + } + } +} + /// Vector of data values. pub trait Vector: Send + Sync + Serializable { /// Returns the data type of the vector. @@ -45,6 +65,20 @@ pub trait Vector: Send + Sync + Serializable { /// Convert this vector to a new arrow [ArrayRef]. fn to_arrow_array(&self) -> ArrayRef; + + /// Returns the validity of the Array. + fn validity(&self) -> Validity; + + /// The number of null slots on this [`Vector`]. + /// # Implementation + /// This is `O(1)`. + fn null_count(&self) -> usize { + match self.validity() { + Validity::Slots(bitmap) => bitmap.null_count(), + Validity::AllValid => 0, + Validity::AllNull => self.len(), + } + } } pub type VectorRef = Arc; @@ -101,16 +135,16 @@ macro_rules! impl_try_from_arrow_array_for_vector { pub(crate) use impl_try_from_arrow_array_for_vector; #[cfg(test)] -mod tests { +pub mod tests { use arrow::array::{Array, PrimitiveArray}; - use serde::Serialize; + use serde_json; use super::*; use crate::data_type::DataType; use crate::types::DataTypeBuilder; #[test] - pub fn test_df_columns_to_vector() { + fn test_df_columns_to_vector() { let df_column: Arc = Arc::new(PrimitiveArray::from_slice(vec![1, 2, 3])); let vector = try_into_vector(df_column).unwrap(); assert_eq!( @@ -120,28 +154,22 @@ mod tests { } #[test] - pub fn test_serialize_i32_vector() { + fn test_serialize_i32_vector() { let df_column: Arc = Arc::new(PrimitiveArray::::from_slice(vec![1, 2, 3])); let json_value = try_into_vector(df_column) .unwrap() .serialize_to_json() .unwrap(); - let mut output = vec![]; - let mut serializer = serde_json::ser::Serializer::new(&mut output); - json_value.serialize(&mut serializer).unwrap(); - assert_eq!(b"[1,2,3]", output.as_slice()); + assert_eq!("[1,2,3]", serde_json::to_string(&json_value).unwrap()); } #[test] - pub fn test_serialize_i8_vector() { + fn test_serialize_i8_vector() { let df_column: Arc = Arc::new(PrimitiveArray::from_slice(vec![1u8, 2u8, 3u8])); let json_value = try_into_vector(df_column) .unwrap() .serialize_to_json() .unwrap(); - let mut output = vec![]; - let mut serializer = serde_json::ser::Serializer::new(&mut output); - json_value.serialize(&mut serializer).unwrap(); - assert_eq!(b"[1,2,3]", output.as_slice()); + assert_eq!("[1,2,3]", serde_json::to_string(&json_value).unwrap()); } } diff --git a/src/datatypes/src/vectors/binary.rs b/src/datatypes/src/vectors/binary.rs index aea5d61e8a..4cb0b09357 100644 --- a/src/datatypes/src/vectors/binary.rs +++ b/src/datatypes/src/vectors/binary.rs @@ -2,7 +2,7 @@ use std::any::Any; use std::sync::Arc; use arrow::array::BinaryValueIter; -use arrow::array::{ArrayRef, BinaryArray}; +use arrow::array::{Array, ArrayRef, BinaryArray}; use arrow::bitmap::utils::ZipValidity; use snafu::OptionExt; use snafu::ResultExt; @@ -15,7 +15,7 @@ use crate::scalars::{ScalarVector, ScalarVectorBuilder}; use crate::serialize::Serializable; use crate::types::BinaryType; use crate::vectors::impl_try_from_arrow_array_for_vector; -use crate::vectors::Vector; +use crate::vectors::{Validity, Vector}; /// Vector of binary strings. #[derive(Debug)] @@ -45,6 +45,13 @@ impl Vector for BinaryVector { fn to_arrow_array(&self) -> ArrayRef { Arc::new(self.array.clone()) } + + fn validity(&self) -> Validity { + match self.array.validity() { + Some(bitmap) => Validity::Slots(bitmap), + None => Validity::AllValid, + } + } } impl ScalarVector for BinaryVector { @@ -53,7 +60,7 @@ impl ScalarVector for BinaryVector { type Builder = BinaryVectorBuilder; fn get_data(&self, idx: usize) -> Option> { - if idx < self.len() { + if self.array.is_valid(idx) { Some(self.array.value(idx)) } else { None @@ -91,8 +98,7 @@ impl ScalarVectorBuilder for BinaryVectorBuilder { impl Serializable for BinaryVector { fn serialize_to_json(&self) -> Result> { - self.array - .iter() + self.iter_data() .map(|v| match v { None => Ok(serde_json::Value::Null), // if binary vector not present, map to NULL Some(vec) => serde_json::to_value(vec), @@ -106,30 +112,87 @@ impl_try_from_arrow_array_for_vector!(LargeBinaryArray, BinaryVector); #[cfg(test)] mod tests { - use serde::*; + use serde_json; - use super::BinaryVector; + use super::*; use crate::arrow_array::LargeBinaryArray; use crate::serialize::Serializable; #[test] - pub fn test_serialize_binary_vector_to_json() { - let vector = BinaryVector { - array: LargeBinaryArray::from_slice(&vec![vec![1, 2, 3], vec![1, 2, 3]]), - }; + fn test_serialize_binary_vector_to_json() { + let vector = BinaryVector::from(LargeBinaryArray::from_slice(&vec![ + vec![1, 2, 3], + vec![1, 2, 3], + ])); let json_value = vector.serialize_to_json().unwrap(); - let mut output = vec![]; - let mut serializer = serde_json::ser::Serializer::new(&mut output); - json_value.serialize(&mut serializer).unwrap(); - assert_eq!("[[1,2,3],[1,2,3]]", String::from_utf8_lossy(&output)); + assert_eq!( + "[[1,2,3],[1,2,3]]", + serde_json::to_string(&json_value).unwrap() + ); } #[test] - pub fn test_from_arrow_array() { + fn test_serialize_binary_vector_with_null_to_json() { + let mut builder = BinaryVectorBuilder::with_capacity(4); + builder.push(Some(&[1, 2, 3])); + builder.push(None); + builder.push(Some(&[4, 5, 6])); + let vector = builder.finish(); + + let json_value = vector.serialize_to_json().unwrap(); + assert_eq!( + "[[1,2,3],null,[4,5,6]]", + serde_json::to_string(&json_value).unwrap() + ); + } + + #[test] + fn test_from_arrow_array() { let arrow_array = LargeBinaryArray::from_slice(&vec![vec![1, 2, 3], vec![1, 2, 3]]); let original = arrow_array.clone(); let vector = BinaryVector::from(arrow_array); assert_eq!(original, vector.array); } + + #[test] + fn test_binary_vector_build_get() { + let mut builder = BinaryVectorBuilder::with_capacity(4); + builder.push(Some(b"hello")); + builder.push(Some(b"happy")); + builder.push(Some(b"world")); + builder.push(None); + + let vector = builder.finish(); + assert_eq!(b"hello", vector.get_data(0).unwrap()); + assert_eq!(None, vector.get_data(3)); + + let mut iter = vector.iter_data(); + assert_eq!(b"hello", iter.next().unwrap().unwrap()); + assert_eq!(b"happy", iter.next().unwrap().unwrap()); + assert_eq!(b"world", iter.next().unwrap().unwrap()); + assert_eq!(None, iter.next().unwrap()); + assert_eq!(None, iter.next()); + } + + #[test] + fn test_binary_vector_validity() { + let mut builder = BinaryVectorBuilder::with_capacity(4); + builder.push(Some(b"hello")); + builder.push(Some(b"world")); + let vector = builder.finish(); + assert_eq!(0, vector.null_count()); + assert_eq!(Validity::AllValid, vector.validity()); + + let mut builder = BinaryVectorBuilder::with_capacity(3); + builder.push(Some(b"hello")); + builder.push(None); + builder.push(Some(b"world")); + let vector = builder.finish(); + assert_eq!(1, vector.null_count()); + let validity = vector.validity(); + let slots = validity.slots().unwrap(); + assert_eq!(1, slots.null_count()); + assert!(!slots.get_bit(1)); + } } diff --git a/src/datatypes/src/vectors/boolean.rs b/src/datatypes/src/vectors/boolean.rs index d6ffcb6498..0f383087a8 100644 --- a/src/datatypes/src/vectors/boolean.rs +++ b/src/datatypes/src/vectors/boolean.rs @@ -2,7 +2,7 @@ use std::any::Any; use std::borrow::Borrow; use std::sync::Arc; -use arrow::array::{ArrayRef, BooleanArray, MutableBooleanArray}; +use arrow::array::{Array, ArrayRef, BooleanArray, MutableBooleanArray}; use arrow::bitmap::utils::{BitmapIter, ZipValidity}; use snafu::OptionExt; use snafu::ResultExt; @@ -13,7 +13,7 @@ use crate::scalars::{ScalarVector, ScalarVectorBuilder}; use crate::serialize::Serializable; use crate::types::BooleanType; use crate::vectors::impl_try_from_arrow_array_for_vector; -use crate::vectors::Vector; +use crate::vectors::{Validity, Vector}; /// Vector of boolean. #[derive(Debug)] @@ -67,6 +67,13 @@ impl Vector for BooleanVector { fn to_arrow_array(&self) -> ArrayRef { Arc::new(self.array.clone()) } + + fn validity(&self) -> Validity { + match self.array.validity() { + Some(bitmap) => Validity::Slots(bitmap), + None => Validity::AllValid, + } + } } impl ScalarVector for BooleanVector { @@ -75,7 +82,7 @@ impl ScalarVector for BooleanVector { type Builder = BooleanVectorBuilder; fn get_data(&self, idx: usize) -> Option> { - if idx < self.len() { + if self.array.is_valid(idx) { Some(self.array.value(idx)) } else { None @@ -124,88 +131,92 @@ impl_try_from_arrow_array_for_vector!(BooleanArray, BooleanVector); #[cfg(test)] mod tests { - use serde::*; + use serde_json; use super::*; use crate::serialize::Serializable; #[test] - pub fn test_serialize_boolean_vector_to_json() { - let vector = BooleanVector { - array: BooleanArray::from_slice(&vec![true, false, true, true, false, false]), - }; + fn test_serialize_boolean_vector_to_json() { + let vector = BooleanVector::from(vec![true, false, true, true, false, false]); let json_value = vector.serialize_to_json().unwrap(); - let mut output = vec![]; - let mut serializer = serde_json::ser::Serializer::new(&mut output); - json_value.serialize(&mut serializer).unwrap(); assert_eq!( "[true,false,true,true,false,false]", - String::from_utf8_lossy(&output) + serde_json::to_string(&json_value).unwrap(), + ); + } + + #[test] + fn test_serialize_boolean_vector_with_null_to_json() { + let vector = BooleanVector::from(vec![Some(true), None, Some(false)]); + + let json_value = vector.serialize_to_json().unwrap(); + assert_eq!( + "[true,null,false]", + serde_json::to_string(&json_value).unwrap(), ); } #[test] fn test_boolean_vector_from_vec() { - let vec = BooleanVector::from(vec![false, true, false, true]); + let input = vec![false, true, false, true]; + let vec = BooleanVector::from(input.clone()); assert_eq!(4, vec.len()); - for i in 0..4 { - assert_eq!( - i == 1 || i == 3, - vec.get_data(i).unwrap(), - "failed at {}", - i - ) + for (i, v) in input.into_iter().enumerate() { + assert_eq!(Some(v), vec.get_data(i), "failed at {}", i) } } #[test] fn test_boolean_vector_from_iter() { - let v = vec![Some(false), Some(true), Some(false), Some(true)]; - let vec = v.into_iter().collect::(); + let input = vec![Some(false), Some(true), Some(false), Some(true)]; + let vec = input.iter().collect::(); assert_eq!(4, vec.len()); - for i in 0..3 { - assert_eq!( - i == 1 || i == 3, - vec.get_data(i).unwrap(), - "failed at {}", - i - ) + for (i, v) in input.into_iter().enumerate() { + assert_eq!(v, vec.get_data(i), "failed at {}", i) } } #[test] fn test_boolean_vector_from_vec_option() { - let vec = BooleanVector::from(vec![Some(false), Some(true), None, Some(true)]); + let input = vec![Some(false), Some(true), None, Some(true)]; + let vec = BooleanVector::from(input.clone()); assert_eq!(4, vec.len()); - for i in 0..4 { - assert_eq!( - i == 1 || i == 3, - vec.get_data(i).unwrap(), - "failed at {}", - i - ) + for (i, v) in input.into_iter().enumerate() { + assert_eq!(v, vec.get_data(i), "failed at {}", i) } } #[test] - fn test_boolean_vector_builder() { - let mut builder = BooleanVectorBuilder::with_capacity(4); - builder.push(Some(false)); - builder.push(Some(true)); - builder.push(Some(false)); - builder.push(Some(true)); + fn test_boolean_vector_build_get() { + let input = [Some(true), None, Some(false)]; + let mut builder = BooleanVectorBuilder::with_capacity(3); + for v in input { + builder.push(v); + } + let vector = builder.finish(); + assert_eq!(input.len(), vector.len()); - let vec = builder.finish(); + let res: Vec<_> = vector.iter_data().collect(); + assert_eq!(input, &res[..]); - assert_eq!(4, vec.len()); - for i in 0..4 { - assert_eq!( - i == 1 || i == 3, - vec.get_data(i).unwrap(), - "failed at {}", - i - ) + for (i, v) in input.into_iter().enumerate() { + assert_eq!(v, vector.get_data(i)); } } + + #[test] + fn test_boolean_vector_validity() { + let vector = BooleanVector::from(vec![Some(true), None, Some(false)]); + assert_eq!(1, vector.null_count()); + let validity = vector.validity(); + let slots = validity.slots().unwrap(); + assert_eq!(1, slots.null_count()); + assert!(!slots.get_bit(1)); + + let vector = BooleanVector::from(vec![true, false, false]); + assert_eq!(0, vector.null_count()); + assert_eq!(Validity::AllValid, vector.validity()); + } } diff --git a/src/datatypes/src/vectors/null.rs b/src/datatypes/src/vectors/null.rs index 9e644954ee..16e7865b9f 100644 --- a/src/datatypes/src/vectors/null.rs +++ b/src/datatypes/src/vectors/null.rs @@ -6,14 +6,13 @@ use arrow::array::ArrayRef; use arrow::array::{Array, NullArray}; use arrow::datatypes::DataType as ArrowDataType; use snafu::OptionExt; -use snafu::ResultExt; use crate::data_type::ConcreteDataType; -use crate::error::{Result, SerializeSnafu}; +use crate::error::Result; use crate::serialize::Serializable; use crate::types::NullType; use crate::vectors::impl_try_from_arrow_array_for_vector; -use crate::vectors::Vector; +use crate::vectors::{Validity, Vector}; pub struct NullVector { array: NullArray, @@ -49,6 +48,10 @@ impl Vector for NullVector { fn to_arrow_array(&self) -> ArrayRef { Arc::new(self.array.clone()) } + + fn validity(&self) -> Validity { + Validity::AllNull + } } impl fmt::Debug for NullVector { @@ -57,14 +60,11 @@ impl fmt::Debug for NullVector { } } -const NULL_STR: &str = "NULL"; impl Serializable for NullVector { fn serialize_to_json(&self) -> Result> { - vec![NULL_STR.to_owned(); self.len()] - .into_iter() - .map(serde_json::to_value) - .collect::>() - .context(SerializeSnafu) + Ok(std::iter::repeat(serde_json::Value::Null) + .take(self.len()) + .collect()) } } @@ -72,16 +72,16 @@ impl_try_from_arrow_array_for_vector!(NullArray, NullVector); #[cfg(test)] mod tests { - use serde_json::Value as JsonValue; + use serde_json; use super::*; #[test] - fn test_null_array() { - let null_arr = NullVector::new(32); + fn test_null_vector() { + let vector = NullVector::new(32); - assert_eq!(null_arr.len(), 32); - let arrow_arr = null_arr.to_arrow_array(); + assert_eq!(vector.len(), 32); + let arrow_arr = vector.to_arrow_array(); assert_eq!(arrow_arr.null_count(), 32); let array2 = arrow_arr.slice(8, 16); @@ -90,21 +90,25 @@ mod tests { } #[test] - fn test_debug_null_array() { + fn test_debug_null_vector() { let array = NullVector::new(1024 * 1024); assert_eq!(format!("{:?}", array), "NullVector(1048576)"); } #[test] fn test_serialize_json() { - let null_vec = NullVector::new(3); + let vector = NullVector::new(3); + let json_value = vector.serialize_to_json().unwrap(); assert_eq!( - vec![ - JsonValue::from(NULL_STR), - JsonValue::from(NULL_STR), - JsonValue::from(NULL_STR), - ], - null_vec.serialize_to_json().unwrap() + "[null,null,null]", + serde_json::to_string(&json_value).unwrap() ); } + + #[test] + fn test_null_vector_validity() { + let vector = NullVector::new(5); + assert_eq!(Validity::AllNull, vector.validity()); + assert_eq!(5, vector.null_count()); + } } diff --git a/src/datatypes/src/vectors/primitive.rs b/src/datatypes/src/vectors/primitive.rs index dd3ed3214a..24fc44634b 100644 --- a/src/datatypes/src/vectors/primitive.rs +++ b/src/datatypes/src/vectors/primitive.rs @@ -3,7 +3,7 @@ use std::iter::FromIterator; use std::slice::Iter; use std::sync::Arc; -use arrow::array::{ArrayRef, MutablePrimitiveArray, PrimitiveArray}; +use arrow::array::{Array, ArrayRef, MutablePrimitiveArray, PrimitiveArray}; use arrow::bitmap::utils::ZipValidity; use serde_json::Value as JsonValue; use snafu::{OptionExt, ResultExt}; @@ -14,7 +14,7 @@ use crate::error::{Result, SerializeSnafu}; use crate::scalars::{ScalarVector, ScalarVectorBuilder}; use crate::serialize::Serializable; use crate::types::{DataTypeBuilder, Primitive}; -use crate::vectors::Vector; +use crate::vectors::{Validity, Vector}; /// Vector for primitive data types. #[derive(Debug)] @@ -73,6 +73,13 @@ impl Vector for PrimitiveVector { fn to_arrow_array(&self) -> ArrayRef { Arc::new(self.array.clone()) } + + fn validity(&self) -> Validity { + match self.array.validity() { + Some(bitmap) => Validity::Slots(bitmap), + None => Validity::AllValid, + } + } } impl From> for PrimitiveVector { @@ -89,21 +96,13 @@ impl>> FromIterator for Pr } } -impl<'a, T: Primitive> PrimitiveVector { - /// implement iter for PrimitiveVector - #[inline] - pub fn iter(&'a self) -> std::slice::Iter<'a, T> { - self.array.values().iter() - } -} - impl ScalarVector for PrimitiveVector { type RefItem<'a> = T; type Iter<'a> = PrimitiveIter<'a, T>; type Builder = PrimitiveVectorBuilder; fn get_data(&self, idx: usize) -> Option> { - if idx < self.len() { + if self.array.is_valid(idx) { Some(self.array.value(idx)) } else { None @@ -168,7 +167,7 @@ impl ScalarVectorBuilder for PrimitiveVectorBuil impl Serializable for PrimitiveVector { fn serialize_to_json(&self) -> Result> { - self.iter() + self.iter_data() .map(serde_json::to_value) .collect::>() .context(SerializeSnafu) @@ -177,43 +176,92 @@ impl Serializable for PrimitiveVector { #[cfg(test)] mod tests { + use serde_json; + use super::*; use crate::serialize::Serializable; - fn assert_vec_eq(v: PrimitiveVector) { - assert_eq!( - vec![ - JsonValue::from(1i32), - JsonValue::from(2i32), - JsonValue::from(3i32), - JsonValue::from(4i32) - ], - v.serialize_to_json().unwrap() - ); + fn check_vec(v: PrimitiveVector) { + let json_value = v.serialize_to_json().unwrap(); + assert_eq!("[1,2,3,4]", serde_json::to_string(&json_value).unwrap(),); } #[test] fn test_from_values() { let v = PrimitiveVector::::from_values(vec![1, 2, 3, 4]); - assert_vec_eq(v); + check_vec(v); } #[test] fn test_from_vec() { let v = PrimitiveVector::::from_vec(vec![1, 2, 3, 4]); - assert_vec_eq(v); + check_vec(v); } #[test] fn test_from_slice() { let v = PrimitiveVector::::from_slice(vec![1, 2, 3, 4]); - assert_vec_eq(v); + check_vec(v); + } + + #[test] + fn test_serialize_primitive_vector_with_null_to_json() { + let input = [Some(1i32), Some(2i32), None, Some(4i32), None]; + let mut builder = PrimitiveVectorBuilder::with_capacity(input.len()); + for v in input { + builder.push(v); + } + let vector = builder.finish(); + + let json_value = vector.serialize_to_json().unwrap(); + assert_eq!( + "[1,2,null,4,null]", + serde_json::to_string(&json_value).unwrap(), + ); } #[test] fn test_from_arrow_array() { let arrow_array = PrimitiveArray::from_slice(vec![1, 2, 3, 4]); let v = PrimitiveVector::from(arrow_array); - assert_vec_eq(v); + check_vec(v); + } + + #[test] + fn test_primitive_vector_build_get() { + let input = [Some(1i32), Some(2i32), None, Some(4i32), None]; + let mut builder = PrimitiveVectorBuilder::with_capacity(input.len()); + for v in input { + builder.push(v); + } + let vector = builder.finish(); + assert_eq!(input.len(), vector.len()); + + for (i, v) in input.into_iter().enumerate() { + assert_eq!(v, vector.get_data(i)); + } + + let res: Vec<_> = vector.iter_data().collect(); + assert_eq!(input, &res[..]); + } + + #[test] + fn test_primitive_vector_validity() { + let input = [Some(1i32), Some(2i32), None, None]; + let mut builder = PrimitiveVectorBuilder::with_capacity(input.len()); + for v in input { + builder.push(v); + } + let vector = builder.finish(); + assert_eq!(2, vector.null_count()); + let validity = vector.validity(); + let slots = validity.slots().unwrap(); + assert_eq!(2, slots.null_count()); + assert!(!slots.get_bit(2)); + assert!(!slots.get_bit(3)); + + let vector = PrimitiveVector::::from_slice(vec![1, 2, 3, 4]); + assert_eq!(0, vector.null_count()); + assert_eq!(Validity::AllValid, vector.validity()); } } diff --git a/src/datatypes/src/vectors/string.rs b/src/datatypes/src/vectors/string.rs index 8aff125db4..d445693e52 100644 --- a/src/datatypes/src/vectors/string.rs +++ b/src/datatypes/src/vectors/string.rs @@ -1,7 +1,7 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{ArrayRef, Utf8ValuesIter}; +use arrow::array::{Array, ArrayRef, Utf8ValuesIter}; use arrow::bitmap::utils::ZipValidity; use serde_json::Value; use snafu::OptionExt; @@ -10,7 +10,7 @@ use snafu::ResultExt; use crate::arrow_array::{MutableStringArray, StringArray}; use crate::data_type::ConcreteDataType; use crate::error::SerializeSnafu; -use crate::prelude::{ScalarVectorBuilder, Vector}; +use crate::prelude::{ScalarVectorBuilder, Validity, Vector}; use crate::scalars::ScalarVector; use crate::serialize::Serializable; use crate::types::StringType; @@ -44,6 +44,13 @@ impl Vector for StringVector { fn to_arrow_array(&self) -> ArrayRef { Arc::new(self.array.clone()) } + + fn validity(&self) -> Validity { + match self.array.validity() { + Some(bitmap) => Validity::Slots(bitmap), + None => Validity::AllValid, + } + } } impl ScalarVector for StringVector { @@ -52,9 +59,10 @@ impl ScalarVector for StringVector { type Builder = StringVectorBuilder; fn get_data(&self, idx: usize) -> Option> { - match idx < self.array.len() { - true => Some(self.array.value(idx)), - false => None, + if self.array.is_valid(idx) { + Some(self.array.value(idx)) + } else { + None } } @@ -89,8 +97,7 @@ impl ScalarVectorBuilder for StringVectorBuilder { impl Serializable for StringVector { fn serialize_to_json(&self) -> crate::error::Result> { - self.array - .iter() + self.iter_data() .map(|v| match v { None => Ok(serde_json::Value::Null), Some(s) => serde_json::to_value(s), @@ -104,31 +111,24 @@ impl_try_from_arrow_array_for_vector!(StringArray, StringVector); #[cfg(test)] mod tests { + use serde_json; + use super::*; #[test] - pub fn test_serialize_string_vector() { + fn test_serialize_string_vector() { let mut builder = StringVectorBuilder::with_capacity(3); builder.push(Some("hello")); builder.push(None); builder.push(Some("world")); let string_vector = builder.finish(); - let serialized = serialize_to_json_string(string_vector.serialize_to_json().unwrap()); + let serialized = + serde_json::to_string(&string_vector.serialize_to_json().unwrap()).unwrap(); assert_eq!(r#"["hello",null,"world"]"#, serialized); } - pub fn serialize_to_json_string(val: T) -> String - where - T: serde::Serialize, - { - let mut output = vec![]; - let mut serializer = serde_json::Serializer::new(&mut output); - val.serialize(&mut serializer).unwrap(); - String::from_utf8_lossy(&output).into() - } - #[test] - pub fn test_from_arrow_array() { + fn test_from_arrow_array() { let mut builder = MutableStringArray::new(); builder.push(Some("A")); builder.push(Some("B")); @@ -138,7 +138,26 @@ mod tests { let vector = StringVector::from(string_array); assert_eq!( r#"["A","B",null,"D"]"#, - serialize_to_json_string(vector.serialize_to_json().unwrap()) + serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(), ); } + + #[test] + fn test_string_vector_build_get() { + let mut builder = StringVectorBuilder::with_capacity(4); + builder.push(Some("hello")); + builder.push(None); + builder.push(Some("world")); + let vector = builder.finish(); + + assert_eq!(Some("hello"), vector.get_data(0)); + assert_eq!(None, vector.get_data(1)); + assert_eq!(Some("world"), vector.get_data(2)); + + let mut iter = vector.iter_data(); + assert_eq!("hello", iter.next().unwrap().unwrap()); + assert_eq!(None, iter.next().unwrap()); + assert_eq!("world", iter.next().unwrap().unwrap()); + assert_eq!(None, iter.next()); + } }