From 519cbc832ad2632c69124751eb4424bf23533d06 Mon Sep 17 00:00:00 2001 From: "Lei, Huang" Date: Wed, 18 May 2022 14:49:36 +0800 Subject: [PATCH] feat: add StringVector datatype (#28) --- src/datatypes/src/lib.rs | 7 +- src/datatypes/src/types.rs | 2 + src/datatypes/src/types/string_type.rs | 34 +++++++ src/datatypes/src/vectors.rs | 1 + src/datatypes/src/vectors/string.rs | 119 +++++++++++++++++++++++++ 5 files changed, 162 insertions(+), 1 deletion(-) create mode 100644 src/datatypes/src/types/string_type.rs create mode 100644 src/datatypes/src/vectors/string.rs diff --git a/src/datatypes/src/lib.rs b/src/datatypes/src/lib.rs index 10fb9ed36f..f81267247b 100644 --- a/src/datatypes/src/lib.rs +++ b/src/datatypes/src/lib.rs @@ -1,6 +1,7 @@ #![feature(generic_associated_types)] -use arrow::array::{BinaryArray, MutableBinaryArray}; +use arrow::array; +use arrow::array::{BinaryArray, MutableBinaryArray, Utf8Array}; mod data_type; pub mod prelude; @@ -12,6 +13,10 @@ pub mod vectors; pub type LargeBinaryArray = BinaryArray; pub type MutableLargeBinaryArray = MutableBinaryArray; + +pub type StringArray = Utf8Array; +pub type MutableStringArray = array::MutableUtf8Array; + pub mod schema; pub mod deserialize; diff --git a/src/datatypes/src/types.rs b/src/datatypes/src/types.rs index 61982e99f9..a8a97a3911 100644 --- a/src/datatypes/src/types.rs +++ b/src/datatypes/src/types.rs @@ -1,7 +1,9 @@ mod binary_type; mod primitive_traits; mod primitive_type; +mod string_type; pub use binary_type::BinaryType; pub use primitive_traits::Primitive; pub use primitive_type::{DataTypeBuilder, PrimitiveType}; +pub use string_type::StringType; diff --git a/src/datatypes/src/types/string_type.rs b/src/datatypes/src/types/string_type.rs new file mode 100644 index 0000000000..d5a7e5dae8 --- /dev/null +++ b/src/datatypes/src/types/string_type.rs @@ -0,0 +1,34 @@ +use std::sync::Arc; + +use arrow::datatypes::DataType as ArrowDataType; +use common_base::bytes::StringBytes; + +use crate::data_type::DataType; +use crate::prelude::{DataTypeRef, LogicalTypeId, Value}; + +#[derive(Debug, Default)] +pub struct StringType; + +impl StringType { + pub fn arc() -> DataTypeRef { + Arc::new(Self) + } +} + +impl DataType for StringType { + fn name(&self) -> &str { + "String" + } + + fn logical_type_id(&self) -> LogicalTypeId { + LogicalTypeId::String + } + + fn default_value(&self) -> Value { + StringBytes::default().into() + } + + fn as_arrow_type(&self) -> ArrowDataType { + ArrowDataType::Utf8 + } +} diff --git a/src/datatypes/src/vectors.rs b/src/datatypes/src/vectors.rs index ce1837e71a..17dd2a901b 100644 --- a/src/datatypes/src/vectors.rs +++ b/src/datatypes/src/vectors.rs @@ -1,5 +1,6 @@ pub mod binary; pub mod primitive; +mod string; use std::any::Any; use std::sync::Arc; diff --git a/src/datatypes/src/vectors/string.rs b/src/datatypes/src/vectors/string.rs new file mode 100644 index 0000000000..7f00f8d7b5 --- /dev/null +++ b/src/datatypes/src/vectors/string.rs @@ -0,0 +1,119 @@ +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{ArrayRef, Utf8ValuesIter}; +use arrow::bitmap::utils::ZipValidity; +use serde_json::Value; +use snafu::ResultExt; + +use crate::data_type::DataTypeRef; +use crate::error::SerializeSnafu; +use crate::prelude::{ScalarVectorBuilder, Vector}; +use crate::scalars::ScalarVector; +use crate::serialize::Serializable; +use crate::types::StringType; +use crate::{MutableStringArray, StringArray}; + +/// String array wrapper +#[derive(Clone)] +pub struct StringVector { + array: StringArray, +} + +impl Vector for StringVector { + fn data_type(&self) -> DataTypeRef { + StringType::arc() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn len(&self) -> usize { + self.array.len() + } + + fn to_arrow_array(&self) -> ArrayRef { + Arc::new(self.array.clone()) + } +} + +impl ScalarVector for StringVector { + type RefItem<'a> = &'a str; + type Iter<'a> = ZipValidity<'a, &'a str, Utf8ValuesIter<'a, i32>>; + type Builder = StringVectorBuilder; + + fn get_data(&self, idx: usize) -> Option> { + match idx < self.array.len() { + true => Some(self.array.value(idx)), + false => None, + } + } + + fn iter_data(&self) -> Self::Iter<'_> { + self.array.iter() + } +} + +pub struct StringVectorBuilder { + buffer: MutableStringArray, +} + +impl ScalarVectorBuilder for StringVectorBuilder { + type VectorType = StringVector; + + fn with_capacity(capacity: usize) -> Self { + Self { + buffer: MutableStringArray::with_capacity(capacity), + } + } + + fn push(&mut self, value: Option<::RefItem<'_>>) { + self.buffer.push(value) + } + + fn finish(self) -> Self::VectorType { + Self::VectorType { + array: self.buffer.into(), + } + } +} + +impl Serializable for StringVector { + fn serialize_to_json(&self) -> crate::error::Result> { + self.array + .iter() + .map(|v| match v { + None => Ok(serde_json::Value::Null), + Some(s) => serde_json::to_value(s), + }) + .collect::>() + .context(SerializeSnafu) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + pub fn test_serialize_string_vector() { + let mut builder = StringVectorBuilder::with_capacity(3); + builder.push(Some("hello")); + builder.push(None); + builder.push(Some("world")); + let string_vector = builder.finish(); + let serialized = serialize_to_json_string(string_vector.serialize_to_json().unwrap()); + assert_eq!(r#"["hello",null,"world"]"#, serialized); + } + + pub fn serialize_to_json_string(val: T) -> String + where + T: serde::Serialize, + { + let mut output = vec![]; + let mut serializer = serde_json::Serializer::new(&mut output); + val.serialize(&mut serializer).unwrap(); + String::from_utf8_lossy(&output).into() + } +}