feat: add StringVector datatype (#28)

This commit is contained in:
Lei, Huang
2022-05-18 14:49:36 +08:00
committed by GitHub
parent bd4fe1f5bc
commit 519cbc832a
5 changed files with 162 additions and 1 deletions

View File

@@ -1,6 +1,7 @@
#![feature(generic_associated_types)]
use arrow::array::{BinaryArray, MutableBinaryArray};
use arrow::array;
use arrow::array::{BinaryArray, MutableBinaryArray, Utf8Array};
mod data_type;
pub mod prelude;
@@ -12,6 +13,10 @@ pub mod vectors;
pub type LargeBinaryArray = BinaryArray<i64>;
pub type MutableLargeBinaryArray = MutableBinaryArray<i64>;
pub type StringArray = Utf8Array<i32>;
pub type MutableStringArray = array::MutableUtf8Array<i32>;
pub mod schema;
pub mod deserialize;

View File

@@ -1,7 +1,9 @@
mod binary_type;
mod primitive_traits;
mod primitive_type;
mod string_type;
pub use binary_type::BinaryType;
pub use primitive_traits::Primitive;
pub use primitive_type::{DataTypeBuilder, PrimitiveType};
pub use string_type::StringType;

View File

@@ -0,0 +1,34 @@
use std::sync::Arc;
use arrow::datatypes::DataType as ArrowDataType;
use common_base::bytes::StringBytes;
use crate::data_type::DataType;
use crate::prelude::{DataTypeRef, LogicalTypeId, Value};
#[derive(Debug, Default)]
pub struct StringType;
impl StringType {
pub fn arc() -> DataTypeRef {
Arc::new(Self)
}
}
impl DataType for StringType {
fn name(&self) -> &str {
"String"
}
fn logical_type_id(&self) -> LogicalTypeId {
LogicalTypeId::String
}
fn default_value(&self) -> Value {
StringBytes::default().into()
}
fn as_arrow_type(&self) -> ArrowDataType {
ArrowDataType::Utf8
}
}

View File

@@ -1,5 +1,6 @@
pub mod binary;
pub mod primitive;
mod string;
use std::any::Any;
use std::sync::Arc;

View File

@@ -0,0 +1,119 @@
use std::any::Any;
use std::sync::Arc;
use arrow::array::{ArrayRef, Utf8ValuesIter};
use arrow::bitmap::utils::ZipValidity;
use serde_json::Value;
use snafu::ResultExt;
use crate::data_type::DataTypeRef;
use crate::error::SerializeSnafu;
use crate::prelude::{ScalarVectorBuilder, Vector};
use crate::scalars::ScalarVector;
use crate::serialize::Serializable;
use crate::types::StringType;
use crate::{MutableStringArray, StringArray};
/// String array wrapper
#[derive(Clone)]
pub struct StringVector {
array: StringArray,
}
impl Vector for StringVector {
fn data_type(&self) -> DataTypeRef {
StringType::arc()
}
fn as_any(&self) -> &dyn Any {
self
}
fn len(&self) -> usize {
self.array.len()
}
fn to_arrow_array(&self) -> ArrayRef {
Arc::new(self.array.clone())
}
}
impl ScalarVector for StringVector {
type RefItem<'a> = &'a str;
type Iter<'a> = ZipValidity<'a, &'a str, Utf8ValuesIter<'a, i32>>;
type Builder = StringVectorBuilder;
fn get_data(&self, idx: usize) -> Option<Self::RefItem<'_>> {
match idx < self.array.len() {
true => Some(self.array.value(idx)),
false => None,
}
}
fn iter_data(&self) -> Self::Iter<'_> {
self.array.iter()
}
}
pub struct StringVectorBuilder {
buffer: MutableStringArray,
}
impl ScalarVectorBuilder for StringVectorBuilder {
type VectorType = StringVector;
fn with_capacity(capacity: usize) -> Self {
Self {
buffer: MutableStringArray::with_capacity(capacity),
}
}
fn push(&mut self, value: Option<<Self::VectorType as ScalarVector>::RefItem<'_>>) {
self.buffer.push(value)
}
fn finish(self) -> Self::VectorType {
Self::VectorType {
array: self.buffer.into(),
}
}
}
impl Serializable for StringVector {
fn serialize_to_json(&self) -> crate::error::Result<Vec<Value>> {
self.array
.iter()
.map(|v| match v {
None => Ok(serde_json::Value::Null),
Some(s) => serde_json::to_value(s),
})
.collect::<serde_json::Result<_>>()
.context(SerializeSnafu)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
pub fn test_serialize_string_vector() {
let mut builder = StringVectorBuilder::with_capacity(3);
builder.push(Some("hello"));
builder.push(None);
builder.push(Some("world"));
let string_vector = builder.finish();
let serialized = serialize_to_json_string(string_vector.serialize_to_json().unwrap());
assert_eq!(r#"["hello",null,"world"]"#, serialized);
}
pub fn serialize_to_json_string<T>(val: T) -> String
where
T: serde::Serialize,
{
let mut output = vec![];
let mut serializer = serde_json::Serializer::new(&mut output);
val.serialize(&mut serializer).unwrap();
String::from_utf8_lossy(&output).into()
}
}