feat: introduce vector type (#4964)

* feat: introduce vector type

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* test: fix prepared stmt

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* test: add grpc test

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* test: parse vector value

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* test: column to row

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* test: sqlness

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* fix: merge issue

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* refactor: add check for bytes size

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* Update tests/cases/standalone/common/types/vector/vector.sql

Co-authored-by: Ruihang Xia <waynestxia@gmail.com>

* chore: update proto

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* chore: simplify cargo

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* chore: address comment

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

---------

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>
Co-authored-by: Ruihang Xia <waynestxia@gmail.com>
This commit is contained in:
Zhenchi
2024-11-12 16:28:44 +08:00
committed by GitHub
parent 84aa5b7b22
commit d616bd92ef
32 changed files with 1109 additions and 120 deletions

17
Cargo.lock generated
View File

@@ -1041,7 +1041,7 @@ dependencies = [
"bitflags 2.6.0",
"cexpr",
"clang-sys",
"itertools 0.12.1",
"itertools 0.10.5",
"lazy_static",
"lazycell",
"log",
@@ -4583,7 +4583,7 @@ dependencies = [
[[package]]
name = "greptime-proto"
version = "0.1.0"
source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=67bb1d52bc1241972c368657e658592b1be7ead3#67bb1d52bc1241972c368657e658592b1be7ead3"
source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=75c5fb569183bb3d0fa1023df9c2214df722b9b1#75c5fb569183bb3d0fa1023df9c2214df722b9b1"
dependencies = [
"prost 0.12.6",
"serde",
@@ -5079,7 +5079,7 @@ dependencies = [
"httpdate",
"itoa",
"pin-project-lite",
"socket2 0.5.7",
"socket2 0.4.10",
"tokio",
"tower-service",
"tracing",
@@ -6069,7 +6069,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
dependencies = [
"cfg-if",
"windows-targets 0.52.6",
"windows-targets 0.48.5",
]
[[package]]
@@ -8810,7 +8810,7 @@ checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
dependencies = [
"bytes",
"heck 0.5.0",
"itertools 0.12.1",
"itertools 0.10.5",
"log",
"multimap",
"once_cell",
@@ -8862,7 +8862,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1"
dependencies = [
"anyhow",
"itertools 0.12.1",
"itertools 0.10.5",
"proc-macro2",
"quote",
"syn 2.0.79",
@@ -9024,7 +9024,7 @@ dependencies = [
"indoc",
"libc",
"memoffset 0.9.1",
"parking_lot 0.12.3",
"parking_lot 0.11.2",
"portable-atomic",
"pyo3-build-config",
"pyo3-ffi",
@@ -12359,6 +12359,7 @@ dependencies = [
"frontend",
"futures",
"futures-util",
"hex",
"itertools 0.10.5",
"meta-client",
"meta-srv",
@@ -13968,7 +13969,7 @@ version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
dependencies = [
"windows-sys 0.59.0",
"windows-sys 0.48.0",
]
[[package]]

View File

@@ -117,11 +117,12 @@ datafusion-sql = { git = "https://github.com/waynexia/arrow-datafusion.git", rev
datafusion-substrait = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "7823ef2f63663907edab46af0d51359900f608d6" }
derive_builder = "0.12"
dotenv = "0.15"
etcd-client = { version = "0.13" }
etcd-client = "0.13"
fst = "0.4.7"
futures = "0.3"
futures-util = "0.3"
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "67bb1d52bc1241972c368657e658592b1be7ead3" }
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "75c5fb569183bb3d0fa1023df9c2214df722b9b1" }
hex = "0.4"
humantime = "2.1"
humantime-serde = "1.1"
itertools = "0.10"
@@ -151,7 +152,7 @@ raft-engine = { version = "0.4.1", default-features = false }
rand = "0.8"
ratelimit = "0.9"
regex = "1.8"
regex-automata = { version = "0.4" }
regex-automata = "0.4"
reqwest = { version = "0.12", default-features = false, features = [
"json",
"rustls-tls-native-roots",
@@ -182,11 +183,11 @@ strum = { version = "0.25", features = ["derive"] }
tempfile = "3"
tokio = { version = "1.40", features = ["full"] }
tokio-postgres = "0.7"
tokio-stream = { version = "0.1" }
tokio-stream = "0.1"
tokio-util = { version = "0.7", features = ["io-util", "compat"] }
toml = "0.8.8"
tonic = { version = "0.11", features = ["tls", "gzip", "zstd"] }
tower = { version = "0.4" }
tower = "0.4"
tracing-appender = "0.2"
tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "fmt"] }
typetag = "0.2"

View File

@@ -36,15 +36,14 @@ use datatypes::vectors::{
TimestampMillisecondVector, TimestampNanosecondVector, TimestampSecondVector, UInt32Vector,
UInt64Vector, VectorRef,
};
use greptime_proto::v1;
use greptime_proto::v1::column_data_type_extension::TypeExt;
use greptime_proto::v1::ddl_request::Expr;
use greptime_proto::v1::greptime_request::Request;
use greptime_proto::v1::query_request::Query;
use greptime_proto::v1::value::ValueData;
use greptime_proto::v1::{
ColumnDataTypeExtension, DdlRequest, DecimalTypeExtension, JsonTypeExtension, QueryRequest,
Row, SemanticType,
self, ColumnDataTypeExtension, DdlRequest, DecimalTypeExtension, JsonTypeExtension,
QueryRequest, Row, SemanticType, VectorTypeExtension,
};
use paste::paste;
use snafu::prelude::*;
@@ -150,6 +149,17 @@ impl From<ColumnDataTypeWrapper> for ConcreteDataType {
ConcreteDataType::decimal128_default_datatype()
}
}
ColumnDataType::Vector => {
if let Some(TypeExt::VectorType(d)) = datatype_wrapper
.datatype_ext
.as_ref()
.and_then(|datatype_ext| datatype_ext.type_ext.as_ref())
{
ConcreteDataType::vector_datatype(d.dim)
} else {
ConcreteDataType::vector_default_datatype()
}
}
}
}
}
@@ -231,6 +241,15 @@ impl ColumnDataTypeWrapper {
}),
}
}
pub fn vector_datatype(dim: u32) -> Self {
ColumnDataTypeWrapper {
datatype: ColumnDataType::Vector,
datatype_ext: Some(ColumnDataTypeExtension {
type_ext: Some(TypeExt::VectorType(VectorTypeExtension { dim })),
}),
}
}
}
impl TryFrom<ConcreteDataType> for ColumnDataTypeWrapper {
@@ -271,6 +290,7 @@ impl TryFrom<ConcreteDataType> for ColumnDataTypeWrapper {
IntervalType::MonthDayNano(_) => ColumnDataType::IntervalMonthDayNano,
},
ConcreteDataType::Decimal128(_) => ColumnDataType::Decimal128,
ConcreteDataType::Vector(_) => ColumnDataType::Vector,
ConcreteDataType::Null(_)
| ConcreteDataType::List(_)
| ConcreteDataType::Dictionary(_)
@@ -299,6 +319,15 @@ impl TryFrom<ConcreteDataType> for ColumnDataTypeWrapper {
None
}
}
ColumnDataType::Vector => {
datatype
.as_vector()
.map(|vector_type| ColumnDataTypeExtension {
type_ext: Some(TypeExt::VectorType(VectorTypeExtension {
dim: vector_type.dim as _,
})),
})
}
_ => None,
};
Ok(Self {
@@ -422,6 +451,10 @@ pub fn values_with_capacity(datatype: ColumnDataType, capacity: usize) -> Values
string_values: Vec::with_capacity(capacity),
..Default::default()
},
ColumnDataType::Vector => Values {
binary_values: Vec::with_capacity(capacity),
..Default::default()
},
}
}
@@ -673,6 +706,7 @@ pub fn pb_values_to_vector_ref(data_type: &ConcreteDataType, values: Values) ->
Decimal128::from_value_precision_scale(x.hi, x.lo, d.precision(), d.scale()).into()
}),
)),
ConcreteDataType::Vector(_) => Arc::new(BinaryVector::from_vec(values.binary_values)),
ConcreteDataType::Null(_)
| ConcreteDataType::List(_)
| ConcreteDataType::Dictionary(_)
@@ -838,6 +872,7 @@ pub fn pb_values_to_values(data_type: &ConcreteDataType, values: Values) -> Vec<
))
})
.collect(),
ConcreteDataType::Vector(_) => values.binary_values.into_iter().map(|v| v.into()).collect(),
ConcreteDataType::Null(_)
| ConcreteDataType::List(_)
| ConcreteDataType::Dictionary(_)
@@ -862,10 +897,7 @@ pub fn is_column_type_value_eq(
ColumnDataTypeWrapper::try_new(type_value, type_extension)
.map(|wrapper| {
let datatype = ConcreteDataType::from(wrapper);
(datatype == *expect_type)
// Json type leverage binary type in pb, so this is valid.
|| (datatype == ConcreteDataType::binary_datatype()
&& *expect_type == ConcreteDataType::json_datatype())
expect_type == &datatype
})
.unwrap_or(false)
}
@@ -1152,6 +1184,10 @@ mod tests {
let values = values_with_capacity(ColumnDataType::Decimal128, 2);
let values = values.decimal128_values;
assert_eq!(2, values.capacity());
let values = values_with_capacity(ColumnDataType::Vector, 2);
let values = values.binary_values;
assert_eq!(2, values.capacity());
}
#[test]
@@ -1239,7 +1275,11 @@ mod tests {
assert_eq!(
ConcreteDataType::decimal128_datatype(10, 2),
ColumnDataTypeWrapper::decimal128_datatype(10, 2).into()
)
);
assert_eq!(
ConcreteDataType::vector_datatype(3),
ColumnDataTypeWrapper::vector_datatype(3).into()
);
}
#[test]
@@ -1335,6 +1375,10 @@ mod tests {
.try_into()
.unwrap()
);
assert_eq!(
ColumnDataTypeWrapper::vector_datatype(3),
ConcreteDataType::vector_datatype(3).try_into().unwrap()
);
let result: Result<ColumnDataTypeWrapper> = ConcreteDataType::null_datatype().try_into();
assert!(result.is_err());

View File

@@ -218,6 +218,12 @@ pub fn values(arrays: &[VectorRef]) -> Result<Values> {
Decimal128Vector,
decimal128_values,
|x| { convert_to_pb_decimal128(x) }
),
(
ConcreteDataType::Vector(_),
BinaryVector,
binary_values,
|x| { x.into() }
)
)
}

View File

@@ -39,7 +39,7 @@ derive_builder.workspace = true
etcd-client.workspace = true
futures.workspace = true
futures-util.workspace = true
hex = { version = "0.4" }
hex.workspace = true
humantime-serde.workspace = true
itertools.workspace = true
lazy_static.workspace = true

View File

@@ -36,7 +36,7 @@ use crate::types::{
IntervalDayTimeType, IntervalMonthDayNanoType, IntervalType, IntervalYearMonthType, JsonType,
ListType, NullType, StringType, TimeMillisecondType, TimeType, TimestampMicrosecondType,
TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, TimestampType,
UInt16Type, UInt32Type, UInt64Type, UInt8Type,
UInt16Type, UInt32Type, UInt64Type, UInt8Type, VectorType,
};
use crate::value::Value;
use crate::vectors::MutableVector;
@@ -84,6 +84,9 @@ pub enum ConcreteDataType {
// JSON type:
Json(JsonType),
// Vector type:
Vector(VectorType),
}
impl fmt::Display for ConcreteDataType {
@@ -132,6 +135,7 @@ impl fmt::Display for ConcreteDataType {
ConcreteDataType::List(v) => write!(f, "{}", v.name()),
ConcreteDataType::Dictionary(v) => write!(f, "{}", v.name()),
ConcreteDataType::Json(v) => write!(f, "{}", v.name()),
ConcreteDataType::Vector(v) => write!(f, "{}", v.name()),
}
}
}
@@ -167,6 +171,7 @@ impl ConcreteDataType {
| ConcreteDataType::Decimal128(_)
| ConcreteDataType::Binary(_)
| ConcreteDataType::Json(_)
| ConcreteDataType::Vector(_)
)
}
@@ -225,6 +230,10 @@ impl ConcreteDataType {
matches!(self, ConcreteDataType::Json(_))
}
pub fn is_vector(&self) -> bool {
matches!(self, ConcreteDataType::Vector(_))
}
pub fn numerics() -> Vec<ConcreteDataType> {
vec![
ConcreteDataType::int8_datatype(),
@@ -334,6 +343,13 @@ impl ConcreteDataType {
}
}
pub fn as_vector(&self) -> Option<VectorType> {
match self {
ConcreteDataType::Vector(v) => Some(*v),
_ => None,
}
}
/// Checks if the data type can cast to another data type.
pub fn can_arrow_type_cast_to(&self, to_type: &ConcreteDataType) -> bool {
let array = arrow_array::new_empty_array(&self.as_arrow_type());
@@ -564,6 +580,14 @@ impl ConcreteDataType {
pub fn decimal128_default_datatype() -> ConcreteDataType {
Self::decimal128_datatype(DECIMAL128_MAX_PRECISION, DECIMAL_DEFAULT_SCALE)
}
pub fn vector_datatype(dim: u32) -> ConcreteDataType {
ConcreteDataType::Vector(VectorType::new(dim))
}
pub fn vector_default_datatype() -> ConcreteDataType {
Self::vector_datatype(0)
}
}
/// Data type abstraction.
@@ -757,6 +781,7 @@ mod tests {
assert!(ConcreteDataType::duration_microsecond_datatype().is_stringifiable());
assert!(ConcreteDataType::duration_nanosecond_datatype().is_stringifiable());
assert!(ConcreteDataType::decimal128_datatype(10, 2).is_stringifiable());
assert!(ConcreteDataType::vector_default_datatype().is_stringifiable());
}
#[test]
@@ -909,5 +934,9 @@ mod tests {
.to_string(),
"Dictionary<Int32, String>"
);
assert_eq!(
ConcreteDataType::vector_datatype(3).to_string(),
"Vector(3)"
);
}
}

View File

@@ -196,6 +196,13 @@ pub enum Error {
location: Location,
},
#[snafu(display("Invalid Vector: {}", msg))]
InvalidVector {
msg: String,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Value exceeds the precision {} bound", precision))]
ValueExceedsPrecision {
precision: u8,
@@ -213,6 +220,12 @@ pub enum Error {
location: Location,
},
#[snafu(display("Failed to parse extended type in metadata: {}", value))]
ParseExtendedType {
value: String,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Invalid fulltext option: {}", msg))]
InvalidFulltextOption {
msg: String,
@@ -238,6 +251,7 @@ impl ErrorExt for Error {
| InvalidTimestampPrecision { .. }
| InvalidPrecisionOrScale { .. }
| InvalidJson { .. }
| InvalidVector { .. }
| InvalidFulltextOption { .. } => StatusCode::InvalidArguments,
ValueExceedsPrecision { .. }
@@ -253,7 +267,8 @@ impl ErrorExt for Error {
| ProjectArrowSchema { .. }
| ToScalarValue { .. }
| TryFromValue { .. }
| ConvertArrowArrayToScalars { .. } => StatusCode::Internal,
| ConvertArrowArrayToScalars { .. }
| ParseExtendedType { .. } => StatusCode::Internal,
}
}

View File

@@ -21,11 +21,12 @@ use std::fmt;
use std::sync::Arc;
use arrow::datatypes::{Field, Schema as ArrowSchema};
use column_schema::ColumnExtType;
use datafusion_common::DFSchemaRef;
use snafu::{ensure, ResultExt};
use crate::error::{self, DuplicateColumnSnafu, Error, ProjectArrowSchemaSnafu, Result};
use crate::prelude::DataType;
use crate::prelude::ConcreteDataType;
pub use crate::schema::column_schema::{
ColumnSchema, FulltextAnalyzer, FulltextOptions, Metadata,
COLUMN_FULLTEXT_CHANGE_OPT_KEY_ENABLE, COLUMN_FULLTEXT_OPT_KEY_ANALYZER,
@@ -263,9 +264,14 @@ fn collect_fields(column_schemas: &[ColumnSchema]) -> Result<FieldsAndIndices> {
}
let mut field = Field::try_from(column_schema)?;
// Json column performs the same as binary column in Arrow, so we need to mark it
if column_schema.data_type.is_json() {
let metadata = HashMap::from([(TYPE_KEY.to_string(), column_schema.data_type.name())]);
// Column with type Json or Vector performs the same as binary column in Arrow, so we need to mark it
let extype = match column_schema.data_type {
ConcreteDataType::Json(_) => Some(ColumnExtType::Json),
ConcreteDataType::Vector(d) => Some(ColumnExtType::Vector(d.dim)),
_ => None,
};
if let Some(extype) = extype {
let metadata = HashMap::from([(TYPE_KEY.to_string(), extype.to_string())]);
field = field.with_metadata(metadata);
}
fields.push(field);

View File

@@ -14,6 +14,7 @@
use std::collections::HashMap;
use std::fmt;
use std::str::FromStr;
use arrow::datatypes::Field;
use serde::{Deserialize, Serialize};
@@ -21,10 +22,9 @@ use snafu::{ensure, ResultExt};
use sqlparser_derive::{Visit, VisitMut};
use crate::data_type::{ConcreteDataType, DataType};
use crate::error::{self, Error, InvalidFulltextOptionSnafu, Result};
use crate::error::{self, Error, InvalidFulltextOptionSnafu, ParseExtendedTypeSnafu, Result};
use crate::schema::constraint::ColumnDefaultConstraint;
use crate::schema::TYPE_KEY;
use crate::types::JSON_TYPE_NAME;
use crate::value::Value;
use crate::vectors::VectorRef;
@@ -300,17 +300,57 @@ impl ColumnSchema {
}
}
/// Column extended type set in column schema's metadata.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ColumnExtType {
/// Json type.
Json,
/// Vector type with dimension.
Vector(u32),
}
impl fmt::Display for ColumnExtType {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
ColumnExtType::Json => write!(f, "Json"),
ColumnExtType::Vector(dim) => write!(f, "Vector({})", dim),
}
}
}
impl FromStr for ColumnExtType {
type Err = String;
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
match s {
"Json" => Ok(ColumnExtType::Json),
_ if s.starts_with("Vector(") && s.ends_with(')') => s[7..s.len() - 1]
.parse::<u32>()
.map(ColumnExtType::Vector)
.map_err(|_| "Invalid dimension for Vector".to_string()),
_ => Err("Unknown variant".to_string()),
}
}
}
impl TryFrom<&Field> for ColumnSchema {
type Error = Error;
fn try_from(field: &Field) -> Result<ColumnSchema> {
let mut data_type = ConcreteDataType::try_from(field.data_type())?;
// Override the data type if it is specified in the metadata.
if field.metadata().contains_key(TYPE_KEY) {
data_type = match field.metadata().get(TYPE_KEY).unwrap().as_str() {
JSON_TYPE_NAME => ConcreteDataType::json_datatype(),
_ => data_type,
};
if let Some(s) = field.metadata().get(TYPE_KEY) {
let extype = ColumnExtType::from_str(s)
.map_err(|_| ParseExtendedTypeSnafu { value: s }.build())?;
match extype {
ColumnExtType::Json => {
data_type = ConcreteDataType::json_datatype();
}
ColumnExtType::Vector(dim) => {
data_type = ConcreteDataType::vector_datatype(dim);
}
}
}
let mut metadata = field.metadata().clone();
let default_constraint = match metadata.remove(DEFAULT_CONSTRAINT_KEY) {
@@ -661,5 +701,24 @@ mod tests {
column_schema.metadata.get(TYPE_KEY).unwrap(),
&ConcreteDataType::json_datatype().name()
);
let field = Field::new("test", ArrowDataType::Binary, true);
let field = field.with_metadata(Metadata::from([(
TYPE_KEY.to_string(),
ConcreteDataType::vector_datatype(3).name(),
)]));
let column_schema = ColumnSchema::try_from(&field).unwrap();
assert_eq!("test", column_schema.name);
assert_eq!(
ConcreteDataType::vector_datatype(3),
column_schema.data_type
);
assert!(column_schema.is_nullable);
assert!(!column_schema.is_time_index);
assert!(column_schema.default_constraint.is_none());
assert_eq!(
column_schema.metadata.get(TYPE_KEY).unwrap(),
&ConcreteDataType::vector_datatype(3).name()
);
}
}

View File

@@ -94,7 +94,7 @@ impl ColumnDefaultConstraint {
// Whether the value could be nullable has been checked before, only need
// to check the type compatibility here.
ensure!(
data_type.logical_type_id() == v.logical_type_id(),
value_type_match(data_type, v.data_type()),
error::DefaultValueTypeSnafu {
reason: format!(
"column has type {:?} but default value has type {:?}",
@@ -215,6 +215,17 @@ fn create_current_timestamp_vector(
}
}
fn value_type_match(column_type: &ConcreteDataType, value_type: ConcreteDataType) -> bool {
match (column_type, value_type) {
(ct, vt) if ct.logical_type_id() == vt.logical_type_id() => true,
// Vector and Json type is encoded as binary
(ConcreteDataType::Vector(_) | ConcreteDataType::Json(_), ConcreteDataType::Binary(_)) => {
true
}
_ => false,
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;

View File

@@ -70,6 +70,8 @@ pub enum LogicalTypeId {
Dictionary,
Json,
Vector,
}
impl LogicalTypeId {
@@ -129,6 +131,7 @@ impl LogicalTypeId {
LogicalTypeId::DurationNanosecond => ConcreteDataType::duration_nanosecond_datatype(),
LogicalTypeId::Decimal128 => ConcreteDataType::decimal128_default_datatype(),
LogicalTypeId::Json => ConcreteDataType::json_datatype(),
LogicalTypeId::Vector => ConcreteDataType::vector_default_datatype(),
}
}
}

View File

@@ -28,6 +28,7 @@ mod primitive_type;
mod string_type;
mod time_type;
mod timestamp_type;
mod vector_type;
pub use binary_type::BinaryType;
pub use boolean_type::BooleanType;
@@ -58,3 +59,4 @@ pub use timestamp_type::{
TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
TimestampSecondType, TimestampType,
};
pub use vector_type::{parse_string_to_vector_type_value, vector_type_value_to_string, VectorType};

View File

@@ -0,0 +1,237 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use arrow::datatypes::DataType as ArrowDataType;
use common_base::bytes::Bytes;
use serde::{Deserialize, Serialize};
use crate::data_type::DataType;
use crate::error::{InvalidVectorSnafu, Result};
use crate::scalars::ScalarVectorBuilder;
use crate::type_id::LogicalTypeId;
use crate::value::Value;
use crate::vectors::{BinaryVectorBuilder, MutableVector};
/// `VectorType` is a data type for vector data with a fixed dimension.
/// The type of items in the vector is float32.
/// It is stored as binary data that contains the concatenated float32 values.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
pub struct VectorType {
pub dim: u32,
}
impl VectorType {
pub fn new(dim: u32) -> Self {
Self { dim }
}
}
impl DataType for VectorType {
fn name(&self) -> String {
format!("Vector({})", self.dim)
}
fn logical_type_id(&self) -> LogicalTypeId {
LogicalTypeId::Vector
}
fn default_value(&self) -> Value {
Bytes::default().into()
}
fn as_arrow_type(&self) -> ArrowDataType {
ArrowDataType::Binary
}
fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector> {
Box::new(BinaryVectorBuilder::with_capacity(capacity))
}
fn try_cast(&self, from: Value) -> Option<Value> {
match from {
Value::Binary(v) => Some(Value::Binary(v)),
_ => None,
}
}
}
/// Converts a vector type value to string
/// for example: [1.0, 2.0, 3.0] -> "[1.0,2.0,3.0]"
pub fn vector_type_value_to_string(val: &[u8], dim: u32) -> Result<String> {
let expected_len = dim as usize * std::mem::size_of::<f32>();
if val.len() != expected_len {
return InvalidVectorSnafu {
msg: format!(
"Failed to convert Vector value to string: wrong byte size, expected {}, got {}",
expected_len,
val.len()
),
}
.fail();
}
if dim == 0 {
return Ok("[]".to_string());
}
let elements = unsafe {
std::slice::from_raw_parts(
val.as_ptr() as *const f32,
val.len() / std::mem::size_of::<f32>(),
)
};
let mut s = String::from("[");
for (i, e) in elements.iter().enumerate() {
if i > 0 {
s.push(',');
}
s.push_str(&e.to_string());
}
s.push(']');
Ok(s)
}
/// Parses a string to a vector type value
/// Valid input format: "[1.0,2.0,3.0]", "[1.0, 2.0, 3.0]"
pub fn parse_string_to_vector_type_value(s: &str, dim: u32) -> Result<Vec<u8>> {
// Trim the brackets
let trimmed = s.trim();
if !trimmed.starts_with('[') || !trimmed.ends_with(']') {
return InvalidVectorSnafu {
msg: format!("Failed to parse {s} to Vector value: not properly enclosed in brackets"),
}
.fail();
}
// Remove the brackets
let content = trimmed[1..trimmed.len() - 1].trim();
if content.is_empty() {
if dim != 0 {
return InvalidVectorSnafu {
msg: format!("Failed to parse {s} to Vector value: wrong dimension"),
}
.fail();
}
return Ok(vec![]);
}
let elements = content
.split(',')
.map(|e| {
e.trim().parse::<f32>().map_err(|_| {
InvalidVectorSnafu {
msg: format!(
"Failed to parse {s} to Vector value: elements are not all float32"
),
}
.build()
})
})
.collect::<Result<Vec<f32>>>()?;
// Check dimension
if elements.len() != dim as usize {
return InvalidVectorSnafu {
msg: format!("Failed to parse {s} to Vector value: wrong dimension"),
}
.fail();
}
// Convert Vec<f32> to Vec<u8>
let bytes = unsafe {
std::slice::from_raw_parts(
elements.as_ptr() as *const u8,
elements.len() * std::mem::size_of::<f32>(),
)
.to_vec()
};
Ok(bytes)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_conversion_between_string_and_vector_type_value() {
let dim = 3;
let cases = [
("[1.0,2.0,3]", "[1,2,3]"),
("[0.0 , 0.0 , 0.0]", "[0,0,0]"),
("[3.4028235e38, -3.4028235e38, 1.1754944e-38]", "[340282350000000000000000000000000000000,-340282350000000000000000000000000000000,0.000000000000000000000000000000000000011754944]"),
];
for (s, expected) in cases.iter() {
let val = parse_string_to_vector_type_value(s, dim).unwrap();
let s = vector_type_value_to_string(&val, dim).unwrap();
assert_eq!(s, *expected);
}
let dim = 0;
let cases = [("[]", "[]"), ("[ ]", "[]"), ("[ ]", "[]")];
for (s, expected) in cases.iter() {
let val = parse_string_to_vector_type_value(s, dim).unwrap();
let s = vector_type_value_to_string(&val, dim).unwrap();
assert_eq!(s, *expected);
}
}
#[test]
fn test_vector_type_value_to_string_wrong_byte_size() {
let dim = 3;
let val = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
let res = vector_type_value_to_string(&val, dim);
assert!(res.is_err());
let dim = 0;
let val = vec![1];
let res = vector_type_value_to_string(&val, dim);
assert!(res.is_err());
}
#[test]
fn test_parse_string_to_vector_type_value_not_properly_enclosed_in_brackets() {
let dim = 3;
let s = "1.0,2.0,3.0";
let res = parse_string_to_vector_type_value(s, dim);
assert!(res.is_err());
let s = "[1.0,2.0,3.0";
let res = parse_string_to_vector_type_value(s, dim);
assert!(res.is_err());
let s = "1.0,2.0,3.0]";
let res = parse_string_to_vector_type_value(s, dim);
assert!(res.is_err());
}
#[test]
fn test_parse_string_to_vector_type_value_wrong_dimension() {
let dim = 3;
let s = "[1.0,2.0]";
let res = parse_string_to_vector_type_value(s, dim);
assert!(res.is_err());
}
#[test]
fn test_parse_string_to_vector_type_value_elements_are_not_all_float32() {
let dim = 3;
let s = "[1.0,2.0,ah]";
let res = parse_string_to_vector_type_value(s, dim);
assert!(res.is_err());
}
}

View File

@@ -520,7 +520,9 @@ pub fn to_null_scalar_value(output_type: &ConcreteDataType) -> Result<ScalarValu
ConcreteDataType::UInt64(_) => ScalarValue::UInt64(None),
ConcreteDataType::Float32(_) => ScalarValue::Float32(None),
ConcreteDataType::Float64(_) => ScalarValue::Float64(None),
ConcreteDataType::Binary(_) | ConcreteDataType::Json(_) => ScalarValue::Binary(None),
ConcreteDataType::Binary(_) | ConcreteDataType::Json(_) | ConcreteDataType::Vector(_) => {
ScalarValue::Binary(None)
}
ConcreteDataType::String(_) => ScalarValue::Utf8(None),
ConcreteDataType::Date(_) => ScalarValue::Date32(None),
ConcreteDataType::DateTime(_) => ScalarValue::Date64(None),

View File

@@ -20,9 +20,10 @@ use snafu::ResultExt;
use crate::arrow_array::{BinaryArray, MutableBinaryArray};
use crate::data_type::ConcreteDataType;
use crate::error::{self, Result};
use crate::error::{self, InvalidVectorSnafu, Result};
use crate::scalars::{ScalarVector, ScalarVectorBuilder};
use crate::serialize::Serializable;
use crate::types::parse_string_to_vector_type_value;
use crate::value::{Value, ValueRef};
use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef};
@@ -66,6 +67,40 @@ impl BinaryVector {
}
Ok(BinaryVector::from(vector))
}
pub fn convert_binary_to_vector(&self, dim: u32) -> Result<BinaryVector> {
let arrow_array = self.to_arrow_array();
let mut vector = vec![];
for binary in arrow_array
.as_any()
.downcast_ref::<BinaryArray>()
.unwrap()
.iter()
{
let v = if let Some(binary) = binary {
let bytes_size = dim as usize * std::mem::size_of::<f32>();
if let Ok(s) = String::from_utf8(binary.to_vec()) {
let v = parse_string_to_vector_type_value(&s, dim)?;
Some(v)
} else if binary.len() == dim as usize * std::mem::size_of::<f32>() {
Some(binary.to_vec())
} else {
return InvalidVectorSnafu {
msg: format!(
"Unexpected bytes size for vector value, expected {}, got {}",
bytes_size,
binary.len()
),
}
.fail();
}
} else {
None
};
vector.push(v);
}
Ok(BinaryVector::from(vector))
}
}
impl From<BinaryArray> for BinaryVector {
@@ -473,4 +508,45 @@ mod tests {
.unwrap_err();
assert_matches!(error, error::Error::InvalidJson { .. });
}
#[test]
fn test_binary_vector_conversion() {
let dim = 3;
let vector = BinaryVector::from(vec![
Some(b"[1,2,3]".to_vec()),
Some(b"[4,5,6]".to_vec()),
Some(b"[7,8,9]".to_vec()),
None,
]);
let expected = BinaryVector::from(vec![
Some(
[1.0f32, 2.0, 3.0]
.iter()
.flat_map(|v| v.to_le_bytes())
.collect(),
),
Some(
[4.0f32, 5.0, 6.0]
.iter()
.flat_map(|v| v.to_le_bytes())
.collect(),
),
Some(
[7.0f32, 8.0, 9.0]
.iter()
.flat_map(|v| v.to_le_bytes())
.collect(),
),
None,
]);
let converted = vector.convert_binary_to_vector(dim).unwrap();
assert_eq!(converted.len(), expected.len());
for i in 0..3 {
assert_eq!(
converted.get_ref(i).as_binary().unwrap().unwrap(),
expected.get_ref(i).as_binary().unwrap().unwrap()
);
}
}
}

View File

@@ -80,7 +80,7 @@ fn equal(lhs: &dyn Vector, rhs: &dyn Vector) -> bool {
match lhs.data_type() {
Null(_) => true,
Boolean(_) => is_vector_eq!(BooleanVector, lhs, rhs),
Binary(_) | Json(_) => is_vector_eq!(BinaryVector, lhs, rhs),
Binary(_) | Json(_) | Vector(_) => is_vector_eq!(BinaryVector, lhs, rhs),
String(_) => is_vector_eq!(StringVector, lhs, rhs),
Date(_) => is_vector_eq!(DateVector, lhs, rhs),
DateTime(_) => is_vector_eq!(DateTimeVector, lhs, rhs),

View File

@@ -91,10 +91,17 @@ macro_rules! impl_scalar_vector_op {
}
fn cast(&self, to_type: &ConcreteDataType) -> Result<VectorRef> {
if to_type == &ConcreteDataType::json_datatype() {
if let Some(vector) = self.as_any().downcast_ref::<BinaryVector>() {
let json_vector = vector.convert_binary_to_json()?;
return Ok(Arc::new(json_vector) as VectorRef);
if let Some(vector) = self.as_any().downcast_ref::<BinaryVector>() {
match to_type {
ConcreteDataType::Json(_) => {
let json_vector = vector.convert_binary_to_json()?;
return Ok(Arc::new(json_vector) as VectorRef);
}
ConcreteDataType::Vector(d) => {
let vector = vector.convert_binary_to_vector(d.dim)?;
return Ok(Arc::new(vector) as VectorRef);
}
_ => {}
}
}
cast::cast_non_constant!(self, to_type)

View File

@@ -369,16 +369,23 @@ pub(crate) fn validate_proto_value(
column_schema: &ColumnSchema,
) -> Result<()> {
if let Some(value_type) = proto_value_type(value) {
let column_type = ColumnDataType::try_from(column_schema.datatype).map_err(|_| {
InvalidRequestSnafu {
region_id,
reason: format!(
"column {} has unknown type {}",
column_schema.column_name, column_schema.datatype
),
}
.build()
})?;
ensure!(
value_type as i32 == column_schema.datatype,
proto_value_type_match(column_type, value_type),
InvalidRequestSnafu {
region_id,
reason: format!(
"value has type {:?}, but column {} has type {:?}({})",
value_type,
column_schema.column_name,
ColumnDataType::try_from(column_schema.datatype),
column_schema.datatype,
value_type, column_schema.column_name, column_type, column_schema.datatype,
),
}
);
@@ -387,6 +394,14 @@ pub(crate) fn validate_proto_value(
Ok(())
}
fn proto_value_type_match(column_type: ColumnDataType, value_type: ColumnDataType) -> bool {
match (column_type, value_type) {
(ct, vt) if ct == vt => true,
(ColumnDataType::Vector, ColumnDataType::Binary) => true,
_ => false,
}
}
/// Oneshot output result sender.
#[derive(Debug)]
pub struct OutputTx(Sender<Result<AffectedRows>>);

View File

@@ -69,7 +69,9 @@ impl SortField {
ConcreteDataType::Int64(_) | ConcreteDataType::UInt64(_) => 9,
ConcreteDataType::Float32(_) => 5,
ConcreteDataType::Float64(_) => 9,
ConcreteDataType::Binary(_) | ConcreteDataType::Json(_) => 11,
ConcreteDataType::Binary(_)
| ConcreteDataType::Json(_)
| ConcreteDataType::Vector(_) => 11,
ConcreteDataType::String(_) => 11, // a non-empty string takes at least 11 bytes.
ConcreteDataType::Date(_) => 5,
ConcreteDataType::DateTime(_) => 9,
@@ -165,7 +167,8 @@ impl SortField {
Time, time,
Duration, duration,
Decimal128, decimal128,
Json, binary
Json, binary,
Vector, binary
);
Ok(())
@@ -188,7 +191,7 @@ impl SortField {
Ok(Value::from(Option::<$f>::deserialize(deserializer).context(error::DeserializeFieldSnafu)?))
}
)*
ConcreteDataType::Binary(_) | ConcreteDataType::Json(_) => Ok(Value::from(
ConcreteDataType::Binary(_) | ConcreteDataType::Json(_) | ConcreteDataType::Vector(_) => Ok(Value::from(
Option::<Vec<u8>>::deserialize(deserializer)
.context(error::DeserializeFieldSnafu)?
.map(Bytes::from),
@@ -273,7 +276,9 @@ impl SortField {
ConcreteDataType::Int64(_) | ConcreteDataType::UInt64(_) => 9,
ConcreteDataType::Float32(_) => 5,
ConcreteDataType::Float64(_) => 9,
ConcreteDataType::Binary(_) | ConcreteDataType::Json(_) => {
ConcreteDataType::Binary(_)
| ConcreteDataType::Json(_)
| ConcreteDataType::Vector(_) => {
// Now the encoder encode binary as a list of bytes so we can't use
// skip bytes.
let pos_before = deserializer.position();
@@ -606,6 +611,7 @@ mod tests {
ConcreteDataType::interval_day_time_datatype(),
ConcreteDataType::interval_month_day_nano_datatype(),
ConcreteDataType::decimal128_default_datatype(),
ConcreteDataType::vector_datatype(3),
],
vec![
Value::Boolean(true),
@@ -630,6 +636,7 @@ mod tests {
Value::IntervalDayTime(IntervalDayTime::new(1, 15)),
Value::IntervalMonthDayNano(IntervalMonthDayNano::new(1, 1, 15)),
Value::Decimal128(Decimal128::from(16)),
Value::Binary(Bytes::from(vec![0; 12])),
],
);
}

View File

@@ -25,6 +25,7 @@ use api::v1::{
RowDeleteRequest, RowInsertRequest, Rows, SemanticType, Value,
};
use common_base::BitVec;
use datatypes::prelude::ConcreteDataType;
use datatypes::vectors::VectorRef;
use snafu::prelude::*;
use snafu::ResultExt;
@@ -53,6 +54,7 @@ fn encode_string_to_jsonb_binary(value_data: ValueData) -> Result<ValueData> {
/// Prepares row insertion requests by converting any JSON values to binary JSONB format.
pub fn preprocess_row_insert_requests(requests: &mut Vec<RowInsertRequest>) -> Result<()> {
for request in requests {
validate_rows(&request.rows)?;
prepare_rows(&mut request.rows)?;
}
@@ -62,6 +64,7 @@ pub fn preprocess_row_insert_requests(requests: &mut Vec<RowInsertRequest>) -> R
/// Prepares row deletion requests by converting any JSON values to binary JSONB format.
pub fn preprocess_row_delete_requests(requests: &mut Vec<RowDeleteRequest>) -> Result<()> {
for request in requests {
validate_rows(&request.rows)?;
prepare_rows(&mut request.rows)?;
}
@@ -102,6 +105,58 @@ fn prepare_rows(rows: &mut Option<Rows>) -> Result<()> {
Ok(())
}
fn validate_rows(rows: &Option<Rows>) -> Result<()> {
let Some(rows) = rows else {
return Ok(());
};
for (col_idx, schema) in rows.schema.iter().enumerate() {
let column_type =
ColumnDataTypeWrapper::try_new(schema.datatype, schema.datatype_extension.clone())
.context(ColumnDataTypeSnafu)?
.into();
let ConcreteDataType::Vector(d) = column_type else {
return Ok(());
};
for row in &rows.rows {
let value = &row.values[col_idx].value_data;
if let Some(data) = value {
validate_vector_col(data, d.dim)?;
}
}
}
Ok(())
}
fn validate_vector_col(data: &ValueData, dim: u32) -> Result<()> {
let data = match data {
ValueData::BinaryValue(data) => data,
_ => {
return InvalidInsertRequestSnafu {
reason: "Expecting binary data for vector column.".to_string(),
}
.fail();
}
};
let expected_len = dim as usize * std::mem::size_of::<f32>();
if data.len() != expected_len {
return InvalidInsertRequestSnafu {
reason: format!(
"Expecting {} bytes of data for vector column, but got {}.",
expected_len,
data.len()
),
}
.fail();
}
Ok(())
}
pub fn columns_to_rows(columns: Vec<Column>, row_count: u32) -> Result<Rows> {
let row_count = row_count as usize;
let column_count = columns.len();
@@ -236,6 +291,7 @@ fn push_column_to_rows(column: Column, rows: &mut [Row]) -> Result<()> {
interval_month_day_nano_values
),
(Decimal128, Decimal128Value, decimal128_values),
(Vector, BinaryValue, binary_values),
);
Ok(())
@@ -264,12 +320,7 @@ pub fn column_schema(
) -> Result<Vec<ColumnSchema>> {
columns
.iter()
.map(|(column_name, vector)| {
let (datatype, datatype_extension) =
ColumnDataTypeWrapper::try_from(vector.data_type().clone())
.context(ColumnDataTypeSnafu)?
.to_parts();
.map(|(column_name, _vector)| {
let column_schema = table_info
.meta
.schema
@@ -278,6 +329,11 @@ pub fn column_schema(
msg: format!("unable to find column {column_name} in table schema"),
})?;
let (datatype, datatype_extension) =
ColumnDataTypeWrapper::try_from(column_schema.data_type.clone())
.context(ColumnDataTypeSnafu)?
.to_parts();
Ok(ColumnSchema {
column_name: column_name.clone(),
datatype: datatype as i32,
@@ -322,7 +378,7 @@ fn semantic_type(table_info: &TableInfo, column: &str) -> Result<SemanticType> {
#[cfg(test)]
mod tests {
use api::v1::column::Values;
use api::v1::SemanticType;
use api::v1::{SemanticType, VectorTypeExtension};
use common_base::bit_vec::prelude::*;
use super::*;
@@ -356,30 +412,57 @@ mod tests {
}),
..Default::default()
},
Column {
column_name: String::from("col3"),
datatype: ColumnDataType::Vector.into(),
semantic_type: SemanticType::Field.into(),
null_mask: vec![],
values: Some(Values {
binary_values: vec![vec![0; 4], vec![1; 4], vec![2; 4]],
..Default::default()
}),
datatype_extension: Some(ColumnDataTypeExtension {
type_ext: Some(TypeExt::VectorType(VectorTypeExtension { dim: 1 })),
}),
..Default::default()
},
];
let row_count = 3;
let result = columns_to_rows(columns, row_count);
let rows = result.unwrap();
assert_eq!(rows.schema.len(), 2);
assert_eq!(rows.schema.len(), 3);
assert_eq!(rows.schema[0].column_name, "col1");
assert_eq!(rows.schema[0].datatype, ColumnDataType::Int32 as i32);
assert_eq!(rows.schema[0].semantic_type, SemanticType::Field as i32);
assert_eq!(rows.schema[1].column_name, "col2");
assert_eq!(rows.schema[1].datatype, ColumnDataType::String as i32);
assert_eq!(rows.schema[1].semantic_type, SemanticType::Tag as i32);
assert_eq!(rows.schema[2].column_name, "col3");
assert_eq!(rows.schema[2].datatype, ColumnDataType::Vector as i32);
assert_eq!(rows.schema[2].semantic_type, SemanticType::Field as i32);
assert_eq!(
rows.schema[2].datatype_extension,
Some(ColumnDataTypeExtension {
type_ext: Some(TypeExt::VectorType(VectorTypeExtension { dim: 1 }))
})
);
assert_eq!(rows.rows.len(), 3);
assert_eq!(rows.rows[0].values.len(), 2);
assert_eq!(rows.rows[0].values.len(), 3);
assert_eq!(rows.rows[0].values[0].value_data, None);
assert_eq!(
rows.rows[0].values[1].value_data,
Some(ValueData::StringValue(String::from("value1")))
);
assert_eq!(
rows.rows[0].values[2].value_data,
Some(ValueData::BinaryValue(vec![0; 4]))
);
assert_eq!(rows.rows[1].values.len(), 2);
assert_eq!(rows.rows[1].values.len(), 3);
assert_eq!(
rows.rows[1].values[0].value_data,
Some(ValueData::I32Value(42))
@@ -388,13 +471,21 @@ mod tests {
rows.rows[1].values[1].value_data,
Some(ValueData::StringValue(String::from("value2")))
);
assert_eq!(
rows.rows[1].values[2].value_data,
Some(ValueData::BinaryValue(vec![1; 4]))
);
assert_eq!(rows.rows[2].values.len(), 2);
assert_eq!(rows.rows[2].values.len(), 3);
assert_eq!(rows.rows[2].values[0].value_data, None);
assert_eq!(
rows.rows[2].values[1].value_data,
Some(ValueData::StringValue(String::from("value3")))
);
assert_eq!(
rows.rows[2].values[2].value_data,
Some(ValueData::BinaryValue(vec![2; 4]))
);
// wrong type
let columns = vec![Column {
@@ -441,4 +532,37 @@ mod tests {
let row_count = 3;
assert!(columns_to_rows(columns, row_count).is_err());
}
#[test]
fn test_validate_vector_row_success() {
let data = ValueData::BinaryValue(vec![0; 4]);
let dim = 1;
assert!(validate_vector_col(&data, dim).is_ok());
let data = ValueData::BinaryValue(vec![0; 8]);
let dim = 2;
assert!(validate_vector_col(&data, dim).is_ok());
let data = ValueData::BinaryValue(vec![0; 12]);
let dim = 3;
assert!(validate_vector_col(&data, dim).is_ok());
}
#[test]
fn test_validate_vector_row_fail_wrong_type() {
let data = ValueData::I32Value(42);
let dim = 1;
assert!(validate_vector_col(&data, dim).is_err());
}
#[test]
fn test_validate_vector_row_fail_wrong_length() {
let data = ValueData::BinaryValue(vec![0; 8]);
let dim = 1;
assert!(validate_vector_col(&data, dim).is_err());
let data = ValueData::BinaryValue(vec![0; 4]);
let dim = 2;
assert!(validate_vector_col(&data, dim).is_err());
}
}

View File

@@ -562,6 +562,12 @@ pub enum Error {
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Convert SQL value error"))]
ConvertSqlValue {
source: datatypes::error::Error,
#[snafu(implicit)]
location: Location,
},
}
pub type Result<T> = std::result::Result<T, Error>;
@@ -674,6 +680,8 @@ impl ErrorExt for Error {
ConvertScalarValue { source, .. } => source.status_code(),
ToJson { .. } => StatusCode::Internal,
ConvertSqlValue { source, .. } => source.status_code(),
}
}

View File

@@ -21,6 +21,7 @@ use common_recordbatch::{RecordBatch, SendableRecordBatchStream};
use common_telemetry::{debug, error};
use datatypes::prelude::{ConcreteDataType, Value};
use datatypes::schema::SchemaRef;
use datatypes::types::vector_type_value_to_string;
use futures::StreamExt;
use opensrv_mysql::{
Column, ColumnFlags, ColumnType, ErrorKind, OkResponse, QueryResultWriter, RowWriter,
@@ -29,7 +30,7 @@ use session::context::QueryContextRef;
use snafu::prelude::*;
use tokio::io::AsyncWrite;
use crate::error::{self, Error, Result};
use crate::error::{self, ConvertSqlValueSnafu, Error, Result};
use crate::metrics::*;
/// Try to write multiple output to the writer if possible.
@@ -168,7 +169,7 @@ impl<'a, W: AsyncWrite + Unpin> MysqlResultWriter<'a, W> {
&mut row_writer,
&record_batch,
query_context.clone(),
&column_def,
&query_result.schema,
)
.await?
}
@@ -192,10 +193,10 @@ impl<'a, W: AsyncWrite + Unpin> MysqlResultWriter<'a, W> {
row_writer: &mut RowWriter<'_, W>,
recordbatch: &RecordBatch,
query_context: QueryContextRef,
column_def: &[Column],
schema: &SchemaRef,
) -> Result<()> {
for row in recordbatch.rows() {
for (value, column) in row.into_iter().zip(column_def.iter()) {
for (value, column) in row.into_iter().zip(schema.column_schemas().iter()) {
match value {
Value::Null => row_writer.write_col(None::<u8>)?,
Value::Boolean(v) => row_writer.write_col(v as i8)?,
@@ -210,10 +211,15 @@ impl<'a, W: AsyncWrite + Unpin> MysqlResultWriter<'a, W> {
Value::Float32(v) => row_writer.write_col(v.0)?,
Value::Float64(v) => row_writer.write_col(v.0)?,
Value::String(v) => row_writer.write_col(v.as_utf8())?,
Value::Binary(v) => match column.coltype {
ColumnType::MYSQL_TYPE_JSON => {
Value::Binary(v) => match column.data_type {
ConcreteDataType::Json(_) => {
row_writer.write_col(jsonb::to_string(&v))?;
}
ConcreteDataType::Vector(d) => {
let s = vector_type_value_to_string(&v, d.dim)
.context(ConvertSqlValueSnafu)?;
row_writer.write_col(s)?;
}
_ => {
row_writer.write_col(v.deref())?;
}
@@ -295,6 +301,7 @@ pub(crate) fn create_mysql_column(
ConcreteDataType::Duration(_) => Ok(ColumnType::MYSQL_TYPE_TIME),
ConcreteDataType::Decimal128(_) => Ok(ColumnType::MYSQL_TYPE_DECIMAL),
ConcreteDataType::Json(_) => Ok(ColumnType::MYSQL_TYPE_JSON),
ConcreteDataType::Vector(_) => Ok(ColumnType::MYSQL_TYPE_STRING),
_ => error::UnsupportedDataTypeSnafu {
data_type,
reason: "not implemented",

View File

@@ -27,7 +27,7 @@ use datafusion_expr::LogicalPlan;
use datatypes::arrow::datatypes::DataType as ArrowDataType;
use datatypes::prelude::{ConcreteDataType, Value};
use datatypes::schema::Schema;
use datatypes::types::{IntervalType, TimestampType};
use datatypes::types::{vector_type_value_to_string, IntervalType, TimestampType};
use datatypes::value::ListValue;
use pgwire::api::portal::{Format, Portal};
use pgwire::api::results::{DataRowEncoder, FieldInfo};
@@ -364,6 +364,24 @@ fn encode_array(
.collect::<PgWireResult<Vec<Option<String>>>>()?;
builder.encode_field(&array)
}
&ConcreteDataType::Vector(d) => {
let array = value_list
.items()
.iter()
.map(|v| match v {
Value::Null => Ok(None),
Value::Binary(v) => {
let s = vector_type_value_to_string(v, d.dim)
.map_err(|e| PgWireError::ApiError(Box::new(e)))?;
Ok(Some(s))
}
_ => Err(PgWireError::ApiError(Box::new(Error::Internal {
err_msg: format!("Invalid list item type, find {v:?}, expected vector",),
}))),
})
.collect::<PgWireResult<Vec<Option<String>>>>()?;
builder.encode_field(&array)
}
_ => Err(PgWireError::ApiError(Box::new(Error::Internal {
err_msg: format!(
"cannot write array type {:?} in postgres protocol: unimplemented",
@@ -395,6 +413,11 @@ pub(super) fn encode_value(
Value::String(v) => builder.encode_field(&v.as_utf8()),
Value::Binary(v) => match datatype {
ConcreteDataType::Json(_) => builder.encode_field(&jsonb::to_string(v)),
ConcreteDataType::Vector(d) => {
let s = vector_type_value_to_string(v, d.dim)
.map_err(|e| PgWireError::ApiError(Box::new(e)))?;
builder.encode_field(&s)
}
_ => {
let bytea_output = query_ctx.configuration_parameter().postgres_bytea_output();
match *bytea_output {
@@ -499,6 +522,7 @@ pub(super) fn type_gt_to_pg(origin: &ConcreteDataType) -> Result<Type> {
&ConcreteDataType::Json(_) => Ok(Type::JSON_ARRAY),
&ConcreteDataType::Duration(_)
| &ConcreteDataType::Dictionary(_)
| &ConcreteDataType::Vector(_)
| &ConcreteDataType::List(_) => server_error::UnsupportedDataTypeSnafu {
data_type: origin,
reason: "not implemented",
@@ -512,6 +536,7 @@ pub(super) fn type_gt_to_pg(origin: &ConcreteDataType) -> Result<Type> {
}
.fail()
}
&ConcreteDataType::Vector(_) => Ok(Type::FLOAT4_ARRAY),
}
}

View File

@@ -325,6 +325,13 @@ pub enum Error {
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Datatype error: {}", source))]
Datatype {
source: datatypes::error::Error,
#[snafu(implicit)]
location: Location,
},
}
impl ErrorExt for Error {
@@ -363,6 +370,7 @@ impl ErrorExt for Error {
SerializeColumnDefaultConstraint { source, .. } => source.status_code(),
ConvertToGrpcDataType { source, .. } => source.status_code(),
Datatype { source, .. } => source.status_code(),
ConvertToDfStatement { .. } => StatusCode::Internal,
ConvertSqlValue { .. } | ConvertValue { .. } => StatusCode::Unsupported,

View File

@@ -39,7 +39,7 @@ use crate::parser::{ParserContext, FLOW};
use crate::parsers::utils::validate_column_fulltext_create_option;
use crate::statements::create::{
Column, ColumnExtensions, CreateDatabase, CreateExternalTable, CreateFlow, CreateTable,
CreateTableLike, CreateView, Partitions, TableConstraint,
CreateTableLike, CreateView, Partitions, TableConstraint, VECTOR_OPT_DIM,
};
use crate::statements::statement::Statement;
use crate::statements::{
@@ -668,6 +668,31 @@ impl<'a> ParserContext<'a> {
column_type: &DataType,
column_extensions: &mut ColumnExtensions,
) -> Result<bool> {
if let DataType::Custom(name, tokens) = column_type
&& name.0.len() == 1
&& &name.0[0].value.to_uppercase() == "VECTOR"
{
ensure!(
tokens.len() == 1,
InvalidColumnOptionSnafu {
name: column_name.to_string(),
msg: "VECTOR type should have dimension",
}
);
let dimension =
tokens[0]
.parse::<u32>()
.ok()
.with_context(|| InvalidColumnOptionSnafu {
name: column_name.to_string(),
msg: "dimension should be a positive integer",
})?;
let options = HashMap::from_iter([(VECTOR_OPT_DIM.to_string(), dimension.to_string())]);
column_extensions.vector_options = Some(options.into());
}
if parser.parse_keyword(Keyword::FULLTEXT) {
ensure!(
column_extensions.fulltext_options.is_none(),

View File

@@ -42,10 +42,10 @@ use common_time::Timestamp;
use datatypes::prelude::ConcreteDataType;
use datatypes::schema::constraint::{CURRENT_TIMESTAMP, CURRENT_TIMESTAMP_FN};
use datatypes::schema::{ColumnDefaultConstraint, ColumnSchema, COMMENT_KEY};
use datatypes::types::{cast, TimestampType};
use datatypes::types::{cast, parse_string_to_vector_type_value, TimestampType};
use datatypes::value::{OrderedF32, OrderedF64, Value};
use snafu::{ensure, OptionExt, ResultExt};
use sqlparser::ast::{ExactNumberInfo, UnaryOperator};
use sqlparser::ast::{ExactNumberInfo, Ident, ObjectName, UnaryOperator};
use crate::ast::{
ColumnDef, ColumnOption, ColumnOptionDef, DataType as SqlDataType, Expr, TimezoneInfo,
@@ -53,7 +53,7 @@ use crate::ast::{
};
use crate::error::{
self, ColumnTypeMismatchSnafu, ConvertSqlValueSnafu, ConvertToGrpcDataTypeSnafu,
ConvertValueSnafu, InvalidCastSnafu, InvalidSqlValueSnafu, InvalidUnaryOpSnafu,
ConvertValueSnafu, DatatypeSnafu, InvalidCastSnafu, InvalidSqlValueSnafu, InvalidUnaryOpSnafu,
ParseSqlValueSnafu, Result, SerializeColumnDefaultConstraintSnafu, SetFulltextOptionSnafu,
TimestampOverflowSnafu, UnsupportedDefaultValueSnafu, UnsupportedUnaryOpSnafu,
};
@@ -61,6 +61,8 @@ use crate::statements::create::Column;
pub use crate::statements::option_map::OptionMap;
pub use crate::statements::transform::{get_data_type_by_alias_name, transform_statements};
const VECTOR_TYPE_NAME: &str = "VECTOR";
fn parse_string_to_value(
column_name: &str,
s: String,
@@ -134,6 +136,10 @@ fn parse_string_to_value(
.fail()
}
}
ConcreteDataType::Vector(d) => {
let v = parse_string_to_vector_type_value(&s, d.dim).context(DatatypeSnafu)?;
Ok(Value::Binary(v.into()))
}
_ => {
unreachable!()
}
@@ -614,6 +620,20 @@ pub fn sql_data_type_to_concrete_data_type(data_type: &SqlDataType) -> Result<Co
}
},
SqlDataType::JSON => Ok(ConcreteDataType::json_datatype()),
// Vector type
SqlDataType::Custom(name, d)
if name.0.as_slice().len() == 1
&& name.0.as_slice()[0].value.to_ascii_uppercase() == VECTOR_TYPE_NAME
&& d.len() == 1 =>
{
let dim = d[0].parse().map_err(|e| {
error::ParseSqlValueSnafu {
msg: format!("Failed to parse vector dimension: {}", e),
}
.build()
})?;
Ok(ConcreteDataType::vector_datatype(dim))
}
_ => error::SqlTypeNotSupportedSnafu {
t: data_type.clone(),
}
@@ -651,6 +671,10 @@ pub fn concrete_data_type_to_sql_data_type(data_type: &ConcreteDataType) -> Resu
ExactNumberInfo::PrecisionAndScale(d.precision() as u64, d.scale() as u64),
)),
ConcreteDataType::Json(_) => Ok(SqlDataType::JSON),
ConcreteDataType::Vector(v) => Ok(SqlDataType::Custom(
ObjectName(vec![Ident::new(VECTOR_TYPE_NAME)]),
vec![v.dim.to_string()],
)),
ConcreteDataType::Duration(_)
| ConcreteDataType::Null(_)
| ConcreteDataType::List(_)
@@ -766,6 +790,14 @@ mod tests {
SqlDataType::Interval,
ConcreteDataType::interval_month_day_nano_datatype(),
);
check_type(SqlDataType::JSON, ConcreteDataType::json_datatype());
check_type(
SqlDataType::Custom(
ObjectName(vec![Ident::new(VECTOR_TYPE_NAME)]),
vec!["3".to_string()],
),
ConcreteDataType::vector_datatype(3),
);
}
#[test]
@@ -1489,6 +1521,7 @@ mod tests {
])
.into(),
),
vector_options: None,
},
};
@@ -1501,7 +1534,7 @@ mod tests {
}
#[test]
pub fn test_parse_placeholder_value() {
fn test_parse_placeholder_value() {
assert!(sql_value_to_value(
"test",
&ConcreteDataType::string_datatype(),

View File

@@ -30,6 +30,7 @@ use crate::statements::OptionMap;
const LINE_SEP: &str = ",\n";
const COMMA_SEP: &str = ", ";
const INDENT: usize = 2;
pub const VECTOR_OPT_DIM: &str = "dim";
macro_rules! format_indent {
($fmt: expr, $arg: expr) => {
@@ -112,6 +113,8 @@ pub struct Column {
pub struct ColumnExtensions {
/// Fulltext options.
pub fulltext_options: Option<OptionMap>,
/// Vector options.
pub vector_options: Option<OptionMap>,
}
impl Column {
@@ -138,6 +141,13 @@ impl Column {
impl Display for Column {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
if let Some(vector_options) = &self.extensions.vector_options {
if let Some(dim) = vector_options.get(VECTOR_OPT_DIM) {
write!(f, "{} VECTOR({})", self.column_def.name, dim)?;
return Ok(());
}
}
write!(f, "{}", self.column_def)?;
if let Some(fulltext_options) = &self.extensions.fulltext_options {
if !fulltext_options.is_empty() {

View File

@@ -83,6 +83,7 @@ zstd.workspace = true
[dev-dependencies]
datafusion.workspace = true
datafusion-expr.workspace = true
hex.workspace = true
itertools.workspace = true
opentelemetry-proto.workspace = true
partition.workspace = true

View File

@@ -17,14 +17,16 @@ mod test {
use std::collections::HashMap;
use api::v1::column::Values;
use api::v1::column_data_type_extension::TypeExt;
use api::v1::ddl_request::Expr as DdlExpr;
use api::v1::greptime_request::Request;
use api::v1::query_request::Query;
use api::v1::region::QueryRequest as RegionQueryRequest;
use api::v1::{
alter_expr, AddColumn, AddColumns, AlterExpr, Column, ColumnDataType, ColumnDef,
CreateDatabaseExpr, CreateTableExpr, DdlRequest, DeleteRequest, DeleteRequests,
DropTableExpr, InsertRequest, InsertRequests, QueryRequest, SemanticType,
alter_expr, AddColumn, AddColumns, AlterExpr, Column, ColumnDataType,
ColumnDataTypeExtension, ColumnDef, CreateDatabaseExpr, CreateTableExpr, DdlRequest,
DeleteRequest, DeleteRequests, DropTableExpr, InsertRequest, InsertRequests, QueryRequest,
SemanticType, VectorTypeExtension,
};
use client::OutputData;
use common_catalog::consts::MITO_ENGINE;
@@ -204,6 +206,7 @@ CREATE TABLE {table_name} (
a INT,
b STRING,
c JSON,
d VECTOR(3),
ts TIMESTAMP,
TIME INDEX (ts),
PRIMARY KEY (a, b, c)
@@ -352,6 +355,27 @@ CREATE TABLE {table_name} (
r#"{ "id": 15, "transactions": [{ "amount": 500, "date": "2024-01-01" }, { "amount": -200, "date": "2024-02-01" }] }"#.to_string(),
r#"{ "id": 16, "transactions": [{ "amount": 500, "date": "2024-01-01" }] }"#.to_string(),
];
let vector_values = [
[1.0f32, 2.0, 3.0],
[4.0, 5.0, 6.0],
[7.0, 8.0, 9.0],
[10.0, 11.0, 12.0],
[13.0, 14.0, 15.0],
[16.0, 17.0, 18.0],
[19.0, 20.0, 21.0],
[22.0, 23.0, 24.0],
[25.0, 26.0, 27.0],
[28.0, 29.0, 30.0],
[31.0, 32.0, 33.0],
[34.0, 35.0, 36.0],
[37.0, 38.0, 39.0],
[40.0, 41.0, 42.0],
[43.0, 44.0, 45.0],
[46.0, 47.0, 48.0],
]
.iter()
.map(|x| x.iter().flat_map(|&f| f.to_le_bytes()).collect::<Vec<u8>>())
.collect::<Vec<_>>();
let insert = InsertRequest {
table_name: table_name.to_string(),
@@ -390,6 +414,19 @@ CREATE TABLE {table_name} (
datatype: ColumnDataType::Json as i32,
..Default::default()
},
Column {
column_name: "d".to_string(),
values: Some(Values {
binary_values: vector_values.clone(),
..Default::default()
}),
semantic_type: SemanticType::Field as i32,
datatype: ColumnDataType::Vector as i32,
datatype_extension: Some(ColumnDataTypeExtension {
type_ext: Some(TypeExt::VectorType(VectorTypeExtension { dim: 3 })),
}),
..Default::default()
},
Column {
column_name: "ts".to_string(),
values: Some(Values {
@@ -414,7 +451,7 @@ CREATE TABLE {table_name} (
let request = Request::Query(QueryRequest {
query: Some(Query::Sql(format!(
"SELECT ts, a, b, json_to_string(c) as c FROM {table_name} ORDER BY ts"
"SELECT ts, a, b, json_to_string(c) as c, d FROM {table_name} ORDER BY ts"
))),
});
let output = query(instance, request.clone()).await;
@@ -422,29 +459,53 @@ CREATE TABLE {table_name} (
unreachable!()
};
let recordbatches = RecordBatches::try_collect(stream).await.unwrap();
let expected = r#"+---------------------+----+-------------------+---------------------------------------------------------------------------------------------------+
| ts | a | b | c |
+---------------------+----+-------------------+---------------------------------------------------------------------------------------------------+
| 2023-01-01T07:26:12 | 1 | ts: 1672557972000 | {"active":true,"age":30,"id":1,"name":"Alice"} |
| 2023-01-01T07:26:13 | 2 | ts: 1672557973000 | {"active":false,"balance":1234.56,"id":2,"name":"Bob"} |
| 2023-01-01T07:26:14 | 3 | ts: 1672557974000 | {"age":28,"id":3,"tags":["rust","testing","json"]} |
| 2023-01-01T07:26:15 | 4 | ts: 1672557975000 | {"id":4,"metadata":{"created_at":"2024-10-30T12:00:00Z","status":"inactive"}} |
| 2023-01-01T07:26:16 | 5 | ts: 1672557976000 | {"id":5,"name":null,"phone":"+1234567890"} |
| 2023-01-01T07:26:17 | | ts: 1672557977000 | {"active":true,"height":5.9,"id":6,"weight":72.5} |
| 2023-01-01T07:26:18 | 11 | ts: 1672557978000 | {"age":29,"id":7,"languages":["English","Spanish"]} |
| 2023-01-01T07:26:19 | 12 | ts: 1672557979000 | {"contact":{"email":"hank@example.com","phone":"+0987654321"},"id":8} |
| 2023-01-01T07:26:20 | 20 | ts: 1672557980000 | {"id":9,"preferences":{"notifications":true,"theme":"dark"}} |
| 2023-01-01T07:26:21 | 21 | ts: 1672557981000 | {"active":false,"id":10,"scores":[88,92,76]} |
| 2023-01-01T07:26:22 | 22 | ts: 1672557982000 | {"birthday":"1996-07-20","id":11,"location":{"city":"New York","zip":"10001"}} |
| 2023-01-01T07:26:23 | 23 | ts: 1672557983000 | {"id":12,"subscription":{"expires":"2025-01-01","type":"premium"}} |
| 2023-01-01T07:26:24 | 50 | ts: 1672557984000 | {"active":true,"id":13,"settings":{"brightness":0.6,"volume":0.8}} |
| 2023-01-01T07:26:25 | 51 | ts: 1672557985000 | {"id":14,"notes":["first note","second note"],"priority":1} |
| 2023-01-01T07:26:26 | 52 | ts: 1672557986000 | {"id":15,"transactions":[{"amount":500,"date":"2024-01-01"},{"amount":-200,"date":"2024-02-01"}]} |
| 2023-01-01T07:26:27 | 53 | ts: 1672557987000 | {"id":16,"transactions":[{"amount":500,"date":"2024-01-01"}]} |
+---------------------+----+-------------------+---------------------------------------------------------------------------------------------------+"#;
assert_eq!(recordbatches.pretty_print().unwrap(), expected);
let expected = r#"+---------------------+----+-------------------+---------------------------------------------------------------------------------------------------+--------------------------+
| ts | a | b | c | d |
+---------------------+----+-------------------+---------------------------------------------------------------------------------------------------+--------------------------+
| 2023-01-01T07:26:12 | 1 | ts: 1672557972000 | {"active":true,"age":30,"id":1,"name":"Alice"} | 0000803f0000004000004040 |
| 2023-01-01T07:26:13 | 2 | ts: 1672557973000 | {"active":false,"balance":1234.56,"id":2,"name":"Bob"} | 000080400000a0400000c040 |
| 2023-01-01T07:26:14 | 3 | ts: 1672557974000 | {"age":28,"id":3,"tags":["rust","testing","json"]} | 0000e0400000004100001041 |
| 2023-01-01T07:26:15 | 4 | ts: 1672557975000 | {"id":4,"metadata":{"created_at":"2024-10-30T12:00:00Z","status":"inactive"}} | 000020410000304100004041 |
| 2023-01-01T07:26:16 | 5 | ts: 1672557976000 | {"id":5,"name":null,"phone":"+1234567890"} | 000050410000604100007041 |
| 2023-01-01T07:26:17 | | ts: 1672557977000 | {"active":true,"height":5.9,"id":6,"weight":72.5} | 000080410000884100009041 |
| 2023-01-01T07:26:18 | 11 | ts: 1672557978000 | {"age":29,"id":7,"languages":["English","Spanish"]} | 000098410000a0410000a841 |
| 2023-01-01T07:26:19 | 12 | ts: 1672557979000 | {"contact":{"email":"hank@example.com","phone":"+0987654321"},"id":8} | 0000b0410000b8410000c041 |
| 2023-01-01T07:26:20 | 20 | ts: 1672557980000 | {"id":9,"preferences":{"notifications":true,"theme":"dark"}} | 0000c8410000d0410000d841 |
| 2023-01-01T07:26:21 | 21 | ts: 1672557981000 | {"active":false,"id":10,"scores":[88,92,76]} | 0000e0410000e8410000f041 |
| 2023-01-01T07:26:22 | 22 | ts: 1672557982000 | {"birthday":"1996-07-20","id":11,"location":{"city":"New York","zip":"10001"}} | 0000f8410000004200000442 |
| 2023-01-01T07:26:23 | 23 | ts: 1672557983000 | {"id":12,"subscription":{"expires":"2025-01-01","type":"premium"}} | 0000084200000c4200001042 |
| 2023-01-01T07:26:24 | 50 | ts: 1672557984000 | {"active":true,"id":13,"settings":{"brightness":0.6,"volume":0.8}} | 000014420000184200001c42 |
| 2023-01-01T07:26:25 | 51 | ts: 1672557985000 | {"id":14,"notes":["first note","second note"],"priority":1} | 000020420000244200002842 |
| 2023-01-01T07:26:26 | 52 | ts: 1672557986000 | {"id":15,"transactions":[{"amount":500,"date":"2024-01-01"},{"amount":-200,"date":"2024-02-01"}]} | 00002c420000304200003442 |
| 2023-01-01T07:26:27 | 53 | ts: 1672557987000 | {"id":16,"transactions":[{"amount":500,"date":"2024-01-01"}]} | 0000384200003c4200004042 |
+---------------------+----+-------------------+---------------------------------------------------------------------------------------------------+--------------------------+"#;
similar_asserts::assert_eq!(recordbatches.pretty_print().unwrap(), expected);
let new_grpc_delete_request = |a, b, c, ts, row_count| DeleteRequest {
// Checks if the encoded vector values are as expected.
let hex_repr_of_vector_values = vector_values.iter().map(hex::encode).collect::<Vec<_>>();
assert_eq!(
hex_repr_of_vector_values,
vec![
"0000803f0000004000004040",
"000080400000a0400000c040",
"0000e0400000004100001041",
"000020410000304100004041",
"000050410000604100007041",
"000080410000884100009041",
"000098410000a0410000a841",
"0000b0410000b8410000c041",
"0000c8410000d0410000d841",
"0000e0410000e8410000f041",
"0000f8410000004200000442",
"0000084200000c4200001042",
"000014420000184200001c42",
"000020420000244200002842",
"00002c420000304200003442",
"0000384200003c4200004042",
]
);
let new_grpc_delete_request = |a, b, c, d, ts, row_count| DeleteRequest {
table_name: table_name.to_string(),
key_columns: vec![
Column {
@@ -477,6 +538,19 @@ CREATE TABLE {table_name} (
datatype: ColumnDataType::Json as i32,
..Default::default()
},
Column {
column_name: "d".to_string(),
values: Some(Values {
binary_values: d,
..Default::default()
}),
semantic_type: SemanticType::Field as i32,
datatype: ColumnDataType::Vector as i32,
datatype_extension: Some(ColumnDataTypeExtension {
type_ext: Some(TypeExt::VectorType(VectorTypeExtension { dim: 3 })),
}),
..Default::default()
},
Column {
column_name: "ts".to_string(),
semantic_type: SemanticType::Timestamp as i32,
@@ -504,6 +578,12 @@ CREATE TABLE {table_name} (
r#"{ "id": 11, "birthday": "1996-07-20", "location": { "city": "New York", "zip": "10001" } }"#.to_string(),
r#"{ "id": 15, "transactions": [{ "amount": 500, "date": "2024-01-01" }, { "amount": -200, "date": "2024-02-01" }] }"#.to_string(),
],
vec![
[4.0f32, 5.0, 6.0].iter().flat_map(|f| f.to_le_bytes()).collect::<Vec<u8>>(),
[22.0f32, 23.0, 24.0].iter().flat_map(|f| f.to_le_bytes()).collect::<Vec<u8>>(),
[31.0f32, 32.0, 33.0].iter().flat_map(|f| f.to_le_bytes()).collect::<Vec<u8>>(),
[43.0f32, 44.0, 45.0].iter().flat_map(|f| f.to_le_bytes()).collect::<Vec<u8>>(),
],
vec![1672557973000, 1672557979000, 1672557982000, 1672557986000],
4,
);
@@ -518,6 +598,16 @@ CREATE TABLE {table_name} (
r#"{ "id": 16, "transactions": [{ "amount": 500, "date": "2024-01-01" }] }"#
.to_string(),
],
vec![
[7.0f32, 8.0, 9.0]
.iter()
.flat_map(|f| f.to_le_bytes())
.collect::<Vec<u8>>(),
[46.0f32, 47.0, 48.0]
.iter()
.flat_map(|f| f.to_le_bytes())
.collect::<Vec<u8>>(),
],
vec![1672557974000, 1672557987000],
2,
);
@@ -535,20 +625,20 @@ CREATE TABLE {table_name} (
unreachable!()
};
let recordbatches = RecordBatches::try_collect(stream).await.unwrap();
let expected = r#"+---------------------+----+-------------------+-------------------------------------------------------------------------------+
| ts | a | b | c |
+---------------------+----+-------------------+-------------------------------------------------------------------------------+
| 2023-01-01T07:26:12 | 1 | ts: 1672557972000 | {"active":true,"age":30,"id":1,"name":"Alice"} |
| 2023-01-01T07:26:15 | 4 | ts: 1672557975000 | {"id":4,"metadata":{"created_at":"2024-10-30T12:00:00Z","status":"inactive"}} |
| 2023-01-01T07:26:16 | 5 | ts: 1672557976000 | {"id":5,"name":null,"phone":"+1234567890"} |
| 2023-01-01T07:26:17 | | ts: 1672557977000 | {"active":true,"height":5.9,"id":6,"weight":72.5} |
| 2023-01-01T07:26:18 | 11 | ts: 1672557978000 | {"age":29,"id":7,"languages":["English","Spanish"]} |
| 2023-01-01T07:26:20 | 20 | ts: 1672557980000 | {"id":9,"preferences":{"notifications":true,"theme":"dark"}} |
| 2023-01-01T07:26:21 | 21 | ts: 1672557981000 | {"active":false,"id":10,"scores":[88,92,76]} |
| 2023-01-01T07:26:23 | 23 | ts: 1672557983000 | {"id":12,"subscription":{"expires":"2025-01-01","type":"premium"}} |
| 2023-01-01T07:26:24 | 50 | ts: 1672557984000 | {"active":true,"id":13,"settings":{"brightness":0.6,"volume":0.8}} |
| 2023-01-01T07:26:25 | 51 | ts: 1672557985000 | {"id":14,"notes":["first note","second note"],"priority":1} |
+---------------------+----+-------------------+-------------------------------------------------------------------------------+"#;
let expected = r#"+---------------------+----+-------------------+-------------------------------------------------------------------------------+--------------------------+
| ts | a | b | c | d |
+---------------------+----+-------------------+-------------------------------------------------------------------------------+--------------------------+
| 2023-01-01T07:26:12 | 1 | ts: 1672557972000 | {"active":true,"age":30,"id":1,"name":"Alice"} | 0000803f0000004000004040 |
| 2023-01-01T07:26:15 | 4 | ts: 1672557975000 | {"id":4,"metadata":{"created_at":"2024-10-30T12:00:00Z","status":"inactive"}} | 000020410000304100004041 |
| 2023-01-01T07:26:16 | 5 | ts: 1672557976000 | {"id":5,"name":null,"phone":"+1234567890"} | 000050410000604100007041 |
| 2023-01-01T07:26:17 | | ts: 1672557977000 | {"active":true,"height":5.9,"id":6,"weight":72.5} | 000080410000884100009041 |
| 2023-01-01T07:26:18 | 11 | ts: 1672557978000 | {"age":29,"id":7,"languages":["English","Spanish"]} | 000098410000a0410000a841 |
| 2023-01-01T07:26:20 | 20 | ts: 1672557980000 | {"id":9,"preferences":{"notifications":true,"theme":"dark"}} | 0000c8410000d0410000d841 |
| 2023-01-01T07:26:21 | 21 | ts: 1672557981000 | {"active":false,"id":10,"scores":[88,92,76]} | 0000e0410000e8410000f041 |
| 2023-01-01T07:26:23 | 23 | ts: 1672557983000 | {"id":12,"subscription":{"expires":"2025-01-01","type":"premium"}} | 0000084200000c4200001042 |
| 2023-01-01T07:26:24 | 50 | ts: 1672557984000 | {"active":true,"id":13,"settings":{"brightness":0.6,"volume":0.8}} | 000014420000184200001c42 |
| 2023-01-01T07:26:25 | 51 | ts: 1672557985000 | {"id":14,"notes":["first note","second note"],"priority":1} | 000020420000244200002842 |
+---------------------+----+-------------------+-------------------------------------------------------------------------------+--------------------------+"#;
similar_asserts::assert_eq!(recordbatches.pretty_print().unwrap(), expected);
}

View File

@@ -165,7 +165,7 @@ pub async fn test_mysql_crud(store_type: StorageType) {
.unwrap();
sqlx::query(
"create table demo(i bigint, ts timestamp time index default current_timestamp, d date default null, dt datetime default null, b blob default null, j json default null)",
"create table demo(i bigint, ts timestamp time index default current_timestamp, d date default null, dt datetime default null, b blob default null, j json default null, v vector(3) default null)",
)
.execute(&pool)
.await
@@ -178,7 +178,7 @@ pub async fn test_mysql_crud(store_type: StorageType) {
let d = NaiveDate::from_yo_opt(2015, 100).unwrap();
let hello = format!("hello{i}");
let bytes = hello.as_bytes();
let jsons = serde_json::json!({
let json = serde_json::json!({
"code": i,
"success": true,
"payload": {
@@ -189,19 +189,21 @@ pub async fn test_mysql_crud(store_type: StorageType) {
"homepage": null
}
});
sqlx::query("insert into demo values(?, ?, ?, ?, ?, ?)")
let vector = "[1,2,3]";
sqlx::query("insert into demo values(?, ?, ?, ?, ?, ?, ?)")
.bind(i)
.bind(i)
.bind(d)
.bind(dt)
.bind(bytes)
.bind(jsons)
.bind(json)
.bind(vector)
.execute(&pool)
.await
.unwrap();
}
let rows = sqlx::query("select i, d, dt, b, j from demo")
let rows = sqlx::query("select i, d, dt, b, j, v from demo")
.fetch_all(&pool)
.await
.unwrap();
@@ -213,6 +215,7 @@ pub async fn test_mysql_crud(store_type: StorageType) {
let dt: DateTime<Utc> = row.get("dt");
let bytes: Vec<u8> = row.get("b");
let json: serde_json::Value = row.get("j");
let vector: String = row.get("v");
assert_eq!(ret, i as i64);
let expected_d = NaiveDate::from_yo_opt(2015, 100).unwrap();
assert_eq!(expected_d, d);
@@ -239,6 +242,7 @@ pub async fn test_mysql_crud(store_type: StorageType) {
}
});
assert_eq!(json, expected_j);
assert_eq!(vector, "[1,2,3]");
}
let rows = sqlx::query("select i from demo where i=?")

View File

@@ -0,0 +1,92 @@
CREATE TABLE t (ts TIMESTAMP TIME INDEX, v VECTOR(3));
Affected Rows: 0
INSERT INTO t VALUES
(1, "[1.0, 2.0, 3.0]"),
(2, "[4.0, 5.0, 6.0]"),
(3, "[7.0, 8.0, 9.0]");
Affected Rows: 3
-- SQLNESS PROTOCOL MYSQL
SELECT * FROM t;
+----------------------------+---------+
| ts | v |
+----------------------------+---------+
| 1970-01-01 00:00:00.001000 | [1,2,3] |
| 1970-01-01 00:00:00.002000 | [4,5,6] |
| 1970-01-01 00:00:00.003000 | [7,8,9] |
+----------------------------+---------+
-- SQLNESS PROTOCOL POSTGRES
SELECT * FROM t;
+----------------------------+-----------+
| ts | v |
+----------------------------+-----------+
| 1970-01-01 00:00:00.001000 | "[1,2,3]" |
| 1970-01-01 00:00:00.002000 | "[4,5,6]" |
| 1970-01-01 00:00:00.003000 | "[7,8,9]" |
+----------------------------+-----------+
-- Unexpected dimension --
INSERT INTO t VALUES
(4, "[1.0]");
Error: 1004(InvalidArguments), Invalid Vector: Failed to parse [1.0] to Vector value: wrong dimension
-- Invalid vector value --
INSERT INTO t VALUES
(5, "1.0,2.0,3.0");
Error: 1004(InvalidArguments), Invalid Vector: Failed to parse 1.0,2.0,3.0 to Vector value: not properly enclosed in brackets
-- Invalid vector value --
INSERT INTO t VALUES
(6, "[30h, 40s, 50m]");
Error: 1004(InvalidArguments), Invalid Vector: Failed to parse [30h, 40s, 50m] to Vector value: elements are not all float32
CREATE TABLE t2 (ts TIMESTAMP TIME INDEX, v VECTOR(3) DEFAULT '[1.0, 2.0, 3.0]');
Affected Rows: 0
INSERT INTO t2 (ts) VALUES
(1),
(2),
(3);
Affected Rows: 3
-- SQLNESS PROTOCOL MYSQL
SELECT * FROM t2;
+----------------------------+---------+
| ts | v |
+----------------------------+---------+
| 1970-01-01 00:00:00.001000 | [1,2,3] |
| 1970-01-01 00:00:00.002000 | [1,2,3] |
| 1970-01-01 00:00:00.003000 | [1,2,3] |
+----------------------------+---------+
-- SQLNESS PROTOCOL POSTGRES
SELECT * FROM t2;
+----------------------------+-----------+
| ts | v |
+----------------------------+-----------+
| 1970-01-01 00:00:00.001000 | "[1,2,3]" |
| 1970-01-01 00:00:00.002000 | "[1,2,3]" |
| 1970-01-01 00:00:00.003000 | "[1,2,3]" |
+----------------------------+-----------+
DROP TABLE t;
Affected Rows: 0
DROP TABLE t2;
Affected Rows: 0

View File

@@ -0,0 +1,41 @@
CREATE TABLE t (ts TIMESTAMP TIME INDEX, v VECTOR(3));
INSERT INTO t VALUES
(1, "[1.0, 2.0, 3.0]"),
(2, "[4.0, 5.0, 6.0]"),
(3, "[7.0, 8.0, 9.0]");
-- SQLNESS PROTOCOL MYSQL
SELECT * FROM t;
-- SQLNESS PROTOCOL POSTGRES
SELECT * FROM t;
-- Unexpected dimension --
INSERT INTO t VALUES
(4, "[1.0]");
-- Invalid vector value --
INSERT INTO t VALUES
(5, "1.0,2.0,3.0");
-- Invalid vector value --
INSERT INTO t VALUES
(6, "[30h, 40s, 50m]");
CREATE TABLE t2 (ts TIMESTAMP TIME INDEX, v VECTOR(3) DEFAULT '[1.0, 2.0, 3.0]');
INSERT INTO t2 (ts) VALUES
(1),
(2),
(3);
-- SQLNESS PROTOCOL MYSQL
SELECT * FROM t2;
-- SQLNESS PROTOCOL POSTGRES
SELECT * FROM t2;
DROP TABLE t;
DROP TABLE t2;