From beb07fc8959fab3c37be19cd449f27c8f1b2fc1c Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Mon, 5 Dec 2022 19:59:23 +0800 Subject: [PATCH] feat: new datatypes subcrate based on the official arrow (#705) * feat: Init datatypes2 crate * chore: Remove some unimplemented types * feat: Implements PrimitiveType and PrimitiveVector for datatypes2 (#633) * feat: Implement primitive types and vectors * feat: Implement a wrapper type * feat: Remove VectorType from ScalarRef * feat: Move some trait bound from NativeType to WrapperType * feat: pub use primitive vectors and builders * feat: Returns error in try_from when type mismatch * feat: Impl PartialEq for some vectors * test: Pass vector tests * chore: Add license header * test: Pass more vector tests * feat: Implement some methods of vector Helper * test: Pass more tests * style: Fix clippy * chore: Add license header * feat: Remove IntoValueRef trait * feat: Add NativeType trait bound to WrapperType::Native * docs: Explain what is wrapper type * chore: Fix typos * refactor: LogicalPrimitiveType::type_name returns str * feat: Implements DateType and DateVector (#651) * feat: Implement DateType and DateVector * test: Pass more value and data type tests * chore: Address CR comments * test: Skip list value test * feat: datatypes2 datetime (#661) * feat: impl DateTime type and vector * fix: add license header * fix: CR comments and add more tests * fix: customized serialization for wrapper type * feat: Implements NullType and NullVector (#658) * feat: Implements NullType and NullVector * chore: Address CR comment Co-authored-by: Ruihang Xia * chore: Address CR comment Co-authored-by: Ruihang Xia * feat: Implements StringType and StringVector (#659) * feat: implement string vector Signed-off-by: Ruihang Xia * add more test and from Signed-off-by: Ruihang Xia * fix clippy Signed-off-by: Ruihang Xia * cover NUL Signed-off-by: Ruihang Xia Signed-off-by: Ruihang Xia * feat: impl datatypes2/timestamp (#686) * feat: add timestamp datatype and vectors * fix: cr comments and reformat code * chore: add some tests * feat: Implements ListType and ListVector (#681) * feat: Implement ListType and ListVector * test: Pass more tests * style: Fix clippy * chore: Fix comment * chore: Address CR comments * feat: impl constant vector (#680) * feat: impl constant vector Signed-off-by: Ruihang Xia * fix tests Signed-off-by: Ruihang Xia * Apply suggestions from code review Co-authored-by: Yingwen * rename fn names Signed-off-by: Ruihang Xia * remove println Signed-off-by: Ruihang Xia Signed-off-by: Ruihang Xia Co-authored-by: Yingwen * feat: Implements Validity (#684) * feat: Implements Validity * chore: remove pub from sub mod in vectors * feat: Implements schema for datatypes2 (#695) * feat: Add is_timestamp_compatible to DataType * feat: Implement ColumnSchema and Schema * feat: Impl RawSchema * chore: Remove useless codes and run more tests * chore: Fix clippy * feat: Impl from_arrow_time_unit and pass schema tests * chore: add more tests for timestamp (#702) * chore: add more tests for timestamp * chore: add replicate test for timestamps * feat: Implements helper methods for vectors/values (#703) * feat: Implement helper methods for vectors/values * chore: Address CR comments * chore: add more test for timestamp Signed-off-by: Ruihang Xia Co-authored-by: evenyag Co-authored-by: Lei, HUANG <6406592+v0y4g3r@users.noreply.github.com> Co-authored-by: Lei, HUANG --- Cargo.lock | 248 +++- Cargo.toml | 1 + src/common/recordbatch/src/recordbatch.rs | 2 + src/common/time/src/timestamp.rs | 48 + src/datatypes2/Cargo.toml | 24 + src/datatypes2/src/arrow_array.rs | 242 ++++ src/datatypes2/src/data_type.rs | 486 +++++++ src/datatypes2/src/error.rs | 144 ++ src/datatypes2/src/lib.rs | 33 + src/datatypes2/src/macros.rs | 68 + src/datatypes2/src/prelude.rs | 20 + src/datatypes2/src/scalars.rs | 443 ++++++ src/datatypes2/src/schema.rs | 430 ++++++ src/datatypes2/src/schema/column_schema.rs | 305 ++++ src/datatypes2/src/schema/constraint.rs | 306 ++++ src/datatypes2/src/schema/raw.rs | 77 + src/datatypes2/src/serialize.rs | 20 + src/datatypes2/src/timestamp.rs | 135 ++ src/datatypes2/src/type_id.rs | 93 ++ src/datatypes2/src/types.rs | 37 + src/datatypes2/src/types/binary_type.rs | 60 + src/datatypes2/src/types/boolean_type.rs | 59 + src/datatypes2/src/types/date_type.rs | 90 ++ src/datatypes2/src/types/datetime_type.rs | 91 ++ src/datatypes2/src/types/list_type.rs | 95 ++ src/datatypes2/src/types/null_type.rs | 58 + src/datatypes2/src/types/primitive_type.rs | 358 +++++ src/datatypes2/src/types/string_type.rs | 60 + src/datatypes2/src/types/timestamp_type.rs | 140 ++ src/datatypes2/src/value.rs | 1275 +++++++++++++++++ src/datatypes2/src/vectors.rs | 309 ++++ src/datatypes2/src/vectors/binary.rs | 353 +++++ src/datatypes2/src/vectors/boolean.rs | 371 +++++ src/datatypes2/src/vectors/constant.rs | 218 +++ src/datatypes2/src/vectors/date.rs | 103 ++ src/datatypes2/src/vectors/datetime.rs | 116 ++ src/datatypes2/src/vectors/eq.rs | 228 +++ src/datatypes2/src/vectors/helper.rs | 431 ++++++ src/datatypes2/src/vectors/list.rs | 747 ++++++++++ src/datatypes2/src/vectors/null.rs | 282 ++++ src/datatypes2/src/vectors/operations.rs | 127 ++ .../src/vectors/operations/filter.rs | 145 ++ .../src/vectors/operations/find_unique.rs | 367 +++++ .../src/vectors/operations/replicate.rs | 170 +++ src/datatypes2/src/vectors/primitive.rs | 552 +++++++ src/datatypes2/src/vectors/string.rs | 370 +++++ src/datatypes2/src/vectors/timestamp.rs | 31 + src/datatypes2/src/vectors/validity.rs | 159 ++ 48 files changed, 10493 insertions(+), 34 deletions(-) create mode 100644 src/datatypes2/Cargo.toml create mode 100644 src/datatypes2/src/arrow_array.rs create mode 100644 src/datatypes2/src/data_type.rs create mode 100644 src/datatypes2/src/error.rs create mode 100644 src/datatypes2/src/lib.rs create mode 100644 src/datatypes2/src/macros.rs create mode 100644 src/datatypes2/src/prelude.rs create mode 100644 src/datatypes2/src/scalars.rs create mode 100644 src/datatypes2/src/schema.rs create mode 100644 src/datatypes2/src/schema/column_schema.rs create mode 100644 src/datatypes2/src/schema/constraint.rs create mode 100644 src/datatypes2/src/schema/raw.rs create mode 100644 src/datatypes2/src/serialize.rs create mode 100644 src/datatypes2/src/timestamp.rs create mode 100644 src/datatypes2/src/type_id.rs create mode 100644 src/datatypes2/src/types.rs create mode 100644 src/datatypes2/src/types/binary_type.rs create mode 100644 src/datatypes2/src/types/boolean_type.rs create mode 100644 src/datatypes2/src/types/date_type.rs create mode 100644 src/datatypes2/src/types/datetime_type.rs create mode 100644 src/datatypes2/src/types/list_type.rs create mode 100644 src/datatypes2/src/types/null_type.rs create mode 100644 src/datatypes2/src/types/primitive_type.rs create mode 100644 src/datatypes2/src/types/string_type.rs create mode 100644 src/datatypes2/src/types/timestamp_type.rs create mode 100644 src/datatypes2/src/value.rs create mode 100644 src/datatypes2/src/vectors.rs create mode 100644 src/datatypes2/src/vectors/binary.rs create mode 100644 src/datatypes2/src/vectors/boolean.rs create mode 100644 src/datatypes2/src/vectors/constant.rs create mode 100644 src/datatypes2/src/vectors/date.rs create mode 100644 src/datatypes2/src/vectors/datetime.rs create mode 100644 src/datatypes2/src/vectors/eq.rs create mode 100644 src/datatypes2/src/vectors/helper.rs create mode 100644 src/datatypes2/src/vectors/list.rs create mode 100644 src/datatypes2/src/vectors/null.rs create mode 100644 src/datatypes2/src/vectors/operations.rs create mode 100644 src/datatypes2/src/vectors/operations/filter.rs create mode 100644 src/datatypes2/src/vectors/operations/find_unique.rs create mode 100644 src/datatypes2/src/vectors/operations/replicate.rs create mode 100644 src/datatypes2/src/vectors/primitive.rs create mode 100644 src/datatypes2/src/vectors/string.rs create mode 100644 src/datatypes2/src/vectors/timestamp.rs create mode 100644 src/datatypes2/src/vectors/validity.rs diff --git a/Cargo.lock b/Cargo.lock index 2badba5f8c..a6f9216b6d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -40,6 +40,19 @@ dependencies = [ "version_check", ] +[[package]] +name = "ahash" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf6ccdb167abbf410dcb915cabd428929d7f6a04980b54a11f26a39f1c7f7107" +dependencies = [ + "cfg-if", + "const-random", + "getrandom 0.2.7", + "once_cell", + "version_check", +] + [[package]] name = "aho-corasick" version = "0.7.19" @@ -182,8 +195,8 @@ dependencies = [ "bitflags", "chrono", "csv", - "flatbuffers", - "half", + "flatbuffers 2.1.1", + "half 1.8.2", "hex", "indexmap", "lazy_static", @@ -197,6 +210,72 @@ dependencies = [ "serde_json", ] +[[package]] +name = "arrow" +version = "26.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e24e2bcd431a4aa0ff003fdd2dc21c78cfb42f31459c89d2312c2746fe17a5ac" +dependencies = [ + "ahash 0.8.2", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "bitflags", + "chrono", + "csv", + "flatbuffers 22.9.29", + "half 2.1.0", + "hashbrown", + "indexmap", + "lazy_static", + "lexical-core", + "multiversion", + "num", + "regex", + "regex-syntax", + "serde_json", +] + +[[package]] +name = "arrow-array" +version = "26.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9044300874385f19e77cbf90911e239bd23630d8f23bb0f948f9067998a13b7" +dependencies = [ + "ahash 0.8.2", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "half 2.1.0", + "hashbrown", + "num", +] + +[[package]] +name = "arrow-buffer" +version = "26.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78476cbe9e3f808dcecab86afe42d573863c63e149c62e6e379ed2522743e626" +dependencies = [ + "half 2.1.0", + "num", +] + +[[package]] +name = "arrow-data" +version = "26.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d916feee158c485dad4f701cba31bc9a90a8db87d9df8e2aa8adc0c20a2bbb9" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half 2.1.0", + "num", +] + [[package]] name = "arrow-format" version = "0.4.0" @@ -207,13 +286,32 @@ dependencies = [ "serde", ] +[[package]] +name = "arrow-schema" +version = "26.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f9406eb7834ca6bd8350d1baa515d18b9fcec487eddacfb62f5e19511f7bd37" + +[[package]] +name = "arrow-select" +version = "26.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6593a01586751c74498495d2f5a01fcd438102b52965c11dd98abf4ebcacef37" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num", +] + [[package]] name = "arrow2" version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2e387b20dd573a96f36b173d9027483898f944d696521afd74e2caa3c813d86e" dependencies = [ - "ahash", + "ahash 0.7.6", "arrow-format", "base64", "bytemuck", @@ -551,7 +649,7 @@ checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" name = "benchmarks" version = "0.1.0" dependencies = [ - "arrow", + "arrow 10.0.0", "clap 4.0.18", "client", "indicatif", @@ -961,7 +1059,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "213030a2b5a4e0c0892b6652260cf6ccac84827b83a85a534e178e3906c4cf1b" dependencies = [ "ciborium-io", - "half", + "half 1.8.2", ] [[package]] @@ -1207,7 +1305,7 @@ dependencies = [ "common-function-macro", "common-query", "common-time", - "datafusion-common", + "datafusion-common 7.0.0", "datatypes", "libc", "num", @@ -1283,7 +1381,7 @@ dependencies = [ "common-recordbatch", "common-time", "datafusion", - "datafusion-common", + "datafusion-common 7.0.0", "datafusion-expr", "datatypes", "snafu", @@ -1297,7 +1395,7 @@ version = "0.1.0" dependencies = [ "common-error", "datafusion", - "datafusion-common", + "datafusion-common 7.0.0", "datatypes", "futures", "paste", @@ -1412,6 +1510,28 @@ dependencies = [ "tracing-subscriber", ] +[[package]] +name = "const-random" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368a7a772ead6ce7e1de82bfb04c485f3db8ec744f72925af5735e29a22cc18e" +dependencies = [ + "const-random-macro", + "proc-macro-hack", +] + +[[package]] +name = "const-random-macro" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d7d6ab3c3a2282db210df5f02c4dab6e0a7057af0fb7ebd4070f30fe05c0ddb" +dependencies = [ + "getrandom 0.2.7", + "once_cell", + "proc-macro-hack", + "tiny-keccak", +] + [[package]] name = "constant_time_eq" version = "0.1.5" @@ -1724,12 +1844,12 @@ name = "datafusion" version = "7.0.0" source = "git+https://github.com/apache/arrow-datafusion.git?branch=arrow2#744b2626081db95a254fc882820fc7812f95aa51" dependencies = [ - "ahash", + "ahash 0.7.6", "arrow2", "async-trait", "chrono", "comfy-table 5.0.1", - "datafusion-common", + "datafusion-common 7.0.0", "datafusion-expr", "datafusion-physical-expr", "futures", @@ -1744,7 +1864,7 @@ dependencies = [ "pin-project-lite", "rand 0.8.5", "smallvec", - "sqlparser", + "sqlparser 0.15.0", "tempfile", "tokio", "tokio-stream", @@ -1758,7 +1878,19 @@ dependencies = [ "arrow2", "ordered-float 2.10.0", "parquet2", - "sqlparser", + "sqlparser 0.15.0", +] + +[[package]] +name = "datafusion-common" +version = "14.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15f1ffcbc1f040c9ab99f41db1c743d95aff267bb2e7286aaa010738b7402251" +dependencies = [ + "arrow 26.0.0", + "chrono", + "ordered-float 3.1.0", + "sqlparser 0.26.0", ] [[package]] @@ -1766,10 +1898,10 @@ name = "datafusion-expr" version = "7.0.0" source = "git+https://github.com/apache/arrow-datafusion.git?branch=arrow2#744b2626081db95a254fc882820fc7812f95aa51" dependencies = [ - "ahash", + "ahash 0.7.6", "arrow2", - "datafusion-common", - "sqlparser", + "datafusion-common 7.0.0", + "sqlparser 0.15.0", ] [[package]] @@ -1777,12 +1909,12 @@ name = "datafusion-physical-expr" version = "7.0.0" source = "git+https://github.com/apache/arrow-datafusion.git?branch=arrow2#744b2626081db95a254fc882820fc7812f95aa51" dependencies = [ - "ahash", + "ahash 0.7.6", "arrow2", "blake2", "blake3", "chrono", - "datafusion-common", + "datafusion-common 7.0.0", "datafusion-expr", "hashbrown", "lazy_static", @@ -1818,7 +1950,7 @@ dependencies = [ "common-telemetry", "common-time", "datafusion", - "datafusion-common", + "datafusion-common 7.0.0", "datatypes", "frontend", "futures", @@ -1857,7 +1989,26 @@ dependencies = [ "common-base", "common-error", "common-time", - "datafusion-common", + "datafusion-common 7.0.0", + "enum_dispatch", + "num", + "num-traits", + "ordered-float 3.1.0", + "paste", + "serde", + "serde_json", + "snafu", +] + +[[package]] +name = "datatypes2" +version = "0.1.0" +dependencies = [ + "arrow 26.0.0", + "common-base", + "common-error", + "common-time", + "datafusion-common 14.0.0", "enum_dispatch", "num", "num-traits", @@ -2159,6 +2310,16 @@ dependencies = [ "thiserror", ] +[[package]] +name = "flatbuffers" +version = "22.9.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ce016b9901aef3579617931fbb2df8fc9a9f7cb95a16eb8acc8148209bb9e70" +dependencies = [ + "bitflags", + "thiserror", +] + [[package]] name = "flate2" version = "1.0.24" @@ -2215,7 +2376,7 @@ dependencies = [ "common-telemetry", "common-time", "datafusion", - "datafusion-common", + "datafusion-common 7.0.0", "datafusion-expr", "datanode", "datatypes", @@ -2235,7 +2396,7 @@ dependencies = [ "session", "snafu", "sql", - "sqlparser", + "sqlparser 0.15.0", "store-api", "table", "tempdir", @@ -2517,6 +2678,16 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" +[[package]] +name = "half" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad6a9459c9c30b177b925162351f97e7d967c7ea8bab3b8352805327daf45554" +dependencies = [ + "crunchy", + "num-traits", +] + [[package]] name = "hash_hasher" version = "2.0.3" @@ -2529,7 +2700,7 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" dependencies = [ - "ahash", + "ahash 0.7.6", ] [[package]] @@ -3218,7 +3389,7 @@ version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b9b8653cec6897f73b519a43fba5ee3d50f62fe9af80b428accdcc093b4a849" dependencies = [ - "ahash", + "ahash 0.7.6", "metrics-macros", "portable-atomic", ] @@ -3324,7 +3495,7 @@ dependencies = [ "common-telemetry", "common-time", "datafusion", - "datafusion-common", + "datafusion-common 7.0.0", "datatypes", "futures", "log-store", @@ -3884,7 +4055,7 @@ version = "10.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53e9c8fc20af9b92d85d42ec86e5217b2eaf1340fbba75c4b4296de764ea7921" dependencies = [ - "arrow", + "arrow 10.0.0", "base64", "brotli", "byteorder", @@ -4504,7 +4675,7 @@ dependencies = [ "common-telemetry", "common-time", "datafusion", - "datafusion-common", + "datafusion-common 7.0.0", "datafusion-physical-expr", "datatypes", "format_num", @@ -5035,7 +5206,7 @@ name = "rustpython-compiler-core" version = "0.1.2" source = "git+https://github.com/RustPython/RustPython?rev=02a1d1d#02a1d1d7db57afbb78049599c2585cc7cd59e6d3" dependencies = [ - "ahash", + "ahash 0.7.6", "indexmap", "itertools", "log", @@ -5077,7 +5248,7 @@ name = "rustpython-parser" version = "0.1.2" source = "git+https://github.com/RustPython/RustPython?rev=02a1d1d#02a1d1d7db57afbb78049599c2585cc7cd59e6d3" dependencies = [ - "ahash", + "ahash 0.7.6", "lalrpop-util", "log", "num-bigint", @@ -5106,7 +5277,7 @@ version = "0.1.2" source = "git+https://github.com/RustPython/RustPython?rev=02a1d1d#02a1d1d7db57afbb78049599c2585cc7cd59e6d3" dependencies = [ "adler32", - "ahash", + "ahash 0.7.6", "ascii", "atty", "bitflags", @@ -5118,7 +5289,7 @@ dependencies = [ "exitcode", "flate2", "getrandom 0.2.7", - "half", + "half 1.8.2", "hex", "hexf-parse", "indexmap", @@ -5343,7 +5514,7 @@ dependencies = [ "common-time", "console", "datafusion", - "datafusion-common", + "datafusion-common 7.0.0", "datafusion-expr", "datafusion-physical-expr", "datatypes", @@ -5428,7 +5599,7 @@ version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5" dependencies = [ - "half", + "half 1.8.2", "serde", ] @@ -5775,7 +5946,7 @@ dependencies = [ "mito", "once_cell", "snafu", - "sqlparser", + "sqlparser 0.15.0", ] [[package]] @@ -5813,6 +5984,15 @@ dependencies = [ "log", ] +[[package]] +name = "sqlparser" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86be66ea0b2b22749cfa157d16e2e84bf793e626a3375f4d378dc289fa03affb" +dependencies = [ + "log", +] + [[package]] name = "sre-engine" version = "0.1.2" @@ -6118,7 +6298,7 @@ dependencies = [ "common-recordbatch", "common-telemetry", "datafusion", - "datafusion-common", + "datafusion-common 7.0.0", "datafusion-expr", "datatypes", "derive_builder", diff --git a/Cargo.toml b/Cargo.toml index d49c91a266..512f090648 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ members = [ "src/common/time", "src/datanode", "src/datatypes", + "src/datatypes2", "src/frontend", "src/log-store", "src/meta-client", diff --git a/src/common/recordbatch/src/recordbatch.rs b/src/common/recordbatch/src/recordbatch.rs index b768a2f0bc..5fc886f8b9 100644 --- a/src/common/recordbatch/src/recordbatch.rs +++ b/src/common/recordbatch/src/recordbatch.rs @@ -23,6 +23,7 @@ use snafu::ResultExt; use crate::error::{self, Result}; +// TODO(yingwen): We should hold vectors in the RecordBatch. #[derive(Clone, Debug, PartialEq)] pub struct RecordBatch { pub schema: SchemaRef, @@ -103,6 +104,7 @@ impl<'a> Iterator for RecordBatchRowIterator<'a> { } else { let mut row = Vec::with_capacity(self.columns); + // TODO(yingwen): Get from the vector if RecordBatch also holds vectors. for col in 0..self.columns { let column_array = self.record_batch.df_recordbatch.column(col); match arrow_array_get(column_array.as_ref(), self.row_cursor) diff --git a/src/common/time/src/timestamp.rs b/src/common/time/src/timestamp.rs index fd0f148d96..5ff20f702b 100644 --- a/src/common/time/src/timestamp.rs +++ b/src/common/time/src/timestamp.rs @@ -147,6 +147,18 @@ impl From for Timestamp { } } +impl From for i64 { + fn from(t: Timestamp) -> Self { + t.value + } +} + +impl From for serde_json::Value { + fn from(d: Timestamp) -> Self { + serde_json::Value::String(d.to_iso8601_string()) + } +} + #[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub enum TimeUnit { Second, @@ -197,6 +209,7 @@ impl Hash for Timestamp { #[cfg(test)] mod tests { use chrono::Offset; + use serde_json::Value; use super::*; @@ -318,4 +331,39 @@ mod tests { let ts = Timestamp::from_millis(ts_millis); assert_eq!("1969-12-31 23:59:58.999+0000", ts.to_iso8601_string()); } + + #[test] + fn test_serialize_to_json_value() { + assert_eq!( + "1970-01-01 00:00:01+0000", + match serde_json::Value::from(Timestamp::new(1, TimeUnit::Second)) { + Value::String(s) => s, + _ => unreachable!(), + } + ); + + assert_eq!( + "1970-01-01 00:00:00.001+0000", + match serde_json::Value::from(Timestamp::new(1, TimeUnit::Millisecond)) { + Value::String(s) => s, + _ => unreachable!(), + } + ); + + assert_eq!( + "1970-01-01 00:00:00.000001+0000", + match serde_json::Value::from(Timestamp::new(1, TimeUnit::Microsecond)) { + Value::String(s) => s, + _ => unreachable!(), + } + ); + + assert_eq!( + "1970-01-01 00:00:00.000000001+0000", + match serde_json::Value::from(Timestamp::new(1, TimeUnit::Nanosecond)) { + Value::String(s) => s, + _ => unreachable!(), + } + ); + } } diff --git a/src/datatypes2/Cargo.toml b/src/datatypes2/Cargo.toml new file mode 100644 index 0000000000..34941606d4 --- /dev/null +++ b/src/datatypes2/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "datatypes2" +version = "0.1.0" +edition = "2021" +license = "Apache-2.0" + +[features] +default = [] +test = [] + +[dependencies] +common-base = { path = "../common/base" } +common-error = { path = "../common/error" } +common-time = { path = "../common/time" } +datafusion-common = "14.0" +enum_dispatch = "0.3" +num = "0.4" +num-traits = "0.2" +ordered-float = { version = "3.0", features = ["serde"] } +paste = "1.0" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +snafu = { version = "0.7", features = ["backtraces"] } +arrow = "26.0" diff --git a/src/datatypes2/src/arrow_array.rs b/src/datatypes2/src/arrow_array.rs new file mode 100644 index 0000000000..7405c8a665 --- /dev/null +++ b/src/datatypes2/src/arrow_array.rs @@ -0,0 +1,242 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use arrow::array::{ + Array, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, Int16Array, + Int32Array, Int64Array, Int8Array, ListArray, UInt16Array, UInt32Array, UInt64Array, + UInt8Array, +}; +use arrow::datatypes::DataType; +use common_time::timestamp::TimeUnit; +use common_time::Timestamp; +use snafu::OptionExt; + +use crate::data_type::ConcreteDataType; +use crate::error::{ConversionSnafu, Result}; +use crate::value::{ListValue, Value}; + +pub type BinaryArray = arrow::array::LargeBinaryArray; +pub type MutableBinaryArray = arrow::array::LargeBinaryBuilder; +pub type StringArray = arrow::array::StringArray; +pub type MutableStringArray = arrow::array::StringBuilder; + +macro_rules! cast_array { + ($arr: ident, $CastType: ty) => { + $arr.as_any() + .downcast_ref::<$CastType>() + .with_context(|| ConversionSnafu { + from: format!("{:?}", $arr.data_type()), + })? + }; +} + +// TODO(yingwen): Remove this function. +pub fn arrow_array_get(array: &dyn Array, idx: usize) -> Result { + if array.is_null(idx) { + return Ok(Value::Null); + } + + let result = match array.data_type() { + DataType::Null => Value::Null, + DataType::Boolean => Value::Boolean(cast_array!(array, BooleanArray).value(idx)), + DataType::Binary => Value::Binary(cast_array!(array, BinaryArray).value(idx).into()), + DataType::Int8 => Value::Int8(cast_array!(array, Int8Array).value(idx)), + DataType::Int16 => Value::Int16(cast_array!(array, Int16Array).value(idx)), + DataType::Int32 => Value::Int32(cast_array!(array, Int32Array).value(idx)), + DataType::Int64 => Value::Int64(cast_array!(array, Int64Array).value(idx)), + DataType::UInt8 => Value::UInt8(cast_array!(array, UInt8Array).value(idx)), + DataType::UInt16 => Value::UInt16(cast_array!(array, UInt16Array).value(idx)), + DataType::UInt32 => Value::UInt32(cast_array!(array, UInt32Array).value(idx)), + DataType::UInt64 => Value::UInt64(cast_array!(array, UInt64Array).value(idx)), + DataType::Float32 => Value::Float32(cast_array!(array, Float32Array).value(idx).into()), + DataType::Float64 => Value::Float64(cast_array!(array, Float64Array).value(idx).into()), + DataType::Utf8 => Value::String(cast_array!(array, StringArray).value(idx).into()), + DataType::Date32 => Value::Date(cast_array!(array, Date32Array).value(idx).into()), + DataType::Date64 => Value::DateTime(cast_array!(array, Date64Array).value(idx).into()), + DataType::Timestamp(t, _) => match t { + arrow::datatypes::TimeUnit::Second => Value::Timestamp(Timestamp::new( + cast_array!(array, arrow::array::TimestampSecondArray).value(idx), + TimeUnit::Second, + )), + arrow::datatypes::TimeUnit::Millisecond => Value::Timestamp(Timestamp::new( + cast_array!(array, arrow::array::TimestampMillisecondArray).value(idx), + TimeUnit::Millisecond, + )), + arrow::datatypes::TimeUnit::Microsecond => Value::Timestamp(Timestamp::new( + cast_array!(array, arrow::array::TimestampMicrosecondArray).value(idx), + TimeUnit::Microsecond, + )), + arrow::datatypes::TimeUnit::Nanosecond => Value::Timestamp(Timestamp::new( + cast_array!(array, arrow::array::TimestampNanosecondArray).value(idx), + TimeUnit::Nanosecond, + )), + }, + DataType::List(_) => { + let array = cast_array!(array, ListArray).value(idx); + let item_type = ConcreteDataType::try_from(array.data_type())?; + let values = (0..array.len()) + .map(|i| arrow_array_get(&*array, i)) + .collect::>>()?; + Value::List(ListValue::new(Some(Box::new(values)), item_type)) + } + _ => unimplemented!("Arrow array datatype: {:?}", array.data_type()), + }; + + Ok(result) +} + +#[cfg(test)] +mod test { + use std::sync::Arc; + + use arrow::array::{ + BooleanArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, + LargeBinaryArray, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, + UInt8Array, + }; + use arrow::datatypes::Int32Type; + use common_time::timestamp::{TimeUnit, Timestamp}; + use paste::paste; + + use super::*; + use crate::data_type::ConcreteDataType; + use crate::types::TimestampType; + + macro_rules! test_arrow_array_get_for_timestamps { + ( $($unit: ident), *) => { + $( + paste! { + let mut builder = arrow::array::[]::builder(3); + builder.append_value(1); + builder.append_value(0); + builder.append_value(-1); + let ts_array = Arc::new(builder.finish()) as Arc; + let v = arrow_array_get(&ts_array, 1).unwrap(); + assert_eq!( + ConcreteDataType::Timestamp(TimestampType::$unit( + $crate::types::[]::default(), + )), + v.data_type() + ); + } + )* + }; + } + + #[test] + fn test_timestamp_array() { + test_arrow_array_get_for_timestamps![Second, Millisecond, Microsecond, Nanosecond]; + } + + #[test] + fn test_arrow_array_access() { + let array1 = BooleanArray::from(vec![true, true, false, false]); + assert_eq!(Value::Boolean(true), arrow_array_get(&array1, 1).unwrap()); + let array1 = Int8Array::from(vec![1, 2, 3, 4]); + assert_eq!(Value::Int8(2), arrow_array_get(&array1, 1).unwrap()); + let array1 = UInt8Array::from(vec![1, 2, 3, 4]); + assert_eq!(Value::UInt8(2), arrow_array_get(&array1, 1).unwrap()); + let array1 = Int16Array::from(vec![1, 2, 3, 4]); + assert_eq!(Value::Int16(2), arrow_array_get(&array1, 1).unwrap()); + let array1 = UInt16Array::from(vec![1, 2, 3, 4]); + assert_eq!(Value::UInt16(2), arrow_array_get(&array1, 1).unwrap()); + let array1 = Int32Array::from(vec![1, 2, 3, 4]); + assert_eq!(Value::Int32(2), arrow_array_get(&array1, 1).unwrap()); + let array1 = UInt32Array::from(vec![1, 2, 3, 4]); + assert_eq!(Value::UInt32(2), arrow_array_get(&array1, 1).unwrap()); + let array = Int64Array::from(vec![1, 2, 3, 4]); + assert_eq!(Value::Int64(2), arrow_array_get(&array, 1).unwrap()); + let array1 = UInt64Array::from(vec![1, 2, 3, 4]); + assert_eq!(Value::UInt64(2), arrow_array_get(&array1, 1).unwrap()); + let array1 = Float32Array::from(vec![1f32, 2f32, 3f32, 4f32]); + assert_eq!( + Value::Float32(2f32.into()), + arrow_array_get(&array1, 1).unwrap() + ); + let array1 = Float64Array::from(vec![1f64, 2f64, 3f64, 4f64]); + assert_eq!( + Value::Float64(2f64.into()), + arrow_array_get(&array1, 1).unwrap() + ); + + let array2 = StringArray::from(vec![Some("hello"), None, Some("world")]); + assert_eq!( + Value::String("hello".into()), + arrow_array_get(&array2, 0).unwrap() + ); + assert_eq!(Value::Null, arrow_array_get(&array2, 1).unwrap()); + + let array3 = LargeBinaryArray::from(vec![ + Some("hello".as_bytes()), + None, + Some("world".as_bytes()), + ]); + assert_eq!(Value::Null, arrow_array_get(&array3, 1).unwrap()); + + let array = TimestampSecondArray::from(vec![1, 2, 3]); + let value = arrow_array_get(&array, 1).unwrap(); + assert_eq!(value, Value::Timestamp(Timestamp::new(2, TimeUnit::Second))); + let array = TimestampMillisecondArray::from(vec![1, 2, 3]); + let value = arrow_array_get(&array, 1).unwrap(); + assert_eq!( + value, + Value::Timestamp(Timestamp::new(2, TimeUnit::Millisecond)) + ); + let array = TimestampMicrosecondArray::from(vec![1, 2, 3]); + let value = arrow_array_get(&array, 1).unwrap(); + assert_eq!( + value, + Value::Timestamp(Timestamp::new(2, TimeUnit::Microsecond)) + ); + let array = TimestampNanosecondArray::from(vec![1, 2, 3]); + let value = arrow_array_get(&array, 1).unwrap(); + assert_eq!( + value, + Value::Timestamp(Timestamp::new(2, TimeUnit::Nanosecond)) + ); + + // test list array + let data = vec![ + Some(vec![Some(1), Some(2), Some(3)]), + None, + Some(vec![Some(4), None, Some(6)]), + ]; + let arrow_array = ListArray::from_iter_primitive::(data); + + let v0 = arrow_array_get(&arrow_array, 0).unwrap(); + match v0 { + Value::List(list) => { + assert!(matches!(list.datatype(), ConcreteDataType::Int32(_))); + let items = list.items().as_ref().unwrap(); + assert_eq!( + **items, + vec![Value::Int32(1), Value::Int32(2), Value::Int32(3)] + ); + } + _ => unreachable!(), + } + + assert_eq!(Value::Null, arrow_array_get(&arrow_array, 1).unwrap()); + let v2 = arrow_array_get(&arrow_array, 2).unwrap(); + match v2 { + Value::List(list) => { + assert!(matches!(list.datatype(), ConcreteDataType::Int32(_))); + let items = list.items().as_ref().unwrap(); + assert_eq!(**items, vec![Value::Int32(4), Value::Null, Value::Int32(6)]); + } + _ => unreachable!(), + } + } +} diff --git a/src/datatypes2/src/data_type.rs b/src/datatypes2/src/data_type.rs new file mode 100644 index 0000000000..0d06d566b6 --- /dev/null +++ b/src/datatypes2/src/data_type.rs @@ -0,0 +1,486 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use arrow::datatypes::{DataType as ArrowDataType, TimeUnit as ArrowTimeUnit}; +use common_time::timestamp::TimeUnit; +use paste::paste; +use serde::{Deserialize, Serialize}; + +use crate::error::{self, Error, Result}; +use crate::type_id::LogicalTypeId; +use crate::types::{ + BinaryType, BooleanType, DateTimeType, DateType, Float32Type, Float64Type, Int16Type, + Int32Type, Int64Type, Int8Type, ListType, NullType, StringType, TimestampMicrosecondType, + TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, TimestampType, + UInt16Type, UInt32Type, UInt64Type, UInt8Type, +}; +use crate::value::Value; +use crate::vectors::MutableVector; + +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[enum_dispatch::enum_dispatch(DataType)] +pub enum ConcreteDataType { + Null(NullType), + Boolean(BooleanType), + + // Numeric types: + Int8(Int8Type), + Int16(Int16Type), + Int32(Int32Type), + Int64(Int64Type), + UInt8(UInt8Type), + UInt16(UInt16Type), + UInt32(UInt32Type), + UInt64(UInt64Type), + Float32(Float32Type), + Float64(Float64Type), + + // String types: + Binary(BinaryType), + String(StringType), + + // Date types: + Date(DateType), + DateTime(DateTimeType), + Timestamp(TimestampType), + + // Compound types: + List(ListType), +} + +// TODO(yingwen): Refactor these `is_xxx()` methods, such as adding a `properties()` method +// returning all these properties to the `DataType` trait +impl ConcreteDataType { + pub fn is_float(&self) -> bool { + matches!( + self, + ConcreteDataType::Float64(_) | ConcreteDataType::Float32(_) + ) + } + + pub fn is_boolean(&self) -> bool { + matches!(self, ConcreteDataType::Boolean(_)) + } + + pub fn is_stringifiable(&self) -> bool { + matches!( + self, + ConcreteDataType::String(_) + | ConcreteDataType::Date(_) + | ConcreteDataType::DateTime(_) + | ConcreteDataType::Timestamp(_) + ) + } + + pub fn is_signed(&self) -> bool { + matches!( + self, + ConcreteDataType::Int8(_) + | ConcreteDataType::Int16(_) + | ConcreteDataType::Int32(_) + | ConcreteDataType::Int64(_) + | ConcreteDataType::Date(_) + | ConcreteDataType::DateTime(_) + | ConcreteDataType::Timestamp(_) + ) + } + + pub fn is_unsigned(&self) -> bool { + matches!( + self, + ConcreteDataType::UInt8(_) + | ConcreteDataType::UInt16(_) + | ConcreteDataType::UInt32(_) + | ConcreteDataType::UInt64(_) + ) + } + + pub fn numerics() -> Vec { + vec![ + ConcreteDataType::int8_datatype(), + ConcreteDataType::int16_datatype(), + ConcreteDataType::int32_datatype(), + ConcreteDataType::int64_datatype(), + ConcreteDataType::uint8_datatype(), + ConcreteDataType::uint16_datatype(), + ConcreteDataType::uint32_datatype(), + ConcreteDataType::uint64_datatype(), + ConcreteDataType::float32_datatype(), + ConcreteDataType::float64_datatype(), + ] + } + + /// Convert arrow data type to [ConcreteDataType]. + /// + /// # Panics + /// Panic if given arrow data type is not supported. + pub fn from_arrow_type(dt: &ArrowDataType) -> Self { + ConcreteDataType::try_from(dt).expect("Unimplemented type") + } + + pub fn is_null(&self) -> bool { + matches!(self, ConcreteDataType::Null(NullType)) + } +} + +impl TryFrom<&ArrowDataType> for ConcreteDataType { + type Error = Error; + + fn try_from(dt: &ArrowDataType) -> Result { + let concrete_type = match dt { + ArrowDataType::Null => Self::null_datatype(), + ArrowDataType::Boolean => Self::boolean_datatype(), + ArrowDataType::UInt8 => Self::uint8_datatype(), + ArrowDataType::UInt16 => Self::uint16_datatype(), + ArrowDataType::UInt32 => Self::uint32_datatype(), + ArrowDataType::UInt64 => Self::uint64_datatype(), + ArrowDataType::Int8 => Self::int8_datatype(), + ArrowDataType::Int16 => Self::int16_datatype(), + ArrowDataType::Int32 => Self::int32_datatype(), + ArrowDataType::Int64 => Self::int64_datatype(), + ArrowDataType::Float32 => Self::float32_datatype(), + ArrowDataType::Float64 => Self::float64_datatype(), + ArrowDataType::Date32 => Self::date_datatype(), + ArrowDataType::Date64 => Self::datetime_datatype(), + ArrowDataType::Timestamp(u, _) => ConcreteDataType::from_arrow_time_unit(u), + ArrowDataType::Binary | ArrowDataType::LargeBinary => Self::binary_datatype(), + ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => Self::string_datatype(), + ArrowDataType::List(field) => Self::List(ListType::new( + ConcreteDataType::from_arrow_type(field.data_type()), + )), + _ => { + return error::UnsupportedArrowTypeSnafu { + arrow_type: dt.clone(), + } + .fail() + } + }; + + Ok(concrete_type) + } +} + +macro_rules! impl_new_concrete_type_functions { + ($($Type: ident), +) => { + paste! { + impl ConcreteDataType { + $( + pub fn [<$Type:lower _datatype>]() -> ConcreteDataType { + ConcreteDataType::$Type([<$Type Type>]::default()) + } + )+ + } + } + } +} + +impl_new_concrete_type_functions!( + Null, Boolean, UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64, + Binary, Date, DateTime, String +); + +impl ConcreteDataType { + pub fn timestamp_second_datatype() -> Self { + ConcreteDataType::Timestamp(TimestampType::Second(TimestampSecondType::default())) + } + + pub fn timestamp_millisecond_datatype() -> Self { + ConcreteDataType::Timestamp(TimestampType::Millisecond( + TimestampMillisecondType::default(), + )) + } + + pub fn timestamp_microsecond_datatype() -> Self { + ConcreteDataType::Timestamp(TimestampType::Microsecond( + TimestampMicrosecondType::default(), + )) + } + + pub fn timestamp_nanosecond_datatype() -> Self { + ConcreteDataType::Timestamp(TimestampType::Nanosecond(TimestampNanosecondType::default())) + } + + pub fn timestamp_datatype(unit: TimeUnit) -> Self { + match unit { + TimeUnit::Second => Self::timestamp_second_datatype(), + TimeUnit::Millisecond => Self::timestamp_millisecond_datatype(), + TimeUnit::Microsecond => Self::timestamp_microsecond_datatype(), + TimeUnit::Nanosecond => Self::timestamp_nanosecond_datatype(), + } + } + + /// Converts from arrow timestamp unit to + pub fn from_arrow_time_unit(t: &ArrowTimeUnit) -> Self { + match t { + ArrowTimeUnit::Second => Self::timestamp_second_datatype(), + ArrowTimeUnit::Millisecond => Self::timestamp_millisecond_datatype(), + ArrowTimeUnit::Microsecond => Self::timestamp_microsecond_datatype(), + ArrowTimeUnit::Nanosecond => Self::timestamp_nanosecond_datatype(), + } + } + + pub fn list_datatype(item_type: ConcreteDataType) -> ConcreteDataType { + ConcreteDataType::List(ListType::new(item_type)) + } +} + +/// Data type abstraction. +#[enum_dispatch::enum_dispatch] +pub trait DataType: std::fmt::Debug + Send + Sync { + /// Name of this data type. + fn name(&self) -> &str; + + /// Returns id of the Logical data type. + fn logical_type_id(&self) -> LogicalTypeId; + + /// Returns the default value of this type. + fn default_value(&self) -> Value; + + /// Convert this type as [arrow::datatypes::DataType]. + fn as_arrow_type(&self) -> ArrowDataType; + + /// Creates a mutable vector with given `capacity` of this type. + fn create_mutable_vector(&self, capacity: usize) -> Box; + + /// Returns true if the data type is compatible with timestamp type so we can + /// use it as a timestamp. + fn is_timestamp_compatible(&self) -> bool; +} + +pub type DataTypeRef = Arc; + +#[cfg(test)] +mod tests { + use arrow::datatypes::Field; + + use super::*; + + #[test] + fn test_concrete_type_as_datatype_trait() { + let concrete_type = ConcreteDataType::boolean_datatype(); + + assert_eq!("Boolean", concrete_type.name()); + assert_eq!(Value::Boolean(false), concrete_type.default_value()); + assert_eq!(LogicalTypeId::Boolean, concrete_type.logical_type_id()); + assert_eq!(ArrowDataType::Boolean, concrete_type.as_arrow_type()); + } + + #[test] + fn test_from_arrow_type() { + assert!(matches!( + ConcreteDataType::from_arrow_type(&ArrowDataType::Null), + ConcreteDataType::Null(_) + )); + assert!(matches!( + ConcreteDataType::from_arrow_type(&ArrowDataType::Boolean), + ConcreteDataType::Boolean(_) + )); + assert!(matches!( + ConcreteDataType::from_arrow_type(&ArrowDataType::Binary), + ConcreteDataType::Binary(_) + )); + assert!(matches!( + ConcreteDataType::from_arrow_type(&ArrowDataType::LargeBinary), + ConcreteDataType::Binary(_) + )); + assert!(matches!( + ConcreteDataType::from_arrow_type(&ArrowDataType::Int8), + ConcreteDataType::Int8(_) + )); + assert!(matches!( + ConcreteDataType::from_arrow_type(&ArrowDataType::Int16), + ConcreteDataType::Int16(_) + )); + assert!(matches!( + ConcreteDataType::from_arrow_type(&ArrowDataType::Int32), + ConcreteDataType::Int32(_) + )); + assert!(matches!( + ConcreteDataType::from_arrow_type(&ArrowDataType::Int64), + ConcreteDataType::Int64(_) + )); + assert!(matches!( + ConcreteDataType::from_arrow_type(&ArrowDataType::UInt8), + ConcreteDataType::UInt8(_) + )); + assert!(matches!( + ConcreteDataType::from_arrow_type(&ArrowDataType::UInt16), + ConcreteDataType::UInt16(_) + )); + assert!(matches!( + ConcreteDataType::from_arrow_type(&ArrowDataType::UInt32), + ConcreteDataType::UInt32(_) + )); + assert!(matches!( + ConcreteDataType::from_arrow_type(&ArrowDataType::UInt64), + ConcreteDataType::UInt64(_) + )); + assert!(matches!( + ConcreteDataType::from_arrow_type(&ArrowDataType::Float32), + ConcreteDataType::Float32(_) + )); + assert!(matches!( + ConcreteDataType::from_arrow_type(&ArrowDataType::Float64), + ConcreteDataType::Float64(_) + )); + assert!(matches!( + ConcreteDataType::from_arrow_type(&ArrowDataType::Utf8), + ConcreteDataType::String(_) + )); + assert_eq!( + ConcreteDataType::from_arrow_type(&ArrowDataType::List(Box::new(Field::new( + "item", + ArrowDataType::Int32, + true, + )))), + ConcreteDataType::List(ListType::new(ConcreteDataType::int32_datatype())) + ); + assert!(matches!( + ConcreteDataType::from_arrow_type(&ArrowDataType::Date32), + ConcreteDataType::Date(_) + )); + } + + #[test] + fn test_from_arrow_timestamp() { + assert_eq!( + ConcreteDataType::timestamp_millisecond_datatype(), + ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Millisecond) + ); + assert_eq!( + ConcreteDataType::timestamp_microsecond_datatype(), + ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Microsecond) + ); + assert_eq!( + ConcreteDataType::timestamp_nanosecond_datatype(), + ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Nanosecond) + ); + assert_eq!( + ConcreteDataType::timestamp_second_datatype(), + ConcreteDataType::from_arrow_time_unit(&ArrowTimeUnit::Second) + ); + } + + #[test] + fn test_is_timestamp_compatible() { + assert!(ConcreteDataType::timestamp_datatype(TimeUnit::Second).is_timestamp_compatible()); + assert!( + ConcreteDataType::timestamp_datatype(TimeUnit::Millisecond).is_timestamp_compatible() + ); + assert!( + ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond).is_timestamp_compatible() + ); + assert!( + ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond).is_timestamp_compatible() + ); + assert!(ConcreteDataType::timestamp_second_datatype().is_timestamp_compatible()); + assert!(ConcreteDataType::timestamp_millisecond_datatype().is_timestamp_compatible()); + assert!(ConcreteDataType::timestamp_microsecond_datatype().is_timestamp_compatible()); + assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_timestamp_compatible()); + assert!(ConcreteDataType::int64_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::null_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::binary_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::boolean_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::date_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::datetime_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::string_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::int32_datatype().is_timestamp_compatible()); + assert!(!ConcreteDataType::uint64_datatype().is_timestamp_compatible()); + } + + #[test] + fn test_is_null() { + assert!(ConcreteDataType::null_datatype().is_null()); + assert!(!ConcreteDataType::int32_datatype().is_null()); + } + + #[test] + fn test_is_float() { + assert!(!ConcreteDataType::int32_datatype().is_float()); + assert!(ConcreteDataType::float32_datatype().is_float()); + assert!(ConcreteDataType::float64_datatype().is_float()); + } + + #[test] + fn test_is_boolean() { + assert!(!ConcreteDataType::int32_datatype().is_boolean()); + assert!(!ConcreteDataType::float32_datatype().is_boolean()); + assert!(ConcreteDataType::boolean_datatype().is_boolean()); + } + + #[test] + fn test_is_stringifiable() { + assert!(!ConcreteDataType::int32_datatype().is_stringifiable()); + assert!(!ConcreteDataType::float32_datatype().is_stringifiable()); + assert!(ConcreteDataType::string_datatype().is_stringifiable()); + assert!(ConcreteDataType::date_datatype().is_stringifiable()); + assert!(ConcreteDataType::datetime_datatype().is_stringifiable()); + assert!(ConcreteDataType::timestamp_second_datatype().is_stringifiable()); + assert!(ConcreteDataType::timestamp_millisecond_datatype().is_stringifiable()); + assert!(ConcreteDataType::timestamp_microsecond_datatype().is_stringifiable()); + assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_stringifiable()); + } + + #[test] + fn test_is_signed() { + assert!(ConcreteDataType::int8_datatype().is_signed()); + assert!(ConcreteDataType::int16_datatype().is_signed()); + assert!(ConcreteDataType::int32_datatype().is_signed()); + assert!(ConcreteDataType::int64_datatype().is_signed()); + assert!(ConcreteDataType::date_datatype().is_signed()); + assert!(ConcreteDataType::datetime_datatype().is_signed()); + assert!(ConcreteDataType::timestamp_second_datatype().is_signed()); + assert!(ConcreteDataType::timestamp_millisecond_datatype().is_signed()); + assert!(ConcreteDataType::timestamp_microsecond_datatype().is_signed()); + assert!(ConcreteDataType::timestamp_nanosecond_datatype().is_signed()); + + assert!(!ConcreteDataType::uint8_datatype().is_signed()); + assert!(!ConcreteDataType::uint16_datatype().is_signed()); + assert!(!ConcreteDataType::uint32_datatype().is_signed()); + assert!(!ConcreteDataType::uint64_datatype().is_signed()); + + assert!(!ConcreteDataType::float32_datatype().is_signed()); + assert!(!ConcreteDataType::float64_datatype().is_signed()); + } + + #[test] + fn test_is_unsigned() { + assert!(!ConcreteDataType::int8_datatype().is_unsigned()); + assert!(!ConcreteDataType::int16_datatype().is_unsigned()); + assert!(!ConcreteDataType::int32_datatype().is_unsigned()); + assert!(!ConcreteDataType::int64_datatype().is_unsigned()); + assert!(!ConcreteDataType::date_datatype().is_unsigned()); + assert!(!ConcreteDataType::datetime_datatype().is_unsigned()); + assert!(!ConcreteDataType::timestamp_second_datatype().is_unsigned()); + assert!(!ConcreteDataType::timestamp_millisecond_datatype().is_unsigned()); + assert!(!ConcreteDataType::timestamp_microsecond_datatype().is_unsigned()); + assert!(!ConcreteDataType::timestamp_nanosecond_datatype().is_unsigned()); + + assert!(ConcreteDataType::uint8_datatype().is_unsigned()); + assert!(ConcreteDataType::uint16_datatype().is_unsigned()); + assert!(ConcreteDataType::uint32_datatype().is_unsigned()); + assert!(ConcreteDataType::uint64_datatype().is_unsigned()); + + assert!(!ConcreteDataType::float32_datatype().is_unsigned()); + assert!(!ConcreteDataType::float64_datatype().is_unsigned()); + } + + #[test] + fn test_numerics() { + let nums = ConcreteDataType::numerics(); + assert_eq!(10, nums.len()); + } +} diff --git a/src/datatypes2/src/error.rs b/src/datatypes2/src/error.rs new file mode 100644 index 0000000000..50b49cf2b4 --- /dev/null +++ b/src/datatypes2/src/error.rs @@ -0,0 +1,144 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; + +use common_error::prelude::{ErrorCompat, ErrorExt, Snafu, StatusCode}; +use snafu::Backtrace; + +#[derive(Debug, Snafu)] +#[snafu(visibility(pub))] +pub enum Error { + #[snafu(display("Failed to serialize data, source: {}", source))] + Serialize { + source: serde_json::Error, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to deserialize data, source: {}, json: {}", source, json))] + Deserialize { + source: serde_json::Error, + backtrace: Backtrace, + json: String, + }, + + #[snafu(display("Failed to convert datafusion type: {}", from))] + Conversion { from: String, backtrace: Backtrace }, + + #[snafu(display("Bad array access, Index out of bounds: {}, size: {}", index, size))] + BadArrayAccess { + index: usize, + size: usize, + backtrace: Backtrace, + }, + + #[snafu(display("Unknown vector, {}", msg))] + UnknownVector { msg: String, backtrace: Backtrace }, + + #[snafu(display("Unsupported arrow data type, type: {:?}", arrow_type))] + UnsupportedArrowType { + arrow_type: arrow::datatypes::DataType, + backtrace: Backtrace, + }, + + #[snafu(display("Timestamp column {} not found", name,))] + TimestampNotFound { name: String, backtrace: Backtrace }, + + #[snafu(display( + "Failed to parse version in schema meta, value: {}, source: {}", + value, + source + ))] + ParseSchemaVersion { + value: String, + source: std::num::ParseIntError, + backtrace: Backtrace, + }, + + #[snafu(display("Invalid timestamp index: {}", index))] + InvalidTimestampIndex { index: usize, backtrace: Backtrace }, + + #[snafu(display("Duplicate timestamp index, exists: {}, new: {}", exists, new))] + DuplicateTimestampIndex { + exists: usize, + new: usize, + backtrace: Backtrace, + }, + + #[snafu(display("{}", msg))] + CastType { msg: String, backtrace: Backtrace }, + + #[snafu(display("Arrow failed to compute, source: {}", source))] + ArrowCompute { + source: arrow::error::ArrowError, + backtrace: Backtrace, + }, + + #[snafu(display("Unsupported column default constraint expression: {}", expr))] + UnsupportedDefaultExpr { expr: String, backtrace: Backtrace }, + + #[snafu(display("Default value should not be null for non null column"))] + NullDefault { backtrace: Backtrace }, + + #[snafu(display("Incompatible default value type, reason: {}", reason))] + DefaultValueType { + reason: String, + backtrace: Backtrace, + }, + + #[snafu(display("Duplicated metadata for {}", key))] + DuplicateMeta { key: String, backtrace: Backtrace }, +} + +impl ErrorExt for Error { + fn status_code(&self) -> StatusCode { + // Inner encoding and decoding error should not be exposed to users. + StatusCode::Internal + } + + fn backtrace_opt(&self) -> Option<&Backtrace> { + ErrorCompat::backtrace(self) + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +pub type Result = std::result::Result; + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use snafu::ResultExt; + + use super::*; + + #[test] + pub fn test_error() { + let mut map = HashMap::new(); + map.insert(true, 1); + map.insert(false, 2); + + let result = serde_json::to_string(&map).context(SerializeSnafu); + assert!(result.is_err(), "serialize result is: {:?}", result); + let err = serde_json::to_string(&map) + .context(SerializeSnafu) + .err() + .unwrap(); + assert!(err.backtrace_opt().is_some()); + assert_eq!(StatusCode::Internal, err.status_code()); + } +} diff --git a/src/datatypes2/src/lib.rs b/src/datatypes2/src/lib.rs new file mode 100644 index 0000000000..256d347eac --- /dev/null +++ b/src/datatypes2/src/lib.rs @@ -0,0 +1,33 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#![feature(generic_associated_types)] +#![feature(assert_matches)] + +pub mod arrow_array; +pub mod data_type; +pub mod error; +pub mod macros; +pub mod prelude; +mod scalars; +pub mod schema; +pub mod serialize; +mod timestamp; +pub mod type_id; +pub mod types; +pub mod value; +pub mod vectors; + +pub use arrow; +pub use error::{Error, Result}; diff --git a/src/datatypes2/src/macros.rs b/src/datatypes2/src/macros.rs new file mode 100644 index 0000000000..37c0a42e3f --- /dev/null +++ b/src/datatypes2/src/macros.rs @@ -0,0 +1,68 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Some helper macros for datatypes, copied from databend. + +/// Apply the macro rules to all primitive types. +#[macro_export] +macro_rules! for_all_primitive_types { + ($macro:tt $(, $x:tt)*) => { + $macro! { + [$($x),*], + { i8 }, + { i16 }, + { i32 }, + { i64 }, + { u8 }, + { u16 }, + { u32 }, + { u64 }, + { f32 }, + { f64 } + } + }; +} + +/// Match the logical type and apply `$body` to all primitive types and +/// `nbody` to other types. +#[macro_export] +macro_rules! with_match_primitive_type_id { + ($key_type:expr, | $_:tt $T:ident | $body:tt, $nbody:tt) => {{ + macro_rules! __with_ty__ { + ( $_ $T:ident ) => { + $body + }; + } + + use $crate::type_id::LogicalTypeId; + use $crate::types::{ + Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, + UInt32Type, UInt64Type, UInt8Type, + }; + match $key_type { + LogicalTypeId::Int8 => __with_ty__! { Int8Type }, + LogicalTypeId::Int16 => __with_ty__! { Int16Type }, + LogicalTypeId::Int32 => __with_ty__! { Int32Type }, + LogicalTypeId::Int64 => __with_ty__! { Int64Type }, + LogicalTypeId::UInt8 => __with_ty__! { UInt8Type }, + LogicalTypeId::UInt16 => __with_ty__! { UInt16Type }, + LogicalTypeId::UInt32 => __with_ty__! { UInt32Type }, + LogicalTypeId::UInt64 => __with_ty__! { UInt64Type }, + LogicalTypeId::Float32 => __with_ty__! { Float32Type }, + LogicalTypeId::Float64 => __with_ty__! { Float64Type }, + + _ => $nbody, + } + }}; +} diff --git a/src/datatypes2/src/prelude.rs b/src/datatypes2/src/prelude.rs new file mode 100644 index 0000000000..f6bd298316 --- /dev/null +++ b/src/datatypes2/src/prelude.rs @@ -0,0 +1,20 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub use crate::data_type::{ConcreteDataType, DataType, DataTypeRef}; +pub use crate::macros::*; +pub use crate::scalars::{Scalar, ScalarRef, ScalarVector, ScalarVectorBuilder}; +pub use crate::type_id::LogicalTypeId; +pub use crate::value::{Value, ValueRef}; +pub use crate::vectors::{MutableVector, Validity, Vector, VectorRef}; diff --git a/src/datatypes2/src/scalars.rs b/src/datatypes2/src/scalars.rs new file mode 100644 index 0000000000..327ebaa629 --- /dev/null +++ b/src/datatypes2/src/scalars.rs @@ -0,0 +1,443 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; + +use common_time::{Date, DateTime}; + +use crate::types::{ + Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, UInt32Type, + UInt64Type, UInt8Type, +}; +use crate::value::{ListValue, ListValueRef, Value}; +use crate::vectors::{ + BinaryVector, BooleanVector, DateTimeVector, DateVector, ListVector, MutableVector, + PrimitiveVector, StringVector, Vector, +}; + +fn get_iter_capacity>(iter: &I) -> usize { + match iter.size_hint() { + (_lower, Some(upper)) => upper, + (0, None) => 1024, + (lower, None) => lower, + } +} + +/// Owned scalar value +/// primitive types, bool, Vec ... +pub trait Scalar: 'static + Sized + Default + Any +where + for<'a> Self::VectorType: ScalarVector = Self::RefType<'a>>, +{ + type VectorType: ScalarVector; + type RefType<'a>: ScalarRef<'a, ScalarType = Self> + where + Self: 'a; + /// Get a reference of the current value. + fn as_scalar_ref(&self) -> Self::RefType<'_>; + + /// Upcast GAT type's lifetime. + fn upcast_gat<'short, 'long: 'short>(long: Self::RefType<'long>) -> Self::RefType<'short>; +} + +pub trait ScalarRef<'a>: std::fmt::Debug + Clone + Copy + Send + 'a { + /// The corresponding [`Scalar`] type. + type ScalarType: Scalar = Self>; + + /// Convert the reference into an owned value. + fn to_owned_scalar(&self) -> Self::ScalarType; +} + +/// A sub trait of Vector to add scalar operation support. +// This implementation refers to Datebend's [ScalarColumn](https://github.com/datafuselabs/databend/blob/main/common/datavalues/src/scalars/type_.rs) +// and skyzh's [type-exercise-in-rust](https://github.com/skyzh/type-exercise-in-rust). +pub trait ScalarVector: Vector + Send + Sync + Sized + 'static +where + for<'a> Self::OwnedItem: Scalar = Self::RefItem<'a>>, +{ + type OwnedItem: Scalar; + /// The reference item of this vector. + type RefItem<'a>: ScalarRef<'a, ScalarType = Self::OwnedItem> + where + Self: 'a; + + /// Iterator type of this vector. + type Iter<'a>: Iterator>> + where + Self: 'a; + + /// Builder type to build this vector. + type Builder: ScalarVectorBuilder; + + /// Returns the reference to an element at given position. + /// + /// Note: `get()` has bad performance, avoid call this function inside loop. + /// + /// # Panics + /// Panics if `idx >= self.len()`. + fn get_data(&self, idx: usize) -> Option>; + + /// Returns iterator of current vector. + fn iter_data(&self) -> Self::Iter<'_>; + + fn from_slice(data: &[Self::RefItem<'_>]) -> Self { + let mut builder = Self::Builder::with_capacity(data.len()); + for item in data { + builder.push(Some(*item)); + } + builder.finish() + } + + fn from_iterator<'a>(it: impl Iterator>) -> Self { + let mut builder = Self::Builder::with_capacity(get_iter_capacity(&it)); + for item in it { + builder.push(Some(item)); + } + builder.finish() + } + + fn from_owned_iterator(it: impl Iterator>) -> Self { + let mut builder = Self::Builder::with_capacity(get_iter_capacity(&it)); + for item in it { + match item { + Some(item) => builder.push(Some(item.as_scalar_ref())), + None => builder.push(None), + } + } + builder.finish() + } + + fn from_vec>(values: Vec) -> Self { + let it = values.into_iter(); + let mut builder = Self::Builder::with_capacity(get_iter_capacity(&it)); + for item in it { + builder.push(Some(item.into().as_scalar_ref())); + } + builder.finish() + } +} + +/// A trait over all vector builders. +pub trait ScalarVectorBuilder: MutableVector { + type VectorType: ScalarVector; + + /// Create a new builder with initial `capacity`. + fn with_capacity(capacity: usize) -> Self; + + /// Push a value into the builder. + fn push(&mut self, value: Option<::RefItem<'_>>); + + /// Finish build and return a new vector. + fn finish(&mut self) -> Self::VectorType; +} + +macro_rules! impl_scalar_for_native { + ($Native: ident, $DataType: ident) => { + impl Scalar for $Native { + type VectorType = PrimitiveVector<$DataType>; + type RefType<'a> = $Native; + + #[inline] + fn as_scalar_ref(&self) -> $Native { + *self + } + + #[allow(clippy::needless_lifetimes)] + #[inline] + fn upcast_gat<'short, 'long: 'short>(long: $Native) -> $Native { + long + } + } + + /// Implement [`ScalarRef`] for primitive types. Note that primitive types are both [`Scalar`] and [`ScalarRef`]. + impl<'a> ScalarRef<'a> for $Native { + type ScalarType = $Native; + + #[inline] + fn to_owned_scalar(&self) -> $Native { + *self + } + } + }; +} + +impl_scalar_for_native!(u8, UInt8Type); +impl_scalar_for_native!(u16, UInt16Type); +impl_scalar_for_native!(u32, UInt32Type); +impl_scalar_for_native!(u64, UInt64Type); +impl_scalar_for_native!(i8, Int8Type); +impl_scalar_for_native!(i16, Int16Type); +impl_scalar_for_native!(i32, Int32Type); +impl_scalar_for_native!(i64, Int64Type); +impl_scalar_for_native!(f32, Float32Type); +impl_scalar_for_native!(f64, Float64Type); + +impl Scalar for bool { + type VectorType = BooleanVector; + type RefType<'a> = bool; + + #[inline] + fn as_scalar_ref(&self) -> bool { + *self + } + + #[allow(clippy::needless_lifetimes)] + #[inline] + fn upcast_gat<'short, 'long: 'short>(long: bool) -> bool { + long + } +} + +impl<'a> ScalarRef<'a> for bool { + type ScalarType = bool; + + #[inline] + fn to_owned_scalar(&self) -> bool { + *self + } +} + +impl Scalar for String { + type VectorType = StringVector; + type RefType<'a> = &'a str; + + #[inline] + fn as_scalar_ref(&self) -> &str { + self + } + + #[inline] + fn upcast_gat<'short, 'long: 'short>(long: &'long str) -> &'short str { + long + } +} + +impl<'a> ScalarRef<'a> for &'a str { + type ScalarType = String; + + #[inline] + fn to_owned_scalar(&self) -> String { + self.to_string() + } +} + +impl Scalar for Vec { + type VectorType = BinaryVector; + type RefType<'a> = &'a [u8]; + + #[inline] + fn as_scalar_ref(&self) -> &[u8] { + self + } + + #[inline] + fn upcast_gat<'short, 'long: 'short>(long: &'long [u8]) -> &'short [u8] { + long + } +} + +impl<'a> ScalarRef<'a> for &'a [u8] { + type ScalarType = Vec; + + #[inline] + fn to_owned_scalar(&self) -> Vec { + self.to_vec() + } +} + +impl Scalar for Date { + type VectorType = DateVector; + type RefType<'a> = Date; + + fn as_scalar_ref(&self) -> Self::RefType<'_> { + *self + } + + fn upcast_gat<'short, 'long: 'short>(long: Self::RefType<'long>) -> Self::RefType<'short> { + long + } +} + +impl<'a> ScalarRef<'a> for Date { + type ScalarType = Date; + + fn to_owned_scalar(&self) -> Self::ScalarType { + *self + } +} + +impl Scalar for DateTime { + type VectorType = DateTimeVector; + type RefType<'a> = DateTime; + + fn as_scalar_ref(&self) -> Self::RefType<'_> { + *self + } + + fn upcast_gat<'short, 'long: 'short>(long: Self::RefType<'long>) -> Self::RefType<'short> { + long + } +} + +impl<'a> ScalarRef<'a> for DateTime { + type ScalarType = DateTime; + + fn to_owned_scalar(&self) -> Self::ScalarType { + *self + } +} + +// Timestamp types implement Scalar and ScalarRef in `src/timestamp.rs`. + +impl Scalar for ListValue { + type VectorType = ListVector; + type RefType<'a> = ListValueRef<'a>; + + fn as_scalar_ref(&self) -> Self::RefType<'_> { + ListValueRef::Ref { val: self } + } + + fn upcast_gat<'short, 'long: 'short>(long: Self::RefType<'long>) -> Self::RefType<'short> { + long + } +} + +impl<'a> ScalarRef<'a> for ListValueRef<'a> { + type ScalarType = ListValue; + + fn to_owned_scalar(&self) -> Self::ScalarType { + match self { + ListValueRef::Indexed { vector, idx } => match vector.get(*idx) { + // Normally should not get `Value::Null` if the `ListValueRef` comes + // from the iterator of the ListVector, but we avoid panic and just + // returns a default list value in such case since `ListValueRef` may + // be constructed manually. + Value::Null => ListValue::default(), + Value::List(v) => v, + _ => unreachable!(), + }, + ListValueRef::Ref { val } => (*val).clone(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::data_type::ConcreteDataType; + use crate::timestamp::TimestampSecond; + use crate::vectors::{BinaryVector, Int32Vector, ListVectorBuilder, TimestampSecondVector}; + + fn build_vector_from_slice(items: &[Option>]) -> T { + let mut builder = T::Builder::with_capacity(items.len()); + for item in items { + builder.push(*item); + } + builder.finish() + } + + fn assert_vector_eq<'a, T: ScalarVector>(expect: &[Option>], vector: &'a T) + where + T::RefItem<'a>: PartialEq + std::fmt::Debug, + { + for (a, b) in expect.iter().zip(vector.iter_data()) { + assert_eq!(*a, b); + } + } + + #[test] + fn test_build_i32_vector() { + let expect = vec![Some(1), Some(2), Some(3), None, Some(5)]; + let vector: Int32Vector = build_vector_from_slice(&expect); + assert_vector_eq(&expect, &vector); + } + + #[test] + fn test_build_binary_vector() { + let expect: Vec> = vec![ + Some(b"a"), + Some(b"b"), + Some(b"c"), + None, + Some(b"e"), + Some(b""), + ]; + let vector: BinaryVector = build_vector_from_slice(&expect); + assert_vector_eq(&expect, &vector); + } + + #[test] + fn test_build_date_vector() { + let expect: Vec> = vec![ + Some(Date::new(0)), + Some(Date::new(-1)), + None, + Some(Date::new(1)), + ]; + let vector: DateVector = build_vector_from_slice(&expect); + assert_vector_eq(&expect, &vector); + } + + #[test] + fn test_date_scalar() { + let date = Date::new(1); + assert_eq!(date, date.as_scalar_ref()); + assert_eq!(date, date.to_owned_scalar()); + } + + #[test] + fn test_datetime_scalar() { + let dt = DateTime::new(123); + assert_eq!(dt, dt.as_scalar_ref()); + assert_eq!(dt, dt.to_owned_scalar()); + } + + #[test] + fn test_list_value_scalar() { + let list_value = ListValue::new( + Some(Box::new(vec![Value::Int32(123)])), + ConcreteDataType::int32_datatype(), + ); + let list_ref = ListValueRef::Ref { val: &list_value }; + assert_eq!(list_ref, list_value.as_scalar_ref()); + assert_eq!(list_value, list_ref.to_owned_scalar()); + + let mut builder = + ListVectorBuilder::with_type_capacity(ConcreteDataType::int32_datatype(), 1); + builder.push(None); + builder.push(Some(list_value.as_scalar_ref())); + let vector = builder.finish(); + + let ref_on_vec = ListValueRef::Indexed { + vector: &vector, + idx: 0, + }; + assert_eq!(ListValue::default(), ref_on_vec.to_owned_scalar()); + let ref_on_vec = ListValueRef::Indexed { + vector: &vector, + idx: 1, + }; + assert_eq!(list_value, ref_on_vec.to_owned_scalar()); + } + + #[test] + fn test_build_timestamp_vector() { + let expect: Vec> = vec![Some(10.into()), None, Some(42.into())]; + let vector: TimestampSecondVector = build_vector_from_slice(&expect); + assert_vector_eq(&expect, &vector); + let val = vector.get_data(0).unwrap(); + assert_eq!(val, val.as_scalar_ref()); + assert_eq!(TimestampSecond::from(10), val.to_owned_scalar()); + } +} diff --git a/src/datatypes2/src/schema.rs b/src/datatypes2/src/schema.rs new file mode 100644 index 0000000000..328fe0de24 --- /dev/null +++ b/src/datatypes2/src/schema.rs @@ -0,0 +1,430 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod column_schema; +mod constraint; +mod raw; + +use std::collections::HashMap; +use std::sync::Arc; + +use arrow::datatypes::{Field, Schema as ArrowSchema}; +use snafu::{ensure, ResultExt}; + +use crate::data_type::DataType; +use crate::error::{self, Error, Result}; +pub use crate::schema::column_schema::{ColumnSchema, Metadata}; +pub use crate::schema::constraint::ColumnDefaultConstraint; +pub use crate::schema::raw::RawSchema; + +/// Key used to store version number of the schema in metadata. +const VERSION_KEY: &str = "greptime:version"; + +/// A common schema, should be immutable. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Schema { + column_schemas: Vec, + name_to_index: HashMap, + arrow_schema: Arc, + /// Index of the timestamp key column. + /// + /// Timestamp key column is the column holds the timestamp and forms part of + /// the primary key. None means there is no timestamp key column. + timestamp_index: Option, + /// Version of the schema. + /// + /// Initial value is zero. The version should bump after altering schema. + version: u32, +} + +impl Schema { + /// Initial version of the schema. + pub const INITIAL_VERSION: u32 = 0; + + /// Create a schema from a vector of [ColumnSchema]. + /// + /// # Panics + /// Panics when ColumnSchema's `default_constraint` can't be serialized into json. + pub fn new(column_schemas: Vec) -> Schema { + // Builder won't fail in this case + SchemaBuilder::try_from(column_schemas) + .unwrap() + .build() + .unwrap() + } + + /// Try to Create a schema from a vector of [ColumnSchema]. + pub fn try_new(column_schemas: Vec) -> Result { + SchemaBuilder::try_from(column_schemas)?.build() + } + + #[inline] + pub fn arrow_schema(&self) -> &Arc { + &self.arrow_schema + } + + #[inline] + pub fn column_schemas(&self) -> &[ColumnSchema] { + &self.column_schemas + } + + pub fn column_schema_by_name(&self, name: &str) -> Option<&ColumnSchema> { + self.name_to_index + .get(name) + .map(|index| &self.column_schemas[*index]) + } + + /// Retrieve the column's name by index + /// # Panics + /// This method **may** panic if the index is out of range of column schemas. + #[inline] + pub fn column_name_by_index(&self, idx: usize) -> &str { + &self.column_schemas[idx].name + } + + #[inline] + pub fn column_index_by_name(&self, name: &str) -> Option { + self.name_to_index.get(name).copied() + } + + #[inline] + pub fn contains_column(&self, name: &str) -> bool { + self.name_to_index.contains_key(name) + } + + #[inline] + pub fn num_columns(&self) -> usize { + self.column_schemas.len() + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.column_schemas.is_empty() + } + + /// Returns index of the timestamp key column. + #[inline] + pub fn timestamp_index(&self) -> Option { + self.timestamp_index + } + + #[inline] + pub fn timestamp_column(&self) -> Option<&ColumnSchema> { + self.timestamp_index.map(|idx| &self.column_schemas[idx]) + } + + #[inline] + pub fn version(&self) -> u32 { + self.version + } + + #[inline] + pub fn metadata(&self) -> &HashMap { + &self.arrow_schema.metadata + } +} + +#[derive(Default)] +pub struct SchemaBuilder { + column_schemas: Vec, + name_to_index: HashMap, + fields: Vec, + timestamp_index: Option, + version: u32, + metadata: HashMap, +} + +impl TryFrom> for SchemaBuilder { + type Error = Error; + + fn try_from(column_schemas: Vec) -> Result { + SchemaBuilder::try_from_columns(column_schemas) + } +} + +impl SchemaBuilder { + pub fn try_from_columns(column_schemas: Vec) -> Result { + let FieldsAndIndices { + fields, + name_to_index, + timestamp_index, + } = collect_fields(&column_schemas)?; + + Ok(Self { + column_schemas, + name_to_index, + fields, + timestamp_index, + ..Default::default() + }) + } + + pub fn version(mut self, version: u32) -> Self { + self.version = version; + self + } + + /// Add key value pair to metadata. + /// + /// Old metadata with same key would be overwritten. + pub fn add_metadata(mut self, key: impl Into, value: impl Into) -> Self { + self.metadata.insert(key.into(), value.into()); + self + } + + pub fn build(mut self) -> Result { + if let Some(timestamp_index) = self.timestamp_index { + validate_timestamp_index(&self.column_schemas, timestamp_index)?; + } + + self.metadata + .insert(VERSION_KEY.to_string(), self.version.to_string()); + + let arrow_schema = ArrowSchema::new(self.fields).with_metadata(self.metadata); + + Ok(Schema { + column_schemas: self.column_schemas, + name_to_index: self.name_to_index, + arrow_schema: Arc::new(arrow_schema), + timestamp_index: self.timestamp_index, + version: self.version, + }) + } +} + +struct FieldsAndIndices { + fields: Vec, + name_to_index: HashMap, + timestamp_index: Option, +} + +fn collect_fields(column_schemas: &[ColumnSchema]) -> Result { + let mut fields = Vec::with_capacity(column_schemas.len()); + let mut name_to_index = HashMap::with_capacity(column_schemas.len()); + let mut timestamp_index = None; + for (index, column_schema) in column_schemas.iter().enumerate() { + if column_schema.is_time_index() { + ensure!( + timestamp_index.is_none(), + error::DuplicateTimestampIndexSnafu { + exists: timestamp_index.unwrap(), + new: index, + } + ); + timestamp_index = Some(index); + } + let field = Field::try_from(column_schema)?; + fields.push(field); + name_to_index.insert(column_schema.name.clone(), index); + } + + Ok(FieldsAndIndices { + fields, + name_to_index, + timestamp_index, + }) +} + +fn validate_timestamp_index(column_schemas: &[ColumnSchema], timestamp_index: usize) -> Result<()> { + ensure!( + timestamp_index < column_schemas.len(), + error::InvalidTimestampIndexSnafu { + index: timestamp_index, + } + ); + + let column_schema = &column_schemas[timestamp_index]; + ensure!( + column_schema.data_type.is_timestamp_compatible(), + error::InvalidTimestampIndexSnafu { + index: timestamp_index, + } + ); + ensure!( + column_schema.is_time_index(), + error::InvalidTimestampIndexSnafu { + index: timestamp_index, + } + ); + + Ok(()) +} + +pub type SchemaRef = Arc; + +impl TryFrom> for Schema { + type Error = Error; + + fn try_from(arrow_schema: Arc) -> Result { + let mut column_schemas = Vec::with_capacity(arrow_schema.fields.len()); + let mut name_to_index = HashMap::with_capacity(arrow_schema.fields.len()); + for field in &arrow_schema.fields { + let column_schema = ColumnSchema::try_from(field)?; + name_to_index.insert(field.name().to_string(), column_schemas.len()); + column_schemas.push(column_schema); + } + + let mut timestamp_index = None; + for (index, column_schema) in column_schemas.iter().enumerate() { + if column_schema.is_time_index() { + validate_timestamp_index(&column_schemas, index)?; + ensure!( + timestamp_index.is_none(), + error::DuplicateTimestampIndexSnafu { + exists: timestamp_index.unwrap(), + new: index, + } + ); + timestamp_index = Some(index); + } + } + + let version = try_parse_version(&arrow_schema.metadata, VERSION_KEY)?; + + Ok(Self { + column_schemas, + name_to_index, + arrow_schema, + timestamp_index, + version, + }) + } +} + +impl TryFrom for Schema { + type Error = Error; + + fn try_from(arrow_schema: ArrowSchema) -> Result { + let arrow_schema = Arc::new(arrow_schema); + + Schema::try_from(arrow_schema) + } +} + +fn try_parse_version(metadata: &HashMap, key: &str) -> Result { + if let Some(value) = metadata.get(key) { + let version = value + .parse() + .context(error::ParseSchemaVersionSnafu { value })?; + + Ok(version) + } else { + Ok(Schema::INITIAL_VERSION) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::data_type::ConcreteDataType; + + #[test] + fn test_build_empty_schema() { + let schema = SchemaBuilder::default().build().unwrap(); + assert_eq!(0, schema.num_columns()); + assert!(schema.is_empty()); + } + + #[test] + fn test_schema_no_timestamp() { + let column_schemas = vec![ + ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), false), + ColumnSchema::new("col2", ConcreteDataType::float64_datatype(), true), + ]; + let schema = Schema::new(column_schemas.clone()); + + assert_eq!(2, schema.num_columns()); + assert!(!schema.is_empty()); + assert!(schema.timestamp_index().is_none()); + assert!(schema.timestamp_column().is_none()); + assert_eq!(Schema::INITIAL_VERSION, schema.version()); + + for column_schema in &column_schemas { + let found = schema.column_schema_by_name(&column_schema.name).unwrap(); + assert_eq!(column_schema, found); + } + assert!(schema.column_schema_by_name("col3").is_none()); + + let new_schema = Schema::try_from(schema.arrow_schema().clone()).unwrap(); + + assert_eq!(schema, new_schema); + assert_eq!(column_schemas, schema.column_schemas()); + } + + #[test] + fn test_metadata() { + let column_schemas = vec![ColumnSchema::new( + "col1", + ConcreteDataType::int32_datatype(), + false, + )]; + let schema = SchemaBuilder::try_from(column_schemas) + .unwrap() + .add_metadata("k1", "v1") + .build() + .unwrap(); + + assert_eq!("v1", schema.metadata().get("k1").unwrap()); + } + + #[test] + fn test_schema_with_timestamp() { + let column_schemas = vec![ + ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true), + ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ) + .with_time_index(true), + ]; + let schema = SchemaBuilder::try_from(column_schemas.clone()) + .unwrap() + .version(123) + .build() + .unwrap(); + + assert_eq!(1, schema.timestamp_index().unwrap()); + assert_eq!(&column_schemas[1], schema.timestamp_column().unwrap()); + assert_eq!(123, schema.version()); + + let new_schema = Schema::try_from(schema.arrow_schema().clone()).unwrap(); + assert_eq!(1, schema.timestamp_index().unwrap()); + assert_eq!(schema, new_schema); + } + + #[test] + fn test_schema_wrong_timestamp() { + let column_schemas = vec![ + ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true) + .with_time_index(true), + ColumnSchema::new("col2", ConcreteDataType::float64_datatype(), false), + ]; + assert!(SchemaBuilder::try_from(column_schemas) + .unwrap() + .build() + .is_err()); + + let column_schemas = vec![ + ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true), + ColumnSchema::new("col2", ConcreteDataType::float64_datatype(), false) + .with_time_index(true), + ]; + + assert!(SchemaBuilder::try_from(column_schemas) + .unwrap() + .build() + .is_err()); + } +} diff --git a/src/datatypes2/src/schema/column_schema.rs b/src/datatypes2/src/schema/column_schema.rs new file mode 100644 index 0000000000..0577ca6aff --- /dev/null +++ b/src/datatypes2/src/schema/column_schema.rs @@ -0,0 +1,305 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::BTreeMap; + +use arrow::datatypes::Field; +use serde::{Deserialize, Serialize}; +use snafu::{ensure, ResultExt}; + +use crate::data_type::{ConcreteDataType, DataType}; +use crate::error::{self, Error, Result}; +use crate::schema::constraint::ColumnDefaultConstraint; +use crate::vectors::VectorRef; + +pub type Metadata = BTreeMap; + +/// Key used to store whether the column is time index in arrow field's metadata. +const TIME_INDEX_KEY: &str = "greptime:time_index"; +/// Key used to store default constraint in arrow field's metadata. +const DEFAULT_CONSTRAINT_KEY: &str = "greptime:default_constraint"; + +/// Schema of a column, used as an immutable struct. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct ColumnSchema { + pub name: String, + pub data_type: ConcreteDataType, + is_nullable: bool, + is_time_index: bool, + default_constraint: Option, + metadata: Metadata, +} + +impl ColumnSchema { + pub fn new>( + name: T, + data_type: ConcreteDataType, + is_nullable: bool, + ) -> ColumnSchema { + ColumnSchema { + name: name.into(), + data_type, + is_nullable, + is_time_index: false, + default_constraint: None, + metadata: Metadata::new(), + } + } + + #[inline] + pub fn is_time_index(&self) -> bool { + self.is_time_index + } + + #[inline] + pub fn is_nullable(&self) -> bool { + self.is_nullable + } + + #[inline] + pub fn default_constraint(&self) -> Option<&ColumnDefaultConstraint> { + self.default_constraint.as_ref() + } + + #[inline] + pub fn metadata(&self) -> &Metadata { + &self.metadata + } + + pub fn with_time_index(mut self, is_time_index: bool) -> Self { + self.is_time_index = is_time_index; + if is_time_index { + self.metadata + .insert(TIME_INDEX_KEY.to_string(), "true".to_string()); + } else { + self.metadata.remove(TIME_INDEX_KEY); + } + self + } + + pub fn with_default_constraint( + mut self, + default_constraint: Option, + ) -> Result { + if let Some(constraint) = &default_constraint { + constraint.validate(&self.data_type, self.is_nullable)?; + } + + self.default_constraint = default_constraint; + Ok(self) + } + + /// Creates a new [`ColumnSchema`] with given metadata. + pub fn with_metadata(mut self, metadata: Metadata) -> Self { + self.metadata = metadata; + self + } + + pub fn create_default_vector(&self, num_rows: usize) -> Result> { + match &self.default_constraint { + Some(c) => c + .create_default_vector(&self.data_type, self.is_nullable, num_rows) + .map(Some), + None => { + if self.is_nullable { + // No default constraint, use null as default value. + // TODO(yingwen): Use NullVector once it supports setting logical type. + ColumnDefaultConstraint::null_value() + .create_default_vector(&self.data_type, self.is_nullable, num_rows) + .map(Some) + } else { + Ok(None) + } + } + } + } +} + +impl TryFrom<&Field> for ColumnSchema { + type Error = Error; + + fn try_from(field: &Field) -> Result { + let data_type = ConcreteDataType::try_from(field.data_type())?; + let mut metadata = field.metadata().cloned().unwrap_or_default(); + let default_constraint = match metadata.remove(DEFAULT_CONSTRAINT_KEY) { + Some(json) => { + Some(serde_json::from_str(&json).context(error::DeserializeSnafu { json })?) + } + None => None, + }; + let is_time_index = metadata.contains_key(TIME_INDEX_KEY); + + Ok(ColumnSchema { + name: field.name().clone(), + data_type, + is_nullable: field.is_nullable(), + is_time_index, + default_constraint, + metadata, + }) + } +} + +impl TryFrom<&ColumnSchema> for Field { + type Error = Error; + + fn try_from(column_schema: &ColumnSchema) -> Result { + let mut metadata = column_schema.metadata.clone(); + if let Some(value) = &column_schema.default_constraint { + // Adds an additional metadata to store the default constraint. + let old = metadata.insert( + DEFAULT_CONSTRAINT_KEY.to_string(), + serde_json::to_string(&value).context(error::SerializeSnafu)?, + ); + + ensure!( + old.is_none(), + error::DuplicateMetaSnafu { + key: DEFAULT_CONSTRAINT_KEY, + } + ); + } + + Ok(Field::new( + &column_schema.name, + column_schema.data_type.as_arrow_type(), + column_schema.is_nullable(), + ) + .with_metadata(Some(metadata))) + } +} + +#[cfg(test)] +mod tests { + use arrow::datatypes::DataType as ArrowDataType; + + use super::*; + use crate::value::Value; + + #[test] + fn test_column_schema() { + let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true); + let field = Field::try_from(&column_schema).unwrap(); + assert_eq!("test", field.name()); + assert_eq!(ArrowDataType::Int32, *field.data_type()); + assert!(field.is_nullable()); + + let new_column_schema = ColumnSchema::try_from(&field).unwrap(); + assert_eq!(column_schema, new_column_schema); + } + + #[test] + fn test_column_schema_with_default_constraint() { + let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true) + .with_default_constraint(Some(ColumnDefaultConstraint::Value(Value::from(99)))) + .unwrap(); + assert!(column_schema + .metadata() + .get(DEFAULT_CONSTRAINT_KEY) + .is_none()); + + let field = Field::try_from(&column_schema).unwrap(); + assert_eq!("test", field.name()); + assert_eq!(ArrowDataType::Int32, *field.data_type()); + assert!(field.is_nullable()); + assert_eq!( + "{\"Value\":{\"Int32\":99}}", + field + .metadata() + .unwrap() + .get(DEFAULT_CONSTRAINT_KEY) + .unwrap() + ); + + let new_column_schema = ColumnSchema::try_from(&field).unwrap(); + assert_eq!(column_schema, new_column_schema); + } + + #[test] + fn test_column_schema_with_metadata() { + let mut metadata = Metadata::new(); + metadata.insert("k1".to_string(), "v1".to_string()); + let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true) + .with_metadata(metadata) + .with_default_constraint(Some(ColumnDefaultConstraint::null_value())) + .unwrap(); + assert_eq!("v1", column_schema.metadata().get("k1").unwrap()); + assert!(column_schema + .metadata() + .get(DEFAULT_CONSTRAINT_KEY) + .is_none()); + + let field = Field::try_from(&column_schema).unwrap(); + assert_eq!("v1", field.metadata().unwrap().get("k1").unwrap()); + assert!(field + .metadata() + .unwrap() + .get(DEFAULT_CONSTRAINT_KEY) + .is_some()); + + let new_column_schema = ColumnSchema::try_from(&field).unwrap(); + assert_eq!(column_schema, new_column_schema); + } + + #[test] + fn test_column_schema_with_duplicate_metadata() { + let mut metadata = Metadata::new(); + metadata.insert(DEFAULT_CONSTRAINT_KEY.to_string(), "v1".to_string()); + let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true) + .with_metadata(metadata) + .with_default_constraint(Some(ColumnDefaultConstraint::null_value())) + .unwrap(); + Field::try_from(&column_schema).unwrap_err(); + } + + #[test] + fn test_column_schema_invalid_default_constraint() { + ColumnSchema::new("test", ConcreteDataType::int32_datatype(), false) + .with_default_constraint(Some(ColumnDefaultConstraint::null_value())) + .unwrap_err(); + } + + #[test] + fn test_column_default_constraint_try_into_from() { + let default_constraint = ColumnDefaultConstraint::Value(Value::from(42i64)); + + let bytes: Vec = default_constraint.clone().try_into().unwrap(); + let from_value = ColumnDefaultConstraint::try_from(&bytes[..]).unwrap(); + + assert_eq!(default_constraint, from_value); + } + + #[test] + fn test_column_schema_create_default_null() { + // Implicit default null. + let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true); + let v = column_schema.create_default_vector(5).unwrap().unwrap(); + assert_eq!(5, v.len()); + assert!(v.only_null()); + + // Explicit default null. + let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), true) + .with_default_constraint(Some(ColumnDefaultConstraint::null_value())) + .unwrap(); + let v = column_schema.create_default_vector(5).unwrap().unwrap(); + assert_eq!(5, v.len()); + assert!(v.only_null()); + } + + #[test] + fn test_column_schema_no_default() { + let column_schema = ColumnSchema::new("test", ConcreteDataType::int32_datatype(), false); + assert!(column_schema.create_default_vector(5).unwrap().is_none()); + } +} diff --git a/src/datatypes2/src/schema/constraint.rs b/src/datatypes2/src/schema/constraint.rs new file mode 100644 index 0000000000..4dd3ecc14b --- /dev/null +++ b/src/datatypes2/src/schema/constraint.rs @@ -0,0 +1,306 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::{Display, Formatter}; +use std::sync::Arc; + +use common_time::util; +use serde::{Deserialize, Serialize}; +use snafu::{ensure, ResultExt}; + +use crate::data_type::{ConcreteDataType, DataType}; +use crate::error::{self, Result}; +use crate::value::Value; +use crate::vectors::{Int64Vector, TimestampMillisecondVector, VectorRef}; + +const CURRENT_TIMESTAMP: &str = "current_timestamp()"; + +/// Column's default constraint. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum ColumnDefaultConstraint { + // A function invocation + // TODO(dennis): we save the function expression here, maybe use a struct in future. + Function(String), + // A value + Value(Value), +} + +impl TryFrom<&[u8]> for ColumnDefaultConstraint { + type Error = error::Error; + + fn try_from(bytes: &[u8]) -> Result { + let json = String::from_utf8_lossy(bytes); + serde_json::from_str(&json).context(error::DeserializeSnafu { json }) + } +} + +impl TryFrom for Vec { + type Error = error::Error; + + fn try_from(value: ColumnDefaultConstraint) -> std::result::Result { + let s = serde_json::to_string(&value).context(error::SerializeSnafu)?; + Ok(s.into_bytes()) + } +} + +impl Display for ColumnDefaultConstraint { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + ColumnDefaultConstraint::Function(expr) => write!(f, "{}", expr), + ColumnDefaultConstraint::Value(v) => write!(f, "{}", v), + } + } +} + +impl ColumnDefaultConstraint { + /// Returns a default null constraint. + pub fn null_value() -> ColumnDefaultConstraint { + ColumnDefaultConstraint::Value(Value::Null) + } + + /// Check whether the constraint is valid for columns with given `data_type` + /// and `is_nullable` attributes. + pub fn validate(&self, data_type: &ConcreteDataType, is_nullable: bool) -> Result<()> { + ensure!(is_nullable || !self.maybe_null(), error::NullDefaultSnafu); + + match self { + ColumnDefaultConstraint::Function(expr) => { + ensure!( + expr == CURRENT_TIMESTAMP, + error::UnsupportedDefaultExprSnafu { expr } + ); + ensure!( + data_type.is_timestamp_compatible(), + error::DefaultValueTypeSnafu { + reason: "return value of the function must has timestamp type", + } + ); + } + ColumnDefaultConstraint::Value(v) => { + if !v.is_null() { + // Whether the value could be nullable has been checked before, only need + // to check the type compatibility here. + ensure!( + data_type.logical_type_id() == v.logical_type_id(), + error::DefaultValueTypeSnafu { + reason: format!( + "column has type {:?} but default value has type {:?}", + data_type.logical_type_id(), + v.logical_type_id() + ), + } + ); + } + } + } + + Ok(()) + } + + /// Create a vector that contains `num_rows` default values for given `data_type`. + /// + /// If `is_nullable` is `true`, then this method would returns error if the created + /// default value is null. + /// + /// # Panics + /// Panics if `num_rows == 0`. + pub fn create_default_vector( + &self, + data_type: &ConcreteDataType, + is_nullable: bool, + num_rows: usize, + ) -> Result { + assert!(num_rows > 0); + + match self { + ColumnDefaultConstraint::Function(expr) => { + // Functions should also ensure its return value is not null when + // is_nullable is true. + match &expr[..] { + // TODO(dennis): we only supports current_timestamp right now, + // it's better to use a expression framework in future. + CURRENT_TIMESTAMP => create_current_timestamp_vector(data_type, num_rows), + _ => error::UnsupportedDefaultExprSnafu { expr }.fail(), + } + } + ColumnDefaultConstraint::Value(v) => { + ensure!(is_nullable || !v.is_null(), error::NullDefaultSnafu); + + // TODO(yingwen): + // 1. For null value, we could use NullVector once it supports custom logical type. + // 2. For non null value, we could use ConstantVector, but it would cause all codes + // attempt to downcast the vector fail if they don't check whether the vector is const + // first. + let mut mutable_vector = data_type.create_mutable_vector(1); + mutable_vector.push_value_ref(v.as_value_ref())?; + let base_vector = mutable_vector.to_vector(); + Ok(base_vector.replicate(&[num_rows])) + } + } + } + + /// Returns true if this constraint might creates NULL. + fn maybe_null(&self) -> bool { + // Once we support more functions, we may return true if given function + // could return null. + matches!(self, ColumnDefaultConstraint::Value(Value::Null)) + } +} + +fn create_current_timestamp_vector( + data_type: &ConcreteDataType, + num_rows: usize, +) -> Result { + // FIXME(yingwen): We should implements cast in VectorOp so we could cast the millisecond vector + // to other data type and avoid this match. + match data_type { + ConcreteDataType::Timestamp(_) => Ok(Arc::new(TimestampMillisecondVector::from_values( + std::iter::repeat(util::current_time_millis()).take(num_rows), + ))), + ConcreteDataType::Int64(_) => Ok(Arc::new(Int64Vector::from_values( + std::iter::repeat(util::current_time_millis()).take(num_rows), + ))), + _ => error::DefaultValueTypeSnafu { + reason: format!( + "Not support to assign current timestamp to {:?} type", + data_type + ), + } + .fail(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::error::Error; + use crate::vectors::Int32Vector; + + #[test] + fn test_null_default_constraint() { + let constraint = ColumnDefaultConstraint::null_value(); + assert!(constraint.maybe_null()); + let constraint = ColumnDefaultConstraint::Value(Value::Int32(10)); + assert!(!constraint.maybe_null()); + } + + #[test] + fn test_validate_null_constraint() { + let constraint = ColumnDefaultConstraint::null_value(); + let data_type = ConcreteDataType::int32_datatype(); + constraint.validate(&data_type, false).unwrap_err(); + constraint.validate(&data_type, true).unwrap(); + } + + #[test] + fn test_validate_value_constraint() { + let constraint = ColumnDefaultConstraint::Value(Value::Int32(10)); + let data_type = ConcreteDataType::int32_datatype(); + constraint.validate(&data_type, false).unwrap(); + constraint.validate(&data_type, true).unwrap(); + + constraint + .validate(&ConcreteDataType::uint32_datatype(), true) + .unwrap_err(); + } + + #[test] + fn test_validate_function_constraint() { + let constraint = ColumnDefaultConstraint::Function(CURRENT_TIMESTAMP.to_string()); + constraint + .validate(&ConcreteDataType::timestamp_millisecond_datatype(), false) + .unwrap(); + constraint + .validate(&ConcreteDataType::boolean_datatype(), false) + .unwrap_err(); + + let constraint = ColumnDefaultConstraint::Function("hello()".to_string()); + constraint + .validate(&ConcreteDataType::timestamp_millisecond_datatype(), false) + .unwrap_err(); + } + + #[test] + fn test_create_default_vector_by_null() { + let constraint = ColumnDefaultConstraint::null_value(); + let data_type = ConcreteDataType::int32_datatype(); + constraint + .create_default_vector(&data_type, false, 10) + .unwrap_err(); + + let constraint = ColumnDefaultConstraint::null_value(); + let v = constraint + .create_default_vector(&data_type, true, 3) + .unwrap(); + assert_eq!(3, v.len()); + for i in 0..v.len() { + assert_eq!(Value::Null, v.get(i)); + } + } + + #[test] + fn test_create_default_vector_by_value() { + let constraint = ColumnDefaultConstraint::Value(Value::Int32(10)); + let data_type = ConcreteDataType::int32_datatype(); + let v = constraint + .create_default_vector(&data_type, false, 4) + .unwrap(); + let expect: VectorRef = Arc::new(Int32Vector::from_values(vec![10; 4])); + assert_eq!(expect, v); + } + + #[test] + fn test_create_default_vector_by_func() { + let constraint = ColumnDefaultConstraint::Function(CURRENT_TIMESTAMP.to_string()); + // Timestamp type. + let data_type = ConcreteDataType::timestamp_millisecond_datatype(); + let v = constraint + .create_default_vector(&data_type, false, 4) + .unwrap(); + assert_eq!(4, v.len()); + assert!( + matches!(v.get(0), Value::Timestamp(_)), + "v {:?} is not timestamp", + v.get(0) + ); + + // Int64 type. + let data_type = ConcreteDataType::int64_datatype(); + let v = constraint + .create_default_vector(&data_type, false, 4) + .unwrap(); + assert_eq!(4, v.len()); + assert!( + matches!(v.get(0), Value::Int64(_)), + "v {:?} is not timestamp", + v.get(0) + ); + + let constraint = ColumnDefaultConstraint::Function("no".to_string()); + let data_type = ConcreteDataType::timestamp_millisecond_datatype(); + constraint + .create_default_vector(&data_type, false, 4) + .unwrap_err(); + } + + #[test] + fn test_create_by_func_and_invalid_type() { + let constraint = ColumnDefaultConstraint::Function(CURRENT_TIMESTAMP.to_string()); + let data_type = ConcreteDataType::boolean_datatype(); + let err = constraint + .create_default_vector(&data_type, false, 4) + .unwrap_err(); + assert!(matches!(err, Error::DefaultValueType { .. }), "{:?}", err); + } +} diff --git a/src/datatypes2/src/schema/raw.rs b/src/datatypes2/src/schema/raw.rs new file mode 100644 index 0000000000..75f0853b4b --- /dev/null +++ b/src/datatypes2/src/schema/raw.rs @@ -0,0 +1,77 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use serde::{Deserialize, Serialize}; + +use crate::error::{Error, Result}; +use crate::schema::{ColumnSchema, Schema, SchemaBuilder}; + +/// Struct used to serialize and deserialize [`Schema`](crate::schema::Schema). +/// +/// This struct only contains necessary data to recover the Schema. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct RawSchema { + pub column_schemas: Vec, + pub timestamp_index: Option, + pub version: u32, +} + +impl TryFrom for Schema { + type Error = Error; + + fn try_from(raw: RawSchema) -> Result { + SchemaBuilder::try_from(raw.column_schemas)? + .version(raw.version) + .build() + } +} + +impl From<&Schema> for RawSchema { + fn from(schema: &Schema) -> RawSchema { + RawSchema { + column_schemas: schema.column_schemas.clone(), + timestamp_index: schema.timestamp_index, + version: schema.version, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::data_type::ConcreteDataType; + + #[test] + fn test_raw_convert() { + let column_schemas = vec![ + ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true), + ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ) + .with_time_index(true), + ]; + let schema = SchemaBuilder::try_from(column_schemas) + .unwrap() + .version(123) + .build() + .unwrap(); + + let raw = RawSchema::from(&schema); + let schema_new = Schema::try_from(raw).unwrap(); + + assert_eq!(schema, schema_new); + } +} diff --git a/src/datatypes2/src/serialize.rs b/src/datatypes2/src/serialize.rs new file mode 100644 index 0000000000..1cbf04cedd --- /dev/null +++ b/src/datatypes2/src/serialize.rs @@ -0,0 +1,20 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::error::Result; + +pub trait Serializable: Send + Sync { + /// Serialize a column of value with given type to JSON value + fn serialize_to_json(&self) -> Result>; +} diff --git a/src/datatypes2/src/timestamp.rs b/src/datatypes2/src/timestamp.rs new file mode 100644 index 0000000000..f14e91a6c6 --- /dev/null +++ b/src/datatypes2/src/timestamp.rs @@ -0,0 +1,135 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +use common_time::timestamp::TimeUnit; +use common_time::Timestamp; +use paste::paste; +use serde::{Deserialize, Serialize}; + +use crate::prelude::{Scalar, Value, ValueRef}; +use crate::scalars::ScalarRef; +use crate::types::{ + TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, + TimestampSecondType, WrapperType, +}; +use crate::vectors::{ + TimestampMicrosecondVector, TimestampMillisecondVector, TimestampNanosecondVector, + TimestampSecondVector, +}; + +macro_rules! define_timestamp_with_unit { + ($unit: ident) => { + paste! { + #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] + pub struct [](pub Timestamp); + + impl [] { + pub fn new(val: i64) -> Self { + Self(Timestamp::new(val, TimeUnit::$unit)) + } + } + + impl Default for [] { + fn default() -> Self { + Self::new(0) + } + } + + impl From<[]> for Value { + fn from(t: []) -> Value { + Value::Timestamp(t.0) + } + } + + impl From<[]> for serde_json::Value { + fn from(t: []) -> Self { + t.0.into() + } + } + + impl From<[]> for ValueRef<'static> { + fn from(t: []) -> Self { + ValueRef::Timestamp(t.0) + } + } + + impl Scalar for [] { + type VectorType = []; + type RefType<'a> = []; + + fn as_scalar_ref(&self) -> Self::RefType<'_> { + *self + } + + fn upcast_gat<'short, 'long: 'short>( + long: Self::RefType<'long>, + ) -> Self::RefType<'short> { + long + } + } + + impl<'a> ScalarRef<'a> for [] { + type ScalarType = []; + + fn to_owned_scalar(&self) -> Self::ScalarType { + *self + } + } + + impl WrapperType for [] { + type LogicalType = []; + type Native = i64; + + fn from_native(value: Self::Native) -> Self { + Self::new(value) + } + + fn into_native(self) -> Self::Native { + self.0.into() + } + } + + impl From for [] { + fn from(val: i64) -> Self { + []::from_native(val) + } + } + } + }; +} + +define_timestamp_with_unit!(Second); +define_timestamp_with_unit!(Millisecond); +define_timestamp_with_unit!(Microsecond); +define_timestamp_with_unit!(Nanosecond); + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_timestamp_scalar() { + let ts = TimestampSecond::new(123); + assert_eq!(ts, ts.as_scalar_ref()); + assert_eq!(ts, ts.to_owned_scalar()); + let ts = TimestampMillisecond::new(123); + assert_eq!(ts, ts.as_scalar_ref()); + assert_eq!(ts, ts.to_owned_scalar()); + let ts = TimestampMicrosecond::new(123); + assert_eq!(ts, ts.as_scalar_ref()); + assert_eq!(ts, ts.to_owned_scalar()); + let ts = TimestampNanosecond::new(123); + assert_eq!(ts, ts.as_scalar_ref()); + assert_eq!(ts, ts.to_owned_scalar()); + } +} diff --git a/src/datatypes2/src/type_id.rs b/src/datatypes2/src/type_id.rs new file mode 100644 index 0000000000..bcb7ea52b1 --- /dev/null +++ b/src/datatypes2/src/type_id.rs @@ -0,0 +1,93 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// Unique identifier for logical data type. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum LogicalTypeId { + Null, + + // Numeric types: + Boolean, + Int8, + Int16, + Int32, + Int64, + UInt8, + UInt16, + UInt32, + UInt64, + Float32, + Float64, + + // String types: + String, + Binary, + + // Date & Time types: + /// Date representing the elapsed time since UNIX epoch (1970-01-01) + /// in days (32 bits). + Date, + /// Datetime representing the elapsed time since UNIX epoch (1970-01-01) in + /// seconds/milliseconds/microseconds/nanoseconds, determined by precision. + DateTime, + + TimestampSecond, + TimestampMillisecond, + TimestampMicrosecond, + TimestampNanosecond, + + List, +} + +impl LogicalTypeId { + /// Create ConcreteDataType based on this id. This method is for test only as it + /// would lost some info. + /// + /// # Panics + /// Panics if data type is not supported. + #[cfg(any(test, feature = "test"))] + pub fn data_type(&self) -> crate::data_type::ConcreteDataType { + use crate::data_type::ConcreteDataType; + + match self { + LogicalTypeId::Null => ConcreteDataType::null_datatype(), + LogicalTypeId::Boolean => ConcreteDataType::boolean_datatype(), + LogicalTypeId::Int8 => ConcreteDataType::int8_datatype(), + LogicalTypeId::Int16 => ConcreteDataType::int16_datatype(), + LogicalTypeId::Int32 => ConcreteDataType::int32_datatype(), + LogicalTypeId::Int64 => ConcreteDataType::int64_datatype(), + LogicalTypeId::UInt8 => ConcreteDataType::uint8_datatype(), + LogicalTypeId::UInt16 => ConcreteDataType::uint16_datatype(), + LogicalTypeId::UInt32 => ConcreteDataType::uint32_datatype(), + LogicalTypeId::UInt64 => ConcreteDataType::uint64_datatype(), + LogicalTypeId::Float32 => ConcreteDataType::float32_datatype(), + LogicalTypeId::Float64 => ConcreteDataType::float64_datatype(), + LogicalTypeId::String => ConcreteDataType::string_datatype(), + LogicalTypeId::Binary => ConcreteDataType::binary_datatype(), + LogicalTypeId::Date => ConcreteDataType::date_datatype(), + LogicalTypeId::DateTime => ConcreteDataType::datetime_datatype(), + LogicalTypeId::TimestampSecond => ConcreteDataType::timestamp_second_datatype(), + LogicalTypeId::TimestampMillisecond => { + ConcreteDataType::timestamp_millisecond_datatype() + } + LogicalTypeId::TimestampMicrosecond => { + ConcreteDataType::timestamp_microsecond_datatype() + } + LogicalTypeId::TimestampNanosecond => ConcreteDataType::timestamp_nanosecond_datatype(), + LogicalTypeId::List => { + ConcreteDataType::list_datatype(ConcreteDataType::null_datatype()) + } + } + } +} diff --git a/src/datatypes2/src/types.rs b/src/datatypes2/src/types.rs new file mode 100644 index 0000000000..186704fdfd --- /dev/null +++ b/src/datatypes2/src/types.rs @@ -0,0 +1,37 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod binary_type; +mod boolean_type; +mod date_type; +mod datetime_type; +mod list_type; +mod null_type; +mod primitive_type; +mod string_type; + +mod timestamp_type; + +pub use binary_type::BinaryType; +pub use boolean_type::BooleanType; +pub use date_type::DateType; +pub use datetime_type::DateTimeType; +pub use list_type::ListType; +pub use null_type::NullType; +pub use primitive_type::{ + Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LogicalPrimitiveType, + NativeType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, WrapperType, +}; +pub use string_type::StringType; +pub use timestamp_type::*; diff --git a/src/datatypes2/src/types/binary_type.rs b/src/datatypes2/src/types/binary_type.rs new file mode 100644 index 0000000000..0d06724fff --- /dev/null +++ b/src/datatypes2/src/types/binary_type.rs @@ -0,0 +1,60 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use arrow::datatypes::DataType as ArrowDataType; +use common_base::bytes::StringBytes; +use serde::{Deserialize, Serialize}; + +use crate::data_type::{DataType, DataTypeRef}; +use crate::scalars::ScalarVectorBuilder; +use crate::type_id::LogicalTypeId; +use crate::value::Value; +use crate::vectors::{BinaryVectorBuilder, MutableVector}; + +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct BinaryType; + +impl BinaryType { + pub fn arc() -> DataTypeRef { + Arc::new(Self) + } +} + +impl DataType for BinaryType { + fn name(&self) -> &str { + "Binary" + } + + fn logical_type_id(&self) -> LogicalTypeId { + LogicalTypeId::Binary + } + + fn default_value(&self) -> Value { + StringBytes::default().into() + } + + fn as_arrow_type(&self) -> ArrowDataType { + ArrowDataType::LargeBinary + } + + fn create_mutable_vector(&self, capacity: usize) -> Box { + Box::new(BinaryVectorBuilder::with_capacity(capacity)) + } + + fn is_timestamp_compatible(&self) -> bool { + false + } +} diff --git a/src/datatypes2/src/types/boolean_type.rs b/src/datatypes2/src/types/boolean_type.rs new file mode 100644 index 0000000000..36d92169eb --- /dev/null +++ b/src/datatypes2/src/types/boolean_type.rs @@ -0,0 +1,59 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use arrow::datatypes::DataType as ArrowDataType; +use serde::{Deserialize, Serialize}; + +use crate::data_type::{DataType, DataTypeRef}; +use crate::scalars::ScalarVectorBuilder; +use crate::type_id::LogicalTypeId; +use crate::value::Value; +use crate::vectors::{BooleanVectorBuilder, MutableVector}; + +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct BooleanType; + +impl BooleanType { + pub fn arc() -> DataTypeRef { + Arc::new(Self) + } +} + +impl DataType for BooleanType { + fn name(&self) -> &str { + "Boolean" + } + + fn logical_type_id(&self) -> LogicalTypeId { + LogicalTypeId::Boolean + } + + fn default_value(&self) -> Value { + bool::default().into() + } + + fn as_arrow_type(&self) -> ArrowDataType { + ArrowDataType::Boolean + } + + fn create_mutable_vector(&self, capacity: usize) -> Box { + Box::new(BooleanVectorBuilder::with_capacity(capacity)) + } + + fn is_timestamp_compatible(&self) -> bool { + false + } +} diff --git a/src/datatypes2/src/types/date_type.rs b/src/datatypes2/src/types/date_type.rs new file mode 100644 index 0000000000..052b837a3d --- /dev/null +++ b/src/datatypes2/src/types/date_type.rs @@ -0,0 +1,90 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use arrow::datatypes::{DataType as ArrowDataType, Date32Type}; +use common_time::Date; +use serde::{Deserialize, Serialize}; +use snafu::OptionExt; + +use crate::data_type::{ConcreteDataType, DataType}; +use crate::error::{self, Result}; +use crate::scalars::ScalarVectorBuilder; +use crate::type_id::LogicalTypeId; +use crate::types::LogicalPrimitiveType; +use crate::value::{Value, ValueRef}; +use crate::vectors::{DateVector, DateVectorBuilder, MutableVector, Vector}; + +/// Data type for Date (YYYY-MM-DD). +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct DateType; + +impl DataType for DateType { + fn name(&self) -> &str { + "Date" + } + + fn logical_type_id(&self) -> LogicalTypeId { + LogicalTypeId::Date + } + + fn default_value(&self) -> Value { + Value::Date(Default::default()) + } + + fn as_arrow_type(&self) -> ArrowDataType { + ArrowDataType::Date32 + } + + fn create_mutable_vector(&self, capacity: usize) -> Box { + Box::new(DateVectorBuilder::with_capacity(capacity)) + } + + fn is_timestamp_compatible(&self) -> bool { + false + } +} + +impl LogicalPrimitiveType for DateType { + type ArrowPrimitive = Date32Type; + type Native = i32; + type Wrapper = Date; + + fn build_data_type() -> ConcreteDataType { + ConcreteDataType::date_datatype() + } + + fn type_name() -> &'static str { + "Date" + } + + fn cast_vector(vector: &dyn Vector) -> Result<&DateVector> { + vector + .as_any() + .downcast_ref::() + .with_context(|| error::CastTypeSnafu { + msg: format!("Failed to cast {} to DateVector", vector.vector_type_name(),), + }) + } + + fn cast_value_ref(value: ValueRef) -> Result> { + match value { + ValueRef::Null => Ok(None), + ValueRef::Date(v) => Ok(Some(v)), + other => error::CastTypeSnafu { + msg: format!("Failed to cast value {:?} to Date", other,), + } + .fail(), + } + } +} diff --git a/src/datatypes2/src/types/datetime_type.rs b/src/datatypes2/src/types/datetime_type.rs new file mode 100644 index 0000000000..d74a02effe --- /dev/null +++ b/src/datatypes2/src/types/datetime_type.rs @@ -0,0 +1,91 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use arrow::datatypes::{DataType as ArrowDataType, Date64Type}; +use common_time::DateTime; +use serde::{Deserialize, Serialize}; +use snafu::OptionExt; + +use crate::data_type::{ConcreteDataType, DataType}; +use crate::error::{self, Result}; +use crate::prelude::{LogicalTypeId, MutableVector, ScalarVectorBuilder, Value, ValueRef, Vector}; +use crate::types::LogicalPrimitiveType; +use crate::vectors::{DateTimeVector, DateTimeVectorBuilder, PrimitiveVector}; + +/// Data type for [`DateTime`]. +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct DateTimeType; + +impl DataType for DateTimeType { + fn name(&self) -> &str { + "DateTime" + } + + fn logical_type_id(&self) -> LogicalTypeId { + LogicalTypeId::DateTime + } + + fn default_value(&self) -> Value { + Value::DateTime(DateTime::default()) + } + + fn as_arrow_type(&self) -> ArrowDataType { + ArrowDataType::Date64 + } + + fn create_mutable_vector(&self, capacity: usize) -> Box { + Box::new(DateTimeVectorBuilder::with_capacity(capacity)) + } + + fn is_timestamp_compatible(&self) -> bool { + false + } +} + +impl LogicalPrimitiveType for DateTimeType { + type ArrowPrimitive = Date64Type; + type Native = i64; + type Wrapper = DateTime; + + fn build_data_type() -> ConcreteDataType { + ConcreteDataType::datetime_datatype() + } + + fn type_name() -> &'static str { + "DateTime" + } + + fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveVector> { + vector + .as_any() + .downcast_ref::() + .with_context(|| error::CastTypeSnafu { + msg: format!( + "Failed to cast {} to DateTimeVector", + vector.vector_type_name() + ), + }) + } + + fn cast_value_ref(value: ValueRef) -> Result> { + match value { + ValueRef::Null => Ok(None), + ValueRef::DateTime(v) => Ok(Some(v)), + other => error::CastTypeSnafu { + msg: format!("Failed to cast value {:?} to DateTime", other,), + } + .fail(), + } + } +} diff --git a/src/datatypes2/src/types/list_type.rs b/src/datatypes2/src/types/list_type.rs new file mode 100644 index 0000000000..b9875ca362 --- /dev/null +++ b/src/datatypes2/src/types/list_type.rs @@ -0,0 +1,95 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use arrow::datatypes::{DataType as ArrowDataType, Field}; +use serde::{Deserialize, Serialize}; + +use crate::data_type::{ConcreteDataType, DataType}; +use crate::type_id::LogicalTypeId; +use crate::value::{ListValue, Value}; +use crate::vectors::{ListVectorBuilder, MutableVector}; + +/// Used to represent the List datatype. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct ListType { + /// The type of List's item. + // Use Box to avoid recursive dependency, as enum ConcreteDataType depends on ListType. + item_type: Box, +} + +impl Default for ListType { + fn default() -> Self { + ListType::new(ConcreteDataType::null_datatype()) + } +} + +impl ListType { + /// Create a new `ListType` whose item's data type is `item_type`. + pub fn new(item_type: ConcreteDataType) -> Self { + ListType { + item_type: Box::new(item_type), + } + } +} + +impl DataType for ListType { + fn name(&self) -> &str { + "List" + } + + fn logical_type_id(&self) -> LogicalTypeId { + LogicalTypeId::List + } + + fn default_value(&self) -> Value { + Value::List(ListValue::new(None, *self.item_type.clone())) + } + + fn as_arrow_type(&self) -> ArrowDataType { + let field = Box::new(Field::new("item", self.item_type.as_arrow_type(), true)); + ArrowDataType::List(field) + } + + fn create_mutable_vector(&self, capacity: usize) -> Box { + Box::new(ListVectorBuilder::with_type_capacity( + *self.item_type.clone(), + capacity, + )) + } + + fn is_timestamp_compatible(&self) -> bool { + false + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::value::ListValue; + + #[test] + fn test_list_type() { + let t = ListType::new(ConcreteDataType::boolean_datatype()); + assert_eq!("List", t.name()); + assert_eq!(LogicalTypeId::List, t.logical_type_id()); + assert_eq!( + Value::List(ListValue::new(None, ConcreteDataType::boolean_datatype())), + t.default_value() + ); + assert_eq!( + ArrowDataType::List(Box::new(Field::new("item", ArrowDataType::Boolean, true))), + t.as_arrow_type() + ); + } +} diff --git a/src/datatypes2/src/types/null_type.rs b/src/datatypes2/src/types/null_type.rs new file mode 100644 index 0000000000..b9bb2dc752 --- /dev/null +++ b/src/datatypes2/src/types/null_type.rs @@ -0,0 +1,58 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use arrow::datatypes::DataType as ArrowDataType; +use serde::{Deserialize, Serialize}; + +use crate::data_type::{DataType, DataTypeRef}; +use crate::type_id::LogicalTypeId; +use crate::value::Value; +use crate::vectors::{MutableVector, NullVectorBuilder}; + +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct NullType; + +impl NullType { + pub fn arc() -> DataTypeRef { + Arc::new(NullType) + } +} + +impl DataType for NullType { + fn name(&self) -> &str { + "Null" + } + + fn logical_type_id(&self) -> LogicalTypeId { + LogicalTypeId::Null + } + + fn default_value(&self) -> Value { + Value::Null + } + + fn as_arrow_type(&self) -> ArrowDataType { + ArrowDataType::Null + } + + fn create_mutable_vector(&self, _capacity: usize) -> Box { + Box::new(NullVectorBuilder::default()) + } + + fn is_timestamp_compatible(&self) -> bool { + false + } +} diff --git a/src/datatypes2/src/types/primitive_type.rs b/src/datatypes2/src/types/primitive_type.rs new file mode 100644 index 0000000000..e389ca13bf --- /dev/null +++ b/src/datatypes2/src/types/primitive_type.rs @@ -0,0 +1,358 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::cmp::Ordering; + +use arrow::datatypes::{ArrowNativeType, ArrowPrimitiveType, DataType as ArrowDataType}; +use common_time::{Date, DateTime}; +use num::NumCast; +use serde::{Deserialize, Serialize}; +use snafu::OptionExt; + +use crate::data_type::{ConcreteDataType, DataType}; +use crate::error::{self, Result}; +use crate::scalars::{Scalar, ScalarRef, ScalarVectorBuilder}; +use crate::type_id::LogicalTypeId; +use crate::types::{DateTimeType, DateType}; +use crate::value::{Value, ValueRef}; +use crate::vectors::{MutableVector, PrimitiveVector, PrimitiveVectorBuilder, Vector}; + +/// Data types that can be used as arrow's native type. +pub trait NativeType: ArrowNativeType + NumCast { + /// Largest numeric type this primitive type can be cast to. + type LargestType: NativeType; +} + +macro_rules! impl_native_type { + ($Type: ident, $LargestType: ident) => { + impl NativeType for $Type { + type LargestType = $LargestType; + } + }; +} + +impl_native_type!(u8, u64); +impl_native_type!(u16, u64); +impl_native_type!(u32, u64); +impl_native_type!(u64, u64); +impl_native_type!(i8, i64); +impl_native_type!(i16, i64); +impl_native_type!(i32, i64); +impl_native_type!(i64, i64); +impl_native_type!(f32, f64); +impl_native_type!(f64, f64); + +/// Represents the wrapper type that wraps a native type using the `newtype pattern`, +/// such as [Date](`common_time::Date`) is a wrapper type for the underlying native +/// type `i32`. +pub trait WrapperType: + Copy + + Scalar + + PartialEq + + Into + + Into> + + Serialize + + Into +{ + /// Logical primitive type that this wrapper type belongs to. + type LogicalType: LogicalPrimitiveType; + /// The underlying native type. + type Native: NativeType; + + /// Convert native type into this wrapper type. + fn from_native(value: Self::Native) -> Self; + + /// Convert this wrapper type into native type. + fn into_native(self) -> Self::Native; +} + +/// Trait bridging the logical primitive type with [ArrowPrimitiveType]. +pub trait LogicalPrimitiveType: 'static + Sized { + /// Arrow primitive type of this logical type. + type ArrowPrimitive: ArrowPrimitiveType; + /// Native (physical) type of this logical type. + type Native: NativeType; + /// Wrapper type that the vector returns. + type Wrapper: WrapperType + + for<'a> Scalar, RefType<'a> = Self::Wrapper> + + for<'a> ScalarRef<'a, ScalarType = Self::Wrapper>; + + /// Construct the data type struct. + fn build_data_type() -> ConcreteDataType; + + /// Return the name of the type. + fn type_name() -> &'static str; + + /// Dynamic cast the vector to the concrete vector type. + fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveVector>; + + /// Cast value ref to the primitive type. + fn cast_value_ref(value: ValueRef) -> Result>; +} + +/// A new type for [WrapperType], complement the `Ord` feature for it. Wrapping non ordered +/// primitive types like `f32` and `f64` in `OrdPrimitive` can make them be used in places that +/// require `Ord`. For example, in `Median` or `Percentile` UDAFs. +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct OrdPrimitive(pub T); + +impl OrdPrimitive { + pub fn as_primitive(&self) -> T { + self.0 + } +} + +impl Eq for OrdPrimitive {} + +impl PartialOrd for OrdPrimitive { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for OrdPrimitive { + fn cmp(&self, other: &Self) -> Ordering { + Into::::into(self.0).cmp(&Into::::into(other.0)) + } +} + +impl From> for Value { + fn from(p: OrdPrimitive) -> Self { + p.0.into() + } +} + +macro_rules! impl_wrapper { + ($Type: ident, $LogicalType: ident) => { + impl WrapperType for $Type { + type LogicalType = $LogicalType; + type Native = $Type; + + fn from_native(value: Self::Native) -> Self { + value + } + + fn into_native(self) -> Self::Native { + self + } + } + }; +} + +impl_wrapper!(u8, UInt8Type); +impl_wrapper!(u16, UInt16Type); +impl_wrapper!(u32, UInt32Type); +impl_wrapper!(u64, UInt64Type); +impl_wrapper!(i8, Int8Type); +impl_wrapper!(i16, Int16Type); +impl_wrapper!(i32, Int32Type); +impl_wrapper!(i64, Int64Type); +impl_wrapper!(f32, Float32Type); +impl_wrapper!(f64, Float64Type); + +impl WrapperType for Date { + type LogicalType = DateType; + type Native = i32; + + fn from_native(value: i32) -> Self { + Date::new(value) + } + + fn into_native(self) -> i32 { + self.val() + } +} + +impl WrapperType for DateTime { + type LogicalType = DateTimeType; + type Native = i64; + + fn from_native(value: Self::Native) -> Self { + DateTime::new(value) + } + + fn into_native(self) -> Self::Native { + self.val() + } +} + +macro_rules! define_logical_primitive_type { + ($Native: ident, $TypeId: ident, $DataType: ident) => { + // We need to define it as an empty struct `struct DataType {}` instead of a struct-unit + // `struct DataType;` to ensure the serialized JSON string is compatible with previous + // implementation. + #[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] + pub struct $DataType {} + + impl LogicalPrimitiveType for $DataType { + type ArrowPrimitive = arrow::datatypes::$DataType; + type Native = $Native; + type Wrapper = $Native; + + fn build_data_type() -> ConcreteDataType { + ConcreteDataType::$TypeId($DataType::default()) + } + + fn type_name() -> &'static str { + stringify!($TypeId) + } + + fn cast_vector(vector: &dyn Vector) -> Result<&PrimitiveVector<$DataType>> { + vector + .as_any() + .downcast_ref::>() + .with_context(|| error::CastTypeSnafu { + msg: format!( + "Failed to cast {} to vector of primitive type {}", + vector.vector_type_name(), + stringify!($TypeId) + ), + }) + } + + fn cast_value_ref(value: ValueRef) -> Result> { + match value { + ValueRef::Null => Ok(None), + ValueRef::$TypeId(v) => Ok(Some(v.into())), + other => error::CastTypeSnafu { + msg: format!( + "Failed to cast value {:?} to primitive type {}", + other, + stringify!($TypeId), + ), + } + .fail(), + } + } + } + }; +} + +macro_rules! define_non_timestamp_primitive { + ($Native: ident, $TypeId: ident, $DataType: ident) => { + define_logical_primitive_type!($Native, $TypeId, $DataType); + + impl DataType for $DataType { + fn name(&self) -> &str { + stringify!($TypeId) + } + + fn logical_type_id(&self) -> LogicalTypeId { + LogicalTypeId::$TypeId + } + + fn default_value(&self) -> Value { + $Native::default().into() + } + + fn as_arrow_type(&self) -> ArrowDataType { + ArrowDataType::$TypeId + } + + fn create_mutable_vector(&self, capacity: usize) -> Box { + Box::new(PrimitiveVectorBuilder::<$DataType>::with_capacity(capacity)) + } + + fn is_timestamp_compatible(&self) -> bool { + false + } + } + }; +} + +define_non_timestamp_primitive!(u8, UInt8, UInt8Type); +define_non_timestamp_primitive!(u16, UInt16, UInt16Type); +define_non_timestamp_primitive!(u32, UInt32, UInt32Type); +define_non_timestamp_primitive!(u64, UInt64, UInt64Type); +define_non_timestamp_primitive!(i8, Int8, Int8Type); +define_non_timestamp_primitive!(i16, Int16, Int16Type); +define_non_timestamp_primitive!(i32, Int32, Int32Type); +define_non_timestamp_primitive!(f32, Float32, Float32Type); +define_non_timestamp_primitive!(f64, Float64, Float64Type); + +// Timestamp primitive: +define_logical_primitive_type!(i64, Int64, Int64Type); + +impl DataType for Int64Type { + fn name(&self) -> &str { + "Int64" + } + + fn logical_type_id(&self) -> LogicalTypeId { + LogicalTypeId::Int64 + } + + fn default_value(&self) -> Value { + Value::Int64(0) + } + + fn as_arrow_type(&self) -> ArrowDataType { + ArrowDataType::Int64 + } + + fn create_mutable_vector(&self, capacity: usize) -> Box { + Box::new(PrimitiveVectorBuilder::::with_capacity(capacity)) + } + + fn is_timestamp_compatible(&self) -> bool { + true + } +} + +#[cfg(test)] +mod tests { + use std::collections::BinaryHeap; + + use super::*; + + #[test] + fn test_ord_primitive() { + struct Foo + where + T: WrapperType, + { + heap: BinaryHeap>, + } + + impl Foo + where + T: WrapperType, + { + fn push(&mut self, value: T) { + let value = OrdPrimitive::(value); + self.heap.push(value); + } + } + + macro_rules! test { + ($Type:ident) => { + let mut foo = Foo::<$Type> { + heap: BinaryHeap::new(), + }; + foo.push($Type::default()); + }; + } + + test!(u8); + test!(u16); + test!(u32); + test!(u64); + test!(i8); + test!(i16); + test!(i32); + test!(i64); + test!(f32); + test!(f64); + } +} diff --git a/src/datatypes2/src/types/string_type.rs b/src/datatypes2/src/types/string_type.rs new file mode 100644 index 0000000000..799cbbbdd3 --- /dev/null +++ b/src/datatypes2/src/types/string_type.rs @@ -0,0 +1,60 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use arrow::datatypes::DataType as ArrowDataType; +use common_base::bytes::StringBytes; +use serde::{Deserialize, Serialize}; + +use crate::data_type::{DataType, DataTypeRef}; +use crate::prelude::ScalarVectorBuilder; +use crate::type_id::LogicalTypeId; +use crate::value::Value; +use crate::vectors::{MutableVector, StringVectorBuilder}; + +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct StringType; + +impl StringType { + pub fn arc() -> DataTypeRef { + Arc::new(Self) + } +} + +impl DataType for StringType { + fn name(&self) -> &str { + "String" + } + + fn logical_type_id(&self) -> LogicalTypeId { + LogicalTypeId::String + } + + fn default_value(&self) -> Value { + StringBytes::default().into() + } + + fn as_arrow_type(&self) -> ArrowDataType { + ArrowDataType::Utf8 + } + + fn create_mutable_vector(&self, capacity: usize) -> Box { + Box::new(StringVectorBuilder::with_capacity(capacity)) + } + + fn is_timestamp_compatible(&self) -> bool { + false + } +} diff --git a/src/datatypes2/src/types/timestamp_type.rs b/src/datatypes2/src/types/timestamp_type.rs new file mode 100644 index 0000000000..fe86eeb8fd --- /dev/null +++ b/src/datatypes2/src/types/timestamp_type.rs @@ -0,0 +1,140 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use arrow::datatypes::{ + DataType as ArrowDataType, TimeUnit as ArrowTimeUnit, + TimestampMicrosecondType as ArrowTimestampMicrosecondType, + TimestampMillisecondType as ArrowTimestampMillisecondType, + TimestampNanosecondType as ArrowTimestampNanosecondType, + TimestampSecondType as ArrowTimestampSecondType, +}; +use common_time::timestamp::TimeUnit; +use common_time::Timestamp; +use enum_dispatch::enum_dispatch; +use paste::paste; +use serde::{Deserialize, Serialize}; +use snafu::OptionExt; + +use crate::data_type::ConcreteDataType; +use crate::error; +use crate::prelude::{ + DataType, LogicalTypeId, MutableVector, ScalarVectorBuilder, Value, ValueRef, Vector, +}; +use crate::timestamp::{ + TimestampMicrosecond, TimestampMillisecond, TimestampNanosecond, TimestampSecond, +}; +use crate::types::LogicalPrimitiveType; +use crate::vectors::{ + PrimitiveVector, TimestampMicrosecondVector, TimestampMicrosecondVectorBuilder, + TimestampMillisecondVector, TimestampMillisecondVectorBuilder, TimestampNanosecondVector, + TimestampNanosecondVectorBuilder, TimestampSecondVector, TimestampSecondVectorBuilder, +}; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[enum_dispatch(DataType)] +pub enum TimestampType { + Second(TimestampSecondType), + Millisecond(TimestampMillisecondType), + Microsecond(TimestampMicrosecondType), + Nanosecond(TimestampNanosecondType), +} + +macro_rules! impl_data_type_for_timestamp { + ($unit: ident) => { + paste! { + #[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] + pub struct []; + + impl DataType for [] { + fn name(&self) -> &str { + stringify!([]) + } + + fn logical_type_id(&self) -> LogicalTypeId { + LogicalTypeId::[] + } + + fn default_value(&self) -> Value { + Value::Timestamp(Timestamp::new(0, TimeUnit::$unit)) + } + + fn as_arrow_type(&self) -> ArrowDataType { + ArrowDataType::Timestamp(ArrowTimeUnit::$unit, None) + } + + fn create_mutable_vector(&self, capacity: usize) -> Box { + Box::new([]::with_capacity(capacity)) + } + + fn is_timestamp_compatible(&self) -> bool { + true + } + } + + + impl LogicalPrimitiveType for [] { + type ArrowPrimitive = []; + type Native = i64; + type Wrapper = []; + + fn build_data_type() -> ConcreteDataType { + ConcreteDataType::Timestamp(TimestampType::$unit( + []::default(), + )) + } + + fn type_name() -> &'static str { + stringify!([]) + } + + fn cast_vector(vector: &dyn Vector) -> crate::Result<&PrimitiveVector> { + vector + .as_any() + .downcast_ref::<[]>() + .with_context(|| error::CastTypeSnafu { + msg: format!( + "Failed to cast {} to {}", + vector.vector_type_name(), stringify!([]) + ), + }) + } + + fn cast_value_ref(value: ValueRef) -> crate::Result> { + match value { + ValueRef::Null => Ok(None), + ValueRef::Timestamp(t) => match t.unit() { + TimeUnit::$unit => Ok(Some([](t))), + other => error::CastTypeSnafu { + msg: format!( + "Failed to cast Timestamp value with different unit {:?} to {}", + other, stringify!([]) + ), + } + .fail(), + }, + other => error::CastTypeSnafu { + msg: format!("Failed to cast value {:?} to {}", other, stringify!([])), + } + .fail(), + } + } + } + } + } +} + +impl_data_type_for_timestamp!(Nanosecond); +impl_data_type_for_timestamp!(Second); +impl_data_type_for_timestamp!(Millisecond); +impl_data_type_for_timestamp!(Microsecond); diff --git a/src/datatypes2/src/value.rs b/src/datatypes2/src/value.rs new file mode 100644 index 0000000000..bade88d419 --- /dev/null +++ b/src/datatypes2/src/value.rs @@ -0,0 +1,1275 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::cmp::Ordering; +use std::fmt::{Display, Formatter}; + +use common_base::bytes::{Bytes, StringBytes}; +use common_time::date::Date; +use common_time::datetime::DateTime; +use common_time::timestamp::{TimeUnit, Timestamp}; +use datafusion_common::ScalarValue; +pub use ordered_float::OrderedFloat; +use serde::{Deserialize, Serialize}; + +use crate::error::{self, Result}; +use crate::prelude::*; +use crate::type_id::LogicalTypeId; +use crate::vectors::ListVector; + +pub type OrderedF32 = OrderedFloat; +pub type OrderedF64 = OrderedFloat; + +/// Value holds a single arbitrary value of any [DataType](crate::data_type::DataType). +/// +/// Comparison between values with different types (expect Null) is not allowed. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum Value { + Null, + + // Numeric types: + Boolean(bool), + UInt8(u8), + UInt16(u16), + UInt32(u32), + UInt64(u64), + Int8(i8), + Int16(i16), + Int32(i32), + Int64(i64), + Float32(OrderedF32), + Float64(OrderedF64), + + // String types: + String(StringBytes), + Binary(Bytes), + + // Date & Time types: + Date(Date), + DateTime(DateTime), + Timestamp(Timestamp), + + List(ListValue), +} + +impl Display for Value { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Value::Null => write!(f, "{}", self.data_type().name()), + Value::Boolean(v) => write!(f, "{}", v), + Value::UInt8(v) => write!(f, "{}", v), + Value::UInt16(v) => write!(f, "{}", v), + Value::UInt32(v) => write!(f, "{}", v), + Value::UInt64(v) => write!(f, "{}", v), + Value::Int8(v) => write!(f, "{}", v), + Value::Int16(v) => write!(f, "{}", v), + Value::Int32(v) => write!(f, "{}", v), + Value::Int64(v) => write!(f, "{}", v), + Value::Float32(v) => write!(f, "{}", v), + Value::Float64(v) => write!(f, "{}", v), + Value::String(v) => write!(f, "{}", v.as_utf8()), + Value::Binary(v) => { + let hex = v + .iter() + .map(|b| format!("{:02x}", b)) + .collect::>() + .join(""); + write!(f, "{}", hex) + } + Value::Date(v) => write!(f, "{}", v), + Value::DateTime(v) => write!(f, "{}", v), + Value::Timestamp(v) => write!(f, "{}", v.to_iso8601_string()), + Value::List(v) => { + let default = Box::new(vec![]); + let items = v.items().as_ref().unwrap_or(&default); + let items = items + .iter() + .map(|i| i.to_string()) + .collect::>() + .join(", "); + write!(f, "{}[{}]", v.datatype.name(), items) + } + } + } +} + +impl Value { + /// Returns data type of the value. + /// + /// # Panics + /// Panics if the data type is not supported. + pub fn data_type(&self) -> ConcreteDataType { + // TODO(yingwen): Implement this once all data types are implemented. + match self { + Value::Null => ConcreteDataType::null_datatype(), + Value::Boolean(_) => ConcreteDataType::boolean_datatype(), + Value::UInt8(_) => ConcreteDataType::uint8_datatype(), + Value::UInt16(_) => ConcreteDataType::uint16_datatype(), + Value::UInt32(_) => ConcreteDataType::uint32_datatype(), + Value::UInt64(_) => ConcreteDataType::uint64_datatype(), + Value::Int8(_) => ConcreteDataType::int8_datatype(), + Value::Int16(_) => ConcreteDataType::int16_datatype(), + Value::Int32(_) => ConcreteDataType::int32_datatype(), + Value::Int64(_) => ConcreteDataType::int64_datatype(), + Value::Float32(_) => ConcreteDataType::float32_datatype(), + Value::Float64(_) => ConcreteDataType::float64_datatype(), + Value::String(_) => ConcreteDataType::string_datatype(), + Value::Binary(_) => ConcreteDataType::binary_datatype(), + Value::Date(_) => ConcreteDataType::date_datatype(), + Value::DateTime(_) => ConcreteDataType::datetime_datatype(), + Value::Timestamp(v) => ConcreteDataType::timestamp_datatype(v.unit()), + Value::List(list) => ConcreteDataType::list_datatype(list.datatype().clone()), + } + } + + /// Returns true if this is a null value. + pub fn is_null(&self) -> bool { + matches!(self, Value::Null) + } + + /// Cast itself to [ListValue]. + pub fn as_list(&self) -> Result> { + match self { + Value::Null => Ok(None), + Value::List(v) => Ok(Some(v)), + other => error::CastTypeSnafu { + msg: format!("Failed to cast {:?} to list value", other), + } + .fail(), + } + } + + /// Cast itself to [ValueRef]. + pub fn as_value_ref(&self) -> ValueRef { + match self { + Value::Null => ValueRef::Null, + Value::Boolean(v) => ValueRef::Boolean(*v), + Value::UInt8(v) => ValueRef::UInt8(*v), + Value::UInt16(v) => ValueRef::UInt16(*v), + Value::UInt32(v) => ValueRef::UInt32(*v), + Value::UInt64(v) => ValueRef::UInt64(*v), + Value::Int8(v) => ValueRef::Int8(*v), + Value::Int16(v) => ValueRef::Int16(*v), + Value::Int32(v) => ValueRef::Int32(*v), + Value::Int64(v) => ValueRef::Int64(*v), + Value::Float32(v) => ValueRef::Float32(*v), + Value::Float64(v) => ValueRef::Float64(*v), + Value::String(v) => ValueRef::String(v.as_utf8()), + Value::Binary(v) => ValueRef::Binary(v), + Value::Date(v) => ValueRef::Date(*v), + Value::DateTime(v) => ValueRef::DateTime(*v), + Value::List(v) => ValueRef::List(ListValueRef::Ref { val: v }), + Value::Timestamp(v) => ValueRef::Timestamp(*v), + } + } + + /// Returns the logical type of the value. + pub fn logical_type_id(&self) -> LogicalTypeId { + match self { + Value::Null => LogicalTypeId::Null, + Value::Boolean(_) => LogicalTypeId::Boolean, + Value::UInt8(_) => LogicalTypeId::UInt8, + Value::UInt16(_) => LogicalTypeId::UInt16, + Value::UInt32(_) => LogicalTypeId::UInt32, + Value::UInt64(_) => LogicalTypeId::UInt64, + Value::Int8(_) => LogicalTypeId::Int8, + Value::Int16(_) => LogicalTypeId::Int16, + Value::Int32(_) => LogicalTypeId::Int32, + Value::Int64(_) => LogicalTypeId::Int64, + Value::Float32(_) => LogicalTypeId::Float32, + Value::Float64(_) => LogicalTypeId::Float64, + Value::String(_) => LogicalTypeId::String, + Value::Binary(_) => LogicalTypeId::Binary, + Value::List(_) => LogicalTypeId::List, + Value::Date(_) => LogicalTypeId::Date, + Value::DateTime(_) => LogicalTypeId::DateTime, + Value::Timestamp(t) => match t.unit() { + TimeUnit::Second => LogicalTypeId::TimestampSecond, + TimeUnit::Millisecond => LogicalTypeId::TimestampMillisecond, + TimeUnit::Microsecond => LogicalTypeId::TimestampMicrosecond, + TimeUnit::Nanosecond => LogicalTypeId::TimestampNanosecond, + }, + } + } +} + +macro_rules! impl_ord_for_value_like { + ($Type: ident, $left: ident, $right: ident) => { + if $left.is_null() && !$right.is_null() { + return Ordering::Less; + } else if !$left.is_null() && $right.is_null() { + return Ordering::Greater; + } else { + match ($left, $right) { + ($Type::Null, $Type::Null) => Ordering::Equal, + ($Type::Boolean(v1), $Type::Boolean(v2)) => v1.cmp(v2), + ($Type::UInt8(v1), $Type::UInt8(v2)) => v1.cmp(v2), + ($Type::UInt16(v1), $Type::UInt16(v2)) => v1.cmp(v2), + ($Type::UInt32(v1), $Type::UInt32(v2)) => v1.cmp(v2), + ($Type::UInt64(v1), $Type::UInt64(v2)) => v1.cmp(v2), + ($Type::Int8(v1), $Type::Int8(v2)) => v1.cmp(v2), + ($Type::Int16(v1), $Type::Int16(v2)) => v1.cmp(v2), + ($Type::Int32(v1), $Type::Int32(v2)) => v1.cmp(v2), + ($Type::Int64(v1), $Type::Int64(v2)) => v1.cmp(v2), + ($Type::Float32(v1), $Type::Float32(v2)) => v1.cmp(v2), + ($Type::Float64(v1), $Type::Float64(v2)) => v1.cmp(v2), + ($Type::String(v1), $Type::String(v2)) => v1.cmp(v2), + ($Type::Binary(v1), $Type::Binary(v2)) => v1.cmp(v2), + ($Type::Date(v1), $Type::Date(v2)) => v1.cmp(v2), + ($Type::DateTime(v1), $Type::DateTime(v2)) => v1.cmp(v2), + ($Type::Timestamp(v1), $Type::Timestamp(v2)) => v1.cmp(v2), + ($Type::List(v1), $Type::List(v2)) => v1.cmp(v2), + _ => panic!( + "Cannot compare different values {:?} and {:?}", + $left, $right + ), + } + } + }; +} + +impl PartialOrd for Value { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Value { + fn cmp(&self, other: &Self) -> Ordering { + impl_ord_for_value_like!(Value, self, other) + } +} + +macro_rules! impl_value_from { + ($Variant: ident, $Type: ident) => { + impl From<$Type> for Value { + fn from(value: $Type) -> Self { + Value::$Variant(value.into()) + } + } + + impl From> for Value { + fn from(value: Option<$Type>) -> Self { + match value { + Some(v) => Value::$Variant(v.into()), + None => Value::Null, + } + } + } + }; +} + +impl_value_from!(Boolean, bool); +impl_value_from!(UInt8, u8); +impl_value_from!(UInt16, u16); +impl_value_from!(UInt32, u32); +impl_value_from!(UInt64, u64); +impl_value_from!(Int8, i8); +impl_value_from!(Int16, i16); +impl_value_from!(Int32, i32); +impl_value_from!(Int64, i64); +impl_value_from!(Float32, f32); +impl_value_from!(Float64, f64); +impl_value_from!(String, StringBytes); +impl_value_from!(Binary, Bytes); +impl_value_from!(Date, Date); +impl_value_from!(DateTime, DateTime); +impl_value_from!(Timestamp, Timestamp); + +impl From for Value { + fn from(string: String) -> Value { + Value::String(string.into()) + } +} + +impl From<&str> for Value { + fn from(string: &str) -> Value { + Value::String(string.into()) + } +} + +impl From> for Value { + fn from(bytes: Vec) -> Value { + Value::Binary(bytes.into()) + } +} + +impl From<&[u8]> for Value { + fn from(bytes: &[u8]) -> Value { + Value::Binary(bytes.into()) + } +} + +impl TryFrom for serde_json::Value { + type Error = serde_json::Error; + + fn try_from(value: Value) -> serde_json::Result { + let json_value = match value { + Value::Null => serde_json::Value::Null, + Value::Boolean(v) => serde_json::Value::Bool(v), + Value::UInt8(v) => serde_json::Value::from(v), + Value::UInt16(v) => serde_json::Value::from(v), + Value::UInt32(v) => serde_json::Value::from(v), + Value::UInt64(v) => serde_json::Value::from(v), + Value::Int8(v) => serde_json::Value::from(v), + Value::Int16(v) => serde_json::Value::from(v), + Value::Int32(v) => serde_json::Value::from(v), + Value::Int64(v) => serde_json::Value::from(v), + Value::Float32(v) => serde_json::Value::from(v.0), + Value::Float64(v) => serde_json::Value::from(v.0), + Value::String(bytes) => serde_json::Value::String(bytes.as_utf8().to_string()), + Value::Binary(bytes) => serde_json::to_value(bytes)?, + Value::Date(v) => serde_json::Value::Number(v.val().into()), + Value::DateTime(v) => serde_json::Value::Number(v.val().into()), + Value::List(v) => serde_json::to_value(v)?, + Value::Timestamp(v) => serde_json::to_value(v.value())?, + }; + + Ok(json_value) + } +} + +// TODO(yingwen): Consider removing the `datatype` field from `ListValue`. +/// List value. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct ListValue { + /// List of nested Values (boxed to reduce size_of(Value)) + #[allow(clippy::box_collection)] + items: Option>>, + /// Inner values datatype, to distinguish empty lists of different datatypes. + /// Restricted by DataFusion, cannot use null datatype for empty list. + datatype: ConcreteDataType, +} + +impl Eq for ListValue {} + +impl ListValue { + pub fn new(items: Option>>, datatype: ConcreteDataType) -> Self { + Self { items, datatype } + } + + pub fn items(&self) -> &Option>> { + &self.items + } + + pub fn datatype(&self) -> &ConcreteDataType { + &self.datatype + } +} + +impl Default for ListValue { + fn default() -> ListValue { + ListValue::new(None, ConcreteDataType::null_datatype()) + } +} + +impl PartialOrd for ListValue { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for ListValue { + fn cmp(&self, other: &Self) -> Ordering { + assert_eq!( + self.datatype, other.datatype, + "Cannot compare different datatypes!" + ); + self.items.cmp(&other.items) + } +} + +impl TryFrom for Value { + type Error = error::Error; + + fn try_from(v: ScalarValue) -> Result { + let v = match v { + ScalarValue::Null => Value::Null, + ScalarValue::Boolean(b) => Value::from(b), + ScalarValue::Float32(f) => Value::from(f), + ScalarValue::Float64(f) => Value::from(f), + ScalarValue::Int8(i) => Value::from(i), + ScalarValue::Int16(i) => Value::from(i), + ScalarValue::Int32(i) => Value::from(i), + ScalarValue::Int64(i) => Value::from(i), + ScalarValue::UInt8(u) => Value::from(u), + ScalarValue::UInt16(u) => Value::from(u), + ScalarValue::UInt32(u) => Value::from(u), + ScalarValue::UInt64(u) => Value::from(u), + ScalarValue::Utf8(s) | ScalarValue::LargeUtf8(s) => { + Value::from(s.map(StringBytes::from)) + } + ScalarValue::Binary(b) + | ScalarValue::LargeBinary(b) + | ScalarValue::FixedSizeBinary(_, b) => Value::from(b.map(Bytes::from)), + ScalarValue::List(vs, field) => { + let items = if let Some(vs) = vs { + let vs = vs + .into_iter() + .map(ScalarValue::try_into) + .collect::>()?; + Some(Box::new(vs)) + } else { + None + }; + let datatype = ConcreteDataType::try_from(field.data_type())?; + Value::List(ListValue::new(items, datatype)) + } + ScalarValue::Date32(d) => d.map(|x| Value::Date(Date::new(x))).unwrap_or(Value::Null), + ScalarValue::Date64(d) => d + .map(|x| Value::DateTime(DateTime::new(x))) + .unwrap_or(Value::Null), + ScalarValue::TimestampSecond(t, _) => t + .map(|x| Value::Timestamp(Timestamp::new(x, TimeUnit::Second))) + .unwrap_or(Value::Null), + ScalarValue::TimestampMillisecond(t, _) => t + .map(|x| Value::Timestamp(Timestamp::new(x, TimeUnit::Millisecond))) + .unwrap_or(Value::Null), + ScalarValue::TimestampMicrosecond(t, _) => t + .map(|x| Value::Timestamp(Timestamp::new(x, TimeUnit::Microsecond))) + .unwrap_or(Value::Null), + ScalarValue::TimestampNanosecond(t, _) => t + .map(|x| Value::Timestamp(Timestamp::new(x, TimeUnit::Nanosecond))) + .unwrap_or(Value::Null), + ScalarValue::Decimal128(_, _, _) + | ScalarValue::Time64(_) + | ScalarValue::IntervalYearMonth(_) + | ScalarValue::IntervalDayTime(_) + | ScalarValue::IntervalMonthDayNano(_) + | ScalarValue::Struct(_, _) + | ScalarValue::Dictionary(_, _) => { + return error::UnsupportedArrowTypeSnafu { + arrow_type: v.get_datatype(), + } + .fail() + } + }; + Ok(v) + } +} + +/// Reference to [Value]. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ValueRef<'a> { + Null, + + // Numeric types: + Boolean(bool), + UInt8(u8), + UInt16(u16), + UInt32(u32), + UInt64(u64), + Int8(i8), + Int16(i16), + Int32(i32), + Int64(i64), + Float32(OrderedF32), + Float64(OrderedF64), + + // String types: + String(&'a str), + Binary(&'a [u8]), + + // Date & Time types: + Date(Date), + DateTime(DateTime), + Timestamp(Timestamp), + List(ListValueRef<'a>), +} + +macro_rules! impl_as_for_value_ref { + ($value: ident, $Variant: ident) => { + match $value { + ValueRef::Null => Ok(None), + ValueRef::$Variant(v) => Ok(Some(*v)), + other => error::CastTypeSnafu { + msg: format!( + "Failed to cast value ref {:?} to {}", + other, + stringify!($Variant) + ), + } + .fail(), + } + }; +} + +impl<'a> ValueRef<'a> { + /// Returns true if this is null. + pub fn is_null(&self) -> bool { + matches!(self, ValueRef::Null) + } + + /// Cast itself to binary slice. + pub fn as_binary(&self) -> Result> { + impl_as_for_value_ref!(self, Binary) + } + + /// Cast itself to string slice. + pub fn as_string(&self) -> Result> { + impl_as_for_value_ref!(self, String) + } + + /// Cast itself to boolean. + pub fn as_boolean(&self) -> Result> { + impl_as_for_value_ref!(self, Boolean) + } + + /// Cast itself to [Date]. + pub fn as_date(&self) -> Result> { + impl_as_for_value_ref!(self, Date) + } + + /// Cast itself to [DateTime]. + pub fn as_datetime(&self) -> Result> { + impl_as_for_value_ref!(self, DateTime) + } + + pub fn as_timestamp(&self) -> Result> { + impl_as_for_value_ref!(self, Timestamp) + } + + /// Cast itself to [ListValueRef]. + pub fn as_list(&self) -> Result> { + impl_as_for_value_ref!(self, List) + } +} + +impl<'a> PartialOrd for ValueRef<'a> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl<'a> Ord for ValueRef<'a> { + fn cmp(&self, other: &Self) -> Ordering { + impl_ord_for_value_like!(ValueRef, self, other) + } +} + +macro_rules! impl_value_ref_from { + ($Variant:ident, $Type:ident) => { + impl From<$Type> for ValueRef<'_> { + fn from(value: $Type) -> Self { + ValueRef::$Variant(value.into()) + } + } + + impl From> for ValueRef<'_> { + fn from(value: Option<$Type>) -> Self { + match value { + Some(v) => ValueRef::$Variant(v.into()), + None => ValueRef::Null, + } + } + } + }; +} + +impl_value_ref_from!(Boolean, bool); +impl_value_ref_from!(UInt8, u8); +impl_value_ref_from!(UInt16, u16); +impl_value_ref_from!(UInt32, u32); +impl_value_ref_from!(UInt64, u64); +impl_value_ref_from!(Int8, i8); +impl_value_ref_from!(Int16, i16); +impl_value_ref_from!(Int32, i32); +impl_value_ref_from!(Int64, i64); +impl_value_ref_from!(Float32, f32); +impl_value_ref_from!(Float64, f64); +impl_value_ref_from!(Date, Date); +impl_value_ref_from!(DateTime, DateTime); +impl_value_ref_from!(Timestamp, Timestamp); + +impl<'a> From<&'a str> for ValueRef<'a> { + fn from(string: &'a str) -> ValueRef<'a> { + ValueRef::String(string) + } +} + +impl<'a> From<&'a [u8]> for ValueRef<'a> { + fn from(bytes: &'a [u8]) -> ValueRef<'a> { + ValueRef::Binary(bytes) + } +} + +impl<'a> From>> for ValueRef<'a> { + fn from(list: Option) -> ValueRef { + match list { + Some(v) => ValueRef::List(v), + None => ValueRef::Null, + } + } +} + +/// Reference to a [ListValue]. +/// +/// Now comparison still requires some allocation (call of `to_value()`) and +/// might be avoidable by downcasting and comparing the underlying array slice +/// if it becomes bottleneck. +#[derive(Debug, Clone, Copy)] +pub enum ListValueRef<'a> { + // TODO(yingwen): Consider replace this by VectorRef. + Indexed { vector: &'a ListVector, idx: usize }, + Ref { val: &'a ListValue }, +} + +impl<'a> ListValueRef<'a> { + /// Convert self to [Value]. This method would clone the underlying data. + fn to_value(self) -> Value { + match self { + ListValueRef::Indexed { vector, idx } => vector.get(idx), + ListValueRef::Ref { val } => Value::List(val.clone()), + } + } +} + +impl<'a> PartialEq for ListValueRef<'a> { + fn eq(&self, other: &Self) -> bool { + self.to_value().eq(&other.to_value()) + } +} + +impl<'a> Eq for ListValueRef<'a> {} + +impl<'a> Ord for ListValueRef<'a> { + fn cmp(&self, other: &Self) -> Ordering { + // Respect the order of `Value` by converting into value before comparison. + self.to_value().cmp(&other.to_value()) + } +} + +impl<'a> PartialOrd for ListValueRef<'a> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +#[cfg(test)] +mod tests { + use arrow::datatypes::DataType as ArrowDataType; + use num_traits::Float; + + use super::*; + + #[test] + fn test_try_from_scalar_value() { + assert_eq!( + Value::Boolean(true), + ScalarValue::Boolean(Some(true)).try_into().unwrap() + ); + assert_eq!( + Value::Boolean(false), + ScalarValue::Boolean(Some(false)).try_into().unwrap() + ); + assert_eq!(Value::Null, ScalarValue::Boolean(None).try_into().unwrap()); + + assert_eq!( + Value::Float32(1.0f32.into()), + ScalarValue::Float32(Some(1.0f32)).try_into().unwrap() + ); + assert_eq!(Value::Null, ScalarValue::Float32(None).try_into().unwrap()); + + assert_eq!( + Value::Float64(2.0f64.into()), + ScalarValue::Float64(Some(2.0f64)).try_into().unwrap() + ); + assert_eq!(Value::Null, ScalarValue::Float64(None).try_into().unwrap()); + + assert_eq!( + Value::Int8(i8::MAX), + ScalarValue::Int8(Some(i8::MAX)).try_into().unwrap() + ); + assert_eq!(Value::Null, ScalarValue::Int8(None).try_into().unwrap()); + + assert_eq!( + Value::Int16(i16::MAX), + ScalarValue::Int16(Some(i16::MAX)).try_into().unwrap() + ); + assert_eq!(Value::Null, ScalarValue::Int16(None).try_into().unwrap()); + + assert_eq!( + Value::Int32(i32::MAX), + ScalarValue::Int32(Some(i32::MAX)).try_into().unwrap() + ); + assert_eq!(Value::Null, ScalarValue::Int32(None).try_into().unwrap()); + + assert_eq!( + Value::Int64(i64::MAX), + ScalarValue::Int64(Some(i64::MAX)).try_into().unwrap() + ); + assert_eq!(Value::Null, ScalarValue::Int64(None).try_into().unwrap()); + + assert_eq!( + Value::UInt8(u8::MAX), + ScalarValue::UInt8(Some(u8::MAX)).try_into().unwrap() + ); + assert_eq!(Value::Null, ScalarValue::UInt8(None).try_into().unwrap()); + + assert_eq!( + Value::UInt16(u16::MAX), + ScalarValue::UInt16(Some(u16::MAX)).try_into().unwrap() + ); + assert_eq!(Value::Null, ScalarValue::UInt16(None).try_into().unwrap()); + + assert_eq!( + Value::UInt32(u32::MAX), + ScalarValue::UInt32(Some(u32::MAX)).try_into().unwrap() + ); + assert_eq!(Value::Null, ScalarValue::UInt32(None).try_into().unwrap()); + + assert_eq!( + Value::UInt64(u64::MAX), + ScalarValue::UInt64(Some(u64::MAX)).try_into().unwrap() + ); + assert_eq!(Value::Null, ScalarValue::UInt64(None).try_into().unwrap()); + + assert_eq!( + Value::from("hello"), + ScalarValue::Utf8(Some("hello".to_string())) + .try_into() + .unwrap() + ); + assert_eq!(Value::Null, ScalarValue::Utf8(None).try_into().unwrap()); + + assert_eq!( + Value::from("large_hello"), + ScalarValue::LargeUtf8(Some("large_hello".to_string())) + .try_into() + .unwrap() + ); + assert_eq!( + Value::Null, + ScalarValue::LargeUtf8(None).try_into().unwrap() + ); + + assert_eq!( + Value::from("world".as_bytes()), + ScalarValue::Binary(Some("world".as_bytes().to_vec())) + .try_into() + .unwrap() + ); + assert_eq!(Value::Null, ScalarValue::Binary(None).try_into().unwrap()); + + assert_eq!( + Value::from("large_world".as_bytes()), + ScalarValue::LargeBinary(Some("large_world".as_bytes().to_vec())) + .try_into() + .unwrap() + ); + assert_eq!( + Value::Null, + ScalarValue::LargeBinary(None).try_into().unwrap() + ); + + assert_eq!( + Value::List(ListValue::new( + Some(Box::new(vec![Value::Int32(1), Value::Null])), + ConcreteDataType::int32_datatype() + )), + ScalarValue::new_list( + Some(vec![ScalarValue::Int32(Some(1)), ScalarValue::Int32(None)]), + ArrowDataType::Int32, + ) + .try_into() + .unwrap() + ); + assert_eq!( + Value::List(ListValue::new(None, ConcreteDataType::uint32_datatype())), + ScalarValue::new_list(None, ArrowDataType::UInt32) + .try_into() + .unwrap() + ); + + assert_eq!( + Value::Date(Date::new(123)), + ScalarValue::Date32(Some(123)).try_into().unwrap() + ); + assert_eq!(Value::Null, ScalarValue::Date32(None).try_into().unwrap()); + + assert_eq!( + Value::DateTime(DateTime::new(456)), + ScalarValue::Date64(Some(456)).try_into().unwrap() + ); + assert_eq!(Value::Null, ScalarValue::Date64(None).try_into().unwrap()); + + assert_eq!( + Value::Timestamp(Timestamp::new(1, TimeUnit::Second)), + ScalarValue::TimestampSecond(Some(1), None) + .try_into() + .unwrap() + ); + assert_eq!( + Value::Null, + ScalarValue::TimestampSecond(None, None).try_into().unwrap() + ); + + assert_eq!( + Value::Timestamp(Timestamp::new(1, TimeUnit::Millisecond)), + ScalarValue::TimestampMillisecond(Some(1), None) + .try_into() + .unwrap() + ); + assert_eq!( + Value::Null, + ScalarValue::TimestampMillisecond(None, None) + .try_into() + .unwrap() + ); + + assert_eq!( + Value::Timestamp(Timestamp::new(1, TimeUnit::Microsecond)), + ScalarValue::TimestampMicrosecond(Some(1), None) + .try_into() + .unwrap() + ); + assert_eq!( + Value::Null, + ScalarValue::TimestampMicrosecond(None, None) + .try_into() + .unwrap() + ); + + assert_eq!( + Value::Timestamp(Timestamp::new(1, TimeUnit::Nanosecond)), + ScalarValue::TimestampNanosecond(Some(1), None) + .try_into() + .unwrap() + ); + assert_eq!( + Value::Null, + ScalarValue::TimestampNanosecond(None, None) + .try_into() + .unwrap() + ); + + let result: Result = ScalarValue::Decimal128(Some(1), 0, 0).try_into(); + result + .unwrap_err() + .to_string() + .contains("Unsupported arrow data type, type: Decimal(0, 0)"); + } + + #[test] + fn test_value_from_inner() { + assert_eq!(Value::Boolean(true), Value::from(true)); + assert_eq!(Value::Boolean(false), Value::from(false)); + + assert_eq!(Value::UInt8(u8::MIN), Value::from(u8::MIN)); + assert_eq!(Value::UInt8(u8::MAX), Value::from(u8::MAX)); + + assert_eq!(Value::UInt16(u16::MIN), Value::from(u16::MIN)); + assert_eq!(Value::UInt16(u16::MAX), Value::from(u16::MAX)); + + assert_eq!(Value::UInt32(u32::MIN), Value::from(u32::MIN)); + assert_eq!(Value::UInt32(u32::MAX), Value::from(u32::MAX)); + + assert_eq!(Value::UInt64(u64::MIN), Value::from(u64::MIN)); + assert_eq!(Value::UInt64(u64::MAX), Value::from(u64::MAX)); + + assert_eq!(Value::Int8(i8::MIN), Value::from(i8::MIN)); + assert_eq!(Value::Int8(i8::MAX), Value::from(i8::MAX)); + + assert_eq!(Value::Int16(i16::MIN), Value::from(i16::MIN)); + assert_eq!(Value::Int16(i16::MAX), Value::from(i16::MAX)); + + assert_eq!(Value::Int32(i32::MIN), Value::from(i32::MIN)); + assert_eq!(Value::Int32(i32::MAX), Value::from(i32::MAX)); + + assert_eq!(Value::Int64(i64::MIN), Value::from(i64::MIN)); + assert_eq!(Value::Int64(i64::MAX), Value::from(i64::MAX)); + + assert_eq!( + Value::Float32(OrderedFloat(f32::MIN)), + Value::from(f32::MIN) + ); + assert_eq!( + Value::Float32(OrderedFloat(f32::MAX)), + Value::from(f32::MAX) + ); + + assert_eq!( + Value::Float64(OrderedFloat(f64::MIN)), + Value::from(f64::MIN) + ); + assert_eq!( + Value::Float64(OrderedFloat(f64::MAX)), + Value::from(f64::MAX) + ); + + let string_bytes = StringBytes::from("hello"); + assert_eq!( + Value::String(string_bytes.clone()), + Value::from(string_bytes) + ); + + let bytes = Bytes::from(b"world".as_slice()); + assert_eq!(Value::Binary(bytes.clone()), Value::from(bytes)); + } + + fn check_type_and_value(data_type: &ConcreteDataType, value: &Value) { + assert_eq!(*data_type, value.data_type()); + assert_eq!(data_type.logical_type_id(), value.logical_type_id()); + } + + #[test] + fn test_value_datatype() { + check_type_and_value(&ConcreteDataType::boolean_datatype(), &Value::Boolean(true)); + check_type_and_value(&ConcreteDataType::uint8_datatype(), &Value::UInt8(u8::MIN)); + check_type_and_value( + &ConcreteDataType::uint16_datatype(), + &Value::UInt16(u16::MIN), + ); + check_type_and_value( + &ConcreteDataType::uint16_datatype(), + &Value::UInt16(u16::MAX), + ); + check_type_and_value( + &ConcreteDataType::uint32_datatype(), + &Value::UInt32(u32::MIN), + ); + check_type_and_value( + &ConcreteDataType::uint64_datatype(), + &Value::UInt64(u64::MIN), + ); + check_type_and_value(&ConcreteDataType::int8_datatype(), &Value::Int8(i8::MIN)); + check_type_and_value(&ConcreteDataType::int16_datatype(), &Value::Int16(i16::MIN)); + check_type_and_value(&ConcreteDataType::int32_datatype(), &Value::Int32(i32::MIN)); + check_type_and_value(&ConcreteDataType::int64_datatype(), &Value::Int64(i64::MIN)); + check_type_and_value( + &ConcreteDataType::float32_datatype(), + &Value::Float32(OrderedFloat(f32::MIN)), + ); + check_type_and_value( + &ConcreteDataType::float64_datatype(), + &Value::Float64(OrderedFloat(f64::MIN)), + ); + check_type_and_value( + &ConcreteDataType::string_datatype(), + &Value::String(StringBytes::from("hello")), + ); + check_type_and_value( + &ConcreteDataType::binary_datatype(), + &Value::Binary(Bytes::from(b"world".as_slice())), + ); + check_type_and_value( + &ConcreteDataType::list_datatype(ConcreteDataType::int32_datatype()), + &Value::List(ListValue::new( + Some(Box::new(vec![Value::Int32(10)])), + ConcreteDataType::int32_datatype(), + )), + ); + check_type_and_value( + &ConcreteDataType::list_datatype(ConcreteDataType::null_datatype()), + &Value::List(ListValue::default()), + ); + check_type_and_value( + &ConcreteDataType::date_datatype(), + &Value::Date(Date::new(1)), + ); + check_type_and_value( + &ConcreteDataType::datetime_datatype(), + &Value::DateTime(DateTime::new(1)), + ); + check_type_and_value( + &ConcreteDataType::timestamp_millisecond_datatype(), + &Value::Timestamp(Timestamp::from_millis(1)), + ); + } + + #[test] + fn test_value_from_string() { + let hello = "hello".to_string(); + assert_eq!( + Value::String(StringBytes::from(hello.clone())), + Value::from(hello) + ); + + let world = "world"; + assert_eq!(Value::String(StringBytes::from(world)), Value::from(world)); + } + + #[test] + fn test_value_from_bytes() { + let hello = b"hello".to_vec(); + assert_eq!( + Value::Binary(Bytes::from(hello.clone())), + Value::from(hello) + ); + + let world: &[u8] = b"world"; + assert_eq!(Value::Binary(Bytes::from(world)), Value::from(world)); + } + + fn to_json(value: Value) -> serde_json::Value { + value.try_into().unwrap() + } + + #[test] + fn test_to_json_value() { + assert_eq!(serde_json::Value::Null, to_json(Value::Null)); + assert_eq!(serde_json::Value::Bool(true), to_json(Value::Boolean(true))); + assert_eq!( + serde_json::Value::Number(20u8.into()), + to_json(Value::UInt8(20)) + ); + assert_eq!( + serde_json::Value::Number(20i8.into()), + to_json(Value::Int8(20)) + ); + assert_eq!( + serde_json::Value::Number(2000u16.into()), + to_json(Value::UInt16(2000)) + ); + assert_eq!( + serde_json::Value::Number(2000i16.into()), + to_json(Value::Int16(2000)) + ); + assert_eq!( + serde_json::Value::Number(3000u32.into()), + to_json(Value::UInt32(3000)) + ); + assert_eq!( + serde_json::Value::Number(3000i32.into()), + to_json(Value::Int32(3000)) + ); + assert_eq!( + serde_json::Value::Number(4000u64.into()), + to_json(Value::UInt64(4000)) + ); + assert_eq!( + serde_json::Value::Number(4000i64.into()), + to_json(Value::Int64(4000)) + ); + assert_eq!( + serde_json::Value::from(125.0f32), + to_json(Value::Float32(125.0.into())) + ); + assert_eq!( + serde_json::Value::from(125.0f64), + to_json(Value::Float64(125.0.into())) + ); + assert_eq!( + serde_json::Value::String(String::from("hello")), + to_json(Value::String(StringBytes::from("hello"))) + ); + assert_eq!( + serde_json::Value::from(b"world".as_slice()), + to_json(Value::Binary(Bytes::from(b"world".as_slice()))) + ); + assert_eq!( + serde_json::Value::Number(5000i32.into()), + to_json(Value::Date(Date::new(5000))) + ); + assert_eq!( + serde_json::Value::Number(5000i64.into()), + to_json(Value::DateTime(DateTime::new(5000))) + ); + + assert_eq!( + serde_json::Value::Number(1.into()), + to_json(Value::Timestamp(Timestamp::from_millis(1))) + ); + + let json_value: serde_json::Value = + serde_json::from_str(r#"{"items":[{"Int32":123}],"datatype":{"Int32":{}}}"#).unwrap(); + assert_eq!( + json_value, + to_json(Value::List(ListValue { + items: Some(Box::new(vec![Value::Int32(123)])), + datatype: ConcreteDataType::int32_datatype(), + })) + ); + } + + #[test] + fn test_null_value() { + assert!(Value::Null.is_null()); + assert!(!Value::Boolean(true).is_null()); + assert!(Value::Null < Value::Boolean(false)); + assert!(Value::Boolean(true) > Value::Null); + assert!(Value::Null < Value::Int32(10)); + assert!(Value::Int32(10) > Value::Null); + } + + #[test] + fn test_null_value_ref() { + assert!(ValueRef::Null.is_null()); + assert!(!ValueRef::Boolean(true).is_null()); + assert!(ValueRef::Null < ValueRef::Boolean(false)); + assert!(ValueRef::Boolean(true) > ValueRef::Null); + assert!(ValueRef::Null < ValueRef::Int32(10)); + assert!(ValueRef::Int32(10) > ValueRef::Null); + } + + #[test] + fn test_as_value_ref() { + macro_rules! check_as_value_ref { + ($Variant: ident, $data: expr) => { + let value = Value::$Variant($data); + let value_ref = value.as_value_ref(); + let expect_ref = ValueRef::$Variant($data); + + assert_eq!(expect_ref, value_ref); + }; + } + + assert_eq!(ValueRef::Null, Value::Null.as_value_ref()); + check_as_value_ref!(Boolean, true); + check_as_value_ref!(UInt8, 123); + check_as_value_ref!(UInt16, 123); + check_as_value_ref!(UInt32, 123); + check_as_value_ref!(UInt64, 123); + check_as_value_ref!(Int8, -12); + check_as_value_ref!(Int16, -12); + check_as_value_ref!(Int32, -12); + check_as_value_ref!(Int64, -12); + check_as_value_ref!(Float32, OrderedF32::from(16.0)); + check_as_value_ref!(Float64, OrderedF64::from(16.0)); + check_as_value_ref!(Timestamp, Timestamp::from_millis(1)); + + assert_eq!( + ValueRef::String("hello"), + Value::String("hello".into()).as_value_ref() + ); + assert_eq!( + ValueRef::Binary(b"hello"), + Value::Binary("hello".as_bytes().into()).as_value_ref() + ); + + check_as_value_ref!(Date, Date::new(103)); + check_as_value_ref!(DateTime, DateTime::new(1034)); + + let list = ListValue { + items: None, + datatype: ConcreteDataType::int32_datatype(), + }; + assert_eq!( + ValueRef::List(ListValueRef::Ref { val: &list }), + Value::List(list.clone()).as_value_ref() + ); + } + + #[test] + fn test_value_ref_as() { + macro_rules! check_as_null { + ($method: ident) => { + assert_eq!(None, ValueRef::Null.$method().unwrap()); + }; + } + + check_as_null!(as_binary); + check_as_null!(as_string); + check_as_null!(as_boolean); + check_as_null!(as_date); + check_as_null!(as_datetime); + check_as_null!(as_list); + + macro_rules! check_as_correct { + ($data: expr, $Variant: ident, $method: ident) => { + assert_eq!(Some($data), ValueRef::$Variant($data).$method().unwrap()); + }; + } + + check_as_correct!("hello", String, as_string); + check_as_correct!("hello".as_bytes(), Binary, as_binary); + check_as_correct!(true, Boolean, as_boolean); + check_as_correct!(Date::new(123), Date, as_date); + check_as_correct!(DateTime::new(12), DateTime, as_datetime); + let list = ListValue { + items: None, + datatype: ConcreteDataType::int32_datatype(), + }; + check_as_correct!(ListValueRef::Ref { val: &list }, List, as_list); + + let wrong_value = ValueRef::Int32(12345); + assert!(wrong_value.as_binary().is_err()); + assert!(wrong_value.as_string().is_err()); + assert!(wrong_value.as_boolean().is_err()); + assert!(wrong_value.as_date().is_err()); + assert!(wrong_value.as_datetime().is_err()); + assert!(wrong_value.as_list().is_err()); + } + + #[test] + fn test_display() { + assert_eq!(Value::Null.to_string(), "Null"); + assert_eq!(Value::UInt8(8).to_string(), "8"); + assert_eq!(Value::UInt16(16).to_string(), "16"); + assert_eq!(Value::UInt32(32).to_string(), "32"); + assert_eq!(Value::UInt64(64).to_string(), "64"); + assert_eq!(Value::Int8(-8).to_string(), "-8"); + assert_eq!(Value::Int16(-16).to_string(), "-16"); + assert_eq!(Value::Int32(-32).to_string(), "-32"); + assert_eq!(Value::Int64(-64).to_string(), "-64"); + assert_eq!(Value::Float32((-32.123).into()).to_string(), "-32.123"); + assert_eq!(Value::Float64((-64.123).into()).to_string(), "-64.123"); + assert_eq!(Value::Float64(OrderedF64::infinity()).to_string(), "inf"); + assert_eq!(Value::Float64(OrderedF64::nan()).to_string(), "NaN"); + assert_eq!(Value::String(StringBytes::from("123")).to_string(), "123"); + assert_eq!( + Value::Binary(Bytes::from(vec![1, 2, 3])).to_string(), + "010203" + ); + assert_eq!(Value::Date(Date::new(0)).to_string(), "1970-01-01"); + assert_eq!( + Value::DateTime(DateTime::new(0)).to_string(), + "1970-01-01 00:00:00" + ); + assert_eq!( + Value::Timestamp(Timestamp::new(1000, TimeUnit::Millisecond)).to_string(), + "1970-01-01 00:00:01+0000" + ); + assert_eq!( + Value::List(ListValue::new( + Some(Box::new(vec![Value::Int8(1), Value::Int8(2)])), + ConcreteDataType::int8_datatype(), + )) + .to_string(), + "Int8[1, 2]" + ); + assert_eq!( + Value::List(ListValue::new( + Some(Box::new(vec![])), + ConcreteDataType::timestamp_second_datatype(), + )) + .to_string(), + "TimestampSecondType[]" + ); + assert_eq!( + Value::List(ListValue::new( + Some(Box::new(vec![])), + ConcreteDataType::timestamp_millisecond_datatype(), + )) + .to_string(), + "TimestampMillisecondType[]" + ); + assert_eq!( + Value::List(ListValue::new( + Some(Box::new(vec![])), + ConcreteDataType::timestamp_microsecond_datatype(), + )) + .to_string(), + "TimestampMicrosecondType[]" + ); + assert_eq!( + Value::List(ListValue::new( + Some(Box::new(vec![])), + ConcreteDataType::timestamp_nanosecond_datatype(), + )) + .to_string(), + "TimestampNanosecondType[]" + ); + } +} diff --git a/src/datatypes2/src/vectors.rs b/src/datatypes2/src/vectors.rs new file mode 100644 index 0000000000..38fa762d4b --- /dev/null +++ b/src/datatypes2/src/vectors.rs @@ -0,0 +1,309 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::fmt::Debug; +use std::sync::Arc; + +use arrow::array::{Array, ArrayRef}; +use snafu::ensure; + +use crate::data_type::ConcreteDataType; +use crate::error::{self, Result}; +use crate::serialize::Serializable; +use crate::value::{Value, ValueRef}; +use crate::vectors::operations::VectorOp; + +mod binary; +mod boolean; +mod constant; +mod date; +mod datetime; +mod eq; +mod helper; +mod list; +mod null; +mod operations; +mod primitive; +mod string; +mod timestamp; +mod validity; + +pub use binary::{BinaryVector, BinaryVectorBuilder}; +pub use boolean::{BooleanVector, BooleanVectorBuilder}; +pub use constant::ConstantVector; +pub use date::{DateVector, DateVectorBuilder}; +pub use datetime::{DateTimeVector, DateTimeVectorBuilder}; +pub use helper::Helper; +pub use list::{ListIter, ListVector, ListVectorBuilder}; +pub use null::{NullVector, NullVectorBuilder}; +pub use primitive::{ + Float32Vector, Float32VectorBuilder, Float64Vector, Float64VectorBuilder, Int16Vector, + Int16VectorBuilder, Int32Vector, Int32VectorBuilder, Int64Vector, Int64VectorBuilder, + Int8Vector, Int8VectorBuilder, PrimitiveIter, PrimitiveVector, PrimitiveVectorBuilder, + UInt16Vector, UInt16VectorBuilder, UInt32Vector, UInt32VectorBuilder, UInt64Vector, + UInt64VectorBuilder, UInt8Vector, UInt8VectorBuilder, +}; +pub use string::{StringVector, StringVectorBuilder}; +pub use timestamp::{ + TimestampMicrosecondVector, TimestampMicrosecondVectorBuilder, TimestampMillisecondVector, + TimestampMillisecondVectorBuilder, TimestampNanosecondVector, TimestampNanosecondVectorBuilder, + TimestampSecondVector, TimestampSecondVectorBuilder, +}; +pub use validity::Validity; + +// TODO(yingwen): arrow 28.0 implements Clone for all arrays, we could upgrade to it and simplify +// some codes in methods such as `to_arrow_array()` and `to_boxed_arrow_array()`. +/// Vector of data values. +pub trait Vector: Send + Sync + Serializable + Debug + VectorOp { + /// Returns the data type of the vector. + /// + /// This may require heap allocation. + fn data_type(&self) -> ConcreteDataType; + + fn vector_type_name(&self) -> String; + + /// Returns the vector as [Any](std::any::Any) so that it can be + /// downcast to a specific implementation. + fn as_any(&self) -> &dyn Any; + + /// Returns number of elements in the vector. + fn len(&self) -> usize; + + /// Returns whether the vector is empty. + fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Convert this vector to a new arrow [ArrayRef]. + fn to_arrow_array(&self) -> ArrayRef; + + /// Convert this vector to a new boxed arrow [Array]. + fn to_boxed_arrow_array(&self) -> Box; + + /// Returns the validity of the Array. + fn validity(&self) -> Validity; + + /// Returns the memory size of vector. + fn memory_size(&self) -> usize; + + /// The number of null slots on this [`Vector`]. + /// # Implementation + /// This is `O(1)`. + fn null_count(&self) -> usize; + + /// Returns true when it's a ConstantColumn + fn is_const(&self) -> bool { + false + } + + /// Returns whether row is null. + fn is_null(&self, row: usize) -> bool; + + /// If the only value vector can contain is NULL. + fn only_null(&self) -> bool { + self.null_count() == self.len() + } + + /// Slices the `Vector`, returning a new `VectorRef`. + /// + /// # Panics + /// This function panics if `offset + length > self.len()`. + fn slice(&self, offset: usize, length: usize) -> VectorRef; + + /// Returns the clone of value at `index`. + /// + /// # Panics + /// Panic if `index` is out of bound. + fn get(&self, index: usize) -> Value; + + /// Returns the clone of value at `index` or error if `index` + /// is out of bound. + fn try_get(&self, index: usize) -> Result { + ensure!( + index < self.len(), + error::BadArrayAccessSnafu { + index, + size: self.len() + } + ); + Ok(self.get(index)) + } + + /// Returns the reference of value at `index`. + /// + /// # Panics + /// Panic if `index` is out of bound. + fn get_ref(&self, index: usize) -> ValueRef; +} + +pub type VectorRef = Arc; + +/// Mutable vector that could be used to build an immutable vector. +pub trait MutableVector: Send + Sync { + /// Returns the data type of the vector. + fn data_type(&self) -> ConcreteDataType; + + /// Returns the length of the vector. + fn len(&self) -> usize; + + /// Returns whether the vector is empty. + fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Convert to Any, to enable dynamic casting. + fn as_any(&self) -> &dyn Any; + + /// Convert to mutable Any, to enable dynamic casting. + fn as_mut_any(&mut self) -> &mut dyn Any; + + /// Convert `self` to an (immutable) [VectorRef] and reset `self`. + fn to_vector(&mut self) -> VectorRef; + + /// Push value ref to this mutable vector. + /// + /// Returns error if data type unmatch. + fn push_value_ref(&mut self, value: ValueRef) -> Result<()>; + + /// Extend this mutable vector by slice of `vector`. + /// + /// Returns error if data type unmatch. + /// + /// # Panics + /// Panics if `offset + length > vector.len()`. + fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()>; +} + +/// Helper to define `try_from_arrow_array(array: arrow::array::ArrayRef)` function. +macro_rules! impl_try_from_arrow_array_for_vector { + ($Array: ident, $Vector: ident) => { + impl $Vector { + pub fn try_from_arrow_array( + array: impl AsRef, + ) -> crate::error::Result<$Vector> { + use snafu::OptionExt; + + let data = array + .as_ref() + .as_any() + .downcast_ref::<$Array>() + .with_context(|| crate::error::ConversionSnafu { + from: std::format!("{:?}", array.as_ref().data_type()), + })? + .data() + .clone(); + + let concrete_array = $Array::from(data); + Ok($Vector::from(concrete_array)) + } + } + }; +} + +macro_rules! impl_validity_for_vector { + ($array: expr) => { + Validity::from_array_data($array.data()) + }; +} + +macro_rules! impl_get_for_vector { + ($array: expr, $index: ident) => { + if $array.is_valid($index) { + // Safety: The index have been checked by `is_valid()`. + unsafe { $array.value_unchecked($index).into() } + } else { + Value::Null + } + }; +} + +macro_rules! impl_get_ref_for_vector { + ($array: expr, $index: ident) => { + if $array.is_valid($index) { + // Safety: The index have been checked by `is_valid()`. + unsafe { $array.value_unchecked($index).into() } + } else { + ValueRef::Null + } + }; +} + +macro_rules! impl_extend_for_builder { + ($mutable_vector: expr, $vector: ident, $VectorType: ident, $offset: ident, $length: ident) => {{ + use snafu::OptionExt; + + let sliced_vector = $vector.slice($offset, $length); + let concrete_vector = sliced_vector + .as_any() + .downcast_ref::<$VectorType>() + .with_context(|| crate::error::CastTypeSnafu { + msg: format!( + "Failed to cast vector from {} to {}", + $vector.vector_type_name(), + stringify!($VectorType) + ), + })?; + for value in concrete_vector.iter_data() { + $mutable_vector.push(value); + } + Ok(()) + }}; +} + +pub(crate) use { + impl_extend_for_builder, impl_get_for_vector, impl_get_ref_for_vector, + impl_try_from_arrow_array_for_vector, impl_validity_for_vector, +}; + +#[cfg(test)] +pub mod tests { + use arrow::array::{Array, Int32Array, UInt8Array}; + use serde_json; + + use super::*; + use crate::data_type::DataType; + use crate::types::{Int32Type, LogicalPrimitiveType}; + use crate::vectors::helper::Helper; + + #[test] + fn test_df_columns_to_vector() { + let df_column: Arc = Arc::new(Int32Array::from(vec![1, 2, 3])); + let vector = Helper::try_into_vector(df_column).unwrap(); + assert_eq!( + Int32Type::build_data_type().as_arrow_type(), + vector.data_type().as_arrow_type() + ); + } + + #[test] + fn test_serialize_i32_vector() { + let df_column: Arc = Arc::new(Int32Array::from(vec![1, 2, 3])); + let json_value = Helper::try_into_vector(df_column) + .unwrap() + .serialize_to_json() + .unwrap(); + assert_eq!("[1,2,3]", serde_json::to_string(&json_value).unwrap()); + } + + #[test] + fn test_serialize_i8_vector() { + let df_column: Arc = Arc::new(UInt8Array::from(vec![1, 2, 3])); + let json_value = Helper::try_into_vector(df_column) + .unwrap() + .serialize_to_json() + .unwrap(); + assert_eq!("[1,2,3]", serde_json::to_string(&json_value).unwrap()); + } +} diff --git a/src/datatypes2/src/vectors/binary.rs b/src/datatypes2/src/vectors/binary.rs new file mode 100644 index 0000000000..3b5defc8ec --- /dev/null +++ b/src/datatypes2/src/vectors/binary.rs @@ -0,0 +1,353 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef}; +use snafu::ResultExt; + +use crate::arrow_array::{BinaryArray, MutableBinaryArray}; +use crate::data_type::ConcreteDataType; +use crate::error::{self, Result}; +use crate::scalars::{ScalarVector, ScalarVectorBuilder}; +use crate::serialize::Serializable; +use crate::value::{Value, ValueRef}; +use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; + +/// Vector of binary strings. +#[derive(Debug, PartialEq)] +pub struct BinaryVector { + array: BinaryArray, +} + +impl BinaryVector { + pub(crate) fn as_arrow(&self) -> &dyn Array { + &self.array + } + + fn to_array_data(&self) -> ArrayData { + self.array.data().clone() + } + + fn from_array_data(data: ArrayData) -> BinaryVector { + BinaryVector { + array: BinaryArray::from(data), + } + } +} + +impl From for BinaryVector { + fn from(array: BinaryArray) -> Self { + Self { array } + } +} + +impl From>>> for BinaryVector { + fn from(data: Vec>>) -> Self { + Self { + array: BinaryArray::from_iter(data), + } + } +} + +impl Vector for BinaryVector { + fn data_type(&self) -> ConcreteDataType { + ConcreteDataType::binary_datatype() + } + + fn vector_type_name(&self) -> String { + "BinaryVector".to_string() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn len(&self) -> usize { + self.array.len() + } + + fn to_arrow_array(&self) -> ArrayRef { + let data = self.to_array_data(); + Arc::new(BinaryArray::from(data)) + } + + fn to_boxed_arrow_array(&self) -> Box { + let data = self.to_array_data(); + Box::new(BinaryArray::from(data)) + } + + fn validity(&self) -> Validity { + vectors::impl_validity_for_vector!(self.array) + } + + fn memory_size(&self) -> usize { + self.array.get_buffer_memory_size() + } + + fn null_count(&self) -> usize { + self.array.null_count() + } + + fn is_null(&self, row: usize) -> bool { + self.array.is_null(row) + } + + fn slice(&self, offset: usize, length: usize) -> VectorRef { + let data = self.array.data().slice(offset, length); + Arc::new(Self::from_array_data(data)) + } + + fn get(&self, index: usize) -> Value { + vectors::impl_get_for_vector!(self.array, index) + } + + fn get_ref(&self, index: usize) -> ValueRef { + vectors::impl_get_ref_for_vector!(self.array, index) + } +} + +impl ScalarVector for BinaryVector { + type OwnedItem = Vec; + type RefItem<'a> = &'a [u8]; + type Iter<'a> = ArrayIter<&'a BinaryArray>; + type Builder = BinaryVectorBuilder; + + fn get_data(&self, idx: usize) -> Option> { + if self.array.is_valid(idx) { + Some(self.array.value(idx)) + } else { + None + } + } + + fn iter_data(&self) -> Self::Iter<'_> { + self.array.iter() + } +} + +pub struct BinaryVectorBuilder { + mutable_array: MutableBinaryArray, +} + +impl MutableVector for BinaryVectorBuilder { + fn data_type(&self) -> ConcreteDataType { + ConcreteDataType::binary_datatype() + } + + fn len(&self) -> usize { + self.mutable_array.len() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_mut_any(&mut self) -> &mut dyn Any { + self + } + + fn to_vector(&mut self) -> VectorRef { + Arc::new(self.finish()) + } + + fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { + match value.as_binary()? { + Some(v) => self.mutable_array.append_value(v), + None => self.mutable_array.append_null(), + } + Ok(()) + } + + fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { + vectors::impl_extend_for_builder!(self, vector, BinaryVector, offset, length) + } +} + +impl ScalarVectorBuilder for BinaryVectorBuilder { + type VectorType = BinaryVector; + + fn with_capacity(capacity: usize) -> Self { + Self { + mutable_array: MutableBinaryArray::with_capacity(capacity, 0), + } + } + + fn push(&mut self, value: Option<::RefItem<'_>>) { + match value { + Some(v) => self.mutable_array.append_value(v), + None => self.mutable_array.append_null(), + } + } + + fn finish(&mut self) -> Self::VectorType { + BinaryVector { + array: self.mutable_array.finish(), + } + } +} + +impl Serializable for BinaryVector { + fn serialize_to_json(&self) -> Result> { + self.iter_data() + .map(|v| match v { + None => Ok(serde_json::Value::Null), // if binary vector not present, map to NULL + Some(vec) => serde_json::to_value(vec), + }) + .collect::>() + .context(error::SerializeSnafu) + } +} + +vectors::impl_try_from_arrow_array_for_vector!(BinaryArray, BinaryVector); + +#[cfg(test)] +mod tests { + use arrow::datatypes::DataType as ArrowDataType; + use common_base::bytes::Bytes; + use serde_json; + + use super::*; + use crate::arrow_array::BinaryArray; + use crate::data_type::DataType; + use crate::serialize::Serializable; + use crate::types::BinaryType; + + #[test] + fn test_binary_vector_misc() { + let v = BinaryVector::from(BinaryArray::from_iter_values(&[ + vec![1, 2, 3], + vec![1, 2, 3], + ])); + + assert_eq!(2, v.len()); + assert_eq!("BinaryVector", v.vector_type_name()); + assert!(!v.is_const()); + assert!(v.validity().is_all_valid()); + assert!(!v.only_null()); + assert_eq!(128, v.memory_size()); + + for i in 0..2 { + assert!(!v.is_null(i)); + assert_eq!(Value::Binary(Bytes::from(vec![1, 2, 3])), v.get(i)); + assert_eq!(ValueRef::Binary(&[1, 2, 3]), v.get_ref(i)); + } + + let arrow_arr = v.to_arrow_array(); + assert_eq!(2, arrow_arr.len()); + assert_eq!(&ArrowDataType::LargeBinary, arrow_arr.data_type()); + } + + #[test] + fn test_serialize_binary_vector_to_json() { + let vector = BinaryVector::from(BinaryArray::from_iter_values(&[ + vec![1, 2, 3], + vec![1, 2, 3], + ])); + + let json_value = vector.serialize_to_json().unwrap(); + assert_eq!( + "[[1,2,3],[1,2,3]]", + serde_json::to_string(&json_value).unwrap() + ); + } + + #[test] + fn test_serialize_binary_vector_with_null_to_json() { + let mut builder = BinaryVectorBuilder::with_capacity(4); + builder.push(Some(&[1, 2, 3])); + builder.push(None); + builder.push(Some(&[4, 5, 6])); + let vector = builder.finish(); + + let json_value = vector.serialize_to_json().unwrap(); + assert_eq!( + "[[1,2,3],null,[4,5,6]]", + serde_json::to_string(&json_value).unwrap() + ); + } + + #[test] + fn test_from_arrow_array() { + let arrow_array = BinaryArray::from_iter_values(&[vec![1, 2, 3], vec![1, 2, 3]]); + let original = BinaryArray::from(arrow_array.data().clone()); + let vector = BinaryVector::from(arrow_array); + assert_eq!(original, vector.array); + } + + #[test] + fn test_binary_vector_build_get() { + let mut builder = BinaryVectorBuilder::with_capacity(4); + builder.push(Some(b"hello")); + builder.push(Some(b"happy")); + builder.push(Some(b"world")); + builder.push(None); + + let vector = builder.finish(); + assert_eq!(b"hello", vector.get_data(0).unwrap()); + assert_eq!(None, vector.get_data(3)); + + assert_eq!(Value::Binary(b"hello".as_slice().into()), vector.get(0)); + assert_eq!(Value::Null, vector.get(3)); + + let mut iter = vector.iter_data(); + assert_eq!(b"hello", iter.next().unwrap().unwrap()); + assert_eq!(b"happy", iter.next().unwrap().unwrap()); + assert_eq!(b"world", iter.next().unwrap().unwrap()); + assert_eq!(None, iter.next().unwrap()); + assert_eq!(None, iter.next()); + } + + #[test] + fn test_binary_vector_validity() { + let mut builder = BinaryVectorBuilder::with_capacity(4); + builder.push(Some(b"hello")); + builder.push(Some(b"world")); + let vector = builder.finish(); + assert_eq!(0, vector.null_count()); + assert!(vector.validity().is_all_valid()); + + let mut builder = BinaryVectorBuilder::with_capacity(3); + builder.push(Some(b"hello")); + builder.push(None); + builder.push(Some(b"world")); + let vector = builder.finish(); + assert_eq!(1, vector.null_count()); + let validity = vector.validity(); + assert!(!validity.is_set(1)); + + assert_eq!(1, validity.null_count()); + assert!(!validity.is_set(1)); + } + + #[test] + fn test_binary_vector_builder() { + let input = BinaryVector::from_slice(&[b"world", b"one", b"two"]); + + let mut builder = BinaryType::default().create_mutable_vector(3); + builder + .push_value_ref(ValueRef::Binary("hello".as_bytes())) + .unwrap(); + assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); + builder.extend_slice_of(&input, 1, 2).unwrap(); + assert!(builder + .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) + .is_err()); + let vector = builder.to_vector(); + + let expect: VectorRef = Arc::new(BinaryVector::from_slice(&[b"hello", b"one", b"two"])); + assert_eq!(expect, vector); + } +} diff --git a/src/datatypes2/src/vectors/boolean.rs b/src/datatypes2/src/vectors/boolean.rs new file mode 100644 index 0000000000..2b4e5b8e10 --- /dev/null +++ b/src/datatypes2/src/vectors/boolean.rs @@ -0,0 +1,371 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::borrow::Borrow; +use std::sync::Arc; + +use arrow::array::{ + Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef, BooleanArray, BooleanBuilder, +}; +use snafu::ResultExt; + +use crate::data_type::ConcreteDataType; +use crate::error::Result; +use crate::scalars::{ScalarVector, ScalarVectorBuilder}; +use crate::serialize::Serializable; +use crate::value::{Value, ValueRef}; +use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; + +/// Vector of boolean. +#[derive(Debug, PartialEq)] +pub struct BooleanVector { + array: BooleanArray, +} + +impl BooleanVector { + pub(crate) fn as_arrow(&self) -> &dyn Array { + &self.array + } + + pub(crate) fn as_boolean_array(&self) -> &BooleanArray { + &self.array + } + + fn to_array_data(&self) -> ArrayData { + self.array.data().clone() + } + + fn from_array_data(data: ArrayData) -> BooleanVector { + BooleanVector { + array: BooleanArray::from(data), + } + } + + pub(crate) fn false_count(&self) -> usize { + self.array.false_count() + } +} + +impl From> for BooleanVector { + fn from(data: Vec) -> Self { + BooleanVector { + array: BooleanArray::from(data), + } + } +} + +impl From for BooleanVector { + fn from(array: BooleanArray) -> Self { + Self { array } + } +} + +impl From>> for BooleanVector { + fn from(data: Vec>) -> Self { + BooleanVector { + array: BooleanArray::from(data), + } + } +} + +impl>> FromIterator for BooleanVector { + fn from_iter>(iter: I) -> Self { + BooleanVector { + array: BooleanArray::from_iter(iter), + } + } +} + +impl Vector for BooleanVector { + fn data_type(&self) -> ConcreteDataType { + ConcreteDataType::boolean_datatype() + } + + fn vector_type_name(&self) -> String { + "BooleanVector".to_string() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn len(&self) -> usize { + self.array.len() + } + + fn to_arrow_array(&self) -> ArrayRef { + let data = self.to_array_data(); + Arc::new(BooleanArray::from(data)) + } + + fn to_boxed_arrow_array(&self) -> Box { + let data = self.to_array_data(); + Box::new(BooleanArray::from(data)) + } + + fn validity(&self) -> Validity { + vectors::impl_validity_for_vector!(self.array) + } + + fn memory_size(&self) -> usize { + self.array.get_buffer_memory_size() + } + + fn null_count(&self) -> usize { + self.array.null_count() + } + + fn is_null(&self, row: usize) -> bool { + self.array.is_null(row) + } + + fn slice(&self, offset: usize, length: usize) -> VectorRef { + let data = self.array.data().slice(offset, length); + Arc::new(Self::from_array_data(data)) + } + + fn get(&self, index: usize) -> Value { + vectors::impl_get_for_vector!(self.array, index) + } + + fn get_ref(&self, index: usize) -> ValueRef { + vectors::impl_get_ref_for_vector!(self.array, index) + } +} + +impl ScalarVector for BooleanVector { + type OwnedItem = bool; + type RefItem<'a> = bool; + type Iter<'a> = ArrayIter<&'a BooleanArray>; + type Builder = BooleanVectorBuilder; + + fn get_data(&self, idx: usize) -> Option> { + if self.array.is_valid(idx) { + Some(self.array.value(idx)) + } else { + None + } + } + + fn iter_data(&self) -> Self::Iter<'_> { + self.array.iter() + } +} + +pub struct BooleanVectorBuilder { + mutable_array: BooleanBuilder, +} + +impl MutableVector for BooleanVectorBuilder { + fn data_type(&self) -> ConcreteDataType { + ConcreteDataType::boolean_datatype() + } + + fn len(&self) -> usize { + self.mutable_array.len() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_mut_any(&mut self) -> &mut dyn Any { + self + } + + fn to_vector(&mut self) -> VectorRef { + Arc::new(self.finish()) + } + + fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { + match value.as_boolean()? { + Some(v) => self.mutable_array.append_value(v), + None => self.mutable_array.append_null(), + } + Ok(()) + } + + fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { + vectors::impl_extend_for_builder!(self, vector, BooleanVector, offset, length) + } +} + +impl ScalarVectorBuilder for BooleanVectorBuilder { + type VectorType = BooleanVector; + + fn with_capacity(capacity: usize) -> Self { + Self { + mutable_array: BooleanBuilder::with_capacity(capacity), + } + } + + fn push(&mut self, value: Option<::RefItem<'_>>) { + match value { + Some(v) => self.mutable_array.append_value(v), + None => self.mutable_array.append_null(), + } + } + + fn finish(&mut self) -> Self::VectorType { + BooleanVector { + array: self.mutable_array.finish(), + } + } +} + +impl Serializable for BooleanVector { + fn serialize_to_json(&self) -> Result> { + self.iter_data() + .map(serde_json::to_value) + .collect::>() + .context(crate::error::SerializeSnafu) + } +} + +vectors::impl_try_from_arrow_array_for_vector!(BooleanArray, BooleanVector); + +#[cfg(test)] +mod tests { + use arrow::datatypes::DataType as ArrowDataType; + use serde_json; + + use super::*; + use crate::data_type::DataType; + use crate::serialize::Serializable; + use crate::types::BooleanType; + + #[test] + fn test_boolean_vector_misc() { + let bools = vec![true, false, true, true, false, false, true, true, false]; + let v = BooleanVector::from(bools.clone()); + assert_eq!(9, v.len()); + assert_eq!("BooleanVector", v.vector_type_name()); + assert!(!v.is_const()); + assert!(v.validity().is_all_valid()); + assert!(!v.only_null()); + assert_eq!(64, v.memory_size()); + + for (i, b) in bools.iter().enumerate() { + assert!(!v.is_null(i)); + assert_eq!(Value::Boolean(*b), v.get(i)); + assert_eq!(ValueRef::Boolean(*b), v.get_ref(i)); + } + + let arrow_arr = v.to_arrow_array(); + assert_eq!(9, arrow_arr.len()); + assert_eq!(&ArrowDataType::Boolean, arrow_arr.data_type()); + } + + #[test] + fn test_serialize_boolean_vector_to_json() { + let vector = BooleanVector::from(vec![true, false, true, true, false, false]); + + let json_value = vector.serialize_to_json().unwrap(); + assert_eq!( + "[true,false,true,true,false,false]", + serde_json::to_string(&json_value).unwrap(), + ); + } + + #[test] + fn test_serialize_boolean_vector_with_null_to_json() { + let vector = BooleanVector::from(vec![Some(true), None, Some(false)]); + + let json_value = vector.serialize_to_json().unwrap(); + assert_eq!( + "[true,null,false]", + serde_json::to_string(&json_value).unwrap(), + ); + } + + #[test] + fn test_boolean_vector_from_vec() { + let input = vec![false, true, false, true]; + let vec = BooleanVector::from(input.clone()); + assert_eq!(4, vec.len()); + for (i, v) in input.into_iter().enumerate() { + assert_eq!(Some(v), vec.get_data(i), "failed at {}", i) + } + } + + #[test] + fn test_boolean_vector_from_iter() { + let input = vec![Some(false), Some(true), Some(false), Some(true)]; + let vec = input.iter().collect::(); + assert_eq!(4, vec.len()); + for (i, v) in input.into_iter().enumerate() { + assert_eq!(v, vec.get_data(i), "failed at {}", i) + } + } + + #[test] + fn test_boolean_vector_from_vec_option() { + let input = vec![Some(false), Some(true), None, Some(true)]; + let vec = BooleanVector::from(input.clone()); + assert_eq!(4, vec.len()); + for (i, v) in input.into_iter().enumerate() { + assert_eq!(v, vec.get_data(i), "failed at {}", i) + } + } + + #[test] + fn test_boolean_vector_build_get() { + let input = [Some(true), None, Some(false)]; + let mut builder = BooleanVectorBuilder::with_capacity(3); + for v in input { + builder.push(v); + } + let vector = builder.finish(); + assert_eq!(input.len(), vector.len()); + + let res: Vec<_> = vector.iter_data().collect(); + assert_eq!(input, &res[..]); + + for (i, v) in input.into_iter().enumerate() { + assert_eq!(v, vector.get_data(i)); + assert_eq!(Value::from(v), vector.get(i)); + } + } + + #[test] + fn test_boolean_vector_validity() { + let vector = BooleanVector::from(vec![Some(true), None, Some(false)]); + assert_eq!(1, vector.null_count()); + let validity = vector.validity(); + assert_eq!(1, validity.null_count()); + assert!(!validity.is_set(1)); + + let vector = BooleanVector::from(vec![true, false, false]); + assert_eq!(0, vector.null_count()); + assert!(vector.validity().is_all_valid()); + } + + #[test] + fn test_boolean_vector_builder() { + let input = BooleanVector::from_slice(&[true, false, true]); + + let mut builder = BooleanType::default().create_mutable_vector(3); + builder.push_value_ref(ValueRef::Boolean(true)).unwrap(); + assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); + builder.extend_slice_of(&input, 1, 2).unwrap(); + assert!(builder + .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) + .is_err()); + let vector = builder.to_vector(); + + let expect: VectorRef = Arc::new(BooleanVector::from_slice(&[true, false, true])); + assert_eq!(expect, vector); + } +} diff --git a/src/datatypes2/src/vectors/constant.rs b/src/datatypes2/src/vectors/constant.rs new file mode 100644 index 0000000000..87739e9131 --- /dev/null +++ b/src/datatypes2/src/vectors/constant.rs @@ -0,0 +1,218 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::fmt; +use std::sync::Arc; + +use arrow::array::{Array, ArrayRef}; +use snafu::ResultExt; + +use crate::data_type::ConcreteDataType; +use crate::error::{Result, SerializeSnafu}; +use crate::serialize::Serializable; +use crate::value::{Value, ValueRef}; +use crate::vectors::{BooleanVector, Helper, Validity, Vector, VectorRef}; + +#[derive(Clone)] +pub struct ConstantVector { + length: usize, + vector: VectorRef, +} + +impl ConstantVector { + /// Create a new [ConstantVector]. + /// + /// # Panics + /// Panics if `vector.len() != 1`. + pub fn new(vector: VectorRef, length: usize) -> Self { + assert_eq!(1, vector.len()); + + // Avoid const recursion. + if vector.is_const() { + let vec: &ConstantVector = unsafe { Helper::static_cast(&vector) }; + return Self::new(vec.inner().clone(), length); + } + Self { vector, length } + } + + pub fn inner(&self) -> &VectorRef { + &self.vector + } + + /// Returns the constant value. + pub fn get_constant_ref(&self) -> ValueRef { + self.vector.get_ref(0) + } + + pub(crate) fn replicate_vector(&self, offsets: &[usize]) -> VectorRef { + assert_eq!(offsets.len(), self.len()); + + if offsets.is_empty() { + return self.slice(0, 0); + } + + Arc::new(ConstantVector::new( + self.vector.clone(), + *offsets.last().unwrap(), + )) + } + + pub(crate) fn filter_vector(&self, filter: &BooleanVector) -> Result { + let length = self.len() - filter.false_count(); + if length == self.len() { + return Ok(Arc::new(self.clone())); + } + Ok(Arc::new(ConstantVector::new(self.inner().clone(), length))) + } +} + +impl Vector for ConstantVector { + fn data_type(&self) -> ConcreteDataType { + self.vector.data_type() + } + + fn vector_type_name(&self) -> String { + "ConstantVector".to_string() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn len(&self) -> usize { + self.length + } + + fn to_arrow_array(&self) -> ArrayRef { + let v = self.vector.replicate(&[self.length]); + v.to_arrow_array() + } + + fn to_boxed_arrow_array(&self) -> Box { + let v = self.vector.replicate(&[self.length]); + v.to_boxed_arrow_array() + } + + fn is_const(&self) -> bool { + true + } + + fn validity(&self) -> Validity { + if self.vector.is_null(0) { + Validity::all_null(self.length) + } else { + Validity::all_valid(self.length) + } + } + + fn memory_size(&self) -> usize { + self.vector.memory_size() + } + + fn is_null(&self, _row: usize) -> bool { + self.vector.is_null(0) + } + + fn only_null(&self) -> bool { + self.vector.is_null(0) + } + + fn slice(&self, _offset: usize, length: usize) -> VectorRef { + Arc::new(Self { + vector: self.vector.clone(), + length, + }) + } + + fn get(&self, _index: usize) -> Value { + self.vector.get(0) + } + + fn get_ref(&self, _index: usize) -> ValueRef { + self.vector.get_ref(0) + } + + fn null_count(&self) -> usize { + if self.only_null() { + self.len() + } else { + 0 + } + } +} + +impl fmt::Debug for ConstantVector { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "ConstantVector([{:?}; {}])", self.get(0), self.len()) + } +} + +impl Serializable for ConstantVector { + fn serialize_to_json(&self) -> Result> { + std::iter::repeat(self.get(0)) + .take(self.len()) + .map(serde_json::Value::try_from) + .collect::>() + .context(SerializeSnafu) + } +} + +#[cfg(test)] +mod tests { + use arrow::datatypes::DataType as ArrowDataType; + + use super::*; + use crate::vectors::Int32Vector; + + #[test] + fn test_constant_vector_misc() { + let a = Int32Vector::from_slice(vec![1]); + let c = ConstantVector::new(Arc::new(a), 10); + + assert_eq!("ConstantVector", c.vector_type_name()); + assert!(c.is_const()); + assert_eq!(10, c.len()); + assert!(c.validity().is_all_valid()); + assert!(!c.only_null()); + assert_eq!(64, c.memory_size()); + + for i in 0..10 { + assert!(!c.is_null(i)); + assert_eq!(Value::Int32(1), c.get(i)); + } + + let arrow_arr = c.to_arrow_array(); + assert_eq!(10, arrow_arr.len()); + assert_eq!(&ArrowDataType::Int32, arrow_arr.data_type()); + } + + #[test] + fn test_debug_null_array() { + let a = Int32Vector::from_slice(vec![1]); + let c = ConstantVector::new(Arc::new(a), 10); + + let s = format!("{:?}", c); + assert_eq!(s, "ConstantVector([Int32(1); 10])"); + } + + #[test] + fn test_serialize_json() { + let a = Int32Vector::from_slice(vec![1]); + let c = ConstantVector::new(Arc::new(a), 10); + + let s = serde_json::to_string(&c.serialize_to_json().unwrap()).unwrap(); + assert_eq!(s, "[1,1,1,1,1,1,1,1,1,1]"); + } +} diff --git a/src/datatypes2/src/vectors/date.rs b/src/datatypes2/src/vectors/date.rs new file mode 100644 index 0000000000..d0a66b80fb --- /dev/null +++ b/src/datatypes2/src/vectors/date.rs @@ -0,0 +1,103 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::types::DateType; +use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder}; + +// Vector for [`Date`](common_time::Date). +pub type DateVector = PrimitiveVector; +// Builder to build DateVector. +pub type DateVectorBuilder = PrimitiveVectorBuilder; + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow::array::Array; + use common_time::date::Date; + + use super::*; + use crate::data_type::DataType; + use crate::scalars::{ScalarVector, ScalarVectorBuilder}; + use crate::serialize::Serializable; + use crate::types::DateType; + use crate::value::{Value, ValueRef}; + use crate::vectors::{Vector, VectorRef}; + + #[test] + fn test_build_date_vector() { + let mut builder = DateVectorBuilder::with_capacity(4); + builder.push(Some(Date::new(1))); + builder.push(None); + builder.push(Some(Date::new(-1))); + let vector = builder.finish(); + assert_eq!(3, vector.len()); + assert_eq!(Value::Date(Date::new(1)), vector.get(0)); + assert_eq!(ValueRef::Date(Date::new(1)), vector.get_ref(0)); + assert_eq!(Some(Date::new(1)), vector.get_data(0)); + assert_eq!(None, vector.get_data(1)); + assert_eq!(Value::Null, vector.get(1)); + assert_eq!(ValueRef::Null, vector.get_ref(1)); + assert_eq!(Some(Date::new(-1)), vector.get_data(2)); + let mut iter = vector.iter_data(); + assert_eq!(Some(Date::new(1)), iter.next().unwrap()); + assert_eq!(None, iter.next().unwrap()); + assert_eq!(Some(Date::new(-1)), iter.next().unwrap()); + } + + #[test] + fn test_date_scalar() { + let vector = DateVector::from_slice(&[1, 2]); + assert_eq!(2, vector.len()); + assert_eq!(Some(Date::new(1)), vector.get_data(0)); + assert_eq!(Some(Date::new(2)), vector.get_data(1)); + } + + #[test] + fn test_date_vector_builder() { + let input = DateVector::from_slice(&[1, 2, 3]); + + let mut builder = DateType::default().create_mutable_vector(3); + builder + .push_value_ref(ValueRef::Date(Date::new(5))) + .unwrap(); + assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); + builder.extend_slice_of(&input, 1, 2).unwrap(); + assert!(builder + .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) + .is_err()); + let vector = builder.to_vector(); + + let expect: VectorRef = Arc::new(DateVector::from_slice(&[5, 2, 3])); + assert_eq!(expect, vector); + } + + #[test] + fn test_date_from_arrow() { + let vector = DateVector::from_slice(&[1, 2]); + let arrow = vector.as_arrow().slice(0, vector.len()); + let vector2 = DateVector::try_from_arrow_array(&arrow).unwrap(); + assert_eq!(vector, vector2); + } + + #[test] + fn test_serialize_date_vector() { + let vector = DateVector::from_slice(&[-1, 0, 1]); + let serialized_json = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(); + assert_eq!( + r#"["1969-12-31","1970-01-01","1970-01-02"]"#, + serialized_json + ); + } +} diff --git a/src/datatypes2/src/vectors/datetime.rs b/src/datatypes2/src/vectors/datetime.rs new file mode 100644 index 0000000000..a40a3e54d3 --- /dev/null +++ b/src/datatypes2/src/vectors/datetime.rs @@ -0,0 +1,116 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::types::DateTimeType; +use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder}; + +/// Vector of [`DateTime`](common_time::Date) +pub type DateTimeVector = PrimitiveVector; +/// Builder for [`DateTimeVector`]. +pub type DateTimeVectorBuilder = PrimitiveVectorBuilder; + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow::array::{Array, PrimitiveArray}; + use common_time::DateTime; + use datafusion_common::from_slice::FromSlice; + + use super::*; + use crate::data_type::DataType; + use crate::prelude::{ + ConcreteDataType, ScalarVector, ScalarVectorBuilder, Value, ValueRef, Vector, VectorRef, + }; + use crate::serialize::Serializable; + + #[test] + fn test_datetime_vector() { + let v = DateTimeVector::new(PrimitiveArray::from_slice(&[1, 2, 3])); + assert_eq!(ConcreteDataType::datetime_datatype(), v.data_type()); + assert_eq!(3, v.len()); + assert_eq!("DateTimeVector", v.vector_type_name()); + assert_eq!( + &arrow::datatypes::DataType::Date64, + v.to_arrow_array().data_type() + ); + + assert_eq!(Some(DateTime::new(1)), v.get_data(0)); + assert_eq!(Value::DateTime(DateTime::new(1)), v.get(0)); + assert_eq!(ValueRef::DateTime(DateTime::new(1)), v.get_ref(0)); + + let mut iter = v.iter_data(); + assert_eq!(Some(DateTime::new(1)), iter.next().unwrap()); + assert_eq!(Some(DateTime::new(2)), iter.next().unwrap()); + assert_eq!(Some(DateTime::new(3)), iter.next().unwrap()); + assert!(!v.is_null(0)); + assert_eq!(64, v.memory_size()); + + if let Value::DateTime(d) = v.get(0) { + assert_eq!(1, d.val()); + } else { + unreachable!() + } + assert_eq!( + "[\"1970-01-01 00:00:01\",\"1970-01-01 00:00:02\",\"1970-01-01 00:00:03\"]", + serde_json::to_string(&v.serialize_to_json().unwrap()).unwrap() + ); + } + + #[test] + fn test_datetime_vector_builder() { + let mut builder = DateTimeVectorBuilder::with_capacity(3); + builder.push(Some(DateTime::new(1))); + builder.push(None); + builder.push(Some(DateTime::new(-1))); + + let v = builder.finish(); + assert_eq!(ConcreteDataType::datetime_datatype(), v.data_type()); + assert_eq!(Value::DateTime(DateTime::new(1)), v.get(0)); + assert_eq!(Value::Null, v.get(1)); + assert_eq!(Value::DateTime(DateTime::new(-1)), v.get(2)); + + let input = DateTimeVector::from_wrapper_slice(&[ + DateTime::new(1), + DateTime::new(2), + DateTime::new(3), + ]); + + let mut builder = DateTimeType::default().create_mutable_vector(3); + builder + .push_value_ref(ValueRef::DateTime(DateTime::new(5))) + .unwrap(); + assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); + builder.extend_slice_of(&input, 1, 2).unwrap(); + assert!(builder + .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) + .is_err()); + let vector = builder.to_vector(); + + let expect: VectorRef = Arc::new(DateTimeVector::from_wrapper_slice(&[ + DateTime::new(5), + DateTime::new(2), + DateTime::new(3), + ])); + assert_eq!(expect, vector); + } + + #[test] + fn test_datetime_from_arrow() { + let vector = DateTimeVector::from_wrapper_slice(&[DateTime::new(1), DateTime::new(2)]); + let arrow = vector.as_arrow().slice(0, vector.len()); + let vector2 = DateTimeVector::try_from_arrow_array(&arrow).unwrap(); + assert_eq!(vector, vector2); + } +} diff --git a/src/datatypes2/src/vectors/eq.rs b/src/datatypes2/src/vectors/eq.rs new file mode 100644 index 0000000000..55359026d4 --- /dev/null +++ b/src/datatypes2/src/vectors/eq.rs @@ -0,0 +1,228 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use crate::data_type::DataType; +use crate::types::TimestampType; +use crate::vectors::constant::ConstantVector; +use crate::vectors::{ + BinaryVector, BooleanVector, DateTimeVector, DateVector, ListVector, PrimitiveVector, + StringVector, TimestampMicrosecondVector, TimestampMillisecondVector, + TimestampNanosecondVector, TimestampSecondVector, Vector, +}; +use crate::with_match_primitive_type_id; + +impl Eq for dyn Vector + '_ {} + +impl PartialEq for dyn Vector + '_ { + fn eq(&self, other: &dyn Vector) -> bool { + equal(self, other) + } +} + +impl PartialEq for Arc { + fn eq(&self, other: &dyn Vector) -> bool { + equal(&**self, other) + } +} + +macro_rules! is_vector_eq { + ($VectorType: ident, $lhs: ident, $rhs: ident) => {{ + let lhs = $lhs.as_any().downcast_ref::<$VectorType>().unwrap(); + let rhs = $rhs.as_any().downcast_ref::<$VectorType>().unwrap(); + + lhs == rhs + }}; +} + +fn equal(lhs: &dyn Vector, rhs: &dyn Vector) -> bool { + if lhs.data_type() != rhs.data_type() || lhs.len() != rhs.len() { + return false; + } + + if lhs.is_const() || rhs.is_const() { + // Length has been checked before, so we only need to compare inner + // vector here. + return equal( + &**lhs + .as_any() + .downcast_ref::() + .unwrap() + .inner(), + &**lhs + .as_any() + .downcast_ref::() + .unwrap() + .inner(), + ); + } + + use crate::data_type::ConcreteDataType::*; + + let lhs_type = lhs.data_type(); + match lhs.data_type() { + Null(_) => true, + Boolean(_) => is_vector_eq!(BooleanVector, lhs, rhs), + Binary(_) => is_vector_eq!(BinaryVector, lhs, rhs), + String(_) => is_vector_eq!(StringVector, lhs, rhs), + Date(_) => is_vector_eq!(DateVector, lhs, rhs), + DateTime(_) => is_vector_eq!(DateTimeVector, lhs, rhs), + Timestamp(t) => match t { + TimestampType::Second(_) => { + is_vector_eq!(TimestampSecondVector, lhs, rhs) + } + TimestampType::Millisecond(_) => { + is_vector_eq!(TimestampMillisecondVector, lhs, rhs) + } + TimestampType::Microsecond(_) => { + is_vector_eq!(TimestampMicrosecondVector, lhs, rhs) + } + TimestampType::Nanosecond(_) => { + is_vector_eq!(TimestampNanosecondVector, lhs, rhs) + } + }, + List(_) => is_vector_eq!(ListVector, lhs, rhs), + UInt8(_) | UInt16(_) | UInt32(_) | UInt64(_) | Int8(_) | Int16(_) | Int32(_) | Int64(_) + | Float32(_) | Float64(_) => { + with_match_primitive_type_id!(lhs_type.logical_type_id(), |$T| { + let lhs = lhs.as_any().downcast_ref::>().unwrap(); + let rhs = rhs.as_any().downcast_ref::>().unwrap(); + + lhs == rhs + }, + { + unreachable!("should not compare {} with {}", lhs.vector_type_name(), rhs.vector_type_name()) + }) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::vectors::{ + list, Float32Vector, Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector, + NullVector, UInt16Vector, UInt32Vector, UInt64Vector, UInt8Vector, VectorRef, + }; + + fn assert_vector_ref_eq(vector: VectorRef) { + let rhs = vector.clone(); + assert_eq!(vector, rhs); + assert_dyn_vector_eq(&*vector, &*rhs); + } + + fn assert_dyn_vector_eq(lhs: &dyn Vector, rhs: &dyn Vector) { + assert_eq!(lhs, rhs); + } + + fn assert_vector_ref_ne(lhs: VectorRef, rhs: VectorRef) { + assert_ne!(lhs, rhs); + } + + #[test] + fn test_vector_eq() { + assert_vector_ref_eq(Arc::new(BinaryVector::from(vec![ + Some(b"hello".to_vec()), + Some(b"world".to_vec()), + ]))); + assert_vector_ref_eq(Arc::new(BooleanVector::from(vec![true, false]))); + assert_vector_ref_eq(Arc::new(ConstantVector::new( + Arc::new(BooleanVector::from(vec![true])), + 5, + ))); + assert_vector_ref_eq(Arc::new(BooleanVector::from(vec![true, false]))); + assert_vector_ref_eq(Arc::new(DateVector::from(vec![Some(100), Some(120)]))); + assert_vector_ref_eq(Arc::new(DateTimeVector::from(vec![Some(100), Some(120)]))); + assert_vector_ref_eq(Arc::new(TimestampSecondVector::from_values([100, 120]))); + assert_vector_ref_eq(Arc::new(TimestampMillisecondVector::from_values([ + 100, 120, + ]))); + assert_vector_ref_eq(Arc::new(TimestampMicrosecondVector::from_values([ + 100, 120, + ]))); + assert_vector_ref_eq(Arc::new(TimestampNanosecondVector::from_values([100, 120]))); + + let list_vector = list::tests::new_list_vector(&[ + Some(vec![Some(1), Some(2)]), + None, + Some(vec![Some(3), Some(4)]), + ]); + assert_vector_ref_eq(Arc::new(list_vector)); + + assert_vector_ref_eq(Arc::new(NullVector::new(4))); + assert_vector_ref_eq(Arc::new(StringVector::from(vec![ + Some("hello"), + Some("world"), + ]))); + + assert_vector_ref_eq(Arc::new(Int8Vector::from_slice(&[1, 2, 3, 4]))); + assert_vector_ref_eq(Arc::new(UInt8Vector::from_slice(&[1, 2, 3, 4]))); + assert_vector_ref_eq(Arc::new(Int16Vector::from_slice(&[1, 2, 3, 4]))); + assert_vector_ref_eq(Arc::new(UInt16Vector::from_slice(&[1, 2, 3, 4]))); + assert_vector_ref_eq(Arc::new(Int32Vector::from_slice(&[1, 2, 3, 4]))); + assert_vector_ref_eq(Arc::new(UInt32Vector::from_slice(&[1, 2, 3, 4]))); + assert_vector_ref_eq(Arc::new(Int64Vector::from_slice(&[1, 2, 3, 4]))); + assert_vector_ref_eq(Arc::new(UInt64Vector::from_slice(&[1, 2, 3, 4]))); + assert_vector_ref_eq(Arc::new(Float32Vector::from_slice(&[1.0, 2.0, 3.0, 4.0]))); + assert_vector_ref_eq(Arc::new(Float64Vector::from_slice(&[1.0, 2.0, 3.0, 4.0]))); + } + + #[test] + fn test_vector_ne() { + assert_vector_ref_ne( + Arc::new(Int32Vector::from_slice(&[1, 2, 3, 4])), + Arc::new(Int32Vector::from_slice(&[1, 2])), + ); + assert_vector_ref_ne( + Arc::new(Int32Vector::from_slice(&[1, 2, 3, 4])), + Arc::new(Int8Vector::from_slice(&[1, 2, 3, 4])), + ); + assert_vector_ref_ne( + Arc::new(Int32Vector::from_slice(&[1, 2, 3, 4])), + Arc::new(BooleanVector::from(vec![true, true])), + ); + assert_vector_ref_ne( + Arc::new(ConstantVector::new( + Arc::new(BooleanVector::from(vec![true])), + 5, + )), + Arc::new(ConstantVector::new( + Arc::new(BooleanVector::from(vec![true])), + 4, + )), + ); + assert_vector_ref_ne( + Arc::new(ConstantVector::new( + Arc::new(BooleanVector::from(vec![true])), + 5, + )), + Arc::new(ConstantVector::new( + Arc::new(BooleanVector::from(vec![false])), + 4, + )), + ); + assert_vector_ref_ne( + Arc::new(ConstantVector::new( + Arc::new(BooleanVector::from(vec![true])), + 5, + )), + Arc::new(ConstantVector::new( + Arc::new(Int32Vector::from_slice(vec![1])), + 4, + )), + ); + assert_vector_ref_ne(Arc::new(NullVector::new(5)), Arc::new(NullVector::new(8))); + } +} diff --git a/src/datatypes2/src/vectors/helper.rs b/src/datatypes2/src/vectors/helper.rs new file mode 100644 index 0000000000..f3236ca0ec --- /dev/null +++ b/src/datatypes2/src/vectors/helper.rs @@ -0,0 +1,431 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Vector helper functions, inspired by databend Series mod + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{Array, ArrayRef, StringArray}; +use arrow::compute; +use arrow::compute::kernels::comparison; +use arrow::datatypes::{DataType as ArrowDataType, TimeUnit}; +use datafusion_common::ScalarValue; +use snafu::{OptionExt, ResultExt}; + +use crate::data_type::ConcreteDataType; +use crate::error::{self, Result}; +use crate::scalars::{Scalar, ScalarVectorBuilder}; +use crate::value::{ListValue, ListValueRef}; +use crate::vectors::{ + BinaryVector, BooleanVector, ConstantVector, DateTimeVector, DateVector, Float32Vector, + Float64Vector, Int16Vector, Int32Vector, Int64Vector, Int8Vector, ListVector, + ListVectorBuilder, MutableVector, NullVector, StringVector, TimestampMicrosecondVector, + TimestampMillisecondVector, TimestampNanosecondVector, TimestampSecondVector, UInt16Vector, + UInt32Vector, UInt64Vector, UInt8Vector, Vector, VectorRef, +}; + +/// Helper functions for `Vector`. +pub struct Helper; + +impl Helper { + /// Get a pointer to the underlying data of this vectors. + /// Can be useful for fast comparisons. + /// # Safety + /// Assumes that the `vector` is T. + pub unsafe fn static_cast(vector: &VectorRef) -> &T { + let object = vector.as_ref(); + debug_assert!(object.as_any().is::()); + &*(object as *const dyn Vector as *const T) + } + + pub fn check_get_scalar(vector: &VectorRef) -> Result<&::VectorType> { + let arr = vector + .as_any() + .downcast_ref::<::VectorType>() + .with_context(|| error::UnknownVectorSnafu { + msg: format!( + "downcast vector error, vector type: {:?}, expected vector: {:?}", + vector.vector_type_name(), + std::any::type_name::(), + ), + }); + arr + } + + pub fn check_get(vector: &VectorRef) -> Result<&T> { + let arr = vector + .as_any() + .downcast_ref::() + .with_context(|| error::UnknownVectorSnafu { + msg: format!( + "downcast vector error, vector type: {:?}, expected vector: {:?}", + vector.vector_type_name(), + std::any::type_name::(), + ), + }); + arr + } + + pub fn check_get_mutable_vector( + vector: &mut dyn MutableVector, + ) -> Result<&mut T> { + let ty = vector.data_type(); + let arr = vector + .as_mut_any() + .downcast_mut() + .with_context(|| error::UnknownVectorSnafu { + msg: format!( + "downcast vector error, vector type: {:?}, expected vector: {:?}", + ty, + std::any::type_name::(), + ), + }); + arr + } + + pub fn check_get_scalar_vector( + vector: &VectorRef, + ) -> Result<&::VectorType> { + let arr = vector + .as_any() + .downcast_ref::<::VectorType>() + .with_context(|| error::UnknownVectorSnafu { + msg: format!( + "downcast vector error, vector type: {:?}, expected vector: {:?}", + vector.vector_type_name(), + std::any::type_name::(), + ), + }); + arr + } + + /// Try to cast an arrow scalar value into vector + pub fn try_from_scalar_value(value: ScalarValue, length: usize) -> Result { + let vector = match value { + ScalarValue::Null => ConstantVector::new(Arc::new(NullVector::new(1)), length), + ScalarValue::Boolean(v) => { + ConstantVector::new(Arc::new(BooleanVector::from(vec![v])), length) + } + ScalarValue::Float32(v) => { + ConstantVector::new(Arc::new(Float32Vector::from(vec![v])), length) + } + ScalarValue::Float64(v) => { + ConstantVector::new(Arc::new(Float64Vector::from(vec![v])), length) + } + ScalarValue::Int8(v) => { + ConstantVector::new(Arc::new(Int8Vector::from(vec![v])), length) + } + ScalarValue::Int16(v) => { + ConstantVector::new(Arc::new(Int16Vector::from(vec![v])), length) + } + ScalarValue::Int32(v) => { + ConstantVector::new(Arc::new(Int32Vector::from(vec![v])), length) + } + ScalarValue::Int64(v) => { + ConstantVector::new(Arc::new(Int64Vector::from(vec![v])), length) + } + ScalarValue::UInt8(v) => { + ConstantVector::new(Arc::new(UInt8Vector::from(vec![v])), length) + } + ScalarValue::UInt16(v) => { + ConstantVector::new(Arc::new(UInt16Vector::from(vec![v])), length) + } + ScalarValue::UInt32(v) => { + ConstantVector::new(Arc::new(UInt32Vector::from(vec![v])), length) + } + ScalarValue::UInt64(v) => { + ConstantVector::new(Arc::new(UInt64Vector::from(vec![v])), length) + } + ScalarValue::Utf8(v) | ScalarValue::LargeUtf8(v) => { + ConstantVector::new(Arc::new(StringVector::from(vec![v])), length) + } + ScalarValue::Binary(v) + | ScalarValue::LargeBinary(v) + | ScalarValue::FixedSizeBinary(_, v) => { + ConstantVector::new(Arc::new(BinaryVector::from(vec![v])), length) + } + ScalarValue::List(v, field) => { + let item_type = ConcreteDataType::try_from(field.data_type())?; + let mut builder = ListVectorBuilder::with_type_capacity(item_type.clone(), 1); + if let Some(values) = v { + let values = values + .into_iter() + .map(ScalarValue::try_into) + .collect::>()?; + let list_value = ListValue::new(Some(Box::new(values)), item_type); + builder.push(Some(ListValueRef::Ref { val: &list_value })); + } else { + builder.push(None); + } + let list_vector = builder.to_vector(); + ConstantVector::new(list_vector, length) + } + ScalarValue::Date32(v) => { + ConstantVector::new(Arc::new(DateVector::from(vec![v])), length) + } + ScalarValue::Date64(v) => { + ConstantVector::new(Arc::new(DateTimeVector::from(vec![v])), length) + } + ScalarValue::TimestampSecond(v, _) => { + // Timezone is unimplemented now. + ConstantVector::new(Arc::new(TimestampSecondVector::from(vec![v])), length) + } + ScalarValue::TimestampMillisecond(v, _) => { + // Timezone is unimplemented now. + ConstantVector::new(Arc::new(TimestampMillisecondVector::from(vec![v])), length) + } + ScalarValue::TimestampMicrosecond(v, _) => { + // Timezone is unimplemented now. + ConstantVector::new(Arc::new(TimestampMicrosecondVector::from(vec![v])), length) + } + ScalarValue::TimestampNanosecond(v, _) => { + // Timezone is unimplemented now. + ConstantVector::new(Arc::new(TimestampNanosecondVector::from(vec![v])), length) + } + ScalarValue::Decimal128(_, _, _) + | ScalarValue::Time64(_) + | ScalarValue::IntervalYearMonth(_) + | ScalarValue::IntervalDayTime(_) + | ScalarValue::IntervalMonthDayNano(_) + | ScalarValue::Struct(_, _) + | ScalarValue::Dictionary(_, _) => { + return error::ConversionSnafu { + from: format!("Unsupported scalar value: {}", value), + } + .fail() + } + }; + + Ok(Arc::new(vector)) + } + + /// Try to cast an arrow array into vector + /// + /// # Panics + /// Panic if given arrow data type is not supported. + pub fn try_into_vector(array: impl AsRef) -> Result { + Ok(match array.as_ref().data_type() { + ArrowDataType::Null => Arc::new(NullVector::try_from_arrow_array(array)?), + ArrowDataType::Boolean => Arc::new(BooleanVector::try_from_arrow_array(array)?), + ArrowDataType::LargeBinary => Arc::new(BinaryVector::try_from_arrow_array(array)?), + ArrowDataType::Int8 => Arc::new(Int8Vector::try_from_arrow_array(array)?), + ArrowDataType::Int16 => Arc::new(Int16Vector::try_from_arrow_array(array)?), + ArrowDataType::Int32 => Arc::new(Int32Vector::try_from_arrow_array(array)?), + ArrowDataType::Int64 => Arc::new(Int64Vector::try_from_arrow_array(array)?), + ArrowDataType::UInt8 => Arc::new(UInt8Vector::try_from_arrow_array(array)?), + ArrowDataType::UInt16 => Arc::new(UInt16Vector::try_from_arrow_array(array)?), + ArrowDataType::UInt32 => Arc::new(UInt32Vector::try_from_arrow_array(array)?), + ArrowDataType::UInt64 => Arc::new(UInt64Vector::try_from_arrow_array(array)?), + ArrowDataType::Float32 => Arc::new(Float32Vector::try_from_arrow_array(array)?), + ArrowDataType::Float64 => Arc::new(Float64Vector::try_from_arrow_array(array)?), + ArrowDataType::Utf8 => Arc::new(StringVector::try_from_arrow_array(array)?), + ArrowDataType::Date32 => Arc::new(DateVector::try_from_arrow_array(array)?), + ArrowDataType::Date64 => Arc::new(DateTimeVector::try_from_arrow_array(array)?), + ArrowDataType::List(_) => Arc::new(ListVector::try_from_arrow_array(array)?), + ArrowDataType::Timestamp(unit, _) => match unit { + TimeUnit::Second => Arc::new(TimestampSecondVector::try_from_arrow_array(array)?), + TimeUnit::Millisecond => { + Arc::new(TimestampMillisecondVector::try_from_arrow_array(array)?) + } + TimeUnit::Microsecond => { + Arc::new(TimestampMicrosecondVector::try_from_arrow_array(array)?) + } + TimeUnit::Nanosecond => { + Arc::new(TimestampNanosecondVector::try_from_arrow_array(array)?) + } + }, + ArrowDataType::Float16 + | ArrowDataType::Time32(_) + | ArrowDataType::Time64(_) + | ArrowDataType::Duration(_) + | ArrowDataType::Interval(_) + | ArrowDataType::Binary + | ArrowDataType::FixedSizeBinary(_) + | ArrowDataType::LargeUtf8 + | ArrowDataType::LargeList(_) + | ArrowDataType::FixedSizeList(_, _) + | ArrowDataType::Struct(_) + | ArrowDataType::Union(_, _, _) + | ArrowDataType::Dictionary(_, _) + | ArrowDataType::Decimal128(_, _) + | ArrowDataType::Decimal256(_, _) + | ArrowDataType::Map(_, _) => { + unimplemented!("Arrow array datatype: {:?}", array.as_ref().data_type()) + } + }) + } + + /// Try to cast slice of `arrays` to vectors. + pub fn try_into_vectors(arrays: &[ArrayRef]) -> Result> { + arrays.iter().map(Self::try_into_vector).collect() + } + + /// Perform SQL like operation on `names` and a scalar `s`. + pub fn like_utf8(names: Vec, s: &str) -> Result { + let array = StringArray::from(names); + + let filter = comparison::like_utf8_scalar(&array, s).context(error::ArrowComputeSnafu)?; + + let result = compute::filter(&array, &filter).context(error::ArrowComputeSnafu)?; + Helper::try_into_vector(result) + } +} + +#[cfg(test)] +mod tests { + use arrow::array::{ + ArrayRef, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, Int16Array, + Int32Array, Int64Array, Int8Array, LargeBinaryArray, ListArray, NullArray, + TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, + }; + use arrow::datatypes::{Field, Int32Type}; + use common_time::{Date, DateTime}; + + use super::*; + use crate::value::Value; + use crate::vectors::ConcreteDataType; + + #[test] + fn test_try_into_vectors() { + let arrays: Vec = vec![ + Arc::new(Int32Array::from(vec![1])), + Arc::new(Int32Array::from(vec![2])), + Arc::new(Int32Array::from(vec![3])), + ]; + let vectors = Helper::try_into_vectors(&arrays); + assert!(vectors.is_ok()); + let vectors = vectors.unwrap(); + vectors.iter().for_each(|v| assert_eq!(1, v.len())); + assert_eq!(Value::Int32(1), vectors[0].get(0)); + assert_eq!(Value::Int32(2), vectors[1].get(0)); + assert_eq!(Value::Int32(3), vectors[2].get(0)); + } + + #[test] + fn test_try_into_date_vector() { + let vector = DateVector::from(vec![Some(1), Some(2), None]); + let arrow_array = vector.to_arrow_array(); + assert_eq!(&ArrowDataType::Date32, arrow_array.data_type()); + let vector_converted = Helper::try_into_vector(arrow_array).unwrap(); + assert_eq!(vector.len(), vector_converted.len()); + for i in 0..vector_converted.len() { + assert_eq!(vector.get(i), vector_converted.get(i)); + } + } + + #[test] + fn test_try_from_scalar_date_value() { + let vector = Helper::try_from_scalar_value(ScalarValue::Date32(Some(42)), 3).unwrap(); + assert_eq!(ConcreteDataType::date_datatype(), vector.data_type()); + assert_eq!(3, vector.len()); + for i in 0..vector.len() { + assert_eq!(Value::Date(Date::new(42)), vector.get(i)); + } + } + + #[test] + fn test_try_from_scalar_datetime_value() { + let vector = Helper::try_from_scalar_value(ScalarValue::Date64(Some(42)), 3).unwrap(); + assert_eq!(ConcreteDataType::datetime_datatype(), vector.data_type()); + assert_eq!(3, vector.len()); + for i in 0..vector.len() { + assert_eq!(Value::DateTime(DateTime::new(42)), vector.get(i)); + } + } + + #[test] + fn test_try_from_list_value() { + let value = ScalarValue::List( + Some(vec![ + ScalarValue::Int32(Some(1)), + ScalarValue::Int32(Some(2)), + ]), + Box::new(Field::new("item", ArrowDataType::Int32, true)), + ); + let vector = Helper::try_from_scalar_value(value, 3).unwrap(); + assert_eq!( + ConcreteDataType::list_datatype(ConcreteDataType::int32_datatype()), + vector.data_type() + ); + assert_eq!(3, vector.len()); + for i in 0..vector.len() { + let v = vector.get(i); + let items = v.as_list().unwrap().unwrap().items().as_ref().unwrap(); + assert_eq!(vec![Value::Int32(1), Value::Int32(2)], **items); + } + } + + #[test] + fn test_like_utf8() { + fn assert_vector(expected: Vec<&str>, actual: &VectorRef) { + let actual = actual.as_any().downcast_ref::().unwrap(); + assert_eq!(*actual, StringVector::from(expected)); + } + + let names: Vec = vec!["greptime", "hello", "public", "world"] + .into_iter() + .map(|x| x.to_string()) + .collect(); + + let ret = Helper::like_utf8(names.clone(), "%ll%").unwrap(); + assert_vector(vec!["hello"], &ret); + + let ret = Helper::like_utf8(names.clone(), "%time").unwrap(); + assert_vector(vec!["greptime"], &ret); + + let ret = Helper::like_utf8(names.clone(), "%ld").unwrap(); + assert_vector(vec!["world"], &ret); + + let ret = Helper::like_utf8(names, "%").unwrap(); + assert_vector(vec!["greptime", "hello", "public", "world"], &ret); + } + + fn check_try_into_vector(array: impl Array + 'static) { + let array: ArrayRef = Arc::new(array); + let vector = Helper::try_into_vector(array.clone()).unwrap(); + assert_eq!(&array, &vector.to_arrow_array()); + } + + #[test] + fn test_try_into_vector() { + check_try_into_vector(NullArray::new(2)); + check_try_into_vector(BooleanArray::from(vec![true, false])); + check_try_into_vector(LargeBinaryArray::from(vec![ + "hello".as_bytes(), + "world".as_bytes(), + ])); + check_try_into_vector(Int8Array::from(vec![1, 2, 3])); + check_try_into_vector(Int16Array::from(vec![1, 2, 3])); + check_try_into_vector(Int32Array::from(vec![1, 2, 3])); + check_try_into_vector(Int64Array::from(vec![1, 2, 3])); + check_try_into_vector(UInt8Array::from(vec![1, 2, 3])); + check_try_into_vector(UInt16Array::from(vec![1, 2, 3])); + check_try_into_vector(UInt32Array::from(vec![1, 2, 3])); + check_try_into_vector(UInt64Array::from(vec![1, 2, 3])); + check_try_into_vector(Float32Array::from(vec![1.0, 2.0, 3.0])); + check_try_into_vector(Float64Array::from(vec![1.0, 2.0, 3.0])); + check_try_into_vector(StringArray::from(vec!["hello", "world"])); + check_try_into_vector(Date32Array::from(vec![1, 2, 3])); + check_try_into_vector(Date64Array::from(vec![1, 2, 3])); + let data = vec![None, Some(vec![Some(6), Some(7)])]; + let list_array = ListArray::from_iter_primitive::(data); + check_try_into_vector(list_array); + check_try_into_vector(TimestampSecondArray::from(vec![1, 2, 3])); + check_try_into_vector(TimestampMillisecondArray::from(vec![1, 2, 3])); + check_try_into_vector(TimestampMicrosecondArray::from(vec![1, 2, 3])); + check_try_into_vector(TimestampNanosecondArray::from(vec![1, 2, 3])); + } +} diff --git a/src/datatypes2/src/vectors/list.rs b/src/datatypes2/src/vectors/list.rs new file mode 100644 index 0000000000..747e03557b --- /dev/null +++ b/src/datatypes2/src/vectors/list.rs @@ -0,0 +1,747 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{ + Array, ArrayData, ArrayRef, BooleanBufferBuilder, Int32BufferBuilder, ListArray, +}; +use arrow::buffer::Buffer; +use arrow::datatypes::DataType as ArrowDataType; +use serde_json::Value as JsonValue; + +use crate::data_type::{ConcreteDataType, DataType}; +use crate::error::Result; +use crate::scalars::{ScalarVector, ScalarVectorBuilder}; +use crate::serialize::Serializable; +use crate::types::ListType; +use crate::value::{ListValue, ListValueRef, Value, ValueRef}; +use crate::vectors::{self, Helper, MutableVector, Validity, Vector, VectorRef}; + +/// Vector of Lists, basically backed by Arrow's `ListArray`. +#[derive(Debug, PartialEq)] +pub struct ListVector { + array: ListArray, + /// The datatype of the items in the list. + item_type: ConcreteDataType, +} + +impl ListVector { + /// Iterate elements as [VectorRef]. + pub fn values_iter(&self) -> impl Iterator>> + '_ { + self.array + .iter() + .map(|value_opt| value_opt.map(Helper::try_into_vector).transpose()) + } + + fn to_array_data(&self) -> ArrayData { + self.array.data().clone() + } + + fn from_array_data_and_type(data: ArrayData, item_type: ConcreteDataType) -> Self { + Self { + array: ListArray::from(data), + item_type, + } + } + + pub(crate) fn as_arrow(&self) -> &dyn Array { + &self.array + } +} + +impl Vector for ListVector { + fn data_type(&self) -> ConcreteDataType { + ConcreteDataType::List(ListType::new(self.item_type.clone())) + } + + fn vector_type_name(&self) -> String { + "ListVector".to_string() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn len(&self) -> usize { + self.array.len() + } + + fn to_arrow_array(&self) -> ArrayRef { + let data = self.to_array_data(); + Arc::new(ListArray::from(data)) + } + + fn to_boxed_arrow_array(&self) -> Box { + let data = self.to_array_data(); + Box::new(ListArray::from(data)) + } + + fn validity(&self) -> Validity { + vectors::impl_validity_for_vector!(self.array) + } + + fn memory_size(&self) -> usize { + self.array.get_buffer_memory_size() + } + + fn null_count(&self) -> usize { + self.array.null_count() + } + + fn is_null(&self, row: usize) -> bool { + self.array.is_null(row) + } + + fn slice(&self, offset: usize, length: usize) -> VectorRef { + let data = self.array.data().slice(offset, length); + Arc::new(Self::from_array_data_and_type(data, self.item_type.clone())) + } + + fn get(&self, index: usize) -> Value { + if !self.array.is_valid(index) { + return Value::Null; + } + + let array = &self.array.value(index); + let vector = Helper::try_into_vector(array).unwrap_or_else(|_| { + panic!( + "arrow array with datatype {:?} cannot converted to our vector", + array.data_type() + ) + }); + let values = (0..vector.len()) + .map(|i| vector.get(i)) + .collect::>(); + Value::List(ListValue::new( + Some(Box::new(values)), + self.item_type.clone(), + )) + } + + fn get_ref(&self, index: usize) -> ValueRef { + ValueRef::List(ListValueRef::Indexed { + vector: self, + idx: index, + }) + } +} + +impl Serializable for ListVector { + fn serialize_to_json(&self) -> Result> { + self.array + .iter() + .map(|v| match v { + None => Ok(JsonValue::Null), + Some(v) => Helper::try_into_vector(v) + .and_then(|v| v.serialize_to_json()) + .map(JsonValue::Array), + }) + .collect() + } +} + +impl From for ListVector { + fn from(array: ListArray) -> Self { + let item_type = ConcreteDataType::from_arrow_type(match array.data_type() { + ArrowDataType::List(field) => field.data_type(), + other => panic!( + "Try to create ListVector from an arrow array with type {:?}", + other + ), + }); + Self { array, item_type } + } +} + +vectors::impl_try_from_arrow_array_for_vector!(ListArray, ListVector); + +pub struct ListIter<'a> { + vector: &'a ListVector, + idx: usize, +} + +impl<'a> ListIter<'a> { + fn new(vector: &'a ListVector) -> ListIter { + ListIter { vector, idx: 0 } + } +} + +impl<'a> Iterator for ListIter<'a> { + type Item = Option>; + + #[inline] + fn next(&mut self) -> Option { + if self.idx >= self.vector.len() { + return None; + } + + let idx = self.idx; + self.idx += 1; + + if self.vector.is_null(idx) { + return Some(None); + } + + Some(Some(ListValueRef::Indexed { + vector: self.vector, + idx, + })) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + (self.vector.len(), Some(self.vector.len())) + } +} + +impl ScalarVector for ListVector { + type OwnedItem = ListValue; + type RefItem<'a> = ListValueRef<'a>; + type Iter<'a> = ListIter<'a>; + type Builder = ListVectorBuilder; + + fn get_data(&self, idx: usize) -> Option> { + if self.array.is_valid(idx) { + Some(ListValueRef::Indexed { vector: self, idx }) + } else { + None + } + } + + fn iter_data(&self) -> Self::Iter<'_> { + ListIter::new(self) + } +} + +// Ports from arrow's GenericListBuilder. +// See https://github.com/apache/arrow-rs/blob/94565bca99b5d9932a3e9a8e094aaf4e4384b1e5/arrow-array/src/builder/generic_list_builder.rs +/// [ListVector] builder. +pub struct ListVectorBuilder { + item_type: ConcreteDataType, + offsets_builder: Int32BufferBuilder, + null_buffer_builder: NullBufferBuilder, + values_builder: Box, +} + +impl ListVectorBuilder { + /// Creates a new [`ListVectorBuilder`]. `item_type` is the data type of the list item, `capacity` + /// is the number of items to pre-allocate space for in this builder. + pub fn with_type_capacity(item_type: ConcreteDataType, capacity: usize) -> ListVectorBuilder { + let mut offsets_builder = Int32BufferBuilder::new(capacity + 1); + offsets_builder.append(0); + // The actual required capacity might be greater than the capacity of the `ListVector` + // if the child vector has more than one element. + let values_builder = item_type.create_mutable_vector(capacity); + + ListVectorBuilder { + item_type, + offsets_builder, + null_buffer_builder: NullBufferBuilder::new(capacity), + values_builder, + } + } + + /// Finish the current variable-length list vector slot. + fn finish_list(&mut self, is_valid: bool) { + self.offsets_builder + .append(i32::try_from(self.values_builder.len()).unwrap()); + self.null_buffer_builder.append(is_valid); + } + + fn push_null(&mut self) { + self.finish_list(false); + } + + fn push_list_value(&mut self, list_value: &ListValue) -> Result<()> { + if let Some(items) = list_value.items() { + for item in &**items { + self.values_builder.push_value_ref(item.as_value_ref())?; + } + } + + self.finish_list(true); + Ok(()) + } +} + +impl MutableVector for ListVectorBuilder { + fn data_type(&self) -> ConcreteDataType { + ConcreteDataType::list_datatype(self.item_type.clone()) + } + + fn len(&self) -> usize { + self.null_buffer_builder.len() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_mut_any(&mut self) -> &mut dyn Any { + self + } + + fn to_vector(&mut self) -> VectorRef { + Arc::new(self.finish()) + } + + fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { + if let Some(list_ref) = value.as_list()? { + match list_ref { + ListValueRef::Indexed { vector, idx } => match vector.get(idx).as_list()? { + Some(list_value) => self.push_list_value(list_value)?, + None => self.push_null(), + }, + ListValueRef::Ref { val } => self.push_list_value(val)?, + } + } else { + self.push_null(); + } + + Ok(()) + } + + fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { + for idx in offset..offset + length { + let value = vector.get_ref(idx); + self.push_value_ref(value)?; + } + + Ok(()) + } +} + +impl ScalarVectorBuilder for ListVectorBuilder { + type VectorType = ListVector; + + fn with_capacity(_capacity: usize) -> Self { + panic!("Must use ListVectorBuilder::with_type_capacity()"); + } + + fn push(&mut self, value: Option<::RefItem<'_>>) { + // We expect the input ListValue has the same inner type as the builder when using + // push(), so just panic if `push_value_ref()` returns error, which indicate an + // invalid input value type. + self.push_value_ref(value.into()).unwrap_or_else(|e| { + panic!( + "Failed to push value, expect value type {:?}, err:{}", + self.item_type, e + ); + }); + } + + fn finish(&mut self) -> Self::VectorType { + let len = self.len(); + let values_vector = self.values_builder.to_vector(); + let values_arr = values_vector.to_arrow_array(); + let values_data = values_arr.data(); + + let offset_buffer = self.offsets_builder.finish(); + let null_bit_buffer = self.null_buffer_builder.finish(); + // Re-initialize the offsets_builder. + self.offsets_builder.append(0); + let data_type = ConcreteDataType::list_datatype(self.item_type.clone()).as_arrow_type(); + let array_data_builder = ArrayData::builder(data_type) + .len(len) + .add_buffer(offset_buffer) + .add_child_data(values_data.clone()) + .null_bit_buffer(null_bit_buffer); + + let array_data = unsafe { array_data_builder.build_unchecked() }; + let array = ListArray::from(array_data); + + ListVector { + array, + item_type: self.item_type.clone(), + } + } +} + +// Ports from https://github.com/apache/arrow-rs/blob/94565bca99b5d9932a3e9a8e094aaf4e4384b1e5/arrow-array/src/builder/null_buffer_builder.rs +/// Builder for creating the null bit buffer. +/// This builder only materializes the buffer when we append `false`. +/// If you only append `true`s to the builder, what you get will be +/// `None` when calling [`finish`](#method.finish). +/// This optimization is **very** important for the performance. +#[derive(Debug)] +struct NullBufferBuilder { + bitmap_builder: Option, + /// Store the length of the buffer before materializing. + len: usize, + capacity: usize, +} + +impl NullBufferBuilder { + /// Creates a new empty builder. + /// `capacity` is the number of bits in the null buffer. + fn new(capacity: usize) -> Self { + Self { + bitmap_builder: None, + len: 0, + capacity, + } + } + + fn len(&self) -> usize { + if let Some(b) = &self.bitmap_builder { + b.len() + } else { + self.len + } + } + + /// Appends a `true` into the builder + /// to indicate that this item is not null. + #[inline] + fn append_non_null(&mut self) { + if let Some(buf) = self.bitmap_builder.as_mut() { + buf.append(true) + } else { + self.len += 1; + } + } + + /// Appends a `false` into the builder + /// to indicate that this item is null. + #[inline] + fn append_null(&mut self) { + self.materialize_if_needed(); + self.bitmap_builder.as_mut().unwrap().append(false); + } + + /// Appends a boolean value into the builder. + #[inline] + fn append(&mut self, not_null: bool) { + if not_null { + self.append_non_null() + } else { + self.append_null() + } + } + + /// Builds the null buffer and resets the builder. + /// Returns `None` if the builder only contains `true`s. + fn finish(&mut self) -> Option { + let buf = self.bitmap_builder.as_mut().map(|b| b.finish()); + self.bitmap_builder = None; + self.len = 0; + buf + } + + #[inline] + fn materialize_if_needed(&mut self) { + if self.bitmap_builder.is_none() { + self.materialize() + } + } + + #[cold] + fn materialize(&mut self) { + if self.bitmap_builder.is_none() { + let mut b = BooleanBufferBuilder::new(self.len.max(self.capacity)); + b.append_n(self.len, true); + self.bitmap_builder = Some(b); + } + } +} + +#[cfg(test)] +pub mod tests { + use arrow::array::{Int32Array, Int32Builder, ListBuilder}; + use serde_json::json; + + use super::*; + use crate::scalars::ScalarRef; + use crate::types::ListType; + use crate::vectors::Int32Vector; + + pub fn new_list_vector(data: &[Option>>]) -> ListVector { + let mut builder = + ListVectorBuilder::with_type_capacity(ConcreteDataType::int32_datatype(), 8); + for vec_opt in data { + if let Some(vec) = vec_opt { + let values = vec.iter().map(|v| Value::from(*v)).collect(); + let values = Some(Box::new(values)); + let list_value = ListValue::new(values, ConcreteDataType::int32_datatype()); + + builder.push(Some(ListValueRef::Ref { val: &list_value })); + } else { + builder.push(None); + } + } + + builder.finish() + } + + fn new_list_array(data: &[Option>>]) -> ListArray { + let mut builder = ListBuilder::new(Int32Builder::new()); + for vec_opt in data { + if let Some(vec) = vec_opt { + for value_opt in vec { + builder.values().append_option(*value_opt); + } + + builder.append(true); + } else { + builder.append(false); + } + } + + builder.finish() + } + + #[test] + fn test_list_vector() { + let data = vec![ + Some(vec![Some(1), Some(2), Some(3)]), + None, + Some(vec![Some(4), None, Some(6)]), + ]; + + let list_vector = new_list_vector(&data); + + assert_eq!( + ConcreteDataType::List(ListType::new(ConcreteDataType::int32_datatype())), + list_vector.data_type() + ); + assert_eq!("ListVector", list_vector.vector_type_name()); + assert_eq!(3, list_vector.len()); + assert!(!list_vector.is_null(0)); + assert!(list_vector.is_null(1)); + assert!(!list_vector.is_null(2)); + + let arrow_array = new_list_array(&data); + assert_eq!( + arrow_array, + *list_vector + .to_arrow_array() + .as_any() + .downcast_ref::() + .unwrap() + ); + let validity = list_vector.validity(); + assert!(!validity.is_all_null()); + assert!(!validity.is_all_valid()); + assert!(validity.is_set(0)); + assert!(!validity.is_set(1)); + assert!(validity.is_set(2)); + assert_eq!(256, list_vector.memory_size()); + + let slice = list_vector.slice(0, 2).to_arrow_array(); + let sliced_array = slice.as_any().downcast_ref::().unwrap(); + assert_eq!( + Int32Array::from_iter_values([1, 2, 3]), + *sliced_array + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + ); + assert!(sliced_array.is_null(1)); + + assert_eq!( + Value::List(ListValue::new( + Some(Box::new(vec![ + Value::Int32(1), + Value::Int32(2), + Value::Int32(3) + ])), + ConcreteDataType::int32_datatype() + )), + list_vector.get(0) + ); + let value_ref = list_vector.get_ref(0); + assert!(matches!( + value_ref, + ValueRef::List(ListValueRef::Indexed { .. }) + )); + let value_ref = list_vector.get_ref(1); + if let ValueRef::List(ListValueRef::Indexed { idx, .. }) = value_ref { + assert_eq!(1, idx); + } else { + unreachable!() + } + assert_eq!(Value::Null, list_vector.get(1)); + assert_eq!( + Value::List(ListValue::new( + Some(Box::new(vec![ + Value::Int32(4), + Value::Null, + Value::Int32(6) + ])), + ConcreteDataType::int32_datatype() + )), + list_vector.get(2) + ); + } + + #[test] + fn test_from_arrow_array() { + let data = vec![ + Some(vec![Some(1), Some(2), Some(3)]), + None, + Some(vec![Some(4), None, Some(6)]), + ]; + + let arrow_array = new_list_array(&data); + let array_ref: ArrayRef = Arc::new(arrow_array); + let expect = new_list_vector(&data); + + // Test try from ArrayRef + let list_vector = ListVector::try_from_arrow_array(array_ref).unwrap(); + assert_eq!(expect, list_vector); + + // Test from + let arrow_array = new_list_array(&data); + let list_vector = ListVector::from(arrow_array); + assert_eq!(expect, list_vector); + } + + #[test] + fn test_iter_list_vector_values() { + let data = vec![ + Some(vec![Some(1), Some(2), Some(3)]), + None, + Some(vec![Some(4), None, Some(6)]), + ]; + + let list_vector = new_list_vector(&data); + + assert_eq!( + ConcreteDataType::List(ListType::new(ConcreteDataType::int32_datatype())), + list_vector.data_type() + ); + let mut iter = list_vector.values_iter(); + assert_eq!( + Arc::new(Int32Vector::from_slice(&[1, 2, 3])) as VectorRef, + *iter.next().unwrap().unwrap().unwrap() + ); + assert!(iter.next().unwrap().unwrap().is_none()); + assert_eq!( + Arc::new(Int32Vector::from(vec![Some(4), None, Some(6)])) as VectorRef, + *iter.next().unwrap().unwrap().unwrap(), + ); + assert!(iter.next().is_none()) + } + + #[test] + fn test_serialize_to_json() { + let data = vec![ + Some(vec![Some(1), Some(2), Some(3)]), + None, + Some(vec![Some(4), None, Some(6)]), + ]; + + let list_vector = new_list_vector(&data); + assert_eq!( + vec![json!([1, 2, 3]), json!(null), json!([4, null, 6]),], + list_vector.serialize_to_json().unwrap() + ); + } + + #[test] + fn test_list_vector_builder() { + let mut builder = + ListType::new(ConcreteDataType::int32_datatype()).create_mutable_vector(3); + builder + .push_value_ref(ValueRef::List(ListValueRef::Ref { + val: &ListValue::new( + Some(Box::new(vec![ + Value::Int32(4), + Value::Null, + Value::Int32(6), + ])), + ConcreteDataType::int32_datatype(), + ), + })) + .unwrap(); + assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); + + let data = vec![ + Some(vec![Some(1), Some(2), Some(3)]), + None, + Some(vec![Some(7), Some(8), None]), + ]; + let input = new_list_vector(&data); + builder.extend_slice_of(&input, 1, 2).unwrap(); + assert!(builder + .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) + .is_err()); + let vector = builder.to_vector(); + + let expect: VectorRef = Arc::new(new_list_vector(&[ + Some(vec![Some(4), None, Some(6)]), + None, + Some(vec![Some(7), Some(8), None]), + ])); + assert_eq!(expect, vector); + } + + #[test] + fn test_list_vector_for_scalar() { + let mut builder = + ListVectorBuilder::with_type_capacity(ConcreteDataType::int32_datatype(), 2); + builder.push(None); + builder.push(Some(ListValueRef::Ref { + val: &ListValue::new( + Some(Box::new(vec![ + Value::Int32(4), + Value::Null, + Value::Int32(6), + ])), + ConcreteDataType::int32_datatype(), + ), + })); + let vector = builder.finish(); + + let expect = new_list_vector(&[None, Some(vec![Some(4), None, Some(6)])]); + assert_eq!(expect, vector); + + assert!(vector.get_data(0).is_none()); + assert_eq!( + ListValueRef::Indexed { + vector: &vector, + idx: 1 + }, + vector.get_data(1).unwrap() + ); + assert_eq!( + *vector.get(1).as_list().unwrap().unwrap(), + vector.get_data(1).unwrap().to_owned_scalar() + ); + + let mut iter = vector.iter_data(); + assert!(iter.next().unwrap().is_none()); + assert_eq!( + ListValueRef::Indexed { + vector: &vector, + idx: 1 + }, + iter.next().unwrap().unwrap() + ); + assert!(iter.next().is_none()); + + let mut iter = vector.iter_data(); + assert_eq!(2, iter.size_hint().0); + assert_eq!( + ListValueRef::Indexed { + vector: &vector, + idx: 1 + }, + iter.nth(1).unwrap().unwrap() + ); + } +} diff --git a/src/datatypes2/src/vectors/null.rs b/src/datatypes2/src/vectors/null.rs new file mode 100644 index 0000000000..bb66e09b39 --- /dev/null +++ b/src/datatypes2/src/vectors/null.rs @@ -0,0 +1,282 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::fmt; +use std::sync::Arc; + +use arrow::array::{Array, ArrayData, ArrayRef, NullArray}; +use snafu::{ensure, OptionExt}; + +use crate::data_type::ConcreteDataType; +use crate::error::{self, Result}; +use crate::serialize::Serializable; +use crate::types::NullType; +use crate::value::{Value, ValueRef}; +use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; + +/// A vector where all elements are nulls. +#[derive(PartialEq)] +pub struct NullVector { + array: NullArray, +} + +// TODO(yingwen): Support null vector with other logical types. +impl NullVector { + /// Create a new `NullVector` with `n` elements. + pub fn new(n: usize) -> Self { + Self { + array: NullArray::new(n), + } + } + + pub(crate) fn as_arrow(&self) -> &dyn Array { + &self.array + } + + fn to_array_data(&self) -> ArrayData { + self.array.data().clone() + } +} + +impl From for NullVector { + fn from(array: NullArray) -> Self { + Self { array } + } +} + +impl Vector for NullVector { + fn data_type(&self) -> ConcreteDataType { + ConcreteDataType::Null(NullType::default()) + } + + fn vector_type_name(&self) -> String { + "NullVector".to_string() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn len(&self) -> usize { + self.array.len() + } + + fn to_arrow_array(&self) -> ArrayRef { + // TODO(yingwen): Replaced by clone after upgrading to arrow 28.0. + let data = self.to_array_data(); + Arc::new(NullArray::from(data)) + } + + fn to_boxed_arrow_array(&self) -> Box { + let data = self.to_array_data(); + Box::new(NullArray::from(data)) + } + + fn validity(&self) -> Validity { + Validity::all_null(self.array.len()) + } + + fn memory_size(&self) -> usize { + 0 + } + + fn null_count(&self) -> usize { + self.array.null_count() + } + + fn is_null(&self, _row: usize) -> bool { + true + } + + fn only_null(&self) -> bool { + true + } + + fn slice(&self, _offset: usize, length: usize) -> VectorRef { + Arc::new(Self::new(length)) + } + + fn get(&self, _index: usize) -> Value { + // Skips bound check for null array. + Value::Null + } + + fn get_ref(&self, _index: usize) -> ValueRef { + // Skips bound check for null array. + ValueRef::Null + } +} + +impl fmt::Debug for NullVector { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "NullVector({})", self.len()) + } +} + +impl Serializable for NullVector { + fn serialize_to_json(&self) -> Result> { + Ok(std::iter::repeat(serde_json::Value::Null) + .take(self.len()) + .collect()) + } +} + +vectors::impl_try_from_arrow_array_for_vector!(NullArray, NullVector); + +#[derive(Default)] +pub struct NullVectorBuilder { + length: usize, +} + +impl MutableVector for NullVectorBuilder { + fn data_type(&self) -> ConcreteDataType { + ConcreteDataType::null_datatype() + } + + fn len(&self) -> usize { + self.length + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_mut_any(&mut self) -> &mut dyn Any { + self + } + + fn to_vector(&mut self) -> VectorRef { + let vector = Arc::new(NullVector::new(self.length)); + self.length = 0; + vector + } + + fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { + ensure!( + value.is_null(), + error::CastTypeSnafu { + msg: format!("Failed to cast value ref {:?} to null", value), + } + ); + + self.length += 1; + Ok(()) + } + + fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { + vector + .as_any() + .downcast_ref::() + .with_context(|| error::CastTypeSnafu { + msg: format!( + "Failed to convert vector from {} to NullVector", + vector.vector_type_name() + ), + })?; + assert!( + offset + length <= vector.len(), + "offset {} + length {} must less than {}", + offset, + length, + vector.len() + ); + + self.length += length; + Ok(()) + } +} + +pub(crate) fn replicate_null(vector: &NullVector, offsets: &[usize]) -> VectorRef { + assert_eq!(offsets.len(), vector.len()); + + if offsets.is_empty() { + return vector.slice(0, 0); + } + + Arc::new(NullVector::new(*offsets.last().unwrap())) +} + +#[cfg(test)] +mod tests { + use serde_json; + + use super::*; + use crate::data_type::DataType; + + #[test] + fn test_null_vector_misc() { + let v = NullVector::new(32); + + assert_eq!(v.len(), 32); + assert_eq!(0, v.memory_size()); + let arrow_arr = v.to_arrow_array(); + assert_eq!(arrow_arr.null_count(), 32); + + let array2 = arrow_arr.slice(8, 16); + assert_eq!(array2.len(), 16); + assert_eq!(array2.null_count(), 16); + + assert_eq!("NullVector", v.vector_type_name()); + assert!(!v.is_const()); + assert!(v.validity().is_all_null()); + assert!(v.only_null()); + + for i in 0..32 { + assert!(v.is_null(i)); + assert_eq!(Value::Null, v.get(i)); + assert_eq!(ValueRef::Null, v.get_ref(i)); + } + } + + #[test] + fn test_debug_null_vector() { + let array = NullVector::new(1024 * 1024); + assert_eq!(format!("{:?}", array), "NullVector(1048576)"); + } + + #[test] + fn test_serialize_json() { + let vector = NullVector::new(3); + let json_value = vector.serialize_to_json().unwrap(); + assert_eq!( + "[null,null,null]", + serde_json::to_string(&json_value).unwrap() + ); + } + + #[test] + fn test_null_vector_validity() { + let vector = NullVector::new(5); + assert!(vector.validity().is_all_null()); + assert_eq!(5, vector.null_count()); + } + + #[test] + fn test_null_vector_builder() { + let mut builder = NullType::default().create_mutable_vector(3); + builder.push_value_ref(ValueRef::Null).unwrap(); + assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); + + let input = NullVector::new(3); + builder.extend_slice_of(&input, 1, 2).unwrap(); + assert!(builder + .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) + .is_err()); + let vector = builder.to_vector(); + + let expect: VectorRef = Arc::new(input); + assert_eq!(expect, vector); + } +} diff --git a/src/datatypes2/src/vectors/operations.rs b/src/datatypes2/src/vectors/operations.rs new file mode 100644 index 0000000000..70ddb4a031 --- /dev/null +++ b/src/datatypes2/src/vectors/operations.rs @@ -0,0 +1,127 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod filter; +mod find_unique; +mod replicate; + +use common_base::BitVec; + +use crate::error::Result; +use crate::types::LogicalPrimitiveType; +use crate::vectors::constant::ConstantVector; +use crate::vectors::{ + BinaryVector, BooleanVector, ListVector, NullVector, PrimitiveVector, StringVector, Vector, + VectorRef, +}; + +/// Vector compute operations. +pub trait VectorOp { + /// Copies each element according `offsets` parameter. + /// - `i-th` element should be copied `offsets[i] - offsets[i - 1]` times + /// - `0-th` element would be copied `offsets[0]` times + /// + /// # Panics + /// Panics if `offsets.len() != self.len()`. + fn replicate(&self, offsets: &[usize]) -> VectorRef; + + /// Mark `i-th` bit of `selected` to `true` if the `i-th` element of `self` is unique, which + /// means there is no elements behind it have same value as it. + /// + /// The caller should ensure + /// 1. the length of `selected` bitmap is equal to `vector.len()`. + /// 2. `vector` and `prev_vector` are sorted. + /// + /// If there are multiple duplicate elements, this function retains the **first** element. + /// The first element is considered as unique if the first element of `self` is different + /// from its previous element, that is the last element of `prev_vector`. + /// + /// # Panics + /// Panics if + /// - `selected.len() < self.len()`. + /// - `prev_vector` and `self` have different data types. + fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>); + + /// Filters the vector, returns elements matching the `filter` (i.e. where the values are true). + /// + /// Note that the nulls of `filter` are interpreted as `false` will lead to these elements being masked out. + fn filter(&self, filter: &BooleanVector) -> Result; +} + +macro_rules! impl_scalar_vector_op { + ($($VectorType: ident),+) => {$( + impl VectorOp for $VectorType { + fn replicate(&self, offsets: &[usize]) -> VectorRef { + replicate::replicate_scalar(self, offsets) + } + + fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { + let prev_vector = prev_vector.map(|pv| pv.as_any().downcast_ref::<$VectorType>().unwrap()); + find_unique::find_unique_scalar(self, selected, prev_vector); + } + + fn filter(&self, filter: &BooleanVector) -> Result { + filter::filter_non_constant!(self, $VectorType, filter) + } + } + )+}; +} + +impl_scalar_vector_op!(BinaryVector, BooleanVector, ListVector, StringVector); + +impl VectorOp for PrimitiveVector { + fn replicate(&self, offsets: &[usize]) -> VectorRef { + std::sync::Arc::new(replicate::replicate_primitive(self, offsets)) + } + + fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { + let prev_vector = + prev_vector.and_then(|pv| pv.as_any().downcast_ref::>()); + find_unique::find_unique_scalar(self, selected, prev_vector); + } + + fn filter(&self, filter: &BooleanVector) -> Result { + filter::filter_non_constant!(self, PrimitiveVector, filter) + } +} + +impl VectorOp for NullVector { + fn replicate(&self, offsets: &[usize]) -> VectorRef { + replicate::replicate_null(self, offsets) + } + + fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { + let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::()); + find_unique::find_unique_null(self, selected, prev_vector); + } + + fn filter(&self, filter: &BooleanVector) -> Result { + filter::filter_non_constant!(self, NullVector, filter) + } +} + +impl VectorOp for ConstantVector { + fn replicate(&self, offsets: &[usize]) -> VectorRef { + self.replicate_vector(offsets) + } + + fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { + let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::()); + find_unique::find_unique_constant(self, selected, prev_vector); + } + + fn filter(&self, filter: &BooleanVector) -> Result { + self.filter_vector(filter) + } +} diff --git a/src/datatypes2/src/vectors/operations/filter.rs b/src/datatypes2/src/vectors/operations/filter.rs new file mode 100644 index 0000000000..8368a6afb4 --- /dev/null +++ b/src/datatypes2/src/vectors/operations/filter.rs @@ -0,0 +1,145 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +macro_rules! filter_non_constant { + ($vector: expr, $VectorType: ty, $filter: ident) => {{ + use std::sync::Arc; + + use arrow::compute; + use snafu::ResultExt; + + let arrow_array = $vector.as_arrow(); + let filtered = compute::filter(arrow_array, $filter.as_boolean_array()) + .context(crate::error::ArrowComputeSnafu)?; + Ok(Arc::new(<$VectorType>::try_from_arrow_array(filtered)?)) + }}; +} + +pub(crate) use filter_non_constant; + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use common_time::{Date, DateTime}; + + use crate::scalars::ScalarVector; + use crate::timestamp::{ + TimestampMicrosecond, TimestampMillisecond, TimestampNanosecond, TimestampSecond, + }; + use crate::types::WrapperType; + use crate::vectors::constant::ConstantVector; + use crate::vectors::{ + BooleanVector, Int32Vector, NullVector, StringVector, VectorOp, VectorRef, + }; + + fn check_filter_primitive(expect: &[i32], input: &[i32], filter: &[bool]) { + let v = Int32Vector::from_slice(&input); + let filter = BooleanVector::from_slice(filter); + let out = v.filter(&filter).unwrap(); + + let expect: VectorRef = Arc::new(Int32Vector::from_slice(&expect)); + assert_eq!(expect, out); + } + + #[test] + fn test_filter_primitive() { + check_filter_primitive(&[], &[], &[]); + check_filter_primitive(&[5], &[5], &[true]); + check_filter_primitive(&[], &[5], &[false]); + check_filter_primitive(&[], &[5, 6], &[false, false]); + check_filter_primitive(&[5, 6], &[5, 6], &[true, true]); + check_filter_primitive(&[], &[5, 6, 7], &[false, false, false]); + check_filter_primitive(&[5], &[5, 6, 7], &[true, false, false]); + check_filter_primitive(&[6], &[5, 6, 7], &[false, true, false]); + check_filter_primitive(&[7], &[5, 6, 7], &[false, false, true]); + check_filter_primitive(&[5, 7], &[5, 6, 7], &[true, false, true]); + } + + fn check_filter_constant(expect_length: usize, input_length: usize, filter: &[bool]) { + let v = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[123])), input_length); + let filter = BooleanVector::from_slice(filter); + let out = v.filter(&filter).unwrap(); + + assert!(out.is_const()); + assert_eq!(expect_length, out.len()); + } + + #[test] + fn test_filter_constant() { + check_filter_constant(0, 0, &[]); + check_filter_constant(1, 1, &[true]); + check_filter_constant(0, 1, &[false]); + check_filter_constant(1, 2, &[false, true]); + check_filter_constant(2, 2, &[true, true]); + check_filter_constant(1, 4, &[false, false, false, true]); + check_filter_constant(2, 4, &[false, true, false, true]); + } + + #[test] + fn test_filter_scalar() { + let v = StringVector::from_slice(&["0", "1", "2", "3"]); + let filter = BooleanVector::from_slice(&[false, true, false, true]); + let out = v.filter(&filter).unwrap(); + + let expect: VectorRef = Arc::new(StringVector::from_slice(&["1", "3"])); + assert_eq!(expect, out); + } + + #[test] + fn test_filter_null() { + let v = NullVector::new(5); + let filter = BooleanVector::from_slice(&[false, true, false, true, true]); + let out = v.filter(&filter).unwrap(); + + let expect: VectorRef = Arc::new(NullVector::new(3)); + assert_eq!(expect, out); + } + + macro_rules! impl_filter_date_like_test { + ($VectorType: ident, $ValueType: ident, $method: ident) => {{ + use std::sync::Arc; + + use $crate::vectors::{$VectorType, VectorRef}; + + let v = $VectorType::from_iterator((0..5).map($ValueType::$method)); + let filter = BooleanVector::from_slice(&[false, true, false, true, true]); + let out = v.filter(&filter).unwrap(); + + let expect: VectorRef = Arc::new($VectorType::from_iterator( + [1, 3, 4].into_iter().map($ValueType::$method), + )); + assert_eq!(expect, out); + }}; + } + + #[test] + fn test_filter_date_like() { + impl_filter_date_like_test!(DateVector, Date, new); + impl_filter_date_like_test!(DateTimeVector, DateTime, new); + + impl_filter_date_like_test!(TimestampSecondVector, TimestampSecond, from_native); + impl_filter_date_like_test!( + TimestampMillisecondVector, + TimestampMillisecond, + from_native + ); + impl_filter_date_like_test!( + TimestampMicrosecondVector, + TimestampMicrosecond, + from_native + ); + impl_filter_date_like_test!(TimestampNanosecondVector, TimestampNanosecond, from_native); + } +} diff --git a/src/datatypes2/src/vectors/operations/find_unique.rs b/src/datatypes2/src/vectors/operations/find_unique.rs new file mode 100644 index 0000000000..7116a9e90d --- /dev/null +++ b/src/datatypes2/src/vectors/operations/find_unique.rs @@ -0,0 +1,367 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use common_base::BitVec; + +use crate::scalars::ScalarVector; +use crate::vectors::constant::ConstantVector; +use crate::vectors::{NullVector, Vector}; + +// To implement `find_unique()` correctly, we need to keep in mind that always marks an element as +// selected when it is different from the previous one, and leaves the `selected` unchanged +// in any other case. +pub(crate) fn find_unique_scalar<'a, T: ScalarVector>( + vector: &'a T, + selected: &'a mut BitVec, + prev_vector: Option<&'a T>, +) where + T::RefItem<'a>: PartialEq, +{ + assert!(selected.len() >= vector.len()); + + if vector.is_empty() { + return; + } + + for ((i, current), next) in vector + .iter_data() + .enumerate() + .zip(vector.iter_data().skip(1)) + { + if current != next { + // If next element is a different element, we mark it as selected. + selected.set(i + 1, true); + } + } + + // Marks first element as selected if it is different from previous element, otherwise + // keep selected bitmap unchanged. + let is_first_not_duplicate = prev_vector + .map(|pv| { + if pv.is_empty() { + true + } else { + let last = pv.get_data(pv.len() - 1); + last != vector.get_data(0) + } + }) + .unwrap_or(true); + if is_first_not_duplicate { + selected.set(0, true); + } +} + +pub(crate) fn find_unique_null( + vector: &NullVector, + selected: &mut BitVec, + prev_vector: Option<&NullVector>, +) { + if vector.is_empty() { + return; + } + + let is_first_not_duplicate = prev_vector.map(NullVector::is_empty).unwrap_or(true); + if is_first_not_duplicate { + selected.set(0, true); + } +} + +pub(crate) fn find_unique_constant( + vector: &ConstantVector, + selected: &mut BitVec, + prev_vector: Option<&ConstantVector>, +) { + if vector.is_empty() { + return; + } + + let is_first_not_duplicate = prev_vector + .map(|pv| { + if pv.is_empty() { + true + } else { + vector.get_constant_ref() != pv.get_constant_ref() + } + }) + .unwrap_or(true); + + if is_first_not_duplicate { + selected.set(0, true); + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use common_time::{Date, DateTime}; + + use super::*; + use crate::timestamp::*; + use crate::vectors::{Int32Vector, StringVector, Vector, VectorOp}; + + fn check_bitmap(expect: &[bool], selected: &BitVec) { + let actual = selected.iter().collect::>(); + assert_eq!(expect, actual); + } + + fn check_find_unique_scalar(expect: &[bool], input: &[i32], prev: Option<&[i32]>) { + check_find_unique_scalar_opt(expect, input.iter().map(|v| Some(*v)), prev); + } + + fn check_find_unique_scalar_opt( + expect: &[bool], + input: impl Iterator>, + prev: Option<&[i32]>, + ) { + let input = Int32Vector::from(input.collect::>()); + let prev = prev.map(Int32Vector::from_slice); + + let mut selected = BitVec::repeat(false, input.len()); + input.find_unique(&mut selected, prev.as_ref().map(|v| v as _)); + + check_bitmap(expect, &selected); + } + + #[test] + fn test_find_unique_scalar() { + check_find_unique_scalar(&[], &[], None); + check_find_unique_scalar(&[true], &[1], None); + check_find_unique_scalar(&[true, false], &[1, 1], None); + check_find_unique_scalar(&[true, true], &[1, 2], None); + check_find_unique_scalar(&[true, true, true, true], &[1, 2, 3, 4], None); + check_find_unique_scalar(&[true, false, true, false], &[1, 1, 3, 3], None); + check_find_unique_scalar(&[true, false, false, false, true], &[2, 2, 2, 2, 3], None); + + check_find_unique_scalar(&[true], &[5], Some(&[])); + check_find_unique_scalar(&[true], &[5], Some(&[3])); + check_find_unique_scalar(&[false], &[5], Some(&[5])); + check_find_unique_scalar(&[false], &[5], Some(&[4, 5])); + check_find_unique_scalar(&[false, true], &[5, 6], Some(&[4, 5])); + check_find_unique_scalar(&[false, true, false], &[5, 6, 6], Some(&[4, 5])); + check_find_unique_scalar( + &[false, true, false, true, true], + &[5, 6, 6, 7, 8], + Some(&[4, 5]), + ); + + check_find_unique_scalar_opt( + &[true, true, false, true, false], + [Some(1), Some(2), Some(2), None, None].into_iter(), + None, + ); + } + + #[test] + fn test_find_unique_scalar_multi_times_with_prev() { + let prev = Int32Vector::from_slice(&[1]); + + let v1 = Int32Vector::from_slice(&[2, 3, 4]); + let mut selected = BitVec::repeat(false, v1.len()); + v1.find_unique(&mut selected, Some(&prev)); + + // Though element in v2 are the same as prev, but we should still keep them. + let v2 = Int32Vector::from_slice(&[1, 1, 1]); + v2.find_unique(&mut selected, Some(&prev)); + + check_bitmap(&[true, true, true], &selected); + } + + fn new_bitmap(bits: &[bool]) -> BitVec { + BitVec::from_iter(bits) + } + + #[test] + fn test_find_unique_scalar_with_prev() { + let prev = Int32Vector::from_slice(&[1]); + + let mut selected = new_bitmap(&[true, false, true, false]); + let v = Int32Vector::from_slice(&[2, 3, 4, 5]); + v.find_unique(&mut selected, Some(&prev)); + // All elements are different. + check_bitmap(&[true, true, true, true], &selected); + + let mut selected = new_bitmap(&[true, false, true, false]); + let v = Int32Vector::from_slice(&[1, 2, 3, 4]); + v.find_unique(&mut selected, Some(&prev)); + // Though first element is duplicate, but we keep the flag unchanged. + check_bitmap(&[true, true, true, true], &selected); + + // Same case as above, but now `prev` is None. + let mut selected = new_bitmap(&[true, false, true, false]); + let v = Int32Vector::from_slice(&[1, 2, 3, 4]); + v.find_unique(&mut selected, None); + check_bitmap(&[true, true, true, true], &selected); + + // Same case as above, but now `prev` is empty. + let mut selected = new_bitmap(&[true, false, true, false]); + let v = Int32Vector::from_slice(&[1, 2, 3, 4]); + v.find_unique(&mut selected, Some(&Int32Vector::from_slice(&[]))); + check_bitmap(&[true, true, true, true], &selected); + + let mut selected = new_bitmap(&[false, false, false, false]); + let v = Int32Vector::from_slice(&[2, 2, 4, 5]); + v.find_unique(&mut selected, Some(&prev)); + // only v[1] is duplicate. + check_bitmap(&[true, false, true, true], &selected); + } + + fn check_find_unique_null(len: usize) { + let input = NullVector::new(len); + let mut selected = BitVec::repeat(false, input.len()); + input.find_unique(&mut selected, None); + + let mut expect = vec![false; len]; + if !expect.is_empty() { + expect[0] = true; + } + check_bitmap(&expect, &selected); + + let mut selected = BitVec::repeat(false, input.len()); + let prev = Some(NullVector::new(1)); + input.find_unique(&mut selected, prev.as_ref().map(|v| v as _)); + let expect = vec![false; len]; + check_bitmap(&expect, &selected); + } + + #[test] + fn test_find_unique_null() { + for len in 0..5 { + check_find_unique_null(len); + } + } + + #[test] + fn test_find_unique_null_with_prev() { + let prev = NullVector::new(1); + + // Keep flags unchanged. + let mut selected = new_bitmap(&[true, false, true, false]); + let v = NullVector::new(4); + v.find_unique(&mut selected, Some(&prev)); + check_bitmap(&[true, false, true, false], &selected); + + // Keep flags unchanged. + let mut selected = new_bitmap(&[false, false, true, false]); + v.find_unique(&mut selected, Some(&prev)); + check_bitmap(&[false, false, true, false], &selected); + + // Prev is None, select first element. + let mut selected = new_bitmap(&[false, false, true, false]); + v.find_unique(&mut selected, None); + check_bitmap(&[true, false, true, false], &selected); + + // Prev is empty, select first element. + let mut selected = new_bitmap(&[false, false, true, false]); + v.find_unique(&mut selected, Some(&NullVector::new(0))); + check_bitmap(&[true, false, true, false], &selected); + } + + fn check_find_unique_constant(len: usize) { + let input = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[8])), len); + let mut selected = BitVec::repeat(false, len); + input.find_unique(&mut selected, None); + + let mut expect = vec![false; len]; + if !expect.is_empty() { + expect[0] = true; + } + check_bitmap(&expect, &selected); + + let mut selected = BitVec::repeat(false, len); + let prev = Some(ConstantVector::new( + Arc::new(Int32Vector::from_slice(&[8])), + 1, + )); + input.find_unique(&mut selected, prev.as_ref().map(|v| v as _)); + let expect = vec![false; len]; + check_bitmap(&expect, &selected); + } + + #[test] + fn test_find_unique_constant() { + for len in 0..5 { + check_find_unique_constant(len); + } + } + + #[test] + fn test_find_unique_constant_with_prev() { + let prev = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[1])), 1); + + // Keep flags unchanged. + let mut selected = new_bitmap(&[true, false, true, false]); + let v = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[1])), 4); + v.find_unique(&mut selected, Some(&prev)); + check_bitmap(&[true, false, true, false], &selected); + + // Keep flags unchanged. + let mut selected = new_bitmap(&[false, false, true, false]); + v.find_unique(&mut selected, Some(&prev)); + check_bitmap(&[false, false, true, false], &selected); + + // Prev is None, select first element. + let mut selected = new_bitmap(&[false, false, true, false]); + v.find_unique(&mut selected, None); + check_bitmap(&[true, false, true, false], &selected); + + // Prev is empty, select first element. + let mut selected = new_bitmap(&[false, false, true, false]); + v.find_unique( + &mut selected, + Some(&ConstantVector::new( + Arc::new(Int32Vector::from_slice(&[1])), + 0, + )), + ); + check_bitmap(&[true, false, true, false], &selected); + + // Different constant vector. + let mut selected = new_bitmap(&[false, false, true, false]); + let v = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[2])), 4); + v.find_unique(&mut selected, Some(&prev)); + check_bitmap(&[true, false, true, false], &selected); + } + + #[test] + fn test_find_unique_string() { + let input = StringVector::from_slice(&["a", "a", "b", "c"]); + let mut selected = BitVec::repeat(false, 4); + input.find_unique(&mut selected, None); + let expect = vec![true, false, true, true]; + check_bitmap(&expect, &selected); + } + + macro_rules! impl_find_unique_date_like_test { + ($VectorType: ident, $ValueType: ident, $method: ident) => {{ + use $crate::vectors::$VectorType; + + let v = $VectorType::from_iterator([8, 8, 9, 10].into_iter().map($ValueType::$method)); + let mut selected = BitVec::repeat(false, 4); + v.find_unique(&mut selected, None); + let expect = vec![true, false, true, true]; + check_bitmap(&expect, &selected); + }}; + } + + #[test] + fn test_find_unique_date_like() { + impl_find_unique_date_like_test!(DateVector, Date, new); + impl_find_unique_date_like_test!(DateTimeVector, DateTime, new); + impl_find_unique_date_like_test!(TimestampSecondVector, TimestampSecond, from); + impl_find_unique_date_like_test!(TimestampMillisecondVector, TimestampMillisecond, from); + impl_find_unique_date_like_test!(TimestampMicrosecondVector, TimestampMicrosecond, from); + impl_find_unique_date_like_test!(TimestampNanosecondVector, TimestampNanosecond, from); + } +} diff --git a/src/datatypes2/src/vectors/operations/replicate.rs b/src/datatypes2/src/vectors/operations/replicate.rs new file mode 100644 index 0000000000..8216517fc6 --- /dev/null +++ b/src/datatypes2/src/vectors/operations/replicate.rs @@ -0,0 +1,170 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::prelude::*; +pub(crate) use crate::vectors::null::replicate_null; +pub(crate) use crate::vectors::primitive::replicate_primitive; + +pub(crate) fn replicate_scalar(c: &C, offsets: &[usize]) -> VectorRef { + assert_eq!(offsets.len(), c.len()); + + if offsets.is_empty() { + return c.slice(0, 0); + } + let mut builder = <::Builder>::with_capacity(c.len()); + + let mut previous_offset = 0; + for (i, offset) in offsets.iter().enumerate() { + let data = c.get_data(i); + for _ in previous_offset..*offset { + builder.push(data); + } + previous_offset = *offset; + } + builder.to_vector() +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use common_time::timestamp::TimeUnit; + use common_time::{Date, DateTime, Timestamp}; + use paste::paste; + + use super::*; + use crate::vectors::constant::ConstantVector; + use crate::vectors::{Int32Vector, NullVector, StringVector, VectorOp}; + + #[test] + fn test_replicate_primitive() { + let v = Int32Vector::from_iterator(0..5); + let offsets = [0, 1, 2, 3, 4]; + + let v = v.replicate(&offsets); + assert_eq!(4, v.len()); + + for i in 0..4 { + assert_eq!(Value::Int32(i as i32 + 1), v.get(i)); + } + } + + #[test] + fn test_replicate_nullable_primitive() { + let v = Int32Vector::from(vec![None, Some(1), None, Some(2)]); + let offsets = [2, 4, 6, 8]; + let v = v.replicate(&offsets); + assert_eq!(8, v.len()); + + let expect: VectorRef = Arc::new(Int32Vector::from(vec![ + None, + None, + Some(1), + Some(1), + None, + None, + Some(2), + Some(2), + ])); + assert_eq!(expect, v); + } + + #[test] + fn test_replicate_scalar() { + let v = StringVector::from_slice(&["0", "1", "2", "3"]); + let offsets = [1, 3, 5, 6]; + + let v = v.replicate(&offsets); + assert_eq!(6, v.len()); + + let expect: VectorRef = Arc::new(StringVector::from_slice(&["0", "1", "1", "2", "2", "3"])); + assert_eq!(expect, v); + } + + #[test] + fn test_replicate_constant() { + let v = Arc::new(StringVector::from_slice(&["hello"])); + let cv = ConstantVector::new(v.clone(), 2); + let offsets = [1, 4]; + + let cv = cv.replicate(&offsets); + assert_eq!(4, cv.len()); + + let expect: VectorRef = Arc::new(ConstantVector::new(v, 4)); + assert_eq!(expect, cv); + } + + #[test] + fn test_replicate_null() { + let v = NullVector::new(0); + let offsets = []; + let v = v.replicate(&offsets); + assert!(v.is_empty()); + + let v = NullVector::new(3); + let offsets = [1, 3, 5]; + + let v = v.replicate(&offsets); + assert_eq!(5, v.len()); + } + + macro_rules! impl_replicate_date_like_test { + ($VectorType: ident, $ValueType: ident, $method: ident) => {{ + use $crate::vectors::$VectorType; + + let v = $VectorType::from_iterator((0..5).map($ValueType::$method)); + let offsets = [0, 1, 2, 3, 4]; + + let v = v.replicate(&offsets); + assert_eq!(4, v.len()); + + for i in 0..4 { + assert_eq!( + Value::$ValueType($ValueType::$method((i as i32 + 1).into())), + v.get(i) + ); + } + }}; + } + + macro_rules! impl_replicate_timestamp_test { + ($unit: ident) => {{ + paste!{ + use $crate::vectors::[]; + use $crate::timestamp::[]; + let v = []::from_iterator((0..5).map([]::from)); + let offsets = [0, 1, 2, 3, 4]; + let v = v.replicate(&offsets); + assert_eq!(4, v.len()); + for i in 0..4 { + assert_eq!( + Value::Timestamp(Timestamp::new(i as i64 + 1, TimeUnit::$unit)), + v.get(i) + ); + } + } + }}; + } + + #[test] + fn test_replicate_date_like() { + impl_replicate_date_like_test!(DateVector, Date, new); + impl_replicate_date_like_test!(DateTimeVector, DateTime, new); + + impl_replicate_timestamp_test!(Second); + impl_replicate_timestamp_test!(Millisecond); + impl_replicate_timestamp_test!(Microsecond); + impl_replicate_timestamp_test!(Nanosecond); + } +} diff --git a/src/datatypes2/src/vectors/primitive.rs b/src/datatypes2/src/vectors/primitive.rs new file mode 100644 index 0000000000..7829c31731 --- /dev/null +++ b/src/datatypes2/src/vectors/primitive.rs @@ -0,0 +1,552 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::fmt; +use std::sync::Arc; + +use arrow::array::{ + Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef, PrimitiveArray, PrimitiveBuilder, +}; +use serde_json::Value as JsonValue; +use snafu::OptionExt; + +use crate::data_type::ConcreteDataType; +use crate::error::{self, Result}; +use crate::scalars::{Scalar, ScalarRef, ScalarVector, ScalarVectorBuilder}; +use crate::serialize::Serializable; +use crate::types::{ + Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LogicalPrimitiveType, + UInt16Type, UInt32Type, UInt64Type, UInt8Type, WrapperType, +}; +use crate::value::{Value, ValueRef}; +use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; + +pub type UInt8Vector = PrimitiveVector; +pub type UInt16Vector = PrimitiveVector; +pub type UInt32Vector = PrimitiveVector; +pub type UInt64Vector = PrimitiveVector; + +pub type Int8Vector = PrimitiveVector; +pub type Int16Vector = PrimitiveVector; +pub type Int32Vector = PrimitiveVector; +pub type Int64Vector = PrimitiveVector; + +pub type Float32Vector = PrimitiveVector; +pub type Float64Vector = PrimitiveVector; + +/// Vector for primitive data types. +pub struct PrimitiveVector { + array: PrimitiveArray, +} + +impl PrimitiveVector { + pub fn new(array: PrimitiveArray) -> Self { + Self { array } + } + + pub fn try_from_arrow_array(array: impl AsRef) -> Result { + let data = array + .as_ref() + .as_any() + .downcast_ref::>() + .with_context(|| error::ConversionSnafu { + from: format!("{:?}", array.as_ref().data_type()), + })? + .data() + .clone(); + let concrete_array = PrimitiveArray::::from(data); + Ok(Self::new(concrete_array)) + } + + pub fn from_slice>(slice: P) -> Self { + let iter = slice.as_ref().iter().copied(); + Self { + array: PrimitiveArray::from_iter_values(iter), + } + } + + pub fn from_wrapper_slice>(slice: P) -> Self { + let iter = slice.as_ref().iter().copied().map(WrapperType::into_native); + Self { + array: PrimitiveArray::from_iter_values(iter), + } + } + + pub fn from_vec(array: Vec) -> Self { + Self { + array: PrimitiveArray::from_iter_values(array), + } + } + + pub fn from_values>(iter: I) -> Self { + Self { + array: PrimitiveArray::from_iter_values(iter), + } + } + + pub(crate) fn as_arrow(&self) -> &PrimitiveArray { + &self.array + } + + fn to_array_data(&self) -> ArrayData { + self.array.data().clone() + } + + fn from_array_data(data: ArrayData) -> Self { + Self { + array: PrimitiveArray::from(data), + } + } + + // To distinguish with `Vector::slice()`. + fn get_slice(&self, offset: usize, length: usize) -> Self { + let data = self.array.data().slice(offset, length); + Self::from_array_data(data) + } +} + +impl Vector for PrimitiveVector { + fn data_type(&self) -> ConcreteDataType { + T::build_data_type() + } + + fn vector_type_name(&self) -> String { + format!("{}Vector", T::type_name()) + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn len(&self) -> usize { + self.array.len() + } + + fn to_arrow_array(&self) -> ArrayRef { + let data = self.to_array_data(); + Arc::new(PrimitiveArray::::from(data)) + } + + fn to_boxed_arrow_array(&self) -> Box { + let data = self.to_array_data(); + Box::new(PrimitiveArray::::from(data)) + } + + fn validity(&self) -> Validity { + vectors::impl_validity_for_vector!(self.array) + } + + fn memory_size(&self) -> usize { + self.array.get_buffer_memory_size() + } + + fn null_count(&self) -> usize { + self.array.null_count() + } + + fn is_null(&self, row: usize) -> bool { + self.array.is_null(row) + } + + fn slice(&self, offset: usize, length: usize) -> VectorRef { + let data = self.array.data().slice(offset, length); + Arc::new(Self::from_array_data(data)) + } + + fn get(&self, index: usize) -> Value { + if self.array.is_valid(index) { + // Safety: The index have been checked by `is_valid()`. + let wrapper = unsafe { T::Wrapper::from_native(self.array.value_unchecked(index)) }; + wrapper.into() + } else { + Value::Null + } + } + + fn get_ref(&self, index: usize) -> ValueRef { + if self.array.is_valid(index) { + // Safety: The index have been checked by `is_valid()`. + let wrapper = unsafe { T::Wrapper::from_native(self.array.value_unchecked(index)) }; + wrapper.into() + } else { + ValueRef::Null + } + } +} + +impl fmt::Debug for PrimitiveVector { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("PrimitiveVector") + .field("array", &self.array) + .finish() + } +} + +impl From> for PrimitiveVector { + fn from(array: PrimitiveArray) -> Self { + Self { array } + } +} + +impl From>> for PrimitiveVector { + fn from(v: Vec>) -> Self { + Self { + array: PrimitiveArray::from_iter(v), + } + } +} + +pub struct PrimitiveIter<'a, T: LogicalPrimitiveType> { + iter: ArrayIter<&'a PrimitiveArray>, +} + +impl<'a, T: LogicalPrimitiveType> Iterator for PrimitiveIter<'a, T> { + type Item = Option; + + fn next(&mut self) -> Option> { + self.iter + .next() + .map(|item| item.map(T::Wrapper::from_native)) + } + + fn size_hint(&self) -> (usize, Option) { + self.iter.size_hint() + } +} + +impl ScalarVector for PrimitiveVector { + type OwnedItem = T::Wrapper; + type RefItem<'a> = T::Wrapper; + type Iter<'a> = PrimitiveIter<'a, T>; + type Builder = PrimitiveVectorBuilder; + + fn get_data(&self, idx: usize) -> Option> { + if self.array.is_valid(idx) { + Some(T::Wrapper::from_native(self.array.value(idx))) + } else { + None + } + } + + fn iter_data(&self) -> Self::Iter<'_> { + PrimitiveIter { + iter: self.array.iter(), + } + } +} + +impl Serializable for PrimitiveVector { + fn serialize_to_json(&self) -> Result> { + let res = self + .iter_data() + .map(|v| match v { + None => serde_json::Value::Null, + // use WrapperType's Into bound instead of + // serde_json::to_value to facilitate customized serialization + // for WrapperType + Some(v) => v.into(), + }) + .collect::>(); + Ok(res) + } +} + +impl PartialEq for PrimitiveVector { + fn eq(&self, other: &PrimitiveVector) -> bool { + self.array == other.array + } +} + +pub type UInt8VectorBuilder = PrimitiveVectorBuilder; +pub type UInt16VectorBuilder = PrimitiveVectorBuilder; +pub type UInt32VectorBuilder = PrimitiveVectorBuilder; +pub type UInt64VectorBuilder = PrimitiveVectorBuilder; + +pub type Int8VectorBuilder = PrimitiveVectorBuilder; +pub type Int16VectorBuilder = PrimitiveVectorBuilder; +pub type Int32VectorBuilder = PrimitiveVectorBuilder; +pub type Int64VectorBuilder = PrimitiveVectorBuilder; + +pub type Float32VectorBuilder = PrimitiveVectorBuilder; +pub type Float64VectorBuilder = PrimitiveVectorBuilder; + +/// Builder to build a primitive vector. +pub struct PrimitiveVectorBuilder { + mutable_array: PrimitiveBuilder, +} + +impl MutableVector for PrimitiveVectorBuilder { + fn data_type(&self) -> ConcreteDataType { + T::build_data_type() + } + + fn len(&self) -> usize { + self.mutable_array.len() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_mut_any(&mut self) -> &mut dyn Any { + self + } + + fn to_vector(&mut self) -> VectorRef { + Arc::new(self.finish()) + } + + fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { + let primitive = T::cast_value_ref(value)?; + match primitive { + Some(v) => self.mutable_array.append_value(v.into_native()), + None => self.mutable_array.append_null(), + } + Ok(()) + } + + fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { + let primitive = T::cast_vector(vector)?; + // Slice the underlying array to avoid creating a new Arc. + let slice = primitive.get_slice(offset, length); + for v in slice.iter_data() { + self.push(v); + } + Ok(()) + } +} + +impl ScalarVectorBuilder for PrimitiveVectorBuilder +where + T: LogicalPrimitiveType, + T::Wrapper: Scalar>, + for<'a> T::Wrapper: ScalarRef<'a, ScalarType = T::Wrapper>, + for<'a> T::Wrapper: Scalar = T::Wrapper>, +{ + type VectorType = PrimitiveVector; + + fn with_capacity(capacity: usize) -> Self { + Self { + mutable_array: PrimitiveBuilder::with_capacity(capacity), + } + } + + fn push(&mut self, value: Option<::RefItem<'_>>) { + self.mutable_array + .append_option(value.map(|v| v.into_native())); + } + + fn finish(&mut self) -> Self::VectorType { + PrimitiveVector { + array: self.mutable_array.finish(), + } + } +} + +pub(crate) fn replicate_primitive( + vector: &PrimitiveVector, + offsets: &[usize], +) -> PrimitiveVector { + assert_eq!(offsets.len(), vector.len()); + + if offsets.is_empty() { + return vector.get_slice(0, 0); + } + + let mut builder = PrimitiveVectorBuilder::::with_capacity(*offsets.last().unwrap() as usize); + + let mut previous_offset = 0; + + for (offset, value) in offsets.iter().zip(vector.array.iter()) { + let repeat_times = *offset - previous_offset; + match value { + Some(data) => { + unsafe { + // Safety: std::iter::Repeat and std::iter::Take implement TrustedLen. + builder + .mutable_array + .append_trusted_len_iter(std::iter::repeat(data).take(repeat_times)); + } + } + None => { + builder.mutable_array.append_nulls(repeat_times); + } + } + previous_offset = *offset; + } + builder.finish() +} + +#[cfg(test)] +mod tests { + use arrow::array::Int32Array; + use arrow::datatypes::DataType as ArrowDataType; + use serde_json; + + use super::*; + use crate::data_type::DataType; + use crate::serialize::Serializable; + use crate::types::Int64Type; + + fn check_vec(v: Int32Vector) { + assert_eq!(4, v.len()); + assert_eq!("Int32Vector", v.vector_type_name()); + assert!(!v.is_const()); + assert!(v.validity().is_all_valid()); + assert!(!v.only_null()); + + for i in 0..4 { + assert!(!v.is_null(i)); + assert_eq!(Value::Int32(i as i32 + 1), v.get(i)); + assert_eq!(ValueRef::Int32(i as i32 + 1), v.get_ref(i)); + } + + let json_value = v.serialize_to_json().unwrap(); + assert_eq!("[1,2,3,4]", serde_json::to_string(&json_value).unwrap(),); + + let arrow_arr = v.to_arrow_array(); + assert_eq!(4, arrow_arr.len()); + assert_eq!(&ArrowDataType::Int32, arrow_arr.data_type()); + } + + #[test] + fn test_from_values() { + let v = Int32Vector::from_values(vec![1, 2, 3, 4]); + check_vec(v); + } + + #[test] + fn test_from_vec() { + let v = Int32Vector::from_vec(vec![1, 2, 3, 4]); + check_vec(v); + } + + #[test] + fn test_from_slice() { + let v = Int32Vector::from_slice(vec![1, 2, 3, 4]); + check_vec(v); + } + + #[test] + fn test_serialize_primitive_vector_with_null_to_json() { + let input = [Some(1i32), Some(2i32), None, Some(4i32), None]; + let mut builder = Int32VectorBuilder::with_capacity(input.len()); + for v in input { + builder.push(v); + } + let vector = builder.finish(); + + let json_value = vector.serialize_to_json().unwrap(); + assert_eq!( + "[1,2,null,4,null]", + serde_json::to_string(&json_value).unwrap(), + ); + } + + #[test] + fn test_from_arrow_array() { + let arrow_array = Int32Array::from(vec![1, 2, 3, 4]); + let v = Int32Vector::from(arrow_array); + check_vec(v); + } + + #[test] + fn test_primitive_vector_build_get() { + let input = [Some(1i32), Some(2i32), None, Some(4i32), None]; + let mut builder = Int32VectorBuilder::with_capacity(input.len()); + for v in input { + builder.push(v); + } + let vector = builder.finish(); + assert_eq!(input.len(), vector.len()); + + for (i, v) in input.into_iter().enumerate() { + assert_eq!(v, vector.get_data(i)); + assert_eq!(Value::from(v), vector.get(i)); + } + + let res: Vec<_> = vector.iter_data().collect(); + assert_eq!(input, &res[..]); + } + + #[test] + fn test_primitive_vector_validity() { + let input = [Some(1i32), Some(2i32), None, None]; + let mut builder = Int32VectorBuilder::with_capacity(input.len()); + for v in input { + builder.push(v); + } + let vector = builder.finish(); + assert_eq!(2, vector.null_count()); + let validity = vector.validity(); + assert_eq!(2, validity.null_count()); + assert!(!validity.is_set(2)); + assert!(!validity.is_set(3)); + + let vector = Int32Vector::from_slice(vec![1, 2, 3, 4]); + assert_eq!(0, vector.null_count()); + assert!(vector.validity().is_all_valid()); + } + + #[test] + fn test_memory_size() { + let v = Int32Vector::from_slice((0..5).collect::>()); + assert_eq!(64, v.memory_size()); + let v = Int64Vector::from(vec![Some(0i64), Some(1i64), Some(2i64), None, None]); + assert_eq!(128, v.memory_size()); + } + + #[test] + fn test_primitive_vector_builder() { + let mut builder = Int64Type::default().create_mutable_vector(3); + builder.push_value_ref(ValueRef::Int64(123)).unwrap(); + assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); + + let input = Int64Vector::from_slice(&[7, 8, 9]); + builder.extend_slice_of(&input, 1, 2).unwrap(); + assert!(builder + .extend_slice_of(&Int32Vector::from_slice(&[13]), 0, 1) + .is_err()); + let vector = builder.to_vector(); + + let expect: VectorRef = Arc::new(Int64Vector::from_slice(&[123, 8, 9])); + assert_eq!(expect, vector); + } + + #[test] + fn test_from_wrapper_slice() { + macro_rules! test_from_wrapper_slice { + ($vec: ident, $ty: ident) => { + let from_wrapper_slice = $vec::from_wrapper_slice(&[ + $ty::from_native($ty::MAX), + $ty::from_native($ty::MIN), + ]); + let from_slice = $vec::from_slice(&[$ty::MAX, $ty::MIN]); + assert_eq!(from_wrapper_slice, from_slice); + }; + } + + test_from_wrapper_slice!(UInt8Vector, u8); + test_from_wrapper_slice!(Int8Vector, i8); + test_from_wrapper_slice!(UInt16Vector, u16); + test_from_wrapper_slice!(Int16Vector, i16); + test_from_wrapper_slice!(UInt32Vector, u32); + test_from_wrapper_slice!(Int32Vector, i32); + test_from_wrapper_slice!(UInt64Vector, u64); + test_from_wrapper_slice!(Int64Vector, i64); + test_from_wrapper_slice!(Float32Vector, f32); + test_from_wrapper_slice!(Float64Vector, f64); + } +} diff --git a/src/datatypes2/src/vectors/string.rs b/src/datatypes2/src/vectors/string.rs new file mode 100644 index 0000000000..252116b3b2 --- /dev/null +++ b/src/datatypes2/src/vectors/string.rs @@ -0,0 +1,370 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{Array, ArrayBuilder, ArrayData, ArrayIter, ArrayRef}; +use snafu::ResultExt; + +use crate::arrow_array::{MutableStringArray, StringArray}; +use crate::data_type::ConcreteDataType; +use crate::error::{self, Result}; +use crate::scalars::{ScalarVector, ScalarVectorBuilder}; +use crate::serialize::Serializable; +use crate::value::{Value, ValueRef}; +use crate::vectors::{self, MutableVector, Validity, Vector, VectorRef}; + +/// Vector of strings. +#[derive(Debug, PartialEq)] +pub struct StringVector { + array: StringArray, +} + +impl StringVector { + pub(crate) fn as_arrow(&self) -> &dyn Array { + &self.array + } + + fn to_array_data(&self) -> ArrayData { + self.array.data().clone() + } + + fn from_array_data(data: ArrayData) -> Self { + Self { + array: StringArray::from(data), + } + } +} + +impl From for StringVector { + fn from(array: StringArray) -> Self { + Self { array } + } +} + +impl From>> for StringVector { + fn from(data: Vec>) -> Self { + Self { + array: StringArray::from_iter(data), + } + } +} + +impl From>> for StringVector { + fn from(data: Vec>) -> Self { + Self { + array: StringArray::from_iter(data), + } + } +} + +impl From<&[Option]> for StringVector { + fn from(data: &[Option]) -> Self { + Self { + array: StringArray::from_iter(data), + } + } +} + +impl From<&[Option<&str>]> for StringVector { + fn from(data: &[Option<&str>]) -> Self { + Self { + array: StringArray::from_iter(data), + } + } +} + +impl From> for StringVector { + fn from(data: Vec) -> Self { + Self { + array: StringArray::from_iter(data.into_iter().map(Some)), + } + } +} + +impl From> for StringVector { + fn from(data: Vec<&str>) -> Self { + Self { + array: StringArray::from_iter(data.into_iter().map(Some)), + } + } +} + +impl Vector for StringVector { + fn data_type(&self) -> ConcreteDataType { + ConcreteDataType::string_datatype() + } + + fn vector_type_name(&self) -> String { + "StringVector".to_string() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn len(&self) -> usize { + self.array.len() + } + + fn to_arrow_array(&self) -> ArrayRef { + let data = self.to_array_data(); + Arc::new(StringArray::from(data)) + } + + fn to_boxed_arrow_array(&self) -> Box { + let data = self.to_array_data(); + Box::new(StringArray::from(data)) + } + + fn validity(&self) -> Validity { + vectors::impl_validity_for_vector!(self.array) + } + + fn memory_size(&self) -> usize { + self.array.get_buffer_memory_size() + } + + fn null_count(&self) -> usize { + self.array.null_count() + } + + fn is_null(&self, row: usize) -> bool { + self.array.is_null(row) + } + + fn slice(&self, offset: usize, length: usize) -> VectorRef { + let data = self.array.data().slice(offset, length); + Arc::new(Self::from_array_data(data)) + } + + fn get(&self, index: usize) -> Value { + vectors::impl_get_for_vector!(self.array, index) + } + + fn get_ref(&self, index: usize) -> ValueRef { + vectors::impl_get_ref_for_vector!(self.array, index) + } +} + +impl ScalarVector for StringVector { + type OwnedItem = String; + type RefItem<'a> = &'a str; + type Iter<'a> = ArrayIter<&'a StringArray>; + type Builder = StringVectorBuilder; + + fn get_data(&self, idx: usize) -> Option> { + if self.array.is_valid(idx) { + Some(self.array.value(idx)) + } else { + None + } + } + + fn iter_data(&self) -> Self::Iter<'_> { + self.array.iter() + } +} + +pub struct StringVectorBuilder { + mutable_array: MutableStringArray, +} + +impl MutableVector for StringVectorBuilder { + fn data_type(&self) -> ConcreteDataType { + ConcreteDataType::string_datatype() + } + + fn len(&self) -> usize { + self.mutable_array.len() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_mut_any(&mut self) -> &mut dyn Any { + self + } + + fn to_vector(&mut self) -> VectorRef { + Arc::new(self.finish()) + } + + fn push_value_ref(&mut self, value: ValueRef) -> Result<()> { + match value.as_string()? { + Some(v) => self.mutable_array.append_value(v), + None => self.mutable_array.append_null(), + } + Ok(()) + } + + fn extend_slice_of(&mut self, vector: &dyn Vector, offset: usize, length: usize) -> Result<()> { + vectors::impl_extend_for_builder!(self, vector, StringVector, offset, length) + } +} + +impl ScalarVectorBuilder for StringVectorBuilder { + type VectorType = StringVector; + + fn with_capacity(capacity: usize) -> Self { + Self { + mutable_array: MutableStringArray::with_capacity(capacity, 0), + } + } + + fn push(&mut self, value: Option<::RefItem<'_>>) { + match value { + Some(v) => self.mutable_array.append_value(v), + None => self.mutable_array.append_null(), + } + } + + fn finish(&mut self) -> Self::VectorType { + StringVector { + array: self.mutable_array.finish(), + } + } +} + +impl Serializable for StringVector { + fn serialize_to_json(&self) -> Result> { + self.iter_data() + .map(serde_json::to_value) + .collect::>() + .context(error::SerializeSnafu) + } +} + +vectors::impl_try_from_arrow_array_for_vector!(StringArray, StringVector); + +#[cfg(test)] +mod tests { + use arrow::datatypes::DataType; + + use super::*; + + #[test] + fn test_string_vector_build_get() { + let mut builder = StringVectorBuilder::with_capacity(4); + builder.push(Some("hello")); + builder.push(None); + builder.push(Some("world")); + let vector = builder.finish(); + + assert_eq!(Some("hello"), vector.get_data(0)); + assert_eq!(None, vector.get_data(1)); + assert_eq!(Some("world"), vector.get_data(2)); + + // Get out of bound + assert!(vector.try_get(3).is_err()); + + assert_eq!(Value::String("hello".into()), vector.get(0)); + assert_eq!(Value::Null, vector.get(1)); + assert_eq!(Value::String("world".into()), vector.get(2)); + + let mut iter = vector.iter_data(); + assert_eq!("hello", iter.next().unwrap().unwrap()); + assert_eq!(None, iter.next().unwrap()); + assert_eq!("world", iter.next().unwrap().unwrap()); + assert_eq!(None, iter.next()); + } + + #[test] + fn test_string_vector_builder() { + let mut builder = StringVectorBuilder::with_capacity(3); + builder.push_value_ref(ValueRef::String("hello")).unwrap(); + assert!(builder.push_value_ref(ValueRef::Int32(123)).is_err()); + + let input = StringVector::from_slice(&["world", "one", "two"]); + builder.extend_slice_of(&input, 1, 2).unwrap(); + assert!(builder + .extend_slice_of(&crate::vectors::Int32Vector::from_slice(&[13]), 0, 1) + .is_err()); + let vector = builder.to_vector(); + + let expect: VectorRef = Arc::new(StringVector::from_slice(&["hello", "one", "two"])); + assert_eq!(expect, vector); + } + + #[test] + fn test_string_vector_misc() { + let strs = vec!["hello", "greptime", "rust"]; + let v = StringVector::from(strs.clone()); + assert_eq!(3, v.len()); + assert_eq!("StringVector", v.vector_type_name()); + assert!(!v.is_const()); + assert!(v.validity().is_all_valid()); + assert!(!v.only_null()); + assert_eq!(128, v.memory_size()); + + for (i, s) in strs.iter().enumerate() { + assert_eq!(Value::from(*s), v.get(i)); + assert_eq!(ValueRef::from(*s), v.get_ref(i)); + assert_eq!(Value::from(*s), v.try_get(i).unwrap()); + } + + let arrow_arr = v.to_arrow_array(); + assert_eq!(3, arrow_arr.len()); + assert_eq!(&DataType::Utf8, arrow_arr.data_type()); + } + + #[test] + fn test_serialize_string_vector() { + let mut builder = StringVectorBuilder::with_capacity(3); + builder.push(Some("hello")); + builder.push(None); + builder.push(Some("world")); + let string_vector = builder.finish(); + let serialized = + serde_json::to_string(&string_vector.serialize_to_json().unwrap()).unwrap(); + assert_eq!(r#"["hello",null,"world"]"#, serialized); + } + + #[test] + fn test_from_arrow_array() { + let mut builder = MutableStringArray::new(); + builder.append_option(Some("A")); + builder.append_option(Some("B")); + builder.append_null(); + builder.append_option(Some("D")); + let string_array: StringArray = builder.finish(); + let vector = StringVector::from(string_array); + assert_eq!( + r#"["A","B",null,"D"]"#, + serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(), + ); + } + + #[test] + fn test_from_non_option_string() { + let nul = String::from_utf8(vec![0]).unwrap(); + let corpus = vec!["😅😅😅", "😍😍😍😍", "🥵🥵", nul.as_str()]; + let vector = StringVector::from(corpus); + let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(); + assert_eq!(r#"["😅😅😅","😍😍😍😍","🥵🥵","\u0000"]"#, serialized); + + let corpus = vec![ + "🀀🀀🀀".to_string(), + "🀁🀁🀁".to_string(), + "🀂🀂🀂".to_string(), + "🀃🀃🀃".to_string(), + "🀆🀆".to_string(), + ]; + let vector = StringVector::from(corpus); + let serialized = serde_json::to_string(&vector.serialize_to_json().unwrap()).unwrap(); + assert_eq!(r#"["🀀🀀🀀","🀁🀁🀁","🀂🀂🀂","🀃🀃🀃","🀆🀆"]"#, serialized); + } +} diff --git a/src/datatypes2/src/vectors/timestamp.rs b/src/datatypes2/src/vectors/timestamp.rs new file mode 100644 index 0000000000..5d9f7f2ed1 --- /dev/null +++ b/src/datatypes2/src/vectors/timestamp.rs @@ -0,0 +1,31 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::types::{ + TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, + TimestampSecondType, +}; +use crate::vectors::{PrimitiveVector, PrimitiveVectorBuilder}; + +pub type TimestampSecondVector = PrimitiveVector; +pub type TimestampSecondVectorBuilder = PrimitiveVectorBuilder; + +pub type TimestampMillisecondVector = PrimitiveVector; +pub type TimestampMillisecondVectorBuilder = PrimitiveVectorBuilder; + +pub type TimestampMicrosecondVector = PrimitiveVector; +pub type TimestampMicrosecondVectorBuilder = PrimitiveVectorBuilder; + +pub type TimestampNanosecondVector = PrimitiveVector; +pub type TimestampNanosecondVectorBuilder = PrimitiveVectorBuilder; diff --git a/src/datatypes2/src/vectors/validity.rs b/src/datatypes2/src/vectors/validity.rs new file mode 100644 index 0000000000..01c7faa789 --- /dev/null +++ b/src/datatypes2/src/vectors/validity.rs @@ -0,0 +1,159 @@ +// Copyright 2022 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use arrow::array::ArrayData; +use arrow::bitmap::Bitmap; + +#[derive(Debug, PartialEq)] +enum ValidityKind<'a> { + /// Whether the array slot is valid or not (null). + Slots { + bitmap: &'a Bitmap, + len: usize, + null_count: usize, + }, + /// All slots are valid. + AllValid { len: usize }, + /// All slots are null. + AllNull { len: usize }, +} + +/// Validity of a vector. +#[derive(Debug, PartialEq)] +pub struct Validity<'a> { + kind: ValidityKind<'a>, +} + +impl<'a> Validity<'a> { + /// Creates a `Validity` from [`ArrayData`]. + pub fn from_array_data(data: &'a ArrayData) -> Validity<'a> { + match data.null_bitmap() { + Some(bitmap) => Validity { + kind: ValidityKind::Slots { + bitmap, + len: data.len(), + null_count: data.null_count(), + }, + }, + None => Validity::all_valid(data.len()), + } + } + + /// Returns `Validity` that all elements are valid. + pub fn all_valid(len: usize) -> Validity<'a> { + Validity { + kind: ValidityKind::AllValid { len }, + } + } + + /// Returns `Validity` that all elements are null. + pub fn all_null(len: usize) -> Validity<'a> { + Validity { + kind: ValidityKind::AllNull { len }, + } + } + + /// Returns whether `i-th` bit is set. + pub fn is_set(&self, i: usize) -> bool { + match self.kind { + ValidityKind::Slots { bitmap, .. } => bitmap.is_set(i), + ValidityKind::AllValid { len } => i < len, + ValidityKind::AllNull { .. } => false, + } + } + + /// Returns true if all bits are null. + pub fn is_all_null(&self) -> bool { + match self.kind { + ValidityKind::Slots { + len, null_count, .. + } => len == null_count, + ValidityKind::AllValid { .. } => false, + ValidityKind::AllNull { .. } => true, + } + } + + /// Returns true if all bits are valid. + pub fn is_all_valid(&self) -> bool { + match self.kind { + ValidityKind::Slots { null_count, .. } => null_count == 0, + ValidityKind::AllValid { .. } => true, + ValidityKind::AllNull { .. } => false, + } + } + + /// The number of null slots on this [`Vector`]. + pub fn null_count(&self) -> usize { + match self.kind { + ValidityKind::Slots { null_count, .. } => null_count, + ValidityKind::AllValid { .. } => 0, + ValidityKind::AllNull { len } => len, + } + } +} + +#[cfg(test)] +mod tests { + use arrow::array::{Array, Int32Array}; + + use super::*; + + #[test] + fn test_all_valid() { + let validity = Validity::all_valid(5); + assert!(validity.is_all_valid()); + assert!(!validity.is_all_null()); + assert_eq!(0, validity.null_count()); + for i in 0..5 { + assert!(validity.is_set(i)); + } + assert!(!validity.is_set(5)); + } + + #[test] + fn test_all_null() { + let validity = Validity::all_null(5); + assert!(validity.is_all_null()); + assert!(!validity.is_all_valid()); + assert_eq!(5, validity.null_count()); + for i in 0..5 { + assert!(!validity.is_set(i)); + } + assert!(!validity.is_set(5)); + } + + #[test] + fn test_from_array_data() { + let array = Int32Array::from_iter([None, Some(1), None]); + let validity = Validity::from_array_data(array.data()); + assert_eq!(2, validity.null_count()); + assert!(!validity.is_set(0)); + assert!(validity.is_set(1)); + assert!(!validity.is_set(2)); + assert!(!validity.is_all_null()); + assert!(!validity.is_all_valid()); + + let array = Int32Array::from_iter([None, None]); + let validity = Validity::from_array_data(array.data()); + assert!(validity.is_all_null()); + assert!(!validity.is_all_valid()); + assert_eq!(2, validity.null_count()); + + let array = Int32Array::from_iter_values([1, 2]); + let validity = Validity::from_array_data(array.data()); + assert!(!validity.is_all_null()); + assert!(validity.is_all_valid()); + assert_eq!(0, validity.null_count()); + } +}